From 354161b37c6465a32073eac5f16fa35939af2bb4 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Mon, 27 Jun 2016 15:02:08 -0400 Subject: [PATCH] target-arm: emulate LL/SC using cmpxchg helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Emulating LL/SC with cmpxchg is not correct, since it can suffer from the ABA problem. Portable parallel code, however, is written assuming only cmpxchg--and not LL/SC--is available. This means that in practice emulating LL/SC with cmpxchg is a viable alternative. The appended emulates LL/SC pairs in ARM with cmpxchg helpers. This works in both user and system mode. In usermode, it avoids pausing all other CPUs to perform the LL/SC pair. The subsequent performance and scalability improvement is significant, as the plots below show. They plot the throughput of atomic_add-bench compiled for ARM and executed on a 64-core x86 machine. Hi-res plots: http://imgur.com/a/aNQpB atomic_add-bench: 1000000 ops/thread, [0,1] range 9 ++---------+----------+----------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 8 +Emaster +-H--+ ++ | | | 7 ++E ++ | | | 6 ++++ ++ | | | 5 ++ | ++ 4 ++ | ++ | | | 3 ++ | ++ | | | 2 ++ | ++ |H++E+--- +++ ---+E+------+E+------+E| 1 +++ +E+-----+E+------+E+------+E+------+E+-- +++ +++ ++ ++H+ + +++ + +++ ++++ + + + | 0 ++--H----H-+-----H----+----------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,2] range 16 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 14 ++master +-H--+ ++ | | | 12 ++| ++ | E | 10 ++| ++ | | | 8 ++++ ++ |E+| | | | | 6 ++ | ++ | | | 4 ++ | ++ | +E+--- +++ +++ +++ ---+E+------+E| 2 +H+ +E+------E-------+E+-----+E+------+E+------+E+-- +++ + | + +++ + ++++ + + + | 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,128] range 70 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + ++++ + | 60 ++master +-H--+ ----E------+E+-------++ | -+E+--- +++ +++ +E| | +++ ---- +++ ++| 50 ++ +++ ---+E+- ++ | -E--- | 40 ++ ---+++ ++ | +++--- | | -+E+ | 30 ++ +++---- ++ | +E+ | 20 ++ +++-- ++ | +E+ | |+E+ | 10 +E+ ++ + + + + + + + | 0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,1024] range 120 ++---------+---------+----------+---------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | | master +-H--+ ++| 100 ++ ----E+ | +++ ---+E+--- ++| | --E--- +++ | 80 ++ ---- +++ ++ | ---+E+- | 60 ++ -+E+-- ++ | +++ ---- +++ | | -+E+- | 40 ++ +++---- ++ | +++ ---+E+ | | -+E+--- | 20 ++ +E+ ++ |+E+++ | +E+ + + + + + + | 0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads [rth: Enforce alignment for ldrexd.] Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Message-Id: <1467054136-10430-23-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- target-arm/translate.c | 140 +++++++++++++---------------------------- 1 file changed, 45 insertions(+), 95 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index ee2bf57f90..cc97d1ed46 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -1026,9 +1026,6 @@ DO_GEN_LD(8u, MO_UB) DO_GEN_LD(16s, MO_SW) DO_GEN_LD(16u, MO_UW) DO_GEN_LD(32u, MO_UL) -/* 'a' variants include an alignment check */ -DO_GEN_LD(16ua, MO_UW | MO_ALIGN) -DO_GEN_LD(32ua, MO_UL | MO_ALIGN) DO_GEN_ST(8, MO_UB) DO_GEN_ST(16, MO_UW) DO_GEN_ST(32, MO_UL) @@ -7720,45 +7717,30 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi) /* Load/Store exclusive instructions are implemented by remembering the value/address loaded, and seeing if these are the same - when the store is performed. This should be sufficient to implement + when the store is performed. This should be sufficient to implement the architecturally mandated semantics, and avoids having to monitor - regular stores. - - In system emulation mode only one CPU will be running at once, so - this sequence is effectively atomic. In user emulation mode we - throw an exception and handle the atomic operation elsewhere. */ + regular stores. The compare vs the remembered value is done during + the cmpxchg operation, but we must compare the addresses manually. */ static void gen_load_exclusive(DisasContext *s, int rt, int rt2, TCGv_i32 addr, int size) { TCGv_i32 tmp = tcg_temp_new_i32(); + TCGMemOp opc = size | MO_ALIGN | s->be_data; s->is_ldex = true; - switch (size) { - case 0: - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); - break; - case 1: - gen_aa32_ld16ua(s, tmp, addr, get_mem_index(s)); - break; - case 2: - case 3: - gen_aa32_ld32ua(s, tmp, addr, get_mem_index(s)); - break; - default: - abort(); - } - if (size == 3) { TCGv_i32 tmp2 = tcg_temp_new_i32(); - TCGv_i32 tmp3 = tcg_temp_new_i32(); + TCGv_i64 t64 = tcg_temp_new_i64(); - tcg_gen_addi_i32(tmp2, addr, 4); - gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s)); - tcg_temp_free_i32(tmp2); - tcg_gen_concat_i32_i64(cpu_exclusive_val, tmp, tmp3); - store_reg(s, rt2, tmp3); + gen_aa32_ld_i64(s, t64, addr, get_mem_index(s), opc); + tcg_gen_mov_i64(cpu_exclusive_val, t64); + tcg_gen_extr_i64_i32(tmp, tmp2, t64); + tcg_temp_free_i64(t64); + + store_reg(s, rt2, tmp2); } else { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc); tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp); } @@ -7771,23 +7753,15 @@ static void gen_clrex(DisasContext *s) tcg_gen_movi_i64(cpu_exclusive_addr, -1); } -#ifdef CONFIG_USER_ONLY static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, TCGv_i32 addr, int size) { - tcg_gen_extu_i32_i64(cpu_exclusive_test, addr); - tcg_gen_movi_i32(cpu_exclusive_info, - size | (rd << 4) | (rt << 8) | (rt2 << 12)); - gen_exception_internal_insn(s, 4, EXCP_STREX); -} -#else -static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, - TCGv_i32 addr, int size) -{ - TCGv_i32 tmp; - TCGv_i64 val64, extaddr; + TCGv_i32 t0, t1, t2; + TCGv_i64 extaddr; + TCGv taddr; TCGLabel *done_label; TCGLabel *fail_label; + TCGMemOp opc = size | MO_ALIGN | s->be_data; /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) { [addr] = {Rt}; @@ -7802,69 +7776,45 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label); tcg_temp_free_i64(extaddr); - tmp = tcg_temp_new_i32(); - switch (size) { - case 0: - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); - break; - case 1: - gen_aa32_ld16u(s, tmp, addr, get_mem_index(s)); - break; - case 2: - case 3: - gen_aa32_ld32u(s, tmp, addr, get_mem_index(s)); - break; - default: - abort(); - } - - val64 = tcg_temp_new_i64(); + taddr = gen_aa32_addr(s, addr, opc); + t0 = tcg_temp_new_i32(); + t1 = load_reg(s, rt); if (size == 3) { - TCGv_i32 tmp2 = tcg_temp_new_i32(); - TCGv_i32 tmp3 = tcg_temp_new_i32(); - tcg_gen_addi_i32(tmp2, addr, 4); - gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s)); - tcg_temp_free_i32(tmp2); - tcg_gen_concat_i32_i64(val64, tmp, tmp3); - tcg_temp_free_i32(tmp3); - } else { - tcg_gen_extu_i32_i64(val64, tmp); - } - tcg_temp_free_i32(tmp); + TCGv_i64 o64 = tcg_temp_new_i64(); + TCGv_i64 n64 = tcg_temp_new_i64(); - tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label); - tcg_temp_free_i64(val64); + t2 = load_reg(s, rt2); + tcg_gen_concat_i32_i64(n64, t1, t2); + tcg_temp_free_i32(t2); + gen_aa32_frob64(s, n64); - tmp = load_reg(s, rt); - switch (size) { - case 0: - gen_aa32_st8(s, tmp, addr, get_mem_index(s)); - break; - case 1: - gen_aa32_st16(s, tmp, addr, get_mem_index(s)); - break; - case 2: - case 3: - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); - break; - default: - abort(); - } - tcg_temp_free_i32(tmp); - if (size == 3) { - tcg_gen_addi_i32(addr, addr, 4); - tmp = load_reg(s, rt2); - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); - tcg_temp_free_i32(tmp); + tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64, + get_mem_index(s), opc); + tcg_temp_free_i64(n64); + + gen_aa32_frob64(s, o64); + tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val); + tcg_gen_extrl_i64_i32(t0, o64); + + tcg_temp_free_i64(o64); + } else { + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val); + tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc); + tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2); + tcg_temp_free_i32(t2); } - tcg_gen_movi_i32(cpu_R[rd], 0); + tcg_temp_free_i32(t1); + tcg_temp_free(taddr); + tcg_gen_mov_i32(cpu_R[rd], t0); + tcg_temp_free_i32(t0); tcg_gen_br(done_label); + gen_set_label(fail_label); tcg_gen_movi_i32(cpu_R[rd], 1); gen_set_label(done_label); tcg_gen_movi_i64(cpu_exclusive_addr, -1); } -#endif /* gen_srs: * @env: CPUARMState -- GitLab