Message ID | 1467392693-22715-12-git-send-email-rth@twiddle.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jul 01, 2016 at 10:04:37 -0700, Richard Henderson wrote: > From: "Emilio G. Cota" <cota@braap.org> > > The diff here is uglier than necessary. All this does is to turn > > FOO > > into: > > if (s->prefix & PREFIX_LOCK) { > BAR > } else { > FOO > } > > where FOO is the original implementation of an unlocked cmpxchg. > > [rth: Adjust unlocked cmpxchg to use movcond instead of branches. > Adjust helpers to use atomic helpers.] > > Signed-off-by: Emilio G. Cota <cota@braap.org> > Message-Id: <1467054136-10430-6-git-send-email-cota@braap.org> > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > target-i386/mem_helper.c | 96 ++++++++++++++++++++++++++++++++++++++---------- > target-i386/translate.c | 87 +++++++++++++++++++++---------------------- > 2 files changed, 120 insertions(+), 63 deletions(-) > > diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c > index c2f4769..5c0558f 100644 > --- a/target-i386/mem_helper.c > +++ b/target-i386/mem_helper.c > @@ -22,6 +22,8 @@ > #include "exec/helper-proto.h" > #include "exec/exec-all.h" > #include "exec/cpu_ldst.h" > +#include "qemu/int128.h" > +#include "tcg.h" > > /* broken thread support */ > > @@ -58,20 +60,39 @@ void helper_lock_init(void) > > void helper_cmpxchg8b(CPUX86State *env, target_ulong a0) > { > - uint64_t d; > + uintptr_t ra = GETPC(); > + uint64_t oldv, cmpv, newv; > int eflags; > > eflags = cpu_cc_compute_all(env, CC_OP); > - d = cpu_ldq_data_ra(env, a0, GETPC()); > - if (d == (((uint64_t)env->regs[R_EDX] << 32) | (uint32_t)env->regs[R_EAX])) { > - cpu_stq_data_ra(env, a0, ((uint64_t)env->regs[R_ECX] << 32) > - | (uint32_t)env->regs[R_EBX], GETPC()); > - eflags |= CC_Z; > + > + cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]); > + newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]); > + > + if (parallel_cpus) { I think here we mean 'if (prefix_locked)', although prefix_locked isn't here. This is why in my original patch I defined two helpers ('locked' and 'unlocked'), otherwise we'll emulate cmpxchg atomically even when the LOCK prefix wasn't there. Not a big deal, but could hide bugs. > +#ifdef CONFIG_USER_ONLY > + uint64_t *haddr = g2h(a0); > + cmpv = cpu_to_le64(cmpv); > + newv = cpu_to_le64(newv); > + oldv = atomic_cmpxchg(haddr, cmpv, newv); > + oldv = le64_to_cpu(oldv); > +#else > + int mem_idx = cpu_mmu_index(env, false); > + TCGMemOpIdx oi = make_memop_idx(MO_TEQ, mem_idx); > + oldv = helper_atomic_cmpxchgq_le_mmu(env, a0, cmpv, newv, oi, ra); > +#endif > } else { > + oldv = cpu_ldq_data_ra(env, a0, ra); > + newv = (cmpv == oldv ? newv : oldv); > /* always do the store */ > - cpu_stq_data_ra(env, a0, d, GETPC()); > - env->regs[R_EDX] = (uint32_t)(d >> 32); > - env->regs[R_EAX] = (uint32_t)d; > + cpu_stq_data_ra(env, a0, newv, ra); > + } > + > + if (oldv == cmpv) { > + eflags |= CC_Z; > + } else { > + env->regs[R_EAX] = (uint32_t)oldv; > + env->regs[R_EDX] = (uint32_t)(oldv >> 32); > eflags &= ~CC_Z; > } > CC_SRC = eflags; > @@ -80,25 +101,60 @@ void helper_cmpxchg8b(CPUX86State *env, target_ulong a0) > #ifdef TARGET_X86_64 > void helper_cmpxchg16b(CPUX86State *env, target_ulong a0) > { > - uint64_t d0, d1; > + uintptr_t ra = GETPC(); > + Int128 oldv, cmpv, newv; > int eflags; > + bool success; > > if ((a0 & 0xf) != 0) { > raise_exception_ra(env, EXCP0D_GPF, GETPC()); > } > eflags = cpu_cc_compute_all(env, CC_OP); > - d0 = cpu_ldq_data_ra(env, a0, GETPC()); > - d1 = cpu_ldq_data_ra(env, a0 + 8, GETPC()); > - if (d0 == env->regs[R_EAX] && d1 == env->regs[R_EDX]) { > - cpu_stq_data_ra(env, a0, env->regs[R_EBX], GETPC()); > - cpu_stq_data_ra(env, a0 + 8, env->regs[R_ECX], GETPC()); > + > + cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]); > + newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]); > + > + if (parallel_cpus) { Ditto. FWIW I tested the x86_64 bits with the all the ck_pr regression tests in concurrencykit. Would be nice to get access to a big-endian machine to test the byte-ordering bits -- I sent a request to the GCC compile farm earlier today for this purpose. Emilio
On Thu, Jul 07, 2016 at 23:08:17 -0400, Emilio G. Cota wrote: > On Fri, Jul 01, 2016 at 10:04:37 -0700, Richard Henderson wrote: > > From: "Emilio G. Cota" <cota@braap.org> > > > > The diff here is uglier than necessary. All this does is to turn > > > > FOO > > > > into: > > > > if (s->prefix & PREFIX_LOCK) { > > BAR > > } else { > > FOO > > } > > > > where FOO is the original implementation of an unlocked cmpxchg. > > > > [rth: Adjust unlocked cmpxchg to use movcond instead of branches. > > Adjust helpers to use atomic helpers.] > > > > Signed-off-by: Emilio G. Cota <cota@braap.org> > > Message-Id: <1467054136-10430-6-git-send-email-cota@braap.org> > > Signed-off-by: Richard Henderson <rth@twiddle.net> > > --- > > target-i386/mem_helper.c | 96 ++++++++++++++++++++++++++++++++++++++---------- > > target-i386/translate.c | 87 +++++++++++++++++++++---------------------- > > 2 files changed, 120 insertions(+), 63 deletions(-) > > > > diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c > > index c2f4769..5c0558f 100644 > > --- a/target-i386/mem_helper.c > > +++ b/target-i386/mem_helper.c > > @@ -22,6 +22,8 @@ > > #include "exec/helper-proto.h" > > #include "exec/exec-all.h" > > #include "exec/cpu_ldst.h" > > +#include "qemu/int128.h" > > +#include "tcg.h" > > > > /* broken thread support */ > > > > @@ -58,20 +60,39 @@ void helper_lock_init(void) > > > > void helper_cmpxchg8b(CPUX86State *env, target_ulong a0) > > { > > - uint64_t d; > > + uintptr_t ra = GETPC(); > > + uint64_t oldv, cmpv, newv; > > int eflags; > > > > eflags = cpu_cc_compute_all(env, CC_OP); > > - d = cpu_ldq_data_ra(env, a0, GETPC()); > > - if (d == (((uint64_t)env->regs[R_EDX] << 32) | (uint32_t)env->regs[R_EAX])) { > > - cpu_stq_data_ra(env, a0, ((uint64_t)env->regs[R_ECX] << 32) > > - | (uint32_t)env->regs[R_EBX], GETPC()); > > - eflags |= CC_Z; > > + > > + cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]); > > + newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]); > > + > > + if (parallel_cpus) { > > I think here we mean 'if (prefix_locked)', although prefix_locked isn't > here. This is why in my original patch I defined two helpers ('locked' > and 'unlocked'), otherwise we'll emulate cmpxchg atomically even when > the LOCK prefix wasn't there. Not a big deal, but could hide bugs. Correction to myself: we'd still need if (prefix_locked) for the whole stop-the-world-and-do-non-atomically thing to work. The concern that we might hide bugs by emulating non-locked cmpxchg atomically persists. [ I wonder why one would ever use a non-locked cmpxchg, though ] E.
diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c index c2f4769..5c0558f 100644 --- a/target-i386/mem_helper.c +++ b/target-i386/mem_helper.c @@ -22,6 +22,8 @@ #include "exec/helper-proto.h" #include "exec/exec-all.h" #include "exec/cpu_ldst.h" +#include "qemu/int128.h" +#include "tcg.h" /* broken thread support */ @@ -58,20 +60,39 @@ void helper_lock_init(void) void helper_cmpxchg8b(CPUX86State *env, target_ulong a0) { - uint64_t d; + uintptr_t ra = GETPC(); + uint64_t oldv, cmpv, newv; int eflags; eflags = cpu_cc_compute_all(env, CC_OP); - d = cpu_ldq_data_ra(env, a0, GETPC()); - if (d == (((uint64_t)env->regs[R_EDX] << 32) | (uint32_t)env->regs[R_EAX])) { - cpu_stq_data_ra(env, a0, ((uint64_t)env->regs[R_ECX] << 32) - | (uint32_t)env->regs[R_EBX], GETPC()); - eflags |= CC_Z; + + cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]); + newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]); + + if (parallel_cpus) { +#ifdef CONFIG_USER_ONLY + uint64_t *haddr = g2h(a0); + cmpv = cpu_to_le64(cmpv); + newv = cpu_to_le64(newv); + oldv = atomic_cmpxchg(haddr, cmpv, newv); + oldv = le64_to_cpu(oldv); +#else + int mem_idx = cpu_mmu_index(env, false); + TCGMemOpIdx oi = make_memop_idx(MO_TEQ, mem_idx); + oldv = helper_atomic_cmpxchgq_le_mmu(env, a0, cmpv, newv, oi, ra); +#endif } else { + oldv = cpu_ldq_data_ra(env, a0, ra); + newv = (cmpv == oldv ? newv : oldv); /* always do the store */ - cpu_stq_data_ra(env, a0, d, GETPC()); - env->regs[R_EDX] = (uint32_t)(d >> 32); - env->regs[R_EAX] = (uint32_t)d; + cpu_stq_data_ra(env, a0, newv, ra); + } + + if (oldv == cmpv) { + eflags |= CC_Z; + } else { + env->regs[R_EAX] = (uint32_t)oldv; + env->regs[R_EDX] = (uint32_t)(oldv >> 32); eflags &= ~CC_Z; } CC_SRC = eflags; @@ -80,25 +101,60 @@ void helper_cmpxchg8b(CPUX86State *env, target_ulong a0) #ifdef TARGET_X86_64 void helper_cmpxchg16b(CPUX86State *env, target_ulong a0) { - uint64_t d0, d1; + uintptr_t ra = GETPC(); + Int128 oldv, cmpv, newv; int eflags; + bool success; if ((a0 & 0xf) != 0) { raise_exception_ra(env, EXCP0D_GPF, GETPC()); } eflags = cpu_cc_compute_all(env, CC_OP); - d0 = cpu_ldq_data_ra(env, a0, GETPC()); - d1 = cpu_ldq_data_ra(env, a0 + 8, GETPC()); - if (d0 == env->regs[R_EAX] && d1 == env->regs[R_EDX]) { - cpu_stq_data_ra(env, a0, env->regs[R_EBX], GETPC()); - cpu_stq_data_ra(env, a0 + 8, env->regs[R_ECX], GETPC()); + + cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]); + newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]); + + if (parallel_cpus) { +#ifndef CONFIG_ATOMIC128 + cpu_loop_exit_atomic(ENV_GET_CPU(env), ra); +#elif defined(CONFIG_USER_ONLY) + Int128 *haddr = g2h(a0); + oldv = cmpv; +#ifdef HOST_WORDS_BIGENDIAN + oldv = bswap128(oldv); + newv = bswap128(newv); +#endif + success = __atomic_compare_exchange_16(haddr, &oldv, newv, false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST); +#ifdef HOST_WORDS_BIGENDIAN + oldv = bswap128(oldv); +#endif +#else + int mem_idx = cpu_mmu_index(env, false); + TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx); + oldv = helper_atomic_cmpxchgo_le_mmu(env, a0, cmpv, newv, oi, ra); + success = int128_eq(oldv, cmpv); +#endif + } else { + uint64_t o0 = cpu_ldq_data_ra(env, a0 + 0, ra); + uint64_t o1 = cpu_ldq_data_ra(env, a0 + 8, ra); + + oldv = int128_make128(o0, o1); + success = int128_eq(oldv, cmpv); + if (!success) { + newv = oldv; + } + + cpu_stq_data_ra(env, a0 + 0, int128_getlo(newv), ra); + cpu_stq_data_ra(env, a0 + 8, int128_gethi(newv), ra); + } + + if (success) { eflags |= CC_Z; } else { - /* always do the store */ - cpu_stq_data_ra(env, a0, d0, GETPC()); - cpu_stq_data_ra(env, a0 + 8, d1, GETPC()); - env->regs[R_EDX] = d1; - env->regs[R_EAX] = d0; + env->regs[R_EAX] = int128_getlo(oldv); + env->regs[R_EDX] = int128_gethi(oldv); eflags &= ~CC_Z; } CC_SRC = eflags; diff --git a/target-i386/translate.c b/target-i386/translate.c index 7dea18b..2244f38 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -5070,57 +5070,58 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, case 0x1b0: case 0x1b1: /* cmpxchg Ev, Gv */ { - TCGLabel *label1, *label2; - TCGv t0, t1, t2, a0; + TCGv oldv, newv, cmpv; ot = mo_b_d(b, dflag); modrm = cpu_ldub_code(env, s->pc++); reg = ((modrm >> 3) & 7) | rex_r; mod = (modrm >> 6) & 3; - t0 = tcg_temp_local_new(); - t1 = tcg_temp_local_new(); - t2 = tcg_temp_local_new(); - a0 = tcg_temp_local_new(); - gen_op_mov_v_reg(ot, t1, reg); - if (mod == 3) { - rm = (modrm & 7) | REX_B(s); - gen_op_mov_v_reg(ot, t0, rm); - } else { + oldv = tcg_temp_new(); + newv = tcg_temp_new(); + cmpv = tcg_temp_new(); + gen_op_mov_v_reg(ot, newv, reg); + tcg_gen_mov_tl(cmpv, cpu_regs[R_EAX]); + + if (s->prefix & PREFIX_LOCK) { + if (mod == 3) { + goto illegal_op; + } gen_lea_modrm(env, s, modrm); - tcg_gen_mov_tl(a0, cpu_A0); - gen_op_ld_v(s, ot, t0, a0); - rm = 0; /* avoid warning */ - } - label1 = gen_new_label(); - tcg_gen_mov_tl(t2, cpu_regs[R_EAX]); - gen_extu(ot, t0); - gen_extu(ot, t2); - tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1); - label2 = gen_new_label(); - if (mod == 3) { - gen_op_mov_reg_v(ot, R_EAX, t0); - tcg_gen_br(label2); - gen_set_label(label1); - gen_op_mov_reg_v(ot, rm, t1); + tcg_gen_atomic_cmpxchg_tl(oldv, cpu_A0, cmpv, newv, + s->mem_index, ot | MO_LE); + gen_op_mov_reg_v(ot, R_EAX, oldv); } else { - /* perform no-op store cycle like physical cpu; must be - before changing accumulator to ensure idempotency if - the store faults and the instruction is restarted */ - gen_op_st_v(s, ot, t0, a0); - gen_op_mov_reg_v(ot, R_EAX, t0); - tcg_gen_br(label2); - gen_set_label(label1); - gen_op_st_v(s, ot, t1, a0); - } - gen_set_label(label2); - tcg_gen_mov_tl(cpu_cc_src, t0); - tcg_gen_mov_tl(cpu_cc_srcT, t2); - tcg_gen_sub_tl(cpu_cc_dst, t2, t0); + if (mod == 3) { + rm = (modrm & 7) | REX_B(s); + gen_op_mov_v_reg(ot, oldv, rm); + } else { + gen_lea_modrm(env, s, modrm); + gen_op_ld_v(s, ot, oldv, cpu_A0); + rm = 0; /* avoid warning */ + } + gen_extu(ot, oldv); + gen_extu(ot, cmpv); + /* store value = (old == cmp ? new : old); */ + tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv); + if (mod == 3) { + gen_op_mov_reg_v(ot, R_EAX, oldv); + gen_op_mov_reg_v(ot, rm, newv); + } else { + /* Perform an unconditional store cycle like physical cpu; + must be before changing accumulator to ensure + idempotency if the store faults and the instruction + is restarted */ + gen_op_st_v(s, ot, newv, cpu_A0); + gen_op_mov_reg_v(ot, R_EAX, oldv); + } + } + tcg_gen_mov_tl(cpu_cc_src, oldv); + tcg_gen_mov_tl(cpu_cc_srcT, cmpv); + tcg_gen_sub_tl(cpu_cc_dst, cmpv, oldv); set_cc_op(s, CC_OP_SUBB + ot); - tcg_temp_free(t0); - tcg_temp_free(t1); - tcg_temp_free(t2); - tcg_temp_free(a0); + tcg_temp_free(oldv); + tcg_temp_free(newv); + tcg_temp_free(cmpv); } break; case 0x1c7: /* cmpxchg8b */