Message ID | 20160618040343.19517-3-bobby.prani@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 18/06/2016 06:03, Pranith Kumar wrote: > Generate mfence/sfence/lfence instruction on SSE2 enabled > processors. For older processors, generate a 'lock orl $0,0(%esp)' > instruction which has full ordering semantics. > > Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > [rth: Check for sse2, fallback to locked memory op otherwise.] > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 47 insertions(+) > > diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c > index 317484c..0748652 100644 > --- a/tcg/i386/tcg-target.inc.c > +++ b/tcg/i386/tcg-target.inc.c > @@ -121,6 +121,16 @@ static bool have_cmov; > # define have_cmov 0 > #endif > > +/* For 32-bit, we are going to attempt to determine at runtime whether > + sse2 support is available. */ > +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__) > +# define have_sse2 1 > +#elif defined(CONFIG_CPUID_H) && defined(bit_SSE2) > +static bool have_sse2; > +#else > +# define have_sse2 0 > +#endif > + > /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are > going to attempt to determine at runtime whether movbe is available. */ > #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE) > @@ -686,6 +696,32 @@ static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) > } > } > > +static inline void tcg_out_mb(TCGContext *s, TCGArg a0) > +{ > + if (have_sse2) { > + tcg_out16(s, 0xae0f); > + switch (a0 & TCG_MO_ALL) { > + case TCG_MO_LD_LD: > + /* lfence */ > + tcg_out8(s, 0xe8); > + break; > + case TCG_MO_ST_ST: > + /* sfence */ > + tcg_out8(s, 0xf8); > + break; These two barriers are unnecessary on x86, and so is TCG_MO_LD_ST. > + default: > + /* mfence */ > + tcg_out8(s, 0xf0); > + break; Please use lock orl here too, it turns out to be faster. > + } > + } else { > + /* lock orl $0,0(%esp) */ > + tcg_out8(s, 0xf0); > + tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); > + tcg_out8(s, 0); This is only needed for TCG_MO_ST_LD. Paolo > + } > +} > + > static inline void tcg_out_push(TCGContext *s, int reg) > { > tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); > @@ -2120,6 +2156,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > } > break; > > + case INDEX_op_mb: > + assert(args[0] != 0); > + tcg_out_mb(s, args[0]); > + break; > case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ > case INDEX_op_mov_i64: > case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ > @@ -2185,6 +2225,8 @@ static const TCGTargetOpDef x86_op_defs[] = { > { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } }, > { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } }, > > + { INDEX_op_mb, { } }, > + > #if TCG_TARGET_REG_BITS == 32 > { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } }, > { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } }, > @@ -2362,6 +2404,11 @@ static void tcg_target_init(TCGContext *s) > available, we'll use a small forward branch. */ > have_cmov = (d & bit_CMOV) != 0; > #endif > +#ifndef have_sse2 > + /* Likewise, almost all hardware supports SSE2, but we do > + have a locked memory operation to use as a substitute. */ > + have_sse2 = (d & bit_SSE2) != 0; > +#endif > #ifndef have_movbe > /* MOVBE is only available on Intel Atom and Haswell CPUs, so we > need to probe for it. */ >
Pranith Kumar <bobby.prani@gmail.com> writes: > Generate mfence/sfence/lfence instruction on SSE2 enabled > processors. For older processors, generate a 'lock orl $0,0(%esp)' > instruction which has full ordering semantics. > > Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > [rth: Check for sse2, fallback to locked memory op otherwise.] > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 47 insertions(+) > > diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c > index 317484c..0748652 100644 > --- a/tcg/i386/tcg-target.inc.c > +++ b/tcg/i386/tcg-target.inc.c > @@ -121,6 +121,16 @@ static bool have_cmov; > # define have_cmov 0 > #endif > > +/* For 32-bit, we are going to attempt to determine at runtime whether > + sse2 support is available. */ > +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__) Hmm checkpatch.pl warns against including architecture specific defines. Is the || leg only going to trigger when building 32 bit x86 with custom compiler flags to force SSE2 code? Perhaps it is worth just leaving this case to the cpuid code? > +# define have_sse2 1 > +#elif defined(CONFIG_CPUID_H) && defined(bit_SSE2) > +static bool have_sse2; > +#else > +# define have_sse2 0 > +#endif I was going to say the mixing of define and parameter seems a bit icky but I see other code in this function does the same thing. > + > /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are > going to attempt to determine at runtime whether movbe is available. */ > #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE) > @@ -686,6 +696,32 @@ static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) > } > } > > +static inline void tcg_out_mb(TCGContext *s, TCGArg a0) > +{ > + if (have_sse2) { > + tcg_out16(s, 0xae0f); > + switch (a0 & TCG_MO_ALL) { > + case TCG_MO_LD_LD: > + /* lfence */ > + tcg_out8(s, 0xe8); > + break; > + case TCG_MO_ST_ST: > + /* sfence */ > + tcg_out8(s, 0xf8); > + break; > + default: > + /* mfence */ > + tcg_out8(s, 0xf0); > + break; > + } > + } else { > + /* lock orl $0,0(%esp) */ > + tcg_out8(s, 0xf0); > + tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); > + tcg_out8(s, 0); > + } > +} > + > static inline void tcg_out_push(TCGContext *s, int reg) > { > tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); > @@ -2120,6 +2156,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > } > break; > > + case INDEX_op_mb: > + assert(args[0] != 0); Please use tcg_debug_assert for this. > + tcg_out_mb(s, args[0]); > + break; > case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ > case INDEX_op_mov_i64: > case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ > @@ -2185,6 +2225,8 @@ static const TCGTargetOpDef x86_op_defs[] = { > { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } }, > { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } }, > > + { INDEX_op_mb, { } }, > + > #if TCG_TARGET_REG_BITS == 32 > { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } }, > { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } }, > @@ -2362,6 +2404,11 @@ static void tcg_target_init(TCGContext *s) > available, we'll use a small forward branch. */ > have_cmov = (d & bit_CMOV) != 0; > #endif > +#ifndef have_sse2 > + /* Likewise, almost all hardware supports SSE2, but we do > + have a locked memory operation to use as a substitute. */ > + have_sse2 = (d & bit_SSE2) != 0; > +#endif > #ifndef have_movbe > /* MOVBE is only available on Intel Atom and Haswell CPUs, so we > need to probe for it. */ -- Alex Bennée
On 06/22/2016 09:25 AM, Alex Bennée wrote: > > Pranith Kumar <bobby.prani@gmail.com> writes: > >> Generate mfence/sfence/lfence instruction on SSE2 enabled >> processors. For older processors, generate a 'lock orl $0,0(%esp)' >> instruction which has full ordering semantics. >> >> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> >> [rth: Check for sse2, fallback to locked memory op otherwise.] >> Signed-off-by: Richard Henderson <rth@twiddle.net> >> --- >> tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 47 insertions(+) >> >> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c >> index 317484c..0748652 100644 >> --- a/tcg/i386/tcg-target.inc.c >> +++ b/tcg/i386/tcg-target.inc.c >> @@ -121,6 +121,16 @@ static bool have_cmov; >> # define have_cmov 0 >> #endif >> >> +/* For 32-bit, we are going to attempt to determine at runtime whether >> + sse2 support is available. */ >> +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__) > > Hmm checkpatch.pl warns against including architecture specific defines. > Is the || leg only going to trigger when building 32 bit x86 with custom > compiler flags to force SSE2 code? Yes, e.g. -march=native. I think checkpatch should be ignored in this situation. There's precedent elsewhere in the tcg backends. And it's definitely architecture specific code. ;-) r~
Richard Henderson <rth@twiddle.net> writes: > On 06/22/2016 09:25 AM, Alex Bennée wrote: >> >> Pranith Kumar <bobby.prani@gmail.com> writes: >> >>> Generate mfence/sfence/lfence instruction on SSE2 enabled >>> processors. For older processors, generate a 'lock orl $0,0(%esp)' >>> instruction which has full ordering semantics. >>> >>> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> >>> [rth: Check for sse2, fallback to locked memory op otherwise.] >>> Signed-off-by: Richard Henderson <rth@twiddle.net> >>> --- >>> tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ >>> 1 file changed, 47 insertions(+) >>> >>> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c >>> index 317484c..0748652 100644 >>> --- a/tcg/i386/tcg-target.inc.c >>> +++ b/tcg/i386/tcg-target.inc.c >>> @@ -121,6 +121,16 @@ static bool have_cmov; >>> # define have_cmov 0 >>> #endif >>> >>> +/* For 32-bit, we are going to attempt to determine at runtime whether >>> + sse2 support is available. */ >>> +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__) >> >> Hmm checkpatch.pl warns against including architecture specific defines. >> Is the || leg only going to trigger when building 32 bit x86 with custom >> compiler flags to force SSE2 code? > > Yes, e.g. -march=native. > > I think checkpatch should be ignored in this situation. There's precedent > elsewhere in the tcg backends. And it's definitely architecture specific code. > ;-) Fair enough, I bow to your maintainerly view ;-) -- Alex Bennée
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c index 317484c..0748652 100644 --- a/tcg/i386/tcg-target.inc.c +++ b/tcg/i386/tcg-target.inc.c @@ -121,6 +121,16 @@ static bool have_cmov; # define have_cmov 0 #endif +/* For 32-bit, we are going to attempt to determine at runtime whether + sse2 support is available. */ +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__) +# define have_sse2 1 +#elif defined(CONFIG_CPUID_H) && defined(bit_SSE2) +static bool have_sse2; +#else +# define have_sse2 0 +#endif + /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are going to attempt to determine at runtime whether movbe is available. */ #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE) @@ -686,6 +696,32 @@ static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val) } } +static inline void tcg_out_mb(TCGContext *s, TCGArg a0) +{ + if (have_sse2) { + tcg_out16(s, 0xae0f); + switch (a0 & TCG_MO_ALL) { + case TCG_MO_LD_LD: + /* lfence */ + tcg_out8(s, 0xe8); + break; + case TCG_MO_ST_ST: + /* sfence */ + tcg_out8(s, 0xf8); + break; + default: + /* mfence */ + tcg_out8(s, 0xf0); + break; + } + } else { + /* lock orl $0,0(%esp) */ + tcg_out8(s, 0xf0); + tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0); + tcg_out8(s, 0); + } +} + static inline void tcg_out_push(TCGContext *s, int reg) { tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0); @@ -2120,6 +2156,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; + case INDEX_op_mb: + assert(args[0] != 0); + tcg_out_mb(s, args[0]); + break; case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ case INDEX_op_mov_i64: case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ @@ -2185,6 +2225,8 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } }, { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } }, + { INDEX_op_mb, { } }, + #if TCG_TARGET_REG_BITS == 32 { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } }, { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } }, @@ -2362,6 +2404,11 @@ static void tcg_target_init(TCGContext *s) available, we'll use a small forward branch. */ have_cmov = (d & bit_CMOV) != 0; #endif +#ifndef have_sse2 + /* Likewise, almost all hardware supports SSE2, but we do + have a locked memory operation to use as a substitute. */ + have_sse2 = (d & bit_SSE2) != 0; +#endif #ifndef have_movbe /* MOVBE is only available on Intel Atom and Haswell CPUs, so we need to probe for it. */