diff mbox

[RFC,v3,02/14] tcg/i386: Add support for fence

Message ID 20160618040343.19517-3-bobby.prani@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Pranith Kumar June 18, 2016, 4:03 a.m. UTC
Generate mfence/sfence/lfence instruction on SSE2 enabled
processors. For older processors, generate a 'lock orl $0,0(%esp)'
instruction which has full ordering semantics.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
[rth: Check for sse2, fallback to locked memory op otherwise.]
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

Comments

Paolo Bonzini June 21, 2016, 7:24 a.m. UTC | #1
On 18/06/2016 06:03, Pranith Kumar wrote:
> Generate mfence/sfence/lfence instruction on SSE2 enabled
> processors. For older processors, generate a 'lock orl $0,0(%esp)'
> instruction which has full ordering semantics.
> 
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
> [rth: Check for sse2, fallback to locked memory op otherwise.]
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
> 
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 317484c..0748652 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> @@ -121,6 +121,16 @@ static bool have_cmov;
>  # define have_cmov 0
>  #endif
>  
> +/* For 32-bit, we are going to attempt to determine at runtime whether
> +   sse2 support is available.  */
> +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__)
> +# define have_sse2 1
> +#elif defined(CONFIG_CPUID_H) && defined(bit_SSE2)
> +static bool have_sse2;
> +#else
> +# define have_sse2 0
> +#endif
> +
>  /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
>     going to attempt to determine at runtime whether movbe is available.  */
>  #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
> @@ -686,6 +696,32 @@ static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
>      }
>  }
>  
> +static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
> +{
> +    if (have_sse2) {
> +        tcg_out16(s, 0xae0f);
> +        switch (a0 & TCG_MO_ALL) {
> +        case TCG_MO_LD_LD:
> +            /* lfence */
> +            tcg_out8(s, 0xe8);
> +            break;
> +        case TCG_MO_ST_ST:
> +            /* sfence */
> +            tcg_out8(s, 0xf8);
> +            break;

These two barriers are unnecessary on x86, and so is TCG_MO_LD_ST.

> +        default:
> +            /* mfence */
> +            tcg_out8(s, 0xf0);
> +            break;

Please use lock orl here too, it turns out to be faster.

> +        }
> +    } else {
> +        /* lock orl $0,0(%esp) */
> +        tcg_out8(s, 0xf0);
> +        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
> +        tcg_out8(s, 0);

This is only needed for TCG_MO_ST_LD.

Paolo

> +    }
> +}
> +
>  static inline void tcg_out_push(TCGContext *s, int reg)
>  {
>      tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
> @@ -2120,6 +2156,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          }
>          break;
>  
> +    case INDEX_op_mb:
> +        assert(args[0] != 0);
> +        tcg_out_mb(s, args[0]);
> +        break;
>      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
>      case INDEX_op_mov_i64:
>      case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
> @@ -2185,6 +2225,8 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
>      { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
>  
> +    { INDEX_op_mb, { } },
> +
>  #if TCG_TARGET_REG_BITS == 32
>      { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
>      { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
> @@ -2362,6 +2404,11 @@ static void tcg_target_init(TCGContext *s)
>             available, we'll use a small forward branch.  */
>          have_cmov = (d & bit_CMOV) != 0;
>  #endif
> +#ifndef have_sse2
> +        /* Likewise, almost all hardware supports SSE2, but we do
> +           have a locked memory operation to use as a substitute.  */
> +        have_sse2 = (d & bit_SSE2) != 0;
> +#endif
>  #ifndef have_movbe
>          /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
>             need to probe for it.  */
>
Alex Bennée June 22, 2016, 4:25 p.m. UTC | #2
Pranith Kumar <bobby.prani@gmail.com> writes:

> Generate mfence/sfence/lfence instruction on SSE2 enabled
> processors. For older processors, generate a 'lock orl $0,0(%esp)'
> instruction which has full ordering semantics.
>
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
> [rth: Check for sse2, fallback to locked memory op otherwise.]
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
>
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 317484c..0748652 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> @@ -121,6 +121,16 @@ static bool have_cmov;
>  # define have_cmov 0
>  #endif
>
> +/* For 32-bit, we are going to attempt to determine at runtime whether
> +   sse2 support is available.  */
> +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__)

Hmm checkpatch.pl warns against including architecture specific defines.
Is the || leg only going to trigger when building 32 bit x86 with custom
compiler flags to force SSE2 code? Perhaps it is worth just leaving this
case to the cpuid code?

> +# define have_sse2 1
> +#elif defined(CONFIG_CPUID_H) && defined(bit_SSE2)
> +static bool have_sse2;
> +#else
> +# define have_sse2 0
> +#endif

I was going to say the mixing of define and parameter seems a bit icky
but I see other code in this function does the same thing.

> +
>  /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
>     going to attempt to determine at runtime whether movbe is available.  */
>  #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
> @@ -686,6 +696,32 @@ static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
>      }
>  }
>
> +static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
> +{
> +    if (have_sse2) {
> +        tcg_out16(s, 0xae0f);
> +        switch (a0 & TCG_MO_ALL) {
> +        case TCG_MO_LD_LD:
> +            /* lfence */
> +            tcg_out8(s, 0xe8);
> +            break;
> +        case TCG_MO_ST_ST:
> +            /* sfence */
> +            tcg_out8(s, 0xf8);
> +            break;
> +        default:
> +            /* mfence */
> +            tcg_out8(s, 0xf0);
> +            break;
> +        }
> +    } else {
> +        /* lock orl $0,0(%esp) */
> +        tcg_out8(s, 0xf0);
> +        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
> +        tcg_out8(s, 0);
> +    }
> +}
> +
>  static inline void tcg_out_push(TCGContext *s, int reg)
>  {
>      tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
> @@ -2120,6 +2156,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          }
>          break;
>
> +    case INDEX_op_mb:
> +        assert(args[0] != 0);

Please use tcg_debug_assert for this.

> +        tcg_out_mb(s, args[0]);
> +        break;
>      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
>      case INDEX_op_mov_i64:
>      case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
> @@ -2185,6 +2225,8 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
>      { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
>
> +    { INDEX_op_mb, { } },
> +
>  #if TCG_TARGET_REG_BITS == 32
>      { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
>      { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
> @@ -2362,6 +2404,11 @@ static void tcg_target_init(TCGContext *s)
>             available, we'll use a small forward branch.  */
>          have_cmov = (d & bit_CMOV) != 0;
>  #endif
> +#ifndef have_sse2
> +        /* Likewise, almost all hardware supports SSE2, but we do
> +           have a locked memory operation to use as a substitute.  */
> +        have_sse2 = (d & bit_SSE2) != 0;
> +#endif
>  #ifndef have_movbe
>          /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
>             need to probe for it.  */


--
Alex Bennée
Richard Henderson June 22, 2016, 4:49 p.m. UTC | #3
On 06/22/2016 09:25 AM, Alex Bennée wrote:
> 
> Pranith Kumar <bobby.prani@gmail.com> writes:
> 
>> Generate mfence/sfence/lfence instruction on SSE2 enabled
>> processors. For older processors, generate a 'lock orl $0,0(%esp)'
>> instruction which has full ordering semantics.
>>
>> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
>> [rth: Check for sse2, fallback to locked memory op otherwise.]
>> Signed-off-by: Richard Henderson <rth@twiddle.net>
>> ---
>>  tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 47 insertions(+)
>>
>> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
>> index 317484c..0748652 100644
>> --- a/tcg/i386/tcg-target.inc.c
>> +++ b/tcg/i386/tcg-target.inc.c
>> @@ -121,6 +121,16 @@ static bool have_cmov;
>>  # define have_cmov 0
>>  #endif
>>
>> +/* For 32-bit, we are going to attempt to determine at runtime whether
>> +   sse2 support is available.  */
>> +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__)
> 
> Hmm checkpatch.pl warns against including architecture specific defines.
> Is the || leg only going to trigger when building 32 bit x86 with custom
> compiler flags to force SSE2 code?

Yes, e.g. -march=native.

I think checkpatch should be ignored in this situation.  There's precedent
elsewhere in the tcg backends.  And it's definitely architecture specific code.
 ;-)


r~
Alex Bennée June 22, 2016, 6:18 p.m. UTC | #4
Richard Henderson <rth@twiddle.net> writes:

> On 06/22/2016 09:25 AM, Alex Bennée wrote:
>>
>> Pranith Kumar <bobby.prani@gmail.com> writes:
>>
>>> Generate mfence/sfence/lfence instruction on SSE2 enabled
>>> processors. For older processors, generate a 'lock orl $0,0(%esp)'
>>> instruction which has full ordering semantics.
>>>
>>> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
>>> [rth: Check for sse2, fallback to locked memory op otherwise.]
>>> Signed-off-by: Richard Henderson <rth@twiddle.net>
>>> ---
>>>  tcg/i386/tcg-target.inc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 47 insertions(+)
>>>
>>> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
>>> index 317484c..0748652 100644
>>> --- a/tcg/i386/tcg-target.inc.c
>>> +++ b/tcg/i386/tcg-target.inc.c
>>> @@ -121,6 +121,16 @@ static bool have_cmov;
>>>  # define have_cmov 0
>>>  #endif
>>>
>>> +/* For 32-bit, we are going to attempt to determine at runtime whether
>>> +   sse2 support is available.  */
>>> +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__)
>>
>> Hmm checkpatch.pl warns against including architecture specific defines.
>> Is the || leg only going to trigger when building 32 bit x86 with custom
>> compiler flags to force SSE2 code?
>
> Yes, e.g. -march=native.
>
> I think checkpatch should be ignored in this situation.  There's precedent
> elsewhere in the tcg backends.  And it's definitely architecture specific code.
>  ;-)

Fair enough, I bow to your maintainerly view ;-)

--
Alex Bennée
diff mbox

Patch

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 317484c..0748652 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -121,6 +121,16 @@  static bool have_cmov;
 # define have_cmov 0
 #endif
 
+/* For 32-bit, we are going to attempt to determine at runtime whether
+   sse2 support is available.  */
+#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__)
+# define have_sse2 1
+#elif defined(CONFIG_CPUID_H) && defined(bit_SSE2)
+static bool have_sse2;
+#else
+# define have_sse2 0
+#endif
+
 /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
    going to attempt to determine at runtime whether movbe is available.  */
 #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
@@ -686,6 +696,32 @@  static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
     }
 }
 
+static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    if (have_sse2) {
+        tcg_out16(s, 0xae0f);
+        switch (a0 & TCG_MO_ALL) {
+        case TCG_MO_LD_LD:
+            /* lfence */
+            tcg_out8(s, 0xe8);
+            break;
+        case TCG_MO_ST_ST:
+            /* sfence */
+            tcg_out8(s, 0xf8);
+            break;
+        default:
+            /* mfence */
+            tcg_out8(s, 0xf0);
+            break;
+        }
+    } else {
+        /* lock orl $0,0(%esp) */
+        tcg_out8(s, 0xf0);
+        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
+        tcg_out8(s, 0);
+    }
+}
+
 static inline void tcg_out_push(TCGContext *s, int reg)
 {
     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
@@ -2120,6 +2156,10 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    case INDEX_op_mb:
+        assert(args[0] != 0);
+        tcg_out_mb(s, args[0]);
+        break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
@@ -2185,6 +2225,8 @@  static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
     { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
 
+    { INDEX_op_mb, { } },
+
 #if TCG_TARGET_REG_BITS == 32
     { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
     { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
@@ -2362,6 +2404,11 @@  static void tcg_target_init(TCGContext *s)
            available, we'll use a small forward branch.  */
         have_cmov = (d & bit_CMOV) != 0;
 #endif
+#ifndef have_sse2
+        /* Likewise, almost all hardware supports SSE2, but we do
+           have a locked memory operation to use as a substitute.  */
+        have_sse2 = (d & bit_SSE2) != 0;
+#endif
 #ifndef have_movbe
         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
            need to probe for it.  */