diff mbox

[3/4] xen/x86: Replace incorrect mandatory barriers with SMP barriers

Message ID 1480932317-22962-4-git-send-email-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andrew Cooper Dec. 5, 2016, 10:05 a.m. UTC
Mandatory barriers are only for use with reduced-cacheability memory mappings.

All of these uses are just to deal with shared memory between multiple
processors, so use the smp_*() which are the correct barriers for the purpose.

This is no functional change, because Xen currently assigns the smp_* meaning
to non-smp_* barriers.  This will be fixed in the following patch.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>

Restricting to just the $ARCH maintainers, as this is a project-wide sweep
---
 xen/arch/x86/acpi/cpu_idle.c             |  8 ++++----
 xen/arch/x86/cpu/mcheck/amd_nonfatal.c   |  4 ++--
 xen/arch/x86/cpu/mcheck/barrier.c        | 10 +++++-----
 xen/arch/x86/cpu/mcheck/mce.c            |  2 +-
 xen/arch/x86/cpu/mcheck/mctelem.c        | 10 +++++-----
 xen/arch/x86/genapic/x2apic.c            |  6 +++---
 xen/arch/x86/hpet.c                      |  6 +++---
 xen/arch/x86/hvm/ioreq.c                 |  4 ++--
 xen/arch/x86/io_apic.c                   |  4 ++--
 xen/arch/x86/irq.c                       |  4 ++--
 xen/arch/x86/mm.c                        | 10 +++++-----
 xen/arch/x86/mm/shadow/multi.c           |  2 +-
 xen/arch/x86/smpboot.c                   | 12 ++++++------
 xen/arch/x86/time.c                      |  8 ++++----
 xen/drivers/passthrough/amd/iommu_init.c |  4 ++--
 xen/include/asm-x86/desc.h               |  8 ++++----
 xen/include/asm-x86/system.h             |  2 +-
 17 files changed, 52 insertions(+), 52 deletions(-)

Comments

Jan Beulich Dec. 5, 2016, 11:47 a.m. UTC | #1
>>> On 05.12.16 at 11:05, <andrew.cooper3@citrix.com> wrote:
> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -391,9 +391,9 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>  
>      if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
>      {
> -        mb();
> +        smp_mb();
>          clflush((void *)&mwait_wakeup(cpu));
> -        mb();
> +        smp_mb();
>      }

Both need to stay as they are afaict: In order for the clflush() to do
what we want we have to order it wrt earlier as well as later writes,
regardless of SMP-ness. Or wait - the SDM has changed in that
respect (and a footnote describes the earlier specified behavior now).
AMD, otoh, continues to require MFENCE for ordering purposes.

> --- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
> +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
> @@ -175,7 +175,7 @@ static void mce_amd_work_fn(void *data)
>  			/* Counter enable */
>  			value |= (1ULL << 51);
>  			mca_wrmsr(MSR_IA32_MCx_MISC(4), value);
> -			wmb();
> +			smp_wmb();
>  		}
>  	}
>  
> @@ -240,7 +240,7 @@ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
>  			value |= (1ULL << 51);
>  			wrmsrl(MSR_IA32_MCx_MISC(4), value);
>  			/* serialize */
> -			wmb();
> +			smp_wmb();
>  			printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
>  		}
>  	}

These will need confirming by AMD engineers.

> --- a/xen/arch/x86/cpu/mcheck/barrier.c
> +++ b/xen/arch/x86/cpu/mcheck/barrier.c
> @@ -12,7 +12,7 @@ void mce_barrier_init(struct mce_softirq_barrier *bar)
>  void mce_barrier_dec(struct mce_softirq_barrier *bar)
>  {
>      atomic_inc(&bar->outgen);
> -    wmb();
> +    smp_wmb();
>      atomic_dec(&bar->val);
>  }

This being x86-specific code - do we need a barrier here at all,
between two LOCKed instructions? (Same for two more cases further
down.)

> @@ -433,7 +433,7 @@ mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
>  	}
>  
>  	mctelem_processing_hold(tep);
> -	wmb();
> +	smp_wmb();
>  	spin_unlock(&processing_lock);

Don't we imply unlocks to be write barriers?

> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -1486,7 +1486,7 @@ static int __init timer_irq_works(void)
>      unsigned long t1, flags;
>  
>      t1 = pit0_ticks;
> -    mb();
> +    smp_mb();
>  
>      local_save_flags(flags);
>      local_irq_enable();
> @@ -1501,7 +1501,7 @@ static int __init timer_irq_works(void)
>       * might have cached one ExtINT interrupt.  Finally, at
>       * least one tick may be lost due to delays.
>       */
> -    mb();
> +    smp_mb();
>      if (pit0_ticks - t1 > 4)
>          return 1;

I think these two can be barrier().

> @@ -124,7 +124,7 @@ static void synchronize_tsc_master(unsigned int slave)
>      for ( i = 1; i <= 5; i++ )
>      {
>          tsc_value = rdtsc_ordered();
> -        wmb();
> +        smp_wmb();
>          atomic_inc(&tsc_count);

Same question as above wrt the following LOCKed instruction.

> --- a/xen/drivers/passthrough/amd/iommu_init.c
> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> @@ -559,7 +559,7 @@ static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[])
>              return;
>          }
>          udelay(1);
> -        rmb();
> +        smp_rmb();
>          code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK,
>                                        IOMMU_EVENT_CODE_SHIFT);
>      }
> @@ -664,7 +664,7 @@ void parse_ppr_log_entry(struct amd_iommu *iommu, u32 entry[])
>              return;
>          }
>          udelay(1);
> -        rmb();
> +        smp_rmb();
>          code = get_field_from_reg_u32(entry[1], IOMMU_PPR_LOG_CODE_MASK,
>                                        IOMMU_PPR_LOG_CODE_SHIFT);
>      }

Don't these fall into the "reduced-cacheability memory mappings"
category? Or if the mappings these reads go through are UC,
aren't they unneeded altogether? In any event this would better be
a separate patch Cc-ed to the AMD IOMMU maintainers.

Jan
Andrew Cooper Dec. 5, 2016, 1:29 p.m. UTC | #2
On 05/12/16 11:47, Jan Beulich wrote:
>>>> On 05.12.16 at 11:05, <andrew.cooper3@citrix.com> wrote:
>> --- a/xen/arch/x86/acpi/cpu_idle.c
>> +++ b/xen/arch/x86/acpi/cpu_idle.c
>> @@ -391,9 +391,9 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>  
>>      if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
>>      {
>> -        mb();
>> +        smp_mb();
>>          clflush((void *)&mwait_wakeup(cpu));
>> -        mb();
>> +        smp_mb();
>>      }
> Both need to stay as they are afaict: In order for the clflush() to do
> what we want we have to order it wrt earlier as well as later writes,
> regardless of SMP-ness. Or wait - the SDM has changed in that
> respect (and a footnote describes the earlier specified behavior now).
> AMD, otoh, continues to require MFENCE for ordering purposes.

mb() == smp_mb().  They are both mfence instructions.

However, if AMD specifically requires mfence, we should explicitly use
that rather than relying on the implementation details of smp_mb().

>
>> --- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
>> +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
>> @@ -175,7 +175,7 @@ static void mce_amd_work_fn(void *data)
>>  			/* Counter enable */
>>  			value |= (1ULL << 51);
>>  			mca_wrmsr(MSR_IA32_MCx_MISC(4), value);
>> -			wmb();
>> +			smp_wmb();
>>  		}
>>  	}
>>  
>> @@ -240,7 +240,7 @@ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
>>  			value |= (1ULL << 51);
>>  			wrmsrl(MSR_IA32_MCx_MISC(4), value);
>>  			/* serialize */
>> -			wmb();
>> +			smp_wmb();
>>  			printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
>>  		}
>>  	}
> These will need confirming by AMD engineers.

I was uncertain whether these were necessary at all, but as identified
in the commit message, this is no functional change as Xen currently has
rmb/wmb as plain barriers, not fence instructions.

>
>> --- a/xen/arch/x86/cpu/mcheck/barrier.c
>> +++ b/xen/arch/x86/cpu/mcheck/barrier.c
>> @@ -12,7 +12,7 @@ void mce_barrier_init(struct mce_softirq_barrier *bar)
>>  void mce_barrier_dec(struct mce_softirq_barrier *bar)
>>  {
>>      atomic_inc(&bar->outgen);
>> -    wmb();
>> +    smp_wmb();
>>      atomic_dec(&bar->val);
>>  }
> This being x86-specific code - do we need a barrier here at all,
> between two LOCKed instructions? (Same for two more cases further
> down.)

Hmm, I think not.  The C semantics guarantees ordering of the
atomic_inc() and atomic_dec(), and they are both writes to will be
strictly ordered.

>
>> @@ -433,7 +433,7 @@ mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
>>  	}
>>  
>>  	mctelem_processing_hold(tep);
>> -	wmb();
>> +	smp_wmb();
>>  	spin_unlock(&processing_lock);
> Don't we imply unlocks to be write barriers?

They are, as an unlock is necessarily a write, combined with x86's
ordering guarantees.

Then again, I am not sure how this would interact with TSX, so am not
sure if we should assume or rely on such behaviour.

>
>> --- a/xen/arch/x86/io_apic.c
>> +++ b/xen/arch/x86/io_apic.c
>> @@ -1486,7 +1486,7 @@ static int __init timer_irq_works(void)
>>      unsigned long t1, flags;
>>  
>>      t1 = pit0_ticks;
>> -    mb();
>> +    smp_mb();
>>  
>>      local_save_flags(flags);
>>      local_irq_enable();
>> @@ -1501,7 +1501,7 @@ static int __init timer_irq_works(void)
>>       * might have cached one ExtINT interrupt.  Finally, at
>>       * least one tick may be lost due to delays.
>>       */
>> -    mb();
>> +    smp_mb();
>>      if (pit0_ticks - t1 > 4)
>>          return 1;
> I think these two can be barrier().

These were originally in the previous patch, but I wasn't so certain and
moved them back here.

Thinking about it further, pit0_ticks is only ever updated by the
current cpu, so so-long as the compiler doesn't elide the read, it
should be fine.

>
>> @@ -124,7 +124,7 @@ static void synchronize_tsc_master(unsigned int slave)
>>      for ( i = 1; i <= 5; i++ )
>>      {
>>          tsc_value = rdtsc_ordered();
>> -        wmb();
>> +        smp_wmb();
>>          atomic_inc(&tsc_count);
> Same question as above wrt the following LOCKed instruction.

I'm not sure the locked instruction is relevant here.  C's
ordering-model is sufficient to make this correct.

>
>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>> @@ -559,7 +559,7 @@ static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[])
>>              return;
>>          }
>>          udelay(1);
>> -        rmb();
>> +        smp_rmb();
>>          code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK,
>>                                        IOMMU_EVENT_CODE_SHIFT);
>>      }
>> @@ -664,7 +664,7 @@ void parse_ppr_log_entry(struct amd_iommu *iommu, u32 entry[])
>>              return;
>>          }
>>          udelay(1);
>> -        rmb();
>> +        smp_rmb();
>>          code = get_field_from_reg_u32(entry[1], IOMMU_PPR_LOG_CODE_MASK,
>>                                        IOMMU_PPR_LOG_CODE_SHIFT);
>>      }
> Don't these fall into the "reduced-cacheability memory mappings"
> category? Or if the mappings these reads go through are UC,
> aren't they unneeded altogether? In any event this would better be
> a separate patch Cc-ed to the AMD IOMMU maintainers.

I can't find any requirements in the AMD IOMMU spec about the
cacheability of mappings.

We use UC mappings, although frankly for the starting memset alone, we
should probably better using WB followed by an explicit cache flush,
then switching to UC.

With UC mappings, we don't need any barriers at all.  I will submit a
patch separately removing them.

~Andrew
Jan Beulich Dec. 5, 2016, 2:03 p.m. UTC | #3
>>> On 05.12.16 at 14:29, <andrew.cooper3@citrix.com> wrote:
> On 05/12/16 11:47, Jan Beulich wrote:
>>>>> On 05.12.16 at 11:05, <andrew.cooper3@citrix.com> wrote:
>>> --- a/xen/arch/x86/acpi/cpu_idle.c
>>> +++ b/xen/arch/x86/acpi/cpu_idle.c
>>> @@ -391,9 +391,9 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>>  
>>>      if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
>>>      {
>>> -        mb();
>>> +        smp_mb();
>>>          clflush((void *)&mwait_wakeup(cpu));
>>> -        mb();
>>> +        smp_mb();
>>>      }
>> Both need to stay as they are afaict: In order for the clflush() to do
>> what we want we have to order it wrt earlier as well as later writes,
>> regardless of SMP-ness. Or wait - the SDM has changed in that
>> respect (and a footnote describes the earlier specified behavior now).
>> AMD, otoh, continues to require MFENCE for ordering purposes.
> 
> mb() == smp_mb().  They are both mfence instructions.

Of course. But still smp_mb() would be less correct from an
abstract perspective, as here we care only about the local CPU.
That said, ...

> However, if AMD specifically requires mfence, we should explicitly use
> that rather than relying on the implementation details of smp_mb().

... I'd be fine with this.

>>> --- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
>>> +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
>>> @@ -175,7 +175,7 @@ static void mce_amd_work_fn(void *data)
>>>  			/* Counter enable */
>>>  			value |= (1ULL << 51);
>>>  			mca_wrmsr(MSR_IA32_MCx_MISC(4), value);
>>> -			wmb();
>>> +			smp_wmb();
>>>  		}
>>>  	}
>>>  
>>> @@ -240,7 +240,7 @@ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
>>>  			value |= (1ULL << 51);
>>>  			wrmsrl(MSR_IA32_MCx_MISC(4), value);
>>>  			/* serialize */
>>> -			wmb();
>>> +			smp_wmb();
>>>  			printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
>>>  		}
>>>  	}
>> These will need confirming by AMD engineers.
> 
> I was uncertain whether these were necessary at all, but as identified
> in the commit message, this is no functional change as Xen currently has
> rmb/wmb as plain barriers, not fence instructions.

And may hence be subtly broken, if this code was lifted from Linux?

>>> @@ -433,7 +433,7 @@ mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
>>>  	}
>>>  
>>>  	mctelem_processing_hold(tep);
>>> -	wmb();
>>> +	smp_wmb();
>>>  	spin_unlock(&processing_lock);
>> Don't we imply unlocks to be write barriers?
> 
> They are, as an unlock is necessarily a write, combined with x86's
> ordering guarantees.
> 
> Then again, I am not sure how this would interact with TSX, so am not
> sure if we should assume or rely on such behaviour.

Isn't architectural state at the end of a transactional region
indistinguishable as to whether TSX was actually used or the
abort path taken (assuming the two code patch don't differ
in their actions)?

>>> --- a/xen/arch/x86/io_apic.c
>>> +++ b/xen/arch/x86/io_apic.c
>>> @@ -1486,7 +1486,7 @@ static int __init timer_irq_works(void)
>>>      unsigned long t1, flags;
>>>  
>>>      t1 = pit0_ticks;
>>> -    mb();
>>> +    smp_mb();
>>>  
>>>      local_save_flags(flags);
>>>      local_irq_enable();
>>> @@ -1501,7 +1501,7 @@ static int __init timer_irq_works(void)
>>>       * might have cached one ExtINT interrupt.  Finally, at
>>>       * least one tick may be lost due to delays.
>>>       */
>>> -    mb();
>>> +    smp_mb();
>>>      if (pit0_ticks - t1 > 4)
>>>          return 1;
>> I think these two can be barrier().
> 
> These were originally in the previous patch, but I wasn't so certain and
> moved them back here.
> 
> Thinking about it further, pit0_ticks is only ever updated by the
> current cpu, so so-long as the compiler doesn't elide the read, it
> should be fine.

That was exactly my thinking when giving the comment.

>>> @@ -124,7 +124,7 @@ static void synchronize_tsc_master(unsigned int slave)
>>>      for ( i = 1; i <= 5; i++ )
>>>      {
>>>          tsc_value = rdtsc_ordered();
>>> -        wmb();
>>> +        smp_wmb();
>>>          atomic_inc(&tsc_count);
>> Same question as above wrt the following LOCKed instruction.
> 
> I'm not sure the locked instruction is relevant here.  C's
> ordering-model is sufficient to make this correct.

I don't follow - the two involved variables are distinct, so I don't
see how C's ordering model helps here at all. We need (at the
machine level) the write to tsc_value to precede the increment
of tsc_count, and I don't think C alone guarantees any such
ordering.

Jan
Andrew Cooper Dec. 5, 2016, 2:24 p.m. UTC | #4
On 05/12/16 14:03, Jan Beulich wrote:
>>>> On 05.12.16 at 14:29, <andrew.cooper3@citrix.com> wrote:
>> On 05/12/16 11:47, Jan Beulich wrote:
>>>>>> On 05.12.16 at 11:05, <andrew.cooper3@citrix.com> wrote:
>>>> --- a/xen/arch/x86/acpi/cpu_idle.c
>>>> +++ b/xen/arch/x86/acpi/cpu_idle.c
>>>> @@ -391,9 +391,9 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>>>  
>>>>      if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
>>>>      {
>>>> -        mb();
>>>> +        smp_mb();
>>>>          clflush((void *)&mwait_wakeup(cpu));
>>>> -        mb();
>>>> +        smp_mb();
>>>>      }
>>> Both need to stay as they are afaict: In order for the clflush() to do
>>> what we want we have to order it wrt earlier as well as later writes,
>>> regardless of SMP-ness. Or wait - the SDM has changed in that
>>> respect (and a footnote describes the earlier specified behavior now).
>>> AMD, otoh, continues to require MFENCE for ordering purposes.
>> mb() == smp_mb().  They are both mfence instructions.
> Of course. But still smp_mb() would be less correct from an
> abstract perspective

? That is entirely the purpose and intended meaning of the abstraction.

smp_mb() orders operations such that (visible to other CPUs in the
system), all writes will have completed before any subsequent reads begin.

> , as here we care only about the local CPU.
> That said, ...
>
>> However, if AMD specifically requires mfence, we should explicitly use
>> that rather than relying on the implementation details of smp_mb().
> ... I'd be fine with this.

An earlier version of the series introduced explicit {m,s,l}fence()
defines.  I will reintroduce these.

>
>>>> --- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
>>>> +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
>>>> @@ -175,7 +175,7 @@ static void mce_amd_work_fn(void *data)
>>>>  			/* Counter enable */
>>>>  			value |= (1ULL << 51);
>>>>  			mca_wrmsr(MSR_IA32_MCx_MISC(4), value);
>>>> -			wmb();
>>>> +			smp_wmb();
>>>>  		}
>>>>  	}
>>>>  
>>>> @@ -240,7 +240,7 @@ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
>>>>  			value |= (1ULL << 51);
>>>>  			wrmsrl(MSR_IA32_MCx_MISC(4), value);
>>>>  			/* serialize */
>>>> -			wmb();
>>>> +			smp_wmb();
>>>>  			printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
>>>>  		}
>>>>  	}
>>> These will need confirming by AMD engineers.
>> I was uncertain whether these were necessary at all, but as identified
>> in the commit message, this is no functional change as Xen currently has
>> rmb/wmb as plain barriers, not fence instructions.
> And may hence be subtly broken, if this code was lifted from Linux?

It doesn't resemble anything in Linux these days.  I don't know if that
means we have lagged, or it was developed independently.

Looking at the Linux code, there are a few mandatory barriers which
should all be SMP barriers instead (guarding updates of shared memory),
but no barriers at all around MSR reads or writes.

>
>>>> @@ -433,7 +433,7 @@ mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
>>>>  	}
>>>>  
>>>>  	mctelem_processing_hold(tep);
>>>> -	wmb();
>>>> +	smp_wmb();
>>>>  	spin_unlock(&processing_lock);
>>> Don't we imply unlocks to be write barriers?
>> They are, as an unlock is necessarily a write, combined with x86's
>> ordering guarantees.
>>
>> Then again, I am not sure how this would interact with TSX, so am not
>> sure if we should assume or rely on such behaviour.
> Isn't architectural state at the end of a transactional region
> indistinguishable as to whether TSX was actually used or the
> abort path taken (assuming the two code patch don't differ
> in their actions)?

I'd hope so, but I haven't had occasion to dig into TSX in detail yet.

>>>> @@ -124,7 +124,7 @@ static void synchronize_tsc_master(unsigned int slave)
>>>>      for ( i = 1; i <= 5; i++ )
>>>>      {
>>>>          tsc_value = rdtsc_ordered();
>>>> -        wmb();
>>>> +        smp_wmb();
>>>>          atomic_inc(&tsc_count);
>>> Same question as above wrt the following LOCKed instruction.
>> I'm not sure the locked instruction is relevant here.  C's
>> ordering-model is sufficient to make this correct.
> I don't follow - the two involved variables are distinct, so I don't
> see how C's ordering model helps here at all. We need (at the
> machine level) the write to tsc_value to precede the increment
> of tsc_count, and I don't think C alone guarantees any such
> ordering.

Sorry - I looked at that code and thought they were both using tsc_value.

Yes, we do need at least a compiler barrer here.

~Andrew
Jan Beulich Dec. 5, 2016, 2:33 p.m. UTC | #5
>>> On 05.12.16 at 15:24, <andrew.cooper3@citrix.com> wrote:
> On 05/12/16 14:03, Jan Beulich wrote:
>>>>> On 05.12.16 at 14:29, <andrew.cooper3@citrix.com> wrote:
>>> On 05/12/16 11:47, Jan Beulich wrote:
>>>>>>> On 05.12.16 at 11:05, <andrew.cooper3@citrix.com> wrote:
>>>>> --- a/xen/arch/x86/acpi/cpu_idle.c
>>>>> +++ b/xen/arch/x86/acpi/cpu_idle.c
>>>>> @@ -391,9 +391,9 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int 
> ecx)
>>>>>  
>>>>>      if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
>>>>>      {
>>>>> -        mb();
>>>>> +        smp_mb();
>>>>>          clflush((void *)&mwait_wakeup(cpu));
>>>>> -        mb();
>>>>> +        smp_mb();
>>>>>      }
>>>> Both need to stay as they are afaict: In order for the clflush() to do
>>>> what we want we have to order it wrt earlier as well as later writes,
>>>> regardless of SMP-ness. Or wait - the SDM has changed in that
>>>> respect (and a footnote describes the earlier specified behavior now).
>>>> AMD, otoh, continues to require MFENCE for ordering purposes.
>>> mb() == smp_mb().  They are both mfence instructions.
>> Of course. But still smp_mb() would be less correct from an
>> abstract perspective
> 
> ? That is entirely the purpose and intended meaning of the abstraction.
> 
> smp_mb() orders operations such that (visible to other CPUs in the
> system), all writes will have completed before any subsequent reads begin.

Yes, but this code is not about multiple CPUs, but just the local
one (we want to make sure CLFLUSH does what we want). Hence
using smp_ prefixed barriers would be wrong. But anyway, with
this becoming explicit mfence() invocations, the discussion is all
moot now.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 09c8848..b481059 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -391,9 +391,9 @@  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 
     if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
     {
-        mb();
+        smp_mb();
         clflush((void *)&mwait_wakeup(cpu));
-        mb();
+        smp_mb();
     }
 
     __monitor((void *)&mwait_wakeup(cpu), 0, 0);
@@ -756,10 +756,10 @@  void acpi_dead_idle(void)
              * instruction, hence memory fence is necessary to make sure all 
              * load/store visible before flush cache line.
              */
-            mb();
+            smp_mb();
             clflush(mwait_ptr);
             __monitor(mwait_ptr, 0, 0);
-            mb();
+            smp_mb();
             __mwait(cx->address, 0);
         }
     }
diff --git a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
index 8a80a9f..fb1d41d 100644
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
@@ -175,7 +175,7 @@  static void mce_amd_work_fn(void *data)
 			/* Counter enable */
 			value |= (1ULL << 51);
 			mca_wrmsr(MSR_IA32_MCx_MISC(4), value);
-			wmb();
+			smp_wmb();
 		}
 	}
 
@@ -240,7 +240,7 @@  void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
 			value |= (1ULL << 51);
 			wrmsrl(MSR_IA32_MCx_MISC(4), value);
 			/* serialize */
-			wmb();
+			smp_wmb();
 			printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
 		}
 	}
diff --git a/xen/arch/x86/cpu/mcheck/barrier.c b/xen/arch/x86/cpu/mcheck/barrier.c
index 5dce1fb..1a2149f 100644
--- a/xen/arch/x86/cpu/mcheck/barrier.c
+++ b/xen/arch/x86/cpu/mcheck/barrier.c
@@ -12,7 +12,7 @@  void mce_barrier_init(struct mce_softirq_barrier *bar)
 void mce_barrier_dec(struct mce_softirq_barrier *bar)
 {
     atomic_inc(&bar->outgen);
-    wmb();
+    smp_wmb();
     atomic_dec(&bar->val);
 }
 
@@ -24,12 +24,12 @@  void mce_barrier_enter(struct mce_softirq_barrier *bar)
         return;
     atomic_inc(&bar->ingen);
     gen = atomic_read(&bar->outgen);
-    mb();
+    smp_mb();
     atomic_inc(&bar->val);
     while ( atomic_read(&bar->val) != num_online_cpus() &&
             atomic_read(&bar->outgen) == gen )
     {
-            mb();
+            smp_mb();
             mce_panic_check();
     }
 }
@@ -42,12 +42,12 @@  void mce_barrier_exit(struct mce_softirq_barrier *bar)
         return;
     atomic_inc(&bar->outgen);
     gen = atomic_read(&bar->ingen);
-    mb();
+    smp_mb();
     atomic_dec(&bar->val);
     while ( atomic_read(&bar->val) != 0 &&
             atomic_read(&bar->ingen) == gen )
     {
-            mb();
+            smp_mb();
             mce_panic_check();
     }
 }
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index 460e336..02883fc 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -388,7 +388,7 @@  mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
         else if ( who == MCA_MCE_SCAN && need_clear)
             mcabanks_set(i, clear_bank);
 
-        wmb();
+        smp_wmb();
     }
 
     if (mig && errcnt > 0) {
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c
index 95e83c5..01077fe 100644
--- a/xen/arch/x86/cpu/mcheck/mctelem.c
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c
@@ -414,9 +414,9 @@  static void mctelem_append_processing(mctelem_class_t which)
 		ltep->mcte_prev = *procltp;
 		*procltp = dangling[target];
 	}
-	wmb();
+	smp_wmb();
 	dangling[target] = NULL;
-	wmb();
+	smp_wmb();
 }
 
 mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
@@ -433,7 +433,7 @@  mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
 	}
 
 	mctelem_processing_hold(tep);
-	wmb();
+	smp_wmb();
 	spin_unlock(&processing_lock);
 	return MCTE2COOKIE(tep);
 }
@@ -444,7 +444,7 @@  void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
 
 	spin_lock(&processing_lock);
 	mctelem_processing_release(tep);
-	wmb();
+	smp_wmb();
 	spin_unlock(&processing_lock);
 }
 
@@ -460,6 +460,6 @@  void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
 	spin_lock(&processing_lock);
 	if (tep == mctctl.mctc_processing_head[target])
 		mctelem_processing_release(tep);
-	wmb();
+	smp_wmb();
 	spin_unlock(&processing_lock);
 }
diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
index d894a98..c63af0c 100644
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -107,12 +107,12 @@  static void send_IPI_mask_x2apic_phys(const cpumask_t *cpumask, int vector)
      * CPU is seen by notified remote CPUs. The WRMSR contained within
      * apic_icr_write() can otherwise be executed early.
      * 
-     * The reason mb() is sufficient here is subtle: the register arguments
+     * The reason smp_mb() is sufficient here is subtle: the register arguments
      * to WRMSR must depend on a memory read executed after the barrier. This
      * is guaranteed by cpu_physical_id(), which reads from a global array (and
      * so cannot be hoisted above the barrier even by a clever compiler).
      */
-    mb();
+    smp_mb();
 
     local_irq_save(flags);
 
@@ -136,7 +136,7 @@  static void send_IPI_mask_x2apic_cluster(const cpumask_t *cpumask, int vector)
     const cpumask_t *cluster_cpus;
     unsigned long flags;
 
-    mb(); /* See above for an explanation. */
+    smp_mb(); /* See above for an explanation. */
 
     local_irq_save(flags);
 
diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
index f78054d..c5ef534 100644
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -190,9 +190,9 @@  static void handle_hpet_broadcast(struct hpet_event_channel *ch)
     {
         s_time_t deadline;
 
-        rmb();
+        smp_rmb();
         deadline = per_cpu(timer_deadline, cpu);
-        rmb();
+        smp_rmb();
         if ( !cpumask_test_cpu(cpu, ch->cpumask) )
             continue;
 
@@ -610,7 +610,7 @@  void __init hpet_broadcast_init(void)
         hpet_events[i].shift = 32;
         hpet_events[i].next_event = STIME_MAX;
         spin_lock_init(&hpet_events[i].lock);
-        wmb();
+        smp_wmb();
         hpet_events[i].event_handler = handle_hpet_broadcast;
 
         hpet_events[i].msi.msi_attrib.maskbit = 1;
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 88071ab..6c00c0b 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -92,7 +92,7 @@  static bool_t hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
     {
         unsigned int state = p->state;
 
-        rmb();
+        smp_rmb();
         switch ( state )
         {
         case STATE_IOREQ_NONE:
@@ -1272,7 +1272,7 @@  static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
     }
 
     /* Make the ioreq_t visible /before/ write_pointer. */
-    wmb();
+    smp_wmb();
     pg->ptrs.write_pointer += qw ? 2 : 1;
 
     /* Canonicalize read/write pointers to prevent their overflow. */
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index 33e5927..185b956 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1486,7 +1486,7 @@  static int __init timer_irq_works(void)
     unsigned long t1, flags;
 
     t1 = pit0_ticks;
-    mb();
+    smp_mb();
 
     local_save_flags(flags);
     local_irq_enable();
@@ -1501,7 +1501,7 @@  static int __init timer_irq_works(void)
      * might have cached one ExtINT interrupt.  Finally, at
      * least one tick may be lost due to delays.
      */
-    mb();
+    smp_mb();
     if (pit0_ticks - t1 > 4)
         return 1;
 
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index 8c1545a..de72b0d 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -757,9 +757,9 @@  void irq_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
     
     ASSERT(spin_is_locked(&desc->lock));
     desc->status &= ~IRQ_MOVE_PENDING;
-    wmb();
+    smp_wmb();
     cpumask_copy(desc->arch.pending_mask, mask);
-    wmb();
+    smp_wmb();
     desc->status |= IRQ_MOVE_PENDING;
 }
 
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 14552a1..a2672b1 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -460,7 +460,7 @@  void share_xen_page_with_guest(
     page->u.inuse.type_info |= PGT_validated | 1;
 
     page_set_owner(page, d);
-    wmb(); /* install valid domain ptr before updating refcnt. */
+    smp_wmb(); /* install valid domain ptr before updating refcnt. */
     ASSERT((page->count_info & ~PGC_xen_heap) == 0);
 
     /* Only add to the allocation list if the domain isn't dying. */
@@ -2281,7 +2281,7 @@  static int alloc_page_type(struct page_info *page, unsigned long type,
     }
 
     /* No need for atomic update of type_info here: noone else updates it. */
-    wmb();
+    smp_wmb();
     switch ( rc )
     {
     case 0:
@@ -2388,7 +2388,7 @@  static int __put_final_page_type(
         if ( !(shadow_mode_enabled(page_get_owner(page)) &&
                (page->count_info & PGC_page_table)) )
             page->tlbflush_timestamp = tlbflush_current_time();
-        wmb();
+        smp_wmb();
         page->u.inuse.type_info--;
     }
     else if ( rc == -EINTR )
@@ -2398,13 +2398,13 @@  static int __put_final_page_type(
         if ( !(shadow_mode_enabled(page_get_owner(page)) &&
                (page->count_info & PGC_page_table)) )
             page->tlbflush_timestamp = tlbflush_current_time();
-        wmb();
+        smp_wmb();
         page->u.inuse.type_info |= PGT_validated;
     }
     else
     {
         BUG_ON(rc != -ERESTART);
-        wmb();
+        smp_wmb();
         get_page_light(page);
         page->u.inuse.type_info |= PGT_partial;
     }
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 2696396..7268300 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3021,7 +3021,7 @@  static int sh_page_fault(struct vcpu *v,
      * will make sure no inconsistent mapping being translated into
      * shadow page table. */
     version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
-    rmb();
+    smp_rmb();
     rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 0aa377a..ba0fffe 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -77,7 +77,7 @@  static enum cpu_state {
     CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
     CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
 } cpu_state;
-#define set_cpu_state(state) do { mb(); cpu_state = (state); } while (0)
+#define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
 
 void *stack_base[NR_CPUS];
 
@@ -124,7 +124,7 @@  static void synchronize_tsc_master(unsigned int slave)
     for ( i = 1; i <= 5; i++ )
     {
         tsc_value = rdtsc_ordered();
-        wmb();
+        smp_wmb();
         atomic_inc(&tsc_count);
         while ( atomic_read(&tsc_count) != (i<<1) )
             cpu_relax();
@@ -149,7 +149,7 @@  static void synchronize_tsc_slave(unsigned int slave)
     {
         while ( atomic_read(&tsc_count) != ((i<<1)-1) )
             cpu_relax();
-        rmb();
+        smp_rmb();
         /*
          * If a CPU has been physically hotplugged, we may as well write
          * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does
@@ -552,13 +552,13 @@  static int do_boot_cpu(int apicid, int cpu)
         }
         else if ( cpu_state == CPU_STATE_DEAD )
         {
-            rmb();
+            smp_rmb();
             rc = cpu_error;
         }
         else
         {
             boot_error = 1;
-            mb();
+            smp_mb();
             if ( bootsym(trampoline_cpu_started) == 0xA5 )
                 /* trampoline started but...? */
                 printk("Stuck ??\n");
@@ -576,7 +576,7 @@  static int do_boot_cpu(int apicid, int cpu)
 
     /* mark "stuck" area as not stuck */
     bootsym(trampoline_cpu_started) = 0;
-    mb();
+    smp_mb();
 
     smpboot_restore_warm_reset_vector();
 
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index dda89d8..0039e23 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -977,10 +977,10 @@  static void __update_vcpu_system_time(struct vcpu *v, int force)
 
     /* 1. Update guest kernel version. */
     _u.version = u->version = version_update_begin(u->version);
-    wmb();
+    smp_wmb();
     /* 2. Update all other guest kernel fields. */
     *u = _u;
-    wmb();
+    smp_wmb();
     /* 3. Update guest kernel version. */
     u->version = version_update_end(u->version);
 
@@ -1006,10 +1006,10 @@  bool_t update_secondary_system_time(struct vcpu *v,
         smap_policy_change(v, saved_policy);
         return 0;
     }
-    wmb();
+    smp_wmb();
     /* 2. Update all other userspace fields. */
     __copy_to_guest(user_u, u, 1);
-    wmb();
+    smp_wmb();
     /* 3. Update userspace version. */
     u->version = version_update_end(u->version);
     __copy_field_to_guest(user_u, u, version);
diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c
index ea9f7e7..4fcf6fa 100644
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -559,7 +559,7 @@  static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[])
             return;
         }
         udelay(1);
-        rmb();
+        smp_rmb();
         code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK,
                                       IOMMU_EVENT_CODE_SHIFT);
     }
@@ -664,7 +664,7 @@  void parse_ppr_log_entry(struct amd_iommu *iommu, u32 entry[])
             return;
         }
         udelay(1);
-        rmb();
+        smp_rmb();
         code = get_field_from_reg_u32(entry[1], IOMMU_PPR_LOG_CODE_MASK,
                                       IOMMU_PPR_LOG_CODE_SHIFT);
     }
diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h
index da924bf..9956aae 100644
--- a/xen/include/asm-x86/desc.h
+++ b/xen/include/asm-x86/desc.h
@@ -128,10 +128,10 @@  static inline void _write_gate_lower(volatile idt_entry_t *gate,
 #define _set_gate(gate_addr,type,dpl,addr)               \
 do {                                                     \
     (gate_addr)->a = 0;                                  \
-    wmb(); /* disable gate /then/ rewrite */             \
+    smp_wmb(); /* disable gate /then/ rewrite */         \
     (gate_addr)->b =                                     \
         ((unsigned long)(addr) >> 32);                   \
-    wmb(); /* rewrite /then/ enable gate */              \
+    smp_wmb(); /* rewrite /then/ enable gate */          \
     (gate_addr)->a =                                     \
         (((unsigned long)(addr) & 0xFFFF0000UL) << 32) | \
         ((unsigned long)(dpl) << 45) |                   \
@@ -174,11 +174,11 @@  static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr)
 #define _set_tssldt_desc(desc,addr,limit,type)           \
 do {                                                     \
     (desc)[0].b = (desc)[1].b = 0;                       \
-    wmb(); /* disable entry /then/ rewrite */            \
+    smp_wmb(); /* disable entry /then/ rewrite */        \
     (desc)[0].a =                                        \
         ((u32)(addr) << 16) | ((u32)(limit) & 0xFFFF);   \
     (desc)[1].a = (u32)(((unsigned long)(addr)) >> 32);  \
-    wmb(); /* rewrite /then/ enable entry */             \
+    smp_wmb(); /* rewrite /then/ enable entry */         \
     (desc)[0].b =                                        \
         ((u32)(addr) & 0xFF000000U) |                    \
         ((u32)(type) << 8) | 0x8000U |                   \
diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
index eb498f5..9cb6fd7 100644
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -183,7 +183,7 @@  static always_inline unsigned long __xadd(
 #define smp_wmb()       wmb()
 
 #define set_mb(var, value) do { xchg(&var, value); } while (0)
-#define set_wmb(var, value) do { var = value; wmb(); } while (0)
+#define set_wmb(var, value) do { var = value; smp_wmb(); } while (0)
 
 #define local_irq_disable()     asm volatile ( "cli" : : : "memory" )
 #define local_irq_enable()      asm volatile ( "sti" : : : "memory" )