diff mbox

[3/8] x86/time: introduce and use rdtsc_ordered()

Message ID 576149AB02000078000F539D@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich June 15, 2016, 10:27 a.m. UTC
Matching Linux commit 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered() and
use it in trivial call sites") and earlier ones it builds upon, let's
make sure timing loops don't have their rdtsc()-s re-ordered, as that
would harm precision of the result (values were observed to be several
hundred clocks off without this adjustment).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
x86/time: introduce and use rdtsc_ordered()

Matching Linux commit 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered() and
use it in trivial call sites") and earlier ones it builds upon, let's
make sure timing loops don't have their rdtsc()-s re-ordered, as that
would harm precision of the result (values were observed to be several
hundred clocks off without this adjustment).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -1137,7 +1137,7 @@ static int __init calibrate_APIC_clock(v
     /*
      * We wrapped around just now. Let's start:
      */
-    t1 = rdtsc();
+    t1 = rdtsc_ordered();
     tt1 = apic_read(APIC_TMCCT);
 
     /*
@@ -1147,7 +1147,7 @@ static int __init calibrate_APIC_clock(v
         wait_8254_wraparound();
 
     tt2 = apic_read(APIC_TMCCT);
-    t2 = rdtsc();
+    t2 = rdtsc_ordered();
 
     /*
      * The APIC bus clock counter is 32 bits only, it
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -541,6 +541,9 @@ static void init_amd(struct cpuinfo_x86
 			wrmsr_amd_safe(0xc001100d, l, h & ~1);
 	}
 
+	/* MFENCE stops RDTSC speculation */
+	__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+
 	switch(c->x86)
 	{
 	case 0xf ... 0x17:
--- a/xen/arch/x86/delay.c
+++ b/xen/arch/x86/delay.c
@@ -21,10 +21,10 @@ void __udelay(unsigned long usecs)
     unsigned long ticks = usecs * (cpu_khz / 1000);
     unsigned long s, e;
 
-    s = rdtsc();
+    s = rdtsc_ordered();
     do
     {
         rep_nop();
-        e = rdtsc();
+        e = rdtsc_ordered();
     } while ((e-s) < ticks);
 }
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -123,7 +123,7 @@ static void synchronize_tsc_master(unsig
 
     for ( i = 1; i <= 5; i++ )
     {
-        tsc_value = rdtsc();
+        tsc_value = rdtsc_ordered();
         wmb();
         atomic_inc(&tsc_count);
         while ( atomic_read(&tsc_count) != (i<<1) )
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -257,10 +257,10 @@ static u64 init_pit_and_calibrate_tsc(vo
     outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
     outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
 
-    start = rdtsc();
+    start = rdtsc_ordered();
     for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
         continue;
-    end = rdtsc();
+    end = rdtsc_ordered();
 
     /* Error if the CTC doesn't behave itself. */
     if ( count == 0 )
@@ -760,7 +760,7 @@ s_time_t get_s_time_fixed(u64 at_tsc)
     if ( at_tsc )
         tsc = at_tsc;
     else
-        tsc = rdtsc();
+        tsc = rdtsc_ordered();
     delta = tsc - t->local_tsc_stamp;
     now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
 
@@ -933,7 +933,7 @@ int cpu_frequency_change(u64 freq)
     /* TSC-extrapolated time may be bogus after frequency change. */
     /*t->stime_local_stamp = get_s_time();*/
     t->stime_local_stamp = t->stime_master_stamp;
-    curr_tsc = rdtsc();
+    curr_tsc = rdtsc_ordered();
     t->local_tsc_stamp = curr_tsc;
     set_time_scale(&t->tsc_scale, freq);
     local_irq_enable();
@@ -1248,7 +1248,7 @@ static void time_calibration_tsc_rendezv
             if ( r->master_stime == 0 )
             {
                 r->master_stime = read_platform_stime();
-                r->master_tsc_stamp = rdtsc();
+                r->master_tsc_stamp = rdtsc_ordered();
             }
             atomic_inc(&r->semaphore);
 
@@ -1274,7 +1274,7 @@ static void time_calibration_tsc_rendezv
         }
     }
 
-    c->local_tsc_stamp = rdtsc();
+    c->local_tsc_stamp = rdtsc_ordered();
     c->stime_local_stamp = get_s_time_fixed(c->local_tsc_stamp);
     c->stime_master_stamp = r->master_stime;
 
@@ -1304,7 +1304,7 @@ static void time_calibration_std_rendezv
         mb(); /* receive signal /then/ read r->master_stime */
     }
 
-    c->local_tsc_stamp = rdtsc();
+    c->local_tsc_stamp = rdtsc_ordered();
     c->stime_local_stamp = get_s_time_fixed(c->local_tsc_stamp);
     c->stime_master_stamp = r->master_stime;
 
@@ -1338,7 +1338,7 @@ void time_latch_stamps(void) {
 
     local_irq_save(flags);
     ap_bringup_ref.master_stime = read_platform_stime();
-    tsc = rdtsc();
+    tsc = rdtsc_ordered();
     local_irq_restore(flags);
 
     ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
@@ -1356,7 +1356,7 @@ void init_percpu_time(void)
 
     local_irq_save(flags);
     now = read_platform_stime();
-    tsc = rdtsc();
+    tsc = rdtsc_ordered();
     local_irq_restore(flags);
 
     t->stime_master_stamp = now;
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -16,6 +16,7 @@ XEN_CPUFEATURE(XTOPOLOGY,       (FSCAPIN
 XEN_CPUFEATURE(CPUID_FAULTING,  (FSCAPINTS+0)*32+ 6) /* cpuid faulting */
 XEN_CPUFEATURE(CLFLUSH_MONITOR, (FSCAPINTS+0)*32+ 7) /* clflush reqd with monitor */
 XEN_CPUFEATURE(APERFMPERF,      (FSCAPINTS+0)*32+ 8) /* APERFMPERF */
+XEN_CPUFEATURE(MFENCE_RDTSC,    (FSCAPINTS+0)*32+ 9) /* MFENCE synchronizes RDTSC */
 
 #define NCAPINTS (FSCAPINTS + 1) /* N 32-bit words worth of info */
 
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -80,6 +80,22 @@ static inline uint64_t rdtsc(void)
     return ((uint64_t)high << 32) | low;
 }
 
+static inline uint64_t rdtsc_ordered(void)
+{
+	/*
+	 * The RDTSC instruction is not ordered relative to memory access.
+	 * The Intel SDM and the AMD APM are both vague on this point, but
+	 * empirically an RDTSC instruction can be speculatively executed
+	 * before prior loads.  An RDTSC immediately after an appropriate
+	 * barrier appears to be ordered as a normal load, that is, it
+	 * provides the same ordering guarantees as reading from a global
+	 * memory location that some other imaginary CPU is updating
+	 * continuously with a time stamp.
+	 */
+	alternative("lfence", "mfence", X86_FEATURE_MFENCE_RDTSC);
+	return rdtsc();
+}
+
 #define __write_tsc(val) wrmsrl(MSR_IA32_TSC, val)
 #define write_tsc(val) ({                                       \
     /* Reliable TSCs are in lockstep across all CPUs. We should \

Comments

Andrew Cooper June 20, 2016, 12:59 p.m. UTC | #1
On 15/06/16 11:27, Jan Beulich wrote:
> Matching Linux commit 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered() and
> use it in trivial call sites") and earlier ones it builds upon, let's
> make sure timing loops don't have their rdtsc()-s re-ordered, as that
> would harm precision of the result (values were observed to be several
> hundred clocks off without this adjustment).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Jan Beulich June 20, 2016, 1:06 p.m. UTC | #2
>>> On 20.06.16 at 14:59, <andrew.cooper3@citrix.com> wrote:
> On 15/06/16 11:27, Jan Beulich wrote:
>> Matching Linux commit 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered() and
>> use it in trivial call sites") and earlier ones it builds upon, let's
>> make sure timing loops don't have their rdtsc()-s re-ordered, as that
>> would harm precision of the result (values were observed to be several
>> hundred clocks off without this adjustment).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

I have these two additional hunks for v2:

@@ -1124,16 +1124,13 @@ static void local_time_calibration(void)
  */
 static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
 {
-#define rdtsc_barrier() mb()
     static DEFINE_SPINLOCK(sync_lock);
     static cycles_t last_tsc;
 
     cycles_t start, now, prev, end;
     int i;
 
-    rdtsc_barrier();
-    start = get_cycles();
-    rdtsc_barrier();
+    start = rdtsc_ordered();
 
     /* The measurement runs for 20 msecs: */
     end = start + tsc_khz * 20ULL;
@@ -1148,9 +1145,7 @@ static void check_tsc_warp(unsigned long
          */
         spin_lock(&sync_lock);
         prev = last_tsc;
-        rdtsc_barrier();
-        now = get_cycles();
-        rdtsc_barrier();
+        now = rdtsc_ordered();
         last_tsc = now;
         spin_unlock(&sync_lock);
 

May I consider those covered as well?

Jan
Andrew Cooper June 20, 2016, 1:07 p.m. UTC | #3
On 20/06/16 14:06, Jan Beulich wrote:
>>>> On 20.06.16 at 14:59, <andrew.cooper3@citrix.com> wrote:
>> On 15/06/16 11:27, Jan Beulich wrote:
>>> Matching Linux commit 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered() and
>>> use it in trivial call sites") and earlier ones it builds upon, let's
>>> make sure timing loops don't have their rdtsc()-s re-ordered, as that
>>> would harm precision of the result (values were observed to be several
>>> hundred clocks off without this adjustment).
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> I have these two additional hunks for v2:
>
> @@ -1124,16 +1124,13 @@ static void local_time_calibration(void)
>   */
>  static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
>  {
> -#define rdtsc_barrier() mb()
>      static DEFINE_SPINLOCK(sync_lock);
>      static cycles_t last_tsc;
>  
>      cycles_t start, now, prev, end;
>      int i;
>  
> -    rdtsc_barrier();
> -    start = get_cycles();
> -    rdtsc_barrier();
> +    start = rdtsc_ordered();
>  
>      /* The measurement runs for 20 msecs: */
>      end = start + tsc_khz * 20ULL;
> @@ -1148,9 +1145,7 @@ static void check_tsc_warp(unsigned long
>           */
>          spin_lock(&sync_lock);
>          prev = last_tsc;
> -        rdtsc_barrier();
> -        now = get_cycles();
> -        rdtsc_barrier();
> +        now = rdtsc_ordered();
>          last_tsc = now;
>          spin_unlock(&sync_lock);
>  
>
> May I consider those covered as well?

Yes.

I need to dust off my series removing most of the misuse of mb() in the
Xen codebase now that 4.8 is open.

~Andrew
Dario Faggioli July 11, 2016, 11:39 a.m. UTC | #4
On Mon, 2016-06-20 at 13:59 +0100, Andrew Cooper wrote:
> On 15/06/16 11:27, Jan Beulich wrote:
> > Matching Linux commit 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered()
> > and
> > use it in trivial call sites") and earlier ones it builds upon,
> > let's
> > make sure timing loops don't have their rdtsc()-s re-ordered, as
> > that
> > would harm precision of the result (values were observed to be
> > several
> > hundred clocks off without this adjustment).
> > 
> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
>  
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
FWIW:

Reviewed-by: Dario Faggioli <dario.faggioli@citrix.com>
Tested-by: Dario Faggioli <dario.faggioli@citrix.com>

(or Reviewed-and-Tested-by: as you wish :-)).

FTR, during my own investigation, before raising the issue on the
mailing list, I also came to the conclusion that we'd need something
like this. I even try doing something like this (in a much more hacky
way), and had the feeling that it was making a difference but, of
course, alone, without all the other issues that Jan found and fixed in
this series, it wasn't enough.

Thanks and regards,
Dario
diff mbox

Patch

--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -1137,7 +1137,7 @@  static int __init calibrate_APIC_clock(v
     /*
      * We wrapped around just now. Let's start:
      */
-    t1 = rdtsc();
+    t1 = rdtsc_ordered();
     tt1 = apic_read(APIC_TMCCT);
 
     /*
@@ -1147,7 +1147,7 @@  static int __init calibrate_APIC_clock(v
         wait_8254_wraparound();
 
     tt2 = apic_read(APIC_TMCCT);
-    t2 = rdtsc();
+    t2 = rdtsc_ordered();
 
     /*
      * The APIC bus clock counter is 32 bits only, it
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -541,6 +541,9 @@  static void init_amd(struct cpuinfo_x86
 			wrmsr_amd_safe(0xc001100d, l, h & ~1);
 	}
 
+	/* MFENCE stops RDTSC speculation */
+	__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+
 	switch(c->x86)
 	{
 	case 0xf ... 0x17:
--- a/xen/arch/x86/delay.c
+++ b/xen/arch/x86/delay.c
@@ -21,10 +21,10 @@  void __udelay(unsigned long usecs)
     unsigned long ticks = usecs * (cpu_khz / 1000);
     unsigned long s, e;
 
-    s = rdtsc();
+    s = rdtsc_ordered();
     do
     {
         rep_nop();
-        e = rdtsc();
+        e = rdtsc_ordered();
     } while ((e-s) < ticks);
 }
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -123,7 +123,7 @@  static void synchronize_tsc_master(unsig
 
     for ( i = 1; i <= 5; i++ )
     {
-        tsc_value = rdtsc();
+        tsc_value = rdtsc_ordered();
         wmb();
         atomic_inc(&tsc_count);
         while ( atomic_read(&tsc_count) != (i<<1) )
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -257,10 +257,10 @@  static u64 init_pit_and_calibrate_tsc(vo
     outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
     outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
 
-    start = rdtsc();
+    start = rdtsc_ordered();
     for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
         continue;
-    end = rdtsc();
+    end = rdtsc_ordered();
 
     /* Error if the CTC doesn't behave itself. */
     if ( count == 0 )
@@ -760,7 +760,7 @@  s_time_t get_s_time_fixed(u64 at_tsc)
     if ( at_tsc )
         tsc = at_tsc;
     else
-        tsc = rdtsc();
+        tsc = rdtsc_ordered();
     delta = tsc - t->local_tsc_stamp;
     now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
 
@@ -933,7 +933,7 @@  int cpu_frequency_change(u64 freq)
     /* TSC-extrapolated time may be bogus after frequency change. */
     /*t->stime_local_stamp = get_s_time();*/
     t->stime_local_stamp = t->stime_master_stamp;
-    curr_tsc = rdtsc();
+    curr_tsc = rdtsc_ordered();
     t->local_tsc_stamp = curr_tsc;
     set_time_scale(&t->tsc_scale, freq);
     local_irq_enable();
@@ -1248,7 +1248,7 @@  static void time_calibration_tsc_rendezv
             if ( r->master_stime == 0 )
             {
                 r->master_stime = read_platform_stime();
-                r->master_tsc_stamp = rdtsc();
+                r->master_tsc_stamp = rdtsc_ordered();
             }
             atomic_inc(&r->semaphore);
 
@@ -1274,7 +1274,7 @@  static void time_calibration_tsc_rendezv
         }
     }
 
-    c->local_tsc_stamp = rdtsc();
+    c->local_tsc_stamp = rdtsc_ordered();
     c->stime_local_stamp = get_s_time_fixed(c->local_tsc_stamp);
     c->stime_master_stamp = r->master_stime;
 
@@ -1304,7 +1304,7 @@  static void time_calibration_std_rendezv
         mb(); /* receive signal /then/ read r->master_stime */
     }
 
-    c->local_tsc_stamp = rdtsc();
+    c->local_tsc_stamp = rdtsc_ordered();
     c->stime_local_stamp = get_s_time_fixed(c->local_tsc_stamp);
     c->stime_master_stamp = r->master_stime;
 
@@ -1338,7 +1338,7 @@  void time_latch_stamps(void) {
 
     local_irq_save(flags);
     ap_bringup_ref.master_stime = read_platform_stime();
-    tsc = rdtsc();
+    tsc = rdtsc_ordered();
     local_irq_restore(flags);
 
     ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
@@ -1356,7 +1356,7 @@  void init_percpu_time(void)
 
     local_irq_save(flags);
     now = read_platform_stime();
-    tsc = rdtsc();
+    tsc = rdtsc_ordered();
     local_irq_restore(flags);
 
     t->stime_master_stamp = now;
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -16,6 +16,7 @@  XEN_CPUFEATURE(XTOPOLOGY,       (FSCAPIN
 XEN_CPUFEATURE(CPUID_FAULTING,  (FSCAPINTS+0)*32+ 6) /* cpuid faulting */
 XEN_CPUFEATURE(CLFLUSH_MONITOR, (FSCAPINTS+0)*32+ 7) /* clflush reqd with monitor */
 XEN_CPUFEATURE(APERFMPERF,      (FSCAPINTS+0)*32+ 8) /* APERFMPERF */
+XEN_CPUFEATURE(MFENCE_RDTSC,    (FSCAPINTS+0)*32+ 9) /* MFENCE synchronizes RDTSC */
 
 #define NCAPINTS (FSCAPINTS + 1) /* N 32-bit words worth of info */
 
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -80,6 +80,22 @@  static inline uint64_t rdtsc(void)
     return ((uint64_t)high << 32) | low;
 }
 
+static inline uint64_t rdtsc_ordered(void)
+{
+	/*
+	 * The RDTSC instruction is not ordered relative to memory access.
+	 * The Intel SDM and the AMD APM are both vague on this point, but
+	 * empirically an RDTSC instruction can be speculatively executed
+	 * before prior loads.  An RDTSC immediately after an appropriate
+	 * barrier appears to be ordered as a normal load, that is, it
+	 * provides the same ordering guarantees as reading from a global
+	 * memory location that some other imaginary CPU is updating
+	 * continuously with a time stamp.
+	 */
+	alternative("lfence", "mfence", X86_FEATURE_MFENCE_RDTSC);
+	return rdtsc();
+}
+
 #define __write_tsc(val) wrmsrl(MSR_IA32_TSC, val)
 #define write_tsc(val) ({                                       \
     /* Reliable TSCs are in lockstep across all CPUs. We should \