diff mbox

[1/8] x86/time: improve cross-CPU clock monotonicity (and more)

Message ID 5761496802000078000F5395@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich June 15, 2016, 10:26 a.m. UTC
Using the bare return value from read_platform_stime() is not suitable
when local_time_calibration() is going to use its fast path: Divergence
of several dozen microseconds between NOW() return values on different
CPUs results when platform and local time don't stay in close sync.

Latch local and platform time on the CPU initiating AP bringup, such
that the AP can use these values to seed its stime_local_stamp with as
little of an error as possible. The boot CPU, otoh, can simply
calculate the correct initial value (other CPUs could do so too with
even greater accuracy than the approach being introduced, but that can
work only if all CPUs' TSCs start ticking at the same time, which
generally can't be assumed to be the case on multi-socket systems).

This slightly defers init_percpu_time() (moved ahead by commit
dd2658f966 ["x86/time: initialise time earlier during
start_secondary()"]) in order to reduce as much as possible the gap
between populating the stamps and consuming them.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
x86/time: adjust local system time initialization

Using the bare return value from read_platform_stime() is not suitable
when local_time_calibration() is going to use its fast path: Divergence
of several dozen microseconds between NOW() return values on different
CPUs results when platform and local time don't stay in close sync.

Latch local and platform time on the CPU initiating AP bringup, such
that the AP can use these values to seed its stime_local_stamp with as
little of an error as possible. The boot CPU, otoh, can simply
calculate the correct initial value (other CPUs could do so too with
even greater accuracy than the approach being introduced, but that can
work only if all CPUs' TSCs start ticking at the same time, which
generally can't be assumed to be the case on multi-socket systems).

This slightly defers init_percpu_time() (moved ahead by commit
dd2658f966 ["x86/time: initialise time earlier during
start_secondary()"]) in order to reduce as much as possible the gap
between populating the stamps and consuming them.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -328,12 +328,12 @@ void start_secondary(void *unused)
 
     percpu_traps_init();
 
-    init_percpu_time();
-
     cpu_init();
 
     smp_callin();
 
+    init_percpu_time();
+
     setup_secondary_APIC_clock();
 
     /*
@@ -996,6 +996,8 @@ int __cpu_up(unsigned int cpu)
     if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
         return ret;
 
+    time_latch_stamps();
+
     set_cpu_state(CPU_STATE_ONLINE);
     while ( !cpu_online(cpu) )
     {
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1328,21 +1328,51 @@ static void time_calibration(void *unuse
                      &r, 1);
 }
 
+static struct {
+    s_time_t local_stime, master_stime;
+} ap_bringup_ref;
+
+void time_latch_stamps(void) {
+    unsigned long flags;
+    u64 tsc;
+
+    local_irq_save(flags);
+    ap_bringup_ref.master_stime = read_platform_stime();
+    tsc = rdtsc();
+    local_irq_restore(flags);
+
+    ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
+}
+
 void init_percpu_time(void)
 {
     struct cpu_time *t = &this_cpu(cpu_time);
     unsigned long flags;
+    u64 tsc;
     s_time_t now;
 
     /* Initial estimate for TSC rate. */
     t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
 
     local_irq_save(flags);
-    t->local_tsc_stamp = rdtsc();
     now = read_platform_stime();
+    tsc = rdtsc();
     local_irq_restore(flags);
 
     t->stime_master_stamp = now;
+    /*
+     * To avoid a discontinuity (TSC and platform clock can't be expected
+     * to be in perfect sync), initialization here needs to match up with
+     * local_time_calibration()'s decision whether to use its fast path.
+     */
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+    {
+        if ( system_state < SYS_STATE_smp_boot )
+            now = get_s_time_fixed(tsc);
+        else
+            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
+    }
+    t->local_tsc_stamp    = tsc;
     t->stime_local_stamp  = now;
 }
 
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -40,6 +40,7 @@ int time_suspend(void);
 int time_resume(void);
 
 void init_percpu_time(void);
+void time_latch_stamps(void);
 
 struct ioreq;
 int hwdom_pit_access(struct ioreq *ioreq);

Comments

Jan Beulich June 15, 2016, 10:32 a.m. UTC | #1
>>> On 15.06.16 at 12:26, <JBeulich@suse.com> wrote:

The title of this was (of course) meant to be

x86/time: adjust local system time initialization

Jan

> Using the bare return value from read_platform_stime() is not suitable
> when local_time_calibration() is going to use its fast path: Divergence
> of several dozen microseconds between NOW() return values on different
> CPUs results when platform and local time don't stay in close sync.
> 
> Latch local and platform time on the CPU initiating AP bringup, such
> that the AP can use these values to seed its stime_local_stamp with as
> little of an error as possible. The boot CPU, otoh, can simply
> calculate the correct initial value (other CPUs could do so too with
> even greater accuracy than the approach being introduced, but that can
> work only if all CPUs' TSCs start ticking at the same time, which
> generally can't be assumed to be the case on multi-socket systems).
> 
> This slightly defers init_percpu_time() (moved ahead by commit
> dd2658f966 ["x86/time: initialise time earlier during
> start_secondary()"]) in order to reduce as much as possible the gap
> between populating the stamps and consuming them.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -328,12 +328,12 @@ void start_secondary(void *unused)
>  
>      percpu_traps_init();
>  
> -    init_percpu_time();
> -
>      cpu_init();
>  
>      smp_callin();
>  
> +    init_percpu_time();
> +
>      setup_secondary_APIC_clock();
>  
>      /*
> @@ -996,6 +996,8 @@ int __cpu_up(unsigned int cpu)
>      if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
>          return ret;
>  
> +    time_latch_stamps();
> +
>      set_cpu_state(CPU_STATE_ONLINE);
>      while ( !cpu_online(cpu) )
>      {
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -1328,21 +1328,51 @@ static void time_calibration(void *unuse
>                       &r, 1);
>  }
>  
> +static struct {
> +    s_time_t local_stime, master_stime;
> +} ap_bringup_ref;
> +
> +void time_latch_stamps(void) {
> +    unsigned long flags;
> +    u64 tsc;
> +
> +    local_irq_save(flags);
> +    ap_bringup_ref.master_stime = read_platform_stime();
> +    tsc = rdtsc();
> +    local_irq_restore(flags);
> +
> +    ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
> +}
> +
>  void init_percpu_time(void)
>  {
>      struct cpu_time *t = &this_cpu(cpu_time);
>      unsigned long flags;
> +    u64 tsc;
>      s_time_t now;
>  
>      /* Initial estimate for TSC rate. */
>      t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
>  
>      local_irq_save(flags);
> -    t->local_tsc_stamp = rdtsc();
>      now = read_platform_stime();
> +    tsc = rdtsc();
>      local_irq_restore(flags);
>  
>      t->stime_master_stamp = now;
> +    /*
> +     * To avoid a discontinuity (TSC and platform clock can't be expected
> +     * to be in perfect sync), initialization here needs to match up with
> +     * local_time_calibration()'s decision whether to use its fast path.
> +     */
> +    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
> +    {
> +        if ( system_state < SYS_STATE_smp_boot )
> +            now = get_s_time_fixed(tsc);
> +        else
> +            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
> +    }
> +    t->local_tsc_stamp    = tsc;
>      t->stime_local_stamp  = now;
>  }
>  
> --- a/xen/include/asm-x86/time.h
> +++ b/xen/include/asm-x86/time.h
> @@ -40,6 +40,7 @@ int time_suspend(void);
>  int time_resume(void);
>  
>  void init_percpu_time(void);
> +void time_latch_stamps(void);
>  
>  struct ioreq;
>  int hwdom_pit_access(struct ioreq *ioreq);
Joao Martins June 15, 2016, 10:51 p.m. UTC | #2
On 06/15/2016 11:26 AM, Jan Beulich wrote:
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -1328,21 +1328,51 @@ static void time_calibration(void *unuse
>                       &r, 1);
>  }
>  
> +static struct {
> +    s_time_t local_stime, master_stime;
> +} ap_bringup_ref;
> +
> +void time_latch_stamps(void) {
                                ^ style: shouldn't this bracket be on a new line?

> +    unsigned long flags;
> +    u64 tsc;
> +
> +    local_irq_save(flags);
> +    ap_bringup_ref.master_stime = read_platform_stime();
> +    tsc = rdtsc();
> +    local_irq_restore(flags);
> +
> +    ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
> +}
> +
>  void init_percpu_time(void)
>  {
>      struct cpu_time *t = &this_cpu(cpu_time);
>      unsigned long flags;
> +    u64 tsc;
>      s_time_t now;
>  
>      /* Initial estimate for TSC rate. */
>      t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
>  
>      local_irq_save(flags);
> -    t->local_tsc_stamp = rdtsc();
>      now = read_platform_stime();
Do we need to take another read from the platform timer here? We could
just reuse the one on ap_bringup_ref.master_stime. See also below.

> +    tsc = rdtsc();
>      local_irq_restore(flags);
>  
>      t->stime_master_stamp = now;
> +    /*
> +     * To avoid a discontinuity (TSC and platform clock can't be expected
> +     * to be in perfect sync), initialization here needs to match up with
> +     * local_time_calibration()'s decision whether to use its fast path.
> +     */
> +    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
> +    {
> +        if ( system_state < SYS_STATE_smp_boot )
> +            now = get_s_time_fixed(tsc);
> +        else
> +            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
> +    }
> +    t->local_tsc_stamp    = tsc;

As far as my current experimentation (both v1 of this patch and the whole series on
v2), the source of the remaining warps could be fixed with this seeding. Though I
think this seeding might not yet be correct. Essentially the boot CPU you do
get_s_time_fixed(tsc), which basically gives you a value strictly calculated with TSC
since boot_tsc_stamp. For the other CPUs you append the time skew between TSC and
platform timer calculated before AP bringup, with a value just read on the platform
timer. Which I assume you take this as an approximation skew between TSC and platform
timer.

So, before this patch cpu_time is filled like this:

CPU 0: M0 , T0
CPU 1: M1 , T1
CPU 2: M2 , T2

With this patch it goes like:

CPU 0: L0 , T0
CPU 1: L1 = (M1 + (L - M)), T1
CPU 2: L2 = (M2 + (L - M)), T2

Given that,

1) values separated by commas are respectively local stime, tsc that are
written in cpu_time
2) M2 > M1 > M0 as values read from platform timer.
3) L and M solely as values calculated on CPU doing AP bringup.

After this seeding, local_time_calibration will increment the deltas between
calibrations every second, based entirely on a monotonically increasing TSC. Altough
I see two issues here: 1) appending to a new read from platform time which might
already be offsetted by the one taken from the boot CPU and 2) the skew you calculate
don't account for the current tsc. Which leads to local stime and tsc still not being
a valid pair for the secondary CPUs. So it would be possible that get_s_time(S0) on
CPU0 immediately followed by a get_s_time(S1) on CPU1 [S1 > S0] ends up returning a
value backwards IOW L0 + scale(S0 - T0) is bigger than L1 + scale(S1 - T1). That is
independently of the deltas being added on every calibration.

So how about we do the seeding in another way?

1) Relying on individual CPU TSC like you do on CPU 0:

     t->stamp.master_stime = now;
+    t->stamp.local_tsc = 0;
+    t->stamp.local_stime = 0;
+
     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
     {
         now = get_s_time_fixed(tsc);
     }

(...)

Or 2) taking into account the skew between platform timer and tsc we take on
init_percpu_time. Diff below based on this series:

@@ -1394,11 +1508,14 @@ void init_percpu_time(void)
     t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;

     local_irq_save(flags);
-    now = read_platform_stime();
+    now = ap_bringup_ref.master_stime;
     tsc = rdtsc_ordered();
     local_irq_restore(flags);

     t->stamp.master_stime = now;
+    t->stamp.local_tsc = boot_tsc_stamp;
+    t->stamp.local_stime = 0;
+

     /*
      * To avoid a discontinuity (TSC and platform clock can't be expected
      * to be in perfect sync), initialization here needs to match up with
@@ -1406,10 +1523,12 @@ void init_percpu_time(void)
      */
     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
     {
+        u64 stime = get_s_time_fixed(tsc);
+
         if ( system_state < SYS_STATE_smp_boot )
-            now = get_s_time_fixed(tsc);
+            now = stime;
         else
-            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
+            now += stime - ap_bringup_ref.master_stime;
     }

Second option follows a similar thinking of this patch, but on both ways the
local_stime will match the tsc on init_percpu_time, thus being a matched pair. I have
a test similar to check_tsc_warp but with get_s_time() and I no longer observe time
going backwards. But without the above I still observe it even on short periods.
Thoughts? (Sorry for the long answer)

Joao
Jan Beulich June 16, 2016, 8:27 a.m. UTC | #3
>>> On 16.06.16 at 00:51, <joao.m.martins@oracle.com> wrote:
> On 06/15/2016 11:26 AM, Jan Beulich wrote:
>> --- a/xen/arch/x86/time.c
>> +++ b/xen/arch/x86/time.c
>> @@ -1328,21 +1328,51 @@ static void time_calibration(void *unuse
>>                       &r, 1);
>>  }
>>  
>> +static struct {
>> +    s_time_t local_stime, master_stime;
>> +} ap_bringup_ref;
>> +
>> +void time_latch_stamps(void) {
>                                 ^ style: shouldn't this bracket be on a new line?

Oh, yes, of course.

>> +    unsigned long flags;
>> +    u64 tsc;
>> +
>> +    local_irq_save(flags);
>> +    ap_bringup_ref.master_stime = read_platform_stime();
>> +    tsc = rdtsc();
>> +    local_irq_restore(flags);
>> +
>> +    ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
>> +}
>> +
>>  void init_percpu_time(void)
>>  {
>>      struct cpu_time *t = &this_cpu(cpu_time);
>>      unsigned long flags;
>> +    u64 tsc;
>>      s_time_t now;
>>  
>>      /* Initial estimate for TSC rate. */
>>      t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
>>  
>>      local_irq_save(flags);
>> -    t->local_tsc_stamp = rdtsc();
>>      now = read_platform_stime();
> Do we need to take another read from the platform timer here? We could
> just reuse the one on ap_bringup_ref.master_stime. See also below.

Yes, for this model of approximation of the local stime delta by
master stime delta we obviously need to read the master clock
here again (or else we have no way to calculate the master
delta, which we want to use as the correcting value).

>> +    tsc = rdtsc();
>>      local_irq_restore(flags);
>>  
>>      t->stime_master_stamp = now;
>> +    /*
>> +     * To avoid a discontinuity (TSC and platform clock can't be expected
>> +     * to be in perfect sync), initialization here needs to match up with
>> +     * local_time_calibration()'s decision whether to use its fast path.
>> +     */
>> +    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>> +    {
>> +        if ( system_state < SYS_STATE_smp_boot )
>> +            now = get_s_time_fixed(tsc);
>> +        else
>> +            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
>> +    }
>> +    t->local_tsc_stamp    = tsc;
> 
> As far as my current experimentation (both v1 of this patch and the whole 
> series on
> v2), the source of the remaining warps could be fixed with this seeding. 
> Though I
> think this seeding might not yet be correct. Essentially the boot CPU you do
> get_s_time_fixed(tsc), which basically gives you a value strictly calculated 
> with TSC
> since boot_tsc_stamp. For the other CPUs you append the time skew between 
> TSC and
> platform timer calculated before AP bringup, with a value just read on the 
> platform
> timer. Which I assume you take this as an approximation skew between TSC and 
> platform
> timer.

Correct.

> So, before this patch cpu_time is filled like this:
> 
> CPU 0: M0 , T0
> CPU 1: M1 , T1
> CPU 2: M2 , T2
> 
> With this patch it goes like:
> 
> CPU 0: L0 , T0
> CPU 1: L1 = (M1 + (L - M)), T1
> CPU 2: L2 = (M2 + (L - M)), T2
> 
> Given that,
> 
> 1) values separated by commas are respectively local stime, tsc that are
> written in cpu_time
> 2) M2 > M1 > M0 as values read from platform timer.
> 3) L and M solely as values calculated on CPU doing AP bringup.
> 
> After this seeding, local_time_calibration will increment the deltas between
> calibrations every second, based entirely on a monotonically increasing TSC. 
> Altough
> I see two issues here: 1) appending to a new read from platform time which 
> might
> already be offsetted by the one taken from the boot CPU and 2) the skew you 
> calculate
> don't account for the current tsc. Which leads to local stime and tsc still 
> not being
> a valid pair for the secondary CPUs.

That's true if platform clock and TSC, even over this small time
period, diverge enough. The calculate local time indeed is only an
approximation to match the TSC just read.

> So it would be possible that 
> get_s_time(S0) on
> CPU0 immediately followed by a get_s_time(S1) on CPU1 [S1 > S0] ends up 
> returning a
> value backwards IOW L0 + scale(S0 - T0) is bigger than L1 + scale(S1 - T1). 
> That is
> independently of the deltas being added on every calibration.
> 
> So how about we do the seeding in another way?
> 
> 1) Relying on individual CPU TSC like you do on CPU 0:
> 
>      t->stamp.master_stime = now;
> +    t->stamp.local_tsc = 0;
> +    t->stamp.local_stime = 0;
> +
>      if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>      {
>          now = get_s_time_fixed(tsc);
>      }

As said before elsewhere, such a model can only work if all TSCs
start ticking at the same time. The latest this assumption breaks
is when a CPU gets physically hotplugged.

> Or 2) taking into account the skew between platform timer and tsc we take on
> init_percpu_time. Diff below based on this series:
> 
> @@ -1394,11 +1508,14 @@ void init_percpu_time(void)
>      t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
> 
>      local_irq_save(flags);
> -    now = read_platform_stime();
> +    now = ap_bringup_ref.master_stime;
>      tsc = rdtsc_ordered();
>      local_irq_restore(flags);
> 
>      t->stamp.master_stime = now;
> +    t->stamp.local_tsc = boot_tsc_stamp;

Again - this is invalid. It indeed works fine on a single socket system
(where all TSCs start ticking at the same time), and gives really good
results. But due to hotplug (and to a lesser degree multi-socket, but
likely to a somewhat higher degree multi-node, systems) we just can't
use boot_tsc_stamp as reference for APs.

> +    t->stamp.local_stime = 0;
> +
> 
>      /*
>       * To avoid a discontinuity (TSC and platform clock can't be expected
>       * to be in perfect sync), initialization here needs to match up with
> @@ -1406,10 +1523,12 @@ void init_percpu_time(void)
>       */
>      if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>      {
> +        u64 stime = get_s_time_fixed(tsc);
> +
>          if ( system_state < SYS_STATE_smp_boot )
> -            now = get_s_time_fixed(tsc);
> +            now = stime;
>          else
> -            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
> +            now += stime - ap_bringup_ref.master_stime;

That seems to contradict your desire to keep out of all calculations
the skew between platform timer and TSC.

>      }
> 
> Second option follows a similar thinking of this patch, but on both ways the
> local_stime will match the tsc on init_percpu_time, thus being a matched 
> pair. I have
> a test similar to check_tsc_warp but with get_s_time() and I no longer 
> observe time
> going backwards. But without the above I still observe it even on short 
> periods.

Right - I'm not claiming the series eliminates all backwards moves.
But I simply can't see a model to fully eliminate them. I.e. my plan
was, once the backwards moves are small enough, to maybe
indeed compensate them by maintaining a global variable tracking
the most recently returned value. There are issues with such an
approach too, though: HT effects can result in one hyperthread
making it just past that check of the global, then hardware
switching to the other hyperthread, NOW() producing a slightly
larger value there, and hardware switching back to the first
hyperthread only after the second one consumed the result of
NOW(). Dario's use would be unaffected by this aiui, as his NOW()
invocations are globally serialized through a spinlock, but arbitrary
NOW() invocations on two hyperthreads can't be made such that
the invoking party can be guaranteed to see strictly montonic
values.

And btw., similar considerations apply for two fully independent
CPUs, if one runs at a much higher P-state than the other (i.e.
the faster one could overtake the slower one between the
montonicity check in NOW() and the callers consuming the returned
values). So in the end I'm not sure it's worth the performance hit
such a global montonicity check would incur, and therefore I didn't
make a respective patch part of this series.

Jan
Joao Martins June 16, 2016, 8:27 p.m. UTC | #4
>>> +    unsigned long flags;
>>> +    u64 tsc;
>>> +
>>> +    local_irq_save(flags);
>>> +    ap_bringup_ref.master_stime = read_platform_stime();
>>> +    tsc = rdtsc();
>>> +    local_irq_restore(flags);
>>> +
>>> +    ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
>>> +}
>>> +
>>>  void init_percpu_time(void)
>>>  {
>>>      struct cpu_time *t = &this_cpu(cpu_time);
>>>      unsigned long flags;
>>> +    u64 tsc;
>>>      s_time_t now;
>>>  
>>>      /* Initial estimate for TSC rate. */
>>>      t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
>>>  
>>>      local_irq_save(flags);
>>> -    t->local_tsc_stamp = rdtsc();
>>>      now = read_platform_stime();
>> Do we need to take another read from the platform timer here? We could
>> just reuse the one on ap_bringup_ref.master_stime. See also below.
> 
> Yes, for this model of approximation of the local stime delta by
> master stime delta we obviously need to read the master clock
> here again (or else we have no way to calculate the master
> delta, which we want to use as the correcting value).
Ah, correct.

>> So it would be possible that 
>> get_s_time(S0) on
>> CPU0 immediately followed by a get_s_time(S1) on CPU1 [S1 > S0] ends up 
>> returning a
>> value backwards IOW L0 + scale(S0 - T0) is bigger than L1 + scale(S1 - T1). 
>> That is
>> independently of the deltas being added on every calibration.
>>
>> So how about we do the seeding in another way?
>>
>> 1) Relying on individual CPU TSC like you do on CPU 0:
>>
>>      t->stamp.master_stime = now;
>> +    t->stamp.local_tsc = 0;
>> +    t->stamp.local_stime = 0;
>> +
>>      if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>>      {
>>          now = get_s_time_fixed(tsc);
>>      }
> 
> As said before elsewhere, such a model can only work if all TSCs
> start ticking at the same time. The latest this assumption breaks 
> is when a CPU gets physically hotplugged.
Agreed.

>> Or 2) taking into account the skew between platform timer and tsc we take on
>> init_percpu_time. Diff below based on this series:
>>
>> @@ -1394,11 +1508,14 @@ void init_percpu_time(void)
>>      t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
>>
>>      local_irq_save(flags);
>> -    now = read_platform_stime();
>> +    now = ap_bringup_ref.master_stime;
>>      tsc = rdtsc_ordered();
>>      local_irq_restore(flags);
>>
>>      t->stamp.master_stime = now;
>> +    t->stamp.local_tsc = boot_tsc_stamp;
> 
> Again - this is invalid. It indeed works fine on a single socket system
> (where all TSCs start ticking at the same time), and gives really good
> results. But due to hotplug (and to a lesser degree multi-socket, but
> likely to a somewhat higher degree multi-node, systems) we just can't
> use boot_tsc_stamp as reference for APs.
It also gives really good results on my multi-socket system at least on my old Intel
multi-socket box (Westmere-based; TSC invariant is introduced on its predecessor:
Nehalem). Btw, Linux seems to take Intel boxes as having synchronized TSCs across
cores and sockets [0][1]. Though CPU hotplug is the troublesome case.

[0] arch/x86/kernel/tsc.c#L1082
[1] arch/x86/kernel/cpu/intel.c#L85

> 
>> +    t->stamp.local_stime = 0;
>> +
>>
>>      /*
>>       * To avoid a discontinuity (TSC and platform clock can't be expected
>>       * to be in perfect sync), initialization here needs to match up with
>> @@ -1406,10 +1523,12 @@ void init_percpu_time(void)
>>       */
>>      if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>>      {
>> +        u64 stime = get_s_time_fixed(tsc);
>> +
>>          if ( system_state < SYS_STATE_smp_boot )
>> -            now = get_s_time_fixed(tsc);
>> +            now = stime;
>>          else
>> -            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
>> +            now += stime - ap_bringup_ref.master_stime;
> 
> That seems to contradict your desire to keep out of all calculations
> the skew between platform timer and TSC.
> 
That's indeed my desire (on my related series about using tsc as clocksource and tsc
stable bit), but being aware of TSC limitations: I was trying to see if there was
common ground between this seeding with platform timer while accounting to the
potential unreliable TSC of individual CPUs. But of course, for any of these
solutions to have get_s_time return monotonically increasing values, it can only work
on a truly invariant TSC box.

>>      }
>>
>> Second option follows a similar thinking of this patch, but on both ways the
>> local_stime will match the tsc on init_percpu_time, thus being a matched 
>> pair. I have
>> a test similar to check_tsc_warp but with get_s_time() and I no longer 
>> observe time
>> going backwards. But without the above I still observe it even on short 
>> periods.
> 
> Right - I'm not claiming the series eliminates all backwards moves.
> But I simply can't see a model to fully eliminate them. 
Totally agree with you.

> I.e. my plan was, once the backwards moves are small enough, to maybe
> indeed compensate them by maintaining a global variable tracking
> the most recently returned value. There are issues with such an
> approach too, though: HT effects can result in one hyperthread
> making it just past that check of the global, then hardware
> switching to the other hyperthread, NOW() producing a slightly
> larger value there, and hardware switching back to the first
> hyperthread only after the second one consumed the result of
> NOW(). Dario's use would be unaffected by this aiui, as his NOW()
> invocations are globally serialized through a spinlock, but arbitrary
> NOW() invocations on two hyperthreads can't be made such that
> the invoking party can be guaranteed to see strictly montonic
> values.
> 
> And btw., similar considerations apply for two fully independent
> CPUs, if one runs at a much higher P-state than the other (i.e.
> the faster one could overtake the slower one between the
> montonicity check in NOW() and the callers consuming the returned
> values). So in the end I'm not sure it's worth the performance hit
> such a global montonicity check would incur, and therefore I didn't
> make a respective patch part of this series.
> 

Hm, guests pvclock should have faced similar issues too as their
local stamps for each vcpu diverge. Linux commit 489fb49 ("x86, paravirt: Add a
global synchronization point for pvclock") depicts a fix to similar situations to the
scenarios you just described - which lead to have a global variable to keep track of
most recent timestamp. One important chunk of that commit is pasted below for
convenience:

--
/*
 * Assumption here is that last_value, a global accumulator, always goes
 * forward. If we are less than that, we should not be much smaller.
 * We assume there is an error marging we're inside, and then the correction
 * does not sacrifice accuracy.
 *
 * For reads: global may have changed between test and return,
 * but this means someone else updated poked the clock at a later time.
 * We just need to make sure we are not seeing a backwards event.
 *
 * For updates: last_value = ret is not enough, since two vcpus could be
 * updating at the same time, and one of them could be slightly behind,
 * making the assumption that last_value always go forward fail to hold.
 */
 last = atomic64_read(&last_value);
 do {
     if (ret < last)
         return last;
     last = atomic64_cmpxchg(&last_value, last, ret);
 } while (unlikely(last != ret));
--

Though I am not sure what other options we would have with such a heavy reliance on
TSC right now.

Joao
Jan Beulich June 17, 2016, 7:32 a.m. UTC | #5
>>> On 16.06.16 at 22:27, <joao.m.martins@oracle.com> wrote:
>> I.e. my plan was, once the backwards moves are small enough, to maybe
>> indeed compensate them by maintaining a global variable tracking
>> the most recently returned value. There are issues with such an
>> approach too, though: HT effects can result in one hyperthread
>> making it just past that check of the global, then hardware
>> switching to the other hyperthread, NOW() producing a slightly
>> larger value there, and hardware switching back to the first
>> hyperthread only after the second one consumed the result of
>> NOW(). Dario's use would be unaffected by this aiui, as his NOW()
>> invocations are globally serialized through a spinlock, but arbitrary
>> NOW() invocations on two hyperthreads can't be made such that
>> the invoking party can be guaranteed to see strictly montonic
>> values.
>> 
>> And btw., similar considerations apply for two fully independent
>> CPUs, if one runs at a much higher P-state than the other (i.e.
>> the faster one could overtake the slower one between the
>> montonicity check in NOW() and the callers consuming the returned
>> values). So in the end I'm not sure it's worth the performance hit
>> such a global montonicity check would incur, and therefore I didn't
>> make a respective patch part of this series.
>> 
> 
> Hm, guests pvclock should have faced similar issues too as their
> local stamps for each vcpu diverge. Linux commit 489fb49 ("x86, paravirt: 
> Add a
> global synchronization point for pvclock") depicts a fix to similar 
> situations to the
> scenarios you just described - which lead to have a global variable to keep 
> track of
> most recent timestamp. One important chunk of that commit is pasted below 
> for
> convenience:
> 
> --
> /*
>  * Assumption here is that last_value, a global accumulator, always goes
>  * forward. If we are less than that, we should not be much smaller.
>  * We assume there is an error marging we're inside, and then the correction
>  * does not sacrifice accuracy.
>  *
>  * For reads: global may have changed between test and return,
>  * but this means someone else updated poked the clock at a later time.
>  * We just need to make sure we are not seeing a backwards event.
>  *
>  * For updates: last_value = ret is not enough, since two vcpus could be
>  * updating at the same time, and one of them could be slightly behind,
>  * making the assumption that last_value always go forward fail to hold.
>  */
>  last = atomic64_read(&last_value);
>  do {
>      if (ret < last)
>          return last;
>      last = atomic64_cmpxchg(&last_value, last, ret);
>  } while (unlikely(last != ret));
> --

Meaning they decided it's worth the overhead. But (having read
through the entire description) they don't even discuss whether this
indeed eliminates _all_ apparent backward moves due to effects
like the ones named above.

Plus, the contention they're facing is limited to a single VM, i.e. likely
much more narrow than that on an entire physical system. So for
us to do the same in the hypervisor, quite a bit more of win would
be needed to outweigh the cost.

Jan
Joao Martins June 21, 2016, 12:05 p.m. UTC | #6
On 06/17/2016 08:32 AM, Jan Beulich wrote:
>>>> On 16.06.16 at 22:27, <joao.m.martins@oracle.com> wrote:
>>> I.e. my plan was, once the backwards moves are small enough, to maybe
>>> indeed compensate them by maintaining a global variable tracking
>>> the most recently returned value. There are issues with such an
>>> approach too, though: HT effects can result in one hyperthread
>>> making it just past that check of the global, then hardware
>>> switching to the other hyperthread, NOW() producing a slightly
>>> larger value there, and hardware switching back to the first
>>> hyperthread only after the second one consumed the result of
>>> NOW(). Dario's use would be unaffected by this aiui, as his NOW()
>>> invocations are globally serialized through a spinlock, but arbitrary
>>> NOW() invocations on two hyperthreads can't be made such that
>>> the invoking party can be guaranteed to see strictly montonic
>>> values.
>>>
>>> And btw., similar considerations apply for two fully independent
>>> CPUs, if one runs at a much higher P-state than the other (i.e.
>>> the faster one could overtake the slower one between the
>>> montonicity check in NOW() and the callers consuming the returned
>>> values). So in the end I'm not sure it's worth the performance hit
>>> such a global montonicity check would incur, and therefore I didn't
>>> make a respective patch part of this series.
>>>
>>
>> Hm, guests pvclock should have faced similar issues too as their
>> local stamps for each vcpu diverge. Linux commit 489fb49 ("x86, paravirt: 
>> Add a
>> global synchronization point for pvclock") depicts a fix to similar 
>> situations to the
>> scenarios you just described - which lead to have a global variable to keep 
>> track of
>> most recent timestamp. One important chunk of that commit is pasted below 
>> for
>> convenience:
>>
>> --
>> /*
>>  * Assumption here is that last_value, a global accumulator, always goes
>>  * forward. If we are less than that, we should not be much smaller.
>>  * We assume there is an error marging we're inside, and then the correction
>>  * does not sacrifice accuracy.
>>  *
>>  * For reads: global may have changed between test and return,
>>  * but this means someone else updated poked the clock at a later time.
>>  * We just need to make sure we are not seeing a backwards event.
>>  *
>>  * For updates: last_value = ret is not enough, since two vcpus could be
>>  * updating at the same time, and one of them could be slightly behind,
>>  * making the assumption that last_value always go forward fail to hold.
>>  */
>>  last = atomic64_read(&last_value);
>>  do {
>>      if (ret < last)
>>          return last;
>>      last = atomic64_cmpxchg(&last_value, last, ret);
>>  } while (unlikely(last != ret));
>> --
> 
> Meaning they decided it's worth the overhead. But (having read
> through the entire description) they don't even discuss whether this
> indeed eliminates _all_ apparent backward moves due to effects
> like the ones named above.
>
> Plus, the contention they're facing is limited to a single VM, i.e. likely
> much more narrow than that on an entire physical system. So for
> us to do the same in the hypervisor, quite a bit more of win would
> be needed to outweigh the cost.
> 
Experimental details look very unclear too - likely running the time
warp test for 5 days would get some of these cases cleared out. But
as you say it should be much more narrow that of an entire system.

BTW It was implicit in the discussion but my apologies for not
formally/explicitly stating. So FWIW:

Tested-by: Joao Martins <joao.m.martins@oracle.com>

This series is certainly a way forward into improving cross-CPU monotonicity,
and I am seeing indeed less occurrences of time going backwards on my systems.

Joao
Jan Beulich June 21, 2016, 12:28 p.m. UTC | #7
>>> On 21.06.16 at 14:05, <joao.m.martins@oracle.com> wrote:

> 
> On 06/17/2016 08:32 AM, Jan Beulich wrote:
>>>>> On 16.06.16 at 22:27, <joao.m.martins@oracle.com> wrote:
>>>> I.e. my plan was, once the backwards moves are small enough, to maybe
>>>> indeed compensate them by maintaining a global variable tracking
>>>> the most recently returned value. There are issues with such an
>>>> approach too, though: HT effects can result in one hyperthread
>>>> making it just past that check of the global, then hardware
>>>> switching to the other hyperthread, NOW() producing a slightly
>>>> larger value there, and hardware switching back to the first
>>>> hyperthread only after the second one consumed the result of
>>>> NOW(). Dario's use would be unaffected by this aiui, as his NOW()
>>>> invocations are globally serialized through a spinlock, but arbitrary
>>>> NOW() invocations on two hyperthreads can't be made such that
>>>> the invoking party can be guaranteed to see strictly montonic
>>>> values.
>>>>
>>>> And btw., similar considerations apply for two fully independent
>>>> CPUs, if one runs at a much higher P-state than the other (i.e.
>>>> the faster one could overtake the slower one between the
>>>> montonicity check in NOW() and the callers consuming the returned
>>>> values). So in the end I'm not sure it's worth the performance hit
>>>> such a global montonicity check would incur, and therefore I didn't
>>>> make a respective patch part of this series.
>>>>
>>>
>>> Hm, guests pvclock should have faced similar issues too as their
>>> local stamps for each vcpu diverge. Linux commit 489fb49 ("x86, paravirt: 
>>> Add a
>>> global synchronization point for pvclock") depicts a fix to similar 
>>> situations to the
>>> scenarios you just described - which lead to have a global variable to keep 
>>> track of
>>> most recent timestamp. One important chunk of that commit is pasted below 
>>> for
>>> convenience:
>>>
>>> --
>>> /*
>>>  * Assumption here is that last_value, a global accumulator, always goes
>>>  * forward. If we are less than that, we should not be much smaller.
>>>  * We assume there is an error marging we're inside, and then the correction
>>>  * does not sacrifice accuracy.
>>>  *
>>>  * For reads: global may have changed between test and return,
>>>  * but this means someone else updated poked the clock at a later time.
>>>  * We just need to make sure we are not seeing a backwards event.
>>>  *
>>>  * For updates: last_value = ret is not enough, since two vcpus could be
>>>  * updating at the same time, and one of them could be slightly behind,
>>>  * making the assumption that last_value always go forward fail to hold.
>>>  */
>>>  last = atomic64_read(&last_value);
>>>  do {
>>>      if (ret < last)
>>>          return last;
>>>      last = atomic64_cmpxchg(&last_value, last, ret);
>>>  } while (unlikely(last != ret));
>>> --
>> 
>> Meaning they decided it's worth the overhead. But (having read
>> through the entire description) they don't even discuss whether this
>> indeed eliminates _all_ apparent backward moves due to effects
>> like the ones named above.
>>
>> Plus, the contention they're facing is limited to a single VM, i.e. likely
>> much more narrow than that on an entire physical system. So for
>> us to do the same in the hypervisor, quite a bit more of win would
>> be needed to outweigh the cost.
>> 
> Experimental details look very unclear too - likely running the time
> warp test for 5 days would get some of these cases cleared out. But
> as you say it should be much more narrow that of an entire system.
> 
> BTW It was implicit in the discussion but my apologies for not
> formally/explicitly stating. So FWIW:
> 
> Tested-by: Joao Martins <joao.m.martins@oracle.com>

Thanks, but this ...

> This series is certainly a way forward into improving cross-CPU monotonicity,
> and I am seeing indeed less occurrences of time going backwards on my 
> systems.

... leaves me guessing whether the above was meant for just this
patch, or the entire series.

Jan
Joao Martins June 21, 2016, 1:57 p.m. UTC | #8
On 06/21/2016 01:28 PM, Jan Beulich wrote:
>>>> On 21.06.16 at 14:05, <joao.m.martins@oracle.com> wrote:
> 
>>
>> On 06/17/2016 08:32 AM, Jan Beulich wrote:
>>>>>> On 16.06.16 at 22:27, <joao.m.martins@oracle.com> wrote:
>>>>> I.e. my plan was, once the backwards moves are small enough, to maybe
>>>>> indeed compensate them by maintaining a global variable tracking
>>>>> the most recently returned value. There are issues with such an
>>>>> approach too, though: HT effects can result in one hyperthread
>>>>> making it just past that check of the global, then hardware
>>>>> switching to the other hyperthread, NOW() producing a slightly
>>>>> larger value there, and hardware switching back to the first
>>>>> hyperthread only after the second one consumed the result of
>>>>> NOW(). Dario's use would be unaffected by this aiui, as his NOW()
>>>>> invocations are globally serialized through a spinlock, but arbitrary
>>>>> NOW() invocations on two hyperthreads can't be made such that
>>>>> the invoking party can be guaranteed to see strictly montonic
>>>>> values.
>>>>>
>>>>> And btw., similar considerations apply for two fully independent
>>>>> CPUs, if one runs at a much higher P-state than the other (i.e.
>>>>> the faster one could overtake the slower one between the
>>>>> montonicity check in NOW() and the callers consuming the returned
>>>>> values). So in the end I'm not sure it's worth the performance hit
>>>>> such a global montonicity check would incur, and therefore I didn't
>>>>> make a respective patch part of this series.
>>>>>
>>>>
>>>> Hm, guests pvclock should have faced similar issues too as their
>>>> local stamps for each vcpu diverge. Linux commit 489fb49 ("x86, paravirt: 
>>>> Add a
>>>> global synchronization point for pvclock") depicts a fix to similar 
>>>> situations to the
>>>> scenarios you just described - which lead to have a global variable to keep 
>>>> track of
>>>> most recent timestamp. One important chunk of that commit is pasted below 
>>>> for
>>>> convenience:
>>>>
>>>> --
>>>> /*
>>>>  * Assumption here is that last_value, a global accumulator, always goes
>>>>  * forward. If we are less than that, we should not be much smaller.
>>>>  * We assume there is an error marging we're inside, and then the correction
>>>>  * does not sacrifice accuracy.
>>>>  *
>>>>  * For reads: global may have changed between test and return,
>>>>  * but this means someone else updated poked the clock at a later time.
>>>>  * We just need to make sure we are not seeing a backwards event.
>>>>  *
>>>>  * For updates: last_value = ret is not enough, since two vcpus could be
>>>>  * updating at the same time, and one of them could be slightly behind,
>>>>  * making the assumption that last_value always go forward fail to hold.
>>>>  */
>>>>  last = atomic64_read(&last_value);
>>>>  do {
>>>>      if (ret < last)
>>>>          return last;
>>>>      last = atomic64_cmpxchg(&last_value, last, ret);
>>>>  } while (unlikely(last != ret));
>>>> --
>>>
>>> Meaning they decided it's worth the overhead. But (having read
>>> through the entire description) they don't even discuss whether this
>>> indeed eliminates _all_ apparent backward moves due to effects
>>> like the ones named above.
>>>
>>> Plus, the contention they're facing is limited to a single VM, i.e. likely
>>> much more narrow than that on an entire physical system. So for
>>> us to do the same in the hypervisor, quite a bit more of win would
>>> be needed to outweigh the cost.
>>>
>> Experimental details look very unclear too - likely running the time
>> warp test for 5 days would get some of these cases cleared out. But
>> as you say it should be much more narrow that of an entire system.
>>
>> BTW It was implicit in the discussion but my apologies for not
>> formally/explicitly stating. So FWIW:
>>
>> Tested-by: Joao Martins <joao.m.martins@oracle.com>
> 
> Thanks, but this ...
> 
>> This series is certainly a way forward into improving cross-CPU monotonicity,
>> and I am seeing indeed less occurrences of time going backwards on my 
>> systems.
> 
> ... leaves me guessing whether the above was meant for just this
> patch, or the entire series.
> 
Ah sorry, a little ambiguous on my end - It is meant for the entire series.

Joao
Andrew Cooper Aug. 2, 2016, 7:30 p.m. UTC | #9
On 15/06/16 11:26, Jan Beulich wrote:
> Using the bare return value from read_platform_stime() is not suitable
> when local_time_calibration() is going to use its fast path: Divergence
> of several dozen microseconds between NOW() return values on different
> CPUs results when platform and local time don't stay in close sync.
>
> Latch local and platform time on the CPU initiating AP bringup, such
> that the AP can use these values to seed its stime_local_stamp with as
> little of an error as possible. The boot CPU, otoh, can simply
> calculate the correct initial value (other CPUs could do so too with
> even greater accuracy than the approach being introduced, but that can
> work only if all CPUs' TSCs start ticking at the same time, which
> generally can't be assumed to be the case on multi-socket systems).
>
> This slightly defers init_percpu_time() (moved ahead by commit
> dd2658f966 ["x86/time: initialise time earlier during
> start_secondary()"]) in order to reduce as much as possible the gap
> between populating the stamps and consuming them.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Subject to the style issue spotted by Joao, Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>
diff mbox

Patch

--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -328,12 +328,12 @@  void start_secondary(void *unused)
 
     percpu_traps_init();
 
-    init_percpu_time();
-
     cpu_init();
 
     smp_callin();
 
+    init_percpu_time();
+
     setup_secondary_APIC_clock();
 
     /*
@@ -996,6 +996,8 @@  int __cpu_up(unsigned int cpu)
     if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
         return ret;
 
+    time_latch_stamps();
+
     set_cpu_state(CPU_STATE_ONLINE);
     while ( !cpu_online(cpu) )
     {
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1328,21 +1328,51 @@  static void time_calibration(void *unuse
                      &r, 1);
 }
 
+static struct {
+    s_time_t local_stime, master_stime;
+} ap_bringup_ref;
+
+void time_latch_stamps(void) {
+    unsigned long flags;
+    u64 tsc;
+
+    local_irq_save(flags);
+    ap_bringup_ref.master_stime = read_platform_stime();
+    tsc = rdtsc();
+    local_irq_restore(flags);
+
+    ap_bringup_ref.local_stime = get_s_time_fixed(tsc);
+}
+
 void init_percpu_time(void)
 {
     struct cpu_time *t = &this_cpu(cpu_time);
     unsigned long flags;
+    u64 tsc;
     s_time_t now;
 
     /* Initial estimate for TSC rate. */
     t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
 
     local_irq_save(flags);
-    t->local_tsc_stamp = rdtsc();
     now = read_platform_stime();
+    tsc = rdtsc();
     local_irq_restore(flags);
 
     t->stime_master_stamp = now;
+    /*
+     * To avoid a discontinuity (TSC and platform clock can't be expected
+     * to be in perfect sync), initialization here needs to match up with
+     * local_time_calibration()'s decision whether to use its fast path.
+     */
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+    {
+        if ( system_state < SYS_STATE_smp_boot )
+            now = get_s_time_fixed(tsc);
+        else
+            now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
+    }
+    t->local_tsc_stamp    = tsc;
     t->stime_local_stamp  = now;
 }
 
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -40,6 +40,7 @@  int time_suspend(void);
 int time_resume(void);
 
 void init_percpu_time(void);
+void time_latch_stamps(void);
 
 struct ioreq;
 int hwdom_pit_access(struct ioreq *ioreq);