diff mbox

[5/8] x86/time: correctly honor late clearing of TSC related feature flags

Message ID 57614A0402000078000F53A5@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich June 15, 2016, 10:28 a.m. UTC
As such clearing of flags may have an impact on the selected rendezvous
function, handle such in a central place.

But don't allow such feature flags to be cleared during CPU hotplug:
Platform and local system times may have diverged significantly by
then, potentially causing noticably (even if only temporary) strange
behavior. As we're anyway expecting only sufficiently similar CPUs to
appear during hotplug, this shouldn't be introducing new limitations.

Reported-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
x86/time: correctly honor late clearing of TSC related feature flags

As such clearing of flags may have an impact on the selected rendezvous
function, handle such in a central place.

But don't allow such feature flags to be cleared during CPU hotplug:
Platform and local system times may have diverged significantly by
then, potentially causing noticably (even if only temporary) strange
behavior. As we're anyway expecting only sufficiently similar CPUs to
appear during hotplug, this shouldn't be introducing new limitations.

Reported-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -1135,8 +1135,8 @@ static int mwait_idle_cpu_init(struct no
 		}
 
 		if (state > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
-		    !pm_idle_save)
-			setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+		    !pm_idle_save && system_state < SYS_STATE_active)
+			clear_tsc_cap(X86_FEATURE_TSC_RELIABLE);
 
 		cx = dev->states + dev->count;
 		cx->type = state;
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
                      &r, 1);
 }
 
+void __init clear_tsc_cap(unsigned int feature)
+{
+    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
+
+    if ( feature )
+        setup_clear_cpu_cap(feature);
+
+    /* If we have constant-rate TSCs then scale factor can be shared. */
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+    {
+        /* If TSCs are not marked as 'reliable', re-sync during rendezvous. */
+        if ( !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+            rendezvous_fn = time_calibration_tsc_rendezvous;
+    }
+
+    time_calibration_rendezvous_fn = rendezvous_fn;
+}
+
 static struct {
     s_time_t local_stime, master_stime;
 } ap_bringup_ref;
@@ -1482,7 +1500,7 @@ static int __init verify_tsc_reliability
         {
             printk("%s: TSC warp detected, disabling TSC_RELIABLE\n",
                    __func__);
-            setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+            clear_tsc_cap(X86_FEATURE_TSC_RELIABLE);
         }
     }
 
@@ -1495,13 +1513,7 @@ int __init init_xen_time(void)
 {
     tsc_check_writability();
 
-    /* If we have constant-rate TSCs then scale factor can be shared. */
-    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
-    {
-        /* If TSCs are not marked as 'reliable', re-sync during rendezvous. */
-        if ( !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
-            time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
-    }
+    clear_tsc_cap(0);
 
     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
 
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -71,6 +71,7 @@ void tsc_get_info(struct domain *d, uint
 void force_update_vcpu_system_time(struct vcpu *v);
 
 int host_tsc_is_safe(void);
+void clear_tsc_cap(unsigned int feature);
 void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
                      uint32_t *ecx, uint32_t *edx);

Comments

Andrew Cooper June 20, 2016, 2:32 p.m. UTC | #1
On 15/06/16 11:28, Jan Beulich wrote:
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>                       &r, 1);
>  }
>  
> +void __init clear_tsc_cap(unsigned int feature)
> +{
> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;

This should read time_calibration_rendezvous_fn rather than assuming
time_calibration_std_rendezvous.

Otherwise, there is a risk that it could be reset back to
time_calibration_std_rendezvous.

~Andrew
Jan Beulich June 20, 2016, 3:20 p.m. UTC | #2
>>> On 20.06.16 at 16:32, <andrew.cooper3@citrix.com> wrote:
> On 15/06/16 11:28, Jan Beulich wrote:
>> --- a/xen/arch/x86/time.c
>> +++ b/xen/arch/x86/time.c
>> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>>                       &r, 1);
>>  }
>>  
>> +void __init clear_tsc_cap(unsigned int feature)
>> +{
>> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
> 
> This should read time_calibration_rendezvous_fn rather than assuming
> time_calibration_std_rendezvous.
> 
> Otherwise, there is a risk that it could be reset back to
> time_calibration_std_rendezvous.

But that's the purpose: We may need to switch back.

Jan
Andrew Cooper July 4, 2016, 3:39 p.m. UTC | #3
On 20/06/16 16:20, Jan Beulich wrote:
>>>> On 20.06.16 at 16:32, <andrew.cooper3@citrix.com> wrote:
>> On 15/06/16 11:28, Jan Beulich wrote:
>>> --- a/xen/arch/x86/time.c
>>> +++ b/xen/arch/x86/time.c
>>> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>>>                       &r, 1);
>>>  }
>>>  
>>> +void __init clear_tsc_cap(unsigned int feature)
>>> +{
>>> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
>> This should read time_calibration_rendezvous_fn rather than assuming
>> time_calibration_std_rendezvous.
>>
>> Otherwise, there is a risk that it could be reset back to
>> time_calibration_std_rendezvous.
> But that's the purpose: We may need to switch back.

Under what circumstances could we ever move from re-syncing back to not
re-syncing?

Even in a scenario with pcpu hotplug, where we lose the final pcpu which
was causing re-syncing, we don't know that one of the intermediate pcpus
hasn't gone and come back, with a different TSC base.

~Andrew
Jan Beulich July 4, 2016, 3:53 p.m. UTC | #4
>>> On 04.07.16 at 17:39, <andrew.cooper3@citrix.com> wrote:
> On 20/06/16 16:20, Jan Beulich wrote:
>>>>> On 20.06.16 at 16:32, <andrew.cooper3@citrix.com> wrote:
>>> On 15/06/16 11:28, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/time.c
>>>> +++ b/xen/arch/x86/time.c
>>>> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>>>>                       &r, 1);
>>>>  }
>>>>  
>>>> +void __init clear_tsc_cap(unsigned int feature)
>>>> +{
>>>> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
>>> This should read time_calibration_rendezvous_fn rather than assuming
>>> time_calibration_std_rendezvous.
>>>
>>> Otherwise, there is a risk that it could be reset back to
>>> time_calibration_std_rendezvous.
>> But that's the purpose: We may need to switch back.
> 
> Under what circumstances could we ever move from re-syncing back to not
> re-syncing?

verify_tsc_reliability() may result in X86_FEATURE_TSC_RELIABLE
getting cleared. That's an initcall, which means it runs after
init_xen_time(), and hence after the rendezvous function got
established initially.

Jan
Andrew Cooper Aug. 2, 2016, 7:08 p.m. UTC | #5
On 04/07/16 16:53, Jan Beulich wrote:
>>>> On 04.07.16 at 17:39, <andrew.cooper3@citrix.com> wrote:
>> On 20/06/16 16:20, Jan Beulich wrote:
>>>>>> On 20.06.16 at 16:32, <andrew.cooper3@citrix.com> wrote:
>>>> On 15/06/16 11:28, Jan Beulich wrote:
>>>>> --- a/xen/arch/x86/time.c
>>>>> +++ b/xen/arch/x86/time.c
>>>>> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>>>>>                       &r, 1);
>>>>>  }
>>>>>  
>>>>> +void __init clear_tsc_cap(unsigned int feature)
>>>>> +{
>>>>> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
>>>> This should read time_calibration_rendezvous_fn rather than assuming
>>>> time_calibration_std_rendezvous.
>>>>
>>>> Otherwise, there is a risk that it could be reset back to
>>>> time_calibration_std_rendezvous.
>>> But that's the purpose: We may need to switch back.
>> Under what circumstances could we ever move from re-syncing back to not
>> re-syncing?
> verify_tsc_reliability() may result in X86_FEATURE_TSC_RELIABLE
> getting cleared. That's an initcall, which means it runs after
> init_xen_time(), and hence after the rendezvous function got
> established initially.

Right, but that isn't important.

There will never be a case where, once TSC_RELIABLE is cleared, it is
safe to revert back to std_rendezvous, even if TSC_RELIABLE is
subsequently re-set.

Therefore, this function must never accidentally revert
time_calibration_rendezvous_fn from time_calibration_tsc_rendezvous back
to time_calibration_std_rendezvous.

~Andrew
Jan Beulich Aug. 3, 2016, 9:43 a.m. UTC | #6
>>> On 02.08.16 at 21:08, <andrew.cooper3@citrix.com> wrote:
> On 04/07/16 16:53, Jan Beulich wrote:
>>>>> On 04.07.16 at 17:39, <andrew.cooper3@citrix.com> wrote:
>>> On 20/06/16 16:20, Jan Beulich wrote:
>>>>>>> On 20.06.16 at 16:32, <andrew.cooper3@citrix.com> wrote:
>>>>> On 15/06/16 11:28, Jan Beulich wrote:
>>>>>> --- a/xen/arch/x86/time.c
>>>>>> +++ b/xen/arch/x86/time.c
>>>>>> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>>>>>>                       &r, 1);
>>>>>>  }
>>>>>>  
>>>>>> +void __init clear_tsc_cap(unsigned int feature)
>>>>>> +{
>>>>>> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
>>>>> This should read time_calibration_rendezvous_fn rather than assuming
>>>>> time_calibration_std_rendezvous.
>>>>>
>>>>> Otherwise, there is a risk that it could be reset back to
>>>>> time_calibration_std_rendezvous.
>>>> But that's the purpose: We may need to switch back.
>>> Under what circumstances could we ever move from re-syncing back to not
>>> re-syncing?
>> verify_tsc_reliability() may result in X86_FEATURE_TSC_RELIABLE
>> getting cleared. That's an initcall, which means it runs after
>> init_xen_time(), and hence after the rendezvous function got
>> established initially.
> 
> Right, but that isn't important.
> 
> There will never be a case where, once TSC_RELIABLE is cleared, it is
> safe to revert back to std_rendezvous, even if TSC_RELIABLE is
> subsequently re-set.

You've got this backwards: TSC_RELIABLE may get _cleared_ late.
Nothing can ever set it late, due to the use of setup_clear_cpu_cap().
Reverting back to time_calibration_std_rendezvous() would only be
possible if CONSTANT_TSC got cleared late, ...

> Therefore, this function must never accidentally revert
> time_calibration_rendezvous_fn from time_calibration_tsc_rendezvous back
> to time_calibration_std_rendezvous.

... yet I can't see what would be wrong with that especially when
that still happened at boot time. Or else how would it be safe to
switch the other direction? (If neither switch was safe, then all we
could do upon finding the TSC unreliable in verify_tsc_reliability()
would be to panic().)

Jan
Andrew Cooper Aug. 31, 2016, 1:42 p.m. UTC | #7
On 03/08/16 10:43, Jan Beulich wrote:
>>>> On 02.08.16 at 21:08, <andrew.cooper3@citrix.com> wrote:
>> On 04/07/16 16:53, Jan Beulich wrote:
>>>>>> On 04.07.16 at 17:39, <andrew.cooper3@citrix.com> wrote:
>>>> On 20/06/16 16:20, Jan Beulich wrote:
>>>>>>>> On 20.06.16 at 16:32, <andrew.cooper3@citrix.com> wrote:
>>>>>> On 15/06/16 11:28, Jan Beulich wrote:
>>>>>>> --- a/xen/arch/x86/time.c
>>>>>>> +++ b/xen/arch/x86/time.c
>>>>>>> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>>>>>>>                       &r, 1);
>>>>>>>  }
>>>>>>>  
>>>>>>> +void __init clear_tsc_cap(unsigned int feature)
>>>>>>> +{
>>>>>>> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
>>>>>> This should read time_calibration_rendezvous_fn rather than assuming
>>>>>> time_calibration_std_rendezvous.
>>>>>>
>>>>>> Otherwise, there is a risk that it could be reset back to
>>>>>> time_calibration_std_rendezvous.
>>>>> But that's the purpose: We may need to switch back.
>>>> Under what circumstances could we ever move from re-syncing back to not
>>>> re-syncing?
>>> verify_tsc_reliability() may result in X86_FEATURE_TSC_RELIABLE
>>> getting cleared. That's an initcall, which means it runs after
>>> init_xen_time(), and hence after the rendezvous function got
>>> established initially.
>> Right, but that isn't important.
>>
>> There will never be a case where, once TSC_RELIABLE is cleared, it is
>> safe to revert back to std_rendezvous, even if TSC_RELIABLE is
>> subsequently re-set.
> You've got this backwards: TSC_RELIABLE may get _cleared_ late.

Quite - I haven't got this backwards.

> Nothing can ever set it late, due to the use of setup_clear_cpu_cap().
> Reverting back to time_calibration_std_rendezvous() would only be
> possible if CONSTANT_TSC got cleared late, ...

time_calibration_rendezvous_fn defaults to
time_calibration_std_rendezvous(), i.e. defaults to the assumption that
the TSCs are invariant.

We then later call clear_caps(TSC_RELIABLE), and the default changes to
time_calibration_tsc_rendezvous().

We then later call clear_tsc_cap(CONSTANT_TSC), or indeed that
CONSTANT_TSC was never set in the first place, and the default switches
back to time_calibration_std_rendezvous() because of the aformentioned bug.

Once the switch to time_calibration_tsc_rendezvous() is made, it is
never safe to switch back.

Therefore, your function must read time_calibration_rendezvous_fn and
not assume time_calibration_std_rendezvous().

~Andrew
Jan Beulich Aug. 31, 2016, 2:03 p.m. UTC | #8
>>> On 31.08.16 at 15:42, <andrew.cooper3@citrix.com> wrote:
> On 03/08/16 10:43, Jan Beulich wrote:
>>>>> On 02.08.16 at 21:08, <andrew.cooper3@citrix.com> wrote:
>>> On 04/07/16 16:53, Jan Beulich wrote:
>>>>>>> On 04.07.16 at 17:39, <andrew.cooper3@citrix.com> wrote:
>>>>> On 20/06/16 16:20, Jan Beulich wrote:
>>>>>>>>> On 20.06.16 at 16:32, <andrew.cooper3@citrix.com> wrote:
>>>>>>> On 15/06/16 11:28, Jan Beulich wrote:
>>>>>>>> --- a/xen/arch/x86/time.c
>>>>>>>> +++ b/xen/arch/x86/time.c
>>>>>>>> @@ -1358,6 +1358,24 @@ static void time_calibration(void *unuse
>>>>>>>>                       &r, 1);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +void __init clear_tsc_cap(unsigned int feature)
>>>>>>>> +{
>>>>>>>> +    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
>>>>>>> This should read time_calibration_rendezvous_fn rather than assuming
>>>>>>> time_calibration_std_rendezvous.
>>>>>>>
>>>>>>> Otherwise, there is a risk that it could be reset back to
>>>>>>> time_calibration_std_rendezvous.
>>>>>> But that's the purpose: We may need to switch back.
>>>>> Under what circumstances could we ever move from re-syncing back to not
>>>>> re-syncing?
>>>> verify_tsc_reliability() may result in X86_FEATURE_TSC_RELIABLE
>>>> getting cleared. That's an initcall, which means it runs after
>>>> init_xen_time(), and hence after the rendezvous function got
>>>> established initially.
>>> Right, but that isn't important.
>>>
>>> There will never be a case where, once TSC_RELIABLE is cleared, it is
>>> safe to revert back to std_rendezvous, even if TSC_RELIABLE is
>>> subsequently re-set.
>> You've got this backwards: TSC_RELIABLE may get _cleared_ late.
> 
> Quite - I haven't got this backwards.
> 
>> Nothing can ever set it late, due to the use of setup_clear_cpu_cap().
>> Reverting back to time_calibration_std_rendezvous() would only be
>> possible if CONSTANT_TSC got cleared late, ...
> 
> time_calibration_rendezvous_fn defaults to
> time_calibration_std_rendezvous(), i.e. defaults to the assumption that
> the TSCs are invariant.
> 
> We then later call clear_caps(TSC_RELIABLE), and the default changes to
> time_calibration_tsc_rendezvous().
> 
> We then later call clear_tsc_cap(CONSTANT_TSC), or indeed that
> CONSTANT_TSC was never set in the first place, and the default switches
> back to time_calibration_std_rendezvous() because of the aformentioned bug.
> 
> Once the switch to time_calibration_tsc_rendezvous() is made, it is
> never safe to switch back.

You still don't explain why - I don't see what's wrong with doing
so namely when there wasn't a whole lot of skew gained yet. Plus
I don't see why running with tsc_rendezvous is fine when one of
the two pre-conditions for switching to it isn't met, but we find
out only after having brought up APs.

Jan

> Therefore, your function must read time_calibration_rendezvous_fn and
> not assume time_calibration_std_rendezvous().
> 
> ~Andrew
diff mbox

Patch

--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -1135,8 +1135,8 @@  static int mwait_idle_cpu_init(struct no
 		}
 
 		if (state > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
-		    !pm_idle_save)
-			setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+		    !pm_idle_save && system_state < SYS_STATE_active)
+			clear_tsc_cap(X86_FEATURE_TSC_RELIABLE);
 
 		cx = dev->states + dev->count;
 		cx->type = state;
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1358,6 +1358,24 @@  static void time_calibration(void *unuse
                      &r, 1);
 }
 
+void __init clear_tsc_cap(unsigned int feature)
+{
+    void (*rendezvous_fn)(void *) = time_calibration_std_rendezvous;
+
+    if ( feature )
+        setup_clear_cpu_cap(feature);
+
+    /* If we have constant-rate TSCs then scale factor can be shared. */
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+    {
+        /* If TSCs are not marked as 'reliable', re-sync during rendezvous. */
+        if ( !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+            rendezvous_fn = time_calibration_tsc_rendezvous;
+    }
+
+    time_calibration_rendezvous_fn = rendezvous_fn;
+}
+
 static struct {
     s_time_t local_stime, master_stime;
 } ap_bringup_ref;
@@ -1482,7 +1500,7 @@  static int __init verify_tsc_reliability
         {
             printk("%s: TSC warp detected, disabling TSC_RELIABLE\n",
                    __func__);
-            setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+            clear_tsc_cap(X86_FEATURE_TSC_RELIABLE);
         }
     }
 
@@ -1495,13 +1513,7 @@  int __init init_xen_time(void)
 {
     tsc_check_writability();
 
-    /* If we have constant-rate TSCs then scale factor can be shared. */
-    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
-    {
-        /* If TSCs are not marked as 'reliable', re-sync during rendezvous. */
-        if ( !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
-            time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
-    }
+    clear_tsc_cap(0);
 
     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
 
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -71,6 +71,7 @@  void tsc_get_info(struct domain *d, uint
 void force_update_vcpu_system_time(struct vcpu *v);
 
 int host_tsc_is_safe(void);
+void clear_tsc_cap(unsigned int feature);
 void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
                      uint32_t *ecx, uint32_t *edx);