diff mbox

[2/5] x86/time: implement tsc as clocksource

Message ID 1458231136-13457-3-git-send-email-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Joao Martins March 17, 2016, 4:12 p.m. UTC
Introduce support for using TSC as platform time which is the highest
resolution time and most performant to get (~20 nsecs).  Though there
are also several problems associated with its usage, and there isn't a
complete (and architecturally defined) guarantee that all machines
will provide reliable and monotonic TSC across all CPUs, on different
sockets and different P/C states.  I believe Intel to be the only that
can guarantee that. For this reason it's set with less priority when
compared to HPET unless adminstrator changes "clocksource" boot option
to "tsc". Upon initializing it, we also check for time warps and
appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
NONSTOP_TSC. And in case none of these conditions are met, we fail in
order to fallback to the next available clocksource.

It is also worth noting that with clocksource=tsc there isn't any
need to synchronise with another clocksource, and I could verify that
great portion the time skew was eliminated and seeing much less time
warps happening. With HPET I observed ~500 warps in the period
of 1h of around 27 us, and with TSC down to 50 warps in the same
period having each warp < 100 ns. The warps still exist though but
are only related to cross CPU calibration (being how much it takes to
rendezvous with master), in which a later patch in this series aims to
solve.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
Cc: Keir Fraser <keir@xen.org>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>

Changes since RFC:
 - Spelling fixes in the commit message.
 - Remove unused clocksource_is_tsc variable and introduce it instead
 on the patch that uses it.
 - Move plt_tsc from second to last in the available clocksources.
---
 xen/arch/x86/time.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 2 deletions(-)

Comments

Andrew Cooper March 18, 2016, 8:21 p.m. UTC | #1
On 17/03/16 16:12, Joao Martins wrote:
> Introduce support for using TSC as platform time which is the highest
> resolution time and most performant to get (~20 nsecs).  Though there
> are also several problems associated with its usage, and there isn't a
> complete (and architecturally defined) guarantee that all machines
> will provide reliable and monotonic TSC across all CPUs, on different
> sockets and different P/C states.  I believe Intel to be the only that
> can guarantee that. For this reason it's set with less priority when
> compared to HPET unless adminstrator changes "clocksource" boot option
> to "tsc". Upon initializing it, we also check for time warps and
> appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
> NONSTOP_TSC. And in case none of these conditions are met, we fail in
> order to fallback to the next available clocksource.
>
> It is also worth noting that with clocksource=tsc there isn't any
> need to synchronise with another clocksource, and I could verify that
> great portion the time skew was eliminated and seeing much less time
> warps happening. With HPET I observed ~500 warps in the period
> of 1h of around 27 us, and with TSC down to 50 warps in the same
> period having each warp < 100 ns. The warps still exist though but
> are only related to cross CPU calibration (being how much it takes to
> rendezvous with master), in which a later patch in this series aims to
> solve.
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
> Cc: Keir Fraser <keir@xen.org>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>

Some style corrections, but no functional problems.

Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>

>
> Changes since RFC:
>  - Spelling fixes in the commit message.
>  - Remove unused clocksource_is_tsc variable and introduce it instead
>  on the patch that uses it.
>  - Move plt_tsc from second to last in the available clocksources.
> ---
>  xen/arch/x86/time.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 60 insertions(+), 2 deletions(-)
>
> diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
> index 687e39b..1311c58 100644
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -432,6 +432,63 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
>  }
>  
>  /************************************************************
> + * PLATFORM TIMER 4: TSC
> + */
> +static u64 tsc_freq;
> +static unsigned long tsc_max_warp;
> +static void tsc_check_reliability(void);
> +
> +static int __init init_tsctimer(struct platform_timesource *pts)
> +{
> +    bool_t tsc_reliable = 0;
> +
> +    tsc_check_reliability();
> +
> +    if ( tsc_max_warp > 0 )
> +    {
> +        tsc_reliable = 0;
> +        printk("TSC: didn't passed warp test\n");

printk(XENLOG_INFO "TSC ...

> +    }
> +    else if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ||
> +              ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
> +                boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) )

You don't need extra spaces on inner brackets, so

boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ||
(boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
 boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) )

is fine.

> +    {
> +        tsc_reliable = 1;
> +    }
> +    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
> +    {
> +        tsc_reliable = (max_cstate <= 2);
> +
> +        if (tsc_reliable)

Please add some extra ones here though.

> +            printk("TSC: no deep Cstates, deemed reliable\n");
> +        else
> +            printk("TSC: deep Cstates possible, so not reliable\n");

XENLOG_INFO as well please.

~Andrew
Joao Martins March 21, 2016, 11:43 a.m. UTC | #2
On 03/18/2016 08:21 PM, Andrew Cooper wrote:
> On 17/03/16 16:12, Joao Martins wrote:
>> Introduce support for using TSC as platform time which is the highest
>> resolution time and most performant to get (~20 nsecs).  Though there
>> are also several problems associated with its usage, and there isn't a
>> complete (and architecturally defined) guarantee that all machines
>> will provide reliable and monotonic TSC across all CPUs, on different
>> sockets and different P/C states.  I believe Intel to be the only that
>> can guarantee that. For this reason it's set with less priority when
>> compared to HPET unless adminstrator changes "clocksource" boot option
>> to "tsc". Upon initializing it, we also check for time warps and
>> appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
>> NONSTOP_TSC. And in case none of these conditions are met, we fail in
>> order to fallback to the next available clocksource.
>>
>> It is also worth noting that with clocksource=tsc there isn't any
>> need to synchronise with another clocksource, and I could verify that
>> great portion the time skew was eliminated and seeing much less time
>> warps happening. With HPET I observed ~500 warps in the period
>> of 1h of around 27 us, and with TSC down to 50 warps in the same
>> period having each warp < 100 ns. The warps still exist though but
>> are only related to cross CPU calibration (being how much it takes to
>> rendezvous with master), in which a later patch in this series aims to
>> solve.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>> Cc: Keir Fraser <keir@xen.org>
>> Cc: Jan Beulich <jbeulich@suse.com>
>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> 
> Some style corrections, but no functional problems.
> 
> Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>
> 
I've fixed all your comments for v2, Thanks!

>>
>> Changes since RFC:
>>  - Spelling fixes in the commit message.
>>  - Remove unused clocksource_is_tsc variable and introduce it instead
>>  on the patch that uses it.
>>  - Move plt_tsc from second to last in the available clocksources.
>> ---
>>  xen/arch/x86/time.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 60 insertions(+), 2 deletions(-)
>>
>> diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
>> index 687e39b..1311c58 100644
>> --- a/xen/arch/x86/time.c
>> +++ b/xen/arch/x86/time.c
>> @@ -432,6 +432,63 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
>>  }
>>  
>>  /************************************************************
>> + * PLATFORM TIMER 4: TSC
>> + */
>> +static u64 tsc_freq;
>> +static unsigned long tsc_max_warp;
>> +static void tsc_check_reliability(void);
>> +
>> +static int __init init_tsctimer(struct platform_timesource *pts)
>> +{
>> +    bool_t tsc_reliable = 0;
>> +
>> +    tsc_check_reliability();
>> +
>> +    if ( tsc_max_warp > 0 )
>> +    {
>> +        tsc_reliable = 0;
>> +        printk("TSC: didn't passed warp test\n");
> 
> printk(XENLOG_INFO "TSC ...
> 
>> +    }
>> +    else if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ||
>> +              ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
>> +                boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) )
> 
> You don't need extra spaces on inner brackets, so
> 
> boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ||
> (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
>  boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) )
> 
> is fine.
> 
>> +    {
>> +        tsc_reliable = 1;
>> +    }
>> +    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>> +    {
>> +        tsc_reliable = (max_cstate <= 2);
>> +
>> +        if (tsc_reliable)
> 
> Please add some extra ones here though.
> 
>> +            printk("TSC: no deep Cstates, deemed reliable\n");
>> +        else
>> +            printk("TSC: deep Cstates possible, so not reliable\n");
> 
> XENLOG_INFO as well please.
> 
> ~Andrew
>
Joao Martins March 22, 2016, 12:41 p.m. UTC | #3
On 03/18/2016 08:21 PM, Andrew Cooper wrote:
> On 17/03/16 16:12, Joao Martins wrote:
>> Introduce support for using TSC as platform time which is the highest
>> resolution time and most performant to get (~20 nsecs).  Though there
>> are also several problems associated with its usage, and there isn't a
>> complete (and architecturally defined) guarantee that all machines
>> will provide reliable and monotonic TSC across all CPUs, on different
>> sockets and different P/C states.  I believe Intel to be the only that
>> can guarantee that. For this reason it's set with less priority when
>> compared to HPET unless adminstrator changes "clocksource" boot option
>> to "tsc". Upon initializing it, we also check for time warps and
>> appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
>> NONSTOP_TSC. And in case none of these conditions are met, we fail in
>> order to fallback to the next available clocksource.
>>
>> It is also worth noting that with clocksource=tsc there isn't any
>> need to synchronise with another clocksource, and I could verify that
>> great portion the time skew was eliminated and seeing much less time
>> warps happening. With HPET I observed ~500 warps in the period
>> of 1h of around 27 us, and with TSC down to 50 warps in the same
>> period having each warp < 100 ns. The warps still exist though but
>> are only related to cross CPU calibration (being how much it takes to
>> rendezvous with master), in which a later patch in this series aims to
>> solve.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>> Cc: Keir Fraser <keir@xen.org>
>> Cc: Jan Beulich <jbeulich@suse.com>
>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> 
> Some style corrections, but no functional problems.
> 
> Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>
> 
I found out one issue in the tsc clocksource initialization path with respect to
the reliability check. This check is running when initializing platform timer,
but not all CPUS are up at that point (init_xen_time()) which means that the
check will always succeed. So for clocksource=tsc I need to defer initialization
to a later point (on verify_tsc_reliability()) and if successful switch to that
clocksource. Unless you disagree, v2 would have this and just requires one
additional preparatory change prior to this patch.

Joao

>>
>> Changes since RFC:
>>  - Spelling fixes in the commit message.
>>  - Remove unused clocksource_is_tsc variable and introduce it instead
>>  on the patch that uses it.
>>  - Move plt_tsc from second to last in the available clocksources.
>> ---
>>  xen/arch/x86/time.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 60 insertions(+), 2 deletions(-)
>>
>> diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
>> index 687e39b..1311c58 100644
>> --- a/xen/arch/x86/time.c
>> +++ b/xen/arch/x86/time.c
>> @@ -432,6 +432,63 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
>>  }
>>  
>>  /************************************************************
>> + * PLATFORM TIMER 4: TSC
>> + */
>> +static u64 tsc_freq;
>> +static unsigned long tsc_max_warp;
>> +static void tsc_check_reliability(void);
>> +
>> +static int __init init_tsctimer(struct platform_timesource *pts)
>> +{
>> +    bool_t tsc_reliable = 0;
>> +
>> +    tsc_check_reliability();
>> +
>> +    if ( tsc_max_warp > 0 )
>> +    {
>> +        tsc_reliable = 0;
>> +        printk("TSC: didn't passed warp test\n");
> 
> printk(XENLOG_INFO "TSC ...
> 
>> +    }
>> +    else if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ||
>> +              ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
>> +                boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) )
> 
> You don't need extra spaces on inner brackets, so
> 
> boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ||
> (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
>  boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) )
> 
> is fine.
> 
>> +    {
>> +        tsc_reliable = 1;
>> +    }
>> +    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
>> +    {
>> +        tsc_reliable = (max_cstate <= 2);
>> +
>> +        if (tsc_reliable)
> 
> Please add some extra ones here though.
> 
>> +            printk("TSC: no deep Cstates, deemed reliable\n");
>> +        else
>> +            printk("TSC: deep Cstates possible, so not reliable\n");
> 
> XENLOG_INFO as well please.
> 
> ~Andrew
>
Jan Beulich March 22, 2016, 12:46 p.m. UTC | #4
>>> On 22.03.16 at 13:41, <joao.m.martins@oracle.com> wrote:

> 
> On 03/18/2016 08:21 PM, Andrew Cooper wrote:
>> On 17/03/16 16:12, Joao Martins wrote:
>>> Introduce support for using TSC as platform time which is the highest
>>> resolution time and most performant to get (~20 nsecs).  Though there
>>> are also several problems associated with its usage, and there isn't a
>>> complete (and architecturally defined) guarantee that all machines
>>> will provide reliable and monotonic TSC across all CPUs, on different
>>> sockets and different P/C states.  I believe Intel to be the only that
>>> can guarantee that. For this reason it's set with less priority when
>>> compared to HPET unless adminstrator changes "clocksource" boot option
>>> to "tsc". Upon initializing it, we also check for time warps and
>>> appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
>>> NONSTOP_TSC. And in case none of these conditions are met, we fail in
>>> order to fallback to the next available clocksource.
>>>
>>> It is also worth noting that with clocksource=tsc there isn't any
>>> need to synchronise with another clocksource, and I could verify that
>>> great portion the time skew was eliminated and seeing much less time
>>> warps happening. With HPET I observed ~500 warps in the period
>>> of 1h of around 27 us, and with TSC down to 50 warps in the same
>>> period having each warp < 100 ns. The warps still exist though but
>>> are only related to cross CPU calibration (being how much it takes to
>>> rendezvous with master), in which a later patch in this series aims to
>>> solve.
>>>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> ---
>>> Cc: Keir Fraser <keir@xen.org>
>>> Cc: Jan Beulich <jbeulich@suse.com>
>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>> 
>> Some style corrections, but no functional problems.
>> 
>> Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>
>> 
> I found out one issue in the tsc clocksource initialization path with respect to
> the reliability check. This check is running when initializing platform timer,
> but not all CPUS are up at that point (init_xen_time()) which means that the
> check will always succeed. So for clocksource=tsc I need to defer initialization
> to a later point (on verify_tsc_reliability()) and if successful switch to that
> clocksource. Unless you disagree, v2 would have this and just requires one
> additional preparatory change prior to this patch.

Hmm, that's suspicious when thinking about CPU hotplug: What
do you intend to do when a CPU comes online later, failing the
check? So far we don't do runtime switching of the clock source,
and I'm not sure that's a good idea to do when there are running
guests.

Jan
Joao Martins March 22, 2016, 3:51 p.m. UTC | #5
On 03/22/2016 12:46 PM, Jan Beulich wrote:
>>>> On 22.03.16 at 13:41, <joao.m.martins@oracle.com> wrote:
> 
>>
>> On 03/18/2016 08:21 PM, Andrew Cooper wrote:
>>> On 17/03/16 16:12, Joao Martins wrote:
>>>> Introduce support for using TSC as platform time which is the highest
>>>> resolution time and most performant to get (~20 nsecs).  Though there
>>>> are also several problems associated with its usage, and there isn't a
>>>> complete (and architecturally defined) guarantee that all machines
>>>> will provide reliable and monotonic TSC across all CPUs, on different
>>>> sockets and different P/C states.  I believe Intel to be the only that
>>>> can guarantee that. For this reason it's set with less priority when
>>>> compared to HPET unless adminstrator changes "clocksource" boot option
>>>> to "tsc". Upon initializing it, we also check for time warps and
>>>> appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
>>>> NONSTOP_TSC. And in case none of these conditions are met, we fail in
>>>> order to fallback to the next available clocksource.
>>>>
>>>> It is also worth noting that with clocksource=tsc there isn't any
>>>> need to synchronise with another clocksource, and I could verify that
>>>> great portion the time skew was eliminated and seeing much less time
>>>> warps happening. With HPET I observed ~500 warps in the period
>>>> of 1h of around 27 us, and with TSC down to 50 warps in the same
>>>> period having each warp < 100 ns. The warps still exist though but
>>>> are only related to cross CPU calibration (being how much it takes to
>>>> rendezvous with master), in which a later patch in this series aims to
>>>> solve.
>>>>
>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>> ---
>>>> Cc: Keir Fraser <keir@xen.org>
>>>> Cc: Jan Beulich <jbeulich@suse.com>
>>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>>
>>> Some style corrections, but no functional problems.
>>>
>>> Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>
>>>
>> I found out one issue in the tsc clocksource initialization path with respect to
>> the reliability check. This check is running when initializing platform timer,
>> but not all CPUS are up at that point (init_xen_time()) which means that the
>> check will always succeed. So for clocksource=tsc I need to defer initialization
>> to a later point (on verify_tsc_reliability()) and if successful switch to that
>> clocksource. Unless you disagree, v2 would have this and just requires one
>> additional preparatory change prior to this patch.
> 
> Hmm, that's suspicious when thinking about CPU hotplug: What
> do you intend to do when a CPU comes online later, failing the
> check?
Good point, but I am not sure whether that would happen. The initcall
verify_tsc_reliability seems to be called only for the boot processor. Or are
you saying that it's case that initcalls are called too when hotplugging cpus
later on? If that's the case then my suggestion wouldn't work as you point out -
or rather without having runtime switching support as you point out below.

> So far we don't do runtime switching of the clock source,
> and I'm not sure that's a good idea to do when there are running
> guests.
Totally agree, but I would be proposing to be at initialization phase where
there wouldn't be guests running. We would start with HPET, then only switch to
TSC if that check has passed (and would happen once).

Simpler alternative would be to call init_xen_time after all CPUs are brought up
(and would also keep this patch as is), but I am not sure about the
repercussions of that.

Joao
Jan Beulich March 22, 2016, 4:02 p.m. UTC | #6
>>> On 22.03.16 at 16:51, <joao.m.martins@oracle.com> wrote:

> 
> On 03/22/2016 12:46 PM, Jan Beulich wrote:
>>>>> On 22.03.16 at 13:41, <joao.m.martins@oracle.com> wrote:
>> 
>>>
>>> On 03/18/2016 08:21 PM, Andrew Cooper wrote:
>>>> On 17/03/16 16:12, Joao Martins wrote:
>>>>> Introduce support for using TSC as platform time which is the highest
>>>>> resolution time and most performant to get (~20 nsecs).  Though there
>>>>> are also several problems associated with its usage, and there isn't a
>>>>> complete (and architecturally defined) guarantee that all machines
>>>>> will provide reliable and monotonic TSC across all CPUs, on different
>>>>> sockets and different P/C states.  I believe Intel to be the only that
>>>>> can guarantee that. For this reason it's set with less priority when
>>>>> compared to HPET unless adminstrator changes "clocksource" boot option
>>>>> to "tsc". Upon initializing it, we also check for time warps and
>>>>> appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
>>>>> NONSTOP_TSC. And in case none of these conditions are met, we fail in
>>>>> order to fallback to the next available clocksource.
>>>>>
>>>>> It is also worth noting that with clocksource=tsc there isn't any
>>>>> need to synchronise with another clocksource, and I could verify that
>>>>> great portion the time skew was eliminated and seeing much less time
>>>>> warps happening. With HPET I observed ~500 warps in the period
>>>>> of 1h of around 27 us, and with TSC down to 50 warps in the same
>>>>> period having each warp < 100 ns. The warps still exist though but
>>>>> are only related to cross CPU calibration (being how much it takes to
>>>>> rendezvous with master), in which a later patch in this series aims to
>>>>> solve.
>>>>>
>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>> ---
>>>>> Cc: Keir Fraser <keir@xen.org>
>>>>> Cc: Jan Beulich <jbeulich@suse.com>
>>>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>>>
>>>> Some style corrections, but no functional problems.
>>>>
>>>> Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>
>>>>
>>> I found out one issue in the tsc clocksource initialization path with respect to
>>> the reliability check. This check is running when initializing platform timer,
>>> but not all CPUS are up at that point (init_xen_time()) which means that the
>>> check will always succeed. So for clocksource=tsc I need to defer initialization
>>> to a later point (on verify_tsc_reliability()) and if successful switch to that
>>> clocksource. Unless you disagree, v2 would have this and just requires one
>>> additional preparatory change prior to this patch.
>> 
>> Hmm, that's suspicious when thinking about CPU hotplug: What
>> do you intend to do when a CPU comes online later, failing the
>> check?
> Good point, but I am not sure whether that would happen. The initcall
> verify_tsc_reliability seems to be called only for the boot processor. Or are
> you saying that it's case that initcalls are called too when hotplugging cpus
> later on? If that's the case then my suggestion wouldn't work as you point out -
> or rather without having runtime switching support as you point out below.

Looks like I didn't express myself clearly enough: "failing the check"
wasn't meant to imply the failure would actually occur, but rather
that failure would occur if the check was run on that CPU. IOW the
case of a CPU getting hotplugged which doesn't satisfy the needs
for selecting TSC as the clock source.

>> So far we don't do runtime switching of the clock source,
>> and I'm not sure that's a good idea to do when there are running
>> guests.
> Totally agree, but I would be proposing to be at initialization phase where
> there wouldn't be guests running. We would start with HPET, then only switch 
> to
> TSC if that check has passed (and would happen once).
> 
> Simpler alternative would be to call init_xen_time after all CPUs are 
> brought up
> (and would also keep this patch as is), but I am not sure about the
> repercussions of that.

I don't see how either would help with the hotplug case.

Jan
Joao Martins March 22, 2016, 8:40 p.m. UTC | #7
On 03/22/2016 04:02 PM, Jan Beulich wrote:
>>>> On 22.03.16 at 16:51, <joao.m.martins@oracle.com> wrote:
> 
>>
>> On 03/22/2016 12:46 PM, Jan Beulich wrote:
>>>>>> On 22.03.16 at 13:41, <joao.m.martins@oracle.com> wrote:
>>>
>>>>
>>>> On 03/18/2016 08:21 PM, Andrew Cooper wrote:
>>>>> On 17/03/16 16:12, Joao Martins wrote:
>>>>>> Introduce support for using TSC as platform time which is the highest
>>>>>> resolution time and most performant to get (~20 nsecs).  Though there
>>>>>> are also several problems associated with its usage, and there isn't a
>>>>>> complete (and architecturally defined) guarantee that all machines
>>>>>> will provide reliable and monotonic TSC across all CPUs, on different
>>>>>> sockets and different P/C states.  I believe Intel to be the only that
>>>>>> can guarantee that. For this reason it's set with less priority when
>>>>>> compared to HPET unless adminstrator changes "clocksource" boot option
>>>>>> to "tsc". Upon initializing it, we also check for time warps and
>>>>>> appropriate CPU features i.e.  TSC_RELIABLE, CONSTANT_TSC and
>>>>>> NONSTOP_TSC. And in case none of these conditions are met, we fail in
>>>>>> order to fallback to the next available clocksource.
>>>>>>
>>>>>> It is also worth noting that with clocksource=tsc there isn't any
>>>>>> need to synchronise with another clocksource, and I could verify that
>>>>>> great portion the time skew was eliminated and seeing much less time
>>>>>> warps happening. With HPET I observed ~500 warps in the period
>>>>>> of 1h of around 27 us, and with TSC down to 50 warps in the same
>>>>>> period having each warp < 100 ns. The warps still exist though but
>>>>>> are only related to cross CPU calibration (being how much it takes to
>>>>>> rendezvous with master), in which a later patch in this series aims to
>>>>>> solve.
>>>>>>
>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>> ---
>>>>>> Cc: Keir Fraser <keir@xen.org>
>>>>>> Cc: Jan Beulich <jbeulich@suse.com>
>>>>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>>>>
>>>>> Some style corrections, but no functional problems.
>>>>>
>>>>> Reviewed-by Andrew Cooper <andrew.cooper3@citrix.com>
>>>>>
>>>> I found out one issue in the tsc clocksource initialization path with respect to
>>>> the reliability check. This check is running when initializing platform timer,
>>>> but not all CPUS are up at that point (init_xen_time()) which means that the
>>>> check will always succeed. So for clocksource=tsc I need to defer initialization
>>>> to a later point (on verify_tsc_reliability()) and if successful switch to that
>>>> clocksource. Unless you disagree, v2 would have this and just requires one
>>>> additional preparatory change prior to this patch.
>>>
>>> Hmm, that's suspicious when thinking about CPU hotplug: What
>>> do you intend to do when a CPU comes online later, failing the
>>> check?
>> Good point, but I am not sure whether that would happen. The initcall
>> verify_tsc_reliability seems to be called only for the boot processor. Or are
>> you saying that it's case that initcalls are called too when hotplugging cpus
>> later on? If that's the case then my suggestion wouldn't work as you point out -
>> or rather without having runtime switching support as you point out below.
> 
> Looks like I didn't express myself clearly enough: "failing the check"
> wasn't meant to imply the failure would actually occur, but rather
> that failure would occur if the check was run on that CPU. IOW the
> case of a CPU getting hotplugged which doesn't satisfy the needs
> for selecting TSC as the clock source.
Ah, I see. I believe this wouldn't be an issue with the current rendezvous
mechanism (std_rendezvous), as the delta is computed between local_tsc_stamp
taken in that (hotplugged) CPU in the calibration and the rdtsc() in the guest
same CPU, even though having CPU0 TSC (master) as system_time.

However it can be a problem with PVCLOCK_TSC_STABLE_BIT as the hotplugged CPU
could have its TSC drifted, and since setting this flag relies on
synchronization of TSCs we would need to clear the flag enterily.

>>> So far we don't do runtime switching of the clock source,
>>> and I'm not sure that's a good idea to do when there are running
>>> guests.
>> Totally agree, but I would be proposing to be at initialization phase where
>> there wouldn't be guests running. We would start with HPET, then only switch 
>> to
>> TSC if that check has passed (and would happen once).
>>
>> Simpler alternative would be to call init_xen_time after all CPUs are 
>> brought up
>> (and would also keep this patch as is), but I am not sure about the
>> repercussions of that.
> 
> I don't see how either would help with the hotplug case.
This was in response to what I thought you meant with your earlier question
(which I misunderstood). But my question is still valid I believe. The reason
for choosing between my suggested approaches is that tsc_check_reliability()
requires all CPUs up so that the check is correctly performed.

Joao
Jan Beulich March 23, 2016, 7:28 a.m. UTC | #8
>>> On 22.03.16 at 21:40, <joao.m.martins@oracle.com> wrote:
> On 03/22/2016 04:02 PM, Jan Beulich wrote:
>>>>> On 22.03.16 at 16:51, <joao.m.martins@oracle.com> wrote:
>>> On 03/22/2016 12:46 PM, Jan Beulich wrote:
>>>>>>> On 22.03.16 at 13:41, <joao.m.martins@oracle.com> wrote:
>>>>>>
>>>>> I found out one issue in the tsc clocksource initialization path with respect to
>>>>> the reliability check. This check is running when initializing platform timer,
>>>>> but not all CPUS are up at that point (init_xen_time()) which means that the
>>>>> check will always succeed. So for clocksource=tsc I need to defer initialization
>>>>> to a later point (on verify_tsc_reliability()) and if successful switch to that
>>>>> clocksource. Unless you disagree, v2 would have this and just requires one
>>>>> additional preparatory change prior to this patch.
>>>>
>>>> Hmm, that's suspicious when thinking about CPU hotplug: What
>>>> do you intend to do when a CPU comes online later, failing the
>>>> check?
>>> Good point, but I am not sure whether that would happen. The initcall
>>> verify_tsc_reliability seems to be called only for the boot processor. Or are
>>> you saying that it's case that initcalls are called too when hotplugging cpus
>>> later on? If that's the case then my suggestion wouldn't work as you point out -
>>> or rather without having runtime switching support as you point out below.
>> 
>> Looks like I didn't express myself clearly enough: "failing the check"
>> wasn't meant to imply the failure would actually occur, but rather
>> that failure would occur if the check was run on that CPU. IOW the
>> case of a CPU getting hotplugged which doesn't satisfy the needs
>> for selecting TSC as the clock source.
> Ah, I see. I believe this wouldn't be an issue with the current rendezvous
> mechanism (std_rendezvous), as the delta is computed between local_tsc_stamp
> taken in that (hotplugged) CPU in the calibration and the rdtsc() in the 
> guest
> same CPU, even though having CPU0 TSC (master) as system_time.
> 
> However it can be a problem with PVCLOCK_TSC_STABLE_BIT as the hotplugged CPU
> could have its TSC drifted, and since setting this flag relies on
> synchronization of TSCs we would need to clear the flag enterily.

Except that we can't, after guests already got started, validly clear
that flag afaics. The only option I see here would be to never set
this flag if CPU hotplug is possible - by looking at the hot pluggable
CPU count and, if non-zero, perhaps allowing a command line
override to indicate no hotplug is intended (it may well be that such
is already implicitly available).

>>>> So far we don't do runtime switching of the clock source,
>>>> and I'm not sure that's a good idea to do when there are running
>>>> guests.
>>> Totally agree, but I would be proposing to be at initialization phase where
>>> there wouldn't be guests running. We would start with HPET, then only switch 
>>> to
>>> TSC if that check has passed (and would happen once).
>>>
>>> Simpler alternative would be to call init_xen_time after all CPUs are 
>>> brought up
>>> (and would also keep this patch as is), but I am not sure about the
>>> repercussions of that.
>> 
>> I don't see how either would help with the hotplug case.
> This was in response to what I thought you meant with your earlier question
> (which I misunderstood). But my question is still valid I believe. The 
> reason
> for choosing between my suggested approaches is that tsc_check_reliability()
> requires all CPUs up so that the check is correctly performed.

Sure - it seems quite obvious that all boot time available CPUs
should be checked.

Jan
Joao Martins March 23, 2016, 12:05 p.m. UTC | #9
On 03/23/2016 07:28 AM, Jan Beulich wrote:
>>>> On 22.03.16 at 21:40, <joao.m.martins@oracle.com> wrote:
>> On 03/22/2016 04:02 PM, Jan Beulich wrote:
>>>>>> On 22.03.16 at 16:51, <joao.m.martins@oracle.com> wrote:
>>>> On 03/22/2016 12:46 PM, Jan Beulich wrote:
>>>>>>>> On 22.03.16 at 13:41, <joao.m.martins@oracle.com> wrote:
>>>>>>>
>>>>>> I found out one issue in the tsc clocksource initialization path with respect to
>>>>>> the reliability check. This check is running when initializing platform timer,
>>>>>> but not all CPUS are up at that point (init_xen_time()) which means that the
>>>>>> check will always succeed. So for clocksource=tsc I need to defer initialization
>>>>>> to a later point (on verify_tsc_reliability()) and if successful switch to that
>>>>>> clocksource. Unless you disagree, v2 would have this and just requires one
>>>>>> additional preparatory change prior to this patch.
>>>>>
>>>>> Hmm, that's suspicious when thinking about CPU hotplug: What
>>>>> do you intend to do when a CPU comes online later, failing the
>>>>> check?
>>>> Good point, but I am not sure whether that would happen. The initcall
>>>> verify_tsc_reliability seems to be called only for the boot processor. Or are
>>>> you saying that it's case that initcalls are called too when hotplugging cpus
>>>> later on? If that's the case then my suggestion wouldn't work as you point out -
>>>> or rather without having runtime switching support as you point out below.
>>>
>>> Looks like I didn't express myself clearly enough: "failing the check"
>>> wasn't meant to imply the failure would actually occur, but rather
>>> that failure would occur if the check was run on that CPU. IOW the
>>> case of a CPU getting hotplugged which doesn't satisfy the needs
>>> for selecting TSC as the clock source.
>> Ah, I see. I believe this wouldn't be an issue with the current rendezvous
>> mechanism (std_rendezvous), as the delta is computed between local_tsc_stamp
>> taken in that (hotplugged) CPU in the calibration and the rdtsc() in the 
>> guest
>> same CPU, even though having CPU0 TSC (master) as system_time.
>>
>> However it can be a problem with PVCLOCK_TSC_STABLE_BIT as the hotplugged CPU
>> could have its TSC drifted, and since setting this flag relies on
>> synchronization of TSCs we would need to clear the flag enterily.
> 
> Except that we can't, after guests already got started, validly clear
> that flag afaics.
Correct.

> The only option I see here would be to never set
> this flag if CPU hotplug is possible - by looking at the hot pluggable
> CPU count and, if non-zero, perhaps allowing a command line
> override to indicate no hotplug is intended (it may well be that such
> is already implicitly available).
OK, will add this then to allow the flag only if the conditions above are met.
Thanks for the pointer!

>>>>> So far we don't do runtime switching of the clock source,
>>>>> and I'm not sure that's a good idea to do when there are running
>>>>> guests.
>>>> Totally agree, but I would be proposing to be at initialization phase where
>>>> there wouldn't be guests running. We would start with HPET, then only switch 
>>>> to
>>>> TSC if that check has passed (and would happen once).
>>>>
>>>> Simpler alternative would be to call init_xen_time after all CPUs are 
>>>> brought up
>>>> (and would also keep this patch as is), but I am not sure about the
>>>> repercussions of that.
>>>
>>> I don't see how either would help with the hotplug case.
>> This was in response to what I thought you meant with your earlier question
>> (which I misunderstood). But my question is still valid I believe. The 
>> reason
>> for choosing between my suggested approaches is that tsc_check_reliability()
>> requires all CPUs up so that the check is correctly performed.
> 
> Sure - it seems quite obvious that all boot time available CPUs
> should be checked.
Cool, so I will go with moving init_xen_time right after all CPUs are up but
before initcalls are invoked.

Joao
Jan Beulich March 23, 2016, 2:05 p.m. UTC | #10
>>> On 23.03.16 at 13:05, <joao.m.martins@oracle.com> wrote:
> On 03/23/2016 07:28 AM, Jan Beulich wrote:
>> Sure - it seems quite obvious that all boot time available CPUs
>> should be checked.
> Cool, so I will go with moving init_xen_time right after all CPUs are up but
> before initcalls are invoked.

I think your other alternative seemed more reasonable; I wouldn't
be very happy to see init_xen_time() moved around.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 687e39b..1311c58 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -432,6 +432,63 @@  uint64_t ns_to_acpi_pm_tick(uint64_t ns)
 }
 
 /************************************************************
+ * PLATFORM TIMER 4: TSC
+ */
+static u64 tsc_freq;
+static unsigned long tsc_max_warp;
+static void tsc_check_reliability(void);
+
+static int __init init_tsctimer(struct platform_timesource *pts)
+{
+    bool_t tsc_reliable = 0;
+
+    tsc_check_reliability();
+
+    if ( tsc_max_warp > 0 )
+    {
+        tsc_reliable = 0;
+        printk("TSC: didn't passed warp test\n");
+    }
+    else if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ||
+              ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+                boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) )
+    {
+        tsc_reliable = 1;
+    }
+    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+    {
+        tsc_reliable = (max_cstate <= 2);
+
+        if (tsc_reliable)
+            printk("TSC: no deep Cstates, deemed reliable\n");
+        else
+            printk("TSC: deep Cstates possible, so not reliable\n");
+    }
+
+    pts->frequency = tsc_freq;
+    return tsc_reliable;
+}
+
+static u64 read_tsc(void)
+{
+    return rdtsc();
+}
+
+static void resume_tsctimer(struct platform_timesource *pts)
+{
+}
+
+static struct platform_timesource __initdata plt_tsc =
+{
+    .id = "tsc",
+    .name = "TSC",
+    .read_counter = read_tsc,
+    .counter_bits = 64,
+    .init = init_tsctimer,
+    .resume = resume_tsctimer,
+};
+
+/************************************************************
  * GENERIC PLATFORM TIMER INFRASTRUCTURE
  */
 
@@ -536,7 +593,7 @@  static void resume_platform_timer(void)
 static void __init init_platform_timer(void)
 {
     static struct platform_timesource * __initdata plt_timers[] = {
-        &plt_hpet, &plt_pmtimer, &plt_pit
+        &plt_hpet, &plt_pmtimer, &plt_pit, &plt_tsc
     };
 
     struct platform_timesource *pts = NULL;
@@ -1181,7 +1238,7 @@  static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
     }
 }
 
-static unsigned long tsc_max_warp, tsc_check_count;
+static unsigned long tsc_check_count;
 static cpumask_t tsc_check_cpumask;
 
 static void tsc_check_slave(void *unused)
@@ -1465,6 +1522,7 @@  void __init early_time_init(void)
     struct cpu_time *t = &this_cpu(cpu_time);
     u64 tmp = init_pit_and_calibrate_tsc();
 
+    tsc_freq = tmp;
     set_time_scale(&t->tsc_scale, tmp);
     t->local_tsc_stamp = boot_tsc_stamp;