diff mbox

tsc: use kvmclock for calibration

Message ID 1344513463-7329-1-git-send-email-kraxel@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gerd Hoffmann Aug. 9, 2012, 11:57 a.m. UTC
Use kvmclock for tsc calibration when running on kvm.  Without this the
tsc frequency calibrated by seabios can be *way* off in case the virtual
machine is booted on a loaded host.  I've seen seabios calibrating 27
instead of ca. 2800 MHz, resulting in timeouts being to short by factor
100.  Which in turn leads to disk I/O errors due to timeouts, especially
as I/O requests tend to take a bit longer than usual on a loaded box ...

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 src/clock.c    |    9 +++++
 src/paravirt.c |   90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/paravirt.h |    1 +
 3 files changed, 100 insertions(+), 0 deletions(-)

Comments

Avi Kivity Aug. 9, 2012, 12:53 p.m. UTC | #1
On 08/09/2012 02:57 PM, Gerd Hoffmann wrote:
> Use kvmclock for tsc calibration when running on kvm.  Without this the
> tsc frequency calibrated by seabios can be *way* off in case the virtual
> machine is booted on a loaded host.  I've seen seabios calibrating 27
> instead of ca. 2800 MHz, resulting in timeouts being to short by factor
> 100.  Which in turn leads to disk I/O errors due to timeouts, especially
> as I/O requests tend to take a bit longer than usual on a loaded box ...

> +
> +struct pvclock_vcpu_time_info {
> +	u32   version;
> +	u32   pad0;
> +	u64   tsc_timestamp;
> +	u64   system_time;
> +	u32   tsc_to_system_mul;
> +	s8    tsc_shift;
> +	u8    flags;
> +	u8    pad[2];
> +} PACKED;
> +
> +
> +u64 kvm_tsc_khz(void)
> +{
> +    u32 eax, ebx, ecx, edx, msr;
> +    struct pvclock_vcpu_time_info time;
> +    u32 addr = (u32)(&time);
> +    u64 khz;
> +
> +    /* check presence and figure msr number */
> +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
> +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
> +        msr = MSR_KVM_SYSTEM_TIME_NEW;
> +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
> +        msr = MSR_KVM_SYSTEM_TIME;
> +    } else {
> +        return 0;
> +    }
> +
> +    /* ask kvm hypervisor to fill struct */
> +    memset(&time, 0, sizeof(time));
> +    wrmsr(msr, addr | 1);

How can this work?  There is a 64-byte alignment requirement.

> +    wrmsr(msr, 0);
> +    if (time.version < 2 || time.tsc_to_system_mul == 0)
> +        return 0;
> +
> +    /* go figure tsc frequency */
> +    khz = pvclock_tsc_khz(&time);
> +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
> +            msr, (u32)khz / 1000);
> +    return khz;

That's a meaningless number.  You can be migrated to a cpu or a machine
with very different tsc.

You want accurate time on kvm, don't use the tsc.
Fred . Aug. 9, 2012, 1:25 p.m. UTC | #2
It should be kHz not khz.

-            msr, (u32)khz / 1000);
+            msr, (u32)kHz / 1000);

On Thu, Aug 9, 2012 at 2:53 PM, Avi Kivity <avi@redhat.com> wrote:
> On 08/09/2012 02:57 PM, Gerd Hoffmann wrote:
>> Use kvmclock for tsc calibration when running on kvm.  Without this the
>> tsc frequency calibrated by seabios can be *way* off in case the virtual
>> machine is booted on a loaded host.  I've seen seabios calibrating 27
>> instead of ca. 2800 MHz, resulting in timeouts being to short by factor
>> 100.  Which in turn leads to disk I/O errors due to timeouts, especially
>> as I/O requests tend to take a bit longer than usual on a loaded box ...
>
>> +
>> +struct pvclock_vcpu_time_info {
>> +     u32   version;
>> +     u32   pad0;
>> +     u64   tsc_timestamp;
>> +     u64   system_time;
>> +     u32   tsc_to_system_mul;
>> +     s8    tsc_shift;
>> +     u8    flags;
>> +     u8    pad[2];
>> +} PACKED;
>> +
>> +
>> +u64 kvm_tsc_khz(void)
>> +{
>> +    u32 eax, ebx, ecx, edx, msr;
>> +    struct pvclock_vcpu_time_info time;
>> +    u32 addr = (u32)(&time);
>> +    u64 khz;
>> +
>> +    /* check presence and figure msr number */
>> +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
>> +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
>> +        msr = MSR_KVM_SYSTEM_TIME_NEW;
>> +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
>> +        msr = MSR_KVM_SYSTEM_TIME;
>> +    } else {
>> +        return 0;
>> +    }
>> +
>> +    /* ask kvm hypervisor to fill struct */
>> +    memset(&time, 0, sizeof(time));
>> +    wrmsr(msr, addr | 1);
>
> How can this work?  There is a 64-byte alignment requirement.
>
>> +    wrmsr(msr, 0);
>> +    if (time.version < 2 || time.tsc_to_system_mul == 0)
>> +        return 0;
>> +
>> +    /* go figure tsc frequency */
>> +    khz = pvclock_tsc_khz(&time);
>> +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
>> +            msr, (u32)khz / 1000);
>> +    return khz;
>
> That's a meaningless number.  You can be migrated to a cpu or a machine
> with very different tsc.
>
> You want accurate time on kvm, don't use the tsc.
>
>
> --
> error compiling committee.c: too many arguments to function
>
> _______________________________________________
> SeaBIOS mailing list
> SeaBIOS@seabios.org
> http://www.seabios.org/mailman/listinfo/seabios
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gerd Hoffmann Aug. 9, 2012, 1:57 p.m. UTC | #3
Hi,

>> +u64 kvm_tsc_khz(void)
>> +{
>> +    u32 eax, ebx, ecx, edx, msr;
>> +    struct pvclock_vcpu_time_info time;
>> +    u32 addr = (u32)(&time);
>> +    u64 khz;
>> +
>> +    /* check presence and figure msr number */
>> +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
>> +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
>> +        msr = MSR_KVM_SYSTEM_TIME_NEW;
>> +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
>> +        msr = MSR_KVM_SYSTEM_TIME;
>> +    } else {
>> +        return 0;
>> +    }
>> +
>> +    /* ask kvm hypervisor to fill struct */
>> +    memset(&time, 0, sizeof(time));
>> +    wrmsr(msr, addr | 1);
> 
> How can this work?

It did in my testing, although maybe by pure luck ...

> There is a 64-byte alignment requirement.

64 bytes?  Sure?  The whole struct is only 32 bytes in size ...

Easily fixable though, just need to grab some memory with memalign
instead of using the stack.

>> +    wrmsr(msr, 0);
>> +    if (time.version < 2 || time.tsc_to_system_mul == 0)
>> +        return 0;
>> +
>> +    /* go figure tsc frequency */
>> +    khz = pvclock_tsc_khz(&time);
>> +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
>> +            msr, (u32)khz / 1000);
>> +    return khz;
> 
> That's a meaningless number.  You can be migrated to a cpu or a machine
> with very different tsc.

> You want accurate time on kvm, don't use the tsc.

seabios uses the tsc for timeout calculations only, so it doesn't need
to be 100% accurate.  The order of magnitude should be correct though.
The Linux kernel uses the value for delay loops too, so using it for the
given purpose can't be *that* horrible after all ...

It is certainly an improvement over the current code which tries to
calibrate the tsc and gets totally broken results in case the busy host
happens to schedule the guest in the middle of calibration.

So what do you suggest?  The options I see are:

  (1) Use this patch (with alignment issue fixed of course).
  (2) Do a full kvmclock implementation.  Feels a bit like overkill.
  (3) SeaBIOS can fallback to the PIT for timing on machines which
      have no TSC.  We could do that too in case we detect kvm ...

cheers,
  Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 9, 2012, 2:01 p.m. UTC | #4
On 08/09/2012 04:57 PM, Gerd Hoffmann wrote:
>   Hi,
> 
>>> +u64 kvm_tsc_khz(void)
>>> +{
>>> +    u32 eax, ebx, ecx, edx, msr;
>>> +    struct pvclock_vcpu_time_info time;
>>> +    u32 addr = (u32)(&time);
>>> +    u64 khz;
>>> +
>>> +    /* check presence and figure msr number */
>>> +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
>>> +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
>>> +        msr = MSR_KVM_SYSTEM_TIME_NEW;
>>> +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
>>> +        msr = MSR_KVM_SYSTEM_TIME;
>>> +    } else {
>>> +        return 0;
>>> +    }
>>> +
>>> +    /* ask kvm hypervisor to fill struct */
>>> +    memset(&time, 0, sizeof(time));
>>> +    wrmsr(msr, addr | 1);
>> 
>> How can this work?
> 
> It did in my testing, although maybe by pure luck ...
> 
>> There is a 64-byte alignment requirement.
> 
> 64 bytes?  Sure?  The whole struct is only 32 bytes in size ...

er, the documentation says 4 bytes (so stack alignment works).  I
distinctly remember having a large alignment requirement so we don't
cross a page or slot boundary... something's wrong here.

> 
> Easily fixable though, just need to grab some memory with memalign
> instead of using the stack.

> 
>>> +    wrmsr(msr, 0);
>>> +    if (time.version < 2 || time.tsc_to_system_mul == 0)
>>> +        return 0;
>>> +
>>> +    /* go figure tsc frequency */
>>> +    khz = pvclock_tsc_khz(&time);
>>> +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
>>> +            msr, (u32)khz / 1000);
>>> +    return khz;
>> 
>> That's a meaningless number.  You can be migrated to a cpu or a machine
>> with very different tsc.
> 
>> You want accurate time on kvm, don't use the tsc.
> 
> seabios uses the tsc for timeout calculations only, so it doesn't need
> to be 100% accurate.  The order of magnitude should be correct though.
> The Linux kernel uses the value for delay loops too, so using it for the
> given purpose can't be *that* horrible after all ...
> 
> It is certainly an improvement over the current code which tries to
> calibrate the tsc and gets totally broken results in case the busy host
> happens to schedule the guest in the middle of calibration.
> 
> So what do you suggest?  The options I see are:
> 
>   (1) Use this patch (with alignment issue fixed of course).
>   (2) Do a full kvmclock implementation.  Feels a bit like overkill.
>   (3) SeaBIOS can fallback to the PIT for timing on machines which
>       have no TSC.  We could do that too in case we detect kvm ...

What sort of timeouts are these?  If seconds, maybe the rtc would be best.
Avi Kivity Aug. 9, 2012, 2:05 p.m. UTC | #5
On 08/09/2012 05:01 PM, Avi Kivity wrote:
> On 08/09/2012 04:57 PM, Gerd Hoffmann wrote:
>>   Hi,
>> 
>>>> +u64 kvm_tsc_khz(void)
>>>> +{
>>>> +    u32 eax, ebx, ecx, edx, msr;
>>>> +    struct pvclock_vcpu_time_info time;
>>>> +    u32 addr = (u32)(&time);
>>>> +    u64 khz;
>>>> +
>>>> +    /* check presence and figure msr number */
>>>> +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
>>>> +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
>>>> +        msr = MSR_KVM_SYSTEM_TIME_NEW;
>>>> +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
>>>> +        msr = MSR_KVM_SYSTEM_TIME;
>>>> +    } else {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    /* ask kvm hypervisor to fill struct */
>>>> +    memset(&time, 0, sizeof(time));
>>>> +    wrmsr(msr, addr | 1);
>>> 
>>> How can this work?
>> 
>> It did in my testing, although maybe by pure luck ...
>> 
>>> There is a 64-byte alignment requirement.
>> 
>> 64 bytes?  Sure?  The whole struct is only 32 bytes in size ...
> 
> er, the documentation says 4 bytes (so stack alignment works).  I
> distinctly remember having a large alignment requirement so we don't
> cross a page or slot boundary... something's wrong here.

	case MSR_KVM_SYSTEM_TIME: {
		kvmclock_reset(vcpu);

		vcpu->arch.time = data;
		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

		/* we verify if the enable bit is set... */
		if (!(data & 1))
			break;

		/* ...but clean it before doing the actual write */
		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);

		vcpu->arch.time_page =
				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);

		if (is_error_page(vcpu->arch.time_page))
			vcpu->arch.time_page = NULL;

		break;

So your tests worked by pure luck, but the bug is in kvm.  We need to
grab two pages here.
Gerd Hoffmann Aug. 9, 2012, 2:12 p.m. UTC | #6
Hi,

>> er, the documentation says 4 bytes (so stack alignment works).  I
>> distinctly remember having a large alignment requirement so we don't
>> cross a page or slot boundary... something's wrong here.
> 
> 	case MSR_KVM_SYSTEM_TIME: {
[ ... ]

> So your tests worked by pure luck, but the bug is in kvm.  We need to
> grab two pages here.

Ok, so better use memalign(32,32) to make sure the struct doesn't cross
a page border ...

cheers,
  Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 9, 2012, 2:17 p.m. UTC | #7
On 08/09/2012 05:12 PM, Gerd Hoffmann wrote:
>   Hi,
> 
>>> er, the documentation says 4 bytes (so stack alignment works).  I
>>> distinctly remember having a large alignment requirement so we don't
>>> cross a page or slot boundary... something's wrong here.
>> 
>> 	case MSR_KVM_SYSTEM_TIME: {
> [ ... ]
> 
>> So your tests worked by pure luck, but the bug is in kvm.  We need to
>> grab two pages here.
> 
> Ok, so better use memalign(32,32) to make sure the struct doesn't cross
> a page border ...

No, we need to fix kvm, no need to complicate the guest for that.
Gerd Hoffmann Aug. 9, 2012, 2:18 p.m. UTC | #8
Hi,

>> So what do you suggest?  The options I see are:
>>
>>   (1) Use this patch (with alignment issue fixed of course).
>>   (2) Do a full kvmclock implementation.  Feels a bit like overkill.
>>   (3) SeaBIOS can fallback to the PIT for timing on machines which
>>       have no TSC.  We could do that too in case we detect kvm ...
> 
> What sort of timeouts are these?  If seconds, maybe the rtc would be best.

All sorts of timeouts, from a few miliseconds to seconds.

The problematic ones are the longer timeouts, which wait for I/O stuff
like disk reads complete.  The stuff with smaller timeouts (like waiting
for AHCI link become ready) tend to finish instantly in kvm.

cheers,
  Gerd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 9, 2012, 2:20 p.m. UTC | #9
On 08/09/2012 05:18 PM, Gerd Hoffmann wrote:
>   Hi,
> 
>>> So what do you suggest?  The options I see are:
>>>
>>>   (1) Use this patch (with alignment issue fixed of course).
>>>   (2) Do a full kvmclock implementation.  Feels a bit like overkill.
>>>   (3) SeaBIOS can fallback to the PIT for timing on machines which
>>>       have no TSC.  We could do that too in case we detect kvm ...
>> 
>> What sort of timeouts are these?  If seconds, maybe the rtc would be best.
> 
> All sorts of timeouts, from a few miliseconds to seconds.
> 
> The problematic ones are the longer timeouts, which wait for I/O stuff
> like disk reads complete.  The stuff with smaller timeouts (like waiting
> for AHCI link become ready) tend to finish instantly in kvm.

That's not guaranteed.  The AHCI adapter might be real hardware.  Or the
emulation may change.

What's wrong with having a full kvmclock implementation?  Instead of
issuing rdtsc call a function pointer.
Marcelo Tosatti Aug. 9, 2012, 6:59 p.m. UTC | #10
On Thu, Aug 09, 2012 at 03:53:24PM +0300, Avi Kivity wrote:
> On 08/09/2012 02:57 PM, Gerd Hoffmann wrote:
> > Use kvmclock for tsc calibration when running on kvm.  Without this the
> > tsc frequency calibrated by seabios can be *way* off in case the virtual
> > machine is booted on a loaded host.  I've seen seabios calibrating 27
> > instead of ca. 2800 MHz, resulting in timeouts being to short by factor
> > 100.  Which in turn leads to disk I/O errors due to timeouts, especially
> > as I/O requests tend to take a bit longer than usual on a loaded box ...
> 
> > +
> > +struct pvclock_vcpu_time_info {
> > +	u32   version;
> > +	u32   pad0;
> > +	u64   tsc_timestamp;
> > +	u64   system_time;
> > +	u32   tsc_to_system_mul;
> > +	s8    tsc_shift;
> > +	u8    flags;
> > +	u8    pad[2];
> > +} PACKED;
> > +
> > +
> > +u64 kvm_tsc_khz(void)
> > +{
> > +    u32 eax, ebx, ecx, edx, msr;
> > +    struct pvclock_vcpu_time_info time;
> > +    u32 addr = (u32)(&time);
> > +    u64 khz;
> > +
> > +    /* check presence and figure msr number */
> > +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
> > +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
> > +        msr = MSR_KVM_SYSTEM_TIME_NEW;
> > +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
> > +        msr = MSR_KVM_SYSTEM_TIME;
> > +    } else {
> > +        return 0;
> > +    }
> > +
> > +    /* ask kvm hypervisor to fill struct */
> > +    memset(&time, 0, sizeof(time));
> > +    wrmsr(msr, addr | 1);
> 
> How can this work?  There is a 64-byte alignment requirement.
> 
> > +    wrmsr(msr, 0);
> > +    if (time.version < 2 || time.tsc_to_system_mul == 0)
> > +        return 0;
> > +
> > +    /* go figure tsc frequency */
> > +    khz = pvclock_tsc_khz(&time);
> > +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
> > +            msr, (u32)khz / 1000);
> > +    return khz;
> 
> That's a meaningless number.  You can be migrated to a cpu or a machine
> with very different tsc.

Thats why there exists hardware tsc frequency scaling and the software
equivalent for that on kvm.

> You want accurate time on kvm, don't use the tsc.
> 
> 
> -- 
> error compiling committee.c: too many arguments to function
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 9, 2012, 7:02 p.m. UTC | #11
On Thu, Aug 09, 2012 at 05:20:11PM +0300, Avi Kivity wrote:
> On 08/09/2012 05:18 PM, Gerd Hoffmann wrote:
> >   Hi,
> > 
> >>> So what do you suggest?  The options I see are:
> >>>
> >>>   (1) Use this patch (with alignment issue fixed of course).
> >>>   (2) Do a full kvmclock implementation.  Feels a bit like overkill.
> >>>   (3) SeaBIOS can fallback to the PIT for timing on machines which
> >>>       have no TSC.  We could do that too in case we detect kvm ...
> >> 
> >> What sort of timeouts are these?  If seconds, maybe the rtc would be best.
> > 
> > All sorts of timeouts, from a few miliseconds to seconds.
> > 
> > The problematic ones are the longer timeouts, which wait for I/O stuff
> > like disk reads complete.  The stuff with smaller timeouts (like waiting
> > for AHCI link become ready) tend to finish instantly in kvm.
> 
> That's not guaranteed.  The AHCI adapter might be real hardware.  Or the
> emulation may change.
> 
> What's wrong with having a full kvmclock implementation?  Instead of
> issuing rdtsc call a function pointer.

Its not necessary (someone is going to maintain the kvmclock frequency
retrieve, which patch is already here, versus maintainance of 
full kvmclock).

Frequency scaling (or the software equivalent: TSC trapping) are
required for other reasons anyway.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 9, 2012, 7:09 p.m. UTC | #12
On Thu, Aug 09, 2012 at 05:01:34PM +0300, Avi Kivity wrote:
> On 08/09/2012 04:57 PM, Gerd Hoffmann wrote:
> >   Hi,
> > 
> >>> +u64 kvm_tsc_khz(void)
> >>> +{
> >>> +    u32 eax, ebx, ecx, edx, msr;
> >>> +    struct pvclock_vcpu_time_info time;
> >>> +    u32 addr = (u32)(&time);
> >>> +    u64 khz;
> >>> +
> >>> +    /* check presence and figure msr number */
> >>> +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
> >>> +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
> >>> +        msr = MSR_KVM_SYSTEM_TIME_NEW;
> >>> +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
> >>> +        msr = MSR_KVM_SYSTEM_TIME;
> >>> +    } else {
> >>> +        return 0;
> >>> +    }
> >>> +
> >>> +    /* ask kvm hypervisor to fill struct */
> >>> +    memset(&time, 0, sizeof(time));
> >>> +    wrmsr(msr, addr | 1);
> >> 
> >> How can this work?
> > 
> > It did in my testing, although maybe by pure luck ...
> > 
> >> There is a 64-byte alignment requirement.
> > 
> > 64 bytes?  Sure?  The whole struct is only 32 bytes in size ...
> 
> er, the documentation says 4 bytes (so stack alignment works).  I
> distinctly remember having a large alignment requirement so we don't
> cross a page or slot boundary... something's wrong here.
> 
> > 
> > Easily fixable though, just need to grab some memory with memalign
> > instead of using the stack.
> 
> > 
> >>> +    wrmsr(msr, 0);
> >>> +    if (time.version < 2 || time.tsc_to_system_mul == 0)
> >>> +        return 0;
> >>> +
> >>> +    /* go figure tsc frequency */
> >>> +    khz = pvclock_tsc_khz(&time);
> >>> +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
> >>> +            msr, (u32)khz / 1000);
> >>> +    return khz;
> >> 
> >> That's a meaningless number.  You can be migrated to a cpu or a machine
> >> with very different tsc.
> > 
> >> You want accurate time on kvm, don't use the tsc.
> > 
> > seabios uses the tsc for timeout calculations only, so it doesn't need
> > to be 100% accurate.  The order of magnitude should be correct though.
> > The Linux kernel uses the value for delay loops too, so using it for the
> > given purpose can't be *that* horrible after all ...
> > 
> > It is certainly an improvement over the current code which tries to
> > calibrate the tsc and gets totally broken results in case the busy host
> > happens to schedule the guest in the middle of calibration.
> > 
> > So what do you suggest?  The options I see are:
> > 
> >   (1) Use this patch (with alignment issue fixed of course).
> >   (2) Do a full kvmclock implementation.  Feels a bit like overkill.
> >   (3) SeaBIOS can fallback to the PIT for timing on machines which
> >       have no TSC.  We could do that too in case we detect kvm ...
> 
> What sort of timeouts are these?  If seconds, maybe the rtc would be best.

I vote for 3 so nobody has to maintain kvmclock code in SeaBIOS and Gerd
can fix the in-kernel PIT issues with GRUB (see Michaels message) while testing.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 10, 2012, 7:18 a.m. UTC | #13
On Thu, Aug 09, 2012 at 04:09:13PM -0300, Marcelo Tosatti wrote:
> On Thu, Aug 09, 2012 at 05:01:34PM +0300, Avi Kivity wrote:
> > On 08/09/2012 04:57 PM, Gerd Hoffmann wrote:
> > >   Hi,
> > > 
> > >>> +u64 kvm_tsc_khz(void)
> > >>> +{
> > >>> +    u32 eax, ebx, ecx, edx, msr;
> > >>> +    struct pvclock_vcpu_time_info time;
> > >>> +    u32 addr = (u32)(&time);
> > >>> +    u64 khz;
> > >>> +
> > >>> +    /* check presence and figure msr number */
> > >>> +    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
> > >>> +    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
> > >>> +        msr = MSR_KVM_SYSTEM_TIME_NEW;
> > >>> +    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
> > >>> +        msr = MSR_KVM_SYSTEM_TIME;
> > >>> +    } else {
> > >>> +        return 0;
> > >>> +    }
> > >>> +
> > >>> +    /* ask kvm hypervisor to fill struct */
> > >>> +    memset(&time, 0, sizeof(time));
> > >>> +    wrmsr(msr, addr | 1);
> > >> 
> > >> How can this work?
> > > 
> > > It did in my testing, although maybe by pure luck ...
> > > 
> > >> There is a 64-byte alignment requirement.
> > > 
> > > 64 bytes?  Sure?  The whole struct is only 32 bytes in size ...
> > 
> > er, the documentation says 4 bytes (so stack alignment works).  I
> > distinctly remember having a large alignment requirement so we don't
> > cross a page or slot boundary... something's wrong here.
> > 
> > > 
> > > Easily fixable though, just need to grab some memory with memalign
> > > instead of using the stack.
> > 
> > > 
> > >>> +    wrmsr(msr, 0);
> > >>> +    if (time.version < 2 || time.tsc_to_system_mul == 0)
> > >>> +        return 0;
> > >>> +
> > >>> +    /* go figure tsc frequency */
> > >>> +    khz = pvclock_tsc_khz(&time);
> > >>> +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
> > >>> +            msr, (u32)khz / 1000);
> > >>> +    return khz;
> > >> 
> > >> That's a meaningless number.  You can be migrated to a cpu or a machine
> > >> with very different tsc.
> > > 
> > >> You want accurate time on kvm, don't use the tsc.
> > > 
> > > seabios uses the tsc for timeout calculations only, so it doesn't need
> > > to be 100% accurate.  The order of magnitude should be correct though.
> > > The Linux kernel uses the value for delay loops too, so using it for the
> > > given purpose can't be *that* horrible after all ...
> > > 
> > > It is certainly an improvement over the current code which tries to
> > > calibrate the tsc and gets totally broken results in case the busy host
> > > happens to schedule the guest in the middle of calibration.
> > > 
> > > So what do you suggest?  The options I see are:
> > > 
> > >   (1) Use this patch (with alignment issue fixed of course).
> > >   (2) Do a full kvmclock implementation.  Feels a bit like overkill.
> > >   (3) SeaBIOS can fallback to the PIT for timing on machines which
> > >       have no TSC.  We could do that too in case we detect kvm ...
> > 
> > What sort of timeouts are these?  If seconds, maybe the rtc would be best.
> 
> I vote for 3 so nobody has to maintain kvmclock code in SeaBIOS and Gerd
That or pm timer.

> can fix the in-kernel PIT issues with GRUB (see Michaels message) while testing.
> 
What message exactly?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 10, 2012, 7:30 a.m. UTC | #14
On Fri, Aug 10, 2012 at 10:18:00AM +0300, Gleb Natapov wrote:
> > can fix the in-kernel PIT issues with GRUB (see Michaels message) while testing.
> > 
> What message exactly?
> 
found it.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 12, 2012, 9:01 a.m. UTC | #15
On 08/09/2012 09:59 PM, Marcelo Tosatti wrote:
>> 
>> > +    wrmsr(msr, 0);
>> > +    if (time.version < 2 || time.tsc_to_system_mul == 0)
>> > +        return 0;
>> > +
>> > +    /* go figure tsc frequency */
>> > +    khz = pvclock_tsc_khz(&time);
>> > +    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
>> > +            msr, (u32)khz / 1000);
>> > +    return khz;
>> 
>> That's a meaningless number.  You can be migrated to a cpu or a machine
>> with very different tsc.
> 
> Thats why there exists hardware tsc frequency scaling and the software
> equivalent for that on kvm.
> 

The hardware is only available on a minority of processors in the field.
 The software equivalent isn't there.
Avi Kivity Aug. 12, 2012, 10:56 a.m. UTC | #16
On 08/09/2012 10:02 PM, Marcelo Tosatti wrote:
> On Thu, Aug 09, 2012 at 05:20:11PM +0300, Avi Kivity wrote:
>> On 08/09/2012 05:18 PM, Gerd Hoffmann wrote:
>> >   Hi,
>> > 
>> >>> So what do you suggest?  The options I see are:
>> >>>
>> >>>   (1) Use this patch (with alignment issue fixed of course).
>> >>>   (2) Do a full kvmclock implementation.  Feels a bit like overkill.
>> >>>   (3) SeaBIOS can fallback to the PIT for timing on machines which
>> >>>       have no TSC.  We could do that too in case we detect kvm ...
>> >> 
>> >> What sort of timeouts are these?  If seconds, maybe the rtc would be best.
>> > 
>> > All sorts of timeouts, from a few miliseconds to seconds.
>> > 
>> > The problematic ones are the longer timeouts, which wait for I/O stuff
>> > like disk reads complete.  The stuff with smaller timeouts (like waiting
>> > for AHCI link become ready) tend to finish instantly in kvm.
>> 
>> That's not guaranteed.  The AHCI adapter might be real hardware.  Or the
>> emulation may change.
>> 
>> What's wrong with having a full kvmclock implementation?  Instead of
>> issuing rdtsc call a function pointer.
> 
> Its not necessary (someone is going to maintain the kvmclock frequency
> retrieve, which patch is already here, versus maintainance of 
> full kvmclock).

The frequency is meaninless.

> 
> Frequency scaling (or the software equivalent: TSC trapping) are
> required for other reasons anyway.

One thing we can do is enable TSC trapping, then disable it if the guest
activates kvmclock.  That gives us accurate time either way.
diff mbox

Patch

diff --git a/src/clock.c b/src/clock.c
index 69e9f17..5883b1a 100644
--- a/src/clock.c
+++ b/src/clock.c
@@ -13,6 +13,7 @@ 
 #include "bregs.h" // struct bregs
 #include "biosvar.h" // GET_GLOBAL
 #include "usb-hid.h" // usb_check_event
+#include "paravirt.h" // kvm clock
 
 // RTC register flags
 #define RTC_A_UIP 0x80
@@ -80,6 +81,14 @@  calibrate_tsc(void)
         return;
     }
 
+    if (kvm_para_available()) {
+        u32 khz = kvm_tsc_khz();
+        if (khz != 0) {
+            SET_GLOBAL(cpu_khz, khz);
+            return;
+        }
+    }
+
     // Setup "timer2"
     u8 orig = inb(PORT_PS2_CTRLB);
     outb((orig & ~PPCB_SPKR) | PPCB_T2GATE, PORT_PS2_CTRLB);
diff --git a/src/paravirt.c b/src/paravirt.c
index 2a98d53..942ce11 100644
--- a/src/paravirt.c
+++ b/src/paravirt.c
@@ -12,6 +12,7 @@ 
 #include "ioport.h" // outw
 #include "paravirt.h" // qemu_cfg_port_probe
 #include "smbios.h" // struct smbios_structure_header
+#include "biosvar.h" // GET_GLOBAL
 
 int qemu_cfg_present;
 
@@ -346,3 +347,92 @@  void qemu_cfg_romfile_setup(void)
         dprintf(3, "Found fw_cfg file: %s (size=%d)\n", file->name, file->size);
     }
 }
+
+#define KVM_CPUID_SIGNATURE       0x40000000
+#define KVM_CPUID_FEATURES        0x40000001
+#define KVM_FEATURE_CLOCKSOURCE            0
+#define KVM_FEATURE_CLOCKSOURCE2           3
+#define MSR_KVM_SYSTEM_TIME             0x12
+#define MSR_KVM_SYSTEM_TIME_NEW   0x4b564d01
+
+struct pvclock_vcpu_time_info {
+	u32   version;
+	u32   pad0;
+	u64   tsc_timestamp;
+	u64   system_time;
+	u32   tsc_to_system_mul;
+	s8    tsc_shift;
+	u8    flags;
+	u8    pad[2];
+} PACKED;
+
+/*
+ * do_div() is NOT a C function. It wants to return
+ * two values (the quotient and the remainder), but
+ * since that doesn't work very well in C, what it
+ * does is:
+ *
+ * - modifies the 64-bit dividend _in_place_
+ * - returns the 32-bit remainder
+ *
+ * This ends up being the most efficient "calling
+ * convention" on x86.
+ */
+#define do_div(n, base)                                                 \
+    ({                                                                  \
+        unsigned long __upper, __low, __high, __mod, __base;            \
+        __base = (base);                                                \
+        asm("" : "=a" (__low), "=d" (__high) : "A" (n));                \
+        __upper = __high;                                               \
+        if (__high) {                                                   \
+            __upper = __high % (__base);                                \
+            __high = __high / (__base);                                 \
+        }                                                               \
+        asm("divl %2" : "=a" (__low), "=d" (__mod)                      \
+            : "rm" (__base), "0" (__low), "1" (__upper));               \
+        asm("" : "=A" (n) : "a" (__low), "d" (__high));                 \
+        __mod;                                                          \
+    })
+
+static u64 pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+    u64 pv_tsc_khz = 1000000ULL << 32;
+
+    do_div(pv_tsc_khz, src->tsc_to_system_mul);
+    if (src->tsc_shift < 0)
+        pv_tsc_khz <<= -src->tsc_shift;
+    else
+        pv_tsc_khz >>= src->tsc_shift;
+    return pv_tsc_khz;
+}
+
+u64 kvm_tsc_khz(void)
+{
+    u32 eax, ebx, ecx, edx, msr;
+    struct pvclock_vcpu_time_info time;
+    u32 addr = (u32)(&time);
+    u64 khz;
+
+    /* check presence and figure msr number */
+    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
+    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
+        msr = MSR_KVM_SYSTEM_TIME_NEW;
+    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
+        msr = MSR_KVM_SYSTEM_TIME;
+    } else {
+        return 0;
+    }
+
+    /* ask kvm hypervisor to fill struct */
+    memset(&time, 0, sizeof(time));
+    wrmsr(msr, addr | 1);
+    wrmsr(msr, 0);
+    if (time.version < 2 || time.tsc_to_system_mul == 0)
+        return 0;
+
+    /* go figure tsc frequency */
+    khz = pvclock_tsc_khz(&time);
+    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
+            msr, (u32)khz / 1000);
+    return khz;
+}
diff --git a/src/paravirt.h b/src/paravirt.h
index a284c41..eedfcc3 100644
--- a/src/paravirt.h
+++ b/src/paravirt.h
@@ -27,6 +27,7 @@  static inline int kvm_para_available(void)
 
     return 0;
 }
+extern u64 kvm_tsc_khz(void);
 
 #define QEMU_CFG_SIGNATURE              0x00
 #define QEMU_CFG_ID                     0x01