Message ID | 1442591670-5216-2-git-send-email-rkrcmar@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Sep 18, 2015 at 05:54:29PM +0200, Radim Kr?má? wrote: > Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore. > The purpose of that flags was to start counting system time from 0 when > the KVM clock has been initialized. > We can achieve the same by selecting one read as the initial point. > > A simple subtraction will work unless the KVM clock count overflows > earlier (has smaller width) than scheduler's cycle count. We should be > safe till x86_128. > > Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors, > setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might > regress on older ones. > > I presume we don't need to change kvm_clock_read instead of introducing > kvm_sched_clock_read. A problem could arise in case sched_clock is > expected to return the same value as get_cycles, but we should have > merged those clocks in that case. > > Signed-off-by: Radim Kr?má? <rkrcmar@redhat.com> > --- > arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++----------- > 1 file changed, 35 insertions(+), 11 deletions(-) > > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > index 2c7aafa70702..ef5b3d2cecce 100644 > --- a/arch/x86/kernel/kvmclock.c > +++ b/arch/x86/kernel/kvmclock.c > @@ -32,6 +32,7 @@ > static int kvmclock = 1; > static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; > static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; > +static cycle_t kvm_sched_clock_offset; > > static int parse_no_kvmclock(char *arg) > { > @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) > return kvm_clock_read(); > } > > +static cycle_t kvm_sched_clock_read(void) > +{ > + return kvm_clock_read() - kvm_sched_clock_offset; > +} > + > +static inline void kvm_sched_clock_init(bool stable) > +{ > + if (!stable) { > + pv_time_ops.sched_clock = kvm_clock_read; > + return; > + } > + > + kvm_sched_clock_offset = kvm_clock_read(); > + pv_time_ops.sched_clock = kvm_sched_clock_read; > + set_sched_clock_stable(); > + > + printk("kvm-clock: using sched offset of %llu cycles\n", > + kvm_sched_clock_offset); > + > + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); > +} > + > /* > * If we don't do that, there is the possibility that the guest > * will calibrate under heavy load - thus, getting a lower lpj - > @@ -248,7 +272,17 @@ void __init kvmclock_init(void) > memblock_free(mem, size); > return; > } > - pv_time_ops.sched_clock = kvm_clock_read; > + > + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) > + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); > + > + cpu = get_cpu(); > + vcpu_time = &hv_clock[cpu].pvti; > + flags = pvclock_read_flags(vcpu_time); > + > + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); > + put_cpu(); > + > x86_platform.calibrate_tsc = kvm_get_tsc_khz; > x86_platform.get_wallclock = kvm_get_wallclock; > x86_platform.set_wallclock = kvm_set_wallclock; > @@ -265,16 +299,6 @@ void __init kvmclock_init(void) > kvm_get_preset_lpj(); > clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); > pv_info.name = "KVM"; > - > - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) > - pvclock_set_flags(~0); > - > - cpu = get_cpu(); > - vcpu_time = &hv_clock[cpu].pvti; > - flags = pvclock_read_flags(vcpu_time); > - if (flags & PVCLOCK_COUNTS_FROM_ZERO) > - set_sched_clock_stable(); > - put_cpu(); > } > > int __init kvm_setup_vsyscall_timeinfo(void) > -- > 2.5.2 ACK -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 18/09/2015 17:54, Radim Kr?má? wrote: > + kvm_sched_clock_offset = kvm_clock_read(); > + pv_time_ops.sched_clock = kvm_sched_clock_read; > + set_sched_clock_stable(); > + > + printk("kvm-clock: using sched offset of %llu cycles\n", Ok to change this to KERN_DEBUG or KERN_INFO? Paolo > + kvm_sched_clock_offset); > + -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2c7aafa70702..ef5b3d2cecce 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,6 +32,7 @@ static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; +static cycle_t kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } +static cycle_t kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) { + pv_time_ops.sched_clock = kvm_clock_read; + return; + } + + kvm_sched_clock_offset = kvm_clock_read(); + pv_time_ops.sched_clock = kvm_sched_clock_read; + set_sched_clock_stable(); + + printk("kvm-clock: using sched offset of %llu cycles\n", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + /* * If we don't do that, there is the possibility that the guest * will calibrate under heavy load - thus, getting a lower lpj - @@ -248,7 +272,17 @@ void __init kvmclock_init(void) memblock_free(mem, size); return; } - pv_time_ops.sched_clock = kvm_clock_read; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + put_cpu(); + x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; @@ -265,16 +299,6 @@ void __init kvmclock_init(void) kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(~0); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - if (flags & PVCLOCK_COUNTS_FROM_ZERO) - set_sched_clock_stable(); - put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void)
Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore. The purpose of that flags was to start counting system time from 0 when the KVM clock has been initialized. We can achieve the same by selecting one read as the initial point. A simple subtraction will work unless the KVM clock count overflows earlier (has smaller width) than scheduler's cycle count. We should be safe till x86_128. Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors, setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might regress on older ones. I presume we don't need to change kvm_clock_read instead of introducing kvm_sched_clock_read. A problem could arise in case sched_clock is expected to return the same value as get_cycles, but we should have merged those clocks in that case. Signed-off-by: Radim Kr?má? <rkrcmar@redhat.com> --- arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-)