Message ID | 1560332419-17195-1-git-send-email-wanpengli@tencent.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v3,1/2] KVM: LAPIC: Optimize timer latency consider world switch time | expand |
On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote: > From: Wanpeng Li <wanpengli@tencent.com> > > Advance lapic timer tries to hidden the hypervisor overhead between the > host emulated timer fires and the guest awares the timer is fired. However, > even though after more sustaining optimizations, kvm-unit-tests/tscdeadline_latency > still awares ~1000 cycles latency since we lost the time between the end of > wait_lapic_expire and the guest awares the timer is fired. There are > codes between the end of wait_lapic_expire and the world switch, furthermore, > the world switch itself also has overhead. Actually the guest_tsc is equal > to the target deadline time in wait_lapic_expire is too late, guest will > aware the latency between the end of wait_lapic_expire() and after vmentry > to the guest. This patch takes this time into consideration. > > The vmentry+vmexit time which is measured by kvm-unit-tests/vmexit.falt is > 1800 cycles on my 2.5GHz Skylake server, the vmentry_advance_ns module > parameter default value is 300ns, it can be tuned/reworked in the further. > This patch can reduce average cyclictest latency from 3us to 2us on Skylake > server. (guest w/ nohz=off, idle=poll, host w/ preemption_timer=N, the > cyclictest latency is not too sensitive when preemption_timer=Y for this > optimization in my testing). > > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: Radim Krčmář <rkrcmar@redhat.com> > Cc: Sean Christopherson <sean.j.christopherson@intel.com> > Signed-off-by: Wanpeng Li <wanpengli@tencent.com> > --- > v2 -> v3: > * read-only module parameter > * get_vmentry_advance_cycles() not inline > v1 -> v2: > * rename get_vmentry_advance_delta to get_vmentry_advance_cycles > * cache vmentry_advance_cycles by setting param bit 0 > * add param max limit > > arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++++++--- > arch/x86/kvm/lapic.h | 3 +++ > arch/x86/kvm/vmx/vmx.c | 2 +- > arch/x86/kvm/x86.c | 8 ++++++++ > arch/x86/kvm/x86.h | 2 ++ > 5 files changed, 44 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index fcf42a3..c6d76f9 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -1531,6 +1531,33 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, > apic->lapic_timer.timer_advance_ns = timer_advance_ns; > } > > +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu) > +{ > + u64 cycles; > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz; > + do_div(cycles, 1000000); > + > + apic->lapic_timer.vmentry_advance_cycles = cycles; > + > + return cycles; > +} > + > +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + if (unlikely(!vmentry_advance_ns)) > + return 0; > + > + if (likely(apic->lapic_timer.vmentry_advance_cycles)) > + return apic->lapic_timer.vmentry_advance_cycles; > + > + return compute_vmentry_advance_cycles(vcpu); If vmentry_advance_ns is read-only, then we don't need to be able to compute lapic_timer.vmentry_advance_cycles on demand, e.g. it can be set during kvm_create_lapic() and recomputed in kvm_set_tsc_khz(). Alternatively, it could be handled purely in kvm_set_tsc_khz() if the call to kvm_create_lapic() were moved before kvm_set_tsc_khz(). > +} > +EXPORT_SYMBOL_GPL(get_vmentry_advance_cycles); > + > void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) > { > struct kvm_lapic *apic = vcpu->arch.apic; > @@ -1544,7 +1571,7 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) > > tsc_deadline = apic->lapic_timer.expired_tscdeadline; > apic->lapic_timer.expired_tscdeadline = 0; > - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); > + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu); > apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; > > if (guest_tsc < tsc_deadline) > @@ -1572,7 +1599,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) > local_irq_save(flags); > > now = ktime_get(); > - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); > + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu); > ns = (tscdeadline - guest_tsc) * 1000000ULL; > do_div(ns, this_tsc_khz); > @@ -2329,7 +2356,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) > apic->lapic_timer.timer_advance_ns = timer_advance_ns; > apic->lapic_timer.timer_advance_adjust_done = true; > } > - > + apic->lapic_timer.vmentry_advance_cycles = 0; > > /* > * APIC is created enabled. This will prevent kvm_lapic_set_base from > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index f974a3d..fb32e69 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -33,6 +33,7 @@ struct kvm_timer { > u64 expired_tscdeadline; > u32 timer_advance_ns; > s64 advance_expire_delta; > + u64 vmentry_advance_cycles; > atomic_t pending; /* accumulated triggered timers */ > bool hv_timer_in_use; > bool timer_advance_adjust_done; > @@ -221,6 +222,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) > bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); > > void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); > +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu); > +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu); > > bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, > struct kvm_vcpu **dest_vcpu); > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index 0861c71..0751a44 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -7041,7 +7041,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, > > vmx = to_vmx(vcpu); > tscl = rdtsc(); > - guest_tscl = kvm_read_l1_tsc(vcpu, tscl); > + guest_tscl = kvm_read_l1_tsc(vcpu, tscl) + get_vmentry_advance_cycles(vcpu); > delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; > lapic_timer_advance_cycles = nsec_to_cycles(vcpu, > ktimer->timer_advance_ns); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 553c292..4b983bb 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); > static int __read_mostly lapic_timer_advance_ns = -1; > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); > > +/* > + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds. > + */ > +u32 __read_mostly vmentry_advance_ns = 300; Enabling this by default makes me nervous, e.g. nothing guarantees that future versions of KVM and/or CPUs will continue to have 300ns of overhead between wait_lapic_expire() and VM-Enter. If we want it enabled by default so that it gets tested, the default value should be extremely conservative, e.g. set the default to a small percentage (25%?) of the latency of VM-Enter itself on modern CPUs, VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE. > +module_param(vmentry_advance_ns, uint, S_IRUGO); > + > static bool __read_mostly vector_hashing = true; > module_param(vector_hashing, bool, S_IRUGO); > > @@ -1592,6 +1598,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) > kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, > &vcpu->arch.virtual_tsc_shift, > &vcpu->arch.virtual_tsc_mult); > + if (vcpu->arch.apic && user_tsc_khz != vcpu->arch.virtual_tsc_khz) > + compute_vmentry_advance_cycles(vcpu); > vcpu->arch.virtual_tsc_khz = user_tsc_khz; > > /* > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index a470ff0..2174355 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -294,6 +294,8 @@ extern u64 kvm_supported_xcr0(void); > > extern unsigned int min_timer_period_us; > > +extern unsigned int vmentry_advance_ns; > + > extern bool enable_vmware_backdoor; > > extern struct static_key kvm_no_apic_vcpu; > -- > 2.7.4 >
2019-06-12 08:14-0700, Sean Christopherson: > On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote: > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); > > static int __read_mostly lapic_timer_advance_ns = -1; > > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); > > > > +/* > > + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds. > > + */ > > +u32 __read_mostly vmentry_advance_ns = 300; > > Enabling this by default makes me nervous, e.g. nothing guarantees that > future versions of KVM and/or CPUs will continue to have 300ns of overhead > between wait_lapic_expire() and VM-Enter. > > If we want it enabled by default so that it gets tested, the default > value should be extremely conservative, e.g. set the default to a small > percentage (25%?) of the latency of VM-Enter itself on modern CPUs, > VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE. I share the sentiment. We definitely must not enter the guest before the deadline has expired and CPUs are approaching 5 GHz (in turbo), so 300 ns would be too much even today. I wrote a simple testcase for rough timing and there are 267 cycles (111 ns @ 2.4 GHz) between doing rdtsc() right after kvm_wait_lapic_expire() [1] and doing rdtsc() in the guest as soon as possible (see the attached kvm-unit-test). That is on a Haswell, where vmexit.flat reports 2120 cycles for a vmcall. This would linearly (likely incorrect method in this case) translate to 230 cycles on a machine with 1800 cycles for a vmcall, which is less than 50 ns @ 5 GHz. I wouldn't go above 25 ns for a hard-coded default. (We could also do a similar measurement when initializing KVM and have a dynamic default, but I'm thinking it's going to be way too much code for the benefit.) --- 1: This is how the TSC is read in KVM. diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index da24f1858acc..a7251ac0109b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6449,6 +6449,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apic->lapic_timer.timer_advance_ns) kvm_wait_lapic_expire(vcpu); + vcpu->last_seen_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6200d5a51f13..5e0ce8ca31e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7201,6 +7201,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_SEND_IPI: ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; + case KVM_HC_LAST_SEEN_TSC: + ret = vcpu->last_seen_tsc; + break; default: ret = -KVM_ENOSYS; break; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index abafddb9fe2c..7f70fe7a28b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -323,6 +323,8 @@ struct kvm_vcpu { bool preempted; struct kvm_vcpu_arch arch; struct dentry *debugfs_dentry; + + u64 last_seen_tsc; }; static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index 6c0ce49931e5..dfbc6e9ad7a1 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 #define KVM_HC_CLOCK_PAIRING 9 #define KVM_HC_SEND_IPI 10 +#define KVM_HC_LAST_SEEN_TSC 11 /* * hypercalls use architecture specific
2019-06-12 21:22+0200, Radim Krčmář: > 2019-06-12 08:14-0700, Sean Christopherson: > > On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote: > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); > > > static int __read_mostly lapic_timer_advance_ns = -1; > > > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); > > > > > > +/* > > > + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds. > > > + */ > > > +u32 __read_mostly vmentry_advance_ns = 300; > > > > Enabling this by default makes me nervous, e.g. nothing guarantees that > > future versions of KVM and/or CPUs will continue to have 300ns of overhead > > between wait_lapic_expire() and VM-Enter. > > > > If we want it enabled by default so that it gets tested, the default > > value should be extremely conservative, e.g. set the default to a small > > percentage (25%?) of the latency of VM-Enter itself on modern CPUs, > > VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE. > > I share the sentiment. We definitely must not enter the guest before > the deadline has expired and CPUs are approaching 5 GHz (in turbo), so > 300 ns would be too much even today. > > I wrote a simple testcase for rough timing and there are 267 cycles > (111 ns @ 2.4 GHz) between doing rdtsc() right after > kvm_wait_lapic_expire() [1] and doing rdtsc() in the guest as soon as > possible (see the attached kvm-unit-test). I forgot to attach it, pasting here as a patch for kvm-unit-tests. --- diff --git a/x86/Makefile.common b/x86/Makefile.common index e612dbe..ceed648 100644 --- a/x86/Makefile.common +++ b/x86/Makefile.common @@ -58,7 +58,7 @@ tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \ $(TEST_DIR)/init.flat $(TEST_DIR)/smap.flat \ $(TEST_DIR)/hyperv_synic.flat $(TEST_DIR)/hyperv_stimer.flat \ $(TEST_DIR)/hyperv_connections.flat \ - $(TEST_DIR)/umip.flat + $(TEST_DIR)/umip.flat $(TEST_DIR)/vmentry_latency.flat ifdef API tests-api = api/api-sample api/dirty-log api/dirty-log-perf diff --git a/x86/vmentry_latency.c b/x86/vmentry_latency.c new file mode 100644 index 0000000..3859f09 --- /dev/null +++ b/x86/vmentry_latency.c @@ -0,0 +1,45 @@ +#include "x86/vm.h" + +static u64 get_last_hypervisor_tsc_delta(void) +{ + u64 a = 0, b, c, d; + u64 tsc; + + /* + * The first vmcall is there to force a vm exit just before measuring. + */ + asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d)); + + tsc = rdtsc(); + + /* + * The second hypercall recovers the value that was stored when vm + * entering to execute the rdtsc() + */ + a = 11; + asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d)); + + return tsc - a; +} + +static void vmentry_latency(void) +{ + unsigned i = 1000000; + u64 min = -1; + + while (i--) { + u64 latency = get_last_hypervisor_tsc_delta(); + if (latency < min) + min = latency; + } + + printf("vm entry latency is %"PRIu64" TSC cycles\n", min); +} + +int main(void) +{ + setup_vm(); + vmentry_latency(); + + return 0; +}
On Wed, 12 Jun 2019 at 23:14, Sean Christopherson <sean.j.christopherson@intel.com> wrote: > > On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote: > > From: Wanpeng Li <wanpengli@tencent.com> > > > > Advance lapic timer tries to hidden the hypervisor overhead between the > > host emulated timer fires and the guest awares the timer is fired. However, > > even though after more sustaining optimizations, kvm-unit-tests/tscdeadline_latency > > still awares ~1000 cycles latency since we lost the time between the end of > > wait_lapic_expire and the guest awares the timer is fired. There are > > codes between the end of wait_lapic_expire and the world switch, furthermore, > > the world switch itself also has overhead. Actually the guest_tsc is equal > > to the target deadline time in wait_lapic_expire is too late, guest will > > aware the latency between the end of wait_lapic_expire() and after vmentry > > to the guest. This patch takes this time into consideration. > > > > The vmentry+vmexit time which is measured by kvm-unit-tests/vmexit.falt is > > 1800 cycles on my 2.5GHz Skylake server, the vmentry_advance_ns module > > parameter default value is 300ns, it can be tuned/reworked in the further. > > This patch can reduce average cyclictest latency from 3us to 2us on Skylake > > server. (guest w/ nohz=off, idle=poll, host w/ preemption_timer=N, the > > cyclictest latency is not too sensitive when preemption_timer=Y for this > > optimization in my testing). > > > > Cc: Paolo Bonzini <pbonzini@redhat.com> > > Cc: Radim Krčmář <rkrcmar@redhat.com> > > Cc: Sean Christopherson <sean.j.christopherson@intel.com> > > Signed-off-by: Wanpeng Li <wanpengli@tencent.com> > > --- > > v2 -> v3: > > * read-only module parameter > > * get_vmentry_advance_cycles() not inline > > v1 -> v2: > > * rename get_vmentry_advance_delta to get_vmentry_advance_cycles > > * cache vmentry_advance_cycles by setting param bit 0 > > * add param max limit > > > > arch/x86/kvm/lapic.c | 33 ++++++++++++++++++++++++++++++--- > > arch/x86/kvm/lapic.h | 3 +++ > > arch/x86/kvm/vmx/vmx.c | 2 +- > > arch/x86/kvm/x86.c | 8 ++++++++ > > arch/x86/kvm/x86.h | 2 ++ > > 5 files changed, 44 insertions(+), 4 deletions(-) > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > > index fcf42a3..c6d76f9 100644 > > --- a/arch/x86/kvm/lapic.c > > +++ b/arch/x86/kvm/lapic.c > > @@ -1531,6 +1531,33 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, > > apic->lapic_timer.timer_advance_ns = timer_advance_ns; > > } > > > > +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu) > > +{ > > + u64 cycles; > > + struct kvm_lapic *apic = vcpu->arch.apic; > > + > > + cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz; > > + do_div(cycles, 1000000); > > + > > + apic->lapic_timer.vmentry_advance_cycles = cycles; > > + > > + return cycles; > > +} > > + > > +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu) > > +{ > > + struct kvm_lapic *apic = vcpu->arch.apic; > > + > > + if (unlikely(!vmentry_advance_ns)) > > + return 0; > > + > > + if (likely(apic->lapic_timer.vmentry_advance_cycles)) > > + return apic->lapic_timer.vmentry_advance_cycles; > > + > > + return compute_vmentry_advance_cycles(vcpu); > > If vmentry_advance_ns is read-only, then we don't need to be able to > compute lapic_timer.vmentry_advance_cycles on demand, e.g. it can be set > during kvm_create_lapic() and recomputed in kvm_set_tsc_khz(). > > Alternatively, it could be handled purely in kvm_set_tsc_khz() if the call > to kvm_create_lapic() were moved before kvm_set_tsc_khz(). hardcode 25ns, how about something like this: diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fcf42a3..ed62d6b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1531,6 +1531,19 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, apic->lapic_timer.timer_advance_ns = timer_advance_ns; } +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu) +{ + u64 cycles; + struct kvm_lapic *apic = vcpu->arch.apic; + + cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz; + do_div(cycles, 1000000); + + apic->lapic_timer.vmentry_advance_cycles = cycles; + + return cycles; +} + void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1544,7 +1557,8 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + + apic->lapic_timer.vmentry_advance_cycles; apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; if (guest_tsc < tsc_deadline) @@ -1572,7 +1586,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_save(flags); now = ktime_get(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + + apic->lapic_timer.vmentry_advance_cycles; ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); @@ -2329,7 +2344,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->lapic_timer.timer_advance_ns = timer_advance_ns; apic->lapic_timer.timer_advance_adjust_done = true; } - + apic->lapic_timer.vmentry_advance_cycles = 0; /* * APIC is created enabled. This will prevent kvm_lapic_set_base from diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f974a3d..751cb26 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -33,6 +33,7 @@ struct kvm_timer { u64 expired_tscdeadline; u32 timer_advance_ns; s64 advance_expire_delta; + u64 vmentry_advance_cycles; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; bool timer_advance_adjust_done; @@ -221,6 +222,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0861c71..b572359 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7041,7 +7041,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, vmx = to_vmx(vcpu); tscl = rdtsc(); - guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + guest_tscl = kvm_read_l1_tsc(vcpu, tscl) + + vcpu->arch.apic->lapic_timer.vmentry_advance_cycles; delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; lapic_timer_advance_cycles = nsec_to_cycles(vcpu, ktimer->timer_advance_ns); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 553c292..f6e1366 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); +/* + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds. + */ +u32 __read_mostly vmentry_advance_ns = 25; +module_param(vmentry_advance_ns, uint, S_IRUGO); + static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); @@ -1592,6 +1598,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, &vcpu->arch.virtual_tsc_shift, &vcpu->arch.virtual_tsc_mult); + if (user_tsc_khz != vcpu->arch.virtual_tsc_khz) + compute_vmentry_advance_cycles(vcpu); vcpu->arch.virtual_tsc_khz = user_tsc_khz; /* @@ -9127,8 +9135,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - kvm_set_tsc_khz(vcpu, max_tsc_khz); - r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -9141,6 +9147,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } else static_key_slow_inc(&kvm_no_apic_vcpu); + kvm_set_tsc_khz(vcpu, max_tsc_khz); + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a470ff0..2174355 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,6 +294,8 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; +extern unsigned int vmentry_advance_ns; + extern bool enable_vmware_backdoor; extern struct static_key kvm_no_apic_vcpu;
On Thu, 13 Jun 2019 at 03:27, Radim Krčmář <rkrcmar@redhat.com> wrote: > > 2019-06-12 21:22+0200, Radim Krčmář: > > 2019-06-12 08:14-0700, Sean Christopherson: > > > On Wed, Jun 12, 2019 at 05:40:18PM +0800, Wanpeng Li wrote: > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); > > > > static int __read_mostly lapic_timer_advance_ns = -1; > > > > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); > > > > > > > > +/* > > > > + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds. > > > > + */ > > > > +u32 __read_mostly vmentry_advance_ns = 300; > > > > > > Enabling this by default makes me nervous, e.g. nothing guarantees that > > > future versions of KVM and/or CPUs will continue to have 300ns of overhead > > > between wait_lapic_expire() and VM-Enter. > > > > > > If we want it enabled by default so that it gets tested, the default > > > value should be extremely conservative, e.g. set the default to a small > > > percentage (25%?) of the latency of VM-Enter itself on modern CPUs, > > > VM-Enter latency being the min between VMLAUNCH and VMLOAD+VMRUN+VMSAVE. > > > > I share the sentiment. We definitely must not enter the guest before > > the deadline has expired and CPUs are approaching 5 GHz (in turbo), so > > 300 ns would be too much even today. > > > > I wrote a simple testcase for rough timing and there are 267 cycles > > (111 ns @ 2.4 GHz) between doing rdtsc() right after > > kvm_wait_lapic_expire() [1] and doing rdtsc() in the guest as soon as > > possible (see the attached kvm-unit-test). > > I forgot to attach it, pasting here as a patch for kvm-unit-tests. Thanks for this, Radim. :) Regards, Wanpeng Li > > --- > diff --git a/x86/Makefile.common b/x86/Makefile.common > index e612dbe..ceed648 100644 > --- a/x86/Makefile.common > +++ b/x86/Makefile.common > @@ -58,7 +58,7 @@ tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \ > $(TEST_DIR)/init.flat $(TEST_DIR)/smap.flat \ > $(TEST_DIR)/hyperv_synic.flat $(TEST_DIR)/hyperv_stimer.flat \ > $(TEST_DIR)/hyperv_connections.flat \ > - $(TEST_DIR)/umip.flat > + $(TEST_DIR)/umip.flat $(TEST_DIR)/vmentry_latency.flat > > ifdef API > tests-api = api/api-sample api/dirty-log api/dirty-log-perf > diff --git a/x86/vmentry_latency.c b/x86/vmentry_latency.c > new file mode 100644 > index 0000000..3859f09 > --- /dev/null > +++ b/x86/vmentry_latency.c > @@ -0,0 +1,45 @@ > +#include "x86/vm.h" > + > +static u64 get_last_hypervisor_tsc_delta(void) > +{ > + u64 a = 0, b, c, d; > + u64 tsc; > + > + /* > + * The first vmcall is there to force a vm exit just before measuring. > + */ > + asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d)); > + > + tsc = rdtsc(); > + > + /* > + * The second hypercall recovers the value that was stored when vm > + * entering to execute the rdtsc() > + */ > + a = 11; > + asm volatile ("vmcall" : "+a"(a), "=b"(b), "=c"(c), "=d"(d)); > + > + return tsc - a; > +} > + > +static void vmentry_latency(void) > +{ > + unsigned i = 1000000; > + u64 min = -1; > + > + while (i--) { > + u64 latency = get_last_hypervisor_tsc_delta(); > + if (latency < min) > + min = latency; > + } > + > + printf("vm entry latency is %"PRIu64" TSC cycles\n", min); > +} > + > +int main(void) > +{ > + setup_vm(); > + vmentry_latency(); > + > + return 0; > +}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fcf42a3..c6d76f9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1531,6 +1531,33 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, apic->lapic_timer.timer_advance_ns = timer_advance_ns; } +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu) +{ + u64 cycles; + struct kvm_lapic *apic = vcpu->arch.apic; + + cycles = vmentry_advance_ns * vcpu->arch.virtual_tsc_khz; + do_div(cycles, 1000000); + + apic->lapic_timer.vmentry_advance_cycles = cycles; + + return cycles; +} + +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (unlikely(!vmentry_advance_ns)) + return 0; + + if (likely(apic->lapic_timer.vmentry_advance_cycles)) + return apic->lapic_timer.vmentry_advance_cycles; + + return compute_vmentry_advance_cycles(vcpu); +} +EXPORT_SYMBOL_GPL(get_vmentry_advance_cycles); + void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1544,7 +1571,7 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu); apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; if (guest_tsc < tsc_deadline) @@ -1572,7 +1599,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_save(flags); now = ktime_get(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) + get_vmentry_advance_cycles(vcpu); ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); @@ -2329,7 +2356,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->lapic_timer.timer_advance_ns = timer_advance_ns; apic->lapic_timer.timer_advance_adjust_done = true; } - + apic->lapic_timer.vmentry_advance_cycles = 0; /* * APIC is created enabled. This will prevent kvm_lapic_set_base from diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f974a3d..fb32e69 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -33,6 +33,7 @@ struct kvm_timer { u64 expired_tscdeadline; u32 timer_advance_ns; s64 advance_expire_delta; + u64 vmentry_advance_cycles; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; bool timer_advance_adjust_done; @@ -221,6 +222,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); +u64 compute_vmentry_advance_cycles(struct kvm_vcpu *vcpu); +u64 get_vmentry_advance_cycles(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0861c71..0751a44 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7041,7 +7041,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, vmx = to_vmx(vcpu); tscl = rdtsc(); - guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + guest_tscl = kvm_read_l1_tsc(vcpu, tscl) + get_vmentry_advance_cycles(vcpu); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; lapic_timer_advance_cycles = nsec_to_cycles(vcpu, ktimer->timer_advance_ns); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 553c292..4b983bb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -145,6 +145,12 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); +/* + * lapic timer vmentry advance (tscdeadline mode only) in nanoseconds. + */ +u32 __read_mostly vmentry_advance_ns = 300; +module_param(vmentry_advance_ns, uint, S_IRUGO); + static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); @@ -1592,6 +1598,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, &vcpu->arch.virtual_tsc_shift, &vcpu->arch.virtual_tsc_mult); + if (vcpu->arch.apic && user_tsc_khz != vcpu->arch.virtual_tsc_khz) + compute_vmentry_advance_cycles(vcpu); vcpu->arch.virtual_tsc_khz = user_tsc_khz; /* diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a470ff0..2174355 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,6 +294,8 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; +extern unsigned int vmentry_advance_ns; + extern bool enable_vmware_backdoor; extern struct static_key kvm_no_apic_vcpu;