Message ID | 20141210205904.415174860@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 10/12/2014 21:57, Marcelo Tosatti wrote: > For the hrtimer which emulates the tscdeadline timer in the guest, > add an option to advance expiration, and busy spin on VM-entry waiting > for the actual expiration time to elapse. > > This allows achieving low latencies in cyclictest (or any scenario > which requires strict timing regarding timer expiration). > > Reduces cyclictest avg latency by 50%. > > Note: this option requires tuning to find the appropriate value > for a particular hardware/guest combination. One method is to measure the > average delay between apic_timer_fn and VM-entry. > Another method is to start with 1000ns, and increase the value > in say 500ns increments until avg cyclictest numbers stop decreasing. What values are you using in practice for the parameter? > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> > > Index: kvm/arch/x86/kvm/lapic.c > =================================================================== > --- kvm.orig/arch/x86/kvm/lapic.c > +++ kvm/arch/x86/kvm/lapic.c > @@ -33,6 +33,7 @@ > #include <asm/page.h> > #include <asm/current.h> > #include <asm/apicdef.h> > +#include <asm/delay.h> > #include <linux/atomic.h> > #include <linux/jump_label.h> > #include "kvm_cache_regs.h" > @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv > { > struct kvm_vcpu *vcpu = apic->vcpu; > wait_queue_head_t *q = &vcpu->wq; > + struct kvm_timer *ktimer = &apic->lapic_timer; > > /* > * Note: KVM_REQ_PENDING_TIMER is implicitly checked in > @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv > > if (waitqueue_active(q)) > wake_up_interruptible(q); > + > + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE) > + ktimer->expired_tscdeadline = ktimer->tscdeadline; > +} > + > +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); > + > + if (kvm_apic_hw_enabled(apic)) { > + int vec = reg & APIC_VECTOR_MASK; > + > + if (kvm_x86_ops->test_posted_interrupt) > + return kvm_x86_ops->test_posted_interrupt(vcpu, vec); > + else { > + if (apic_test_vector(vec, apic->regs + APIC_ISR)) > + return true; > + } One branch here is testing IRR, the other is testing ISR. I think testing ISR is right; on APICv, the above test will cause a busy wait during a higher-priority task (or during an interrupt service routine for the timer itself), just because the timer interrupt was delivered. So, on APICv, if the interrupt is in PIR but it has bits 7:4 <= PPR[7:4], you have a problem. :( There is no APICv hook that lets you get a vmexit when the PPR becomes low enough. > + } > + return false; > +} > + > +void wait_lapic_expire(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + u64 guest_tsc, tsc_deadline; > + > + if (!kvm_vcpu_has_lapic(vcpu)) > + return; > + > + if (!apic_lvtt_tscdeadline(apic)) > + return; This test is wrong, I think. You need to check whether the timer interrupt was a TSC deadline interrupt. Instead, you are checking whether the current mode is TSC-deadline. This can be different if the interrupt could not be delivered immediately after it was received. This is easy to fix: replace the first two tests with "apic->lapic_timer.expired_tscdeadline != 0" and... > + if (!lapic_timer_int_injected(vcpu)) > + return; > + tsc_deadline = apic->lapic_timer.expired_tscdeadline; ... set apic->lapic_timer.expired_tscdeadline to 0 here. But I'm not sure how to solve the above problem with APICv. That's a pity. Knowing what values you use in practice for the parameter, would also make it easier to understand the problem. Please report that together with the graphs produced by the unit test you added. Paolo > + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); > + > + while (guest_tsc < tsc_deadline) { > + int delay = min(tsc_deadline - guest_tsc, 1000ULL); > + > + ndelay(delay); > + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); > + } > } > > static void start_apic_timer(struct kvm_lapic *apic) > { > ktime_t now; > + > atomic_set(&apic->lapic_timer.pending, 0); > > if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { > @@ -1137,6 +1186,7 @@ static void start_apic_timer(struct kvm_ > /* lapic timer in tsc deadline mode */ > u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; > u64 ns = 0; > + ktime_t expire; > struct kvm_vcpu *vcpu = apic->vcpu; > unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; > unsigned long flags; > @@ -1151,8 +1201,10 @@ static void start_apic_timer(struct kvm_ > if (likely(tscdeadline > guest_tsc)) { > ns = (tscdeadline - guest_tsc) * 1000000ULL; > do_div(ns, this_tsc_khz); > + expire = ktime_add_ns(now, ns); > + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); > hrtimer_start(&apic->lapic_timer.timer, > - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); > + expire, HRTIMER_MODE_ABS); > } else > apic_timer_expired(apic); > > Index: kvm/arch/x86/kvm/lapic.h > =================================================================== > --- kvm.orig/arch/x86/kvm/lapic.h > +++ kvm/arch/x86/kvm/lapic.h > @@ -14,6 +14,7 @@ struct kvm_timer { > u32 timer_mode; > u32 timer_mode_mask; > u64 tscdeadline; > + u64 expired_tscdeadline; > atomic_t pending; /* accumulated triggered timers */ > }; > > @@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s > > bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); > > +void wait_lapic_expire(struct kvm_vcpu *vcpu); > + > #endif > Index: kvm/arch/x86/kvm/x86.c > =================================================================== > --- kvm.orig/arch/x86/kvm/x86.c > +++ kvm/arch/x86/kvm/x86.c > @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz) > static u32 tsc_tolerance_ppm = 250; > module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); > > +/* lapic timer advance (tscdeadline mode only) in nanoseconds */ > +unsigned int lapic_timer_advance_ns = 0; > +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); > + > static bool backwards_tsc_observed = false; > > #define KVM_NR_SHARED_MSRS 16 > @@ -6311,6 +6315,7 @@ static int vcpu_enter_guest(struct kvm_v > } > > trace_kvm_entry(vcpu->vcpu_id); > + wait_lapic_expire(vcpu); > kvm_x86_ops->run(vcpu); > > /* > Index: kvm/arch/x86/kvm/x86.h > =================================================================== > --- kvm.orig/arch/x86/kvm/x86.h > +++ kvm/arch/x86/kvm/x86.h > @@ -170,5 +170,7 @@ extern u64 kvm_supported_xcr0(void); > > extern unsigned int min_timer_period_us; > > +extern unsigned int lapic_timer_advance_ns; > + > extern struct static_key kvm_no_apic_vcpu; > #endif > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: > > > On 10/12/2014 21:57, Marcelo Tosatti wrote: > > For the hrtimer which emulates the tscdeadline timer in the guest, > > add an option to advance expiration, and busy spin on VM-entry waiting > > for the actual expiration time to elapse. > > > > This allows achieving low latencies in cyclictest (or any scenario > > which requires strict timing regarding timer expiration). > > > > Reduces cyclictest avg latency by 50%. > > > > Note: this option requires tuning to find the appropriate value > > for a particular hardware/guest combination. One method is to measure the > > average delay between apic_timer_fn and VM-entry. > > Another method is to start with 1000ns, and increase the value > > in say 500ns increments until avg cyclictest numbers stop decreasing. > > What values are you using in practice for the parameter? 7us. > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> > > > > Index: kvm/arch/x86/kvm/lapic.c > > =================================================================== > > --- kvm.orig/arch/x86/kvm/lapic.c > > +++ kvm/arch/x86/kvm/lapic.c > > @@ -33,6 +33,7 @@ > > #include <asm/page.h> > > #include <asm/current.h> > > #include <asm/apicdef.h> > > +#include <asm/delay.h> > > #include <linux/atomic.h> > > #include <linux/jump_label.h> > > #include "kvm_cache_regs.h" > > @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv > > { > > struct kvm_vcpu *vcpu = apic->vcpu; > > wait_queue_head_t *q = &vcpu->wq; > > + struct kvm_timer *ktimer = &apic->lapic_timer; > > > > /* > > * Note: KVM_REQ_PENDING_TIMER is implicitly checked in > > @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv > > > > if (waitqueue_active(q)) > > wake_up_interruptible(q); > > + > > + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE) > > + ktimer->expired_tscdeadline = ktimer->tscdeadline; > > +} > > + > > +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) > > +{ > > + struct kvm_lapic *apic = vcpu->arch.apic; > > + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); > > + > > + if (kvm_apic_hw_enabled(apic)) { > > + int vec = reg & APIC_VECTOR_MASK; > > + > > + if (kvm_x86_ops->test_posted_interrupt) > > + return kvm_x86_ops->test_posted_interrupt(vcpu, vec); > > + else { > > + if (apic_test_vector(vec, apic->regs + APIC_ISR)) > > + return true; > > + } > > One branch here is testing IRR, the other is testing ISR. I think > testing ISR is right; on APICv, the above test will cause a busy wait > during a higher-priority task (or during an interrupt service routine > for the timer itself), just because the timer interrupt was delivered. Yes. > So, on APICv, if the interrupt is in PIR but it has bits 7:4 <= > PPR[7:4], you have a problem. :( There is no APICv hook that lets you > get a vmexit when the PPR becomes low enough. Well, you simply exit earlier and busy spin for VM-exit time. For Linux guests, there is no problem. > > + } > > + return false; > > +} > > + > > +void wait_lapic_expire(struct kvm_vcpu *vcpu) > > +{ > > + struct kvm_lapic *apic = vcpu->arch.apic; > > + u64 guest_tsc, tsc_deadline; > > + > > + if (!kvm_vcpu_has_lapic(vcpu)) > > + return; > > + > > + if (!apic_lvtt_tscdeadline(apic)) > > + return; > > This test is wrong, I think. You need to check whether the timer > interrupt was a TSC deadline interrupt. Instead, you are checking > whether the current mode is TSC-deadline. This can be different if the > interrupt could not be delivered immediately after it was received. > This is easy to fix: replace the first two tests with > "apic->lapic_timer.expired_tscdeadline != 0" and... Yes. > > + if (!lapic_timer_int_injected(vcpu)) > > + return; > > + tsc_deadline = apic->lapic_timer.expired_tscdeadline; > > ... set apic->lapic_timer.expired_tscdeadline to 0 here. > > But I'm not sure how to solve the above problem with APICv. That's a > pity. Knowing what values you use in practice for the parameter, would > also make it easier to understand the problem. Please report that > together with the graphs produced by the unit test you added. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/12/2014 04:07, Marcelo Tosatti wrote: >> > So, on APICv, if the interrupt is in PIR but it has bits 7:4 <= >> > PPR[7:4], you have a problem. :( There is no APICv hook that lets you >> > get a vmexit when the PPR becomes low enough. > Well, you simply exit earlier and busy spin for VM-exit > time. Okay, given how small the jitter is in your plots, a small busy wait is not the end of the world even if it caused a very short priority inversion. Can you add a tracepoint that triggers when you do the busy wait, and possibly a kvm stat that counts good (no busy wait) vs. bad (busy wait) invocations of wait_lapic_expire? Thanks, Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: > On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: >> >> >> On 10/12/2014 21:57, Marcelo Tosatti wrote: >>> For the hrtimer which emulates the tscdeadline timer in the guest, >>> add an option to advance expiration, and busy spin on VM-entry waiting >>> for the actual expiration time to elapse. >>> >>> This allows achieving low latencies in cyclictest (or any scenario >>> which requires strict timing regarding timer expiration). >>> >>> Reduces cyclictest avg latency by 50%. >>> >>> Note: this option requires tuning to find the appropriate value >>> for a particular hardware/guest combination. One method is to measure the >>> average delay between apic_timer_fn and VM-entry. >>> Another method is to start with 1000ns, and increase the value >>> in say 500ns increments until avg cyclictest numbers stop decreasing. >> >> What values are you using in practice for the parameter? > > 7us. It takes 7us to get from TSC deadline expiration to the *start* of vmresume? That seems rather extreme. Is it possible that almost all of that latency is from deadline expiration to C-state exit? If so, can we teach the timer code to wake up early to account for that? We're supposed to know our idle exit latency these days. --Andy > >>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> >>> >>> Index: kvm/arch/x86/kvm/lapic.c >>> =================================================================== >>> --- kvm.orig/arch/x86/kvm/lapic.c >>> +++ kvm/arch/x86/kvm/lapic.c >>> @@ -33,6 +33,7 @@ >>> #include <asm/page.h> >>> #include <asm/current.h> >>> #include <asm/apicdef.h> >>> +#include <asm/delay.h> >>> #include <linux/atomic.h> >>> #include <linux/jump_label.h> >>> #include "kvm_cache_regs.h" >>> @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv >>> { >>> struct kvm_vcpu *vcpu = apic->vcpu; >>> wait_queue_head_t *q = &vcpu->wq; >>> + struct kvm_timer *ktimer = &apic->lapic_timer; >>> >>> /* >>> * Note: KVM_REQ_PENDING_TIMER is implicitly checked in >>> @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv >>> >>> if (waitqueue_active(q)) >>> wake_up_interruptible(q); >>> + >>> + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE) >>> + ktimer->expired_tscdeadline = ktimer->tscdeadline; >>> +} >>> + >>> +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) >>> +{ >>> + struct kvm_lapic *apic = vcpu->arch.apic; >>> + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); >>> + >>> + if (kvm_apic_hw_enabled(apic)) { >>> + int vec = reg & APIC_VECTOR_MASK; >>> + >>> + if (kvm_x86_ops->test_posted_interrupt) >>> + return kvm_x86_ops->test_posted_interrupt(vcpu, vec); >>> + else { >>> + if (apic_test_vector(vec, apic->regs + APIC_ISR)) >>> + return true; >>> + } >> >> One branch here is testing IRR, the other is testing ISR. I think >> testing ISR is right; on APICv, the above test will cause a busy wait >> during a higher-priority task (or during an interrupt service routine >> for the timer itself), just because the timer interrupt was delivered. > > Yes. > >> So, on APICv, if the interrupt is in PIR but it has bits 7:4 <= >> PPR[7:4], you have a problem. :( There is no APICv hook that lets you >> get a vmexit when the PPR becomes low enough. > > Well, you simply exit earlier and busy spin for VM-exit > time. > > For Linux guests, there is no problem. > >>> + } >>> + return false; >>> +} >>> + >>> +void wait_lapic_expire(struct kvm_vcpu *vcpu) >>> +{ >>> + struct kvm_lapic *apic = vcpu->arch.apic; >>> + u64 guest_tsc, tsc_deadline; >>> + >>> + if (!kvm_vcpu_has_lapic(vcpu)) >>> + return; >>> + >>> + if (!apic_lvtt_tscdeadline(apic)) >>> + return; >> >> This test is wrong, I think. You need to check whether the timer >> interrupt was a TSC deadline interrupt. Instead, you are checking >> whether the current mode is TSC-deadline. This can be different if the >> interrupt could not be delivered immediately after it was received. >> This is easy to fix: replace the first two tests with >> "apic->lapic_timer.expired_tscdeadline != 0" and... > > Yes. > >>> + if (!lapic_timer_int_injected(vcpu)) >>> + return; >>> + tsc_deadline = apic->lapic_timer.expired_tscdeadline; >> >> ... set apic->lapic_timer.expired_tscdeadline to 0 here. >> >> But I'm not sure how to solve the above problem with APICv. That's a >> pity. Knowing what values you use in practice for the parameter, would >> also make it easier to understand the problem. Please report that >> together with the graphs produced by the unit test you added. > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote: > On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: > > On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: > >> > >> > >> On 10/12/2014 21:57, Marcelo Tosatti wrote: > >>> For the hrtimer which emulates the tscdeadline timer in the guest, > >>> add an option to advance expiration, and busy spin on VM-entry waiting > >>> for the actual expiration time to elapse. > >>> > >>> This allows achieving low latencies in cyclictest (or any scenario > >>> which requires strict timing regarding timer expiration). > >>> > >>> Reduces cyclictest avg latency by 50%. > >>> > >>> Note: this option requires tuning to find the appropriate value > >>> for a particular hardware/guest combination. One method is to measure the > >>> average delay between apic_timer_fn and VM-entry. > >>> Another method is to start with 1000ns, and increase the value > >>> in say 500ns increments until avg cyclictest numbers stop decreasing. > >> > >> What values are you using in practice for the parameter? > > > > 7us. > > > It takes 7us to get from TSC deadline expiration to the *start* of > vmresume? That seems rather extreme. > > Is it possible that almost all of that latency is from deadline > expiration to C-state exit? If so, can we teach the timer code to wake > up early to account for that? We're supposed to know our idle exit > latency these days. 7us includes: idle thread wakeup idle schedout ksoftirqd schedin ksoftirqd schedout qemu schedin vm-entry C-states are disabled of course. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Dec 11, 2014 at 12:58 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote: >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: >> > On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: >> >> >> >> >> >> On 10/12/2014 21:57, Marcelo Tosatti wrote: >> >>> For the hrtimer which emulates the tscdeadline timer in the guest, >> >>> add an option to advance expiration, and busy spin on VM-entry waiting >> >>> for the actual expiration time to elapse. >> >>> >> >>> This allows achieving low latencies in cyclictest (or any scenario >> >>> which requires strict timing regarding timer expiration). >> >>> >> >>> Reduces cyclictest avg latency by 50%. >> >>> >> >>> Note: this option requires tuning to find the appropriate value >> >>> for a particular hardware/guest combination. One method is to measure the >> >>> average delay between apic_timer_fn and VM-entry. >> >>> Another method is to start with 1000ns, and increase the value >> >>> in say 500ns increments until avg cyclictest numbers stop decreasing. >> >> >> >> What values are you using in practice for the parameter? >> > >> > 7us. >> >> >> It takes 7us to get from TSC deadline expiration to the *start* of >> vmresume? That seems rather extreme. >> >> Is it possible that almost all of that latency is from deadline >> expiration to C-state exit? If so, can we teach the timer code to wake >> up early to account for that? We're supposed to know our idle exit >> latency these days. > > 7us includes: > > idle thread wakeup > idle schedout > ksoftirqd schedin > ksoftirqd schedout > qemu schedin > vm-entry Is there some secret way to get perf to profile this? Like some way to tell perf to only record samples after the IRQ fires and before vmresume? Because 7 us seems waaaaay slower than it should be for this. Yes, Rik, I know that we're wasting all kinds of time doing dumb things with xstate, but that shouldn't be anywhere near 7us on modern hardware, unless we're actually taking a device-not-available exception in there. :) There might be a whopping size xstate operations, but those should be 60ns each, perhaps, if the state is dirty? Maybe everything misses cache? > > C-states are disabled of course. > :) --Andy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/12/2014 21:48, Andy Lutomirski wrote: > On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: >> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: >>> >>> >>> On 10/12/2014 21:57, Marcelo Tosatti wrote: >>>> For the hrtimer which emulates the tscdeadline timer in the guest, >>>> add an option to advance expiration, and busy spin on VM-entry waiting >>>> for the actual expiration time to elapse. >>>> >>>> This allows achieving low latencies in cyclictest (or any scenario >>>> which requires strict timing regarding timer expiration). >>>> >>>> Reduces cyclictest avg latency by 50%. >>>> >>>> Note: this option requires tuning to find the appropriate value >>>> for a particular hardware/guest combination. One method is to measure the >>>> average delay between apic_timer_fn and VM-entry. >>>> Another method is to start with 1000ns, and increase the value >>>> in say 500ns increments until avg cyclictest numbers stop decreasing. >>> >>> What values are you using in practice for the parameter? >> >> 7us. > > It takes 7us to get from TSC deadline expiration to the *start* of > vmresume? That seems rather extreme. No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone costs about 1300. > Is it possible that almost all of that latency is from deadline > expiration to C-state exit? No, I don't think so. Marcelo confirmed that C-states are disabled, bt anyway none of the C-state latency matches Marcelo's data: C1 is really small (1 us), C1e is too large (~10 us). To see the effect of C-state exit, go to the plots I made on a normal laptop and see latency jumping up to 200000 or 400000 cycles (respectively 70 and 140 us, corresponding to C3 and C6 latencies of 60 and 80 us). > If so, can we teach the timer code to wake > up early to account for that? What, it doesn't already do that? Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > > On 11/12/2014 21:48, Andy Lutomirski wrote: >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: >>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: >>>> >>>> >>>> On 10/12/2014 21:57, Marcelo Tosatti wrote: >>>>> For the hrtimer which emulates the tscdeadline timer in the guest, >>>>> add an option to advance expiration, and busy spin on VM-entry waiting >>>>> for the actual expiration time to elapse. >>>>> >>>>> This allows achieving low latencies in cyclictest (or any scenario >>>>> which requires strict timing regarding timer expiration). >>>>> >>>>> Reduces cyclictest avg latency by 50%. >>>>> >>>>> Note: this option requires tuning to find the appropriate value >>>>> for a particular hardware/guest combination. One method is to measure the >>>>> average delay between apic_timer_fn and VM-entry. >>>>> Another method is to start with 1000ns, and increase the value >>>>> in say 500ns increments until avg cyclictest numbers stop decreasing. >>>> >>>> What values are you using in practice for the parameter? >>> >>> 7us. >> >> It takes 7us to get from TSC deadline expiration to the *start* of >> vmresume? That seems rather extreme. > > No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone > costs about 1300. > I suspect that something's massively wrong with context switching, then -- it deserves to be considerably faster than that. The architecturally expensive bits are vmresume, interrupt delivery, and iret, but iret is only ~300 cycles and interrupt delivery should be under 1k cycles. Throw in a few hundred more cycles for whatever wrmsr idiocy is going on somewhere in the process, and we're still nowhere near 21k cycles. >> Is it possible that almost all of that latency is from deadline >> expiration to C-state exit? > > No, I don't think so. Marcelo confirmed that C-states are disabled, bt > anyway none of the C-state latency matches Marcelo's data: C1 is really > small (1 us), C1e is too large (~10 us). > > To see the effect of C-state exit, go to the plots I made on a normal > laptop and see latency jumping up to 200000 or 400000 cycles > (respectively 70 and 140 us, corresponding to C3 and C6 latencies of 60 > and 80 us). > >> If so, can we teach the timer code to wake >> up early to account for that? > > What, it doesn't already do that? No clue. My one machine that actually cares about this has C states rather heavily tuned, so I wouldn't notice. --Andy > > Paolo
On Thu, Dec 11, 2014 at 01:16:52PM -0800, Andy Lutomirski wrote: > On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > > > > > On 11/12/2014 21:48, Andy Lutomirski wrote: > >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: > >>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: > >>>> > >>>> > >>>> On 10/12/2014 21:57, Marcelo Tosatti wrote: > >>>>> For the hrtimer which emulates the tscdeadline timer in the guest, > >>>>> add an option to advance expiration, and busy spin on VM-entry waiting > >>>>> for the actual expiration time to elapse. > >>>>> > >>>>> This allows achieving low latencies in cyclictest (or any scenario > >>>>> which requires strict timing regarding timer expiration). > >>>>> > >>>>> Reduces cyclictest avg latency by 50%. > >>>>> > >>>>> Note: this option requires tuning to find the appropriate value > >>>>> for a particular hardware/guest combination. One method is to measure the > >>>>> average delay between apic_timer_fn and VM-entry. > >>>>> Another method is to start with 1000ns, and increase the value > >>>>> in say 500ns increments until avg cyclictest numbers stop decreasing. > >>>> > >>>> What values are you using in practice for the parameter? > >>> > >>> 7us. > >> > >> It takes 7us to get from TSC deadline expiration to the *start* of > >> vmresume? That seems rather extreme. > > > > No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone > > costs about 1300. > > > > I suspect that something's massively wrong with context switching, > then -- it deserves to be considerably faster than that. The > architecturally expensive bits are vmresume, interrupt delivery, and > iret, but iret is only ~300 cycles and interrupt delivery should be > under 1k cycles. > > Throw in a few hundred more cycles for whatever wrmsr idiocy is going > on somewhere in the process, and we're still nowhere near 21k cycles. <idle>-0 [003] d..h2.. 1991756745496752: apic_timer_fn <-__run_hrtimer <idle>-0 [003] dN.h2.. 1991756745498732: tick_program_event <-hrtimer_interrupt <idle>-0 [003] d...3.. 1991756745502112: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=qemu-system-x86 next_pid=20114 next_prio=98 <idle>-0 [003] d...2.. 1991756745502592: __context_tracking_task_switch <-__schedule qemu-system-x86-20114 [003] ....1.. 1991756745503916: kvm_arch_vcpu_load <-kvm_sched_in qemu-system-x86-20114 [003] ....... 1991756745505320: kvm_cpu_has_pending_timer <-kvm_vcpu_block qemu-system-x86-20114 [003] ....... 1991756745506260: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run qemu-system-x86-20114 [003] ....... 1991756745507812: kvm_apic_accept_events <-kvm_arch_vcpu_ioctl_run qemu-system-x86-20114 [003] ....... 1991756745508100: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run qemu-system-x86-20114 [003] ....... 1991756745508872: kvm_apic_accept_events <-vcpu_enter_guest qemu-system-x86-20114 [003] ....1.. 1991756745510040: vmx_save_host_state <-vcpu_enter_guest qemu-system-x86-20114 [003] d...2.. 1991756745511876: kvm_entry: vcpu 1 1991756745511876 - 1991756745496752 = 15124 The timestamps are TSC reads. This is patched to run without ksoftirqd. Consider: The LAPIC is programmed to the next earliest event by hrtimer_interrupt. VM-entry is processing KVM_REQ_DEACTIVATE_FPU, KVM_REQ_EVENT. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Dec 11, 2014 at 07:27:17PM -0200, Marcelo Tosatti wrote: > On Thu, Dec 11, 2014 at 01:16:52PM -0800, Andy Lutomirski wrote: > > On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > > > > > > > > On 11/12/2014 21:48, Andy Lutomirski wrote: > > >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: > > >>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: > > >>>> > > >>>> > > >>>> On 10/12/2014 21:57, Marcelo Tosatti wrote: > > >>>>> For the hrtimer which emulates the tscdeadline timer in the guest, > > >>>>> add an option to advance expiration, and busy spin on VM-entry waiting > > >>>>> for the actual expiration time to elapse. > > >>>>> > > >>>>> This allows achieving low latencies in cyclictest (or any scenario > > >>>>> which requires strict timing regarding timer expiration). > > >>>>> > > >>>>> Reduces cyclictest avg latency by 50%. > > >>>>> > > >>>>> Note: this option requires tuning to find the appropriate value > > >>>>> for a particular hardware/guest combination. One method is to measure the > > >>>>> average delay between apic_timer_fn and VM-entry. > > >>>>> Another method is to start with 1000ns, and increase the value > > >>>>> in say 500ns increments until avg cyclictest numbers stop decreasing. > > >>>> > > >>>> What values are you using in practice for the parameter? > > >>> > > >>> 7us. > > >> > > >> It takes 7us to get from TSC deadline expiration to the *start* of > > >> vmresume? That seems rather extreme. > > > > > > No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone > > > costs about 1300. > > > > > > > I suspect that something's massively wrong with context switching, > > then -- it deserves to be considerably faster than that. The > > architecturally expensive bits are vmresume, interrupt delivery, and > > iret, but iret is only ~300 cycles and interrupt delivery should be > > under 1k cycles. > > > > Throw in a few hundred more cycles for whatever wrmsr idiocy is going > > on somewhere in the process, and we're still nowhere near 21k cycles. > > > <idle>-0 [003] d..h2.. 1991756745496752: apic_timer_fn > <-__run_hrtimer > <idle>-0 [003] dN.h2.. 1991756745498732: tick_program_event <-hrtimer_interrupt > <idle>-0 [003] d...3.. 1991756745502112: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=qemu-system-x86 next_pid=20114 next_prio=98 > <idle>-0 [003] d...2.. 1991756745502592: __context_tracking_task_switch <-__schedule > qemu-system-x86-20114 [003] ....1.. 1991756745503916: kvm_arch_vcpu_load <-kvm_sched_in > qemu-system-x86-20114 [003] ....... 1991756745505320: kvm_cpu_has_pending_timer <-kvm_vcpu_block > qemu-system-x86-20114 [003] ....... 1991756745506260: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run > qemu-system-x86-20114 [003] ....... 1991756745507812: kvm_apic_accept_events <-kvm_arch_vcpu_ioctl_run > qemu-system-x86-20114 [003] ....... 1991756745508100: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run > qemu-system-x86-20114 [003] ....... 1991756745508872: kvm_apic_accept_events <-vcpu_enter_guest > qemu-system-x86-20114 [003] ....1.. 1991756745510040: vmx_save_host_state <-vcpu_enter_guest > qemu-system-x86-20114 [003] d...2.. 1991756745511876: kvm_entry: vcpu 1 > > > 1991756745511876 - 1991756745496752 = 15124 > > The timestamps are TSC reads. > > This is patched to run without ksoftirqd. Consider: > > The LAPIC is programmed to the next earliest event by hrtimer_interrupt. > VM-entry is processing KVM_REQ_DEACTIVATE_FPU, KVM_REQ_EVENT. > model : 58 model name : Intel(R) Core(TM) i5-3470S CPU @ 2.90GHz stepping : 9 microcode : 0x1b cpu MHz : 2873.492 cache size : 6144 KB -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 12/11/2014 04:07 PM, Andy Lutomirski wrote: > On Thu, Dec 11, 2014 at 12:58 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: >> On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote: >>> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote: >>>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote: >>>>> >>>>> >>>>> On 10/12/2014 21:57, Marcelo Tosatti wrote: >>>>>> For the hrtimer which emulates the tscdeadline timer in the guest, >>>>>> add an option to advance expiration, and busy spin on VM-entry waiting >>>>>> for the actual expiration time to elapse. >>>>>> >>>>>> This allows achieving low latencies in cyclictest (or any scenario >>>>>> which requires strict timing regarding timer expiration). >>>>>> >>>>>> Reduces cyclictest avg latency by 50%. >>>>>> >>>>>> Note: this option requires tuning to find the appropriate value >>>>>> for a particular hardware/guest combination. One method is to measure the >>>>>> average delay between apic_timer_fn and VM-entry. >>>>>> Another method is to start with 1000ns, and increase the value >>>>>> in say 500ns increments until avg cyclictest numbers stop decreasing. >>>>> >>>>> What values are you using in practice for the parameter? >>>> >>>> 7us. >>> >>> >>> It takes 7us to get from TSC deadline expiration to the *start* of >>> vmresume? That seems rather extreme. >>> >>> Is it possible that almost all of that latency is from deadline >>> expiration to C-state exit? If so, can we teach the timer code to wake >>> up early to account for that? We're supposed to know our idle exit >>> latency these days. >> >> 7us includes: >> >> idle thread wakeup >> idle schedout >> ksoftirqd schedin >> ksoftirqd schedout >> qemu schedin >> vm-entry > > Is there some secret way to get perf to profile this? Like some way > to tell perf to only record samples after the IRQ fires and before > vmresume? Because 7 us seems waaaaay slower than it should be for > this. > > Yes, Rik, I know that we're wasting all kinds of time doing dumb > things with xstate, but that shouldn't be anywhere near 7us on modern > hardware, unless we're actually taking a device-not-available > exception in there. :) There might be a whopping size xstate > operations, but those should be 60ns each, perhaps, if the state is > dirty? > > Maybe everything misses cache? I don't expect the xstate stuff to be more than about half a microsecond, with cache misses and failure to optimize XSAVEOPT / XRSTOR across vmenter/vmexit. We get another microsecond or so from not trapping from the guest to the host every time the guest accesses the FPU for the first time. Marcelo already has a hack for that in his tree, and my series merely has something that achieves the same in an automated (and hopefully upstreamable) way. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2014-12-10 15:57-0500, Marcelo Tosatti: > For the hrtimer which emulates the tscdeadline timer in the guest, > add an option to advance expiration, and busy spin on VM-entry waiting > for the actual expiration time to elapse. > > This allows achieving low latencies in cyclictest (or any scenario > which requires strict timing regarding timer expiration). > > Reduces cyclictest avg latency by 50%. > > Note: this option requires tuning to find the appropriate value > for a particular hardware/guest combination. One method is to measure the > average delay between apic_timer_fn and VM-entry. > Another method is to start with 1000ns, and increase the value > in say 500ns increments until avg cyclictest numbers stop decreasing. > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> > > Index: kvm/arch/x86/kvm/lapic.c > =================================================================== > --- kvm.orig/arch/x86/kvm/lapic.c > +++ kvm/arch/x86/kvm/lapic.c > @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv > > if (waitqueue_active(q)) > wake_up_interruptible(q); > + > + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE) timer_mode != timer_mode_mask. (Please use apic_lvtt_tscdeadline().) (So the code never waited for tsc_deadline >= guest_tsc ... I suppose it was possible to achieve lower latencies thanks to that.) > +void wait_lapic_expire(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + u64 guest_tsc, tsc_deadline; > + > + if (!kvm_vcpu_has_lapic(vcpu)) > + return; > + > + if (!apic_lvtt_tscdeadline(apic)) > + return; (It is better to check expired_tscdeadline here and zero it later.) > + > + if (!lapic_timer_int_injected(vcpu)) > + return; > + > + tsc_deadline = apic->lapic_timer.expired_tscdeadline; > + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); > + > + while (guest_tsc < tsc_deadline) { > + int delay = min(tsc_deadline - guest_tsc, 1000ULL); > + > + ndelay(delay); The delay is in nanoseconds, but you feed it a difference in TSC. (And usually overestimate the time; cpu_relax() loop seems easiest.) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Index: kvm/arch/x86/kvm/lapic.c =================================================================== --- kvm.orig/arch/x86/kvm/lapic.c +++ kvm/arch/x86/kvm/lapic.c @@ -33,6 +33,7 @@ #include <asm/page.h> #include <asm/current.h> #include <asm/apicdef.h> +#include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include "kvm_cache_regs.h" @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv { struct kvm_vcpu *vcpu = apic->vcpu; wait_queue_head_t *q = &vcpu->wq; + struct kvm_timer *ktimer = &apic->lapic_timer; /* * Note: KVM_REQ_PENDING_TIMER is implicitly checked in @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv if (waitqueue_active(q)) wake_up_interruptible(q); + + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE) + ktimer->expired_tscdeadline = ktimer->tscdeadline; +} + +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); + + if (kvm_apic_hw_enabled(apic)) { + int vec = reg & APIC_VECTOR_MASK; + + if (kvm_x86_ops->test_posted_interrupt) + return kvm_x86_ops->test_posted_interrupt(vcpu, vec); + else { + if (apic_test_vector(vec, apic->regs + APIC_ISR)) + return true; + } + } + return false; +} + +void wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 guest_tsc, tsc_deadline; + + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (!apic_lvtt_tscdeadline(apic)) + return; + + if (!lapic_timer_int_injected(vcpu)) + return; + + tsc_deadline = apic->lapic_timer.expired_tscdeadline; + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + + while (guest_tsc < tsc_deadline) { + int delay = min(tsc_deadline - guest_tsc, 1000ULL); + + ndelay(delay); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + } } static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; + atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { @@ -1137,6 +1186,7 @@ static void start_apic_timer(struct kvm_ /* lapic timer in tsc deadline mode */ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; + ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; @@ -1151,8 +1201,10 @@ static void start_apic_timer(struct kvm_ if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS); } else apic_timer_expired(apic); Index: kvm/arch/x86/kvm/lapic.h =================================================================== --- kvm.orig/arch/x86/kvm/lapic.h +++ kvm/arch/x86/kvm/lapic.h @@ -14,6 +14,7 @@ struct kvm_timer { u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; + u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ }; @@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +void wait_lapic_expire(struct kvm_vcpu *vcpu); + #endif Index: kvm/arch/x86/kvm/x86.c =================================================================== --- kvm.orig/arch/x86/kvm/x86.c +++ kvm/arch/x86/kvm/x86.c @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz) static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +/* lapic timer advance (tscdeadline mode only) in nanoseconds */ +unsigned int lapic_timer_advance_ns = 0; +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); + static bool backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -6311,6 +6315,7 @@ static int vcpu_enter_guest(struct kvm_v } trace_kvm_entry(vcpu->vcpu_id); + wait_lapic_expire(vcpu); kvm_x86_ops->run(vcpu); /* Index: kvm/arch/x86/kvm/x86.h =================================================================== --- kvm.orig/arch/x86/kvm/x86.h +++ kvm/arch/x86/kvm/x86.h @@ -170,5 +170,7 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; +extern unsigned int lapic_timer_advance_ns; + extern struct static_key kvm_no_apic_vcpu; #endif
For the hrtimer which emulates the tscdeadline timer in the guest, add an option to advance expiration, and busy spin on VM-entry waiting for the actual expiration time to elapse. This allows achieving low latencies in cyclictest (or any scenario which requires strict timing regarding timer expiration). Reduces cyclictest avg latency by 50%. Note: this option requires tuning to find the appropriate value for a particular hardware/guest combination. One method is to measure the average delay between apic_timer_fn and VM-entry. Another method is to start with 1000ns, and increase the value in say 500ns increments until avg cyclictest numbers stop decreasing. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html