diff mbox

[2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration

Message ID 20141210205904.415174860@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marcelo Tosatti Dec. 10, 2014, 8:57 p.m. UTC
For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.

This allows achieving low latencies in cyclictest (or any scenario 
which requires strict timing regarding timer expiration).

Reduces cyclictest avg latency by 50%.

Note: this option requires tuning to find the appropriate value 
for a particular hardware/guest combination. One method is to measure the 
average delay between apic_timer_fn and VM-entry. 
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Paolo Bonzini Dec. 10, 2014, 11:37 p.m. UTC | #1
On 10/12/2014 21:57, Marcelo Tosatti wrote:
> For the hrtimer which emulates the tscdeadline timer in the guest,
> add an option to advance expiration, and busy spin on VM-entry waiting
> for the actual expiration time to elapse.
> 
> This allows achieving low latencies in cyclictest (or any scenario 
> which requires strict timing regarding timer expiration).
> 
> Reduces cyclictest avg latency by 50%.
> 
> Note: this option requires tuning to find the appropriate value 
> for a particular hardware/guest combination. One method is to measure the 
> average delay between apic_timer_fn and VM-entry. 
> Another method is to start with 1000ns, and increase the value
> in say 500ns increments until avg cyclictest numbers stop decreasing.

What values are you using in practice for the parameter?

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Index: kvm/arch/x86/kvm/lapic.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/lapic.c
> +++ kvm/arch/x86/kvm/lapic.c
> @@ -33,6 +33,7 @@
>  #include <asm/page.h>
>  #include <asm/current.h>
>  #include <asm/apicdef.h>
> +#include <asm/delay.h>
>  #include <linux/atomic.h>
>  #include <linux/jump_label.h>
>  #include "kvm_cache_regs.h"
> @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
>  {
>  	struct kvm_vcpu *vcpu = apic->vcpu;
>  	wait_queue_head_t *q = &vcpu->wq;
> +	struct kvm_timer *ktimer = &apic->lapic_timer;
>  
>  	/*
>  	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
> @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
>  
>  	if (waitqueue_active(q))
>  		wake_up_interruptible(q);
> +
> +	if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
> +		ktimer->expired_tscdeadline = ktimer->tscdeadline;
> +}
> +
> +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
> +
> +	if (kvm_apic_hw_enabled(apic)) {
> +		int vec = reg & APIC_VECTOR_MASK;
> +
> +		if (kvm_x86_ops->test_posted_interrupt)
> +			return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
> +		else {
> +			if (apic_test_vector(vec, apic->regs + APIC_ISR))
> +				return true;
> +		}

One branch here is testing IRR, the other is testing ISR.  I think
testing ISR is right; on APICv, the above test will cause a busy wait
during a higher-priority task (or during an interrupt service routine
for the timer itself), just because the timer interrupt was delivered.

So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
PPR[7:4], you have a problem. :(  There is no APICv hook that lets you
get a vmexit when the PPR becomes low enough.

> +	}
> +	return false;
> +}
> +
> +void wait_lapic_expire(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	u64 guest_tsc, tsc_deadline;
> +
> +	if (!kvm_vcpu_has_lapic(vcpu))
> +		return;
> +
> +	if (!apic_lvtt_tscdeadline(apic))
> +		return;

This test is wrong, I think.  You need to check whether the timer
interrupt was a TSC deadline interrupt.  Instead, you are checking
whether the current mode is TSC-deadline.  This can be different if the
interrupt could not be delivered immediately after it was received.
This is easy to fix: replace the first two tests with
"apic->lapic_timer.expired_tscdeadline != 0" and...

> +	if (!lapic_timer_int_injected(vcpu))
> +		return;
> +	tsc_deadline = apic->lapic_timer.expired_tscdeadline;

... set apic->lapic_timer.expired_tscdeadline to 0 here.

But I'm not sure how to solve the above problem with APICv.  That's a
pity.  Knowing what values you use in practice for the parameter, would
also make it easier to understand the problem.  Please report that
together with the graphs produced by the unit test you added.

Paolo

> +	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
> +
> +	while (guest_tsc < tsc_deadline) {
> +		int delay = min(tsc_deadline - guest_tsc, 1000ULL);
> +
> +		ndelay(delay);
> +		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
> +	}
>  }
>  
>  static void start_apic_timer(struct kvm_lapic *apic)
>  {
>  	ktime_t now;
> +
>  	atomic_set(&apic->lapic_timer.pending, 0);
>  
>  	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
> @@ -1137,6 +1186,7 @@ static void start_apic_timer(struct kvm_
>  		/* lapic timer in tsc deadline mode */
>  		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
>  		u64 ns = 0;
> +		ktime_t expire;
>  		struct kvm_vcpu *vcpu = apic->vcpu;
>  		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
>  		unsigned long flags;
> @@ -1151,8 +1201,10 @@ static void start_apic_timer(struct kvm_
>  		if (likely(tscdeadline > guest_tsc)) {
>  			ns = (tscdeadline - guest_tsc) * 1000000ULL;
>  			do_div(ns, this_tsc_khz);
> +			expire = ktime_add_ns(now, ns);
> +			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
>  			hrtimer_start(&apic->lapic_timer.timer,
> -				ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
> +				      expire, HRTIMER_MODE_ABS);
>  		} else
>  			apic_timer_expired(apic);
>  
> Index: kvm/arch/x86/kvm/lapic.h
> ===================================================================
> --- kvm.orig/arch/x86/kvm/lapic.h
> +++ kvm/arch/x86/kvm/lapic.h
> @@ -14,6 +14,7 @@ struct kvm_timer {
>  	u32 timer_mode;
>  	u32 timer_mode_mask;
>  	u64 tscdeadline;
> +	u64 expired_tscdeadline;
>  	atomic_t pending;			/* accumulated triggered timers */
>  };
>  
> @@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
>  
>  bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
>  
> +void wait_lapic_expire(struct kvm_vcpu *vcpu);
> +
>  #endif
> Index: kvm/arch/x86/kvm/x86.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/x86.c
> +++ kvm/arch/x86/kvm/x86.c
> @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz)
>  static u32 tsc_tolerance_ppm = 250;
>  module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
>  
> +/* lapic timer advance (tscdeadline mode only) in nanoseconds */
> +unsigned int lapic_timer_advance_ns = 0;
> +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
> +
>  static bool backwards_tsc_observed = false;
>  
>  #define KVM_NR_SHARED_MSRS 16
> @@ -6311,6 +6315,7 @@ static int vcpu_enter_guest(struct kvm_v
>  	}
>  
>  	trace_kvm_entry(vcpu->vcpu_id);
> +	wait_lapic_expire(vcpu);
>  	kvm_x86_ops->run(vcpu);
>  
>  	/*
> Index: kvm/arch/x86/kvm/x86.h
> ===================================================================
> --- kvm.orig/arch/x86/kvm/x86.h
> +++ kvm/arch/x86/kvm/x86.h
> @@ -170,5 +170,7 @@ extern u64 kvm_supported_xcr0(void);
>  
>  extern unsigned int min_timer_period_us;
>  
> +extern unsigned int lapic_timer_advance_ns;
> +
>  extern struct static_key kvm_no_apic_vcpu;
>  #endif
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Dec. 11, 2014, 3:07 a.m. UTC | #2
On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
> 
> 
> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> > For the hrtimer which emulates the tscdeadline timer in the guest,
> > add an option to advance expiration, and busy spin on VM-entry waiting
> > for the actual expiration time to elapse.
> > 
> > This allows achieving low latencies in cyclictest (or any scenario 
> > which requires strict timing regarding timer expiration).
> > 
> > Reduces cyclictest avg latency by 50%.
> > 
> > Note: this option requires tuning to find the appropriate value 
> > for a particular hardware/guest combination. One method is to measure the 
> > average delay between apic_timer_fn and VM-entry. 
> > Another method is to start with 1000ns, and increase the value
> > in say 500ns increments until avg cyclictest numbers stop decreasing.
> 
> What values are you using in practice for the parameter?

7us.

> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > Index: kvm/arch/x86/kvm/lapic.c
> > ===================================================================
> > --- kvm.orig/arch/x86/kvm/lapic.c
> > +++ kvm/arch/x86/kvm/lapic.c
> > @@ -33,6 +33,7 @@
> >  #include <asm/page.h>
> >  #include <asm/current.h>
> >  #include <asm/apicdef.h>
> > +#include <asm/delay.h>
> >  #include <linux/atomic.h>
> >  #include <linux/jump_label.h>
> >  #include "kvm_cache_regs.h"
> > @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
> >  {
> >  	struct kvm_vcpu *vcpu = apic->vcpu;
> >  	wait_queue_head_t *q = &vcpu->wq;
> > +	struct kvm_timer *ktimer = &apic->lapic_timer;
> >  
> >  	/*
> >  	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
> > @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
> >  
> >  	if (waitqueue_active(q))
> >  		wake_up_interruptible(q);
> > +
> > +	if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
> > +		ktimer->expired_tscdeadline = ktimer->tscdeadline;
> > +}
> > +
> > +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
> > +{
> > +	struct kvm_lapic *apic = vcpu->arch.apic;
> > +	u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
> > +
> > +	if (kvm_apic_hw_enabled(apic)) {
> > +		int vec = reg & APIC_VECTOR_MASK;
> > +
> > +		if (kvm_x86_ops->test_posted_interrupt)
> > +			return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
> > +		else {
> > +			if (apic_test_vector(vec, apic->regs + APIC_ISR))
> > +				return true;
> > +		}
> 
> One branch here is testing IRR, the other is testing ISR.  I think
> testing ISR is right; on APICv, the above test will cause a busy wait
> during a higher-priority task (or during an interrupt service routine
> for the timer itself), just because the timer interrupt was delivered.

Yes.

> So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
> PPR[7:4], you have a problem. :(  There is no APICv hook that lets you
> get a vmexit when the PPR becomes low enough.

Well, you simply exit earlier and busy spin for VM-exit
time.

For Linux guests, there is no problem.

> > +	}
> > +	return false;
> > +}
> > +
> > +void wait_lapic_expire(struct kvm_vcpu *vcpu)
> > +{
> > +	struct kvm_lapic *apic = vcpu->arch.apic;
> > +	u64 guest_tsc, tsc_deadline;
> > +
> > +	if (!kvm_vcpu_has_lapic(vcpu))
> > +		return;
> > +
> > +	if (!apic_lvtt_tscdeadline(apic))
> > +		return;
> 
> This test is wrong, I think.  You need to check whether the timer
> interrupt was a TSC deadline interrupt.  Instead, you are checking
> whether the current mode is TSC-deadline.  This can be different if the
> interrupt could not be delivered immediately after it was received.
> This is easy to fix: replace the first two tests with
> "apic->lapic_timer.expired_tscdeadline != 0" and...

Yes.

> > +	if (!lapic_timer_int_injected(vcpu))
> > +		return;
> > +	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
> 
> ... set apic->lapic_timer.expired_tscdeadline to 0 here.
> 
> But I'm not sure how to solve the above problem with APICv.  That's a
> pity.  Knowing what values you use in practice for the parameter, would
> also make it easier to understand the problem.  Please report that
> together with the graphs produced by the unit test you added.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Dec. 11, 2014, 6:58 p.m. UTC | #3
On 11/12/2014 04:07, Marcelo Tosatti wrote:
>> > So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
>> > PPR[7:4], you have a problem. :(  There is no APICv hook that lets you
>> > get a vmexit when the PPR becomes low enough.
> Well, you simply exit earlier and busy spin for VM-exit
> time.

Okay, given how small the jitter is in your plots, a small busy wait is
not the end of the world even if it caused a very short priority
inversion.  Can you add a tracepoint that triggers when you do the busy
wait, and possibly a kvm stat that counts good (no busy wait) vs. bad
(busy wait) invocations of wait_lapic_expire?

Thanks,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Dec. 11, 2014, 8:48 p.m. UTC | #4
On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>
>>
>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>> for the actual expiration time to elapse.
>>>
>>> This allows achieving low latencies in cyclictest (or any scenario 
>>> which requires strict timing regarding timer expiration).
>>>
>>> Reduces cyclictest avg latency by 50%.
>>>
>>> Note: this option requires tuning to find the appropriate value 
>>> for a particular hardware/guest combination. One method is to measure the 
>>> average delay between apic_timer_fn and VM-entry. 
>>> Another method is to start with 1000ns, and increase the value
>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>
>> What values are you using in practice for the parameter?
> 
> 7us.


It takes 7us to get from TSC deadline expiration to the *start* of
vmresume?  That seems rather extreme.

Is it possible that almost all of that latency is from deadline
expiration to C-state exit?  If so, can we teach the timer code to wake
up early to account for that?  We're supposed to know our idle exit
latency these days.

--Andy

> 
>>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>>
>>> Index: kvm/arch/x86/kvm/lapic.c
>>> ===================================================================
>>> --- kvm.orig/arch/x86/kvm/lapic.c
>>> +++ kvm/arch/x86/kvm/lapic.c
>>> @@ -33,6 +33,7 @@
>>>  #include <asm/page.h>
>>>  #include <asm/current.h>
>>>  #include <asm/apicdef.h>
>>> +#include <asm/delay.h>
>>>  #include <linux/atomic.h>
>>>  #include <linux/jump_label.h>
>>>  #include "kvm_cache_regs.h"
>>> @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
>>>  {
>>>  	struct kvm_vcpu *vcpu = apic->vcpu;
>>>  	wait_queue_head_t *q = &vcpu->wq;
>>> +	struct kvm_timer *ktimer = &apic->lapic_timer;
>>>  
>>>  	/*
>>>  	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
>>> @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
>>>  
>>>  	if (waitqueue_active(q))
>>>  		wake_up_interruptible(q);
>>> +
>>> +	if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
>>> +		ktimer->expired_tscdeadline = ktimer->tscdeadline;
>>> +}
>>> +
>>> +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
>>> +{
>>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>>> +	u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
>>> +
>>> +	if (kvm_apic_hw_enabled(apic)) {
>>> +		int vec = reg & APIC_VECTOR_MASK;
>>> +
>>> +		if (kvm_x86_ops->test_posted_interrupt)
>>> +			return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
>>> +		else {
>>> +			if (apic_test_vector(vec, apic->regs + APIC_ISR))
>>> +				return true;
>>> +		}
>>
>> One branch here is testing IRR, the other is testing ISR.  I think
>> testing ISR is right; on APICv, the above test will cause a busy wait
>> during a higher-priority task (or during an interrupt service routine
>> for the timer itself), just because the timer interrupt was delivered.
> 
> Yes.
> 
>> So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
>> PPR[7:4], you have a problem. :(  There is no APICv hook that lets you
>> get a vmexit when the PPR becomes low enough.
> 
> Well, you simply exit earlier and busy spin for VM-exit
> time.
> 
> For Linux guests, there is no problem.
> 
>>> +	}
>>> +	return false;
>>> +}
>>> +
>>> +void wait_lapic_expire(struct kvm_vcpu *vcpu)
>>> +{
>>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>>> +	u64 guest_tsc, tsc_deadline;
>>> +
>>> +	if (!kvm_vcpu_has_lapic(vcpu))
>>> +		return;
>>> +
>>> +	if (!apic_lvtt_tscdeadline(apic))
>>> +		return;
>>
>> This test is wrong, I think.  You need to check whether the timer
>> interrupt was a TSC deadline interrupt.  Instead, you are checking
>> whether the current mode is TSC-deadline.  This can be different if the
>> interrupt could not be delivered immediately after it was received.
>> This is easy to fix: replace the first two tests with
>> "apic->lapic_timer.expired_tscdeadline != 0" and...
> 
> Yes.
> 
>>> +	if (!lapic_timer_int_injected(vcpu))
>>> +		return;
>>> +	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
>>
>> ... set apic->lapic_timer.expired_tscdeadline to 0 here.
>>
>> But I'm not sure how to solve the above problem with APICv.  That's a
>> pity.  Knowing what values you use in practice for the parameter, would
>> also make it easier to understand the problem.  Please report that
>> together with the graphs produced by the unit test you added.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Dec. 11, 2014, 8:58 p.m. UTC | #5
On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote:
> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> > On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
> >>
> >>
> >> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> >>> For the hrtimer which emulates the tscdeadline timer in the guest,
> >>> add an option to advance expiration, and busy spin on VM-entry waiting
> >>> for the actual expiration time to elapse.
> >>>
> >>> This allows achieving low latencies in cyclictest (or any scenario 
> >>> which requires strict timing regarding timer expiration).
> >>>
> >>> Reduces cyclictest avg latency by 50%.
> >>>
> >>> Note: this option requires tuning to find the appropriate value 
> >>> for a particular hardware/guest combination. One method is to measure the 
> >>> average delay between apic_timer_fn and VM-entry. 
> >>> Another method is to start with 1000ns, and increase the value
> >>> in say 500ns increments until avg cyclictest numbers stop decreasing.
> >>
> >> What values are you using in practice for the parameter?
> > 
> > 7us.
> 
> 
> It takes 7us to get from TSC deadline expiration to the *start* of
> vmresume?  That seems rather extreme.
> 
> Is it possible that almost all of that latency is from deadline
> expiration to C-state exit?  If so, can we teach the timer code to wake
> up early to account for that?  We're supposed to know our idle exit
> latency these days.

7us includes: 

idle thread wakeup
idle schedout
ksoftirqd schedin
ksoftirqd schedout
qemu schedin
vm-entry

C-states are disabled of course.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Dec. 11, 2014, 9:07 p.m. UTC | #6
On Thu, Dec 11, 2014 at 12:58 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote:
>> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>> > On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>> >>
>> >>
>> >> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>> >>> For the hrtimer which emulates the tscdeadline timer in the guest,
>> >>> add an option to advance expiration, and busy spin on VM-entry waiting
>> >>> for the actual expiration time to elapse.
>> >>>
>> >>> This allows achieving low latencies in cyclictest (or any scenario
>> >>> which requires strict timing regarding timer expiration).
>> >>>
>> >>> Reduces cyclictest avg latency by 50%.
>> >>>
>> >>> Note: this option requires tuning to find the appropriate value
>> >>> for a particular hardware/guest combination. One method is to measure the
>> >>> average delay between apic_timer_fn and VM-entry.
>> >>> Another method is to start with 1000ns, and increase the value
>> >>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>> >>
>> >> What values are you using in practice for the parameter?
>> >
>> > 7us.
>>
>>
>> It takes 7us to get from TSC deadline expiration to the *start* of
>> vmresume?  That seems rather extreme.
>>
>> Is it possible that almost all of that latency is from deadline
>> expiration to C-state exit?  If so, can we teach the timer code to wake
>> up early to account for that?  We're supposed to know our idle exit
>> latency these days.
>
> 7us includes:
>
> idle thread wakeup
> idle schedout
> ksoftirqd schedin
> ksoftirqd schedout
> qemu schedin
> vm-entry

Is there some secret way to get perf to profile this?  Like some way
to tell perf to only record samples after the IRQ fires and before
vmresume?  Because 7 us seems waaaaay slower than it should be for
this.

Yes, Rik, I know that we're wasting all kinds of time doing dumb
things with xstate, but that shouldn't be anywhere near 7us on modern
hardware, unless we're actually taking a device-not-available
exception in there. :)  There might be a whopping size xstate
operations, but those should be 60ns each, perhaps, if the state is
dirty?

Maybe everything misses cache?

>
> C-states are disabled of course.
>

:)

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Dec. 11, 2014, 9:10 p.m. UTC | #7
On 11/12/2014 21:48, Andy Lutomirski wrote:
> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>>
>>>
>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>>> for the actual expiration time to elapse.
>>>>
>>>> This allows achieving low latencies in cyclictest (or any scenario 
>>>> which requires strict timing regarding timer expiration).
>>>>
>>>> Reduces cyclictest avg latency by 50%.
>>>>
>>>> Note: this option requires tuning to find the appropriate value 
>>>> for a particular hardware/guest combination. One method is to measure the 
>>>> average delay between apic_timer_fn and VM-entry. 
>>>> Another method is to start with 1000ns, and increase the value
>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>>
>>> What values are you using in practice for the parameter?
>>
>> 7us.
> 
> It takes 7us to get from TSC deadline expiration to the *start* of
> vmresume?  That seems rather extreme.

No, to the end.  7us is 21000 clock cycles, and the vmexit+vmentry alone
costs about 1300.

> Is it possible that almost all of that latency is from deadline
> expiration to C-state exit?

No, I don't think so.  Marcelo confirmed that C-states are disabled, bt
anyway none of the C-state latency matches Marcelo's data: C1 is really
small (1 us), C1e is too large (~10 us).

To see the effect of C-state exit, go to the plots I made on a normal
laptop and see latency jumping up to 200000 or 400000 cycles
(respectively 70 and 140 us, corresponding to C3 and C6 latencies of 60
and 80 us).

> If so, can we teach the timer code to wake
> up early to account for that?

What, it doesn't already do that?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Dec. 11, 2014, 9:16 p.m. UTC | #8
On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 11/12/2014 21:48, Andy Lutomirski wrote:
>> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>>>
>>>>
>>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>>>> for the actual expiration time to elapse.
>>>>>
>>>>> This allows achieving low latencies in cyclictest (or any scenario
>>>>> which requires strict timing regarding timer expiration).
>>>>>
>>>>> Reduces cyclictest avg latency by 50%.
>>>>>
>>>>> Note: this option requires tuning to find the appropriate value
>>>>> for a particular hardware/guest combination. One method is to measure the
>>>>> average delay between apic_timer_fn and VM-entry.
>>>>> Another method is to start with 1000ns, and increase the value
>>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>>>
>>>> What values are you using in practice for the parameter?
>>>
>>> 7us.
>>
>> It takes 7us to get from TSC deadline expiration to the *start* of
>> vmresume?  That seems rather extreme.
>
> No, to the end.  7us is 21000 clock cycles, and the vmexit+vmentry alone
> costs about 1300.
>

I suspect that something's massively wrong with context switching,
then -- it deserves to be considerably faster than that.  The
architecturally expensive bits are vmresume, interrupt delivery, and
iret, but iret is only ~300 cycles and interrupt delivery should be
under 1k cycles.

Throw in a few hundred more cycles for whatever wrmsr idiocy is going
on somewhere in the process, and we're still nowhere near 21k cycles.

>> Is it possible that almost all of that latency is from deadline
>> expiration to C-state exit?
>
> No, I don't think so.  Marcelo confirmed that C-states are disabled, bt
> anyway none of the C-state latency matches Marcelo's data: C1 is really
> small (1 us), C1e is too large (~10 us).
>
> To see the effect of C-state exit, go to the plots I made on a normal
> laptop and see latency jumping up to 200000 or 400000 cycles
> (respectively 70 and 140 us, corresponding to C3 and C6 latencies of 60
> and 80 us).
>
>> If so, can we teach the timer code to wake
>> up early to account for that?
>
> What, it doesn't already do that?

No clue.  My one machine that actually cares about this has C states
rather heavily tuned, so I wouldn't notice.

--Andy

>
> Paolo
Marcelo Tosatti Dec. 11, 2014, 9:27 p.m. UTC | #9
On Thu, Dec 11, 2014 at 01:16:52PM -0800, Andy Lutomirski wrote:
> On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> >
> > On 11/12/2014 21:48, Andy Lutomirski wrote:
> >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> >>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
> >>>>
> >>>>
> >>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> >>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
> >>>>> add an option to advance expiration, and busy spin on VM-entry waiting
> >>>>> for the actual expiration time to elapse.
> >>>>>
> >>>>> This allows achieving low latencies in cyclictest (or any scenario
> >>>>> which requires strict timing regarding timer expiration).
> >>>>>
> >>>>> Reduces cyclictest avg latency by 50%.
> >>>>>
> >>>>> Note: this option requires tuning to find the appropriate value
> >>>>> for a particular hardware/guest combination. One method is to measure the
> >>>>> average delay between apic_timer_fn and VM-entry.
> >>>>> Another method is to start with 1000ns, and increase the value
> >>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
> >>>>
> >>>> What values are you using in practice for the parameter?
> >>>
> >>> 7us.
> >>
> >> It takes 7us to get from TSC deadline expiration to the *start* of
> >> vmresume?  That seems rather extreme.
> >
> > No, to the end.  7us is 21000 clock cycles, and the vmexit+vmentry alone
> > costs about 1300.
> >
> 
> I suspect that something's massively wrong with context switching,
> then -- it deserves to be considerably faster than that.  The
> architecturally expensive bits are vmresume, interrupt delivery, and
> iret, but iret is only ~300 cycles and interrupt delivery should be
> under 1k cycles.
> 
> Throw in a few hundred more cycles for whatever wrmsr idiocy is going
> on somewhere in the process, and we're still nowhere near 21k cycles.


          <idle>-0     [003] d..h2.. 1991756745496752: apic_timer_fn
<-__run_hrtimer
          <idle>-0     [003] dN.h2.. 1991756745498732: tick_program_event <-hrtimer_interrupt
          <idle>-0     [003] d...3.. 1991756745502112: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=qemu-system-x86 next_pid=20114 next_prio=98
          <idle>-0     [003] d...2.. 1991756745502592: __context_tracking_task_switch <-__schedule
 qemu-system-x86-20114 [003] ....1.. 1991756745503916: kvm_arch_vcpu_load <-kvm_sched_in
 qemu-system-x86-20114 [003] ....... 1991756745505320: kvm_cpu_has_pending_timer <-kvm_vcpu_block
 qemu-system-x86-20114 [003] ....... 1991756745506260: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
 qemu-system-x86-20114 [003] ....... 1991756745507812: kvm_apic_accept_events <-kvm_arch_vcpu_ioctl_run
 qemu-system-x86-20114 [003] ....... 1991756745508100: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
 qemu-system-x86-20114 [003] ....... 1991756745508872: kvm_apic_accept_events <-vcpu_enter_guest
 qemu-system-x86-20114 [003] ....1.. 1991756745510040: vmx_save_host_state <-vcpu_enter_guest
 qemu-system-x86-20114 [003] d...2.. 1991756745511876: kvm_entry: vcpu 1


1991756745511876 - 1991756745496752 = 15124

The timestamps are TSC reads.

This is patched to run without ksoftirqd. Consider:

The LAPIC is programmed to the next earliest event by hrtimer_interrupt.
VM-entry is processing KVM_REQ_DEACTIVATE_FPU, KVM_REQ_EVENT.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Dec. 11, 2014, 9:29 p.m. UTC | #10
On Thu, Dec 11, 2014 at 07:27:17PM -0200, Marcelo Tosatti wrote:
> On Thu, Dec 11, 2014 at 01:16:52PM -0800, Andy Lutomirski wrote:
> > On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> > >
> > >
> > > On 11/12/2014 21:48, Andy Lutomirski wrote:
> > >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> > >>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
> > >>>>
> > >>>>
> > >>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> > >>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
> > >>>>> add an option to advance expiration, and busy spin on VM-entry waiting
> > >>>>> for the actual expiration time to elapse.
> > >>>>>
> > >>>>> This allows achieving low latencies in cyclictest (or any scenario
> > >>>>> which requires strict timing regarding timer expiration).
> > >>>>>
> > >>>>> Reduces cyclictest avg latency by 50%.
> > >>>>>
> > >>>>> Note: this option requires tuning to find the appropriate value
> > >>>>> for a particular hardware/guest combination. One method is to measure the
> > >>>>> average delay between apic_timer_fn and VM-entry.
> > >>>>> Another method is to start with 1000ns, and increase the value
> > >>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
> > >>>>
> > >>>> What values are you using in practice for the parameter?
> > >>>
> > >>> 7us.
> > >>
> > >> It takes 7us to get from TSC deadline expiration to the *start* of
> > >> vmresume?  That seems rather extreme.
> > >
> > > No, to the end.  7us is 21000 clock cycles, and the vmexit+vmentry alone
> > > costs about 1300.
> > >
> > 
> > I suspect that something's massively wrong with context switching,
> > then -- it deserves to be considerably faster than that.  The
> > architecturally expensive bits are vmresume, interrupt delivery, and
> > iret, but iret is only ~300 cycles and interrupt delivery should be
> > under 1k cycles.
> > 
> > Throw in a few hundred more cycles for whatever wrmsr idiocy is going
> > on somewhere in the process, and we're still nowhere near 21k cycles.
> 
> 
>           <idle>-0     [003] d..h2.. 1991756745496752: apic_timer_fn
> <-__run_hrtimer
>           <idle>-0     [003] dN.h2.. 1991756745498732: tick_program_event <-hrtimer_interrupt
>           <idle>-0     [003] d...3.. 1991756745502112: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=qemu-system-x86 next_pid=20114 next_prio=98
>           <idle>-0     [003] d...2.. 1991756745502592: __context_tracking_task_switch <-__schedule
>  qemu-system-x86-20114 [003] ....1.. 1991756745503916: kvm_arch_vcpu_load <-kvm_sched_in
>  qemu-system-x86-20114 [003] ....... 1991756745505320: kvm_cpu_has_pending_timer <-kvm_vcpu_block
>  qemu-system-x86-20114 [003] ....... 1991756745506260: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
>  qemu-system-x86-20114 [003] ....... 1991756745507812: kvm_apic_accept_events <-kvm_arch_vcpu_ioctl_run
>  qemu-system-x86-20114 [003] ....... 1991756745508100: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
>  qemu-system-x86-20114 [003] ....... 1991756745508872: kvm_apic_accept_events <-vcpu_enter_guest
>  qemu-system-x86-20114 [003] ....1.. 1991756745510040: vmx_save_host_state <-vcpu_enter_guest
>  qemu-system-x86-20114 [003] d...2.. 1991756745511876: kvm_entry: vcpu 1
> 
> 
> 1991756745511876 - 1991756745496752 = 15124
> 
> The timestamps are TSC reads.
> 
> This is patched to run without ksoftirqd. Consider:
> 
> The LAPIC is programmed to the next earliest event by hrtimer_interrupt.
> VM-entry is processing KVM_REQ_DEACTIVATE_FPU, KVM_REQ_EVENT.
> 

model       : 58
model name  : Intel(R) Core(TM) i5-3470S CPU @ 2.90GHz
stepping    : 9
microcode   : 0x1b
cpu MHz     : 2873.492
cache size  : 6144 KB

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel Dec. 11, 2014, 9:37 p.m. UTC | #11
On 12/11/2014 04:07 PM, Andy Lutomirski wrote:
> On Thu, Dec 11, 2014 at 12:58 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote:
>>> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>>>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>>>>
>>>>>
>>>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>>>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>>>>> for the actual expiration time to elapse.
>>>>>>
>>>>>> This allows achieving low latencies in cyclictest (or any scenario
>>>>>> which requires strict timing regarding timer expiration).
>>>>>>
>>>>>> Reduces cyclictest avg latency by 50%.
>>>>>>
>>>>>> Note: this option requires tuning to find the appropriate value
>>>>>> for a particular hardware/guest combination. One method is to measure the
>>>>>> average delay between apic_timer_fn and VM-entry.
>>>>>> Another method is to start with 1000ns, and increase the value
>>>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>>>>
>>>>> What values are you using in practice for the parameter?
>>>>
>>>> 7us.
>>>
>>>
>>> It takes 7us to get from TSC deadline expiration to the *start* of
>>> vmresume?  That seems rather extreme.
>>>
>>> Is it possible that almost all of that latency is from deadline
>>> expiration to C-state exit?  If so, can we teach the timer code to wake
>>> up early to account for that?  We're supposed to know our idle exit
>>> latency these days.
>>
>> 7us includes:
>>
>> idle thread wakeup
>> idle schedout
>> ksoftirqd schedin
>> ksoftirqd schedout
>> qemu schedin
>> vm-entry
> 
> Is there some secret way to get perf to profile this?  Like some way
> to tell perf to only record samples after the IRQ fires and before
> vmresume?  Because 7 us seems waaaaay slower than it should be for
> this.
> 
> Yes, Rik, I know that we're wasting all kinds of time doing dumb
> things with xstate, but that shouldn't be anywhere near 7us on modern
> hardware, unless we're actually taking a device-not-available
> exception in there. :)  There might be a whopping size xstate
> operations, but those should be 60ns each, perhaps, if the state is
> dirty?
> 
> Maybe everything misses cache?

I don't expect the xstate stuff to be more than about half
a microsecond, with cache misses and failure to optimize
XSAVEOPT / XRSTOR across vmenter/vmexit.

We get another microsecond or so from not trapping from the
guest to the host every time the guest accesses the FPU for
the first time. Marcelo already has a hack for that in his
tree, and my series merely has something that achieves the
same in an automated (and hopefully upstreamable) way.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář Dec. 12, 2014, 6:35 p.m. UTC | #12
2014-12-10 15:57-0500, Marcelo Tosatti:
> For the hrtimer which emulates the tscdeadline timer in the guest,
> add an option to advance expiration, and busy spin on VM-entry waiting
> for the actual expiration time to elapse.
> 
> This allows achieving low latencies in cyclictest (or any scenario 
> which requires strict timing regarding timer expiration).
> 
> Reduces cyclictest avg latency by 50%.
> 
> Note: this option requires tuning to find the appropriate value 
> for a particular hardware/guest combination. One method is to measure the 
> average delay between apic_timer_fn and VM-entry. 
> Another method is to start with 1000ns, and increase the value
> in say 500ns increments until avg cyclictest numbers stop decreasing.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Index: kvm/arch/x86/kvm/lapic.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/lapic.c
> +++ kvm/arch/x86/kvm/lapic.c
> @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
>  
>  	if (waitqueue_active(q))
>  		wake_up_interruptible(q);
> +
> +	if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)

timer_mode != timer_mode_mask.  (Please use apic_lvtt_tscdeadline().)

(So the code never waited for tsc_deadline >= guest_tsc ...
 I suppose it was possible to achieve lower latencies thanks to that.)

> +void wait_lapic_expire(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	u64 guest_tsc, tsc_deadline;
> +
> +	if (!kvm_vcpu_has_lapic(vcpu))
> +		return;
> +
> +	if (!apic_lvtt_tscdeadline(apic))
> +		return;

(It is better to check expired_tscdeadline here and zero it later.)

> +
> +	if (!lapic_timer_int_injected(vcpu))
> +		return;
> +
> +	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
> +	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
> +
> +	while (guest_tsc < tsc_deadline) {
> +		int delay = min(tsc_deadline - guest_tsc, 1000ULL);
> +
> +		ndelay(delay);

The delay is in nanoseconds, but you feed it a difference in TSC.
(And usually overestimate the time;  cpu_relax() loop seems easiest.)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

Index: kvm/arch/x86/kvm/lapic.c
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@ 
 #include <asm/page.h>
 #include <asm/current.h>
 #include <asm/apicdef.h>
+#include <asm/delay.h>
 #include <linux/atomic.h>
 #include <linux/jump_label.h>
 #include "kvm_cache_regs.h"
@@ -1073,6 +1074,7 @@  static void apic_timer_expired(struct kv
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	wait_queue_head_t *q = &vcpu->wq;
+	struct kvm_timer *ktimer = &apic->lapic_timer;
 
 	/*
 	 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,58 @@  static void apic_timer_expired(struct kv
 
 	if (waitqueue_active(q))
 		wake_up_interruptible(q);
+
+	if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
+		ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+	if (kvm_apic_hw_enabled(apic)) {
+		int vec = reg & APIC_VECTOR_MASK;
+
+		if (kvm_x86_ops->test_posted_interrupt)
+			return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
+		else {
+			if (apic_test_vector(vec, apic->regs + APIC_ISR))
+				return true;
+		}
+	}
+	return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 guest_tsc, tsc_deadline;
+
+	if (!kvm_vcpu_has_lapic(vcpu))
+		return;
+
+	if (!apic_lvtt_tscdeadline(apic))
+		return;
+
+	if (!lapic_timer_int_injected(vcpu))
+		return;
+
+	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+
+	while (guest_tsc < tsc_deadline) {
+		int delay = min(tsc_deadline - guest_tsc, 1000ULL);
+
+		ndelay(delay);
+		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	}
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
+
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1186,7 @@  static void start_apic_timer(struct kvm_
 		/* lapic timer in tsc deadline mode */
 		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 		u64 ns = 0;
+		ktime_t expire;
 		struct kvm_vcpu *vcpu = apic->vcpu;
 		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
 		unsigned long flags;
@@ -1151,8 +1201,10 @@  static void start_apic_timer(struct kvm_
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
+			expire = ktime_add_ns(now, ns);
+			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
 			hrtimer_start(&apic->lapic_timer.timer,
-				ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+				      expire, HRTIMER_MODE_ABS);
 		} else
 			apic_timer_expired(apic);
 
Index: kvm/arch/x86/kvm/lapic.h
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.h
+++ kvm/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@  struct kvm_timer {
 	u32 timer_mode;
 	u32 timer_mode_mask;
 	u64 tscdeadline;
+	u64 expired_tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
 };
 
@@ -170,4 +171,6 @@  static inline bool kvm_apic_has_events(s
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
 #endif
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@  EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz)
 static u32 tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
 static bool backwards_tsc_observed = false;
 
 #define KVM_NR_SHARED_MSRS 16
@@ -6311,6 +6315,7 @@  static int vcpu_enter_guest(struct kvm_v
 	}
 
 	trace_kvm_entry(vcpu->vcpu_id);
+	wait_lapic_expire(vcpu);
 	kvm_x86_ops->run(vcpu);
 
 	/*
Index: kvm/arch/x86/kvm/x86.h
===================================================================
--- kvm.orig/arch/x86/kvm/x86.h
+++ kvm/arch/x86/kvm/x86.h
@@ -170,5 +170,7 @@  extern u64 kvm_supported_xcr0(void);
 
 extern unsigned int min_timer_period_us;
 
+extern unsigned int lapic_timer_advance_ns;
+
 extern struct static_key kvm_no_apic_vcpu;
 #endif