diff mbox

[v2] KVM: lapic: remove timer spikes on target expectation

Message ID 20180422005320.24734-1-anthoine.bourgeois@blade-group.com (mailing list archive)
State New, archived
Headers show

Commit Message

Anthoine Bourgeois April 22, 2018, 12:53 a.m. UTC
Since the commit "8003c9ae204e: add APIC Timer periodic/oneshot mode VMX
preemption timer support", a Windows 10 guest has some erratic timer
spikes after few hours.  As the uptime of the VM grows the spikes are
larger.

Here the results on a 150000 times 1ms timer without any load:
	  Before 8003c9ae204e | After 8003c9ae204e
Max           1834us          |  86000us
Mean          1100us          |   1021us
Deviation       59us          |    149us
Here the results on a 150000 times 1ms timer with a cpu-z stress test:
	  Before 8003c9ae204e | After 8003c9ae204e
Max          32000us          | 140000us
Mean          1006us          |   1997us
Deviation      140us          |  11095us

The current patch partially revert the previous commit by removing the
target timer expectation to go back to the straight hrtimer calls.  The
APIC Timer periodic/oneshot mode support is kept because it is necessary
on the new Windows Spring update.

v2: Check if the tsc deadline is already expired. Thank you Mika.

Cc: Mika Penttilä <mika.penttila@nextfour.com
Signed-off-by: Anthoine Bourgeois <anthoine.bourgeois@blade-group.com>
---
 arch/x86/kvm/lapic.c | 57 +++++++++++++++++++++++++---------------------------
 arch/x86/kvm/lapic.h |  1 -
 2 files changed, 27 insertions(+), 31 deletions(-)

Comments

Paolo Bonzini April 23, 2018, 5:15 p.m. UTC | #1
On 22/04/2018 02:53, Anthoine Bourgeois wrote:
> Since the commit "8003c9ae204e: add APIC Timer periodic/oneshot mode VMX
> preemption timer support", a Windows 10 guest has some erratic timer
> spikes after few hours.  As the uptime of the VM grows the spikes are
> larger.
> 
> Here the results on a 150000 times 1ms timer without any load:
> 	  Before 8003c9ae204e | After 8003c9ae204e
> Max           1834us          |  86000us
> Mean          1100us          |   1021us
> Deviation       59us          |    149us
> Here the results on a 150000 times 1ms timer with a cpu-z stress test:
> 	  Before 8003c9ae204e | After 8003c9ae204e
> Max          32000us          | 140000us
> Mean          1006us          |   1997us
> Deviation      140us          |  11095us
> 
> The current patch partially revert the previous commit by removing the
> target timer expectation to go back to the straight hrtimer calls.  The
> APIC Timer periodic/oneshot mode support is kept because it is necessary
> on the new Windows Spring update.
> 
> v2: Check if the tsc deadline is already expired. Thank you Mika.
> 
> Cc: Mika Penttilä <mika.penttila@nextfour.com
> Signed-off-by: Anthoine Bourgeois <anthoine.bourgeois@blade-group.com>
> ---
>  arch/x86/kvm/lapic.c | 57 +++++++++++++++++++++++++---------------------------
>  arch/x86/kvm/lapic.h |  1 -
>  2 files changed, 27 insertions(+), 31 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 70dcb5548022..8b5c2a69a3b6 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1173,7 +1173,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
>  
>  static u32 apic_get_tmcct(struct kvm_lapic *apic)
>  {
> -	ktime_t remaining, now;
> +	ktime_t remaining;
>  	s64 ns;
>  	u32 tmcct;
>  
> @@ -1184,8 +1184,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
>  		apic->lapic_timer.period == 0)
>  		return 0;
>  
> -	now = ktime_get();
> -	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
> +	remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);

I'm confused, how can this work when the preemption timer is in use
(vcpu->arch.apic->lapic_timer.hv_timer_in_use is true)?

Paolo

>  	if (ktime_to_ns(remaining) < 0)
>  		remaining = 0;
>  
> @@ -1465,32 +1464,50 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
>  
>  static void start_sw_period(struct kvm_lapic *apic)
>  {
> +	ktime_t now;
> +
> +	/* lapic timer in oneshot or periodic mode */
> +	now = ktime_get();
> +	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
> +		    * APIC_BUS_CYCLE_NS * apic->divide_count;
> +
>  	if (!apic->lapic_timer.period)
>  		return;
>  
> +	limit_periodic_timer_frequency(apic);
> +
>  	if (apic_lvtt_oneshot(apic) &&
> -	    ktime_after(ktime_get(),
> -			apic->lapic_timer.target_expiration)) {
> +	    ktime_after(now,
> +			apic->lapic_timer.tscdeadline)) {
>  		apic_timer_expired(apic);
>  		return;
>  	}
>  
>  	hrtimer_start(&apic->lapic_timer.timer,
> -		apic->lapic_timer.target_expiration,
> +		ktime_add_ns(now, apic->lapic_timer.period),
>  		HRTIMER_MODE_ABS_PINNED);
> +
> +	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
> +		   PRIx64 ", "
> +		   "timer initial count 0x%x, period %lldns, "
> +		   "expire @ 0x%016" PRIx64 ".\n", __func__,
> +		   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
> +		   kvm_lapic_get_reg(apic, APIC_TMICT),
> +		   apic->lapic_timer.period,
> +		   ktime_to_ns(ktime_add_ns(now,
> +				apic->lapic_timer.period)));
>  }
>  
>  static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
>  {
> -	ktime_t now, remaining;
> +	ktime_t remaining;
>  	u64 ns_remaining_old, ns_remaining_new;
>  
>  	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
>  		* APIC_BUS_CYCLE_NS * apic->divide_count;
>  	limit_periodic_timer_frequency(apic);
>  
> -	now = ktime_get();
> -	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
> +	remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
>  	if (ktime_to_ns(remaining) < 0)
>  		remaining = 0;
>  
> @@ -1501,15 +1518,10 @@ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso
>  	apic->lapic_timer.tscdeadline +=
>  		nsec_to_cycles(apic->vcpu, ns_remaining_new) -
>  		nsec_to_cycles(apic->vcpu, ns_remaining_old);
> -	apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
>  }
>  
>  static bool set_target_expiration(struct kvm_lapic *apic)
>  {
> -	ktime_t now;
> -	u64 tscl = rdtsc();
> -
> -	now = ktime_get();
>  	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
>  		* APIC_BUS_CYCLE_NS * apic->divide_count;
>  
> @@ -1520,19 +1532,8 @@ static bool set_target_expiration(struct kvm_lapic *apic)
>  
>  	limit_periodic_timer_frequency(apic);
>  
> -	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
> -		   PRIx64 ", "
> -		   "timer initial count 0x%x, period %lldns, "
> -		   "expire @ 0x%016" PRIx64 ".\n", __func__,
> -		   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
> -		   kvm_lapic_get_reg(apic, APIC_TMICT),
> -		   apic->lapic_timer.period,
> -		   ktime_to_ns(ktime_add_ns(now,
> -				apic->lapic_timer.period)));
> -
> -	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
> +	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, rdtsc()) +
>  		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
> -	apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
>  
>  	return true;
>  }
> @@ -1541,9 +1542,6 @@ static void advance_periodic_target_expiration(struct kvm_lapic *apic)
>  {
>  	apic->lapic_timer.tscdeadline +=
>  		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
> -	apic->lapic_timer.target_expiration =
> -		ktime_add_ns(apic->lapic_timer.target_expiration,
> -				apic->lapic_timer.period);
>  }
>  
>  bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
> @@ -2216,7 +2214,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
>  			apic->lapic_timer.tscdeadline = 0;
>  		if (apic_lvtt_oneshot(apic)) {
>  			apic->lapic_timer.tscdeadline = 0;
> -			apic->lapic_timer.target_expiration = 0;
>  		}
>  		atomic_set(&apic->lapic_timer.pending, 0);
>  	}
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index edce055e9fd7..56823b159e9b 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -19,7 +19,6 @@
>  struct kvm_timer {
>  	struct hrtimer timer;
>  	s64 period; 				/* unit: ns */
> -	ktime_t target_expiration;
>  	u32 timer_mode;
>  	u32 timer_mode_mask;
>  	u64 tscdeadline;
>
Anthoine Bourgeois April 23, 2018, 9:05 p.m. UTC | #2
On Mon, Apr 23, 2018 at 07:15:58PM +0200, Paolo Bonzini wrote:
>On 22/04/2018 02:53, Anthoine Bourgeois wrote:
>> Since the commit "8003c9ae204e: add APIC Timer periodic/oneshot mode VMX
>> preemption timer support", a Windows 10 guest has some erratic timer
>> spikes after few hours.  As the uptime of the VM grows the spikes are
>> larger.
>>
>> Here the results on a 150000 times 1ms timer without any load:
>> 	  Before 8003c9ae204e | After 8003c9ae204e
>> Max           1834us          |  86000us
>> Mean          1100us          |   1021us
>> Deviation       59us          |    149us
>> Here the results on a 150000 times 1ms timer with a cpu-z stress test:
>> 	  Before 8003c9ae204e | After 8003c9ae204e
>> Max          32000us          | 140000us
>> Mean          1006us          |   1997us
>> Deviation      140us          |  11095us
>>
>> The current patch partially revert the previous commit by removing the
>> target timer expectation to go back to the straight hrtimer calls.  The
>> APIC Timer periodic/oneshot mode support is kept because it is necessary
>> on the new Windows Spring update.
>>
>> v2: Check if the tsc deadline is already expired. Thank you Mika.
>>
>> Cc: Mika Penttilä <mika.penttila@nextfour.com
>> Signed-off-by: Anthoine Bourgeois <anthoine.bourgeois@blade-group.com>
>> ---
>>  arch/x86/kvm/lapic.c | 57 +++++++++++++++++++++++++---------------------------
>>  arch/x86/kvm/lapic.h |  1 -
>>  2 files changed, 27 insertions(+), 31 deletions(-)
>>
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 70dcb5548022..8b5c2a69a3b6 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -1173,7 +1173,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
>>
>>  static u32 apic_get_tmcct(struct kvm_lapic *apic)
>>  {
>> -	ktime_t remaining, now;
>> +	ktime_t remaining;
>>  	s64 ns;
>>  	u32 tmcct;
>>
>> @@ -1184,8 +1184,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
>>  		apic->lapic_timer.period == 0)
>>  		return 0;
>>
>> -	now = ktime_get();
>> -	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
>> +	remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
>
>I'm confused, how can this work when the preemption timer is in use
>(vcpu->arch.apic->lapic_timer.hv_timer_in_use is true)?
>

I don't really know, this hunk is only a revert that works for me.

I'm still seeking what is the root cause. My guest is the
target_expiration variable is mis compute sometimes.
What I see is the spikes are linear over time at the rate of 1ms more
every 1 minutes 30 seconds.

Anthoine
Paolo Bonzini April 24, 2018, 8:19 a.m. UTC | #3
On 23/04/2018 23:05, Anthoine Bourgeois wrote:
>>>
>>>
>>> -    now = ktime_get();
>>> -    remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
>>> +    remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
>>
>> I'm confused, how can this work when the preemption timer is in use
>> (vcpu->arch.apic->lapic_timer.hv_timer_in_use is true)?
>>
> 
> I don't really know, this hunk is only a revert that works for me.

If you don't revert the whole patch, you should try and understand how
the hunks you revert interact with those that you leave in place!

> I'm still seeking what is the root cause. My guest is the
> target_expiration variable is mis compute sometimes.

Yes, that is likely.

Paolo

> What I see is the spikes are linear over time at the rate of 1ms more
> every 1 minutes 30 seconds.
Anthoine Bourgeois April 24, 2018, 10:06 p.m. UTC | #4
On Tue, Apr 24, 2018 at 10:19:27AM +0200, Paolo Bonzini wrote:
>On 23/04/2018 23:05, Anthoine Bourgeois wrote:
>>>>
>>>>
>>>> -    now = ktime_get();
>>>> -    remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
>>>> +    remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
>>>
>>> I'm confused, how can this work when the preemption timer is in use
>>> (vcpu->arch.apic->lapic_timer.hv_timer_in_use is true)?
>>>
>>
>> I don't really know, this hunk is only a revert that works for me.
>
>If you don't revert the whole patch, you should try and understand how
>the hunks you revert interact with those that you leave in place!
>
>> I'm still seeking what is the root cause. My guest is the
>> target_expiration variable is mis compute sometimes.
>
>Yes, that is likely.

In start_sw_period, the target_expiration is sometimes in the past (aka.
lesser than ktime_get()). Most of the time hrtimer_start is fast to call
the timer function, few microseconds, but sometimes it can takes
microseconds. Maybe a check like start_sw_tscdeadline does to expire the
timer immediately will be good.

On another way, I saw that target_expiration is sometimes in a future
farer than the next period (I compare target_expiration > now + period
in start_sw_period). Do you think it is a normal thing or
target_expiration should be adjust to avoid a timer miss ?

Anthoine
Wanpeng Li April 30, 2018, 2:07 a.m. UTC | #5
2018-04-25 6:06 GMT+08:00 Anthoine Bourgeois
<anthoine.bourgeois@blade-group.com>:
> On Tue, Apr 24, 2018 at 10:19:27AM +0200, Paolo Bonzini wrote:
> On another way, I saw that target_expiration is sometimes in a future
> farer than the next period (I compare target_expiration > now + period
> in start_sw_period). Do you think it is a normal thing or
> target_expiration should be adjust to avoid a timer miss ?

How you handle this in v4?

Regards,
Wanpeng Li
Anthoine Bourgeois April 30, 2018, 9:10 a.m. UTC | #6
On Mon, Apr 30, 2018 at 10:07:08AM +0800, Wanpeng Li wrote:
>2018-04-25 6:06 GMT+08:00 Anthoine Bourgeois
><anthoine.bourgeois@blade-group.com>:
>> On Tue, Apr 24, 2018 at 10:19:27AM +0200, Paolo Bonzini wrote:
>> On another way, I saw that target_expiration is sometimes in a future
>> farer than the next period (I compare target_expiration > now + period
>> in start_sw_period). Do you think it is a normal thing or
>> target_expiration should be adjust to avoid a timer miss ?
>
>How you handle this in v4?

I didn't handle this in any version of the patch. It is hard to
reproduce and the jitter was always close to the deadline.
I never proved that this case distort my tests. I continue to dig that
way but it'll be another patch if necessary.

Thank you for your patch on min_timer_period_us.

Anthoine
diff mbox

Patch

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 70dcb5548022..8b5c2a69a3b6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1173,7 +1173,7 @@  static void apic_send_ipi(struct kvm_lapic *apic)
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
 {
-	ktime_t remaining, now;
+	ktime_t remaining;
 	s64 ns;
 	u32 tmcct;
 
@@ -1184,8 +1184,7 @@  static u32 apic_get_tmcct(struct kvm_lapic *apic)
 		apic->lapic_timer.period == 0)
 		return 0;
 
-	now = ktime_get();
-	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+	remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
 	if (ktime_to_ns(remaining) < 0)
 		remaining = 0;
 
@@ -1465,32 +1464,50 @@  static void start_sw_tscdeadline(struct kvm_lapic *apic)
 
 static void start_sw_period(struct kvm_lapic *apic)
 {
+	ktime_t now;
+
+	/* lapic timer in oneshot or periodic mode */
+	now = ktime_get();
+	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+		    * APIC_BUS_CYCLE_NS * apic->divide_count;
+
 	if (!apic->lapic_timer.period)
 		return;
 
+	limit_periodic_timer_frequency(apic);
+
 	if (apic_lvtt_oneshot(apic) &&
-	    ktime_after(ktime_get(),
-			apic->lapic_timer.target_expiration)) {
+	    ktime_after(now,
+			apic->lapic_timer.tscdeadline)) {
 		apic_timer_expired(apic);
 		return;
 	}
 
 	hrtimer_start(&apic->lapic_timer.timer,
-		apic->lapic_timer.target_expiration,
+		ktime_add_ns(now, apic->lapic_timer.period),
 		HRTIMER_MODE_ABS_PINNED);
+
+	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+		   PRIx64 ", "
+		   "timer initial count 0x%x, period %lldns, "
+		   "expire @ 0x%016" PRIx64 ".\n", __func__,
+		   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+		   kvm_lapic_get_reg(apic, APIC_TMICT),
+		   apic->lapic_timer.period,
+		   ktime_to_ns(ktime_add_ns(now,
+				apic->lapic_timer.period)));
 }
 
 static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
 {
-	ktime_t now, remaining;
+	ktime_t remaining;
 	u64 ns_remaining_old, ns_remaining_new;
 
 	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
 		* APIC_BUS_CYCLE_NS * apic->divide_count;
 	limit_periodic_timer_frequency(apic);
 
-	now = ktime_get();
-	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+	remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
 	if (ktime_to_ns(remaining) < 0)
 		remaining = 0;
 
@@ -1501,15 +1518,10 @@  static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso
 	apic->lapic_timer.tscdeadline +=
 		nsec_to_cycles(apic->vcpu, ns_remaining_new) -
 		nsec_to_cycles(apic->vcpu, ns_remaining_old);
-	apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
 }
 
 static bool set_target_expiration(struct kvm_lapic *apic)
 {
-	ktime_t now;
-	u64 tscl = rdtsc();
-
-	now = ktime_get();
 	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
 		* APIC_BUS_CYCLE_NS * apic->divide_count;
 
@@ -1520,19 +1532,8 @@  static bool set_target_expiration(struct kvm_lapic *apic)
 
 	limit_periodic_timer_frequency(apic);
 
-	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
-		   PRIx64 ", "
-		   "timer initial count 0x%x, period %lldns, "
-		   "expire @ 0x%016" PRIx64 ".\n", __func__,
-		   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
-		   kvm_lapic_get_reg(apic, APIC_TMICT),
-		   apic->lapic_timer.period,
-		   ktime_to_ns(ktime_add_ns(now,
-				apic->lapic_timer.period)));
-
-	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, rdtsc()) +
 		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
-	apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
 
 	return true;
 }
@@ -1541,9 +1542,6 @@  static void advance_periodic_target_expiration(struct kvm_lapic *apic)
 {
 	apic->lapic_timer.tscdeadline +=
 		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
-	apic->lapic_timer.target_expiration =
-		ktime_add_ns(apic->lapic_timer.target_expiration,
-				apic->lapic_timer.period);
 }
 
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
@@ -2216,7 +2214,6 @@  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 			apic->lapic_timer.tscdeadline = 0;
 		if (apic_lvtt_oneshot(apic)) {
 			apic->lapic_timer.tscdeadline = 0;
-			apic->lapic_timer.target_expiration = 0;
 		}
 		atomic_set(&apic->lapic_timer.pending, 0);
 	}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index edce055e9fd7..56823b159e9b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -19,7 +19,6 @@ 
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
-	ktime_t target_expiration;
 	u32 timer_mode;
 	u32 timer_mode_mask;
 	u64 tscdeadline;