diff mbox

[4/8] KVM: x86: replace hrtimer based timer emulation

Message ID 20090706015812.786509491@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Marcelo Tosatti July 6, 2009, 1:55 a.m. UTC
Replace hrtimer based timer emulation with host timebase (ktime_t)
comparisons on guest entry.

This avoids host load when guests are scheduled out, removes a
spinlock acquision on entry (i8254.c's inject_lock), and makes future
improvements easier.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Comments

Gleb Natapov July 8, 2009, 12:58 p.m. UTC | #1
Excellent patch series.

On Sun, Jul 05, 2009 at 10:55:15PM -0300, Marcelo Tosatti wrote:
>  int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
>  {
> -	int ret;
> +	ktime_t now, expires;
>  
> -	ret = pit_has_pending_timer(vcpu);
> -	ret |= apic_has_pending_timer(vcpu);
> +	expires = kvm_vcpu_next_timer_event(vcpu);
> +	now = ktime_get();
> +	if (expires.tv64 <= now.tv64) {
> +		if (kvm_arch_interrupt_allowed(vcpu))
> +			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
You shouldn't unhalt vcpu here. Not every timer event will generate
interrupt (vector can be masked in pic/ioapic) or timer event can
generate NMI instead of interrupt. Leaving this code out probably means
that you can't remove kvm_inject_pending_timer_irqs() call from
__vcpu_run().

> +		return 1;
> +	}
>  
> -	return ret;
> +	return 0;
>  }
>  EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
>  

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 8, 2009, 1:41 p.m. UTC | #2
On 07/06/2009 04:55 AM, Marcelo Tosatti wrote:
> Replace hrtimer based timer emulation with host timebase (ktime_t)
> comparisons on guest entry.
>
> This avoids host load when guests are scheduled out, removes a
> spinlock acquision on entry (i8254.c's inject_lock), and makes future
> improvements easier.
>    

I wonder if we're really winning with this.  Guests should be 
scheduled-out-but-not-halted rarely, and in all other cases we need to 
keep the timer.  A timer comparison on each guest entry might be 
expensive (maybe not so much with tsc based timers).
Marcelo Tosatti July 8, 2009, 4:24 p.m. UTC | #3
On Wed, Jul 08, 2009 at 04:41:22PM +0300, Avi Kivity wrote:
> On 07/06/2009 04:55 AM, Marcelo Tosatti wrote:
>> Replace hrtimer based timer emulation with host timebase (ktime_t)
>> comparisons on guest entry.
>>
>> This avoids host load when guests are scheduled out, removes a
>> spinlock acquision on entry (i8254.c's inject_lock), and makes future
>> improvements easier.
>>    
>
> I wonder if we're really winning with this.  Guests should be  
> scheduled-out-but-not-halted rarely, and in all other cases we need to  
> keep the timer.  A timer comparison on each guest entry might be  
> expensive (maybe not so much with tsc based timers).

Any activity outside of guest mode that takes more than the period of
the timer (think 1000Hz) causes unnecessary host load.

Booting a RHEL5 UP without VNC or serial output on an idle host:

timer_int_normal=95416 timer_interrupt_accumulated=873

(and it continues to increase in that rate, roughly 1%). 

Now factor in multiple guests, loaded host, and you'll probably see more
than that.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 8, 2009, 4:36 p.m. UTC | #4
On 07/08/2009 07:24 PM, Marcelo Tosatti wrote:
>> I wonder if we're really winning with this.  Guests should be
>> scheduled-out-but-not-halted rarely, and in all other cases we need to
>> keep the timer.  A timer comparison on each guest entry might be
>> expensive (maybe not so much with tsc based timers).
>>      
>
> Any activity outside of guest mode that takes more than the period of
> the timer (think 1000Hz) causes unnecessary host load.
>
> Booting a RHEL5 UP without VNC or serial output on an idle host:
>
> timer_int_normal=95416 timer_interrupt_accumulated=873
>
> (and it continues to increase in that rate, roughly 1%).
>
> Now factor in multiple guests, loaded host, and you'll probably see more
> than that.
>    

If you're using qcow, you may be seeing the non-aio accesses.  
Otherwise, I can't think what can cause 1ms latency.  Random host 
process?  long mmu resync?
diff mbox

Patch

Index: kvm-new/arch/x86/kvm/x86.c
===================================================================
--- kvm-new.orig/arch/x86/kvm/x86.c
+++ kvm-new/arch/x86/kvm/x86.c
@@ -3461,8 +3461,6 @@  static int vcpu_enter_guest(struct kvm_v
 		goto out;
 
 	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
-			__kvm_migrate_timers(vcpu);
 		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
 			kvm_write_guest_time(vcpu);
 		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
@@ -3482,6 +3480,9 @@  static int vcpu_enter_guest(struct kvm_v
 		}
 	}
 
+
+	kvm_inject_pending_timer_irqs(vcpu);
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3499,6 +3500,8 @@  static int vcpu_enter_guest(struct kvm_v
 		goto out;
 	}
 
+	kvm_vcpu_arm_exit(vcpu);
+
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else
@@ -3564,6 +3567,8 @@  static int vcpu_enter_guest(struct kvm_v
 
 	preempt_enable();
 
+	kvm_vcpu_cleanup_timer(vcpu);
+
 	down_read(&vcpu->kvm->slots_lock);
 
 	/*
@@ -3627,10 +3632,6 @@  static int __vcpu_run(struct kvm_vcpu *v
 		if (r <= 0)
 			break;
 
-		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-		if (kvm_cpu_has_pending_timer(vcpu))
-			kvm_inject_pending_timer_irqs(vcpu);
-
 		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 			r = -EINTR;
 			kvm_run->exit_reason = KVM_EXIT_INTR;
@@ -4579,6 +4580,8 @@  int kvm_arch_vcpu_setup(struct kvm_vcpu 
 	if (r < 0)
 		goto free_vcpu;
 
+	kvm_vcpu_init_armed_exit(vcpu);
+
 	if (kvm->arch.vpit && kvm_vcpu_is_bsp(vcpu))
 		kvm_timer_vcpu_bind(&kvm->arch.vpit->pit_state.pit_timer, vcpu);
 
Index: kvm-new/virt/kvm/kvm_main.c
===================================================================
--- kvm-new.orig/virt/kvm/kvm_main.c
+++ kvm-new/virt/kvm/kvm_main.c
@@ -1656,11 +1656,19 @@  void mark_page_dirty(struct kvm *kvm, gf
 	}
 }
 
+#ifndef KVM_ARCH_HAVE_TIMER_EVENT
+ktime_t kvm_vcpu_next_timer_event(struct kvm_vcpu *vcpu)
+{
+	return (ktime_t) { .tv64 = KTIME_MAX };
+}
+#endif
+
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
+	ktime_t expires;
 	DEFINE_WAIT(wait);
 
 	for (;;) {
@@ -1677,8 +1685,9 @@  void kvm_vcpu_block(struct kvm_vcpu *vcp
 		if (signal_pending(current))
 			break;
 
+		expires = kvm_vcpu_next_timer_event(vcpu);
 		vcpu_put(vcpu);
-		schedule();
+		schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
 		vcpu_load(vcpu);
 	}
 
Index: kvm-new/arch/x86/kvm/i8254.c
===================================================================
--- kvm-new.orig/arch/x86/kvm/i8254.c
+++ kvm-new/arch/x86/kvm/i8254.c
@@ -224,15 +224,6 @@  static void pit_latch_status(struct kvm 
 	}
 }
 
-int pit_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
-	if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
-		return kvm_timer_has_pending(&pit->pit_state.pit_timer);
-	return 0;
-}
-
 static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
@@ -548,6 +539,36 @@  static const struct kvm_io_device_ops sp
 	.write    = speaker_ioport_write,
 };
 
+static void pit_inject(struct kvm_timer *ktimer)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	struct kvm *kvm = ktimer->kvm;
+
+	mutex_lock(&kvm->irq_lock);
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+	mutex_unlock(&kvm->irq_lock);
+
+	/*
+	 * Provides NMI watchdog support via Virtual Wire mode.
+	 * The route is: PIT -> PIC -> LVT0 in NMI mode.
+	 *
+	 * Note: Our Virtual Wire implementation is simplified, only
+	 * propagating PIT interrupts to all VCPUs when they have set
+	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+	 * VCPU0, and only if its LVT0 is in EXTINT mode.
+	 */
+	if (kvm->arch.vapics_in_nmi_mode > 0)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvm_apic_nmi_wd_deliver(vcpu);
+}
+
+struct kvm_timer_ops kpit_ops = {
+	.inject = pit_inject,
+	.name = "pit",
+};
+
 /* Caller must have writers lock on slots_lock */
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
@@ -573,7 +594,7 @@  struct kvm_pit *kvm_create_pit(struct kv
 
 	pit_state = &pit->pit_state;
 	pit_state->pit = pit;
-	kvm_timer_init(kvm, &pit_state->pit_timer);
+	kvm_timer_init(kvm, &pit_state->pit_timer, &kpit_ops);
 
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
@@ -610,50 +631,3 @@  void kvm_free_pit(struct kvm *kvm)
 	}
 }
 
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	mutex_lock(&kvm->irq_lock);
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-	mutex_unlock(&kvm->irq_lock);
-
-	/*
-	 * Provides NMI watchdog support via Virtual Wire mode.
-	 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-	 *
-	 * Note: Our Virtual Wire implementation is simplified, only
-	 * propagating PIT interrupts to all VCPUs when they have set
-	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-	 * VCPU0, and only if its LVT0 is in EXTINT mode.
-	 */
-	if (kvm->arch.vapics_in_nmi_mode > 0)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_kpit_state *ps;
-
-	if (vcpu && pit) {
-		int inject = 0;
-		ps = &pit->pit_state;
-
-		/* Try to inject pending interrupts when
-		 * last one has been acked.
-		 */
-		spin_lock(&ps->inject_lock);
-		if (kvm_timer_has_pending(&ps->pit_timer) && ps->irq_ack) {
-			ps->irq_ack = 0;
-			inject = 1;
-		}
-		spin_unlock(&ps->inject_lock);
-		if (inject)
-			__inject_pit_timer_intr(kvm);
-	}
-}
Index: kvm-new/arch/x86/kvm/lapic.c
===================================================================
--- kvm-new.orig/arch/x86/kvm/lapic.c
+++ kvm-new/arch/x86/kvm/lapic.c
@@ -875,16 +875,6 @@  int kvm_lapic_enabled(struct kvm_vcpu *v
  *----------------------------------------------------------------------
  */
 
-int apic_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *lapic = vcpu->arch.apic;
-
-	if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
-		return kvm_timer_has_pending(&lapic->lapic_timer);
-
-	return 0;
-}
-
 static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 {
 	u32 reg = apic_get_reg(apic, lvt_type);
@@ -912,6 +902,20 @@  static const struct kvm_io_device_ops ap
 	.write    = apic_mmio_write,
 };
 
+void inject_lapic_timer(struct kvm_timer *ktimer)
+{
+	struct kvm_vcpu *vcpu = ktimer->vcpu;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic)
+		kvm_apic_local_deliver(apic, APIC_LVTT);
+}
+
+struct kvm_timer_ops lapic_timer_ops = {
+	.inject = inject_lapic_timer,
+	.name = "lapic",
+};
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic;
@@ -935,7 +939,7 @@  int kvm_create_lapic(struct kvm_vcpu *vc
 	memset(apic->regs, 0, PAGE_SIZE);
 	apic->vcpu = vcpu;
 
-	kvm_timer_init(vcpu->kvm, &apic->lapic_timer);
+	kvm_timer_init(vcpu->kvm, &apic->lapic_timer, &lapic_timer_ops);
 	kvm_timer_vcpu_bind(&apic->lapic_timer, vcpu);
 
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
@@ -982,14 +986,6 @@  int kvm_apic_accept_pic_intr(struct kvm_
 	return r;
 }
 
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	if (apic && kvm_timer_has_pending(&apic->lapic_timer))
-		kvm_apic_local_deliver(apic, APIC_LVTT);
-}
-
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 {
 	int vector = kvm_apic_has_interrupt(vcpu);
Index: kvm-new/arch/x86/include/asm/kvm_host.h
===================================================================
--- kvm-new.orig/arch/x86/include/asm/kvm_host.h
+++ kvm-new/arch/x86/include/asm/kvm_host.h
@@ -377,6 +377,7 @@  struct kvm_vcpu_arch {
 	u64 *mce_banks;
 
 	struct list_head timers;
+	struct hrtimer exit_timer;
 };
 
 struct kvm_mem_alias {
@@ -800,4 +801,7 @@  int kvm_unmap_hva(struct kvm *kvm, unsig
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 
+#define KVM_ARCH_HAVE_TIMER_EVENT
+ktime_t kvm_vcpu_next_timer_event(struct kvm_vcpu *vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
Index: kvm-new/arch/x86/kvm/irq.c
===================================================================
--- kvm-new.orig/arch/x86/kvm/irq.c
+++ kvm-new/arch/x86/kvm/irq.c
@@ -26,18 +26,19 @@ 
 #include "i8254.h"
 #include "x86.h"
 
-/*
- * check if there are pending timer events
- * to be processed.
- */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	int ret;
+	ktime_t now, expires;
 
-	ret = pit_has_pending_timer(vcpu);
-	ret |= apic_has_pending_timer(vcpu);
+	expires = kvm_vcpu_next_timer_event(vcpu);
+	now = ktime_get();
+	if (expires.tv64 <= now.tv64) {
+		if (kvm_arch_interrupt_allowed(vcpu))
+			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		return 1;
+	}
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
@@ -86,36 +87,3 @@  int kvm_cpu_get_interrupt(struct kvm_vcp
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	kvm_inject_apic_timer_irqs(vcpu);
-	kvm_inject_pit_timer_irqs(vcpu);
-	/* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
-
-static void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	if (!apic)
-		return;
-
-	kvm_migrate_timer(&apic->lapic_timer);
-}
-
-static void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
-	if (!kvm_vcpu_is_bsp(vcpu) || !pit)
-		return;
-
-	kvm_migrate_timer(&pit->pit_state.pit_timer);
-}
-
-void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
-{
-	__kvm_migrate_apic_timer(vcpu);
-	__kvm_migrate_pit_timer(vcpu);
-}
Index: kvm-new/arch/x86/kvm/svm.c
===================================================================
--- kvm-new.orig/arch/x86/kvm/svm.c
+++ kvm-new/arch/x86/kvm/svm.c
@@ -738,7 +738,6 @@  static void svm_vcpu_load(struct kvm_vcp
 		delta = vcpu->arch.host_tsc - tsc_this;
 		svm->vmcb->control.tsc_offset += delta;
 		vcpu->cpu = cpu;
-		kvm_migrate_timers(vcpu);
 	}
 
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
Index: kvm-new/arch/x86/kvm/vmx.c
===================================================================
--- kvm-new.orig/arch/x86/kvm/vmx.c
+++ kvm-new/arch/x86/kvm/vmx.c
@@ -703,7 +703,6 @@  static void vmx_vcpu_load(struct kvm_vcp
 
 	if (vcpu->cpu != cpu) {
 		vcpu_clear(vmx);
-		kvm_migrate_timers(vcpu);
 		vpid_sync_vcpu_all(vmx);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
Index: kvm-new/arch/x86/kvm/kvm_timer.h
===================================================================
--- kvm-new.orig/arch/x86/kvm/kvm_timer.h
+++ kvm-new/arch/x86/kvm/kvm_timer.h
@@ -1,26 +1,41 @@ 
+struct kvm_timer_ops;
 
 struct kvm_timer {
-	struct hrtimer timer;
-	s64 period; 			/* unit: ns */
-	atomic_t pending;		/* accumulated triggered timers */
+	ktime_t count_load_time;
+	ktime_t inject_time;
+	u64 period; 				/* unit: ns */
+	u64 acked_events;
+
+	bool can_inject;
 	bool reinject;
 	bool periodic;
+
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	struct list_head vcpu_timer;
+	struct kvm_timer_ops *ops;
 };
 
-void kvm_timer_init(struct kvm *kvm, struct kvm_timer *ktimer);
+struct kvm_timer_ops {
+	void (*inject)(struct kvm_timer *);
+	char *name;
+};
+
+void kvm_timer_init(struct kvm *kvm, struct kvm_timer *ktimer,
+		    struct kvm_timer_ops *ops);
+void kvm_timer_vcpu_bind(struct kvm_timer *ktimer, struct kvm_vcpu *vcpu);
 void kvm_timer_start(struct kvm_timer *ktimer, u64 interval, bool periodic);
 void kvm_timer_cancel(struct kvm_timer *ktimer);
-void kvm_timer_vcpu_bind(struct kvm_timer *ktimer, struct kvm_vcpu *vcpu);
-
 int kvm_timer_has_pending(struct kvm_timer *ktimer);
 void kvm_timer_ack(struct kvm_timer *ktimer);
 void kvm_timer_reset(struct kvm_timer *ktimer);
 
 void kvm_migrate_timer(struct kvm_timer *ktimer);
 
+void kvm_vcpu_init_armed_exit(struct kvm_vcpu *vcpu);
 
-ktime_t kvm_timer_remaining(struct kvm_timer *ktimer);
+void kvm_vcpu_arm_exit(struct kvm_vcpu *vcpu);
+void kvm_vcpu_cleanup_timer(struct kvm_vcpu *vcpu);
 
+ktime_t kvm_timer_next_event(struct kvm_timer *ktimer);
+ktime_t kvm_timer_remaining(struct kvm_timer *ktimer);
Index: kvm-new/arch/x86/kvm/timer.c
===================================================================
--- kvm-new.orig/arch/x86/kvm/timer.c
+++ kvm-new/arch/x86/kvm/timer.c
@@ -1,107 +1,176 @@ 
+/*
+ *
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/hrtimer.h>
 #include <asm/atomic.h>
 #include "kvm_timer.h"
 
-static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
-{
-	int restart_timer = 0;
-	wait_queue_head_t *q = &vcpu->wq;
-
-	/*
-	 * There is a race window between reading and incrementing, but we do
-	 * not care about potentially loosing timer events in the !reinject
-	 * case anyway.
-	 */
-	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
-		atomic_inc(&ktimer->pending);
-		/* FIXME: this code should not know anything about vcpus */
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-	}
-
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
-
-	if (ktimer->periodic) {
-		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
-		restart_timer = 1;
-	}
-
-	return restart_timer;
-}
-
-static enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
-	int restart_timer;
-	struct kvm_vcpu *vcpu;
-	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-
-	vcpu = ktimer->vcpu;
-	if (!vcpu)
-		return HRTIMER_NORESTART;
-
-	restart_timer = __kvm_timer_fn(vcpu, ktimer);
-	if (restart_timer)
-		return HRTIMER_RESTART;
-	else
-		return HRTIMER_NORESTART;
-}
 
-void kvm_timer_init(struct kvm *kvm, struct kvm_timer *ktimer)
+void kvm_timer_init(struct kvm *kvm, struct kvm_timer *ktimer,
+		    struct kvm_timer_ops *ops)
 {
 	ktimer->kvm = kvm;
-	hrtimer_init(&ktimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	ktimer->timer.function = kvm_timer_fn;
 	INIT_LIST_HEAD(&ktimer->vcpu_timer);
+	ktimer->ops = ops;
+	ktimer->can_inject = false;
 }
 
 void kvm_timer_vcpu_bind(struct kvm_timer *ktimer, struct kvm_vcpu *vcpu)
 {
 	ktimer->vcpu = vcpu;
-	list_add(&ktimer->vcpu_timer, &vcpu->arch.timers);
 }
 
 void kvm_timer_start(struct kvm_timer *ktimer, u64 interval, bool periodic)
 {
-	hrtimer_cancel(&ktimer->timer);
-	atomic_set(&ktimer->pending, 0);
 	ktimer->periodic = periodic;
 	ktimer->period = interval;
-	hrtimer_start(&ktimer->timer, ktime_add_ns(ktime_get(), interval),
-			HRTIMER_MODE_ABS);
+	ktimer->count_load_time = ktime_get();
+	ktimer->acked_events = 0;
+	ktimer->can_inject = true;
+
+	WARN_ON(interval == 0);
+
+	list_add(&ktimer->vcpu_timer, &ktimer->vcpu->arch.timers);
 }
 
 void kvm_timer_cancel(struct kvm_timer *ktimer)
 {
-	hrtimer_cancel(&ktimer->timer);
-	atomic_set(&ktimer->pending, 0);
+	if (!list_empty(&ktimer->vcpu_timer))
+		list_del_init(&ktimer->vcpu_timer);
 }
 
-int kvm_timer_has_pending(struct kvm_timer *ktimer)
+void kvm_timer_reset(struct kvm_timer *ktimer)
 {
-	return atomic_read(&ktimer->pending);
+	ktimer->can_inject = true;
 }
 
 void kvm_timer_ack(struct kvm_timer *ktimer)
 {
-	if (atomic_dec_return(&ktimer->pending) < 0)
-		atomic_inc(&ktimer->pending);
+	ktimer->acked_events++;
+	ktimer->can_inject = true;
 }
 
-void kvm_timer_reset(struct kvm_timer *ktimer)
+static ktime_t periodic_timer_next_event(struct kvm_timer *ktimer)
 {
-	atomic_set(&ktimer->pending, 0);
+	ktime_t last_acked_event;
+
+	last_acked_event = ktime_add_ns(ktimer->count_load_time,
+					ktimer->acked_events * ktimer->period);
+
+	return ktime_add_ns(last_acked_event, ktimer->period);
 }
 
-void kvm_migrate_timer(struct kvm_timer *ktimer)
+ktime_t kvm_timer_next_event(struct kvm_timer *ktimer)
 {
-	if (hrtimer_cancel(&ktimer->timer))
-		hrtimer_start_expires(&ktimer->timer, HRTIMER_MODE_ABS);
+	if (!ktimer->periodic)
+		return ktime_add_ns(ktimer->count_load_time, ktimer->period);
+	else
+		return periodic_timer_next_event(ktimer);
 }
 
 ktime_t kvm_timer_remaining(struct kvm_timer *ktimer)
 {
-	return hrtimer_expires_remaining(&ktimer->timer);
+	ktime_t now = ktime_get();
+
+	return ktime_sub(kvm_timer_next_event(ktimer), now);
 }
 
+struct kvm_timer *kvm_vcpu_injectable_timer_event(struct kvm_vcpu *vcpu)
+{
+	struct kvm_timer *ktimer, *ktimer_expire = NULL;
+	ktime_t expires = { .tv64 = KTIME_MAX };
+
+	list_for_each_entry(ktimer, &vcpu->arch.timers, vcpu_timer) {
+		ktime_t this_expires = { .tv64 = KTIME_MAX };
+
+		if (ktimer->can_inject)
+			this_expires = kvm_timer_next_event(ktimer);
+
+		if (this_expires.tv64 < expires.tv64) {
+			expires = this_expires;
+			ktimer_expire = ktimer;
+		}
+	}
+
+	return ktimer_expire;
+}
+
+/*
+ * when the next vcpu timer expires, in host timebase.
+ */
+ktime_t kvm_vcpu_next_timer_event(struct kvm_vcpu *vcpu)
+{
+	ktime_t expires = { .tv64 = KTIME_MAX };
+	struct kvm_timer *ktimer = kvm_vcpu_injectable_timer_event(vcpu);
+
+	if (!ktimer)
+		return expires;
+
+	return kvm_timer_next_event(ktimer);
+}
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_timer *ktimer, *n;
+	ktime_t now = ktime_get();
+
+	list_for_each_entry_safe(ktimer, n, &vcpu->arch.timers, vcpu_timer) {
+		ktime_t expire;
+
+		if (!ktimer->can_inject)
+			continue;
+
+		expire = kvm_timer_next_event(ktimer);
+		if (ktime_to_ns(now) < ktime_to_ns(expire))
+			continue;
+
+		ktimer->can_inject = false;
+		ktimer->ops->inject(ktimer);
+		if (!ktimer->periodic)
+			list_del_init(&ktimer->vcpu_timer);
+	}
+}
+
+/* arm/disarm exit */
+
+static enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
+{
+	return HRTIMER_NORESTART;
+}
+
+void kvm_vcpu_init_armed_exit(struct kvm_vcpu *vcpu)
+{
+	hrtimer_init(&vcpu->arch.exit_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	vcpu->arch.exit_timer.function = kvm_timer_fn;
+}
+
+void kvm_vcpu_arm_exit(struct kvm_vcpu *vcpu)
+{
+	ktime_t expire;
+	ktime_t now;
+	struct kvm_timer *ktimer = kvm_vcpu_injectable_timer_event(vcpu);
+
+	if (!ktimer)
+		return;
+
+	now = ktime_get();
+	expire = kvm_timer_next_event(ktimer);
+
+	if (expire.tv64 != KTIME_MAX)
+		hrtimer_start(&vcpu->arch.exit_timer, expire, HRTIMER_MODE_ABS);
+}
+
+void kvm_vcpu_cleanup_timer(struct kvm_vcpu *vcpu)
+{
+	hrtimer_cancel(&vcpu->arch.exit_timer);
+}
+
+
Index: kvm-new/arch/x86/kvm/irq.h
===================================================================
--- kvm-new.orig/arch/x86/kvm/irq.h
+++ kvm-new/arch/x86/kvm/irq.h
@@ -94,9 +94,5 @@  void kvm_pic_reset(struct kvm_kpic_state
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
-void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
-
-int pit_has_pending_timer(struct kvm_vcpu *vcpu);
-int apic_has_pending_timer(struct kvm_vcpu *vcpu);
 
 #endif
Index: kvm-new/include/linux/kvm_host.h
===================================================================
--- kvm-new.orig/include/linux/kvm_host.h
+++ kvm-new/include/linux/kvm_host.h
@@ -30,15 +30,14 @@ 
  * vcpu->requests bit members
  */
 #define KVM_REQ_TLB_FLUSH          0
-#define KVM_REQ_MIGRATE_TIMER      1
-#define KVM_REQ_REPORT_TPR_ACCESS  2
-#define KVM_REQ_MMU_RELOAD         3
-#define KVM_REQ_TRIPLE_FAULT       4
-#define KVM_REQ_PENDING_TIMER      5
-#define KVM_REQ_UNHALT             6
-#define KVM_REQ_MMU_SYNC           7
-#define KVM_REQ_KVMCLOCK_UPDATE    8
-#define KVM_REQ_KICK               9
+#define KVM_REQ_REPORT_TPR_ACCESS  1
+#define KVM_REQ_MMU_RELOAD         2
+#define KVM_REQ_TRIPLE_FAULT       3
+#define KVM_REQ_PENDING_TIMER      4
+#define KVM_REQ_UNHALT             5
+#define KVM_REQ_MMU_SYNC           6
+#define KVM_REQ_KVMCLOCK_UPDATE    7
+#define KVM_REQ_KICK               8
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -469,11 +468,6 @@  static inline hpa_t pfn_to_hpa(pfn_t pfn
 	return (hpa_t)pfn << PAGE_SHIFT;
 }
 
-static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
-{
-	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
-}
-
 enum kvm_stat_kind {
 	KVM_STAT_VM,
 	KVM_STAT_VCPU,