diff mbox

[RFC,3/7] KVM: timer: synchronize tsc-deadline timestamp for guest

Message ID 1512722390-3654-4-git-send-email-quan.xu0@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Quan Xu Dec. 8, 2017, 8:39 a.m. UTC
From: Ben Luo <bn0418@gmail.com>

In general, KVM guest programs tsc-deadline timestamp to
MSR_IA32_TSC_DEADLINE MSR. This will cause a VM-exit, and
then KVM handles this timer for guest.

The tsc-deadline timestamp is mostly recorded in share page
with less VM-exit. We Introduce a periodically working kthread
to scan share page and synchronize timer setting for guest
on a dedicated CPU.

Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
Signed-off-by: Quan Xu <quan.xu0@gmail.com>
Signed-off-by: Ben Luo <bn0418@gmail.com>
---
 arch/x86/kvm/lapic.c |  138 ++++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/lapic.h |    5 ++
 2 files changed, 143 insertions(+), 0 deletions(-)

Comments

Konrad Rzeszutek Wilk Dec. 8, 2017, 3:06 p.m. UTC | #1
On Fri, Dec 08, 2017 at 04:39:46PM +0800, Quan Xu wrote:
> From: Ben Luo <bn0418@gmail.com>
> 
> In general, KVM guest programs tsc-deadline timestamp to
> MSR_IA32_TSC_DEADLINE MSR. This will cause a VM-exit, and
> then KVM handles this timer for guest.
> 
> The tsc-deadline timestamp is mostly recorded in share page
> with less VM-exit. We Introduce a periodically working kthread
> to scan share page and synchronize timer setting for guest
> on a dedicated CPU.

That sounds like a race. Meaning the guest may put too small window
and this 'working thread to scan' may not get to it fast enough?

Meaning we miss the deadline to inject the timer in the guest.

Or is this part of this PV MSR semantics - that it will only work
for certain amount of values and anything less than say 1ms
should not use the PV MSR?
Quan Xu Dec. 14, 2017, 1:54 a.m. UTC | #2
On 2017/12/08 23:06, Konrad Rzeszutek Wilk wrote:
> On Fri, Dec 08, 2017 at 04:39:46PM +0800, Quan Xu wrote:
>> From: Ben Luo <bn0418@gmail.com>
>>
>> In general, KVM guest programs tsc-deadline timestamp to
>> MSR_IA32_TSC_DEADLINE MSR. This will cause a VM-exit, and
>> then KVM handles this timer for guest.
>>
>> The tsc-deadline timestamp is mostly recorded in share page
>> with less VM-exit. We Introduce a periodically working kthread
>> to scan share page and synchronize timer setting for guest
>> on a dedicated CPU.
> That sounds like a race. Meaning the guest may put too small window
> and this 'working thread to scan' may not get to it fast enough?
yes, you are right. So ..
> .
> Meaning we miss the deadline to inject the timer in the guest.
>
> Or is this part of this PV MSR semantics - that it will only work
> for certain amount of values and anything less than say 1ms
> should not use the PV MSR?

..
for these timers, We have to program these tsc-deadline timestamps
to MSR_IA32_TSC_DEADLINE as normal, which will cause VM-exit and KVM will
signal the working thread through IPI to program timer, instead of
registering on current CPU (patch 0004).

more detail in patch 0007.

Quan
Alibaba Cloud
diff mbox

Patch

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 55c9ba3..20a23bb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -36,6 +36,10 @@ 
 #include <asm/delay.h>
 #include <linux/atomic.h>
 #include <linux/jump_label.h>
+#include <linux/ktime.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mmu_context.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
 #include "trace.h"
@@ -70,6 +74,12 @@ 
 #define APIC_BROADCAST			0xFF
 #define X2APIC_BROADCAST		0xFFFFFFFFul
 
+static struct hrtimer pv_sync_timer;
+static long pv_timer_period_ns = PVTIMER_PERIOD_NS;
+static struct task_struct *pv_timer_polling_worker;
+
+module_param(pv_timer_period_ns, long, 0644);
+
 static inline int apic_test_vector(int vec, void *bitmap)
 {
 	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -2542,8 +2552,130 @@  void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 	}
 }
 
+static enum hrtimer_restart pv_sync_timer_callback(struct hrtimer *timer)
+{
+	hrtimer_forward_now(timer, ns_to_ktime(pv_timer_period_ns));
+	wake_up_process(pv_timer_polling_worker);
+
+	return HRTIMER_RESTART;
+}
+
+void kvm_apic_sync_pv_timer(void *data)
+{
+	struct kvm_vcpu *vcpu = data;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	unsigned long flags, this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+	u64 guest_tsc, expire_tsc;
+	long rem_tsc;
+
+	if (!lapic_in_kernel(vcpu) || !pv_timer_enabled(vcpu))
+		return;
+
+	local_irq_save(flags);
+	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+	rem_tsc = ktime_to_ns(hrtimer_get_remaining(&pv_sync_timer))
+			* this_tsc_khz;
+	if (rem_tsc <= 0)
+		rem_tsc += pv_timer_period_ns * this_tsc_khz;
+	do_div(rem_tsc, 1000000L);
+
+	/*
+	 * make sure guest_tsc and rem_tsc are assigned before to update
+	 * next_sync_tsc.
+	 */
+	smp_wmb();
+	kvm_xchg_guest_cached(vcpu->kvm, &vcpu->arch.pv_timer.data,
+		offsetof(struct pvtimer_vcpu_event_info, next_sync_tsc),
+		guest_tsc + rem_tsc, 8);
+
+	/* make sure next_sync_tsc is visible */
+	smp_wmb();
+
+	expire_tsc = kvm_xchg_guest_cached(vcpu->kvm, &vcpu->arch.pv_timer.data,
+			offsetof(struct pvtimer_vcpu_event_info, expire_tsc),
+			0UL, 8);
+
+	/* make sure expire_tsc is visible */
+	smp_wmb();
+
+	if (expire_tsc) {
+		if (expire_tsc > guest_tsc)
+			/*
+			 * As we bind this thread to a dedicated CPU through
+			 * IPI, the timer is registered on that dedicated
+			 * CPU here.
+			 */
+			kvm_set_lapic_tscdeadline_msr(apic->vcpu, expire_tsc);
+		else
+			/* deliver immediately if expired */
+			kvm_apic_local_deliver(apic, APIC_LVTT);
+	}
+	local_irq_restore(flags);
+}
+
+static int pv_timer_polling(void *arg)
+{
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+	mm_segment_t oldfs = get_fs();
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			break;
+		}
+
+		spin_lock(&kvm_lock);
+		__set_current_state(TASK_RUNNING);
+		list_for_each_entry(kvm, &vm_list, vm_list) {
+			set_fs(USER_DS);
+			use_mm(kvm->mm);
+			kvm_for_each_vcpu(i, vcpu, kvm) {
+				kvm_apic_sync_pv_timer(vcpu);
+			}
+			unuse_mm(kvm->mm);
+			set_fs(oldfs);
+		}
+
+		spin_unlock(&kvm_lock);
+
+		schedule();
+	}
+
+	return 0;
+}
+
+static void kvm_pv_timer_init(void)
+{
+	ktime_t ktime = ktime_set(0, pv_timer_period_ns);
+
+	hrtimer_init(&pv_sync_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	pv_sync_timer.function = &pv_sync_timer_callback;
+
+	/* kthread for pv_timer sync buffer */
+	pv_timer_polling_worker = kthread_create(pv_timer_polling, NULL,
+						"pv_timer_polling_worker/%d",
+						PVTIMER_SYNC_CPU);
+	if (IS_ERR(pv_timer_polling_worker)) {
+		pr_warn_once("kvm: failed to create thread for pv_timer\n");
+		pv_timer_polling_worker = NULL;
+		hrtimer_cancel(&pv_sync_timer);
+
+		return;
+	}
+
+	kthread_bind(pv_timer_polling_worker, PVTIMER_SYNC_CPU);
+	wake_up_process(pv_timer_polling_worker);
+	hrtimer_start(&pv_sync_timer, ktime, HRTIMER_MODE_REL);
+}
+
 void kvm_lapic_init(void)
 {
+	kvm_pv_timer_init();
+
 	/* do not patch jump label more than once per second */
 	jump_label_rate_limit(&apic_hw_disabled, HZ);
 	jump_label_rate_limit(&apic_sw_disabled, HZ);
@@ -2551,6 +2683,12 @@  void kvm_lapic_init(void)
 
 void kvm_lapic_exit(void)
 {
+	if (pv_timer_polling_worker) {
+		hrtimer_cancel(&pv_sync_timer);
+		kthread_stop(pv_timer_polling_worker);
+		pv_timer_polling_worker = NULL;
+	}
+
 	static_key_deferred_flush(&apic_hw_disabled);
 	static_key_deferred_flush(&apic_sw_disabled);
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 539a738..4588d59 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -16,6 +16,9 @@ 
 #define APIC_BUS_CYCLE_NS       1
 #define APIC_BUS_FREQUENCY      (1000000000ULL / APIC_BUS_CYCLE_NS)
 
+#define PVTIMER_SYNC_CPU	(NR_CPUS - 1) /* dedicated CPU */
+#define PVTIMER_PERIOD_NS	250000L /* pvtimer default period */
+
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
@@ -213,6 +216,8 @@  static inline bool pv_timer_enabled(struct kvm_vcpu *vcpu)
 	return vcpu->arch.pv_timer.msr_val & KVM_MSR_ENABLED;
 }
 
+void kvm_apic_sync_pv_timer(void *data);
+
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);