diff mbox series

[3/4] KVM host implementation

Message ID 20210831015919.13006-3-skyele@sjtu.edu.cn (mailing list archive)
State New, archived
Headers show
Series [1/4] KVM: x86: Introduce .pcpu_is_idle() stub infrastructure | expand

Commit Message

Tianqiang Xu Aug. 31, 2021, 1:59 a.m. UTC
Host OS sets 'is_idle' field of kvm_steal_time to 1 if
cpu_rq(this_cpu)->nr_running is 1 before a vCPU being scheduled out.
On this condition, there is no other task on this pCPU to run.
Thus, is_idle == 1 means the pCPU where the preempted vCPU most
recently run is idle.

Host OS invokes get_cpu_nr_running() to get the value of
cpu_rq(this_cpu)->nr_running.

--
Authors: Tianqiang Xu, Dingji Li, Zeyu Mi
	 Shanghai Jiao Tong University

Signed-off-by: Tianqiang Xu <skyele@sjtu.edu.cn>
---
 arch/x86/include/asm/qspinlock.h |  1 -
 arch/x86/kvm/x86.c               | 88 +++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Aug. 31, 2021, 7:16 a.m. UTC | #1
On Tue, Aug 31, 2021 at 09:59:18AM +0800, Tianqiang Xu wrote:
> @@ -4304,8 +4374,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  	idx = srcu_read_lock(&vcpu->kvm->srcu);
>  	if (kvm_xen_msr_enabled(vcpu->kvm))
>  		kvm_xen_runstate_set_preempted(vcpu);
> -	else
> +	else {
>  		kvm_steal_time_set_preempted(vcpu);
> +
> +		if (get_cpu_nr_running(smp_processor_id()) <= 1)
> +			kvm_steal_time_set_is_idle(vcpu);
> +		else
> +			kvm_steal_time_clear_is_idle(vcpu);
> +	}
>  	srcu_read_unlock(&vcpu->kvm->srcu, idx);


This cannot be right. The CPU could long since be running tasks again,
but as long as this vCPU crud doesn't run, the guest keeps thinking it's
physically idle.
Peter Zijlstra Aug. 31, 2021, 7:17 a.m. UTC | #2
On Tue, Aug 31, 2021 at 09:16:09AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 31, 2021 at 09:59:18AM +0800, Tianqiang Xu wrote:
> > @@ -4304,8 +4374,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >  	idx = srcu_read_lock(&vcpu->kvm->srcu);
> >  	if (kvm_xen_msr_enabled(vcpu->kvm))
> >  		kvm_xen_runstate_set_preempted(vcpu);
> > -	else
> > +	else {
> >  		kvm_steal_time_set_preempted(vcpu);
> > +
> > +		if (get_cpu_nr_running(smp_processor_id()) <= 1)
> > +			kvm_steal_time_set_is_idle(vcpu);
> > +		else
> > +			kvm_steal_time_clear_is_idle(vcpu);
> > +	}
> >  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> 
> 
> This cannot be right. The CPU could long since be running tasks again,
> but as long as this vCPU crud doesn't run, the guest keeps thinking it's
> physically idle.

More fundamentally, a blocked task doesn't have a CPU. So unless you've
pinned your vCPU threads to physical CPUs, the whole thing is bonkers.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index c32f2eb6186c..1832dd8308ca 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -61,7 +61,6 @@  static inline bool vcpu_is_preempted(long cpu)
 {
 	return pv_vcpu_is_preempted(cpu);
 }
-#endif
 
 #define pcpu_is_idle pcpu_is_idle
 static inline bool pcpu_is_idle(long cpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5d5c5ed7dd4..1fb1ab3d6fca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3181,6 +3181,72 @@  static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
 	static_call(kvm_x86_tlb_flush_guest)(vcpu);
 }
 
+static void kvm_steal_time_set_is_idle(struct kvm_vcpu *vcpu)
+{
+	struct kvm_host_map map;
+	struct kvm_steal_time *st;
+
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	if (vcpu->arch.st.is_idle)
+		return;
+
+	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+			&vcpu->arch.st.cache, true))
+		return;
+
+	st = map.hva +
+		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+	st->is_idle = vcpu->arch.st.is_idle = KVM_PCPU_IS_IDLE;
+
+	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+}
+
+static void kvm_steal_time_clear_is_idle(struct kvm_vcpu *vcpu)
+{
+	struct kvm_host_map map;
+	struct kvm_steal_time *st;
+
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	if (vcpu->arch.st.is_idle)
+		return;
+
+	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+			&vcpu->arch.st.cache, false))
+		return;
+
+	st = map.hva +
+		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH))
+		xchg(&st->is_idle, 0);
+	else
+		st->is_idle = 0;
+
+	vcpu->arch.st.is_idle = 0;
+
+	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+}
+
+
+static DEFINE_PER_CPU(struct kvm_vcpu *, this_cpu_pre_run_vcpu);
+
+static void vcpu_load_update_pre_vcpu_callback(struct kvm_vcpu *new_vcpu, struct kvm_steal_time *st)
+{
+	struct kvm_vcpu *old_vcpu = __this_cpu_read(this_cpu_pre_run_vcpu);
+
+	if (!old_vcpu)
+		return;
+	if (old_vcpu != new_vcpu)
+		kvm_steal_time_clear_is_idle(old_vcpu);
+	else
+		st->is_idle = new_vcpu->arch.st.is_idle = KVM_PCPU_IS_IDLE;
+}
+
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
 	struct kvm_host_map map;
@@ -3219,6 +3285,8 @@  static void record_steal_time(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.preempted = 0;
 
+	vcpu_load_update_pre_vcpu_callback(vcpu, st);
+
 	if (st->version & 1)
 		st->version += 1;  /* first time write, random junk */
 
@@ -4290,6 +4358,8 @@  static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
 }
 
+extern int get_cpu_nr_running(int cpu);
+
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	int idx;
@@ -4304,8 +4374,14 @@  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	if (kvm_xen_msr_enabled(vcpu->kvm))
 		kvm_xen_runstate_set_preempted(vcpu);
-	else
+	else {
 		kvm_steal_time_set_preempted(vcpu);
+
+		if (get_cpu_nr_running(smp_processor_id()) <= 1)
+			kvm_steal_time_set_is_idle(vcpu);
+		else
+			kvm_steal_time_clear_is_idle(vcpu);
+	}
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
 	static_call(kvm_x86_vcpu_put)(vcpu);
@@ -9693,6 +9769,8 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	local_irq_enable();
 	preempt_enable();
 
+	__this_cpu_write(this_cpu_pre_run_vcpu, vcpu);
+
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
 	/*
@@ -11253,6 +11331,14 @@  void kvm_arch_pre_destroy_vm(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	int cpu;
+	struct kvm_vcpu *vcpu;
+
+	for_each_possible_cpu(cpu) {
+		vcpu = per_cpu(this_cpu_pre_run_vcpu, cpu);
+		if (vcpu && vcpu->kvm == kvm)
+			per_cpu(this_cpu_pre_run_vcpu, cpu) = NULL;
+	}
+
 	if (current->mm == kvm->mm) {
 		/*
 		 * Free memory regions allocated on behalf of userspace,