diff mbox series

[RFC,21/73] KVM: x86/PVM: Implement vcpu_run() callbacks

Message ID 20240226143630.33643-22-jiangshanlai@gmail.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/PVM: Introduce a new hypervisor | expand

Commit Message

Lai Jiangshan Feb. 26, 2024, 2:35 p.m. UTC
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>

In the vcpu_run() callback, the hypervisor needs to prepare for VM enter
in the switcher and record exit reasons after VM exit. The guest
registers are prepared on the host SP0 stack, and the guest/host
hardware CR3 is saved in the TSS for the switcher before VM enter.
Additionally, the guest xsave state is loaded into hardware before VM
enter. After VM exit, the guest registers are saved from the entry
stack, and host xsave states are restored.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
---
 arch/x86/kvm/pvm/pvm.c | 163 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/pvm/pvm.h |   5 ++
 2 files changed, 168 insertions(+)
diff mbox series

Patch

diff --git a/arch/x86/kvm/pvm/pvm.c b/arch/x86/kvm/pvm/pvm.c
index 52b3b47ffe42..00a50ed0c118 100644
--- a/arch/x86/kvm/pvm/pvm.c
+++ b/arch/x86/kvm/pvm/pvm.c
@@ -16,8 +16,11 @@ 
 #include <asm/gsseg.h>
 #include <asm/io_bitmap.h>
 #include <asm/pvm_para.h>
+#include <asm/mmu_context.h>
 
 #include "cpuid.h"
+#include "lapic.h"
+#include "trace.h"
 #include "x86.h"
 #include "pvm.h"
 
@@ -204,6 +207,31 @@  static void pvm_switch_to_host(struct vcpu_pvm *pvm)
 	preempt_enable();
 }
 
+static void pvm_set_host_cr3_for_hypervisor(struct vcpu_pvm *pvm)
+{
+	unsigned long cr3;
+
+	if (static_cpu_has(X86_FEATURE_PCID))
+		cr3 = __get_current_cr3_fast() | X86_CR3_PCID_NOFLUSH;
+	else
+		cr3 = __get_current_cr3_fast();
+	this_cpu_write(cpu_tss_rw.tss_ex.host_cr3, cr3);
+}
+
+// Set tss_ex.host_cr3 for VMExit.
+// Set tss_ex.enter_cr3 for VMEnter.
+static void pvm_set_host_cr3(struct vcpu_pvm *pvm)
+{
+	pvm_set_host_cr3_for_hypervisor(pvm);
+	this_cpu_write(cpu_tss_rw.tss_ex.enter_cr3, pvm->vcpu.arch.mmu->root.hpa);
+}
+
+static void pvm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+			     int root_level)
+{
+	/* Nothing to do. Guest cr3 will be prepared in pvm_set_host_cr3(). */
+}
+
 DEFINE_PER_CPU(struct vcpu_pvm *, active_pvm_vcpu);
 
 /*
@@ -262,6 +290,136 @@  static bool cpu_has_pvm_wbinvd_exit(void)
 	return true;
 }
 
+static int pvm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+	return 1;
+}
+
+// Save guest registers from host sp0 or IST stack.
+static __always_inline void save_regs(struct kvm_vcpu *vcpu, struct pt_regs *guest)
+{
+	struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+	vcpu->arch.regs[VCPU_REGS_RAX] = guest->ax;
+	vcpu->arch.regs[VCPU_REGS_RCX] = guest->cx;
+	vcpu->arch.regs[VCPU_REGS_RDX] = guest->dx;
+	vcpu->arch.regs[VCPU_REGS_RBX] = guest->bx;
+	vcpu->arch.regs[VCPU_REGS_RSP] = guest->sp;
+	vcpu->arch.regs[VCPU_REGS_RBP] = guest->bp;
+	vcpu->arch.regs[VCPU_REGS_RSI] = guest->si;
+	vcpu->arch.regs[VCPU_REGS_RDI] = guest->di;
+	vcpu->arch.regs[VCPU_REGS_R8] = guest->r8;
+	vcpu->arch.regs[VCPU_REGS_R9] = guest->r9;
+	vcpu->arch.regs[VCPU_REGS_R10] = guest->r10;
+	vcpu->arch.regs[VCPU_REGS_R11] = guest->r11;
+	vcpu->arch.regs[VCPU_REGS_R12] = guest->r12;
+	vcpu->arch.regs[VCPU_REGS_R13] = guest->r13;
+	vcpu->arch.regs[VCPU_REGS_R14] = guest->r14;
+	vcpu->arch.regs[VCPU_REGS_R15] = guest->r15;
+	vcpu->arch.regs[VCPU_REGS_RIP] = guest->ip;
+	pvm->rflags = guest->flags;
+	pvm->hw_cs = guest->cs;
+	pvm->hw_ss = guest->ss;
+}
+
+// load guest registers to host sp0 stack.
+static __always_inline void load_regs(struct kvm_vcpu *vcpu, struct pt_regs *guest)
+{
+	struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+	guest->ss = pvm->hw_ss;
+	guest->sp = vcpu->arch.regs[VCPU_REGS_RSP];
+	guest->flags = (pvm->rflags & SWITCH_ENTER_EFLAGS_ALLOWED) | SWITCH_ENTER_EFLAGS_FIXED;
+	guest->cs = pvm->hw_cs;
+	guest->ip = vcpu->arch.regs[VCPU_REGS_RIP];
+	guest->orig_ax = -1;
+	guest->di = vcpu->arch.regs[VCPU_REGS_RDI];
+	guest->si = vcpu->arch.regs[VCPU_REGS_RSI];
+	guest->dx = vcpu->arch.regs[VCPU_REGS_RDX];
+	guest->cx = vcpu->arch.regs[VCPU_REGS_RCX];
+	guest->ax = vcpu->arch.regs[VCPU_REGS_RAX];
+	guest->r8 = vcpu->arch.regs[VCPU_REGS_R8];
+	guest->r9 = vcpu->arch.regs[VCPU_REGS_R9];
+	guest->r10 = vcpu->arch.regs[VCPU_REGS_R10];
+	guest->r11 = vcpu->arch.regs[VCPU_REGS_R11];
+	guest->bx = vcpu->arch.regs[VCPU_REGS_RBX];
+	guest->bp = vcpu->arch.regs[VCPU_REGS_RBP];
+	guest->r12 = vcpu->arch.regs[VCPU_REGS_R12];
+	guest->r13 = vcpu->arch.regs[VCPU_REGS_R13];
+	guest->r14 = vcpu->arch.regs[VCPU_REGS_R14];
+	guest->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+}
+
+static noinstr void pvm_vcpu_run_noinstr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_pvm *pvm = to_pvm(vcpu);
+	struct pt_regs *sp0_regs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+	struct pt_regs *ret_regs;
+
+	guest_state_enter_irqoff();
+
+	// Load guest registers into the host sp0 stack for switcher.
+	load_regs(vcpu, sp0_regs);
+
+	// Call into switcher and enter guest.
+	ret_regs = switcher_enter_guest();
+
+	// Get the guest registers from the host sp0 stack.
+	save_regs(vcpu, ret_regs);
+	pvm->exit_vector = (ret_regs->orig_ax >> 32);
+	pvm->exit_error_code = (u32)ret_regs->orig_ax;
+
+	guest_state_exit_irqoff();
+}
+
+/*
+ * PVM wrappers for kvm_load_{guest|host}_xsave_state().
+ *
+ * Currently PKU is disabled for shadowpaging and to avoid overhead,
+ * host CR4.PKE is unchanged for entering/exiting guest even when
+ * host CR4.PKE is enabled.
+ *
+ * These wrappers fix pkru when host CR4.PKE is enabled.
+ */
+static inline void pvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+{
+	kvm_load_guest_xsave_state(vcpu);
+
+	if (cpu_feature_enabled(X86_FEATURE_PKU)) {
+		if (vcpu->arch.host_pkru)
+			write_pkru(0);
+	}
+}
+
+static inline void pvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
+{
+	kvm_load_host_xsave_state(vcpu);
+
+	if (cpu_feature_enabled(X86_FEATURE_PKU)) {
+		if (rdpkru() != vcpu->arch.host_pkru)
+			write_pkru(vcpu->arch.host_pkru);
+	}
+}
+
+static fastpath_t pvm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+	trace_kvm_entry(vcpu);
+
+	pvm_load_guest_xsave_state(vcpu);
+
+	kvm_wait_lapic_expire(vcpu);
+
+	pvm_set_host_cr3(pvm);
+
+	pvm_vcpu_run_noinstr(vcpu);
+
+	pvm_load_host_xsave_state(vcpu);
+
+	return EXIT_FASTPATH_NONE;
+}
+
 static void reset_segment(struct kvm_segment *var, int seg)
 {
 	memset(var, 0, sizeof(*var));
@@ -520,6 +678,11 @@  static struct kvm_x86_ops pvm_x86_ops __initdata = {
 	.vcpu_load = pvm_vcpu_load,
 	.vcpu_put = pvm_vcpu_put,
 
+	.load_mmu_pgd = pvm_load_mmu_pgd,
+
+	.vcpu_pre_run = pvm_vcpu_pre_run,
+	.vcpu_run = pvm_vcpu_run,
+
 	.vcpu_after_set_cpuid = pvm_vcpu_after_set_cpuid,
 
 	.sched_in = pvm_sched_in,
diff --git a/arch/x86/kvm/pvm/pvm.h b/arch/x86/kvm/pvm/pvm.h
index 6584314487bc..349f4eac98ec 100644
--- a/arch/x86/kvm/pvm/pvm.h
+++ b/arch/x86/kvm/pvm/pvm.h
@@ -28,10 +28,15 @@  int host_mmu_init(void);
 struct vcpu_pvm {
 	struct kvm_vcpu vcpu;
 
+	// guest rflags, turned into hw rflags when in switcher
+	unsigned long rflags;
+
 	unsigned long switch_flags;
 
 	u16 host_ds_sel, host_es_sel;
 
+	u32 exit_vector;
+	u32 exit_error_code;
 	u32 hw_cs, hw_ss;
 
 	int loaded_cpu_state;