diff mbox series

[RFC,04/73] x86/entry: Implement direct switching for the switcher

Message ID 20240226143630.33643-5-jiangshanlai@gmail.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/PVM: Introduce a new hypervisor | expand

Commit Message

Lai Jiangshan Feb. 26, 2024, 2:35 p.m. UTC
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>

During VM running, all VM exits in the switcher will be forwarded to the
hypervisor and then returned to the switcher to re-enter the VM after
handling the VM exit. In some situations, the switcher can handle the VM
exit directly without involving the hypervisor. This is referred to as
direct switching, and it can reduce the overhead of guest/host state
switching. Currently, for simplicity, only the syscall event from user
mode and ERETU synthetic instruction are allowed for direct switching.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
---
 arch/x86/entry/entry_64_switcher.S | 145 ++++++++++++++++++++++++++++-
 arch/x86/include/asm/ptrace.h      |   2 +
 arch/x86/include/asm/switcher.h    |  60 ++++++++++++
 arch/x86/kernel/asm-offsets_64.c   |  23 +++++
 4 files changed, 229 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/arch/x86/entry/entry_64_switcher.S b/arch/x86/entry/entry_64_switcher.S
index 2b99a46421cc..6f166d15635c 100644
--- a/arch/x86/entry/entry_64_switcher.S
+++ b/arch/x86/entry/entry_64_switcher.S
@@ -75,7 +75,7 @@  SYM_FUNC_START(switcher_enter_guest)
 
 	/* Switch to guest GSBASE and return to guest */
 	swapgs
-	jmp	native_irq_return_iret
+	jmp	.L_switcher_return_to_guest
 
 SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
 	/* switch back to host cr3 when still on sp0/ist stack */
@@ -99,6 +99,23 @@  SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
 SYM_FUNC_END(switcher_enter_guest)
 EXPORT_SYMBOL_GPL(switcher_enter_guest)
 
+.macro canonical_rcx
+	/*
+	 * If width of "canonical tail" ever becomes variable, this will need
+	 * to be updated to remain correct on both old and new CPUs.
+	 *
+	 * Change top bits to match most significant bit (47th or 56th bit
+	 * depending on paging mode) in the address.
+	 */
+#ifdef CONFIG_X86_5LEVEL
+	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
+		    "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
+#else
+	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
+.endm
+
 SYM_CODE_START(entry_SYSCALL_64_switcher)
 	UNWIND_HINT_ENTRY
 	ENDBR
@@ -117,7 +134,133 @@  SYM_INNER_LABEL(entry_SYSCALL_64_switcher_safe_stack, SYM_L_GLOBAL)
 	pushq	%r11					/* pt_regs->flags */
 	pushq	$__USER_CS				/* pt_regs->cs */
 	pushq	%rcx					/* pt_regs->ip */
+	pushq	%rdi					/* put rdi on ORIG_RAX */
+
+	/* check if it can do direct switch from umod to smod */
+	testq	$SWITCH_FLAGS_NO_DS_TO_SMOD, TSS_extra(switch_flags)
+	jnz	.L_switcher_check_return_umod_instruction
+
+	/* Now it must be umod, start to do direct switch from umod to smod */
+	movq	TSS_extra(pvcs), %rdi
+	movl	%r11d, PVCS_eflags(%rdi)
+	movq	%rcx, PVCS_rip(%rdi)
+	movq	%rcx, PVCS_rcx(%rdi)
+	movq	%r11, PVCS_r11(%rdi)
+	movq	RSP-ORIG_RAX(%rsp), %rcx
+	movq	%rcx, PVCS_rsp(%rdi)
+
+	/* switch umod to smod (switch_flags & cr3) */
+	xorb	$SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+	movq	TSS_extra(smod_cr3), %rcx
+	movq	%rcx, %cr3
+
+	/* load smod registers from TSS_extra to sp0 stack or %r11 */
+	movq	TSS_extra(smod_rsp), %rcx
+	movq	%rcx, RSP-ORIG_RAX(%rsp)
+	movq	TSS_extra(smod_entry), %rcx
+	movq	%rcx, RIP-ORIG_RAX(%rsp)
+	movq	TSS_extra(smod_gsbase), %r11
+
+	/* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+	swapgs
+
+	/* save guest gsbase as user_gsbase and switch to smod_gsbase */
+	rdgsbase %rcx
+	movq	%rcx, PVCS_user_gsbase(%rdi)
+	wrgsbase %r11
+
+	/* restore umod rdi and smod rflags/r11, rip/rcx and rsp for sysretq */
+	popq	%rdi
+	movq	$SWITCH_ENTER_EFLAGS_FIXED, %r11
+	movq	RIP-RIP(%rsp), %rcx
+
+.L_switcher_sysretq:
+	UNWIND_HINT_IRET_REGS
+	/* now everything is ready for sysretq except for %rsp */
+	movq	RSP-RIP(%rsp), %rsp
+	/* No instruction can be added between seting the guest %rsp and doing sysretq */
+SYM_INNER_LABEL(entry_SYSRETQ_switcher_unsafe_stack, SYM_L_GLOBAL)
+	sysretq
+
+.L_switcher_check_return_umod_instruction:
+	UNWIND_HINT_IRET_REGS offset=8
+
+	/* check if it can do direct switch from smod to umod */
+	testq	$SWITCH_FLAGS_NO_DS_TO_UMOD, TSS_extra(switch_flags)
+	jnz	.L_switcher_return_to_hypervisor
+
+	/*
+	 * Now it must be smod, check if it is the return-umod instruction.
+	 * Switcher and the PVM specification defines a SYSCALL instrucion
+	 * at TSS_extra(retu_rip) - 2 in smod as the return-umod instruction.
+	 */
+	cmpq	%rcx, TSS_extra(retu_rip)
+	jne	.L_switcher_return_to_hypervisor
+
+	/* only handle for the most common cs/ss */
+	movq	TSS_extra(pvcs), %rdi
+	cmpl	$((__USER_DS << 16) | __USER_CS), PVCS_user_cs(%rdi)
+	jne	.L_switcher_return_to_hypervisor
+
+	/* Switcher and the PVM specification requires the smod RSP to be saved */
+	movq	RSP-ORIG_RAX(%rsp), %rcx
+	movq	%rcx, TSS_extra(smod_rsp)
+
+	/* switch smod to umod (switch_flags & cr3) */
+	xorb	$SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+	movq	TSS_extra(umod_cr3), %rcx
+	movq	%rcx, %cr3
+
+	/* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+	swapgs
+
+	/* write umod gsbase */
+	movq	PVCS_user_gsbase(%rdi), %rcx
+	canonical_rcx
+	wrgsbase %rcx
+
+	/* load sp, flags, ip to sp0 stack and cx, r11, rdi to registers */
+	movq	PVCS_rsp(%rdi), %rcx
+	movq	%rcx, RSP-ORIG_RAX(%rsp)
+	movl	PVCS_eflags(%rdi), %r11d
+	movq	%r11, EFLAGS-ORIG_RAX(%rsp)
+	movq	PVCS_rip(%rdi), %rcx
+	movq	%rcx, RIP-ORIG_RAX(%rsp)
+	movq	PVCS_rcx(%rdi), %rcx
+	movq	PVCS_r11(%rdi), %r11
+	popq	%rdi		// saved rdi (on ORIG_RAX)
+
+.L_switcher_return_to_guest:
+	/*
+	 * Now the RSP points to an IRET frame with guest state on the
+	 * top of the sp0 stack.  Check if it can do sysretq.
+	 */
+	UNWIND_HINT_IRET_REGS
+
+	andq	$SWITCH_ENTER_EFLAGS_ALLOWED, EFLAGS-RIP(%rsp)
+	orq	$SWITCH_ENTER_EFLAGS_FIXED, EFLAGS-RIP(%rsp)
+	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), EFLAGS-RIP(%rsp)
+	jnz	native_irq_return_iret
+	cmpq	%r11, EFLAGS-RIP(%rsp)
+	jne	native_irq_return_iret
+
+	cmpq	%rcx, RIP-RIP(%rsp)
+	jne	native_irq_return_iret
+	/*
+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+	 * in kernel space.  This essentially lets the guest take over
+	 * the host, since guest controls RSP.
+	 */
+	canonical_rcx
+	cmpq	%rcx, RIP-RIP(%rsp)
+	je	.L_switcher_sysretq
+
+	/* RCX matches for RIP only before RCX is canonicalized, restore RCX and do IRET. */
+	movq	RIP-RIP(%rsp), %rcx
+	jmp	native_irq_return_iret
 
+.L_switcher_return_to_hypervisor:
+	popq	%rdi					/* saved rdi */
 	pushq	$0					/* pt_regs->orig_ax */
 	movl	$SWITCH_EXIT_REASONS_SYSCALL, 4(%rsp)
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 9eeeb5fdd387..322697877a2d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -198,6 +198,8 @@  static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
 	ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_switcher &&
 		      regs->ip <  (unsigned long)entry_SYSCALL_64_switcher_safe_stack);
 
+	ret = ret || (regs->ip == (unsigned long)entry_SYSRETQ_switcher_unsafe_stack);
+
 	return ret;
 }
 #endif
diff --git a/arch/x86/include/asm/switcher.h b/arch/x86/include/asm/switcher.h
index dbf1970ca62f..35a60f4044c4 100644
--- a/arch/x86/include/asm/switcher.h
+++ b/arch/x86/include/asm/switcher.h
@@ -8,6 +8,40 @@ 
 #define SWITCH_EXIT_REASONS_SYSCALL		1024
 #define SWITCH_EXIT_REASONS_FAILED_VMETNRY	1025
 
+/*
+ * SWITCH_FLAGS control the way how the switcher code works,
+ *	mostly dictate whether it should directly do the guest ring
+ *	switch or just go back to hypervisor.
+ *
+ * SMOD and UMOD
+ *	Current vcpu mode. Use two parity bits to simplify direct-switch
+ *	flags checking.
+ *
+ * NO_DS_CR3
+ *	Not to direct switch due to smod_cr3 or umod_cr3 not having been
+ *	prepared.
+ */
+#define SWITCH_FLAGS_SMOD			_BITULL(0)
+#define SWITCH_FLAGS_UMOD			_BITULL(1)
+#define SWITCH_FLAGS_NO_DS_CR3			_BITULL(2)
+
+#define SWITCH_FLAGS_MOD_TOGGLE			(SWITCH_FLAGS_SMOD | SWITCH_FLAGS_UMOD)
+
+/*
+ * Direct switching disabling bits are all the bits other than
+ * SWITCH_FLAGS_SMOD or SWITCH_FLAGS_UMOD. Bits 8-64 are defined by the driver
+ * using the switcher. Direct switching is enabled if all the disabling bits
+ * are cleared.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_SMOD: not to direct switch to smod due to any
+ * disabling bit or smod bit being set.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_UMOD: not to direct switch to umod due to any
+ * disabling bit or umod bit being set.
+ */
+#define SWITCH_FLAGS_NO_DS_TO_SMOD		(~SWITCH_FLAGS_UMOD)
+#define SWITCH_FLAGS_NO_DS_TO_UMOD		(~SWITCH_FLAGS_SMOD)
+
 /* Bits allowed to be set in the underlying eflags */
 #define SWITCH_ENTER_EFLAGS_ALLOWED	(X86_EFLAGS_FIXED | X86_EFLAGS_IF |\
 					 X86_EFLAGS_TF | X86_EFLAGS_RF |\
@@ -24,6 +58,7 @@ 
 #include <linux/cache.h>
 
 struct pt_regs;
+struct pvm_vcpu_struct;
 
 /*
  * Extra per CPU control structure lives in the struct tss_struct.
@@ -46,6 +81,31 @@  struct tss_extra {
 	unsigned long host_rsp;
 	/* Prepared guest CR3 to be loaded before VM enter. */
 	unsigned long enter_cr3;
+
+	/*
+	 * Direct switching flag indicates whether direct switching
+	 * is allowed.
+	 */
+	unsigned long switch_flags ____cacheline_aligned;
+	/*
+	 * Guest supervisor mode hardware CR3 for direct switching of guest
+	 * user mode syscall.
+	 */
+	unsigned long smod_cr3;
+	/*
+	 * Guest user mode hardware CR3 for direct switching of guest ERETU
+	 * synthetic instruction.
+	 */
+	unsigned long umod_cr3;
+	/*
+	 * The current PVCS for saving and restoring guest user mode context
+	 * in direct switching.
+	 */
+	struct pvm_vcpu_struct *pvcs;
+	unsigned long retu_rip;
+	unsigned long smod_entry;
+	unsigned long smod_gsbase;
+	unsigned long smod_rsp;
 } ____cacheline_aligned;
 
 extern struct pt_regs *switcher_enter_guest(void);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1485cbda6dc4..8230bd27f0b3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,6 +4,7 @@ 
 #endif
 
 #include <asm/ia32.h>
+#include <asm/pvm_para.h>
 
 #if defined(CONFIG_KVM_GUEST)
 #include <asm/kvm_para.h>
@@ -65,6 +66,28 @@  int main(void)
 	ENTRY(host_cr3);
 	ENTRY(host_rsp);
 	ENTRY(enter_cr3);
+	ENTRY(switch_flags);
+	ENTRY(smod_cr3);
+	ENTRY(umod_cr3);
+	ENTRY(pvcs);
+	ENTRY(retu_rip);
+	ENTRY(smod_entry);
+	ENTRY(smod_gsbase);
+	ENTRY(smod_rsp);
+	BLANK();
+#undef ENTRY
+
+#define ENTRY(entry) OFFSET(PVCS_ ## entry, pvm_vcpu_struct, entry)
+	ENTRY(event_flags);
+	ENTRY(event_errcode);
+	ENTRY(user_cs);
+	ENTRY(user_ss);
+	ENTRY(user_gsbase);
+	ENTRY(rsp);
+	ENTRY(eflags);
+	ENTRY(rip);
+	ENTRY(rcx);
+	ENTRY(r11);
 	BLANK();
 #undef ENTRY