@@ -75,7 +75,7 @@ SYM_FUNC_START(switcher_enter_guest)
/* Switch to guest GSBASE and return to guest */
swapgs
- jmp native_irq_return_iret
+ jmp .L_switcher_return_to_guest
SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
/* switch back to host cr3 when still on sp0/ist stack */
@@ -99,6 +99,23 @@ SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
SYM_FUNC_END(switcher_enter_guest)
EXPORT_SYMBOL_GPL(switcher_enter_guest)
+.macro canonical_rcx
+ /*
+ * If width of "canonical tail" ever becomes variable, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ *
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
+ */
+#ifdef CONFIG_X86_5LEVEL
+ ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
+ "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
+#else
+ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+ sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
+.endm
+
SYM_CODE_START(entry_SYSCALL_64_switcher)
UNWIND_HINT_ENTRY
ENDBR
@@ -117,7 +134,133 @@ SYM_INNER_LABEL(entry_SYSCALL_64_switcher_safe_stack, SYM_L_GLOBAL)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+ pushq %rdi /* put rdi on ORIG_RAX */
+
+ /* check if it can do direct switch from umod to smod */
+ testq $SWITCH_FLAGS_NO_DS_TO_SMOD, TSS_extra(switch_flags)
+ jnz .L_switcher_check_return_umod_instruction
+
+ /* Now it must be umod, start to do direct switch from umod to smod */
+ movq TSS_extra(pvcs), %rdi
+ movl %r11d, PVCS_eflags(%rdi)
+ movq %rcx, PVCS_rip(%rdi)
+ movq %rcx, PVCS_rcx(%rdi)
+ movq %r11, PVCS_r11(%rdi)
+ movq RSP-ORIG_RAX(%rsp), %rcx
+ movq %rcx, PVCS_rsp(%rdi)
+
+ /* switch umod to smod (switch_flags & cr3) */
+ xorb $SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+ movq TSS_extra(smod_cr3), %rcx
+ movq %rcx, %cr3
+
+ /* load smod registers from TSS_extra to sp0 stack or %r11 */
+ movq TSS_extra(smod_rsp), %rcx
+ movq %rcx, RSP-ORIG_RAX(%rsp)
+ movq TSS_extra(smod_entry), %rcx
+ movq %rcx, RIP-ORIG_RAX(%rsp)
+ movq TSS_extra(smod_gsbase), %r11
+
+ /* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+ swapgs
+
+ /* save guest gsbase as user_gsbase and switch to smod_gsbase */
+ rdgsbase %rcx
+ movq %rcx, PVCS_user_gsbase(%rdi)
+ wrgsbase %r11
+
+ /* restore umod rdi and smod rflags/r11, rip/rcx and rsp for sysretq */
+ popq %rdi
+ movq $SWITCH_ENTER_EFLAGS_FIXED, %r11
+ movq RIP-RIP(%rsp), %rcx
+
+.L_switcher_sysretq:
+ UNWIND_HINT_IRET_REGS
+ /* now everything is ready for sysretq except for %rsp */
+ movq RSP-RIP(%rsp), %rsp
+ /* No instruction can be added between seting the guest %rsp and doing sysretq */
+SYM_INNER_LABEL(entry_SYSRETQ_switcher_unsafe_stack, SYM_L_GLOBAL)
+ sysretq
+
+.L_switcher_check_return_umod_instruction:
+ UNWIND_HINT_IRET_REGS offset=8
+
+ /* check if it can do direct switch from smod to umod */
+ testq $SWITCH_FLAGS_NO_DS_TO_UMOD, TSS_extra(switch_flags)
+ jnz .L_switcher_return_to_hypervisor
+
+ /*
+ * Now it must be smod, check if it is the return-umod instruction.
+ * Switcher and the PVM specification defines a SYSCALL instrucion
+ * at TSS_extra(retu_rip) - 2 in smod as the return-umod instruction.
+ */
+ cmpq %rcx, TSS_extra(retu_rip)
+ jne .L_switcher_return_to_hypervisor
+
+ /* only handle for the most common cs/ss */
+ movq TSS_extra(pvcs), %rdi
+ cmpl $((__USER_DS << 16) | __USER_CS), PVCS_user_cs(%rdi)
+ jne .L_switcher_return_to_hypervisor
+
+ /* Switcher and the PVM specification requires the smod RSP to be saved */
+ movq RSP-ORIG_RAX(%rsp), %rcx
+ movq %rcx, TSS_extra(smod_rsp)
+
+ /* switch smod to umod (switch_flags & cr3) */
+ xorb $SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+ movq TSS_extra(umod_cr3), %rcx
+ movq %rcx, %cr3
+
+ /* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+ swapgs
+
+ /* write umod gsbase */
+ movq PVCS_user_gsbase(%rdi), %rcx
+ canonical_rcx
+ wrgsbase %rcx
+
+ /* load sp, flags, ip to sp0 stack and cx, r11, rdi to registers */
+ movq PVCS_rsp(%rdi), %rcx
+ movq %rcx, RSP-ORIG_RAX(%rsp)
+ movl PVCS_eflags(%rdi), %r11d
+ movq %r11, EFLAGS-ORIG_RAX(%rsp)
+ movq PVCS_rip(%rdi), %rcx
+ movq %rcx, RIP-ORIG_RAX(%rsp)
+ movq PVCS_rcx(%rdi), %rcx
+ movq PVCS_r11(%rdi), %r11
+ popq %rdi // saved rdi (on ORIG_RAX)
+
+.L_switcher_return_to_guest:
+ /*
+ * Now the RSP points to an IRET frame with guest state on the
+ * top of the sp0 stack. Check if it can do sysretq.
+ */
+ UNWIND_HINT_IRET_REGS
+
+ andq $SWITCH_ENTER_EFLAGS_ALLOWED, EFLAGS-RIP(%rsp)
+ orq $SWITCH_ENTER_EFLAGS_FIXED, EFLAGS-RIP(%rsp)
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), EFLAGS-RIP(%rsp)
+ jnz native_irq_return_iret
+ cmpq %r11, EFLAGS-RIP(%rsp)
+ jne native_irq_return_iret
+
+ cmpq %rcx, RIP-RIP(%rsp)
+ jne native_irq_return_iret
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the guest take over
+ * the host, since guest controls RSP.
+ */
+ canonical_rcx
+ cmpq %rcx, RIP-RIP(%rsp)
+ je .L_switcher_sysretq
+
+ /* RCX matches for RIP only before RCX is canonicalized, restore RCX and do IRET. */
+ movq RIP-RIP(%rsp), %rcx
+ jmp native_irq_return_iret
+.L_switcher_return_to_hypervisor:
+ popq %rdi /* saved rdi */
pushq $0 /* pt_regs->orig_ax */
movl $SWITCH_EXIT_REASONS_SYSCALL, 4(%rsp)
@@ -198,6 +198,8 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_switcher &&
regs->ip < (unsigned long)entry_SYSCALL_64_switcher_safe_stack);
+ ret = ret || (regs->ip == (unsigned long)entry_SYSRETQ_switcher_unsafe_stack);
+
return ret;
}
#endif
@@ -8,6 +8,40 @@
#define SWITCH_EXIT_REASONS_SYSCALL 1024
#define SWITCH_EXIT_REASONS_FAILED_VMETNRY 1025
+/*
+ * SWITCH_FLAGS control the way how the switcher code works,
+ * mostly dictate whether it should directly do the guest ring
+ * switch or just go back to hypervisor.
+ *
+ * SMOD and UMOD
+ * Current vcpu mode. Use two parity bits to simplify direct-switch
+ * flags checking.
+ *
+ * NO_DS_CR3
+ * Not to direct switch due to smod_cr3 or umod_cr3 not having been
+ * prepared.
+ */
+#define SWITCH_FLAGS_SMOD _BITULL(0)
+#define SWITCH_FLAGS_UMOD _BITULL(1)
+#define SWITCH_FLAGS_NO_DS_CR3 _BITULL(2)
+
+#define SWITCH_FLAGS_MOD_TOGGLE (SWITCH_FLAGS_SMOD | SWITCH_FLAGS_UMOD)
+
+/*
+ * Direct switching disabling bits are all the bits other than
+ * SWITCH_FLAGS_SMOD or SWITCH_FLAGS_UMOD. Bits 8-64 are defined by the driver
+ * using the switcher. Direct switching is enabled if all the disabling bits
+ * are cleared.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_SMOD: not to direct switch to smod due to any
+ * disabling bit or smod bit being set.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_UMOD: not to direct switch to umod due to any
+ * disabling bit or umod bit being set.
+ */
+#define SWITCH_FLAGS_NO_DS_TO_SMOD (~SWITCH_FLAGS_UMOD)
+#define SWITCH_FLAGS_NO_DS_TO_UMOD (~SWITCH_FLAGS_SMOD)
+
/* Bits allowed to be set in the underlying eflags */
#define SWITCH_ENTER_EFLAGS_ALLOWED (X86_EFLAGS_FIXED | X86_EFLAGS_IF |\
X86_EFLAGS_TF | X86_EFLAGS_RF |\
@@ -24,6 +58,7 @@
#include <linux/cache.h>
struct pt_regs;
+struct pvm_vcpu_struct;
/*
* Extra per CPU control structure lives in the struct tss_struct.
@@ -46,6 +81,31 @@ struct tss_extra {
unsigned long host_rsp;
/* Prepared guest CR3 to be loaded before VM enter. */
unsigned long enter_cr3;
+
+ /*
+ * Direct switching flag indicates whether direct switching
+ * is allowed.
+ */
+ unsigned long switch_flags ____cacheline_aligned;
+ /*
+ * Guest supervisor mode hardware CR3 for direct switching of guest
+ * user mode syscall.
+ */
+ unsigned long smod_cr3;
+ /*
+ * Guest user mode hardware CR3 for direct switching of guest ERETU
+ * synthetic instruction.
+ */
+ unsigned long umod_cr3;
+ /*
+ * The current PVCS for saving and restoring guest user mode context
+ * in direct switching.
+ */
+ struct pvm_vcpu_struct *pvcs;
+ unsigned long retu_rip;
+ unsigned long smod_entry;
+ unsigned long smod_gsbase;
+ unsigned long smod_rsp;
} ____cacheline_aligned;
extern struct pt_regs *switcher_enter_guest(void);
@@ -4,6 +4,7 @@
#endif
#include <asm/ia32.h>
+#include <asm/pvm_para.h>
#if defined(CONFIG_KVM_GUEST)
#include <asm/kvm_para.h>
@@ -65,6 +66,28 @@ int main(void)
ENTRY(host_cr3);
ENTRY(host_rsp);
ENTRY(enter_cr3);
+ ENTRY(switch_flags);
+ ENTRY(smod_cr3);
+ ENTRY(umod_cr3);
+ ENTRY(pvcs);
+ ENTRY(retu_rip);
+ ENTRY(smod_entry);
+ ENTRY(smod_gsbase);
+ ENTRY(smod_rsp);
+ BLANK();
+#undef ENTRY
+
+#define ENTRY(entry) OFFSET(PVCS_ ## entry, pvm_vcpu_struct, entry)
+ ENTRY(event_flags);
+ ENTRY(event_errcode);
+ ENTRY(user_cs);
+ ENTRY(user_ss);
+ ENTRY(user_gsbase);
+ ENTRY(rsp);
+ ENTRY(eflags);
+ ENTRY(rip);
+ ENTRY(rcx);
+ ENTRY(r11);
BLANK();
#undef ENTRY