diff mbox series

KVM's SYSCALL emulation for GenuineIntel is buggy

Message ID CALCETrU8k1mL=Uy_QNbT7fjtCLO8N3xgZb6zLyfdwHx6SUFPoA@mail.gmail.com (mailing list archive)
State New, archived
Headers show
Series KVM's SYSCALL emulation for GenuineIntel is buggy | expand

Commit Message

Andy Lutomirski June 29, 2019, 5:16 a.m. UTC
If I do SYSCALL with EFLAGS.TF set from compat mode on Intel hardware
with -cpu host and no other funny business, the guest kernel seems to
get #DB with the stored IP pointing at the SYSCALL instruction.  This
is wrong -- SYSCALL is #UD, which is a *fault*, so there shouldn't be
a single-step trap.

Unless I'm missing something in the code, emulate_ud() is mishandled
in general -- it seems to make cause inject_emulated_exception() to
return false here:

    if (ctxt->have_exception) {
        r = EMULATE_DONE;
        if (inject_emulated_exception(vcpu))
            return r;

and then we land here:

        if (r == EMULATE_DONE && ctxt->tf)
            kvm_vcpu_do_singlestep(vcpu, &r);

if TF was set, which is wrong.

You can test this by applying the attached patch, building x86
selftests, and running syscall_arg_fault_32 in a VM.  It hangs.  It
should complete successfully, and it does on bare metal.

Comments

Paolo Bonzini June 30, 2019, 6:55 a.m. UTC | #1
On 29/06/19 07:16, Andy Lutomirski wrote:
> If I do SYSCALL with EFLAGS.TF set from compat mode on Intel hardware
> with -cpu host and no other funny business, the guest kernel seems to
> get #DB with the stored IP pointing at the SYSCALL instruction.  This
> is wrong -- SYSCALL is #UD, which is a *fault*, so there shouldn't be
> a single-step trap.

Yeah, the emulator doesn't try too hard to emulate Intel vs. AMD
differences.  But emulate_ud()'s mishandling

> Unless I'm missing something in the code, emulate_ud() is mishandled
> in general -- it seems to make cause inject_emulated_exception() to
> return false here:
> 
>     if (ctxt->have_exception) {
>         r = EMULATE_DONE;
>         if (inject_emulated_exception(vcpu))
>             return r;
> 
> and then we land here:
> 
>         if (r == EMULATE_DONE && ctxt->tf)
>             kvm_vcpu_do_singlestep(vcpu, &r);
> 
> if TF was set, which is wrong.
> 
> You can test this by applying the attached patch, building x86
> selftests, and running syscall_arg_fault_32 in a VM.  It hangs.  It
> should complete successfully, and it does on bare metal.

Ok, this is helpful.  inject_emulated_exception should return one of
vmexit (currently true), fault (the incorrect case), none (currently
false).  Thanks!

Paolo
diff mbox series

Patch

commit fae8e860584b5a8c2253b522cb478e92b8b0c281
Author: Andy Lutomirski <luto@kernel.org>
Date:   Fri Jun 28 19:54:34 2019 -0700

    selftests/x86: Test SYSCALL and SYSENTER manually with TF set
    
    Make sure that we exercise both variants of the nasty
    TF-in-compat-syscall regardless of what vendor's CPU is running the
    tests.
    
    Also change the intentional signal after SYSCALL to use ud2, which
    is a lot more comprehensible.
    
    This crashes the kernel due to an FSGSBASE bug right now.
    
    Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
    Cc: "Bae, Chang Seok" <chang.seok.bae@intel.com>
    Signed-off-by: Andy Lutomirski <luto@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 186520198de7..fa07d526fe39 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -12,8 +12,9 @@  CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
 			check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
-			protection_keys test_vdso test_vsyscall mov_ss_trap
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
+			protection_keys test_vdso test_vsyscall mov_ss_trap \
+			syscall_arg_fault
+TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
 TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
index 4e25d38c8bbd..939de3c94976 100644
--- a/tools/testing/selftests/x86/syscall_arg_fault.c
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -15,9 +15,30 @@ 
 #include <setjmp.h>
 #include <errno.h>
 
+#ifdef __x86_64__
+# define WIDTH "q"
+#else
+# define WIDTH "l"
+#endif
+
 /* Our sigaltstack scratch space. */
 static unsigned char altstack_data[SIGSTKSZ];
 
+static unsigned long get_eflags(void)
+{
+	unsigned long eflags;
+	asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
+	return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+	asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
+		      : : "rm" (eflags) : "flags");
+}
+
+#define X86_EFLAGS_TF (1UL << 8)
+
 static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
 		       int flags)
 {
@@ -35,13 +56,22 @@  static sigjmp_buf jmpbuf;
 
 static volatile sig_atomic_t n_errs;
 
+#ifdef __x86_64__
+#define REG_AX REG_RAX
+#define REG_IP REG_RIP
+#else
+#define REG_AX REG_EAX
+#define REG_IP REG_EIP
+#endif
+
 static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
 {
 	ucontext_t *ctx = (ucontext_t*)ctx_void;
+	long ax = (long)ctx->uc_mcontext.gregs[REG_AX];
 
-	if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
-		printf("[FAIL]\tAX had the wrong value: 0x%x\n",
-		       ctx->uc_mcontext.gregs[REG_EAX]);
+	if (ax != -EFAULT && ax != -ENOSYS) {
+		printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
+		       (unsigned long)ax);
 		n_errs++;
 	} else {
 		printf("[OK]\tSeems okay\n");
@@ -50,9 +80,21 @@  static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
 	siglongjmp(jmpbuf, 1);
 }
 
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+}
+
 static void sigill(int sig, siginfo_t *info, void *ctx_void)
 {
-	printf("[SKIP]\tIllegal instruction\n");
+	ucontext_t *ctx = (ucontext_t*)ctx_void;
+	unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
+
+	if (*ip == 0x0b0f) {
+		/* one of the ud2 instructions faulted */
+		printf("[OK]\tSYSCALL returned normally\n");
+	} else {
+		printf("[SKIP]\tIllegal instruction\n");
+	}
 	siglongjmp(jmpbuf, 1);
 }
 
@@ -120,9 +162,46 @@  int main()
 			"movl $-1, %%ebp\n\t"
 			"movl $-1, %%esp\n\t"
 			"syscall\n\t"
-			"pushl $0"	/* make sure we segfault cleanly */
+			"ud2"		/* make sure we recover cleanly */
+			: : : "memory", "flags");
+	}
+
+	printf("[RUN]\tSYSENTER with TF and invalid state\n");
+	sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
+
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		set_eflags(get_eflags() | X86_EFLAGS_TF);
+		asm volatile (
+			"movl $-1, %%eax\n\t"
+			"movl $-1, %%ebx\n\t"
+			"movl $-1, %%ecx\n\t"
+			"movl $-1, %%edx\n\t"
+			"movl $-1, %%esi\n\t"
+			"movl $-1, %%edi\n\t"
+			"movl $-1, %%ebp\n\t"
+			"movl $-1, %%esp\n\t"
+			"sysenter"
+			: : : "memory", "flags");
+	}
+	set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+	printf("[RUN]\tSYSCALL with TF and invalid state\n");
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		set_eflags(get_eflags() | X86_EFLAGS_TF);
+		asm volatile (
+			"movl $-1, %%eax\n\t"
+			"movl $-1, %%ebx\n\t"
+			"movl $-1, %%ecx\n\t"
+			"movl $-1, %%edx\n\t"
+			"movl $-1, %%esi\n\t"
+			"movl $-1, %%edi\n\t"
+			"movl $-1, %%ebp\n\t"
+			"movl $-1, %%esp\n\t"
+			"syscall\n\t"
+			"ud2"		/* make sure we recover cleanly */
 			: : : "memory", "flags");
 	}
+	set_eflags(get_eflags() & ~X86_EFLAGS_TF);
 
 	return 0;
 }