diff mbox series

[v2,12/12] x86/kvm/emulate: Avoid RET for fastops

Message ID 20241111125219.361243118@infradead.org (mailing list archive)
State New
Headers show
Series x86/kvm/emulate: Avoid RET for FASTOPs | expand

Commit Message

Peter Zijlstra Nov. 11, 2024, 11:59 a.m. UTC
Since there is only a single fastop() function, convert the FASTOP
stuff from CALL_NOSPEC+RET to JMP_NOSPEC+JMP, avoiding the return
thunks and all that jazz.

Specifically FASTOPs rely on the return thunk to preserve EFLAGS,
which not all of them can trivially do (call depth tracing suffers
here).

Objtool strenuously complains about this:

 - indirect call without a .rodata, fails to determine JUMP_TABLE,
   annotate
 - fastop functions fall through, exception
 - unreachable instruction after fastop_return, save/restore

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kvm/emulate.c              |   20 +++++++++++++++-----
 include/linux/objtool_types.h       |    1 +
 tools/include/linux/objtool_types.h |    1 +
 tools/objtool/check.c               |   11 ++++++++++-
 4 files changed, 27 insertions(+), 6 deletions(-)

Comments

Peter Zijlstra Nov. 11, 2024, 4:27 p.m. UTC | #1
On Mon, Nov 11, 2024 at 12:59:47PM +0100, Peter Zijlstra wrote:

> +/*
> + * All the FASTOP magic above relies on there being *one* instance of this
> + * so it can JMP back, avoiding RET and it's various thunks.
> + */
> +static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
>  {
>  	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
>  
>  	if (!(ctxt->d & ByteOp))
>  		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
>  
> -	asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
> +	asm("push %[flags]; popf \n\t"
> +	    UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
> +	    ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE)
> +	    JMP_NOSPEC
> +	    "fastop_return: \n\t"
> +	    UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
> +	    "pushf; pop %[flags]\n"
>  	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
>  	      [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
>  	    : "c"(ctxt->src2.val));

Do Andrew is telling me the compiler is free to mess this up... Notably:

  https://github.com/llvm/llvm-project/issues/92161

In lieu of that, I wrote the below hack. It makes objtool sad (it don't
like STT_FUNC calling STT_NOTYPE), but it should work if we ever run
into the compiler being daft like that (it should fail to compile
because of the duplicate fastop_return label, so it's not silent
failure).

Wear protective eye gear before continuing...

---
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -429,9 +429,9 @@ static inline void call_depth_return_thu
 
 #ifdef CONFIG_X86_64
 
-#define __CS_PREFIX						\
+#define __CS_PREFIX(reg)					\
 	".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n"		\
-	".ifc %V[thunk_target],\\rs\n"				\
+	".ifc " reg ",\\rs\n"					\
 	".byte 0x2e\n"						\
 	".endif\n"						\
 	".endr\n"
@@ -441,12 +441,12 @@ static inline void call_depth_return_thu
  * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
  */
 # define CALL_NOSPEC						\
-	__CS_PREFIX						\
+	__CS_PREFIX("%V[thunk_target]")				\
 	"call __x86_indirect_thunk_%V[thunk_target]\n"
 
-# define JMP_NOSPEC						\
-	__CS_PREFIX						\
-	"jmp __x86_indirect_thunk_%V[thunk_target]\n"
+# define __JMP_NOSPEC(reg)					\
+	__CS_PREFIX(reg)					\
+	"jmp __x86_indirect_thunk_" reg "\n"
 
 # define THUNK_TARGET(addr) [thunk_target] "r" (addr)
 
@@ -478,10 +478,10 @@ static inline void call_depth_return_thu
 	"call *%[thunk_target]\n",				\
 	X86_FEATURE_RETPOLINE_LFENCE)
 
-# define JMP_NOSPEC						\
+# define __JMP_NOSPEC(reg)					\
 	ALTERNATIVE_2(						\
 	ANNOTATE_RETPOLINE_SAFE					\
-	"jmp *%[thunk_target]\n",				\
+	"jmp *%%" reg "\n",					\
 	"       jmp    901f;\n"					\
 	"       .align 16\n"					\
 	"901:	call   903f;\n"					\
@@ -490,22 +490,25 @@ static inline void call_depth_return_thu
 	"       jmp    902b;\n"					\
 	"       .align 16\n"					\
 	"903:	lea    4(%%esp), %%esp;\n"			\
-	"       pushl  %[thunk_target];\n"			\
+	"       pushl  %%" reg "\n"				\
 	"       ret;\n",					\
 	X86_FEATURE_RETPOLINE,					\
 	"lfence;\n"						\
 	ANNOTATE_RETPOLINE_SAFE					\
-	"jmp *%[thunk_target]\n",				\
+	"jmp *%%" reg "\n",					\
 	X86_FEATURE_RETPOLINE_LFENCE)
 
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
 #endif
+
 #else /* No retpoline for C / inline asm */
 # define CALL_NOSPEC "call *%[thunk_target]\n"
-# define JMP_NOSPEC "jmp *%[thunk_target]\n"
+# define __JMP_NOSPEC(reg) "jmp *%%" reg "\n"
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
 #endif
 
+# define JMP_NOSPEC __JMP_NOSPEC("%V[thunk_target]")
+
 /* The Spectre V2 mitigation variants */
 enum spectre_v2_mitigation {
 	SPECTRE_V2_NONE,
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5039,23 +5039,45 @@ static void fetch_possible_mmx_operand(s
 }
 
 /*
+ * Stub written in asm in order to ensure GCC doesn't duplicate the
+ * fastop_return: label.
+ *
+ * Custom calling convention.
+ *
+ * __fastop:
+ * ax = ctxt->dst.val
+ * dx = ctxt->src.val
+ * cx = ctxt->src.val2
+ * di = flags
+ * si = fop
+ */
+asm (ASM_FUNC_ALIGN
+     "__fastop: \n\t"
+     "push %" _ASM_DI "\n\t"
+     "popf \n\t"
+     UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
+     ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE)
+     __JMP_NOSPEC(_ASM_SI)
+     "fastop_return: \n\t"
+     UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
+     "pushf \n\t"
+     "pop %" _ASM_DI "\n\t"
+     ASM_RET
+     ".type __fastop, @notype \n\t"
+     ".size __fastop, . - __fastop \n\t");
+
+/*
  * All the FASTOP magic above relies on there being *one* instance of this
  * so it can JMP back, avoiding RET and it's various thunks.
  */
-static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
 {
 	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
 
 	if (!(ctxt->d & ByteOp))
 		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
 
-	asm("push %[flags]; popf \n\t"
-	    UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
-	    ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE)
-	    JMP_NOSPEC
-	    "fastop_return: \n\t"
-	    UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
-	    "pushf; pop %[flags]\n"
+	asm("call __fastop"
 	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
 	      [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
 	    : "c"(ctxt->src2.val));
Sean Christopherson Nov. 11, 2024, 5:26 p.m. UTC | #2
KVM: x86:

On Mon, Nov 11, 2024, Peter Zijlstra wrote:
> Since there is only a single fastop() function, convert the FASTOP
> stuff from CALL_NOSPEC+RET to JMP_NOSPEC+JMP, avoiding the return
> thunks and all that jazz.
> 
> Specifically FASTOPs rely on the return thunk to preserve EFLAGS,
> which not all of them can trivially do (call depth tracing suffers
> here).

Maybe add an example?  Mostly as a reminder of how to reproduce the call depth
issues.

  E.g. booting with "retbleed=force,stuff spectre_v2=retpoline,generic" causes
  KVM-Unit-Test's "emulator" test to fail due to flags being clobbered.

> Objtool strenuously complains about this:
> 
>  - indirect call without a .rodata, fails to determine JUMP_TABLE,
>    annotate
>  - fastop functions fall through, exception
>  - unreachable instruction after fastop_return, save/restore
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

The original patch works, but with the fixup KVM fails emulation of an ADC and
generates:

  ------------[ cut here ]------------
  Unpatched return thunk in use. This should not happen!
  WARNING: CPU: 4 PID: 1452 at arch/x86/kernel/cpu/bugs.c:3063 __warn_thunk+0x26/0x30
  Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel kvm
  CPU: 4 UID: 1000 PID: 1452 Comm: qemu Not tainted 6.12.0-rc5-22582d7d68a6-rev/fastops-miti #11
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:__warn_thunk+0x26/0x30
  Code: 5e ff 7e 05 0f 1f 44 00 00 80 3d 5d 06 5c 01 00 74 05 e9 bd 7d a0 00 48 c7 c7 10 4a 13 82 c6 05 48 06 5c 01 01 e8 31 48 04 00 <0f> 0b e9 a3 7d a0 00 cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90
  RSP: 0018:ffffc90001743c78 EFLAGS: 00010287
  RAX: 0000000000000000 RBX: ffff88811877a040 RCX: 0000000000000027
  RDX: ffff88846f91b208 RSI: 0000000000000001 RDI: ffff88846f91b200
  RBP: ffffc90001743cc8 R08: ffffffff825195c8 R09: 0000000000000003
  R10: ffffffff824395e0 R11: ffffffff824e95e0 R12: 0000000000000000
  R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
  FS:  00007f3027400700(0000) GS:ffff88846f900000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f3029ba3001 CR3: 000000010cdc0000 CR4: 0000000000352eb0
  Call Trace:
   <TASK>
   ? __warn+0x85/0x120
   ? __warn_thunk+0x26/0x30
   ? report_bug+0x17d/0x190
   ? handle_bug+0x53/0x90
   ? exc_invalid_op+0x14/0x70
   ? asm_exc_invalid_op+0x16/0x20
   ? __warn_thunk+0x26/0x30
   ? __warn_thunk+0x26/0x30
   warn_thunk_thunk+0x16/0x30
   ? __kvm_mmu_topup_memory_cache+0x57/0x150 [kvm]
   init_emulate_ctxt+0xae/0x110 [kvm]
   x86_decode_emulated_instruction+0x25/0x80 [kvm]
   x86_emulate_instruction+0x2cb/0x6f0 [kvm]
   vmx_handle_exit+0x394/0x6e0 [kvm_intel]
   kvm_arch_vcpu_ioctl_run+0xf40/0x1db0 [kvm]
   kvm_vcpu_ioctl+0x2e9/0x870 [kvm]
   ? futex_wake+0x81/0x180
   ? call_depth_return_thunk+0x6c/0x90
   ? call_depth_return_thunk+0x66/0x90
   ? call_depth_return_thunk+0x60/0x90
   ? call_depth_return_thunk+0x5a/0x90
   __x64_sys_ioctl+0x8a/0xc0
   do_syscall_64+0x5b/0x170
   entry_SYSCALL_64_after_hwframe+0x71/0x79
  RIP: 0033:0x7f30290cedeb
  Code: 0f 92 c0 84 c0 75 b0 49 8d 3c 1c e8 ff 47 04 00 85 c0 78 b1 48 83 c4 08 4c 89 e0 5b 41 5c c3 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a0 ff ff ff f7 d8 64 89 01 48
  RSP: 002b:00007f30273ff748 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
  RAX: ffffffffffffffda RBX: 000000000000ae80 RCX: 00007f30290cedeb
  RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000000e
  RBP: 0000555587e2f1e0 R08: 00007f302923fc10 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
  R13: 00007f3029b90780 R14: 00007ffea5ab9640 R15: 00007f30273ffa00
   </TASK>
  ---[ end trace 0000000000000000 ]---
Peter Zijlstra Nov. 11, 2024, 6:28 p.m. UTC | #3
On Mon, Nov 11, 2024 at 09:26:44AM -0800, Sean Christopherson wrote:
> KVM: x86:
> 
> On Mon, Nov 11, 2024, Peter Zijlstra wrote:
> > Since there is only a single fastop() function, convert the FASTOP
> > stuff from CALL_NOSPEC+RET to JMP_NOSPEC+JMP, avoiding the return
> > thunks and all that jazz.
> > 
> > Specifically FASTOPs rely on the return thunk to preserve EFLAGS,
> > which not all of them can trivially do (call depth tracing suffers
> > here).
> 
> Maybe add an example?  Mostly as a reminder of how to reproduce the call depth
> issues.
> 
>   E.g. booting with "retbleed=force,stuff spectre_v2=retpoline,generic" causes
>   KVM-Unit-Test's "emulator" test to fail due to flags being clobbered.
> 
> > Objtool strenuously complains about this:
> > 
> >  - indirect call without a .rodata, fails to determine JUMP_TABLE,
> >    annotate
> >  - fastop functions fall through, exception
> >  - unreachable instruction after fastop_return, save/restore
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> The original patch works, but with the fixup KVM fails emulation of an ADC and
> generates:

Bah, I'll go chase it down. Thanks!
diff mbox series

Patch

--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -285,8 +285,8 @@  static void invalidate_registers(struct
  * different operand sizes can be reached by calculation, rather than a jump
  * table (which would be bigger than the code).
  *
- * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR
- * and 1 for the straight line speculation INT3, leaves 7 bytes for the
+ * The 16 byte alignment, considering 5 bytes for the JMP, 4 for ENDBR
+ * and 1 for the straight line speculation INT3, leaves 6 bytes for the
  * body of the function.  Currently none is larger than 4.
  */
 static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
@@ -304,7 +304,7 @@  static int fastop(struct x86_emulate_ctx
 	__FOP_FUNC(#name)
 
 #define __FOP_RET(name) \
-	"11: " ASM_RET \
+	"11: jmp fastop_return; int3 \n\t" \
 	".size " name ", .-" name "\n\t"
 
 #define FOP_RET(name) \
@@ -5071,14 +5071,24 @@  static void fetch_possible_mmx_operand(s
 		kvm_read_mmx_reg(op->addr.mm, &op->mm_val);
 }
 
-static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
+/*
+ * All the FASTOP magic above relies on there being *one* instance of this
+ * so it can JMP back, avoiding RET and it's various thunks.
+ */
+static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
 {
 	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
 
 	if (!(ctxt->d & ByteOp))
 		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
 
-	asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
+	asm("push %[flags]; popf \n\t"
+	    UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
+	    ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE)
+	    JMP_NOSPEC
+	    "fastop_return: \n\t"
+	    UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
+	    "pushf; pop %[flags]\n"
 	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
 	      [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
 	    : "c"(ctxt->src2.val));
--- a/include/linux/objtool_types.h
+++ b/include/linux/objtool_types.h
@@ -64,5 +64,6 @@  struct unwind_hint {
 #define ANNOTYPE_UNRET_BEGIN		5
 #define ANNOTYPE_IGNORE_ALTS		6
 #define ANNOTYPE_INTRA_FUNCTION_CALLS	7
+#define ANNOTYPE_JUMP_TABLE		8
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
--- a/tools/include/linux/objtool_types.h
+++ b/tools/include/linux/objtool_types.h
@@ -64,5 +64,6 @@  struct unwind_hint {
 #define ANNOTYPE_UNRET_BEGIN		5
 #define ANNOTYPE_IGNORE_ALTS		6
 #define ANNOTYPE_INTRA_FUNCTION_CALLS	7
+#define ANNOTYPE_JUMP_TABLE		8
 
 #endif /* _LINUX_OBJTOOL_TYPES_H */
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2386,6 +2386,14 @@  static int __annotate_late(struct objtoo
 		insn->unret = 1;
 		break;
 
+	/*
+	 * Must be after add_jump_table(); for it doesn't set a sane
+	 * _jump_table value.
+	 */
+	case ANNOTYPE_JUMP_TABLE:
+		insn->_jump_table = (void *)1;
+		break;
+
 	default:
 		break;
 	}
@@ -3459,7 +3467,8 @@  static int validate_branch(struct objtoo
 		if (func && insn_func(insn) && func != insn_func(insn)->pfunc) {
 			/* Ignore KCFI type preambles, which always fall through */
 			if (!strncmp(func->name, "__cfi_", 6) ||
-			    !strncmp(func->name, "__pfx_", 6))
+			    !strncmp(func->name, "__pfx_", 6) ||
+			    !strcmp(insn_func(insn)->name, "fastop"))
 				return 0;
 
 			WARN("%s() falls through to next function %s()",