Message ID | 20241111125219.361243118@infradead.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | x86/kvm/emulate: Avoid RET for FASTOPs | expand |
On Mon, Nov 11, 2024 at 12:59:47PM +0100, Peter Zijlstra wrote: > +/* > + * All the FASTOP magic above relies on there being *one* instance of this > + * so it can JMP back, avoiding RET and it's various thunks. > + */ > +static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) > { > ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; > > if (!(ctxt->d & ByteOp)) > fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; > > - asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" > + asm("push %[flags]; popf \n\t" > + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) > + ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE) > + JMP_NOSPEC > + "fastop_return: \n\t" > + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) > + "pushf; pop %[flags]\n" > : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), > [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT > : "c"(ctxt->src2.val)); Do Andrew is telling me the compiler is free to mess this up... Notably: https://github.com/llvm/llvm-project/issues/92161 In lieu of that, I wrote the below hack. It makes objtool sad (it don't like STT_FUNC calling STT_NOTYPE), but it should work if we ever run into the compiler being daft like that (it should fail to compile because of the duplicate fastop_return label, so it's not silent failure). Wear protective eye gear before continuing... --- --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -429,9 +429,9 @@ static inline void call_depth_return_thu #ifdef CONFIG_X86_64 -#define __CS_PREFIX \ +#define __CS_PREFIX(reg) \ ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ - ".ifc %V[thunk_target],\\rs\n" \ + ".ifc " reg ",\\rs\n" \ ".byte 0x2e\n" \ ".endif\n" \ ".endr\n" @@ -441,12 +441,12 @@ static inline void call_depth_return_thu * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ # define CALL_NOSPEC \ - __CS_PREFIX \ + __CS_PREFIX("%V[thunk_target]") \ "call __x86_indirect_thunk_%V[thunk_target]\n" -# define JMP_NOSPEC \ - __CS_PREFIX \ - "jmp __x86_indirect_thunk_%V[thunk_target]\n" +# define __JMP_NOSPEC(reg) \ + __CS_PREFIX(reg) \ + "jmp __x86_indirect_thunk_" reg "\n" # define THUNK_TARGET(addr) [thunk_target] "r" (addr) @@ -478,10 +478,10 @@ static inline void call_depth_return_thu "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_LFENCE) -# define JMP_NOSPEC \ +# define __JMP_NOSPEC(reg) \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ - "jmp *%[thunk_target]\n", \ + "jmp *%%" reg "\n", \ " jmp 901f;\n" \ " .align 16\n" \ "901: call 903f;\n" \ @@ -490,22 +490,25 @@ static inline void call_depth_return_thu " jmp 902b;\n" \ " .align 16\n" \ "903: lea 4(%%esp), %%esp;\n" \ - " pushl %[thunk_target];\n" \ + " pushl %%" reg "\n" \ " ret;\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ - "jmp *%[thunk_target]\n", \ + "jmp *%%" reg "\n", \ X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif + #else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" -# define JMP_NOSPEC "jmp *%[thunk_target]\n" +# define __JMP_NOSPEC(reg) "jmp *%%" reg "\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif +# define JMP_NOSPEC __JMP_NOSPEC("%V[thunk_target]") + /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5039,23 +5039,45 @@ static void fetch_possible_mmx_operand(s } /* + * Stub written in asm in order to ensure GCC doesn't duplicate the + * fastop_return: label. + * + * Custom calling convention. + * + * __fastop: + * ax = ctxt->dst.val + * dx = ctxt->src.val + * cx = ctxt->src.val2 + * di = flags + * si = fop + */ +asm (ASM_FUNC_ALIGN + "__fastop: \n\t" + "push %" _ASM_DI "\n\t" + "popf \n\t" + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) + ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE) + __JMP_NOSPEC(_ASM_SI) + "fastop_return: \n\t" + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) + "pushf \n\t" + "pop %" _ASM_DI "\n\t" + ASM_RET + ".type __fastop, @notype \n\t" + ".size __fastop, . - __fastop \n\t"); + +/* * All the FASTOP magic above relies on there being *one* instance of this * so it can JMP back, avoiding RET and it's various thunks. */ -static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; - asm("push %[flags]; popf \n\t" - UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) - ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE) - JMP_NOSPEC - "fastop_return: \n\t" - UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) - "pushf; pop %[flags]\n" + asm("call __fastop" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val));
KVM: x86: On Mon, Nov 11, 2024, Peter Zijlstra wrote: > Since there is only a single fastop() function, convert the FASTOP > stuff from CALL_NOSPEC+RET to JMP_NOSPEC+JMP, avoiding the return > thunks and all that jazz. > > Specifically FASTOPs rely on the return thunk to preserve EFLAGS, > which not all of them can trivially do (call depth tracing suffers > here). Maybe add an example? Mostly as a reminder of how to reproduce the call depth issues. E.g. booting with "retbleed=force,stuff spectre_v2=retpoline,generic" causes KVM-Unit-Test's "emulator" test to fail due to flags being clobbered. > Objtool strenuously complains about this: > > - indirect call without a .rodata, fails to determine JUMP_TABLE, > annotate > - fastop functions fall through, exception > - unreachable instruction after fastop_return, save/restore > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> The original patch works, but with the fixup KVM fails emulation of an ADC and generates: ------------[ cut here ]------------ Unpatched return thunk in use. This should not happen! WARNING: CPU: 4 PID: 1452 at arch/x86/kernel/cpu/bugs.c:3063 __warn_thunk+0x26/0x30 Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel kvm CPU: 4 UID: 1000 PID: 1452 Comm: qemu Not tainted 6.12.0-rc5-22582d7d68a6-rev/fastops-miti #11 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:__warn_thunk+0x26/0x30 Code: 5e ff 7e 05 0f 1f 44 00 00 80 3d 5d 06 5c 01 00 74 05 e9 bd 7d a0 00 48 c7 c7 10 4a 13 82 c6 05 48 06 5c 01 01 e8 31 48 04 00 <0f> 0b e9 a3 7d a0 00 cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffc90001743c78 EFLAGS: 00010287 RAX: 0000000000000000 RBX: ffff88811877a040 RCX: 0000000000000027 RDX: ffff88846f91b208 RSI: 0000000000000001 RDI: ffff88846f91b200 RBP: ffffc90001743cc8 R08: ffffffff825195c8 R09: 0000000000000003 R10: ffffffff824395e0 R11: ffffffff824e95e0 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f3027400700(0000) GS:ffff88846f900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3029ba3001 CR3: 000000010cdc0000 CR4: 0000000000352eb0 Call Trace: <TASK> ? __warn+0x85/0x120 ? __warn_thunk+0x26/0x30 ? report_bug+0x17d/0x190 ? handle_bug+0x53/0x90 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? __warn_thunk+0x26/0x30 ? __warn_thunk+0x26/0x30 warn_thunk_thunk+0x16/0x30 ? __kvm_mmu_topup_memory_cache+0x57/0x150 [kvm] init_emulate_ctxt+0xae/0x110 [kvm] x86_decode_emulated_instruction+0x25/0x80 [kvm] x86_emulate_instruction+0x2cb/0x6f0 [kvm] vmx_handle_exit+0x394/0x6e0 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xf40/0x1db0 [kvm] kvm_vcpu_ioctl+0x2e9/0x870 [kvm] ? futex_wake+0x81/0x180 ? call_depth_return_thunk+0x6c/0x90 ? call_depth_return_thunk+0x66/0x90 ? call_depth_return_thunk+0x60/0x90 ? call_depth_return_thunk+0x5a/0x90 __x64_sys_ioctl+0x8a/0xc0 do_syscall_64+0x5b/0x170 entry_SYSCALL_64_after_hwframe+0x71/0x79 RIP: 0033:0x7f30290cedeb Code: 0f 92 c0 84 c0 75 b0 49 8d 3c 1c e8 ff 47 04 00 85 c0 78 b1 48 83 c4 08 4c 89 e0 5b 41 5c c3 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f30273ff748 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 000000000000ae80 RCX: 00007f30290cedeb RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000000e RBP: 0000555587e2f1e0 R08: 00007f302923fc10 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f3029b90780 R14: 00007ffea5ab9640 R15: 00007f30273ffa00 </TASK> ---[ end trace 0000000000000000 ]---
On Mon, Nov 11, 2024 at 09:26:44AM -0800, Sean Christopherson wrote: > KVM: x86: > > On Mon, Nov 11, 2024, Peter Zijlstra wrote: > > Since there is only a single fastop() function, convert the FASTOP > > stuff from CALL_NOSPEC+RET to JMP_NOSPEC+JMP, avoiding the return > > thunks and all that jazz. > > > > Specifically FASTOPs rely on the return thunk to preserve EFLAGS, > > which not all of them can trivially do (call depth tracing suffers > > here). > > Maybe add an example? Mostly as a reminder of how to reproduce the call depth > issues. > > E.g. booting with "retbleed=force,stuff spectre_v2=retpoline,generic" causes > KVM-Unit-Test's "emulator" test to fail due to flags being clobbered. > > > Objtool strenuously complains about this: > > > > - indirect call without a .rodata, fails to determine JUMP_TABLE, > > annotate > > - fastop functions fall through, exception > > - unreachable instruction after fastop_return, save/restore > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > The original patch works, but with the fixup KVM fails emulation of an ADC and > generates: Bah, I'll go chase it down. Thanks!
--- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -285,8 +285,8 @@ static void invalidate_registers(struct * different operand sizes can be reached by calculation, rather than a jump * table (which would be bigger than the code). * - * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR - * and 1 for the straight line speculation INT3, leaves 7 bytes for the + * The 16 byte alignment, considering 5 bytes for the JMP, 4 for ENDBR + * and 1 for the straight line speculation INT3, leaves 6 bytes for the * body of the function. Currently none is larger than 4. */ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); @@ -304,7 +304,7 @@ static int fastop(struct x86_emulate_ctx __FOP_FUNC(#name) #define __FOP_RET(name) \ - "11: " ASM_RET \ + "11: jmp fastop_return; int3 \n\t" \ ".size " name ", .-" name "\n\t" #define FOP_RET(name) \ @@ -5071,14 +5071,24 @@ static void fetch_possible_mmx_operand(s kvm_read_mmx_reg(op->addr.mm, &op->mm_val); } -static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) +/* + * All the FASTOP magic above relies on there being *one* instance of this + * so it can JMP back, avoiding RET and it's various thunks. + */ +static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; - asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" + asm("push %[flags]; popf \n\t" + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) + ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE) + JMP_NOSPEC + "fastop_return: \n\t" + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) + "pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -64,5 +64,6 @@ struct unwind_hint { #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALLS 7 +#define ANNOTYPE_JUMP_TABLE 8 #endif /* _LINUX_OBJTOOL_TYPES_H */ --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -64,5 +64,6 @@ struct unwind_hint { #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALLS 7 +#define ANNOTYPE_JUMP_TABLE 8 #endif /* _LINUX_OBJTOOL_TYPES_H */ --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2386,6 +2386,14 @@ static int __annotate_late(struct objtoo insn->unret = 1; break; + /* + * Must be after add_jump_table(); for it doesn't set a sane + * _jump_table value. + */ + case ANNOTYPE_JUMP_TABLE: + insn->_jump_table = (void *)1; + break; + default: break; } @@ -3459,7 +3467,8 @@ static int validate_branch(struct objtoo if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6)) + !strncmp(func->name, "__pfx_", 6) || + !strcmp(insn_func(insn)->name, "fastop")) return 0; WARN("%s() falls through to next function %s()",
Since there is only a single fastop() function, convert the FASTOP stuff from CALL_NOSPEC+RET to JMP_NOSPEC+JMP, avoiding the return thunks and all that jazz. Specifically FASTOPs rely on the return thunk to preserve EFLAGS, which not all of them can trivially do (call depth tracing suffers here). Objtool strenuously complains about this: - indirect call without a .rodata, fails to determine JUMP_TABLE, annotate - fastop functions fall through, exception - unreachable instruction after fastop_return, save/restore Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- arch/x86/kvm/emulate.c | 20 +++++++++++++++----- include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 11 ++++++++++- 4 files changed, 27 insertions(+), 6 deletions(-)