Message ID | 20200714120917.11253-46-joro@8bytes.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86: SEV-ES Guest Support | expand |
On Tue, Jul 14, 2020 at 02:08:47PM +0200, Joerg Roedel wrote: > @@ -489,6 +490,9 @@ DEFINE_IDTENTRY_RAW(exc_nmi) > this_cpu_write(nmi_cr2, read_cr2()); > nmi_restart: > > + /* Needs to happen before DR7 is accessed */ > + sev_es_ist_enter(regs); > + > this_cpu_write(nmi_dr7, local_db_save()); > > nmi_enter(); > @@ -502,6 +506,8 @@ DEFINE_IDTENTRY_RAW(exc_nmi) > > local_db_restore(this_cpu_read(nmi_dr7)); > > + sev_es_ist_exit(); > + > if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) > write_cr2(this_cpu_read(nmi_cr2)); > if (this_cpu_dec_return(nmi_state)) I really hate all this #VC stuff :-( So the above will make the NMI do 4 unconditional extra CALL+RET, a LOAD (which will potentially miss) and a compare and branch. How's that a win for normal people? Can we please turn all these sev_es_*() hooks into something like: DECLARE_STATIC_KEY_FALSE(sev_es_enabled_key); static __always_inline void sev_es_foo() { if (static_branch_unlikely(&sev_es_enabled_key)) __sev_es_foo(); } So that normal people will only see an extra NOP? > diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c > index d415368f16ec..2a7cc72db1d5 100644 > --- a/arch/x86/kernel/sev-es.c > +++ b/arch/x86/kernel/sev-es.c > @@ -78,6 +78,67 @@ static void __init sev_es_setup_vc_stacks(int cpu) > tss->x86_tss.ist[IST_INDEX_VC] = CEA_ESTACK_TOP(&cea->estacks, VC); > } > > +static bool on_vc_stack(unsigned long sp) noinstr or __always_inline > +{ > + return ((sp >= __this_cpu_ist_bot_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); > +} > + > +/* > + * This function handles the case when an NMI or an NMI-like exception > + * like #DB is raised in the #VC exception handler entry code. In this I've yet to find you handle the NMI-like cases.. > + * case the IST entry for VC must be adjusted, so that any subsequent VC > + * exception will not overwrite the stack contents of the interrupted VC > + * handler. > + * > + * The IST entry is adjusted unconditionally so that it can be also be > + * unconditionally back-adjusted in sev_es_nmi_exit(). Otherwise a > + * nested nmi_exit() call (#VC->NMI->#DB) may back-adjust the IST entry > + * too early. Is this comment accurate, I cannot find the patch touching nmi_enter/exit()? > + */ > +void noinstr sev_es_ist_enter(struct pt_regs *regs) > +{ > + unsigned long old_ist, new_ist; > + unsigned long *p; > + > + if (!sev_es_active()) > + return; > + > + /* Read old IST entry */ > + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); > + > + /* Make room on the IST stack */ > + if (on_vc_stack(regs->sp)) > + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); > + else > + new_ist = old_ist - sizeof(old_ist); > + > + /* Store old IST entry */ > + p = (unsigned long *)new_ist; > + *p = old_ist; > + > + /* Set new IST entry */ > + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); > +} > + > +void noinstr sev_es_ist_exit(void) > +{ > + unsigned long ist; > + unsigned long *p; > + > + if (!sev_es_active()) > + return; > + > + /* Read IST entry */ > + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); > + > + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) > + return; > + > + /* Read back old IST entry and write it to the TSS */ > + p = (unsigned long *)ist; > + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *p); > +} That's pretty disguisting :-(
On Wed, Jul 15, 2020 at 11:47:02AM +0200, Peter Zijlstra wrote: > On Tue, Jul 14, 2020 at 02:08:47PM +0200, Joerg Roedel wrote: > DECLARE_STATIC_KEY_FALSE(sev_es_enabled_key); > > static __always_inline void sev_es_foo() > { > if (static_branch_unlikely(&sev_es_enabled_key)) > __sev_es_foo(); > } > > So that normal people will only see an extra NOP? Yes, that is a good idea, I will use a static key for these cases. > > +static bool on_vc_stack(unsigned long sp) > > noinstr or __always_inline Will add __always_inline, thanks. > > +/* > > + * This function handles the case when an NMI or an NMI-like exception > > + * like #DB is raised in the #VC exception handler entry code. In this > > I've yet to find you handle the NMI-like cases.. The comment is not 100% accurate anymore, I will update it. Initially #DB was an NMI-like case, but I figured that with .text.noinstr and the way the #VC entry code switches stacks, there is no #DB special handling necessary anymore. > > + * case the IST entry for VC must be adjusted, so that any subsequent VC > > + * exception will not overwrite the stack contents of the interrupted VC > > + * handler. > > + * > > + * The IST entry is adjusted unconditionally so that it can be also be > > + * unconditionally back-adjusted in sev_es_nmi_exit(). Otherwise a > > + * nested nmi_exit() call (#VC->NMI->#DB) may back-adjust the IST entry > > + * too early. > > Is this comment accurate, I cannot find the patch touching > nmi_enter/exit()? Right, will update that too. I had the sev-es NMI stack adjustment in nmi_enter/exit first, but needed to move it out because the possible DR7 access needs the #VC stack already adjusted. > > + */ > > +void noinstr sev_es_ist_enter(struct pt_regs *regs) > > +{ > > + unsigned long old_ist, new_ist; > > + unsigned long *p; > > + > > + if (!sev_es_active()) > > + return; > > + > > + /* Read old IST entry */ > > + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); > > + > > + /* Make room on the IST stack */ > > + if (on_vc_stack(regs->sp)) > > + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); > > + else > > + new_ist = old_ist - sizeof(old_ist); > > + > > + /* Store old IST entry */ > > + p = (unsigned long *)new_ist; > > + *p = old_ist; > > + > > + /* Set new IST entry */ > > + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); > > +} > > + > > +void noinstr sev_es_ist_exit(void) > > +{ > > + unsigned long ist; > > + unsigned long *p; > > + > > + if (!sev_es_active()) > > + return; > > + > > + /* Read IST entry */ > > + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); > > + > > + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) > > + return; > > + > > + /* Read back old IST entry and write it to the TSS */ > > + p = (unsigned long *)ist; > > + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *p); > > +} > > That's pretty disguisting :-( Yeah, but its needed because ... IST :( I am open for suggestions on how to make it less disgusting. Or maybe you like it more if you think of it as a software implementation of what hardware should actually do to make IST less painful. Regards, Joerg
On Wed, Jul 15, 2020 at 12:26:53PM +0200, Joerg Roedel wrote: > On Wed, Jul 15, 2020 at 11:47:02AM +0200, Peter Zijlstra wrote: > > That's pretty disguisting :-( > > Yeah, but its needed because ... IST :( > I am open for suggestions on how to make it less disgusting. Or maybe > you like it more if you think of it as a software implementation of what > hardware should actually do to make IST less painful. No bright ideas, sadly..
diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h index 824e9e6b067c..330140a189be 100644 --- a/arch/x86/include/asm/sev-es.h +++ b/arch/x86/include/asm/sev-es.h @@ -77,4 +77,12 @@ static inline u64 lower_bits(u64 val, unsigned int bits) extern void vc_no_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern void sev_es_ist_enter(struct pt_regs *regs); +extern void sev_es_ist_exit(void); +#else +static inline void sev_es_ist_enter(struct pt_regs *regs) { } +static inline void sev_es_ist_exit(void) { } +#endif + #endif diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d7c5e44b26f7..d94a5bb0bebc 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -33,6 +33,7 @@ #include <asm/reboot.h> #include <asm/cache.h> #include <asm/nospec-branch.h> +#include <asm/sev-es.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -489,6 +490,9 @@ DEFINE_IDTENTRY_RAW(exc_nmi) this_cpu_write(nmi_cr2, read_cr2()); nmi_restart: + /* Needs to happen before DR7 is accessed */ + sev_es_ist_enter(regs); + this_cpu_write(nmi_dr7, local_db_save()); nmi_enter(); @@ -502,6 +506,8 @@ DEFINE_IDTENTRY_RAW(exc_nmi) local_db_restore(this_cpu_read(nmi_dr7)); + sev_es_ist_exit(); + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); if (this_cpu_dec_return(nmi_state)) diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index d415368f16ec..2a7cc72db1d5 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -78,6 +78,67 @@ static void __init sev_es_setup_vc_stacks(int cpu) tss->x86_tss.ist[IST_INDEX_VC] = CEA_ESTACK_TOP(&cea->estacks, VC); } +static bool on_vc_stack(unsigned long sp) +{ + return ((sp >= __this_cpu_ist_bot_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI or an NMI-like exception + * like #DB is raised in the #VC exception handler entry code. In this + * case the IST entry for VC must be adjusted, so that any subsequent VC + * exception will not overwrite the stack contents of the interrupted VC + * handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally back-adjusted in sev_es_nmi_exit(). Otherwise a + * nested nmi_exit() call (#VC->NMI->#DB) may back-adjust the IST entry + * too early. + */ +void noinstr sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + unsigned long *p; + + if (!sev_es_active()) + return; + + /* Read old IST entry */ + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* Make room on the IST stack */ + if (on_vc_stack(regs->sp)) + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); + else + new_ist = old_ist - sizeof(old_ist); + + /* Store old IST entry */ + p = (unsigned long *)new_ist; + *p = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr sev_es_ist_exit(void) +{ + unsigned long ist; + unsigned long *p; + + if (!sev_es_active()) + return; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + p = (unsigned long *)ist; + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *p); +} + /* Needed in vc_early_forward_exception */ void do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 907ac2b378a8..59d17e541df9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -59,6 +59,7 @@ #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> +#include <asm/sev-es.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -733,6 +734,7 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) { + /* * Disable breakpoints during exception handling; recursive exceptions * are exceedingly 'fun'.