diff mbox series

[v4,45/75] x86/sev-es: Adjust #VC IST Stack on entering NMI handler

Message ID 20200714120917.11253-46-joro@8bytes.org (mailing list archive)
State New, archived
Headers show
Series x86: SEV-ES Guest Support | expand

Commit Message

Joerg Roedel July 14, 2020, 12:08 p.m. UTC
From: Joerg Roedel <jroedel@suse.de>

When an NMI hits in the #VC handler entry code before it switched to
another stack, any subsequent #VC exception in the NMI code-path will
overwrite the interrupted #VC handlers stack.

Make sure this doesn't happen by  explicitly adjusting the #VC IST entry
in the NMI handler for the time in can cause #VC exceptions.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/include/asm/sev-es.h |  8 +++++
 arch/x86/kernel/nmi.c         |  6 ++++
 arch/x86/kernel/sev-es.c      | 61 +++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c       |  2 ++
 4 files changed, 77 insertions(+)

Comments

Peter Zijlstra July 15, 2020, 9:47 a.m. UTC | #1
On Tue, Jul 14, 2020 at 02:08:47PM +0200, Joerg Roedel wrote:

> @@ -489,6 +490,9 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
>  	this_cpu_write(nmi_cr2, read_cr2());
>  nmi_restart:
>  
> +	/* Needs to happen before DR7 is accessed */
> +	sev_es_ist_enter(regs);
> +
>  	this_cpu_write(nmi_dr7, local_db_save());
>  
>  	nmi_enter();
> @@ -502,6 +506,8 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
>  
>  	local_db_restore(this_cpu_read(nmi_dr7));
>  
> +	sev_es_ist_exit();
> +
>  	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
>  		write_cr2(this_cpu_read(nmi_cr2));
>  	if (this_cpu_dec_return(nmi_state))

I really hate all this #VC stuff :-(

So the above will make the NMI do 4 unconditional extra CALL+RET, a LOAD
(which will potentially miss) and a compare and branch.

How's that a win for normal people? Can we please turn all these
sev_es_*() hooks into something like:

DECLARE_STATIC_KEY_FALSE(sev_es_enabled_key);

static __always_inline void sev_es_foo()
{
	if (static_branch_unlikely(&sev_es_enabled_key))
		__sev_es_foo();
}

So that normal people will only see an extra NOP?

> diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
> index d415368f16ec..2a7cc72db1d5 100644
> --- a/arch/x86/kernel/sev-es.c
> +++ b/arch/x86/kernel/sev-es.c
> @@ -78,6 +78,67 @@ static void __init sev_es_setup_vc_stacks(int cpu)
>  	tss->x86_tss.ist[IST_INDEX_VC] = CEA_ESTACK_TOP(&cea->estacks, VC);
>  }
>  
> +static bool on_vc_stack(unsigned long sp)

noinstr or __always_inline

> +{
> +	return ((sp >= __this_cpu_ist_bot_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
> +}
> +
> +/*
> + * This function handles the case when an NMI or an NMI-like exception
> + * like #DB is raised in the #VC exception handler entry code. In this

I've yet to find you handle the NMI-like cases..

> + * case the IST entry for VC must be adjusted, so that any subsequent VC
> + * exception will not overwrite the stack contents of the interrupted VC
> + * handler.
> + *
> + * The IST entry is adjusted unconditionally so that it can be also be
> + * unconditionally back-adjusted in sev_es_nmi_exit(). Otherwise a
> + * nested nmi_exit() call (#VC->NMI->#DB) may back-adjust the IST entry
> + * too early.

Is this comment accurate, I cannot find the patch touching
nmi_enter/exit()?

> + */
> +void noinstr sev_es_ist_enter(struct pt_regs *regs)
> +{
> +	unsigned long old_ist, new_ist;
> +	unsigned long *p;
> +
> +	if (!sev_es_active())
> +		return;
> +
> +	/* Read old IST entry */
> +	old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
> +
> +	/* Make room on the IST stack */
> +	if (on_vc_stack(regs->sp))
> +		new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
> +	else
> +		new_ist = old_ist - sizeof(old_ist);
> +
> +	/* Store old IST entry */
> +	p       = (unsigned long *)new_ist;
> +	*p      = old_ist;
> +
> +	/* Set new IST entry */
> +	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
> +}
> +
> +void noinstr sev_es_ist_exit(void)
> +{
> +	unsigned long ist;
> +	unsigned long *p;
> +
> +	if (!sev_es_active())
> +		return;
> +
> +	/* Read IST entry */
> +	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
> +
> +	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
> +		return;
> +
> +	/* Read back old IST entry and write it to the TSS */
> +	p = (unsigned long *)ist;
> +	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *p);
> +}

That's pretty disguisting :-(
Joerg Roedel July 15, 2020, 10:26 a.m. UTC | #2
On Wed, Jul 15, 2020 at 11:47:02AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 14, 2020 at 02:08:47PM +0200, Joerg Roedel wrote:

> DECLARE_STATIC_KEY_FALSE(sev_es_enabled_key);
> 
> static __always_inline void sev_es_foo()
> {
> 	if (static_branch_unlikely(&sev_es_enabled_key))
> 		__sev_es_foo();
> }
> 
> So that normal people will only see an extra NOP?

Yes, that is a good idea, I will use a static key for these cases.

> > +static bool on_vc_stack(unsigned long sp)
> 
> noinstr or __always_inline

Will add __always_inline, thanks.

> > +/*
> > + * This function handles the case when an NMI or an NMI-like exception
> > + * like #DB is raised in the #VC exception handler entry code. In this
> 
> I've yet to find you handle the NMI-like cases..

The comment is not 100% accurate anymore, I will update it. Initially
#DB was an NMI-like case, but I figured that with .text.noinstr and the
way the #VC entry code switches stacks, there is no #DB special handling
necessary anymore.

> > + * case the IST entry for VC must be adjusted, so that any subsequent VC
> > + * exception will not overwrite the stack contents of the interrupted VC
> > + * handler.
> > + *
> > + * The IST entry is adjusted unconditionally so that it can be also be
> > + * unconditionally back-adjusted in sev_es_nmi_exit(). Otherwise a
> > + * nested nmi_exit() call (#VC->NMI->#DB) may back-adjust the IST entry
> > + * too early.
> 
> Is this comment accurate, I cannot find the patch touching
> nmi_enter/exit()?

Right, will update that too. I had the sev-es NMI stack adjustment in
nmi_enter/exit first, but needed to move it out because the possible DR7
access needs the #VC stack already adjusted.

> > + */
> > +void noinstr sev_es_ist_enter(struct pt_regs *regs)
> > +{
> > +	unsigned long old_ist, new_ist;
> > +	unsigned long *p;
> > +
> > +	if (!sev_es_active())
> > +		return;
> > +
> > +	/* Read old IST entry */
> > +	old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
> > +
> > +	/* Make room on the IST stack */
> > +	if (on_vc_stack(regs->sp))
> > +		new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
> > +	else
> > +		new_ist = old_ist - sizeof(old_ist);
> > +
> > +	/* Store old IST entry */
> > +	p       = (unsigned long *)new_ist;
> > +	*p      = old_ist;
> > +
> > +	/* Set new IST entry */
> > +	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
> > +}
> > +
> > +void noinstr sev_es_ist_exit(void)
> > +{
> > +	unsigned long ist;
> > +	unsigned long *p;
> > +
> > +	if (!sev_es_active())
> > +		return;
> > +
> > +	/* Read IST entry */
> > +	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
> > +
> > +	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
> > +		return;
> > +
> > +	/* Read back old IST entry and write it to the TSS */
> > +	p = (unsigned long *)ist;
> > +	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *p);
> > +}
> 
> That's pretty disguisting :-(

Yeah, but its needed because ... IST :(
I am open for suggestions on how to make it less disgusting. Or maybe
you like it more if you think of it as a software implementation of what
hardware should actually do to make IST less painful.

Regards,

	Joerg
Peter Zijlstra July 15, 2020, 10:56 a.m. UTC | #3
On Wed, Jul 15, 2020 at 12:26:53PM +0200, Joerg Roedel wrote:
> On Wed, Jul 15, 2020 at 11:47:02AM +0200, Peter Zijlstra wrote:

> > That's pretty disguisting :-(
> 
> Yeah, but its needed because ... IST :(
> I am open for suggestions on how to make it less disgusting. Or maybe
> you like it more if you think of it as a software implementation of what
> hardware should actually do to make IST less painful.

No bright ideas, sadly..
diff mbox series

Patch

diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h
index 824e9e6b067c..330140a189be 100644
--- a/arch/x86/include/asm/sev-es.h
+++ b/arch/x86/include/asm/sev-es.h
@@ -77,4 +77,12 @@  static inline u64 lower_bits(u64 val, unsigned int bits)
 extern void vc_no_ghcb(void);
 extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+extern void sev_es_ist_enter(struct pt_regs *regs);
+extern void sev_es_ist_exit(void);
+#else
+static inline void sev_es_ist_enter(struct pt_regs *regs) { }
+static inline void sev_es_ist_exit(void) { }
+#endif
+
 #endif
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d7c5e44b26f7..d94a5bb0bebc 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -33,6 +33,7 @@ 
 #include <asm/reboot.h>
 #include <asm/cache.h>
 #include <asm/nospec-branch.h>
+#include <asm/sev-es.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -489,6 +490,9 @@  DEFINE_IDTENTRY_RAW(exc_nmi)
 	this_cpu_write(nmi_cr2, read_cr2());
 nmi_restart:
 
+	/* Needs to happen before DR7 is accessed */
+	sev_es_ist_enter(regs);
+
 	this_cpu_write(nmi_dr7, local_db_save());
 
 	nmi_enter();
@@ -502,6 +506,8 @@  DEFINE_IDTENTRY_RAW(exc_nmi)
 
 	local_db_restore(this_cpu_read(nmi_dr7));
 
+	sev_es_ist_exit();
+
 	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
 		write_cr2(this_cpu_read(nmi_cr2));
 	if (this_cpu_dec_return(nmi_state))
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index d415368f16ec..2a7cc72db1d5 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -78,6 +78,67 @@  static void __init sev_es_setup_vc_stacks(int cpu)
 	tss->x86_tss.ist[IST_INDEX_VC] = CEA_ESTACK_TOP(&cea->estacks, VC);
 }
 
+static bool on_vc_stack(unsigned long sp)
+{
+	return ((sp >= __this_cpu_ist_bot_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
+}
+
+/*
+ * This function handles the case when an NMI or an NMI-like exception
+ * like #DB is raised in the #VC exception handler entry code. In this
+ * case the IST entry for VC must be adjusted, so that any subsequent VC
+ * exception will not overwrite the stack contents of the interrupted VC
+ * handler.
+ *
+ * The IST entry is adjusted unconditionally so that it can be also be
+ * unconditionally back-adjusted in sev_es_nmi_exit(). Otherwise a
+ * nested nmi_exit() call (#VC->NMI->#DB) may back-adjust the IST entry
+ * too early.
+ */
+void noinstr sev_es_ist_enter(struct pt_regs *regs)
+{
+	unsigned long old_ist, new_ist;
+	unsigned long *p;
+
+	if (!sev_es_active())
+		return;
+
+	/* Read old IST entry */
+	old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+	/* Make room on the IST stack */
+	if (on_vc_stack(regs->sp))
+		new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
+	else
+		new_ist = old_ist - sizeof(old_ist);
+
+	/* Store old IST entry */
+	p       = (unsigned long *)new_ist;
+	*p      = old_ist;
+
+	/* Set new IST entry */
+	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
+}
+
+void noinstr sev_es_ist_exit(void)
+{
+	unsigned long ist;
+	unsigned long *p;
+
+	if (!sev_es_active())
+		return;
+
+	/* Read IST entry */
+	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
+		return;
+
+	/* Read back old IST entry and write it to the TSS */
+	p = (unsigned long *)ist;
+	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *p);
+}
+
 /* Needed in vc_early_forward_exception */
 void do_early_exception(struct pt_regs *regs, int trapnr);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 907ac2b378a8..59d17e541df9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,6 +59,7 @@ 
 #include <asm/umip.h>
 #include <asm/insn.h>
 #include <asm/insn-eval.h>
+#include <asm/sev-es.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -733,6 +734,7 @@  static bool is_sysenter_singlestep(struct pt_regs *regs)
 
 static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
 {
+
 	/*
 	 * Disable breakpoints during exception handling; recursive exceptions
 	 * are exceedingly 'fun'.