diff mbox series

[RESEND,5/6] KVM: x86/VMX: add kvm_vmx_reinject_nmi_irq() for NMI/IRQ reinjection

Message ID 20221110061545.1531-6-xin3.li@intel.com (mailing list archive)
State New, archived
Headers show
Series x86/traps,VMX: implement software based NMI/IRQ dispatch for VMX NMI/IRQ reinjection | expand

Commit Message

Li, Xin3 Nov. 10, 2022, 6:15 a.m. UTC
To eliminate dispatching NMI/IRQ through the IDT, add
kvm_vmx_reinject_nmi_irq(), which calls external_interrupt()
for IRQ reinjection.

Lastly replace calling a NMI/IRQ handler in an IDT descriptor
with calling kvm_vmx_reinject_nmi_irq().

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
 arch/x86/include/asm/traps.h |  2 ++
 arch/x86/kernel/traps.c      | 23 +++++++++++++++++++++++
 arch/x86/kvm/vmx/vmenter.S   | 33 ---------------------------------
 arch/x86/kvm/vmx/vmx.c       | 19 +++++++------------
 4 files changed, 32 insertions(+), 45 deletions(-)

Comments

Peter Zijlstra Nov. 10, 2022, 9:03 a.m. UTC | #1
On Wed, Nov 09, 2022 at 10:15:44PM -0800, Xin Li wrote:
> To eliminate dispatching NMI/IRQ through the IDT, add
> kvm_vmx_reinject_nmi_irq(), which calls external_interrupt()
> for IRQ reinjection.
> 
> Lastly replace calling a NMI/IRQ handler in an IDT descriptor
> with calling kvm_vmx_reinject_nmi_irq().
> 
> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
> Signed-off-by: Xin Li <xin3.li@intel.com>

Idem.


> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +/*
> + * KVM VMX reinjects NMI/IRQ on its current stack, it's a sync
> + * call thus the values in the pt_regs structure are not used in
> + * executing NMI/IRQ handlers, except cs.RPL and flags.IF, which
> + * are both always 0 in the VMX NMI/IRQ reinjection context. Thus
> + * we simply allocate a zeroed pt_regs structure on current stack
> + * to call external_interrupt().
> + */
> +void kvm_vmx_reinject_nmi_irq(u32 vector)

noinstr ?

> +{
> +	struct pt_regs irq_regs;
> +
> +	memset(&irq_regs, 0, sizeof(irq_regs));
> +
> +	if (vector == NMI_VECTOR)
> +		return exc_nmi(&irq_regs);
> +
> +	external_interrupt(&irq_regs, vector);
> +}
> +EXPORT_SYMBOL_GPL(kvm_vmx_reinject_nmi_irq);
> +#endif

> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 63247c57c72c..b457e4888468 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -46,6 +46,7 @@
>  #include <asm/mshyperv.h>
>  #include <asm/mwait.h>
>  #include <asm/spec-ctrl.h>
> +#include <asm/traps.h>
>  #include <asm/virtext.h>
>  #include <asm/vmx.h>
>  
> @@ -6758,15 +6759,11 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
>  	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
>  }
>  
> -void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
> -
> -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
> -					unsigned long entry)
> +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 vector)
>  {
> -	bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
> -
> -	kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
> -	vmx_do_interrupt_nmi_irqoff(entry);
> +	kvm_before_interrupt(vcpu, vector == NMI_VECTOR ?
> +				   KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
> +	kvm_vmx_reinject_nmi_irq(vector);
>  	kvm_after_interrupt(vcpu);
>  }
>  
> @@ -6792,7 +6789,6 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
>  
>  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
>  {
> -	const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
>  	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
>  
>  	/* if exit due to PF check for async PF */
> @@ -6806,20 +6802,19 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
>  		kvm_machine_check();
>  	/* We need to handle NMIs before interrupts are enabled */
>  	else if (is_nmi(intr_info))
> -		handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
> +		handle_interrupt_nmi_irqoff(&vmx->vcpu, NMI_VECTOR);
>  }
>  
>  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
>  {
>  	u32 intr_info = vmx_get_intr_info(vcpu);
>  	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> -	gate_desc *desc = (gate_desc *)host_idt_base + vector;
>  
>  	if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
>  	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
>  		return;
>  
> -	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> +	handle_interrupt_nmi_irqoff(vcpu, vector);
>  	vcpu->arch.at_instruction_boundary = true;
>  }

How does any of this work? You're calling into entry/noinstr code from a
random context.
Li, Xin3 Nov. 10, 2022, 8:53 p.m. UTC | #2
> > To eliminate dispatching NMI/IRQ through the IDT, add
> > kvm_vmx_reinject_nmi_irq(), which calls external_interrupt() for IRQ
> > reinjection.
> >
> > Lastly replace calling a NMI/IRQ handler in an IDT descriptor with
> > calling kvm_vmx_reinject_nmi_irq().
> >
> > Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
> > Signed-off-by: Xin Li <xin3.li@intel.com>
> 
> Idem.
> 
> 
> > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > +/*
> > + * KVM VMX reinjects NMI/IRQ on its current stack, it's a sync
> > + * call thus the values in the pt_regs structure are not used in
> > + * executing NMI/IRQ handlers, except cs.RPL and flags.IF, which
> > + * are both always 0 in the VMX NMI/IRQ reinjection context. Thus
> > + * we simply allocate a zeroed pt_regs structure on current stack
> > + * to call external_interrupt().
> > + */
> > +void kvm_vmx_reinject_nmi_irq(u32 vector)
> 
> noinstr ?

Seems it's not needed to me.

> 
> > +{
> > +	struct pt_regs irq_regs;
> > +
> > +	memset(&irq_regs, 0, sizeof(irq_regs));
> > +
> > +	if (vector == NMI_VECTOR)
> > +		return exc_nmi(&irq_regs);
> > +
> > +	external_interrupt(&irq_regs, vector); }
> > +EXPORT_SYMBOL_GPL(kvm_vmx_reinject_nmi_irq);
> > +#endif
> 
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > 63247c57c72c..b457e4888468 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -46,6 +46,7 @@
> >  #include <asm/mshyperv.h>
> >  #include <asm/mwait.h>
> >  #include <asm/spec-ctrl.h>
> > +#include <asm/traps.h>
> >  #include <asm/virtext.h>
> >  #include <asm/vmx.h>
> >
> > @@ -6758,15 +6759,11 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
> >  	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));  }
> >
> > -void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
> > -
> > -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
> > -					unsigned long entry)
> > +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32
> > +vector)
> >  {
> > -	bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
> > -
> > -	kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI :
> KVM_HANDLING_IRQ);
> > -	vmx_do_interrupt_nmi_irqoff(entry);
> > +	kvm_before_interrupt(vcpu, vector == NMI_VECTOR ?
> > +				   KVM_HANDLING_NMI :
> KVM_HANDLING_IRQ);
> > +	kvm_vmx_reinject_nmi_irq(vector);
> >  	kvm_after_interrupt(vcpu);
> >  }
> >
> > @@ -6792,7 +6789,6 @@ static void handle_nm_fault_irqoff(struct
> > kvm_vcpu *vcpu)
> >
> >  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)  {
> > -	const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
> >  	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
> >
> >  	/* if exit due to PF check for async PF */ @@ -6806,20 +6802,19 @@
> > static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> >  		kvm_machine_check();
> >  	/* We need to handle NMIs before interrupts are enabled */
> >  	else if (is_nmi(intr_info))
> > -		handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
> > +		handle_interrupt_nmi_irqoff(&vmx->vcpu, NMI_VECTOR);
> >  }
> >
> >  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> > {
> >  	u32 intr_info = vmx_get_intr_info(vcpu);
> >  	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> > -	gate_desc *desc = (gate_desc *)host_idt_base + vector;
> >
> >  	if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
> >  	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
> >  		return;
> >
> > -	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> > +	handle_interrupt_nmi_irqoff(vcpu, vector);
> >  	vcpu->arch.at_instruction_boundary = true;  }
> 
> How does any of this work? You're calling into entry/noinstr code from a
> random context.

Can you please elaborate your concern a bit more?

We are here in handle_external_interrupt_irqoff () because an external
interrupt happened when a guest was running and the CPU vm-exits to host
to dispatch to the IRQ handler with IRQ disabled.
Peter Zijlstra Nov. 11, 2022, 9:15 a.m. UTC | #3
On Thu, Nov 10, 2022 at 08:53:30PM +0000, Li, Xin3 wrote:

> > >  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> > > {
> > >  	u32 intr_info = vmx_get_intr_info(vcpu);
> > >  	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> > > -	gate_desc *desc = (gate_desc *)host_idt_base + vector;
> > >
> > >  	if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
> > >  	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
> > >  		return;
> > >
> > > -	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> > > +	handle_interrupt_nmi_irqoff(vcpu, vector);
> > >  	vcpu->arch.at_instruction_boundary = true;  }
> > 
> > How does any of this work? You're calling into entry/noinstr code from a
> > random context.
> 
> Can you please elaborate your concern a bit more?
> 
> We are here in handle_external_interrupt_irqoff () because an external
> interrupt happened when a guest was running and the CPU vm-exits to host
> to dispatch to the IRQ handler with IRQ disabled.

I don't speak virt (but this all sounds disguisting) -- but what appears
to be the case is you calling into entry code from regular kernel
context, which is odd at best.

Specifically, going by the fact that all this is not noinstr code, the
assumption is that RCU/lockdep/etc.. is all set-up and running. This
means you should not be calling DEFINE_IDTENTRY_*(func) functions
because those will try and set all that up again.

Granted, irqentry_{enter,exit}() do nest, but *yuck*.
Paolo Bonzini Nov. 11, 2022, 12:04 p.m. UTC | #4
On 11/11/22 10:15, Peter Zijlstra wrote:
> I don't speak virt (but this all sounds disguisting)

Yes, it is.  AMD does not need this, they just hold onto the interrupt
until the host has issued both STGI (for NMIs) and STI (for IRQs).

On Intel you can optionally make it hold onto IRQs, but NMIs are always
eaten by the VMEXIT and have to be reinjected manually.

> -- but what appears to be the case is you calling into entry code
> from regular kernel context, which is odd at best.
> 
> Specifically, going by the fact that all this is not noinstr code,
> the assumption is that RCU/lockdep/etc.. is all set-up and running.

Indeed it is.  This is called long after the noinstr area has been left:

vcpu_enter_guest
   ...
   static_call(kvm_x86_vcpu_run) -> vmx_vcpu_run
     vmx_vcpu_enter_exit (noinstr)
       guest_state_enter_irqoff
       __vmx_vcpu_run
       guest_state_exit_irqoff
   ...
   static_call(kvm_x86_handle_exit_irqoff) -> vmx_handle_exit_irqoff
     handle_external_interrupt_irqoff
   ...

Paolo

> This means you should not be calling DEFINE_IDTENTRY_*(func)
> functions because those will try and set all that up again.
> 
> Granted, irqentry_{enter,exit}() do nest, but*yuck*.
Peter Zijlstra Nov. 11, 2022, 12:19 p.m. UTC | #5
On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> On 11/11/22 10:15, Peter Zijlstra wrote:
> > I don't speak virt (but this all sounds disguisting)
> 
> Yes, it is.  AMD does not need this, they just hold onto the interrupt
> until the host has issued both STGI (for NMIs) and STI (for IRQs).

Right -- Cooper just explained that to me.

> On Intel you can optionally make it hold onto IRQs, but NMIs are always
> eaten by the VMEXIT and have to be reinjected manually.

That 'optionally' thing worries me -- as in, KVM is currently
opting-out?

Since much of this is about preparing for FRED, can we pretty *PLEASE*
fix this VMX NMI hole instead so that all this nonsense isn't needed
anymore, please, really please?

Have FRED imply a GI flag for VMX or something and then we can all
forget about these reinjection horrors.
Paolo Bonzini Nov. 11, 2022, 12:48 p.m. UTC | #6
On 11/11/22 13:19, Peter Zijlstra wrote:
> On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>> On Intel you can optionally make it hold onto IRQs, but NMIs are always
>> eaten by the VMEXIT and have to be reinjected manually.
> 
> That 'optionally' thing worries me -- as in, KVM is currently
> opting-out?

Yes, because "If the “process posted interrupts” VM-execution control is 
1, the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 
26.2.1.1, checks on VM-Execution Control Fields).  Ipse dixit.  Posted 
interrupts are available and used on all processors since I think Ivy 
Bridge.

(sorry about splitting the replies across two threads)

Paolo
Peter Zijlstra Nov. 11, 2022, 2:23 p.m. UTC | #7
On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
> On 11/11/22 13:19, Peter Zijlstra wrote:
> > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> > > On Intel you can optionally make it hold onto IRQs, but NMIs are always
> > > eaten by the VMEXIT and have to be reinjected manually.
> > 
> > That 'optionally' thing worries me -- as in, KVM is currently
> > opting-out?
> 
> Yes, because "If the “process posted interrupts” VM-execution control is 1,
> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
> checks on VM-Execution Control Fields).  Ipse dixit.  Posted interrupts are
> available and used on all processors since I think Ivy Bridge.

(imagine the non-coc compliant reaction here)

So instead of fixing it, they made it worse :-(

And now FRED is arguably making it worse again, and people wonder why I
hate virt...
Andrew Cooper Nov. 11, 2022, 4:35 p.m. UTC | #8
On 11/11/2022 14:23, Peter Zijlstra wrote:
> On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
>> On 11/11/22 13:19, Peter Zijlstra wrote:
>>> On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>>>> On Intel you can optionally make it hold onto IRQs, but NMIs are always
>>>> eaten by the VMEXIT and have to be reinjected manually.
>>> That 'optionally' thing worries me -- as in, KVM is currently
>>> opting-out?
>> Yes, because "If the “process posted interrupts” VM-execution control is 1,
>> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
>> checks on VM-Execution Control Fields).  Ipse dixit.  Posted interrupts are
>> available and used on all processors since I think Ivy Bridge.

On server SKUs.  Client only got "virtual interrupt processing" fairly
recently IIRC, which is the CPU-side property which matters.

> (imagine the non-coc compliant reaction here)
>
> So instead of fixing it, they made it worse :-(
>
> And now FRED is arguably making it worse again, and people wonder why I
> hate virt...

The only FRED-compatible fix is to send a self-NMI, because because you
may need a CSL change too.

VT-x *does* hold the NMI latch (for VMEXIT_REASON NMI), so it's self-NMI
and then enable_nmi()s.

Except the IRET to self won't work - it will need to be ERETS-to-self. 
Which I think is fine.

But what isn't fine is the fact that a self-NMI doesn't deliver
synchronously, so you need to wait until it is pending, before enabling
NMIs.  (Well, actually you need to ensure that it's definitely delivered
before re-entering the VM).

And I'm totally out of ideas here...

~Andrew
Li, Xin3 Nov. 11, 2022, 6:06 p.m. UTC | #9
> On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
> > On 11/11/22 13:19, Peter Zijlstra wrote:
> > > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> > > > On Intel you can optionally make it hold onto IRQs, but NMIs are
> > > > always eaten by the VMEXIT and have to be reinjected manually.
> > >
> > > That 'optionally' thing worries me -- as in, KVM is currently
> > > opting-out?
> >
> > Yes, because "If the “process posted interrupts” VM-execution control
> > is 1, the “acknowledge interrupt on exit” VM-exit control is 1" (SDM
> > 26.2.1.1, checks on VM-Execution Control Fields).  Ipse dixit.  Posted
> > interrupts are available and used on all processors since I think Ivy Bridge.
> 
> (imagine the non-coc compliant reaction here)
> 
> So instead of fixing it, they made it worse :-(
> 
> And now FRED is arguably making it worse again, and people wonder why I
> hate virt...

Maybe I take it wrong, but FRED doesn't make anything worse. Fred entry
code will call external_interrupt() immediately for IRQs.

You really really don't like the context how VMX dispatches NMI/IRQs (which has
been there for a long time), right?

As you said, what if we do not call DEFINE_IDTENTRY_*(func) but the internal
IRQ handlers __func?  That way we take out the setup for RCU/lockdep/etc.
Peter Zijlstra Nov. 11, 2022, 7:33 p.m. UTC | #10
On Fri, Nov 11, 2022 at 06:06:12PM +0000, Li, Xin3 wrote:
> > On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
> > > On 11/11/22 13:19, Peter Zijlstra wrote:
> > > > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
> > > > > On Intel you can optionally make it hold onto IRQs, but NMIs are
> > > > > always eaten by the VMEXIT and have to be reinjected manually.
> > > >
> > > > That 'optionally' thing worries me -- as in, KVM is currently
> > > > opting-out?
> > >
> > > Yes, because "If the “process posted interrupts” VM-execution control
> > > is 1, the “acknowledge interrupt on exit” VM-exit control is 1" (SDM
> > > 26.2.1.1, checks on VM-Execution Control Fields).  Ipse dixit.  Posted
> > > interrupts are available and used on all processors since I think Ivy Bridge.
> > 
> > (imagine the non-coc compliant reaction here)
> > 
> > So instead of fixing it, they made it worse :-(
> > 
> > And now FRED is arguably making it worse again, and people wonder why I
> > hate virt...
> 
> Maybe I take it wrong, but FRED doesn't make anything worse. Fred entry
> code will call external_interrupt() immediately for IRQs.

But what about NMIs, afaict this is all horribly broken for NMIs.

So the whole VMX thing latches the NMI (which stops NMI recursion),
right?

But then you drop out of noinstr code, which means any random exception
can happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
nonsense like *SAN). This exception will do IRET and clear the NMI
latch, all before you get to run any of the NMI code.

Note how the normal NMI code is very careful to clear DR7 and both
kprobes and hw_breakpoint know not to accept noinstr code as targets.

You threw all that out the window.

Also, NMI is IST, and with FRED it will run on a different stack as
well, directly calling external_interrupt() doesn't honour that either.

> You really really don't like the context how VMX dispatches NMI/IRQs (which has
> been there for a long time), right?

I really really hate this with a passion. The fact that it's been this
way is no justification for keeping it. Crap is crap.

Intel should have taken an example of SVM in this regard, and not
doubled down and extended this NMI hole to regular IRQs. These are
exactly the kind of exception delivery trainwrecks FRED is supposed to
fix, except in this case it appears it doesn't :/
H. Peter Anvin Nov. 11, 2022, 10:15 p.m. UTC | #11
On November 11, 2022 6:23:13 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
>> On 11/11/22 13:19, Peter Zijlstra wrote:
>> > On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>> > > On Intel you can optionally make it hold onto IRQs, but NMIs are always
>> > > eaten by the VMEXIT and have to be reinjected manually.
>> > 
>> > That 'optionally' thing worries me -- as in, KVM is currently
>> > opting-out?
>> 
>> Yes, because "If the “process posted interrupts” VM-execution control is 1,
>> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
>> checks on VM-Execution Control Fields).  Ipse dixit.  Posted interrupts are
>> available and used on all processors since I think Ivy Bridge.
>
>(imagine the non-coc compliant reaction here)
>
>So instead of fixing it, they made it worse :-(
>
>And now FRED is arguably making it worse again, and people wonder why I
>hate virt...

I object to saying that FRED is making it worse. Xin's patchset gets rid of low-level assembly magic across the board and at least makes it obvious what the code actually *does*, right now, as opposed to being buried in highly questionable assembly.

It would also, regardless, be good to narrow down the set of possible events that may have to be reinjected to the absolute minimum, *and* document that in the code.

That being said, if there are better ways of doing it, we should, and you are certainly right that we may not have properly dug into if this code is even exercised on platforms which will have FRED.
H. Peter Anvin Nov. 11, 2022, 10:22 p.m. UTC | #12
On November 11, 2022 8:35:30 AM PST, Andrew Cooper <Andrew.Cooper3@citrix.com> wrote:
>On 11/11/2022 14:23, Peter Zijlstra wrote:
>> On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
>>> On 11/11/22 13:19, Peter Zijlstra wrote:
>>>> On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>>>>> On Intel you can optionally make it hold onto IRQs, but NMIs are always
>>>>> eaten by the VMEXIT and have to be reinjected manually.
>>>> That 'optionally' thing worries me -- as in, KVM is currently
>>>> opting-out?
>>> Yes, because "If the “process posted interrupts” VM-execution control is 1,
>>> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
>>> checks on VM-Execution Control Fields).  Ipse dixit.  Posted interrupts are
>>> available and used on all processors since I think Ivy Bridge.
>
>On server SKUs.  Client only got "virtual interrupt processing" fairly
>recently IIRC, which is the CPU-side property which matters.
>
>> (imagine the non-coc compliant reaction here)
>>
>> So instead of fixing it, they made it worse :-(
>>
>> And now FRED is arguably making it worse again, and people wonder why I
>> hate virt...
>
>The only FRED-compatible fix is to send a self-NMI, because because you
>may need a CSL change too.
>
>VT-x *does* hold the NMI latch (for VMEXIT_REASON NMI), so it's self-NMI
>and then enable_nmi()s.
>
>Except the IRET to self won't work - it will need to be ERETS-to-self. 
>Which I think is fine.
>
>But what isn't fine is the fact that a self-NMI doesn't deliver
>synchronously, so you need to wait until it is pending, before enabling
>NMIs.  (Well, actually you need to ensure that it's definitely delivered
>before re-entering the VM).
>
>And I'm totally out of ideas here...
>
>~Andrew
>

There is no fundamental reason to do a CSL/IST change if you happen to know a priori that the stack is in a valid state to have the NMI frame on it; that is:

1. Not deep into a nested I/O layer;
2. Valid, and not in flux in any way.

Since this reinject will always be in a well-defined location, that's fine.

So I think *that* concern is not actually an issue.

Again, note that this is not a FRED-specific problem.
Andrew Cooper Nov. 12, 2022, 12:08 a.m. UTC | #13
On 11/11/2022 22:22, H. Peter Anvin wrote:
> On November 11, 2022 8:35:30 AM PST, Andrew Cooper <Andrew.Cooper3@citrix.com> wrote:
>> On 11/11/2022 14:23, Peter Zijlstra wrote:
>>> On Fri, Nov 11, 2022 at 01:48:26PM +0100, Paolo Bonzini wrote:
>>>> On 11/11/22 13:19, Peter Zijlstra wrote:
>>>>> On Fri, Nov 11, 2022 at 01:04:27PM +0100, Paolo Bonzini wrote:
>>>>>> On Intel you can optionally make it hold onto IRQs, but NMIs are always
>>>>>> eaten by the VMEXIT and have to be reinjected manually.
>>>>> That 'optionally' thing worries me -- as in, KVM is currently
>>>>> opting-out?
>>>> Yes, because "If the “process posted interrupts” VM-execution control is 1,
>>>> the “acknowledge interrupt on exit” VM-exit control is 1" (SDM 26.2.1.1,
>>>> checks on VM-Execution Control Fields).  Ipse dixit.  Posted interrupts are
>>>> available and used on all processors since I think Ivy Bridge.
>> On server SKUs.  Client only got "virtual interrupt processing" fairly
>> recently IIRC, which is the CPU-side property which matters.
>>
>>> (imagine the non-coc compliant reaction here)
>>>
>>> So instead of fixing it, they made it worse :-(
>>>
>>> And now FRED is arguably making it worse again, and people wonder why I
>>> hate virt...
>> The only FRED-compatible fix is to send a self-NMI, because because you
>> may need a CSL change too.
>>
>> VT-x *does* hold the NMI latch (for VMEXIT_REASON NMI), so it's self-NMI
>> and then enable_nmi()s.
>>
>> Except the IRET to self won't work - it will need to be ERETS-to-self. 
>> Which I think is fine.
>>
>> But what isn't fine is the fact that a self-NMI doesn't deliver
>> synchronously, so you need to wait until it is pending, before enabling
>> NMIs.  (Well, actually you need to ensure that it's definitely delivered
>> before re-entering the VM).
>>
>> And I'm totally out of ideas here...
>>
>> ~Andrew
>>
> There is no fundamental reason to do a CSL/IST change if you happen to know a priori that the stack is in a valid state to have the NMI frame on it; that is:
>
> 1. Not deep into a nested I/O layer;
> 2. Valid, and not in flux in any way.

3. The NMI handler doesn't depend on being run on the alternate stack.

> Since this reinject will always be in a well-defined location, that's fine.
>
> So I think *that* concern is not actually an issue.
>
> Again, note that this is not a FRED-specific problem.

Hmm yeah.  On further consideration, I don't think FRED is relevant here
(outside of a few minor details).

The VMExit behaviour is simply that of the NMI handler but without an
exception frame on the stack.  The early asm is walking on egg-shells
with respect to the NMI latch, just like the regular NMI handler is.


Peter is correct that once you leave the VMExit handler's noinstr
region, a plethora of things can re-enable NMIs behind your back.  And
this happening in practice will end up with you logically taking NMIs
out of order.

Whether this matters or not is a different question.  Right now, NMI is
"just" an edge triggered interrupt, but a theoretical future with NMI
vectors might have some fun causality bugs to contend with.


If the out-of-order NMIs isn't a major concern, then a self-NMI is the
simple way to invoke the NMI handler in a context it can cope with. 
Otherwise, the VMExit handler's instr region has to do the handoff when
it's in the same state that the NMI handler is expecting.

~Andrew
Li, Xin3 Nov. 12, 2022, 6:35 a.m. UTC | #14
> > > So instead of fixing it, they made it worse :-(
> > >
> > > And now FRED is arguably making it worse again, and people wonder
> > > why I hate virt...
> >
> > Maybe I take it wrong, but FRED doesn't make anything worse. Fred
> > entry code will call external_interrupt() immediately for IRQs.
> 
> But what about NMIs, afaict this is all horribly broken for NMIs.

NMIs are NOT handled by external_interrupt(), which is introduced in patch 4.

The NMI handling is added to kvm_vmx_reinject_nmi_irq() in patch 5 purely for VMX only.

> 
> So the whole VMX thing latches the NMI (which stops NMI recursion), right?
> 
> But then you drop out of noinstr code, which means any random exception can
> happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
> nonsense like *SAN). This exception will do IRET and clear the NMI latch, all
> before you get to run any of the NMI code.
> 
> Note how the normal NMI code is very careful to clear DR7 and both kprobes
> and hw_breakpoint know not to accept noinstr code as targets.
> 
> You threw all that out the window.
> 
> Also, NMI is IST, and with FRED it will run on a different stack as well, directly
> calling external_interrupt() doesn't honour that either.
> 
> > You really really don't like the context how VMX dispatches NMI/IRQs
> > (which has been there for a long time), right?
> 
> I really really hate this with a passion. The fact that it's been this way is no
> justification for keeping it. Crap is crap.
> 
> Intel should have taken an example of SVM in this regard, and not doubled
> down and extended this NMI hole to regular IRQs. These are exactly the kind of
> exception delivery trainwrecks FRED is supposed to fix, except in this case it
> appears it doesn't :/
Li, Xin3 Nov. 14, 2022, 4:39 a.m. UTC | #15
> > > > > > On Intel you can optionally make it hold onto IRQs, but NMIs
> > > > > > are always eaten by the VMEXIT and have to be reinjected manually.
> > > > >
> > > > > That 'optionally' thing worries me -- as in, KVM is currently
> > > > > opting-out?
> > > >
> > > > Yes, because "If the “process posted interrupts” VM-execution
> > > > control is 1, the “acknowledge interrupt on exit” VM-exit control
> > > > is 1" (SDM 26.2.1.1, checks on VM-Execution Control Fields).  Ipse
> > > > dixit.  Posted interrupts are available and used on all processors since I
> think Ivy Bridge.
> > >
> > > (imagine the non-coc compliant reaction here)
> > >
> > > So instead of fixing it, they made it worse :-(
> > >
> > > And now FRED is arguably making it worse again, and people wonder
> > > why I hate virt...
> >
> > Maybe I take it wrong, but FRED doesn't make anything worse. Fred
> > entry code will call external_interrupt() immediately for IRQs.
> 
> But what about NMIs, afaict this is all horribly broken for NMIs.
> 
> So the whole VMX thing latches the NMI (which stops NMI recursion), right?
> 
> But then you drop out of noinstr code, which means any random exception can
> happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
> nonsense like *SAN). This exception will do IRET and clear the NMI latch, all
> before you get to run any of the NMI code.

What you said here implies that we have this problem in the existing code.
Because a fake iret stack is created to call the NMI handler in the IDT NMI
descriptor, which lastly executes the IRET instruction.

> 
> Note how the normal NMI code is very careful to clear DR7 and both kprobes
> and hw_breakpoint know not to accept noinstr code as targets.
> 
> You threw all that out the window.
> 
> Also, NMI is IST, and with FRED it will run on a different stack as well, directly
> calling external_interrupt() doesn't honour that either.
> 
> > You really really don't like the context how VMX dispatches NMI/IRQs
> > (which has been there for a long time), right?
> 
> I really really hate this with a passion. The fact that it's been this way is no
> justification for keeping it. Crap is crap.
> 
> Intel should have taken an example of SVM in this regard, and not doubled
> down and extended this NMI hole to regular IRQs. These are exactly the kind of
> exception delivery trainwrecks FRED is supposed to fix, except in this case it
> appears it doesn't :/
Peter Zijlstra Nov. 14, 2022, 9:08 a.m. UTC | #16
On Mon, Nov 14, 2022 at 04:39:40AM +0000, Li, Xin3 wrote:

> > But what about NMIs, afaict this is all horribly broken for NMIs.
> > 
> > So the whole VMX thing latches the NMI (which stops NMI recursion), right?
> > 
> > But then you drop out of noinstr code, which means any random exception can
> > happen (kprobes #BP, hw_breakpoint #DB, or even #PF due to random
> > nonsense like *SAN). This exception will do IRET and clear the NMI latch, all
> > before you get to run any of the NMI code.
> 
> What you said here implies that we have this problem in the existing code.
> Because a fake iret stack is created to call the NMI handler in the IDT NMI
> descriptor, which lastly executes the IRET instruction.

I can't follow; of course the IDT handler terminates with IRET, it has
to no?

And yes, the current code appears to suffer the same defect.
Li, Xin3 Nov. 15, 2022, 7:50 a.m. UTC | #17
> > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > >
> > > So the whole VMX thing latches the NMI (which stops NMI recursion),
> right?
> > >
> > > But then you drop out of noinstr code, which means any random
> > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even #PF
> > > due to random nonsense like *SAN). This exception will do IRET and
> > > clear the NMI latch, all before you get to run any of the NMI code.
> >
> > What you said here implies that we have this problem in the existing code.
> > Because a fake iret stack is created to call the NMI handler in the
> > IDT NMI descriptor, which lastly executes the IRET instruction.
> 
> I can't follow; of course the IDT handler terminates with IRET, it has to no?

With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS field
to control whether to unblock NMI. If bit 28 of the field (above the selector)
is 1, ERETS/ERETU unblocks NMIs.

> 
> And yes, the current code appears to suffer the same defect.
Peter Zijlstra Nov. 15, 2022, 9:17 a.m. UTC | #18
On Tue, Nov 15, 2022 at 07:50:49AM +0000, Li, Xin3 wrote:
> > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > >
> > > > So the whole VMX thing latches the NMI (which stops NMI recursion),
> > right?
> > > >
> > > > But then you drop out of noinstr code, which means any random
> > > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even #PF
> > > > due to random nonsense like *SAN). This exception will do IRET and
> > > > clear the NMI latch, all before you get to run any of the NMI code.
> > >
> > > What you said here implies that we have this problem in the existing code.
> > > Because a fake iret stack is created to call the NMI handler in the
> > > IDT NMI descriptor, which lastly executes the IRET instruction.
> > 
> > I can't follow; of course the IDT handler terminates with IRET, it has to no?
> 
> With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS field
> to control whether to unblock NMI. If bit 28 of the field (above the selector)
> is 1, ERETS/ERETU unblocks NMIs.

Yes, I know that. It is one of the many improvements FRED brings.
Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in the
hardware exception frame, but that's still up in the air I believe :/

Anyway.. given there is interrupt priority and NMI is pretty much on top
of everything else the reinject crap *should* run NMI first. That way
NMI runs with the latch disabled and whatever other pending interrupts
will run later.

But that all is still broken because afaict the current code also leaves
noinstr -- and once you leave noinstr (or use a static_key, static_call
or anything else that *can* change at runtime) you can't guarantee
nothing.
Li, Xin3 Nov. 17, 2022, 3:37 a.m. UTC | #19
> > > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > > >
> > > > > So the whole VMX thing latches the NMI (which stops NMI
> > > > > recursion),
> > > right?
> > > > >
> > > > > But then you drop out of noinstr code, which means any random
> > > > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even
> > > > > #PF due to random nonsense like *SAN). This exception will do
> > > > > IRET and clear the NMI latch, all before you get to run any of the NMI
> code.
> > > >
> > > > What you said here implies that we have this problem in the existing
> code.
> > > > Because a fake iret stack is created to call the NMI handler in
> > > > the IDT NMI descriptor, which lastly executes the IRET instruction.
> > >
> > > I can't follow; of course the IDT handler terminates with IRET, it has to no?
> >
> > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS
> > field to control whether to unblock NMI. If bit 28 of the field (above
> > the selector) is 1, ERETS/ERETU unblocks NMIs.
> 
> Yes, I know that. It is one of the many improvements FRED brings.
> Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in the
> hardware exception frame, but that's still up in the air I believe :/
> 
> Anyway.. given there is interrupt priority and NMI is pretty much on top of
> everything else the reinject crap *should* run NMI first. That way NMI runs
> with the latch disabled and whatever other pending interrupts will run later.
> 
> But that all is still broken because afaict the current code also leaves noinstr --
> and once you leave noinstr (or use a static_key, static_call or anything else that
> *can* change at runtime) you can't guarantee nothing.

For NMI, HPA asked me to use "int $2", as it switches to the NMI IST stack to
execute the NMI handler, essentially like how HW deals with a NMI in host. and
I tested it with NMI watchdog, it looks working fine.

For IRQs, we still use the dispatch table, but with a new func added in
DEFINE_IDTENTRY_SYSVEC with the noinstr entry/exit code removed:

#define DEFINE_IDTENTRY_SYSVEC(func)                                    \
static void __##func(struct pt_regs *regs);                             \
                                                                        \
__visible noinstr void func(struct pt_regs *regs)                       \
{                                                                       \
        irqentry_state_t state = irqentry_enter(regs);                  \
                                                                        \
        instrumentation_begin();                                        \
        kvm_set_cpu_l1tf_flush_l1d();                                   \
        run_sysvec_on_irqstack_cond(__##func, regs);                    \
        instrumentation_end();                                          \
        irqentry_exit(regs, state);                                     \
}                                                                       \
                                                                        \
+void dispatch_table_##func(struct pt_regs *regs)                        \
+{                                                                       \
+        kvm_set_cpu_l1tf_flush_l1d();                                   \
+        run_sysvec_on_irqstack_cond(__##func, regs);                    \
+}
+                                                                       \
                                                                        \
static noinline void __##func(struct pt_regs *regs)

How do you think?
Sean Christopherson Nov. 17, 2022, 3:51 p.m. UTC | #20
On Thu, Nov 17, 2022, Li, Xin3 wrote:
> 
> > > > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > > > > So the whole VMX thing latches the NMI (which stops NMI
> > > > > > recursion), right?
> > > > > >
> > > > > > But then you drop out of noinstr code, which means any random
> > > > > > exception can happen (kprobes #BP, hw_breakpoint #DB, or even
> > > > > > #PF due to random nonsense like *SAN). This exception will do
> > > > > > IRET and clear the NMI latch, all before you get to run any of the
> > > > > > NMI code.
> > > > >
> > > > > What you said here implies that we have this problem in the existing code.
> > > > > Because a fake iret stack is created to call the NMI handler in
> > > > > the IDT NMI descriptor, which lastly executes the IRET instruction.
> > > >
> > > > I can't follow; of course the IDT handler terminates with IRET, it has to no?
> > > >
> > > > And yes, the current code appears to suffer the same defect.

That defect isn't going to be fixed simply by changing how KVM forwards NMIs
though.  IIUC, _everything_ between VM-Exit and the invocation of the NMI handler
needs to be noinstr.  On VM-Exit due to NMI, NMIs are blocked.  If a #BP/#DB/#PF
occurs before KVM gets to kvm_x86_handle_exit_irqoff(), the subsequent IRET will
unblock NMIs before the original NMI is serviced, i.e. a second NMI could come in
at anytime regardless of how KVM forwards the NMI to the kernel.

Is there any way to solve this without tagging everything noinstr?  There is a
metric shit ton of code between VM-Exit and the handling of NMIs, and much of that
code is common helpers.  It might be possible to hoist NMI handler much earlier,
though we'd need to do a super thorough audit to ensure all necessary host state
is restored.

> > > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped CS
> > > field to control whether to unblock NMI. If bit 28 of the field (above
> > > the selector) is 1, ERETS/ERETU unblocks NMIs.

Side topic, there's a bug in the ISE docs.  Section "9.4.2 NMI Blocking" states
that bit 16 holds the "unblock NMI" magic, which I'm guessing is a holdover from
an earlier revision of FRED.

  As specified in Section 6.1.3 and Section 6.2.3, ERETS and ERETU each unblocks NMIs
  if bit 16 of the popped CS field is 1. The following items detail how this behavior may be
  changed in VMX non-root operation, depending on the settings of certain VM-execution
  controls:

> > Yes, I know that. It is one of the many improvements FRED brings.
> > Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in the
> > hardware exception frame, but that's still up in the air I believe :/
> > 
> > Anyway.. given there is interrupt priority and NMI is pretty much on top of
> > everything else the reinject crap *should* run NMI first. That way NMI runs
> > with the latch disabled and whatever other pending interrupts will run later.
> >
> > But that all is still broken because afaict the current code also leaves noinstr --
> > and once you leave noinstr (or use a static_key, static_call or anything else that
> > *can* change at runtime) you can't guarantee nothing.
> 
> For NMI, HPA asked me to use "int $2", as it switches to the NMI IST stack to
> execute the NMI handler, essentially like how HW deals with a NMI in host. and
> I tested it with NMI watchdog, it looks working fine.

Heh, well yeah, because that's how KVM used to handle NMIs back before I reworked
NMI handling to use the direct call method.  Ironically, that original change was
done in part to try and make it _easier_ to deal with FRED (back before FRED was
publicly disclosed).

If KVM reverts to INTn, the fix to route KVM=>NMI through the non-IST entry can
be reverted too.

  a217a6593cec ("KVM/VMX: Invoke NMI non-IST entry instead of IST entry")
  1a5488ef0dcf ("KVM: VMX: Invoke NMI handler via indirect call instead of INTn")
Li, Xin3 Nov. 18, 2022, 12:05 a.m. UTC | #21
> > > > > > > But what about NMIs, afaict this is all horribly broken for NMIs.
> > > > > > > So the whole VMX thing latches the NMI (which stops NMI
> > > > > > > recursion), right?
> > > > > > >
> > > > > > > But then you drop out of noinstr code, which means any
> > > > > > > random exception can happen (kprobes #BP, hw_breakpoint #DB,
> > > > > > > or even #PF due to random nonsense like *SAN). This
> > > > > > > exception will do IRET and clear the NMI latch, all before
> > > > > > > you get to run any of the NMI code.
> > > > > >
> > > > > > What you said here implies that we have this problem in the existing
> code.
> > > > > > Because a fake iret stack is created to call the NMI handler
> > > > > > in the IDT NMI descriptor, which lastly executes the IRET instruction.
> > > > >
> > > > > I can't follow; of course the IDT handler terminates with IRET, it has to
> no?
> > > > >
> > > > > And yes, the current code appears to suffer the same defect.
> 
> That defect isn't going to be fixed simply by changing how KVM forwards NMIs
> though.  IIUC, _everything_ between VM-Exit and the invocation of the NMI
> handler needs to be noinstr.  On VM-Exit due to NMI, NMIs are blocked.  If a
> #BP/#DB/#PF occurs before KVM gets to kvm_x86_handle_exit_irqoff(), the
> subsequent IRET will unblock NMIs before the original NMI is serviced, i.e. a
> second NMI could come in at anytime regardless of how KVM forwards the NMI
> to the kernel.
> 
> Is there any way to solve this without tagging everything noinstr?  There is a
> metric shit ton of code between VM-Exit and the handling of NMIs, and much
> of that code is common helpers.  It might be possible to hoist NMI handler
> much earlier, though we'd need to do a super thorough audit to ensure all
> necessary host state is restored.

As NMI is the only vector with this potential issue, it sounds a good idea
to only promote its handling.


> > > > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the popped
> > > > CS field to control whether to unblock NMI. If bit 28 of the field
> > > > (above the selector) is 1, ERETS/ERETU unblocks NMIs.
> 
> Side topic, there's a bug in the ISE docs.  Section "9.4.2 NMI Blocking" states
> that bit 16 holds the "unblock NMI" magic, which I'm guessing is a holdover
> from an earlier revision of FRED.

Good catch, the latest 3.0 draft spec changed it to bit 28, but section 9.4.2
didn't get a proper update.

> 
>   As specified in Section 6.1.3 and Section 6.2.3, ERETS and ERETU each
> unblocks NMIs
>   if bit 16 of the popped CS field is 1. The following items detail how this
> behavior may be
>   changed in VMX non-root operation, depending on the settings of certain VM-
> execution
>   controls:
> 
> > > Yes, I know that. It is one of the many improvements FRED brings.
> > > Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in
> > > the hardware exception frame, but that's still up in the air I
> > > believe :/
> > >
> > > Anyway.. given there is interrupt priority and NMI is pretty much on
> > > top of everything else the reinject crap *should* run NMI first.
> > > That way NMI runs with the latch disabled and whatever other pending
> interrupts will run later.
> > >
> > > But that all is still broken because afaict the current code also
> > > leaves noinstr -- and once you leave noinstr (or use a static_key,
> > > static_call or anything else that
> > > *can* change at runtime) you can't guarantee nothing.
> >
> > For NMI, HPA asked me to use "int $2", as it switches to the NMI IST
> > stack to execute the NMI handler, essentially like how HW deals with a
> > NMI in host. and I tested it with NMI watchdog, it looks working fine.
> 
> Heh, well yeah, because that's how KVM used to handle NMIs back before I
> reworked NMI handling to use the direct call method.  Ironically, that original
> change was done in part to try and make it _easier_ to deal with FRED (back
> before FRED was publicly disclosed).
> 
> If KVM reverts to INTn, the fix to route KVM=>NMI through the non-IST entry
> can be reverted too.
> 
>   a217a6593cec ("KVM/VMX: Invoke NMI non-IST entry instead of IST entry")
>   1a5488ef0dcf ("KVM: VMX: Invoke NMI handler via indirect call instead of
> INTn")

Sure, I'm just thinking to put asm("int $2") in a new function exc_raise_nmi()
defined in arch/x86/kernel/traps.c. Thus we will need to change it only
whenever we have any better facility, and no KVM VMX change required.
Li, Xin3 Nov. 22, 2022, 1 p.m. UTC | #22
> > > > > > And yes, the current code appears to suffer the same defect.
> >
> > That defect isn't going to be fixed simply by changing how KVM
> > forwards NMIs though.  IIUC, _everything_ between VM-Exit and the
> > invocation of the NMI handler needs to be noinstr.  On VM-Exit due to
> > NMI, NMIs are blocked.  If a #BP/#DB/#PF occurs before KVM gets to
> > kvm_x86_handle_exit_irqoff(), the subsequent IRET will unblock NMIs
> > before the original NMI is serviced, i.e. a second NMI could come in
> > at anytime regardless of how KVM forwards the NMI to the kernel.
> >
> > Is there any way to solve this without tagging everything noinstr?
> > There is a metric shit ton of code between VM-Exit and the handling of
> > NMIs, and much of that code is common helpers.  It might be possible
> > to hoist NMI handler much earlier, though we'd need to do a super
> > thorough audit to ensure all necessary host state is restored.
> 
> As NMI is the only vector with this potential issue, it sounds a good idea to only
> promote its handling.
> 

Hi Peter/Sean,

I prefer to move _everything_ between VM-Exit and the invocation of the NMI
handler into the noinstr section in the next patch set, how do you think?

Xin

> 
> > > > > With FRED, ERETS/ERETU replace IRET, and use bit 28 of the
> > > > > popped CS field to control whether to unblock NMI. If bit 28 of
> > > > > the field (above the selector) is 1, ERETS/ERETU unblocks NMIs.
> >
> > Side topic, there's a bug in the ISE docs.  Section "9.4.2 NMI
> > Blocking" states that bit 16 holds the "unblock NMI" magic, which I'm
> > guessing is a holdover from an earlier revision of FRED.
> 
> Good catch, the latest 3.0 draft spec changed it to bit 28, but section 9.4.2
> didn't get a proper update.
> 
> >
> >   As specified in Section 6.1.3 and Section 6.2.3, ERETS and ERETU
> > each unblocks NMIs
> >   if bit 16 of the popped CS field is 1. The following items detail
> > how this behavior may be
> >   changed in VMX non-root operation, depending on the settings of
> > certain VM- execution
> >   controls:
> >
> > > > Yes, I know that. It is one of the many improvements FRED brings.
> > > > Ideally the IBT WAIT-FOR-ENDBR state also gets squirreled away in
> > > > the hardware exception frame, but that's still up in the air I
> > > > believe :/
> > > >
> > > > Anyway.. given there is interrupt priority and NMI is pretty much
> > > > on top of everything else the reinject crap *should* run NMI first.
> > > > That way NMI runs with the latch disabled and whatever other
> > > > pending
> > interrupts will run later.
> > > >
> > > > But that all is still broken because afaict the current code also
> > > > leaves noinstr -- and once you leave noinstr (or use a static_key,
> > > > static_call or anything else that
> > > > *can* change at runtime) you can't guarantee nothing.
> > >
> > > For NMI, HPA asked me to use "int $2", as it switches to the NMI IST
> > > stack to execute the NMI handler, essentially like how HW deals with
> > > a NMI in host. and I tested it with NMI watchdog, it looks working fine.
> >
> > Heh, well yeah, because that's how KVM used to handle NMIs back before
> > I reworked NMI handling to use the direct call method.  Ironically,
> > that original change was done in part to try and make it _easier_ to
> > deal with FRED (back before FRED was publicly disclosed).
> >
> > If KVM reverts to INTn, the fix to route KVM=>NMI through the non-IST
> > entry can be reverted too.
> >
> >   a217a6593cec ("KVM/VMX: Invoke NMI non-IST entry instead of IST entry")
> >   1a5488ef0dcf ("KVM: VMX: Invoke NMI handler via indirect call
> > instead of
> > INTn")
> 
> Sure, I'm just thinking to put asm("int $2") in a new function exc_raise_nmi()
> defined in arch/x86/kernel/traps.c. Thus we will need to change it only
> whenever we have any better facility, and no KVM VMX change required.
Sean Christopherson Nov. 22, 2022, 8:52 p.m. UTC | #23
On Tue, Nov 22, 2022, Li, Xin3 wrote:
> > > > > > > And yes, the current code appears to suffer the same defect.
> > >
> > > That defect isn't going to be fixed simply by changing how KVM
> > > forwards NMIs though.  IIUC, _everything_ between VM-Exit and the
> > > invocation of the NMI handler needs to be noinstr.  On VM-Exit due to
> > > NMI, NMIs are blocked.  If a #BP/#DB/#PF occurs before KVM gets to
> > > kvm_x86_handle_exit_irqoff(), the subsequent IRET will unblock NMIs
> > > before the original NMI is serviced, i.e. a second NMI could come in
> > > at anytime regardless of how KVM forwards the NMI to the kernel.
> > >
> > > Is there any way to solve this without tagging everything noinstr?
> > > There is a metric shit ton of code between VM-Exit and the handling of
> > > NMIs, and much of that code is common helpers.  It might be possible
> > > to hoist NMI handler much earlier, though we'd need to do a super
> > > thorough audit to ensure all necessary host state is restored.
> > 
> > As NMI is the only vector with this potential issue, it sounds a good idea to only
> > promote its handling.
> > 
> 
> Hi Peter/Sean,
> 
> I prefer to move _everything_ between VM-Exit and the invocation of the NMI
> handler into the noinstr section in the next patch set, how do you think?

That's likely going to be beyond painful and will have a _lot_ of collateral
damage in the sense that other paths will end up calling noinstr function just
because of VMX.  E.g. hw_breakpoint_restore(), fpu_sync_guest_vmexit_xfd_state(),
kvm_get_apic_mode(), multiple static calls in KVM... the list goes on and on and on.

The ongoing maintenance for that would also be quite painful.

Actually, SVM already enables NMIs far earlier, which means the probability of
breaking something by moving NMI handling earlier is lower.  Not zero, as I don't
trust that SVM gets all the corner right, but definitely low.

E.g. this should be doable

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cea8c07f5229..2fec93f38960 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7249,6 +7249,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (unlikely(vmx->exit_reason.failed_vmentry))
                return EXIT_FASTPATH_NONE;
 
+       <handle NMIs here>
+
        vmx->loaded_vmcs->launched = 1;
 
        vmx_recover_nmi_blocking(vmx);


thouh we'd like want a fair bit of refactoring so that all of vmx_vcpu_run() and
svm_vcpu_run() don't need to be noinstr.

Another wart that needs to be addressed is trace_kvm_exit().  IIRC, tracepoints
must be outside of noinstr, though maybe I'm misremembering that.  And conversely,
SVM doesn't trace exits that are handled in the fastpath.  Best option is probably
to move VMX's trace_kvm_exit() call to vmx_handle_exit(), and then figure out a
common way to trace exits that are handled in the fastpath (which can reside outside
of the noinstr section).
Li, Xin3 Nov. 23, 2022, 8:31 a.m. UTC | #24
> > > > > > > > And yes, the current code appears to suffer the same defect.
> > > >
> > > > That defect isn't going to be fixed simply by changing how KVM
> > > > forwards NMIs though.  IIUC, _everything_ between VM-Exit and the
> > > > invocation of the NMI handler needs to be noinstr.  On VM-Exit due
> > > > to NMI, NMIs are blocked.  If a #BP/#DB/#PF occurs before KVM gets
> > > > to kvm_x86_handle_exit_irqoff(), the subsequent IRET will unblock
> > > > NMIs before the original NMI is serviced, i.e. a second NMI could
> > > > come in at anytime regardless of how KVM forwards the NMI to the
> kernel.
> > > >
> > > > Is there any way to solve this without tagging everything noinstr?
> > > > There is a metric shit ton of code between VM-Exit and the
> > > > handling of NMIs, and much of that code is common helpers.  It
> > > > might be possible to hoist NMI handler much earlier, though we'd
> > > > need to do a super thorough audit to ensure all necessary host state is
> restored.
> > >
> > > As NMI is the only vector with this potential issue, it sounds a
> > > good idea to only promote its handling.
> > >
> >
> > Hi Peter/Sean,
> >
> > I prefer to move _everything_ between VM-Exit and the invocation of
> > the NMI handler into the noinstr section in the next patch set, how do you
> think?
> 
> That's likely going to be beyond painful and will have a _lot_ of collateral
> damage in the sense that other paths will end up calling noinstr function just
> because of VMX.  E.g. hw_breakpoint_restore(),
> fpu_sync_guest_vmexit_xfd_state(),
> kvm_get_apic_mode(), multiple static calls in KVM... the list goes on and on
> and on.
> 
> The ongoing maintenance for that would also be quite painful.

This is very complicated and I'm lost after reading the code around it.

Maybe some comments would help to explain the steps and order after VM exit.
(of course some people know it quite well)

> 
> Actually, SVM already enables NMIs far earlier, which means the probability of
> breaking something by moving NMI handling earlier is lower.  Not zero, as I
> don't trust that SVM gets all the corner right, but definitely low.
> 
> E.g. this should be doable

Do we need to recover *all* host states before switching to NMI stack and
handler?  if not what is the minimal set? If we restore the minimal set and
do "int $2" as early as possible, is it a quick and dirty approach?

> 
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> cea8c07f5229..2fec93f38960 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7249,6 +7249,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu
> *vcpu)
>         if (unlikely(vmx->exit_reason.failed_vmentry))
>                 return EXIT_FASTPATH_NONE;
> 
> +       <handle NMIs here>
> +
>         vmx->loaded_vmcs->launched = 1;
> 
>         vmx_recover_nmi_blocking(vmx);
> 
> 
> thouh we'd like want a fair bit of refactoring so that all of vmx_vcpu_run() and
> svm_vcpu_run() don't need to be noinstr.

This sounds reasonable to me, however from Documentation/core-api/entry.rst,
we do need it. Maybe we could argue guest is logically orthogonal to host,
and the CPU is borrowed to another OS... (which also kind of explains nested) 

> 
> Another wart that needs to be addressed is trace_kvm_exit().  IIRC, tracepoints
> must be outside of noinstr, though maybe I'm misremembering that.  And
> conversely, SVM doesn't trace exits that are handled in the fastpath.  Best
> option is probably to move VMX's trace_kvm_exit() call to vmx_handle_exit(),
> and then figure out a common way to trace exits that are handled in the
> fastpath (which can reside outside of the noinstr section).
Peter Zijlstra Nov. 23, 2022, 9:16 a.m. UTC | #25
On Tue, Nov 22, 2022 at 08:52:35PM +0000, Sean Christopherson wrote:

> Another wart that needs to be addressed is trace_kvm_exit().  IIRC, tracepoints
> must be outside of noinstr, though maybe I'm misremembering that.

You are not, that is correct. Another point to be careful with is usage
of jump_label and static_call, both can be used in noinstr *provided*
they don't actually ever change -- so boot time setup only.

If either of them were to change, text_poke_bp() has a clue in the name.
Sean Christopherson Nov. 23, 2022, 7:18 p.m. UTC | #26
On Wed, Nov 23, 2022, Peter Zijlstra wrote:
> On Tue, Nov 22, 2022 at 08:52:35PM +0000, Sean Christopherson wrote:
> 
> > Another wart that needs to be addressed is trace_kvm_exit().  IIRC, tracepoints
> > must be outside of noinstr, though maybe I'm misremembering that.
> 
> You are not, that is correct. Another point to be careful with is usage
> of jump_label and static_call, both can be used in noinstr *provided*
> they don't actually ever change -- so boot time setup only.
> 
> If either of them were to change, text_poke_bp() has a clue in the name.

I think we're mostly ok on that front.  kvm_wait_lapic_expire() consumes multiple
static keys that can change at will, but that can be kept outside of the noinstr
section.  

Thanks!
Sean Christopherson Nov. 23, 2022, 8:42 p.m. UTC | #27
On Wed, Nov 23, 2022, Li, Xin3 wrote:
> > Actually, SVM already enables NMIs far earlier, which means the probability of
> > breaking something by moving NMI handling earlier is lower.  Not zero, as I
> > don't trust that SVM gets all the corner right, but definitely low.

Duh.  Moving NMI handling much earlier in VMX's VM-Exit path _must_ be safe,
otherwise we already have problems.  On VMX, NMIs are enabled right up until
VM-Enter (VMLAUNCH/VMRESUME), and unless the VM-Exit is due to an NMI, NMIs are
enabled immediately after VM-Exit.

This case is problematic because the CPU can end up running the NMI handler with
NMIs enabled, not because it might run with incorrect state loaded.

> > E.g. this should be doable
> 
> Do we need to recover *all* host states before switching to NMI stack and
> handler?  

As above, no.

> if not what is the minimal set?

The absolute minimal set is what is context switched via the VMCS.

> If we restore the minimal set and do "int $2" as early as possible, is it a
> quick and dirty approach?

No, it is simply "the approach".  If there are other problems with respect to
noinstr, then they can/should be fixed separately.

> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > cea8c07f5229..2fec93f38960 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -7249,6 +7249,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu
> > *vcpu)
> >         if (unlikely(vmx->exit_reason.failed_vmentry))
> >                 return EXIT_FASTPATH_NONE;
> > 
> > +       <handle NMIs here>
> > +
> >         vmx->loaded_vmcs->launched = 1;
> > 
> >         vmx_recover_nmi_blocking(vmx);
> > 
> > 
> > thouh we'd like want a fair bit of refactoring so that all of vmx_vcpu_run() and
> > svm_vcpu_run() don't need to be noinstr.

For the record, svm_vcpu_run() is fine, at least as far as NMIs are concerned.

> This sounds reasonable to me, however from Documentation/core-api/entry.rst,
> we do need it. 

Why do you say that?


I believe this is what we should end up with.  Compile tested only, and needs to
split up into 4+ patches.  I'll test and massage this into a proper series next
week (US holiday the rest of this week).

---
 arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
 arch/x86/kvm/vmx/vmenter.S    |  4 +--
 arch/x86/kvm/vmx/vmx.c        | 51 ++++++++++++++++++-----------------
 arch/x86/kvm/vmx/vmx.h        |  2 +-
 arch/x86/kvm/x86.h            |  6 ++---
 5 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index c09174f73a34..af9bd0374915 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
  * 1	  0	  register in vcpu->arch
  * 1	  1	  register in vcpu->arch, needs to be stored back
  */
-static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
-					     enum kvm_reg reg)
+static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+						      enum kvm_reg reg)
 {
 	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
 }
 
-static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
-					 enum kvm_reg reg)
+static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+						  enum kvm_reg reg)
 {
 	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
 }
 
-static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
-					       enum kvm_reg reg)
+static __always_inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+							enum kvm_reg reg)
 {
 	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
 }
 
-static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
-					   enum kvm_reg reg)
+static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+						    enum kvm_reg reg)
 {
 	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
 	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 0b5db4de4d09..b104dfd282ed 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -318,7 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline)
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
 
-SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
+SYM_FUNC_START(vmx_do_interrupt_irqoff)
 	/*
 	 * Unconditionally create a stack frame, getting the correct RSP on the
 	 * stack (for x86-64) would take two instructions anyways, and RBP can
@@ -349,4 +349,4 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
 	mov %_ASM_BP, %_ASM_SP
 	pop %_ASM_BP
 	RET
-SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
+SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cea8c07f5229..b721fde4f5af 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5064,8 +5064,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	vect_info = vmx->idt_vectoring_info;
 	intr_info = vmx_get_intr_info(vcpu);
 
+	/*
+	 * Machine checks are handled by handle_exception_irqoff(), or by
+	 * vmx_vcpu_run() if a #MC occurs on VM-Entry.  NMIs are handled by
+	 * vmx_vcpu_enter_exit().
+	 */
 	if (is_machine_check(intr_info) || is_nmi(intr_info))
-		return 1; /* handled by handle_exception_nmi_irqoff() */
+		return 1;
 
 	/*
 	 * Queue the exception here instead of in handle_nm_fault_irqoff().
@@ -6755,18 +6760,6 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
 }
 
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
-					unsigned long entry)
-{
-	bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
-
-	kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
-	vmx_do_interrupt_nmi_irqoff(entry);
-	kvm_after_interrupt(vcpu);
-}
-
 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -6787,9 +6780,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 		rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
 }
 
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct vcpu_vmx *vmx)
 {
-	const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
 	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
 
 	/* if exit due to PF check for async PF */
@@ -6801,11 +6793,10 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 	/* Handle machine checks before interrupts are enabled */
 	else if (is_machine_check(intr_info))
 		kvm_machine_check();
-	/* We need to handle NMIs before interrupts are enabled */
-	else if (is_nmi(intr_info))
-		handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
 }
 
+void vmx_do_interrupt_irqoff(unsigned long entry);
+
 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 {
 	u32 intr_info = vmx_get_intr_info(vcpu);
@@ -6816,7 +6807,10 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+	vmx_do_interrupt_irqoff(gate_offset(desc));
+	kvm_after_interrupt(vcpu);
+
 	vcpu->arch.at_instruction_boundary = true;
 }
 
@@ -6830,7 +6824,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
 		handle_external_interrupt_irqoff(vcpu);
 	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
-		handle_exception_nmi_irqoff(vmx);
+		handle_exception_irqoff(vmx);
 }
 
 /*
@@ -7091,6 +7085,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
 	vmx_enable_fb_clear(vmx);
 
+	if (unlikely(vmx->fail))
+		vmx->exit_reason.full = 0xdead;
+	else
+		vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+
+	if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
+	    is_nmi(vmx_get_intr_info(vcpu))) {
+		kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+		asm("int $2");
+		kvm_after_interrupt(vcpu);
+	}
+
 	guest_state_exit_irqoff();
 }
 
@@ -7232,12 +7238,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->idt_vectoring_info = 0;
 
-	if (unlikely(vmx->fail)) {
-		vmx->exit_reason.full = 0xdead;
+	if (unlikely(vmx->fail))
 		return EXIT_FASTPATH_NONE;
-	}
 
-	vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
 	if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
 		kvm_machine_check();
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a3da84f4ea45..eb52cfde5ec2 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -680,7 +680,7 @@ static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
 	return vmx->exit_qualification;
 }
 
-static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9de72586f406..44d1827f0a30 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -382,13 +382,13 @@ enum kvm_intr_type {
 	KVM_HANDLING_NMI,
 };
 
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
-					enum kvm_intr_type intr)
+static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+						 enum kvm_intr_type intr)
 {
 	WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
 }
 
-static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
 {
 	WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
 }

base-commit: 0fa32dad1e78629cb42999dacd82489503fdf4c2
--
Li, Xin3 Nov. 24, 2022, 3:40 a.m. UTC | #28
> > > thouh we'd like want a fair bit of refactoring so that all of
> > > vmx_vcpu_run() and
> > > svm_vcpu_run() don't need to be noinstr.
> 
> For the record, svm_vcpu_run() is fine, at least as far as NMIs are concerned.
> 
> > This sounds reasonable to me, however from
> > Documentation/core-api/entry.rst, we do need it.
> 
> Why do you say that?
>

Copy/Paste from Documentation/core-api/entry.rst:

KVM
---

Entering or exiting guest mode is very similar to syscalls. From the host
kernel point of view the CPU goes off into user space when entering the
guest and returns to the kernel on exit.

kvm_guest_enter_irqoff() is a KVM-specific variant of exit_to_user_mode()
and kvm_guest_exit_irqoff() is the KVM variant of enter_from_user_mode().
The state operations have the same ordering.

Task work handling is done separately for guest at the boundary of the
vcpu_run() loop via xfer_to_guest_mode_handle_work() which is a subset of
the work handled on return to user space.

Do not nest KVM entry/exit transitions because doing so is nonsensical.

> 
> I believe this is what we should end up with.  Compile tested only, and needs to
> split up into 4+ patches.  I'll test and massage this into a proper series next
> week (US holiday the rest of this week).

Great, thanks for doing it quickly.
Xin

> 
> ---
>  arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
>  arch/x86/kvm/vmx/vmenter.S    |  4 +--
>  arch/x86/kvm/vmx/vmx.c        | 51 ++++++++++++++++++-----------------
>  arch/x86/kvm/vmx/vmx.h        |  2 +-
>  arch/x86/kvm/x86.h            |  6 ++---
>  5 files changed, 41 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/kvm/kvm_cache_regs.h
> b/arch/x86/kvm/kvm_cache_regs.h index c09174f73a34..af9bd0374915
> 100644
> --- a/arch/x86/kvm/kvm_cache_regs.h
> +++ b/arch/x86/kvm/kvm_cache_regs.h
> @@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
>   * 1	  0	  register in vcpu->arch
>   * 1	  1	  register in vcpu->arch, needs to be stored back
>   */
> -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> -					     enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> +						      enum kvm_reg reg)
>  {
>  	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);  }
> 
> -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> -					 enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> +						  enum kvm_reg reg)
>  {
>  	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);  }
> 
> -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> -					       enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_available(struct kvm_vcpu
> *vcpu,
> +							enum kvm_reg reg)
>  {
>  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);  }
> 
> -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> -					   enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> +						    enum kvm_reg reg)
>  {
>  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
>  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); diff --git
> a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index
> 0b5db4de4d09..b104dfd282ed 100644
> --- a/arch/x86/kvm/vmx/vmenter.S
> +++ b/arch/x86/kvm/vmx/vmenter.S
> @@ -318,7 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline)
>  	RET
>  SYM_FUNC_END(vmread_error_trampoline)
> 
> -SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
> +SYM_FUNC_START(vmx_do_interrupt_irqoff)
>  	/*
>  	 * Unconditionally create a stack frame, getting the correct RSP on the
>  	 * stack (for x86-64) would take two instructions anyways, and RBP can
> @@ -349,4 +349,4 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
>  	mov %_ASM_BP, %_ASM_SP
>  	pop %_ASM_BP
>  	RET
> -SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
> +SYM_FUNC_END(vmx_do_interrupt_irqoff)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> cea8c07f5229..b721fde4f5af 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5064,8 +5064,13 @@ static int handle_exception_nmi(struct kvm_vcpu
> *vcpu)
>  	vect_info = vmx->idt_vectoring_info;
>  	intr_info = vmx_get_intr_info(vcpu);
> 
> +	/*
> +	 * Machine checks are handled by handle_exception_irqoff(), or by
> +	 * vmx_vcpu_run() if a #MC occurs on VM-Entry.  NMIs are handled by
> +	 * vmx_vcpu_enter_exit().
> +	 */
>  	if (is_machine_check(intr_info) || is_nmi(intr_info))
> -		return 1; /* handled by handle_exception_nmi_irqoff() */
> +		return 1;
> 
>  	/*
>  	 * Queue the exception here instead of in handle_nm_fault_irqoff().
> @@ -6755,18 +6760,6 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
>  	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));  }
> 
> -void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
> -
> -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
> -					unsigned long entry)
> -{
> -	bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
> -
> -	kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI :
> KVM_HANDLING_IRQ);
> -	vmx_do_interrupt_nmi_irqoff(entry);
> -	kvm_after_interrupt(vcpu);
> -}
> -
>  static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)  {
>  	/*
> @@ -6787,9 +6780,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu
> *vcpu)
>  		rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);  }
> 
> -static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> +static void handle_exception_irqoff(struct vcpu_vmx *vmx)
>  {
> -	const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
>  	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
> 
>  	/* if exit due to PF check for async PF */ @@ -6801,11 +6793,10 @@
> static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
>  	/* Handle machine checks before interrupts are enabled */
>  	else if (is_machine_check(intr_info))
>  		kvm_machine_check();
> -	/* We need to handle NMIs before interrupts are enabled */
> -	else if (is_nmi(intr_info))
> -		handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
>  }
> 
> +void vmx_do_interrupt_irqoff(unsigned long entry);
> +
>  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)  {
>  	u32 intr_info = vmx_get_intr_info(vcpu); @@ -6816,7 +6807,10 @@
> static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
>  	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
>  		return;
> 
> -	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> +	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> +	vmx_do_interrupt_irqoff(gate_offset(desc));
> +	kvm_after_interrupt(vcpu);
> +
>  	vcpu->arch.at_instruction_boundary = true;  }
> 
> @@ -6830,7 +6824,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu
> *vcpu)
>  	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
>  		handle_external_interrupt_irqoff(vcpu);
>  	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
> -		handle_exception_nmi_irqoff(vmx);
> +		handle_exception_irqoff(vmx);
>  }
> 
>  /*
> @@ -7091,6 +7085,18 @@ static noinstr void vmx_vcpu_enter_exit(struct
> kvm_vcpu *vcpu,
> 
>  	vmx_enable_fb_clear(vmx);
> 
> +	if (unlikely(vmx->fail))
> +		vmx->exit_reason.full = 0xdead;
> +	else
> +		vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
> +
> +	if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
> +	    is_nmi(vmx_get_intr_info(vcpu))) {
> +		kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
> +		asm("int $2");
> +		kvm_after_interrupt(vcpu);
> +	}
> +
>  	guest_state_exit_irqoff();
>  }
> 
> @@ -7232,12 +7238,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu
> *vcpu)
> 
>  	vmx->idt_vectoring_info = 0;
> 
> -	if (unlikely(vmx->fail)) {
> -		vmx->exit_reason.full = 0xdead;
> +	if (unlikely(vmx->fail))
>  		return EXIT_FASTPATH_NONE;
> -	}
> 
> -	vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
>  	if (unlikely((u16)vmx->exit_reason.basic ==
> EXIT_REASON_MCE_DURING_VMENTRY))
>  		kvm_machine_check();
> 
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index
> a3da84f4ea45..eb52cfde5ec2 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -680,7 +680,7 @@ static inline unsigned long vmx_get_exit_qual(struct
> kvm_vcpu *vcpu)
>  	return vmx->exit_qualification;
>  }
> 
> -static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
> +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> 
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index
> 9de72586f406..44d1827f0a30 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -382,13 +382,13 @@ enum kvm_intr_type {
>  	KVM_HANDLING_NMI,
>  };
> 
> -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
> -					enum kvm_intr_type intr)
> +static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
> +						 enum kvm_intr_type intr)
>  {
>  	WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);  }
> 
> -static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
> +static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
>  {
>  	WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);  }
> 
> base-commit: 0fa32dad1e78629cb42999dacd82489503fdf4c2
> --
Peter Zijlstra Nov. 24, 2022, 9:46 a.m. UTC | #29
On Wed, Nov 23, 2022 at 08:42:51PM +0000, Sean Christopherson wrote:
>  arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
>  arch/x86/kvm/vmx/vmenter.S    |  4 +--
>  arch/x86/kvm/vmx/vmx.c        | 51 ++++++++++++++++++-----------------
>  arch/x86/kvm/vmx/vmx.h        |  2 +-
>  arch/x86/kvm/x86.h            |  6 ++---
>  5 files changed, 41 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
> index c09174f73a34..af9bd0374915 100644
> --- a/arch/x86/kvm/kvm_cache_regs.h
> +++ b/arch/x86/kvm/kvm_cache_regs.h
> @@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
>   * 1	  0	  register in vcpu->arch
>   * 1	  1	  register in vcpu->arch, needs to be stored back
>   */
> -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> -					     enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> +						      enum kvm_reg reg)
>  {
>  	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
>  }
>  
> -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> -					 enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> +						  enum kvm_reg reg)
>  {
>  	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
>  }
>  
> -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> -					       enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> +							enum kvm_reg reg)
>  {
>  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
>  }
>  
> -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> -					   enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> +						    enum kvm_reg reg)
>  {
>  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
>  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);

You'll have to consider include/asm-generic/bitops/instrumented-non-atomic.h
and friend, and the above should probably switch to using:

  arch_test_bit(), arch___set_bit() resp.

to avoid the explicit instrumentation.
Sean Christopherson Nov. 28, 2022, 4:26 p.m. UTC | #30
On Thu, Nov 24, 2022, Li, Xin3 wrote:
> > > > thouh we'd like want a fair bit of refactoring so that all of
> > > > vmx_vcpu_run() and
> > > > svm_vcpu_run() don't need to be noinstr.
> > 
> > For the record, svm_vcpu_run() is fine, at least as far as NMIs are concerned.
> > 
> > > This sounds reasonable to me, however from
> > > Documentation/core-api/entry.rst, we do need it.
> > 
> > Why do you say that?
> >
> 
> Copy/Paste from Documentation/core-api/entry.rst:

I'm very confused.  What do you mean by "we do need it".  What is "it"?  And what
does "it" have to do with the below documentation?  The documentation does nothing
more than explain how KVM handles task work.
 
> KVM
> ---
> 
> Entering or exiting guest mode is very similar to syscalls. From the host
> kernel point of view the CPU goes off into user space when entering the
> guest and returns to the kernel on exit.
> 
> kvm_guest_enter_irqoff() is a KVM-specific variant of exit_to_user_mode()
> and kvm_guest_exit_irqoff() is the KVM variant of enter_from_user_mode().
> The state operations have the same ordering.
> 
> Task work handling is done separately for guest at the boundary of the
> vcpu_run() loop via xfer_to_guest_mode_handle_work() which is a subset of
> the work handled on return to user space.
> 
> Do not nest KVM entry/exit transitions because doing so is nonsensical.
Sean Christopherson Nov. 28, 2022, 7:05 p.m. UTC | #31
On Thu, Nov 24, 2022, Peter Zijlstra wrote:
> On Wed, Nov 23, 2022 at 08:42:51PM +0000, Sean Christopherson wrote:
> >  arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
> >  arch/x86/kvm/vmx/vmenter.S    |  4 +--
> >  arch/x86/kvm/vmx/vmx.c        | 51 ++++++++++++++++++-----------------
> >  arch/x86/kvm/vmx/vmx.h        |  2 +-
> >  arch/x86/kvm/x86.h            |  6 ++---
> >  5 files changed, 41 insertions(+), 38 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
> > index c09174f73a34..af9bd0374915 100644
> > --- a/arch/x86/kvm/kvm_cache_regs.h
> > +++ b/arch/x86/kvm/kvm_cache_regs.h
> > @@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
> >   * 1	  0	  register in vcpu->arch
> >   * 1	  1	  register in vcpu->arch, needs to be stored back
> >   */
> > -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> > -					     enum kvm_reg reg)
> > +static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> > +						      enum kvm_reg reg)
> >  {
> >  	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> >  }
> >  
> > -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> > -					 enum kvm_reg reg)
> > +static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> > +						  enum kvm_reg reg)
> >  {
> >  	return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
> >  }
> >  
> > -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> > -					       enum kvm_reg reg)
> > +static __always_inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> > +							enum kvm_reg reg)
> >  {
> >  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> >  }
> >  
> > -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> > -					   enum kvm_reg reg)
> > +static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> > +						    enum kvm_reg reg)
> >  {
> >  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> >  	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
> 
> You'll have to consider include/asm-generic/bitops/instrumented-non-atomic.h
> and friend, and the above should probably switch to using:
> 
>   arch_test_bit(), arch___set_bit() resp.
> 
> to avoid the explicit instrumentation.

Well that's just mean.  I'll figure out a solution, thanks for the heads up!
diff mbox series

Patch

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 89c4233e19db..4c56a8d31762 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -57,4 +57,6 @@  void __noreturn handle_stack_overflow(struct pt_regs *regs,
 		unsigned long vector __maybe_unused)
 typedef DECLARE_SYSTEM_INTERRUPT_HANDLER((*system_interrupt_handler));
 
+void kvm_vmx_reinject_nmi_irq(u32 vector);
+
 #endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c1eb3bd335ce..9abf91534b13 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1528,6 +1528,29 @@  __visible noinstr void external_interrupt(struct pt_regs *regs,
 	common_interrupt(regs, vector);
 }
 
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * KVM VMX reinjects NMI/IRQ on its current stack, it's a sync
+ * call thus the values in the pt_regs structure are not used in
+ * executing NMI/IRQ handlers, except cs.RPL and flags.IF, which
+ * are both always 0 in the VMX NMI/IRQ reinjection context. Thus
+ * we simply allocate a zeroed pt_regs structure on current stack
+ * to call external_interrupt().
+ */
+void kvm_vmx_reinject_nmi_irq(u32 vector)
+{
+	struct pt_regs irq_regs;
+
+	memset(&irq_regs, 0, sizeof(irq_regs));
+
+	if (vector == NMI_VECTOR)
+		return exc_nmi(&irq_regs);
+
+	external_interrupt(&irq_regs, vector);
+}
+EXPORT_SYMBOL_GPL(kvm_vmx_reinject_nmi_irq);
+#endif
+
 void __init trap_init(void)
 {
 	/* Init cpu_entry_area before IST entries are set up */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8477d8bdd69c..0c1608b329cd 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -317,36 +317,3 @@  SYM_FUNC_START(vmread_error_trampoline)
 
 	RET
 SYM_FUNC_END(vmread_error_trampoline)
-
-SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
-	/*
-	 * Unconditionally create a stack frame, getting the correct RSP on the
-	 * stack (for x86-64) would take two instructions anyways, and RBP can
-	 * be used to restore RSP to make objtool happy (see below).
-	 */
-	push %_ASM_BP
-	mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
-	 * creating the synthetic interrupt stack frame for the IRQ/NMI.
-	 */
-	and  $-16, %rsp
-	push $__KERNEL_DS
-	push %rbp
-#endif
-	pushf
-	push $__KERNEL_CS
-	CALL_NOSPEC _ASM_ARG1
-
-	/*
-	 * "Restore" RSP from RBP, even though IRET has already unwound RSP to
-	 * the correct value.  objtool doesn't know the callee will IRET and,
-	 * without the explicit restore, thinks the stack is getting walloped.
-	 * Using an unwind hint is problematic due to x86-64's dynamic alignment.
-	 */
-	mov %_ASM_BP, %_ASM_SP
-	pop %_ASM_BP
-	RET
-SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63247c57c72c..b457e4888468 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -46,6 +46,7 @@ 
 #include <asm/mshyperv.h>
 #include <asm/mwait.h>
 #include <asm/spec-ctrl.h>
+#include <asm/traps.h>
 #include <asm/virtext.h>
 #include <asm/vmx.h>
 
@@ -6758,15 +6759,11 @@  static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
 }
 
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
-					unsigned long entry)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 vector)
 {
-	bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
-
-	kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
-	vmx_do_interrupt_nmi_irqoff(entry);
+	kvm_before_interrupt(vcpu, vector == NMI_VECTOR ?
+				   KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
+	kvm_vmx_reinject_nmi_irq(vector);
 	kvm_after_interrupt(vcpu);
 }
 
@@ -6792,7 +6789,6 @@  static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
 
 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 {
-	const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
 	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
 
 	/* if exit due to PF check for async PF */
@@ -6806,20 +6802,19 @@  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 		kvm_machine_check();
 	/* We need to handle NMIs before interrupts are enabled */
 	else if (is_nmi(intr_info))
-		handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
+		handle_interrupt_nmi_irqoff(&vmx->vcpu, NMI_VECTOR);
 }
 
 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 {
 	u32 intr_info = vmx_get_intr_info(vcpu);
 	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
-	gate_desc *desc = (gate_desc *)host_idt_base + vector;
 
 	if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
 	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
-	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+	handle_interrupt_nmi_irqoff(vcpu, vector);
 	vcpu->arch.at_instruction_boundary = true;
 }