[13/15] Add NMI injection support to SVM.

Message ID	1239616545-25199-14-git-send-email-gleb@redhat.com (mailing list archive)
State	Accepted
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n3D9u7YM005959 for <patchwork-kvm@patchwork.kernel.org>; Mon, 13 Apr 2009 09:56:08 GMT From: Gleb Natapov <gleb@redhat.com> To: avi@redhat.com Cc: kvm@vger.kernel.org, joerg.roedel@amd.com, sheng@linux.intel.com, Gleb Natapov <gleb@redhat.com> Subject: [PATCH 13/15] Add NMI injection support to SVM. Date: Mon, 13 Apr 2009 12:55:43 +0300 Message-Id: <1239616545-25199-14-git-send-email-gleb@redhat.com> In-Reply-To: <1239616545-25199-1-git-send-email-gleb@redhat.com> References: <1239616545-25199-1-git-send-email-gleb@redhat.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

Gleb Natapov April 13, 2009, 9:55 a.m. UTC

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

Jan Kiszka April 17, 2009, 11:59 a.m. UTC | #1

Gleb Natapov wrote:
> Signed-off-by: Gleb Natapov <gleb@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
>  2 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 8b6f6e9..057a612 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -766,6 +766,7 @@ enum {
>  #define HF_GIF_MASK		(1 << 0)
>  #define HF_HIF_MASK		(1 << 1)
>  #define HF_VINTR_MASK		(1 << 2)
> +#define HF_NMI_MASK		(1 << 3)
>  
>  /*
>   * Hardware virtualization extension instructions may fault if a
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index c605477..cd60fd7 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>  	return 1;
>  }
>  
> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> +{
> +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
> +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
> +	return 0;
> +}
> +
>  static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>  {
>  	if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
> @@ -2111,6 +2118,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
>  	[SVM_EXIT_VINTR]			= interrupt_window_interception,
>  	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
>  	[SVM_EXIT_CPUID]			= cpuid_interception,
> +	[SVM_EXIT_IRET]                         = iret_interception,
>  	[SVM_EXIT_INVD]                         = emulate_on_interception,
>  	[SVM_EXIT_HLT]				= halt_interception,
>  	[SVM_EXIT_INVLPG]			= invlpg_interception,
> @@ -2218,6 +2226,11 @@ static void pre_svm_run(struct vcpu_svm *svm)
>  		new_asid(svm, svm_data);
>  }
>  
> +static void svm_inject_nmi(struct vcpu_svm *svm)
> +{
> +	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
> +	svm->vcpu.arch.hflags |= HF_NMI_MASK;
> +}
>  
>  static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
>  {
> @@ -2269,6 +2282,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
>  		vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
>  }
>  
> +static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +	struct vmcb *vmcb = svm->vmcb;
> +	return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
> +		!(svm->vcpu.arch.hflags & HF_NMI_MASK);
> +}
> +
>  static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -2284,16 +2305,37 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
>  	svm_inject_irq(to_svm(vcpu), 0x0);
>  }
>  
> +static void enable_nmi_window(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (svm->vcpu.arch.hflags & HF_NMI_MASK)
> +		svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
> +	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
> +		enable_irq_window(vcpu);
> +}
> +
>  static void svm_intr_inject(struct kvm_vcpu *vcpu)
>  {
>  	/* try to reinject previous events if any */
> +	if (vcpu->arch.nmi_injected) {
> +		svm_inject_nmi(to_svm(vcpu));
> +		return;
> +	}
> +
>  	if (vcpu->arch.interrupt.pending) {
>  		svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
>  		return;
>  	}
>  
>  	/* try to inject new event if pending */
> -	if (kvm_cpu_has_interrupt(vcpu)) {
> +	if (vcpu->arch.nmi_pending) {
> +		if (svm_nmi_allowed(vcpu)) {
> +			vcpu->arch.nmi_pending = false;
> +			vcpu->arch.nmi_injected = true;
> +			svm_inject_nmi(vcpu);
> +		}
> +	} else if (kvm_cpu_has_interrupt(vcpu)) {

Strictly spoken, this 'else' is incorrect: If we have an NMI pending
while the NMI window is closed _but_ the guest decided to open the IRQ
window, there is no reason why we shouldn't inject an IRQ. Only if we
actually injected an NMI, pending IRQs should be skipped for this run.

>  		if (svm_interrupt_allowed(vcpu)) {
>  			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
>  			svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
> @@ -2312,7 +2354,10 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  
>  	svm_intr_inject(vcpu);
>  
> -	if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
> +	/* enable NMI/IRQ window open exits if needed */
> +	if (vcpu->arch.nmi_pending)
> +		enable_nmi_window(vcpu);
> +	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
>  		enable_irq_window(vcpu);
>  
>  out:

Jan

Dmitry Baryshkov April 17, 2009, 3:12 p.m. UTC | #2

This patch does expose some problems on real HW. The first NMI completes w/o
problems. However If I try to boot the kernel w/ nmi_watchdog=1 or to trigger
two NMIs from the monitor, kernel is stuck somewhere.

Jan Kiszka April 17, 2009, 7:55 p.m. UTC | #3

Gleb Natapov wrote:
> Signed-off-by: Gleb Natapov <gleb@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
>  2 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 8b6f6e9..057a612 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -766,6 +766,7 @@ enum {
>  #define HF_GIF_MASK		(1 << 0)
>  #define HF_HIF_MASK		(1 << 1)
>  #define HF_VINTR_MASK		(1 << 2)
> +#define HF_NMI_MASK		(1 << 3)
>  
>  /*
>   * Hardware virtualization extension instructions may fault if a
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index c605477..cd60fd7 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>  	return 1;
>  }
>  
> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> +{
> +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
> +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
> +	return 0;
> +}

First, this must return 1 (or set an exit reason, but there is no reason
to escape to user space here). And second, I think a corner case is not
handled the same way as on real iron: If there is already the next NMI
waiting, we will inject it before iret, not after its execution as it
should be.

No easy solution for this yet. Maybe emulating iret, but there is no
implementation, specifically for protected mode. Maybe setting a
breakpoint. Or maybe enforcing a single step exception. Nothing trivial
in this list. On the other hand, this may only be a slight imprecision
of the virtualization. Need to think about it.

Jan

Avi Kivity April 19, 2009, 8:57 a.m. UTC | #4

Jan Kiszka wrote:
> First, this must return 1 (or set an exit reason, but there is no reason
> to escape to user space here). And second, I think a corner case is not
> handled the same way as on real iron: If there is already the next NMI
> waiting, we will inject it before iret, not after its execution as it
> should be.
>   

Yes, good catch.

> No easy solution for this yet. Maybe emulating iret, but there is no
> implementation, specifically for protected mode. 

That will be a disaster, IRET is horribly complicated.

> Maybe setting a
> breakpoint. Or maybe enforcing a single step exception. 

Single step looks good, except that it may conflict with a guest 
debugging itself (guest debugging an NMI handler?!).

> Nothing trivial
> in this list. On the other hand, this may only be a slight imprecision
> of the virtualization. Need to think about it.
>   

It may cause a stack overflow if we have a huge stream of NMIs (if an 
NMI triggers an NMI in the handler).  Of course that's a totally 
unrealistic scenario.

Jan Kiszka April 19, 2009, 9:12 a.m. UTC | #5

Avi Kivity wrote:
> Jan Kiszka wrote:
>> First, this must return 1 (or set an exit reason, but there is no reason
>> to escape to user space here). And second, I think a corner case is not
>> handled the same way as on real iron: If there is already the next NMI
>> waiting, we will inject it before iret, not after its execution as it
>> should be.
>>   
> 
> Yes, good catch.
> 
>> No easy solution for this yet. Maybe emulating iret, but there is no
>> implementation, specifically for protected mode. 
> 
> That will be a disaster, IRET is horribly complicated.

Yeah, I know...

> 
>> Maybe setting a
>> breakpoint. Or maybe enforcing a single step exception. 
> 
> Single step looks good, except that it may conflict with a guest
> debugging itself (guest debugging an NMI handler?!).

But that should be solvable without too much effort.

BTW, guest single-stepping in and out of interrupt handlers is not
properly working anyway. We only set TF in current eflags but do not
bother about the CPU state that will get loaded next. Given some rainy
days (or a paying customer) I think I'll look into this once again. Same
goes for suppressing IRQ injection while single-stepping just as QEMU
does, which may even come earlier as someone already asked for it.

> 
>> Nothing trivial
>> in this list. On the other hand, this may only be a slight imprecision
>> of the virtualization. Need to think about it.
>>   
> 
> It may cause a stack overflow if we have a huge stream of NMIs (if an
> NMI triggers an NMI in the handler).  Of course that's a totally
> unrealistic scenario.
> 

Good point. But as it is a corner case, I think we can fly without a
complete solution first, fixing it in a second step.

Jan

Gleb Natapov April 19, 2009, 1:17 p.m. UTC | #6

On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote:
> Gleb Natapov wrote:
> > Signed-off-by: Gleb Natapov <gleb@redhat.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |    1 +
> >  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 48 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 8b6f6e9..057a612 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -766,6 +766,7 @@ enum {
> >  #define HF_GIF_MASK		(1 << 0)
> >  #define HF_HIF_MASK		(1 << 1)
> >  #define HF_VINTR_MASK		(1 << 2)
> > +#define HF_NMI_MASK		(1 << 3)
> >  
> >  /*
> >   * Hardware virtualization extension instructions may fault if a
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index c605477..cd60fd7 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> >  	return 1;
> >  }
> >  
> > +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> > +{
> > +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
> > +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
> > +	return 0;
> > +}
> 
> First, this must return 1 (or set an exit reason, but there is no reason
> to escape to user space here). And second, I think a corner case is not
> handled the same way as on real iron: If there is already the next NMI
> waiting, we will inject it before iret, not after its execution as it
> should be.
> 
> No easy solution for this yet. Maybe emulating iret, but there is no
> implementation, specifically for protected mode. Maybe setting a
> breakpoint. Or maybe enforcing a single step exception. Nothing trivial
> in this list. On the other hand, this may only be a slight imprecision
> of the virtualization. Need to think about it.
> 
What about this:
Instead of clearing HF_NMI_MASK in iret_interception() we can set
another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and
HF_IRET) if HF_IRET is set, but do that after checking for NMI
injection. The pending NMI will be injected on the next entry.
Also not how real HW works, but may be better then current situation.


--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity April 19, 2009, 1:21 p.m. UTC | #7

Gleb Natapov wrote:
> On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote:
>   
>> Gleb Natapov wrote:
>>     
>>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
>>> ---
>>>  arch/x86/include/asm/kvm_host.h |    1 +
>>>  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
>>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>> index 8b6f6e9..057a612 100644
>>> --- a/arch/x86/include/asm/kvm_host.h
>>> +++ b/arch/x86/include/asm/kvm_host.h
>>> @@ -766,6 +766,7 @@ enum {
>>>  #define HF_GIF_MASK		(1 << 0)
>>>  #define HF_HIF_MASK		(1 << 1)
>>>  #define HF_VINTR_MASK		(1 << 2)
>>> +#define HF_NMI_MASK		(1 << 3)
>>>  
>>>  /*
>>>   * Hardware virtualization extension instructions may fault if a
>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>> index c605477..cd60fd7 100644
>>> --- a/arch/x86/kvm/svm.c
>>> +++ b/arch/x86/kvm/svm.c
>>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>>>  	return 1;
>>>  }
>>>  
>>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>>> +{
>>> +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
>>> +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
>>> +	return 0;
>>> +}
>>>       
>> First, this must return 1 (or set an exit reason, but there is no reason
>> to escape to user space here). And second, I think a corner case is not
>> handled the same way as on real iron: If there is already the next NMI
>> waiting, we will inject it before iret, not after its execution as it
>> should be.
>>
>> No easy solution for this yet. Maybe emulating iret, but there is no
>> implementation, specifically for protected mode. Maybe setting a
>> breakpoint. Or maybe enforcing a single step exception. Nothing trivial
>> in this list. On the other hand, this may only be a slight imprecision
>> of the virtualization. Need to think about it.
>>
>>     
> What about this:
> Instead of clearing HF_NMI_MASK in iret_interception() we can set
> another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and
> HF_IRET) if HF_IRET is set, but do that after checking for NMI
> injection. The pending NMI will be injected on the next entry.
> Also not how real HW works, but may be better then current situation.
>   

There may not be a next entry if the guest is in a tight loop.  Given 
NMIs are used for watchdogs, that's not good.

btw, injection before IRET is executed is broken if interrupt stack 
tables are used, since the injection will reset rsp instead of nesting.

Gleb Natapov April 19, 2009, 1:24 p.m. UTC | #8

On Sun, Apr 19, 2009 at 04:21:29PM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>> On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote:
>>   
>>> Gleb Natapov wrote:
>>>     
>>>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
>>>> ---
>>>>  arch/x86/include/asm/kvm_host.h |    1 +
>>>>  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
>>>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index 8b6f6e9..057a612 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -766,6 +766,7 @@ enum {
>>>>  #define HF_GIF_MASK		(1 << 0)
>>>>  #define HF_HIF_MASK		(1 << 1)
>>>>  #define HF_VINTR_MASK		(1 << 2)
>>>> +#define HF_NMI_MASK		(1 << 3)
>>>>   /*
>>>>   * Hardware virtualization extension instructions may fault if a
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index c605477..cd60fd7 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>>>>  	return 1;
>>>>  }
>>>>  +static int iret_interception(struct vcpu_svm *svm, struct kvm_run 
>>>> *kvm_run)
>>>> +{
>>>> +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
>>>> +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
>>>> +	return 0;
>>>> +}
>>>>       
>>> First, this must return 1 (or set an exit reason, but there is no reason
>>> to escape to user space here). And second, I think a corner case is not
>>> handled the same way as on real iron: If there is already the next NMI
>>> waiting, we will inject it before iret, not after its execution as it
>>> should be.
>>>
>>> No easy solution for this yet. Maybe emulating iret, but there is no
>>> implementation, specifically for protected mode. Maybe setting a
>>> breakpoint. Or maybe enforcing a single step exception. Nothing trivial
>>> in this list. On the other hand, this may only be a slight imprecision
>>> of the virtualization. Need to think about it.
>>>
>>>     
>> What about this:
>> Instead of clearing HF_NMI_MASK in iret_interception() we can set
>> another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and
>> HF_IRET) if HF_IRET is set, but do that after checking for NMI
>> injection. The pending NMI will be injected on the next entry.
>> Also not how real HW works, but may be better then current situation.
>>   
>
> There may not be a next entry if the guest is in a tight loop.  Given  
> NMIs are used for watchdogs, that's not good.
>
We don't exit a guest after kvm time slice ends?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jan Kiszka April 19, 2009, 1:27 p.m. UTC | #9

Gleb Natapov wrote:
> On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote:
>> Gleb Natapov wrote:
>>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
>>> ---
>>>  arch/x86/include/asm/kvm_host.h |    1 +
>>>  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
>>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>> index 8b6f6e9..057a612 100644
>>> --- a/arch/x86/include/asm/kvm_host.h
>>> +++ b/arch/x86/include/asm/kvm_host.h
>>> @@ -766,6 +766,7 @@ enum {
>>>  #define HF_GIF_MASK		(1 << 0)
>>>  #define HF_HIF_MASK		(1 << 1)
>>>  #define HF_VINTR_MASK		(1 << 2)
>>> +#define HF_NMI_MASK		(1 << 3)
>>>  
>>>  /*
>>>   * Hardware virtualization extension instructions may fault if a
>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>> index c605477..cd60fd7 100644
>>> --- a/arch/x86/kvm/svm.c
>>> +++ b/arch/x86/kvm/svm.c
>>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>>>  	return 1;
>>>  }
>>>  
>>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>>> +{
>>> +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
>>> +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
>>> +	return 0;
>>> +}
>> First, this must return 1 (or set an exit reason, but there is no reason
>> to escape to user space here). And second, I think a corner case is not
>> handled the same way as on real iron: If there is already the next NMI
>> waiting, we will inject it before iret, not after its execution as it
>> should be.
>>
>> No easy solution for this yet. Maybe emulating iret, but there is no
>> implementation, specifically for protected mode. Maybe setting a
>> breakpoint. Or maybe enforcing a single step exception. Nothing trivial
>> in this list. On the other hand, this may only be a slight imprecision
>> of the virtualization. Need to think about it.
>>
> What about this:
> Instead of clearing HF_NMI_MASK in iret_interception() we can set
> another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and
> HF_IRET) if HF_IRET is set, but do that after checking for NMI
> injection. The pending NMI will be injected on the next entry.
> Also not how real HW works, but may be better then current situation.

It's OK as a first step towards correct NMI emulation. Additionally, you
could enable the IRQ window interception in case the is an NMI pending.
The resulting behavior should then much like the VNMI mask emulation for
vmx.

The next step should then be setting TF in the eflags stored on the
guest's stack before returning *if* there is already the next NMI
pending. But I wonder how much additional effort this will actually mean
(compared to the band-aid work)... :)

Jan

Avi Kivity April 19, 2009, 1:28 p.m. UTC | #10

Gleb Natapov wrote:
>> There may not be a next entry if the guest is in a tight loop.  Given  
>> NMIs are used for watchdogs, that's not good.
>>
>>     
> We don't exit a guest after kvm time slice ends?
>   

There are no time slices any more.  If there's only once thread for a 
vcpu, you might have no exits at all with a tickless kernel.

Gleb Natapov April 19, 2009, 1:32 p.m. UTC | #11

On Sun, Apr 19, 2009 at 03:27:22PM +0200, Jan Kiszka wrote:
> Gleb Natapov wrote:
> > On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote:
> >> Gleb Natapov wrote:
> >>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
> >>> ---
> >>>  arch/x86/include/asm/kvm_host.h |    1 +
> >>>  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
> >>>  2 files changed, 48 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>> index 8b6f6e9..057a612 100644
> >>> --- a/arch/x86/include/asm/kvm_host.h
> >>> +++ b/arch/x86/include/asm/kvm_host.h
> >>> @@ -766,6 +766,7 @@ enum {
> >>>  #define HF_GIF_MASK		(1 << 0)
> >>>  #define HF_HIF_MASK		(1 << 1)
> >>>  #define HF_VINTR_MASK		(1 << 2)
> >>> +#define HF_NMI_MASK		(1 << 3)
> >>>  
> >>>  /*
> >>>   * Hardware virtualization extension instructions may fault if a
> >>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >>> index c605477..cd60fd7 100644
> >>> --- a/arch/x86/kvm/svm.c
> >>> +++ b/arch/x86/kvm/svm.c
> >>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> >>>  	return 1;
> >>>  }
> >>>  
> >>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> >>> +{
> >>> +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
> >>> +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
> >>> +	return 0;
> >>> +}
> >> First, this must return 1 (or set an exit reason, but there is no reason
> >> to escape to user space here). And second, I think a corner case is not
> >> handled the same way as on real iron: If there is already the next NMI
> >> waiting, we will inject it before iret, not after its execution as it
> >> should be.
> >>
> >> No easy solution for this yet. Maybe emulating iret, but there is no
> >> implementation, specifically for protected mode. Maybe setting a
> >> breakpoint. Or maybe enforcing a single step exception. Nothing trivial
> >> in this list. On the other hand, this may only be a slight imprecision
> >> of the virtualization. Need to think about it.
> >>
> > What about this:
> > Instead of clearing HF_NMI_MASK in iret_interception() we can set
> > another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and
> > HF_IRET) if HF_IRET is set, but do that after checking for NMI
> > injection. The pending NMI will be injected on the next entry.
> > Also not how real HW works, but may be better then current situation.
> 
> It's OK as a first step towards correct NMI emulation. Additionally, you
> could enable the IRQ window interception in case the is an NMI pending.
> The resulting behavior should then much like the VNMI mask emulation for
> vmx.
> 
Yeah, but the question is if IRQ windows is already opened will exit
happens before or after IRET.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Gleb Natapov April 19, 2009, 1:40 p.m. UTC | #12

On Sun, Apr 19, 2009 at 04:28:09PM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>>> There may not be a next entry if the guest is in a tight loop.  Given 
>>>  NMIs are used for watchdogs, that's not good.
>>>
>>>     
>> We don't exit a guest after kvm time slice ends?
>>   
>
> There are no time slices any more.  If there's only once thread for a  
> vcpu, you might have no exits at all with a tickless kernel.
>
Well, KVM may request some kind of even (timer) that will cause exit to
VCPU. This looks hacky though.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jan Kiszka April 19, 2009, 1:40 p.m. UTC | #13

Gleb Natapov wrote:
> On Sun, Apr 19, 2009 at 03:27:22PM +0200, Jan Kiszka wrote:
>> Gleb Natapov wrote:
>>> On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote:
>>>> Gleb Natapov wrote:
>>>>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
>>>>> ---
>>>>>  arch/x86/include/asm/kvm_host.h |    1 +
>>>>>  arch/x86/kvm/svm.c              |   49 +++++++++++++++++++++++++++++++++++++-
>>>>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>>> index 8b6f6e9..057a612 100644
>>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>>> @@ -766,6 +766,7 @@ enum {
>>>>>  #define HF_GIF_MASK		(1 << 0)
>>>>>  #define HF_HIF_MASK		(1 << 1)
>>>>>  #define HF_VINTR_MASK		(1 << 2)
>>>>> +#define HF_NMI_MASK		(1 << 3)
>>>>>  
>>>>>  /*
>>>>>   * Hardware virtualization extension instructions may fault if a
>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>>> index c605477..cd60fd7 100644
>>>>> --- a/arch/x86/kvm/svm.c
>>>>> +++ b/arch/x86/kvm/svm.c
>>>>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>>>>>  	return 1;
>>>>>  }
>>>>>  
>>>>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
>>>>> +{
>>>>> +	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
>>>>> +	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
>>>>> +	return 0;
>>>>> +}
>>>> First, this must return 1 (or set an exit reason, but there is no reason
>>>> to escape to user space here). And second, I think a corner case is not
>>>> handled the same way as on real iron: If there is already the next NMI
>>>> waiting, we will inject it before iret, not after its execution as it
>>>> should be.
>>>>
>>>> No easy solution for this yet. Maybe emulating iret, but there is no
>>>> implementation, specifically for protected mode. Maybe setting a
>>>> breakpoint. Or maybe enforcing a single step exception. Nothing trivial
>>>> in this list. On the other hand, this may only be a slight imprecision
>>>> of the virtualization. Need to think about it.
>>>>
>>> What about this:
>>> Instead of clearing HF_NMI_MASK in iret_interception() we can set
>>> another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and
>>> HF_IRET) if HF_IRET is set, but do that after checking for NMI
>>> injection. The pending NMI will be injected on the next entry.
>>> Also not how real HW works, but may be better then current situation.
>> It's OK as a first step towards correct NMI emulation. Additionally, you
>> could enable the IRQ window interception in case the is an NMI pending.
>> The resulting behavior should then much like the VNMI mask emulation for
>> vmx.
>>
> Yeah, but the question is if IRQ windows is already opened will exit
> happens before or after IRET.
> 

Hey, this is band-aid, it won't heal all cases. ;)

Jan

Avi Kivity April 19, 2009, 1:40 p.m. UTC | #14

Gleb Natapov wrote:
>> It's OK as a first step towards correct NMI emulation. Additionally, you
>> could enable the IRQ window interception in case the is an NMI pending.
>> The resulting behavior should then much like the VNMI mask emulation for
>> vmx.
>>
>>     
> Yeah, but the question is if IRQ windows is already opened will exit
> happens before or after IRET.
>   

You mean if the NMI handler enabled interrupts?

Gleb Natapov April 19, 2009, 1:41 p.m. UTC | #15

On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>>> It's OK as a first step towards correct NMI emulation. Additionally, you
>>> could enable the IRQ window interception in case the is an NMI pending.
>>> The resulting behavior should then much like the VNMI mask emulation for
>>> vmx.
>>>
>>>     
>> Yeah, but the question is if IRQ windows is already opened will exit
>> happens before or after IRET.
>>   
>
> You mean if the NMI handler enabled interrupts?
>
Yes.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity April 19, 2009, 1:43 p.m. UTC | #16

Gleb Natapov wrote:
> On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote:
>   
>> Gleb Natapov wrote:
>>     
>>>> It's OK as a first step towards correct NMI emulation. Additionally, you
>>>> could enable the IRQ window interception in case the is an NMI pending.
>>>> The resulting behavior should then much like the VNMI mask emulation for
>>>> vmx.
>>>>
>>>>     
>>>>         
>>> Yeah, but the question is if IRQ windows is already opened will exit
>>> happens before or after IRET.
>>>   
>>>       
>> You mean if the NMI handler enabled interrupts?
>>
>>     
> Yes.
>
>   

Then the guest deserves whatever it gets...

Jan Kiszka April 19, 2009, 1:43 p.m. UTC | #17

Gleb Natapov wrote:
> On Sun, Apr 19, 2009 at 04:28:09PM +0300, Avi Kivity wrote:
>> Gleb Natapov wrote:
>>>> There may not be a next entry if the guest is in a tight loop.  Given 
>>>>  NMIs are used for watchdogs, that's not good.
>>>>
>>>>     
>>> We don't exit a guest after kvm time slice ends?
>>>   
>> There are no time slices any more.  If there's only once thread for a  
>> vcpu, you might have no exits at all with a tickless kernel.
>>
> Well, KVM may request some kind of even (timer) that will cause exit to
> VCPU. This looks hacky though.

We already spent to much electrons and brain cycles on possibly "much
simpler" workarounds. I think injecting and handling a single-step, even
while there is guest debugging going on or the guest itself single-steps
or both, will not be more complicated - but "more correct".

Jan

Gleb Natapov April 19, 2009, 1:44 p.m. UTC | #18

On Sun, Apr 19, 2009 at 04:43:12PM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>> On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote:
>>   
>>> Gleb Natapov wrote:
>>>     
>>>>> It's OK as a first step towards correct NMI emulation. Additionally, you
>>>>> could enable the IRQ window interception in case the is an NMI pending.
>>>>> The resulting behavior should then much like the VNMI mask emulation for
>>>>> vmx.
>>>>>
>>>>>             
>>>> Yeah, but the question is if IRQ windows is already opened will exit
>>>> happens before or after IRET.
>>>>         
>>> You mean if the NMI handler enabled interrupts?
>>>
>>>     
>> Yes.
>>
>>   
>
> Then the guest deserves whatever it gets...
>
I suspect windows may do this since it uses NMI for task switching.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity April 19, 2009, 1:49 p.m. UTC | #19

Jan Kiszka wrote:
> We already spent to much electrons and brain cycles on possibly "much
> simpler" workarounds. I think injecting and handling a single-step, even
> while there is guest debugging going on or the guest itself single-steps
> or both, will not be more complicated - but "more correct".
>   

I agree.  I'm still worried about interactions between the IRET single 
stepping code and other things which use the debug registers.

Gleb Natapov April 19, 2009, 1:51 p.m. UTC | #20

On Sun, Apr 19, 2009 at 04:49:18PM +0300, Avi Kivity wrote:
> Jan Kiszka wrote:
>> We already spent to much electrons and brain cycles on possibly "much
>> simpler" workarounds. I think injecting and handling a single-step, even
>> while there is guest debugging going on or the guest itself single-steps
>> or both, will not be more complicated - but "more correct".
>>   
>
> I agree.  I'm still worried about interactions between the IRET single  
> stepping code and other things which use the debug registers.
>
I don't disagree too. Just throwing other ideas :)

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jan Kiszka April 19, 2009, 1:59 p.m. UTC | #21

Avi Kivity wrote:
> Jan Kiszka wrote:
>> We already spent to much electrons and brain cycles on possibly "much
>> simpler" workarounds. I think injecting and handling a single-step, even
>> while there is guest debugging going on or the guest itself single-steps
>> or both, will not be more complicated - but "more correct".
>>   
> 
> I agree.  I'm still worried about interactions between the IRET single
> stepping code and other things which use the debug registers.

The interaction is inside KVM, so under our control. We may simply save
the related states, hook into the guest-debugging parts that evaluate
single step exceptions, and handle this case with highest prio,
transparently to users of lower prio. In theory - you never really know
until you tried to implement it...

Jan

Julian Stecklina April 19, 2009, 2:07 p.m. UTC | #22

Gleb Natapov <gleb@redhat.com> writes:

> On Sun, Apr 19, 2009 at 04:43:12PM +0300, Avi Kivity wrote:
>> Gleb Natapov wrote:
>>> On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote:
>>>   
>>>> Gleb Natapov wrote:
>>>>     
>>>>>> It's OK as a first step towards correct NMI emulation. Additionally, you
>>>>>> could enable the IRQ window interception in case the is an NMI pending.
>>>>>> The resulting behavior should then much like the VNMI mask emulation for
>>>>>> vmx.
>>>>>>
>>>>>>             
>>>>> Yeah, but the question is if IRQ windows is already opened will exit
>>>>> happens before or after IRET.
>>>>>         
>>>> You mean if the NMI handler enabled interrupts?
>>>>
>>>>     
>>> Yes.
>>>
>>>   
>>
>> Then the guest deserves whatever it gets...
>>
> I suspect windows may do this since it uses NMI for task switching.

Could you elaborate on that? How/why does it use NMIs for task
switching?

Regards,

Gleb Natapov April 19, 2009, 2:13 p.m. UTC | #23

On Sun, Apr 19, 2009 at 04:07:52PM +0200, Julian Stecklina wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Sun, Apr 19, 2009 at 04:43:12PM +0300, Avi Kivity wrote:
> >> Gleb Natapov wrote:
> >>> On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote:
> >>>   
> >>>> Gleb Natapov wrote:
> >>>>     
> >>>>>> It's OK as a first step towards correct NMI emulation. Additionally, you
> >>>>>> could enable the IRQ window interception in case the is an NMI pending.
> >>>>>> The resulting behavior should then much like the VNMI mask emulation for
> >>>>>> vmx.
> >>>>>>
> >>>>>>             
> >>>>> Yeah, but the question is if IRQ windows is already opened will exit
> >>>>> happens before or after IRET.
> >>>>>         
> >>>> You mean if the NMI handler enabled interrupts?
> >>>>
> >>>>     
> >>> Yes.
> >>>
> >>>   
> >>
> >> Then the guest deserves whatever it gets...
> >>
> > I suspect windows may do this since it uses NMI for task switching.
> 
> Could you elaborate on that? How/why does it use NMIs for task
> switching?
> 
During WHQL testing (or if you just enable verifier on windows 2003)
windows changes hibernate to not power down a PC, but resume
immediately. During this immediate resume it sends NMI to non-boot CPUs
while IDT for nmi is configured as a task gate. I am not sure it
actually calls IRET after that.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity April 19, 2009, 2:20 p.m. UTC | #24

Gleb Natapov wrote:
>> Could you elaborate on that? How/why does it use NMIs for task
>> switching?
>>
>>     
> During WHQL testing (or if you just enable verifier on windows 2003)
> windows changes hibernate to not power down a PC, but resume
> immediately. During this immediate resume it sends NMI to non-boot CPUs
> while IDT for nmi is configured as a task gate. I am not sure it
> actually calls IRET after that.
>   

If it doesn't call IRET, it will never see another NMI.

But of course it will execute IRET, as part of normal execution.  You 
can't do anything without it.

Gleb Natapov April 19, 2009, 2:29 p.m. UTC | #25

On Sun, Apr 19, 2009 at 05:20:37PM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>>> Could you elaborate on that? How/why does it use NMIs for task
>>> switching?
>>>
>>>     
>> During WHQL testing (or if you just enable verifier on windows 2003)
>> windows changes hibernate to not power down a PC, but resume
>> immediately. During this immediate resume it sends NMI to non-boot CPUs
>> while IDT for nmi is configured as a task gate. I am not sure it
>> actually calls IRET after that.
>>   
>
> If it doesn't call IRET, it will never see another NMI.
>
> But of course it will execute IRET, as part of normal execution.  You  
> can't do anything without it.
>
Boot CPU can send INIT after task switch (and I think this is what
happens).

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity April 19, 2009, 2:57 p.m. UTC | #26

Gleb Natapov wrote:
> On Sun, Apr 19, 2009 at 05:20:37PM +0300, Avi Kivity wrote:
>   
>> Gleb Natapov wrote:
>>     
>>>> Could you elaborate on that? How/why does it use NMIs for task
>>>> switching?
>>>>
>>>>     
>>>>         
>>> During WHQL testing (or if you just enable verifier on windows 2003)
>>> windows changes hibernate to not power down a PC, but resume
>>> immediately. During this immediate resume it sends NMI to non-boot CPUs
>>> while IDT for nmi is configured as a task gate. I am not sure it
>>> actually calls IRET after that.
>>>   
>>>       
>> If it doesn't call IRET, it will never see another NMI.
>>
>> But of course it will execute IRET, as part of normal execution.  You  
>> can't do anything without it.
>>
>>     
> Boot CPU can send INIT after task switch (and I think this is what
> happens).
>   

But eventually it will execute IRET.

(We need to fix INIT to clear the NMI blocking flag, not that it matters 
so much)

Gleb Natapov April 19, 2009, 4:36 p.m. UTC | #27

On Sun, Apr 19, 2009 at 05:57:56PM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>> On Sun, Apr 19, 2009 at 05:20:37PM +0300, Avi Kivity wrote:
>>   
>>> Gleb Natapov wrote:
>>>     
>>>>> Could you elaborate on that? How/why does it use NMIs for task
>>>>> switching?
>>>>>
>>>>>             
>>>> During WHQL testing (or if you just enable verifier on windows 2003)
>>>> windows changes hibernate to not power down a PC, but resume
>>>> immediately. During this immediate resume it sends NMI to non-boot CPUs
>>>> while IDT for nmi is configured as a task gate. I am not sure it
>>>> actually calls IRET after that.
>>>>         
>>> If it doesn't call IRET, it will never see another NMI.
>>>
>>> But of course it will execute IRET, as part of normal execution.  You 
>>>  can't do anything without it.
>>>
>>>     
>> Boot CPU can send INIT after task switch (and I think this is what
>> happens).
>>   
>
> But eventually it will execute IRET.
>
Yes :) But I strongly suspect that NMI window will be opened after SIPI
even before first IRET.

> (We need to fix INIT to clear the NMI blocking flag, not that it matters  
> so much)
If we reset intercept mask on INIT, but don't clear NMI blocking flag we
will never receive NMIs on the vcpu.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[13/15] Add NMI injection support to SVM.

Commit Message

Comments

Patch