Message ID | 1239616545-25199-14-git-send-email-gleb@redhat.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Gleb Natapov wrote: > Signed-off-by: Gleb Natapov <gleb@redhat.com> > --- > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- > 2 files changed, 48 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 8b6f6e9..057a612 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -766,6 +766,7 @@ enum { > #define HF_GIF_MASK (1 << 0) > #define HF_HIF_MASK (1 << 1) > #define HF_VINTR_MASK (1 << 2) > +#define HF_NMI_MASK (1 << 3) > > /* > * Hardware virtualization extension instructions may fault if a > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index c605477..cd60fd7 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > return 1; > } > > +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > +{ > + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); > + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; > + return 0; > +} > + > static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > { > if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) > @@ -2111,6 +2118,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, > [SVM_EXIT_VINTR] = interrupt_window_interception, > /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ > [SVM_EXIT_CPUID] = cpuid_interception, > + [SVM_EXIT_IRET] = iret_interception, > [SVM_EXIT_INVD] = emulate_on_interception, > [SVM_EXIT_HLT] = halt_interception, > [SVM_EXIT_INVLPG] = invlpg_interception, > @@ -2218,6 +2226,11 @@ static void pre_svm_run(struct vcpu_svm *svm) > new_asid(svm, svm_data); > } > > +static void svm_inject_nmi(struct vcpu_svm *svm) > +{ > + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; > + svm->vcpu.arch.hflags |= HF_NMI_MASK; > +} > > static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) > { > @@ -2269,6 +2282,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) > vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; > } > > +static int svm_nmi_allowed(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_svm *svm = to_svm(vcpu); > + struct vmcb *vmcb = svm->vmcb; > + return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && > + !(svm->vcpu.arch.hflags & HF_NMI_MASK); > +} > + > static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) > { > struct vcpu_svm *svm = to_svm(vcpu); > @@ -2284,16 +2305,37 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) > svm_inject_irq(to_svm(vcpu), 0x0); > } > > +static void enable_nmi_window(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_svm *svm = to_svm(vcpu); > + > + if (svm->vcpu.arch.hflags & HF_NMI_MASK) > + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); > + if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) > + enable_irq_window(vcpu); > +} > + > static void svm_intr_inject(struct kvm_vcpu *vcpu) > { > /* try to reinject previous events if any */ > + if (vcpu->arch.nmi_injected) { > + svm_inject_nmi(to_svm(vcpu)); > + return; > + } > + > if (vcpu->arch.interrupt.pending) { > svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr); > return; > } > > /* try to inject new event if pending */ > - if (kvm_cpu_has_interrupt(vcpu)) { > + if (vcpu->arch.nmi_pending) { > + if (svm_nmi_allowed(vcpu)) { > + vcpu->arch.nmi_pending = false; > + vcpu->arch.nmi_injected = true; > + svm_inject_nmi(vcpu); > + } > + } else if (kvm_cpu_has_interrupt(vcpu)) { Strictly spoken, this 'else' is incorrect: If we have an NMI pending while the NMI window is closed _but_ the guest decided to open the IRQ window, there is no reason why we shouldn't inject an IRQ. Only if we actually injected an NMI, pending IRQs should be skipped for this run. > if (svm_interrupt_allowed(vcpu)) { > kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); > svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr); > @@ -2312,7 +2354,10 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > > svm_intr_inject(vcpu); > > - if (kvm_cpu_has_interrupt(vcpu) || req_int_win) > + /* enable NMI/IRQ window open exits if needed */ > + if (vcpu->arch.nmi_pending) > + enable_nmi_window(vcpu); > + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) > enable_irq_window(vcpu); > > out: Jan
This patch does expose some problems on real HW. The first NMI completes w/o problems. However If I try to boot the kernel w/ nmi_watchdog=1 or to trigger two NMIs from the monitor, kernel is stuck somewhere.
Gleb Natapov wrote: > Signed-off-by: Gleb Natapov <gleb@redhat.com> > --- > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- > 2 files changed, 48 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 8b6f6e9..057a612 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -766,6 +766,7 @@ enum { > #define HF_GIF_MASK (1 << 0) > #define HF_HIF_MASK (1 << 1) > #define HF_VINTR_MASK (1 << 2) > +#define HF_NMI_MASK (1 << 3) > > /* > * Hardware virtualization extension instructions may fault if a > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index c605477..cd60fd7 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > return 1; > } > > +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > +{ > + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); > + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; > + return 0; > +} First, this must return 1 (or set an exit reason, but there is no reason to escape to user space here). And second, I think a corner case is not handled the same way as on real iron: If there is already the next NMI waiting, we will inject it before iret, not after its execution as it should be. No easy solution for this yet. Maybe emulating iret, but there is no implementation, specifically for protected mode. Maybe setting a breakpoint. Or maybe enforcing a single step exception. Nothing trivial in this list. On the other hand, this may only be a slight imprecision of the virtualization. Need to think about it. Jan
Jan Kiszka wrote: > First, this must return 1 (or set an exit reason, but there is no reason > to escape to user space here). And second, I think a corner case is not > handled the same way as on real iron: If there is already the next NMI > waiting, we will inject it before iret, not after its execution as it > should be. > Yes, good catch. > No easy solution for this yet. Maybe emulating iret, but there is no > implementation, specifically for protected mode. That will be a disaster, IRET is horribly complicated. > Maybe setting a > breakpoint. Or maybe enforcing a single step exception. Single step looks good, except that it may conflict with a guest debugging itself (guest debugging an NMI handler?!). > Nothing trivial > in this list. On the other hand, this may only be a slight imprecision > of the virtualization. Need to think about it. > It may cause a stack overflow if we have a huge stream of NMIs (if an NMI triggers an NMI in the handler). Of course that's a totally unrealistic scenario.
Avi Kivity wrote: > Jan Kiszka wrote: >> First, this must return 1 (or set an exit reason, but there is no reason >> to escape to user space here). And second, I think a corner case is not >> handled the same way as on real iron: If there is already the next NMI >> waiting, we will inject it before iret, not after its execution as it >> should be. >> > > Yes, good catch. > >> No easy solution for this yet. Maybe emulating iret, but there is no >> implementation, specifically for protected mode. > > That will be a disaster, IRET is horribly complicated. Yeah, I know... > >> Maybe setting a >> breakpoint. Or maybe enforcing a single step exception. > > Single step looks good, except that it may conflict with a guest > debugging itself (guest debugging an NMI handler?!). But that should be solvable without too much effort. BTW, guest single-stepping in and out of interrupt handlers is not properly working anyway. We only set TF in current eflags but do not bother about the CPU state that will get loaded next. Given some rainy days (or a paying customer) I think I'll look into this once again. Same goes for suppressing IRQ injection while single-stepping just as QEMU does, which may even come earlier as someone already asked for it. > >> Nothing trivial >> in this list. On the other hand, this may only be a slight imprecision >> of the virtualization. Need to think about it. >> > > It may cause a stack overflow if we have a huge stream of NMIs (if an > NMI triggers an NMI in the handler). Of course that's a totally > unrealistic scenario. > Good point. But as it is a corner case, I think we can fly without a complete solution first, fixing it in a second step. Jan
On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote: > Gleb Natapov wrote: > > Signed-off-by: Gleb Natapov <gleb@redhat.com> > > --- > > arch/x86/include/asm/kvm_host.h | 1 + > > arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- > > 2 files changed, 48 insertions(+), 2 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > index 8b6f6e9..057a612 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -766,6 +766,7 @@ enum { > > #define HF_GIF_MASK (1 << 0) > > #define HF_HIF_MASK (1 << 1) > > #define HF_VINTR_MASK (1 << 2) > > +#define HF_NMI_MASK (1 << 3) > > > > /* > > * Hardware virtualization extension instructions may fault if a > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > > index c605477..cd60fd7 100644 > > --- a/arch/x86/kvm/svm.c > > +++ b/arch/x86/kvm/svm.c > > @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > > return 1; > > } > > > > +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > > +{ > > + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); > > + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; > > + return 0; > > +} > > First, this must return 1 (or set an exit reason, but there is no reason > to escape to user space here). And second, I think a corner case is not > handled the same way as on real iron: If there is already the next NMI > waiting, we will inject it before iret, not after its execution as it > should be. > > No easy solution for this yet. Maybe emulating iret, but there is no > implementation, specifically for protected mode. Maybe setting a > breakpoint. Or maybe enforcing a single step exception. Nothing trivial > in this list. On the other hand, this may only be a slight imprecision > of the virtualization. Need to think about it. > What about this: Instead of clearing HF_NMI_MASK in iret_interception() we can set another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and HF_IRET) if HF_IRET is set, but do that after checking for NMI injection. The pending NMI will be injected on the next entry. Also not how real HW works, but may be better then current situation. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote: > On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote: > >> Gleb Natapov wrote: >> >>> Signed-off-by: Gleb Natapov <gleb@redhat.com> >>> --- >>> arch/x86/include/asm/kvm_host.h | 1 + >>> arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- >>> 2 files changed, 48 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >>> index 8b6f6e9..057a612 100644 >>> --- a/arch/x86/include/asm/kvm_host.h >>> +++ b/arch/x86/include/asm/kvm_host.h >>> @@ -766,6 +766,7 @@ enum { >>> #define HF_GIF_MASK (1 << 0) >>> #define HF_HIF_MASK (1 << 1) >>> #define HF_VINTR_MASK (1 << 2) >>> +#define HF_NMI_MASK (1 << 3) >>> >>> /* >>> * Hardware virtualization extension instructions may fault if a >>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >>> index c605477..cd60fd7 100644 >>> --- a/arch/x86/kvm/svm.c >>> +++ b/arch/x86/kvm/svm.c >>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) >>> return 1; >>> } >>> >>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) >>> +{ >>> + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); >>> + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; >>> + return 0; >>> +} >>> >> First, this must return 1 (or set an exit reason, but there is no reason >> to escape to user space here). And second, I think a corner case is not >> handled the same way as on real iron: If there is already the next NMI >> waiting, we will inject it before iret, not after its execution as it >> should be. >> >> No easy solution for this yet. Maybe emulating iret, but there is no >> implementation, specifically for protected mode. Maybe setting a >> breakpoint. Or maybe enforcing a single step exception. Nothing trivial >> in this list. On the other hand, this may only be a slight imprecision >> of the virtualization. Need to think about it. >> >> > What about this: > Instead of clearing HF_NMI_MASK in iret_interception() we can set > another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and > HF_IRET) if HF_IRET is set, but do that after checking for NMI > injection. The pending NMI will be injected on the next entry. > Also not how real HW works, but may be better then current situation. > There may not be a next entry if the guest is in a tight loop. Given NMIs are used for watchdogs, that's not good. btw, injection before IRET is executed is broken if interrupt stack tables are used, since the injection will reset rsp instead of nesting.
On Sun, Apr 19, 2009 at 04:21:29PM +0300, Avi Kivity wrote: > Gleb Natapov wrote: >> On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote: >> >>> Gleb Natapov wrote: >>> >>>> Signed-off-by: Gleb Natapov <gleb@redhat.com> >>>> --- >>>> arch/x86/include/asm/kvm_host.h | 1 + >>>> arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- >>>> 2 files changed, 48 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >>>> index 8b6f6e9..057a612 100644 >>>> --- a/arch/x86/include/asm/kvm_host.h >>>> +++ b/arch/x86/include/asm/kvm_host.h >>>> @@ -766,6 +766,7 @@ enum { >>>> #define HF_GIF_MASK (1 << 0) >>>> #define HF_HIF_MASK (1 << 1) >>>> #define HF_VINTR_MASK (1 << 2) >>>> +#define HF_NMI_MASK (1 << 3) >>>> /* >>>> * Hardware virtualization extension instructions may fault if a >>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >>>> index c605477..cd60fd7 100644 >>>> --- a/arch/x86/kvm/svm.c >>>> +++ b/arch/x86/kvm/svm.c >>>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) >>>> return 1; >>>> } >>>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run >>>> *kvm_run) >>>> +{ >>>> + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); >>>> + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; >>>> + return 0; >>>> +} >>>> >>> First, this must return 1 (or set an exit reason, but there is no reason >>> to escape to user space here). And second, I think a corner case is not >>> handled the same way as on real iron: If there is already the next NMI >>> waiting, we will inject it before iret, not after its execution as it >>> should be. >>> >>> No easy solution for this yet. Maybe emulating iret, but there is no >>> implementation, specifically for protected mode. Maybe setting a >>> breakpoint. Or maybe enforcing a single step exception. Nothing trivial >>> in this list. On the other hand, this may only be a slight imprecision >>> of the virtualization. Need to think about it. >>> >>> >> What about this: >> Instead of clearing HF_NMI_MASK in iret_interception() we can set >> another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and >> HF_IRET) if HF_IRET is set, but do that after checking for NMI >> injection. The pending NMI will be injected on the next entry. >> Also not how real HW works, but may be better then current situation. >> > > There may not be a next entry if the guest is in a tight loop. Given > NMIs are used for watchdogs, that's not good. > We don't exit a guest after kvm time slice ends? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote: > On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote: >> Gleb Natapov wrote: >>> Signed-off-by: Gleb Natapov <gleb@redhat.com> >>> --- >>> arch/x86/include/asm/kvm_host.h | 1 + >>> arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- >>> 2 files changed, 48 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >>> index 8b6f6e9..057a612 100644 >>> --- a/arch/x86/include/asm/kvm_host.h >>> +++ b/arch/x86/include/asm/kvm_host.h >>> @@ -766,6 +766,7 @@ enum { >>> #define HF_GIF_MASK (1 << 0) >>> #define HF_HIF_MASK (1 << 1) >>> #define HF_VINTR_MASK (1 << 2) >>> +#define HF_NMI_MASK (1 << 3) >>> >>> /* >>> * Hardware virtualization extension instructions may fault if a >>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >>> index c605477..cd60fd7 100644 >>> --- a/arch/x86/kvm/svm.c >>> +++ b/arch/x86/kvm/svm.c >>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) >>> return 1; >>> } >>> >>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) >>> +{ >>> + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); >>> + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; >>> + return 0; >>> +} >> First, this must return 1 (or set an exit reason, but there is no reason >> to escape to user space here). And second, I think a corner case is not >> handled the same way as on real iron: If there is already the next NMI >> waiting, we will inject it before iret, not after its execution as it >> should be. >> >> No easy solution for this yet. Maybe emulating iret, but there is no >> implementation, specifically for protected mode. Maybe setting a >> breakpoint. Or maybe enforcing a single step exception. Nothing trivial >> in this list. On the other hand, this may only be a slight imprecision >> of the virtualization. Need to think about it. >> > What about this: > Instead of clearing HF_NMI_MASK in iret_interception() we can set > another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and > HF_IRET) if HF_IRET is set, but do that after checking for NMI > injection. The pending NMI will be injected on the next entry. > Also not how real HW works, but may be better then current situation. It's OK as a first step towards correct NMI emulation. Additionally, you could enable the IRQ window interception in case the is an NMI pending. The resulting behavior should then much like the VNMI mask emulation for vmx. The next step should then be setting TF in the eflags stored on the guest's stack before returning *if* there is already the next NMI pending. But I wonder how much additional effort this will actually mean (compared to the band-aid work)... :) Jan
Gleb Natapov wrote: >> There may not be a next entry if the guest is in a tight loop. Given >> NMIs are used for watchdogs, that's not good. >> >> > We don't exit a guest after kvm time slice ends? > There are no time slices any more. If there's only once thread for a vcpu, you might have no exits at all with a tickless kernel.
On Sun, Apr 19, 2009 at 03:27:22PM +0200, Jan Kiszka wrote: > Gleb Natapov wrote: > > On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote: > >> Gleb Natapov wrote: > >>> Signed-off-by: Gleb Natapov <gleb@redhat.com> > >>> --- > >>> arch/x86/include/asm/kvm_host.h | 1 + > >>> arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- > >>> 2 files changed, 48 insertions(+), 2 deletions(-) > >>> > >>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > >>> index 8b6f6e9..057a612 100644 > >>> --- a/arch/x86/include/asm/kvm_host.h > >>> +++ b/arch/x86/include/asm/kvm_host.h > >>> @@ -766,6 +766,7 @@ enum { > >>> #define HF_GIF_MASK (1 << 0) > >>> #define HF_HIF_MASK (1 << 1) > >>> #define HF_VINTR_MASK (1 << 2) > >>> +#define HF_NMI_MASK (1 << 3) > >>> > >>> /* > >>> * Hardware virtualization extension instructions may fault if a > >>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > >>> index c605477..cd60fd7 100644 > >>> --- a/arch/x86/kvm/svm.c > >>> +++ b/arch/x86/kvm/svm.c > >>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > >>> return 1; > >>> } > >>> > >>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) > >>> +{ > >>> + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); > >>> + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; > >>> + return 0; > >>> +} > >> First, this must return 1 (or set an exit reason, but there is no reason > >> to escape to user space here). And second, I think a corner case is not > >> handled the same way as on real iron: If there is already the next NMI > >> waiting, we will inject it before iret, not after its execution as it > >> should be. > >> > >> No easy solution for this yet. Maybe emulating iret, but there is no > >> implementation, specifically for protected mode. Maybe setting a > >> breakpoint. Or maybe enforcing a single step exception. Nothing trivial > >> in this list. On the other hand, this may only be a slight imprecision > >> of the virtualization. Need to think about it. > >> > > What about this: > > Instead of clearing HF_NMI_MASK in iret_interception() we can set > > another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and > > HF_IRET) if HF_IRET is set, but do that after checking for NMI > > injection. The pending NMI will be injected on the next entry. > > Also not how real HW works, but may be better then current situation. > > It's OK as a first step towards correct NMI emulation. Additionally, you > could enable the IRQ window interception in case the is an NMI pending. > The resulting behavior should then much like the VNMI mask emulation for > vmx. > Yeah, but the question is if IRQ windows is already opened will exit happens before or after IRET. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Apr 19, 2009 at 04:28:09PM +0300, Avi Kivity wrote: > Gleb Natapov wrote: >>> There may not be a next entry if the guest is in a tight loop. Given >>> NMIs are used for watchdogs, that's not good. >>> >>> >> We don't exit a guest after kvm time slice ends? >> > > There are no time slices any more. If there's only once thread for a > vcpu, you might have no exits at all with a tickless kernel. > Well, KVM may request some kind of even (timer) that will cause exit to VCPU. This looks hacky though. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote: > On Sun, Apr 19, 2009 at 03:27:22PM +0200, Jan Kiszka wrote: >> Gleb Natapov wrote: >>> On Fri, Apr 17, 2009 at 09:55:45PM +0200, Jan Kiszka wrote: >>>> Gleb Natapov wrote: >>>>> Signed-off-by: Gleb Natapov <gleb@redhat.com> >>>>> --- >>>>> arch/x86/include/asm/kvm_host.h | 1 + >>>>> arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- >>>>> 2 files changed, 48 insertions(+), 2 deletions(-) >>>>> >>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >>>>> index 8b6f6e9..057a612 100644 >>>>> --- a/arch/x86/include/asm/kvm_host.h >>>>> +++ b/arch/x86/include/asm/kvm_host.h >>>>> @@ -766,6 +766,7 @@ enum { >>>>> #define HF_GIF_MASK (1 << 0) >>>>> #define HF_HIF_MASK (1 << 1) >>>>> #define HF_VINTR_MASK (1 << 2) >>>>> +#define HF_NMI_MASK (1 << 3) >>>>> >>>>> /* >>>>> * Hardware virtualization extension instructions may fault if a >>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >>>>> index c605477..cd60fd7 100644 >>>>> --- a/arch/x86/kvm/svm.c >>>>> +++ b/arch/x86/kvm/svm.c >>>>> @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) >>>>> return 1; >>>>> } >>>>> >>>>> +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) >>>>> +{ >>>>> + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); >>>>> + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; >>>>> + return 0; >>>>> +} >>>> First, this must return 1 (or set an exit reason, but there is no reason >>>> to escape to user space here). And second, I think a corner case is not >>>> handled the same way as on real iron: If there is already the next NMI >>>> waiting, we will inject it before iret, not after its execution as it >>>> should be. >>>> >>>> No easy solution for this yet. Maybe emulating iret, but there is no >>>> implementation, specifically for protected mode. Maybe setting a >>>> breakpoint. Or maybe enforcing a single step exception. Nothing trivial >>>> in this list. On the other hand, this may only be a slight imprecision >>>> of the virtualization. Need to think about it. >>>> >>> What about this: >>> Instead of clearing HF_NMI_MASK in iret_interception() we can set >>> another flag (HF_IRET) and on guest entry clear HF_NMI_MASK (and >>> HF_IRET) if HF_IRET is set, but do that after checking for NMI >>> injection. The pending NMI will be injected on the next entry. >>> Also not how real HW works, but may be better then current situation. >> It's OK as a first step towards correct NMI emulation. Additionally, you >> could enable the IRQ window interception in case the is an NMI pending. >> The resulting behavior should then much like the VNMI mask emulation for >> vmx. >> > Yeah, but the question is if IRQ windows is already opened will exit > happens before or after IRET. > Hey, this is band-aid, it won't heal all cases. ;) Jan
Gleb Natapov wrote: >> It's OK as a first step towards correct NMI emulation. Additionally, you >> could enable the IRQ window interception in case the is an NMI pending. >> The resulting behavior should then much like the VNMI mask emulation for >> vmx. >> >> > Yeah, but the question is if IRQ windows is already opened will exit > happens before or after IRET. > You mean if the NMI handler enabled interrupts?
On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote: > Gleb Natapov wrote: >>> It's OK as a first step towards correct NMI emulation. Additionally, you >>> could enable the IRQ window interception in case the is an NMI pending. >>> The resulting behavior should then much like the VNMI mask emulation for >>> vmx. >>> >>> >> Yeah, but the question is if IRQ windows is already opened will exit >> happens before or after IRET. >> > > You mean if the NMI handler enabled interrupts? > Yes. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote: > On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote: > >> Gleb Natapov wrote: >> >>>> It's OK as a first step towards correct NMI emulation. Additionally, you >>>> could enable the IRQ window interception in case the is an NMI pending. >>>> The resulting behavior should then much like the VNMI mask emulation for >>>> vmx. >>>> >>>> >>>> >>> Yeah, but the question is if IRQ windows is already opened will exit >>> happens before or after IRET. >>> >>> >> You mean if the NMI handler enabled interrupts? >> >> > Yes. > > Then the guest deserves whatever it gets...
Gleb Natapov wrote: > On Sun, Apr 19, 2009 at 04:28:09PM +0300, Avi Kivity wrote: >> Gleb Natapov wrote: >>>> There may not be a next entry if the guest is in a tight loop. Given >>>> NMIs are used for watchdogs, that's not good. >>>> >>>> >>> We don't exit a guest after kvm time slice ends? >>> >> There are no time slices any more. If there's only once thread for a >> vcpu, you might have no exits at all with a tickless kernel. >> > Well, KVM may request some kind of even (timer) that will cause exit to > VCPU. This looks hacky though. We already spent to much electrons and brain cycles on possibly "much simpler" workarounds. I think injecting and handling a single-step, even while there is guest debugging going on or the guest itself single-steps or both, will not be more complicated - but "more correct". Jan
On Sun, Apr 19, 2009 at 04:43:12PM +0300, Avi Kivity wrote: > Gleb Natapov wrote: >> On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote: >> >>> Gleb Natapov wrote: >>> >>>>> It's OK as a first step towards correct NMI emulation. Additionally, you >>>>> could enable the IRQ window interception in case the is an NMI pending. >>>>> The resulting behavior should then much like the VNMI mask emulation for >>>>> vmx. >>>>> >>>>> >>>> Yeah, but the question is if IRQ windows is already opened will exit >>>> happens before or after IRET. >>>> >>> You mean if the NMI handler enabled interrupts? >>> >>> >> Yes. >> >> > > Then the guest deserves whatever it gets... > I suspect windows may do this since it uses NMI for task switching. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Kiszka wrote: > We already spent to much electrons and brain cycles on possibly "much > simpler" workarounds. I think injecting and handling a single-step, even > while there is guest debugging going on or the guest itself single-steps > or both, will not be more complicated - but "more correct". > I agree. I'm still worried about interactions between the IRET single stepping code and other things which use the debug registers.
On Sun, Apr 19, 2009 at 04:49:18PM +0300, Avi Kivity wrote: > Jan Kiszka wrote: >> We already spent to much electrons and brain cycles on possibly "much >> simpler" workarounds. I think injecting and handling a single-step, even >> while there is guest debugging going on or the guest itself single-steps >> or both, will not be more complicated - but "more correct". >> > > I agree. I'm still worried about interactions between the IRET single > stepping code and other things which use the debug registers. > I don't disagree too. Just throwing other ideas :) -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Avi Kivity wrote: > Jan Kiszka wrote: >> We already spent to much electrons and brain cycles on possibly "much >> simpler" workarounds. I think injecting and handling a single-step, even >> while there is guest debugging going on or the guest itself single-steps >> or both, will not be more complicated - but "more correct". >> > > I agree. I'm still worried about interactions between the IRET single > stepping code and other things which use the debug registers. The interaction is inside KVM, so under our control. We may simply save the related states, hook into the guest-debugging parts that evaluate single step exceptions, and handle this case with highest prio, transparently to users of lower prio. In theory - you never really know until you tried to implement it... Jan
Gleb Natapov <gleb@redhat.com> writes: > On Sun, Apr 19, 2009 at 04:43:12PM +0300, Avi Kivity wrote: >> Gleb Natapov wrote: >>> On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote: >>> >>>> Gleb Natapov wrote: >>>> >>>>>> It's OK as a first step towards correct NMI emulation. Additionally, you >>>>>> could enable the IRQ window interception in case the is an NMI pending. >>>>>> The resulting behavior should then much like the VNMI mask emulation for >>>>>> vmx. >>>>>> >>>>>> >>>>> Yeah, but the question is if IRQ windows is already opened will exit >>>>> happens before or after IRET. >>>>> >>>> You mean if the NMI handler enabled interrupts? >>>> >>>> >>> Yes. >>> >>> >> >> Then the guest deserves whatever it gets... >> > I suspect windows may do this since it uses NMI for task switching. Could you elaborate on that? How/why does it use NMIs for task switching? Regards,
On Sun, Apr 19, 2009 at 04:07:52PM +0200, Julian Stecklina wrote: > Gleb Natapov <gleb@redhat.com> writes: > > > On Sun, Apr 19, 2009 at 04:43:12PM +0300, Avi Kivity wrote: > >> Gleb Natapov wrote: > >>> On Sun, Apr 19, 2009 at 04:40:51PM +0300, Avi Kivity wrote: > >>> > >>>> Gleb Natapov wrote: > >>>> > >>>>>> It's OK as a first step towards correct NMI emulation. Additionally, you > >>>>>> could enable the IRQ window interception in case the is an NMI pending. > >>>>>> The resulting behavior should then much like the VNMI mask emulation for > >>>>>> vmx. > >>>>>> > >>>>>> > >>>>> Yeah, but the question is if IRQ windows is already opened will exit > >>>>> happens before or after IRET. > >>>>> > >>>> You mean if the NMI handler enabled interrupts? > >>>> > >>>> > >>> Yes. > >>> > >>> > >> > >> Then the guest deserves whatever it gets... > >> > > I suspect windows may do this since it uses NMI for task switching. > > Could you elaborate on that? How/why does it use NMIs for task > switching? > During WHQL testing (or if you just enable verifier on windows 2003) windows changes hibernate to not power down a PC, but resume immediately. During this immediate resume it sends NMI to non-boot CPUs while IDT for nmi is configured as a task gate. I am not sure it actually calls IRET after that. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote: >> Could you elaborate on that? How/why does it use NMIs for task >> switching? >> >> > During WHQL testing (or if you just enable verifier on windows 2003) > windows changes hibernate to not power down a PC, but resume > immediately. During this immediate resume it sends NMI to non-boot CPUs > while IDT for nmi is configured as a task gate. I am not sure it > actually calls IRET after that. > If it doesn't call IRET, it will never see another NMI. But of course it will execute IRET, as part of normal execution. You can't do anything without it.
On Sun, Apr 19, 2009 at 05:20:37PM +0300, Avi Kivity wrote: > Gleb Natapov wrote: >>> Could you elaborate on that? How/why does it use NMIs for task >>> switching? >>> >>> >> During WHQL testing (or if you just enable verifier on windows 2003) >> windows changes hibernate to not power down a PC, but resume >> immediately. During this immediate resume it sends NMI to non-boot CPUs >> while IDT for nmi is configured as a task gate. I am not sure it >> actually calls IRET after that. >> > > If it doesn't call IRET, it will never see another NMI. > > But of course it will execute IRET, as part of normal execution. You > can't do anything without it. > Boot CPU can send INIT after task switch (and I think this is what happens). -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gleb Natapov wrote: > On Sun, Apr 19, 2009 at 05:20:37PM +0300, Avi Kivity wrote: > >> Gleb Natapov wrote: >> >>>> Could you elaborate on that? How/why does it use NMIs for task >>>> switching? >>>> >>>> >>>> >>> During WHQL testing (or if you just enable verifier on windows 2003) >>> windows changes hibernate to not power down a PC, but resume >>> immediately. During this immediate resume it sends NMI to non-boot CPUs >>> while IDT for nmi is configured as a task gate. I am not sure it >>> actually calls IRET after that. >>> >>> >> If it doesn't call IRET, it will never see another NMI. >> >> But of course it will execute IRET, as part of normal execution. You >> can't do anything without it. >> >> > Boot CPU can send INIT after task switch (and I think this is what > happens). > But eventually it will execute IRET. (We need to fix INIT to clear the NMI blocking flag, not that it matters so much)
On Sun, Apr 19, 2009 at 05:57:56PM +0300, Avi Kivity wrote: > Gleb Natapov wrote: >> On Sun, Apr 19, 2009 at 05:20:37PM +0300, Avi Kivity wrote: >> >>> Gleb Natapov wrote: >>> >>>>> Could you elaborate on that? How/why does it use NMIs for task >>>>> switching? >>>>> >>>>> >>>> During WHQL testing (or if you just enable verifier on windows 2003) >>>> windows changes hibernate to not power down a PC, but resume >>>> immediately. During this immediate resume it sends NMI to non-boot CPUs >>>> while IDT for nmi is configured as a task gate. I am not sure it >>>> actually calls IRET after that. >>>> >>> If it doesn't call IRET, it will never see another NMI. >>> >>> But of course it will execute IRET, as part of normal execution. You >>> can't do anything without it. >>> >>> >> Boot CPU can send INIT after task switch (and I think this is what >> happens). >> > > But eventually it will execute IRET. > Yes :) But I strongly suspect that NMI window will be opened after SIPI even before first IRET. > (We need to fix INIT to clear the NMI blocking flag, not that it matters > so much) If we reset intercept mask on INIT, but don't clear NMI blocking flag we will never receive NMIs on the vcpu. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8b6f6e9..057a612 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -766,6 +766,7 @@ enum { #define HF_GIF_MASK (1 << 0) #define HF_HIF_MASK (1 << 1) #define HF_VINTR_MASK (1 << 2) +#define HF_NMI_MASK (1 << 3) /* * Hardware virtualization extension instructions may fault if a diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c605477..cd60fd7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1834,6 +1834,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; + return 0; +} + static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) @@ -2111,6 +2118,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_VINTR] = interrupt_window_interception, /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, @@ -2218,6 +2226,11 @@ static void pre_svm_run(struct vcpu_svm *svm) new_asid(svm, svm_data); } +static void svm_inject_nmi(struct vcpu_svm *svm) +{ + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + svm->vcpu.arch.hflags |= HF_NMI_MASK; +} static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { @@ -2269,6 +2282,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; } +static int svm_nmi_allowed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); +} + static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2284,16 +2305,37 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) svm_inject_irq(to_svm(vcpu), 0x0); } +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (svm->vcpu.arch.hflags & HF_NMI_MASK) + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) + enable_irq_window(vcpu); +} + static void svm_intr_inject(struct kvm_vcpu *vcpu) { /* try to reinject previous events if any */ + if (vcpu->arch.nmi_injected) { + svm_inject_nmi(to_svm(vcpu)); + return; + } + if (vcpu->arch.interrupt.pending) { svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr); return; } /* try to inject new event if pending */ - if (kvm_cpu_has_interrupt(vcpu)) { + if (vcpu->arch.nmi_pending) { + if (svm_nmi_allowed(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + svm_inject_nmi(vcpu); + } + } else if (kvm_cpu_has_interrupt(vcpu)) { if (svm_interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr); @@ -2312,7 +2354,10 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm_intr_inject(vcpu); - if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) enable_irq_window(vcpu); out:
Signed-off-by: Gleb Natapov <gleb@redhat.com> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-)