Message ID | 20190326130746.28748-1-vkuznets@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag | expand |
> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > > Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against > SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM. > The check is correct, however, there is a special case when RSM is called > to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still > set and in case VMXE was set before entering SMM we're failing to return. > > Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls > when ops->set_cr() is called from RSM. > > Reported-by: Jon Doron <arilou@gmail.com> > Suggested-by: Liran Alon <liran.alon@oracle.com> > Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM") > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Patch looks good to me. Reviewed-by: Liran Alon <liran.alon@oracle.com> > --- > - Instread of putting the temporary HF_SMM_MASK drop to > rsm_enter_protected_mode() (as was suggested by Liran), move it to > emulator_set_cr() modifying its interface. emulate.c seems to be > vcpu-specifics-free at this moment, we may want to keep it this way. > - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic > hangs even with this patch. These hangs, however, seem to be unrelated to > rsm. Feel free to share details on these hangs ;) Great work, -Liran > --- > arch/x86/include/asm/kvm_emulate.h | 3 ++- > arch/x86/kvm/emulate.c | 27 ++++++++++++++------------- > arch/x86/kvm/x86.c | 12 +++++++++++- > 3 files changed, 27 insertions(+), 15 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h > index 93c4bf598fb0..6c33caa82fa5 100644 > --- a/arch/x86/include/asm/kvm_emulate.h > +++ b/arch/x86/include/asm/kvm_emulate.h > @@ -203,7 +203,8 @@ struct x86_emulate_ops { > void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); > void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); > ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); > - int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); > + int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val, > + bool from_rsm); > int (*cpl)(struct x86_emulate_ctxt *ctxt); > int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); > int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); > diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c > index c338984c850d..a6204105d4d7 100644 > --- a/arch/x86/kvm/emulate.c > +++ b/arch/x86/kvm/emulate.c > @@ -2413,7 +2413,7 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, > cr3 &= ~0xfff; > } > > - bad = ctxt->ops->set_cr(ctxt, 3, cr3); > + bad = ctxt->ops->set_cr(ctxt, 3, cr3, true); > if (bad) > return X86EMUL_UNHANDLEABLE; > > @@ -2422,20 +2422,20 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, > * Then enable protected mode. However, PCID cannot be enabled > * if EFER.LMA=0, so set it separately. > */ > - bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); > + bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true); > if (bad) > return X86EMUL_UNHANDLEABLE; > > - bad = ctxt->ops->set_cr(ctxt, 0, cr0); > + bad = ctxt->ops->set_cr(ctxt, 0, cr0, true); > if (bad) > return X86EMUL_UNHANDLEABLE; > > if (cr4 & X86_CR4_PCIDE) { > - bad = ctxt->ops->set_cr(ctxt, 4, cr4); > + bad = ctxt->ops->set_cr(ctxt, 4, cr4, true); > if (bad) > return X86EMUL_UNHANDLEABLE; > if (pcid) { > - bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); > + bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid, true); > if (bad) > return X86EMUL_UNHANDLEABLE; > } > @@ -2581,7 +2581,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) > > /* Zero CR4.PCIDE before CR0.PG. */ > if (cr4 & X86_CR4_PCIDE) { > - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); > + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true); > cr4 &= ~X86_CR4_PCIDE; > } > > @@ -2595,11 +2595,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) > /* For the 64-bit case, this will clear EFER.LMA. */ > cr0 = ctxt->ops->get_cr(ctxt, 0); > if (cr0 & X86_CR0_PE) > - ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); > + ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE), > + true); > > /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ > if (cr4 & X86_CR4_PAE) > - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); > + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE, true); > > /* And finally go back to 32-bit mode. */ > efer = 0; > @@ -3131,7 +3132,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, > int ret; > u8 cpl; > > - if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) > + if (ctxt->ops->set_cr(ctxt, 3, tss->cr3, false)) > return emulate_gp(ctxt, 0); > ctxt->_eip = tss->eip; > ctxt->eflags = tss->eflags | 2; > @@ -3331,7 +3332,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, > write_segment_descriptor(ctxt, tss_selector, &next_tss_desc); > } > > - ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); > + ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS, false); > ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); > > if (has_error_code) { > @@ -3633,7 +3634,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) > > static int em_cr_write(struct x86_emulate_ctxt *ctxt) > { > - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) > + if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val, false)) > return emulate_gp(ctxt, 0); > > /* Disable writeback. */ > @@ -3766,7 +3767,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) > > cr0 = ctxt->ops->get_cr(ctxt, 0); > cr0 &= ~X86_CR0_TS; > - ctxt->ops->set_cr(ctxt, 0, cr0); > + ctxt->ops->set_cr(ctxt, 0, cr0, false); > return X86EMUL_CONTINUE; > } > > @@ -3866,7 +3867,7 @@ static int em_smsw(struct x86_emulate_ctxt *ctxt) > static int em_lmsw(struct x86_emulate_ctxt *ctxt) > { > ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) > - | (ctxt->src.val & 0x0f)); > + | (ctxt->src.val & 0x0f), false); > ctxt->dst.type = OP_NONE; > return X86EMUL_CONTINUE; > } > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index a419656521b6..f2745e3170b6 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -5739,7 +5739,8 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) > return value; > } > > -static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) > +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val, > + bool from_rsm) > { > struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); > int res = 0; > @@ -5755,7 +5756,16 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) > res = kvm_set_cr3(vcpu, val); > break; > case 4: > + /* > + * set_cr4() may forbid to set certain flags (e.g. VMXE) from > + * SMM but we're actually leaving it; temporary drop HF_SMM_MASK > + * when setting CR4. > + */ > + if (from_rsm) > + vcpu->arch.hflags &= ~HF_SMM_MASK; > res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); > + if (from_rsm) > + vcpu->arch.hflags |= HF_SMM_MASK; > break; > case 8: > res = kvm_set_cr8(vcpu, val); > -- > 2.20.1 >
Liran Alon <liran.alon@oracle.com> writes: >> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: >> >> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against >> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM. >> The check is correct, however, there is a special case when RSM is called >> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still >> set and in case VMXE was set before entering SMM we're failing to return. >> >> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls >> when ops->set_cr() is called from RSM. >> >> Reported-by: Jon Doron <arilou@gmail.com> >> Suggested-by: Liran Alon <liran.alon@oracle.com> >> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM") >> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> > > Patch looks good to me. > Reviewed-by: Liran Alon <liran.alon@oracle.com> Thanks! > >> --- >> - Instread of putting the temporary HF_SMM_MASK drop to >> rsm_enter_protected_mode() (as was suggested by Liran), move it to >> emulator_set_cr() modifying its interface. emulate.c seems to be >> vcpu-specifics-free at this moment, we may want to keep it this way. >> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic >> hangs even with this patch. These hangs, however, seem to be unrelated to >> rsm. > > Feel free to share details on these hangs ;) > You've asked for it) The immediate issue I'm observing is some sort of a lockup which is easy to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it seems we get too many interrupts and combined with preemtion timer for L2 we're not making any progress: kvm_userspace_exit: reason KVM_EXIT_IOAPIC_EOI (26) kvm_set_irq: gsi 18 level 1 source 0 kvm_msi_set_irq: dst 0 vec 177 (Fixed|physical|level) kvm_apic_accept_irq: apicid 0 vec 177 (Fixed|edge) kvm_fpu: load kvm_entry: vcpu 0 kvm_exit: reason VMRESUME rip 0xfffff80000848115 info 0 0 kvm_entry: vcpu 0 kvm_exit: reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0 kvm_nested_vmexit: rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0 kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0 kvm_entry: vcpu 0 kvm_exit: reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0 kvm_apic: apic_write APIC_EOI = 0x0 kvm_eoi: apicid 0 vector 177 kvm_fpu: unload kvm_userspace_exit: reason KVM_EXIT_IOAPIC_EOI (26) ... (and the pattern repeats) Maybe it is a usb-only/Qemu-only problem, maybe not.
> On 26 Mar 2019, at 15:48, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > > Liran Alon <liran.alon@oracle.com> writes: > >>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: >>> - Instread of putting the temporary HF_SMM_MASK drop to >>> rsm_enter_protected_mode() (as was suggested by Liran), move it to >>> emulator_set_cr() modifying its interface. emulate.c seems to be >>> vcpu-specifics-free at this moment, we may want to keep it this way. >>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic >>> hangs even with this patch. These hangs, however, seem to be unrelated to >>> rsm. >> >> Feel free to share details on these hangs ;) >> > > You've asked for it) > > The immediate issue I'm observing is some sort of a lockup which is easy > to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it > seems we get too many interrupts and combined with preemtion timer for > L2 we're not making any progress: > > kvm_userspace_exit: reason KVM_EXIT_IOAPIC_EOI (26) > kvm_set_irq: gsi 18 level 1 source 0 > kvm_msi_set_irq: dst 0 vec 177 (Fixed|physical|level) > kvm_apic_accept_irq: apicid 0 vec 177 (Fixed|edge) > kvm_fpu: load > kvm_entry: vcpu 0 > kvm_exit: reason VMRESUME rip 0xfffff80000848115 info 0 0 > kvm_entry: vcpu 0 > kvm_exit: reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0 > kvm_nested_vmexit: rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0 > kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0 > kvm_entry: vcpu 0 > kvm_exit: reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0 > kvm_apic: apic_write APIC_EOI = 0x0 > kvm_eoi: apicid 0 vector 177 > kvm_fpu: unload > kvm_userspace_exit: reason KVM_EXIT_IOAPIC_EOI (26) > ... > (and the pattern repeats) > > Maybe it is a usb-only/Qemu-only problem, maybe not. > > -- > Vitaly The trace of kvm_apic_accept_irq should indicate that __apic_accept_irq() was called to inject an interrupt to L1 guest. (I know that now we are running in L1 because next exit is a VMRESUME). However, it is surprising to see that on next entry to guest, no interrupt was injected by vmx_inject_irq(). It may be because L1 guest is currently running with interrupt disabled and therefore only an IRQ-window was requested. (Too bad we don’t have a trace for this…) Next, we got an exit from L1 guest on VMRESUME. As part of it’s handling, active VMCS was changed from vmcs01 to vmcs02. I believe the immediate exit later on preemption-timer was because the immediate-exit-request mechanism was invoked which is now implemented by setting a VMX preemption-timer with value of 0 (Thanks to Sean). (See vmx_vcpu_run() -> vmx_update_hv_timer() -> vmx_arm_hv_timer(vmx, 0)). (Note that the pending interrupt was evaluated because of a recent patch of mine to nested_vmx_enter_non_root_mode() to request KVM_REQ_EVENT when vmcs01 have requested an IRQ-window) Therefore when entering L2, you immediately get an exit on PREEMPTION_TIMER which will cause eventually L0 to call vmx_check_nested_events() which notices now the pending interrupt that should have been injected before to L1 and now exit from L2 to L1 on EXTERNAL_INTERRUPT on vector 0xb1. Then L1 handles the interrupt by performing an EOI to LAPIC which propagate an EOI to IOAPIC which immediately re-inject the interrupt (after clearing the remote_irr) as the irq-line is still set. i.e. QEMU’s ioapic_eoi_broadcast() calls ioapic_service() immediate after it clears remote-irr for this pin. Also note that in trace we see only a single kvm_set_irq to level 1 but we don’t see immediately another kvm_set_irq to level 0. This should indicate that in QEMU’s IOAPIC redirection-table, this pin is configured as level-triggered interrupt. However, the trace of kvm_apic_accept_irq indicates that this interrupt is raised as an edge-triggered interrupt. To sum up: 1) I would create a patch to add a trace to vcpu_enter_guest() when calling enable_smi_window() / enable_nmi_window() / enable_irq_window(). 2) It is worth investigating why MSI trigger-mode is edge-triggered instead of level-triggered. 3) If this is indeed a level-triggered interrupt, it is worth investigating how the interrupt source behaves. i.e. What cause this device to lower the irq-line? (As we don’t see any I/O Port or MMIO access by L1 guest interrupt-handler before performing the EOI) 4) Does this issue reproduce also when running with kernel-irqchip? (Instead of split-irqchip) -Liran
Liran Alon <liran.alon@oracle.com> writes: >> On 26 Mar 2019, at 15:48, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: >> >> Liran Alon <liran.alon@oracle.com> writes: >> >>>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: >>>> - Instread of putting the temporary HF_SMM_MASK drop to >>>> rsm_enter_protected_mode() (as was suggested by Liran), move it to >>>> emulator_set_cr() modifying its interface. emulate.c seems to be >>>> vcpu-specifics-free at this moment, we may want to keep it this way. >>>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic >>>> hangs even with this patch. These hangs, however, seem to be unrelated to >>>> rsm. >>> >>> Feel free to share details on these hangs ;) >>> >> >> You've asked for it) >> >> The immediate issue I'm observing is some sort of a lockup which is easy >> to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it >> seems we get too many interrupts and combined with preemtion timer for >> L2 we're not making any progress: >> >> kvm_userspace_exit: reason KVM_EXIT_IOAPIC_EOI (26) >> kvm_set_irq: gsi 18 level 1 source 0 >> kvm_msi_set_irq: dst 0 vec 177 (Fixed|physical|level) >> kvm_apic_accept_irq: apicid 0 vec 177 (Fixed|edge) >> kvm_fpu: load >> kvm_entry: vcpu 0 >> kvm_exit: reason VMRESUME rip 0xfffff80000848115 info 0 0 >> kvm_entry: vcpu 0 >> kvm_exit: reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0 >> kvm_nested_vmexit: rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0 >> kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0 >> kvm_entry: vcpu 0 >> kvm_exit: reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0 >> kvm_apic: apic_write APIC_EOI = 0x0 >> kvm_eoi: apicid 0 vector 177 >> kvm_fpu: unload >> kvm_userspace_exit: reason KVM_EXIT_IOAPIC_EOI (26) >> ... >> (and the pattern repeats) >> >> Maybe it is a usb-only/Qemu-only problem, maybe not. >> >> -- >> Vitaly > > The trace of kvm_apic_accept_irq should indicate that __apic_accept_irq() was called to inject an interrupt to L1 guest. > (I know that now we are running in L1 because next exit is a VMRESUME). > > However, it is surprising to see that on next entry to guest, no interrupt was injected by vmx_inject_irq(). > It may be because L1 guest is currently running with interrupt disabled and therefore only an IRQ-window was requested. > (Too bad we don’t have a trace for this…) > > Next, we got an exit from L1 guest on VMRESUME. As part of it’s handling, active VMCS was changed from vmcs01 to vmcs02. > I believe the immediate exit later on preemption-timer was because the immediate-exit-request mechanism was invoked > which is now implemented by setting a VMX preemption-timer with value of 0 (Thanks to Sean). > (See vmx_vcpu_run() -> vmx_update_hv_timer() -> vmx_arm_hv_timer(vmx, 0)). > (Note that the pending interrupt was evaluated because of a recent patch of mine to nested_vmx_enter_non_root_mode() > to request KVM_REQ_EVENT when vmcs01 have requested an IRQ-window) > > Therefore when entering L2, you immediately get an exit on PREEMPTION_TIMER which will cause eventually L0 to call > vmx_check_nested_events() which notices now the pending interrupt that should have been injected before to L1 > and now exit from L2 to L1 on EXTERNAL_INTERRUPT on vector 0xb1. > > Then L1 handles the interrupt by performing an EOI to LAPIC which propagate an EOI to IOAPIC which immediately re-inject > the interrupt (after clearing the remote_irr) as the irq-line is still set. i.e. QEMU’s ioapic_eoi_broadcast() calls ioapic_service() immediate after it clears remote-irr for this pin. > > Also note that in trace we see only a single kvm_set_irq to level 1 but we don’t see immediately another kvm_set_irq to level 0. > This should indicate that in QEMU’s IOAPIC redirection-table, this pin is configured as level-triggered interrupt. > However, the trace of kvm_apic_accept_irq indicates that this interrupt is raised as an edge-triggered interrupt. > > To sum up: > 1) I would create a patch to add a trace to vcpu_enter_guest() when calling enable_smi_window() / enable_nmi_window() / enable_irq_window(). > 2) It is worth investigating why MSI trigger-mode is edge-triggered instead of level-triggered. > 3) If this is indeed a level-triggered interrupt, it is worth investigating how the interrupt source behaves. i.e. What cause this device to lower the irq-line? > (As we don’t see any I/O Port or MMIO access by L1 guest interrupt-handler before performing the EOI) > 4) Does this issue reproduce also when running with kernel-irqchip? (Instead of split-irqchip) > Thank you Liran, all are valuable suggestions. It seems the isssue doesn't reproduce with 'kernel-irqchip=on' but reproduces with "kernel-irqchip=split". My first guess would then be that we're less picky with in-kernel implementation about the observed edge/level discrepancy. I'll be investigating and share my findings.
On Tue, Mar 26, 2019 at 02:07:46PM +0100, Vitaly Kuznetsov wrote: > Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against > SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM. > The check is correct, however, there is a special case when RSM is called > to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still > set and in case VMXE was set before entering SMM we're failing to return. > > Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls > when ops->set_cr() is called from RSM. > > Reported-by: Jon Doron <arilou@gmail.com> > Suggested-by: Liran Alon <liran.alon@oracle.com> > Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM") > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Plumbing from_rsm all the way to set_cr() is pretty heinous. What about going with the idea Jim alluded to, i.e. manually save/restore VMXE during transitions to/from SMM? It's a little more tedious than it aught to be due to the placement and naming of the x86_ops hooks for SMM, but IMO the end result is cleaner even after adding the necessary callbacks. The following patches are compile tested only. Sean Christopherson (2): KVM: x86: Rename pre_{enter,leave}_smm() ops to reference SMM state save KVM: x86: Add kvm_x86_ops callback to allow VMX to stash away CR4.VMXE arch/x86/include/asm/kvm_emulate.h | 3 ++- arch/x86/include/asm/kvm_host.h | 6 ++++-- arch/x86/kvm/emulate.c | 10 ++++++---- arch/x86/kvm/svm.c | 20 ++++++++++++++++---- arch/x86/kvm/vmx/vmx.c | 30 ++++++++++++++++++++++++++---- arch/x86/kvm/vmx/vmx.h | 2 ++ arch/x86/kvm/x86.c | 23 ++++++++++++++++------- 7 files changed, 72 insertions(+), 22 deletions(-)
Sean Christopherson <sean.j.christopherson@intel.com> writes: > On Tue, Mar 26, 2019 at 02:07:46PM +0100, Vitaly Kuznetsov wrote: >> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against >> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM. >> The check is correct, however, there is a special case when RSM is called >> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still >> set and in case VMXE was set before entering SMM we're failing to return. >> >> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls >> when ops->set_cr() is called from RSM. >> >> Reported-by: Jon Doron <arilou@gmail.com> >> Suggested-by: Liran Alon <liran.alon@oracle.com> >> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM") >> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> > > Plumbing from_rsm all the way to set_cr() is pretty heinous. What about > going with the idea Jim alluded to, i.e. manually save/restore VMXE during > transitions to/from SMM? It's a little more tedious than it aught to be > due to the placement and naming of the x86_ops hooks for SMM, but IMO the > end result is cleaner even after adding the necessary callbacks. > > The following patches are compile tested only. Thanks and welcome to the 'rsm fixing' party :-) I like your/Jim's approach, however, with these patches as-is the issue persists: KVM: entry failed, hardware error 0x80000021 If you're running a guest on an Intel machine without unrestricted mode support, the failure can be most likely due to the guest entering an invalid state for Intel VT. For example, the guest maybe running in big real mode which is not supported on less recent Intel processors. RAX=000000007ffc7301 RBX=0000000000000000 RCX=0000000000000000 RDX=00007ff900000000 RSI=000000007dab4018 RDI=000000007ff9f000 RBP=0000000000000000 RSP=ffffe80000210fc0 R8 =0000000000000000 R9 =0000000000000000 R10=ffffbb8140cfd790 R11=ffffbb8140cfd7d0 R12=ffffbb8140cfd7d0 R13=ffffbb8140cfd790 R14=0000000000000008 R15=0000000000000000 RIP=fffff80007d44125 RFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=1 HLT=0 ... Let me take a look what's wrong. P.S. We probably need a kvm-unit-tests or selftests test as Hyper-V+secureboot is the only known way to test this atm.
On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > P.S. We probably need a kvm-unit-tests or selftests test as > Hyper-V+secureboot is the only known way to test this atm. Yes, please! And the requirement for providing tests of new features should be applied universally, rather than ad hoc.
On 28/03/19 17:40, Jim Mattson wrote: > On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > >> P.S. We probably need a kvm-unit-tests or selftests test as >> Hyper-V+secureboot is the only known way to test this atm. > > Yes, please! And the requirement for providing tests of new features > should be applied universally, rather than ad hoc. > Yes, I agree. Unfortunately all this stuff dates before the nice selftests framework (and also asking to write tests is one thing, asking to add SMM support for the framework is another... I'm doing that though). Paolo
Jim Mattson <jmattson@google.com> writes: > On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > >> P.S. We probably need a kvm-unit-tests or selftests test as >> Hyper-V+secureboot is the only known way to test this atm. > > Yes, please! And the requirement for providing tests of new features > should be applied universally, rather than ad hoc. FWIW, I have a working proof-of-concept test for KVM selftests doing SMI/RSM. I'm going to clean it up, enhance a bit, maybe add something to state_test and send out (unless someone beats me up to it, of course :-)
On 09/04/19 10:21, Vitaly Kuznetsov wrote: > Jim Mattson <jmattson@google.com> writes: > >> On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote: >> >>> P.S. We probably need a kvm-unit-tests or selftests test as >>> Hyper-V+secureboot is the only known way to test this atm. >> >> Yes, please! And the requirement for providing tests of new features >> should be applied universally, rather than ad hoc. > > FWIW, I have a working proof-of-concept test for KVM selftests doing > SMI/RSM. I'm going to clean it up, enhance a bit, maybe add something to > state_test and send out (unless someone beats me up to it, of course :-) Oh, me too. :) But please send yours, and then we'll see what can be taken from mine. state_test can come later. Paolo
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 93c4bf598fb0..6c33caa82fa5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -203,7 +203,8 @@ struct x86_emulate_ops { void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); - int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val, + bool from_rsm); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c338984c850d..a6204105d4d7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2413,7 +2413,7 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, cr3 &= ~0xfff; } - bad = ctxt->ops->set_cr(ctxt, 3, cr3); + bad = ctxt->ops->set_cr(ctxt, 3, cr3, true); if (bad) return X86EMUL_UNHANDLEABLE; @@ -2422,20 +2422,20 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, * Then enable protected mode. However, PCID cannot be enabled * if EFER.LMA=0, so set it separately. */ - bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true); if (bad) return X86EMUL_UNHANDLEABLE; - bad = ctxt->ops->set_cr(ctxt, 0, cr0); + bad = ctxt->ops->set_cr(ctxt, 0, cr0, true); if (bad) return X86EMUL_UNHANDLEABLE; if (cr4 & X86_CR4_PCIDE) { - bad = ctxt->ops->set_cr(ctxt, 4, cr4); + bad = ctxt->ops->set_cr(ctxt, 4, cr4, true); if (bad) return X86EMUL_UNHANDLEABLE; if (pcid) { - bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); + bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid, true); if (bad) return X86EMUL_UNHANDLEABLE; } @@ -2581,7 +2581,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* Zero CR4.PCIDE before CR0.PG. */ if (cr4 & X86_CR4_PCIDE) { - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true); cr4 &= ~X86_CR4_PCIDE; } @@ -2595,11 +2595,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* For the 64-bit case, this will clear EFER.LMA. */ cr0 = ctxt->ops->get_cr(ctxt, 0); if (cr0 & X86_CR0_PE) - ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE), + true); /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE, true); /* And finally go back to 32-bit mode. */ efer = 0; @@ -3131,7 +3132,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, int ret; u8 cpl; - if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) + if (ctxt->ops->set_cr(ctxt, 3, tss->cr3, false)) return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; @@ -3331,7 +3332,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, write_segment_descriptor(ctxt, tss_selector, &next_tss_desc); } - ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); + ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS, false); ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); if (has_error_code) { @@ -3633,7 +3634,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) static int em_cr_write(struct x86_emulate_ctxt *ctxt) { - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val, false)) return emulate_gp(ctxt, 0); /* Disable writeback. */ @@ -3766,7 +3767,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) cr0 = ctxt->ops->get_cr(ctxt, 0); cr0 &= ~X86_CR0_TS; - ctxt->ops->set_cr(ctxt, 0, cr0); + ctxt->ops->set_cr(ctxt, 0, cr0, false); return X86EMUL_CONTINUE; } @@ -3866,7 +3867,7 @@ static int em_smsw(struct x86_emulate_ctxt *ctxt) static int em_lmsw(struct x86_emulate_ctxt *ctxt) { ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) - | (ctxt->src.val & 0x0f)); + | (ctxt->src.val & 0x0f), false); ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a419656521b6..f2745e3170b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5739,7 +5739,8 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) return value; } -static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val, + bool from_rsm) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int res = 0; @@ -5755,7 +5756,16 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr3(vcpu, val); break; case 4: + /* + * set_cr4() may forbid to set certain flags (e.g. VMXE) from + * SMM but we're actually leaving it; temporary drop HF_SMM_MASK + * when setting CR4. + */ + if (from_rsm) + vcpu->arch.hflags &= ~HF_SMM_MASK; res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + if (from_rsm) + vcpu->arch.hflags |= HF_SMM_MASK; break; case 8: res = kvm_set_cr8(vcpu, val);
Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM. The check is correct, however, there is a special case when RSM is called to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still set and in case VMXE was set before entering SMM we're failing to return. Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls when ops->set_cr() is called from RSM. Reported-by: Jon Doron <arilou@gmail.com> Suggested-by: Liran Alon <liran.alon@oracle.com> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM") Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> --- - Instread of putting the temporary HF_SMM_MASK drop to rsm_enter_protected_mode() (as was suggested by Liran), move it to emulator_set_cr() modifying its interface. emulate.c seems to be vcpu-specifics-free at this moment, we may want to keep it this way. - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic hangs even with this patch. These hangs, however, seem to be unrelated to rsm. --- arch/x86/include/asm/kvm_emulate.h | 3 ++- arch/x86/kvm/emulate.c | 27 ++++++++++++++------------- arch/x86/kvm/x86.c | 12 +++++++++++- 3 files changed, 27 insertions(+), 15 deletions(-)