diff mbox series

KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag

Message ID 20190326130746.28748-1-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag | expand

Commit Message

Vitaly Kuznetsov March 26, 2019, 1:07 p.m. UTC
Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
The check is correct, however, there is a special case when RSM is called
to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
set and in case VMXE was set before entering SMM we're failing to return.

Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
when ops->set_cr() is called from RSM.

Reported-by: Jon Doron <arilou@gmail.com>
Suggested-by: Liran Alon <liran.alon@oracle.com>
Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
- Instread of putting the temporary HF_SMM_MASK drop to
  rsm_enter_protected_mode() (as was suggested by Liran), move it to
  emulator_set_cr() modifying its interface. emulate.c seems to be
  vcpu-specifics-free at this moment, we may want to keep it this way.
- It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
  hangs even with this patch. These hangs, however, seem to be unrelated to
  rsm.
---
 arch/x86/include/asm/kvm_emulate.h |  3 ++-
 arch/x86/kvm/emulate.c             | 27 ++++++++++++++-------------
 arch/x86/kvm/x86.c                 | 12 +++++++++++-
 3 files changed, 27 insertions(+), 15 deletions(-)

Comments

Liran Alon March 26, 2019, 1:11 p.m. UTC | #1
> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> 
> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
> The check is correct, however, there is a special case when RSM is called
> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
> set and in case VMXE was set before entering SMM we're failing to return.
> 
> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
> when ops->set_cr() is called from RSM.
> 
> Reported-by: Jon Doron <arilou@gmail.com>
> Suggested-by: Liran Alon <liran.alon@oracle.com>
> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>

Patch looks good to me.
Reviewed-by: Liran Alon <liran.alon@oracle.com>

> ---
> - Instread of putting the temporary HF_SMM_MASK drop to
>  rsm_enter_protected_mode() (as was suggested by Liran), move it to
>  emulator_set_cr() modifying its interface. emulate.c seems to be
>  vcpu-specifics-free at this moment, we may want to keep it this way.
> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>  hangs even with this patch. These hangs, however, seem to be unrelated to
>  rsm.

Feel free to share details on these hangs ;)

Great work,
-Liran

> ---
> arch/x86/include/asm/kvm_emulate.h |  3 ++-
> arch/x86/kvm/emulate.c             | 27 ++++++++++++++-------------
> arch/x86/kvm/x86.c                 | 12 +++++++++++-
> 3 files changed, 27 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
> index 93c4bf598fb0..6c33caa82fa5 100644
> --- a/arch/x86/include/asm/kvm_emulate.h
> +++ b/arch/x86/include/asm/kvm_emulate.h
> @@ -203,7 +203,8 @@ struct x86_emulate_ops {
> 	void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
> 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
> 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
> -	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
> +	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
> +		      bool from_rsm);
> 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
> 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
> 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index c338984c850d..a6204105d4d7 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -2413,7 +2413,7 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
> 		cr3 &= ~0xfff;
> 	}
> 
> -	bad = ctxt->ops->set_cr(ctxt, 3, cr3);
> +	bad = ctxt->ops->set_cr(ctxt, 3, cr3, true);
> 	if (bad)
> 		return X86EMUL_UNHANDLEABLE;
> 
> @@ -2422,20 +2422,20 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
> 	 * Then enable protected mode.	However, PCID cannot be enabled
> 	 * if EFER.LMA=0, so set it separately.
> 	 */
> -	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
> +	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
> 	if (bad)
> 		return X86EMUL_UNHANDLEABLE;
> 
> -	bad = ctxt->ops->set_cr(ctxt, 0, cr0);
> +	bad = ctxt->ops->set_cr(ctxt, 0, cr0, true);
> 	if (bad)
> 		return X86EMUL_UNHANDLEABLE;
> 
> 	if (cr4 & X86_CR4_PCIDE) {
> -		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
> +		bad = ctxt->ops->set_cr(ctxt, 4, cr4, true);
> 		if (bad)
> 			return X86EMUL_UNHANDLEABLE;
> 		if (pcid) {
> -			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
> +			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid, true);
> 			if (bad)
> 				return X86EMUL_UNHANDLEABLE;
> 		}
> @@ -2581,7 +2581,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
> 
> 		/* Zero CR4.PCIDE before CR0.PG.  */
> 		if (cr4 & X86_CR4_PCIDE) {
> -			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
> +			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
> 			cr4 &= ~X86_CR4_PCIDE;
> 		}
> 
> @@ -2595,11 +2595,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
> 	/* For the 64-bit case, this will clear EFER.LMA.  */
> 	cr0 = ctxt->ops->get_cr(ctxt, 0);
> 	if (cr0 & X86_CR0_PE)
> -		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
> +		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE),
> +				  true);
> 
> 	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
> 	if (cr4 & X86_CR4_PAE)
> -		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
> +		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE, true);
> 
> 	/* And finally go back to 32-bit mode.  */
> 	efer = 0;
> @@ -3131,7 +3132,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
> 	int ret;
> 	u8 cpl;
> 
> -	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
> +	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3, false))
> 		return emulate_gp(ctxt, 0);
> 	ctxt->_eip = tss->eip;
> 	ctxt->eflags = tss->eflags | 2;
> @@ -3331,7 +3332,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
> 		write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
> 	}
> 
> -	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS);
> +	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS, false);
> 	ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
> 
> 	if (has_error_code) {
> @@ -3633,7 +3634,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
> 
> static int em_cr_write(struct x86_emulate_ctxt *ctxt)
> {
> -	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
> +	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val, false))
> 		return emulate_gp(ctxt, 0);
> 
> 	/* Disable writeback. */
> @@ -3766,7 +3767,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
> 
> 	cr0 = ctxt->ops->get_cr(ctxt, 0);
> 	cr0 &= ~X86_CR0_TS;
> -	ctxt->ops->set_cr(ctxt, 0, cr0);
> +	ctxt->ops->set_cr(ctxt, 0, cr0, false);
> 	return X86EMUL_CONTINUE;
> }
> 
> @@ -3866,7 +3867,7 @@ static int em_smsw(struct x86_emulate_ctxt *ctxt)
> static int em_lmsw(struct x86_emulate_ctxt *ctxt)
> {
> 	ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
> -			  | (ctxt->src.val & 0x0f));
> +			  | (ctxt->src.val & 0x0f), false);
> 	ctxt->dst.type = OP_NONE;
> 	return X86EMUL_CONTINUE;
> }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a419656521b6..f2745e3170b6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5739,7 +5739,8 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
> 	return value;
> }
> 
> -static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
> +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
> +			   bool from_rsm)
> {
> 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
> 	int res = 0;
> @@ -5755,7 +5756,16 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
> 		res = kvm_set_cr3(vcpu, val);
> 		break;
> 	case 4:
> +		/*
> +		 * set_cr4() may forbid to set certain flags (e.g. VMXE) from
> +		 * SMM but we're actually leaving it; temporary drop HF_SMM_MASK
> +		 * when setting CR4.
> +		 */
> +		if (from_rsm)
> +			vcpu->arch.hflags &= ~HF_SMM_MASK;
> 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
> +		if (from_rsm)
> +			vcpu->arch.hflags |= HF_SMM_MASK;
> 		break;
> 	case 8:
> 		res = kvm_set_cr8(vcpu, val);
> -- 
> 2.20.1
>
Vitaly Kuznetsov March 26, 2019, 1:48 p.m. UTC | #2
Liran Alon <liran.alon@oracle.com> writes:

>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>> 
>> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
>> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
>> The check is correct, however, there is a special case when RSM is called
>> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
>> set and in case VMXE was set before entering SMM we're failing to return.
>> 
>> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
>> when ops->set_cr() is called from RSM.
>> 
>> Reported-by: Jon Doron <arilou@gmail.com>
>> Suggested-by: Liran Alon <liran.alon@oracle.com>
>> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>
> Patch looks good to me.
> Reviewed-by: Liran Alon <liran.alon@oracle.com>

Thanks!

>
>> ---
>> - Instread of putting the temporary HF_SMM_MASK drop to
>>  rsm_enter_protected_mode() (as was suggested by Liran), move it to
>>  emulator_set_cr() modifying its interface. emulate.c seems to be
>>  vcpu-specifics-free at this moment, we may want to keep it this way.
>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>>  hangs even with this patch. These hangs, however, seem to be unrelated to
>>  rsm.
>
> Feel free to share details on these hangs ;)
>

You've asked for it)

The immediate issue I'm observing is some sort of a lockup which is easy
to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it
seems we get too many interrupts and combined with preemtion timer for
L2 we're not making any progress:

kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
kvm_set_irq:          gsi 18 level 1 source 0
kvm_msi_set_irq:      dst 0 vec 177 (Fixed|physical|level)
kvm_apic_accept_irq:  apicid 0 vec 177 (Fixed|edge)
kvm_fpu:              load
kvm_entry:            vcpu 0
kvm_exit:             reason VMRESUME rip 0xfffff80000848115 info 0 0
kvm_entry:            vcpu 0
kvm_exit:             reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0
kvm_nested_vmexit:    rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0
kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0
kvm_entry:            vcpu 0
kvm_exit:             reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0
kvm_apic:             apic_write APIC_EOI = 0x0
kvm_eoi:              apicid 0 vector 177
kvm_fpu:              unload
kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
...
(and the pattern repeats)

Maybe it is a usb-only/Qemu-only problem, maybe not.
Liran Alon March 26, 2019, 3:02 p.m. UTC | #3
> On 26 Mar 2019, at 15:48, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> 
> Liran Alon <liran.alon@oracle.com> writes:
> 
>>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>> - Instread of putting the temporary HF_SMM_MASK drop to
>>> rsm_enter_protected_mode() (as was suggested by Liran), move it to
>>> emulator_set_cr() modifying its interface. emulate.c seems to be
>>> vcpu-specifics-free at this moment, we may want to keep it this way.
>>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>>> hangs even with this patch. These hangs, however, seem to be unrelated to
>>> rsm.
>> 
>> Feel free to share details on these hangs ;)
>> 
> 
> You've asked for it)
> 
> The immediate issue I'm observing is some sort of a lockup which is easy
> to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it
> seems we get too many interrupts and combined with preemtion timer for
> L2 we're not making any progress:
> 
> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
> kvm_set_irq:          gsi 18 level 1 source 0
> kvm_msi_set_irq:      dst 0 vec 177 (Fixed|physical|level)
> kvm_apic_accept_irq:  apicid 0 vec 177 (Fixed|edge)
> kvm_fpu:              load
> kvm_entry:            vcpu 0
> kvm_exit:             reason VMRESUME rip 0xfffff80000848115 info 0 0
> kvm_entry:            vcpu 0
> kvm_exit:             reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0
> kvm_nested_vmexit:    rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0
> kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0
> kvm_entry:            vcpu 0
> kvm_exit:             reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0
> kvm_apic:             apic_write APIC_EOI = 0x0
> kvm_eoi:              apicid 0 vector 177
> kvm_fpu:              unload
> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
> ...
> (and the pattern repeats)
> 
> Maybe it is a usb-only/Qemu-only problem, maybe not.
> 
> -- 
> Vitaly

The trace of kvm_apic_accept_irq should indicate that __apic_accept_irq() was called to inject an interrupt to L1 guest.
(I know that now we are running in L1 because next exit is a VMRESUME).

However, it is surprising to see that on next entry to guest, no interrupt was injected by vmx_inject_irq().
It may be because L1 guest is currently running with interrupt disabled and therefore only an IRQ-window was requested.
(Too bad we don’t have a trace for this…)

Next, we got an exit from L1 guest on VMRESUME. As part of it’s handling, active VMCS was changed from vmcs01 to vmcs02.
I believe the immediate exit later on preemption-timer was because the immediate-exit-request mechanism was invoked
which is now implemented by setting a VMX preemption-timer with value of 0 (Thanks to Sean).
(See vmx_vcpu_run() -> vmx_update_hv_timer() -> vmx_arm_hv_timer(vmx, 0)).
(Note that the pending interrupt was evaluated because of a recent patch of mine to nested_vmx_enter_non_root_mode()
to request KVM_REQ_EVENT when vmcs01 have requested an IRQ-window)

Therefore when entering L2, you immediately get an exit on PREEMPTION_TIMER which will cause eventually L0 to call
vmx_check_nested_events() which notices now the pending interrupt that should have been injected before to L1
and now exit from L2 to L1 on EXTERNAL_INTERRUPT on vector 0xb1.

Then L1 handles the interrupt by performing an EOI to LAPIC which propagate an EOI to IOAPIC which immediately re-inject
the interrupt (after clearing the remote_irr) as the irq-line is still set. i.e. QEMU’s ioapic_eoi_broadcast() calls ioapic_service() immediate after it clears remote-irr for this pin.

Also note that in trace we see only a single kvm_set_irq to level 1 but we don’t see immediately another kvm_set_irq to level 0.
This should indicate that in QEMU’s IOAPIC redirection-table, this pin is configured as level-triggered interrupt.
However, the trace of kvm_apic_accept_irq indicates that this interrupt is raised as an edge-triggered interrupt.

To sum up:
1) I would create a patch to add a trace to vcpu_enter_guest() when calling enable_smi_window() / enable_nmi_window() / enable_irq_window().
2) It is worth investigating why MSI trigger-mode is edge-triggered instead of level-triggered.
3) If this is indeed a level-triggered interrupt, it is worth investigating how the interrupt source behaves. i.e. What cause this device to lower the irq-line?
(As we don’t see any I/O Port or MMIO access by L1 guest interrupt-handler before performing the EOI)
4) Does this issue reproduce also when running with kernel-irqchip? (Instead of split-irqchip)

-Liran
Vitaly Kuznetsov March 27, 2019, 10:08 a.m. UTC | #4
Liran Alon <liran.alon@oracle.com> writes:

>> On 26 Mar 2019, at 15:48, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>> 
>> Liran Alon <liran.alon@oracle.com> writes:
>> 
>>>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>>> - Instread of putting the temporary HF_SMM_MASK drop to
>>>> rsm_enter_protected_mode() (as was suggested by Liran), move it to
>>>> emulator_set_cr() modifying its interface. emulate.c seems to be
>>>> vcpu-specifics-free at this moment, we may want to keep it this way.
>>>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>>>> hangs even with this patch. These hangs, however, seem to be unrelated to
>>>> rsm.
>>> 
>>> Feel free to share details on these hangs ;)
>>> 
>> 
>> You've asked for it)
>> 
>> The immediate issue I'm observing is some sort of a lockup which is easy
>> to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it
>> seems we get too many interrupts and combined with preemtion timer for
>> L2 we're not making any progress:
>> 
>> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
>> kvm_set_irq:          gsi 18 level 1 source 0
>> kvm_msi_set_irq:      dst 0 vec 177 (Fixed|physical|level)
>> kvm_apic_accept_irq:  apicid 0 vec 177 (Fixed|edge)
>> kvm_fpu:              load
>> kvm_entry:            vcpu 0
>> kvm_exit:             reason VMRESUME rip 0xfffff80000848115 info 0 0
>> kvm_entry:            vcpu 0
>> kvm_exit:             reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0
>> kvm_nested_vmexit:    rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0
>> kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0
>> kvm_entry:            vcpu 0
>> kvm_exit:             reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0
>> kvm_apic:             apic_write APIC_EOI = 0x0
>> kvm_eoi:              apicid 0 vector 177
>> kvm_fpu:              unload
>> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
>> ...
>> (and the pattern repeats)
>> 
>> Maybe it is a usb-only/Qemu-only problem, maybe not.
>> 
>> -- 
>> Vitaly
>
> The trace of kvm_apic_accept_irq should indicate that __apic_accept_irq() was called to inject an interrupt to L1 guest.
> (I know that now we are running in L1 because next exit is a VMRESUME).
>
> However, it is surprising to see that on next entry to guest, no interrupt was injected by vmx_inject_irq().
> It may be because L1 guest is currently running with interrupt disabled and therefore only an IRQ-window was requested.
> (Too bad we don’t have a trace for this…)
>
> Next, we got an exit from L1 guest on VMRESUME. As part of it’s handling, active VMCS was changed from vmcs01 to vmcs02.
> I believe the immediate exit later on preemption-timer was because the immediate-exit-request mechanism was invoked
> which is now implemented by setting a VMX preemption-timer with value of 0 (Thanks to Sean).
> (See vmx_vcpu_run() -> vmx_update_hv_timer() -> vmx_arm_hv_timer(vmx, 0)).
> (Note that the pending interrupt was evaluated because of a recent patch of mine to nested_vmx_enter_non_root_mode()
> to request KVM_REQ_EVENT when vmcs01 have requested an IRQ-window)
>
> Therefore when entering L2, you immediately get an exit on PREEMPTION_TIMER which will cause eventually L0 to call
> vmx_check_nested_events() which notices now the pending interrupt that should have been injected before to L1
> and now exit from L2 to L1 on EXTERNAL_INTERRUPT on vector 0xb1.
>
> Then L1 handles the interrupt by performing an EOI to LAPIC which propagate an EOI to IOAPIC which immediately re-inject
> the interrupt (after clearing the remote_irr) as the irq-line is still set. i.e. QEMU’s ioapic_eoi_broadcast() calls ioapic_service() immediate after it clears remote-irr for this pin.
>
> Also note that in trace we see only a single kvm_set_irq to level 1 but we don’t see immediately another kvm_set_irq to level 0.
> This should indicate that in QEMU’s IOAPIC redirection-table, this pin is configured as level-triggered interrupt.
> However, the trace of kvm_apic_accept_irq indicates that this interrupt is raised as an edge-triggered interrupt.
>
> To sum up:
> 1) I would create a patch to add a trace to vcpu_enter_guest() when calling enable_smi_window() / enable_nmi_window() / enable_irq_window().
> 2) It is worth investigating why MSI trigger-mode is edge-triggered instead of level-triggered.
> 3) If this is indeed a level-triggered interrupt, it is worth investigating how the interrupt source behaves. i.e. What cause this device to lower the irq-line?
> (As we don’t see any I/O Port or MMIO access by L1 guest interrupt-handler before performing the EOI)
> 4) Does this issue reproduce also when running with kernel-irqchip? (Instead of split-irqchip)
>

Thank you Liran,

all are valuable suggestions. It seems the isssue doesn't reproduce with
'kernel-irqchip=on' but reproduces with "kernel-irqchip=split". My first
guess would then be that we're less picky with in-kernel implementation
about the observed edge/level discrepancy. I'll be investigating and
share my findings.
Sean Christopherson March 27, 2019, 7:29 p.m. UTC | #5
On Tue, Mar 26, 2019 at 02:07:46PM +0100, Vitaly Kuznetsov wrote:
> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
> The check is correct, however, there is a special case when RSM is called
> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
> set and in case VMXE was set before entering SMM we're failing to return.
>
> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
> when ops->set_cr() is called from RSM.
>
> Reported-by: Jon Doron <arilou@gmail.com>
> Suggested-by: Liran Alon <liran.alon@oracle.com>
> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>

Plumbing from_rsm all the way to set_cr() is pretty heinous.  What about
going with the idea Jim alluded to, i.e. manually save/restore VMXE during
transitions to/from SMM?  It's a little more tedious than it aught to be
due to the placement and naming of the x86_ops hooks for SMM, but IMO the
end result is cleaner even after adding the necessary callbacks.

The following patches are compile tested only.

Sean Christopherson (2):
  KVM: x86: Rename pre_{enter,leave}_smm() ops to reference SMM state
    save
  KVM: x86: Add kvm_x86_ops callback to allow VMX to stash away CR4.VMXE

 arch/x86/include/asm/kvm_emulate.h |  3 ++-
 arch/x86/include/asm/kvm_host.h    |  6 ++++--
 arch/x86/kvm/emulate.c             | 10 ++++++----
 arch/x86/kvm/svm.c                 | 20 ++++++++++++++++----
 arch/x86/kvm/vmx/vmx.c             | 30 ++++++++++++++++++++++++++----
 arch/x86/kvm/vmx/vmx.h             |  2 ++
 arch/x86/kvm/x86.c                 | 23 ++++++++++++++++-------
 7 files changed, 72 insertions(+), 22 deletions(-)
Vitaly Kuznetsov March 28, 2019, 8:55 a.m. UTC | #6
Sean Christopherson <sean.j.christopherson@intel.com> writes:

> On Tue, Mar 26, 2019 at 02:07:46PM +0100, Vitaly Kuznetsov wrote:
>> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
>> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
>> The check is correct, however, there is a special case when RSM is called
>> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
>> set and in case VMXE was set before entering SMM we're failing to return.
>>
>> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
>> when ops->set_cr() is called from RSM.
>>
>> Reported-by: Jon Doron <arilou@gmail.com>
>> Suggested-by: Liran Alon <liran.alon@oracle.com>
>> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>
> Plumbing from_rsm all the way to set_cr() is pretty heinous.  What about
> going with the idea Jim alluded to, i.e. manually save/restore VMXE during
> transitions to/from SMM?  It's a little more tedious than it aught to be
> due to the placement and naming of the x86_ops hooks for SMM, but IMO the
> end result is cleaner even after adding the necessary callbacks.
>
> The following patches are compile tested only.

Thanks and welcome to the 'rsm fixing' party :-)

I like your/Jim's approach, however, with these patches as-is the issue
persists:

KVM: entry failed, hardware error 0x80000021

If you're running a guest on an Intel machine without unrestricted mode
support, the failure can be most likely due to the guest entering an invalid
state for Intel VT. For example, the guest maybe running in big real mode
which is not supported on less recent Intel processors.

RAX=000000007ffc7301 RBX=0000000000000000 RCX=0000000000000000 RDX=00007ff900000000
RSI=000000007dab4018 RDI=000000007ff9f000 RBP=0000000000000000 RSP=ffffe80000210fc0
R8 =0000000000000000 R9 =0000000000000000 R10=ffffbb8140cfd790 R11=ffffbb8140cfd7d0
R12=ffffbb8140cfd7d0 R13=ffffbb8140cfd790 R14=0000000000000008 R15=0000000000000000
RIP=fffff80007d44125 RFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=1 HLT=0
...

Let me take a look what's wrong.

P.S. We probably need a kvm-unit-tests or selftests test as
Hyper-V+secureboot is the only known way to test this atm.
Jim Mattson March 28, 2019, 4:40 p.m. UTC | #7
On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:

> P.S. We probably need a kvm-unit-tests or selftests test as
> Hyper-V+secureboot is the only known way to test this atm.

Yes, please!  And the requirement for providing tests of new features
should be applied universally, rather than ad hoc.
Paolo Bonzini March 28, 2019, 5:07 p.m. UTC | #8
On 28/03/19 17:40, Jim Mattson wrote:
> On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> 
>> P.S. We probably need a kvm-unit-tests or selftests test as
>> Hyper-V+secureboot is the only known way to test this atm.
> 
> Yes, please!  And the requirement for providing tests of new features
> should be applied universally, rather than ad hoc.
> 

Yes, I agree.  Unfortunately all this stuff dates before the nice
selftests framework (and also asking to write tests is one thing, asking
to add SMM support for the framework is another...  I'm doing that though).

Paolo
Vitaly Kuznetsov April 9, 2019, 8:21 a.m. UTC | #9
Jim Mattson <jmattson@google.com> writes:

> On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>
>> P.S. We probably need a kvm-unit-tests or selftests test as
>> Hyper-V+secureboot is the only known way to test this atm.
>
> Yes, please!  And the requirement for providing tests of new features
> should be applied universally, rather than ad hoc.

FWIW, I have a working proof-of-concept test for KVM selftests doing
SMI/RSM. I'm going to clean it up, enhance a bit, maybe add something to
state_test and send out (unless someone beats me up to it, of course :-)
Paolo Bonzini April 9, 2019, 4:31 p.m. UTC | #10
On 09/04/19 10:21, Vitaly Kuznetsov wrote:
> Jim Mattson <jmattson@google.com> writes:
> 
>> On Thu, Mar 28, 2019 at 1:56 AM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>
>>> P.S. We probably need a kvm-unit-tests or selftests test as
>>> Hyper-V+secureboot is the only known way to test this atm.
>>
>> Yes, please!  And the requirement for providing tests of new features
>> should be applied universally, rather than ad hoc.
> 
> FWIW, I have a working proof-of-concept test for KVM selftests doing
> SMI/RSM. I'm going to clean it up, enhance a bit, maybe add something to
> state_test and send out (unless someone beats me up to it, of course :-)

Oh, me too. :)  But please send yours, and then we'll see what can be
taken from mine.  state_test can come later.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 93c4bf598fb0..6c33caa82fa5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -203,7 +203,8 @@  struct x86_emulate_ops {
 	void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
-	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
+		      bool from_rsm);
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c338984c850d..a6204105d4d7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2413,7 +2413,7 @@  static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
 		cr3 &= ~0xfff;
 	}
 
-	bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+	bad = ctxt->ops->set_cr(ctxt, 3, cr3, true);
 	if (bad)
 		return X86EMUL_UNHANDLEABLE;
 
@@ -2422,20 +2422,20 @@  static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
 	 * Then enable protected mode.	However, PCID cannot be enabled
 	 * if EFER.LMA=0, so set it separately.
 	 */
-	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
 	if (bad)
 		return X86EMUL_UNHANDLEABLE;
 
-	bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+	bad = ctxt->ops->set_cr(ctxt, 0, cr0, true);
 	if (bad)
 		return X86EMUL_UNHANDLEABLE;
 
 	if (cr4 & X86_CR4_PCIDE) {
-		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+		bad = ctxt->ops->set_cr(ctxt, 4, cr4, true);
 		if (bad)
 			return X86EMUL_UNHANDLEABLE;
 		if (pcid) {
-			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid, true);
 			if (bad)
 				return X86EMUL_UNHANDLEABLE;
 		}
@@ -2581,7 +2581,7 @@  static int em_rsm(struct x86_emulate_ctxt *ctxt)
 
 		/* Zero CR4.PCIDE before CR0.PG.  */
 		if (cr4 & X86_CR4_PCIDE) {
-			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
 			cr4 &= ~X86_CR4_PCIDE;
 		}
 
@@ -2595,11 +2595,12 @@  static int em_rsm(struct x86_emulate_ctxt *ctxt)
 	/* For the 64-bit case, this will clear EFER.LMA.  */
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	if (cr0 & X86_CR0_PE)
-		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE),
+				  true);
 
 	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
 	if (cr4 & X86_CR4_PAE)
-		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE, true);
 
 	/* And finally go back to 32-bit mode.  */
 	efer = 0;
@@ -3131,7 +3132,7 @@  static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	int ret;
 	u8 cpl;
 
-	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
+	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3, false))
 		return emulate_gp(ctxt, 0);
 	ctxt->_eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
@@ -3331,7 +3332,7 @@  static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
 	}
 
-	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS);
+	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS, false);
 	ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
 
 	if (has_error_code) {
@@ -3633,7 +3634,7 @@  static int em_movbe(struct x86_emulate_ctxt *ctxt)
 
 static int em_cr_write(struct x86_emulate_ctxt *ctxt)
 {
-	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val, false))
 		return emulate_gp(ctxt, 0);
 
 	/* Disable writeback. */
@@ -3766,7 +3767,7 @@  static int em_clts(struct x86_emulate_ctxt *ctxt)
 
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	cr0 &= ~X86_CR0_TS;
-	ctxt->ops->set_cr(ctxt, 0, cr0);
+	ctxt->ops->set_cr(ctxt, 0, cr0, false);
 	return X86EMUL_CONTINUE;
 }
 
@@ -3866,7 +3867,7 @@  static int em_smsw(struct x86_emulate_ctxt *ctxt)
 static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 {
 	ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
-			  | (ctxt->src.val & 0x0f));
+			  | (ctxt->src.val & 0x0f), false);
 	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a419656521b6..f2745e3170b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5739,7 +5739,8 @@  static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
 	return value;
 }
 
-static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
+			   bool from_rsm)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	int res = 0;
@@ -5755,7 +5756,16 @@  static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 		res = kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
+		/*
+		 * set_cr4() may forbid to set certain flags (e.g. VMXE) from
+		 * SMM but we're actually leaving it; temporary drop HF_SMM_MASK
+		 * when setting CR4.
+		 */
+		if (from_rsm)
+			vcpu->arch.hflags &= ~HF_SMM_MASK;
 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		if (from_rsm)
+			vcpu->arch.hflags |= HF_SMM_MASK;
 		break;
 	case 8:
 		res = kvm_set_cr8(vcpu, val);