diff mbox

KVM: nVMX: Fix direct injection of interrupts from L0 to L2

Message ID 511FBD76.8010307@web.de (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Kiszka Feb. 16, 2013, 5:10 p.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
vmx_complete_interrupts on L2 exits. This is required because, with
direct interrupt injection from L0 to L2, L0 has to update its pending
events.

Also, we need to allow vmx_cancel_injection when entering L2 in we left
to L0. This condition is indirectly derived from the absence of valid
vectoring info in vmcs12. We no explicitly clear it if we find out that
the L2 exit is not targeting L1 but L0.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 arch/x86/kvm/vmx.c |   43 +++++++++++++++++++++++++++----------------
 1 files changed, 27 insertions(+), 16 deletions(-)

Comments

Gleb Natapov Feb. 17, 2013, 3:07 p.m. UTC | #1
On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
> vmx_complete_interrupts on L2 exits. This is required because, with
> direct interrupt injection from L0 to L2, L0 has to update its pending
> events.
> 
> Also, we need to allow vmx_cancel_injection when entering L2 in we left
> to L0. This condition is indirectly derived from the absence of valid
> vectoring info in vmcs12. We no explicitly clear it if we find out that
> the L2 exit is not targeting L1 but L0.
> 
We really need to overhaul how interrupt injection is emulated in nested
VMX. Why not put pending events into event queue instead of
get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.

> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  arch/x86/kvm/vmx.c |   43 +++++++++++++++++++++++++++----------------
>  1 files changed, 27 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 68a045ae..464b6a5 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -624,6 +624,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
>  			    struct kvm_segment *var, int seg);
>  static bool guest_state_valid(struct kvm_vcpu *vcpu);
>  static u32 vmx_segment_access_rights(struct kvm_segment *var);
> +static void vmx_complete_interrupts(struct vcpu_vmx *vmx);
>  
>  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> @@ -6213,9 +6214,19 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  	else
>  		vmx->nested.nested_run_pending = 0;
>  
> -	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> -		nested_vmx_vmexit(vcpu);
> -		return 1;
> +	if (is_guest_mode(vcpu)) {
> +		if (nested_vmx_exit_handled(vcpu)) {
> +			nested_vmx_vmexit(vcpu);
> +			return 1;
> +		}
> +		/*
> +		 * Now it's clear, we are leaving to L0. Perform the postponed
> +		 * interrupt completion and clear L1's vectoring info field so
> +		 * that we do not overwrite what L0 wants to inject on
> +		 * re-entry.
> +		 */
> +		vmx_complete_interrupts(vmx);
> +		get_vmcs12(vcpu)->idt_vectoring_info_field = 0;
>  	}
>  
>  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
> @@ -6495,8 +6506,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
>  
>  static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>  {
> -	if (is_guest_mode(&vmx->vcpu))
> -		return;
>  	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
>  				  VM_EXIT_INSTRUCTION_LEN,
>  				  IDT_VECTORING_ERROR_CODE);
> @@ -6504,7 +6513,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>  
>  static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
>  {
> -	if (is_guest_mode(vcpu))
> +	if (is_guest_mode(vcpu) &&
> +	    get_vmcs12(vcpu)->idt_vectoring_info_field &
> +			VECTORING_INFO_VALID_MASK)
Why skip cancel_injection at all? As far as I see we can lose injected
irq if we do. Consider:

  io thread                                  vcpu in nested mode
set irr 200
                                          clear irr 200 set isr 200
                                          set 200 in VM_ENTRY_INTR_INFO_FIELD
set irr 250
set KVM_REQ_EVENT
                                          if (KVM_REQ_EVENT)
                                                  vmx_cancel_injection() <- does nothing

                                          clear irr 250 set isr 250
                                          set 250 in VM_ENTRY_INTR_INFO_FIELD
                                          vmentry

So now APIC state is bogus. isr bit 200 is set but vector 200 was never
injected and actually is lost forever. Next EOI will clear isr 250 and
isr 200 will block all lower level interrupt forever.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Feb. 17, 2013, 3:31 p.m. UTC | #2
On 2013-02-17 16:07, Gleb Natapov wrote:
> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
>> vmx_complete_interrupts on L2 exits. This is required because, with
>> direct interrupt injection from L0 to L2, L0 has to update its pending
>> events.
>>
>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
>> to L0. This condition is indirectly derived from the absence of valid
>> vectoring info in vmcs12. We no explicitly clear it if we find out that
>> the L2 exit is not targeting L1 but L0.
>>
> We really need to overhaul how interrupt injection is emulated in nested
> VMX. Why not put pending events into event queue instead of
> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.

I was thinking about the same step but felt unsure so far if
vmx_complete_interrupts & Co. do not include any assumptions about the
vmcs configuration that won't match what L1 does. So I went for a
different path first, specifically to avoid impact on these hairy bits
for non-nested mode.

> 
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  arch/x86/kvm/vmx.c |   43 +++++++++++++++++++++++++++----------------
>>  1 files changed, 27 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 68a045ae..464b6a5 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -624,6 +624,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
>>  			    struct kvm_segment *var, int seg);
>>  static bool guest_state_valid(struct kvm_vcpu *vcpu);
>>  static u32 vmx_segment_access_rights(struct kvm_segment *var);
>> +static void vmx_complete_interrupts(struct vcpu_vmx *vmx);
>>  
>>  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>>  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>> @@ -6213,9 +6214,19 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>  	else
>>  		vmx->nested.nested_run_pending = 0;
>>  
>> -	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
>> -		nested_vmx_vmexit(vcpu);
>> -		return 1;
>> +	if (is_guest_mode(vcpu)) {
>> +		if (nested_vmx_exit_handled(vcpu)) {
>> +			nested_vmx_vmexit(vcpu);
>> +			return 1;
>> +		}
>> +		/*
>> +		 * Now it's clear, we are leaving to L0. Perform the postponed
>> +		 * interrupt completion and clear L1's vectoring info field so
>> +		 * that we do not overwrite what L0 wants to inject on
>> +		 * re-entry.
>> +		 */
>> +		vmx_complete_interrupts(vmx);
>> +		get_vmcs12(vcpu)->idt_vectoring_info_field = 0;
>>  	}
>>  
>>  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>> @@ -6495,8 +6506,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
>>  
>>  static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>>  {
>> -	if (is_guest_mode(&vmx->vcpu))
>> -		return;
>>  	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
>>  				  VM_EXIT_INSTRUCTION_LEN,
>>  				  IDT_VECTORING_ERROR_CODE);
>> @@ -6504,7 +6513,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>>  
>>  static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
>>  {
>> -	if (is_guest_mode(vcpu))
>> +	if (is_guest_mode(vcpu) &&
>> +	    get_vmcs12(vcpu)->idt_vectoring_info_field &
>> +			VECTORING_INFO_VALID_MASK)
> Why skip cancel_injection at all? As far as I see we can lose injected
> irq if we do. Consider:
> 
>   io thread                                  vcpu in nested mode
> set irr 200
>                                           clear irr 200 set isr 200
>                                           set 200 in VM_ENTRY_INTR_INFO_FIELD
> set irr 250
> set KVM_REQ_EVENT
>                                           if (KVM_REQ_EVENT)
>                                                   vmx_cancel_injection() <- does nothing

No, it does cancel as vmcs12's idt_vectoring_info_field signals an
invalid state then. Only if we left L2 with valid vectoring info and are
about to reenter, we skip the cancellation - but in that case we didn't
inject anything from L0 previously anyway.

Jan

> 
>                                           clear irr 250 set isr 250
>                                           set 250 in VM_ENTRY_INTR_INFO_FIELD
>                                           vmentry
> 
> So now APIC state is bogus. isr bit 200 is set but vector 200 was never
> injected and actually is lost forever. Next EOI will clear isr 250 and
> isr 200 will block all lower level interrupt forever.
> 
> --
> 			Gleb.
>
Gleb Natapov Feb. 17, 2013, 4:26 p.m. UTC | #3
On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
> On 2013-02-17 16:07, Gleb Natapov wrote:
> > On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
> >> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>
> >> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
> >> vmx_complete_interrupts on L2 exits. This is required because, with
> >> direct interrupt injection from L0 to L2, L0 has to update its pending
> >> events.
> >>
> >> Also, we need to allow vmx_cancel_injection when entering L2 in we left
> >> to L0. This condition is indirectly derived from the absence of valid
> >> vectoring info in vmcs12. We no explicitly clear it if we find out that
> >> the L2 exit is not targeting L1 but L0.
> >>
> > We really need to overhaul how interrupt injection is emulated in nested
> > VMX. Why not put pending events into event queue instead of
> > get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
> 
> I was thinking about the same step but felt unsure so far if
> vmx_complete_interrupts & Co. do not include any assumptions about the
> vmcs configuration that won't match what L1 does. So I went for a
> different path first, specifically to avoid impact on these hairy bits
> for non-nested mode.
>
Assumption made by those functions should be still correct since guest
VMCS configuration is not applied directly to real HW, but we should be
careful of course. For instance interrupt queues should be cleared
during nested vmexit and event transfered back to idt_vectoring_info_field.
IIRC this is how nested SVM works BTW.

And with you patch you did a half of the job already :) When exiting to
L0 you transfer event information from get_vmcs12(vcpu)->idt_vectoring_info_field
to our internal event queues anyway. Hmm, but you do not clear the queue
during nested vmexit. So what happens if L2 exits to L0 with an exception
in idt_vectoring_info_field. Now interrupt is delivered so nested vm exit
is done, but exception is left in internal queue. I think it will be
delivered into L1 during next vmentry.

> > 
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> ---
> >>  arch/x86/kvm/vmx.c |   43 +++++++++++++++++++++++++++----------------
> >>  1 files changed, 27 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index 68a045ae..464b6a5 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -624,6 +624,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
> >>  			    struct kvm_segment *var, int seg);
> >>  static bool guest_state_valid(struct kvm_vcpu *vcpu);
> >>  static u32 vmx_segment_access_rights(struct kvm_segment *var);
> >> +static void vmx_complete_interrupts(struct vcpu_vmx *vmx);
> >>  
> >>  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
> >>  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> >> @@ -6213,9 +6214,19 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >>  	else
> >>  		vmx->nested.nested_run_pending = 0;
> >>  
> >> -	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> >> -		nested_vmx_vmexit(vcpu);
> >> -		return 1;
> >> +	if (is_guest_mode(vcpu)) {
> >> +		if (nested_vmx_exit_handled(vcpu)) {
> >> +			nested_vmx_vmexit(vcpu);
> >> +			return 1;
> >> +		}
> >> +		/*
> >> +		 * Now it's clear, we are leaving to L0. Perform the postponed
> >> +		 * interrupt completion and clear L1's vectoring info field so
> >> +		 * that we do not overwrite what L0 wants to inject on
> >> +		 * re-entry.
> >> +		 */
> >> +		vmx_complete_interrupts(vmx);
> >> +		get_vmcs12(vcpu)->idt_vectoring_info_field = 0;
> >>  	}
> >>  
> >>  	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
> >> @@ -6495,8 +6506,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
> >>  
> >>  static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
> >>  {
> >> -	if (is_guest_mode(&vmx->vcpu))
> >> -		return;
> >>  	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
> >>  				  VM_EXIT_INSTRUCTION_LEN,
> >>  				  IDT_VECTORING_ERROR_CODE);
> >> @@ -6504,7 +6513,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
> >>  
> >>  static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
> >>  {
> >> -	if (is_guest_mode(vcpu))
> >> +	if (is_guest_mode(vcpu) &&
> >> +	    get_vmcs12(vcpu)->idt_vectoring_info_field &
> >> +			VECTORING_INFO_VALID_MASK)
> > Why skip cancel_injection at all? As far as I see we can lose injected
> > irq if we do. Consider:
> > 
> >   io thread                                  vcpu in nested mode
> > set irr 200
> >                                           clear irr 200 set isr 200
> >                                           set 200 in VM_ENTRY_INTR_INFO_FIELD
> > set irr 250
> > set KVM_REQ_EVENT
> >                                           if (KVM_REQ_EVENT)
> >                                                   vmx_cancel_injection() <- does nothing
> 
> No, it does cancel as vmcs12's idt_vectoring_info_field signals an
> invalid state then. Only if we left L2 with valid vectoring info and are
> about to reenter, we skip the cancellation - but in that case we didn't
> inject anything from L0 previously anyway.
> 
Ah, I misread if() condition.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Feb. 17, 2013, 5:01 p.m. UTC | #4
On 2013-02-17 17:26, Gleb Natapov wrote:
> On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
>> On 2013-02-17 16:07, Gleb Natapov wrote:
>>> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>
>>>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
>>>> vmx_complete_interrupts on L2 exits. This is required because, with
>>>> direct interrupt injection from L0 to L2, L0 has to update its pending
>>>> events.
>>>>
>>>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
>>>> to L0. This condition is indirectly derived from the absence of valid
>>>> vectoring info in vmcs12. We no explicitly clear it if we find out that
>>>> the L2 exit is not targeting L1 but L0.
>>>>
>>> We really need to overhaul how interrupt injection is emulated in nested
>>> VMX. Why not put pending events into event queue instead of
>>> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
>>
>> I was thinking about the same step but felt unsure so far if
>> vmx_complete_interrupts & Co. do not include any assumptions about the
>> vmcs configuration that won't match what L1 does. So I went for a
>> different path first, specifically to avoid impact on these hairy bits
>> for non-nested mode.
>>
> Assumption made by those functions should be still correct since guest
> VMCS configuration is not applied directly to real HW, but we should be
> careful of course. For instance interrupt queues should be cleared
> during nested vmexit and event transfered back to idt_vectoring_info_field.
> IIRC this is how nested SVM works BTW.

Checking __vmx_complete_interrupts, the first issue I find is that type
5 (privileged software exception) is not decoded, thus will be lost if
L2 leaves this way. That's a reason why it might be better to re-inject
the content of vmcs12 if it is valid. VMX is a bit more hairy than SVM,
I guess.

> 
> And with you patch you did a half of the job already :) When exiting to
> L0 you transfer event information from get_vmcs12(vcpu)->idt_vectoring_info_field
> to our internal event queues anyway. Hmm, but you do not clear the queue
> during nested vmexit. So what happens if L2 exits to L0 with an exception
> in idt_vectoring_info_field. Now interrupt is delivered so nested vm exit
> is done, but exception is left in internal queue. I think it will be
> delivered into L1 during next vmentry.

Indeed. The queue is only cleared on L2->L0 exits (via the late
vmx_complete_interrupts). It should be cleared on L2->L1 exists as well.
Will fix.

Jan
Gleb Natapov Feb. 17, 2013, 5:35 p.m. UTC | #5
On Sun, Feb 17, 2013 at 06:01:05PM +0100, Jan Kiszka wrote:
> On 2013-02-17 17:26, Gleb Natapov wrote:
> > On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
> >> On 2013-02-17 16:07, Gleb Natapov wrote:
> >>> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
> >>>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>
> >>>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
> >>>> vmx_complete_interrupts on L2 exits. This is required because, with
> >>>> direct interrupt injection from L0 to L2, L0 has to update its pending
> >>>> events.
> >>>>
> >>>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
> >>>> to L0. This condition is indirectly derived from the absence of valid
> >>>> vectoring info in vmcs12. We no explicitly clear it if we find out that
> >>>> the L2 exit is not targeting L1 but L0.
> >>>>
> >>> We really need to overhaul how interrupt injection is emulated in nested
> >>> VMX. Why not put pending events into event queue instead of
> >>> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
> >>
> >> I was thinking about the same step but felt unsure so far if
> >> vmx_complete_interrupts & Co. do not include any assumptions about the
> >> vmcs configuration that won't match what L1 does. So I went for a
> >> different path first, specifically to avoid impact on these hairy bits
> >> for non-nested mode.
> >>
> > Assumption made by those functions should be still correct since guest
> > VMCS configuration is not applied directly to real HW, but we should be
> > careful of course. For instance interrupt queues should be cleared
> > during nested vmexit and event transfered back to idt_vectoring_info_field.
> > IIRC this is how nested SVM works BTW.
> 
> Checking __vmx_complete_interrupts, the first issue I find is that type
> 5 (privileged software exception) is not decoded, thus will be lost if
> L2 leaves this way. That's a reason why it might be better to re-inject
> the content of vmcs12 if it is valid. VMX is a bit more hairy than SVM,
> I guess.
> 
I do not see type 5 in SDM Table 24-15. We handle every type specified
there. Why shouldn't we? SVM and VMX are pretty close in regards to
event injection, this allowed us to move a lot of logic into the common
code.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Feb. 17, 2013, 5:39 p.m. UTC | #6
On 2013-02-17 18:35, Gleb Natapov wrote:
> On Sun, Feb 17, 2013 at 06:01:05PM +0100, Jan Kiszka wrote:
>> On 2013-02-17 17:26, Gleb Natapov wrote:
>>> On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
>>>> On 2013-02-17 16:07, Gleb Natapov wrote:
>>>>> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>
>>>>>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
>>>>>> vmx_complete_interrupts on L2 exits. This is required because, with
>>>>>> direct interrupt injection from L0 to L2, L0 has to update its pending
>>>>>> events.
>>>>>>
>>>>>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
>>>>>> to L0. This condition is indirectly derived from the absence of valid
>>>>>> vectoring info in vmcs12. We no explicitly clear it if we find out that
>>>>>> the L2 exit is not targeting L1 but L0.
>>>>>>
>>>>> We really need to overhaul how interrupt injection is emulated in nested
>>>>> VMX. Why not put pending events into event queue instead of
>>>>> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
>>>>
>>>> I was thinking about the same step but felt unsure so far if
>>>> vmx_complete_interrupts & Co. do not include any assumptions about the
>>>> vmcs configuration that won't match what L1 does. So I went for a
>>>> different path first, specifically to avoid impact on these hairy bits
>>>> for non-nested mode.
>>>>
>>> Assumption made by those functions should be still correct since guest
>>> VMCS configuration is not applied directly to real HW, but we should be
>>> careful of course. For instance interrupt queues should be cleared
>>> during nested vmexit and event transfered back to idt_vectoring_info_field.
>>> IIRC this is how nested SVM works BTW.
>>
>> Checking __vmx_complete_interrupts, the first issue I find is that type
>> 5 (privileged software exception) is not decoded, thus will be lost if
>> L2 leaves this way. That's a reason why it might be better to re-inject
>> the content of vmcs12 if it is valid. VMX is a bit more hairy than SVM,
>> I guess.
>>
> I do not see type 5 in SDM Table 24-15. We handle every type specified
> there. Why shouldn't we? SVM and VMX are pretty close in regards to
> event injection, this allowed us to move a lot of logic into the common
> code.

It's a type relevant for event delivery, see 24-16.

I think we only handle what we can possibly generate. This assumption
would have to be checked and potentially resolved before we can use the
standard code for nesting as well.

Jan
Gleb Natapov Feb. 17, 2013, 5:51 p.m. UTC | #7
On Sun, Feb 17, 2013 at 06:39:51PM +0100, Jan Kiszka wrote:
> On 2013-02-17 18:35, Gleb Natapov wrote:
> > On Sun, Feb 17, 2013 at 06:01:05PM +0100, Jan Kiszka wrote:
> >> On 2013-02-17 17:26, Gleb Natapov wrote:
> >>> On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
> >>>> On 2013-02-17 16:07, Gleb Natapov wrote:
> >>>>> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
> >>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>>
> >>>>>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
> >>>>>> vmx_complete_interrupts on L2 exits. This is required because, with
> >>>>>> direct interrupt injection from L0 to L2, L0 has to update its pending
> >>>>>> events.
> >>>>>>
> >>>>>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
> >>>>>> to L0. This condition is indirectly derived from the absence of valid
> >>>>>> vectoring info in vmcs12. We no explicitly clear it if we find out that
> >>>>>> the L2 exit is not targeting L1 but L0.
> >>>>>>
> >>>>> We really need to overhaul how interrupt injection is emulated in nested
> >>>>> VMX. Why not put pending events into event queue instead of
> >>>>> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
> >>>>
> >>>> I was thinking about the same step but felt unsure so far if
> >>>> vmx_complete_interrupts & Co. do not include any assumptions about the
> >>>> vmcs configuration that won't match what L1 does. So I went for a
> >>>> different path first, specifically to avoid impact on these hairy bits
> >>>> for non-nested mode.
> >>>>
> >>> Assumption made by those functions should be still correct since guest
> >>> VMCS configuration is not applied directly to real HW, but we should be
> >>> careful of course. For instance interrupt queues should be cleared
> >>> during nested vmexit and event transfered back to idt_vectoring_info_field.
> >>> IIRC this is how nested SVM works BTW.
> >>
> >> Checking __vmx_complete_interrupts, the first issue I find is that type
> >> 5 (privileged software exception) is not decoded, thus will be lost if
> >> L2 leaves this way. That's a reason why it might be better to re-inject
> >> the content of vmcs12 if it is valid. VMX is a bit more hairy than SVM,
> >> I guess.
> >>
> > I do not see type 5 in SDM Table 24-15. We handle every type specified
> > there. Why shouldn't we? SVM and VMX are pretty close in regards to
> > event injection, this allowed us to move a lot of logic into the common
> > code.
> 
> It's a type relevant for event delivery, see 24-16.
> 
> I think we only handle what we can possibly generate. This assumption
> would have to be checked and potentially resolved before we can use the
> standard code for nesting as well.
> 
I cannot find what can generate "privileged software exception" exit,
but on XEN ML I found that undocumented 0xf1 opcode (ICEBP) does it.
We should handle it regardless of nested vmx. Your patch already calls
__vmx_complete_interrupts() on nested idt_vectoring_info, so all
potential problem that it may cause should be addressed anyway.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Feb. 19, 2013, 10:04 a.m. UTC | #8
On 2013-02-17 18:35, Gleb Natapov wrote:
> On Sun, Feb 17, 2013 at 06:01:05PM +0100, Jan Kiszka wrote:
>> On 2013-02-17 17:26, Gleb Natapov wrote:
>>> On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
>>>> On 2013-02-17 16:07, Gleb Natapov wrote:
>>>>> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>
>>>>>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
>>>>>> vmx_complete_interrupts on L2 exits. This is required because, with
>>>>>> direct interrupt injection from L0 to L2, L0 has to update its pending
>>>>>> events.
>>>>>>
>>>>>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
>>>>>> to L0. This condition is indirectly derived from the absence of valid
>>>>>> vectoring info in vmcs12. We no explicitly clear it if we find out that
>>>>>> the L2 exit is not targeting L1 but L0.
>>>>>>
>>>>> We really need to overhaul how interrupt injection is emulated in nested
>>>>> VMX. Why not put pending events into event queue instead of
>>>>> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
>>>>
>>>> I was thinking about the same step but felt unsure so far if
>>>> vmx_complete_interrupts & Co. do not include any assumptions about the
>>>> vmcs configuration that won't match what L1 does. So I went for a
>>>> different path first, specifically to avoid impact on these hairy bits
>>>> for non-nested mode.
>>>>
>>> Assumption made by those functions should be still correct since guest
>>> VMCS configuration is not applied directly to real HW, but we should be
>>> careful of course. For instance interrupt queues should be cleared
>>> during nested vmexit and event transfered back to idt_vectoring_info_field.
>>> IIRC this is how nested SVM works BTW.
>>
>> Checking __vmx_complete_interrupts, the first issue I find is that type
>> 5 (privileged software exception) is not decoded, thus will be lost if
>> L2 leaves this way. That's a reason why it might be better to re-inject
>> the content of vmcs12 if it is valid. VMX is a bit more hairy than SVM,
>> I guess.
>>
> I do not see type 5 in SDM Table 24-15. We handle every type specified
> there. Why shouldn't we? SVM and VMX are pretty close in regards to
> event injection, this allowed us to move a lot of logic into the common
> code.

I had a look at SVM to check how it deals with this, but I'm not sure
if I understand the logic correctly. SVM does:

static int nested_svm_vmexit(struct vcpu_svm *svm)
{
	...
	/*
	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
	 * to make sure that we do not lose injected events. So check event_inj
	 * here and copy it to exit_int_info if it is valid.
	 * Exit_int_info and event_inj can't be both valid because the case
	 * below only happens on a VMRUN instruction intercept which has
	 * no valid exit_int_info set.
	 */
	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
		struct vmcb_control_area *nc = &nested_vmcb->control;

		nc->exit_int_info     = vmcb->control.event_inj;
		nc->exit_int_info_err = vmcb->control.event_inj_err;
	}

nested_svm_vmexit is only called when we leave L2 toward L1, right? So,
vmcb->control.event_inj might have been set on last VMRUN emulation, and
if that one failed, this value shall become the nested exit_int_info. So
far, so good.

But what if that injection succeeded and we are now exiting L2 past the
execution of VMRUN, e.g. L1 intercepts the execution of some special
instruction in L2? Doesn't the nested exit_int_info now gain a stale
value? Or does the hardware clear the valid bit int EVENTINJ on
successful injection? Didn't find an indication in the spec on first
glance.

Otherwise the logic seems to be like this:
 - EVENTINJ is set to the nested value on VMRUN emulation, and only
   there (that's in contrast to current VMX, but it makes sense)
 - Interrupt completion with state transfer the VCPU event queues is
   *only* performed on L2-to-L1 exits (that's like VMX is trying to do
   it as well)
 - There is a special case around nested.exit_required that I didn't
   fully get yet, nor can I say how it corresponds to logic in VMX.

Jan
Gleb Natapov Feb. 19, 2013, 1:13 p.m. UTC | #9
Copying Alex. He wrote nested SVM.

On Tue, Feb 19, 2013 at 11:04:01AM +0100, Jan Kiszka wrote:
> On 2013-02-17 18:35, Gleb Natapov wrote:
> > On Sun, Feb 17, 2013 at 06:01:05PM +0100, Jan Kiszka wrote:
> >> On 2013-02-17 17:26, Gleb Natapov wrote:
> >>> On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
> >>>> On 2013-02-17 16:07, Gleb Natapov wrote:
> >>>>> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
> >>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>>
> >>>>>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
> >>>>>> vmx_complete_interrupts on L2 exits. This is required because, with
> >>>>>> direct interrupt injection from L0 to L2, L0 has to update its pending
> >>>>>> events.
> >>>>>>
> >>>>>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
> >>>>>> to L0. This condition is indirectly derived from the absence of valid
> >>>>>> vectoring info in vmcs12. We no explicitly clear it if we find out that
> >>>>>> the L2 exit is not targeting L1 but L0.
> >>>>>>
> >>>>> We really need to overhaul how interrupt injection is emulated in nested
> >>>>> VMX. Why not put pending events into event queue instead of
> >>>>> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
> >>>>
> >>>> I was thinking about the same step but felt unsure so far if
> >>>> vmx_complete_interrupts & Co. do not include any assumptions about the
> >>>> vmcs configuration that won't match what L1 does. So I went for a
> >>>> different path first, specifically to avoid impact on these hairy bits
> >>>> for non-nested mode.
> >>>>
> >>> Assumption made by those functions should be still correct since guest
> >>> VMCS configuration is not applied directly to real HW, but we should be
> >>> careful of course. For instance interrupt queues should be cleared
> >>> during nested vmexit and event transfered back to idt_vectoring_info_field.
> >>> IIRC this is how nested SVM works BTW.
> >>
> >> Checking __vmx_complete_interrupts, the first issue I find is that type
> >> 5 (privileged software exception) is not decoded, thus will be lost if
> >> L2 leaves this way. That's a reason why it might be better to re-inject
> >> the content of vmcs12 if it is valid. VMX is a bit more hairy than SVM,
> >> I guess.
> >>
> > I do not see type 5 in SDM Table 24-15. We handle every type specified
> > there. Why shouldn't we? SVM and VMX are pretty close in regards to
> > event injection, this allowed us to move a lot of logic into the common
> > code.
> 
> I had a look at SVM to check how it deals with this, but I'm not sure
> if I understand the logic correctly. SVM does:
> 
> static int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> 	...
> 	/*
> 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
> 	 * to make sure that we do not lose injected events. So check event_inj
> 	 * here and copy it to exit_int_info if it is valid.
> 	 * Exit_int_info and event_inj can't be both valid because the case
> 	 * below only happens on a VMRUN instruction intercept which has
> 	 * no valid exit_int_info set.
> 	 */
> 	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
> 		struct vmcb_control_area *nc = &nested_vmcb->control;
> 
> 		nc->exit_int_info     = vmcb->control.event_inj;
> 		nc->exit_int_info_err = vmcb->control.event_inj_err;
> 	}
> 
> nested_svm_vmexit is only called when we leave L2 toward L1, right? So,
> vmcb->control.event_inj might have been set on last VMRUN emulation, and
> if that one failed, this value shall become the nested exit_int_info. So
> far, so good.
> 
> But what if that injection succeeded and we are now exiting L2 past the
> execution of VMRUN, e.g. L1 intercepts the execution of some special
> instruction in L2? Doesn't the nested exit_int_info now gain a stale
> value? Or does the hardware clear the valid bit int EVENTINJ on
> successful injection? Didn't find an indication in the spec on first
> glance.
I think it should. Otherwise, even without nested guest, event will be
reinject on the next entry.

> 
> Otherwise the logic seems to be like this:
>  - EVENTINJ is set to the nested value on VMRUN emulation, and only
>    there (that's in contrast to current VMX, but it makes sense)
>  - Interrupt completion with state transfer the VCPU event queues is
>    *only* performed on L2-to-L1 exits (that's like VMX is trying to do
>    it as well)
>  - There is a special case around nested.exit_required that I didn't
>    fully get yet, nor can I say how it corresponds to logic in VMX.
> 
> Jan
> 



--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Feb. 19, 2013, 1:41 p.m. UTC | #10
On 2013-02-19 14:13, Gleb Natapov wrote:
> Copying Alex. He wrote nested SVM.
> 
> On Tue, Feb 19, 2013 at 11:04:01AM +0100, Jan Kiszka wrote:
>> On 2013-02-17 18:35, Gleb Natapov wrote:
>>> On Sun, Feb 17, 2013 at 06:01:05PM +0100, Jan Kiszka wrote:
>>>> On 2013-02-17 17:26, Gleb Natapov wrote:
>>>>> On Sun, Feb 17, 2013 at 04:31:26PM +0100, Jan Kiszka wrote:
>>>>>> On 2013-02-17 16:07, Gleb Natapov wrote:
>>>>>>> On Sat, Feb 16, 2013 at 06:10:14PM +0100, Jan Kiszka wrote:
>>>>>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>>
>>>>>>>> If L1 does not set PIN_BASED_EXT_INTR_MASK, we incorrectly skipped
>>>>>>>> vmx_complete_interrupts on L2 exits. This is required because, with
>>>>>>>> direct interrupt injection from L0 to L2, L0 has to update its pending
>>>>>>>> events.
>>>>>>>>
>>>>>>>> Also, we need to allow vmx_cancel_injection when entering L2 in we left
>>>>>>>> to L0. This condition is indirectly derived from the absence of valid
>>>>>>>> vectoring info in vmcs12. We no explicitly clear it if we find out that
>>>>>>>> the L2 exit is not targeting L1 but L0.
>>>>>>>>
>>>>>>> We really need to overhaul how interrupt injection is emulated in nested
>>>>>>> VMX. Why not put pending events into event queue instead of
>>>>>>> get_vmcs12(vcpu)->idt_vectoring_info_field and inject them in usual way.
>>>>>>
>>>>>> I was thinking about the same step but felt unsure so far if
>>>>>> vmx_complete_interrupts & Co. do not include any assumptions about the
>>>>>> vmcs configuration that won't match what L1 does. So I went for a
>>>>>> different path first, specifically to avoid impact on these hairy bits
>>>>>> for non-nested mode.
>>>>>>
>>>>> Assumption made by those functions should be still correct since guest
>>>>> VMCS configuration is not applied directly to real HW, but we should be
>>>>> careful of course. For instance interrupt queues should be cleared
>>>>> during nested vmexit and event transfered back to idt_vectoring_info_field.
>>>>> IIRC this is how nested SVM works BTW.
>>>>
>>>> Checking __vmx_complete_interrupts, the first issue I find is that type
>>>> 5 (privileged software exception) is not decoded, thus will be lost if
>>>> L2 leaves this way. That's a reason why it might be better to re-inject
>>>> the content of vmcs12 if it is valid. VMX is a bit more hairy than SVM,
>>>> I guess.
>>>>
>>> I do not see type 5 in SDM Table 24-15. We handle every type specified
>>> there. Why shouldn't we? SVM and VMX are pretty close in regards to
>>> event injection, this allowed us to move a lot of logic into the common
>>> code.
>>
>> I had a look at SVM to check how it deals with this, but I'm not sure
>> if I understand the logic correctly. SVM does:
>>
>> static int nested_svm_vmexit(struct vcpu_svm *svm)
>> {
>> 	...
>> 	/*
>> 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
>> 	 * to make sure that we do not lose injected events. So check event_inj
>> 	 * here and copy it to exit_int_info if it is valid.
>> 	 * Exit_int_info and event_inj can't be both valid because the case
>> 	 * below only happens on a VMRUN instruction intercept which has
>> 	 * no valid exit_int_info set.
>> 	 */
>> 	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
>> 		struct vmcb_control_area *nc = &nested_vmcb->control;
>>
>> 		nc->exit_int_info     = vmcb->control.event_inj;
>> 		nc->exit_int_info_err = vmcb->control.event_inj_err;
>> 	}
>>
>> nested_svm_vmexit is only called when we leave L2 toward L1, right? So,
>> vmcb->control.event_inj might have been set on last VMRUN emulation, and
>> if that one failed, this value shall become the nested exit_int_info. So
>> far, so good.
>>
>> But what if that injection succeeded and we are now exiting L2 past the
>> execution of VMRUN, e.g. L1 intercepts the execution of some special
>> instruction in L2? Doesn't the nested exit_int_info now gain a stale
>> value? Or does the hardware clear the valid bit int EVENTINJ on
>> successful injection? Didn't find an indication in the spec on first
>> glance.
> I think it should. Otherwise, even without nested guest, event will be
> reinject on the next entry.

OK... there is apparently no place where event_inj is cleared (except
for cancellation).

Makes me wonder now where a difference between event_inj and
exit_int_info could come from. From the case where we did no physical
VMRUN (nested.exit_required == true)?

Jan

> 
>>
>> Otherwise the logic seems to be like this:
>>  - EVENTINJ is set to the nested value on VMRUN emulation, and only
>>    there (that's in contrast to current VMX, but it makes sense)
>>  - Interrupt completion with state transfer the VCPU event queues is
>>    *only* performed on L2-to-L1 exits (that's like VMX is trying to do
>>    it as well)
>>  - There is a special case around nested.exit_required that I didn't
>>    fully get yet, nor can I say how it corresponds to logic in VMX.
>>
>> Jan
>>
> 
> 
> 
> --
> 			Gleb.
>
Joerg Roedel Feb. 19, 2013, 4:14 p.m. UTC | #11
On Tue, Feb 19, 2013 at 11:04:01AM +0100, Jan Kiszka wrote:
> I had a look at SVM to check how it deals with this, but I'm not sure
> if I understand the logic correctly. SVM does:
> 
> static int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> 	...
> 	/*
> 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
> 	 * to make sure that we do not lose injected events. So check event_inj
> 	 * here and copy it to exit_int_info if it is valid.
> 	 * Exit_int_info and event_inj can't be both valid because the case
> 	 * below only happens on a VMRUN instruction intercept which has
> 	 * no valid exit_int_info set.
> 	 */
> 	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
> 		struct vmcb_control_area *nc = &nested_vmcb->control;
> 
> 		nc->exit_int_info     = vmcb->control.event_inj;
> 		nc->exit_int_info_err = vmcb->control.event_inj_err;
> 	}
> 
> nested_svm_vmexit is only called when we leave L2 toward L1, right?

Right.

> So, vmcb->control.event_inj might have been set on last VMRUN emulation, and
> if that one failed, this value shall become the nested exit_int_info. So
> far, so good.

Important fact here: This L2->L1 exit is emulated in the same real
#vmexit cycle as the VMRUN was emulated. So what happens is:

	1. VMRUN intercept from L1
	2. We emulate the VMRUN and load L2 state into VMCB
	3. On the way back to guest mode (to actually run the L2) we
	   detect a #vmexit condition
	4. So we roll-back by calling nested_svm_vmexit()
	5. We enter the guest again which continues execution right
	   after its VMRUN instruction.

So we never actually entered L2, but for L1 it has to look like it was
in L2 and made no progress. But when coming out of a guest event_inj is
never valid, so without the special case above we make sure that the L1
hypervisor re-injects the event so it is not lost.


> But what if that injection succeeded and we are now exiting L2 past the
> execution of VMRUN, e.g. L1 intercepts the execution of some special
> instruction in L2? Doesn't the nested exit_int_info now gain a stale
> value? Or does the hardware clear the valid bit int EVENTINJ on
> successful injection? Didn't find an indication in the spec on first
> glance.

Hardware clears event_inj. If the injection was not successful the event
is reported in exit_int_info.

> Otherwise the logic seems to be like this:
>  - EVENTINJ is set to the nested value on VMRUN emulation, and only
>    there (that's in contrast to current VMX, but it makes sense)
>  - Interrupt completion with state transfer the VCPU event queues is
>    *only* performed on L2-to-L1 exits (that's like VMX is trying to do
>    it as well)
>  - There is a special case around nested.exit_required that I didn't
>    fully get yet, nor can I say how it corresponds to logic in VMX.

Which special case do you mean? There are checks in
nested_svm_check_exception() and nested_svm_intr().


Regards,

	Joerg


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Feb. 19, 2013, 4:19 p.m. UTC | #12
On 2013-02-19 17:14, Joerg Roedel wrote:
> On Tue, Feb 19, 2013 at 11:04:01AM +0100, Jan Kiszka wrote:
>> I had a look at SVM to check how it deals with this, but I'm not sure
>> if I understand the logic correctly. SVM does:
>>
>> static int nested_svm_vmexit(struct vcpu_svm *svm)
>> {
>> 	...
>> 	/*
>> 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
>> 	 * to make sure that we do not lose injected events. So check event_inj
>> 	 * here and copy it to exit_int_info if it is valid.
>> 	 * Exit_int_info and event_inj can't be both valid because the case
>> 	 * below only happens on a VMRUN instruction intercept which has
>> 	 * no valid exit_int_info set.
>> 	 */
>> 	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
>> 		struct vmcb_control_area *nc = &nested_vmcb->control;
>>
>> 		nc->exit_int_info     = vmcb->control.event_inj;
>> 		nc->exit_int_info_err = vmcb->control.event_inj_err;
>> 	}
>>
>> nested_svm_vmexit is only called when we leave L2 toward L1, right?
> 
> Right.
> 
>> So, vmcb->control.event_inj might have been set on last VMRUN emulation, and
>> if that one failed, this value shall become the nested exit_int_info. So
>> far, so good.
> 
> Important fact here: This L2->L1 exit is emulated in the same real
> #vmexit cycle as the VMRUN was emulated. So what happens is:
> 
> 	1. VMRUN intercept from L1
> 	2. We emulate the VMRUN and load L2 state into VMCB
> 	3. On the way back to guest mode (to actually run the L2) we
> 	   detect a #vmexit condition
> 	4. So we roll-back by calling nested_svm_vmexit()
> 	5. We enter the guest again which continues execution right
> 	   after its VMRUN instruction.
> 
> So we never actually entered L2, but for L1 it has to look like it was
> in L2 and made no progress. But when coming out of a guest event_inj is
> never valid, so without the special case above we make sure that the L1
> hypervisor re-injects the event so it is not lost.

Thanks for explaining. Already assumed this in my other mail. Makes
perfect sense now.

Need to rethink if / how this maps on VMX and if there is something
special on that side.

> 
> 
>> But what if that injection succeeded and we are now exiting L2 past the
>> execution of VMRUN, e.g. L1 intercepts the execution of some special
>> instruction in L2? Doesn't the nested exit_int_info now gain a stale
>> value? Or does the hardware clear the valid bit int EVENTINJ on
>> successful injection? Didn't find an indication in the spec on first
>> glance.
> 
> Hardware clears event_inj. If the injection was not successful the event
> is reported in exit_int_info.
> 
>> Otherwise the logic seems to be like this:
>>  - EVENTINJ is set to the nested value on VMRUN emulation, and only
>>    there (that's in contrast to current VMX, but it makes sense)
>>  - Interrupt completion with state transfer the VCPU event queues is
>>    *only* performed on L2-to-L1 exits (that's like VMX is trying to do
>>    it as well)
>>  - There is a special case around nested.exit_required that I didn't
>>    fully get yet, nor can I say how it corresponds to logic in VMX.
> 
> Which special case do you mean? There are checks in
> nested_svm_check_exception() and nested_svm_intr().

What you explained above. It wasn't clear for me at that point when
exit_required is set and what the implications are.

Jan
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 68a045ae..464b6a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -624,6 +624,7 @@  static void vmx_get_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -6213,9 +6214,19 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	else
 		vmx->nested.nested_run_pending = 0;
 
-	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
-		nested_vmx_vmexit(vcpu);
-		return 1;
+	if (is_guest_mode(vcpu)) {
+		if (nested_vmx_exit_handled(vcpu)) {
+			nested_vmx_vmexit(vcpu);
+			return 1;
+		}
+		/*
+		 * Now it's clear, we are leaving to L0. Perform the postponed
+		 * interrupt completion and clear L1's vectoring info field so
+		 * that we do not overwrite what L0 wants to inject on
+		 * re-entry.
+		 */
+		vmx_complete_interrupts(vmx);
+		get_vmcs12(vcpu)->idt_vectoring_info_field = 0;
 	}
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
@@ -6495,8 +6506,6 @@  static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-	if (is_guest_mode(&vmx->vcpu))
-		return;
 	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
@@ -6504,7 +6513,9 @@  static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu))
+	if (is_guest_mode(vcpu) &&
+	    get_vmcs12(vcpu)->idt_vectoring_info_field &
+			VECTORING_INFO_VALID_MASK)
 		return;
 	__vmx_complete_interrupts(to_vmx(vcpu),
 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -6710,6 +6721,14 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
+	vmx->loaded_vmcs->launched = 1;
+
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+
+	vmx_complete_atomic_exit(vmx);
+	vmx_recover_nmi_blocking(vmx);
+
 	if (is_guest_mode(vcpu)) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 		vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
@@ -6719,16 +6738,8 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 			vmcs12->vm_exit_instruction_len =
 				vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 		}
-	}
-
-	vmx->loaded_vmcs->launched = 1;
-
-	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
-
-	vmx_complete_atomic_exit(vmx);
-	vmx_recover_nmi_blocking(vmx);
-	vmx_complete_interrupts(vmx);
+	} else
+		vmx_complete_interrupts(vmx);
 }
 
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)