diff mbox

[v4,6/6] KVM: nVMX: Enable nested posted interrupt processing

Message ID CACzj_yUUuR6grQyJWLAajay1BgNduUVWRRf5+AKKLjXTUmzZJQ@mail.gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wincy Van Jan. 28, 2015, 4:02 p.m. UTC
If vcpu has a interrupt in vmx non-root mode, we will
kick that vcpu to inject interrupt timely. With posted
interrupt processing, the kick intr is not needed, and
interrupts are fully taken care of by hardware.

In nested vmx, this feature avoids much more vmexits
than non-nested vmx.

This patch use L0's POSTED_INTR_NV to avoid unexpected
interrupt if L1's vector is different with L0's. If vcpu
is in hardware's non-root mode, we use a physical ipi to
deliver posted interrupts, otherwise we will deliver that
interrupt to L1 and kick that vcpu out of nested
non-root mode.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
---
 arch/x86/kvm/vmx.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 81 insertions(+), 3 deletions(-)

        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2353,6 +2365,9 @@ static void nested_vmx_setup_ctls_msrs(struct
vcpu_vmx *vmx)
        vmx->nested.nested_vmx_pinbased_ctls_high |=
                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                PIN_BASED_VMX_PREEMPTION_TIMER;
+       if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+               vmx->nested.nested_vmx_pinbased_ctls_high |=
+                       PIN_BASED_POSTED_INTR;

        /* exit controls */
        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
@@ -4304,6 +4319,19 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
        return enable_apicv && irqchip_in_kernel(kvm);
 }

+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+                                               int vector)
+{
+       if (is_guest_mode(vcpu) &&
+           vector == to_vmx(vcpu)->nested.posted_intr_nv &&
+           vcpu->mode == IN_GUEST_MODE) {
+               /* the PIR and ON have been set by L1. */
+               apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+                               POSTED_INTR_VECTOR);
+               return 0;
+       }
+       return -1;
+}
 /*
  * Send interrupt to vcpu via posted interrupt way.
  * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4316,6 +4344,10 @@ static void vmx_deliver_posted_interrupt(struct
kvm_vcpu *vcpu, int vector)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int r;

+       r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+       if (!r)
+               return;
+
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
                return;

@@ -6561,6 +6593,7 @@ static inline void nested_release_vmcs12(struct
vcpu_vmx *vmx)
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
                vmcs_write64(VMCS_LINK_POINTER, -1ull);
        }
+       vmx->nested.posted_intr_nv = -1;
        kunmap(vmx->nested.current_vmcs12_page);
        nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
@@ -6589,6 +6622,10 @@ static void free_nested(struct vcpu_vmx *vmx)
                nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
+       if (vmx->nested.pi_desc_page) {
+               nested_release_page(vmx->nested.pi_desc_page);
+               vmx->nested.pi_desc_page = NULL;
+       }

        nested_free_all_saved_vmcss(vmx);
 }
@@ -8182,6 +8219,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct
kvm *kvm, unsigned int id)
        if (nested)
                nested_vmx_setup_ctls_msrs(vmx);

+       vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
        vmx->nested.current_vmcs12 = NULL;

@@ -8415,6 +8453,19 @@ static bool nested_get_vmcs12_pages(struct
kvm_vcpu *vcpu,
                        return false;
        }

+       if (nested_cpu_has_posted_intr(vmcs12)) {
+               if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+                       return false;
+
+               if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+                       nested_release_page(vmx->nested.pi_desc_page);
+               }
+               vmx->nested.pi_desc_page =
+                       nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+               if (!vmx->nested.pi_desc_page)
+                       return false;
+       }
+
        return true;
 }

@@ -8550,7 +8601,8 @@ static int
nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 {
        if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
            !nested_cpu_has_apic_reg_virt(vmcs12) &&
-           !nested_cpu_has_vid(vmcs12))
+           !nested_cpu_has_vid(vmcs12) &&
+           !nested_cpu_has_posted_intr(vmcs12))
                return 0;

        /*
@@ -8569,6 +8621,17 @@ static int
nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
           !nested_exit_on_intr(vcpu))
                return -EINVAL;

+       /*
+        * bits 15:8 should be zero in posted_intr_nv,
+        * the descriptor address has been already checked
+        * in nested_get_vmcs12_pages.
+        */
+       if (nested_cpu_has_posted_intr(vmcs12) &&
+          (!nested_cpu_has_vid(vmcs12) ||
+           !nested_exit_intr_ack_set(vcpu) ||
+           vmcs12->posted_intr_nv & 0xff00))
+               return -EINVAL;
+
        /* tpr shadow is needed by all apicv features. */
        if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
                return -EINVAL;
@@ -8811,8 +8874,19 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)

        exec_control = vmcs12->pin_based_vm_exec_control;
        exec_control |= vmcs_config.pin_based_exec_ctrl;
-       exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
-                          PIN_BASED_POSTED_INTR);
+       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+       if (nested_cpu_has_posted_intr(vmcs12)) {
+               /* Note that we use L0's vector to avoid unexpected intr. */
+               vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+               vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+               vmcs_write64(POSTED_INTR_DESC_ADDR,
+                       page_to_phys(vmx->nested.pi_desc_page) +
+                       (unsigned long)(vmcs12->posted_intr_desc_addr &
+                       (PAGE_SIZE - 1)));
+       } else
+               exec_control &= ~PIN_BASED_POSTED_INTR;
+
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);

        vmx->nested.preemption_timer_expired = false;
@@ -9728,6 +9802,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu
*vcpu, u32 exit_reason,
                nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
+       if (vmx->nested.pi_desc_page) {
+               nested_release_page(vmx->nested.pi_desc_page);
+               vmx->nested.pi_desc_page = NULL;
+       }

        /*
         * We are now running in L2, mmu_notifier will force to reload the
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Paolo Bonzini Feb. 2, 2015, 11:03 a.m. UTC | #1
On 28/01/2015 17:02, Wincy Van wrote:
> +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
> +                                               int vector)
> +{
> +       if (is_guest_mode(vcpu) &&
> +           vector == to_vmx(vcpu)->nested.posted_intr_nv &&
> +           vcpu->mode == IN_GUEST_MODE) {
> +               /* the PIR and ON have been set by L1. */

What happens if there is a L2->L0->L2 exit on the target VCPU, and the
guest exits before apic->send_IPI_mask sends the IPI?

The L1 hypervisor might "know" somehow that there cannot be a concurrent
L2->L1->L2 exit, and not do the equivalent of KVM's

        kvm_make_request(KVM_REQ_EVENT, vcpu);

after it sets ON.

So I think you have to do something like

static bool vmx_is_nested_posted_interrupt(struct kvm_vcpu *vcpu,
					   int vector)
{
	return (is_guest_mode(vcpu) &&
                vector == to_vmx(vcpu)->nested.posted_intr_nv);
}

and in vmx_deliver_posted_interrupt:

	r = 0;
	if (!vmx_is_nested_posted_interrupt(vcpu, vector)) {
	        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
	                return;

	        r = pi_test_and_set_on(&vmx->pi_desc);
	}
        kvm_make_request(KVM_REQ_EVENT, vcpu);
#ifdef CONFIG_SMP
        if (!r && (vcpu->mode == IN_GUEST_MODE))
                apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
                                POSTED_INTR_VECTOR);
        else
#endif
                kvm_vcpu_kick(vcpu);


What do you think?

Paolo

> +               apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
> +                               POSTED_INTR_VECTOR);
> +               return 0;
> +       }
> +       return -1;
> +}
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wincy Van Feb. 2, 2015, 3:33 p.m. UTC | #2
On Mon, Feb 2, 2015 at 7:03 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 28/01/2015 17:02, Wincy Van wrote:
>> +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
>> +                                               int vector)
>> +{
>> +       if (is_guest_mode(vcpu) &&
>> +           vector == to_vmx(vcpu)->nested.posted_intr_nv &&
>> +           vcpu->mode == IN_GUEST_MODE) {
>> +               /* the PIR and ON have been set by L1. */
>
> What happens if there is a L2->L0->L2 exit on the target VCPU, and the
> guest exits before apic->send_IPI_mask sends the IPI?
>
> The L1 hypervisor might "know" somehow that there cannot be a concurrent
> L2->L1->L2 exit, and not do the equivalent of KVM's
>

In non-nested case, if a posted intr was not accomplished by hardware,
we will sync the pir to irr and set rvi to accomplish it.
Current implementation may lead some of the nested posted intrs delay
for a short time(wait for a nested-vmexit).

>         kvm_make_request(KVM_REQ_EVENT, vcpu);
>
> after it sets ON.
>
> So I think you have to do something like
>
> static bool vmx_is_nested_posted_interrupt(struct kvm_vcpu *vcpu,
>                                            int vector)
> {
>         return (is_guest_mode(vcpu) &&
>                 vector == to_vmx(vcpu)->nested.posted_intr_nv);
> }
>
> and in vmx_deliver_posted_interrupt:
>
>         r = 0;
>         if (!vmx_is_nested_posted_interrupt(vcpu, vector)) {
>                 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
>                         return;
>
>                 r = pi_test_and_set_on(&vmx->pi_desc);
>         }
>         kvm_make_request(KVM_REQ_EVENT, vcpu);
> #ifdef CONFIG_SMP
>         if (!r && (vcpu->mode == IN_GUEST_MODE))
>                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>                                 POSTED_INTR_VECTOR);
>         else
> #endif
>                 kvm_vcpu_kick(vcpu);
>
>
> What do you think?
>

I think that there would be a way to avoid that delay, but may hurt performance:
When doing nested posted intr, we can set a request bit:

       if (is_guest_mode(vcpu) &&
            vector == to_vmx(vcpu)->nested.posted_intr_nv &&
            vcpu->mode == IN_GUEST_MODE) {
                /* the PIR and ON have been set by L1. */
                apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
                                POSTED_INTR_VECTOR);
+                kvm_make_request(KVM_REQ_ACCOMP_POSTED_INTR, vcpu);
                return 0;
        }

If a posted intr was not accomplished by hardware,  we can check that
bit before checking KVM_REQ_EVENT, and if that bit is set, we can do:

static void vmx_accomp_nested_posted_intr(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (is_guest_mode(vcpu) &&
            vmx->nested.posted_intr_nv != -1 &&
            pi_test_on(vmx->nested.pi_desc))
                kvm_apic_set_irr(vcpu,
                        vmx->nested.posted_intr_nv);
}

Then we will get an nested-vmexit in vmx_check_nested_events, that
posted intr will be handled by L1 immediately.
This mechanism will also emulate the hardware's behavior: If a posted
intr was not accomplished by hardware, we will get an interrupt with
POSTED_INTR_NV.

Would this be better?

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 2, 2015, 4:14 p.m. UTC | #3
On 02/02/2015 16:33, Wincy Van wrote:
> static void vmx_accomp_nested_posted_intr(struct kvm_vcpu *vcpu)
> {
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
> 
>         if (is_guest_mode(vcpu) &&
>             vmx->nested.posted_intr_nv != -1 &&
>             pi_test_on(vmx->nested.pi_desc))
>                 kvm_apic_set_irr(vcpu,
>                         vmx->nested.posted_intr_nv);
> }
> 
> Then we will get an nested-vmexit in vmx_check_nested_events, that
> posted intr will be handled by L1 immediately.
> This mechanism will also emulate the hardware's behavior: If a posted
> intr was not accomplished by hardware, we will get an interrupt with
> POSTED_INTR_NV.

Yes.

> Would this be better?

I think you do not even need a new bit.  You can use KVM_REQ_EVENT and
(to complete my suggestion, which was not enough) do the above in
vmx_check_nested_events.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Feb. 3, 2015, 1:21 a.m. UTC | #4
Paolo Bonzini wrote on 2015-02-03:
> 

> 

> On 02/02/2015 16:33, Wincy Van wrote:

>> static void vmx_accomp_nested_posted_intr(struct kvm_vcpu *vcpu) {

>>         struct vcpu_vmx *vmx = to_vmx(vcpu);

>>         

>>         if (is_guest_mode(vcpu) &&

>>             vmx->nested.posted_intr_nv != -1 &&

>>             pi_test_on(vmx->nested.pi_desc))

>>                 kvm_apic_set_irr(vcpu,

>>                         vmx->nested.posted_intr_nv); }

>> Then we will get an nested-vmexit in vmx_check_nested_events, that

>> posted intr will be handled by L1 immediately.

>> This mechanism will also emulate the hardware's behavior: If a

>> posted intr was not accomplished by hardware, we will get an


Actually, we cannot say "not accomplished by hardware". It more like we don't do the job well. See my below answer.

>> interrupt with POSTED_INTR_NV.

> 

> Yes.


This is not enough. From L1's point, L2 is in vmx non-root mode. So we should emulate the posted interrupt in L0 correctly, say:
1. clear ON bit
2. ack interrupt
3, syn pir to virr
4. update RVI.
Then let the hardware(virtual interrupt delivery) to accomplish interrupt injection.

Force a vmexit more like a trick. It's better to follow the hardware's behavior unless we cannot do it. 

> 

>> Would this be better?

> 

> I think you do not even need a new bit.  You can use KVM_REQ_EVENT and

> (to complete my suggestion, which was not enough) do the above in

> vmx_check_nested_events.

> 

> Paolo



Best regards,
Yang
Wincy Van Feb. 3, 2015, 3:32 a.m. UTC | #5
On Tue, Feb 3, 2015 at 9:21 AM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
> Paolo Bonzini wrote on 2015-02-03:
>>
>>
>> On 02/02/2015 16:33, Wincy Van wrote:
>>> static void vmx_accomp_nested_posted_intr(struct kvm_vcpu *vcpu) {
>>>         struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>
>>>         if (is_guest_mode(vcpu) &&
>>>             vmx->nested.posted_intr_nv != -1 &&
>>>             pi_test_on(vmx->nested.pi_desc))
>>>                 kvm_apic_set_irr(vcpu,
>>>                         vmx->nested.posted_intr_nv); }
>>> Then we will get an nested-vmexit in vmx_check_nested_events, that
>>> posted intr will be handled by L1 immediately.
>>> This mechanism will also emulate the hardware's behavior: If a
>>> posted intr was not accomplished by hardware, we will get an
>
> Actually, we cannot say "not accomplished by hardware". It more like we don't do the job well. See my below answer.
>

Yes, exactly.

>>> interrupt with POSTED_INTR_NV.
>>
>> Yes.
>
> This is not enough. From L1's point, L2 is in vmx non-root mode. So we should emulate the posted interrupt in L0 correctly, say:
> 1. clear ON bit
> 2. ack interrupt
> 3, syn pir to virr
> 4. update RVI.
> Then let the hardware(virtual interrupt delivery) to accomplish interrupt injection.
>
> Force a vmexit more like a trick. It's better to follow the hardware's behavior unless we cannot do it.
>

Yes, I will try again to do this.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ab131f3..85a163c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,7 @@  struct __packed vmcs12 {
        u64 tsc_offset;
        u64 virtual_apic_page_addr;
        u64 apic_access_addr;
+       u64 posted_intr_desc_addr;
        u64 ept_pointer;
        u64 eoi_exit_bitmap0;
        u64 eoi_exit_bitmap1;
@@ -334,6 +335,7 @@  struct __packed vmcs12 {
        u32 vmx_preemption_timer_value;
        u32 padding32[7]; /* room for future expansion */
        u16 virtual_processor_id;
+       u16 posted_intr_nv;
        u16 guest_es_selector;
        u16 guest_cs_selector;
        u16 guest_ss_selector;
@@ -406,6 +408,8 @@  struct nested_vmx {
         */
        struct page *apic_access_page;
        struct page *virtual_apic_page;
+       struct page *pi_desc_page;
+       u16 posted_intr_nv;
        u64 msr_ia32_feature_control;

        struct hrtimer preemption_timer;
@@ -621,6 +625,7 @@  static int max_shadow_read_write_fields =

 static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+       FIELD(POSTED_INTR_NV, posted_intr_nv),
        FIELD(GUEST_ES_SELECTOR, guest_es_selector),
        FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
        FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -646,6 +651,7 @@  static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(TSC_OFFSET, tsc_offset),
        FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
        FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+       FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
        FIELD64(EPT_POINTER, ept_pointer),
        FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
        FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -798,6 +804,7 @@  static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -1150,6 +1157,11 @@  static inline bool nested_cpu_has_vid(struct
vmcs12 *vmcs12)
        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 }

+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+       return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool is_exception(u32 intr_info)
 {