diff mbox

[PART1,RFC,6/9] svm: Add interrupt injection via AVIC

Message ID 1455285574-27892-7-git-send-email-suravee.suthikulpanit@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Suthikulpanit, Suravee Feb. 12, 2016, 1:59 p.m. UTC
From: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>

VINTR is not supported when enable AVIC. Therefore, we need to inject
interrupt via APIC backing page instead. Also, adding AVIC doorbell
support to signal running vcpu to check IRR for injected interrupts.

This patch also introduces kvm_x86_ops.apicv_intr_pending() to allow SVM
to provide a function hook to query AVIC interrupt pending status.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 arch/x86/include/asm/kvm_host.h  |   2 +
 arch/x86/include/asm/msr-index.h |   1 +
 arch/x86/kvm/lapic.c             |   3 +-
 arch/x86/kvm/svm.c               | 110 +++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.c               |   4 +-
 5 files changed, 114 insertions(+), 6 deletions(-)

Comments

Suthikulpanit, Suravee Feb. 12, 2016, 3:54 p.m. UTC | #1
Hi,

On 02/12/2016 09:16 PM, Borislav Petkov wrote:
>> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
>> >index b05402e..605b869 100644
>> >--- a/arch/x86/include/asm/msr-index.h
>> >+++ b/arch/x86/include/asm/msr-index.h
>> >@@ -284,6 +284,7 @@
>> >  #define MSR_AMD64_TSC_RATIO		0xc0000104
>> >  #define MSR_AMD64_NB_CFG		0xc001001f
>> >  #define MSR_AMD64_PATCH_LOADER		0xc0010020
>> >+#define MSR_AMD64_AVIC_DOORBELL		0xc001011b
> AFAICT, that MSR is being used only in arch/x86/kvm/lapic.c
>
> Please add it there.

Do you mean in the arch/x86/kvm/svm.c? If so, sure, I'll only put it there.

Suravee
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 12, 2016, 3:55 p.m. UTC | #2
On 12/02/2016 14:59, Suravee Suthikulpanit wrote:
> +static bool avic_check_irr_pending(struct kvm_vcpu *vcpu)
> +{
> +	int i;
> +	u32 irr;
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	for (i = 0; i < 8; i++) {
> +		irr = *(avic_get_bk_page_entry(svm,
> +					APIC_IRR + (0x10 * i)));
> +		if (irr)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static bool svm_avic_check_ppr(struct vcpu_svm *svm)
> +{
> +	u32 tpr = *(avic_get_bk_page_entry(svm, APIC_TASKPRI));
> +	u32 ppr = *(avic_get_bk_page_entry(svm, APIC_PROCPRI));
> +
> +	if (ppr && (ppr != tpr))
> +		return true;
> +
> +	return false;
> +}
> +
> +/* Note: Returns true means do not block */
> +static bool svm_apicv_intr_pending (struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!avic)
> +		return false;
> +
> +	if (atomic_read(&svm->avic_pending_cnt))
> +		return true;
> +
> +	return avic_check_irr_pending(vcpu);
> +}
> +
> +static void avic_post_vmrun(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!avic)
> +		return;
> +
> +	if (atomic_read(&svm->avic_pending_cnt)) {
> +		if (svm_avic_check_ppr(svm))
> +			return;
> +		if (avic_check_irr_pending(vcpu))
> +			return;
> +		/*
> +		 * At this point, if there is no interrupt pending.
> +		 * So, we decrement the pending count
> +		 */
> +		atomic_dec(&svm->avic_pending_cnt);
> +	}
> +}
> +
>  static void svm_vcpu_run(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -4588,6 +4686,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
>  	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
>  		kvm_after_handle_nmi(&svm->vcpu);
>  
> +	avic_post_vmrun(vcpu);
> +
>  	sync_cr8_to_lapic(vcpu);
>  
>  	svm->next_rip = 0;
> @@ -5050,7 +5150,9 @@ static struct kvm_x86_ops svm_x86_ops = {
>  
>  	.sched_in = svm_sched_in,
>  
> +	.apicv_intr_pending = svm_apicv_intr_pending,
>  	.pmu_ops = &amd_pmu_ops,
> +	.deliver_posted_interrupt = svm_deliver_avic_intr,
>  };
>  
>  static int __init svm_init(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 4244c2b..2def290 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8087,7 +8087,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
>  	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
>  		kvm_x86_ops->check_nested_events(vcpu, false);
>  
> -	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
> +	return (kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu) ||
> +		(kvm_x86_ops->apicv_intr_pending &&
> +		 kvm_x86_ops->apicv_intr_pending(vcpu)));
>  }

I think this is not necessary.  What you need is to make kvm_lapic's
regs field point to the backing page.  Then when the processor writes to
IRR, kvm_apic_has_interrupt (called through kvm_vcpu_has_events) will
see it.

avic_pending_cnt shouldn't be necessary either.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee Feb. 12, 2016, 4:21 p.m. UTC | #3
Hi Paolo,

On 02/12/2016 10:55 PM, Paolo Bonzini wrote:
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >index 4244c2b..2def290 100644
>> >--- a/arch/x86/kvm/x86.c
>> >+++ b/arch/x86/kvm/x86.c
>> >@@ -8087,7 +8087,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
>> >  	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
>> >  		kvm_x86_ops->check_nested_events(vcpu, false);
>> >
>> >-	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
>> >+	return (kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu) ||
>> >+		(kvm_x86_ops->apicv_intr_pending &&
>> >+		 kvm_x86_ops->apicv_intr_pending(vcpu)));
>> >  }
> I think this is not necessary.  What you need is to make kvm_lapic's
> regs field point to the backing page.  Then when the processor writes to
> IRR, kvm_apic_has_interrupt (called through kvm_vcpu_has_events) will
> see it.
>
> avic_pending_cnt shouldn't be necessary either.
>
> Paolo

So, the other thing I am using the avic_pending_cnt for is for the part 
2 of the series (to enable AVIC support in IOMMU) that I am planning to 
send out later. However, it might be good to discuss this at this point.

When the IOMMU cannot inject interrupts into the guest vcpu due to it is 
not running (therefore, it cannot doorbell the vcpu directly), it logs 
the interrupt in the GA log buffer. Then it generates interrupt to 
notify the IOMMU driver that it needs to handle the log entry. Here, the 
IOMMU driver will end up notifying the SVM to scheduling the VCPU in to 
process interrupt.

Here, I have run into issue where the vcpu often goes into idle (i.e. 
scheduled out), and ended up causing IOMMU to generate a lot of the 
entries in the GA log. This really hurts device pass-through performance 
(e.g. for XGBE NIC).

So, what I ended up experimenting with is to set the avic_pending_cnt to 
a larger value (i.e. avic_ga_log_threshold) whenever we processing the 
GA log entry. The intention is to delay the vcpu schedule out in 
expecting that there might be more interrupts coming in soon. I also 
make this threshold value tunable as a module_param.

This actually works well in my experiment, where I can actually get 
about 5% speed up in my netperf test on XGBE NIC pass-through test.
However, I am not sure if this is an acceptable approach. Actually, I 
think it's similar to the halt_poll_ns, but specifically for IOMMU GA 
log in this case.

Let me know what you think.

Thanks,
Suravee


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Feb. 12, 2016, 5:14 p.m. UTC | #4
On Fri, Feb 12, 2016 at 10:54:43PM +0700, Suravee Suthikulpanit wrote:
> Do you mean in the arch/x86/kvm/svm.c?

Ah, it is svm.c. Yes, where it is being used.
Paolo Bonzini Feb. 12, 2016, 6:19 p.m. UTC | #5
On 12/02/2016 17:21, Suravee Suthikulpanit wrote:
> Hi Paolo,
> 
> On 02/12/2016 10:55 PM, Paolo Bonzini wrote:
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> >index 4244c2b..2def290 100644
>>> >--- a/arch/x86/kvm/x86.c
>>> >+++ b/arch/x86/kvm/x86.c
>>> >@@ -8087,7 +8087,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
>>> >      if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
>>> >          kvm_x86_ops->check_nested_events(vcpu, false);
>>> >
>>> >-    return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
>>> >+    return (kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu) ||
>>> >+        (kvm_x86_ops->apicv_intr_pending &&
>>> >+         kvm_x86_ops->apicv_intr_pending(vcpu)));
>>> >  }
>> I think this is not necessary.  What you need is to make kvm_lapic's
>> regs field point to the backing page.  Then when the processor writes to
>> IRR, kvm_apic_has_interrupt (called through kvm_vcpu_has_events) will
>> see it.
>>
>> avic_pending_cnt shouldn't be necessary either.
>>
>> Paolo
> 
> So, the other thing I am using the avic_pending_cnt for is for the part
> 2 of the series (to enable AVIC support in IOMMU) that I am planning to
> send out later. However, it might be good to discuss this at this point.

It's better to discuss it later.  For now, I would prefer the AVIC
patches to be as clean as possible, and not know about the IOMMU at all.
 Also, there are a lot of assumptions about how to use kvm_lapic's regs
field for APIC virtualization---dating back to when Intel only
virtualized the TPR field.  Deviating for that would be a recipe for
trouble. :)

Regarding the IOMMU, I'm actually very happy with the way the Intel VT-d
posted interrupts patches worked out, so I would be even more happy if
everything you do fits in the same scheme and reuses the same hooks! :D

> When the IOMMU cannot inject interrupts into the guest vcpu due to it is
> not running (therefore, it cannot doorbell the vcpu directly), it logs
> the interrupt in the GA log buffer.

Where is this documented?

> Then it generates interrupt to
> notify the IOMMU driver that it needs to handle the log entry. Here, the
> IOMMU driver will end up notifying the SVM to scheduling the VCPU in to
> process interrupt.
> 
> Here, I have run into issue where the vcpu often goes into idle (i.e.
> scheduled out), and ended up causing IOMMU to generate a lot of the
> entries in the GA log. This really hurts device pass-through performance
> (e.g. for XGBE NIC).
> 
> So, what I ended up experimenting with is to set the avic_pending_cnt to
> a larger value (i.e. avic_ga_log_threshold) whenever we processing the
> GA log entry. The intention is to delay the vcpu schedule out in
> expecting that there might be more interrupts coming in soon. I also
> make this threshold value tunable as a module_param.
> 
> This actually works well in my experiment, where I can actually get
> about 5% speed up in my netperf test on XGBE NIC pass-through test.
> However, I am not sure if this is an acceptable approach. Actually, I
> think it's similar to the halt_poll_ns, but specifically for IOMMU GA
> log in this case.

Have you retested now that the halt_poll_ns mechanism is dynamic and
enabled by default?  If I read patch 9 right, halt_poll_ns would delay
vcpu_put and IsRunning=0.  Hopefully this is enough to avoid this kind
of notification and make the issue moot.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 12, 2016, 6:21 p.m. UTC | #6
On 12/02/2016 18:14, Borislav Petkov wrote:
> On Fri, Feb 12, 2016 at 10:54:43PM +0700, Suravee Suthikulpanit wrote:
>> > Do you mean in the arch/x86/kvm/svm.c?
> Ah, it is svm.c. Yes, where it is being used.

Hmm, currently things such as MSR_VM_HSAVE_PA are defined in msr-index.h.

It's okay for me to move them to kvm_host.h or similar, but they should
all be treated the same.  Right now this means adding the doorbell MSR
to msr-index.h.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Feb. 12, 2016, 6:30 p.m. UTC | #7
On Fri, Feb 12, 2016 at 07:21:17PM +0100, Paolo Bonzini wrote:
> Hmm, currently things such as MSR_VM_HSAVE_PA are defined in msr-index.h.

But that one is used in 3 files AFAICT.

> It's okay for me to move them to kvm_host.h or similar, but they should
> all be treated the same.  Right now this means adding the doorbell MSR
> to msr-index.h.

Only if it is used in multiple files. The doorbell thing is used once in
svm.c.

We don't want to make msr-index.h an encyclopedia of any MSR ever
defined :-)
Paolo Bonzini Feb. 12, 2016, 6:56 p.m. UTC | #8
On 12/02/2016 19:30, Borislav Petkov wrote:
> On Fri, Feb 12, 2016 at 07:21:17PM +0100, Paolo Bonzini wrote:
>> Hmm, currently things such as MSR_VM_HSAVE_PA are defined in msr-index.h.
> 
> But that one is used in 3 files AFAICT.

Ok, next examples: MSR_VM_CR and MSR_VM_IGNNE. :)

>> It's okay for me to move them to kvm_host.h or similar, but they should
>> all be treated the same.  Right now this means adding the doorbell MSR
>> to msr-index.h.
> 
> Only if it is used in multiple files. The doorbell thing is used once in
> svm.c.
> 
> We don't want to make msr-index.h an encyclopedia of any MSR ever
> defined :-)

Are you okay with moving all the SVM MSRs to virtext.h instead?

Thanks,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee Feb. 12, 2016, 7:36 p.m. UTC | #9
Hi

On 2/13/16 01:19, Paolo Bonzini wrote:
>> When the IOMMU cannot inject interrupts into the guest vcpu due to it is
>> >not running (therefore, it cannot doorbell the vcpu directly), it logs
>> >the interrupt in the GA log buffer.
> Where is this documented?
>

http://support.amd.com/TechDocs/48882_IOMMU.pdf

Regards,
Suravee
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee Feb. 19, 2016, 11:57 a.m. UTC | #10
Hi Paolo,

On 2/13/16 01:19, Paolo Bonzini wrote:
>
>
> On 12/02/2016 17:21, Suravee Suthikulpanit wrote:
>> Hi Paolo,
>>
>> On 02/12/2016 10:55 PM, Paolo Bonzini wrote:
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>> index 4244c2b..2def290 100644
>>>>> --- a/arch/x86/kvm/x86.c
>>>>> +++ b/arch/x86/kvm/x86.c
>>>>> @@ -8087,7 +8087,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
>>>>>       if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
>>>>>           kvm_x86_ops->check_nested_events(vcpu, false);
>>>>>
>>>>> -    return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
>>>>> +    return (kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu) ||
>>>>> +        (kvm_x86_ops->apicv_intr_pending &&
>>>>> +         kvm_x86_ops->apicv_intr_pending(vcpu)));
>>>>>   }
>>> I think this is not necessary.  What you need is to make kvm_lapic's
>>> regs field point to the backing page.  Then when the processor writes to
>>> IRR, kvm_apic_has_interrupt (called through kvm_vcpu_has_events) will
>>> see it.
>>>
>>> avic_pending_cnt shouldn't be necessary either.
>>>
>>> Paolo

Actually, I also found out during another benchmark (running tar xf 
linux.tar.gz) on the VM w/ multiple cpus, that the performance is quite 
bad due to large amount of AVIC_INCOMP_IPI vmexit for to target not 
running. The same issue does not happen with 1 vcpu, or taskset the tar 
process to one vcpu, or if I put in the logic above in 
kvm_arch_vcpu_runnable() to delay the halting.

>>
>> So, the other thing I am using the avic_pending_cnt for is for the part
>> 2 of the series (to enable AVIC support in IOMMU) that I am planning to
>> send out later. However, it might be good to discuss this at this point.
>
> It's better to discuss it later.  For now, I would prefer the AVIC
> patches to be as clean as possible, and not know about the IOMMU at all.
>   Also, there are a lot of assumptions about how to use kvm_lapic's regs
> field for APIC virtualization---dating back to when Intel only
> virtualized the TPR field.  Deviating for that would be a recipe for
> trouble. :)
>
> Regarding the IOMMU, I'm actually very happy with the way the Intel VT-d
> posted interrupts patches worked out, so I would be even more happy if
> everything you do fits in the same scheme and reuses the same hooks! :D
>
>> When the IOMMU cannot inject interrupts into the guest vcpu due to it is
>> not running (therefore, it cannot doorbell the vcpu directly), it logs
>> the interrupt in the GA log buffer.
>>
>> Then it generates interrupt to
>> notify the IOMMU driver that it needs to handle the log entry. Here, the
>> IOMMU driver will end up notifying the SVM to scheduling the VCPU in to
>> process interrupt.
>>
>> Here, I have run into issue where the vcpu often goes into idle (i.e.
>> scheduled out), and ended up causing IOMMU to generate a lot of the
>> entries in the GA log. This really hurts device pass-through performance
>> (e.g. for XGBE NIC).
>>
>> So, what I ended up experimenting with is to set the avic_pending_cnt to
>> a larger value (i.e. avic_ga_log_threshold) whenever we processing the
>> GA log entry. The intention is to delay the vcpu schedule out in
>> expecting that there might be more interrupts coming in soon. I also
>> make this threshold value tunable as a module_param.
>>
>> This actually works well in my experiment, where I can actually get
>> about 5% speed up in my netperf test on XGBE NIC pass-through test.
>> However, I am not sure if this is an acceptable approach. Actually, I
>> think it's similar to the halt_poll_ns, but specifically for IOMMU GA
>> log in this case.
>
> Have you retested now that the halt_poll_ns mechanism is dynamic and
> enabled by default?  If I read patch 9 right, halt_poll_ns would delay
> vcpu_put and IsRunning=0.  Hopefully this is enough to avoid this kind
> of notification and make the issue moot.
>
> Paolo
>
I assume that the halt_poll_ns mechanism would have been already enabled 
by default as off commit 93c9247cfd1e608e262274616a28632681abb2d3.

So, I have tried playing with halt_poll_ns, halt_poll_ns_[grow|shrink], 
but it doesn't seem to help much.

Thanks,
Suravee
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7b78328..a7c8852 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -973,6 +973,8 @@  struct kvm_x86_ops {
 	void (*post_block)(struct kvm_vcpu *vcpu);
 	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
 			      uint32_t guest_irq, bool set);
+
+	bool (*apicv_intr_pending)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b05402e..605b869 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -284,6 +284,7 @@ 
 #define MSR_AMD64_TSC_RATIO		0xc0000104
 #define MSR_AMD64_NB_CFG		0xc001001f
 #define MSR_AMD64_PATCH_LOADER		0xc0010020
+#define MSR_AMD64_AVIC_DOORBELL		0xc001011b
 #define MSR_AMD64_OSVW_ID_LENGTH	0xc0010140
 #define MSR_AMD64_OSVW_STATUS		0xc0010141
 #define MSR_AMD64_LS_CFG		0xc0011020
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fc313a0..f6deb04 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1828,7 +1828,8 @@  int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+	if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic) ||
+	    kvm_x86_ops->apicv_intr_pending)
 		return -1;
 
 	apic_update_ppr(apic);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bedf52b..5d7b049 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -169,6 +169,7 @@  struct vcpu_svm {
 	bool nrips_enabled	: 1;
 
 	struct page *avic_bk_page;
+	atomic_t avic_pending_cnt;
 };
 
 struct __attribute__ ((__packed__))
@@ -232,6 +233,8 @@  static bool npt_enabled = true;
 static bool npt_enabled;
 #endif
 
+static struct kvm_x86_ops svm_x86_ops;
+
 /* allow nested paging (virtualized MMU) for all guests */
 static int npt = true;
 module_param(npt, int, S_IRUGO);
@@ -974,6 +977,9 @@  static __init int svm_hardware_setup(void)
 
 	if (avic) {
 		printk(KERN_INFO "kvm: AVIC enabled\n");
+	} else {
+		svm_x86_ops.deliver_posted_interrupt = NULL;
+		svm_x86_ops.apicv_intr_pending = NULL;
 	}
 
 	return 0;
@@ -1188,8 +1194,10 @@  static void init_vmcb(struct vcpu_svm *svm)
 
 	enable_gif(svm);
 
-	if (avic)
+	if (avic) {
 		avic_init_vmcb(svm);
+		atomic_set(&svm->avic_pending_cnt, 0);
+	}
 }
 
 static struct svm_avic_phy_ait_entry *
@@ -3059,8 +3067,10 @@  static int clgi_interception(struct vcpu_svm *svm)
 	disable_gif(svm);
 
 	/* After a CLGI no interrupts should come */
-	svm_clear_vintr(svm);
-	svm->vmcb->control.v_irq = 0;
+	if (!avic) {
+		svm_clear_vintr(svm);
+		svm->vmcb->control.v_irq = 0;
+	}
 
 	mark_dirty(svm->vmcb, VMCB_INTR);
 
@@ -3635,6 +3645,9 @@  static int msr_interception(struct vcpu_svm *svm)
 
 static int interrupt_window_interception(struct vcpu_svm *svm)
 {
+	if (avic)
+		BUG_ON(1);
+
 	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.v_irq = 0;
@@ -4190,7 +4203,7 @@  static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 {
 	struct vmcb_control_area *control;
 
-
+	/* The following fields are ignored when AVIC is enabled */
 	control = &svm->vmcb->control;
 	control->int_vector = irq;
 	control->v_intr_prio = 0xf;
@@ -4261,6 +4274,27 @@  static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 	return;
 }
 
+static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	kvm_lapic_set_vector(vec, avic_get_bk_page_entry(svm, APIC_IRR));
+
+	/* Note:
+	 * This gives us a hint to check for pending interrupts
+	 * during #VMEXIT.
+	 */
+	atomic_inc(&svm->avic_pending_cnt);
+
+	if (vcpu->mode == IN_GUEST_MODE) {
+		wrmsrl(MSR_AMD64_AVIC_DOORBELL,
+		       __default_cpu_present_to_apicid(vcpu->cpu));
+	} else {
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4321,6 +4355,9 @@  static void enable_irq_window(struct kvm_vcpu *vcpu)
 	 * get that intercept, this function will be called again though and
 	 * we'll get the vintr intercept.
 	 */
+	if (avic)
+		return;
+
 	if (gif_set(svm) && nested_svm_intr(svm)) {
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
@@ -4462,6 +4499,67 @@  static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
+static bool avic_check_irr_pending(struct kvm_vcpu *vcpu)
+{
+	int i;
+	u32 irr;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	for (i = 0; i < 8; i++) {
+		irr = *(avic_get_bk_page_entry(svm,
+					APIC_IRR + (0x10 * i)));
+		if (irr)
+			return true;
+	}
+
+	return false;
+}
+
+static bool svm_avic_check_ppr(struct vcpu_svm *svm)
+{
+	u32 tpr = *(avic_get_bk_page_entry(svm, APIC_TASKPRI));
+	u32 ppr = *(avic_get_bk_page_entry(svm, APIC_PROCPRI));
+
+	if (ppr && (ppr != tpr))
+		return true;
+
+	return false;
+}
+
+/* Note: Returns true means do not block */
+static bool svm_apicv_intr_pending (struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!avic)
+		return false;
+
+	if (atomic_read(&svm->avic_pending_cnt))
+		return true;
+
+	return avic_check_irr_pending(vcpu);
+}
+
+static void avic_post_vmrun(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!avic)
+		return;
+
+	if (atomic_read(&svm->avic_pending_cnt)) {
+		if (svm_avic_check_ppr(svm))
+			return;
+		if (avic_check_irr_pending(vcpu))
+			return;
+		/*
+		 * At this point, if there is no interrupt pending.
+		 * So, we decrement the pending count
+		 */
+		atomic_dec(&svm->avic_pending_cnt);
+	}
+}
+
 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4588,6 +4686,8 @@  static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_after_handle_nmi(&svm->vcpu);
 
+	avic_post_vmrun(vcpu);
+
 	sync_cr8_to_lapic(vcpu);
 
 	svm->next_rip = 0;
@@ -5050,7 +5150,9 @@  static struct kvm_x86_ops svm_x86_ops = {
 
 	.sched_in = svm_sched_in,
 
+	.apicv_intr_pending = svm_apicv_intr_pending,
 	.pmu_ops = &amd_pmu_ops,
+	.deliver_posted_interrupt = svm_deliver_avic_intr,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4244c2b..2def290 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8087,7 +8087,9 @@  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
 		kvm_x86_ops->check_nested_events(vcpu, false);
 
-	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+	return (kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu) ||
+		(kvm_x86_ops->apicv_intr_pending &&
+		 kvm_x86_ops->apicv_intr_pending(vcpu)));
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)