diff mbox

[PART2,v6,12/12] svm: Implements update_pi_irte hook to setup posted interrupt

Message ID 1471549364-6672-13-git-send-email-Suravee.Suthikulpanit@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Suthikulpanit, Suravee Aug. 18, 2016, 7:42 p.m. UTC
This patch implements update_pi_irte function hook to allow SVM
communicate to IOMMU driver regarding how to set up IRTE for handling
posted interrupt.

In case AVIC is enabled, during vcpu_load/unload, SVM needs to update
IOMMU IRTE with appropriate host physical APIC ID. Also, when
vcpu_blocking/unblocking, SVM needs to update the is-running bit in
the IOMMU IRTE. Both are achieved via calling amd_iommu_update_ga().

However, if GA mode is not enabled for the pass-through device,
IOMMU driver will simply just return when calling amd_iommu_update_ga.

Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
---
 arch/x86/kvm/svm.c        | 247 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/amd-iommu.h |   1 +
 2 files changed, 227 insertions(+), 21 deletions(-)

Comments

Radim Krčmář Aug. 19, 2016, 2:49 p.m. UTC | #1
2016-08-18 14:42-0500, Suravee Suthikulpanit:
> This patch implements update_pi_irte function hook to allow SVM
> communicate to IOMMU driver regarding how to set up IRTE for handling
> posted interrupt.
> 
> In case AVIC is enabled, during vcpu_load/unload, SVM needs to update
> IOMMU IRTE with appropriate host physical APIC ID. Also, when
> vcpu_blocking/unblocking, SVM needs to update the is-running bit in
> the IOMMU IRTE. Both are achieved via calling amd_iommu_update_ga().
> 
> However, if GA mode is not enabled for the pass-through device,
> IOMMU driver will simply just return when calling amd_iommu_update_ga.
> 
> Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
> ---
> diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
> @@ -34,6 +34,7 @@ struct amd_ir_data {
>  	struct msi_msg msi_entry;
>  	void *entry;    /* Pointer to union irte or struct irte_ga */
>  	void *ref;      /* Pointer to the actual irte */
> +	struct list_head node;	/* Used by SVM for per-vcpu ir_list */

Putting a list_head here requires all users of amd-iommu to cooperate,
which is dangerous, but it allows simpler SVM code.  The alternative is
to force wrappers in SVM, which would also allow IOMMU to keep struct
amd_ir_data as incomplete in public headers.

 struct struct amd_ir_data_wrapper {
 	struct list_head node;
 	struct amd_ir_data *ir_data;
 }

(The rant continues below.)

> +static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> +			      uint32_t guest_irq, bool set)
> +{
> +	struct kvm_kernel_irq_routing_entry *e;
> +	struct kvm_irq_routing_table *irq_rt;
[...]
> +	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
> +		struct kvm_lapic_irq irq;
> +		struct vcpu_data vcpu_info;
[...]
> +		kvm_set_msi_irq(e, &irq);
> +		if (kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
> +			svm = to_svm(vcpu);
> +			vcpu_info.pi_desc_addr = page_to_phys(svm->avic_backing_page);
> +			vcpu_info.vector = irq.vector;
[...]
> +			struct amd_iommu_pi_data pi;
> +
> +			/* Try to enable guest_mode in IRTE */
> +			pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
> +						     vcpu->vcpu_id);
> +			pi.is_guest_mode = true;
> +			pi.vcpu_data = &vcpu_info;
> +			ret = irq_set_vcpu_affinity(host_irq, &pi);
> +			if (!ret && pi.is_guest_mode)
> +				svm_ir_list_add(svm, pi.ir_data);

I missed a bug here the last time:

If ir_data is already inside some VCPU list and the VCPU changes, then
we don't remove ir_data from the previous list and just add it to a new
one.  This was not as bad when we only had wrappers (only resulted in
duplication), but now we break the removed list ...

The problem with wrappers is that we don't know what list we should
remove the "struct amd_ir_data" from;  we would need to add another
tracking structure or go through all VCPUs.

Having "struct list_head" in "struct amd_ir_data" would allow us to know
the current list and remove it from there:
One "struct amd_ir_data" should never be used by more than one caller of
amd_iommu_update_ga(), because they would have to be cooperating anyway,
which would mean a single mediator, so we can add a "struct list_head"
into "struct amd_ir_data".

  Minor design note:
  To make the usage of "struct amd_ir_data" safer, we could pass "struct
  list_head" into irq_set_vcpu_affinity(), instead of returning "struct
  amd_ir_data *".

  irq_set_vcpu_affinity() would add "struct amd_ir_data" to the list only
  if ir_data was not already in some list and report whether the list
  was modified.

I think that adding "struct list_head" into "struct amd_ir_data" is
nicer than having wrappers.

Joerg, Paolo, what do you think?

Thanks.

> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> @@ -4366,6 +4399,177 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
> +static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_ir_data *ir)
> +{
> +	spin_lock_irqsave(&svm->ir_list_lock, flags);
> +	list_for_each_entry(cur, &svm->ir_list, node) {
> +		if (cur != ir)
> +			continue;
> +		found = true;
> +		break;
> +	}

If we're using ir->node, then this can ask !list_empty(ir->node),
because we should never add it to a new list otherwise.

> +	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
> +
> +	if (found)
> +		return 0;
> +
> +	spin_lock_irqsave(&svm->ir_list_lock, flags);
> +	list_add(&ir->node, &svm->ir_list);
> +	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
> +	return 0;
> +}
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář Aug. 19, 2016, 3:04 p.m. UTC | #2
2016-08-19 16:49+0200, Radim Krčmář:
>   Minor design note:
>   To make the usage of "struct amd_ir_data" safer, we could pass "struct
>   list_head" into irq_set_vcpu_affinity(), instead of returning "struct
>   amd_ir_data *".
> 
>   irq_set_vcpu_affinity() would add "struct amd_ir_data" to the list only
>   if ir_data was not already in some list and report whether the list
>   was modified.

No, this would not simplify the search.  An important point of list_head
in amd_ir_data was to let us know the current list.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee Aug. 22, 2016, 9:19 a.m. UTC | #3
Hi Radim

On 08/19/2016 09:49 PM, Radim Krčmář wrote:
> 2016-08-18 14:42-0500, Suravee Suthikulpanit:
>> This patch implements update_pi_irte function hook to allow SVM
>> communicate to IOMMU driver regarding how to set up IRTE for handling
>> posted interrupt.
>>
>> In case AVIC is enabled, during vcpu_load/unload, SVM needs to update
>> IOMMU IRTE with appropriate host physical APIC ID. Also, when
>> vcpu_blocking/unblocking, SVM needs to update the is-running bit in
>> the IOMMU IRTE. Both are achieved via calling amd_iommu_update_ga().
>>
>> However, if GA mode is not enabled for the pass-through device,
>> IOMMU driver will simply just return when calling amd_iommu_update_ga.
>>
>> Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
>> ---
>> diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
>> @@ -34,6 +34,7 @@ struct amd_ir_data {
>>   	struct msi_msg msi_entry;
>>   	void *entry;    /* Pointer to union irte or struct irte_ga */
>>   	void *ref;      /* Pointer to the actual irte */
>> +	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
>
> Putting a list_head here requires all users of amd-iommu to cooperate,
> which is dangerous, but it allows simpler SVM code.  The alternative is
> to force wrappers in SVM, which would also allow IOMMU to keep struct
> amd_ir_data as incomplete in public headers.
>
>   struct struct amd_ir_data_wrapper {
>   	struct list_head node;
>   	struct amd_ir_data *ir_data;
>   }
>
> (The rant continues below.)

The struct amd_ir_data is only used by AMD IOMMU driver (and now SVM 
driver). I don't think it would be that bad since they both would have 
to already coordinate.  Could you please clarify the point you mention 
that it could be dangerous?  Are you thinking of the case where the 
struct amd_ir_data could be free (by AMD IOMMU driver), while still in 
the vcpu list (in SVM)?

>> +static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
>> +			      uint32_t guest_irq, bool set)
>> +{
>> +	struct kvm_kernel_irq_routing_entry *e;
>> +	struct kvm_irq_routing_table *irq_rt;
> [...]
>> +	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
>> +		struct kvm_lapic_irq irq;
>> +		struct vcpu_data vcpu_info;
> [...]
>> +		kvm_set_msi_irq(e, &irq);
>> +		if (kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
>> +			svm = to_svm(vcpu);
>> +			vcpu_info.pi_desc_addr = page_to_phys(svm->avic_backing_page);
>> +			vcpu_info.vector = irq.vector;
> [...]
>> +			struct amd_iommu_pi_data pi;
>> +
>> +			/* Try to enable guest_mode in IRTE */
>> +			pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
>> +						     vcpu->vcpu_id);
>> +			pi.is_guest_mode = true;
>> +			pi.vcpu_data = &vcpu_info;
>> +			ret = irq_set_vcpu_affinity(host_irq, &pi);
>> +			if (!ret && pi.is_guest_mode)
>> +				svm_ir_list_add(svm, pi.ir_data);
>
> I missed a bug here the last time:
>
> If ir_data is already inside some VCPU list and the VCPU changes, then
> we don't remove ir_data from the previous list and just add it to a new
> one.  This was not as bad when we only had wrappers (only resulted in
> duplication), but now we break the removed list ...

Good point here...

> The problem with wrappers is that we don't know what list we should
> remove the "struct amd_ir_data" from;  we would need to add another
> tracking structure or go through all VCPUs.
>
> Having "struct list_head" in "struct amd_ir_data" would allow us to know
> the current list and remove it from there:
> One "struct amd_ir_data" should never be used by more than one caller of
> amd_iommu_update_ga(), because they would have to be cooperating anyway,
> which would mean a single mediator, so we can add a "struct list_head"
> into "struct amd_ir_data".
>
>    Minor design note:
>    To make the usage of "struct amd_ir_data" safer, we could pass "struct
>    list_head" into irq_set_vcpu_affinity(), instead of returning "struct
>    amd_ir_data *".
>
>    irq_set_vcpu_affinity() would add "struct amd_ir_data" to the list only
>    if ir_data was not already in some list and report whether the list
>    was modified.
>
> I think that adding "struct list_head" into "struct amd_ir_data" is
> nicer than having wrappers.
>
> Joerg, Paolo, what do you think?
>

I think modifying irq_set_vcpu_affinity() to also pass struct list_head 
seems a bit redundant since it is currently design to allow passing in 
void *, which leaves the other option where we might just need to pass 
in a wrapper (e.g. going back to the previous design where we pass in 
struct amd_iommu_pi_data) and also add a pointer to the ir_list in the 
wrapper as well. Then, IOMMU is responsible for adding/deleting ir_data 
to/from this list instead of SVM. This should be fine since we only need 
to coordinate b/w SVM and AMD-IOMMU.

Thanks,
Suravee


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee Aug. 22, 2016, 10:09 a.m. UTC | #4
Hi Radim,

On 08/22/2016 04:19 PM, Suravee Suthikulpanit wrote:
>> he problem with wrappers is that we don't know what list we should
>> remove the "struct amd_ir_data" from;  we would need to add another
>> tracking structure or go through all VCPUs.
>>
>> Having "struct list_head" in "struct amd_ir_data" would allow us to know
>> the current list and remove it from there:
>> One "struct amd_ir_data" should never be used by more than one caller of
>> amd_iommu_update_ga(), because they would have to be cooperating anyway,
>> which would mean a single mediator, so we can add a "struct list_head"
>> into "struct amd_ir_data".
>>
>>    Minor design note:
>>    To make the usage of "struct amd_ir_data" safer, we could pass "struct
>>    list_head" into irq_set_vcpu_affinity(), instead of returning "struct
>>    amd_ir_data *".
>>
>>    irq_set_vcpu_affinity() would add "struct amd_ir_data" to the list
>> only
>>    if ir_data was not already in some list and report whether the list
>>    was modified.
>>
>> I think that adding "struct list_head" into "struct amd_ir_data" is
>> nicer than having wrappers.
>>
>> Joerg, Paolo, what do you think?
>>
>
> I think modifying irq_set_vcpu_affinity() to also pass struct list_head
> seems a bit redundant since it is currently design to allow passing in
> void *, which leaves the other option where we might just need to pass
> in a wrapper (e.g. going back to the previous design where we pass in
> struct amd_iommu_pi_data) and also add a pointer to the ir_list in the
> wrapper as well. Then, IOMMU is responsible for adding/deleting ir_data
> to/from this list instead of SVM. This should be fine since we only need
> to coordinate b/w SVM and AMD-IOMMU.
>
> Thanks,
> Suravee

Actually, thinking about this again, going back to keeping the per-vcpu 
list of struct amd_iommu_pi_data is probably the simplest here.

* We avoid having to expose the amd_ir_data to SVM.
* We can match using amd_ir_data * when traversing the list.
* We can easily add the code to manage the list in the SVM. We can make 
sure that the struct amd_iommu_pi_data is not already mapped before 
adding it to a new per-vcpu list. If it is currently mapped, we can 
simply unmapped it. Doing this from IOMMU would be more complicate and 
require lots of parameter passing.

Thanks,
Suravee
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář Aug. 22, 2016, 3:16 p.m. UTC | #5
2016-08-22 17:09+0700, Suravee Suthikulpanit:
> On 08/22/2016 04:19 PM, Suravee Suthikulpanit wrote:
>> > he problem with wrappers is that we don't know what list we should
>> > remove the "struct amd_ir_data" from;  we would need to add another
>> > tracking structure or go through all VCPUs.
>> > 
>> > Having "struct list_head" in "struct amd_ir_data" would allow us to know
>> > the current list and remove it from there:
>> > One "struct amd_ir_data" should never be used by more than one caller of
>> > amd_iommu_update_ga(), because they would have to be cooperating anyway,
>> > which would mean a single mediator, so we can add a "struct list_head"
>> > into "struct amd_ir_data".
>> > 
>> >    Minor design note:
>> >    To make the usage of "struct amd_ir_data" safer, we could pass "struct
>> >    list_head" into irq_set_vcpu_affinity(), instead of returning "struct
>> >    amd_ir_data *".
>> > 
>> >    irq_set_vcpu_affinity() would add "struct amd_ir_data" to the list
>> > only
>> >    if ir_data was not already in some list and report whether the list
>> >    was modified.
>> > 
>> > I think that adding "struct list_head" into "struct amd_ir_data" is
>> > nicer than having wrappers.
>> > 
>> > Joerg, Paolo, what do you think?
>> > 
>> 
>> I think modifying irq_set_vcpu_affinity() to also pass struct list_head
>> seems a bit redundant since it is currently design to allow passing in
>> void *, which leaves the other option where we might just need to pass
>> in a wrapper (e.g. going back to the previous design where we pass in
>> struct amd_iommu_pi_data) and also add a pointer to the ir_list in the
>> wrapper as well. Then, IOMMU is responsible for adding/deleting ir_data
>> to/from this list instead of SVM. This should be fine since we only need
>> to coordinate b/w SVM and AMD-IOMMU.
> 
> Actually, thinking about this again, going back to keeping the per-vcpu list
> of struct amd_iommu_pi_data is probably the simplest here.
> 
> * We avoid having to expose the amd_ir_data to SVM.
> * We can match using amd_ir_data * when traversing the list.
> * We can easily add the code to manage the list in the SVM. We can make sure
> that the struct amd_iommu_pi_data is not already mapped before adding it to
> a new per-vcpu list. If it is currently mapped, we can simply unmapped it.

Sounds good.
A new SVM-specific wrapper for amd_ir_data instead of reusing
amd_iommu_pi_data would be nicer, IMO -- we could change it without
touching the IOMMU interface and also allocate in svm_pi_list_add.

Updating lists would become O(N^2), but updates should not occur often
(and N small) so I think it's still worth the saving on every
sched_in/out.

> Doing this from IOMMU would be more complicate and require lots of parameter
> passing.

Yeah, doing more than returning amd_ir_data from IOMMU doesn't make
sense and not adding list_head for SVm to amd_ir_data is more
acceptable.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
kernel test robot Aug. 23, 2016, 10:51 a.m. UTC | #6
Hi Suravee,

[auto build test WARNING on v4.7-rc7]
[cannot apply to iommu/next kvm/linux-next linux/master v4.8-rc2 v4.8-rc1 next-20160823]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Suravee-Suthikulpanit/iommu-AMD-Introduce-IOMMU-AVIC-support/20160819-111539
config: i386-randconfig-s0-201633 (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from arch/x86/kvm/svm.c:37:0:
   include/linux/amd-iommu.h:222:1: error: expected identifier or '(' before '{' token
    {
    ^
>> include/linux/amd-iommu.h:221:1: warning: 'amd_iommu_update_ga' used but never defined
    amd_iommu_update_ga(u32 cpu, u64 base, bool is_run, struct amd_ir_data *ir_data);
    ^~~~~~~~~~~~~~~~~~~

vim +/amd_iommu_update_ga +221 include/linux/amd-iommu.h

bda901f5 Suravee Suthikulpanit 2016-08-18  215  amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
bda901f5 Suravee Suthikulpanit 2016-08-18  216  {
bda901f5 Suravee Suthikulpanit 2016-08-18  217  	return 0;
bda901f5 Suravee Suthikulpanit 2016-08-18  218  }
bda901f5 Suravee Suthikulpanit 2016-08-18  219  
d49c1352 Suravee Suthikulpanit 2016-08-18  220  static inline int
d49c1352 Suravee Suthikulpanit 2016-08-18 @221  amd_iommu_update_ga(u32 cpu, u64 base, bool is_run, struct amd_ir_data *ir_data);
d49c1352 Suravee Suthikulpanit 2016-08-18 @222  {
d49c1352 Suravee Suthikulpanit 2016-08-18  223  	return 0;
d49c1352 Suravee Suthikulpanit 2016-08-18  224  }
d49c1352 Suravee Suthikulpanit 2016-08-18  225  

:::::: The code at line 221 was first introduced by commit
:::::: d49c1352bd32284fa9619171c8a505973cf79c54 iommu/amd: Introduce amd_iommu_update_ga()

:::::: TO: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
:::::: CC: 0day robot <fengguang.wu@intel.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c060e05..303007a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -43,6 +43,7 @@ 
 #include <asm/desc.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -200,6 +201,16 @@  struct vcpu_svm {
 	struct page *avic_backing_page;
 	u64 *avic_physical_id_cache;
 	bool avic_is_running;
+
+	/*
+	 * Per-vcpu list of struct amd_ir_data:
+	 * This is used mainly to track interrupt remapping table entry (IRTE)
+	 * to be updated when the vcpu affinity changes. This avoid the need
+	 * for scanning for IRTE and try to match ga_tag in the IOMMU driver
+	 * (or using hashtable).
+	 */
+	struct list_head ir_list;
+	spinlock_t ir_list_lock;
 };
 
 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK	(0xFF)
@@ -1443,31 +1454,29 @@  free_avic:
 	return err;
 }
 
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu(struct kvm_vcpu *vcpu, int cpu, phys_addr_t pa, bool r)
 {
-	u64 entry;
-	int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+	int ret;
+	unsigned long flags;
 	struct vcpu_svm *svm = to_svm(vcpu);
+	struct amd_ir_data *ir;
 
-	if (!kvm_vcpu_apicv_active(vcpu))
-		return;
-
-	svm->avic_is_running = is_run;
-
-	/* ID = 0xff (broadcast), ID > 0xff (reserved) */
-	if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
-		return;
-
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
-	WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+	if (!kvm_arch_has_assigned_device(vcpu->kvm))
+		return 0;
 
-	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	if (is_run)
-		entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	/*
+	 * Here, we go through the per-vcpu ir_list to update all existing
+	 * interrupt remapping table entry targeting this vcpu.
+	 */
+	spin_lock_irqsave(&svm->ir_list_lock, flags);
+	list_for_each_entry(ir, &svm->ir_list, node) {
+		ret = amd_iommu_update_ga(cpu, (pa & AVIC_HPA_MASK), r, ir);
+		if (ret)
+			break;
+	}
+	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+	return ret;
 }
 
 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1494,6 +1503,9 @@  static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 
 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	avic_update_iommu(vcpu, h_physical_id,
+			  page_to_phys(svm->avic_backing_page),
+			  svm->avic_is_running);
 }
 
 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1505,10 +1517,28 @@  static void avic_vcpu_put(struct kvm_vcpu *vcpu)
 		return;
 
 	entry = READ_ONCE(*(svm->avic_physical_id_cache));
+	if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+		avic_update_iommu(vcpu, -1,
+				  page_to_phys(svm->avic_backing_page), 0);
+
 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 }
 
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->avic_is_running = is_run;
+	if (is_run)
+		avic_vcpu_load(vcpu, vcpu->cpu);
+	else
+		avic_vcpu_put(vcpu);
+}
+
 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1570,6 +1600,9 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		err = avic_init_backing_page(&svm->vcpu);
 		if (err)
 			goto free_page4;
+
+		INIT_LIST_HEAD(&svm->ir_list);
+		spin_lock_init(&svm->ir_list_lock);
 	}
 
 	/* We initialize this flag to true to make sure that the is_running
@@ -4366,6 +4399,177 @@  static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 		kvm_vcpu_wake_up(vcpu);
 }
 
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_ir_data *ir)
+{
+	bool found = false;
+	unsigned long flags;
+	struct amd_ir_data *cur;
+
+	/**
+	 * In some cases, the existing irte is updaed and re-set,
+	 * so we need to check here if it's already been * added
+	 * to the ir_list.
+	 */
+	spin_lock_irqsave(&svm->ir_list_lock, flags);
+	list_for_each_entry(cur, &svm->ir_list, node) {
+		if (cur != ir)
+			continue;
+		found = true;
+		break;
+	}
+	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+
+	if (found)
+		return 0;
+
+	spin_lock_irqsave(&svm->ir_list_lock, flags);
+	list_add(&ir->node, &svm->ir_list);
+	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+	return 0;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_ir_data *ir)
+{
+	unsigned long flags;
+	struct amd_ir_data *cur;
+
+	spin_lock_irqsave(&svm->ir_list_lock, flags);
+	list_for_each_entry(cur, &svm->ir_list, node) {
+		if (cur != ir)
+			continue;
+		list_del(&cur->node);
+		break;
+	}
+	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct kvm_irq_routing_table *irq_rt;
+	int idx, ret = -EINVAL;
+
+	if (!kvm_arch_has_assigned_device(kvm) ||
+	    !irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+		 __func__, host_irq, guest_irq, set);
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+		struct kvm_lapic_irq irq;
+		struct vcpu_data vcpu_info;
+		struct kvm_vcpu *vcpu = NULL;
+		struct vcpu_svm *svm = NULL;
+
+		if (e->type != KVM_IRQ_ROUTING_MSI)
+			continue;
+
+		/**
+		 * Note:
+		 * The HW cannot support posting multicast/broadcast
+		 * interrupts to a vCPU. So, we still use interrupt
+		 * remapping for these kind of interrupts.
+		 *
+		 * For lowest-priority interrupts, we only support
+		 * those with single CPU as the destination, e.g. user
+		 * configures the interrupts via /proc/irq or uses
+		 * irqbalance to make the interrupts single-CPU.
+		 */
+		kvm_set_msi_irq(e, &irq);
+		if (kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+			svm = to_svm(vcpu);
+			vcpu_info.pi_desc_addr = page_to_phys(svm->avic_backing_page);
+			vcpu_info.vector = irq.vector;
+
+			pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+				 irq.vector);
+		} else {
+			set = false;
+
+			pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+				 __func__, irq.vector);
+		}
+
+		trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+					 vcpu_info.vector,
+					 vcpu_info.pi_desc_addr, set);
+
+		/**
+		 * When AVIC is disabled, we fall-back to setup
+		 * IRTE w/ legacy mode
+		 */
+		if (set && kvm_vcpu_apicv_active(&svm->vcpu)) {
+			struct amd_iommu_pi_data pi;
+
+			/* Try to enable guest_mode in IRTE */
+			pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+						     vcpu->vcpu_id);
+			pi.is_guest_mode = true;
+			pi.vcpu_data = &vcpu_info;
+			ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+			/**
+			 * We save the pointer to pi in the struct
+			 * vcpu_svm so that we can reference to them directly
+			 * when we update vcpu scheduling information in IOMMU
+			 * irte.
+			 */
+			if (!ret && pi.is_guest_mode)
+				svm_ir_list_add(svm, pi.ir_data);
+		} else {
+			/* Use legacy mode in IRTE */
+			struct amd_iommu_pi_data pi;
+
+			/**
+			 * Here, pi is used to:
+			 * - Tell IOMMU to use legacy mode for this interrupt.
+			 * - Retrieve ga_tag of prior interrupt remapping data.
+			 */
+			pi.is_guest_mode = false;
+			ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+			/**
+			 * We need to check if the interrupt was previously
+			 * setup with the guest_mode by checking if the ga_tag
+			 * was cached. If so, we need to clean up the per-vcpu
+			 * ir_list.
+			 */
+			if (!ret && pi.ga_tag) {
+				struct kvm_vcpu *vcpu = kvm_get_vcpu_by_id(kvm,
+						AVIC_GATAG_TO_VCPUID(pi.ga_tag));
+
+				if (vcpu)
+					svm_ir_list_del(to_svm(vcpu), pi.ir_data);
+			}
+		}
+
+		if (ret < 0) {
+			pr_err("%s: failed to update PI IRTE\n", __func__);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -5192,6 +5396,7 @@  static struct kvm_x86_ops svm_x86_ops = {
 
 	.pmu_ops = &amd_pmu_ops,
 	.deliver_posted_interrupt = svm_deliver_avic_intr,
+	.update_pi_irte = svm_update_pi_irte,
 };
 
 static int __init svm_init(void)
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 7b2e802..cd10393 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -34,6 +34,7 @@  struct amd_ir_data {
 	struct msi_msg msi_entry;
 	void *entry;    /* Pointer to union irte or struct irte_ga */
 	void *ref;      /* Pointer to the actual irte */
+	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
 };
 
 /*