diff mbox series

[RFC,v2,3/9] iommu/arm-smmu-v3: Issue invalidations commands to multiple SMMUs

Message ID 20230822185632.RFC.v2.3.I0f149f177e5478e28dc3223c2d10729d8f28d53a@changeid (mailing list archive)
State New, archived
Headers show
Series Install domain onto multiple smmus | expand

Commit Message

Michael Shavit Aug. 22, 2023, 10:56 a.m. UTC
Assume that devices in the smmu_domain->domain list that belong to the
same SMMU are adjacent to each other in the list.
Batch TLB/ATC invalidation commands for an smmu_domain by the SMMU
devices that the domain is installed to.

Signed-off-by: Michael Shavit <mshavit@google.com>
---

Changes in v2:
- Moved the ARM_SMMU_FEAT_BTM changes into a new prepatory commit

 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |   6 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 134 +++++++++++++-----
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   2 +-
 3 files changed, 104 insertions(+), 38 deletions(-)

Comments

Jason Gunthorpe Aug. 22, 2023, 1:14 p.m. UTC | #1
On Tue, Aug 22, 2023 at 06:56:59PM +0800, Michael Shavit wrote:
> Assume that devices in the smmu_domain->domain list that belong to the
> same SMMU are adjacent to each other in the list.
> Batch TLB/ATC invalidation commands for an smmu_domain by the SMMU
> devices that the domain is installed to.
> 
> Signed-off-by: Michael Shavit <mshavit@google.com>
> ---
> 
> Changes in v2:
> - Moved the ARM_SMMU_FEAT_BTM changes into a new prepatory commit
> 
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |   6 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 134 +++++++++++++-----
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   2 +-
>  3 files changed, 104 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 53f65a89a55f9..fe88a7880ad57 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -112,7 +112,7 @@ arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
>  	arm_smmu_write_ctx_desc_devices(smmu_domain, 0, cd);
>  
>  	/* Invalidate TLB entries previously associated with that context */
> -	arm_smmu_tlb_inv_asid(smmu, asid);
> +	arm_smmu_tlb_inv_asid(smmu_domain, asid);
>  
>  	xa_erase(&arm_smmu_asid_xa, asid);
>  	return NULL;
> @@ -252,7 +252,7 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  	 */
>  	arm_smmu_write_ctx_desc_devices(smmu_domain, mm->pasid, &quiet_cd);
>  
> -	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
> +	arm_smmu_tlb_inv_asid(smmu_domain, smmu_mn->cd->asid);
>  	arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
>  
>  	smmu_mn->cleared = true;
> @@ -340,7 +340,7 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
>  	 * new TLB entry can have been formed.
>  	 */
>  	if (!smmu_mn->cleared) {
> -		arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
> +		arm_smmu_tlb_inv_asid(smmu_domain, cd->asid);
>  		arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
>  	}
>  
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index db4df9d6aef10..1d072fd38a2d6 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -960,15 +960,28 @@ static int arm_smmu_page_response(struct device *dev,
>  }
>  
>  /* Context descriptor manipulation functions */
> -void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
> +void arm_smmu_tlb_inv_asid(struct arm_smmu_domain *smmu_domain, u16 asid)
>  {
> +	struct arm_smmu_device *smmu = NULL;
> +	struct arm_smmu_master *master;
>  	struct arm_smmu_cmdq_ent cmd = {
> -		.opcode	= smmu->features & ARM_SMMU_FEAT_E2H ?
> -			CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID,
>  		.tlbi.asid = asid,
>  	};
> +	unsigned long flags;
>  
> -	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
> +	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> +	list_for_each_entry(master, &smmu_domain->devices,
> +			    domain_head) {
> +		if (!smmu)
> +			smmu = master->smmu;
> +		if (smmu != master->smmu ||
> +		    list_is_last(&master->domain_head, &smmu_domain->devices)) {

Finding the end of the list seems too complicated, just:

	struct arm_smmu_device *invalidated_smmu = NULL;
	list_for_each_entry(master, &smmu_domain->devices,
			    domain_head) {
             if (master->smmu == invalidated_smmu)
	          continue;
             cmd.opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
			CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID,
             arm_smmu_cmdq_issue_cmd_with_sync(master->smmu, &cmd);
	     invalidated_smmu = master->smmu;
        }

> @@ -1839,28 +1851,56 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
>  	arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
>  
>  	cmds.num = 0;
> -
>  	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
>  	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
>  		if (!master->ats_enabled)
>  			continue;
> +		if (!smmu)
> +			smmu = master->smmu;
> +		if (smmu != master->smmu ||
> +		    list_is_last(&master->domain_head, &smmu_domain->devices)) {
> +			ret = arm_smmu_cmdq_batch_submit(smmu, &cmds);
> +			if (ret)
> +				break;
> +			cmds.num = 0;
> +		}
>  
>  		for (i = 0; i < master->num_streams; i++) {
>  			cmd.atc.sid = master->streams[i].id;
> -			arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
> +			arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
>  		}
>  	}

Doesn't the IOTLB invalidate have to come before the ATC invalidate?

So again, use the pattern as above?

>  	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>  
> -	return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
> +	return ret;
> +}
> +
> +static void arm_smmu_tlb_inv_vmid(struct arm_smmu_domain *smmu_domain)
> +{
> +	struct arm_smmu_device *smmu = NULL;
> +	struct arm_smmu_master *master;
> +	struct arm_smmu_cmdq_ent cmd = {
> +		.opcode = CMDQ_OP_TLBI_S12_VMALL,
> +		.tlbi.vmid = smmu_domain->s2_cfg.vmid,
> +	};
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> +	list_for_each_entry(master, &smmu_domain->devices,
> +			    domain_head) {
> +		if (!smmu)
> +			smmu = master->smmu;
> +		if (smmu != master->smmu ||
> +		    list_is_last(&master->domain_head, &smmu_domain->devices))
> +			arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
> +	}
> +	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>  }

I count three of these, so a macro helper is probably a good
idea. Something approx like:

static struct arm_smmu_master *smmu_next_entry(struct arm_smmu_master *pos,
					       struct arm_smmu_domain *domain)
{
	struct arm_smmu *smmu = pos->smmu;

	do {
		pos = list_next_entry(pos, domain_head);
	} while (!list_entry_is_head(pos, domain->devices, domain_head) &&
		 pos->smmu == smmu);
	return pos;
}

#define for_each_smmu(pos, domain, smmu)                                       \
	for (pos = list_first_entry((domain)->devices, struct arm_smmu_master, \
				    domain_head),                              \
	    smmu = (pos)->smmu;                                                \
	     !list_entry_is_head(pos, (domain)->devices, domain_head);         \
	     pos = smmu_next_entry(pos, domain), smmu = (pos)->smmu)

> @@ -1949,21 +1987,36 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
>  					  size_t granule, bool leaf,
>  					  struct arm_smmu_domain *smmu_domain)
>  {
> +	struct arm_smmu_device *smmu = NULL;
> +	struct arm_smmu_master *master;
>  	struct arm_smmu_cmdq_ent cmd = {
>  		.tlbi = {
>  			.leaf	= leaf,
>  		},
>  	};
> +	unsigned long flags;
>  
> -	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> -		cmd.opcode	= smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ?
> -				  CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA;
> -		cmd.tlbi.asid	= smmu_domain->cd.asid;
> -	} else {
> -		cmd.opcode	= CMDQ_OP_TLBI_S2_IPA;
> -		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
> +	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> +	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
> +		if (!smmu)
> +			smmu = master->smmu;
> +		if (smmu != master->smmu ||
> +		    list_is_last(&master->domain_head, &smmu_domain->devices)) {
> +			if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> +				cmd.opcode = smmu->features &
> +							     ARM_SMMU_FEAT_E2H ?
> +						     CMDQ_OP_TLBI_EL2_VA :
> +						     CMDQ_OP_TLBI_NH_VA;
> +				cmd.tlbi.asid = smmu_domain->cd.asid;
> +			} else {
> +				cmd.opcode = CMDQ_OP_TLBI_S2_IPA;
> +				cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
> +			}

These calculations based on smmu domain shouldn't be in the loop, the
smmu_domain doesn't change.

> -	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
> +	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> +	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
> +		if (!smmu)
> +			smmu = master->smmu;
> +		if (smmu != master->smmu ||
> +		    list_is_last(&master->domain_head, &smmu_domain->devices)) {
> +			if (skip_btm_capable_devices &&
> +			    smmu->features & ARM_SMMU_FEAT_BTM)
> +				continue;
> +			cmd.opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
> +					     CMDQ_OP_TLBI_EL2_VA :
> +					     CMDQ_OP_TLBI_NH_VA;

There are 3 places doing this if, maybe it should be in a wrapper of
__arm_smmu_tlb_inv_range ?

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 53f65a89a55f9..fe88a7880ad57 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -112,7 +112,7 @@  arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
 	arm_smmu_write_ctx_desc_devices(smmu_domain, 0, cd);
 
 	/* Invalidate TLB entries previously associated with that context */
-	arm_smmu_tlb_inv_asid(smmu, asid);
+	arm_smmu_tlb_inv_asid(smmu_domain, asid);
 
 	xa_erase(&arm_smmu_asid_xa, asid);
 	return NULL;
@@ -252,7 +252,7 @@  static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 */
 	arm_smmu_write_ctx_desc_devices(smmu_domain, mm->pasid, &quiet_cd);
 
-	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
+	arm_smmu_tlb_inv_asid(smmu_domain, smmu_mn->cd->asid);
 	arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
 
 	smmu_mn->cleared = true;
@@ -340,7 +340,7 @@  static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
 	 * new TLB entry can have been formed.
 	 */
 	if (!smmu_mn->cleared) {
-		arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
+		arm_smmu_tlb_inv_asid(smmu_domain, cd->asid);
 		arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
 	}
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index db4df9d6aef10..1d072fd38a2d6 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -960,15 +960,28 @@  static int arm_smmu_page_response(struct device *dev,
 }
 
 /* Context descriptor manipulation functions */
-void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
+void arm_smmu_tlb_inv_asid(struct arm_smmu_domain *smmu_domain, u16 asid)
 {
+	struct arm_smmu_device *smmu = NULL;
+	struct arm_smmu_master *master;
 	struct arm_smmu_cmdq_ent cmd = {
-		.opcode	= smmu->features & ARM_SMMU_FEAT_E2H ?
-			CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID,
 		.tlbi.asid = asid,
 	};
+	unsigned long flags;
 
-	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
+	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+	list_for_each_entry(master, &smmu_domain->devices,
+			    domain_head) {
+		if (!smmu)
+			smmu = master->smmu;
+		if (smmu != master->smmu ||
+		    list_is_last(&master->domain_head, &smmu_domain->devices)) {
+			cmd.opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
+			CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID,
+			arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
+		}
+	}
+	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 }
 
 static void arm_smmu_sync_cd(struct arm_smmu_master *master,
@@ -1811,14 +1824,13 @@  int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 			    unsigned long iova, size_t size)
 {
 	int i;
+	int ret = 0;
 	unsigned long flags;
 	struct arm_smmu_cmdq_ent cmd;
+	struct arm_smmu_device *smmu = NULL;
 	struct arm_smmu_master *master;
 	struct arm_smmu_cmdq_batch cmds;
 
-	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
-		return 0;
-
 	/*
 	 * Ensure that we've completed prior invalidation of the main TLBs
 	 * before we read 'nr_ats_masters' in case of a concurrent call to
@@ -1839,28 +1851,56 @@  int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 	arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
 
 	cmds.num = 0;
-
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
 		if (!master->ats_enabled)
 			continue;
+		if (!smmu)
+			smmu = master->smmu;
+		if (smmu != master->smmu ||
+		    list_is_last(&master->domain_head, &smmu_domain->devices)) {
+			ret = arm_smmu_cmdq_batch_submit(smmu, &cmds);
+			if (ret)
+				break;
+			cmds.num = 0;
+		}
 
 		for (i = 0; i < master->num_streams; i++) {
 			cmd.atc.sid = master->streams[i].id;
-			arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
+			arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
 		}
 	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
-	return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
+	return ret;
+}
+
+static void arm_smmu_tlb_inv_vmid(struct arm_smmu_domain *smmu_domain)
+{
+	struct arm_smmu_device *smmu = NULL;
+	struct arm_smmu_master *master;
+	struct arm_smmu_cmdq_ent cmd = {
+		.opcode = CMDQ_OP_TLBI_S12_VMALL,
+		.tlbi.vmid = smmu_domain->s2_cfg.vmid,
+	};
+	unsigned long flags;
+
+	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+	list_for_each_entry(master, &smmu_domain->devices,
+			    domain_head) {
+		if (!smmu)
+			smmu = master->smmu;
+		if (smmu != master->smmu ||
+		    list_is_last(&master->domain_head, &smmu_domain->devices))
+			arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
+	}
+	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 }
 
 /* IO_PGTABLE API */
 static void arm_smmu_tlb_inv_context(void *cookie)
 {
 	struct arm_smmu_domain *smmu_domain = cookie;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	struct arm_smmu_cmdq_ent cmd;
 
 	/*
 	 * NOTE: when io-pgtable is in non-strict mode, we may get here with
@@ -1870,11 +1910,9 @@  static void arm_smmu_tlb_inv_context(void *cookie)
 	 * careful, 007.
 	 */
 	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
-		arm_smmu_tlb_inv_asid(smmu, smmu_domain->cd.asid);
+		arm_smmu_tlb_inv_asid(smmu_domain, smmu_domain->cd.asid);
 	} else {
-		cmd.opcode	= CMDQ_OP_TLBI_S12_VMALL;
-		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
-		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
+		arm_smmu_tlb_inv_vmid(smmu_domain);
 	}
 	arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
 }
@@ -1882,9 +1920,9 @@  static void arm_smmu_tlb_inv_context(void *cookie)
 static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 				     unsigned long iova, size_t size,
 				     size_t granule,
-				     struct arm_smmu_domain *smmu_domain)
+				     struct arm_smmu_domain *smmu_domain,
+				     struct arm_smmu_device *smmu)
 {
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	unsigned long end = iova + size, num_pages = 0, tg = 0;
 	size_t inv_range = granule;
 	struct arm_smmu_cmdq_batch cmds;
@@ -1949,21 +1987,36 @@  static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
 					  size_t granule, bool leaf,
 					  struct arm_smmu_domain *smmu_domain)
 {
+	struct arm_smmu_device *smmu = NULL;
+	struct arm_smmu_master *master;
 	struct arm_smmu_cmdq_ent cmd = {
 		.tlbi = {
 			.leaf	= leaf,
 		},
 	};
+	unsigned long flags;
 
-	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
-		cmd.opcode	= smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ?
-				  CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA;
-		cmd.tlbi.asid	= smmu_domain->cd.asid;
-	} else {
-		cmd.opcode	= CMDQ_OP_TLBI_S2_IPA;
-		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
+	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+		if (!smmu)
+			smmu = master->smmu;
+		if (smmu != master->smmu ||
+		    list_is_last(&master->domain_head, &smmu_domain->devices)) {
+			if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+				cmd.opcode = smmu->features &
+							     ARM_SMMU_FEAT_E2H ?
+						     CMDQ_OP_TLBI_EL2_VA :
+						     CMDQ_OP_TLBI_NH_VA;
+				cmd.tlbi.asid = smmu_domain->cd.asid;
+			} else {
+				cmd.opcode = CMDQ_OP_TLBI_S2_IPA;
+				cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
+			}
+			__arm_smmu_tlb_inv_range(&cmd, iova, size, granule,
+						 smmu_domain, smmu);
+		}
 	}
-	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
+	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 	/*
 	 * Unfortunately, this can't be leaf-only since we may have
@@ -1977,19 +2030,33 @@  void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
 				 bool skip_btm_capable_devices,
 				 struct arm_smmu_domain *smmu_domain)
 {
+	struct arm_smmu_device *smmu = NULL;
+	struct arm_smmu_master *master;
 	struct arm_smmu_cmdq_ent cmd = {
-		.opcode	= smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ?
-			  CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA,
 		.tlbi = {
 			.asid	= asid,
 			.leaf	= leaf,
 		},
 	};
+	unsigned long flags;
 
-	if (skip_btm_capable_devices &&
-	    smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)
-		return;
-	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
+	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+		if (!smmu)
+			smmu = master->smmu;
+		if (smmu != master->smmu ||
+		    list_is_last(&master->domain_head, &smmu_domain->devices)) {
+			if (skip_btm_capable_devices &&
+			    smmu->features & ARM_SMMU_FEAT_BTM)
+				continue;
+			cmd.opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
+					     CMDQ_OP_TLBI_EL2_VA :
+					     CMDQ_OP_TLBI_NH_VA;
+			__arm_smmu_tlb_inv_range(&cmd, iova, size, granule,
+						 smmu_domain, smmu);
+		}
+	}
+	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 }
 
 static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
@@ -2523,8 +2590,7 @@  static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
-	if (smmu_domain->smmu)
-		arm_smmu_tlb_inv_context(smmu_domain);
+	arm_smmu_tlb_inv_context(smmu_domain);
 }
 
 static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 05599914eb0a0..b0cf9c33e6bcd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -748,7 +748,7 @@  extern struct arm_smmu_ctx_desc quiet_cd;
 
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid,
 			    struct arm_smmu_ctx_desc *cd);
-void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
+void arm_smmu_tlb_inv_asid(struct arm_smmu_domain *smmu_domain, u16 asid);
 void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
 				 size_t granule, bool leaf,
 				 bool skip_btm_capable_devices,