diff mbox series

[v5,24/27] iommu/arm-smmu-v3: Bring back SVA BTM support

Message ID 24-v5-9a37e0c884ce+31e3-smmuv3_newapi_p2_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Update SMMUv3 to the modern iommu API (part 2/3) | expand

Commit Message

Jason Gunthorpe March 4, 2024, 11:44 p.m. UTC
BTM support is a feature where the CPU TLB invalidation can be forwarded
to the IOMMU and also invalidate the IOTLB. For this to work the CPU and
IOMMU ASID must be the same.

Retain the prior SVA design here of keeping the ASID allocator for the
IOMMU private to SMMU and force SVA domains to set an ASID that matches
the CPU ASID.

This requires changing the ASID assigned to a S1 domain if it happens to
be overlapping with the required CPU ASID. We hold on to the CPU ASID so
long as the SVA iommu_domain exists, so SVA domain conflict is not
possible.

With the asid per-smmu we no longer have a problem that two per-smmu
iommu_domain's would need to share a CPU ASID entry in the IOMMU's xarray.

Use the same ASID move algorithm for the S1 domains as before with some
streamlining around how the xarray is being used. Do not synchronize the
ASID's if BTM mode is not supported. Just leave BTM features off
everywhere.

Audit all the places that touch cd->asid and think carefully about how the
locking works with the change to the cd->asid by the move algorithm. Use
xarray internal locking during xa_alloc() instead of double locking. Add a
note that concurrent S1 invalidation doesn't fully work.

Note that this is all still dead code, ARM_SMMU_FEAT_BTM is never set.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 133 ++++++++++++++++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  15 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   2 +-
 3 files changed, 129 insertions(+), 21 deletions(-)

Comments

Michael Shavit March 19, 2024, 5:07 p.m. UTC | #1
On Tue, Mar 5, 2024 at 7:44 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> BTM support is a feature where the CPU TLB invalidation can be forwarded
> to the IOMMU and also invalidate the IOTLB. For this to work the CPU and
> IOMMU ASID must be the same.
>
> Retain the prior SVA design here of keeping the ASID allocator for the
> IOMMU private to SMMU and force SVA domains to set an ASID that matches
> the CPU ASID.
>
> This requires changing the ASID assigned to a S1 domain if it happens to
> be overlapping with the required CPU ASID. We hold on to the CPU ASID so
> long as the SVA iommu_domain exists, so SVA domain conflict is not
> possible.
>
> With the asid per-smmu we no longer have a problem that two per-smmu
> iommu_domain's would need to share a CPU ASID entry in the IOMMU's xarray.
>
> Use the same ASID move algorithm for the S1 domains as before with some
> streamlining around how the xarray is being used. Do not synchronize the
> ASID's if BTM mode is not supported. Just leave BTM features off
> everywhere.
>
> Audit all the places that touch cd->asid and think carefully about how the
> locking works with the change to the cd->asid by the move algorithm. Use
> xarray internal locking during xa_alloc() instead of double locking. Add a
> note that concurrent S1 invalidation doesn't fully work.
>
> Note that this is all still dead code, ARM_SMMU_FEAT_BTM is never set.

Clearly a lot of work/thought has gone into this patch, but I have to
wonder if this complexity is worth carrying forward for a feature that
was never enabled... Is moving this to part 3 or dropping it all
together an option?

>
> Tested-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 133 ++++++++++++++++--
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  15 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   2 +-
>  3 files changed, 129 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 3acd699433b7d8..0b5aeaa3a85575 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -15,12 +15,33 @@
>
>  static DEFINE_MUTEX(sva_lock);
>
> -static void __maybe_unused
> -arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
> +static int arm_smmu_realloc_s1_domain_asid(struct arm_smmu_device *smmu,
> +                                          struct arm_smmu_domain *smmu_domain)
>  {
>         struct arm_smmu_master_domain *master_domain;
> +       u32 old_asid = smmu_domain->cd.asid;
>         struct arm_smmu_cd target_cd;
>         unsigned long flags;
> +       int ret;
> +
> +       lockdep_assert_held(&smmu->asid_lock);
> +
> +       /*
> +        * FIXME: The unmap and invalidation path doesn't take any locks but
> +        * this is not fully safe. Since updating the CD tables is not atomic
> +        * there is always a hole where invalidating only one ASID of two active
> +        * ASIDs during unmap will cause the IOTLB to become stale.
> +        *
> +        * This approach is to hopefully shift the racing CPUs to the new ASID
> +        * before we start programming the HW. This increases the chance that
> +        * racing IOPTE changes will pick up an invalidation for the new ASID
> +        * and we achieve eventual consistency. For the brief period where the
> +        * old ASID is still in the CD entries it will become incoherent.
> +        */
> +       ret = xa_alloc(&smmu->asid_map, &smmu_domain->cd.asid, smmu_domain,
> +                      XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
> +       if (ret)
> +               return ret;
>
>         spin_lock_irqsave(&smmu_domain->devices_lock, flags);
>         list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) {
> @@ -36,6 +57,10 @@ arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
>                                         &target_cd);
>         }
>         spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
> +
> +       /* Clean the ASID we are about to assign to a new translation */
> +       arm_smmu_tlb_inv_asid(smmu, old_asid);
> +       return 0;
>  }
>
>  static u64 page_size_to_cd(void)
> @@ -148,12 +173,12 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
>         }
>
>         if (!smmu_domain->btm_invalidation) {
> +               ioasid_t asid = READ_ONCE(smmu_domain->cd.asid);
> +
>                 if (!size)
> -                       arm_smmu_tlb_inv_asid(smmu_domain->smmu,
> -                                             smmu_domain->cd.asid);
> +                       arm_smmu_tlb_inv_asid(smmu_domain->smmu, asid);
>                 else
> -                       arm_smmu_tlb_inv_range_asid(start, size,
> -                                                   smmu_domain->cd.asid,
> +                       arm_smmu_tlb_inv_range_asid(start, size, asid,
>                                                     PAGE_SIZE, false,
>                                                     smmu_domain);
>         }
> @@ -182,6 +207,8 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>                 cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid);
>                 if (WARN_ON(!cdptr))
>                         continue;
> +
> +               /* An SVA ASID never changes, no asid_lock required */
>                 arm_smmu_make_sva_cd(&target, master, NULL,
>                                      smmu_domain->cd.asid,
>                                      smmu_domain->btm_invalidation);
> @@ -388,6 +415,8 @@ static void arm_smmu_sva_domain_free(struct iommu_domain *domain)
>          * unnecessary invalidation.
>          */
>         arm_smmu_domain_free_id(smmu_domain);
> +       if (smmu_domain->btm_invalidation)
> +               arm64_mm_context_put(domain->mm);
>
>         /*
>          * Actual free is defered to the SRCU callback
> @@ -401,13 +430,97 @@ static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
>         .free                   = arm_smmu_sva_domain_free
>  };
>
> +static int arm_smmu_share_asid(struct arm_smmu_device *smmu,
> +                              struct arm_smmu_domain *smmu_domain,
> +                              struct mm_struct *mm)
> +{
> +       struct arm_smmu_domain *old_s1_domain;
> +       int ret;
> +
> +       /*
> +        * Notice that BTM is never currently enabled, this is all dead code.
> +        * The specification cautions:
> +        *
> +        * Note: Arm expects that SMMU stage 2 address spaces are generally
> +        * shared with their respective PE virtual machine stage 2
> +        * configuration. If broadcast invalidation is required to be avoided
> +        * for a particular SMMU stage 2 address space, Arm recommends that a
> +        * hypervisor configures the STE with a VMID that is not allocated for
> +        * virtual machine use on the PEs
> +        *
> +        * However, in Linux, both KVM and SMMU think they own the VMID pool.
> +        * Unfortunately the ARM design is problematic for Linux as we do not
> +        * currently share the S2 table with KVM. This creates a situation where
> +        * the S2 needs to have the same VMID as KVM in order to allow the guest
> +        * to use BTM, however we must still invalidate the S2 directly since it
> +        * is a different radix tree. What Linux would like is something like
> +        * ASET for the STE to disable BTM only for the S2.
> +        *
> +        * Arguably in a system with BTM the driver should prefer to use a S1
> +        * table in all cases execpt when explicitly asked to create a nesting
> +        * parent. Then it should use the VMID of KVM to enable BTM in the
> +        * guest. We cannot optimize away the resulting double invalidation of
> +        * the S2 :( Or we simply ignore BTM entirely as we are doing now.
> +        */
> +       if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM))
> +               return xa_alloc(&smmu->asid_map, &smmu_domain->cd.asid,
> +                               smmu_domain,
> +                               XA_LIMIT(1, (1 << smmu->asid_bits) - 1),
> +                               GFP_KERNEL);
> +
> +       /* At this point the caller ensures we have a mmget() */
> +       smmu_domain->cd.asid = arm64_mm_context_get(mm);
> +
> +       mutex_lock(&smmu->asid_lock);
> +       old_s1_domain = xa_store(&smmu->asid_map, smmu_domain->cd.asid,
> +                                smmu_domain, GFP_KERNEL);
> +       if (xa_err(old_s1_domain)) {
> +               ret = xa_err(old_s1_domain);
> +               goto out_put_asid;
> +       }
> +
> +       /*
> +        * In BTM mode the CPU ASID and the IOMMU ASID have to be the same.
> +        * Unfortunately we run separate allocators for this and the IOMMU
> +        * ASID can already have been assigned to a S1 domain. SVA domains
> +        * always align to their CPU ASIDs. In this case we change
> +        * the S1 domain's ASID, update the CD entry and flush the caches.
> +        *
> +        * This is a bit tricky, all the places writing to a S1 CD, reading the
> +        * S1 ASID, or doing xa_erase must hold the asid_lock or xa_lock to
> +        * avoid IOTLB incoherence.
> +        */
> +       if (old_s1_domain) {
> +               if (WARN_ON(old_s1_domain->domain.type == IOMMU_DOMAIN_SVA)) {
> +                       ret = -EINVAL;
> +                       goto out_restore_s1;
> +               }
> +               ret = arm_smmu_realloc_s1_domain_asid(smmu, old_s1_domain);
> +               if (ret)
> +                       goto out_restore_s1;
> +       }
> +
> +       smmu_domain->btm_invalidation = true;
> +
> +       ret = 0;
> +       goto out_unlock;
> +
> +out_restore_s1:
> +       xa_store(&smmu->asid_map, smmu_domain->cd.asid, old_s1_domain,
> +                GFP_KERNEL);
> +out_put_asid:
> +       arm64_mm_context_put(mm);
> +out_unlock:
> +       mutex_unlock(&smmu->asid_lock);
> +       return ret;
> +}
> +
>  struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
>                                                struct mm_struct *mm)
>  {
>         struct arm_smmu_master *master = dev_iommu_priv_get(dev);
>         struct arm_smmu_device *smmu = master->smmu;
>         struct arm_smmu_domain *smmu_domain;
> -       u32 asid;
>         int ret;
>
>         smmu_domain = arm_smmu_domain_alloc();
> @@ -418,12 +531,10 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
>         smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
>         smmu_domain->smmu = smmu;
>
> -       ret = xa_alloc(&smmu->asid_map, &asid, smmu_domain,
> -                      XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
> +       ret = arm_smmu_share_asid(smmu, smmu_domain, mm);
>         if (ret)
>                 goto err_free;
>
> -       smmu_domain->cd.asid = asid;
>         smmu_domain->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops;
>         ret = mmu_notifier_register(&smmu_domain->mmu_notifier, mm);
>         if (ret)
> @@ -433,6 +544,8 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
>
>  err_asid:
>         arm_smmu_domain_free_id(smmu_domain);
> +       if (smmu_domain->btm_invalidation)
> +               arm64_mm_context_put(mm);
>  err_free:
>         kfree(smmu_domain);
>         return ERR_PTR(ret);
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 1a72dd63e0ca14..01737a0fc82828 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1314,6 +1314,8 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
>         typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr =
>                 &pgtbl_cfg->arm_lpae_s1_cfg.tcr;
>
> +       lockdep_assert_held(&master->smmu->asid_lock);
> +
>         memset(target, 0, sizeof(*target));
>
>         target->data[0] = cpu_to_le64(
> @@ -2089,7 +2091,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>          * careful, 007.
>          */
>         if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> -               arm_smmu_tlb_inv_asid(smmu, smmu_domain->cd.asid);
> +               arm_smmu_tlb_inv_asid(smmu, READ_ONCE(smmu_domain->cd.asid));
>         } else {
>                 cmd.opcode      = CMDQ_OP_TLBI_S12_VMALL;
>                 cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
> @@ -2334,17 +2336,10 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
>  static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
>                                        struct arm_smmu_domain *smmu_domain)
>  {
> -       int ret;
> -       u32 asid;
>         struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
>
> -       /* Prevent SVA from modifying the ASID until it is written to the CD */
> -       mutex_lock(&smmu->asid_lock);
> -       ret = xa_alloc(&smmu->asid_map, &asid, smmu_domain,
> -                      XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
> -       cd->asid        = (u16)asid;
> -       mutex_unlock(&smmu->asid_lock);
> -       return ret;
> +       return xa_alloc(&smmu->asid_map, &cd->asid, smmu_domain,
> +                       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
>  }
>
>  static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu,
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 6becdbae905598..a23593f1830106 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -586,7 +586,7 @@ struct arm_smmu_strtab_l1_desc {
>  };
>
>  struct arm_smmu_ctx_desc {
> -       u16                             asid;
> +       u32                             asid;
>  };
>
>  struct arm_smmu_l1_ctx_desc {
> --
> 2.43.2
>
Jason Gunthorpe March 20, 2024, 1:05 p.m. UTC | #2
On Wed, Mar 20, 2024 at 01:07:01AM +0800, Michael Shavit wrote:
> On Tue, Mar 5, 2024 at 7:44 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > BTM support is a feature where the CPU TLB invalidation can be forwarded
> > to the IOMMU and also invalidate the IOTLB. For this to work the CPU and
> > IOMMU ASID must be the same.
> >
> > Retain the prior SVA design here of keeping the ASID allocator for the
> > IOMMU private to SMMU and force SVA domains to set an ASID that matches
> > the CPU ASID.
> >
> > This requires changing the ASID assigned to a S1 domain if it happens to
> > be overlapping with the required CPU ASID. We hold on to the CPU ASID so
> > long as the SVA iommu_domain exists, so SVA domain conflict is not
> > possible.
> >
> > With the asid per-smmu we no longer have a problem that two per-smmu
> > iommu_domain's would need to share a CPU ASID entry in the IOMMU's xarray.
> >
> > Use the same ASID move algorithm for the S1 domains as before with some
> > streamlining around how the xarray is being used. Do not synchronize the
> > ASID's if BTM mode is not supported. Just leave BTM features off
> > everywhere.
> >
> > Audit all the places that touch cd->asid and think carefully about how the
> > locking works with the change to the cd->asid by the move algorithm. Use
> > xarray internal locking during xa_alloc() instead of double locking. Add a
> > note that concurrent S1 invalidation doesn't fully work.
> >
> > Note that this is all still dead code, ARM_SMMU_FEAT_BTM is never set.
> 
> Clearly a lot of work/thought has gone into this patch, but I have to
> wonder if this complexity is worth carrying forward for a feature that
> was never enabled... Is moving this to part 3 or dropping it all
> together an option?

Yes, we can do any of those. If we drop it here then it will be moved
to this series:

https://lore.kernel.org/linux-iommu/20240208151837.35068-1-shameerali.kolothum.thodi@huawei.com/

Which is the final bit of work to actually enable the feature. We are
getting quite close, it needs some of Nicolin's iommufd enablement
work to cross the finish line, I think.

I agree dead code like this should not be in the kernel..

Will? Please advise

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 3acd699433b7d8..0b5aeaa3a85575 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -15,12 +15,33 @@ 
 
 static DEFINE_MUTEX(sva_lock);
 
-static void __maybe_unused
-arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
+static int arm_smmu_realloc_s1_domain_asid(struct arm_smmu_device *smmu,
+					   struct arm_smmu_domain *smmu_domain)
 {
 	struct arm_smmu_master_domain *master_domain;
+	u32 old_asid = smmu_domain->cd.asid;
 	struct arm_smmu_cd target_cd;
 	unsigned long flags;
+	int ret;
+
+	lockdep_assert_held(&smmu->asid_lock);
+
+	/*
+	 * FIXME: The unmap and invalidation path doesn't take any locks but
+	 * this is not fully safe. Since updating the CD tables is not atomic
+	 * there is always a hole where invalidating only one ASID of two active
+	 * ASIDs during unmap will cause the IOTLB to become stale.
+	 *
+	 * This approach is to hopefully shift the racing CPUs to the new ASID
+	 * before we start programming the HW. This increases the chance that
+	 * racing IOPTE changes will pick up an invalidation for the new ASID
+	 * and we achieve eventual consistency. For the brief period where the
+	 * old ASID is still in the CD entries it will become incoherent.
+	 */
+	ret = xa_alloc(&smmu->asid_map, &smmu_domain->cd.asid, smmu_domain,
+		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
+	if (ret)
+		return ret;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) {
@@ -36,6 +57,10 @@  arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
 					&target_cd);
 	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+	/* Clean the ASID we are about to assign to a new translation */
+	arm_smmu_tlb_inv_asid(smmu, old_asid);
+	return 0;
 }
 
 static u64 page_size_to_cd(void)
@@ -148,12 +173,12 @@  static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
 	}
 
 	if (!smmu_domain->btm_invalidation) {
+		ioasid_t asid = READ_ONCE(smmu_domain->cd.asid);
+
 		if (!size)
-			arm_smmu_tlb_inv_asid(smmu_domain->smmu,
-					      smmu_domain->cd.asid);
+			arm_smmu_tlb_inv_asid(smmu_domain->smmu, asid);
 		else
-			arm_smmu_tlb_inv_range_asid(start, size,
-						    smmu_domain->cd.asid,
+			arm_smmu_tlb_inv_range_asid(start, size, asid,
 						    PAGE_SIZE, false,
 						    smmu_domain);
 	}
@@ -182,6 +207,8 @@  static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 		cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid);
 		if (WARN_ON(!cdptr))
 			continue;
+
+		/* An SVA ASID never changes, no asid_lock required */
 		arm_smmu_make_sva_cd(&target, master, NULL,
 				     smmu_domain->cd.asid,
 				     smmu_domain->btm_invalidation);
@@ -388,6 +415,8 @@  static void arm_smmu_sva_domain_free(struct iommu_domain *domain)
 	 * unnecessary invalidation.
 	 */
 	arm_smmu_domain_free_id(smmu_domain);
+	if (smmu_domain->btm_invalidation)
+		arm64_mm_context_put(domain->mm);
 
 	/*
 	 * Actual free is defered to the SRCU callback
@@ -401,13 +430,97 @@  static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
 	.free			= arm_smmu_sva_domain_free
 };
 
+static int arm_smmu_share_asid(struct arm_smmu_device *smmu,
+			       struct arm_smmu_domain *smmu_domain,
+			       struct mm_struct *mm)
+{
+	struct arm_smmu_domain *old_s1_domain;
+	int ret;
+
+	/*
+	 * Notice that BTM is never currently enabled, this is all dead code.
+	 * The specification cautions:
+	 *
+	 * Note: Arm expects that SMMU stage 2 address spaces are generally
+	 * shared with their respective PE virtual machine stage 2
+	 * configuration. If broadcast invalidation is required to be avoided
+	 * for a particular SMMU stage 2 address space, Arm recommends that a
+	 * hypervisor configures the STE with a VMID that is not allocated for
+	 * virtual machine use on the PEs
+	 *
+	 * However, in Linux, both KVM and SMMU think they own the VMID pool.
+	 * Unfortunately the ARM design is problematic for Linux as we do not
+	 * currently share the S2 table with KVM. This creates a situation where
+	 * the S2 needs to have the same VMID as KVM in order to allow the guest
+	 * to use BTM, however we must still invalidate the S2 directly since it
+	 * is a different radix tree. What Linux would like is something like
+	 * ASET for the STE to disable BTM only for the S2.
+	 *
+	 * Arguably in a system with BTM the driver should prefer to use a S1
+	 * table in all cases execpt when explicitly asked to create a nesting
+	 * parent. Then it should use the VMID of KVM to enable BTM in the
+	 * guest. We cannot optimize away the resulting double invalidation of
+	 * the S2 :( Or we simply ignore BTM entirely as we are doing now.
+	 */
+	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM))
+		return xa_alloc(&smmu->asid_map, &smmu_domain->cd.asid,
+				smmu_domain,
+				XA_LIMIT(1, (1 << smmu->asid_bits) - 1),
+				GFP_KERNEL);
+
+	/* At this point the caller ensures we have a mmget() */
+	smmu_domain->cd.asid = arm64_mm_context_get(mm);
+
+	mutex_lock(&smmu->asid_lock);
+	old_s1_domain = xa_store(&smmu->asid_map, smmu_domain->cd.asid,
+				 smmu_domain, GFP_KERNEL);
+	if (xa_err(old_s1_domain)) {
+		ret = xa_err(old_s1_domain);
+		goto out_put_asid;
+	}
+
+	/*
+	 * In BTM mode the CPU ASID and the IOMMU ASID have to be the same.
+	 * Unfortunately we run separate allocators for this and the IOMMU
+	 * ASID can already have been assigned to a S1 domain. SVA domains
+	 * always align to their CPU ASIDs. In this case we change
+	 * the S1 domain's ASID, update the CD entry and flush the caches.
+	 *
+	 * This is a bit tricky, all the places writing to a S1 CD, reading the
+	 * S1 ASID, or doing xa_erase must hold the asid_lock or xa_lock to
+	 * avoid IOTLB incoherence.
+	 */
+	if (old_s1_domain) {
+		if (WARN_ON(old_s1_domain->domain.type == IOMMU_DOMAIN_SVA)) {
+			ret = -EINVAL;
+			goto out_restore_s1;
+		}
+		ret = arm_smmu_realloc_s1_domain_asid(smmu, old_s1_domain);
+		if (ret)
+			goto out_restore_s1;
+	}
+
+	smmu_domain->btm_invalidation = true;
+
+	ret = 0;
+	goto out_unlock;
+
+out_restore_s1:
+	xa_store(&smmu->asid_map, smmu_domain->cd.asid, old_s1_domain,
+		 GFP_KERNEL);
+out_put_asid:
+	arm64_mm_context_put(mm);
+out_unlock:
+	mutex_unlock(&smmu->asid_lock);
+	return ret;
+}
+
 struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
 					       struct mm_struct *mm)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_domain *smmu_domain;
-	u32 asid;
 	int ret;
 
 	smmu_domain = arm_smmu_domain_alloc();
@@ -418,12 +531,10 @@  struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
 	smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
 	smmu_domain->smmu = smmu;
 
-	ret = xa_alloc(&smmu->asid_map, &asid, smmu_domain,
-		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
+	ret = arm_smmu_share_asid(smmu, smmu_domain, mm);
 	if (ret)
 		goto err_free;
 
-	smmu_domain->cd.asid = asid;
 	smmu_domain->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops;
 	ret = mmu_notifier_register(&smmu_domain->mmu_notifier, mm);
 	if (ret)
@@ -433,6 +544,8 @@  struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
 
 err_asid:
 	arm_smmu_domain_free_id(smmu_domain);
+	if (smmu_domain->btm_invalidation)
+		arm64_mm_context_put(mm);
 err_free:
 	kfree(smmu_domain);
 	return ERR_PTR(ret);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 1a72dd63e0ca14..01737a0fc82828 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1314,6 +1314,8 @@  void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 	typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr =
 		&pgtbl_cfg->arm_lpae_s1_cfg.tcr;
 
+	lockdep_assert_held(&master->smmu->asid_lock);
+
 	memset(target, 0, sizeof(*target));
 
 	target->data[0] = cpu_to_le64(
@@ -2089,7 +2091,7 @@  static void arm_smmu_tlb_inv_context(void *cookie)
 	 * careful, 007.
 	 */
 	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
-		arm_smmu_tlb_inv_asid(smmu, smmu_domain->cd.asid);
+		arm_smmu_tlb_inv_asid(smmu, READ_ONCE(smmu_domain->cd.asid));
 	} else {
 		cmd.opcode	= CMDQ_OP_TLBI_S12_VMALL;
 		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
@@ -2334,17 +2336,10 @@  static void arm_smmu_domain_free(struct iommu_domain *domain)
 static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
 				       struct arm_smmu_domain *smmu_domain)
 {
-	int ret;
-	u32 asid;
 	struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
 
-	/* Prevent SVA from modifying the ASID until it is written to the CD */
-	mutex_lock(&smmu->asid_lock);
-	ret = xa_alloc(&smmu->asid_map, &asid, smmu_domain,
-		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
-	cd->asid	= (u16)asid;
-	mutex_unlock(&smmu->asid_lock);
-	return ret;
+	return xa_alloc(&smmu->asid_map, &cd->asid, smmu_domain,
+			XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
 }
 
 static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 6becdbae905598..a23593f1830106 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -586,7 +586,7 @@  struct arm_smmu_strtab_l1_desc {
 };
 
 struct arm_smmu_ctx_desc {
-	u16				asid;
+	u32				asid;
 };
 
 struct arm_smmu_l1_ctx_desc {