diff mbox series

[v5,6/9] iommu/arm-smmu-v3: Move CD table to arm_smmu_master

Message ID 20230809011204.v5.6.Ice063dcf87d1b777a72e008d9e3406d2bcf6d876@changeid (mailing list archive)
State New, archived
Headers show
Series Refactor the SMMU's CD table ownership | expand

Commit Message

Michael Shavit Aug. 8, 2023, 5:12 p.m. UTC
With this change, each master will now own its own CD table instead of
sharing one with other masters attached to the same domain. Attaching a
stage 1 domain installs CD entries into the master's CD table. SVA
writes its CD entries into each master's CD table if the domain is
shared across masters.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Michael Shavit <mshavit@google.com>
---

Changes in v5:
- Clear the 0th CD entry when the domain is detached. Not clearing it
  caused a bug in arm_smmu_write_ctx_desc which doesn't expect the entry
  to already be set.

Changes in v4:
- Added comment about the cd_table's dependency on the iommu core's
  group mutex.
- Narrowed the range of code for which the domain's init_mutex is held
  on attach since it now only protects the arm_smmu_domain_finalise
  call.

Changes in v2:
- Allocate CD table when it's first needed instead of on probe.

Changes in v1:
- The master's CD table allocation was previously split to a different
  commit. This change now atomically allocates the new CD table, uses
  it, and removes the old one.

 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 92 ++++++++++-----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  7 +-
 2 files changed, 49 insertions(+), 50 deletions(-)

Comments

Will Deacon Aug. 9, 2023, 1:50 p.m. UTC | #1
On Wed, Aug 09, 2023 at 01:12:02AM +0800, Michael Shavit wrote:
> @@ -2203,7 +2186,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
>  		ias = min_t(unsigned long, ias, VA_BITS);
>  		oas = smmu->ias;
>  		fmt = ARM_64_LPAE_S1;
> -		finalise_stage_fn = arm_smmu_domain_finalise_s1;
> +		finalise_stage_fn = arm_smmu_domain_finalise_cd;

Why is this a better name? Now we have inconsistency with
arm_smmu_domain_finalise_s2().

>  		break;
>  	case ARM_SMMU_DOMAIN_NESTED:
>  	case ARM_SMMU_DOMAIN_S2:
> @@ -2402,6 +2385,16 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
>  	master->domain = NULL;
>  	master->ats_enabled = false;
>  	arm_smmu_install_ste_for_dev(master);
> +	/*
> +	 * The table is uninstalled before clearing the CD to prevent an
> +	 * unnecessary sync in arm_smmu_write_ctx_desc. Although clearing the
> +	 * CD entry isn't strictly required to detach the domain since the
> +	 * table is uninstalled anyway, it's more proper and helps avoid
> +	 * confusion in the call to arm_smmu_write_ctx_desc on the next attach

You can remove the "it's more proper" part.

> +	 * (which expects the entry to be empty).
> +	 */
> +	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->cd_table.cdtab)
> +		arm_smmu_write_ctx_desc(master, 0, NULL);
>  }
>  
>  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
> @@ -2436,22 +2429,14 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>  	if (!smmu_domain->smmu) {
>  		smmu_domain->smmu = smmu;
>  		ret = arm_smmu_domain_finalise(domain, master);
> -		if (ret) {
> +		if (ret)
>  			smmu_domain->smmu = NULL;
> -			goto out_unlock;
> -		}
> -	} else if (smmu_domain->smmu != smmu) {
> -		ret = -EINVAL;
> -		goto out_unlock;
> -	} else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 &&
> -		   master->ssid_bits != smmu_domain->cd_table.max_cds_bits) {
> +	} else if (smmu_domain->smmu != smmu)
>  		ret = -EINVAL;
> -		goto out_unlock;
> -	} else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 &&
> -		   smmu_domain->cd_table.stall_enabled != master->stall_enabled) {
> -		ret = -EINVAL;
> -		goto out_unlock;
> -	}

Removing these checks on the domain is pretty nice.

> @@ -2465,6 +2450,22 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>  	if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
>  		master->ats_enabled = arm_smmu_ats_supported(master);
>  
> +	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> +		if (!master->cd_table.cdtab) {
> +			ret = arm_smmu_alloc_cd_tables(master);
> +			if (ret) {
> +				master->domain = NULL;
> +				return ret;
> +			}
> +		}
> +
> +		ret = arm_smmu_write_ctx_desc(master, 0, &smmu_domain->cd);
> +		if (ret) {
> +			master->domain = NULL;
> +			return ret;

Can you leak the cd tables here if you just allocated them?

> @@ -2472,10 +2473,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>  	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>  
>  	arm_smmu_enable_ats(master);
> -
> -out_unlock:
> -	mutex_unlock(&smmu_domain->init_mutex);
> -	return ret;
> +	return 0;
>  }
>  
>  static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
> @@ -2719,6 +2717,8 @@ static void arm_smmu_release_device(struct device *dev)
>  	arm_smmu_detach_dev(master);
>  	arm_smmu_disable_pasid(master);
>  	arm_smmu_remove_master(master);
> +	if (master->cd_table.cdtab_dma)

Why are you checking 'cdtab_dma' here instead of just 'cdtab'?

Will
Michael Shavit Aug. 10, 2023, 9:23 a.m. UTC | #2
>
> > @@ -2465,6 +2450,22 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
> >       if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
> >               master->ats_enabled = arm_smmu_ats_supported(master);
> >
> > +     if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> > +             if (!master->cd_table.cdtab) {
> > +                     ret = arm_smmu_alloc_cd_tables(master);
> > +                     if (ret) {
> > +                             master->domain = NULL;
> > +                             return ret;
> > +                     }
> > +             }
> > +
> > +             ret = arm_smmu_write_ctx_desc(master, 0, &smmu_domain->cd);
> > +             if (ret) {
> > +                     master->domain = NULL;
> > +                     return ret;
>
> Can you leak the cd tables here if you just allocated them?

The CD table is only de-allocated when the SMMU device is released, so
this isn't "leaked" anymore than on a successful attachment. In a
previous version of this patch, this CD table was even pre-allocated
at probe time but is deferred to first attach following this
discussion: https://lore.kernel.org/lkml/ZMOzs1%2FxoEPX2+vA@nvidia.com/
.

> > @@ -2472,10 +2473,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
> >       spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
> >
> >       arm_smmu_enable_ats(master);
> > -
> > -out_unlock:
> > -     mutex_unlock(&smmu_domain->init_mutex);
> > -     return ret;
> > +     return 0;
> >  }
> >
> >  static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
> > @@ -2719,6 +2717,8 @@ static void arm_smmu_release_device(struct device *dev)
> >       arm_smmu_detach_dev(master);
> >       arm_smmu_disable_pasid(master);
> >       arm_smmu_remove_master(master);
> > +     if (master->cd_table.cdtab_dma)
>
> Why are you checking 'cdtab_dma' here instead of just 'cdtab'?

cd_table is statically allocated as part of the arm_smmu_master
struct. I suppose it could be allocated by arm_smmu_alloc_cd_tables()
instead?
Michael Shavit Aug. 10, 2023, 9:45 a.m. UTC | #3
On Wed, Aug 9, 2023 at 9:50 PM Will Deacon <will@kernel.org> wrote:
>
> On Wed, Aug 09, 2023 at 01:12:02AM +0800, Michael Shavit wrote:
> > @@ -2203,7 +2186,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
> >               ias = min_t(unsigned long, ias, VA_BITS);
> >               oas = smmu->ias;
> >               fmt = ARM_64_LPAE_S1;
> > -             finalise_stage_fn = arm_smmu_domain_finalise_s1;
> > +             finalise_stage_fn = arm_smmu_domain_finalise_cd;
>
> Why is this a better name? Now we have inconsistency with
> arm_smmu_domain_finalise_s2().

There was a time where s1cfg represented the entire STE and carried
the entire cd table. We've gotten rid of s1cfg, and now only store
arm_smmu_ctx_desc in the arm_smmu_domain for stage 1 domains.
arm_smmu_domain_finalise_cd is IMO more clear, especially given the
historical baggage around `s1`.
Will Deacon Aug. 10, 2023, 2:34 p.m. UTC | #4
On Thu, Aug 10, 2023 at 05:45:03PM +0800, Michael Shavit wrote:
> On Wed, Aug 9, 2023 at 9:50 PM Will Deacon <will@kernel.org> wrote:
> >
> > On Wed, Aug 09, 2023 at 01:12:02AM +0800, Michael Shavit wrote:
> > > @@ -2203,7 +2186,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
> > >               ias = min_t(unsigned long, ias, VA_BITS);
> > >               oas = smmu->ias;
> > >               fmt = ARM_64_LPAE_S1;
> > > -             finalise_stage_fn = arm_smmu_domain_finalise_s1;
> > > +             finalise_stage_fn = arm_smmu_domain_finalise_cd;
> >
> > Why is this a better name? Now we have inconsistency with
> > arm_smmu_domain_finalise_s2().
> 
> There was a time where s1cfg represented the entire STE and carried
> the entire cd table. We've gotten rid of s1cfg, and now only store
> arm_smmu_ctx_desc in the arm_smmu_domain for stage 1 domains.
> arm_smmu_domain_finalise_cd is IMO more clear, especially given the
> historical baggage around `s1`.

Ok, but it's the inconsistency I object to. I don't think it's clear at
all to have arm_smmu_domain_finalise_cd() and arm_smmu_domain_finalise_s2().

The easiest thing is to leave it as-is.

Will
Will Deacon Aug. 10, 2023, 2:38 p.m. UTC | #5
On Thu, Aug 10, 2023 at 05:23:37PM +0800, Michael Shavit wrote:
> >
> > > @@ -2465,6 +2450,22 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
> > >       if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
> > >               master->ats_enabled = arm_smmu_ats_supported(master);
> > >
> > > +     if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> > > +             if (!master->cd_table.cdtab) {
> > > +                     ret = arm_smmu_alloc_cd_tables(master);
> > > +                     if (ret) {
> > > +                             master->domain = NULL;
> > > +                             return ret;
> > > +                     }
> > > +             }
> > > +
> > > +             ret = arm_smmu_write_ctx_desc(master, 0, &smmu_domain->cd);
> > > +             if (ret) {
> > > +                     master->domain = NULL;
> > > +                     return ret;
> >
> > Can you leak the cd tables here if you just allocated them?
> 
> The CD table is only de-allocated when the SMMU device is released, so
> this isn't "leaked" anymore than on a successful attachment. In a
> previous version of this patch, this CD table was even pre-allocated
> at probe time but is deferred to first attach following this
> discussion: https://lore.kernel.org/lkml/ZMOzs1%2FxoEPX2+vA@nvidia.com/

Thanks, that makes sense.

> > > @@ -2472,10 +2473,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
> > >       spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
> > >
> > >       arm_smmu_enable_ats(master);
> > > -
> > > -out_unlock:
> > > -     mutex_unlock(&smmu_domain->init_mutex);
> > > -     return ret;
> > > +     return 0;
> > >  }
> > >
> > >  static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
> > > @@ -2719,6 +2717,8 @@ static void arm_smmu_release_device(struct device *dev)
> > >       arm_smmu_detach_dev(master);
> > >       arm_smmu_disable_pasid(master);
> > >       arm_smmu_remove_master(master);
> > > +     if (master->cd_table.cdtab_dma)
> >
> > Why are you checking 'cdtab_dma' here instead of just 'cdtab'?
> 
> cd_table is statically allocated as part of the arm_smmu_master
> struct. I suppose it could be allocated by arm_smmu_alloc_cd_tables()
> instead?

I just mean you could check 'master->cd_table.cdtab' like you do in other
places. The DMA pointer is supposed to be opaque.

Will
Jason Gunthorpe Aug. 10, 2023, 2:56 p.m. UTC | #6
On Thu, Aug 10, 2023 at 03:34:49PM +0100, Will Deacon wrote:
> On Thu, Aug 10, 2023 at 05:45:03PM +0800, Michael Shavit wrote:
> > On Wed, Aug 9, 2023 at 9:50 PM Will Deacon <will@kernel.org> wrote:
> > >
> > > On Wed, Aug 09, 2023 at 01:12:02AM +0800, Michael Shavit wrote:
> > > > @@ -2203,7 +2186,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
> > > >               ias = min_t(unsigned long, ias, VA_BITS);
> > > >               oas = smmu->ias;
> > > >               fmt = ARM_64_LPAE_S1;
> > > > -             finalise_stage_fn = arm_smmu_domain_finalise_s1;
> > > > +             finalise_stage_fn = arm_smmu_domain_finalise_cd;
> > >
> > > Why is this a better name? Now we have inconsistency with
> > > arm_smmu_domain_finalise_s2().
> > 
> > There was a time where s1cfg represented the entire STE and carried
> > the entire cd table. We've gotten rid of s1cfg, and now only store
> > arm_smmu_ctx_desc in the arm_smmu_domain for stage 1 domains.
> > arm_smmu_domain_finalise_cd is IMO more clear, especially given the
> > historical baggage around `s1`.
> 
> Ok, but it's the inconsistency I object to. I don't think it's clear at
> all to have arm_smmu_domain_finalise_cd() and arm_smmu_domain_finalise_s2().
> 
> The easiest thing is to leave it as-is.

Well the names have become wonky.

arm_smmu_domain_finalise_cd() is filling in the struct
arm_smmu_ctx_desc which is mostly the precomputed value for the CD
table entry, which is mostly redundant copies of the values of the
underlying pgtbl_cfg :\

But I agree keeping it named s1 is more consistent with the naming I
think we should use - domains are called S1 or S2 domains depending on
their IOPTE format. 

But arm_smmu_domain_assign_asid/vmid is a generally clearer name for
both :\

Jason
Michael Shavit Aug. 15, 2023, 12:10 p.m. UTC | #7
On Wed, Aug 9, 2023 at 1:15 AM Michael Shavit <mshavit@google.com> wrote:
>

> -static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
> +static int arm_smmu_domain_finalise_cd(struct arm_smmu_domain *smmu_domain,
>                                        struct arm_smmu_master *master,
>                                        struct io_pgtable_cfg *pgtbl_cfg)
>  {
> @@ -2115,10 +2110,6 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
>         if (ret)
>                 goto out_unlock;
>
> -       ret = arm_smmu_alloc_cd_tables(smmu_domain, master);
> -       if (ret)
> -               goto out_free_asid;
> -
>         cd->asid        = (u16)asid;
>         cd->ttbr        = pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
>         cd->tcr         = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) |
> @@ -2130,17 +2121,9 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
>                           CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
>         cd->mair        = pgtbl_cfg->arm_lpae_s1_cfg.mair;
>
> -       ret = arm_smmu_write_ctx_desc(master, 0, cd);
> -       if (ret)
> -               goto out_free_cd_tables;
> -
>         mutex_unlock(&arm_smmu_asid_lock);
>         return 0;
>
> -out_free_cd_tables:
> -       arm_smmu_free_cd_tables(smmu_domain);
> -out_free_asid:
> -       arm_smmu_free_asid(cd);
>  out_unlock:
>         mutex_unlock(&arm_smmu_asid_lock);
>         return ret;
...
> @@ -2465,6 +2450,22 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>         if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
>                 master->ats_enabled = arm_smmu_ats_supported(master);
>
> +       if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
> +               if (!master->cd_table.cdtab) {
> +                       ret = arm_smmu_alloc_cd_tables(master);
> +                       if (ret) {
> +                               master->domain = NULL;
> +                               return ret;
> +                       }
> +               }
> +
> +               ret = arm_smmu_write_ctx_desc(master, 0, &smmu_domain->cd);
> +               if (ret) {
> +                       master->domain = NULL;
> +                       return ret;
> +               }
> +       }
> +
>         arm_smmu_install_ste_for_dev(master);
>
>         spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> @@ -2472,10 +2473,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>         spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>
>         arm_smmu_enable_ats(master);

All this talk of locking on the other thread made me realize there's
an issue here. We are no longer holding the arm_smmu_asid_lock while
arm_smmu_write_ctx_desc is called due to its move out of
arm_smmu_domain_finalise_s1. This can race with arm_smmu_share_asid
which may modify the asid after we've written it, but before we've
updated the CD's smmu_domain->devices list.
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 34bd7815aeb8e..3f32f9a191a5f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1025,7 +1025,7 @@  static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
 	unsigned int idx;
 	struct arm_smmu_l1_ctx_desc *l1_desc;
 	struct arm_smmu_device *smmu = master->smmu;
-	struct arm_smmu_ctx_desc_cfg *cdcfg = &master->domain->cd_table;
+	struct arm_smmu_ctx_desc_cfg *cdcfg = &master->cd_table;
 
 	if (!cdcfg->l1_desc)
 		return cdcfg->cdtab + ssid * CTXDESC_CD_DWORDS;
@@ -1062,7 +1062,7 @@  int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	u64 val;
 	bool cd_live;
 	__le64 *cdptr;
-	struct arm_smmu_ctx_desc_cfg *cd_table = &master->domain->cd_table;
+	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 
 	if (WARN_ON(ssid >= (1 << cd_table->max_cds_bits)))
 		return -E2BIG;
@@ -1125,14 +1125,13 @@  int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	return 0;
 }
 
-static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain,
-				    struct arm_smmu_master *master)
+static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 {
 	int ret;
 	size_t l1size;
 	size_t max_contexts;
 	struct arm_smmu_device *smmu = master->smmu;
-	struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->cd_table;
+	struct arm_smmu_ctx_desc_cfg *cdcfg = &master->cd_table;
 
 	cdcfg->stall_enabled = master->stall_enabled;
 	cdcfg->max_cds_bits = master->ssid_bits;
@@ -1174,12 +1173,12 @@  static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain,
 	return ret;
 }
 
-static void arm_smmu_free_cd_tables(struct arm_smmu_domain *smmu_domain)
+static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 {
 	int i;
 	size_t size, l1size;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->cd_table;
+	struct arm_smmu_device *smmu = master->smmu;
+	struct arm_smmu_ctx_desc_cfg *cdcfg = &master->cd_table;
 
 	if (cdcfg->l1_desc) {
 		size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
@@ -1287,7 +1286,7 @@  static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 	if (smmu_domain) {
 		switch (smmu_domain->stage) {
 		case ARM_SMMU_DOMAIN_S1:
-			cd_table = &smmu_domain->cd_table;
+			cd_table = &master->cd_table;
 			break;
 		case ARM_SMMU_DOMAIN_S2:
 		case ARM_SMMU_DOMAIN_NESTED:
@@ -2077,14 +2076,10 @@  static void arm_smmu_domain_free(struct iommu_domain *domain)
 
 	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
 
-	/* Free the CD and ASID, if we allocated them */
+	/* Free the ASID or VMID */
 	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
-		struct arm_smmu_ctx_desc_cfg *cd_table = &smmu_domain->cd_table;
-
 		/* Prevent SVA from touching the CD while we're freeing it */
 		mutex_lock(&arm_smmu_asid_lock);
-		if (cd_table->cdtab)
-			arm_smmu_free_cd_tables(smmu_domain);
 		arm_smmu_free_asid(&smmu_domain->cd);
 		mutex_unlock(&arm_smmu_asid_lock);
 	} else {
@@ -2096,7 +2091,7 @@  static void arm_smmu_domain_free(struct iommu_domain *domain)
 	kfree(smmu_domain);
 }
 
-static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
+static int arm_smmu_domain_finalise_cd(struct arm_smmu_domain *smmu_domain,
 				       struct arm_smmu_master *master,
 				       struct io_pgtable_cfg *pgtbl_cfg)
 {
@@ -2115,10 +2110,6 @@  static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
 	if (ret)
 		goto out_unlock;
 
-	ret = arm_smmu_alloc_cd_tables(smmu_domain, master);
-	if (ret)
-		goto out_free_asid;
-
 	cd->asid	= (u16)asid;
 	cd->ttbr	= pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
 	cd->tcr		= FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) |
@@ -2130,17 +2121,9 @@  static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
 			  CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
 	cd->mair	= pgtbl_cfg->arm_lpae_s1_cfg.mair;
 
-	ret = arm_smmu_write_ctx_desc(master, 0, cd);
-	if (ret)
-		goto out_free_cd_tables;
-
 	mutex_unlock(&arm_smmu_asid_lock);
 	return 0;
 
-out_free_cd_tables:
-	arm_smmu_free_cd_tables(smmu_domain);
-out_free_asid:
-	arm_smmu_free_asid(cd);
 out_unlock:
 	mutex_unlock(&arm_smmu_asid_lock);
 	return ret;
@@ -2203,7 +2186,7 @@  static int arm_smmu_domain_finalise(struct iommu_domain *domain,
 		ias = min_t(unsigned long, ias, VA_BITS);
 		oas = smmu->ias;
 		fmt = ARM_64_LPAE_S1;
-		finalise_stage_fn = arm_smmu_domain_finalise_s1;
+		finalise_stage_fn = arm_smmu_domain_finalise_cd;
 		break;
 	case ARM_SMMU_DOMAIN_NESTED:
 	case ARM_SMMU_DOMAIN_S2:
@@ -2402,6 +2385,16 @@  static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 	master->domain = NULL;
 	master->ats_enabled = false;
 	arm_smmu_install_ste_for_dev(master);
+	/*
+	 * The table is uninstalled before clearing the CD to prevent an
+	 * unnecessary sync in arm_smmu_write_ctx_desc. Although clearing the
+	 * CD entry isn't strictly required to detach the domain since the
+	 * table is uninstalled anyway, it's more proper and helps avoid
+	 * confusion in the call to arm_smmu_write_ctx_desc on the next attach
+	 * (which expects the entry to be empty).
+	 */
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->cd_table.cdtab)
+		arm_smmu_write_ctx_desc(master, 0, NULL);
 }
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
@@ -2436,22 +2429,14 @@  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	if (!smmu_domain->smmu) {
 		smmu_domain->smmu = smmu;
 		ret = arm_smmu_domain_finalise(domain, master);
-		if (ret) {
+		if (ret)
 			smmu_domain->smmu = NULL;
-			goto out_unlock;
-		}
-	} else if (smmu_domain->smmu != smmu) {
-		ret = -EINVAL;
-		goto out_unlock;
-	} else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 &&
-		   master->ssid_bits != smmu_domain->cd_table.max_cds_bits) {
+	} else if (smmu_domain->smmu != smmu)
 		ret = -EINVAL;
-		goto out_unlock;
-	} else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 &&
-		   smmu_domain->cd_table.stall_enabled != master->stall_enabled) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
+
+	mutex_unlock(&smmu_domain->init_mutex);
+	if (ret)
+		return ret;
 
 	master->domain = smmu_domain;
 
@@ -2465,6 +2450,22 @@  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
 		master->ats_enabled = arm_smmu_ats_supported(master);
 
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		if (!master->cd_table.cdtab) {
+			ret = arm_smmu_alloc_cd_tables(master);
+			if (ret) {
+				master->domain = NULL;
+				return ret;
+			}
+		}
+
+		ret = arm_smmu_write_ctx_desc(master, 0, &smmu_domain->cd);
+		if (ret) {
+			master->domain = NULL;
+			return ret;
+		}
+	}
+
 	arm_smmu_install_ste_for_dev(master);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
@@ -2472,10 +2473,7 @@  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 	arm_smmu_enable_ats(master);
-
-out_unlock:
-	mutex_unlock(&smmu_domain->init_mutex);
-	return ret;
+	return 0;
 }
 
 static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
@@ -2719,6 +2717,8 @@  static void arm_smmu_release_device(struct device *dev)
 	arm_smmu_detach_dev(master);
 	arm_smmu_disable_pasid(master);
 	arm_smmu_remove_master(master);
+	if (master->cd_table.cdtab_dma)
+		arm_smmu_free_cd_tables(master);
 	kfree(master);
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 6066a09c01996..1f3b370257779 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -694,6 +694,8 @@  struct arm_smmu_master {
 	struct arm_smmu_domain		*domain;
 	struct list_head		domain_head;
 	struct arm_smmu_stream		*streams;
+	/* Locked by the iommu core using the group mutex */
+	struct arm_smmu_ctx_desc_cfg	cd_table;
 	unsigned int			num_streams;
 	bool				ats_enabled;
 	bool				stall_enabled;
@@ -720,11 +722,8 @@  struct arm_smmu_domain {
 
 	enum arm_smmu_domain_stage	stage;
 	union {
-		struct {
 		struct arm_smmu_ctx_desc	cd;
-		struct arm_smmu_ctx_desc_cfg	cd_table;
-		};
-		struct arm_smmu_s2_cfg	s2_cfg;
+		struct arm_smmu_s2_cfg		s2_cfg;
 	};
 
 	struct iommu_domain		domain;