diff mbox series

[v2,08/10] iommu/arm-smmu-v3: Shrink the cdtab l1_desc array

Message ID 8-v2-318ed5f6983b+198f-smmuv3_tidy_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Tidy some minor things in the stream table/cd table area | expand

Commit Message

Jason Gunthorpe June 11, 2024, 12:31 a.m. UTC
The top of the 2 level CD table is (at most) 1024 entries big, and two
high order allocations are required. One of __le64 which is programmed
into the HW (8k) and one of struct arm_smmu_l1_ctx_desc which holds the
CPU pointer (16k).

There are two copies of the l2ptr_dma, one is stored in the struct
arm_smmu_l1_ctx_desc, and another is encoded in the __le64 for the HW to
use. Instead of storing two copies just decode the value from the __le64.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 41 +++++++++------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
 2 files changed, 17 insertions(+), 25 deletions(-)

Comments

Nicolin Chen June 11, 2024, 3:02 a.m. UTC | #1
On Mon, Jun 10, 2024 at 09:31:17PM -0300, Jason Gunthorpe wrote:
> The top of the 2 level CD table is (at most) 1024 entries big, and two
> high order allocations are required. One of __le64 which is programmed
> into the HW (8k) and one of struct arm_smmu_l1_ctx_desc which holds the
> CPU pointer (16k).
> 
> There are two copies of the l2ptr_dma, one is stored in the struct
> arm_smmu_l1_ctx_desc, and another is encoded in the __le64 for the HW to
> use. Instead of storing two copies just decode the value from the __le64.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Will Deacon July 2, 2024, 6:46 p.m. UTC | #2
On Mon, Jun 10, 2024 at 09:31:17PM -0300, Jason Gunthorpe wrote:
> The top of the 2 level CD table is (at most) 1024 entries big, and two
> high order allocations are required. One of __le64 which is programmed
> into the HW (8k) and one of struct arm_smmu_l1_ctx_desc which holds the
> CPU pointer (16k).
> 
> There are two copies of the l2ptr_dma, one is stored in the struct
> arm_smmu_l1_ctx_desc, and another is encoded in the __le64 for the HW to
> use. Instead of storing two copies just decode the value from the __le64.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 41 +++++++++------------
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
>  2 files changed, 17 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 6245e2558e6a6a..dd65e27aebafd4 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1167,31 +1167,19 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
>  	arm_smmu_cmdq_batch_submit(smmu, &cmds);
>  }
>  
> -static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
> -					struct arm_smmu_l1_ctx_desc *l1_desc)
> +static void arm_smmu_write_cd_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
>  {
> -	size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
> -
> -	l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size,
> -					    &l1_desc->l2ptr_dma, GFP_KERNEL);
> -	if (!l1_desc->l2ptr) {
> -		dev_warn(smmu->dev,
> -			 "failed to allocate context descriptor table\n");
> -		return -ENOMEM;
> -	}
> -	return 0;
> -}
> -
> -static void arm_smmu_write_cd_l1_desc(__le64 *dst,
> -				      struct arm_smmu_l1_ctx_desc *l1_desc)
> -{
> -	u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
> -		  CTXDESC_L1_DESC_V;
> +	u64 val = (l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V;
>  
>  	/* The HW has 64 bit atomicity with stores to the L2 CD table */
>  	WRITE_ONCE(*dst, cpu_to_le64(val));
>  }
>  
> +static dma_addr_t arm_smmu_cd_l1_get_desc(const __le64 *src)
> +{
> +	return le64_to_cpu(src) & CTXDESC_L1_DESC_L2PTR_MASK;
> +}

I'm assuming this is supposed to be *src, in which case this could be
accessing non-cacheable memory if the SMMU isn't coherent. That's why we
shadow everything.

If the current code isn't causing you any problems, please can we leave
it as-is?

Will
Jason Gunthorpe July 9, 2024, 7:52 p.m. UTC | #3
On Tue, Jul 02, 2024 at 07:46:07PM +0100, Will Deacon wrote:
> On Mon, Jun 10, 2024 at 09:31:17PM -0300, Jason Gunthorpe wrote:
> > The top of the 2 level CD table is (at most) 1024 entries big, and two
> > high order allocations are required. One of __le64 which is programmed
> > into the HW (8k) and one of struct arm_smmu_l1_ctx_desc which holds the
> > CPU pointer (16k).
> > 
> > There are two copies of the l2ptr_dma, one is stored in the struct
> > arm_smmu_l1_ctx_desc, and another is encoded in the __le64 for the HW to
> > use. Instead of storing two copies just decode the value from the __le64.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > ---
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 41 +++++++++------------
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
> >  2 files changed, 17 insertions(+), 25 deletions(-)
> > 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > index 6245e2558e6a6a..dd65e27aebafd4 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > @@ -1167,31 +1167,19 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
> >  	arm_smmu_cmdq_batch_submit(smmu, &cmds);
> >  }
> >  
> > -static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
> > -					struct arm_smmu_l1_ctx_desc *l1_desc)
> > +static void arm_smmu_write_cd_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
> >  {
> > -	size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
> > -
> > -	l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size,
> > -					    &l1_desc->l2ptr_dma, GFP_KERNEL);
> > -	if (!l1_desc->l2ptr) {
> > -		dev_warn(smmu->dev,
> > -			 "failed to allocate context descriptor table\n");
> > -		return -ENOMEM;
> > -	}
> > -	return 0;
> > -}
> > -
> > -static void arm_smmu_write_cd_l1_desc(__le64 *dst,
> > -				      struct arm_smmu_l1_ctx_desc *l1_desc)
> > -{
> > -	u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
> > -		  CTXDESC_L1_DESC_V;
> > +	u64 val = (l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V;
> >  
> >  	/* The HW has 64 bit atomicity with stores to the L2 CD table */
> >  	WRITE_ONCE(*dst, cpu_to_le64(val));
> >  }
> >  
> > +static dma_addr_t arm_smmu_cd_l1_get_desc(const __le64 *src)
> > +{
> > +	return le64_to_cpu(src) & CTXDESC_L1_DESC_L2PTR_MASK;
> > +}
> 
> I'm assuming this is supposed to be *src,

Uh, wow, surprised that compiles, yes.

> in which case this could be
> accessing non-cacheable memory if the SMMU isn't coherent. That's why we
> shadow everything.

That is a confusing statement. I know it is a DMA coherent allocation,
but the driver does not avoid reading for DMA coherent memory in all
cases, and DMA coherent allocations are well defined to be readable
anyhow.

For instance the driver has always read back from coherent allocations
when working with the STE or CD:

	strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma,
				     GFP_KERNEL);
	cfg->strtab.linear = strtab;

static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
                                      __le64 *dst)
{
        u64 val = le64_to_cpu(dst[0]);
                              ^^^^ dst points into strtab above

That's the same thing being done here, with no shadowing, no issue.

I know no reason why the first level cd table would be special that it
needs shadowing? It is just wasting memory. Let's fix it.

Jason
Jason Gunthorpe July 9, 2024, 11:53 p.m. UTC | #4
On Tue, Jul 02, 2024 at 07:46:07PM +0100, Will Deacon wrote:

> > +static dma_addr_t arm_smmu_cd_l1_get_desc(const __le64 *src)
> > +{
> > +	return le64_to_cpu(src) & CTXDESC_L1_DESC_L2PTR_MASK;
> > +}
> 
> I'm assuming this is supposed to be *src, in which case this could
> be

Oh I see now, a few patches later it becomes:

static dma_addr_t arm_smmu_cd_l1_get_desc(const struct arm_smmu_cdtab_l1 *src)
{
	return le64_to_cpu(src->l2ptr) & CTXDESC_L1_DESC_L2PTR_MASK;
}

Which makes it fine, it is just a bisection thing.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6245e2558e6a6a..dd65e27aebafd4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1167,31 +1167,19 @@  static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 	arm_smmu_cmdq_batch_submit(smmu, &cmds);
 }
 
-static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
-					struct arm_smmu_l1_ctx_desc *l1_desc)
+static void arm_smmu_write_cd_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
 {
-	size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
-
-	l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size,
-					    &l1_desc->l2ptr_dma, GFP_KERNEL);
-	if (!l1_desc->l2ptr) {
-		dev_warn(smmu->dev,
-			 "failed to allocate context descriptor table\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static void arm_smmu_write_cd_l1_desc(__le64 *dst,
-				      struct arm_smmu_l1_ctx_desc *l1_desc)
-{
-	u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
-		  CTXDESC_L1_DESC_V;
+	u64 val = (l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V;
 
 	/* The HW has 64 bit atomicity with stores to the L2 CD table */
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
+static dma_addr_t arm_smmu_cd_l1_get_desc(const __le64 *src)
+{
+	return le64_to_cpu(src) & CTXDESC_L1_DESC_L2PTR_MASK;
+}
+
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 					u32 ssid)
 {
@@ -1231,13 +1219,17 @@  struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
 
 		l1_desc = &cd_table->l1_desc[idx];
 		if (!l1_desc->l2ptr) {
-			__le64 *l1ptr;
+			dma_addr_t l2ptr_dma;
 
-			if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+			l1_desc->l2ptr = dma_alloc_coherent(
+				smmu->dev,
+				CTXDESC_L2_ENTRIES * sizeof(struct arm_smmu_cd),
+				&l2ptr_dma, GFP_KERNEL);
+			if (!l1_desc->l2ptr)
 				return NULL;
 
-			l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
-			arm_smmu_write_cd_l1_desc(l1ptr, l1_desc);
+			arm_smmu_write_cd_l1_desc(&cd_table->cdtab[idx],
+						  l2ptr_dma);
 			/* An invalid L1CD can be cached */
 			arm_smmu_sync_cd(master, ssid, false);
 		}
@@ -1415,7 +1407,8 @@  static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 
 			dma_free_coherent(smmu->dev, size,
 					  cd_table->l1_desc[i].l2ptr,
-					  cd_table->l1_desc[i].l2ptr_dma);
+					  arm_smmu_cd_l1_get_desc(
+						  &cd_table->cdtab[i]));
 		}
 		kfree(cd_table->l1_desc);
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 3862f6e65c770e..2f7d70d92b1f31 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -603,7 +603,6 @@  struct arm_smmu_ctx_desc {
 
 struct arm_smmu_l1_ctx_desc {
 	struct arm_smmu_cd		*l2ptr;
-	dma_addr_t			l2ptr_dma;
 };
 
 struct arm_smmu_ctx_desc_cfg {