diff mbox series

[v5,04/27] iommu/arm-smmu-v3: Add an ops indirection to the STE code

Message ID 4-v5-9a37e0c884ce+31e3-smmuv3_newapi_p2_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Update SMMUv3 to the modern iommu API (part 2/3) | expand

Commit Message

Jason Gunthorpe March 4, 2024, 11:43 p.m. UTC
Prepare to put the CD code into the same mechanism. Add an ops indirection
around all the STE specific code and make the worker functions independent
of the entry content being processed.

get_used and sync ops are provided to hook the correct code.

Signed-off-by: Michael Shavit <mshavit@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 172 ++++++++++++--------
 1 file changed, 104 insertions(+), 68 deletions(-)

Comments

Michael Shavit March 13, 2024, 11:30 a.m. UTC | #1
On Tue, Mar 5, 2024 at 7:44 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> Prepare to put the CD code into the same mechanism. Add an ops indirection
> around all the STE specific code and make the worker functions independent
> of the entry content being processed.
>
> get_used and sync ops are provided to hook the correct code.
>
> Signed-off-by: Michael Shavit <mshavit@google.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 172 ++++++++++++--------
>  1 file changed, 104 insertions(+), 68 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index c60b067c1f553e..b7f947e36f596f 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -48,8 +48,20 @@ enum arm_smmu_msi_index {
>         ARM_SMMU_MAX_MSIS,
>  };
>
> -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
> -                                     ioasid_t sid);
> +struct arm_smmu_entry_writer_ops;
> +struct arm_smmu_entry_writer {
> +       const struct arm_smmu_entry_writer_ops *ops;
> +       struct arm_smmu_master *master;
> +};
> +
> +struct arm_smmu_entry_writer_ops {
> +       unsigned int num_entry_qwords;
> +       __le64 v_bit;
> +       void (*get_used)(const __le64 *entry, __le64 *used);
> +       void (*sync)(struct arm_smmu_entry_writer *writer);
> +};
> +
> +#define NUM_ENTRY_QWORDS (sizeof(struct arm_smmu_ste) / sizeof(u64))
>
>  static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
>         [EVTQ_MSI_INDEX] = {
> @@ -982,43 +994,42 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
>   * would be nice if this was complete according to the spec, but minimally it
>   * has to capture the bits this driver uses.
>   */
> -static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
> -                                 struct arm_smmu_ste *used_bits)
> +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
>  {
> -       unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent->data[0]));
> +       unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0]));
>
> -       used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V);
> -       if (!(ent->data[0] & cpu_to_le64(STRTAB_STE_0_V)))
> +       used_bits[0] = cpu_to_le64(STRTAB_STE_0_V);
> +       if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V)))
>                 return;
>
> -       used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
> +       used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
>
>         /* S1 translates */
>         if (cfg & BIT(0)) {
> -               used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
> -                                                 STRTAB_STE_0_S1CTXPTR_MASK |
> -                                                 STRTAB_STE_0_S1CDMAX);
> -               used_bits->data[1] |=
> +               used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
> +                                           STRTAB_STE_0_S1CTXPTR_MASK |
> +                                           STRTAB_STE_0_S1CDMAX);
> +               used_bits[1] |=
>                         cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
>                                     STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
>                                     STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW |
>                                     STRTAB_STE_1_EATS);
> -               used_bits->data[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
> +               used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
>         }
>
>         /* S2 translates */
>         if (cfg & BIT(1)) {
> -               used_bits->data[1] |=
> +               used_bits[1] |=
>                         cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
> -               used_bits->data[2] |=
> +               used_bits[2] |=
>                         cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
>                                     STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
>                                     STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
> -               used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
> +               used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
>         }
>
>         if (cfg == STRTAB_STE_0_CFG_BYPASS)
> -               used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
> +               used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
>  }
>
>  /*
> @@ -1027,57 +1038,55 @@ static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
>   * unused_update is an intermediate value of entry that has unused bits set to
>   * their new values.
>   */
> -static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry,
> -                                   const struct arm_smmu_ste *target,
> -                                   struct arm_smmu_ste *unused_update)
> +static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
> +                                   const __le64 *entry, const __le64 *target,
> +                                   __le64 *unused_update)
>  {
> -       struct arm_smmu_ste target_used = {};
> -       struct arm_smmu_ste cur_used = {};
> +       __le64 target_used[NUM_ENTRY_QWORDS] = {};
> +       __le64 cur_used[NUM_ENTRY_QWORDS] = {};
>         u8 used_qword_diff = 0;
>         unsigned int i;
>
> -       arm_smmu_get_ste_used(entry, &cur_used);
> -       arm_smmu_get_ste_used(target, &target_used);
> +       writer->ops->get_used(entry, cur_used);
> +       writer->ops->get_used(target, target_used);
>
> -       for (i = 0; i != ARRAY_SIZE(target_used.data); i++) {
> +       for (i = 0; i != writer->ops->num_entry_qwords; i++) {
>                 /*
>                  * Check that masks are up to date, the make functions are not
>                  * allowed to set a bit to 1 if the used function doesn't say it
>                  * is used.
>                  */
> -               WARN_ON_ONCE(target->data[i] & ~target_used.data[i]);
> +               WARN_ON_ONCE(target[i] & ~target_used[i]);
>
>                 /* Bits can change because they are not currently being used */
> -               unused_update->data[i] = (entry->data[i] & cur_used.data[i]) |
> -                                        (target->data[i] & ~cur_used.data[i]);
> +               unused_update[i] = (entry[i] & cur_used[i]) |
> +                                  (target[i] & ~cur_used[i]);
>                 /*
>                  * Each bit indicates that a used bit in a qword needs to be
>                  * changed after unused_update is applied.
>                  */
> -               if ((unused_update->data[i] & target_used.data[i]) !=
> -                   target->data[i])
> +               if ((unused_update[i] & target_used[i]) != target[i])
>                         used_qword_diff |= 1 << i;
>         }
>         return used_qword_diff;
>  }
>
> -static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
> -                     struct arm_smmu_ste *entry,
> -                     const struct arm_smmu_ste *target, unsigned int start,
> +static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
> +                     const __le64 *target, unsigned int start,
>                       unsigned int len)
>  {
>         bool changed = false;
>         unsigned int i;
>
>         for (i = start; len != 0; len--, i++) {
> -               if (entry->data[i] != target->data[i]) {
> -                       WRITE_ONCE(entry->data[i], target->data[i]);
> +               if (entry[i] != target[i]) {
> +                       WRITE_ONCE(entry[i], target[i]);
>                         changed = true;
>                 }
>         }
>
>         if (changed)
> -               arm_smmu_sync_ste_for_sid(smmu, sid);
> +               writer->ops->sync(writer);
>         return changed;
>  }
>
> @@ -1107,17 +1116,15 @@ static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
>   * V=0 process. This relies on the IGNORED behavior described in the
>   * specification.
>   */
> -static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
> -                              struct arm_smmu_ste *entry,
> -                              const struct arm_smmu_ste *target)
> +static void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer,
> +                                __le64 *entry, const __le64 *target)
>  {
> -       unsigned int num_entry_qwords = ARRAY_SIZE(target->data);
> -       struct arm_smmu_device *smmu = master->smmu;
> -       struct arm_smmu_ste unused_update;
> +       unsigned int num_entry_qwords = writer->ops->num_entry_qwords;
> +       __le64 unused_update[NUM_ENTRY_QWORDS];
>         u8 used_qword_diff;
>
>         used_qword_diff =
> -               arm_smmu_entry_qword_diff(entry, target, &unused_update);
> +               arm_smmu_entry_qword_diff(writer, entry, target, unused_update);
>         if (hweight8(used_qword_diff) == 1) {
>                 /*
>                  * Only one qword needs its used bits to be changed. This is a
> @@ -1133,22 +1140,21 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
>                  * writing it in the next step anyways. This can save a sync
>                  * when the only change is in that qword.
>                  */
> -               unused_update.data[critical_qword_index] =
> -                       entry->data[critical_qword_index];
> -               entry_set(smmu, sid, entry, &unused_update, 0, num_entry_qwords);
> -               entry_set(smmu, sid, entry, target, critical_qword_index, 1);
> -               entry_set(smmu, sid, entry, target, 0, num_entry_qwords);
> +               unused_update[critical_qword_index] =
> +                       entry[critical_qword_index];
> +               entry_set(writer, entry, unused_update, 0, num_entry_qwords);
> +               entry_set(writer, entry, target, critical_qword_index, 1);
> +               entry_set(writer, entry, target, 0, num_entry_qwords);
>         } else if (used_qword_diff) {
>                 /*
>                  * At least two qwords need their inuse bits to be changed. This
>                  * requires a breaking update, zero the V bit, write all qwords
>                  * but 0, then set qword 0
>                  */
> -               unused_update.data[0] = entry->data[0] &
> -                                       cpu_to_le64(~STRTAB_STE_0_V);
> -               entry_set(smmu, sid, entry, &unused_update, 0, 1);
> -               entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1);
> -               entry_set(smmu, sid, entry, target, 0, 1);
> +               unused_update[0] = entry[0] & (~writer->ops->v_bit);
> +               entry_set(writer, entry, unused_update, 0, 1);
> +               entry_set(writer, entry, target, 1, num_entry_qwords - 1);
> +               entry_set(writer, entry, target, 0, 1);
>         } else {
>                 /*
>                  * No inuse bit changed. Sanity check that all unused bits are 0
> @@ -1156,18 +1162,7 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
>                  * compute_qword_diff().
>                  */
>                 WARN_ON_ONCE(
> -                       entry_set(smmu, sid, entry, target, 0, num_entry_qwords));
> -       }
> -
> -       /* It's likely that we'll want to use the new STE soon */
> -       if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
> -               struct arm_smmu_cmdq_ent
> -                       prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
> -                                        .prefetch = {
> -                                                .sid = sid,
> -                                        } };
> -
> -               arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +                       entry_set(writer, entry, target, 0, num_entry_qwords));
>         }
>  }
>
> @@ -1440,17 +1435,58 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
>         WRITE_ONCE(*dst, cpu_to_le64(val));
>  }
>
> -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
> +struct arm_smmu_ste_writer {
> +       struct arm_smmu_entry_writer writer;
> +       u32 sid;
> +};
> +
> +static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer)
>  {
> +       struct arm_smmu_ste_writer *ste_writer =
> +               container_of(writer, struct arm_smmu_ste_writer, writer);
>         struct arm_smmu_cmdq_ent cmd = {
>                 .opcode = CMDQ_OP_CFGI_STE,
>                 .cfgi   = {
> -                       .sid    = sid,
> +                       .sid    = ste_writer->sid,
>                         .leaf   = true,
>                 },
>         };
>
> -       arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
> +       arm_smmu_cmdq_issue_cmd_with_sync(writer->master->smmu, &cmd);
> +}
> +
> +static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
> +       .sync = arm_smmu_ste_writer_sync_entry,
> +       .get_used = arm_smmu_get_ste_used,
> +       .v_bit = cpu_to_le64(STRTAB_STE_0_V),
> +       .num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64),
> +};
> +
> +static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
> +                              struct arm_smmu_ste *ste,
> +                              const struct arm_smmu_ste *target)
> +{
> +       struct arm_smmu_device *smmu = master->smmu;
> +       struct arm_smmu_ste_writer ste_writer = {
> +               .writer = {
> +                       .ops = &arm_smmu_ste_writer_ops,
> +                       .master = master,
> +               },
> +               .sid = sid,
> +       };
> +
> +       arm_smmu_write_entry(&ste_writer.writer, ste->data, target->data);
> +
> +       /* It's likely that we'll want to use the new STE soon */
> +       if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
> +               struct arm_smmu_cmdq_ent
> +                       prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
> +                                        .prefetch = {
> +                                                .sid = sid,
> +                                        } };
> +
> +               arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +       }
>  }
>
>  static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
> --
> 2.43.2
>

Reviewed-by: Michael Shavit <mshavit@google.com>
Nicolin Chen March 15, 2024, 4:22 a.m. UTC | #2
Hi Michael/Jason,

On Mon, Mar 04, 2024 at 07:43:52PM -0400, Jason Gunthorpe wrote:
> Prepare to put the CD code into the same mechanism. Add an ops indirection
> around all the STE specific code and make the worker functions independent
> of the entry content being processed.
> 
> get_used and sync ops are provided to hook the correct code.
> 
> Signed-off-by: Michael Shavit <mshavit@google.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 172 ++++++++++++--------
>  1 file changed, 104 insertions(+), 68 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index c60b067c1f553e..b7f947e36f596f 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -48,8 +48,20 @@ enum arm_smmu_msi_index {
>  	ARM_SMMU_MAX_MSIS,
>  };
>  
> -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
> -				      ioasid_t sid);
> +struct arm_smmu_entry_writer_ops;
> +struct arm_smmu_entry_writer {
> +	const struct arm_smmu_entry_writer_ops *ops;
> +	struct arm_smmu_master *master;
> +};
> +
> +struct arm_smmu_entry_writer_ops {
> +	unsigned int num_entry_qwords;

I vaguely remember some related discussion, yet can't find it
out. So sorry for questioning this, if it's already discussed.
Aren't CD and STE having the same num_entry_qwords in terms of
their values? Feels like we can just use NUM_ENTRY_QWORDS?

> +	__le64 v_bit;
> +	void (*get_used)(const __le64 *entry, __le64 *used);
> +	void (*sync)(struct arm_smmu_entry_writer *writer);
> +};
> +
> +#define NUM_ENTRY_QWORDS (sizeof(struct arm_smmu_ste) / sizeof(u64))

And this seems to be just a fixed "8"? Since both are defined
straightforwardly:

struct arm_smmu_ste {
	__le64 data[8];
};
...
struct arm_smmu_cd {
	__le64 data[8];
};

Might be a bit nitpicking, yet maybe the other way around?

#define NUM_ENTRY_QWORDS 8
...
struct arm_smmu_ste {
	__le64 data[NUM_ENTRY_QWORDS];
};
...
struct arm_smmu_cd {
	__le64 data[NUM_ENTRY_QWORDS];
};

Thanks
Nicolin
Nicolin Chen March 15, 2024, 5:20 a.m. UTC | #3
On Thu, Mar 14, 2024 at 09:23:00PM -0700, Nicolin Chen wrote:
> Hi Michael/Jason,
> 
> On Mon, Mar 04, 2024 at 07:43:52PM -0400, Jason Gunthorpe wrote:
> > Prepare to put the CD code into the same mechanism. Add an ops indirection
> > around all the STE specific code and make the worker functions independent
> > of the entry content being processed.
> > 
> > get_used and sync ops are provided to hook the correct code.
> > 
> > Signed-off-by: Michael Shavit <mshavit@google.com>
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

With the following trivial comments being sent previously, I've
also retested this version with SVA cases covering two different
S1DSS configurations.

This seems to be the only patch not tagged with Tested-by. So,

Tested-by: Nicolin Chen <nicolinc@nvidia.com>

Thanks
Nicolin

> > ---
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 172 ++++++++++++--------
> >  1 file changed, 104 insertions(+), 68 deletions(-)
> > 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > index c60b067c1f553e..b7f947e36f596f 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > @@ -48,8 +48,20 @@ enum arm_smmu_msi_index {
> >  	ARM_SMMU_MAX_MSIS,
> >  };
> >  
> > -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
> > -				      ioasid_t sid);
> > +struct arm_smmu_entry_writer_ops;
> > +struct arm_smmu_entry_writer {
> > +	const struct arm_smmu_entry_writer_ops *ops;
> > +	struct arm_smmu_master *master;
> > +};
> > +
> > +struct arm_smmu_entry_writer_ops {
> > +	unsigned int num_entry_qwords;
> 
> I vaguely remember some related discussion, yet can't find it
> out. So sorry for questioning this, if it's already discussed.
> Aren't CD and STE having the same num_entry_qwords in terms of
> their values? Feels like we can just use NUM_ENTRY_QWORDS?
> 
> > +	__le64 v_bit;
> > +	void (*get_used)(const __le64 *entry, __le64 *used);
> > +	void (*sync)(struct arm_smmu_entry_writer *writer);
> > +};
> > +
> > +#define NUM_ENTRY_QWORDS (sizeof(struct arm_smmu_ste) / sizeof(u64))
> 
> And this seems to be just a fixed "8"? Since both are defined
> straightforwardly:
> 
> struct arm_smmu_ste {
> 	__le64 data[8];
> };
> ...
> struct arm_smmu_cd {
> 	__le64 data[8];
> };
> 
> Might be a bit nitpicking, yet maybe the other way around?
> 
> #define NUM_ENTRY_QWORDS 8
> ...
> struct arm_smmu_ste {
> 	__le64 data[NUM_ENTRY_QWORDS];
> };
> ...
> struct arm_smmu_cd {
> 	__le64 data[NUM_ENTRY_QWORDS];
> };
> 
> Thanks
> Nicolin
Jason Gunthorpe March 18, 2024, 6:06 p.m. UTC | #4
On Thu, Mar 14, 2024 at 09:22:48PM -0700, Nicolin Chen wrote:
> Hi Michael/Jason,
> 
> On Mon, Mar 04, 2024 at 07:43:52PM -0400, Jason Gunthorpe wrote:
> > Prepare to put the CD code into the same mechanism. Add an ops indirection
> > around all the STE specific code and make the worker functions independent
> > of the entry content being processed.
> > 
> > get_used and sync ops are provided to hook the correct code.
> > 
> > Signed-off-by: Michael Shavit <mshavit@google.com>
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > ---
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 172 ++++++++++++--------
> >  1 file changed, 104 insertions(+), 68 deletions(-)
> > 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > index c60b067c1f553e..b7f947e36f596f 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > @@ -48,8 +48,20 @@ enum arm_smmu_msi_index {
> >  	ARM_SMMU_MAX_MSIS,
> >  };
> >  
> > -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
> > -				      ioasid_t sid);
> > +struct arm_smmu_entry_writer_ops;
> > +struct arm_smmu_entry_writer {
> > +	const struct arm_smmu_entry_writer_ops *ops;
> > +	struct arm_smmu_master *master;
> > +};
> > +
> > +struct arm_smmu_entry_writer_ops {
> > +	unsigned int num_entry_qwords;
> 
> I vaguely remember some related discussion, yet can't find it
> out. So sorry for questioning this, if it's already discussed.
> Aren't CD and STE having the same num_entry_qwords in terms of
> their values? Feels like we can just use NUM_ENTRY_QWORDS?

They do, but the structs should reflect the HW and there is nothing in
the HW description that requires these to be the same.

It is true we go through some contortions to keep the sizes seperate,
and it is also true they are unlikely to ever change size.

> > +#define NUM_ENTRY_QWORDS (sizeof(struct arm_smmu_ste) / sizeof(u64))
 
> And this seems to be just a fixed "8"? Since both are defined
> straightforwardly:

Yes, again it should flow from the struct which should reflect the HW
layout. Making them the same size is a SW desire..

Jason
Mostafa Saleh March 22, 2024, 6:14 p.m. UTC | #5
Hi Jason,

On Mon, Mar 04, 2024 at 07:43:52PM -0400, Jason Gunthorpe wrote:
> Prepare to put the CD code into the same mechanism. Add an ops indirection
> around all the STE specific code and make the worker functions independent
> of the entry content being processed.
> 
> get_used and sync ops are provided to hook the correct code.
> 
> Signed-off-by: Michael Shavit <mshavit@google.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 172 ++++++++++++--------
>  1 file changed, 104 insertions(+), 68 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index c60b067c1f553e..b7f947e36f596f 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -48,8 +48,20 @@ enum arm_smmu_msi_index {
>  	ARM_SMMU_MAX_MSIS,
>  };
>  
> -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
> -				      ioasid_t sid);
> +struct arm_smmu_entry_writer_ops;
> +struct arm_smmu_entry_writer {
> +	const struct arm_smmu_entry_writer_ops *ops;
> +	struct arm_smmu_master *master;
> +};
> +
> +struct arm_smmu_entry_writer_ops {
> +	unsigned int num_entry_qwords;
> +	__le64 v_bit;
> +	void (*get_used)(const __le64 *entry, __le64 *used);
> +	void (*sync)(struct arm_smmu_entry_writer *writer);
> +};
> +
> +#define NUM_ENTRY_QWORDS (sizeof(struct arm_smmu_ste) / sizeof(u64))
>
>  static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
>  	[EVTQ_MSI_INDEX] = {
> @@ -982,43 +994,42 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
>   * would be nice if this was complete according to the spec, but minimally it
>   * has to capture the bits this driver uses.
>   */
> -static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
> -				  struct arm_smmu_ste *used_bits)
> +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
>  {
> -	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent->data[0]));
> +	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0]));
>  
> -	used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V);
> -	if (!(ent->data[0] & cpu_to_le64(STRTAB_STE_0_V)))
> +	used_bits[0] = cpu_to_le64(STRTAB_STE_0_V);
> +	if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V)))
>  		return;
>  
> -	used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
> +	used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
>  
>  	/* S1 translates */
>  	if (cfg & BIT(0)) {
> -		used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
> -						  STRTAB_STE_0_S1CTXPTR_MASK |
> -						  STRTAB_STE_0_S1CDMAX);
> -		used_bits->data[1] |=
> +		used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
> +					    STRTAB_STE_0_S1CTXPTR_MASK |
> +					    STRTAB_STE_0_S1CDMAX);
> +		used_bits[1] |=
>  			cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
>  				    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
>  				    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW |
>  				    STRTAB_STE_1_EATS);
> -		used_bits->data[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
> +		used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
>  	}
>  
>  	/* S2 translates */
>  	if (cfg & BIT(1)) {
> -		used_bits->data[1] |=
> +		used_bits[1] |=
>  			cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
> -		used_bits->data[2] |=
> +		used_bits[2] |=
>  			cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
>  				    STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
>  				    STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
> -		used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
> +		used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
>  	}
>  
>  	if (cfg == STRTAB_STE_0_CFG_BYPASS)
> -		used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
> +		used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
>  }
>  
>  /*
> @@ -1027,57 +1038,55 @@ static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
>   * unused_update is an intermediate value of entry that has unused bits set to
>   * their new values.
>   */
> -static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry,
> -				    const struct arm_smmu_ste *target,
> -				    struct arm_smmu_ste *unused_update)
> +static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
> +				    const __le64 *entry, const __le64 *target,
> +				    __le64 *unused_update)
>  {
> -	struct arm_smmu_ste target_used = {};
> -	struct arm_smmu_ste cur_used = {};
> +	__le64 target_used[NUM_ENTRY_QWORDS] = {};
> +	__le64 cur_used[NUM_ENTRY_QWORDS] = {};
This is confusing to me, the function was modified to be generic, so its has
args are __le64 * instead of struct arm_smmu_ste *.
But NUM_ENTRY_QWORDS is defined as “(sizeof(struct arm_smmu_ste) / sizeof(u64))”
and in the same function writer->ops->num_entry_qwords is used nterchangeably,
I understand that this not a constant and the compiler would complain.
But since for any other num_entry_qwords larger than NUM_ENTRY_QWORDS it fails,
and we know STEs and CDs both have the same size, we simplify the code and make
it a constant everywhere.

I see in the next patch, that this is redefined to be the max between STE and
CD, but again, this hardware and it never changes, so my opinion is to simplify
the code, as there is no need to generalize this part.

>  	u8 used_qword_diff = 0;
>  	unsigned int i;
>  
> -	arm_smmu_get_ste_used(entry, &cur_used);
> -	arm_smmu_get_ste_used(target, &target_used);
> +	writer->ops->get_used(entry, cur_used);
> +	writer->ops->get_used(target, target_used);
>  
> -	for (i = 0; i != ARRAY_SIZE(target_used.data); i++) {
> +	for (i = 0; i != writer->ops->num_entry_qwords; i++) {
>  		/*
>  		 * Check that masks are up to date, the make functions are not
>  		 * allowed to set a bit to 1 if the used function doesn't say it
>  		 * is used.
>  		 */
> -		WARN_ON_ONCE(target->data[i] & ~target_used.data[i]);
> +		WARN_ON_ONCE(target[i] & ~target_used[i]);
>  
>  		/* Bits can change because they are not currently being used */
> -		unused_update->data[i] = (entry->data[i] & cur_used.data[i]) |
> -					 (target->data[i] & ~cur_used.data[i]);
> +		unused_update[i] = (entry[i] & cur_used[i]) |
> +				   (target[i] & ~cur_used[i]);
>  		/*
>  		 * Each bit indicates that a used bit in a qword needs to be
>  		 * changed after unused_update is applied.
>  		 */
> -		if ((unused_update->data[i] & target_used.data[i]) !=
> -		    target->data[i])
> +		if ((unused_update[i] & target_used[i]) != target[i])
>  			used_qword_diff |= 1 << i;
>  	}
>  	return used_qword_diff;
>  }
>  
> -static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
> -		      struct arm_smmu_ste *entry,
> -		      const struct arm_smmu_ste *target, unsigned int start,
> +static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
> +		      const __le64 *target, unsigned int start,
>  		      unsigned int len)
>  {
>  	bool changed = false;
>  	unsigned int i;
>  
>  	for (i = start; len != 0; len--, i++) {
> -		if (entry->data[i] != target->data[i]) {
> -			WRITE_ONCE(entry->data[i], target->data[i]);
> +		if (entry[i] != target[i]) {
> +			WRITE_ONCE(entry[i], target[i]);
>  			changed = true;
>  		}
>  	}
>  
>  	if (changed)
> -		arm_smmu_sync_ste_for_sid(smmu, sid);
> +		writer->ops->sync(writer);
>  	return changed;
>  }
>  
> @@ -1107,17 +1116,15 @@ static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
>   * V=0 process. This relies on the IGNORED behavior described in the
>   * specification.
>   */
> -static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
> -			       struct arm_smmu_ste *entry,
> -			       const struct arm_smmu_ste *target)
> +static void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer,
> +				 __le64 *entry, const __le64 *target)
>  {
> -	unsigned int num_entry_qwords = ARRAY_SIZE(target->data);
> -	struct arm_smmu_device *smmu = master->smmu;
> -	struct arm_smmu_ste unused_update;
> +	unsigned int num_entry_qwords = writer->ops->num_entry_qwords;
> +	__le64 unused_update[NUM_ENTRY_QWORDS];
>  	u8 used_qword_diff;
>  
>  	used_qword_diff =
> -		arm_smmu_entry_qword_diff(entry, target, &unused_update);
> +		arm_smmu_entry_qword_diff(writer, entry, target, unused_update);
>  	if (hweight8(used_qword_diff) == 1) {
>  		/*
>  		 * Only one qword needs its used bits to be changed. This is a
> @@ -1133,22 +1140,21 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
>  		 * writing it in the next step anyways. This can save a sync
>  		 * when the only change is in that qword.
>  		 */
> -		unused_update.data[critical_qword_index] =
> -			entry->data[critical_qword_index];
> -		entry_set(smmu, sid, entry, &unused_update, 0, num_entry_qwords);
> -		entry_set(smmu, sid, entry, target, critical_qword_index, 1);
> -		entry_set(smmu, sid, entry, target, 0, num_entry_qwords);
> +		unused_update[critical_qword_index] =
> +			entry[critical_qword_index];
> +		entry_set(writer, entry, unused_update, 0, num_entry_qwords);
> +		entry_set(writer, entry, target, critical_qword_index, 1);
> +		entry_set(writer, entry, target, 0, num_entry_qwords);
>  	} else if (used_qword_diff) {
>  		/*
>  		 * At least two qwords need their inuse bits to be changed. This
>  		 * requires a breaking update, zero the V bit, write all qwords
>  		 * but 0, then set qword 0
>  		 */
> -		unused_update.data[0] = entry->data[0] &
> -					cpu_to_le64(~STRTAB_STE_0_V);
> -		entry_set(smmu, sid, entry, &unused_update, 0, 1);
> -		entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1);
> -		entry_set(smmu, sid, entry, target, 0, 1);
> +		unused_update[0] = entry[0] & (~writer->ops->v_bit);
> +		entry_set(writer, entry, unused_update, 0, 1);
> +		entry_set(writer, entry, target, 1, num_entry_qwords - 1);
> +		entry_set(writer, entry, target, 0, 1);
>  	} else {
>  		/*
>  		 * No inuse bit changed. Sanity check that all unused bits are 0
> @@ -1156,18 +1162,7 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
>  		 * compute_qword_diff().
>  		 */
>  		WARN_ON_ONCE(
> -			entry_set(smmu, sid, entry, target, 0, num_entry_qwords));
> -	}
> -
> -	/* It's likely that we'll want to use the new STE soon */
> -	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
> -		struct arm_smmu_cmdq_ent
> -			prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
> -					 .prefetch = {
> -						 .sid = sid,
> -					 } };
> -
> -		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +			entry_set(writer, entry, target, 0, num_entry_qwords));
>  	}
>  }
>  
> @@ -1440,17 +1435,58 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
>  	WRITE_ONCE(*dst, cpu_to_le64(val));
>  }
>  
> -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
> +struct arm_smmu_ste_writer {
> +	struct arm_smmu_entry_writer writer;
> +	u32 sid;
> +};
> +
> +static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer)
>  {
> +	struct arm_smmu_ste_writer *ste_writer =
> +		container_of(writer, struct arm_smmu_ste_writer, writer);
>  	struct arm_smmu_cmdq_ent cmd = {
>  		.opcode	= CMDQ_OP_CFGI_STE,
>  		.cfgi	= {
> -			.sid	= sid,
> +			.sid	= ste_writer->sid,
>  			.leaf	= true,
>  		},
>  	};
>  
> -	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
> +	arm_smmu_cmdq_issue_cmd_with_sync(writer->master->smmu, &cmd);
> +}
> +
> +static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
> +	.sync = arm_smmu_ste_writer_sync_entry,
> +	.get_used = arm_smmu_get_ste_used,
> +	.v_bit = cpu_to_le64(STRTAB_STE_0_V),
> +	.num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64),
> +};
> +
> +static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
> +			       struct arm_smmu_ste *ste,
> +			       const struct arm_smmu_ste *target)
> +{
> +	struct arm_smmu_device *smmu = master->smmu;
> +	struct arm_smmu_ste_writer ste_writer = {
> +		.writer = {
> +			.ops = &arm_smmu_ste_writer_ops,
> +			.master = master,
> +		},
> +		.sid = sid,
> +	};
> +
> +	arm_smmu_write_entry(&ste_writer.writer, ste->data, target->data);
> +
> +	/* It's likely that we'll want to use the new STE soon */
> +	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
> +		struct arm_smmu_cmdq_ent
> +			prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
> +					 .prefetch = {
> +						 .sid = sid,
> +					 } };
> +
> +		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
> +	}
>  }
>  
>  static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
> -- 
> 2.43.2
>
Thanks,
Mostafa
Jason Gunthorpe March 25, 2024, 2:11 p.m. UTC | #6
On Fri, Mar 22, 2024 at 06:14:24PM +0000, Mostafa Saleh wrote:
> > @@ -1027,57 +1038,55 @@ static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
> >   * unused_update is an intermediate value of entry that has unused bits set to
> >   * their new values.
> >   */
> > -static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry,
> > -				    const struct arm_smmu_ste *target,
> > -				    struct arm_smmu_ste *unused_update)
> > +static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
> > +				    const __le64 *entry, const __le64 *target,
> > +				    __le64 *unused_update)
> >  {
> > -	struct arm_smmu_ste target_used = {};
> > -	struct arm_smmu_ste cur_used = {};
> > +	__le64 target_used[NUM_ENTRY_QWORDS] = {};
> > +	__le64 cur_used[NUM_ENTRY_QWORDS] = {};
> This is confusing to me, the function was modified to be generic, so its has
> args are __le64 * instead of struct arm_smmu_ste *.

Right

> But NUM_ENTRY_QWORDS is defined as “(sizeof(struct arm_smmu_ste) / sizeof(u64))”
> and in the same function writer->ops->num_entry_qwords is used
> nterchangeably,

Right

> I understand that this not a constant and the compiler would complain.
> But since for any other num_entry_qwords larger than NUM_ENTRY_QWORDS it fails,
> and we know STEs and CDs both have the same size, we simplify the code and make
> it a constant everywhere.

So you say to get rid of num_entry_qwords and just use the constant?

> I see in the next patch, that this is redefined to be the max between STE and
> CD, but again, this hardware and it never changes, so my opinion is to simplify
> the code, as there is no need to generalize this part.

Yes, we need a constant.

It would look like this, it is a little bit simpler:

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index a54062faccde38..d015f41900d802 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -63,9 +63,9 @@ enum arm_smmu_msi_index {
 	ARM_SMMU_MAX_MSIS,
 };
 
-#define NUM_ENTRY_QWORDS                                                \
-	(max(sizeof(struct arm_smmu_ste), sizeof(struct arm_smmu_cd)) / \
-	 sizeof(u64))
+#define NUM_ENTRY_QWORDS 8
+static_assert(sizeof(struct arm_smmu_ste) == NUM_ENTRY_QWORDS * sizeof(u64));
+static_assert(sizeof(struct arm_smmu_cd) == NUM_ENTRY_QWORDS * sizeof(u64));
 
 static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
 	[EVTQ_MSI_INDEX] = {
@@ -1045,7 +1045,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
 	writer->ops->get_used(entry, cur_used);
 	writer->ops->get_used(target, target_used);
 
-	for (i = 0; i != writer->ops->num_entry_qwords; i++) {
+	for (i = 0; i != NUM_ENTRY_QWORDS; i++) {
 		/*
 		 * Check that masks are up to date, the make functions are not
 		 * allowed to set a bit to 1 if the used function doesn't say it
@@ -1114,7 +1114,6 @@ static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
 void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
 			  const __le64 *target)
 {
-	unsigned int num_entry_qwords = writer->ops->num_entry_qwords;
 	__le64 unused_update[NUM_ENTRY_QWORDS];
 	u8 used_qword_diff;
 
@@ -1137,9 +1136,9 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
 		 */
 		unused_update[critical_qword_index] =
 			entry[critical_qword_index];
-		entry_set(writer, entry, unused_update, 0, num_entry_qwords);
+		entry_set(writer, entry, unused_update, 0, NUM_ENTRY_QWORDS);
 		entry_set(writer, entry, target, critical_qword_index, 1);
-		entry_set(writer, entry, target, 0, num_entry_qwords);
+		entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS);
 	} else if (used_qword_diff) {
 		/*
 		 * At least two qwords need their inuse bits to be changed. This
@@ -1148,7 +1147,7 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
 		 */
 		unused_update[0] = entry[0] & (~writer->ops->v_bit);
 		entry_set(writer, entry, unused_update, 0, 1);
-		entry_set(writer, entry, target, 1, num_entry_qwords - 1);
+		entry_set(writer, entry, target, 1, NUM_ENTRY_QWORDS - 1);
 		entry_set(writer, entry, target, 0, 1);
 	} else {
 		/*
@@ -1157,7 +1156,7 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
 		 * compute_qword_diff().
 		 */
 		WARN_ON_ONCE(
-			entry_set(writer, entry, target, 0, num_entry_qwords));
+			entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS));
 	}
 }
 
@@ -1272,7 +1271,6 @@ static const struct arm_smmu_entry_writer_ops arm_smmu_cd_writer_ops = {
 	.sync = arm_smmu_cd_writer_sync_entry,
 	.get_used = arm_smmu_get_cd_used,
 	.v_bit = cpu_to_le64(CTXDESC_CD_0_V),
-	.num_entry_qwords = sizeof(struct arm_smmu_cd) / sizeof(u64),
 };
 
 void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
@@ -1460,7 +1458,6 @@ static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
 	.sync = arm_smmu_ste_writer_sync_entry,
 	.get_used = arm_smmu_get_ste_used,
 	.v_bit = cpu_to_le64(STRTAB_STE_0_V),
-	.num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64),
 };
 
 static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 8ba07b00bf6056..5936dc5f76786a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -779,7 +779,6 @@ struct arm_smmu_entry_writer {
 };
 
 struct arm_smmu_entry_writer_ops {
-	unsigned int num_entry_qwords;
 	__le64 v_bit;
 	void (*get_used)(const __le64 *entry, __le64 *used);
 	void (*sync)(struct arm_smmu_entry_writer *writer);
Mostafa Saleh March 25, 2024, 9:01 p.m. UTC | #7
On Mon, Mar 25, 2024 at 11:11:32AM -0300, Jason Gunthorpe wrote:
> On Fri, Mar 22, 2024 at 06:14:24PM +0000, Mostafa Saleh wrote:
> > > @@ -1027,57 +1038,55 @@ static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
> > >   * unused_update is an intermediate value of entry that has unused bits set to
> > >   * their new values.
> > >   */
> > > -static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry,
> > > -				    const struct arm_smmu_ste *target,
> > > -				    struct arm_smmu_ste *unused_update)
> > > +static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
> > > +				    const __le64 *entry, const __le64 *target,
> > > +				    __le64 *unused_update)
> > >  {
> > > -	struct arm_smmu_ste target_used = {};
> > > -	struct arm_smmu_ste cur_used = {};
> > > +	__le64 target_used[NUM_ENTRY_QWORDS] = {};
> > > +	__le64 cur_used[NUM_ENTRY_QWORDS] = {};
> > This is confusing to me, the function was modified to be generic, so its has
> > args are __le64 * instead of struct arm_smmu_ste *.
> 
> Right
> 
> > But NUM_ENTRY_QWORDS is defined as “(sizeof(struct arm_smmu_ste) / sizeof(u64))”
> > and in the same function writer->ops->num_entry_qwords is used
> > nterchangeably,
> 
> Right
> 
> > I understand that this not a constant and the compiler would complain.
> > But since for any other num_entry_qwords larger than NUM_ENTRY_QWORDS it fails,
> > and we know STEs and CDs both have the same size, we simplify the code and make
> > it a constant everywhere.
> 
> So you say to get rid of num_entry_qwords and just use the constant?

In my opinion, yes, that looks easier to understand, and avoids the MAX
stuff as there is no reason for the extra generalisation.

> > I see in the next patch, that this is redefined to be the max between STE and
> > CD, but again, this hardware and it never changes, so my opinion is to simplify
> > the code, as there is no need to generalize this part.
> 
> Yes, we need a constant.
> 
> It would look like this, it is a little bit simpler:
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index a54062faccde38..d015f41900d802 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -63,9 +63,9 @@ enum arm_smmu_msi_index {
>  	ARM_SMMU_MAX_MSIS,
>  };
>  
> -#define NUM_ENTRY_QWORDS                                                \
> -	(max(sizeof(struct arm_smmu_ste), sizeof(struct arm_smmu_cd)) / \
> -	 sizeof(u64))
> +#define NUM_ENTRY_QWORDS 8
> +static_assert(sizeof(struct arm_smmu_ste) == NUM_ENTRY_QWORDS * sizeof(u64));
> +static_assert(sizeof(struct arm_smmu_cd) == NUM_ENTRY_QWORDS * sizeof(u64));
>  
>  static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
>  	[EVTQ_MSI_INDEX] = {
> @@ -1045,7 +1045,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
>  	writer->ops->get_used(entry, cur_used);
>  	writer->ops->get_used(target, target_used);
>  
> -	for (i = 0; i != writer->ops->num_entry_qwords; i++) {
> +	for (i = 0; i != NUM_ENTRY_QWORDS; i++) {
>  		/*
>  		 * Check that masks are up to date, the make functions are not
>  		 * allowed to set a bit to 1 if the used function doesn't say it
> @@ -1114,7 +1114,6 @@ static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
>  void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
>  			  const __le64 *target)
>  {
> -	unsigned int num_entry_qwords = writer->ops->num_entry_qwords;
>  	__le64 unused_update[NUM_ENTRY_QWORDS];
>  	u8 used_qword_diff;
>  
> @@ -1137,9 +1136,9 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
>  		 */
>  		unused_update[critical_qword_index] =
>  			entry[critical_qword_index];
> -		entry_set(writer, entry, unused_update, 0, num_entry_qwords);
> +		entry_set(writer, entry, unused_update, 0, NUM_ENTRY_QWORDS);
>  		entry_set(writer, entry, target, critical_qword_index, 1);
> -		entry_set(writer, entry, target, 0, num_entry_qwords);
> +		entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS);
>  	} else if (used_qword_diff) {
>  		/*
>  		 * At least two qwords need their inuse bits to be changed. This
> @@ -1148,7 +1147,7 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
>  		 */
>  		unused_update[0] = entry[0] & (~writer->ops->v_bit);
>  		entry_set(writer, entry, unused_update, 0, 1);
> -		entry_set(writer, entry, target, 1, num_entry_qwords - 1);
> +		entry_set(writer, entry, target, 1, NUM_ENTRY_QWORDS - 1);
>  		entry_set(writer, entry, target, 0, 1);
>  	} else {
>  		/*
> @@ -1157,7 +1156,7 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
>  		 * compute_qword_diff().
>  		 */
>  		WARN_ON_ONCE(
> -			entry_set(writer, entry, target, 0, num_entry_qwords));
> +			entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS));
>  	}
>  }
>  
> @@ -1272,7 +1271,6 @@ static const struct arm_smmu_entry_writer_ops arm_smmu_cd_writer_ops = {
>  	.sync = arm_smmu_cd_writer_sync_entry,
>  	.get_used = arm_smmu_get_cd_used,
>  	.v_bit = cpu_to_le64(CTXDESC_CD_0_V),
> -	.num_entry_qwords = sizeof(struct arm_smmu_cd) / sizeof(u64),
>  };
>  
>  void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
> @@ -1460,7 +1458,6 @@ static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
>  	.sync = arm_smmu_ste_writer_sync_entry,
>  	.get_used = arm_smmu_get_ste_used,
>  	.v_bit = cpu_to_le64(STRTAB_STE_0_V),
> -	.num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64),
>  };
>  
>  static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 8ba07b00bf6056..5936dc5f76786a 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -779,7 +779,6 @@ struct arm_smmu_entry_writer {
>  };
>  
>  struct arm_smmu_entry_writer_ops {
> -	unsigned int num_entry_qwords;
>  	__le64 v_bit;
>  	void (*get_used)(const __le64 *entry, __le64 *used);
>  	void (*sync)(struct arm_smmu_entry_writer *writer);
> 

Thanks,
Mostafa
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c60b067c1f553e..b7f947e36f596f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -48,8 +48,20 @@  enum arm_smmu_msi_index {
 	ARM_SMMU_MAX_MSIS,
 };
 
-static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
-				      ioasid_t sid);
+struct arm_smmu_entry_writer_ops;
+struct arm_smmu_entry_writer {
+	const struct arm_smmu_entry_writer_ops *ops;
+	struct arm_smmu_master *master;
+};
+
+struct arm_smmu_entry_writer_ops {
+	unsigned int num_entry_qwords;
+	__le64 v_bit;
+	void (*get_used)(const __le64 *entry, __le64 *used);
+	void (*sync)(struct arm_smmu_entry_writer *writer);
+};
+
+#define NUM_ENTRY_QWORDS (sizeof(struct arm_smmu_ste) / sizeof(u64))
 
 static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
 	[EVTQ_MSI_INDEX] = {
@@ -982,43 +994,42 @@  void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
  * would be nice if this was complete according to the spec, but minimally it
  * has to capture the bits this driver uses.
  */
-static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
-				  struct arm_smmu_ste *used_bits)
+static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
 {
-	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent->data[0]));
+	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0]));
 
-	used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V);
-	if (!(ent->data[0] & cpu_to_le64(STRTAB_STE_0_V)))
+	used_bits[0] = cpu_to_le64(STRTAB_STE_0_V);
+	if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V)))
 		return;
 
-	used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
+	used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
 
 	/* S1 translates */
 	if (cfg & BIT(0)) {
-		used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
-						  STRTAB_STE_0_S1CTXPTR_MASK |
-						  STRTAB_STE_0_S1CDMAX);
-		used_bits->data[1] |=
+		used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
+					    STRTAB_STE_0_S1CTXPTR_MASK |
+					    STRTAB_STE_0_S1CDMAX);
+		used_bits[1] |=
 			cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
 				    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
 				    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW |
 				    STRTAB_STE_1_EATS);
-		used_bits->data[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
+		used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
 	}
 
 	/* S2 translates */
 	if (cfg & BIT(1)) {
-		used_bits->data[1] |=
+		used_bits[1] |=
 			cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
-		used_bits->data[2] |=
+		used_bits[2] |=
 			cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
 				    STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
 				    STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
-		used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
+		used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
 	}
 
 	if (cfg == STRTAB_STE_0_CFG_BYPASS)
-		used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
+		used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
 }
 
 /*
@@ -1027,57 +1038,55 @@  static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
  * unused_update is an intermediate value of entry that has unused bits set to
  * their new values.
  */
-static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry,
-				    const struct arm_smmu_ste *target,
-				    struct arm_smmu_ste *unused_update)
+static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
+				    const __le64 *entry, const __le64 *target,
+				    __le64 *unused_update)
 {
-	struct arm_smmu_ste target_used = {};
-	struct arm_smmu_ste cur_used = {};
+	__le64 target_used[NUM_ENTRY_QWORDS] = {};
+	__le64 cur_used[NUM_ENTRY_QWORDS] = {};
 	u8 used_qword_diff = 0;
 	unsigned int i;
 
-	arm_smmu_get_ste_used(entry, &cur_used);
-	arm_smmu_get_ste_used(target, &target_used);
+	writer->ops->get_used(entry, cur_used);
+	writer->ops->get_used(target, target_used);
 
-	for (i = 0; i != ARRAY_SIZE(target_used.data); i++) {
+	for (i = 0; i != writer->ops->num_entry_qwords; i++) {
 		/*
 		 * Check that masks are up to date, the make functions are not
 		 * allowed to set a bit to 1 if the used function doesn't say it
 		 * is used.
 		 */
-		WARN_ON_ONCE(target->data[i] & ~target_used.data[i]);
+		WARN_ON_ONCE(target[i] & ~target_used[i]);
 
 		/* Bits can change because they are not currently being used */
-		unused_update->data[i] = (entry->data[i] & cur_used.data[i]) |
-					 (target->data[i] & ~cur_used.data[i]);
+		unused_update[i] = (entry[i] & cur_used[i]) |
+				   (target[i] & ~cur_used[i]);
 		/*
 		 * Each bit indicates that a used bit in a qword needs to be
 		 * changed after unused_update is applied.
 		 */
-		if ((unused_update->data[i] & target_used.data[i]) !=
-		    target->data[i])
+		if ((unused_update[i] & target_used[i]) != target[i])
 			used_qword_diff |= 1 << i;
 	}
 	return used_qword_diff;
 }
 
-static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
-		      struct arm_smmu_ste *entry,
-		      const struct arm_smmu_ste *target, unsigned int start,
+static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
+		      const __le64 *target, unsigned int start,
 		      unsigned int len)
 {
 	bool changed = false;
 	unsigned int i;
 
 	for (i = start; len != 0; len--, i++) {
-		if (entry->data[i] != target->data[i]) {
-			WRITE_ONCE(entry->data[i], target->data[i]);
+		if (entry[i] != target[i]) {
+			WRITE_ONCE(entry[i], target[i]);
 			changed = true;
 		}
 	}
 
 	if (changed)
-		arm_smmu_sync_ste_for_sid(smmu, sid);
+		writer->ops->sync(writer);
 	return changed;
 }
 
@@ -1107,17 +1116,15 @@  static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
  * V=0 process. This relies on the IGNORED behavior described in the
  * specification.
  */
-static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
-			       struct arm_smmu_ste *entry,
-			       const struct arm_smmu_ste *target)
+static void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer,
+				 __le64 *entry, const __le64 *target)
 {
-	unsigned int num_entry_qwords = ARRAY_SIZE(target->data);
-	struct arm_smmu_device *smmu = master->smmu;
-	struct arm_smmu_ste unused_update;
+	unsigned int num_entry_qwords = writer->ops->num_entry_qwords;
+	__le64 unused_update[NUM_ENTRY_QWORDS];
 	u8 used_qword_diff;
 
 	used_qword_diff =
-		arm_smmu_entry_qword_diff(entry, target, &unused_update);
+		arm_smmu_entry_qword_diff(writer, entry, target, unused_update);
 	if (hweight8(used_qword_diff) == 1) {
 		/*
 		 * Only one qword needs its used bits to be changed. This is a
@@ -1133,22 +1140,21 @@  static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
 		 * writing it in the next step anyways. This can save a sync
 		 * when the only change is in that qword.
 		 */
-		unused_update.data[critical_qword_index] =
-			entry->data[critical_qword_index];
-		entry_set(smmu, sid, entry, &unused_update, 0, num_entry_qwords);
-		entry_set(smmu, sid, entry, target, critical_qword_index, 1);
-		entry_set(smmu, sid, entry, target, 0, num_entry_qwords);
+		unused_update[critical_qword_index] =
+			entry[critical_qword_index];
+		entry_set(writer, entry, unused_update, 0, num_entry_qwords);
+		entry_set(writer, entry, target, critical_qword_index, 1);
+		entry_set(writer, entry, target, 0, num_entry_qwords);
 	} else if (used_qword_diff) {
 		/*
 		 * At least two qwords need their inuse bits to be changed. This
 		 * requires a breaking update, zero the V bit, write all qwords
 		 * but 0, then set qword 0
 		 */
-		unused_update.data[0] = entry->data[0] &
-					cpu_to_le64(~STRTAB_STE_0_V);
-		entry_set(smmu, sid, entry, &unused_update, 0, 1);
-		entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1);
-		entry_set(smmu, sid, entry, target, 0, 1);
+		unused_update[0] = entry[0] & (~writer->ops->v_bit);
+		entry_set(writer, entry, unused_update, 0, 1);
+		entry_set(writer, entry, target, 1, num_entry_qwords - 1);
+		entry_set(writer, entry, target, 0, 1);
 	} else {
 		/*
 		 * No inuse bit changed. Sanity check that all unused bits are 0
@@ -1156,18 +1162,7 @@  static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
 		 * compute_qword_diff().
 		 */
 		WARN_ON_ONCE(
-			entry_set(smmu, sid, entry, target, 0, num_entry_qwords));
-	}
-
-	/* It's likely that we'll want to use the new STE soon */
-	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
-		struct arm_smmu_cmdq_ent
-			prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
-					 .prefetch = {
-						 .sid = sid,
-					 } };
-
-		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+			entry_set(writer, entry, target, 0, num_entry_qwords));
 	}
 }
 
@@ -1440,17 +1435,58 @@  arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
+struct arm_smmu_ste_writer {
+	struct arm_smmu_entry_writer writer;
+	u32 sid;
+};
+
+static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer)
 {
+	struct arm_smmu_ste_writer *ste_writer =
+		container_of(writer, struct arm_smmu_ste_writer, writer);
 	struct arm_smmu_cmdq_ent cmd = {
 		.opcode	= CMDQ_OP_CFGI_STE,
 		.cfgi	= {
-			.sid	= sid,
+			.sid	= ste_writer->sid,
 			.leaf	= true,
 		},
 	};
 
-	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
+	arm_smmu_cmdq_issue_cmd_with_sync(writer->master->smmu, &cmd);
+}
+
+static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
+	.sync = arm_smmu_ste_writer_sync_entry,
+	.get_used = arm_smmu_get_ste_used,
+	.v_bit = cpu_to_le64(STRTAB_STE_0_V),
+	.num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64),
+};
+
+static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
+			       struct arm_smmu_ste *ste,
+			       const struct arm_smmu_ste *target)
+{
+	struct arm_smmu_device *smmu = master->smmu;
+	struct arm_smmu_ste_writer ste_writer = {
+		.writer = {
+			.ops = &arm_smmu_ste_writer_ops,
+			.master = master,
+		},
+		.sid = sid,
+	};
+
+	arm_smmu_write_entry(&ste_writer.writer, ste->data, target->data);
+
+	/* It's likely that we'll want to use the new STE soon */
+	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
+		struct arm_smmu_cmdq_ent
+			prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
+					 .prefetch = {
+						 .sid = sid,
+					 } };
+
+		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+	}
 }
 
 static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)