diff mbox series

[v5,14/27] iommu/arm-smmu-v3: Make changing domains be hitless for ATS

Message ID 14-v5-9a37e0c884ce+31e3-smmuv3_newapi_p2_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Update SMMUv3 to the modern iommu API (part 2/3) | expand

Commit Message

Jason Gunthorpe March 4, 2024, 11:44 p.m. UTC
The core code allows the domain to be changed on the fly without a forced
stop in BLOCKED/IDENTITY. In this flow the driver should just continually
maintain the ATS with no change while the STE is updated.

ATS relies on a linked list smmu_domain->devices to keep track of which
masters have the domain programmed, but this list is also used by
arm_smmu_share_asid(), unrelated to ats.

Create two new functions to encapsulate this combined logic:
 arm_smmu_attach_prepare()
 <caller generates and sets the STE>
 arm_smmu_attach_commit()

The two functions can sequence both enabling ATS and disabling across
the STE store. Have every update of the STE use this sequence.

Installing a S1/S2 domain always enables the ATS if the PCIe device
supports it.

The enable flow is now ordered differently to allow it to be hitless:

  1) Add the master to the new smmu_domain->devices list
  2) Program the STE
  3) Enable ATS at PCIe
  4) Remove the master from the old smmu_domain

This flow ensures that invalidations to either domain will generate an ATC
invalidation to the device while the STE is being switched. Thus we don't
need to turn off the ATS anymore for correctness.

The disable flow is the reverse:
 1) Disable ATS at PCIe
 2) Program the STE
 3) Invalidate the ATC
 4) Remove the master from the old smmu_domain

Move the nr_ats_masters adjustments to be close to the list
manipulations. It is a count of the number of ATS enabled masters
currently in the list. This is stricly before and after the STE/CD are
revised, and done under the list's spin_lock.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 198 ++++++++++++++------
 1 file changed, 140 insertions(+), 58 deletions(-)

Comments

Michael Shavit March 21, 2024, 12:26 p.m. UTC | #1
On Tue, Mar 5, 2024 at 7:44 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> The core code allows the domain to be changed on the fly without a forced
> stop in BLOCKED/IDENTITY. In this flow the driver should just continually
> maintain the ATS with no change while the STE is updated.
>
> ATS relies on a linked list smmu_domain->devices to keep track of which
> masters have the domain programmed, but this list is also used by
> arm_smmu_share_asid(), unrelated to ats.
>
> Create two new functions to encapsulate this combined logic:
>  arm_smmu_attach_prepare()
>  <caller generates and sets the STE>
>  arm_smmu_attach_commit()
>
> The two functions can sequence both enabling ATS and disabling across
> the STE store. Have every update of the STE use this sequence.
>
> Installing a S1/S2 domain always enables the ATS if the PCIe device
> supports it.
>
> The enable flow is now ordered differently to allow it to be hitless:
>
>   1) Add the master to the new smmu_domain->devices list
>   2) Program the STE
>   3) Enable ATS at PCIe
>   4) Remove the master from the old smmu_domain
>
> This flow ensures that invalidations to either domain will generate an ATC
> invalidation to the device while the STE is being switched. Thus we don't
> need to turn off the ATS anymore for correctness.
>
> The disable flow is the reverse:
>  1) Disable ATS at PCIe
>  2) Program the STE
>  3) Invalidate the ATC
>  4) Remove the master from the old smmu_domain
>
> Move the nr_ats_masters adjustments to be close to the list
> manipulations. It is a count of the number of ATS enabled masters
> currently in the list. This is stricly before and after the STE/CD are
> revised, and done under the list's spin_lock.
>
> Tested-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 198 ++++++++++++++------
>  1 file changed, 140 insertions(+), 58 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 51a1e7198fd1af..45f2190fc31786 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1527,7 +1527,8 @@ static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target)
>  }
>
>  static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
> -                                     struct arm_smmu_master *master)
> +                                     struct arm_smmu_master *master,
> +                                     bool ats_enabled)
>  {
>         struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
>         struct arm_smmu_device *smmu = master->smmu;
> @@ -1550,7 +1551,7 @@ static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
>                          STRTAB_STE_1_S1STALLD :
>                          0) |
>                 FIELD_PREP(STRTAB_STE_1_EATS,
> -                          master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
> +                          ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
>
>         if (smmu->features & ARM_SMMU_FEAT_E2H) {
>                 /*
> @@ -1578,7 +1579,8 @@ static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
>
>  static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
>                                         struct arm_smmu_master *master,
> -                                       struct arm_smmu_domain *smmu_domain)
> +                                       struct arm_smmu_domain *smmu_domain,
> +                                       bool ats_enabled)
>  {
>         struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
>         const struct io_pgtable_cfg *pgtbl_cfg =
> @@ -1594,7 +1596,7 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
>
>         target->data[1] = cpu_to_le64(
>                 FIELD_PREP(STRTAB_STE_1_EATS,
> -                          master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) |
> +                          ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) |
>                 FIELD_PREP(STRTAB_STE_1_SHCFG,
>                            STRTAB_STE_1_SHCFG_INCOMING));
>
> @@ -2470,22 +2472,16 @@ static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
>         return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
>  }
>
> -static void arm_smmu_enable_ats(struct arm_smmu_master *master,
> -                               struct arm_smmu_domain *smmu_domain)
> +static void arm_smmu_enable_ats(struct arm_smmu_master *master)
>  {
>         size_t stu;
>         struct pci_dev *pdev;
>         struct arm_smmu_device *smmu = master->smmu;
>
> -       /* Don't enable ATS at the endpoint if it's not enabled in the STE */
> -       if (!master->ats_enabled)
> -               return;
> -
>         /* Smallest Translation Unit: log2 of the smallest supported granule */
>         stu = __ffs(smmu->pgsize_bitmap);
>         pdev = to_pci_dev(master->dev);
>
> -       atomic_inc(&smmu_domain->nr_ats_masters);
>         /*
>          * ATC invalidation of PASID 0 causes the entire ATC to be flushed.
>          */
> @@ -2494,22 +2490,6 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master,
>                 dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
>  }
>
> -static void arm_smmu_disable_ats(struct arm_smmu_master *master,
> -                                struct arm_smmu_domain *smmu_domain)
> -{
> -       if (!master->ats_enabled)
> -               return;
> -
> -       pci_disable_ats(to_pci_dev(master->dev));
> -       /*
> -        * Ensure ATS is disabled at the endpoint before we issue the
> -        * ATC invalidation via the SMMU.
> -        */
> -       wmb();
> -       arm_smmu_atc_inv_master(master);
> -       atomic_dec(&smmu_domain->nr_ats_masters);
> -}
> -
>  static int arm_smmu_enable_pasid(struct arm_smmu_master *master)
>  {
>         int ret;
> @@ -2573,39 +2553,145 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
>         return NULL;
>  }
>
> -static void arm_smmu_detach_dev(struct arm_smmu_master *master)
> +/*
> + * If the domain uses the smmu_domain->devices list return the arm_smmu_domain
> + * structure, otherwise NULL. These domains track attached devices so they can
> + * issue invalidations.
> + */
> +static struct arm_smmu_domain *
> +to_smmu_domain_devices(struct iommu_domain *domain)
>  {
> -       struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev);
> +       /* The domain can be NULL only when processing the first attach */
> +       if (!domain)
> +               return NULL;
> +       if (domain->type & __IOMMU_DOMAIN_PAGING)
> +               return to_smmu_domain(domain);
> +       return NULL;
> +}
> +
> +static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
> +                                         struct iommu_domain *domain)
> +{
> +       struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
>         struct arm_smmu_master_domain *master_domain;
> -       struct arm_smmu_domain *smmu_domain;
>         unsigned long flags;
>
> -       if (!domain || !(domain->type & __IOMMU_DOMAIN_PAGING))
> +       if (!smmu_domain)
>                 return;
>
> -       smmu_domain = to_smmu_domain(domain);
> -       arm_smmu_disable_ats(master, smmu_domain);
> -
>         spin_lock_irqsave(&smmu_domain->devices_lock, flags);
>         master_domain = arm_smmu_find_master_domain(smmu_domain, master);
>         if (master_domain) {
>                 list_del(&master_domain->devices_elm);
>                 kfree(master_domain);
> +               if (master->ats_enabled)
> +                       atomic_dec(&smmu_domain->nr_ats_masters);
>         }
>         spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
> +}
>
> -       master->ats_enabled = false;
> +struct attach_state {
> +       bool want_ats;
> +       bool disable_ats;
> +};
> +
> +/*
> + * Prepare to attach a domain to a master. If disable_ats is not set this will
> + * turn on ATS if supported. smmu_domain can be NULL if the domain being
> + * attached does not have a page table and does not require invalidation
> + * tracking.
> + */
> +static int arm_smmu_attach_prepare(struct arm_smmu_master *master,
> +                                  struct iommu_domain *domain,
> +                                  struct attach_state *state)
> +{
> +       struct arm_smmu_domain *smmu_domain =
> +               to_smmu_domain_devices(domain);
> +       struct arm_smmu_master_domain *master_domain;
> +       unsigned long flags;
> +
> +       /*
> +        * arm_smmu_share_asid() must not see two domains pointing to the same
> +        * arm_smmu_master_domain contents otherwise it could randomly write one
> +        * or the other to the CD.
> +        */
> +       lockdep_assert_held(&arm_smmu_asid_lock);
> +
> +       state->want_ats = !state->disable_ats && arm_smmu_ats_supported(master);
> +
> +       if (smmu_domain) {
> +               master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
> +               if (!master_domain)
> +                       return -ENOMEM;
> +               master_domain->master = master;
> +
> +               /*
> +                * During prepare we want the current smmu_domain and new
> +                * smmu_domain to be in the devices list before we change any
> +                * HW. This ensures that both domains will send ATS
> +                * invalidations to the master until we are done.
> +                *
> +                * It is tempting to make this list only track masters that are
> +                * using ATS, but arm_smmu_share_asid() also uses this to change
> +                * the ASID of a domain, unrelated to ATS.
> +                *
> +                * Notice if we are re-attaching the same domain then the list
> +                * will have two identical entries and commit will remove only
> +                * one of them.
> +                */
> +               spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> +               if (state->want_ats)
> +                       atomic_inc(&smmu_domain->nr_ats_masters);
> +               list_add(&master_domain->devices_elm, &smmu_domain->devices);
> +               spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
> +       }
> +
> +       if (!state->want_ats && master->ats_enabled) {
> +               pci_disable_ats(to_pci_dev(master->dev));
> +               /*
> +                * This is probably overkill, but the config write for disabling
> +                * ATS should complete before the STE is configured to generate
> +                * UR to avoid AER noise.
> +                */
> +               wmb();
> +       }
> +       return 0;
> +}
> +
> +/*
> + * Commit is done after the STE/CD are configured with the EATS setting. It
> + * completes synchronizing the PCI device's ATC and finishes manipulating the
> + * smmu_domain->devices list.
> + */
> +static void arm_smmu_attach_commit(struct arm_smmu_master *master,
> +                                  struct attach_state *state)
> +{
> +       lockdep_assert_held(&arm_smmu_asid_lock);
> +
> +       if (state->want_ats && !master->ats_enabled) {
> +               arm_smmu_enable_ats(master);
> +       } else if (master->ats_enabled) {
> +               /*
> +                * The translation has changed, flush the ATC. At this point the
> +                * SMMU is translating for the new domain and both the old&new
> +                * domain will issue invalidations.
> +                */
> +               arm_smmu_atc_inv_master(master);
> +       }
> +       master->ats_enabled = state->want_ats;
> +
> +       arm_smmu_remove_master_domain(master,
> +                                     iommu_get_domain_for_dev(master->dev));
>  }
>
>  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>  {
>         int ret = 0;
> -       unsigned long flags;
>         struct arm_smmu_ste target;
>         struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
>         struct arm_smmu_device *smmu;
>         struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> -       struct arm_smmu_master_domain *master_domain;
> +       struct attach_state state = {};
>         struct arm_smmu_master *master;
>         struct arm_smmu_cd *cdptr;
>
> @@ -2642,11 +2728,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>                         return -ENOMEM;
>         }
>
> -       master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
> -       if (!master_domain)
> -               return -ENOMEM;
> -       master_domain->master = master;
> -
>         /*
>          * Prevent arm_smmu_share_asid() from trying to change the ASID
>          * of either the old or new domain while we are working on it.
> @@ -2655,13 +2736,11 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>          */
>         mutex_lock(&arm_smmu_asid_lock);
>
> -       arm_smmu_detach_dev(master);
> -
> -       master->ats_enabled = arm_smmu_ats_supported(master);
> -
> -       spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> -       list_add(&master_domain->devices_elm, &smmu_domain->devices);
> -       spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
> +       ret = arm_smmu_attach_prepare(master, domain, &state);
> +       if (ret) {
> +               mutex_unlock(&arm_smmu_asid_lock);
> +               return ret;
> +       }
>
>         switch (smmu_domain->stage) {
>         case ARM_SMMU_DOMAIN_S1: {
> @@ -2670,18 +2749,19 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
>                 arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
>                 arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
>                                         &target_cd);
> -               arm_smmu_make_cdtable_ste(&target, master);
> +               arm_smmu_make_cdtable_ste(&target, master, state.want_ats);
>                 arm_smmu_install_ste_for_dev(master, &target);
>                 break;
>         }
>         case ARM_SMMU_DOMAIN_S2:
> -               arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
> +               arm_smmu_make_s2_domain_ste(&target, master, smmu_domain,
> +                                           state.want_ats);
>                 arm_smmu_install_ste_for_dev(master, &target);
>                 arm_smmu_clear_cd(master, IOMMU_NO_PASID);
>                 break;
>         }
>
> -       arm_smmu_enable_ats(master, smmu_domain);
> +       arm_smmu_attach_commit(master, &state);
>         mutex_unlock(&arm_smmu_asid_lock);
>         return 0;
>  }
> @@ -2715,10 +2795,11 @@ void arm_smmu_remove_pasid(struct arm_smmu_master *master,
>         arm_smmu_clear_cd(master, pasid);
>  }
>
> -static int arm_smmu_attach_dev_ste(struct device *dev,
> -                                  struct arm_smmu_ste *ste)
> +static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
> +                                  struct device *dev, struct arm_smmu_ste *ste)
>  {
>         struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +       struct attach_state state = {};
>
>         if (arm_smmu_master_sva_enabled(master))
>                 return -EBUSY;
> @@ -2736,9 +2817,10 @@ static int arm_smmu_attach_dev_ste(struct device *dev,
>          * the stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and
>          * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry).
>          */
> -       arm_smmu_detach_dev(master);
> -
> +       state.disable_ats = true;
> +       arm_smmu_attach_prepare(master, domain, &state);
>         arm_smmu_install_ste_for_dev(master, ste);
> +       arm_smmu_attach_commit(master, &state);
>         mutex_unlock(&arm_smmu_asid_lock);
>
>         /*
> @@ -2756,7 +2838,7 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
>         struct arm_smmu_ste ste;
>
>         arm_smmu_make_bypass_ste(&ste);
> -       return arm_smmu_attach_dev_ste(dev, &ste);
> +       return arm_smmu_attach_dev_ste(domain, dev, &ste);
>  }
>
>  static const struct iommu_domain_ops arm_smmu_identity_ops = {
> @@ -2774,7 +2856,7 @@ static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
>         struct arm_smmu_ste ste;
>
>         arm_smmu_make_abort_ste(&ste);
> -       return arm_smmu_attach_dev_ste(dev, &ste);
> +       return arm_smmu_attach_dev_ste(domain, dev, &ste);
>  }
>
>  static const struct iommu_domain_ops arm_smmu_blocked_ops = {
> --
> 2.43.2
>

Overall I think the patch works, but it took me a while to really
digest the big picture. Just to make sure I fully understand:

We're trying to satisfy the following invariants for correctness:
1. Devices cannot get translations from a domain that was detached
after arm_smmu_attach_dev() returns.
2. Devices cannot get cached translations from a domain after
arm_smmu_atc_inv_domain() returns, regardless of whether the domain is
simultaneously being attached/detached from a device.

Apart from note point 2. above, the behaviour of translations while in
the middle of an arm_smmu_attach_dev doesn't have well defined
requirements:
1. Before this patch, devices may get translations belonging to the
old domain, then aborts or identity translations, and then the new
domain while arm_smmu_attach_dev() is in progress.
2. Ater this patch, devices may get an arbitrary mix of translations
belonging to the old domain and the new domain while
arm_smmu_attach_dev() is in progress.

While disabling and re-enabling ATS inside arm_smmu_attach_dev() would
meet the same requirements as this patch, it's not optimal since the
device may still have traffic on other pasids than the one being
manipulated.
Jason Gunthorpe March 21, 2024, 1:28 p.m. UTC | #2
On Thu, Mar 21, 2024 at 08:26:43PM +0800, Michael Shavit wrote:
> Overall I think the patch works, but it took me a while to really
> digest the big picture. Just to make sure I fully understand:
> 
> We're trying to satisfy the following invariants for correctness:
> 1. Devices cannot get translations from a domain that was detached
> after arm_smmu_attach_dev() returns.

Yes, regardless of ATS this is the API requirement.

> 2. Devices cannot get cached translations from a domain after
> arm_smmu_atc_inv_domain() returns, regardless of whether the domain is
> simultaneously being attached/detached from a device.

There is no simultaneously here since the group lock is held by the
core code.
 
> Apart from note point 2. above, the behaviour of translations while in
> the middle of an arm_smmu_attach_dev doesn't have well defined
> requirements:
> 1. Before this patch, devices may get translations belonging to the
> old domain, then aborts or identity translations, and then the new
> domain while arm_smmu_attach_dev() is in progress.

Yes

> 2. Ater this patch, devices may get an arbitrary mix of translations
> belonging to the old domain and the new domain while
> arm_smmu_attach_dev() is in progress.

Yes, but no aborts.

> While disabling and re-enabling ATS inside arm_smmu_attach_dev() would
> meet the same requirements as this patch, it's not optimal since the
> device may still have traffic on other pasids than the one being
> manipulated.

Yes, we can't just disable ATS due to the PASIDs. A SVA PASID might be
present and disabling ATS to change the RID domain would completely
wreck it. Today this scenario is prevented by sva_enable, which is
removed in following patches.

I added this note for context to the commit message:

 This is part of the bigger picture to allow changing the RID domain while
 a PASID is in use. If a SVA PASID is relying on ATS to function then
 changing the RID domain cannot just temporarily toggle ATS off without
 also wrecking the SVA PASID. The new infrastructure here is organized so
 that the PASID attach/detach flows will make use of it as well in
 following patches.

Thanks,
Jason
Michael Shavit March 21, 2024, 2:53 p.m. UTC | #3
On Thu, Mar 21, 2024 at 9:28 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Mar 21, 2024 at 08:26:43PM +0800, Michael Shavit wrote:
> > Overall I think the patch works, but it took me a while to really
> > digest the big picture. Just to make sure I fully understand:
> >
> > We're trying to satisfy the following invariants for correctness:
> > 1. Devices cannot get translations from a domain that was detached
> > after arm_smmu_attach_dev() returns.
>
> Yes, regardless of ATS this is the API requirement.
>
> > 2. Devices cannot get cached translations from a domain after
> > arm_smmu_atc_inv_domain() returns, regardless of whether the domain is
> > simultaneously being attached/detached from a device.
>
> There is no simultaneously here since the group lock is held by the
> core code.

I meant a call to arm_smmu_attach_dev concurrent with
arm_smmu_atc_inv_domain (through tlb_flush_all).

>
> > Apart from note point 2. above, the behaviour of translations while in
> > the middle of an arm_smmu_attach_dev doesn't have well defined
> > requirements:
> > 1. Before this patch, devices may get translations belonging to the
> > old domain, then aborts or identity translations, and then the new
> > domain while arm_smmu_attach_dev() is in progress.
>
> Yes
>
> > 2. Ater this patch, devices may get an arbitrary mix of translations
> > belonging to the old domain and the new domain while
> > arm_smmu_attach_dev() is in progress.
>
> Yes, but no aborts.
>
> > While disabling and re-enabling ATS inside arm_smmu_attach_dev() would
> > meet the same requirements as this patch, it's not optimal since the
> > device may still have traffic on other pasids than the one being
> > manipulated.
>
> Yes, we can't just disable ATS due to the PASIDs. A SVA PASID might be
> present and disabling ATS to change the RID domain would completely
> wreck it. Today this scenario is prevented by sva_enable, which is
> removed in following patches.

What do you mean by wrecking? We might slow it down to a crawl but we
wouldn't be corrupting or destroying anything by disabling ATS while
SVA is running would we?
Oh does disabling ATS abort all transactions rather than making it transparent?

>
> I added this note for context to the commit message:
>
>  This is part of the bigger picture to allow changing the RID domain while
>  a PASID is in use. If a SVA PASID is relying on ATS to function then
>  changing the RID domain cannot just temporarily toggle ATS off without
>  also wrecking the SVA PASID. The new infrastructure here is organized so
>  that the PASID attach/detach flows will make use of it as well in
>  following patches.
>
> Thanks,
> Jason
Michael Shavit March 21, 2024, 2:57 p.m. UTC | #4
On Thu, Mar 21, 2024 at 10:53 PM Michael Shavit <mshavit@google.com> wrote:
>
> On Thu, Mar 21, 2024 at 9:28 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Mar 21, 2024 at 08:26:43PM +0800, Michael Shavit wrote:
> > > Overall I think the patch works, but it took me a while to really
> > > digest the big picture. Just to make sure I fully understand:
> > >
> > > We're trying to satisfy the following invariants for correctness:
> > > 1. Devices cannot get translations from a domain that was detached
> > > after arm_smmu_attach_dev() returns.
> >
> > Yes, regardless of ATS this is the API requirement.
> >
> > > 2. Devices cannot get cached translations from a domain after
> > > arm_smmu_atc_inv_domain() returns, regardless of whether the domain is
> > > simultaneously being attached/detached from a device.
> >
> > There is no simultaneously here since the group lock is held by the
> > core code.
>
> I meant a call to arm_smmu_attach_dev concurrent with
> arm_smmu_atc_inv_domain (through tlb_flush_all).
>
> >
> > > Apart from note point 2. above, the behaviour of translations while in
> > > the middle of an arm_smmu_attach_dev doesn't have well defined
> > > requirements:
> > > 1. Before this patch, devices may get translations belonging to the
> > > old domain, then aborts or identity translations, and then the new
> > > domain while arm_smmu_attach_dev() is in progress.
> >
> > Yes
> >
> > > 2. Ater this patch, devices may get an arbitrary mix of translations
> > > belonging to the old domain and the new domain while
> > > arm_smmu_attach_dev() is in progress.
> >
> > Yes, but no aborts.
> >
> > > While disabling and re-enabling ATS inside arm_smmu_attach_dev() would
> > > meet the same requirements as this patch, it's not optimal since the
> > > device may still have traffic on other pasids than the one being
> > > manipulated.
> >
> > Yes, we can't just disable ATS due to the PASIDs. A SVA PASID might be
> > present and disabling ATS to change the RID domain would completely
> > wreck it. Today this scenario is prevented by sva_enable, which is
> > removed in following patches.
>
> What do you mean by wrecking? We might slow it down to a crawl but we
> wouldn't be corrupting or destroying anything by disabling ATS while
> SVA is running would we?
> Oh does disabling ATS abort all transactions rather than making it transparent?
>
> >
> > I added this note for context to the commit message:
> >
> >  This is part of the bigger picture to allow changing the RID domain while
> >  a PASID is in use. If a SVA PASID is relying on ATS to function then
> >  changing the RID domain cannot just temporarily toggle ATS off without
> >  also wrecking the SVA PASID. The new infrastructure here is organized so
> >  that the PASID attach/detach flows will make use of it as well in
> >  following patches.

Huh ok yeah, I thought this was just enabling/disabling caching of
translations in the ATC but looks like it's disabling translation for
those PCIe devices.
Jason Gunthorpe March 21, 2024, 5:32 p.m. UTC | #5
On Thu, Mar 21, 2024 at 10:53:20PM +0800, Michael Shavit wrote:
> On Thu, Mar 21, 2024 at 9:28 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Mar 21, 2024 at 08:26:43PM +0800, Michael Shavit wrote:
> > > Overall I think the patch works, but it took me a while to really
> > > digest the big picture. Just to make sure I fully understand:
> > >
> > > We're trying to satisfy the following invariants for correctness:
> > > 1. Devices cannot get translations from a domain that was detached
> > > after arm_smmu_attach_dev() returns.
> >
> > Yes, regardless of ATS this is the API requirement.
> >
> > > 2. Devices cannot get cached translations from a domain after
> > > arm_smmu_atc_inv_domain() returns, regardless of whether the domain is
> > > simultaneously being attached/detached from a device.
> >
> > There is no simultaneously here since the group lock is held by the
> > core code.
> 
> I meant a call to arm_smmu_attach_dev concurrent with
> arm_smmu_atc_inv_domain (through tlb_flush_all).

Oh, yes that is the concurrency that drives alot of this organization

> > Yes, we can't just disable ATS due to the PASIDs. A SVA PASID might be
> > present and disabling ATS to change the RID domain would completely
> > wreck it. Today this scenario is prevented by sva_enable, which is
> > removed in following patches.
> 
> What do you mean by wrecking? We might slow it down to a crawl but we
> wouldn't be corrupting or destroying anything by disabling ATS while
> SVA is running would we?
> Oh does disabling ATS abort all transactions rather than making it transparent?

As you deduced, without ATS a non-present page will return an abort
back to the device instead of a non-present ATS reply which should
then trigger PRI.

IOW the SVA domain can no longer handle page faults and is effectively
non-functioning if ATS is disabled.
 
Jason
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 51a1e7198fd1af..45f2190fc31786 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1527,7 +1527,8 @@  static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target)
 }
 
 static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
-				      struct arm_smmu_master *master)
+				      struct arm_smmu_master *master,
+				      bool ats_enabled)
 {
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
@@ -1550,7 +1551,7 @@  static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
 			 STRTAB_STE_1_S1STALLD :
 			 0) |
 		FIELD_PREP(STRTAB_STE_1_EATS,
-			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
 
 	if (smmu->features & ARM_SMMU_FEAT_E2H) {
 		/*
@@ -1578,7 +1579,8 @@  static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
 
 static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 					struct arm_smmu_master *master,
-					struct arm_smmu_domain *smmu_domain)
+					struct arm_smmu_domain *smmu_domain,
+					bool ats_enabled)
 {
 	struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
 	const struct io_pgtable_cfg *pgtbl_cfg =
@@ -1594,7 +1596,7 @@  static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 
 	target->data[1] = cpu_to_le64(
 		FIELD_PREP(STRTAB_STE_1_EATS,
-			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) |
+			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) |
 		FIELD_PREP(STRTAB_STE_1_SHCFG,
 			   STRTAB_STE_1_SHCFG_INCOMING));
 
@@ -2470,22 +2472,16 @@  static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
 	return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
 }
 
-static void arm_smmu_enable_ats(struct arm_smmu_master *master,
-				struct arm_smmu_domain *smmu_domain)
+static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 {
 	size_t stu;
 	struct pci_dev *pdev;
 	struct arm_smmu_device *smmu = master->smmu;
 
-	/* Don't enable ATS at the endpoint if it's not enabled in the STE */
-	if (!master->ats_enabled)
-		return;
-
 	/* Smallest Translation Unit: log2 of the smallest supported granule */
 	stu = __ffs(smmu->pgsize_bitmap);
 	pdev = to_pci_dev(master->dev);
 
-	atomic_inc(&smmu_domain->nr_ats_masters);
 	/*
 	 * ATC invalidation of PASID 0 causes the entire ATC to be flushed.
 	 */
@@ -2494,22 +2490,6 @@  static void arm_smmu_enable_ats(struct arm_smmu_master *master,
 		dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
 }
 
-static void arm_smmu_disable_ats(struct arm_smmu_master *master,
-				 struct arm_smmu_domain *smmu_domain)
-{
-	if (!master->ats_enabled)
-		return;
-
-	pci_disable_ats(to_pci_dev(master->dev));
-	/*
-	 * Ensure ATS is disabled at the endpoint before we issue the
-	 * ATC invalidation via the SMMU.
-	 */
-	wmb();
-	arm_smmu_atc_inv_master(master);
-	atomic_dec(&smmu_domain->nr_ats_masters);
-}
-
 static int arm_smmu_enable_pasid(struct arm_smmu_master *master)
 {
 	int ret;
@@ -2573,39 +2553,145 @@  arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
 	return NULL;
 }
 
-static void arm_smmu_detach_dev(struct arm_smmu_master *master)
+/*
+ * If the domain uses the smmu_domain->devices list return the arm_smmu_domain
+ * structure, otherwise NULL. These domains track attached devices so they can
+ * issue invalidations.
+ */
+static struct arm_smmu_domain *
+to_smmu_domain_devices(struct iommu_domain *domain)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev);
+	/* The domain can be NULL only when processing the first attach */
+	if (!domain)
+		return NULL;
+	if (domain->type & __IOMMU_DOMAIN_PAGING)
+		return to_smmu_domain(domain);
+	return NULL;
+}
+
+static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
+					  struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
 	struct arm_smmu_master_domain *master_domain;
-	struct arm_smmu_domain *smmu_domain;
 	unsigned long flags;
 
-	if (!domain || !(domain->type & __IOMMU_DOMAIN_PAGING))
+	if (!smmu_domain)
 		return;
 
-	smmu_domain = to_smmu_domain(domain);
-	arm_smmu_disable_ats(master, smmu_domain);
-
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	master_domain = arm_smmu_find_master_domain(smmu_domain, master);
 	if (master_domain) {
 		list_del(&master_domain->devices_elm);
 		kfree(master_domain);
+		if (master->ats_enabled)
+			atomic_dec(&smmu_domain->nr_ats_masters);
 	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+}
 
-	master->ats_enabled = false;
+struct attach_state {
+	bool want_ats;
+	bool disable_ats;
+};
+
+/*
+ * Prepare to attach a domain to a master. If disable_ats is not set this will
+ * turn on ATS if supported. smmu_domain can be NULL if the domain being
+ * attached does not have a page table and does not require invalidation
+ * tracking.
+ */
+static int arm_smmu_attach_prepare(struct arm_smmu_master *master,
+				   struct iommu_domain *domain,
+				   struct attach_state *state)
+{
+	struct arm_smmu_domain *smmu_domain =
+		to_smmu_domain_devices(domain);
+	struct arm_smmu_master_domain *master_domain;
+	unsigned long flags;
+
+	/*
+	 * arm_smmu_share_asid() must not see two domains pointing to the same
+	 * arm_smmu_master_domain contents otherwise it could randomly write one
+	 * or the other to the CD.
+	 */
+	lockdep_assert_held(&arm_smmu_asid_lock);
+
+	state->want_ats = !state->disable_ats && arm_smmu_ats_supported(master);
+
+	if (smmu_domain) {
+		master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
+		if (!master_domain)
+			return -ENOMEM;
+		master_domain->master = master;
+
+		/*
+		 * During prepare we want the current smmu_domain and new
+		 * smmu_domain to be in the devices list before we change any
+		 * HW. This ensures that both domains will send ATS
+		 * invalidations to the master until we are done.
+		 *
+		 * It is tempting to make this list only track masters that are
+		 * using ATS, but arm_smmu_share_asid() also uses this to change
+		 * the ASID of a domain, unrelated to ATS.
+		 *
+		 * Notice if we are re-attaching the same domain then the list
+		 * will have two identical entries and commit will remove only
+		 * one of them.
+		 */
+		spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+		if (state->want_ats)
+			atomic_inc(&smmu_domain->nr_ats_masters);
+		list_add(&master_domain->devices_elm, &smmu_domain->devices);
+		spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+	}
+
+	if (!state->want_ats && master->ats_enabled) {
+		pci_disable_ats(to_pci_dev(master->dev));
+		/*
+		 * This is probably overkill, but the config write for disabling
+		 * ATS should complete before the STE is configured to generate
+		 * UR to avoid AER noise.
+		 */
+		wmb();
+	}
+	return 0;
+}
+
+/*
+ * Commit is done after the STE/CD are configured with the EATS setting. It
+ * completes synchronizing the PCI device's ATC and finishes manipulating the
+ * smmu_domain->devices list.
+ */
+static void arm_smmu_attach_commit(struct arm_smmu_master *master,
+				   struct attach_state *state)
+{
+	lockdep_assert_held(&arm_smmu_asid_lock);
+
+	if (state->want_ats && !master->ats_enabled) {
+		arm_smmu_enable_ats(master);
+	} else if (master->ats_enabled) {
+		/*
+		 * The translation has changed, flush the ATC. At this point the
+		 * SMMU is translating for the new domain and both the old&new
+		 * domain will issue invalidations.
+		 */
+		arm_smmu_atc_inv_master(master);
+	}
+	master->ats_enabled = state->want_ats;
+
+	arm_smmu_remove_master_domain(master,
+				      iommu_get_domain_for_dev(master->dev));
 }
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
 	int ret = 0;
-	unsigned long flags;
 	struct arm_smmu_ste target;
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct arm_smmu_device *smmu;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	struct arm_smmu_master_domain *master_domain;
+	struct attach_state state = {};
 	struct arm_smmu_master *master;
 	struct arm_smmu_cd *cdptr;
 
@@ -2642,11 +2728,6 @@  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			return -ENOMEM;
 	}
 
-	master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
-	if (!master_domain)
-		return -ENOMEM;
-	master_domain->master = master;
-
 	/*
 	 * Prevent arm_smmu_share_asid() from trying to change the ASID
 	 * of either the old or new domain while we are working on it.
@@ -2655,13 +2736,11 @@  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	 */
 	mutex_lock(&arm_smmu_asid_lock);
 
-	arm_smmu_detach_dev(master);
-
-	master->ats_enabled = arm_smmu_ats_supported(master);
-
-	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_add(&master_domain->devices_elm, &smmu_domain->devices);
-	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+	ret = arm_smmu_attach_prepare(master, domain, &state);
+	if (ret) {
+		mutex_unlock(&arm_smmu_asid_lock);
+		return ret;
+	}
 
 	switch (smmu_domain->stage) {
 	case ARM_SMMU_DOMAIN_S1: {
@@ -2670,18 +2749,19 @@  static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 		arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
 		arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
 					&target_cd);
-		arm_smmu_make_cdtable_ste(&target, master);
+		arm_smmu_make_cdtable_ste(&target, master, state.want_ats);
 		arm_smmu_install_ste_for_dev(master, &target);
 		break;
 	}
 	case ARM_SMMU_DOMAIN_S2:
-		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
+		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain,
+					    state.want_ats);
 		arm_smmu_install_ste_for_dev(master, &target);
 		arm_smmu_clear_cd(master, IOMMU_NO_PASID);
 		break;
 	}
 
-	arm_smmu_enable_ats(master, smmu_domain);
+	arm_smmu_attach_commit(master, &state);
 	mutex_unlock(&arm_smmu_asid_lock);
 	return 0;
 }
@@ -2715,10 +2795,11 @@  void arm_smmu_remove_pasid(struct arm_smmu_master *master,
 	arm_smmu_clear_cd(master, pasid);
 }
 
-static int arm_smmu_attach_dev_ste(struct device *dev,
-				   struct arm_smmu_ste *ste)
+static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+				   struct device *dev, struct arm_smmu_ste *ste)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct attach_state state = {};
 
 	if (arm_smmu_master_sva_enabled(master))
 		return -EBUSY;
@@ -2736,9 +2817,10 @@  static int arm_smmu_attach_dev_ste(struct device *dev,
 	 * the stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and
 	 * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry).
 	 */
-	arm_smmu_detach_dev(master);
-
+	state.disable_ats = true;
+	arm_smmu_attach_prepare(master, domain, &state);
 	arm_smmu_install_ste_for_dev(master, ste);
+	arm_smmu_attach_commit(master, &state);
 	mutex_unlock(&arm_smmu_asid_lock);
 
 	/*
@@ -2756,7 +2838,7 @@  static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
 	struct arm_smmu_ste ste;
 
 	arm_smmu_make_bypass_ste(&ste);
-	return arm_smmu_attach_dev_ste(dev, &ste);
+	return arm_smmu_attach_dev_ste(domain, dev, &ste);
 }
 
 static const struct iommu_domain_ops arm_smmu_identity_ops = {
@@ -2774,7 +2856,7 @@  static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
 	struct arm_smmu_ste ste;
 
 	arm_smmu_make_abort_ste(&ste);
-	return arm_smmu_attach_dev_ste(dev, &ste);
+	return arm_smmu_attach_dev_ste(domain, dev, &ste);
 }
 
 static const struct iommu_domain_ops arm_smmu_blocked_ops = {