diff mbox series

[v2,08/10] iommufd/device: Use iommu_group_replace_domain()

Message ID 4653f009c3dacae8ebf3a4865aaa944aa9c7cc7e.1675802050.git.nicolinc@nvidia.com (mailing list archive)
State New
Headers show
Series Add IO page table replacement support | expand

Commit Message

Nicolin Chen Feb. 7, 2023, 9:18 p.m. UTC
iommu_group_replace_domain() is introduced to support use cases where an
iommu_group can be attached to a new domain without getting detached from
the old one. This replacement feature will be useful, for cases such as:
1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
   table with a larger table (PASID=N)
2) Nesting mode, when switching the attaching device from an S2 domain
   to an S1 domain, or when switching between relevant S1 domains.
as it allows these cases to switch seamlessly without a DMA disruption.

So, call iommu_group_replace_domain() in the iommufd_device_do_attach().
And add a __iommmufd_device_detach helper to allow the replace routine to
do a partial detach on the current hwpt that's being replaced. Though the
updated locking logic is overcomplicated, it will be eased, once those
iopt_table_add/remove_ioas and list_add/del calls are moved to hwpt's
allocation/destroy() functions in the coming nesting series, as that'll
depend on a new ->domain_alloc_user op in the iommu core.

Also, block replace operations that are from/to auto_domains, i.e. only
user-allocated hw_pagetables can be replaced or replaced with.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/device.c          | 101 +++++++++++++++++-------
 drivers/iommu/iommufd/iommufd_private.h |   2 +
 2 files changed, 76 insertions(+), 27 deletions(-)

Comments

Yi Liu Feb. 8, 2023, 8:08 a.m. UTC | #1
Hi Nic,

> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, February 8, 2023 5:18 AM
> 
> iommu_group_replace_domain() is introduced to support use cases where
> an
> iommu_group can be attached to a new domain without getting detached
> from
> the old one. This replacement feature will be useful, for cases such as:
> 1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
>    table with a larger table (PASID=N)
> 2) Nesting mode, when switching the attaching device from an S2 domain
>    to an S1 domain, or when switching between relevant S1 domains.
> as it allows these cases to switch seamlessly without a DMA disruption.
> 
> So, call iommu_group_replace_domain() in the
> iommufd_device_do_attach().
> And add a __iommmufd_device_detach helper to allow the replace routine
> to
> do a partial detach on the current hwpt that's being replaced. Though the
> updated locking logic is overcomplicated, it will be eased, once those
> iopt_table_add/remove_ioas and list_add/del calls are moved to hwpt's
> allocation/destroy() functions in the coming nesting series, as that'll
> depend on a new ->domain_alloc_user op in the iommu core.
> 
> Also, block replace operations that are from/to auto_domains, i.e. only
> user-allocated hw_pagetables can be replaced or replaced with.
> 
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/iommufd/device.c          | 101 +++++++++++++++++-------
>  drivers/iommu/iommufd/iommufd_private.h |   2 +
>  2 files changed, 76 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/iommu/iommufd/device.c
> b/drivers/iommu/iommufd/device.c
> index b8c3e3baccb5..8a9834fc129a 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -9,6 +9,8 @@
>  #include "io_pagetable.h"
>  #include "iommufd_private.h"
> 
> +MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
> +
>  static bool allow_unsafe_interrupts;
>  module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(
> @@ -194,9 +196,61 @@ static bool
> iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
>  	return false;
>  }
> 
> +/**
> + * __iommmufd_device_detach - Detach a device from idev->hwpt to
> new_hwpt

This function doesn't do anything to make this device attached to new_hwpt.
It is done in the iommufd_device_attach_ioas(). New_hwpt here indicates if
this detach requires to do some extra thing. E.g. remove reserved iova from
the idev->hwpt->ioas. So may just say " Detach a device from idev->hwpt",
and explain the usage of new_hwpt in the below.

> + * @idev: device to detach
> + * @new_hwpt: new hw_pagetable to attach (pass in NULL for a simple
> detach)

The new hw_pagetable to be attached.

> + * @detach_group: flag to call iommu_detach_group
> + *
> + * This is a cleanup helper shared by the replace and detach routines.
> Comparing
> + * to a detach routine, a replace routine only needs a partial detach
> procedure:
> + * it does not need the iommu_detach_group(); it will attach the device to
> a new
> + * hw_pagetable after a partial detach from the currently attached
> hw_pagetable,
> + * so certain steps can be skipped if two hw_pagetables have the same
> IOAS.
> + */
> +static void __iommmufd_device_detach(struct iommufd_device *idev,
> +				     struct iommufd_hw_pagetable
> *new_hwpt,
> +				     bool detach_group)
> +{
> +	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> +	struct iommufd_ioas *new_ioas = NULL;
> +
> +	if (new_hwpt)
> +		new_ioas = new_hwpt->ioas;
> +
> +	mutex_lock(&hwpt->devices_lock);
> +	list_del(&idev->devices_item);
> +	if (hwpt->ioas != new_ioas)
> +		mutex_lock(&hwpt->ioas->mutex);

The lock order is mostly hwpt->ioas->mutex and then hwpt->devices_lock.
See the iommufd_device_auto_get_domain(). If possible, may switch the
order sequence here. Also, rename hwpt to be cur_hwpt, this may help
reviewers to distinguish it from the hwpt in the caller of this function. It
looks to be a deadlock at first look, but not after closer reading.

> +	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> +		if (list_empty(&hwpt->devices)) {
> +			iopt_table_remove_domain(&hwpt->ioas->iopt,
> +						 hwpt->domain);
> +			list_del(&hwpt->hwpt_item);
> +		}
> +		if (detach_group)
> +			iommu_detach_group(hwpt->domain, idev->group);
> +	}
> +	if (hwpt->ioas != new_ioas) {
> +		iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev-
> >dev);
> +		mutex_unlock(&hwpt->ioas->mutex);
> +	}
> +	mutex_unlock(&hwpt->devices_lock);
> +
> +	if (hwpt->auto_domain)
> +		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
> +	else
> +		refcount_dec(&hwpt->obj.users);
> +
> +	idev->hwpt = NULL;
> +
> +	refcount_dec(&idev->obj.users);
> +}
> +
>  static int iommufd_device_do_attach(struct iommufd_device *idev,
>  				    struct iommufd_hw_pagetable *hwpt)
>  {
> +	struct iommufd_hw_pagetable *cur_hwpt = idev->hwpt;
>  	phys_addr_t sw_msi_start = PHYS_ADDR_MAX;
>  	int rc;
> 
> @@ -236,7 +290,7 @@ static int iommufd_device_do_attach(struct
> iommufd_device *idev,
>  	 * the group once for the first device that is in the group.
>  	 */
>  	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> -		rc = iommu_attach_group(hwpt->domain, idev->group);
> +		rc = iommu_group_replace_domain(idev->group, hwpt-
> >domain);
>  		if (rc)
>  			goto out_iova;
> 
> @@ -249,6 +303,10 @@ static int iommufd_device_do_attach(struct
> iommufd_device *idev,
>  		}
>  	}
> 
> +	/* Replace the cur_hwpt without iommu_detach_group() */
> +	if (cur_hwpt)
> +		__iommmufd_device_detach(idev, hwpt, false);
> +
>  	idev->hwpt = hwpt;
>  	refcount_inc(&hwpt->obj.users);
>  	list_add(&idev->devices_item, &hwpt->devices);
> @@ -256,7 +314,10 @@ static int iommufd_device_do_attach(struct
> iommufd_device *idev,
>  	return 0;
> 
>  out_detach:
> -	iommu_detach_group(hwpt->domain, idev->group);
> +	if (cur_hwpt)
> +		iommu_group_replace_domain(idev->group, cur_hwpt-
> >domain);
> +	else
> +		iommu_detach_group(hwpt->domain, idev->group);
>  out_iova:
>  	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
>  out_unlock:
> @@ -345,6 +406,13 @@ int iommufd_device_attach(struct iommufd_device
> *idev, u32 *pt_id)
>  		struct iommufd_hw_pagetable *hwpt =
>  			container_of(pt_obj, struct
> iommufd_hw_pagetable, obj);
> 
> +		if (idev->hwpt == hwpt)
> +			goto out_done;
> +		if (idev->hwpt && idev->hwpt->auto_domain) {
> +			rc = -EBUSY;

This means if device was attached to an auto_created hwpt, then we
cannot replace it with a user allocated hwpt? If yes, this means the
replace is not available until user hwpt support, which is part of nesting.

> +			goto out_put_pt_obj;
> +		}
> +
>  		mutex_lock(&hwpt->ioas->mutex);
>  		rc = iommufd_device_do_attach(idev, hwpt);
>  		mutex_unlock(&hwpt->ioas->mutex);
> @@ -356,6 +424,8 @@ int iommufd_device_attach(struct iommufd_device
> *idev, u32 *pt_id)
>  		struct iommufd_ioas *ioas =
>  			container_of(pt_obj, struct iommufd_ioas, obj);
> 
> +		if (idev->hwpt)
> +			return -EBUSY;

So we don't allow ioas replacement for physical devices. Is it?
Looks like emulated devices allows it.

>  		rc = iommufd_device_auto_get_domain(idev, ioas);
>  		if (rc)
>  			goto out_put_pt_obj;
> @@ -367,6 +437,7 @@ int iommufd_device_attach(struct iommufd_device
> *idev, u32 *pt_id)
>  	}
> 
>  	refcount_inc(&idev->obj.users);
> +out_done:
>  	*pt_id = idev->hwpt->obj.id;
>  	rc = 0;
> 
> @@ -385,31 +456,7 @@
> EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
>   */
>  void iommufd_device_detach(struct iommufd_device *idev)
>  {
> -	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> -
> -	mutex_lock(&hwpt->ioas->mutex);
> -	mutex_lock(&hwpt->devices_lock);
> -	list_del(&idev->devices_item);
> -	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> -		if (list_empty(&hwpt->devices)) {
> -			iopt_table_remove_domain(&hwpt->ioas->iopt,
> -						 hwpt->domain);
> -			list_del(&hwpt->hwpt_item);
> -		}
> -		iommu_detach_group(hwpt->domain, idev->group);
> -	}
> -	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
> -	mutex_unlock(&hwpt->devices_lock);
> -	mutex_unlock(&hwpt->ioas->mutex);
> -
> -	if (hwpt->auto_domain)
> -		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
> -	else
> -		refcount_dec(&hwpt->obj.users);
> -
> -	idev->hwpt = NULL;
> -
> -	refcount_dec(&idev->obj.users);
> +	__iommmufd_device_detach(idev, NULL, true);
>  }
>  EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
> 
> diff --git a/drivers/iommu/iommufd/iommufd_private.h
> b/drivers/iommu/iommufd/iommufd_private.h
> index 593138bb37b8..200c783800ad 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -9,6 +9,8 @@
>  #include <linux/refcount.h>
>  #include <linux/uaccess.h>
> 
> +#include "../iommu-priv.h"
> +
>  struct iommu_domain;
>  struct iommu_group;
>  struct iommu_option;
> --
> 2.39.1

Regards,
Yi Liu
Yi Liu Feb. 8, 2023, 8:12 a.m. UTC | #2
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, February 8, 2023 5:18 AM
> 
> +/**
> + * __iommmufd_device_detach - Detach a device from idev->hwpt to

Nit: s/_iommmufd/__iommufd

Regards,
Yi Liu
Tian, Kevin Feb. 9, 2023, 4 a.m. UTC | #3
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, February 8, 2023 5:18 AM
> 
> iommu_group_replace_domain() is introduced to support use cases where
> an
> iommu_group can be attached to a new domain without getting detached
> from
> the old one. This replacement feature will be useful, for cases such as:
> 1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
>    table with a larger table (PASID=N)
> 2) Nesting mode, when switching the attaching device from an S2 domain
>    to an S1 domain, or when switching between relevant S1 domains.
> as it allows these cases to switch seamlessly without a DMA disruption.
> 
> So, call iommu_group_replace_domain() in the iommufd_device_do_attach().
> And add a __iommmufd_device_detach helper to allow the replace routine
> to
> do a partial detach on the current hwpt that's being replaced. Though the
> updated locking logic is overcomplicated, it will be eased, once those
> iopt_table_add/remove_ioas and list_add/del calls are moved to hwpt's
> allocation/destroy() functions in the coming nesting series, as that'll
> depend on a new ->domain_alloc_user op in the iommu core.

then why not moving those changes into this series to make it simple?

> 
> Also, block replace operations that are from/to auto_domains, i.e. only
> user-allocated hw_pagetables can be replaced or replaced with.

where does this restriction come from? iommu_group_replace_domain()
can switch between any two UNMANAGED domains. What is the extra
problem in iommufd to support from/to auto domains?

> 
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/iommufd/device.c          | 101 +++++++++++++++++-------
>  drivers/iommu/iommufd/iommufd_private.h |   2 +
>  2 files changed, 76 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/iommu/iommufd/device.c
> b/drivers/iommu/iommufd/device.c
> index b8c3e3baccb5..8a9834fc129a 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -9,6 +9,8 @@
>  #include "io_pagetable.h"
>  #include "iommufd_private.h"
> 
> +MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
> +
>  static bool allow_unsafe_interrupts;
>  module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(
> @@ -194,9 +196,61 @@ static bool
> iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
>  	return false;
>  }
> 
> +/**
> + * __iommmufd_device_detach - Detach a device from idev->hwpt to
> new_hwpt

'from ... to ...' means a replace semantics. then this should be called
iommufd_device_replace_hwpt().

> + * @idev: device to detach
> + * @new_hwpt: new hw_pagetable to attach (pass in NULL for a simple
> detach)
> + * @detach_group: flag to call iommu_detach_group
> + *
> + * This is a cleanup helper shared by the replace and detach routines.
> Comparing
> + * to a detach routine, a replace routine only needs a partial detach
> procedure:
> + * it does not need the iommu_detach_group(); it will attach the device to a
> new
> + * hw_pagetable after a partial detach from the currently attached
> hw_pagetable,
> + * so certain steps can be skipped if two hw_pagetables have the same IOAS.
> + */
> +static void __iommmufd_device_detach(struct iommufd_device *idev,
> +				     struct iommufd_hw_pagetable
> *new_hwpt,
> +				     bool detach_group)
> +{
> +	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> +	struct iommufd_ioas *new_ioas = NULL;
> +
> +	if (new_hwpt)
> +		new_ioas = new_hwpt->ioas;
> +
> +	mutex_lock(&hwpt->devices_lock);
> +	list_del(&idev->devices_item);
> +	if (hwpt->ioas != new_ioas)
> +		mutex_lock(&hwpt->ioas->mutex);

I think new_ioas->mutext was meant here.

> +	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> +		if (list_empty(&hwpt->devices)) {
> +			iopt_table_remove_domain(&hwpt->ioas->iopt,
> +						 hwpt->domain);
> +			list_del(&hwpt->hwpt_item);
> +		}

I'm not sure how this can be fully shared between detach and replace.
Here some work e.g. above needs to be done before calling
iommu_group_replace_domain() while others can be done afterwards.
Nicolin Chen Feb. 9, 2023, 8:55 p.m. UTC | #4
On Wed, Feb 08, 2023 at 08:08:42AM +0000, Liu, Yi L wrote:
 
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, February 8, 2023 5:18 AM
> >
> > iommu_group_replace_domain() is introduced to support use cases where
> > an
> > iommu_group can be attached to a new domain without getting detached
> > from
> > the old one. This replacement feature will be useful, for cases such as:
> > 1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
> >    table with a larger table (PASID=N)
> > 2) Nesting mode, when switching the attaching device from an S2 domain
> >    to an S1 domain, or when switching between relevant S1 domains.
> > as it allows these cases to switch seamlessly without a DMA disruption.
> >
> > So, call iommu_group_replace_domain() in the
> > iommufd_device_do_attach().
> > And add a __iommmufd_device_detach helper to allow the replace routine
> > to
> > do a partial detach on the current hwpt that's being replaced. Though the
> > updated locking logic is overcomplicated, it will be eased, once those
> > iopt_table_add/remove_ioas and list_add/del calls are moved to hwpt's
> > allocation/destroy() functions in the coming nesting series, as that'll
> > depend on a new ->domain_alloc_user op in the iommu core.
> >
> > Also, block replace operations that are from/to auto_domains, i.e. only
> > user-allocated hw_pagetables can be replaced or replaced with.
> >
> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> > ---
> >  drivers/iommu/iommufd/device.c          | 101 +++++++++++++++++-------
> >  drivers/iommu/iommufd/iommufd_private.h |   2 +
> >  2 files changed, 76 insertions(+), 27 deletions(-)
> >
> > diff --git a/drivers/iommu/iommufd/device.c
> > b/drivers/iommu/iommufd/device.c
> > index b8c3e3baccb5..8a9834fc129a 100644
> > --- a/drivers/iommu/iommufd/device.c
> > +++ b/drivers/iommu/iommufd/device.c
> > @@ -9,6 +9,8 @@
> >  #include "io_pagetable.h"
> >  #include "iommufd_private.h"
> >
> > +MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
> > +
> >  static bool allow_unsafe_interrupts;
> >  module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
> >  MODULE_PARM_DESC(
> > @@ -194,9 +196,61 @@ static bool
> > iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
> >       return false;
> >  }
> >
> > +/**
> > + * __iommmufd_device_detach - Detach a device from idev->hwpt to
> > new_hwpt
> 
> This function doesn't do anything to make this device attached to new_hwpt.
> It is done in the iommufd_device_attach_ioas(). New_hwpt here indicates if
> this detach requires to do some extra thing. E.g. remove reserved iova from
> the idev->hwpt->ioas. So may just say " Detach a device from idev->hwpt",
> and explain the usage of new_hwpt in the below.

Yea. You are right.

> > + * @idev: device to detach
> > + * @new_hwpt: new hw_pagetable to attach (pass in NULL for a simple
> > detach)
> 
> The new hw_pagetable to be attached.

OK.

> > + * @detach_group: flag to call iommu_detach_group
> > + *
> > + * This is a cleanup helper shared by the replace and detach routines.
> > Comparing
> > + * to a detach routine, a replace routine only needs a partial detach
> > procedure:
> > + * it does not need the iommu_detach_group(); it will attach the device to
> > a new
> > + * hw_pagetable after a partial detach from the currently attached
> > hw_pagetable,
> > + * so certain steps can be skipped if two hw_pagetables have the same
> > IOAS.
> > + */
> > +static void __iommmufd_device_detach(struct iommufd_device *idev,
> > +                                  struct iommufd_hw_pagetable
> > *new_hwpt,
> > +                                  bool detach_group)
> > +{
> > +     struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> > +     struct iommufd_ioas *new_ioas = NULL;
> > +
> > +     if (new_hwpt)
> > +             new_ioas = new_hwpt->ioas;
> > +
> > +     mutex_lock(&hwpt->devices_lock);
> > +     list_del(&idev->devices_item);
> > +     if (hwpt->ioas != new_ioas)
> > +             mutex_lock(&hwpt->ioas->mutex);
> 
> The lock order is mostly hwpt->ioas->mutex and then hwpt->devices_lock.
> See the iommufd_device_auto_get_domain(). If possible, may switch the
> order sequence here.

Yea, I know it's a bit strange. Yet...

Our nesting series simplifies this part to:
	if (cur_ioas != new_ioas) {
		mutex_lock(&hwpt->ioas->mutex);
		iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
		mutex_unlock(&hwpt->ioas->mutex);
	}

So, here is trying to avoid something like:
	if (cur_ioas != new_ioas)
		mutex_lock(&hwpt->ioas->mutex);
	// doing something
	if (cur_ioas != new_ioas)
		iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
	// doing something
	if (cur_ioas != new_ioas)
		mutex_unlock(&hwpt->ioas->mutex);

> Also, rename hwpt to be cur_hwpt, this may help
> reviewers to distinguish it from the hwpt in the caller of this function. It
> looks to be a deadlock at first look, but not after closer reading.

Sure.

> > @@ -345,6 +406,13 @@ int iommufd_device_attach(struct iommufd_device
> > *idev, u32 *pt_id)
> >               struct iommufd_hw_pagetable *hwpt =
> >                       container_of(pt_obj, struct
> > iommufd_hw_pagetable, obj);
> >
> > +             if (idev->hwpt == hwpt)
> > +                     goto out_done;
> > +             if (idev->hwpt && idev->hwpt->auto_domain) {
> > +                     rc = -EBUSY;
> 
> This means if device was attached to an auto_created hwpt, then we
> cannot replace it with a user allocated hwpt? If yes, this means the
> replace is not available until user hwpt support, which is part of nesting.

After aligning with Jason, this limit here might be wrong, as we
should be able to support replacing an IOAS. I'd need to take a
closer look and fix it in v3.

> > +             if (idev->hwpt)
> > +                     return -EBUSY;
> 
> So we don't allow ioas replacement for physical devices. Is it?
> Looks like emulated devices allows it.

This was to avoid an replace with an auto_domain. Similarly, it's
likely wrong, as I replied above.

Thanks
Nic
Nicolin Chen Feb. 9, 2023, 8:56 p.m. UTC | #5
On Wed, Feb 08, 2023 at 08:12:55AM +0000, Liu, Yi L wrote:
> External email: Use caution opening links or attachments
> 
> 
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, February 8, 2023 5:18 AM
> >
> > +/**
> > + * __iommmufd_device_detach - Detach a device from idev->hwpt to
> 
> Nit: s/_iommmufd/__iommufd

Will fix it. Thanks!
Nicolin Chen Feb. 9, 2023, 9:13 p.m. UTC | #6
On Thu, Feb 09, 2023 at 04:00:52AM +0000, Tian, Kevin wrote:
 
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, February 8, 2023 5:18 AM
> >
> > iommu_group_replace_domain() is introduced to support use cases where
> > an
> > iommu_group can be attached to a new domain without getting detached
> > from
> > the old one. This replacement feature will be useful, for cases such as:
> > 1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
> >    table with a larger table (PASID=N)
> > 2) Nesting mode, when switching the attaching device from an S2 domain
> >    to an S1 domain, or when switching between relevant S1 domains.
> > as it allows these cases to switch seamlessly without a DMA disruption.
> >
> > So, call iommu_group_replace_domain() in the iommufd_device_do_attach().
> > And add a __iommmufd_device_detach helper to allow the replace routine
> > to
> > do a partial detach on the current hwpt that's being replaced. Though the
> > updated locking logic is overcomplicated, it will be eased, once those
> > iopt_table_add/remove_ioas and list_add/del calls are moved to hwpt's
> > allocation/destroy() functions in the coming nesting series, as that'll
> > depend on a new ->domain_alloc_user op in the iommu core.
> 
> then why not moving those changes into this series to make it simple?

The simplification depends on the new ->domain_alloc_user op and
its implementation in SMMU driver, which would be introduced by
the nesting series of VT-d and SMMU respectively.

At this point, it's hard to decide the best sequence of our three
series. Putting this replace series first simply because it seems
to be closer to get merged than the other two bigger series.

> > Also, block replace operations that are from/to auto_domains, i.e. only
> > user-allocated hw_pagetables can be replaced or replaced with.
> 
> where does this restriction come from? iommu_group_replace_domain()
> can switch between any two UNMANAGED domains. What is the extra
> problem in iommufd to support from/to auto domains?

It was my misunderstanding. We should have supported that.
Will fix in v3 and add the corresponding support.

> > +/**
> > + * __iommmufd_device_detach - Detach a device from idev->hwpt to
> > new_hwpt
> 
> 'from ... to ...' means a replace semantics. then this should be called
> iommufd_device_replace_hwpt().

Had a hard time to think about the naming, it's indeed a detach-
only routine, but it takes a parameter named new_hwpt...

Perhaps I should just follow Yi's suggestion by rephrasing the
narrative of this function.

> > +static void __iommmufd_device_detach(struct iommufd_device *idev,
> > +                                  struct iommufd_hw_pagetable
> > *new_hwpt,
> > +                                  bool detach_group)
> > +{
> > +     struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> > +     struct iommufd_ioas *new_ioas = NULL;
> > +
> > +     if (new_hwpt)
> > +             new_ioas = new_hwpt->ioas;
> > +
> > +     mutex_lock(&hwpt->devices_lock);
> > +     list_del(&idev->devices_item);
> > +     if (hwpt->ioas != new_ioas)
> > +             mutex_lock(&hwpt->ioas->mutex);
> 
> I think new_ioas->mutext was meant here.

new_hwpt is an input from an replace routine, where it holds the
new_ioas->mutex already. Yi's right that the code here is a bit
confusing. I will try to change it a bit for readability.
 
> > +     if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> > +             if (list_empty(&hwpt->devices)) {
> > +                     iopt_table_remove_domain(&hwpt->ioas->iopt,
> > +                                              hwpt->domain);
> > +                     list_del(&hwpt->hwpt_item);
> > +             }
> 
> I'm not sure how this can be fully shared between detach and replace.
> Here some work e.g. above needs to be done before calling
> iommu_group_replace_domain() while others can be done afterwards.

This iopt_table_remove_domain/list_del is supposed to be done in
the hwpt's destroy() actually. We couldn't move it because it'd
need the new domain_alloc_user op and its implementation in ARM
driver. Overall, I think it should be safe to put it behind the
iommu_group_replace_domain().

Thanks
Nic
Jason Gunthorpe Feb. 10, 2023, 12:01 a.m. UTC | #7
On Thu, Feb 09, 2023 at 01:13:07PM -0800, Nicolin Chen wrote:
> On Thu, Feb 09, 2023 at 04:00:52AM +0000, Tian, Kevin wrote:
>  
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Wednesday, February 8, 2023 5:18 AM
> > >
> > > iommu_group_replace_domain() is introduced to support use cases where
> > > an
> > > iommu_group can be attached to a new domain without getting detached
> > > from
> > > the old one. This replacement feature will be useful, for cases such as:
> > > 1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
> > >    table with a larger table (PASID=N)
> > > 2) Nesting mode, when switching the attaching device from an S2 domain
> > >    to an S1 domain, or when switching between relevant S1 domains.
> > > as it allows these cases to switch seamlessly without a DMA disruption.
> > >
> > > So, call iommu_group_replace_domain() in the iommufd_device_do_attach().
> > > And add a __iommmufd_device_detach helper to allow the replace routine
> > > to
> > > do a partial detach on the current hwpt that's being replaced. Though the
> > > updated locking logic is overcomplicated, it will be eased, once those
> > > iopt_table_add/remove_ioas and list_add/del calls are moved to hwpt's
> > > allocation/destroy() functions in the coming nesting series, as that'll
> > > depend on a new ->domain_alloc_user op in the iommu core.
> > 
> > then why not moving those changes into this series to make it simple?
> 
> The simplification depends on the new ->domain_alloc_user op and
> its implementation in SMMU driver, which would be introduced by
> the nesting series of VT-d and SMMU respectively.

Since we are not fixing all the drivers at this point, this argument
doesn't hold water.

It is as I said in the other email, there should be no changes to the
normal attach/replace path when adding manual HWPT creation - once
those are removed there should be minimal connection between these two
series.

Jason
Tian, Kevin Feb. 10, 2023, 2:11 a.m. UTC | #8
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, February 10, 2023 5:13 AM
> 
> > > +     if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> > > +             if (list_empty(&hwpt->devices)) {
> > > +                     iopt_table_remove_domain(&hwpt->ioas->iopt,
> > > +                                              hwpt->domain);
> > > +                     list_del(&hwpt->hwpt_item);
> > > +             }
> >
> > I'm not sure how this can be fully shared between detach and replace.
> > Here some work e.g. above needs to be done before calling
> > iommu_group_replace_domain() while others can be done afterwards.
> 
> This iopt_table_remove_domain/list_del is supposed to be done in
> the hwpt's destroy() actually. We couldn't move it because it'd
> need the new domain_alloc_user op and its implementation in ARM
> driver. Overall, I think it should be safe to put it behind the
> iommu_group_replace_domain().
> 

My confusion is that we have different flows between detach/attach
and replace.

today with separate detach+attach we have following flow:

	Remove device from current hwpt;
	if (last_device in hwpt) {
		Remove hwpt domain from current iopt;
		if (last_device in group)
			detach group from hwpt domain;
	}

	if (first device in group) {
		attach group to new hwpt domain;
		if (first_device in hwpt)
			Add hwpt domain to new iopt;
	Add device to new hwpt;

but replace flow is different on the detach part:

	if (first device in group) {
		replace group's domain from current hwpt to new hwpt;
		if (first_device in hwpt)
			Add hwpt domain to new iopt;
	}

	Remove device from old hwpt;
	if (last_device in old hwpt)
		Remove hwpt domain from old iopt;

	Add device to new hwpt;

I'm yet to figure out whether we have sufficient lock protection to
prevent other paths from using old iopt/hwpt to find the device
which is already attached to a different domain.
Nicolin Chen Feb. 10, 2023, 8:50 p.m. UTC | #9
On Thu, Feb 09, 2023 at 08:01:00PM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 09, 2023 at 01:13:07PM -0800, Nicolin Chen wrote:
> > On Thu, Feb 09, 2023 at 04:00:52AM +0000, Tian, Kevin wrote:
> >  
> > > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > > Sent: Wednesday, February 8, 2023 5:18 AM
> > > >
> > > > iommu_group_replace_domain() is introduced to support use cases where
> > > > an
> > > > iommu_group can be attached to a new domain without getting detached
> > > > from
> > > > the old one. This replacement feature will be useful, for cases such as:
> > > > 1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
> > > >    table with a larger table (PASID=N)
> > > > 2) Nesting mode, when switching the attaching device from an S2 domain
> > > >    to an S1 domain, or when switching between relevant S1 domains.
> > > > as it allows these cases to switch seamlessly without a DMA disruption.
> > > >
> > > > So, call iommu_group_replace_domain() in the iommufd_device_do_attach().
> > > > And add a __iommmufd_device_detach helper to allow the replace routine
> > > > to
> > > > do a partial detach on the current hwpt that's being replaced. Though the
> > > > updated locking logic is overcomplicated, it will be eased, once those
> > > > iopt_table_add/remove_ioas and list_add/del calls are moved to hwpt's
> > > > allocation/destroy() functions in the coming nesting series, as that'll
> > > > depend on a new ->domain_alloc_user op in the iommu core.
> > > 
> > > then why not moving those changes into this series to make it simple?
> > 
> > The simplification depends on the new ->domain_alloc_user op and
> > its implementation in SMMU driver, which would be introduced by
> > the nesting series of VT-d and SMMU respectively.
> 
> Since we are not fixing all the drivers at this point, this argument
> doesn't hold water.
> 
> It is as I said in the other email, there should be no changes to the
> normal attach/replace path when adding manual HWPT creation - once
> those are removed there should be minimal connection between these two
> series.

Yes. I replied here earlier than the discussion with you in
that thread. I think I should just drop this line of commit
messages.

Thanks
Nic
Nicolin Chen Feb. 11, 2023, 12:10 a.m. UTC | #10
On Fri, Feb 10, 2023 at 02:11:23AM +0000, Tian, Kevin wrote:

> > > > +     if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> > > > +             if (list_empty(&hwpt->devices)) {
> > > > +                     iopt_table_remove_domain(&hwpt->ioas->iopt,
> > > > +                                              hwpt->domain);
> > > > +                     list_del(&hwpt->hwpt_item);
> > > > +             }
> > >
> > > I'm not sure how this can be fully shared between detach and replace.
> > > Here some work e.g. above needs to be done before calling
> > > iommu_group_replace_domain() while others can be done afterwards.
> >
> > This iopt_table_remove_domain/list_del is supposed to be done in
> > the hwpt's destroy() actually. We couldn't move it because it'd
> > need the new domain_alloc_user op and its implementation in ARM
> > driver. Overall, I think it should be safe to put it behind the
> > iommu_group_replace_domain().
> >
> 
> My confusion is that we have different flows between detach/attach
> and replace.
> 
> today with separate detach+attach we have following flow:
> 
>         Remove device from current hwpt;
>         if (last_device in hwpt) {
>                 Remove hwpt domain from current iopt;
>                 if (last_device in group)
>                         detach group from hwpt domain;
>         }
> 
>         if (first device in group) {
>                 attach group to new hwpt domain;
>                 if (first_device in hwpt)
>                         Add hwpt domain to new iopt;
>         Add device to new hwpt;
> 
> but replace flow is different on the detach part:
> 
>         if (first device in group) {
>                 replace group's domain from current hwpt to new hwpt;
>                 if (first_device in hwpt)
>                         Add hwpt domain to new iopt;
>         }
> 
>         Remove device from old hwpt;
>         if (last_device in old hwpt)
>                 Remove hwpt domain from old iopt;
> 
>         Add device to new hwpt;

Oh... thinking it carefully, I see the flow does look a bit off.
Perhaps it's better to have a similar flow for replace.

However, I think something would be still different due to its
tricky nature, especially for a multi-device iommu_group.

An iommu_group_detach happens only when a device is the last one
in its group to go through the routine via a DETACH ioctl, while
an iommu_group_replace_domain() happens only when the device is
the first one in its group to go through the routine via another
ATTACH ioctl. However, when the first device does a replace, the
cleanup routine of the old hwpt is a NOP, since there are still
other devices (same group) in the old hwpt. And two implications
here:
1) Any other device in the same group has to forcibly switch to
   the new domain, when the first device does a replace.
2) The actual hwpt cleanup can only happen at the last device's
   replace call.

This also means that kernel has to rely on the integrity of the
user space that it must replace all active devices in the group:

For a three-device iommu_group,
[scenario 1]
	a. ATTACH dev1 to hwpt1;
	b. ATTACH dev2 to hwpt1;
	c. ATTACH dev3 to hwpt1;
	d. ATTACH (REPLACE) dev1 to hwpt2;
	   (no hwpt1 cleanup; replace dev2&3 too; do hwpt2 init)
	e. ATTACH (REPLACE) dev2 to hwpt2; // user space must do
	   (no hwpt1 cleanup; no dev2 replace; no hwpt2 init)
	f. ATTACH (REPLACE) dev3 to hwpt2; // user space must do
	   (do hwpt1 cleanup; no dev3 replace; no hwpt2 init)

[scenario 2]
	a. ATTACH dev1 to hwpt1;
	b. ATTACH dev2 to hwpt1;
	c. ATTACH dev3 to hwpt1;
	d. DETACH dev3 from hwpt1;
	   (detach dev3; no hwpt1 cleanup)
	f. ATTACH (REPLACE) dev1 to hwpt2;
	   (no hwpt1 cleanup; replace dev2&3 too; do hwpt2 init)
	g. ATTACH (REPLACE) dev2 to hwpt2;	// user space must do
	   (do hwpt1 cleanup; no dev2 replace; no hwpt2 init)
	h. (optional) ATTACH dev3 to hwpt2;	// clean ATTACH, not a REPLACE
	   (no hwpt1 cleanup; no dev3 replace; no hwpt2 init)

[scenario 3]
	a. ATTACH dev1 to hwpt1;
	b. ATTACH dev2 to hwpt1;
	c. ATTACH dev3 to hwpt1;
	d. DETACH dev3 from hwpt1;
	   (detach dev3; no hwpt1 cleanup)
	e. DETACH dev2 from hwpt1;
	   (detach dev2; no hwpt1 cleanup)
	f. ATTACH (REPLACE) dev1 to hwpt2;
	   (do hwpt1 cleanup; replace dev2&3 too; do hwpt2 init)
	g. (optional) ATTACH dev2 to hwpt2;	// clean ATTACH, not a REPLACE
	   (no hwpt1 cleanup; no dev2 replace; no hwpt2 init)
	h. (optional) ATTACH dev3 to hwpt2;	// clean ATTACH, not a REPLACE
	   (no hwpt1 cleanup; no dev3 replace; no hwpt2 init)

Following the narratives above,

[current detach+attach flow]
	// DETACH dev1 from hwpt1;
	Log dev1 out of the hwpt1's device list;
	NOP; // hwpt1 has its group;
	iopt_remove_reserved_iova(hwpt1->iopt, dev1);
	idev1->hwpt = NULL;
	refcount_dec();
	// DETACH dev2 from hwpt1;
	Log dev2 out of the hwpt1's device list;
	if (hwpt1 does not have its group) { // last device to detach
		if (hwpt1's device list is empty)
			iopt_table_remove_domain/list_del(hwpt1);
		iommu_detach_group();
	}
	iopt_remove_reserved_iova(hwpt1->iopt, dev2);
	idev2->hwpt = NULL;
	refcount_dec();
	...
	// ATTACH dev1 to hwpt2;
	iopt_table_enforce_group_resv_regions(hwpt2->iopt, dev1);
	if (hwpt2 does not have its group) { // first device to attach
		iommu_attach_group();
		if (hwpt2's device list is empty)
			iopt_table_add_domain/list_add(hwpt2);
	}
	idev1->hwpt = hwpt2;
	refcount_inc();
	Log dev1 in the hwpt2's device list;
	// ATTACH dev2 to hwpt2;
	iopt_table_enforce_group_resv_regions(hwpt2->iopt, dev2);
	NOP; // hwpt2 has its group;
	idev2->hwpt = hwpt2;
	refcount_inc();
	Log dev2 in to the hwpt2's device list;


[correct (?) replace flow - scenario 1 above]

	// 1.d Switch (REPLACE) dev1 from hwpt1 to hwpt2;
	partial detach (dev1) {
		Log dev1 out of the hwpt1's device list;
		NOP // hwpt1 has its group, and hwpt1's device list isn't empty
		iopt_remove_reserved_iova(hwpt1->iopt, dev1);
		refcount_dec();
	}
	iopt_table_enforce_group_resv_regions(hwpt2->iopt, dev1);
	if (hwpt2 does not have its group) { // first device to replace
		iommu_group_replace_domain();
		if (hwpt2's device list is empty)
			iopt_table_add_domain/list_add(hwpt2);
	}
	idev1->hwpt = hwpt2;
	refcount_int();
	Log dev1 in the hwpt2's device list;

	// 1.e Switch (REPLACE) dev2 from hwpt1 to hwpt2;
	partial detach (dev2) {
		Log dev2 out of the hwpt1's device list;
		NOP // hwpt1 has its group, and hwpt1's device list isn't empty
		iopt_remove_reserved_iova(hwpt1->iopt, dev2);
		refcount_dec();
	}
	iopt_table_enforce_group_resv_regions(hwpt2->iopt, dev2);
	NOP; // hwpt2 has its group, and hwpt2's device list isn't empty
	idev2->hwpt = hwpt2;
	refcount_int();
	Log dev2 in the hwpt2's device list;

	// 1.f Switch (REPLACE) dev3 from hwpt1 to hwpt2;
	partial detach (dev3) {
		Log dev3 out of the hwpt1's device list;
		if (hwpt1 does not have its group) { // last device to detach
			if (hwpt1's device list is empty)
				iopt_table_remove_domain/list_del(hwpt1);
		}
		iopt_remove_reserved_iova(hwpt1->iopt, dev3);
		refcount_dec();
	}
	iopt_table_enforce_group_resv_regions(hwpt2->iopt, dev3);
	NOP; // hwpt2 has its group, and hwpt2's device list isn't empty
	idev3->hwpt = hwpt2;
	refcount_int();
	Log dev3 in the hwpt2's device list;

And, this would also complicate the error-out routines too...

> I'm yet to figure out whether we have sufficient lock protection to
> prevent other paths from using old iopt/hwpt to find the device
> which is already attached to a different domain.

With the correct (?) flow, I think it'd be safer for one-device
group. But it's gets tricker for the multi-device case above:
the dev2 and dev3 are already in the new domain, but all their
iommufd objects (idev->hwpt and iopt) are lagging. Do we need a
lock across the three IOCTLs?

One way to avoid it is to force-update idev2 and idev3 too when
idev1 does a replace -- by iterating all same-group devices out
of the old hwpt. This, however, feels a violation against being
device-centric...

i.e. scenario 1:
	a. ATTACH dev1 to hwpt1;
	b. ATTACH dev2 to hwpt1;
	c. ATTACH dev3 to hwpt1;
	d. ATTACH (REPLACE) dev1 to hwpt2;
	   (do hwpt1 cleanup; replace dev2&3 and update idev2&3 too; do hwpt2 init)
	e. ATTACH (REPLACE) dev2 to hwpt2; // user space must do, or be aware of 1.d
	   (kernel does dummy)
	f. ATTACH (REPLACE) dev3 to hwpt2; // user space must do, or be aware of 1.d
	   (kernel does dummy)

Thanks
Nic
Tian, Kevin Feb. 13, 2023, 2:34 a.m. UTC | #11
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, February 11, 2023 8:10 AM
> 
> On Fri, Feb 10, 2023 at 02:11:23AM +0000, Tian, Kevin wrote:
> 
> > > > > +     if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> > > > > +             if (list_empty(&hwpt->devices)) {
> > > > > +                     iopt_table_remove_domain(&hwpt->ioas->iopt,
> > > > > +                                              hwpt->domain);
> > > > > +                     list_del(&hwpt->hwpt_item);
> > > > > +             }
> > > >
> > > > I'm not sure how this can be fully shared between detach and replace.
> > > > Here some work e.g. above needs to be done before calling
> > > > iommu_group_replace_domain() while others can be done afterwards.
> > >
> > > This iopt_table_remove_domain/list_del is supposed to be done in
> > > the hwpt's destroy() actually. We couldn't move it because it'd
> > > need the new domain_alloc_user op and its implementation in ARM
> > > driver. Overall, I think it should be safe to put it behind the
> > > iommu_group_replace_domain().
> > >
> >
> > My confusion is that we have different flows between detach/attach
> > and replace.
> >
> > today with separate detach+attach we have following flow:
> >
> >         Remove device from current hwpt;
> >         if (last_device in hwpt) {
> >                 Remove hwpt domain from current iopt;
> >                 if (last_device in group)
> >                         detach group from hwpt domain;
> >         }
> >
> >         if (first device in group) {
> >                 attach group to new hwpt domain;
> >                 if (first_device in hwpt)
> >                         Add hwpt domain to new iopt;
> >         Add device to new hwpt;
> >
> > but replace flow is different on the detach part:
> >
> >         if (first device in group) {
> >                 replace group's domain from current hwpt to new hwpt;
> >                 if (first_device in hwpt)
> >                         Add hwpt domain to new iopt;
> >         }
> >
> >         Remove device from old hwpt;
> >         if (last_device in old hwpt)
> >                 Remove hwpt domain from old iopt;
> >
> >         Add device to new hwpt;
> 
> Oh... thinking it carefully, I see the flow does look a bit off.
> Perhaps it's better to have a similar flow for replace.
> 
> However, I think something would be still different due to its
> tricky nature, especially for a multi-device iommu_group.
> 
> An iommu_group_detach happens only when a device is the last one
> in its group to go through the routine via a DETACH ioctl, while
> an iommu_group_replace_domain() happens only when the device is
> the first one in its group to go through the routine via another
> ATTACH ioctl. However, when the first device does a replace, the
> cleanup routine of the old hwpt is a NOP, since there are still
> other devices (same group) in the old hwpt. And two implications
> here:
> 1) Any other device in the same group has to forcibly switch to
>    the new domain, when the first device does a replace.
> 2) The actual hwpt cleanup can only happen at the last device's
>    replace call.
> 
> This also means that kernel has to rely on the integrity of the
> user space that it must replace all active devices in the group:
> 

Jason suggested to move hwpt cleanup out of the detach path
in his reply to patch7. Presumably with that fix the major tricky
point about hwpt in following scenarios would disappear. Let's
see how it will work out then. 
Nicolin Chen Feb. 13, 2023, 7:48 a.m. UTC | #12
On Mon, Feb 13, 2023 at 02:34:18AM +0000, Tian, Kevin wrote:
> External email: Use caution opening links or attachments
> 
> 
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Saturday, February 11, 2023 8:10 AM
> >
> > On Fri, Feb 10, 2023 at 02:11:23AM +0000, Tian, Kevin wrote:
> >
> > > > > > +     if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> > > > > > +             if (list_empty(&hwpt->devices)) {
> > > > > > +                     iopt_table_remove_domain(&hwpt->ioas->iopt,
> > > > > > +                                              hwpt->domain);
> > > > > > +                     list_del(&hwpt->hwpt_item);
> > > > > > +             }
> > > > >
> > > > > I'm not sure how this can be fully shared between detach and replace.
> > > > > Here some work e.g. above needs to be done before calling
> > > > > iommu_group_replace_domain() while others can be done afterwards.
> > > >
> > > > This iopt_table_remove_domain/list_del is supposed to be done in
> > > > the hwpt's destroy() actually. We couldn't move it because it'd
> > > > need the new domain_alloc_user op and its implementation in ARM
> > > > driver. Overall, I think it should be safe to put it behind the
> > > > iommu_group_replace_domain().
> > > >
> > >
> > > My confusion is that we have different flows between detach/attach
> > > and replace.
> > >
> > > today with separate detach+attach we have following flow:
> > >
> > >         Remove device from current hwpt;
> > >         if (last_device in hwpt) {
> > >                 Remove hwpt domain from current iopt;
> > >                 if (last_device in group)
> > >                         detach group from hwpt domain;
> > >         }
> > >
> > >         if (first device in group) {
> > >                 attach group to new hwpt domain;
> > >                 if (first_device in hwpt)
> > >                         Add hwpt domain to new iopt;
> > >         Add device to new hwpt;
> > >
> > > but replace flow is different on the detach part:
> > >
> > >         if (first device in group) {
> > >                 replace group's domain from current hwpt to new hwpt;
> > >                 if (first_device in hwpt)
> > >                         Add hwpt domain to new iopt;
> > >         }
> > >
> > >         Remove device from old hwpt;
> > >         if (last_device in old hwpt)
> > >                 Remove hwpt domain from old iopt;
> > >
> > >         Add device to new hwpt;
> >
> > Oh... thinking it carefully, I see the flow does look a bit off.
> > Perhaps it's better to have a similar flow for replace.
> >
> > However, I think something would be still different due to its
> > tricky nature, especially for a multi-device iommu_group.
> >
> > An iommu_group_detach happens only when a device is the last one
> > in its group to go through the routine via a DETACH ioctl, while
> > an iommu_group_replace_domain() happens only when the device is
> > the first one in its group to go through the routine via another
> > ATTACH ioctl. However, when the first device does a replace, the
> > cleanup routine of the old hwpt is a NOP, since there are still
> > other devices (same group) in the old hwpt. And two implications
> > here:
> > 1) Any other device in the same group has to forcibly switch to
> >    the new domain, when the first device does a replace.
> > 2) The actual hwpt cleanup can only happen at the last device's
> >    replace call.
> >
> > This also means that kernel has to rely on the integrity of the
> > user space that it must replace all active devices in the group:
> >
> 
> Jason suggested to move hwpt cleanup out of the detach path
> in his reply to patch7. Presumably with that fix the major tricky
> point about hwpt in following scenarios would disappear. Let's
> see how it will work out then. 
Tian, Kevin Feb. 13, 2023, 8:27 a.m. UTC | #13
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Monday, February 13, 2023 3:49 PM
> 
> On Mon, Feb 13, 2023 at 02:34:18AM +0000, Tian, Kevin wrote:
> > External email: Use caution opening links or attachments
> >
> >
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Saturday, February 11, 2023 8:10 AM
> > >
> > > On Fri, Feb 10, 2023 at 02:11:23AM +0000, Tian, Kevin wrote:
> > >
> > > > > > > +     if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> > > > > > > +             if (list_empty(&hwpt->devices)) {
> > > > > > > +                     iopt_table_remove_domain(&hwpt->ioas->iopt,
> > > > > > > +                                              hwpt->domain);
> > > > > > > +                     list_del(&hwpt->hwpt_item);
> > > > > > > +             }
> > > > > >
> > > > > > I'm not sure how this can be fully shared between detach and
> replace.
> > > > > > Here some work e.g. above needs to be done before calling
> > > > > > iommu_group_replace_domain() while others can be done
> afterwards.
> > > > >
> > > > > This iopt_table_remove_domain/list_del is supposed to be done in
> > > > > the hwpt's destroy() actually. We couldn't move it because it'd
> > > > > need the new domain_alloc_user op and its implementation in ARM
> > > > > driver. Overall, I think it should be safe to put it behind the
> > > > > iommu_group_replace_domain().
> > > > >
> > > >
> > > > My confusion is that we have different flows between detach/attach
> > > > and replace.
> > > >
> > > > today with separate detach+attach we have following flow:
> > > >
> > > >         Remove device from current hwpt;
> > > >         if (last_device in hwpt) {
> > > >                 Remove hwpt domain from current iopt;
> > > >                 if (last_device in group)
> > > >                         detach group from hwpt domain;
> > > >         }
> > > >
> > > >         if (first device in group) {
> > > >                 attach group to new hwpt domain;
> > > >                 if (first_device in hwpt)
> > > >                         Add hwpt domain to new iopt;
> > > >         Add device to new hwpt;
> > > >
> > > > but replace flow is different on the detach part:
> > > >
> > > >         if (first device in group) {
> > > >                 replace group's domain from current hwpt to new hwpt;
> > > >                 if (first_device in hwpt)
> > > >                         Add hwpt domain to new iopt;
> > > >         }
> > > >
> > > >         Remove device from old hwpt;
> > > >         if (last_device in old hwpt)
> > > >                 Remove hwpt domain from old iopt;
> > > >
> > > >         Add device to new hwpt;
> > >
> > > Oh... thinking it carefully, I see the flow does look a bit off.
> > > Perhaps it's better to have a similar flow for replace.
> > >
> > > However, I think something would be still different due to its
> > > tricky nature, especially for a multi-device iommu_group.
> > >
> > > An iommu_group_detach happens only when a device is the last one
> > > in its group to go through the routine via a DETACH ioctl, while
> > > an iommu_group_replace_domain() happens only when the device is
> > > the first one in its group to go through the routine via another
> > > ATTACH ioctl. However, when the first device does a replace, the
> > > cleanup routine of the old hwpt is a NOP, since there are still
> > > other devices (same group) in the old hwpt. And two implications
> > > here:
> > > 1) Any other device in the same group has to forcibly switch to
> > >    the new domain, when the first device does a replace.
> > > 2) The actual hwpt cleanup can only happen at the last device's
> > >    replace call.
> > >
> > > This also means that kernel has to rely on the integrity of the
> > > user space that it must replace all active devices in the group:
> > >
> >
> > Jason suggested to move hwpt cleanup out of the detach path
> > in his reply to patch7. Presumably with that fix the major tricky
> > point about hwpt in following scenarios would disappear. Let's
> > see how it will work out then. 
Jason Gunthorpe Feb. 13, 2023, 2:49 p.m. UTC | #14
On Sun, Feb 12, 2023 at 11:48:30PM -0800, Nicolin Chen wrote:

> What about point 1? If dev2 and dev3 are already replaced when
> doing iommu_group_replace_domain() on dev1, their idev objects
> still have the old hwpt/iopt until user space does another two
> IOCTLs on them, right?

We have a complicated model for multi-device groups...

The first device in the group to change domains must move all the
devices in the group

But userspace is still expected to run through and change all the
other devices

So replace should be a NOP if the group is already linked to the right
domain.

This patch needs to make sure that incosistency in the view betwen the
iommu_group and the iommufd model doesn't cause a functional
problem.

Jason
Nicolin Chen Feb. 14, 2023, 10:54 a.m. UTC | #15
On Mon, Feb 13, 2023 at 10:49:34AM -0400, Jason Gunthorpe wrote:
> On Sun, Feb 12, 2023 at 11:48:30PM -0800, Nicolin Chen wrote:
> 
> > What about point 1? If dev2 and dev3 are already replaced when
> > doing iommu_group_replace_domain() on dev1, their idev objects
> > still have the old hwpt/iopt until user space does another two
> > IOCTLs on them, right?
> 
> We have a complicated model for multi-device groups...
> 
> The first device in the group to change domains must move all the
> devices in the group
> 
> But userspace is still expected to run through and change all the
> other devices
> 
> So replace should be a NOP if the group is already linked to the right
> domain.
> 
> This patch needs to make sure that incosistency in the view betwen the
> iommu_group and the iommufd model doesn't cause a functional
> problem.

Yea, I was thinking that we'd need to block any access to the
idev->hwpt of a pending device's, before the kernel finishes
the "NOP" IOCTL from userspace, maybe with a helper:
	(iommu_get_domain_for_dev(idev->dev) != idev->hwpt->domain)

Thanks
Nic
Nicolin Chen Feb. 14, 2023, 10:59 a.m. UTC | #16
On Fri, Feb 10, 2023 at 02:11:23AM +0000, Tian, Kevin wrote:

> My confusion is that we have different flows between detach/attach
> and replace.
> 
> today with separate detach+attach we have following flow:
> 
>         Remove device from current hwpt;
>         if (last_device in hwpt) {
>                 Remove hwpt domain from current iopt;
>                 if (last_device in group)
>                         detach group from hwpt domain;
>         }
> 
>         if (first device in group) {
>                 attach group to new hwpt domain;
>                 if (first_device in hwpt)
>                         Add hwpt domain to new iopt;
>         Add device to new hwpt;
> 
> but replace flow is different on the detach part:
> 
>         if (first device in group) {
>                 replace group's domain from current hwpt to new hwpt;
>                 if (first_device in hwpt)
>                         Add hwpt domain to new iopt;
>         }
> 
>         Remove device from old hwpt;
>         if (last_device in old hwpt)
>                 Remove hwpt domain from old iopt;
> 
>         Add device to new hwpt;
> 
> I'm yet to figure out whether we have sufficient lock protection to
> prevent other paths from using old iopt/hwpt to find the device
> which is already attached to a different domain.

With Jason's new series, the detach() routine is lighter now.

I wonder if it'd be safer now to do the detach() call after
iommu_group_replace_domain()?

Thanks
Nic

@@ -196,10 +198,41 @@ static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
 	return false;
 }
 
+/**
+ * __iommufd_device_detach - Detach a device from idev->hwpt
+ * @idev: device to detach
+ * @detach_group: flag to call iommu_detach_group
+ *
+ * This is a cleanup helper shared by the replace and detach routines. Comparing
+ * to a detach routine, a replace call does not need the iommu_detach_group().
+ */
+static void __iommufd_device_detach(struct iommufd_device *idev,
+				     bool detach_group)
+{
+	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
+
+	mutex_lock(&hwpt->devices_lock);
+	list_del(&idev->devices_item);
+	if (detach_group && !iommufd_hw_pagetable_has_group(hwpt, idev->group))
+		iommu_detach_group(hwpt->domain, idev->group);
+	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+	mutex_unlock(&hwpt->devices_lock);
+
+	if (hwpt->auto_domain)
+		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
+	else
+		refcount_dec(&hwpt->obj.users);
+
+	idev->hwpt = NULL;
+
+	refcount_dec(&idev->obj.users);
+}
+
 /* On success this consumes a hwpt reference from the caller */
 static int iommufd_device_do_attach(struct iommufd_device *idev,
 				    struct iommufd_hw_pagetable *hwpt)
 {
+	struct iommufd_hw_pagetable *cur_hwpt = idev->hwpt;
 	phys_addr_t sw_msi_start = PHYS_ADDR_MAX;
 	int rc;
 
@@ -237,7 +270,7 @@ static int iommufd_device_do_attach(struct iommufd_device *idev,
 	 * the group once for the first device that is in the group.
 	 */
 	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
-		rc = iommu_attach_group(hwpt->domain, idev->group);
+		rc = iommu_group_replace_domain(idev->group, hwpt->domain);
 		if (rc)
 			goto out_iova;
 
@@ -249,6 +282,10 @@ static int iommufd_device_do_attach(struct iommufd_device *idev,
 		}
 	}
 
+	/* Detach from the cur_hwpt without iommu_detach_group() */
+	if (cur_hwpt)
+		__iommufd_device_detach(idev, false);
+
 	idev->hwpt = hwpt;
 	/* The HWPT reference from the caller is moved to this list */
 	list_add(&idev->devices_item, &hwpt->devices);
Tian, Kevin Feb. 15, 2023, 1:37 a.m. UTC | #17
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Tuesday, February 14, 2023 6:54 PM
> 
> On Mon, Feb 13, 2023 at 10:49:34AM -0400, Jason Gunthorpe wrote:
> > On Sun, Feb 12, 2023 at 11:48:30PM -0800, Nicolin Chen wrote:
> >
> > > What about point 1? If dev2 and dev3 are already replaced when
> > > doing iommu_group_replace_domain() on dev1, their idev objects
> > > still have the old hwpt/iopt until user space does another two
> > > IOCTLs on them, right?
> >
> > We have a complicated model for multi-device groups...
> >
> > The first device in the group to change domains must move all the
> > devices in the group
> >
> > But userspace is still expected to run through and change all the
> > other devices
> >
> > So replace should be a NOP if the group is already linked to the right
> > domain.
> >
> > This patch needs to make sure that incosistency in the view betwen the
> > iommu_group and the iommufd model doesn't cause a functional
> > problem.
> 
> Yea, I was thinking that we'd need to block any access to the
> idev->hwpt of a pending device's, before the kernel finishes
> the "NOP" IOCTL from userspace, maybe with a helper:
> 	(iommu_get_domain_for_dev(idev->dev) != idev->hwpt->domain)
> 

I didn't see what would be broken w/o such blocking measure.

Can you elaborate?
Tian, Kevin Feb. 15, 2023, 1:38 a.m. UTC | #18
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Tuesday, February 14, 2023 7:00 PM
> 
> On Fri, Feb 10, 2023 at 02:11:23AM +0000, Tian, Kevin wrote:
> 
> > My confusion is that we have different flows between detach/attach
> > and replace.
> >
> > today with separate detach+attach we have following flow:
> >
> >         Remove device from current hwpt;
> >         if (last_device in hwpt) {
> >                 Remove hwpt domain from current iopt;
> >                 if (last_device in group)
> >                         detach group from hwpt domain;
> >         }
> >
> >         if (first device in group) {
> >                 attach group to new hwpt domain;
> >                 if (first_device in hwpt)
> >                         Add hwpt domain to new iopt;
> >         Add device to new hwpt;
> >
> > but replace flow is different on the detach part:
> >
> >         if (first device in group) {
> >                 replace group's domain from current hwpt to new hwpt;
> >                 if (first_device in hwpt)
> >                         Add hwpt domain to new iopt;
> >         }
> >
> >         Remove device from old hwpt;
> >         if (last_device in old hwpt)
> >                 Remove hwpt domain from old iopt;
> >
> >         Add device to new hwpt;
> >
> > I'm yet to figure out whether we have sufficient lock protection to
> > prevent other paths from using old iopt/hwpt to find the device
> > which is already attached to a different domain.
> 
> With Jason's new series, the detach() routine is lighter now.
> 
> I wonder if it'd be safer now to do the detach() call after
> iommu_group_replace_domain()?
> 

yes, looks so.
Nicolin Chen Feb. 15, 2023, 1:58 a.m. UTC | #19
On Wed, Feb 15, 2023 at 01:37:19AM +0000, Tian, Kevin wrote:
 
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Tuesday, February 14, 2023 6:54 PM
> >
> > On Mon, Feb 13, 2023 at 10:49:34AM -0400, Jason Gunthorpe wrote:
> > > On Sun, Feb 12, 2023 at 11:48:30PM -0800, Nicolin Chen wrote:
> > >
> > > > What about point 1? If dev2 and dev3 are already replaced when
> > > > doing iommu_group_replace_domain() on dev1, their idev objects
> > > > still have the old hwpt/iopt until user space does another two
> > > > IOCTLs on them, right?
> > >
> > > We have a complicated model for multi-device groups...
> > >
> > > The first device in the group to change domains must move all the
> > > devices in the group
> > >
> > > But userspace is still expected to run through and change all the
> > > other devices
> > >
> > > So replace should be a NOP if the group is already linked to the right
> > > domain.
> > >
> > > This patch needs to make sure that incosistency in the view betwen the
> > > iommu_group and the iommufd model doesn't cause a functional
> > > problem.
> >
> > Yea, I was thinking that we'd need to block any access to the
> > idev->hwpt of a pending device's, before the kernel finishes
> > the "NOP" IOCTL from userspace, maybe with a helper:
> >       (iommu_get_domain_for_dev(idev->dev) != idev->hwpt->domain)
> >
> 
> I didn't see what would be broken w/o such blocking measure.
> 
> Can you elaborate?

iommu_group { idev1, idev2 }

(1) Attach all devices to domain1 {
	group->domain = domain1;
	idev1->hwpt = hwpt1; // domain1
	idev2->hwpt = hwpt1; // domain1
}

(2) Attach (replace) idev1 only to domain2 {
	group->domain = domain2
	idev1->hwpt = hwpt2; // domain2
	idev2->hwpt == domain1 // != iommu_get_domain_for_dev()
}

Then if user space isn't aware of these and continues to do
IOMMU_IOAS_MAP for idev2. Though IOVA mappings may be added
onto the domain2 correctly, yet domain2's iopt itree won't
reflect that, until idev2->hwpt is updated too, right?

And the tricky thing is that, though we advocate a device-
centric uAPI, we'd still have to ask user space to align the
devices in the same iommu_group, which is also not present
in QEMU's RFC v3 series.

The traditional detach+attach flow doesn't seem to have this
issue, since there's no re-entry so the work flow is always
that detaching all devices first before attaching to another
domain.

Thanks
Nic
Tian, Kevin Feb. 15, 2023, 2:15 a.m. UTC | #20
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, February 15, 2023 9:59 AM
> 
> On Wed, Feb 15, 2023 at 01:37:19AM +0000, Tian, Kevin wrote:
> 
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Tuesday, February 14, 2023 6:54 PM
> > >
> > > On Mon, Feb 13, 2023 at 10:49:34AM -0400, Jason Gunthorpe wrote:
> > > > On Sun, Feb 12, 2023 at 11:48:30PM -0800, Nicolin Chen wrote:
> > > >
> > > > > What about point 1? If dev2 and dev3 are already replaced when
> > > > > doing iommu_group_replace_domain() on dev1, their idev objects
> > > > > still have the old hwpt/iopt until user space does another two
> > > > > IOCTLs on them, right?
> > > >
> > > > We have a complicated model for multi-device groups...
> > > >
> > > > The first device in the group to change domains must move all the
> > > > devices in the group
> > > >
> > > > But userspace is still expected to run through and change all the
> > > > other devices
> > > >
> > > > So replace should be a NOP if the group is already linked to the right
> > > > domain.
> > > >
> > > > This patch needs to make sure that incosistency in the view betwen the
> > > > iommu_group and the iommufd model doesn't cause a functional
> > > > problem.
> > >
> > > Yea, I was thinking that we'd need to block any access to the
> > > idev->hwpt of a pending device's, before the kernel finishes
> > > the "NOP" IOCTL from userspace, maybe with a helper:
> > >       (iommu_get_domain_for_dev(idev->dev) != idev->hwpt->domain)
> > >
> >
> > I didn't see what would be broken w/o such blocking measure.
> >
> > Can you elaborate?
> 
> iommu_group { idev1, idev2 }
> 
> (1) Attach all devices to domain1 {
> 	group->domain = domain1;
> 	idev1->hwpt = hwpt1; // domain1
> 	idev2->hwpt = hwpt1; // domain1
> }
> 
> (2) Attach (replace) idev1 only to domain2 {
> 	group->domain = domain2
> 	idev1->hwpt = hwpt2; // domain2
> 	idev2->hwpt == domain1 // != iommu_get_domain_for_dev()
> }
> 
> Then if user space isn't aware of these and continues to do
> IOMMU_IOAS_MAP for idev2. Though IOVA mappings may be added
> onto the domain2 correctly, yet domain2's iopt itree won't
> reflect that, until idev2->hwpt is updated too, right?

IOMMU_IOAS_MAP is for ioas instead of for device.

even w/o any device attached we still allow ioas map.

also note in case of domain1 still actively attached to idev3 in
another group, banning IOAS_MAP due to idev2 in transition
is also incorrect for idev3.

> 
> And the tricky thing is that, though we advocate a device-
> centric uAPI, we'd still have to ask user space to align the
> devices in the same iommu_group, which is also not present
> in QEMU's RFC v3 series.

IMHO this requirement has been stated clearly from the start
when designing this device centric interface. QEMU should be
improved to take care of it.
Nicolin Chen Feb. 15, 2023, 7:15 a.m. UTC | #21
On Wed, Feb 15, 2023 at 02:15:59AM +0000, Tian, Kevin wrote:
> External email: Use caution opening links or attachments
> 
> 
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, February 15, 2023 9:59 AM
> >
> > On Wed, Feb 15, 2023 at 01:37:19AM +0000, Tian, Kevin wrote:
> >
> > > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > > Sent: Tuesday, February 14, 2023 6:54 PM
> > > >
> > > > On Mon, Feb 13, 2023 at 10:49:34AM -0400, Jason Gunthorpe wrote:
> > > > > On Sun, Feb 12, 2023 at 11:48:30PM -0800, Nicolin Chen wrote:
> > > > >
> > > > > > What about point 1? If dev2 and dev3 are already replaced when
> > > > > > doing iommu_group_replace_domain() on dev1, their idev objects
> > > > > > still have the old hwpt/iopt until user space does another two
> > > > > > IOCTLs on them, right?
> > > > >
> > > > > We have a complicated model for multi-device groups...
> > > > >
> > > > > The first device in the group to change domains must move all the
> > > > > devices in the group
> > > > >
> > > > > But userspace is still expected to run through and change all the
> > > > > other devices
> > > > >
> > > > > So replace should be a NOP if the group is already linked to the right
> > > > > domain.
> > > > >
> > > > > This patch needs to make sure that incosistency in the view betwen the
> > > > > iommu_group and the iommufd model doesn't cause a functional
> > > > > problem.
> > > >
> > > > Yea, I was thinking that we'd need to block any access to the
> > > > idev->hwpt of a pending device's, before the kernel finishes
> > > > the "NOP" IOCTL from userspace, maybe with a helper:
> > > >       (iommu_get_domain_for_dev(idev->dev) != idev->hwpt->domain)
> > > >
> > >
> > > I didn't see what would be broken w/o such blocking measure.
> > >
> > > Can you elaborate?
> >
> > iommu_group { idev1, idev2 }
> >
> > (1) Attach all devices to domain1 {
> >       group->domain = domain1;
> >       idev1->hwpt = hwpt1; // domain1
> >       idev2->hwpt = hwpt1; // domain1
> > }
> >
> > (2) Attach (replace) idev1 only to domain2 {
> >       group->domain = domain2
> >       idev1->hwpt = hwpt2; // domain2
> >       idev2->hwpt == domain1 // != iommu_get_domain_for_dev()
> > }
> >
> > Then if user space isn't aware of these and continues to do
> > IOMMU_IOAS_MAP for idev2. Though IOVA mappings may be added
> > onto the domain2 correctly, yet domain2's iopt itree won't
> > reflect that, until idev2->hwpt is updated too, right?
> 
> IOMMU_IOAS_MAP is for ioas instead of for device.
> 
> even w/o any device attached we still allow ioas map.
> 
> also note in case of domain1 still actively attached to idev3 in
> another group, banning IOAS_MAP due to idev2 in transition
> is also incorrect for idev3.

OK. That's likely true. It doesn't seem to be correct to block an
IOMMU_IOAS_MAP.

But things will be out of control, if user space continues mapping
something onto domain1's iopt for idev2, even after it is attached
covertly to domain2's iopt by the replace routine. I wonder how
kernel should handle this and keep the consistency between IOMMUFD
objects and iommu_group.

> > And the tricky thing is that, though we advocate a device-
> > centric uAPI, we'd still have to ask user space to align the
> > devices in the same iommu_group, which is also not present
> > in QEMU's RFC v3 series.
> 
> IMHO this requirement has been stated clearly from the start
> when designing this device centric interface. QEMU should be
> improved to take care of it.

OK. It has to be so...

Thanks
Nic
Nicolin Chen Feb. 15, 2023, 7:16 a.m. UTC | #22
On Wed, Feb 15, 2023 at 01:38:32AM +0000, Tian, Kevin wrote:
> External email: Use caution opening links or attachments
> 
> 
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Tuesday, February 14, 2023 7:00 PM
> >
> > On Fri, Feb 10, 2023 at 02:11:23AM +0000, Tian, Kevin wrote:
> >
> > > My confusion is that we have different flows between detach/attach
> > > and replace.
> > >
> > > today with separate detach+attach we have following flow:
> > >
> > >         Remove device from current hwpt;
> > >         if (last_device in hwpt) {
> > >                 Remove hwpt domain from current iopt;
> > >                 if (last_device in group)
> > >                         detach group from hwpt domain;
> > >         }
> > >
> > >         if (first device in group) {
> > >                 attach group to new hwpt domain;
> > >                 if (first_device in hwpt)
> > >                         Add hwpt domain to new iopt;
> > >         Add device to new hwpt;
> > >
> > > but replace flow is different on the detach part:
> > >
> > >         if (first device in group) {
> > >                 replace group's domain from current hwpt to new hwpt;
> > >                 if (first_device in hwpt)
> > >                         Add hwpt domain to new iopt;
> > >         }
> > >
> > >         Remove device from old hwpt;
> > >         if (last_device in old hwpt)
> > >                 Remove hwpt domain from old iopt;
> > >
> > >         Add device to new hwpt;
> > >
> > > I'm yet to figure out whether we have sufficient lock protection to
> > > prevent other paths from using old iopt/hwpt to find the device
> > > which is already attached to a different domain.
> >
> > With Jason's new series, the detach() routine is lighter now.
> >
> > I wonder if it'd be safer now to do the detach() call after
> > iommu_group_replace_domain()?
> >
> 
> yes, looks so.

Will rebase a v3 once Jason updates his series. Thanks!
Tian, Kevin Feb. 15, 2023, 7:24 a.m. UTC | #23
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, February 15, 2023 3:15 PM
> 
> But things will be out of control, if user space continues mapping
> something onto domain1's iopt for idev2, even after it is attached
> covertly to domain2's iopt by the replace routine. I wonder how
> kernel should handle this and keep the consistency between IOMMUFD
> objects and iommu_group.
> 

this is where I don't understand.

domain mapping just reflects what an address space has.

Take Qemu for example. w/o vIOMMU domain mappings is added/
removed according to the change in the GPA address space.

w/ vIOMMU then it is synced with guest page table.

why would userspace construct mappings for a specific device?

when device is moved from one domain to another domain, it just
bears with whatever the new domain allows it to access.
Jason Gunthorpe Feb. 15, 2023, 12:51 p.m. UTC | #24
On Tue, Feb 14, 2023 at 11:15:09PM -0800, Nicolin Chen wrote:

> But things will be out of control, if user space continues mapping
> something onto domain1's iopt for idev2, even after it is attached
> covertly to domain2's iopt by the replace routine. I wonder how
> kernel should handle this and keep the consistency between IOMMUFD
> objects and iommu_group.

I've been looking at this, the reason the locking is such a PITA is
because we are still trying to use the device list short cut.

We need to have a iommu group object instead then everything will work
smoothly.

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index b8c3e3baccb5..8a9834fc129a 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -9,6 +9,8 @@ 
 #include "io_pagetable.h"
 #include "iommufd_private.h"
 
+MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
+
 static bool allow_unsafe_interrupts;
 module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(
@@ -194,9 +196,61 @@  static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
 	return false;
 }
 
+/**
+ * __iommmufd_device_detach - Detach a device from idev->hwpt to new_hwpt
+ * @idev: device to detach
+ * @new_hwpt: new hw_pagetable to attach (pass in NULL for a simple detach)
+ * @detach_group: flag to call iommu_detach_group
+ *
+ * This is a cleanup helper shared by the replace and detach routines. Comparing
+ * to a detach routine, a replace routine only needs a partial detach procedure:
+ * it does not need the iommu_detach_group(); it will attach the device to a new
+ * hw_pagetable after a partial detach from the currently attached hw_pagetable,
+ * so certain steps can be skipped if two hw_pagetables have the same IOAS.
+ */
+static void __iommmufd_device_detach(struct iommufd_device *idev,
+				     struct iommufd_hw_pagetable *new_hwpt,
+				     bool detach_group)
+{
+	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
+	struct iommufd_ioas *new_ioas = NULL;
+
+	if (new_hwpt)
+		new_ioas = new_hwpt->ioas;
+
+	mutex_lock(&hwpt->devices_lock);
+	list_del(&idev->devices_item);
+	if (hwpt->ioas != new_ioas)
+		mutex_lock(&hwpt->ioas->mutex);
+	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
+		if (list_empty(&hwpt->devices)) {
+			iopt_table_remove_domain(&hwpt->ioas->iopt,
+						 hwpt->domain);
+			list_del(&hwpt->hwpt_item);
+		}
+		if (detach_group)
+			iommu_detach_group(hwpt->domain, idev->group);
+	}
+	if (hwpt->ioas != new_ioas) {
+		iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+		mutex_unlock(&hwpt->ioas->mutex);
+	}
+	mutex_unlock(&hwpt->devices_lock);
+
+	if (hwpt->auto_domain)
+		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
+	else
+		refcount_dec(&hwpt->obj.users);
+
+	idev->hwpt = NULL;
+
+	refcount_dec(&idev->obj.users);
+}
+
 static int iommufd_device_do_attach(struct iommufd_device *idev,
 				    struct iommufd_hw_pagetable *hwpt)
 {
+	struct iommufd_hw_pagetable *cur_hwpt = idev->hwpt;
 	phys_addr_t sw_msi_start = PHYS_ADDR_MAX;
 	int rc;
 
@@ -236,7 +290,7 @@  static int iommufd_device_do_attach(struct iommufd_device *idev,
 	 * the group once for the first device that is in the group.
 	 */
 	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
-		rc = iommu_attach_group(hwpt->domain, idev->group);
+		rc = iommu_group_replace_domain(idev->group, hwpt->domain);
 		if (rc)
 			goto out_iova;
 
@@ -249,6 +303,10 @@  static int iommufd_device_do_attach(struct iommufd_device *idev,
 		}
 	}
 
+	/* Replace the cur_hwpt without iommu_detach_group() */
+	if (cur_hwpt)
+		__iommmufd_device_detach(idev, hwpt, false);
+
 	idev->hwpt = hwpt;
 	refcount_inc(&hwpt->obj.users);
 	list_add(&idev->devices_item, &hwpt->devices);
@@ -256,7 +314,10 @@  static int iommufd_device_do_attach(struct iommufd_device *idev,
 	return 0;
 
 out_detach:
-	iommu_detach_group(hwpt->domain, idev->group);
+	if (cur_hwpt)
+		iommu_group_replace_domain(idev->group, cur_hwpt->domain);
+	else
+		iommu_detach_group(hwpt->domain, idev->group);
 out_iova:
 	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
 out_unlock:
@@ -345,6 +406,13 @@  int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
 		struct iommufd_hw_pagetable *hwpt =
 			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
 
+		if (idev->hwpt == hwpt)
+			goto out_done;
+		if (idev->hwpt && idev->hwpt->auto_domain) {
+			rc = -EBUSY;
+			goto out_put_pt_obj;
+		}
+
 		mutex_lock(&hwpt->ioas->mutex);
 		rc = iommufd_device_do_attach(idev, hwpt);
 		mutex_unlock(&hwpt->ioas->mutex);
@@ -356,6 +424,8 @@  int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
 		struct iommufd_ioas *ioas =
 			container_of(pt_obj, struct iommufd_ioas, obj);
 
+		if (idev->hwpt)
+			return -EBUSY;
 		rc = iommufd_device_auto_get_domain(idev, ioas);
 		if (rc)
 			goto out_put_pt_obj;
@@ -367,6 +437,7 @@  int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
 	}
 
 	refcount_inc(&idev->obj.users);
+out_done:
 	*pt_id = idev->hwpt->obj.id;
 	rc = 0;
 
@@ -385,31 +456,7 @@  EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
  */
 void iommufd_device_detach(struct iommufd_device *idev)
 {
-	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
-
-	mutex_lock(&hwpt->ioas->mutex);
-	mutex_lock(&hwpt->devices_lock);
-	list_del(&idev->devices_item);
-	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
-		if (list_empty(&hwpt->devices)) {
-			iopt_table_remove_domain(&hwpt->ioas->iopt,
-						 hwpt->domain);
-			list_del(&hwpt->hwpt_item);
-		}
-		iommu_detach_group(hwpt->domain, idev->group);
-	}
-	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
-	mutex_unlock(&hwpt->devices_lock);
-	mutex_unlock(&hwpt->ioas->mutex);
-
-	if (hwpt->auto_domain)
-		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
-	else
-		refcount_dec(&hwpt->obj.users);
-
-	idev->hwpt = NULL;
-
-	refcount_dec(&idev->obj.users);
+	__iommmufd_device_detach(idev, NULL, true);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 593138bb37b8..200c783800ad 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -9,6 +9,8 @@ 
 #include <linux/refcount.h>
 #include <linux/uaccess.h>
 
+#include "../iommu-priv.h"
+
 struct iommu_domain;
 struct iommu_group;
 struct iommu_option;