diff mbox series

[v2,02/12] iommu: Introduce a replace API for device pasid

Message ID 20240412081516.31168-3-yi.l.liu@intel.com (mailing list archive)
State New
Headers show
Series iommufd support pasid attach/replace | expand

Commit Message

Yi Liu April 12, 2024, 8:15 a.m. UTC
Provide a high-level API to allow replacements of one domain with
another for specific pasid of a device. This is similar to
iommu_group_replace_domain() and it is expected to be used only by
IOMMUFD.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/iommu/iommu-priv.h |  3 ++
 drivers/iommu/iommu.c      | 92 +++++++++++++++++++++++++++++++++++---
 2 files changed, 89 insertions(+), 6 deletions(-)

Comments

Duan, Zhenzhong April 16, 2024, 3:01 a.m. UTC | #1
>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: [PATCH v2 02/12] iommu: Introduce a replace API for device pasid
>
>Provide a high-level API to allow replacements of one domain with
>another for specific pasid of a device. This is similar to
>iommu_group_replace_domain() and it is expected to be used only by
>IOMMUFD.
>
>Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>---
> drivers/iommu/iommu-priv.h |  3 ++
> drivers/iommu/iommu.c      | 92
>+++++++++++++++++++++++++++++++++++---
> 2 files changed, 89 insertions(+), 6 deletions(-)
>
>diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
>index 5f731d994803..0949c02cee93 100644
>--- a/drivers/iommu/iommu-priv.h
>+++ b/drivers/iommu/iommu-priv.h
>@@ -20,6 +20,9 @@ static inline const struct iommu_ops
>*dev_iommu_ops(struct device *dev)
> int iommu_group_replace_domain(struct iommu_group *group,
> 			       struct iommu_domain *new_domain);
>
>+int iommu_replace_device_pasid(struct iommu_domain *domain,
>+			       struct device *dev, ioasid_t pasid);
>+
> int iommu_device_register_bus(struct iommu_device *iommu,
> 			      const struct iommu_ops *ops,
> 			      const struct bus_type *bus,
>diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>index 701b02a118db..343683e646e0 100644
>--- a/drivers/iommu/iommu.c
>+++ b/drivers/iommu/iommu.c
>@@ -3315,14 +3315,15 @@ bool
>iommu_group_dma_owner_claimed(struct iommu_group *group)
> EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);
>
> static int __iommu_set_group_pasid(struct iommu_domain *domain,
>-				   struct iommu_group *group, ioasid_t pasid)
>+				   struct iommu_group *group, ioasid_t pasid,
>+				   struct iommu_domain *old)
> {
> 	struct group_device *device, *last_gdev;
> 	int ret;
>
> 	for_each_group_device(group, device) {
> 		ret = domain->ops->set_dev_pasid(domain, device->dev,
>-						 pasid, NULL);
>+						 pasid, old);
> 		if (ret)
> 			goto err_revert;
> 	}
>@@ -3332,11 +3333,34 @@ static int __iommu_set_group_pasid(struct
>iommu_domain *domain,
> err_revert:
> 	last_gdev = device;
> 	for_each_group_device(group, device) {
>-		const struct iommu_ops *ops = dev_iommu_ops(device-
>>dev);
>+		/*
>+		 * If no old domain, just undo all the devices/pasid that
>+		 * have attached to the new domain.
>+		 */
>+		if (!old) {
>+			const struct iommu_ops *ops =
>+						dev_iommu_ops(device->dev);
>+
>+			if (device == last_gdev)

Maybe this check can be moved to beginning of the for loop,

>+				break;
>+			ops = dev_iommu_ops(device->dev);
>+			ops->remove_dev_pasid(device->dev, pasid, domain);
>+			continue;
>+		}
>
>-		if (device == last_gdev)
>+		/*
>+		 * Rollback the devices/pasid that have attached to the new
>+		 * domain. And it is a driver bug to fail attaching with a
>+		 * previously good domain.
>+		 */
>+		if (device == last_gdev) {

then this check can be removed.

>+			WARN_ON(old->ops->set_dev_pasid(old, device-
>>dev,
>+							pasid, NULL));

Is this call necessary? last_gdev is the first device failed.

Thanks
Zhenzhong

> 			break;
>-		ops->remove_dev_pasid(device->dev, pasid, domain);
>+		}
>+
>+		WARN_ON(old->ops->set_dev_pasid(old, device->dev,
>+						pasid, domain));
> 	}
> 	return ret;
> }
>@@ -3395,7 +3419,7 @@ int iommu_attach_device_pasid(struct
>iommu_domain *domain,
> 		goto out_unlock;
> 	}
>
>-	ret = __iommu_set_group_pasid(domain, group, pasid);
>+	ret = __iommu_set_group_pasid(domain, group, pasid, NULL);
> 	if (ret)
> 		xa_erase(&group->pasid_array, pasid);
> out_unlock:
>@@ -3404,6 +3428,62 @@ int iommu_attach_device_pasid(struct
>iommu_domain *domain,
> }
> EXPORT_SYMBOL_GPL(iommu_attach_device_pasid);
>
>+/**
>+ * iommu_replace_device_pasid - replace the domain that a pasid is
>attached to
>+ * @domain: new IOMMU domain to replace with
>+ * @dev: the physical device
>+ * @pasid: pasid that will be attached to the new domain
>+ *
>+ * This API allows the pasid to switch domains. Return 0 on success, or an
>+ * error. The pasid will roll back to use the old domain if failure. The
>+ * caller could call iommu_detach_device_pasid() before free the old
>domain
>+ * in order to avoid use-after-free case.
>+ */
>+int iommu_replace_device_pasid(struct iommu_domain *domain,
>+			       struct device *dev, ioasid_t pasid)
>+{
>+	/* Caller must be a probed driver on dev */
>+	struct iommu_group *group = dev->iommu_group;
>+	void *curr;
>+	int ret;
>+
>+	if (!domain)
>+		return -EINVAL;
>+
>+	if (!domain->ops->set_dev_pasid)
>+		return -EOPNOTSUPP;
>+
>+	if (!group)
>+		return -ENODEV;
>+
>+	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain-
>>owner)
>+		return -EINVAL;
>+
>+	mutex_lock(&group->mutex);
>+	curr = xa_store(&group->pasid_array, pasid, domain, GFP_KERNEL);
>+	if (!curr) {
>+		xa_erase(&group->pasid_array, pasid);
>+		ret = -EINVAL;
>+		goto out_unlock;
>+	}
>+
>+	ret = xa_err(curr);
>+	if (ret)
>+		goto out_unlock;
>+
>+	if (curr == domain)
>+		goto out_unlock;
>+
>+	ret = __iommu_set_group_pasid(domain, group, pasid, curr);
>+	if (ret)
>+		WARN_ON(xa_err(xa_store(&group->pasid_array, pasid,
>+					curr, GFP_KERNEL)));
>+out_unlock:
>+	mutex_unlock(&group->mutex);
>+	return ret;
>+}
>+EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid,
>IOMMUFD_INTERNAL);
>+
> /*
>  * iommu_detach_device_pasid() - Detach the domain from pasid of device
>  * @domain: the iommu domain.
>--
>2.34.1
Yi Liu April 16, 2024, 9:18 a.m. UTC | #2
On 2024/4/16 11:01, Duan, Zhenzhong wrote:
> 
> 
>> -----Original Message-----
>> From: Liu, Yi L <yi.l.liu@intel.com>
>> Subject: [PATCH v2 02/12] iommu: Introduce a replace API for device pasid
>>
>> Provide a high-level API to allow replacements of one domain with
>> another for specific pasid of a device. This is similar to
>> iommu_group_replace_domain() and it is expected to be used only by
>> IOMMUFD.
>>
>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> ---
>> drivers/iommu/iommu-priv.h |  3 ++
>> drivers/iommu/iommu.c      | 92
>> +++++++++++++++++++++++++++++++++++---
>> 2 files changed, 89 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
>> index 5f731d994803..0949c02cee93 100644
>> --- a/drivers/iommu/iommu-priv.h
>> +++ b/drivers/iommu/iommu-priv.h
>> @@ -20,6 +20,9 @@ static inline const struct iommu_ops
>> *dev_iommu_ops(struct device *dev)
>> int iommu_group_replace_domain(struct iommu_group *group,
>> 			       struct iommu_domain *new_domain);
>>
>> +int iommu_replace_device_pasid(struct iommu_domain *domain,
>> +			       struct device *dev, ioasid_t pasid);
>> +
>> int iommu_device_register_bus(struct iommu_device *iommu,
>> 			      const struct iommu_ops *ops,
>> 			      const struct bus_type *bus,
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 701b02a118db..343683e646e0 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -3315,14 +3315,15 @@ bool
>> iommu_group_dma_owner_claimed(struct iommu_group *group)
>> EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);
>>
>> static int __iommu_set_group_pasid(struct iommu_domain *domain,
>> -				   struct iommu_group *group, ioasid_t pasid)
>> +				   struct iommu_group *group, ioasid_t pasid,
>> +				   struct iommu_domain *old)
>> {
>> 	struct group_device *device, *last_gdev;
>> 	int ret;
>>
>> 	for_each_group_device(group, device) {
>> 		ret = domain->ops->set_dev_pasid(domain, device->dev,
>> -						 pasid, NULL);
>> +						 pasid, old);
>> 		if (ret)
>> 			goto err_revert;
>> 	}
>> @@ -3332,11 +3333,34 @@ static int __iommu_set_group_pasid(struct
>> iommu_domain *domain,
>> err_revert:
>> 	last_gdev = device;
>> 	for_each_group_device(group, device) {
>> -		const struct iommu_ops *ops = dev_iommu_ops(device-
>>> dev);
>> +		/*
>> +		 * If no old domain, just undo all the devices/pasid that
>> +		 * have attached to the new domain.
>> +		 */
>> +		if (!old) {
>> +			const struct iommu_ops *ops =
>> +						dev_iommu_ops(device->dev);
>> +
>> +			if (device == last_gdev)
> 
> Maybe this check can be moved to beginning of the for loop,
> 
>> +				break;
>> +			ops = dev_iommu_ops(device->dev);
>> +			ops->remove_dev_pasid(device->dev, pasid, domain);
>> +			continue;
>> +		}
>>
>> -		if (device == last_gdev)
>> +		/*
>> +		 * Rollback the devices/pasid that have attached to the new
>> +		 * domain. And it is a driver bug to fail attaching with a
>> +		 * previously good domain.
>> +		 */
>> +		if (device == last_gdev) {
> 
> then this check can be removed.
> 
>> +			WARN_ON(old->ops->set_dev_pasid(old, device-
>>> dev,
>> +							pasid, NULL));
> 
> Is this call necessary? last_gdev is the first device failed.


It is since we claim replacement failure would leave the pasid to be
attached with the prior attached domain. That's why old is checked
in the above. If no old domain, it would exit the loop if it is the
first failed device. Nothing need to be done for the failed device.

> Thanks
> Zhenzhong
> 
>> 			break;
>> -		ops->remove_dev_pasid(device->dev, pasid, domain);
>> +		}
>> +
>> +		WARN_ON(old->ops->set_dev_pasid(old, device->dev,
>> +						pasid, domain));
>> 	}
>> 	return ret;
>> }
>> @@ -3395,7 +3419,7 @@ int iommu_attach_device_pasid(struct
>> iommu_domain *domain,
>> 		goto out_unlock;
>> 	}
>>
>> -	ret = __iommu_set_group_pasid(domain, group, pasid);
>> +	ret = __iommu_set_group_pasid(domain, group, pasid, NULL);
>> 	if (ret)
>> 		xa_erase(&group->pasid_array, pasid);
>> out_unlock:
>> @@ -3404,6 +3428,62 @@ int iommu_attach_device_pasid(struct
>> iommu_domain *domain,
>> }
>> EXPORT_SYMBOL_GPL(iommu_attach_device_pasid);
>>
>> +/**
>> + * iommu_replace_device_pasid - replace the domain that a pasid is
>> attached to
>> + * @domain: new IOMMU domain to replace with
>> + * @dev: the physical device
>> + * @pasid: pasid that will be attached to the new domain
>> + *
>> + * This API allows the pasid to switch domains. Return 0 on success, or an
>> + * error. The pasid will roll back to use the old domain if failure. The
>> + * caller could call iommu_detach_device_pasid() before free the old
>> domain
>> + * in order to avoid use-after-free case.
>> + */
>> +int iommu_replace_device_pasid(struct iommu_domain *domain,
>> +			       struct device *dev, ioasid_t pasid)
>> +{
>> +	/* Caller must be a probed driver on dev */
>> +	struct iommu_group *group = dev->iommu_group;
>> +	void *curr;
>> +	int ret;
>> +
>> +	if (!domain)
>> +		return -EINVAL;
>> +
>> +	if (!domain->ops->set_dev_pasid)
>> +		return -EOPNOTSUPP;
>> +
>> +	if (!group)
>> +		return -ENODEV;
>> +
>> +	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain-
>>> owner)
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&group->mutex);
>> +	curr = xa_store(&group->pasid_array, pasid, domain, GFP_KERNEL);
>> +	if (!curr) {
>> +		xa_erase(&group->pasid_array, pasid);
>> +		ret = -EINVAL;
>> +		goto out_unlock;
>> +	}
>> +
>> +	ret = xa_err(curr);
>> +	if (ret)
>> +		goto out_unlock;
>> +
>> +	if (curr == domain)
>> +		goto out_unlock;
>> +
>> +	ret = __iommu_set_group_pasid(domain, group, pasid, curr);
>> +	if (ret)
>> +		WARN_ON(xa_err(xa_store(&group->pasid_array, pasid,
>> +					curr, GFP_KERNEL)));
>> +out_unlock:
>> +	mutex_unlock(&group->mutex);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid,
>> IOMMUFD_INTERNAL);
>> +
>> /*
>>   * iommu_detach_device_pasid() - Detach the domain from pasid of device
>>   * @domain: the iommu domain.
>> --
>> 2.34.1
>
Tian, Kevin April 17, 2024, 8:44 a.m. UTC | #3
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Friday, April 12, 2024 4:15 PM
>
> @@ -3332,11 +3333,34 @@ static int __iommu_set_group_pasid(struct
> iommu_domain *domain,
>  err_revert:
>  	last_gdev = device;
>  	for_each_group_device(group, device) {
> -		const struct iommu_ops *ops = dev_iommu_ops(device-
> >dev);
> +		/*
> +		 * If no old domain, just undo all the devices/pasid that
> +		 * have attached to the new domain.
> +		 */
> +		if (!old) {
> +			const struct iommu_ops *ops =
> +						dev_iommu_ops(device-
> >dev);
> +
> +			if (device == last_gdev)
> +				break;
> +			ops = dev_iommu_ops(device->dev);

'ops' is already assigned

> +			ops->remove_dev_pasid(device->dev, pasid, domain);
> +			continue;
> +		}
> 
> -		if (device == last_gdev)
> +		/*
> +		 * Rollback the devices/pasid that have attached to the new
> +		 * domain. And it is a driver bug to fail attaching with a
> +		 * previously good domain.
> +		 */
> +		if (device == last_gdev) {
> +			WARN_ON(old->ops->set_dev_pasid(old, device-
> >dev,
> +							pasid, NULL));

do we have a clear definition that @set_dev_pasid callback should
leave the device detached (as 'NULL' indicates) or we just don't 
care the currently-attached domain at this point?

> 
> +/**
> + * iommu_replace_device_pasid - replace the domain that a pasid is
> attached to
> + * @domain: new IOMMU domain to replace with
> + * @dev: the physical device
> + * @pasid: pasid that will be attached to the new domain
> + *
> + * This API allows the pasid to switch domains. Return 0 on success, or an
> + * error. The pasid will roll back to use the old domain if failure. The
> + * caller could call iommu_detach_device_pasid() before free the old
> domain
> + * in order to avoid use-after-free case.

I didn't get what the last sentence tries to convey. Do you mean that
the old domain cannot be freed even after the replace operation has
been completed successfully? why does it require a detach before
the free?

> + */
> +int iommu_replace_device_pasid(struct iommu_domain *domain,
> +			       struct device *dev, ioasid_t pasid)
> +{
> +	/* Caller must be a probed driver on dev */
> +	struct iommu_group *group = dev->iommu_group;
> +	void *curr;
> +	int ret;
> +
> +	if (!domain)
> +		return -EINVAL;

this check can be skipped. Accessing a null pointer will hit
a call trace already.

> +
> +	if (!domain->ops->set_dev_pasid)
> +		return -EOPNOTSUPP;
> +
> +	if (!group)
> +		return -ENODEV;
> +
> +	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain-
> >owner)
> +		return -EINVAL;

and check it's not IOMMU_NO_PASID

> +
> +	mutex_lock(&group->mutex);
> +	curr = xa_store(&group->pasid_array, pasid, domain, GFP_KERNEL);
> +	if (!curr) {
> +		xa_erase(&group->pasid_array, pasid);
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = xa_err(curr);
> +	if (ret)
> +		goto out_unlock;
> +
> +	if (curr == domain)
> +		goto out_unlock;

emmm then 'ret' is used uninitialized here.

> +
> +	ret = __iommu_set_group_pasid(domain, group, pasid, curr);
> +	if (ret)
> +		WARN_ON(xa_err(xa_store(&group->pasid_array, pasid,
> +					curr, GFP_KERNEL)));

split the line. WARN_ON() as long as the return value doesn't match 
'domain'.

> +out_unlock:
> +	mutex_unlock(&group->mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid,
> IOMMUFD_INTERNAL);
> +
>  /*
>   * iommu_detach_device_pasid() - Detach the domain from pasid of device
>   * @domain: the iommu domain.
> --
> 2.34.1
Jason Gunthorpe April 17, 2024, 12:17 p.m. UTC | #4
On Wed, Apr 17, 2024 at 08:44:11AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Friday, April 12, 2024 4:15 PM
> >
> > @@ -3332,11 +3333,34 @@ static int __iommu_set_group_pasid(struct
> > iommu_domain *domain,
> >  err_revert:
> >  	last_gdev = device;
> >  	for_each_group_device(group, device) {
> > -		const struct iommu_ops *ops = dev_iommu_ops(device-
> > >dev);
> > +		/*
> > +		 * If no old domain, just undo all the devices/pasid that
> > +		 * have attached to the new domain.
> > +		 */
> > +		if (!old) {
> > +			const struct iommu_ops *ops =
> > +						dev_iommu_ops(device-
> > >dev);
> > +
> > +			if (device == last_gdev)
> > +				break;
> > +			ops = dev_iommu_ops(device->dev);
> 
> 'ops' is already assigned
> 
> > +			ops->remove_dev_pasid(device->dev, pasid, domain);
> > +			continue;
> > +		}
> > 
> > -		if (device == last_gdev)
> > +		/*
> > +		 * Rollback the devices/pasid that have attached to the new
> > +		 * domain. And it is a driver bug to fail attaching with a
> > +		 * previously good domain.
> > +		 */
> > +		if (device == last_gdev) {
> > +			WARN_ON(old->ops->set_dev_pasid(old, device-
> > >dev,
> > +							pasid, NULL));
> 
> do we have a clear definition that @set_dev_pasid callback should
> leave the device detached (as 'NULL' indicates) or we just don't 
> care the currently-attached domain at this point?

If set_dev_pasid fails I would expect to to have done nothing so the
failing device should be left in the old config and we should just not
call it at all.

The RID path is wonky here because so many drivers don't do that, so
we poke them again to hopefully get it right. I think for PASID we
should try to make the drivers work properly from the start. Failure
means no change.

I would summarize this in a comment..

Jason
Tian, Kevin April 18, 2024, 12:08 a.m. UTC | #5
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, April 17, 2024 8:17 PM
> 
> On Wed, Apr 17, 2024 at 08:44:11AM +0000, Tian, Kevin wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Friday, April 12, 2024 4:15 PM
> > >
> > > -		if (device == last_gdev)
> > > +		/*
> > > +		 * Rollback the devices/pasid that have attached to the new
> > > +		 * domain. And it is a driver bug to fail attaching with a
> > > +		 * previously good domain.
> > > +		 */
> > > +		if (device == last_gdev) {
> > > +			WARN_ON(old->ops->set_dev_pasid(old, device-
> > > >dev,
> > > +							pasid, NULL));
> >
> > do we have a clear definition that @set_dev_pasid callback should
> > leave the device detached (as 'NULL' indicates) or we just don't
> > care the currently-attached domain at this point?
> 
> If set_dev_pasid fails I would expect to to have done nothing so the
> failing device should be left in the old config and we should just not
> call it at all.
> 
> The RID path is wonky here because so many drivers don't do that, so
> we poke them again to hopefully get it right. I think for PASID we
> should try to make the drivers work properly from the start. Failure
> means no change.
> 
> I would summarize this in a comment..

Ah yes, that's the simple sane way to do. 
Jason Gunthorpe April 29, 2024, 1:55 p.m. UTC | #6
On Fri, Apr 12, 2024 at 01:15:06AM -0700, Yi Liu wrote:

> -		if (device == last_gdev)
> +		/*
> +		 * Rollback the devices/pasid that have attached to the new
> +		 * domain. And it is a driver bug to fail attaching with a
> +		 * previously good domain.
> +		 */
> +		if (device == last_gdev) {
> +			WARN_ON(old->ops->set_dev_pasid(old, device->dev,
> +							pasid, NULL));
>  			break;
> -		ops->remove_dev_pasid(device->dev, pasid, domain);

Suggest writing this as 

if (WARN_ON(old->ops->set_dev_pasid(old, device->dev, pasid, curr)))
    ops->remove_dev_pasid(device->dev, pasid, domain);

As we may as well try to bring the system back to some kind of safe
state before we continue on.

Also NULL doesn't seem right, if we here then the new domain was
attached successfully and we are put it back to old.

> +	mutex_lock(&group->mutex);
> +	curr = xa_store(&group->pasid_array, pasid, domain, GFP_KERNEL);
> +	if (!curr) {
> +		xa_erase(&group->pasid_array, pasid);

A comment here about order is likely a good idea..

At this point the pasid_array and the translation are not matched. If
we get a PRI at this instant it will deliver to the new domain until
the translation is updated.

There is nothing to do about this race, but lets note it and say the
concurrent PRI path will eventually become consistent and there is no
harm in directing PRI to the wrong domain.

Let's also check that receiving a PRI on a domain that is not PRI
capable doesn't explode in case someone uses replace to change from a
PRI to non PRI domain.

Jason
Yi Liu April 30, 2024, 5 a.m. UTC | #7
On 2024/4/29 21:55, Jason Gunthorpe wrote:
> On Fri, Apr 12, 2024 at 01:15:06AM -0700, Yi Liu wrote:
> 
>> -		if (device == last_gdev)
>> +		/*
>> +		 * Rollback the devices/pasid that have attached to the new
>> +		 * domain. And it is a driver bug to fail attaching with a
>> +		 * previously good domain.
>> +		 */
>> +		if (device == last_gdev) {
>> +			WARN_ON(old->ops->set_dev_pasid(old, device->dev,
>> +							pasid, NULL));
>>   			break;
>> -		ops->remove_dev_pasid(device->dev, pasid, domain);
> 
> Suggest writing this as
> 
> if (WARN_ON(old->ops->set_dev_pasid(old, device->dev, pasid, curr)))
>      ops->remove_dev_pasid(device->dev, pasid, domain);
> 
> As we may as well try to bring the system back to some kind of safe
> state before we continue on.
> 
> Also NULL doesn't seem right, if we here then the new domain was
> attached successfully and we are put it back to old.

ok, and given your another remark [1], there is no more need to do rollback
for the last_gdev, just need to break the loop when comes to the last_gdev.

[1] https://lore.kernel.org/linux-iommu/20240417121700.GL3637727@nvidia.com/

>> +	mutex_lock(&group->mutex);
>> +	curr = xa_store(&group->pasid_array, pasid, domain, GFP_KERNEL);
>> +	if (!curr) {
>> +		xa_erase(&group->pasid_array, pasid);
> 
> A comment here about order is likely a good idea..
> 
> At this point the pasid_array and the translation are not matched. If
> we get a PRI at this instant it will deliver to the new domain until
> the translation is updated.

yes.

> There is nothing to do about this race, but lets note it and say the
> concurrent PRI path will eventually become consistent and there is no
> harm in directing PRI to the wrong domain.

If the old and new domain points to the same address space, it is fine.
How about they point to different address spaces? Delivering the PRI to
new domain seems problematic. Or, do we allow such domain replacement
when there is still ongoing DMA?

> Let's also check that receiving a PRI on a domain that is not PRI
> capable doesn't explode in case someone uses replace to change from a
> PRI to non PRI domain.

Just need to refuse the receiving PRI, is it? BTW. Should the PRI cap
be disabled in the devices side and the translation structure (e.g.
PRI enable bit in pasid entry) when the replacement is done?
Jason Gunthorpe April 30, 2024, 12:26 p.m. UTC | #8
On Tue, Apr 30, 2024 at 01:00:57PM +0800, Yi Liu wrote:

> > There is nothing to do about this race, but lets note it and say the
> > concurrent PRI path will eventually become consistent and there is no
> > harm in directing PRI to the wrong domain.
> 
> If the old and new domain points to the same address space, it is fine.
> How about they point to different address spaces? Delivering the PRI to
> new domain seems problematic. Or, do we allow such domain replacement
> when there is still ongoing DMA?

New PRI could happen an instant later and hit the new domain, or an
instant before and hit the old domain. It is fine

> > Let's also check that receiving a PRI on a domain that is not PRI
> > capable doesn't explode in case someone uses replace to change from a
> > PRI to non PRI domain.
> 
> Just need to refuse the receiving PRI, is it?

Yes

> BTW. Should the PRI cap
> be disabled in the devices side and the translation structure (e.g.
> PRI enable bit in pasid entry) when the replacement is done?

Yes, after domain attachment completes

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index 5f731d994803..0949c02cee93 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -20,6 +20,9 @@  static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
 int iommu_group_replace_domain(struct iommu_group *group,
 			       struct iommu_domain *new_domain);
 
+int iommu_replace_device_pasid(struct iommu_domain *domain,
+			       struct device *dev, ioasid_t pasid);
+
 int iommu_device_register_bus(struct iommu_device *iommu,
 			      const struct iommu_ops *ops,
 			      const struct bus_type *bus,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 701b02a118db..343683e646e0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3315,14 +3315,15 @@  bool iommu_group_dma_owner_claimed(struct iommu_group *group)
 EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);
 
 static int __iommu_set_group_pasid(struct iommu_domain *domain,
-				   struct iommu_group *group, ioasid_t pasid)
+				   struct iommu_group *group, ioasid_t pasid,
+				   struct iommu_domain *old)
 {
 	struct group_device *device, *last_gdev;
 	int ret;
 
 	for_each_group_device(group, device) {
 		ret = domain->ops->set_dev_pasid(domain, device->dev,
-						 pasid, NULL);
+						 pasid, old);
 		if (ret)
 			goto err_revert;
 	}
@@ -3332,11 +3333,34 @@  static int __iommu_set_group_pasid(struct iommu_domain *domain,
 err_revert:
 	last_gdev = device;
 	for_each_group_device(group, device) {
-		const struct iommu_ops *ops = dev_iommu_ops(device->dev);
+		/*
+		 * If no old domain, just undo all the devices/pasid that
+		 * have attached to the new domain.
+		 */
+		if (!old) {
+			const struct iommu_ops *ops =
+						dev_iommu_ops(device->dev);
+
+			if (device == last_gdev)
+				break;
+			ops = dev_iommu_ops(device->dev);
+			ops->remove_dev_pasid(device->dev, pasid, domain);
+			continue;
+		}
 
-		if (device == last_gdev)
+		/*
+		 * Rollback the devices/pasid that have attached to the new
+		 * domain. And it is a driver bug to fail attaching with a
+		 * previously good domain.
+		 */
+		if (device == last_gdev) {
+			WARN_ON(old->ops->set_dev_pasid(old, device->dev,
+							pasid, NULL));
 			break;
-		ops->remove_dev_pasid(device->dev, pasid, domain);
+		}
+
+		WARN_ON(old->ops->set_dev_pasid(old, device->dev,
+						pasid, domain));
 	}
 	return ret;
 }
@@ -3395,7 +3419,7 @@  int iommu_attach_device_pasid(struct iommu_domain *domain,
 		goto out_unlock;
 	}
 
-	ret = __iommu_set_group_pasid(domain, group, pasid);
+	ret = __iommu_set_group_pasid(domain, group, pasid, NULL);
 	if (ret)
 		xa_erase(&group->pasid_array, pasid);
 out_unlock:
@@ -3404,6 +3428,62 @@  int iommu_attach_device_pasid(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device_pasid);
 
+/**
+ * iommu_replace_device_pasid - replace the domain that a pasid is attached to
+ * @domain: new IOMMU domain to replace with
+ * @dev: the physical device
+ * @pasid: pasid that will be attached to the new domain
+ *
+ * This API allows the pasid to switch domains. Return 0 on success, or an
+ * error. The pasid will roll back to use the old domain if failure. The
+ * caller could call iommu_detach_device_pasid() before free the old domain
+ * in order to avoid use-after-free case.
+ */
+int iommu_replace_device_pasid(struct iommu_domain *domain,
+			       struct device *dev, ioasid_t pasid)
+{
+	/* Caller must be a probed driver on dev */
+	struct iommu_group *group = dev->iommu_group;
+	void *curr;
+	int ret;
+
+	if (!domain)
+		return -EINVAL;
+
+	if (!domain->ops->set_dev_pasid)
+		return -EOPNOTSUPP;
+
+	if (!group)
+		return -ENODEV;
+
+	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner)
+		return -EINVAL;
+
+	mutex_lock(&group->mutex);
+	curr = xa_store(&group->pasid_array, pasid, domain, GFP_KERNEL);
+	if (!curr) {
+		xa_erase(&group->pasid_array, pasid);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = xa_err(curr);
+	if (ret)
+		goto out_unlock;
+
+	if (curr == domain)
+		goto out_unlock;
+
+	ret = __iommu_set_group_pasid(domain, group, pasid, curr);
+	if (ret)
+		WARN_ON(xa_err(xa_store(&group->pasid_array, pasid,
+					curr, GFP_KERNEL)));
+out_unlock:
+	mutex_unlock(&group->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, IOMMUFD_INTERNAL);
+
 /*
  * iommu_detach_device_pasid() - Detach the domain from pasid of device
  * @domain: the iommu domain.