diff mbox series

[RFC,13/20] iommu: Extend iommu_at[de]tach_device() for multiple devices group

Message ID 20210919063848.1476776-14-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce /dev/iommu for userspace I/O address space management | expand

Commit Message

Yi Liu Sept. 19, 2021, 6:38 a.m. UTC
From: Lu Baolu <baolu.lu@linux.intel.com>

These two helpers could be used when 1) the iommu group is singleton,
or 2) the upper layer has put the iommu group into the secure state by
calling iommu_device_init_user_dma().

As we want the iommufd design to be a device-centric model, we want to
remove any group knowledge in iommufd. Given that we already have
iommu_at[de]tach_device() interface, we could extend it for iommufd
simply by doing below:

 - first device in a group does group attach;
 - last device in a group does group detach.

as long as the group has been put into the secure context.

The commit <426a273834eae> ("iommu: Limit iommu_attach/detach_device to
device with their own group") deliberately restricts the two interfaces
to single-device group. To avoid the conflict with existing usages, we
keep this policy and put the new extension only when the group has been
marked for user_dma.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/iommu.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

Comments

David Gibson Oct. 14, 2021, 5:24 a.m. UTC | #1
On Sun, Sep 19, 2021 at 02:38:41PM +0800, Liu Yi L wrote:
> From: Lu Baolu <baolu.lu@linux.intel.com>
> 
> These two helpers could be used when 1) the iommu group is singleton,
> or 2) the upper layer has put the iommu group into the secure state by
> calling iommu_device_init_user_dma().
> 
> As we want the iommufd design to be a device-centric model, we want to
> remove any group knowledge in iommufd. Given that we already have
> iommu_at[de]tach_device() interface, we could extend it for iommufd
> simply by doing below:
> 
>  - first device in a group does group attach;
>  - last device in a group does group detach.
> 
> as long as the group has been put into the secure context.
> 
> The commit <426a273834eae> ("iommu: Limit iommu_attach/detach_device to
> device with their own group") deliberately restricts the two interfaces
> to single-device group. To avoid the conflict with existing usages, we
> keep this policy and put the new extension only when the group has been
> marked for user_dma.

I still kind of hate this interface because it means an operation that
appears to be explicitly on a single device has an implicit effect on
other devices.

> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  drivers/iommu/iommu.c | 25 +++++++++++++++++++++----
>  1 file changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index bffd84e978fb..b6178997aef1 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -47,6 +47,7 @@ struct iommu_group {
>  	struct list_head entry;
>  	unsigned long user_dma_owner_id;
>  	refcount_t owner_cnt;
> +	refcount_t attach_cnt;
>  };
>  
>  struct group_device {
> @@ -1994,7 +1995,7 @@ static int __iommu_attach_device(struct iommu_domain *domain,
>  int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
>  {
>  	struct iommu_group *group;
> -	int ret;
> +	int ret = 0;
>  
>  	group = iommu_group_get(dev);
>  	if (!group)
> @@ -2005,11 +2006,23 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
>  	 * change while we are attaching
>  	 */
>  	mutex_lock(&group->mutex);
> -	ret = -EINVAL;
> -	if (iommu_group_device_count(group) != 1)
> +	if (group->user_dma_owner_id) {
> +		if (group->domain) {
> +			if (group->domain != domain)
> +				ret = -EBUSY;
> +			else
> +				refcount_inc(&group->attach_cnt);
> +
> +			goto out_unlock;
> +		}
> +	} else if (iommu_group_device_count(group) != 1) {

With this condition in the else, how can you ever attach the first
device of a multi-device group?

> +		ret = -EINVAL;
>  		goto out_unlock;
> +	}
>  
>  	ret = __iommu_attach_group(domain, group);
> +	if (!ret && group->user_dma_owner_id)
> +		refcount_set(&group->attach_cnt, 1);
>  
>  out_unlock:
>  	mutex_unlock(&group->mutex);
> @@ -2261,7 +2274,10 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
>  		return;
>  
>  	mutex_lock(&group->mutex);
> -	if (iommu_group_device_count(group) != 1) {
> +	if (group->user_dma_owner_id) {
> +		if (!refcount_dec_and_test(&group->attach_cnt))
> +			goto out_unlock;
> +	} else if (iommu_group_device_count(group) != 1) {

Shouldn't this path (detach a thing that's not attached), be a no-op
regardless of whether it's a singleton group or not?  Why does one
deserve a warning and one not?

>  		WARN_ON(1);
>  		goto out_unlock;
>  	}
> @@ -3368,6 +3384,7 @@ static int iommu_group_init_user_dma(struct iommu_group *group,
>  
>  	group->user_dma_owner_id = owner;
>  	refcount_set(&group->owner_cnt, 1);
> +	refcount_set(&group->attach_cnt, 0);
>  
>  	/* default domain is unsafe for user-initiated dma */
>  	if (group->domain == group->default_domain)
Tian, Kevin Oct. 14, 2021, 7:06 a.m. UTC | #2
> From: David Gibson <david@gibson.dropbear.id.au>
> Sent: Thursday, October 14, 2021 1:24 PM
> 
> On Sun, Sep 19, 2021 at 02:38:41PM +0800, Liu Yi L wrote:
> > From: Lu Baolu <baolu.lu@linux.intel.com>
> >
> > These two helpers could be used when 1) the iommu group is singleton,
> > or 2) the upper layer has put the iommu group into the secure state by
> > calling iommu_device_init_user_dma().
> >
> > As we want the iommufd design to be a device-centric model, we want to
> > remove any group knowledge in iommufd. Given that we already have
> > iommu_at[de]tach_device() interface, we could extend it for iommufd
> > simply by doing below:
> >
> >  - first device in a group does group attach;
> >  - last device in a group does group detach.
> >
> > as long as the group has been put into the secure context.
> >
> > The commit <426a273834eae> ("iommu: Limit
> iommu_attach/detach_device to
> > device with their own group") deliberately restricts the two interfaces
> > to single-device group. To avoid the conflict with existing usages, we
> > keep this policy and put the new extension only when the group has been
> > marked for user_dma.
> 
> I still kind of hate this interface because it means an operation that
> appears to be explicitly on a single device has an implicit effect on
> other devices.
> 

I still didn't get your concern why it's such a big deal. With this proposal
the group restriction will be 'explicitly' documented in the attach uAPI
comment and sample flow in iommufd.rst. A sane user should read all
those information to understand how this new sub-system works and
follow whatever constraints claimed there. In the end the user should
maintain the same group knowledge regardless of whether to use an
explicit group uAPI or a device uAPI which has group constraint...

Thanks,
Kevin
David Gibson Oct. 18, 2021, 3:57 a.m. UTC | #3
On Thu, Oct 14, 2021 at 07:06:07AM +0000, Tian, Kevin wrote:
> > From: David Gibson <david@gibson.dropbear.id.au>
> > Sent: Thursday, October 14, 2021 1:24 PM
> > 
> > On Sun, Sep 19, 2021 at 02:38:41PM +0800, Liu Yi L wrote:
> > > From: Lu Baolu <baolu.lu@linux.intel.com>
> > >
> > > These two helpers could be used when 1) the iommu group is singleton,
> > > or 2) the upper layer has put the iommu group into the secure state by
> > > calling iommu_device_init_user_dma().
> > >
> > > As we want the iommufd design to be a device-centric model, we want to
> > > remove any group knowledge in iommufd. Given that we already have
> > > iommu_at[de]tach_device() interface, we could extend it for iommufd
> > > simply by doing below:
> > >
> > >  - first device in a group does group attach;
> > >  - last device in a group does group detach.
> > >
> > > as long as the group has been put into the secure context.
> > >
> > > The commit <426a273834eae> ("iommu: Limit
> > iommu_attach/detach_device to
> > > device with their own group") deliberately restricts the two interfaces
> > > to single-device group. To avoid the conflict with existing usages, we
> > > keep this policy and put the new extension only when the group has been
> > > marked for user_dma.
> > 
> > I still kind of hate this interface because it means an operation that
> > appears to be explicitly on a single device has an implicit effect on
> > other devices.
> > 
> 
> I still didn't get your concern why it's such a big deal. With this proposal
> the group restriction will be 'explicitly' documented in the attach uAPI
> comment and sample flow in iommufd.rst. A sane user should read all
> those information to understand how this new sub-system works and
> follow whatever constraints claimed there. In the end the user should
> maintain the same group knowledge regardless of whether to use an
> explicit group uAPI or a device uAPI which has group constraint...

The first user might read this.  Subsequent users are likely to just
copy paste examples from earlier things without fully understanding
them.  In general documenting restrictions somewhere is never as
effective as making those restrictions part of the interface signature
itself.
Jason Gunthorpe Oct. 18, 2021, 4:32 p.m. UTC | #4
On Mon, Oct 18, 2021 at 02:57:12PM +1100, David Gibson wrote:

> The first user might read this.  Subsequent users are likely to just
> copy paste examples from earlier things without fully understanding
> them.  In general documenting restrictions somewhere is never as
> effective as making those restrictions part of the interface signature
> itself.

I'd think this argument would hold more water if you could point to
someplace in existing userspace that cares about the VFIO grouping.

From what I see the applications do what the admin tells them to do -
and if the admin says to use a certain VFIO device then that is
excatly what they do. I don't know of any applications that ask the
admin to tell them group information.

What I see is aligning what the kernel provides to the APIs the
applications have already built.

Jason
David Gibson Oct. 25, 2021, 5:14 a.m. UTC | #5
On Mon, Oct 18, 2021 at 01:32:38PM -0300, Jason Gunthorpe wrote:
> On Mon, Oct 18, 2021 at 02:57:12PM +1100, David Gibson wrote:
> 
> > The first user might read this.  Subsequent users are likely to just
> > copy paste examples from earlier things without fully understanding
> > them.  In general documenting restrictions somewhere is never as
> > effective as making those restrictions part of the interface signature
> > itself.
> 
> I'd think this argument would hold more water if you could point to
> someplace in existing userspace that cares about the VFIO grouping.

My whole point here is that the proposed semantics mean that we have
weird side effects even if the app doesn't think it cares about
groups.

e.g. App's input is a bunch of PCI addresses for NICs.  It attaches
each one to a separate IOAS and bridges packets between them all.  As
far as the app is concerned, it doesn't care about groups, as you say.

Except that it breaks if any two of the devices are in the same group.
Worse, it has a completely horrible failure mode: no syscall returns
an, it just starts trying to do dma with device A, and the packets get
written into the IOAS that belongs to device B instead.  Sounds like a
complete nightmare to debug if you don't know about groups, because
you never thought you cared.


And yes, for a simple bridge like this app, attaching all the devices
to the same IOAS is a more likely setup.  But using an IOAS per device
is a perfectly valid configuration as well, and with the current draft
nothing will warn the app that this is a bad idea.

> From what I see the applications do what the admin tells them to do -
> and if the admin says to use a certain VFIO device then that is
> excatly what they do. I don't know of any applications that ask the
> admin to tell them group information.
> 
> What I see is aligning what the kernel provides to the APIs the
> applications have already built.
> 
> Jason
>
Jason Gunthorpe Oct. 25, 2021, 12:14 p.m. UTC | #6
On Mon, Oct 25, 2021 at 04:14:56PM +1100, David Gibson wrote:
> On Mon, Oct 18, 2021 at 01:32:38PM -0300, Jason Gunthorpe wrote:
> > On Mon, Oct 18, 2021 at 02:57:12PM +1100, David Gibson wrote:
> > 
> > > The first user might read this.  Subsequent users are likely to just
> > > copy paste examples from earlier things without fully understanding
> > > them.  In general documenting restrictions somewhere is never as
> > > effective as making those restrictions part of the interface signature
> > > itself.
> > 
> > I'd think this argument would hold more water if you could point to
> > someplace in existing userspace that cares about the VFIO grouping.
> 
> My whole point here is that the proposed semantics mean that we have
> weird side effects even if the app doesn't think it cares about
> groups.
> 
> e.g. App's input is a bunch of PCI addresses for NICs.  It attaches
> each one to a separate IOAS and bridges packets between them all.  As
> far as the app is concerned, it doesn't care about groups, as you say.
> 
> Except that it breaks if any two of the devices are in the same group.
> Worse, it has a completely horrible failure mode: no syscall returns

Huh? If an app requests an IOAS attach that is not possible then the
attachment IOCTL will fail.

The kernel must track groups and know that group A is on IOAS A and
any further attach of a group A device must specify IOAS A or receive
a failure.

The kernel should never blindly acknowledge a failed attachment.

Jason
David Gibson Oct. 25, 2021, 1:16 p.m. UTC | #7
On Mon, Oct 25, 2021 at 09:14:10AM -0300, Jason Gunthorpe wrote:
> On Mon, Oct 25, 2021 at 04:14:56PM +1100, David Gibson wrote:
> > On Mon, Oct 18, 2021 at 01:32:38PM -0300, Jason Gunthorpe wrote:
> > > On Mon, Oct 18, 2021 at 02:57:12PM +1100, David Gibson wrote:
> > > 
> > > > The first user might read this.  Subsequent users are likely to just
> > > > copy paste examples from earlier things without fully understanding
> > > > them.  In general documenting restrictions somewhere is never as
> > > > effective as making those restrictions part of the interface signature
> > > > itself.
> > > 
> > > I'd think this argument would hold more water if you could point to
> > > someplace in existing userspace that cares about the VFIO grouping.
> > 
> > My whole point here is that the proposed semantics mean that we have
> > weird side effects even if the app doesn't think it cares about
> > groups.
> > 
> > e.g. App's input is a bunch of PCI addresses for NICs.  It attaches
> > each one to a separate IOAS and bridges packets between them all.  As
> > far as the app is concerned, it doesn't care about groups, as you say.
> > 
> > Except that it breaks if any two of the devices are in the same group.
> > Worse, it has a completely horrible failure mode: no syscall returns
> 
> Huh? If an app requests an IOAS attach that is not possible then the
> attachment IOCTL will fail.
> 
> The kernel must track groups and know that group A is on IOAS A and
> any further attach of a group A device must specify IOAS A or receive
> a failure.

Ok, I misunderstood the semantics that were suggested.

So, IIUC what you're suggested is that if group X is attached to IOAS
1, then attaching the group to IOAS 1 again should succeed (as a
no-op), but attaching to any other IOAS should fail?

That's certainly an improvement, but there's still some questions.

If you attach devices A and B (both in group X) to IOAS 1, then detach
device A, what happens?  Do you detach both devices?  Or do you have a
counter so you have to detach as many time as you attached?

> The kernel should never blindly acknowledge a failed attachment.
> 
> Jason
>
Jason Gunthorpe Oct. 25, 2021, 11:36 p.m. UTC | #8
On Tue, Oct 26, 2021 at 12:16:43AM +1100, David Gibson wrote:
> If you attach devices A and B (both in group X) to IOAS 1, then detach
> device A, what happens?  Do you detach both devices?  Or do you have a
> counter so you have to detach as many time as you attached?

I would refcount it since that is the only thing that makes semantic
sense with the device centric model.

Jason
David Gibson Oct. 26, 2021, 9:23 a.m. UTC | #9
On Mon, Oct 25, 2021 at 08:36:02PM -0300, Jason Gunthorpe wrote:
> On Tue, Oct 26, 2021 at 12:16:43AM +1100, David Gibson wrote:
> > If you attach devices A and B (both in group X) to IOAS 1, then detach
> > device A, what happens?  Do you detach both devices?  Or do you have a
> > counter so you have to detach as many time as you attached?
> 
> I would refcount it since that is the only thing that makes semantic
> sense with the device centric model.

Yes, I definitely think that's the better option here.  This does
still leave (at least) one weird edge case where the group structure
can "leak" into the awareness of code that otherwise doesn't care,
though it's definitely less nasty that the ones I mentioned before:

If an app wants to move a bunch of devices from one IOAS to another,
it can do it either:

A)
	for each dev:
		detach dev from IOAS
	for each dev:
		attach dev to new IOAS

or B)

	for each dev:
		detach dev from IOAS
		attach dev to new IOAS

With only singleton groups they're pretty much equivalent, but with
multiple devices in a group, (B) will fail.
diff mbox series

Patch

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index bffd84e978fb..b6178997aef1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -47,6 +47,7 @@  struct iommu_group {
 	struct list_head entry;
 	unsigned long user_dma_owner_id;
 	refcount_t owner_cnt;
+	refcount_t attach_cnt;
 };
 
 struct group_device {
@@ -1994,7 +1995,7 @@  static int __iommu_attach_device(struct iommu_domain *domain,
 int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 {
 	struct iommu_group *group;
-	int ret;
+	int ret = 0;
 
 	group = iommu_group_get(dev);
 	if (!group)
@@ -2005,11 +2006,23 @@  int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 	 * change while we are attaching
 	 */
 	mutex_lock(&group->mutex);
-	ret = -EINVAL;
-	if (iommu_group_device_count(group) != 1)
+	if (group->user_dma_owner_id) {
+		if (group->domain) {
+			if (group->domain != domain)
+				ret = -EBUSY;
+			else
+				refcount_inc(&group->attach_cnt);
+
+			goto out_unlock;
+		}
+	} else if (iommu_group_device_count(group) != 1) {
+		ret = -EINVAL;
 		goto out_unlock;
+	}
 
 	ret = __iommu_attach_group(domain, group);
+	if (!ret && group->user_dma_owner_id)
+		refcount_set(&group->attach_cnt, 1);
 
 out_unlock:
 	mutex_unlock(&group->mutex);
@@ -2261,7 +2274,10 @@  void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 		return;
 
 	mutex_lock(&group->mutex);
-	if (iommu_group_device_count(group) != 1) {
+	if (group->user_dma_owner_id) {
+		if (!refcount_dec_and_test(&group->attach_cnt))
+			goto out_unlock;
+	} else if (iommu_group_device_count(group) != 1) {
 		WARN_ON(1);
 		goto out_unlock;
 	}
@@ -3368,6 +3384,7 @@  static int iommu_group_init_user_dma(struct iommu_group *group,
 
 	group->user_dma_owner_id = owner;
 	refcount_set(&group->owner_cnt, 1);
+	refcount_set(&group->attach_cnt, 0);
 
 	/* default domain is unsafe for user-initiated dma */
 	if (group->domain == group->default_domain)