diff mbox series

[v4,07/13] iommu: Add iommu_at[de]tach_device_shared() for multi-device groups

Message ID 20211217063708.1740334-8-baolu.lu@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Fix BUG_ON in vfio_iommu_group_notifier() | expand

Commit Message

Baolu Lu Dec. 17, 2021, 6:37 a.m. UTC
The iommu_attach/detach_device() interfaces were exposed for the device
drivers to attach/detach their own domains. The commit <426a273834eae>
("iommu: Limit iommu_attach/detach_device to device with their own group")
restricted them to singleton groups to avoid different device in a group
attaching different domain.

As we've introduced device DMA ownership into the iommu core. We can now
introduce interfaces for muliple-device groups, and "all devices are in the
same address space" is still guaranteed.

The iommu_attach/detach_device_shared() could be used when multiple drivers
sharing the group claim the DMA_OWNER_PRIVATE_DOMAIN ownership. The first
call of iommu_attach_device_shared() attaches the domain to the group.
Other drivers could join it later. The domain will be detached from the
group after all drivers unjoin it.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
---
 include/linux/iommu.h | 13 +++++++
 drivers/iommu/iommu.c | 79 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

Comments

Robin Murphy Dec. 21, 2021, 4:50 p.m. UTC | #1
On 2021-12-17 06:37, Lu Baolu wrote:
> The iommu_attach/detach_device() interfaces were exposed for the device
> drivers to attach/detach their own domains. The commit <426a273834eae>
> ("iommu: Limit iommu_attach/detach_device to device with their own group")
> restricted them to singleton groups to avoid different device in a group
> attaching different domain.
> 
> As we've introduced device DMA ownership into the iommu core. We can now
> introduce interfaces for muliple-device groups, and "all devices are in the
> same address space" is still guaranteed.
> 
> The iommu_attach/detach_device_shared() could be used when multiple drivers
> sharing the group claim the DMA_OWNER_PRIVATE_DOMAIN ownership. The first
> call of iommu_attach_device_shared() attaches the domain to the group.
> Other drivers could join it later. The domain will be detached from the
> group after all drivers unjoin it.

I don't see the point of this at all - if you really want to hide the 
concept of IOMMU groups away from drivers then just make 
iommu_{attach,detach}_device() do the right thing. At least the 
iommu_group_get_for_dev() plus iommu_{attach,detach}_group() API is 
clear - this proposal is the worst of both worlds, in that drivers still 
have to be just as aware of groups in order to know whether to call the 
_shared interface or not, except it's now entirely implicit and non-obvious.

Otherwise just add the housekeeping stuff to 
iommu_{attach,detach}_group() - there's no way we want *three* 
attach/detach interfaces all with different semantics.

It's worth taking a step back and realising that overall, this is really 
just a more generalised and finer-grained extension of what 426a273834ea 
already did for non-group-aware code, so it makes little sense *not* to 
integrate it into the existing interfaces.

Robin.

> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> Tested-by: Dmitry Osipenko <digetx@gmail.com>
> ---
>   include/linux/iommu.h | 13 +++++++
>   drivers/iommu/iommu.c | 79 +++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 92 insertions(+)
> 
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 5ad4cf13370d..1bc03118dfb3 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -703,6 +703,8 @@ int iommu_group_set_dma_owner(struct iommu_group *group, enum iommu_dma_owner ow
>   			      void *owner_cookie);
>   void iommu_group_release_dma_owner(struct iommu_group *group, enum iommu_dma_owner owner);
>   bool iommu_group_dma_owner_unclaimed(struct iommu_group *group);
> +int iommu_attach_device_shared(struct iommu_domain *domain, struct device *dev);
> +void iommu_detach_device_shared(struct iommu_domain *domain, struct device *dev);
>   
>   #else /* CONFIG_IOMMU_API */
>   
> @@ -743,11 +745,22 @@ static inline int iommu_attach_device(struct iommu_domain *domain,
>   	return -ENODEV;
>   }
>   
> +static inline int iommu_attach_device_shared(struct iommu_domain *domain,
> +					     struct device *dev)
> +{
> +	return -ENODEV;
> +}
> +
>   static inline void iommu_detach_device(struct iommu_domain *domain,
>   				       struct device *dev)
>   {
>   }
>   
> +static inline void iommu_detach_device_shared(struct iommu_domain *domain,
> +					      struct device *dev)
> +{
> +}
> +
>   static inline struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
>   {
>   	return NULL;
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 8bec71b1cc18..3ad66cb9bedc 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -50,6 +50,7 @@ struct iommu_group {
>   	struct list_head entry;
>   	enum iommu_dma_owner dma_owner;
>   	unsigned int owner_cnt;
> +	unsigned int attach_cnt;
>   	void *owner_cookie;
>   };
>   
> @@ -3512,3 +3513,81 @@ void iommu_device_release_dma_owner(struct device *dev, enum iommu_dma_owner own
>   	iommu_group_put(group);
>   }
>   EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner);
> +
> +/**
> + * iommu_attach_device_shared() - Attach shared domain to a device
> + * @domain: The shared domain.
> + * @dev: The device.
> + *
> + * Similar to iommu_attach_device(), but allowed for shared-group devices
> + * and guarantees that all devices in an iommu group could only be attached
> + * by a same iommu domain. The caller should explicitly set the dma ownership
> + * of DMA_OWNER_PRIVATE_DOMAIN or DMA_OWNER_PRIVATE_DOMAIN_USER type before
> + * calling it and use the paired helper iommu_detach_device_shared() for
> + * cleanup.
> + */
> +int iommu_attach_device_shared(struct iommu_domain *domain, struct device *dev)
> +{
> +	struct iommu_group *group;
> +	int ret = 0;
> +
> +	group = iommu_group_get(dev);
> +	if (!group)
> +		return -ENODEV;
> +
> +	mutex_lock(&group->mutex);
> +	if (group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN &&
> +	    group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN_USER) {
> +		ret = -EPERM;
> +		goto unlock_out;
> +	}
> +
> +	if (group->attach_cnt) {
> +		if (group->domain != domain) {
> +			ret = -EBUSY;
> +			goto unlock_out;
> +		}
> +	} else {
> +		ret = __iommu_attach_group(domain, group);
> +		if (ret)
> +			goto unlock_out;
> +	}
> +
> +	group->attach_cnt++;
> +unlock_out:
> +	mutex_unlock(&group->mutex);
> +	iommu_group_put(group);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_attach_device_shared);
> +
> +/**
> + * iommu_detach_device_shared() - Detach a domain from device
> + * @domain: The domain.
> + * @dev: The device.
> + *
> + * The detach helper paired with iommu_attach_device_shared().
> + */
> +void iommu_detach_device_shared(struct iommu_domain *domain, struct device *dev)
> +{
> +	struct iommu_group *group;
> +
> +	group = iommu_group_get(dev);
> +	if (!group)
> +		return;
> +
> +	mutex_lock(&group->mutex);
> +	if (WARN_ON(!group->attach_cnt || group->domain != domain ||
> +		    (group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN &&
> +		     group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN_USER)))
> +		goto unlock_out;
> +
> +	if (--group->attach_cnt == 0)
> +		__iommu_detach_group(domain, group);
> +
> +unlock_out:
> +	mutex_unlock(&group->mutex);
> +	iommu_group_put(group);
> +}
> +EXPORT_SYMBOL_GPL(iommu_detach_device_shared);
Jason Gunthorpe Dec. 21, 2021, 6:46 p.m. UTC | #2
On Tue, Dec 21, 2021 at 04:50:56PM +0000, Robin Murphy wrote:

> this proposal is the worst of both worlds, in that drivers still have to be
> just as aware of groups in order to know whether to call the _shared
> interface or not, except it's now entirely implicit and non-obvious.

Drivers are not aware of groups, where did you see that?

Drivers have to indicate their intention, based entirely on their own
internal design. If groups are present, or not is irrelevant to the
driver.

If the driver uses a single struct device (which is most) then it uses
iommu_attach_device().

If the driver uses multiple struct devices and intends to connect them
all to the same domain then it uses the _shared variant. The only
difference between the two is the _shared varient lacks some of the
protections against driver abuse of the API.

Nothing uses the group interface except for VFIO and stuff inside
drivers/iommu. VFIO has a uAPI tied to the group interface and it
is stuck with it.

> Otherwise just add the housekeeping stuff to iommu_{attach,detach}_group() -
> there's no way we want *three* attach/detach interfaces all with different
> semantics.

I'm not sure why you think 3 APIs is bad thing. Threes APIs, with
clearly intended purposes is a lot better than one giant API with a
bunch of parameters that tries to do everything.

In this case, it is not simple to 'add the housekeeping' to
iommu_attach_group() in a way that is useful to both tegra and
VFIO. What tegra wants is what the _shared API implements, and that
logic should not be open coded in drivers.

VFIO does not want exactly that, it has its own logic to deal directly
with groups tied to its uAPI. Due to the uAPI it doesn't even have a
struct device, unfortunately.

The reason there are three APIs is because there are three different
use-cases. It is not bad thing to have APIs designed for the use cases
they serve.

> It's worth taking a step back and realising that overall, this is really
> just a more generalised and finer-grained extension of what 426a273834ea
> already did for non-group-aware code, so it makes little sense *not* to
> integrate it into the existing interfaces.

This is taking 426a to it's logical conclusion and *removing* the
group API from the drivers entirely. This is desirable because drivers
cannot do anything sane with the group.

The drivers have struct devices, and so we provide APIs that work in
terms of struct devices to cover both driver use cases today, and do
so more safely than what is already implemented.

Do not mix up VFIO with the driver interface, these are different
things. It is better VFIO stay on its own and not complicate the
driver world.

Jason
Baolu Lu Dec. 22, 2021, 4:22 a.m. UTC | #3
On 12/22/21 2:46 AM, Jason Gunthorpe wrote:
>> It's worth taking a step back and realising that overall, this is really
>> just a more generalised and finer-grained extension of what 426a273834ea
>> already did for non-group-aware code, so it makes little sense*not*  to
>> integrate it into the existing interfaces.
> This is taking 426a to it's logical conclusion and*removing*  the
> group API from the drivers entirely. This is desirable because drivers
> cannot do anything sane with the group.
> 
> The drivers have struct devices, and so we provide APIs that work in
> terms of struct devices to cover both driver use cases today, and do
> so more safely than what is already implemented.
> 
> Do not mix up VFIO with the driver interface, these are different
> things. It is better VFIO stay on its own and not complicate the
> driver world.

Per Joerg's previous comments:

https://lore.kernel.org/linux-iommu/20211119150612.jhsvsbzisvux2lga@8bytes.org/

The commit 426a273834ea came only in order to disallow attaching a
single device within a group to a different iommu_domain. So it's
reasonable to improve the existing iommu_attach/detach_device() to cover
all cases. How about below code? Did I miss anything?

int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
{
         struct iommu_group *group;
         int ret = 0;

         group = iommu_group_get(dev);
         if (!group)
                 return -ENODEV;

         mutex_lock(&group->mutex);
         if (group->attach_cnt) {
                 if (group->domain != domain) {
                         ret = -EBUSY;
                         goto unlock_out;
                 }
         } else {
                 ret = __iommu_attach_group(domain, group);
                 if (ret)
                         goto unlock_out;
         }

         group->attach_cnt++;
unlock_out:
         mutex_unlock(&group->mutex);
         iommu_group_put(group);

         return ret;
}
EXPORT_SYMBOL_GPL(iommu_attach_device);

void iommu_detach_device_shared(struct iommu_domain *domain, struct 
device *dev)
{
         struct iommu_group *group;

         group = iommu_group_get(dev);
         if (WARN_ON(!group))
                 return;

         mutex_lock(&group->mutex);
         if (WARN_ON(!group->attach_cnt || group->domain != domain)
                 goto unlock_out;

         if (--group->attach_cnt == 0)
                 __iommu_detach_group(domain, group);

unlock_out:
         mutex_unlock(&group->mutex);
         iommu_group_put(group);
}
EXPORT_SYMBOL_GPL(iommu_detach_device);

Best regards,
baolu
Baolu Lu Dec. 22, 2021, 4:25 a.m. UTC | #4
On 12/22/21 12:22 PM, Lu Baolu wrote:
> void iommu_detach_device_shared(struct iommu_domain *domain, struct 
> device *dev)

Sorry for typo. Please ignore the _shared postfix.

Best regards,
baolu
Robin Murphy Dec. 22, 2021, 8:26 p.m. UTC | #5
On 21/12/2021 6:46 pm, Jason Gunthorpe wrote:
> On Tue, Dec 21, 2021 at 04:50:56PM +0000, Robin Murphy wrote:
> 
>> this proposal is the worst of both worlds, in that drivers still have to be
>> just as aware of groups in order to know whether to call the _shared
>> interface or not, except it's now entirely implicit and non-obvious.
> 
> Drivers are not aware of groups, where did you see that?

`git grep iommu_attach_group -- :^drivers/iommu :^include`

Did I really have to explain that?

The drivers other than vfio_iommu_type1, however, do have a complete 
failure to handle, or even consider, any group that does not fit the 
particular set of assumptions they are making, but at least they only 
work in a context where that should not occur.

> Drivers have to indicate their intention, based entirely on their own
> internal design. If groups are present, or not is irrelevant to the
> driver.
> 
> If the driver uses a single struct device (which is most) then it uses
> iommu_attach_device().
> 
> If the driver uses multiple struct devices and intends to connect them
> all to the same domain then it uses the _shared variant. The only
> difference between the two is the _shared varient lacks some of the
> protections against driver abuse of the API.

You've lost me again; how are those intentions any different? Attaching 
one device to a private domain is a literal subset of attaching more 
than one device to a private domain. There is no "abuse" of any API 
anywhere; the singleton group restriction exists as a protective measure 
because iommu_attach_device() was already in use before groups were 
really a thing, in contexts where groups happened to be singleton 
already, but anyone adding *new* uses in contexts where that assumption 
might *not* hold would be in trouble. Thus it enforces DMA ownership by 
the most trivial and heavy-handed means of simply preventing it ever 
becoming shared in the first place.

Yes, I'm using the term "DMA ownership" in a slightly different context 
to the one in which you originally proposed it. Please step out of the 
userspace-device-assignment-focused bubble for a moment and stay with me...

So then we have the iommu_attach_group() interface for new code (and 
still nobody has got round to updating the old code to it yet), for 
which the basic use-case is still fundamentally "I want to attach my 
thing to my domain", but at least now forcing explicit awareness that 
"my thing" could possibly be inextricably intertwined with more than 
just the one device they expect, so potential callers should have a good 
think about that. Unfortunately this leaves the matter of who "owns" the 
group entirely in the hands of those callers, which as we've now 
concluded is not great.

One of the main reasons for non-singleton groups to occur is due to ID 
aliasing or lack of isolation well beyond the scope and control of 
endpoint devices themselves, so it's not really fair to expect every 
IOMMU-aware driver to also be aware of that, have any idea of how to 
actually handle it, or especially try to negotiate with random other 
drivers as to whether it might be OK to take control of their DMA 
address space too. The whole point is that *every* domain attach really 
*has* to be considered "shared" because in general drivers can't know 
otherwise. Hence the easy, if crude, fix for the original API.

> Nothing uses the group interface except for VFIO and stuff inside
> drivers/iommu. VFIO has a uAPI tied to the group interface and it
> is stuck with it.

Self-contradiction is getting stronger, careful...
>> Otherwise just add the housekeeping stuff to iommu_{attach,detach}_group() -
>> there's no way we want *three* attach/detach interfaces all with different
>> semantics.
> 
> I'm not sure why you think 3 APIs is bad thing. Threes APIs, with
> clearly intended purposes is a lot better than one giant API with a
> bunch of parameters that tries to do everything.

Because there's only one problem to solve! We have the original API 
which does happen to safely enforce ownership, but in an implicit way 
that doesn't scale; then we have the second API which got past the 
topology constraint but unfortunately turns out to just be unsafe in a 
slightly different way, and was supposed to replace the first one but 
hasn't, and is a bit clunky to boot; now you're proposing a third one 
which can correctly enforce safe ownership for any group topology, which 
is simply combining the good bits of the first two. It makes no sense to 
maintain two bad versions of a thing alongside one which works better.

I don't see why anything would be a giant API with a bunch of parameters 
- depending on how you look at it, this new proposal is basically either 
iommu_attach_device() with the ability to scale up to non-trivial groups 
properly, or iommu_attach_group() with a potentially better interface 
and actual safety. The former is still more prevalent (and the interface 
argument compelling), so if we put the new implementation behind that, 
with the one tweak of having it set DMA_OWNER_PRIVATE_DOMAIN 
automatically, kill off iommu_attach_group() by converting its couple of 
users, and not only have we solved the VFIO problem but we've also 
finally updated all the legacy code for free! Of course you can have a 
separate version for VFIO to attach with DMA_OWNER_PRIVATE_DOMAIN_USER 
if you like, although I still fail to understand the necessity of the 
distinction.

> In this case, it is not simple to 'add the housekeeping' to
> iommu_attach_group() in a way that is useful to both tegra and
> VFIO. What tegra wants is what the _shared API implements, and that
> logic should not be open coded in drivers.
> 
> VFIO does not want exactly that, it has its own logic to deal directly
> with groups tied to its uAPI. Due to the uAPI it doesn't even have a
> struct device, unfortunately.

Nope. VFIO has its own logic to deal with groups because it's the only 
thing that's ever actually tried dealing with groups correctly 
(unsurprisingly, given that it's where they came from), and every other 
private IOMMU domain user is just crippled or broken to some degree. All 
that proves is that we really should be policing groups better in the 
IOMMU core, per this series, because actually fixing all the other users 
to properly validate their device's group would be a ridiculous mess.

What VFIO wants is (conceptually[1]) "attach this device to my domain, 
provided it and any other devices in its group are managed by a driver I 
approve of." Surprise surprise, that's what any other driver wants as 
well! For iommu_attach_device() it was originally implicit, and is now 
further enforced by the singleton group restriction. For Tegra/host1x 
it's implicit in the complete obliviousness to the possibility of that 
not being the case.

Of course VFIO has a struct device if it needs one; it's trivial to 
resolve the member(s) of a group (and even more so once we can assume 
that a group may only ever contain mutually-compatible devices in the 
first place). How do you think vfio_bus_type() works?

VFIO will also need a struct device anyway, because once I get back from 
my holiday in the new year I need to start working with Simon on 
evolving the rest of the API away from bus->iommu_ops to dev->iommu so 
we can finally support IOMMU drivers coexisting[2].

> The reason there are three APIs is because there are three different
> use-cases. It is not bad thing to have APIs designed for the use cases
> they serve.

Indeed I agree with that second point, I'm just increasingly baffled how 
it's not clear to you that there is only one fundamental use-case here. 
Perhaps I'm too familiar with the history to objectively see how unclear 
the current state of things might be :/

>> It's worth taking a step back and realising that overall, this is really
>> just a more generalised and finer-grained extension of what 426a273834ea
>> already did for non-group-aware code, so it makes little sense *not* to
>> integrate it into the existing interfaces.
> 
> This is taking 426a to it's logical conclusion and *removing* the
> group API from the drivers entirely. This is desirable because drivers
> cannot do anything sane with the group.

I am in complete agreement with that (to the point of also not liking 
patch #6).

> The drivers have struct devices, and so we provide APIs that work in
> terms of struct devices to cover both driver use cases today, and do
> so more safely than what is already implemented.

I am in complete agreement with that (given "both" of the supposed 3 
use-cases all being the same).

> Do not mix up VFIO with the driver interface, these are different
> things. It is better VFIO stay on its own and not complicate the
> driver world.

Nope, vfio_iommu_type1 is just a driver, calling the IOMMU API just like 
any other driver. I like the little bit where it passes itself to 
vfio_register_iommu_driver(), which I feel gets this across far more 
poetically than I can manage.

Thanks,
Robin.

[1] Yes, due to the UAPI it actually starts with the whole group rather 
than any particular device within it. Don't nitpick.
[2] 
https://lore.kernel.org/linux-iommu/2021052710373173260118@rock-chips.com/
Jason Gunthorpe Dec. 23, 2021, 12:57 a.m. UTC | #6
On Wed, Dec 22, 2021 at 08:26:34PM +0000, Robin Murphy wrote:
> On 21/12/2021 6:46 pm, Jason Gunthorpe wrote:
> > On Tue, Dec 21, 2021 at 04:50:56PM +0000, Robin Murphy wrote:
> > 
> > > this proposal is the worst of both worlds, in that drivers still have to be
> > > just as aware of groups in order to know whether to call the _shared
> > > interface or not, except it's now entirely implicit and non-obvious.
> > 
> > Drivers are not aware of groups, where did you see that?
> 
> `git grep iommu_attach_group -- :^drivers/iommu :^include`
> 
> Did I really have to explain that?

Well, yes you did, because it shows you haven't understood my
question. After this series we deleted all those calls (though Lu, we
missed one of the tegra ones in staging, let's get it for the next
posting)

So, after this series, where do you see drivers being aware of groups?
If things are missed lets expect to fix them.

> > If the driver uses multiple struct devices and intends to connect them
> > all to the same domain then it uses the _shared variant. The only
> > difference between the two is the _shared varient lacks some of the
> > protections against driver abuse of the API.
> 
> You've lost me again; how are those intentions any different? Attaching one
> device to a private domain is a literal subset of attaching more than one
> device to a private domain. 

Yes it is a subset, but drivers will malfunction if they are not
designed to have multi-attachment and wrongly get it, and there is
only one driver that does actually need this.

I maintain a big driver subsystem and have learned that grepability of
the driver mess for special cases is quite a good thing to
have. Forcing drivers to mark in code when they do something weird is
an advantage, even if it causes some small API redundancy.

However, if you really feel strongly this should really be one API
with the _shared implementation I won't argue it any further.

> So then we have the iommu_attach_group() interface for new code (and still
> nobody has got round to updating the old code to it yet), for which
> the

This series is going in the direction of eliminating
iommu_attach_group() as part of the driver
interface. iommu_attach_group() is repurposed to only be useful for
VFIO.

> properly, or iommu_attach_group() with a potentially better interface and
> actual safety. The former is still more prevalent (and the interface
> argument compelling), so if we put the new implementation behind that, with
> the one tweak of having it set DMA_OWNER_PRIVATE_DOMAIN automatically, kill
> off iommu_attach_group() by converting its couple of users, 

This is what we did, iommu_attach_device() & _shared() are to be the
only interface for the drivers, and we killed off the
iommu_attach_group() couple of users except VFIO (the miss of
drivers/staging excepted)

> and not only have we solved the VFIO problem but we've also finally
> updated all the legacy code for free! Of course you can have a
> separate version for VFIO to attach with
> DMA_OWNER_PRIVATE_DOMAIN_USER if you like, although I still fail to
> understand the necessity of the distinction.

And the seperate version for VFIO is called 'iommu_attach_group()'.

Lu, it is probably a good idea to add an assertion here that the group
is in DMA_OWNER_PRIVATE_DOMAIN_USER to make it clear that
iommu_attach_group() is only for VFIO.

VFIO has a special requirement that it be able to do:

+       ret = iommu_group_set_dma_owner(group->iommu_group,
+                                       DMA_OWNER_PRIVATE_DOMAIN_USER, f.file);

Without having a iommu_domain to attach.

This is because of the giant special case that PPC made of VFIO's
IOMMU code. PPC (aka vfio_iommu_spapr_tce.c) requires the group
isolation that iommu_group_set_dma_owner() provides, but does not
actually have an iommu_domain and can not/does not call
iommu_attach_group().

Fixing this is a whole other giant adventure I'm hoping David will
help me unwind next year.. 

This series solves this problem by using the two step sequence of
iommu_group_set_dma_owner()/iommu_attach_group() and conceptually
redefining how iommu_attach_group() works to require the external
caller to have done the iommu_group_set_dma_owner() for it. This is
why the series has three APIs, because the VFIO special one assumes
external iommu_group_set_dma_owner(). It just happens that is exactly
the same code as iommu_attach_group() today.

As for why does DMA_OWNER_PRIVATE_DOMAIN_USER exist? VFIO doesn't have
an iommu_domain at this point but it still needs the iommu core to
detatch the default domain. This is what the _USER does.

Soo..

There is another way to organize this and perhaps it does make more
sense. I will try to sketch briefly in email, try to imagine the
gaps..

API family (== compares to this series):

   iommu_device_use_dma_api(dev);
     == iommu_device_set_dma_owner(dev, DMA_OWNER_DMA_API, NULL);

   iommu_group_set_dma_owner(group, file);
     == iommu_device_set_dma_owner(dev, DMA_OWNER_PRIVATE_DOMAIN_USER,
                                   file);
     Always detaches all domains from the group

   iommu_attach_device(domain, dev)
     == as is in this patch
     dev and domain are 1:1

   iommu_attach_device_shared(domain, dev)
     == as is in this patch
     dev and domain are N:1
     * could just be the same as iommu_attach_device

   iommu_replace_group_domain(group, old_domain, new_domain)
     Makes group point at new_domain. new_domain can be NULL.

   iommu_device_unuse_dma_api(dev)
    == iommu_device_release_dma_owner() in this patch

   iommu_group_release_dma_owner(group)
    == iommu_detatch_group() && iommu_group_release_dma_owner()

VFIO would use the sequence:

   iommu_group_set_dma_owner(group, file);
   iommu_replace_group_domain(group, NULL, domain_1);
   iommu_replace_group_domain(group, domain_1, domain_2);
   iommu_group_release_dma_owner(group);

Simple devices would use

   iommu_attach_device(domain, dev);
   iommu_detatch_device(domain, dev);

Tegra would use:

   iommu_attach_device_shared(domain, dev);
   iommu_detatch_device_shared(domain, dev);
   // Or not, if people agree we should not mark this

DMA API would have the driver core dma_configure do:
   iommu_device_use_dma_api(dev);
   dev->driver->probe()
   iommu_device_unuse_dma_api(dev);

It is more APIs overall, but perhaps they have a much clearer
purpose. 

I think it would be clear why iommu_group_set_dma_owner(), which
actually does detatch, is not the same thing as iommu_attach_device().

I'm not sure if this entirely eliminates
DMA_OWNER_PRIVATE_DOMAIN_USER, or not, but at least it isn't in the
API.

Is it better?

> What VFIO wants is (conceptually[1]) "attach this device to my domain,
> provided it and any other devices in its group are managed by a driver I
> approve of." 

Yes, sure, "conceptually". But, there are troublesome details.

> VFIO will also need a struct device anyway, because once I get back from my
> holiday in the new year I need to start working with Simon on evolving the
> rest of the API away from bus->iommu_ops to dev->iommu so we can finally
> support IOMMU drivers coexisting[2].

For VFIO it would be much easier to get the ops from the struct
iommu_group (eg via iommu_group->default_domain->ops, or whatever).

> Indeed I agree with that second point, I'm just increasingly baffled how
> it's not clear to you that there is only one fundamental use-case here.
> Perhaps I'm too familiar with the history to objectively see how unclear the
> current state of things might be :/

I think it is because you are just not familiar with the dark corners
of VFIO. 

VFIO has a special case, I outlined above.

> > This is taking 426a to it's logical conclusion and *removing* the
> > group API from the drivers entirely. This is desirable because drivers
> > cannot do anything sane with the group.
> 
> I am in complete agreement with that (to the point of also not liking patch
> #6).

Unfortunately patch #6 is only because of VFIO needing to use the
group as a handle.

Jason
Baolu Lu Dec. 23, 2021, 5:53 a.m. UTC | #7
Hi Robin and Jason,

On 12/23/21 8:57 AM, Jason Gunthorpe wrote:
> On Wed, Dec 22, 2021 at 08:26:34PM +0000, Robin Murphy wrote:
>> On 21/12/2021 6:46 pm, Jason Gunthorpe wrote:
>>> On Tue, Dec 21, 2021 at 04:50:56PM +0000, Robin Murphy wrote:
>>>
>>>> this proposal is the worst of both worlds, in that drivers still have to be
>>>> just as aware of groups in order to know whether to call the _shared
>>>> interface or not, except it's now entirely implicit and non-obvious.
>>>
>>> Drivers are not aware of groups, where did you see that?
>>
>> `git grep iommu_attach_group -- :^drivers/iommu :^include`
>>
>> Did I really have to explain that?
> 
> Well, yes you did, because it shows you haven't understood my
> question. After this series we deleted all those calls (though Lu, we
> missed one of the tegra ones in staging, let's get it for the next
> posting)

Yes, I will.

> 
> So, after this series, where do you see drivers being aware of groups?
> If things are missed lets expect to fix them.
> 
>>> If the driver uses multiple struct devices and intends to connect them
>>> all to the same domain then it uses the _shared variant. The only
>>> difference between the two is the _shared varient lacks some of the
>>> protections against driver abuse of the API.
>>
>> You've lost me again; how are those intentions any different? Attaching one
>> device to a private domain is a literal subset of attaching more than one
>> device to a private domain.
> 
> Yes it is a subset, but drivers will malfunction if they are not
> designed to have multi-attachment and wrongly get it, and there is
> only one driver that does actually need this.
> 
> I maintain a big driver subsystem and have learned that grepability of
> the driver mess for special cases is quite a good thing to
> have. Forcing drivers to mark in code when they do something weird is
> an advantage, even if it causes some small API redundancy.
> 
> However, if you really feel strongly this should really be one API
> with the _shared implementation I won't argue it any further.
> 
>> So then we have the iommu_attach_group() interface for new code (and still
>> nobody has got round to updating the old code to it yet), for which
>> the
> 
> This series is going in the direction of eliminating
> iommu_attach_group() as part of the driver
> interface. iommu_attach_group() is repurposed to only be useful for
> VFIO.

We can also remove iommu_attach_group() in VFIO because it is
essentially equivalent to

	iommu_group_for_each_dev(group, iommu_attach_device(dev))

> 
>> properly, or iommu_attach_group() with a potentially better interface and
>> actual safety. The former is still more prevalent (and the interface
>> argument compelling), so if we put the new implementation behind that, with
>> the one tweak of having it set DMA_OWNER_PRIVATE_DOMAIN automatically, kill
>> off iommu_attach_group() by converting its couple of users,
> 
> This is what we did, iommu_attach_device() & _shared() are to be the
> only interface for the drivers, and we killed off the
> iommu_attach_group() couple of users except VFIO (the miss of
> drivers/staging excepted)
> 
>> and not only have we solved the VFIO problem but we've also finally
>> updated all the legacy code for free! Of course you can have a
>> separate version for VFIO to attach with
>> DMA_OWNER_PRIVATE_DOMAIN_USER if you like, although I still fail to
>> understand the necessity of the distinction.
> 
> And the seperate version for VFIO is called 'iommu_attach_group()'.
> 
> Lu, it is probably a good idea to add an assertion here that the group
> is in DMA_OWNER_PRIVATE_DOMAIN_USER to make it clear that
> iommu_attach_group() is only for VFIO.
> 
> VFIO has a special requirement that it be able to do:
> 
> +       ret = iommu_group_set_dma_owner(group->iommu_group,
> +                                       DMA_OWNER_PRIVATE_DOMAIN_USER, f.file);
> 
> Without having a iommu_domain to attach.
> 
> This is because of the giant special case that PPC made of VFIO's
> IOMMU code. PPC (aka vfio_iommu_spapr_tce.c) requires the group
> isolation that iommu_group_set_dma_owner() provides, but does not
> actually have an iommu_domain and can not/does not call
> iommu_attach_group().
> 
> Fixing this is a whole other giant adventure I'm hoping David will
> help me unwind next year..
> 
> This series solves this problem by using the two step sequence of
> iommu_group_set_dma_owner()/iommu_attach_group() and conceptually
> redefining how iommu_attach_group() works to require the external
> caller to have done the iommu_group_set_dma_owner() for it. This is
> why the series has three APIs, because the VFIO special one assumes
> external iommu_group_set_dma_owner(). It just happens that is exactly
> the same code as iommu_attach_group() today.
> 
> As for why does DMA_OWNER_PRIVATE_DOMAIN_USER exist? VFIO doesn't have
> an iommu_domain at this point but it still needs the iommu core to
> detatch the default domain. This is what the _USER does.

There is also a contract that after the USER ownership is claimed the
device could be accessed by userspace through the MMIO registers. So,
a device could be accessible by userspace before a user-space I/O
address is attached.

> 
> Soo..
> 
> There is another way to organize this and perhaps it does make more
> sense. I will try to sketch briefly in email, try to imagine the
> gaps..
> 
> API family (== compares to this series):
> 
>     iommu_device_use_dma_api(dev);
>       == iommu_device_set_dma_owner(dev, DMA_OWNER_DMA_API, NULL);
> 
>     iommu_group_set_dma_owner(group, file);
>       == iommu_device_set_dma_owner(dev, DMA_OWNER_PRIVATE_DOMAIN_USER,
>                                     file);
>       Always detaches all domains from the group

I hope we can drop all group variant APIs as we already have the per-
device interfaces, just iterate all device in the group and call the
device API.

> 
>     iommu_attach_device(domain, dev)
>       == as is in this patch
>       dev and domain are 1:1
> 
>     iommu_attach_device_shared(domain, dev)
>       == as is in this patch
>       dev and domain are N:1
>       * could just be the same as iommu_attach_device
> 
>     iommu_replace_group_domain(group, old_domain, new_domain)
>       Makes group point at new_domain. new_domain can be NULL.
> 
>     iommu_device_unuse_dma_api(dev)
>      == iommu_device_release_dma_owner() in this patch
> 
>     iommu_group_release_dma_owner(group)
>      == iommu_detatch_group() && iommu_group_release_dma_owner()
> 
> VFIO would use the sequence:
> 
>     iommu_group_set_dma_owner(group, file);
>     iommu_replace_group_domain(group, NULL, domain_1);
>     iommu_replace_group_domain(group, domain_1, domain_2);
>     iommu_group_release_dma_owner(group);
> 
> Simple devices would use
> 
>     iommu_attach_device(domain, dev);
>     iommu_detatch_device(domain, dev);
> 
> Tegra would use:
> 
>     iommu_attach_device_shared(domain, dev);
>     iommu_detatch_device_shared(domain, dev);
>     // Or not, if people agree we should not mark this
> 
> DMA API would have the driver core dma_configure do:
>     iommu_device_use_dma_api(dev);
>     dev->driver->probe()
>     iommu_device_unuse_dma_api(dev);
> 
> It is more APIs overall, but perhaps they have a much clearer
> purpose.
> 
> I think it would be clear why iommu_group_set_dma_owner(), which
> actually does detatch, is not the same thing as iommu_attach_device().

iommu_device_set_dma_owner() will eventually call
iommu_group_set_dma_owner(). I didn't get why
iommu_group_set_dma_owner() is special and need to keep.

> 
> I'm not sure if this entirely eliminates
> DMA_OWNER_PRIVATE_DOMAIN_USER, or not, but at least it isn't in the
> API.
> 
> Is it better?

Perhaps I missed anything. I have a simpler idea. We only need to have
below interfaces:

	iommu_device_set_dma_owner(dev, owner);
	iommu_device_release_dma_owner(dev, owner);
	iommu_attach_device(domain, dev, owner);
	iommu_detach_device(domain, dev);

All existing drivers calling iommu_attach_device() remain unchanged
since we already have singleton group enforcement. We only need to add
a default owner type.

For multiple-device group, like drm/tegra, the drivers should claim the
PRIVATE_DOMAIN ownership and call iommu_attach_device(domain, dev,
PRIVATE_DOMAIN) explicitly.

The new iommu_attach_device(domain, dev, owner) is a mix of the existing
iommu_attach_device() and the new iommu_attach_device_shared(). That
means,
	if (group_is_singleton(group))
		__iommu_atttach_device(domain, dev)
	else
		__iommu_attach_device_shared(domain, dev, owner)

The group variant interfaces will be deprecated and replace with the
device ones.

Sorry if I missed anything.

> 
>> What VFIO wants is (conceptually[1]) "attach this device to my domain,
>> provided it and any other devices in its group are managed by a driver I
>> approve of."
> 
> Yes, sure, "conceptually". But, there are troublesome details.
> 
>> VFIO will also need a struct device anyway, because once I get back from my
>> holiday in the new year I need to start working with Simon on evolving the
>> rest of the API away from bus->iommu_ops to dev->iommu so we can finally
>> support IOMMU drivers coexisting[2].
> 
> For VFIO it would be much easier to get the ops from the struct
> iommu_group (eg via iommu_group->default_domain->ops, or whatever).
> 
>> Indeed I agree with that second point, I'm just increasingly baffled how
>> it's not clear to you that there is only one fundamental use-case here.
>> Perhaps I'm too familiar with the history to objectively see how unclear the
>> current state of things might be :/
> 
> I think it is because you are just not familiar with the dark corners
> of VFIO.
> 
> VFIO has a special case, I outlined above.
> 
>>> This is taking 426a to it's logical conclusion and *removing* the
>>> group API from the drivers entirely. This is desirable because drivers
>>> cannot do anything sane with the group.
>>
>> I am in complete agreement with that (to the point of also not liking patch
>> #6).
> 
> Unfortunately patch #6 is only because of VFIO needing to use the
> group as a handle.
> 
> Jason
> 

Best regards,
baolu
Jason Gunthorpe Dec. 23, 2021, 2:03 p.m. UTC | #8
On Thu, Dec 23, 2021 at 01:53:24PM +0800, Lu Baolu wrote:

> > This series is going in the direction of eliminating
> > iommu_attach_group() as part of the driver
> > interface. iommu_attach_group() is repurposed to only be useful for
> > VFIO.
> 
> We can also remove iommu_attach_group() in VFIO because it is
> essentially equivalent to
> 
> 	iommu_group_for_each_dev(group, iommu_attach_device(dev))

Trying to do this would be subtly buggy, remeber the group list is
dynamic so when it is time to detatch this won't reliably balance.

It is the same problem with randomly picking a device inside the group
as the groups 'handle'. There is no guarentee that will work. Only
devices from a driver should be used with the device API.

> > As for why does DMA_OWNER_PRIVATE_DOMAIN_USER exist? VFIO doesn't have
> > an iommu_domain at this point but it still needs the iommu core to
> > detatch the default domain. This is what the _USER does.
> 
> There is also a contract that after the USER ownership is claimed the
> device could be accessed by userspace through the MMIO registers. So,
> a device could be accessible by userspace before a user-space I/O
> address is attached.

If we had an IOMMU domain we could solve this by just assigning the
correct domain. The core issue that motivates USER is the lack of an
iommu_domain.


> > I think it would be clear why iommu_group_set_dma_owner(), which
> > actually does detatch, is not the same thing as iommu_attach_device().
> 
> iommu_device_set_dma_owner() will eventually call
> iommu_group_set_dma_owner(). I didn't get why
> iommu_group_set_dma_owner() is special and need to keep.

Not quite, they would not call each other, they have different
implementations:

int iommu_device_use_dma_api(struct device *device)
{
	struct iommu_group *group = device->iommu_group;

	if (!group)
		return 0;

	mutex_lock(&group->mutex);
	if (group->owner_cnt != 0 ||
	    group->domain != group->default_domain) {
		mutex_unlock(&group->mutex);
		return -EBUSY;
	}
	group->owner_cnt = 1;
	group->owner = NULL;
	mutex_unlock(&group->mutex);
	return 0;
}

int iommu_group_set_dma_owner(struct iommu_group *group, struct file *owner)
{
	mutex_lock(&group->mutex);
	if (group->owner_cnt != 0) {
		if (group->owner != owner)
			goto err_unlock;
		group->owner_cnt++;
		mutex_unlock(&group->mutex);
		return 0;
	}
	if (group->domain && group->domain != group->default_domain)
		goto err_unlock;

	__iommu_detach_group(group->domain, group);
	group->owner_cnt = 1;
	group->owner = owner;
	mutex_unlock(&group->mutex);
	return 0;

err_unlock;
	mutex_unlock(&group->mutex);
	return -EBUSY;
}

It is the same as how we ended up putting the refcounting logic
directly into the iommu_attach_device().

See, we get rid of the enum as a multiplexor parameter, each API does
only wnat it needs, they don't call each other.

We don't need _USER anymore because iommu_group_set_dma_owner() always
does detatch, and iommu_replace_group_domain() avoids ever reassigning
default_domain. The sepecial USER behavior falls out automatically.

Jason
Baolu Lu Dec. 24, 2021, 1:30 a.m. UTC | #9
Hi Jason,

On 12/23/21 10:03 PM, Jason Gunthorpe wrote:
>>> I think it would be clear why iommu_group_set_dma_owner(), which
>>> actually does detatch, is not the same thing as iommu_attach_device().
>> iommu_device_set_dma_owner() will eventually call
>> iommu_group_set_dma_owner(). I didn't get why
>> iommu_group_set_dma_owner() is special and need to keep.
> Not quite, they would not call each other, they have different
> implementations:
> 
> int iommu_device_use_dma_api(struct device *device)
> {
> 	struct iommu_group *group = device->iommu_group;
> 
> 	if (!group)
> 		return 0;
> 
> 	mutex_lock(&group->mutex);
> 	if (group->owner_cnt != 0 ||
> 	    group->domain != group->default_domain) {
> 		mutex_unlock(&group->mutex);
> 		return -EBUSY;
> 	}
> 	group->owner_cnt = 1;
> 	group->owner = NULL;
> 	mutex_unlock(&group->mutex);
> 	return 0;
> }

It seems that this function doesn't work for multi-device groups. When
the user unbinds all native drivers from devices in the group and start
to bind them with vfio-pci and assign them to user, how could iommu know
whether the group is viable for user?

> 
> int iommu_group_set_dma_owner(struct iommu_group *group, struct file *owner)
> {
> 	mutex_lock(&group->mutex);
> 	if (group->owner_cnt != 0) {
> 		if (group->owner != owner)
> 			goto err_unlock;
> 		group->owner_cnt++;
> 		mutex_unlock(&group->mutex);
> 		return 0;
> 	}
> 	if (group->domain && group->domain != group->default_domain)
> 		goto err_unlock;
> 
> 	__iommu_detach_group(group->domain, group);
> 	group->owner_cnt = 1;
> 	group->owner = owner;
> 	mutex_unlock(&group->mutex);
> 	return 0;
> 
> err_unlock;
> 	mutex_unlock(&group->mutex);
> 	return -EBUSY;
> }
> 
> It is the same as how we ended up putting the refcounting logic
> directly into the iommu_attach_device().
> 
> See, we get rid of the enum as a multiplexor parameter, each API does
> only wnat it needs, they don't call each other.

I like the idea of removing enum parameter and make the API name
specific. But I didn't get why they can't call each other even the
data in group is the same.

> 
> We don't need _USER anymore because iommu_group_set_dma_owner() always
> does detatch, and iommu_replace_group_domain() avoids ever reassigning
> default_domain. The sepecial USER behavior falls out automatically.

This means we will grow more group-centric interfaces. My understanding
is the opposite that we should hide the concept of group in IOMMU
subsystem, and the device drivers only faces device specific interfaces.

The iommu groups are created by the iommu subsystem. The device drivers
don't play any role in determining which device belongs to which group.
So the iommu interfaces for device driver shouldn't rely on the group.

Best regards,
baolu
Jason Gunthorpe Dec. 24, 2021, 2:50 a.m. UTC | #10
On Fri, Dec 24, 2021 at 09:30:17AM +0800, Lu Baolu wrote:
> Hi Jason,
> 
> On 12/23/21 10:03 PM, Jason Gunthorpe wrote:
> > > > I think it would be clear why iommu_group_set_dma_owner(), which
> > > > actually does detatch, is not the same thing as iommu_attach_device().
> > > iommu_device_set_dma_owner() will eventually call
> > > iommu_group_set_dma_owner(). I didn't get why
> > > iommu_group_set_dma_owner() is special and need to keep.
> > Not quite, they would not call each other, they have different
> > implementations:
> > 
> > int iommu_device_use_dma_api(struct device *device)
> > {
> > 	struct iommu_group *group = device->iommu_group;
> > 
> > 	if (!group)
> > 		return 0;
> > 
> > 	mutex_lock(&group->mutex);
> > 	if (group->owner_cnt != 0 ||
> > 	    group->domain != group->default_domain) {
> > 		mutex_unlock(&group->mutex);
> > 		return -EBUSY;
> > 	}
> > 	group->owner_cnt = 1;
> > 	group->owner = NULL;
> > 	mutex_unlock(&group->mutex);
> > 	return 0;
> > }
> 
> It seems that this function doesn't work for multi-device groups. When
> the user unbinds all native drivers from devices in the group and start
> to bind them with vfio-pci and assign them to user, how could iommu know
> whether the group is viable for user?

It is just a mistake, I made this very fast. It should work as your
patch had it with a ++. More like this:

int iommu_device_use_dma_api(struct device *device)
{
	struct iommu_group *group = device->iommu_group;

	if (!group)
		return 0;

	mutex_lock(&group->mutex);
	if (group->owner_cnt != 0) {
		if (group->domain != group->default_domain ||
		    group->owner != NULL) {
			mutex_unlock(&group->mutex);
			return -EBUSY;
		}
	}
	group->owner_cnt++;
	mutex_unlock(&group->mutex);
	return 0;
}

> > See, we get rid of the enum as a multiplexor parameter, each API does
> > only wnat it needs, they don't call each other.
> 
> I like the idea of removing enum parameter and make the API name
> specific. But I didn't get why they can't call each other even the
> data in group is the same.

Well, I think when you type them out you'll find they don't work the
same. Ie the iommu_group_set_dma_owner() does __iommu_detach_group()
which iommu_device_use_dma_api() definately doesn't want to
do. iommu_device_use_dma_api() checks the domain while
iommu_group_set_dma_owner() must not.

This is basically the issue, all the places touching ownercount are
superficially the same but each use different predicates. Given the
predicate is more than half the code I wouldn't try to share the rest
of it. But maybe when it is all typed in something will become
obvious?

> > We don't need _USER anymore because iommu_group_set_dma_owner() always
> > does detatch, and iommu_replace_group_domain() avoids ever reassigning
> > default_domain. The sepecial USER behavior falls out automatically.
> 
> This means we will grow more group-centric interfaces. My understanding
> is the opposite that we should hide the concept of group in IOMMU
> subsystem, and the device drivers only faces device specific interfaces.

Ideally group interfaces would be reduced, but in this case VFIO needs
the group. It has sort of a fundamental problem with its uAPI that
expects the container is fully setup with a domain at the moment the
group is attached. So deferring domain setup to when the device is
available becomes a user visible artifact - and if this is important
or not is a whole research question that isn't really that important
for this series.

We also can't just pull a device out of thin air, a device that hasn't
been probed() hasn't even had dma_configure called! Let alone the
lifetime and locking problems with that kind of idea.

So.. leaving it as a group interface makes the most sense,
particularly for this series which is really about fixing the sharing
model in the iommu core and deleting the BUG_ONs. 

Also, I'm sitting here looking at Robin's idea that
iommu_attach_device() and iommu_attach_device_shared() should be the
same - and that does seem conceptually appealing, but not so simple.

The difference is that iommu_attach_device_shared() requires the
device_driver to have set suppress_auto_claim_dma_owner while
iommu_attach_device() does not (Lu, please do add a kdoc comment
documenting this, and maybe a WARN_ON check to enforce it).

Changing all 11 drivers using iommu_attach_device() to also set
suppress_auto_claim_dma_owner is something to do in another series,
merged properly through the driver trees, if it is done at all. So
this series needs to keep both APIs.

However, what we should be doing is fixing iommu_attach_device() to
rely on the owner_cnt, and not iommu_group_device_count().

Basically it's logic should instead check for the owner_cnt == 1 and
then transform the group from a DMA_OWNER_DMA_API to a
DMA_OWNER_PRIVATE_DOMAIN. If we get rid of the enum then this happens
naturally by making group->domain != group->default_domain. All that
is missing is the owner_cnt == 1 check and some commentary.  Again
also with a WARN_ON and documentation that
suppress_auto_claim_dma_owner is not set. (TBH, I thought this was
discussed already, I haven't yet carefully checked v4..)

Then, we rely on iommu_device_use_dma_api() to block further users of
the group and remove the iommu_group_device_count() hack.

Jason
Baolu Lu Dec. 24, 2021, 3:19 a.m. UTC | #11
On 12/23/21 4:26 AM, Robin Murphy wrote:
> On 21/12/2021 6:46 pm, Jason Gunthorpe wrote:
>> On Tue, Dec 21, 2021 at 04:50:56PM +0000, Robin Murphy wrote:
>>
>>> this proposal is the worst of both worlds, in that drivers still have 
>>> to be
>>> just as aware of groups in order to know whether to call the _shared
>>> interface or not, except it's now entirely implicit and non-obvious.
>>
>> Drivers are not aware of groups, where did you see that?
> 
> `git grep iommu_attach_group -- :^drivers/iommu :^include`
> 
> Did I really have to explain that?
> 
> The drivers other than vfio_iommu_type1, however, do have a complete 
> failure to handle, or even consider, any group that does not fit the 
> particular set of assumptions they are making, but at least they only 
> work in a context where that should not occur.
> 
>> Drivers have to indicate their intention, based entirely on their own
>> internal design. If groups are present, or not is irrelevant to the
>> driver.
>>
>> If the driver uses a single struct device (which is most) then it uses
>> iommu_attach_device().
>>
>> If the driver uses multiple struct devices and intends to connect them
>> all to the same domain then it uses the _shared variant. The only
>> difference between the two is the _shared varient lacks some of the
>> protections against driver abuse of the API.
> 
> You've lost me again; how are those intentions any different? Attaching 
> one device to a private domain is a literal subset of attaching more 
> than one device to a private domain. There is no "abuse" of any API 
> anywhere; the singleton group restriction exists as a protective measure 
> because iommu_attach_device() was already in use before groups were 
> really a thing, in contexts where groups happened to be singleton 
> already, but anyone adding *new* uses in contexts where that assumption 
> might *not* hold would be in trouble. Thus it enforces DMA ownership by 
> the most trivial and heavy-handed means of simply preventing it ever 
> becoming shared in the first place.
> 
> Yes, I'm using the term "DMA ownership" in a slightly different context 
> to the one in which you originally proposed it. Please step out of the 
> userspace-device-assignment-focused bubble for a moment and stay with me...
> 
> So then we have the iommu_attach_group() interface for new code (and 
> still nobody has got round to updating the old code to it yet), for 
> which the basic use-case is still fundamentally "I want to attach my 
> thing to my domain", but at least now forcing explicit awareness that 
> "my thing" could possibly be inextricably intertwined with more than 
> just the one device they expect, so potential callers should have a good 
> think about that. Unfortunately this leaves the matter of who "owns" the 
> group entirely in the hands of those callers, which as we've now 
> concluded is not great.
> 
> One of the main reasons for non-singleton groups to occur is due to ID 
> aliasing or lack of isolation well beyond the scope and control of 
> endpoint devices themselves, so it's not really fair to expect every 
> IOMMU-aware driver to also be aware of that, have any idea of how to 
> actually handle it, or especially try to negotiate with random other 
> drivers as to whether it might be OK to take control of their DMA 
> address space too. The whole point is that *every* domain attach really 
> *has* to be considered "shared" because in general drivers can't know 
> otherwise. Hence the easy, if crude, fix for the original API.
> 
>> Nothing uses the group interface except for VFIO and stuff inside
>> drivers/iommu. VFIO has a uAPI tied to the group interface and it
>> is stuck with it.
> 
> Self-contradiction is getting stronger, careful...
>>> Otherwise just add the housekeeping stuff to 
>>> iommu_{attach,detach}_group() -
>>> there's no way we want *three* attach/detach interfaces all with 
>>> different
>>> semantics.
>>
>> I'm not sure why you think 3 APIs is bad thing. Threes APIs, with
>> clearly intended purposes is a lot better than one giant API with a
>> bunch of parameters that tries to do everything.
> 
> Because there's only one problem to solve! We have the original API 
> which does happen to safely enforce ownership, but in an implicit way 
> that doesn't scale; then we have the second API which got past the 
> topology constraint but unfortunately turns out to just be unsafe in a 
> slightly different way, and was supposed to replace the first one but 
> hasn't, and is a bit clunky to boot; now you're proposing a third one 
> which can correctly enforce safe ownership for any group topology, which 
> is simply combining the good bits of the first two. It makes no sense to 
> maintain two bad versions of a thing alongside one which works better.
> 
> I don't see why anything would be a giant API with a bunch of parameters 
> - depending on how you look at it, this new proposal is basically either 
> iommu_attach_device() with the ability to scale up to non-trivial groups 
> properly, or iommu_attach_group() with a potentially better interface 
> and actual safety. The former is still more prevalent (and the interface 
> argument compelling), so if we put the new implementation behind that, 
> with the one tweak of having it set DMA_OWNER_PRIVATE_DOMAIN 
> automatically, kill off iommu_attach_group() by converting its couple of 
> users, and not only have we solved the VFIO problem but we've also 
> finally updated all the legacy code for free! Of course you can have a 
> separate version for VFIO to attach with DMA_OWNER_PRIVATE_DOMAIN_USER 
> if you like, although I still fail to understand the necessity of the 
> distinction.
> 
>> In this case, it is not simple to 'add the housekeeping' to
>> iommu_attach_group() in a way that is useful to both tegra and
>> VFIO. What tegra wants is what the _shared API implements, and that
>> logic should not be open coded in drivers.
>>
>> VFIO does not want exactly that, it has its own logic to deal directly
>> with groups tied to its uAPI. Due to the uAPI it doesn't even have a
>> struct device, unfortunately.
> 
> Nope. VFIO has its own logic to deal with groups because it's the only 
> thing that's ever actually tried dealing with groups correctly 
> (unsurprisingly, given that it's where they came from), and every other 
> private IOMMU domain user is just crippled or broken to some degree. All 
> that proves is that we really should be policing groups better in the 
> IOMMU core, per this series, because actually fixing all the other users 
> to properly validate their device's group would be a ridiculous mess.
> 
> What VFIO wants is (conceptually[1]) "attach this device to my domain, 
> provided it and any other devices in its group are managed by a driver I 
> approve of." Surprise surprise, that's what any other driver wants as 
> well! For iommu_attach_device() it was originally implicit, and is now 
> further enforced by the singleton group restriction. For Tegra/host1x 
> it's implicit in the complete obliviousness to the possibility of that 
> not being the case.
> 
> Of course VFIO has a struct device if it needs one; it's trivial to 
> resolve the member(s) of a group (and even more so once we can assume 
> that a group may only ever contain mutually-compatible devices in the 
> first place). How do you think vfio_bus_type() works?
> 
> VFIO will also need a struct device anyway, because once I get back from 
> my holiday in the new year I need to start working with Simon on 
> evolving the rest of the API away from bus->iommu_ops to dev->iommu so 
> we can finally support IOMMU drivers coexisting[2].
> 
>> The reason there are three APIs is because there are three different
>> use-cases. It is not bad thing to have APIs designed for the use cases
>> they serve.
> 
> Indeed I agree with that second point, I'm just increasingly baffled how 
> it's not clear to you that there is only one fundamental use-case here. 
> Perhaps I'm too familiar with the history to objectively see how unclear 
> the current state of things might be :/
> 
>>> It's worth taking a step back and realising that overall, this is really
>>> just a more generalised and finer-grained extension of what 426a273834ea
>>> already did for non-group-aware code, so it makes little sense *not* to
>>> integrate it into the existing interfaces.
>>
>> This is taking 426a to it's logical conclusion and *removing* the
>> group API from the drivers entirely. This is desirable because drivers
>> cannot do anything sane with the group.
> 
> I am in complete agreement with that (to the point of also not liking 
> patch #6).
> 
>> The drivers have struct devices, and so we provide APIs that work in
>> terms of struct devices to cover both driver use cases today, and do
>> so more safely than what is already implemented.
> 
> I am in complete agreement with that (given "both" of the supposed 3 
> use-cases all being the same).
> 
>> Do not mix up VFIO with the driver interface, these are different
>> things. It is better VFIO stay on its own and not complicate the
>> driver world.
> 
> Nope, vfio_iommu_type1 is just a driver, calling the IOMMU API just like 
> any other driver. I like the little bit where it passes itself to 
> vfio_register_iommu_driver(), which I feel gets this across far more 
> poetically than I can manage.
> 
> Thanks,
> Robin.
> 
> [1] Yes, due to the UAPI it actually starts with the whole group rather 
> than any particular device within it. Don't nitpick.
> [2] 
> https://lore.kernel.org/linux-iommu/2021052710373173260118@rock-chips.com/

Let me summarize what I've got from above comments.

1. Essentially we only need below interfaces for device drivers to
    manage the I/O address conflict in iommu layer:

int iommu_device_set/release/query_kernel_dma(struct device *dev)

- Device driver lets the iommu layer know that driver DMAs go through
   the kernel DMA APIs. The iommu layer should use the default domain
   for DMA remapping. No other domains could be attached.
- Device driver lets the iommu layer know that driver doesn't do DMA
   anymore and other domains are allowed to be attached.
- Device driver queries "can I only do DMA through the kernel DMA API?
   In other words, can I attach my own domain?"


int iommu_device_set/release_private_dma(struct device *dev)

- Device driver lets the iommu layer know that it wants to use its own
   iommu domain. The iommu layer should detach the default domain and
   allow the driver to attach or detach its own domain through
   iommu_attach/detach_device() interfaces.
- Device driver lets the iommy layer know that it on longer needs a
   private domain.

2. iommu_attach_group() vs. iommu_attach_device()

   [HISTORY]
   The iommu_attach_device() added first by commit <fc2100eb4d096> ("add
   frontend implementation for the IOMMU API") in 2008. At that time,
   there was no concept of iommu group yet.

   The iommu group was added by commit <d72e31c937462> ("iommu: IOMMU
   Groups") four years later in 2012. The iommu_attach_group() was added
   at the same time.

   Then, people realized that iommu_attach_device() allowed different
   device in a same group to attach different domain. This was not in
   line with the concept of iommu group. The commit <426a273834eae>
   ("iommu: Limit iommu_attach/detach_device to device with their own
   group") fixed this problem in 2015.

   [REALITY]
   We have two coexisting interfaces for device drivers to do the same
   thing. But neither is perfect:

   - iommu_attach_device() only works for singleton group.
   - iommu_attach_group() asks the device drivers to handle iommu group
     related staff which is beyond the role of a device driver.

   [FUTURE]
   Considering from the perspective of a device driver, its motivation is
   very simple: "I want to manage my own I/O address space. The kernel
   DMA API is not suitable for me because it hides the I/O address space
   details in the lower layer which is transparent to me."

   We consider heading in this direction:

   Make the iommu_attach_device() the only and generic interface for the
   device drivers to use their own private domain (I/O address space)
   and replace all iommu_attach_group() uses with iommu_attach_device()
   and deprecate the former.

That's all. Did I miss or misunderstand anything?

Best regards,
baolu
Baolu Lu Dec. 24, 2021, 6:44 a.m. UTC | #12
Hi Jason,

On 2021/12/24 10:50, Jason Gunthorpe wrote:
> On Fri, Dec 24, 2021 at 09:30:17AM +0800, Lu Baolu wrote:
>> Hi Jason,
>>
>> On 12/23/21 10:03 PM, Jason Gunthorpe wrote:
>>>>> I think it would be clear why iommu_group_set_dma_owner(), which
>>>>> actually does detatch, is not the same thing as iommu_attach_device().
>>>> iommu_device_set_dma_owner() will eventually call
>>>> iommu_group_set_dma_owner(). I didn't get why
>>>> iommu_group_set_dma_owner() is special and need to keep.
>>> Not quite, they would not call each other, they have different
>>> implementations:
>>>
>>> int iommu_device_use_dma_api(struct device *device)
>>> {
>>> 	struct iommu_group *group = device->iommu_group;
>>>
>>> 	if (!group)
>>> 		return 0;
>>>
>>> 	mutex_lock(&group->mutex);
>>> 	if (group->owner_cnt != 0 ||
>>> 	    group->domain != group->default_domain) {
>>> 		mutex_unlock(&group->mutex);
>>> 		return -EBUSY;
>>> 	}
>>> 	group->owner_cnt = 1;
>>> 	group->owner = NULL;
>>> 	mutex_unlock(&group->mutex);
>>> 	return 0;
>>> }
>> It seems that this function doesn't work for multi-device groups. When
>> the user unbinds all native drivers from devices in the group and start
>> to bind them with vfio-pci and assign them to user, how could iommu know
>> whether the group is viable for user?
> It is just a mistake, I made this very fast. It should work as your
> patch had it with a ++. More like this:
> 
> int iommu_device_use_dma_api(struct device *device)
> {
> 	struct iommu_group *group = device->iommu_group;
> 
> 	if (!group)
> 		return 0;
> 
> 	mutex_lock(&group->mutex);
> 	if (group->owner_cnt != 0) {
> 		if (group->domain != group->default_domain ||
> 		    group->owner != NULL) {
> 			mutex_unlock(&group->mutex);
> 			return -EBUSY;
> 		}
> 	}
> 	group->owner_cnt++;
> 	mutex_unlock(&group->mutex);
> 	return 0;
> }
> 
>>> See, we get rid of the enum as a multiplexor parameter, each API does
>>> only wnat it needs, they don't call each other.
>> I like the idea of removing enum parameter and make the API name
>> specific. But I didn't get why they can't call each other even the
>> data in group is the same.
> Well, I think when you type them out you'll find they don't work the
> same. Ie the iommu_group_set_dma_owner() does __iommu_detach_group()
> which iommu_device_use_dma_api() definately doesn't want to
> do. iommu_device_use_dma_api() checks the domain while
> iommu_group_set_dma_owner() must not.
> 
> This is basically the issue, all the places touching ownercount are
> superficially the same but each use different predicates. Given the
> predicate is more than half the code I wouldn't try to share the rest
> of it. But maybe when it is all typed in something will become
> obvious?
> 

Get you and agree with you. For the remaining comments, let me wait and
listen what Robin will comment.

Best regards,
baolu
Jason Gunthorpe Dec. 24, 2021, 2:24 p.m. UTC | #13
On Fri, Dec 24, 2021 at 11:19:44AM +0800, Lu Baolu wrote:

> Let me summarize what I've got from above comments.
> 
> 1. Essentially we only need below interfaces for device drivers to
>    manage the I/O address conflict in iommu layer:
> 
> int iommu_device_set/release/query_kernel_dma(struct device *dev)
> 
> - Device driver lets the iommu layer know that driver DMAs go through
>   the kernel DMA APIs. The iommu layer should use the default domain
>   for DMA remapping. No other domains could be attached.
> - Device driver lets the iommu layer know that driver doesn't do DMA
>   anymore and other domains are allowed to be attached.
> - Device driver queries "can I only do DMA through the kernel DMA API?
>   In other words, can I attach my own domain?"

I'm not sure I see the utility of a query, but OK - this is the API
family v4 has added to really_probe, basically.

> int iommu_device_set/release_private_dma(struct device *dev)
> 
> - Device driver lets the iommu layer know that it wants to use its own
>   iommu domain. The iommu layer should detach the default domain and
>   allow the driver to attach or detach its own domain through
>   iommu_attach/detach_device() interfaces.
> - Device driver lets the iommy layer know that it on longer needs a
>   private domain.

Drivers don't actually need an interface like this, they all have
domains so they can all present their domain when they want to change
the ownership mode.

The advantage of presenting the domain in the API is that it allows
the core code to support sharing. Present the same domain and your
device gets to join the group. Present a different domain and it is
rejected. Simple.

Since there is no domain the above APIs cannot support tegra, for
instance.

>   Make the iommu_attach_device() the only and generic interface for the
>   device drivers to use their own private domain (I/O address space)
>   and replace all iommu_attach_group() uses with iommu_attach_device()
>   and deprecate the former.

Certainly in the devices drivers yes, VFIO should stay with group as
I've explained.

Ideals aside, we still need to have this series to have a scope that
is achievable in a reasonable size. So, we still end up with three
interfaces:

 1) iommu_attach_device() as used by the 11 current drivers that do
    not set suppress_auto_claim_dma_owner.
    It's key property is that it is API compatible with what we have
    today and doesn't require changing the 11 drivers.

 2) iommu_attach_device_shared() which is used by tegra and requires
    that drivers set suppress_auto_claim_dma_owner.

    A followup series could replace all calls of iommu_attach_device()
    with iommu_attach_device_shared() with one patch per driver that
    also sets suppress_auto_claim_dma_owner.

 3) Unless a better idea aries the
    iommu_group_set_dma_owner()/iommu_replace_group_domain()
    API that I suggested, used only by VFIO. This API is designed to
    work without a domain and uses the 'struct file *owner' instead
    of the domain to permit sharing. It swaps the obviously confusing
    concept of _USER for the more general concept of 'replace domain'.

All three need to consistently use the owner_cnt and related to
implement their internal logic.

It is a pretty clear explanation why there are three interfaces.

Jason
Baolu Lu Jan. 4, 2022, 1:53 a.m. UTC | #14
On 12/24/21 10:50 AM, Jason Gunthorpe wrote:
>>> We don't need _USER anymore because iommu_group_set_dma_owner() always
>>> does detatch, and iommu_replace_group_domain() avoids ever reassigning
>>> default_domain. The sepecial USER behavior falls out automatically.
>> This means we will grow more group-centric interfaces. My understanding
>> is the opposite that we should hide the concept of group in IOMMU
>> subsystem, and the device drivers only faces device specific interfaces.
> Ideally group interfaces would be reduced, but in this case VFIO needs
> the group. It has sort of a fundamental problem with its uAPI that
> expects the container is fully setup with a domain at the moment the
> group is attached. So deferring domain setup to when the device is
> available becomes a user visible artifact - and if this is important
> or not is a whole research question that isn't really that important
> for this series.
> 
> We also can't just pull a device out of thin air, a device that hasn't
> been probed() hasn't even had dma_configure called! Let alone the
> lifetime and locking problems with that kind of idea.
> 
> So.. leaving it as a group interface makes the most sense,
> particularly for this series which is really about fixing the sharing
> model in the iommu core and deleting the BUG_ONs.

I feel it makes more sense if we leave the attach_device/group
refactoring patches into a separated series. I will come up with this
new series so that people can review and comment on the real code.

Best regards,
baolu
diff mbox series

Patch

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5ad4cf13370d..1bc03118dfb3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -703,6 +703,8 @@  int iommu_group_set_dma_owner(struct iommu_group *group, enum iommu_dma_owner ow
 			      void *owner_cookie);
 void iommu_group_release_dma_owner(struct iommu_group *group, enum iommu_dma_owner owner);
 bool iommu_group_dma_owner_unclaimed(struct iommu_group *group);
+int iommu_attach_device_shared(struct iommu_domain *domain, struct device *dev);
+void iommu_detach_device_shared(struct iommu_domain *domain, struct device *dev);
 
 #else /* CONFIG_IOMMU_API */
 
@@ -743,11 +745,22 @@  static inline int iommu_attach_device(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
+static inline int iommu_attach_device_shared(struct iommu_domain *domain,
+					     struct device *dev)
+{
+	return -ENODEV;
+}
+
 static inline void iommu_detach_device(struct iommu_domain *domain,
 				       struct device *dev)
 {
 }
 
+static inline void iommu_detach_device_shared(struct iommu_domain *domain,
+					      struct device *dev)
+{
+}
+
 static inline struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
 {
 	return NULL;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8bec71b1cc18..3ad66cb9bedc 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -50,6 +50,7 @@  struct iommu_group {
 	struct list_head entry;
 	enum iommu_dma_owner dma_owner;
 	unsigned int owner_cnt;
+	unsigned int attach_cnt;
 	void *owner_cookie;
 };
 
@@ -3512,3 +3513,81 @@  void iommu_device_release_dma_owner(struct device *dev, enum iommu_dma_owner own
 	iommu_group_put(group);
 }
 EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner);
+
+/**
+ * iommu_attach_device_shared() - Attach shared domain to a device
+ * @domain: The shared domain.
+ * @dev: The device.
+ *
+ * Similar to iommu_attach_device(), but allowed for shared-group devices
+ * and guarantees that all devices in an iommu group could only be attached
+ * by a same iommu domain. The caller should explicitly set the dma ownership
+ * of DMA_OWNER_PRIVATE_DOMAIN or DMA_OWNER_PRIVATE_DOMAIN_USER type before
+ * calling it and use the paired helper iommu_detach_device_shared() for
+ * cleanup.
+ */
+int iommu_attach_device_shared(struct iommu_domain *domain, struct device *dev)
+{
+	struct iommu_group *group;
+	int ret = 0;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return -ENODEV;
+
+	mutex_lock(&group->mutex);
+	if (group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN &&
+	    group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN_USER) {
+		ret = -EPERM;
+		goto unlock_out;
+	}
+
+	if (group->attach_cnt) {
+		if (group->domain != domain) {
+			ret = -EBUSY;
+			goto unlock_out;
+		}
+	} else {
+		ret = __iommu_attach_group(domain, group);
+		if (ret)
+			goto unlock_out;
+	}
+
+	group->attach_cnt++;
+unlock_out:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_attach_device_shared);
+
+/**
+ * iommu_detach_device_shared() - Detach a domain from device
+ * @domain: The domain.
+ * @dev: The device.
+ *
+ * The detach helper paired with iommu_attach_device_shared().
+ */
+void iommu_detach_device_shared(struct iommu_domain *domain, struct device *dev)
+{
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return;
+
+	mutex_lock(&group->mutex);
+	if (WARN_ON(!group->attach_cnt || group->domain != domain ||
+		    (group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN &&
+		     group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN_USER)))
+		goto unlock_out;
+
+	if (--group->attach_cnt == 0)
+		__iommu_detach_group(domain, group);
+
+unlock_out:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_device_shared);