diff mbox series

[v7,03/19] iommufd: Replace the hwpt->devices list with iommufd_group

Message ID 3-v7-6c0fd698eda2+5e3-iommufd_alloc_jgg@nvidia.com (mailing list archive)
State New
Headers show
Series Add iommufd physical device operations for replace and alloc hwpt | expand

Commit Message

Jason Gunthorpe May 15, 2023, 2 p.m. UTC
The devices list was used as a simple way to avoid having per-group
information. Now that this seems to be unavoidable, just commit to
per-group information fully and remove the devices list from the HWPT.

The iommufd_group stores the currently assigned HWPT for the entire group
and we can manage the per-device attach/detach with a list in the
iommufd_group.

For destruction the flow is organized to make the following patches
easier, the actual call to iommufd_object_destroy_user() is done at the
top of the call chain without holding any locks. The HWPT to be destroyed
is returned out from the locked region to make this possible. Later
patches create locking that requires this.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c          | 100 +++++++++++-------------
 drivers/iommu/iommufd/hw_pagetable.c    |  22 +-----
 drivers/iommu/iommufd/iommufd_private.h |  13 ++-
 3 files changed, 54 insertions(+), 81 deletions(-)

Comments

Baolu Lu May 16, 2023, 3 a.m. UTC | #1
On 5/15/23 10:00 PM, Jason Gunthorpe wrote:
> The devices list was used as a simple way to avoid having per-group
> information. Now that this seems to be unavoidable, just commit to
> per-group information fully and remove the devices list from the HWPT.
> 
> The iommufd_group stores the currently assigned HWPT for the entire group
> and we can manage the per-device attach/detach with a list in the
> iommufd_group.

I am preparing the patches to route I/O page faults to user space
through iommufd. The iommufd page fault handler knows the hwpt and the
device pointer, but it needs to convert the device pointer into its
iommufd object id and pass the id to user space.

It's fine that we remove the hwpt->devices here, but perhaps I need to
add the context pointer in ioas later,

struct iommufd_ioas {
         struct io_pagetable iopt;
         struct mutex mutex;
         struct list_head hwpt_list;
+       struct iommufd_ctx *ictx;
  };

and, use below helper to look up the device id.

+u32 iommufd_get_device_id(struct iommufd_ctx *ictx, struct device *dev)
+{
+       struct iommu_group *group = iommu_group_get(dev);
+       u32 dev_id = IOMMUFD_INVALID_OBJ_ID;
+       struct iommufd_group *igroup;
+       struct iommufd_device *cur;
+       unsigned int id;
+
+       if (!group)
+               return IOMMUFD_INVALID_OBJ_ID;
+
+       id = iommu_group_id(group);
+       xa_lock(&ictx->groups);
+       igroup = xa_load(&ictx->groups, id);
+       if (!iommufd_group_try_get(igroup, group)) {
+               xa_unlock(&ictx->groups);
+               iommu_group_put(group);
+               return IOMMUFD_INVALID_OBJ_ID;
+        }
+        xa_unlock(&ictx->groups);
+
+       mutex_lock(&igroup->lock);
+       list_for_each_entry(cur, &igroup->device_list, group_item) {
+               if (cur->dev == dev) {
+                       dev_id = cur->obj.id;
+                       break;
+               }
+       }
+       mutex_unlock(&igroup->lock);
+
+       iommufd_put_group(igroup);
+       iommu_group_put(group);
+
+       return dev_id;
+}

and, use it like this in the fault handler:

        dev_id = iommufd_get_device_id(hwpt->ioas->ictx, dev);
+       if (dev_id == IOMMUFD_INVALID_OBJ_ID)
+               return IOMMU_PAGE_RESP_FAILURE;

Will this look good to you?

> For destruction the flow is organized to make the following patches
> easier, the actual call to iommufd_object_destroy_user() is done at the
> top of the call chain without holding any locks. The HWPT to be destroyed
> is returned out from the locked region to make this possible. Later
> patches create locking that requires this.
> 
> Reviewed-by: Lu Baolu<baolu.lu@linux.intel.com>
> Reviewed-by: Kevin Tian<kevin.tian@intel.com>
> Signed-off-by: Jason Gunthorpe<jgg@nvidia.com>
> ---
>   drivers/iommu/iommufd/device.c          | 100 +++++++++++-------------
>   drivers/iommu/iommufd/hw_pagetable.c    |  22 +-----
>   drivers/iommu/iommufd/iommufd_private.h |  13 ++-
>   3 files changed, 54 insertions(+), 81 deletions(-)

Best regards,
baolu
Jason Gunthorpe May 16, 2023, 12:27 p.m. UTC | #2
On Tue, May 16, 2023 at 11:00:16AM +0800, Baolu Lu wrote:
> On 5/15/23 10:00 PM, Jason Gunthorpe wrote:
> > The devices list was used as a simple way to avoid having per-group
> > information. Now that this seems to be unavoidable, just commit to
> > per-group information fully and remove the devices list from the HWPT.
> > 
> > The iommufd_group stores the currently assigned HWPT for the entire group
> > and we can manage the per-device attach/detach with a list in the
> > iommufd_group.
> 
> I am preparing the patches to route I/O page faults to user space
> through iommufd. The iommufd page fault handler knows the hwpt and the
> device pointer, but it needs to convert the device pointer into its
> iommufd object id and pass the id to user space.
> 
> It's fine that we remove the hwpt->devices here, but perhaps I need to
> add the context pointer in ioas later,
> 
> struct iommufd_ioas {
>         struct io_pagetable iopt;
>         struct mutex mutex;
>         struct list_head hwpt_list;
> +       struct iommufd_ctx *ictx;
>  };
> 
> and, use below helper to look up the device id.
> 
> +u32 iommufd_get_device_id(struct iommufd_ctx *ictx, struct device *dev)
> +{
> +       struct iommu_group *group = iommu_group_get(dev);
> +       u32 dev_id = IOMMUFD_INVALID_OBJ_ID;
> +       struct iommufd_group *igroup;
> +       struct iommufd_device *cur;
> +       unsigned int id;
> +
> +       if (!group)
> +               return IOMMUFD_INVALID_OBJ_ID;
> +
> +       id = iommu_group_id(group);
> +       xa_lock(&ictx->groups);
> +       igroup = xa_load(&ictx->groups, id);
> +       if (!iommufd_group_try_get(igroup, group)) {
> +               xa_unlock(&ictx->groups);
> +               iommu_group_put(group);
> +               return IOMMUFD_INVALID_OBJ_ID;
> +        }
> +        xa_unlock(&ictx->groups);
> +
> +       mutex_lock(&igroup->lock);
> +       list_for_each_entry(cur, &igroup->device_list, group_item) {
> +               if (cur->dev == dev) {
> +                       dev_id = cur->obj.id;
> +                       break;
> +               }
> +       }

I dislike how slow this is on something resembling a fastish path :\

Maybe we should stash something in the dev_iommu instead?

Or can the PRI stuff provide a cookie per-device?

But it will work like this

>        dev_id = iommufd_get_device_id(hwpt->ioas->ictx, dev);

Where did the hwpt come from?

Jason
Baolu Lu May 17, 2023, 4:15 a.m. UTC | #3
On 5/16/23 8:27 PM, Jason Gunthorpe wrote:
> On Tue, May 16, 2023 at 11:00:16AM +0800, Baolu Lu wrote:
>> On 5/15/23 10:00 PM, Jason Gunthorpe wrote:
>>> The devices list was used as a simple way to avoid having per-group
>>> information. Now that this seems to be unavoidable, just commit to
>>> per-group information fully and remove the devices list from the HWPT.
>>>
>>> The iommufd_group stores the currently assigned HWPT for the entire group
>>> and we can manage the per-device attach/detach with a list in the
>>> iommufd_group.
>>
>> I am preparing the patches to route I/O page faults to user space
>> through iommufd. The iommufd page fault handler knows the hwpt and the
>> device pointer, but it needs to convert the device pointer into its
>> iommufd object id and pass the id to user space.
>>
>> It's fine that we remove the hwpt->devices here, but perhaps I need to
>> add the context pointer in ioas later,
>>
>> struct iommufd_ioas {
>>          struct io_pagetable iopt;
>>          struct mutex mutex;
>>          struct list_head hwpt_list;
>> +       struct iommufd_ctx *ictx;
>>   };
>>
>> and, use below helper to look up the device id.
>>
>> +u32 iommufd_get_device_id(struct iommufd_ctx *ictx, struct device *dev)
>> +{
>> +       struct iommu_group *group = iommu_group_get(dev);
>> +       u32 dev_id = IOMMUFD_INVALID_OBJ_ID;
>> +       struct iommufd_group *igroup;
>> +       struct iommufd_device *cur;
>> +       unsigned int id;
>> +
>> +       if (!group)
>> +               return IOMMUFD_INVALID_OBJ_ID;
>> +
>> +       id = iommu_group_id(group);
>> +       xa_lock(&ictx->groups);
>> +       igroup = xa_load(&ictx->groups, id);
>> +       if (!iommufd_group_try_get(igroup, group)) {
>> +               xa_unlock(&ictx->groups);
>> +               iommu_group_put(group);
>> +               return IOMMUFD_INVALID_OBJ_ID;
>> +        }
>> +        xa_unlock(&ictx->groups);
>> +
>> +       mutex_lock(&igroup->lock);
>> +       list_for_each_entry(cur, &igroup->device_list, group_item) {
>> +               if (cur->dev == dev) {
>> +                       dev_id = cur->obj.id;
>> +                       break;
>> +               }
>> +       }
> 
> I dislike how slow this is on something resembling a fastish path :\

Yes, agreed.

> Maybe we should stash something in the dev_iommu instead?
> 
> Or can the PRI stuff provide a cookie per-device?

We already have a per-device fault cookie:

/**
  * struct iommu_fault_param - per-device IOMMU fault data
  * @handler: Callback function to handle IOMMU faults at device level
  * @data: handler private data
  * @faults: holds the pending faults which needs response
  * @lock: protect pending faults list
  */
struct iommu_fault_param {
         iommu_dev_fault_handler_t handler;
         void *data;
         struct list_head faults;
         struct mutex lock;
};

Perhaps we can add a @dev_id memory here?

> 
> But it will work like this
> 
>>         dev_id = iommufd_get_device_id(hwpt->ioas->ictx, dev);
> 
> Where did the hwpt come from?

It is installed when setting up the iopf handler for the hwpt.

+	iommu_domain_set_iopf_handler(hwpt->domain,
+                                     iommufd_hw_pagetable_iopf_handler,
+                                     hwpt);

Best regards,
baolu
Tian, Kevin May 17, 2023, 6:33 a.m. UTC | #4
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Wednesday, May 17, 2023 12:15 PM
> 
> On 5/16/23 8:27 PM, Jason Gunthorpe wrote:
> > On Tue, May 16, 2023 at 11:00:16AM +0800, Baolu Lu wrote:
> >> On 5/15/23 10:00 PM, Jason Gunthorpe wrote:
> >>> The devices list was used as a simple way to avoid having per-group
> >>> information. Now that this seems to be unavoidable, just commit to
> >>> per-group information fully and remove the devices list from the HWPT.
> >>>
> >>> The iommufd_group stores the currently assigned HWPT for the entire
> group
> >>> and we can manage the per-device attach/detach with a list in the
> >>> iommufd_group.
> >>
> >> I am preparing the patches to route I/O page faults to user space
> >> through iommufd. The iommufd page fault handler knows the hwpt and
> the
> >> device pointer, but it needs to convert the device pointer into its
> >> iommufd object id and pass the id to user space.
> >>
> >> It's fine that we remove the hwpt->devices here, but perhaps I need to
> >> add the context pointer in ioas later,
> >>
> >> struct iommufd_ioas {
> >>          struct io_pagetable iopt;
> >>          struct mutex mutex;
> >>          struct list_head hwpt_list;
> >> +       struct iommufd_ctx *ictx;
> >>   };
> >>
> >> and, use below helper to look up the device id.
> >>
> >> +u32 iommufd_get_device_id(struct iommufd_ctx *ictx, struct device *dev)
> >> +{
> >> +       struct iommu_group *group = iommu_group_get(dev);
> >> +       u32 dev_id = IOMMUFD_INVALID_OBJ_ID;
> >> +       struct iommufd_group *igroup;
> >> +       struct iommufd_device *cur;
> >> +       unsigned int id;
> >> +
> >> +       if (!group)
> >> +               return IOMMUFD_INVALID_OBJ_ID;
> >> +
> >> +       id = iommu_group_id(group);
> >> +       xa_lock(&ictx->groups);
> >> +       igroup = xa_load(&ictx->groups, id);
> >> +       if (!iommufd_group_try_get(igroup, group)) {
> >> +               xa_unlock(&ictx->groups);
> >> +               iommu_group_put(group);
> >> +               return IOMMUFD_INVALID_OBJ_ID;
> >> +        }
> >> +        xa_unlock(&ictx->groups);
> >> +
> >> +       mutex_lock(&igroup->lock);
> >> +       list_for_each_entry(cur, &igroup->device_list, group_item) {
> >> +               if (cur->dev == dev) {
> >> +                       dev_id = cur->obj.id;
> >> +                       break;
> >> +               }
> >> +       }
> >
> > I dislike how slow this is on something resembling a fastish path :\
> 
> Yes, agreed.
> 
> > Maybe we should stash something in the dev_iommu instead?
> >
> > Or can the PRI stuff provide a cookie per-device?
> 
> We already have a per-device fault cookie:
> 
> /**
>   * struct iommu_fault_param - per-device IOMMU fault data
>   * @handler: Callback function to handle IOMMU faults at device level
>   * @data: handler private data
>   * @faults: holds the pending faults which needs response
>   * @lock: protect pending faults list
>   */
> struct iommu_fault_param {
>          iommu_dev_fault_handler_t handler;
>          void *data;
>          struct list_head faults;
>          struct mutex lock;
> };
> 
> Perhaps we can add a @dev_id memory here?
> 

what about SIOV? There is only one cookie per parent device.
Jason Gunthorpe May 17, 2023, 12:43 p.m. UTC | #5
On Wed, May 17, 2023 at 06:33:30AM +0000, Tian, Kevin wrote:
> > From: Baolu Lu <baolu.lu@linux.intel.com>
> > Sent: Wednesday, May 17, 2023 12:15 PM
> > 
> > On 5/16/23 8:27 PM, Jason Gunthorpe wrote:
> > > On Tue, May 16, 2023 at 11:00:16AM +0800, Baolu Lu wrote:
> > >> On 5/15/23 10:00 PM, Jason Gunthorpe wrote:
> > >>> The devices list was used as a simple way to avoid having per-group
> > >>> information. Now that this seems to be unavoidable, just commit to
> > >>> per-group information fully and remove the devices list from the HWPT.
> > >>>
> > >>> The iommufd_group stores the currently assigned HWPT for the entire
> > group
> > >>> and we can manage the per-device attach/detach with a list in the
> > >>> iommufd_group.
> > >>
> > >> I am preparing the patches to route I/O page faults to user space
> > >> through iommufd. The iommufd page fault handler knows the hwpt and
> > the
> > >> device pointer, but it needs to convert the device pointer into its
> > >> iommufd object id and pass the id to user space.
> > >>
> > >> It's fine that we remove the hwpt->devices here, but perhaps I need to
> > >> add the context pointer in ioas later,
> > >>
> > >> struct iommufd_ioas {
> > >>          struct io_pagetable iopt;
> > >>          struct mutex mutex;
> > >>          struct list_head hwpt_list;
> > >> +       struct iommufd_ctx *ictx;
> > >>   };
> > >>
> > >> and, use below helper to look up the device id.
> > >>
> > >> +u32 iommufd_get_device_id(struct iommufd_ctx *ictx, struct device *dev)
> > >> +{
> > >> +       struct iommu_group *group = iommu_group_get(dev);
> > >> +       u32 dev_id = IOMMUFD_INVALID_OBJ_ID;
> > >> +       struct iommufd_group *igroup;
> > >> +       struct iommufd_device *cur;
> > >> +       unsigned int id;
> > >> +
> > >> +       if (!group)
> > >> +               return IOMMUFD_INVALID_OBJ_ID;
> > >> +
> > >> +       id = iommu_group_id(group);
> > >> +       xa_lock(&ictx->groups);
> > >> +       igroup = xa_load(&ictx->groups, id);
> > >> +       if (!iommufd_group_try_get(igroup, group)) {
> > >> +               xa_unlock(&ictx->groups);
> > >> +               iommu_group_put(group);
> > >> +               return IOMMUFD_INVALID_OBJ_ID;
> > >> +        }
> > >> +        xa_unlock(&ictx->groups);
> > >> +
> > >> +       mutex_lock(&igroup->lock);
> > >> +       list_for_each_entry(cur, &igroup->device_list, group_item) {
> > >> +               if (cur->dev == dev) {
> > >> +                       dev_id = cur->obj.id;
> > >> +                       break;
> > >> +               }
> > >> +       }
> > >
> > > I dislike how slow this is on something resembling a fastish path :\
> > 
> > Yes, agreed.
> > 
> > > Maybe we should stash something in the dev_iommu instead?
> > >
> > > Or can the PRI stuff provide a cookie per-device?
> > 
> > We already have a per-device fault cookie:
> > 
> > /**
> >   * struct iommu_fault_param - per-device IOMMU fault data
> >   * @handler: Callback function to handle IOMMU faults at device level
> >   * @data: handler private data
> >   * @faults: holds the pending faults which needs response
> >   * @lock: protect pending faults list
> >   */
> > struct iommu_fault_param {
> >          iommu_dev_fault_handler_t handler;
> >          void *data;
> >          struct list_head faults;
> >          struct mutex lock;
> > };
> > 
> > Perhaps we can add a @dev_id memory here?
> > 
> 
> what about SIOV? There is only one cookie per parent device.

It doesn't make any sense to store a struct like that in dev_iommu.

The fault handler should come from the domain and we should be able to
have a unique 'void *data' cookie linked to the (dev,PASID) to go
along with the fault handler.

This is all going to need some revising before we can expose it to
iommufd

Jason
Baolu Lu May 18, 2023, 7:05 a.m. UTC | #6
On 2023/5/17 20:43, Jason Gunthorpe wrote:
> On Wed, May 17, 2023 at 06:33:30AM +0000, Tian, Kevin wrote:
>>> From: Baolu Lu <baolu.lu@linux.intel.com>
>>> Sent: Wednesday, May 17, 2023 12:15 PM
>>>
>>> On 5/16/23 8:27 PM, Jason Gunthorpe wrote:
>>>> On Tue, May 16, 2023 at 11:00:16AM +0800, Baolu Lu wrote:
>>>>> On 5/15/23 10:00 PM, Jason Gunthorpe wrote:
>>>>>> The devices list was used as a simple way to avoid having per-group
>>>>>> information. Now that this seems to be unavoidable, just commit to
>>>>>> per-group information fully and remove the devices list from the HWPT.
>>>>>>
>>>>>> The iommufd_group stores the currently assigned HWPT for the entire
>>> group
>>>>>> and we can manage the per-device attach/detach with a list in the
>>>>>> iommufd_group.
>>>>>
>>>>> I am preparing the patches to route I/O page faults to user space
>>>>> through iommufd. The iommufd page fault handler knows the hwpt and
>>> the
>>>>> device pointer, but it needs to convert the device pointer into its
>>>>> iommufd object id and pass the id to user space.
>>>>>
>>>>> It's fine that we remove the hwpt->devices here, but perhaps I need to
>>>>> add the context pointer in ioas later,
>>>>>
>>>>> struct iommufd_ioas {
>>>>>           struct io_pagetable iopt;
>>>>>           struct mutex mutex;
>>>>>           struct list_head hwpt_list;
>>>>> +       struct iommufd_ctx *ictx;
>>>>>    };
>>>>>
>>>>> and, use below helper to look up the device id.
>>>>>
>>>>> +u32 iommufd_get_device_id(struct iommufd_ctx *ictx, struct device *dev)
>>>>> +{
>>>>> +       struct iommu_group *group = iommu_group_get(dev);
>>>>> +       u32 dev_id = IOMMUFD_INVALID_OBJ_ID;
>>>>> +       struct iommufd_group *igroup;
>>>>> +       struct iommufd_device *cur;
>>>>> +       unsigned int id;
>>>>> +
>>>>> +       if (!group)
>>>>> +               return IOMMUFD_INVALID_OBJ_ID;
>>>>> +
>>>>> +       id = iommu_group_id(group);
>>>>> +       xa_lock(&ictx->groups);
>>>>> +       igroup = xa_load(&ictx->groups, id);
>>>>> +       if (!iommufd_group_try_get(igroup, group)) {
>>>>> +               xa_unlock(&ictx->groups);
>>>>> +               iommu_group_put(group);
>>>>> +               return IOMMUFD_INVALID_OBJ_ID;
>>>>> +        }
>>>>> +        xa_unlock(&ictx->groups);
>>>>> +
>>>>> +       mutex_lock(&igroup->lock);
>>>>> +       list_for_each_entry(cur, &igroup->device_list, group_item) {
>>>>> +               if (cur->dev == dev) {
>>>>> +                       dev_id = cur->obj.id;
>>>>> +                       break;
>>>>> +               }
>>>>> +       }
>>>>
>>>> I dislike how slow this is on something resembling a fastish path :\
>>>
>>> Yes, agreed.
>>>
>>>> Maybe we should stash something in the dev_iommu instead?
>>>>
>>>> Or can the PRI stuff provide a cookie per-device?
>>>
>>> We already have a per-device fault cookie:
>>>
>>> /**
>>>    * struct iommu_fault_param - per-device IOMMU fault data
>>>    * @handler: Callback function to handle IOMMU faults at device level
>>>    * @data: handler private data
>>>    * @faults: holds the pending faults which needs response
>>>    * @lock: protect pending faults list
>>>    */
>>> struct iommu_fault_param {
>>>           iommu_dev_fault_handler_t handler;
>>>           void *data;
>>>           struct list_head faults;
>>>           struct mutex lock;
>>> };
>>>
>>> Perhaps we can add a @dev_id memory here?
>>>
>>
>> what about SIOV? There is only one cookie per parent device.
> 
> It doesn't make any sense to store a struct like that in dev_iommu.
> 
> The fault handler should come from the domain and we should be able to
> have a unique 'void *data' cookie linked to the (dev,PASID) to go
> along with the fault handler.

If I get your point correctly, the iommu core should provide some places
for the iommufd to put a cookie for each pair of {device, pasid}, and
provide interfaces to manage it. For example,

void iommu_set_device_fault_cookie(struct device *dev,
				   ioasit_t pasid,
				   void *fault_cookie);

void *iommu_get_device_fault_cookie(struct device *dev,
				    ioasit_t pasid)

If so, perhaps we need some special treatment for ARM as a user hwpt
actually presents the PASID table of the device and the guest setting
pasid table entry will not be propagated to host. Then, the @pasid in
above interfaces is meaningless.

> This is all going to need some revising before we can expose it to
> iommufd

Yes, agreed. i will post a preparation series to do this. Besides the
fault cookie, at least, I want to do the following preparation.

1) Move iommu faults uapi from uapi/linux/iommu.h to uapi/linux
   /iommufd.h and remove the former.

2) Add a device id in the iommu_fault structure.
  struct iommu_fault {
         __u32   type;
-       __u32   padding;
+       __u32   dev_id;
         union {
                 struct iommu_fault_unrecoverable event;
                 struct iommu_fault_page_request prm;

3) Add the device pointer to the parameters of domain fault handler.

4) Decouple I/O page fault handling from IOMMU_SVA in the iommu core and
    the drivers.

Best regards,
baolu
Jason Gunthorpe May 18, 2023, 12:02 p.m. UTC | #7
On Thu, May 18, 2023 at 03:05:23PM +0800, Baolu Lu wrote:

> > It doesn't make any sense to store a struct like that in dev_iommu.
> > 
> > The fault handler should come from the domain and we should be able to
> > have a unique 'void *data' cookie linked to the (dev,PASID) to go
> > along with the fault handler.
> 
> If I get your point correctly, the iommu core should provide some places
> for the iommufd to put a cookie for each pair of {device, pasid}, and
> provide interfaces to manage it. For example,

I'd say when you attach a PRI capable domain (eg to a PASID) then provide
a 'void * data' during the attach.

> If so, perhaps we need some special treatment for ARM as a user hwpt
> actually presents the PASID table of the device and the guest setting
> pasid table entry will not be propagated to host. Then, the @pasid in
> above interfaces is meaningless.

As above, when attaching to a RID you'd still pass in the *data

> 1) Move iommu faults uapi from uapi/linux/iommu.h to uapi/linux
>   /iommufd.h and remove the former.

Please no, delete all the dead code from here and move whatever is
still in use into include/linux/

Then we can talk about what parts of it become uAPI and how best to
manage that on a patch by patch basis.

> 2) Add a device id in the iommu_fault structure.
>  struct iommu_fault {
>         __u32   type;
> -       __u32   padding;
> +       __u32   dev_id;

Why? This is iommufd internal, the void * above should cover it.

> 3) Add the device pointer to the parameters of domain fault handler.

That makes some sense
 
> 4) Decouple I/O page fault handling from IOMMU_SVA in the iommu core and
>    the drivers.

Definately, this SVA stuff need to be scrubbed out.

SVA is only a special domain type that takes the page table top from a
mmu_stuct and a shared fault handler in the core code to do handle_mm_fault()

It should not be in drivers any more deeply than that. We still have a
ways to go.

Jason
Baolu Lu May 19, 2023, 2:03 a.m. UTC | #8
On 2023/5/18 20:02, Jason Gunthorpe wrote:
> On Thu, May 18, 2023 at 03:05:23PM +0800, Baolu Lu wrote:
> 
>>> It doesn't make any sense to store a struct like that in dev_iommu.
>>>
>>> The fault handler should come from the domain and we should be able to
>>> have a unique 'void *data' cookie linked to the (dev,PASID) to go
>>> along with the fault handler.
>>
>> If I get your point correctly, the iommu core should provide some places
>> for the iommufd to put a cookie for each pair of {device, pasid}, and
>> provide interfaces to manage it. For example,
> 
> I'd say when you attach a PRI capable domain (eg to a PASID) then provide
> a 'void * data' during the attach.
> 
>> If so, perhaps we need some special treatment for ARM as a user hwpt
>> actually presents the PASID table of the device and the guest setting
>> pasid table entry will not be propagated to host. Then, the @pasid in
>> above interfaces is meaningless.
> 
> As above, when attaching to a RID you'd still pass in the *data

Yes! Merging these with hwpt attach/detach would be more logical.

> 
>> 1) Move iommu faults uapi from uapi/linux/iommu.h to uapi/linux
>>    /iommufd.h and remove the former.
> 
> Please no, delete all the dead code from here and move whatever is
> still in use into include/linux/
> 
> Then we can talk about what parts of it become uAPI and how best to
> manage that on a patch by patch basis.

Okay, let's rebuild it from the ground up.

> 
>> 2) Add a device id in the iommu_fault structure.
>>   struct iommu_fault {
>>          __u32   type;
>> -       __u32   padding;
>> +       __u32   dev_id;
> 
> Why? This is iommufd internal, the void * above should cover it.

This is actually part of 1). :-)

> 
>> 3) Add the device pointer to the parameters of domain fault handler.
> 
> That makes some sense
>   
>> 4) Decouple I/O page fault handling from IOMMU_SVA in the iommu core and
>>     the drivers.
> 
> Definately, this SVA stuff need to be scrubbed out.
> 
> SVA is only a special domain type that takes the page table top from a
> mmu_stuct and a shared fault handler in the core code to do handle_mm_fault()
> 
> It should not be in drivers any more deeply than that. We still have a
> ways to go.

Agreed. In addition to fault handler, SVA usually needs to register a
callback to mm_notifier to keep the IO or device TLB cache consistent.
This part of code could also be consolidated to the core.

There are still things to do here, but the priority (from my point of
view) is to make the iopf handling framework in the iommu core more
generic, rather than just serving SVA.

Best regards,
baolu
Tian, Kevin May 19, 2023, 7:51 a.m. UTC | #9
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Friday, May 19, 2023 10:04 AM
> 
> On 2023/5/18 20:02, Jason Gunthorpe wrote:
> > On Thu, May 18, 2023 at 03:05:23PM +0800, Baolu Lu wrote:
> >
> >> If so, perhaps we need some special treatment for ARM as a user hwpt
> >> actually presents the PASID table of the device and the guest setting
> >> pasid table entry will not be propagated to host. Then, the @pasid in
> >> above interfaces is meaningless.
> >
> > As above, when attaching to a RID you'd still pass in the *data
> 
> Yes! Merging these with hwpt attach/detach would be more logical.

Probably we still need a way in iommu core to differentiate whether
to look at *data according to {RID} alone or {RID, PASID} when receiving
a fault request tagged by {struct device, pasid} from the underlying
iommu driver.

That might be just implementation detail, though.

> 
> >
> >> 1) Move iommu faults uapi from uapi/linux/iommu.h to uapi/linux
> >>    /iommufd.h and remove the former.
> >
> > Please no, delete all the dead code from here and move whatever is
> > still in use into include/linux/
> >
> > Then we can talk about what parts of it become uAPI and how best to
> > manage that on a patch by patch basis.
> 
> Okay, let's rebuild it from the ground up.
> 

yes. Actually that uapi is incomplete. It contains only definitions of
data structures but no actual command. It's hard to say that definition
is sufficient w/o actually involving the command.
Jason Gunthorpe May 19, 2023, 11:42 a.m. UTC | #10
On Fri, May 19, 2023 at 10:03:57AM +0800, Baolu Lu wrote:

> Agreed. In addition to fault handler, SVA usually needs to register a
> callback to mm_notifier to keep the IO or device TLB cache consistent.
> This part of code could also be consolidated to the core.

Yes, that should not be in drivers at all..

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 8c27f6901446e8..23ebf2065e20de 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -20,9 +20,12 @@  static void iommufd_group_release(struct kref *kref)
 	struct iommufd_group *igroup =
 		container_of(kref, struct iommufd_group, ref);
 
+	WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list));
+
 	xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup,
 		   NULL, GFP_KERNEL);
 	iommu_group_put(igroup->group);
+	mutex_destroy(&igroup->lock);
 	kfree(igroup);
 }
 
@@ -83,6 +86,8 @@  static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
 	}
 
 	kref_init(&new_igroup->ref);
+	mutex_init(&new_igroup->lock);
+	INIT_LIST_HEAD(&new_igroup->device_list);
 	/* group reference moves into new_igroup */
 	new_igroup->group = group;
 
@@ -277,29 +282,18 @@  static int iommufd_device_setup_msi(struct iommufd_device *idev,
 	return 0;
 }
 
-static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
-					   struct iommufd_group *igroup)
-{
-	struct iommufd_device *cur_dev;
-
-	lockdep_assert_held(&hwpt->devices_lock);
-
-	list_for_each_entry(cur_dev, &hwpt->devices, devices_item)
-		if (cur_dev->igroup->group == igroup->group)
-			return true;
-	return false;
-}
-
 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 				struct iommufd_device *idev)
 {
 	phys_addr_t sw_msi_start = PHYS_ADDR_MAX;
 	int rc;
 
-	lockdep_assert_held(&hwpt->devices_lock);
+	mutex_lock(&idev->igroup->lock);
 
-	if (WARN_ON(idev->hwpt))
-		return -EINVAL;
+	if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) {
+		rc = -EINVAL;
+		goto err_unlock;
+	}
 
 	/*
 	 * Try to upgrade the domain we have, it is an iommu driver bug to
@@ -313,8 +307,9 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 				hwpt->domain->ops->enforce_cache_coherency(
 					hwpt->domain);
 		if (!hwpt->enforce_cache_coherency) {
-			WARN_ON(list_empty(&hwpt->devices));
-			return -EINVAL;
+			WARN_ON(list_empty(&idev->igroup->device_list));
+			rc = -EINVAL;
+			goto err_unlock;
 		}
 	}
 
@@ -322,51 +317,52 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 						   idev->igroup->group,
 						   &sw_msi_start);
 	if (rc)
-		return rc;
+		goto err_unlock;
 
 	rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start);
 	if (rc)
 		goto err_unresv;
 
 	/*
-	 * FIXME: Hack around missing a device-centric iommu api, only attach to
-	 * the group once for the first device that is in the group.
+	 * Only attach to the group once for the first device that is in the
+	 * group. All the other devices will follow this attachment. The user
+	 * should attach every device individually to the hwpt as the per-device
+	 * reserved regions are only updated during individual device
+	 * attachment.
 	 */
-	if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup)) {
+	if (list_empty(&idev->igroup->device_list)) {
 		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
 		if (rc)
 			goto err_unresv;
+		idev->igroup->hwpt = hwpt;
 	}
+	refcount_inc(&hwpt->obj.users);
+	list_add_tail(&idev->group_item, &idev->igroup->device_list);
+	mutex_unlock(&idev->igroup->lock);
 	return 0;
 err_unresv:
 	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+err_unlock:
+	mutex_unlock(&idev->igroup->lock);
 	return rc;
 }
 
-void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt,
-				 struct iommufd_device *idev)
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 {
-	if (!iommufd_hw_pagetable_has_group(hwpt, idev->igroup))
+	struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt;
+
+	mutex_lock(&idev->igroup->lock);
+	list_del(&idev->group_item);
+	if (list_empty(&idev->igroup->device_list)) {
 		iommu_detach_group(hwpt->domain, idev->igroup->group);
+		idev->igroup->hwpt = NULL;
+	}
 	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
-}
+	mutex_unlock(&idev->igroup->lock);
 
-static int iommufd_device_do_attach(struct iommufd_device *idev,
-				    struct iommufd_hw_pagetable *hwpt)
-{
-	int rc;
-
-	mutex_lock(&hwpt->devices_lock);
-	rc = iommufd_hw_pagetable_attach(hwpt, idev);
-	if (rc)
-		goto out_unlock;
-
-	idev->hwpt = hwpt;
-	refcount_inc(&hwpt->obj.users);
-	list_add(&idev->devices_item, &hwpt->devices);
-out_unlock:
-	mutex_unlock(&hwpt->devices_lock);
-	return rc;
+	/* Caller must destroy hwpt */
+	return hwpt;
 }
 
 /*
@@ -375,7 +371,7 @@  static int iommufd_device_do_attach(struct iommufd_device *idev,
  * Automatic domain selection will never pick a manually created domain.
  */
 static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
-					  struct iommufd_ioas *ioas)
+					  struct iommufd_ioas *ioas, u32 *pt_id)
 {
 	struct iommufd_hw_pagetable *hwpt;
 	int rc;
@@ -392,7 +388,7 @@  static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
 
 		if (!iommufd_lock_obj(&hwpt->obj))
 			continue;
-		rc = iommufd_device_do_attach(idev, hwpt);
+		rc = iommufd_hw_pagetable_attach(hwpt, idev);
 		iommufd_put_object(&hwpt->obj);
 
 		/*
@@ -402,6 +398,7 @@  static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
 		 */
 		if (rc == -EINVAL)
 			continue;
+		*pt_id = hwpt->obj.id;
 		goto out_unlock;
 	}
 
@@ -411,6 +408,7 @@  static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
 		goto out_unlock;
 	}
 	hwpt->auto_domain = true;
+	*pt_id = hwpt->obj.id;
 
 	mutex_unlock(&ioas->mutex);
 	iommufd_object_finalize(idev->ictx, &hwpt->obj);
@@ -446,7 +444,7 @@  int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
 		struct iommufd_hw_pagetable *hwpt =
 			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
 
-		rc = iommufd_device_do_attach(idev, hwpt);
+		rc = iommufd_hw_pagetable_attach(hwpt, idev);
 		if (rc)
 			goto out_put_pt_obj;
 		break;
@@ -455,7 +453,7 @@  int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
 		struct iommufd_ioas *ioas =
 			container_of(pt_obj, struct iommufd_ioas, obj);
 
-		rc = iommufd_device_auto_get_domain(idev, ioas);
+		rc = iommufd_device_auto_get_domain(idev, ioas, pt_id);
 		if (rc)
 			goto out_put_pt_obj;
 		break;
@@ -466,7 +464,6 @@  int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
 	}
 
 	refcount_inc(&idev->obj.users);
-	*pt_id = idev->hwpt->obj.id;
 	rc = 0;
 
 out_put_pt_obj:
@@ -484,14 +481,9 @@  EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
  */
 void iommufd_device_detach(struct iommufd_device *idev)
 {
-	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
-
-	mutex_lock(&hwpt->devices_lock);
-	list_del(&idev->devices_item);
-	idev->hwpt = NULL;
-	iommufd_hw_pagetable_detach(hwpt, idev);
-	mutex_unlock(&hwpt->devices_lock);
+	struct iommufd_hw_pagetable *hwpt;
 
+	hwpt = iommufd_hw_pagetable_detach(idev);
 	if (hwpt->auto_domain)
 		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
 	else
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 6cdb6749d359f3..bdb76cdb1dc347 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -11,8 +11,6 @@  void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
 	struct iommufd_hw_pagetable *hwpt =
 		container_of(obj, struct iommufd_hw_pagetable, obj);
 
-	WARN_ON(!list_empty(&hwpt->devices));
-
 	if (!list_empty(&hwpt->hwpt_item)) {
 		mutex_lock(&hwpt->ioas->mutex);
 		list_del(&hwpt->hwpt_item);
@@ -25,7 +23,6 @@  void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
 		iommu_domain_free(hwpt->domain);
 
 	refcount_dec(&hwpt->ioas->obj.users);
-	mutex_destroy(&hwpt->devices_lock);
 }
 
 /**
@@ -52,9 +49,7 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 	if (IS_ERR(hwpt))
 		return hwpt;
 
-	INIT_LIST_HEAD(&hwpt->devices);
 	INIT_LIST_HEAD(&hwpt->hwpt_item);
-	mutex_init(&hwpt->devices_lock);
 	/* Pairs with iommufd_hw_pagetable_destroy() */
 	refcount_inc(&ioas->obj.users);
 	hwpt->ioas = ioas;
@@ -65,8 +60,6 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 		goto out_abort;
 	}
 
-	mutex_lock(&hwpt->devices_lock);
-
 	/*
 	 * immediate_attach exists only to accommodate iommu drivers that cannot
 	 * directly allocate a domain. These drivers do not finish creating the
@@ -76,29 +69,18 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 	if (immediate_attach) {
 		rc = iommufd_hw_pagetable_attach(hwpt, idev);
 		if (rc)
-			goto out_unlock;
+			goto out_abort;
 	}
 
 	rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
 	if (rc)
 		goto out_detach;
 	list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
-
-	if (immediate_attach) {
-		/* See iommufd_device_do_attach() */
-		refcount_inc(&hwpt->obj.users);
-		idev->hwpt = hwpt;
-		list_add(&idev->devices_item, &hwpt->devices);
-	}
-
-	mutex_unlock(&hwpt->devices_lock);
 	return hwpt;
 
 out_detach:
 	if (immediate_attach)
-		iommufd_hw_pagetable_detach(hwpt, idev);
-out_unlock:
-	mutex_unlock(&hwpt->devices_lock);
+		iommufd_hw_pagetable_detach(idev);
 out_abort:
 	iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
 	return ERR_PTR(rc);
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index f45615f19798e6..e2eb1db5f8f8ce 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -250,8 +250,6 @@  struct iommufd_hw_pagetable {
 	bool msi_cookie : 1;
 	/* Head at iommufd_ioas::hwpt_list */
 	struct list_head hwpt_item;
-	struct mutex devices_lock;
-	struct list_head devices;
 };
 
 struct iommufd_hw_pagetable *
@@ -259,14 +257,17 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 			   struct iommufd_device *idev, bool immediate_attach);
 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 				struct iommufd_device *idev);
-void iommufd_hw_pagetable_detach(struct iommufd_hw_pagetable *hwpt,
-				 struct iommufd_device *idev);
+struct iommufd_hw_pagetable *
+iommufd_hw_pagetable_detach(struct iommufd_device *idev);
 void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
 
 struct iommufd_group {
 	struct kref ref;
+	struct mutex lock;
 	struct iommufd_ctx *ictx;
 	struct iommu_group *group;
+	struct iommufd_hw_pagetable *hwpt;
+	struct list_head device_list;
 };
 
 /*
@@ -278,9 +279,7 @@  struct iommufd_device {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
 	struct iommufd_group *igroup;
-	struct iommufd_hw_pagetable *hwpt;
-	/* Head at iommufd_hw_pagetable::devices */
-	struct list_head devices_item;
+	struct list_head group_item;
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;