diff mbox series

[v2,4/6] iommufd: Deliver fault messages to user space

Message ID 20231026024930.382898-5-baolu.lu@linux.intel.com (mailing list archive)
State New
Headers show
Series IOMMUFD: Deliver IO page faults to user space | expand

Commit Message

Baolu Lu Oct. 26, 2023, 2:49 a.m. UTC
Add the file interface that provides a simple and efficient way for
userspace to handle page faults. The file interface allows userspace
to read fault messages sequentially, and to respond to the handling
result by writing to the same file.

Userspace applications are recommended to use io_uring to speed up read
and write efficiency.

With this done, allow userspace application to allocate a hw page table
with IOMMU_HWPT_ALLOC_IOPF_CAPABLE flag set.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/iommufd/iommufd_private.h |   2 +
 drivers/iommu/iommufd/hw_pagetable.c    | 204 +++++++++++++++++++++++-
 2 files changed, 205 insertions(+), 1 deletion(-)

Comments

Jason Gunthorpe Dec. 1, 2023, 3:24 p.m. UTC | #1
On Thu, Oct 26, 2023 at 10:49:28AM +0800, Lu Baolu wrote:

> +static ssize_t hwpt_fault_fops_write(struct file *filep,
> +				     const char __user *buf,
> +				     size_t count, loff_t *ppos)
> +{
> +	size_t response_size = sizeof(struct iommu_hwpt_page_response);
> +	struct hw_pgtable_fault *fault = filep->private_data;
> +	struct iommu_hwpt_page_response response;
> +	struct iommufd_hw_pagetable *hwpt;
> +	struct iopf_group *iter, *group;
> +	struct iommufd_device *idev;
> +	size_t done = 0;
> +	int rc = 0;
> +
> +	if (*ppos || count % response_size)
> +		return -ESPIPE;
> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->response) && count > done) {
> +		rc = copy_from_user(&response, buf + done, response_size);
> +		if (rc)
> +			break;
> +
> +		/* Get the device that this response targets at. */
> +		idev = container_of(iommufd_get_object(fault->ictx,
> +						       response.dev_id,
> +						       IOMMUFD_OBJ_DEVICE),
> +				    struct iommufd_device, obj);
> +		if (IS_ERR(idev)) {
> +			rc = PTR_ERR(idev);
> +			break;
> +		}

See here it might be better to have a per-fd list of outstanding
faults per-fd and then the cookie would just index that list, then you
get everything in one shot instead of having to do a xarray looking
and then a linear list search

> +static const struct file_operations hwpt_fault_fops = {
> +	.owner		= THIS_MODULE,
> +	.read		= hwpt_fault_fops_read,
> +	.write		= hwpt_fault_fops_write,
> +};

nonseekable_open() behavior should be integrated into this

> +static int hw_pagetable_get_fault_fd(struct hw_pgtable_fault *fault)
> +{
> +	struct file *filep;
> +	int fdno;
> +
> +	fdno = get_unused_fd_flags(O_CLOEXEC);
> +	if (fdno < 0)
> +		return fdno;
> +
> +	filep = anon_inode_getfile("[iommufd-pgfault]", &hwpt_fault_fops,
> +				   fault, O_RDWR);
> +	if (IS_ERR(filep)) {
> +		put_unused_fd(fdno);
> +		return PTR_ERR(filep);
> +	}
> +
> +	fd_install(fdno, filep);
> +	fault->fault_file = filep;
> +	fault->fault_fd = fdno;

fd_install must be the very last thing before returning success from a
system call because we cannot undo it.

There are other failure paths before here and the final return

Jason
Joel Granados Dec. 7, 2023, 4:34 p.m. UTC | #2
On Thu, Oct 26, 2023 at 10:49:28AM +0800, Lu Baolu wrote:
> Add the file interface that provides a simple and efficient way for
> userspace to handle page faults. The file interface allows userspace
> to read fault messages sequentially, and to respond to the handling
> result by writing to the same file.
> 
> Userspace applications are recommended to use io_uring to speed up read
> and write efficiency.
> 
> With this done, allow userspace application to allocate a hw page table
> with IOMMU_HWPT_ALLOC_IOPF_CAPABLE flag set.
> 
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  drivers/iommu/iommufd/iommufd_private.h |   2 +
>  drivers/iommu/iommufd/hw_pagetable.c    | 204 +++++++++++++++++++++++-
>  2 files changed, 205 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 0dbaa2dc5b22..ff063bc48150 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -237,6 +237,8 @@ struct hw_pgtable_fault {
>  	struct mutex mutex;
>  	struct list_head deliver;
>  	struct list_head response;
> +	struct file *fault_file;
> +	int fault_fd;
>  };
>  
>  /*
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index 9f94c824cf86..f0aac1bb2d2d 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -3,6 +3,8 @@
>   * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
>   */
>  #include <linux/iommu.h>
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
>  #include <uapi/linux/iommufd.h>
>  
>  #include "../iommu-priv.h"
> @@ -38,9 +40,198 @@ static void iommufd_kernel_managed_hwpt_destroy(struct iommufd_object *obj)
>  	refcount_dec(&hwpt->ioas->obj.users);
>  }
>  
> +static int iommufd_compose_fault_message(struct iommu_fault *fault,
> +					 struct iommu_hwpt_pgfault *hwpt_fault,
> +					 struct device *dev)
> +{
> +	struct iommufd_device *idev = iopf_pasid_cookie_get(dev, IOMMU_NO_PASID);
> +
> +	if (!idev)
> +		return -ENODEV;
> +
> +	if (IS_ERR(idev))
> +		return PTR_ERR(idev);
> +
> +	hwpt_fault->size = sizeof(*hwpt_fault);
> +	hwpt_fault->flags = fault->prm.flags;
> +	hwpt_fault->dev_id = idev->obj.id;
> +	hwpt_fault->pasid = fault->prm.pasid;
> +	hwpt_fault->grpid = fault->prm.grpid;
> +	hwpt_fault->perm = fault->prm.perm;
> +	hwpt_fault->addr = fault->prm.addr;
> +	hwpt_fault->private_data[0] = fault->prm.private_data[0];
> +	hwpt_fault->private_data[1] = fault->prm.private_data[1];
> +
> +	return 0;
> +}
> +
> +static ssize_t hwpt_fault_fops_read(struct file *filep, char __user *buf,
> +				    size_t count, loff_t *ppos)
> +{
> +	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
> +	struct hw_pgtable_fault *fault = filep->private_data;
> +	struct iommu_hwpt_pgfault data;
> +	struct iopf_group *group;
> +	struct iopf_fault *iopf;
> +	size_t done = 0;
> +	int rc;
> +
> +	if (*ppos || count % fault_size)
> +		return -ESPIPE;
> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->deliver) && count > done) {
> +		group = list_first_entry(&fault->deliver,
> +					 struct iopf_group, node);
> +
> +		if (list_count_nodes(&group->faults) * fault_size > count - done)
> +			break;
> +
> +		list_for_each_entry(iopf, &group->faults, list) {
> +			rc = iommufd_compose_fault_message(&iopf->fault,
> +							   &data, group->dev);
> +			if (rc)
> +				goto err_unlock;
> +			rc = copy_to_user(buf + done, &data, fault_size);
> +			if (rc)
> +				goto err_unlock;
> +			done += fault_size;
> +		}
> +
> +		list_move_tail(&group->node, &fault->response);
> +	}
> +	mutex_unlock(&fault->mutex);
> +
> +	return done;
> +err_unlock:
> +	mutex_unlock(&fault->mutex);
> +	return rc;
> +}
> +
> +static ssize_t hwpt_fault_fops_write(struct file *filep,
> +				     const char __user *buf,
> +				     size_t count, loff_t *ppos)
> +{
> +	size_t response_size = sizeof(struct iommu_hwpt_page_response);
> +	struct hw_pgtable_fault *fault = filep->private_data;
> +	struct iommu_hwpt_page_response response;
> +	struct iommufd_hw_pagetable *hwpt;
> +	struct iopf_group *iter, *group;
> +	struct iommufd_device *idev;
> +	size_t done = 0;
> +	int rc = 0;
> +
> +	if (*ppos || count % response_size)
> +		return -ESPIPE;
> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->response) && count > done) {
> +		rc = copy_from_user(&response, buf + done, response_size);
> +		if (rc)
> +			break;
> +
> +		/* Get the device that this response targets at. */
> +		idev = container_of(iommufd_get_object(fault->ictx,
> +						       response.dev_id,
> +						       IOMMUFD_OBJ_DEVICE),
> +				    struct iommufd_device, obj);
> +		if (IS_ERR(idev)) {
> +			rc = PTR_ERR(idev);
> +			break;
> +		}
> +
> +		/*
> +		 * Get the hw page table that this response was generated for.
> +		 * It must match the one stored in the fault data.
> +		 */
> +		hwpt = container_of(iommufd_get_object(fault->ictx,
> +						       response.hwpt_id,
> +						       IOMMUFD_OBJ_HW_PAGETABLE),
> +				    struct iommufd_hw_pagetable, obj);
> +		if (IS_ERR(hwpt)) {
> +			iommufd_put_object(&idev->obj);
> +			rc = PTR_ERR(hwpt);
> +			break;
> +		}
> +
> +		if (hwpt != fault->hwpt) {
> +			rc = -EINVAL;
> +			goto put_obj;
> +		}
> +
> +		group = NULL;
> +		list_for_each_entry(iter, &fault->response, node) {
> +			if (response.grpid != iter->last_fault.fault.prm.grpid)
> +				continue;
> +
> +			if (idev->dev != iter->dev)
> +				continue;
> +
> +			if ((iter->last_fault.fault.prm.flags &
> +			     IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
> +			    response.pasid != iter->last_fault.fault.prm.pasid)
> +				continue;
> +
> +			group = iter;
> +			break;
> +		}
> +
> +		if (!group) {
> +			rc = -ENODEV;
> +			goto put_obj;
> +		}
> +
> +		rc = iopf_group_response(group, response.code);
> +		if (rc)
> +			goto put_obj;
> +
> +		list_del(&group->node);
> +		iopf_free_group(group);
> +		done += response_size;
> +put_obj:
> +		iommufd_put_object(&hwpt->obj);
> +		iommufd_put_object(&idev->obj);
> +		if (rc)
> +			break;
> +	}
> +	mutex_unlock(&fault->mutex);
> +
> +	return (rc < 0) ? rc : done;
> +}
> +
> +static const struct file_operations hwpt_fault_fops = {
> +	.owner		= THIS_MODULE,
> +	.read		= hwpt_fault_fops_read,
> +	.write		= hwpt_fault_fops_write,
> +};
> +
> +static int hw_pagetable_get_fault_fd(struct hw_pgtable_fault *fault)
> +{
> +	struct file *filep;
> +	int fdno;
> +
> +	fdno = get_unused_fd_flags(O_CLOEXEC);
> +	if (fdno < 0)
> +		return fdno;
> +
> +	filep = anon_inode_getfile("[iommufd-pgfault]", &hwpt_fault_fops,
> +				   fault, O_RDWR);
> +	if (IS_ERR(filep)) {
> +		put_unused_fd(fdno);
> +		return PTR_ERR(filep);
> +	}
> +
> +	fd_install(fdno, filep);
> +	fault->fault_file = filep;
> +	fault->fault_fd = fdno;
> +
> +	return 0;
> +}
> +
>  static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
>  {
>  	struct hw_pgtable_fault *fault;
> +	int rc;
>  
>  	fault = kzalloc(sizeof(*fault), GFP_KERNEL);
>  	if (!fault)
> @@ -50,6 +241,12 @@ static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
>  	INIT_LIST_HEAD(&fault->response);
>  	mutex_init(&fault->mutex);
>  
> +	rc = hw_pagetable_get_fault_fd(fault);
> +	if (rc) {
> +		kfree(fault);
> +		return ERR_PTR(rc);
> +	}
> +
>  	return fault;
>  }
>  
> @@ -58,6 +255,8 @@ static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
>  	WARN_ON(!list_empty(&fault->deliver));
>  	WARN_ON(!list_empty(&fault->response));
>  
> +	fput(fault->fault_file);
> +	put_unused_fd(fault->fault_fd);
I have been running your code and have run into some invalid memory in
this line. When `put_unused_fd` is called the files of the current task
is accessed with `current->files`. In my case this is 0x0.

The reason for it being 0x0 is that `do_exit` calls `exit_files` where
the task files get set to NULL; this call is made in `do_exit` before we
execute `exit_task_work`.

'exit_task_work` is the call that eventually arrives here to `hw_pagetable_fault_free`.

The way I have arrived to this state is the following:
1. Version of linux kernel that I'm using : commit 357b5abcba0477f7f1391dd0fa3a919a6f06bdf0 (HEAD, lubaolu/iommufd-io-pgfault-delivery-v2)
2. Version of qemu that Im using : commit 577ef478780597d3f449feb01e853b93fa5c5530 (HEAD, yiliu/zhenzhong/wip/iommufd_nesting_rfcv1)
3. This error happens when my user space app is exiting. (hence the call
   to `do_exit`
4. I call the IOMMU_HWPT_ALLOC ioctl with
  .flags = IOMMU_HWPT_ALLOC_IOPF_CAPABLE and 
  .hwpt_type = IOMMU_HWPT_TYPE_DEFAULT
  .pt_id = the default ioas id.

I have resolved this in a naive way by just not calling the
put_unused_fd function.

Have you run into this? Is this a path that you were expecting?
Also, please get back to me if you need more information about how I got
to this place. I have provided what I think is enough info, but I might
be missing something obvious.

Best

>  	kfree(fault);
>  }
>  
> @@ -347,7 +546,9 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>  	struct mutex *mutex;
>  	int rc;
>  
> -	if (cmd->flags & ~IOMMU_HWPT_ALLOC_NEST_PARENT || cmd->__reserved)
> +	if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT |
> +			    IOMMU_HWPT_ALLOC_IOPF_CAPABLE)) ||
> +	    cmd->__reserved)
>  		return -EOPNOTSUPP;
>  	if (!cmd->data_len && cmd->hwpt_type != IOMMU_HWPT_TYPE_DEFAULT)
>  		return -EINVAL;
> @@ -416,6 +617,7 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>  		hwpt->fault->hwpt = hwpt;
>  		hwpt->domain->iopf_handler = iommufd_hw_pagetable_iopf_handler;
>  		hwpt->domain->fault_data = hwpt;
> +		cmd->out_fault_fd = hwpt->fault->fault_fd;
>  	}
>  
>  	cmd->out_hwpt_id = hwpt->obj.id;
> -- 
> 2.34.1
>
Jason Gunthorpe Dec. 7, 2023, 5:17 p.m. UTC | #3
On Thu, Dec 07, 2023 at 05:34:10PM +0100, Joel Granados wrote:
> > @@ -58,6 +255,8 @@ static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
> >  	WARN_ON(!list_empty(&fault->deliver));
> >  	WARN_ON(!list_empty(&fault->response));
> >  
> > +	fput(fault->fault_file);
> > +	put_unused_fd(fault->fault_fd);

> I have resolved this in a naive way by just not calling the
> put_unused_fd function.

That is correct.

put_unused_fd() should only be called on error paths prior to the
syscall return.

The design of a FD must follow this pattern

 syscall():
   fdno = get_unused_fd_flags(O_CLOEXEC);
   filep = [..]
 
   // syscall MUST succeed after this statement:
   fd_install(fdno, filep);
   return 0;

  err:
    put_unused_fd(fdno)
    return -ERRNO

Also the refcounting looks a little strange, the filep reference is
consumed by fd_install, so what is that fput pairing with in fault_free?

Jason
Baolu Lu Dec. 8, 2023, 5:47 a.m. UTC | #4
On 12/8/23 1:17 AM, Jason Gunthorpe wrote:
> On Thu, Dec 07, 2023 at 05:34:10PM +0100, Joel Granados wrote:
>>> @@ -58,6 +255,8 @@ static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
>>>   	WARN_ON(!list_empty(&fault->deliver));
>>>   	WARN_ON(!list_empty(&fault->response));
>>>   
>>> +	fput(fault->fault_file);
>>> +	put_unused_fd(fault->fault_fd);
>> I have resolved this in a naive way by just not calling the
>> put_unused_fd function.
> That is correct.
> 
> put_unused_fd() should only be called on error paths prior to the
> syscall return.
> 
> The design of a FD must follow this pattern
> 
>   syscall():
>     fdno = get_unused_fd_flags(O_CLOEXEC);
>     filep = [..]
>   
>     // syscall MUST succeed after this statement:
>     fd_install(fdno, filep);
>     return 0;
> 
>    err:
>      put_unused_fd(fdno)
>      return -ERRNO

Yes. Agreed.

> 
> Also the refcounting looks a little strange, the filep reference is
> consumed by fd_install, so what is that fput pairing with in fault_free?

fput() pairs with get_unused_fd_flags()? fd_install() does not seem to
increase any reference.

Best regards,
baolu
Baolu Lu Dec. 8, 2023, 11:43 a.m. UTC | #5
On 2023/12/1 23:24, Jason Gunthorpe wrote:
> On Thu, Oct 26, 2023 at 10:49:28AM +0800, Lu Baolu wrote:
> 
>> +static ssize_t hwpt_fault_fops_write(struct file *filep,
>> +				     const char __user *buf,
>> +				     size_t count, loff_t *ppos)
>> +{
>> +	size_t response_size = sizeof(struct iommu_hwpt_page_response);
>> +	struct hw_pgtable_fault *fault = filep->private_data;
>> +	struct iommu_hwpt_page_response response;
>> +	struct iommufd_hw_pagetable *hwpt;
>> +	struct iopf_group *iter, *group;
>> +	struct iommufd_device *idev;
>> +	size_t done = 0;
>> +	int rc = 0;
>> +
>> +	if (*ppos || count % response_size)
>> +		return -ESPIPE;
>> +
>> +	mutex_lock(&fault->mutex);
>> +	while (!list_empty(&fault->response) && count > done) {
>> +		rc = copy_from_user(&response, buf + done, response_size);
>> +		if (rc)
>> +			break;
>> +
>> +		/* Get the device that this response targets at. */
>> +		idev = container_of(iommufd_get_object(fault->ictx,
>> +						       response.dev_id,
>> +						       IOMMUFD_OBJ_DEVICE),
>> +				    struct iommufd_device, obj);
>> +		if (IS_ERR(idev)) {
>> +			rc = PTR_ERR(idev);
>> +			break;
>> +		}
> 
> See here it might be better to have a per-fd list of outstanding
> faults per-fd and then the cookie would just index that list, then you
> get everything in one shot instead of having to do a xarray looking
> and then a linear list search

Yours is more efficient. I will do it that way in the next version.

> 
>> +static const struct file_operations hwpt_fault_fops = {
>> +	.owner		= THIS_MODULE,
>> +	.read		= hwpt_fault_fops_read,
>> +	.write		= hwpt_fault_fops_write,
>> +};
> 
> nonseekable_open() behavior should be integrated into this

Sure.

> 
>> +static int hw_pagetable_get_fault_fd(struct hw_pgtable_fault *fault)
>> +{
>> +	struct file *filep;
>> +	int fdno;
>> +
>> +	fdno = get_unused_fd_flags(O_CLOEXEC);
>> +	if (fdno < 0)
>> +		return fdno;
>> +
>> +	filep = anon_inode_getfile("[iommufd-pgfault]", &hwpt_fault_fops,
>> +				   fault, O_RDWR);
>> +	if (IS_ERR(filep)) {
>> +		put_unused_fd(fdno);
>> +		return PTR_ERR(filep);
>> +	}
>> +
>> +	fd_install(fdno, filep);
>> +	fault->fault_file = filep;
>> +	fault->fault_fd = fdno;
> 
> fd_install must be the very last thing before returning success from a
> system call because we cannot undo it.

Yes.

> 
> There are other failure paths before here and the final return
> 
> Jason

Best regards,
baolu
Jason Gunthorpe Dec. 8, 2023, 1:41 p.m. UTC | #6
On Fri, Dec 08, 2023 at 01:47:35PM +0800, Baolu Lu wrote:
> On 12/8/23 1:17 AM, Jason Gunthorpe wrote:
> > On Thu, Dec 07, 2023 at 05:34:10PM +0100, Joel Granados wrote:
> > > > @@ -58,6 +255,8 @@ static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
> > > >   	WARN_ON(!list_empty(&fault->deliver));
> > > >   	WARN_ON(!list_empty(&fault->response));
> > > > +	fput(fault->fault_file);
> > > > +	put_unused_fd(fault->fault_fd);
> > > I have resolved this in a naive way by just not calling the
> > > put_unused_fd function.
> > That is correct.
> > 
> > put_unused_fd() should only be called on error paths prior to the
> > syscall return.
> > 
> > The design of a FD must follow this pattern
> > 
> >   syscall():
> >     fdno = get_unused_fd_flags(O_CLOEXEC);
> >     filep = [..]
> >     // syscall MUST succeed after this statement:
> >     fd_install(fdno, filep);
> >     return 0;
> > 
> >    err:
> >      put_unused_fd(fdno)
> >      return -ERRNO
> 
> Yes. Agreed.
> 
> > 
> > Also the refcounting looks a little strange, the filep reference is
> > consumed by fd_install, so what is that fput pairing with in fault_free?
> 
> fput() pairs with get_unused_fd_flags()? fd_install() does not seem to
> increase any reference.

fd_install() transfers the reference to the fd table and that
reference is put back by the close() system call.

Notice that instantly after fd_install() a concurrent user can free
the filep.

Jason
Shameerali Kolothum Thodi Jan. 12, 2024, 5:46 p.m. UTC | #7
> -----Original Message-----
> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Thursday, October 26, 2023 3:49 AM
> To: Jason Gunthorpe <jgg@ziepe.ca>; Kevin Tian <kevin.tian@intel.com>; Joerg
> Roedel <joro@8bytes.org>; Will Deacon <will@kernel.org>; Robin Murphy
> <robin.murphy@arm.com>; Jean-Philippe Brucker <jean-philippe@linaro.org>;
> Nicolin Chen <nicolinc@nvidia.com>; Yi Liu <yi.l.liu@intel.com>; Jacob Pan
> <jacob.jun.pan@linux.intel.com>
> Cc: iommu@lists.linux.dev; linux-kselftest@vger.kernel.org;
> virtualization@lists.linux-foundation.org; linux-kernel@vger.kernel.org; Lu
> Baolu <baolu.lu@linux.intel.com>
> Subject: [PATCH v2 4/6] iommufd: Deliver fault messages to user space
> 
[...]

Hi,

> +static ssize_t hwpt_fault_fops_write(struct file *filep,
> +				     const char __user *buf,
> +				     size_t count, loff_t *ppos)
> +{
> +	size_t response_size = sizeof(struct iommu_hwpt_page_response);
> +	struct hw_pgtable_fault *fault = filep->private_data;
> +	struct iommu_hwpt_page_response response;
> +	struct iommufd_hw_pagetable *hwpt;
> +	struct iopf_group *iter, *group;
> +	struct iommufd_device *idev;
> +	size_t done = 0;
> +	int rc = 0;
> +
> +	if (*ppos || count % response_size)
> +		return -ESPIPE;
> +
> +	mutex_lock(&fault->mutex);
> +	while (!list_empty(&fault->response) && count > done) {
> +		rc = copy_from_user(&response, buf + done, response_size);
> +		if (rc)
> +			break;
> +
> +		/* Get the device that this response targets at. */
> +		idev = container_of(iommufd_get_object(fault->ictx,
> +						       response.dev_id,
> +						       IOMMUFD_OBJ_DEVICE),
> +				    struct iommufd_device, obj);
> +		if (IS_ERR(idev)) {
> +			rc = PTR_ERR(idev);
> +			break;
> +		}
> +
> +		/*
> +		 * Get the hw page table that this response was generated for.
> +		 * It must match the one stored in the fault data.
> +		 */
> +		hwpt = container_of(iommufd_get_object(fault->ictx,
> +						       response.hwpt_id,
> +
> IOMMUFD_OBJ_HW_PAGETABLE),
> +				    struct iommufd_hw_pagetable, obj);
> +		if (IS_ERR(hwpt)) {
> +			iommufd_put_object(&idev->obj);
> +			rc = PTR_ERR(hwpt);
> +			break;
> +		}
> +
> +		if (hwpt != fault->hwpt) {
> +			rc = -EINVAL;
> +			goto put_obj;
> +		}
> +
> +		group = NULL;
> +		list_for_each_entry(iter, &fault->response, node) {
> +			if (response.grpid != iter->last_fault.fault.prm.grpid)
> +				continue;
> +
> +			if (idev->dev != iter->dev)
> +				continue;
> +
> +			if ((iter->last_fault.fault.prm.flags &
> +			     IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
> +			    response.pasid != iter->last_fault.fault.prm.pasid)
> +				continue;

I am trying to get vSVA working with this series and got hit by the above check.
On ARM platforms, page responses to stall events(CMD_RESUME) do not have
an associated pasid.  I think, either we need to check here using
IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID or remove the check 
as it will be eventually done in iommu_page_response(). 

Thanks,
Shameer
Jason Gunthorpe Jan. 15, 2024, 4:47 p.m. UTC | #8
On Fri, Jan 12, 2024 at 05:46:13PM +0000, Shameerali Kolothum Thodi wrote:
> 
> 
> > -----Original Message-----
> > From: Lu Baolu <baolu.lu@linux.intel.com>
> > Sent: Thursday, October 26, 2023 3:49 AM
> > To: Jason Gunthorpe <jgg@ziepe.ca>; Kevin Tian <kevin.tian@intel.com>; Joerg
> > Roedel <joro@8bytes.org>; Will Deacon <will@kernel.org>; Robin Murphy
> > <robin.murphy@arm.com>; Jean-Philippe Brucker <jean-philippe@linaro.org>;
> > Nicolin Chen <nicolinc@nvidia.com>; Yi Liu <yi.l.liu@intel.com>; Jacob Pan
> > <jacob.jun.pan@linux.intel.com>
> > Cc: iommu@lists.linux.dev; linux-kselftest@vger.kernel.org;
> > virtualization@lists.linux-foundation.org; linux-kernel@vger.kernel.org; Lu
> > Baolu <baolu.lu@linux.intel.com>
> > Subject: [PATCH v2 4/6] iommufd: Deliver fault messages to user space
> > 
> [...]
> 
> Hi,
> 
> > +static ssize_t hwpt_fault_fops_write(struct file *filep,
> > +				     const char __user *buf,
> > +				     size_t count, loff_t *ppos)
> > +{
> > +	size_t response_size = sizeof(struct iommu_hwpt_page_response);
> > +	struct hw_pgtable_fault *fault = filep->private_data;
> > +	struct iommu_hwpt_page_response response;
> > +	struct iommufd_hw_pagetable *hwpt;
> > +	struct iopf_group *iter, *group;
> > +	struct iommufd_device *idev;
> > +	size_t done = 0;
> > +	int rc = 0;
> > +
> > +	if (*ppos || count % response_size)
> > +		return -ESPIPE;
> > +
> > +	mutex_lock(&fault->mutex);
> > +	while (!list_empty(&fault->response) && count > done) {
> > +		rc = copy_from_user(&response, buf + done, response_size);
> > +		if (rc)
> > +			break;
> > +
> > +		/* Get the device that this response targets at. */
> > +		idev = container_of(iommufd_get_object(fault->ictx,
> > +						       response.dev_id,
> > +						       IOMMUFD_OBJ_DEVICE),
> > +				    struct iommufd_device, obj);
> > +		if (IS_ERR(idev)) {
> > +			rc = PTR_ERR(idev);
> > +			break;
> > +		}
> > +
> > +		/*
> > +		 * Get the hw page table that this response was generated for.
> > +		 * It must match the one stored in the fault data.
> > +		 */
> > +		hwpt = container_of(iommufd_get_object(fault->ictx,
> > +						       response.hwpt_id,
> > +
> > IOMMUFD_OBJ_HW_PAGETABLE),
> > +				    struct iommufd_hw_pagetable, obj);
> > +		if (IS_ERR(hwpt)) {
> > +			iommufd_put_object(&idev->obj);
> > +			rc = PTR_ERR(hwpt);
> > +			break;
> > +		}
> > +
> > +		if (hwpt != fault->hwpt) {
> > +			rc = -EINVAL;
> > +			goto put_obj;
> > +		}
> > +
> > +		group = NULL;
> > +		list_for_each_entry(iter, &fault->response, node) {
> > +			if (response.grpid != iter->last_fault.fault.prm.grpid)
> > +				continue;
> > +
> > +			if (idev->dev != iter->dev)
> > +				continue;
> > +
> > +			if ((iter->last_fault.fault.prm.flags &
> > +			     IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
> > +			    response.pasid != iter->last_fault.fault.prm.pasid)
> > +				continue;
> 
> I am trying to get vSVA working with this series and got hit by the above check.
> On ARM platforms, page responses to stall events(CMD_RESUME) do not have
> an associated pasid.  I think, either we need to check here using
> IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID or remove the check 
> as it will be eventually done in iommu_page_response(). 

That doesn't sound right..

The PASID is the only information we have for userspace to identify
the domain that is being faulted. It cannot be optional on the request
side.

If it is valid when userspace does read() then it should be valid when
userspace does write() too.

It is the only way the kernel can actually match request and response
here.

So, I think you have a userspace issue to not provide the right
pasid??

Jason
Shameerali Kolothum Thodi Jan. 15, 2024, 5:44 p.m. UTC | #9
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Monday, January 15, 2024 4:47 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: Lu Baolu <baolu.lu@linux.intel.com>; Kevin Tian <kevin.tian@intel.com>;
> Joerg Roedel <joro@8bytes.org>; Will Deacon <will@kernel.org>; Robin
> Murphy <robin.murphy@arm.com>; Jean-Philippe Brucker <jean-
> philippe@linaro.org>; Nicolin Chen <nicolinc@nvidia.com>; Yi Liu
> <yi.l.liu@intel.com>; Jacob Pan <jacob.jun.pan@linux.intel.com>;
> iommu@lists.linux.dev; linux-kselftest@vger.kernel.org;
> virtualization@lists.linux-foundation.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 4/6] iommufd: Deliver fault messages to user space
> 
> On Fri, Jan 12, 2024 at 05:46:13PM +0000, Shameerali Kolothum Thodi wrote:
> >
> >
> > > -----Original Message-----
> > > From: Lu Baolu <baolu.lu@linux.intel.com>
> > > Sent: Thursday, October 26, 2023 3:49 AM
> > > To: Jason Gunthorpe <jgg@ziepe.ca>; Kevin Tian <kevin.tian@intel.com>;
> Joerg
> > > Roedel <joro@8bytes.org>; Will Deacon <will@kernel.org>; Robin
> Murphy
> > > <robin.murphy@arm.com>; Jean-Philippe Brucker <jean-
> philippe@linaro.org>;
> > > Nicolin Chen <nicolinc@nvidia.com>; Yi Liu <yi.l.liu@intel.com>; Jacob
> Pan
> > > <jacob.jun.pan@linux.intel.com>
> > > Cc: iommu@lists.linux.dev; linux-kselftest@vger.kernel.org;
> > > virtualization@lists.linux-foundation.org; linux-kernel@vger.kernel.org;
> Lu
> > > Baolu <baolu.lu@linux.intel.com>
> > > Subject: [PATCH v2 4/6] iommufd: Deliver fault messages to user space
> > >
> > [...]
> >
> > Hi,
> >
> > > +static ssize_t hwpt_fault_fops_write(struct file *filep,
> > > +				     const char __user *buf,
> > > +				     size_t count, loff_t *ppos)
> > > +{
> > > +	size_t response_size = sizeof(struct iommu_hwpt_page_response);
> > > +	struct hw_pgtable_fault *fault = filep->private_data;
> > > +	struct iommu_hwpt_page_response response;
> > > +	struct iommufd_hw_pagetable *hwpt;
> > > +	struct iopf_group *iter, *group;
> > > +	struct iommufd_device *idev;
> > > +	size_t done = 0;
> > > +	int rc = 0;
> > > +
> > > +	if (*ppos || count % response_size)
> > > +		return -ESPIPE;
> > > +
> > > +	mutex_lock(&fault->mutex);
> > > +	while (!list_empty(&fault->response) && count > done) {
> > > +		rc = copy_from_user(&response, buf + done, response_size);
> > > +		if (rc)
> > > +			break;
> > > +
> > > +		/* Get the device that this response targets at. */
> > > +		idev = container_of(iommufd_get_object(fault->ictx,
> > > +						       response.dev_id,
> > > +						       IOMMUFD_OBJ_DEVICE),
> > > +				    struct iommufd_device, obj);
> > > +		if (IS_ERR(idev)) {
> > > +			rc = PTR_ERR(idev);
> > > +			break;
> > > +		}
> > > +
> > > +		/*
> > > +		 * Get the hw page table that this response was generated
> for.
> > > +		 * It must match the one stored in the fault data.
> > > +		 */
> > > +		hwpt = container_of(iommufd_get_object(fault->ictx,
> > > +						       response.hwpt_id,
> > > +
> > > IOMMUFD_OBJ_HW_PAGETABLE),
> > > +				    struct iommufd_hw_pagetable, obj);
> > > +		if (IS_ERR(hwpt)) {
> > > +			iommufd_put_object(&idev->obj);
> > > +			rc = PTR_ERR(hwpt);
> > > +			break;
> > > +		}
> > > +
> > > +		if (hwpt != fault->hwpt) {
> > > +			rc = -EINVAL;
> > > +			goto put_obj;
> > > +		}
> > > +
> > > +		group = NULL;
> > > +		list_for_each_entry(iter, &fault->response, node) {
> > > +			if (response.grpid != iter->last_fault.fault.prm.grpid)
> > > +				continue;
> > > +
> > > +			if (idev->dev != iter->dev)
> > > +				continue;
> > > +
> > > +			if ((iter->last_fault.fault.prm.flags &
> > > +			     IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)
> &&
> > > +			    response.pasid != iter->last_fault.fault.prm.pasid)
> > > +				continue;
> >
> > I am trying to get vSVA working with this series and got hit by the above
> check.
> > On ARM platforms, page responses to stall events(CMD_RESUME) do not
> have
> > an associated pasid.  I think, either we need to check here using
> > IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID or remove the check
> > as it will be eventually done in iommu_page_response().
> 
> That doesn't sound right..
> 
> The PASID is the only information we have for userspace to identify
> the domain that is being faulted. It cannot be optional on the request
> side.
> 

Yes, it is valid on the request side. But this is on the response side.

> If it is valid when userspace does read() then it should be valid when
> userspace does write() too.
> 
> It is the only way the kernel can actually match request and response
> here.

The kernel currently checks the pasid only if IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID
is set.

https://lore.kernel.org/linux-iommu/20200616144712.748818-1-jean-philippe@linaro.org/

> So, I think you have a userspace issue to not provide the right
> pasid??

This is not just ARM stall resume case, but for some PCI devices as well as per
the above commit log. So do we really need to track this in userspace ?

Thanks,
Shameer
Jason Gunthorpe Jan. 15, 2024, 5:58 p.m. UTC | #10
On Mon, Jan 15, 2024 at 05:44:13PM +0000, Shameerali Kolothum Thodi wrote:

> > If it is valid when userspace does read() then it should be valid when
> > userspace does write() too.
> > 
> > It is the only way the kernel can actually match request and response
> > here.
> 
> The kernel currently checks the pasid only if IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID
> is set.
> 
> https://lore.kernel.org/linux-iommu/20200616144712.748818-1-jean-philippe@linaro.org/
> 
> > So, I think you have a userspace issue to not provide the right
> > pasid??
> 
> This is not just ARM stall resume case, but for some PCI devices as well as per
> the above commit log. So do we really need to track this in userspace ?

Yes, these weird HW details should not leak into userspace.

The PASID is required on the read() side, userspace should provide it
on the write() side. It is trivial for it to do, there is no reason to
accommodate anything else.

Alternatively I'm wondering if we should supply a serial number to
userspace so it can match the request/response instead of relying on
guessing based on pasid/grpid?

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 0dbaa2dc5b22..ff063bc48150 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -237,6 +237,8 @@  struct hw_pgtable_fault {
 	struct mutex mutex;
 	struct list_head deliver;
 	struct list_head response;
+	struct file *fault_file;
+	int fault_fd;
 };
 
 /*
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 9f94c824cf86..f0aac1bb2d2d 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -3,6 +3,8 @@ 
  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
  */
 #include <linux/iommu.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
 #include <uapi/linux/iommufd.h>
 
 #include "../iommu-priv.h"
@@ -38,9 +40,198 @@  static void iommufd_kernel_managed_hwpt_destroy(struct iommufd_object *obj)
 	refcount_dec(&hwpt->ioas->obj.users);
 }
 
+static int iommufd_compose_fault_message(struct iommu_fault *fault,
+					 struct iommu_hwpt_pgfault *hwpt_fault,
+					 struct device *dev)
+{
+	struct iommufd_device *idev = iopf_pasid_cookie_get(dev, IOMMU_NO_PASID);
+
+	if (!idev)
+		return -ENODEV;
+
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	hwpt_fault->size = sizeof(*hwpt_fault);
+	hwpt_fault->flags = fault->prm.flags;
+	hwpt_fault->dev_id = idev->obj.id;
+	hwpt_fault->pasid = fault->prm.pasid;
+	hwpt_fault->grpid = fault->prm.grpid;
+	hwpt_fault->perm = fault->prm.perm;
+	hwpt_fault->addr = fault->prm.addr;
+	hwpt_fault->private_data[0] = fault->prm.private_data[0];
+	hwpt_fault->private_data[1] = fault->prm.private_data[1];
+
+	return 0;
+}
+
+static ssize_t hwpt_fault_fops_read(struct file *filep, char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+	struct hw_pgtable_fault *fault = filep->private_data;
+	struct iommu_hwpt_pgfault data;
+	struct iopf_group *group;
+	struct iopf_fault *iopf;
+	size_t done = 0;
+	int rc;
+
+	if (*ppos || count % fault_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (!list_empty(&fault->deliver) && count > done) {
+		group = list_first_entry(&fault->deliver,
+					 struct iopf_group, node);
+
+		if (list_count_nodes(&group->faults) * fault_size > count - done)
+			break;
+
+		list_for_each_entry(iopf, &group->faults, list) {
+			rc = iommufd_compose_fault_message(&iopf->fault,
+							   &data, group->dev);
+			if (rc)
+				goto err_unlock;
+			rc = copy_to_user(buf + done, &data, fault_size);
+			if (rc)
+				goto err_unlock;
+			done += fault_size;
+		}
+
+		list_move_tail(&group->node, &fault->response);
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done;
+err_unlock:
+	mutex_unlock(&fault->mutex);
+	return rc;
+}
+
+static ssize_t hwpt_fault_fops_write(struct file *filep,
+				     const char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	size_t response_size = sizeof(struct iommu_hwpt_page_response);
+	struct hw_pgtable_fault *fault = filep->private_data;
+	struct iommu_hwpt_page_response response;
+	struct iommufd_hw_pagetable *hwpt;
+	struct iopf_group *iter, *group;
+	struct iommufd_device *idev;
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos || count % response_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (!list_empty(&fault->response) && count > done) {
+		rc = copy_from_user(&response, buf + done, response_size);
+		if (rc)
+			break;
+
+		/* Get the device that this response targets at. */
+		idev = container_of(iommufd_get_object(fault->ictx,
+						       response.dev_id,
+						       IOMMUFD_OBJ_DEVICE),
+				    struct iommufd_device, obj);
+		if (IS_ERR(idev)) {
+			rc = PTR_ERR(idev);
+			break;
+		}
+
+		/*
+		 * Get the hw page table that this response was generated for.
+		 * It must match the one stored in the fault data.
+		 */
+		hwpt = container_of(iommufd_get_object(fault->ictx,
+						       response.hwpt_id,
+						       IOMMUFD_OBJ_HW_PAGETABLE),
+				    struct iommufd_hw_pagetable, obj);
+		if (IS_ERR(hwpt)) {
+			iommufd_put_object(&idev->obj);
+			rc = PTR_ERR(hwpt);
+			break;
+		}
+
+		if (hwpt != fault->hwpt) {
+			rc = -EINVAL;
+			goto put_obj;
+		}
+
+		group = NULL;
+		list_for_each_entry(iter, &fault->response, node) {
+			if (response.grpid != iter->last_fault.fault.prm.grpid)
+				continue;
+
+			if (idev->dev != iter->dev)
+				continue;
+
+			if ((iter->last_fault.fault.prm.flags &
+			     IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+			    response.pasid != iter->last_fault.fault.prm.pasid)
+				continue;
+
+			group = iter;
+			break;
+		}
+
+		if (!group) {
+			rc = -ENODEV;
+			goto put_obj;
+		}
+
+		rc = iopf_group_response(group, response.code);
+		if (rc)
+			goto put_obj;
+
+		list_del(&group->node);
+		iopf_free_group(group);
+		done += response_size;
+put_obj:
+		iommufd_put_object(&hwpt->obj);
+		iommufd_put_object(&idev->obj);
+		if (rc)
+			break;
+	}
+	mutex_unlock(&fault->mutex);
+
+	return (rc < 0) ? rc : done;
+}
+
+static const struct file_operations hwpt_fault_fops = {
+	.owner		= THIS_MODULE,
+	.read		= hwpt_fault_fops_read,
+	.write		= hwpt_fault_fops_write,
+};
+
+static int hw_pagetable_get_fault_fd(struct hw_pgtable_fault *fault)
+{
+	struct file *filep;
+	int fdno;
+
+	fdno = get_unused_fd_flags(O_CLOEXEC);
+	if (fdno < 0)
+		return fdno;
+
+	filep = anon_inode_getfile("[iommufd-pgfault]", &hwpt_fault_fops,
+				   fault, O_RDWR);
+	if (IS_ERR(filep)) {
+		put_unused_fd(fdno);
+		return PTR_ERR(filep);
+	}
+
+	fd_install(fdno, filep);
+	fault->fault_file = filep;
+	fault->fault_fd = fdno;
+
+	return 0;
+}
+
 static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
 {
 	struct hw_pgtable_fault *fault;
+	int rc;
 
 	fault = kzalloc(sizeof(*fault), GFP_KERNEL);
 	if (!fault)
@@ -50,6 +241,12 @@  static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
 	INIT_LIST_HEAD(&fault->response);
 	mutex_init(&fault->mutex);
 
+	rc = hw_pagetable_get_fault_fd(fault);
+	if (rc) {
+		kfree(fault);
+		return ERR_PTR(rc);
+	}
+
 	return fault;
 }
 
@@ -58,6 +255,8 @@  static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
 	WARN_ON(!list_empty(&fault->deliver));
 	WARN_ON(!list_empty(&fault->response));
 
+	fput(fault->fault_file);
+	put_unused_fd(fault->fault_fd);
 	kfree(fault);
 }
 
@@ -347,7 +546,9 @@  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 	struct mutex *mutex;
 	int rc;
 
-	if (cmd->flags & ~IOMMU_HWPT_ALLOC_NEST_PARENT || cmd->__reserved)
+	if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT |
+			    IOMMU_HWPT_ALLOC_IOPF_CAPABLE)) ||
+	    cmd->__reserved)
 		return -EOPNOTSUPP;
 	if (!cmd->data_len && cmd->hwpt_type != IOMMU_HWPT_TYPE_DEFAULT)
 		return -EINVAL;
@@ -416,6 +617,7 @@  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 		hwpt->fault->hwpt = hwpt;
 		hwpt->domain->iopf_handler = iommufd_hw_pagetable_iopf_handler;
 		hwpt->domain->fault_data = hwpt;
+		cmd->out_fault_fd = hwpt->fault->fault_fd;
 	}
 
 	cmd->out_hwpt_id = hwpt->obj.id;