diff mbox series

[v8,21/24] vfio: Add VFIO_DEVICE_BIND_IOMMUFD

Message ID 20230327094047.47215-22-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Add vfio_device cdev for iommufd support | expand

Commit Message

Yi Liu March 27, 2023, 9:40 a.m. UTC
This adds ioctl for userspace to bind device cdev fd to iommufd.

    VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
			      control provided by the iommufd. open_device
			      op is called after bind_iommufd op.
			      VFIO no iommu mode is indicated by passing
			      a negative iommufd value.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/device_cdev.c | 153 +++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio.h        |  13 ++++
 drivers/vfio/vfio_main.c   |   5 ++
 include/uapi/linux/vfio.h  |  37 +++++++++
 4 files changed, 208 insertions(+)

Comments

Alex Williamson March 29, 2023, 9 p.m. UTC | #1
On Mon, 27 Mar 2023 02:40:44 -0700
Yi Liu <yi.l.liu@intel.com> wrote:

> This adds ioctl for userspace to bind device cdev fd to iommufd.
> 
>     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> 			      control provided by the iommufd. open_device
> 			      op is called after bind_iommufd op.
> 			      VFIO no iommu mode is indicated by passing
> 			      a negative iommufd value.
> 
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> Tested-by: Terrence Xu <terrence.xu@intel.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/device_cdev.c | 153 +++++++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio.h        |  13 ++++
>  drivers/vfio/vfio_main.c   |   5 ++
>  include/uapi/linux/vfio.h  |  37 +++++++++
>  4 files changed, 208 insertions(+)
> 
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index 1c640016a824..2b563bac50b9 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -3,6 +3,7 @@
>   * Copyright (c) 2023 Intel Corporation.
>   */
>  #include <linux/vfio.h>
> +#include <linux/iommufd.h>
>  
>  #include "vfio.h"
>  
> @@ -44,6 +45,158 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
>  	return ret;
>  }
>  
> +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> +{
> +	spin_lock(&df->kvm_ref_lock);
> +	if (df->kvm)
> +		_vfio_device_get_kvm_safe(df->device, df->kvm);
> +	spin_unlock(&df->kvm_ref_lock);
> +}
> +
> +void vfio_device_cdev_close(struct vfio_device_file *df)
> +{
> +	struct vfio_device *device = df->device;
> +
> +	/*
> +	 * As df->access_granted writer is under dev_set->lock as well,
> +	 * so this read no need to use smp_load_acquire() to pair with

Nit, "no need to use" -> "does not need to use"

> +	 * smp_store_release() in the caller of vfio_device_open().
> +	 */
> +	if (!df->access_granted)
> +		return;
> +

Isn't the lock we're acquiring below the one that we claim to have in
the comment above to make the non-smp_load_acquire() test safe?

> +	mutex_lock(&device->dev_set->lock);
> +	vfio_device_close(df);
> +	vfio_device_put_kvm(device);
> +	if (df->iommufd)
> +		iommufd_ctx_put(df->iommufd);
> +	mutex_unlock(&device->dev_set->lock);
> +	vfio_device_unblock_group(device);
> +}
> +
> +static int vfio_device_cdev_enable_noiommu(struct vfio_device *device)
> +{
> +	if (!capable(CAP_SYS_RAWIO))
> +		return -EPERM;
> +
> +	if (!device->noiommu)
> +		return -EINVAL;
> +
> +	return 0;
> +}

This is testing, not enabling. ie. naming nit.

> +
> +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> +{
> +	struct fd f;
> +	struct iommufd_ctx *iommufd;
> +
> +	f = fdget(fd);
> +	if (!f.file)
> +		return ERR_PTR(-EBADF);
> +
> +	iommufd = iommufd_ctx_from_file(f.file);
> +
> +	fdput(f);
> +	return iommufd;
> +}
> +
> +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +				    struct vfio_device_bind_iommufd __user *arg)
> +{
> +	struct vfio_device *device = df->device;
> +	struct vfio_device_bind_iommufd bind;
> +	struct iommufd_ctx *iommufd = NULL;
> +	unsigned long minsz;
> +	int ret;
> +
> +	static_assert(__same_type(arg->out_devid, bind.out_devid));

They're the same field in the same structure, how could they be
otherwise?

> +
> +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> +
> +	if (copy_from_user(&bind, arg, minsz))
> +		return -EFAULT;
> +
> +	if (bind.argsz < minsz || bind.flags)
> +		return -EINVAL;
> +
> +	if (!device->ops->bind_iommufd)
> +		return -ENODEV;

This test seems beyond normal paranoia since we test in
__vfio_register_dev()

> +
> +	/* BIND_IOMMUFD only allowed for cdev fds */
> +	if (df->group)
> +		return -EINVAL;
> +
> +	ret = vfio_device_block_group(device);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	/* one device cannot be bound twice */
> +	if (df->access_granted) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	/* iommufd < 0 means noiommu mode */
> +	if (bind.iommufd < 0) {
> +		ret = vfio_device_cdev_enable_noiommu(device);
> +		if (ret)
> +			goto out_unlock;
> +	} else {
> +		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> +		if (IS_ERR(iommufd)) {
> +			ret = PTR_ERR(iommufd);
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * Before the device open, get the KVM pointer currently
> +	 * associated with the device file (if there is) and obtain
> +	 * a reference.  This reference is held until device closed.
> +	 * Save the pointer in the device for use by drivers.
> +	 */
> +	vfio_device_get_kvm_safe(df);
> +
> +	df->iommufd = iommufd;
> +	ret = vfio_device_open(df);
> +	if (ret)
> +		goto out_put_kvm;
> +
> +	if (df->iommufd)
> +		bind.out_devid = df->devid;

How about only setting df->iommufd in the non-noiommu case above so
it's not confusing that it was just set 4 lines previous.  That also
allows the iommufd pointer to be scoped within that branch and not
require initialization.  It might make sense to declare:

	bool is_noiommu = (bind.iommufd < 0);

and use it consistently rather than alternating testing between
bind.iommufd and df->iommufd.

> +
> +	ret = copy_to_user(&arg->out_devid, &bind.out_devid,
> +			   sizeof(bind.out_devid)) ? -EFAULT : 0;

In the noiommu case, this copies back the input value, shouldn't it be
some known invalid value?  Seems confusing.

> +	if (ret)
> +		goto out_close_device;
> +
> +	if (bind.iommufd < 0)
> +		dev_warn(device->dev, "device is bound to vfio-noiommu by user "
> +			 "(%s:%d)\n", current->comm, task_pid_nr(current));
> +
> +	/*
> +	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
> +	 * read/write/mmap
> +	 */
> +	smp_store_release(&df->access_granted, true);
> +	mutex_unlock(&device->dev_set->lock);
> +
> +	return 0;
> +
> +out_close_device:
> +	vfio_device_close(df);
> +out_put_kvm:
> +	df->iommufd = NULL;
> +	vfio_device_put_kvm(device);
> +	if (iommufd)
> +		iommufd_ctx_put(iommufd);
> +out_unlock:
> +	mutex_unlock(&device->dev_set->lock);
> +	vfio_device_unblock_group(device);
> +	return ret;
> +}
> +
>  static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
>  {
>  	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 3a8fd0e32f59..ace3d52b0928 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -281,6 +281,9 @@ static inline void vfio_device_del(struct vfio_device *device)
>  
>  void vfio_init_device_cdev(struct vfio_device *device);
>  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
> +void vfio_device_cdev_close(struct vfio_device_file *df);
> +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +				    struct vfio_device_bind_iommufd __user *arg);
>  int vfio_cdev_init(struct class *device_class);
>  void vfio_cdev_cleanup(void);
>  #else
> @@ -304,6 +307,16 @@ static inline int vfio_device_fops_cdev_open(struct inode *inode,
>  	return 0;
>  }
>  
> +static inline void vfio_device_cdev_close(struct vfio_device_file *df)
> +{
> +}
> +
> +static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +						  struct vfio_device_bind_iommufd __user *arg)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>  static inline int vfio_cdev_init(struct class *device_class)
>  {
>  	return 0;
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 58fc3bb768f2..375086c8803f 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -559,6 +559,8 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
>  
>  	if (df->group)
>  		vfio_device_group_close(df);
> +	else
> +		vfio_device_cdev_close(df);
>  
>  	vfio_device_put_registration(device);
>  
> @@ -1132,6 +1134,9 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
>  	struct vfio_device *device = df->device;
>  	int ret;
>  
> +	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
> +		return vfio_device_ioctl_bind_iommufd(df, (void __user *)arg);
> +
>  	/* Paired with smp_store_release() following vfio_device_open() */
>  	if (!smp_load_acquire(&df->access_granted))
>  		return -EINVAL;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 61b801dfd40b..62b2f2497525 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -194,6 +194,43 @@ struct vfio_group_status {
>  
>  /* --------------- IOCTLs for DEVICE file descriptors --------------- */
>  
> +/*
> + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> + *				   struct vfio_device_bind_iommufd)
> + *
> + * Bind a vfio_device to the specified iommufd.
> + *
> + * The user should provide a device cookie when calling this ioctl. The
> + * cookie is carried only in event e.g. I/O fault reported to userspace
> + * via iommufd. The user should use devid returned by this ioctl to mark
> + * the target device in other ioctls (e.g. iommu hardware infomration query
> + * via iommufd, and etc.).

AFAICT, the whole concept of this dev_cookie is a fantasy.  It only
exists in this series in these comments and the structure below.  It's
not even defined whether it needs to be unique within an iommufd
context, and clearly nothing here validates that.  There's not enough
implementation for it to exist in this series.  Maybe dev_cookie is
appended to the end of the structure and indicated with a flag when it
has some meaning.

> + *
> + * User is not allowed to access the device before the binding operation
> + * is completed.

s/not allowed to access/restricted from accessing/

> + *
> + * Unbind is automatically conducted when device fd is closed.
> + *
> + * @argsz:	 user filled size of this data.
> + * @flags:	 reserved for future extension.
> + * @dev_cookie:	 a per device cookie provided by userspace.
> + * @iommufd:	 iommufd to bind. a negative value means noiommu.

"Use a negative value for no-iommu, where supported", or better, should
we define this explicitly as -1, or why not use a flag bit to specify
no-iommu?  Maybe minsz is only through flags for the noiommu use case.
Thanks,

Alex

> + * @out_devid:	 the device id generated by this bind. This field is valid
> + *		as long as the input @iommufd is valid. Otherwise, it is
> + *		meaningless.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_bind_iommufd {
> +	__u32		argsz;
> +	__u32		flags;
> +	__aligned_u64	dev_cookie;
> +	__s32		iommufd;
> +	__u32		out_devid;
> +};
> +
> +#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
> +
>  /**
>   * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
>   *						struct vfio_device_info)
Jason Gunthorpe March 29, 2023, 11:22 p.m. UTC | #2
On Wed, Mar 29, 2023 at 03:00:55PM -0600, Alex Williamson wrote:

> > + * The user should provide a device cookie when calling this ioctl. The
> > + * cookie is carried only in event e.g. I/O fault reported to userspace
> > + * via iommufd. The user should use devid returned by this ioctl to mark
> > + * the target device in other ioctls (e.g. iommu hardware infomration query
> > + * via iommufd, and etc.).
> 
> AFAICT, the whole concept of this dev_cookie is a fantasy.  It only
> exists in this series in these comments and the structure below.  It's
> not even defined whether it needs to be unique within an iommufd
> context, and clearly nothing here validates that.  There's not enough
> implementation for it to exist in this series.  Maybe dev_cookie is
> appended to the end of the structure and indicated with a flag when it
> has some meaning.

Yes, I've asked for this to be punted to the PRI series enough times
already, why does it keep coming back ??

> > + * @argsz:	 user filled size of this data.
> > + * @flags:	 reserved for future extension.
> > + * @dev_cookie:	 a per device cookie provided by userspace.
> > + * @iommufd:	 iommufd to bind. a negative value means noiommu.
> 
> "Use a negative value for no-iommu, where supported", or better, should
> we define this explicitly as -1, or why not use a flag bit to specify
> no-iommu?  Maybe minsz is only through flags for the noiommu use case.

I was happy enough for this to be defined as -1. We could give it a
formal sounding constant too

Jason
Yi Liu March 30, 2023, 7:09 a.m. UTC | #3
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, March 30, 2023 5:01 AM
> 
> On Mon, 27 Mar 2023 02:40:44 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This adds ioctl for userspace to bind device cdev fd to iommufd.
> >
> >     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> > 			      control provided by the iommufd. open_device
> > 			      op is called after bind_iommufd op.
> > 			      VFIO no iommu mode is indicated by passing
> > 			      a negative iommufd value.
> >
> > Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> > Tested-by: Terrence Xu <terrence.xu@intel.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/device_cdev.c | 153 +++++++++++++++++++++++++++++++++++++
> >  drivers/vfio/vfio.h        |  13 ++++
> >  drivers/vfio/vfio_main.c   |   5 ++
> >  include/uapi/linux/vfio.h  |  37 +++++++++
> >  4 files changed, 208 insertions(+)
> >
> > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > index 1c640016a824..2b563bac50b9 100644
> > --- a/drivers/vfio/device_cdev.c
> > +++ b/drivers/vfio/device_cdev.c
> > @@ -3,6 +3,7 @@
> >   * Copyright (c) 2023 Intel Corporation.
> >   */
> >  #include <linux/vfio.h>
> > +#include <linux/iommufd.h>
> >
> >  #include "vfio.h"
> >
> > @@ -44,6 +45,158 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct
> file *filep)
> >  	return ret;
> >  }
> >
> > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> > +{
> > +	spin_lock(&df->kvm_ref_lock);
> > +	if (df->kvm)
> > +		_vfio_device_get_kvm_safe(df->device, df->kvm);
> > +	spin_unlock(&df->kvm_ref_lock);
> > +}
> > +
> > +void vfio_device_cdev_close(struct vfio_device_file *df)
> > +{
> > +	struct vfio_device *device = df->device;
> > +
> > +	/*
> > +	 * As df->access_granted writer is under dev_set->lock as well,
> > +	 * so this read no need to use smp_load_acquire() to pair with
> 
> Nit, "no need to use" -> "does not need to use"

got it.

> 
> > +	 * smp_store_release() in the caller of vfio_device_open().
> > +	 */
> > +	if (!df->access_granted)
> > +		return;
> > +
> 
> Isn't the lock we're acquiring below the one that we claim to have in
> the comment above to make the non-smp_load_acquire() test safe?

the comment may be not accurate enough. The the non-smp_load_acquire()
and no lock test were according to the below two remarks in v4 and v5.

https://lore.kernel.org/kvm/Y%2FYRx7jLuyEoLxZg@nvidia.com/
https://lore.kernel.org/kvm/Y%2F0CV1K0YNHA+olf@nvidia.com/

Perhaps the comment should be:

"In the time of close, there is no contention with another one
  changing this flag. So test df->access_granted without lock
  nor smp_load_acquire() is ok."

> > +	mutex_lock(&device->dev_set->lock);
> > +	vfio_device_close(df);
> > +	vfio_device_put_kvm(device);
> > +	if (df->iommufd)
> > +		iommufd_ctx_put(df->iommufd);
> > +	mutex_unlock(&device->dev_set->lock);
> > +	vfio_device_unblock_group(device);
> > +}
> > +
> > +static int vfio_device_cdev_enable_noiommu(struct vfio_device *device)
> > +{
> > +	if (!capable(CAP_SYS_RAWIO))
> > +		return -EPERM;
> > +
> > +	if (!device->noiommu)
> > +		return -EINVAL;
> > +
> > +	return 0;
> > +}
> 
> This is testing, not enabling. ie. naming nit.

how about probe_noiommu or test_noiommu?

> 
> > +
> > +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> > +{
> > +	struct fd f;
> > +	struct iommufd_ctx *iommufd;
> > +
> > +	f = fdget(fd);
> > +	if (!f.file)
> > +		return ERR_PTR(-EBADF);
> > +
> > +	iommufd = iommufd_ctx_from_file(f.file);
> > +
> > +	fdput(f);
> > +	return iommufd;
> > +}
> > +
> > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +				    struct vfio_device_bind_iommufd __user *arg)
> > +{
> > +	struct vfio_device *device = df->device;
> > +	struct vfio_device_bind_iommufd bind;
> > +	struct iommufd_ctx *iommufd = NULL;
> > +	unsigned long minsz;
> > +	int ret;
> > +
> > +	static_assert(__same_type(arg->out_devid, bind.out_devid));
> 
> They're the same field in the same structure, how could they be
> otherwise?

@Jason, should I remove this check?

> > +
> > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > +
> > +	if (copy_from_user(&bind, arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (bind.argsz < minsz || bind.flags)
> > +		return -EINVAL;
> > +
> > +	if (!device->ops->bind_iommufd)
> > +		return -ENODEV;
> 
> This test seems beyond normal paranoia since we test in
> __vfio_register_dev()

yes. The whole c file depends on VFIO_DEVICE_CDEV which
depends on IOMMUFD, and if IOMMUFD is enabled,
__vfio_register_dev() already checks this callback.

> 
> > +
> > +	/* BIND_IOMMUFD only allowed for cdev fds */
> > +	if (df->group)
> > +		return -EINVAL;
> > +
> > +	ret = vfio_device_block_group(device);
> > +	if (ret)
> > +		return ret;
> > +
> > +	mutex_lock(&device->dev_set->lock);
> > +	/* one device cannot be bound twice */
> > +	if (df->access_granted) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/* iommufd < 0 means noiommu mode */
> > +	if (bind.iommufd < 0) {
> > +		ret = vfio_device_cdev_enable_noiommu(device);
> > +		if (ret)
> > +			goto out_unlock;
> > +	} else {
> > +		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> > +		if (IS_ERR(iommufd)) {
> > +			ret = PTR_ERR(iommufd);
> > +			goto out_unlock;
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * Before the device open, get the KVM pointer currently
> > +	 * associated with the device file (if there is) and obtain
> > +	 * a reference.  This reference is held until device closed.
> > +	 * Save the pointer in the device for use by drivers.
> > +	 */
> > +	vfio_device_get_kvm_safe(df);
> > +
> > +	df->iommufd = iommufd;
> > +	ret = vfio_device_open(df);
> > +	if (ret)
> > +		goto out_put_kvm;
> > +
> > +	if (df->iommufd)
> > +		bind.out_devid = df->devid;
> 
> How about only setting df->iommufd in the non-noiommu case above so
> it's not confusing that it was just set 4 lines previous.  That also
> allows the iommufd pointer to be scoped within that branch and not
> require initialization.  It might make sense to declare:
> 
> 	bool is_noiommu = (bind.iommufd < 0);
> 
> and use it consistently rather than alternating testing between
> bind.iommufd and df->iommufd.

sure.

> > +
> > +	ret = copy_to_user(&arg->out_devid, &bind.out_devid,
> > +			   sizeof(bind.out_devid)) ? -EFAULT : 0;
> 
> In the noiommu case, this copies back the input value, shouldn't it be
> some known invalid value?  Seems confusing.

maybe just do copy for the non-noiommu case?

> > +	if (ret)
> > +		goto out_close_device;
> > +
> > +	if (bind.iommufd < 0)
> > +		dev_warn(device->dev, "device is bound to vfio-noiommu by user "
> > +			 "(%s:%d)\n", current->comm, task_pid_nr(current));
> > +
> > +	/*
> > +	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
> > +	 * read/write/mmap
> > +	 */
> > +	smp_store_release(&df->access_granted, true);
> > +	mutex_unlock(&device->dev_set->lock);
> > +
> > +	return 0;
> > +
> > +out_close_device:
> > +	vfio_device_close(df);
> > +out_put_kvm:
> > +	df->iommufd = NULL;
> > +	vfio_device_put_kvm(device);
> > +	if (iommufd)
> > +		iommufd_ctx_put(iommufd);
> > +out_unlock:
> > +	mutex_unlock(&device->dev_set->lock);
> > +	vfio_device_unblock_group(device);
> > +	return ret;
> > +}
> > +
> >  static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
> >  {
> >  	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> > index 3a8fd0e32f59..ace3d52b0928 100644
> > --- a/drivers/vfio/vfio.h
> > +++ b/drivers/vfio/vfio.h
> > @@ -281,6 +281,9 @@ static inline void vfio_device_del(struct vfio_device *device)
> >
> >  void vfio_init_device_cdev(struct vfio_device *device);
> >  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
> > +void vfio_device_cdev_close(struct vfio_device_file *df);
> > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +				    struct vfio_device_bind_iommufd __user *arg);
> >  int vfio_cdev_init(struct class *device_class);
> >  void vfio_cdev_cleanup(void);
> >  #else
> > @@ -304,6 +307,16 @@ static inline int vfio_device_fops_cdev_open(struct inode
> *inode,
> >  	return 0;
> >  }
> >
> > +static inline void vfio_device_cdev_close(struct vfio_device_file *df)
> > +{
> > +}
> > +
> > +static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +						  struct vfio_device_bind_iommufd
> __user *arg)
> > +{
> > +	return -EOPNOTSUPP;
> > +}
> > +
> >  static inline int vfio_cdev_init(struct class *device_class)
> >  {
> >  	return 0;
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index 58fc3bb768f2..375086c8803f 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -559,6 +559,8 @@ static int vfio_device_fops_release(struct inode *inode,
> struct file *filep)
> >
> >  	if (df->group)
> >  		vfio_device_group_close(df);
> > +	else
> > +		vfio_device_cdev_close(df);
> >
> >  	vfio_device_put_registration(device);
> >
> > @@ -1132,6 +1134,9 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
> >  	struct vfio_device *device = df->device;
> >  	int ret;
> >
> > +	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
> > +		return vfio_device_ioctl_bind_iommufd(df, (void __user *)arg);
> > +
> >  	/* Paired with smp_store_release() following vfio_device_open() */
> >  	if (!smp_load_acquire(&df->access_granted))
> >  		return -EINVAL;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 61b801dfd40b..62b2f2497525 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -194,6 +194,43 @@ struct vfio_group_status {
> >
> >  /* --------------- IOCTLs for DEVICE file descriptors --------------- */
> >
> > +/*
> > + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> > + *				   struct vfio_device_bind_iommufd)
> > + *
> > + * Bind a vfio_device to the specified iommufd.
> > + *
> > + * The user should provide a device cookie when calling this ioctl. The
> > + * cookie is carried only in event e.g. I/O fault reported to userspace
> > + * via iommufd. The user should use devid returned by this ioctl to mark
> > + * the target device in other ioctls (e.g. iommu hardware infomration query
> > + * via iommufd, and etc.).
> 
> AFAICT, the whole concept of this dev_cookie is a fantasy.  It only
> exists in this series in these comments and the structure below.  It's
> not even defined whether it needs to be unique within an iommufd
> context, and clearly nothing here validates that.  There's not enough
> implementation for it to exist in this series.  Maybe dev_cookie is
> appended to the end of the structure and indicated with a flag when it
> has some meaning.

sorry, I should have deleted it. ☹

> 
> > + *
> > + * User is not allowed to access the device before the binding operation
> > + * is completed.
> 
> s/not allowed to access/restricted from accessing/

got it.

> 
> > + *
> > + * Unbind is automatically conducted when device fd is closed.
> > + *
> > + * @argsz:	 user filled size of this data.
> > + * @flags:	 reserved for future extension.
> > + * @dev_cookie:	 a per device cookie provided by userspace.
> > + * @iommufd:	 iommufd to bind. a negative value means noiommu.
> 
> "Use a negative value for no-iommu, where supported", or better, should
> we define this explicitly as -1, or why not use a flag bit to specify
> no-iommu?  Maybe minsz is only through flags for the noiommu use case.
> Thanks,

I don’t have preference here. maybe using -1 can save a flag bit for future
extension. 

> 
> > + * @out_devid:	 the device id generated by this bind. This field is valid
> > + *		as long as the input @iommufd is valid. Otherwise, it is
> > + *		meaningless.
> > + *
> > + * Return: 0 on success, -errno on failure.
> > + */
> > +struct vfio_device_bind_iommufd {
> > +	__u32		argsz;
> > +	__u32		flags;
> > +	__aligned_u64	dev_cookie;
> > +	__s32		iommufd;
> > +	__u32		out_devid;
> > +};
> > +
> > +#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
> > +
> >  /**
> >   * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
> >   *						struct vfio_device_info)

Regards,
Yi Liu
Jason Gunthorpe March 30, 2023, 11:52 a.m. UTC | #4
On Thu, Mar 30, 2023 at 07:09:31AM +0000, Liu, Yi L wrote:

> > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > +				    struct vfio_device_bind_iommufd __user *arg)
> > > +{
> > > +	struct vfio_device *device = df->device;
> > > +	struct vfio_device_bind_iommufd bind;
> > > +	struct iommufd_ctx *iommufd = NULL;
> > > +	unsigned long minsz;
> > > +	int ret;
> > > +
> > > +	static_assert(__same_type(arg->out_devid, bind.out_devid));
> > 
> > They're the same field in the same structure, how could they be
> > otherwise?
> 
> @Jason, should I remove this check?

Yes, it was from something that looked very different from this

Jason
Yi Liu March 30, 2023, 12:52 p.m. UTC | #5
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 30, 2023 7:22 AM
> 
> On Wed, Mar 29, 2023 at 03:00:55PM -0600, Alex Williamson wrote:
> 
> > > + * The user should provide a device cookie when calling this ioctl. The
> > > + * cookie is carried only in event e.g. I/O fault reported to userspace
> > > + * via iommufd. The user should use devid returned by this ioctl to mark
> > > + * the target device in other ioctls (e.g. iommu hardware infomration query
> > > + * via iommufd, and etc.).
> >
> > AFAICT, the whole concept of this dev_cookie is a fantasy.  It only
> > exists in this series in these comments and the structure below.  It's
> > not even defined whether it needs to be unique within an iommufd
> > context, and clearly nothing here validates that.  There's not enough
> > implementation for it to exist in this series.  Maybe dev_cookie is
> > appended to the end of the structure and indicated with a flag when it
> > has some meaning.
> 
> Yes, I've asked for this to be punted to the PRI series enough times
> already, why does it keep coming back ??

yes, I promise to remove it in next version.

> > > + * @argsz:	 user filled size of this data.
> > > + * @flags:	 reserved for future extension.
> > > + * @dev_cookie:	 a per device cookie provided by userspace.
> > > + * @iommufd:	 iommufd to bind. a negative value means noiommu.
> >
> > "Use a negative value for no-iommu, where supported", or better, should
> > we define this explicitly as -1, or why not use a flag bit to specify
> > no-iommu?  Maybe minsz is only through flags for the noiommu use case.
> 
> I was happy enough for this to be defined as -1. We could give it a
> formal sounding constant too

are you suggesting having something like "#define VFIO_NOIOMMU_FD	-1"?

Regards,
Yi Liu
Yi Liu March 30, 2023, 12:53 p.m. UTC | #6
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 30, 2023 7:52 PM
> 
> On Thu, Mar 30, 2023 at 07:09:31AM +0000, Liu, Yi L wrote:
> 
> > > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > > +				    struct vfio_device_bind_iommufd __user *arg)
> > > > +{
> > > > +	struct vfio_device *device = df->device;
> > > > +	struct vfio_device_bind_iommufd bind;
> > > > +	struct iommufd_ctx *iommufd = NULL;
> > > > +	unsigned long minsz;
> > > > +	int ret;
> > > > +
> > > > +	static_assert(__same_type(arg->out_devid, bind.out_devid));
> > >
> > > They're the same field in the same structure, how could they be
> > > otherwise?
> >
> > @Jason, should I remove this check?
> 
> Yes, it was from something that looked very different from this

ok, I'll remove it here and next patch. 
Jason Gunthorpe March 30, 2023, 12:59 p.m. UTC | #7
On Thu, Mar 30, 2023 at 12:52:39PM +0000, Liu, Yi L wrote:

> > > "Use a negative value for no-iommu, where supported", or better, should
> > > we define this explicitly as -1, or why not use a flag bit to specify
> > > no-iommu?  Maybe minsz is only through flags for the noiommu use case.
> > 
> > I was happy enough for this to be defined as -1. We could give it a
> > formal sounding constant too
> 
> are you suggesting having something like "#define VFIO_NOIOMMU_FD	-1"?

Yeah something like that

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 1c640016a824..2b563bac50b9 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -3,6 +3,7 @@ 
  * Copyright (c) 2023 Intel Corporation.
  */
 #include <linux/vfio.h>
+#include <linux/iommufd.h>
 
 #include "vfio.h"
 
@@ -44,6 +45,158 @@  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
 	return ret;
 }
 
+static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
+{
+	spin_lock(&df->kvm_ref_lock);
+	if (df->kvm)
+		_vfio_device_get_kvm_safe(df->device, df->kvm);
+	spin_unlock(&df->kvm_ref_lock);
+}
+
+void vfio_device_cdev_close(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+
+	/*
+	 * As df->access_granted writer is under dev_set->lock as well,
+	 * so this read no need to use smp_load_acquire() to pair with
+	 * smp_store_release() in the caller of vfio_device_open().
+	 */
+	if (!df->access_granted)
+		return;
+
+	mutex_lock(&device->dev_set->lock);
+	vfio_device_close(df);
+	vfio_device_put_kvm(device);
+	if (df->iommufd)
+		iommufd_ctx_put(df->iommufd);
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+}
+
+static int vfio_device_cdev_enable_noiommu(struct vfio_device *device)
+{
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	if (!device->noiommu)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
+{
+	struct fd f;
+	struct iommufd_ctx *iommufd;
+
+	f = fdget(fd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	iommufd = iommufd_ctx_from_file(f.file);
+
+	fdput(f);
+	return iommufd;
+}
+
+long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+				    struct vfio_device_bind_iommufd __user *arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_bind_iommufd bind;
+	struct iommufd_ctx *iommufd = NULL;
+	unsigned long minsz;
+	int ret;
+
+	static_assert(__same_type(arg->out_devid, bind.out_devid));
+
+	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
+
+	if (copy_from_user(&bind, arg, minsz))
+		return -EFAULT;
+
+	if (bind.argsz < minsz || bind.flags)
+		return -EINVAL;
+
+	if (!device->ops->bind_iommufd)
+		return -ENODEV;
+
+	/* BIND_IOMMUFD only allowed for cdev fds */
+	if (df->group)
+		return -EINVAL;
+
+	ret = vfio_device_block_group(device);
+	if (ret)
+		return ret;
+
+	mutex_lock(&device->dev_set->lock);
+	/* one device cannot be bound twice */
+	if (df->access_granted) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* iommufd < 0 means noiommu mode */
+	if (bind.iommufd < 0) {
+		ret = vfio_device_cdev_enable_noiommu(device);
+		if (ret)
+			goto out_unlock;
+	} else {
+		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
+		if (IS_ERR(iommufd)) {
+			ret = PTR_ERR(iommufd);
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * Before the device open, get the KVM pointer currently
+	 * associated with the device file (if there is) and obtain
+	 * a reference.  This reference is held until device closed.
+	 * Save the pointer in the device for use by drivers.
+	 */
+	vfio_device_get_kvm_safe(df);
+
+	df->iommufd = iommufd;
+	ret = vfio_device_open(df);
+	if (ret)
+		goto out_put_kvm;
+
+	if (df->iommufd)
+		bind.out_devid = df->devid;
+
+	ret = copy_to_user(&arg->out_devid, &bind.out_devid,
+			   sizeof(bind.out_devid)) ? -EFAULT : 0;
+	if (ret)
+		goto out_close_device;
+
+	if (bind.iommufd < 0)
+		dev_warn(device->dev, "device is bound to vfio-noiommu by user "
+			 "(%s:%d)\n", current->comm, task_pid_nr(current));
+
+	/*
+	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
+	 * read/write/mmap
+	 */
+	smp_store_release(&df->access_granted, true);
+	mutex_unlock(&device->dev_set->lock);
+
+	return 0;
+
+out_close_device:
+	vfio_device_close(df);
+out_put_kvm:
+	df->iommufd = NULL;
+	vfio_device_put_kvm(device);
+	if (iommufd)
+		iommufd_ctx_put(iommufd);
+out_unlock:
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+	return ret;
+}
+
 static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 3a8fd0e32f59..ace3d52b0928 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -281,6 +281,9 @@  static inline void vfio_device_del(struct vfio_device *device)
 
 void vfio_init_device_cdev(struct vfio_device *device);
 int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
+void vfio_device_cdev_close(struct vfio_device_file *df);
+long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+				    struct vfio_device_bind_iommufd __user *arg);
 int vfio_cdev_init(struct class *device_class);
 void vfio_cdev_cleanup(void);
 #else
@@ -304,6 +307,16 @@  static inline int vfio_device_fops_cdev_open(struct inode *inode,
 	return 0;
 }
 
+static inline void vfio_device_cdev_close(struct vfio_device_file *df)
+{
+}
+
+static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+						  struct vfio_device_bind_iommufd __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int vfio_cdev_init(struct class *device_class)
 {
 	return 0;
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 58fc3bb768f2..375086c8803f 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -559,6 +559,8 @@  static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 
 	if (df->group)
 		vfio_device_group_close(df);
+	else
+		vfio_device_cdev_close(df);
 
 	vfio_device_put_registration(device);
 
@@ -1132,6 +1134,9 @@  static long vfio_device_fops_unl_ioctl(struct file *filep,
 	struct vfio_device *device = df->device;
 	int ret;
 
+	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
+		return vfio_device_ioctl_bind_iommufd(df, (void __user *)arg);
+
 	/* Paired with smp_store_release() following vfio_device_open() */
 	if (!smp_load_acquire(&df->access_granted))
 		return -EINVAL;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 61b801dfd40b..62b2f2497525 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -194,6 +194,43 @@  struct vfio_group_status {
 
 /* --------------- IOCTLs for DEVICE file descriptors --------------- */
 
+/*
+ * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
+ *				   struct vfio_device_bind_iommufd)
+ *
+ * Bind a vfio_device to the specified iommufd.
+ *
+ * The user should provide a device cookie when calling this ioctl. The
+ * cookie is carried only in event e.g. I/O fault reported to userspace
+ * via iommufd. The user should use devid returned by this ioctl to mark
+ * the target device in other ioctls (e.g. iommu hardware infomration query
+ * via iommufd, and etc.).
+ *
+ * User is not allowed to access the device before the binding operation
+ * is completed.
+ *
+ * Unbind is automatically conducted when device fd is closed.
+ *
+ * @argsz:	 user filled size of this data.
+ * @flags:	 reserved for future extension.
+ * @dev_cookie:	 a per device cookie provided by userspace.
+ * @iommufd:	 iommufd to bind. a negative value means noiommu.
+ * @out_devid:	 the device id generated by this bind. This field is valid
+ *		as long as the input @iommufd is valid. Otherwise, it is
+ *		meaningless.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_bind_iommufd {
+	__u32		argsz;
+	__u32		flags;
+	__aligned_u64	dev_cookie;
+	__s32		iommufd;
+	__u32		out_devid;
+};
+
+#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
 /**
  * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
  *						struct vfio_device_info)