diff mbox series

[v11,19/23] vfio: Add VFIO_DEVICE_BIND_IOMMUFD

Message ID 20230513132827.39066-20-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Add vfio_device cdev for iommufd support | expand

Commit Message

Yi Liu May 13, 2023, 1:28 p.m. UTC
This adds ioctl for userspace to bind device cdev fd to iommufd.

    VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
			      control provided by the iommufd. open_device
			      op is called after bind_iommufd op.

Tested-by: Yanting Jiang <yanting.jiang@intel.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/device_cdev.c | 130 +++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio.h        |  13 ++++
 drivers/vfio/vfio_main.c   |   5 ++
 include/linux/vfio.h       |   3 +-
 include/uapi/linux/vfio.h  |  28 ++++++++
 5 files changed, 178 insertions(+), 1 deletion(-)

Comments

Alex Williamson May 22, 2023, 10:01 p.m. UTC | #1
On Sat, 13 May 2023 06:28:23 -0700
Yi Liu <yi.l.liu@intel.com> wrote:

> This adds ioctl for userspace to bind device cdev fd to iommufd.
> 
>     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> 			      control provided by the iommufd. open_device
> 			      op is called after bind_iommufd op.
> 
> Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/device_cdev.c | 130 +++++++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio.h        |  13 ++++
>  drivers/vfio/vfio_main.c   |   5 ++
>  include/linux/vfio.h       |   3 +-
>  include/uapi/linux/vfio.h  |  28 ++++++++
>  5 files changed, 178 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index 1c640016a824..291cc678a18b 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -3,6 +3,7 @@
>   * Copyright (c) 2023 Intel Corporation.
>   */
>  #include <linux/vfio.h>
> +#include <linux/iommufd.h>
>  
>  #include "vfio.h"
>  
> @@ -44,6 +45,135 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
>  	return ret;
>  }
>  
> +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> +{
> +	spin_lock(&df->kvm_ref_lock);
> +	if (df->kvm)
> +		_vfio_device_get_kvm_safe(df->device, df->kvm);
> +	spin_unlock(&df->kvm_ref_lock);
> +}
> +
> +void vfio_device_cdev_close(struct vfio_device_file *df)
> +{
> +	struct vfio_device *device = df->device;
> +
> +	/*
> +	 * In the time of close, there is no contention with another one
> +	 * changing this flag.  So read df->access_granted without lock
> +	 * and no smp_load_acquire() is ok.
> +	 */
> +	if (!df->access_granted)
> +		return;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	vfio_device_close(df);
> +	vfio_device_put_kvm(device);
> +	iommufd_ctx_put(df->iommufd);
> +	device->cdev_opened = false;
> +	mutex_unlock(&device->dev_set->lock);
> +	vfio_device_unblock_group(device);
> +}
> +
> +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> +{
> +	struct iommufd_ctx *iommufd;
> +	struct fd f;
> +
> +	f = fdget(fd);
> +	if (!f.file)
> +		return ERR_PTR(-EBADF);
> +
> +	iommufd = iommufd_ctx_from_file(f.file);
> +
> +	fdput(f);
> +	return iommufd;
> +}
> +
> +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +				    struct vfio_device_bind_iommufd __user *arg)
> +{
> +	struct vfio_device *device = df->device;
> +	struct vfio_device_bind_iommufd bind;
> +	unsigned long minsz;
> +	int ret;
> +
> +	static_assert(__same_type(arg->out_devid, df->devid));
> +
> +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> +
> +	if (copy_from_user(&bind, arg, minsz))
> +		return -EFAULT;
> +
> +	if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
> +		return -EINVAL;
> +
> +	/* BIND_IOMMUFD only allowed for cdev fds */
> +	if (df->group)
> +		return -EINVAL;
> +
> +	if (vfio_device_is_noiommu(device) && !capable(CAP_SYS_RAWIO))
> +		return -EPERM;
> +
> +	ret = vfio_device_block_group(device);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	/* one device cannot be bound twice */
> +	if (df->access_granted) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	df->iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> +	if (IS_ERR(df->iommufd)) {
> +		ret = PTR_ERR(df->iommufd);
> +		df->iommufd = NULL;
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Before the device open, get the KVM pointer currently
> +	 * associated with the device file (if there is) and obtain
> +	 * a reference.  This reference is held until device closed.
> +	 * Save the pointer in the device for use by drivers.
> +	 */
> +	vfio_device_get_kvm_safe(df);
> +
> +	ret = vfio_device_open(df);
> +	if (ret)
> +		goto out_put_kvm;
> +
> +	ret = copy_to_user(&arg->out_devid, &df->devid,
> +			   sizeof(df->devid)) ? -EFAULT : 0;
> +	if (ret)
> +		goto out_close_device;
> +
> +	/*
> +	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
> +	 * read/write/mmap
> +	 */
> +	smp_store_release(&df->access_granted, true);
> +	device->cdev_opened = true;
> +	mutex_unlock(&device->dev_set->lock);
> +
> +	if (vfio_device_is_noiommu(device))
> +		dev_warn(device->dev, "noiommu device is bound to iommufd by user "
> +			 "(%s:%d)\n", current->comm, task_pid_nr(current));

The noiommu kernel taint only happens in vfio_group_find_or_alloc(), so
how does noiommu taint the kernel when !CONFIG_VFIO_GROUP?


> +	return 0;
> +
> +out_close_device:
> +	vfio_device_close(df);
> +out_put_kvm:
> +	vfio_device_put_kvm(device);
> +	iommufd_ctx_put(df->iommufd);
> +	df->iommufd = NULL;
> +out_unlock:
> +	mutex_unlock(&device->dev_set->lock);
> +	vfio_device_unblock_group(device);
> +	return ret;
> +}
> +
>  static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
>  {
>  	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 6861f8ebb64d..8b359a7794be 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -279,6 +279,9 @@ static inline void vfio_device_del(struct vfio_device *device)
>  
>  void vfio_init_device_cdev(struct vfio_device *device);
>  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
> +void vfio_device_cdev_close(struct vfio_device_file *df);
> +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +				    struct vfio_device_bind_iommufd __user *arg);
>  int vfio_cdev_init(struct class *device_class);
>  void vfio_cdev_cleanup(void);
>  #else
> @@ -302,6 +305,16 @@ static inline int vfio_device_fops_cdev_open(struct inode *inode,
>  	return 0;
>  }
>  
> +static inline void vfio_device_cdev_close(struct vfio_device_file *df)
> +{
> +}
> +
> +static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +						  struct vfio_device_bind_iommufd __user *arg)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>  static inline int vfio_cdev_init(struct class *device_class)
>  {
>  	return 0;
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index c87cc7afe92c..c9fa39ac4b02 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -574,6 +574,8 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
>  
>  	if (df->group)
>  		vfio_device_group_close(df);
> +	else
> +		vfio_device_cdev_close(df);
>  
>  	vfio_device_put_registration(device);
>  
> @@ -1147,6 +1149,9 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
>  	struct vfio_device *device = df->device;
>  	int ret;
>  
> +	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
> +		return vfio_device_ioctl_bind_iommufd(df, (void __user *)arg);
> +
>  	/* Paired with smp_store_release() following vfio_device_open() */
>  	if (!smp_load_acquire(&df->access_granted))
>  		return -EINVAL;
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 873275419f13..cf9d082a623c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -67,6 +67,7 @@ struct vfio_device {
>  	struct iommufd_device *iommufd_device;
>  	bool iommufd_attached;
>  #endif
> +	bool cdev_opened:1;
>  };
>  
>  /**
> @@ -169,7 +170,7 @@ vfio_iommufd_physical_devid(struct vfio_device *vdev)
>  
>  static inline bool vfio_device_cdev_opened(struct vfio_device *device)
>  {
> -	return false;
> +	return device->cdev_opened;
>  }
>  
>  /**
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 24858b650562..07c917de31e9 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -194,6 +194,34 @@ struct vfio_group_status {
>  
>  /* --------------- IOCTLs for DEVICE file descriptors --------------- */
>  
> +/*
> + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> + *				   struct vfio_device_bind_iommufd)
> + *
> + * Bind a vfio_device to the specified iommufd.
> + *
> + * User is restricted from accessing the device before the binding operation
> + * is completed.
> + *
> + * Unbind is automatically conducted when device fd is closed.
> + *
> + * @argsz:	 User filled size of this data.
> + * @flags:	 Must be 0.
> + * @iommufd:	 iommufd to bind.
> + * @out_devid:	 The device id generated by this bind. devid is a handle for
> + *		 this device/iommufd bond and can be used in IOMMUFD commands.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_bind_iommufd {
> +	__u32		argsz;
> +	__u32		flags;
> +	__s32		iommufd;
> +	__u32		out_devid;
> +};
> +
> +#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
> +

Why is this preempting the first device ioctl below rather than being
added in sequential order?  I'm also not sure what's at device ioctl 18
that we started at 19.  VFIO_DEVICE_FEATURE is at 17.  Yes, they're
hard to keep track of.  Thanks,

Alex

>  /**
>   * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
>   *						struct vfio_device_info)
Yi Liu May 23, 2023, 1:41 a.m. UTC | #2
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Tuesday, May 23, 2023 6:01 AM
> 
> On Sat, 13 May 2023 06:28:23 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This adds ioctl for userspace to bind device cdev fd to iommufd.
> >
> >     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> > 			      control provided by the iommufd. open_device
> > 			      op is called after bind_iommufd op.
> >
> > Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> > Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/device_cdev.c | 130 +++++++++++++++++++++++++++++++++++++
> >  drivers/vfio/vfio.h        |  13 ++++
> >  drivers/vfio/vfio_main.c   |   5 ++
> >  include/linux/vfio.h       |   3 +-
> >  include/uapi/linux/vfio.h  |  28 ++++++++
> >  5 files changed, 178 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > index 1c640016a824..291cc678a18b 100644
> > --- a/drivers/vfio/device_cdev.c
> > +++ b/drivers/vfio/device_cdev.c
> > @@ -3,6 +3,7 @@
> >   * Copyright (c) 2023 Intel Corporation.
> >   */
> >  #include <linux/vfio.h>
> > +#include <linux/iommufd.h>
> >
> >  #include "vfio.h"
> >
> > @@ -44,6 +45,135 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct
> file *filep)
> >  	return ret;
> >  }
> >
> > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> > +{
> > +	spin_lock(&df->kvm_ref_lock);
> > +	if (df->kvm)
> > +		_vfio_device_get_kvm_safe(df->device, df->kvm);
> > +	spin_unlock(&df->kvm_ref_lock);
> > +}
> > +
> > +void vfio_device_cdev_close(struct vfio_device_file *df)
> > +{
> > +	struct vfio_device *device = df->device;
> > +
> > +	/*
> > +	 * In the time of close, there is no contention with another one
> > +	 * changing this flag.  So read df->access_granted without lock
> > +	 * and no smp_load_acquire() is ok.
> > +	 */
> > +	if (!df->access_granted)
> > +		return;
> > +
> > +	mutex_lock(&device->dev_set->lock);
> > +	vfio_device_close(df);
> > +	vfio_device_put_kvm(device);
> > +	iommufd_ctx_put(df->iommufd);
> > +	device->cdev_opened = false;
> > +	mutex_unlock(&device->dev_set->lock);
> > +	vfio_device_unblock_group(device);
> > +}
> > +
> > +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> > +{
> > +	struct iommufd_ctx *iommufd;
> > +	struct fd f;
> > +
> > +	f = fdget(fd);
> > +	if (!f.file)
> > +		return ERR_PTR(-EBADF);
> > +
> > +	iommufd = iommufd_ctx_from_file(f.file);
> > +
> > +	fdput(f);
> > +	return iommufd;
> > +}
> > +
> > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +				    struct vfio_device_bind_iommufd __user *arg)
> > +{
> > +	struct vfio_device *device = df->device;
> > +	struct vfio_device_bind_iommufd bind;
> > +	unsigned long minsz;
> > +	int ret;
> > +
> > +	static_assert(__same_type(arg->out_devid, df->devid));
> > +
> > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > +
> > +	if (copy_from_user(&bind, arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
> > +		return -EINVAL;
> > +
> > +	/* BIND_IOMMUFD only allowed for cdev fds */
> > +	if (df->group)
> > +		return -EINVAL;
> > +
> > +	if (vfio_device_is_noiommu(device) && !capable(CAP_SYS_RAWIO))
> > +		return -EPERM;
> > +
> > +	ret = vfio_device_block_group(device);
> > +	if (ret)
> > +		return ret;
> > +
> > +	mutex_lock(&device->dev_set->lock);
> > +	/* one device cannot be bound twice */
> > +	if (df->access_granted) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	df->iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> > +	if (IS_ERR(df->iommufd)) {
> > +		ret = PTR_ERR(df->iommufd);
> > +		df->iommufd = NULL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/*
> > +	 * Before the device open, get the KVM pointer currently
> > +	 * associated with the device file (if there is) and obtain
> > +	 * a reference.  This reference is held until device closed.
> > +	 * Save the pointer in the device for use by drivers.
> > +	 */
> > +	vfio_device_get_kvm_safe(df);
> > +
> > +	ret = vfio_device_open(df);
> > +	if (ret)
> > +		goto out_put_kvm;
> > +
> > +	ret = copy_to_user(&arg->out_devid, &df->devid,
> > +			   sizeof(df->devid)) ? -EFAULT : 0;
> > +	if (ret)
> > +		goto out_close_device;
> > +
> > +	/*
> > +	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
> > +	 * read/write/mmap
> > +	 */
> > +	smp_store_release(&df->access_granted, true);
> > +	device->cdev_opened = true;
> > +	mutex_unlock(&device->dev_set->lock);
> > +
> > +	if (vfio_device_is_noiommu(device))
> > +		dev_warn(device->dev, "noiommu device is bound to iommufd by user
> "
> > +			 "(%s:%d)\n", current->comm, task_pid_nr(current));
> 
> The noiommu kernel taint only happens in vfio_group_find_or_alloc(), so
> how does noiommu taint the kernel when !CONFIG_VFIO_GROUP?

Yeah, in the cdev path, no taint. I add this just in order to par with the below
message in the group path.

vfio_device_open_file()
{
	dev_warn(device->dev, "vfio-noiommu device opened by user "
		   "(%s:%d)\n", current->comm, task_pid_nr(current));
}

> > +	return 0;
> > +
> > +out_close_device:
> > +	vfio_device_close(df);
> > +out_put_kvm:
> > +	vfio_device_put_kvm(device);
> > +	iommufd_ctx_put(df->iommufd);
> > +	df->iommufd = NULL;
> > +out_unlock:
> > +	mutex_unlock(&device->dev_set->lock);
> > +	vfio_device_unblock_group(device);
> > +	return ret;
> > +}
> > +
> >  static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
> >  {
> >  	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> > index 6861f8ebb64d..8b359a7794be 100644
> > --- a/drivers/vfio/vfio.h
> > +++ b/drivers/vfio/vfio.h
> > @@ -279,6 +279,9 @@ static inline void vfio_device_del(struct vfio_device *device)
> >
> >  void vfio_init_device_cdev(struct vfio_device *device);
> >  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
> > +void vfio_device_cdev_close(struct vfio_device_file *df);
> > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +				    struct vfio_device_bind_iommufd __user *arg);
> >  int vfio_cdev_init(struct class *device_class);
> >  void vfio_cdev_cleanup(void);
> >  #else
> > @@ -302,6 +305,16 @@ static inline int vfio_device_fops_cdev_open(struct inode
> *inode,
> >  	return 0;
> >  }
> >
> > +static inline void vfio_device_cdev_close(struct vfio_device_file *df)
> > +{
> > +}
> > +
> > +static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +						  struct vfio_device_bind_iommufd
> __user *arg)
> > +{
> > +	return -EOPNOTSUPP;
> > +}
> > +
> >  static inline int vfio_cdev_init(struct class *device_class)
> >  {
> >  	return 0;
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index c87cc7afe92c..c9fa39ac4b02 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -574,6 +574,8 @@ static int vfio_device_fops_release(struct inode *inode, struct
> file *filep)
> >
> >  	if (df->group)
> >  		vfio_device_group_close(df);
> > +	else
> > +		vfio_device_cdev_close(df);
> >
> >  	vfio_device_put_registration(device);
> >
> > @@ -1147,6 +1149,9 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
> >  	struct vfio_device *device = df->device;
> >  	int ret;
> >
> > +	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
> > +		return vfio_device_ioctl_bind_iommufd(df, (void __user *)arg);
> > +
> >  	/* Paired with smp_store_release() following vfio_device_open() */
> >  	if (!smp_load_acquire(&df->access_granted))
> >  		return -EINVAL;
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index 873275419f13..cf9d082a623c 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -67,6 +67,7 @@ struct vfio_device {
> >  	struct iommufd_device *iommufd_device;
> >  	bool iommufd_attached;
> >  #endif
> > +	bool cdev_opened:1;
> >  };
> >
> >  /**
> > @@ -169,7 +170,7 @@ vfio_iommufd_physical_devid(struct vfio_device *vdev)
> >
> >  static inline bool vfio_device_cdev_opened(struct vfio_device *device)
> >  {
> > -	return false;
> > +	return device->cdev_opened;
> >  }
> >
> >  /**
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 24858b650562..07c917de31e9 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -194,6 +194,34 @@ struct vfio_group_status {
> >
> >  /* --------------- IOCTLs for DEVICE file descriptors --------------- */
> >
> > +/*
> > + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> > + *				   struct vfio_device_bind_iommufd)
> > + *
> > + * Bind a vfio_device to the specified iommufd.
> > + *
> > + * User is restricted from accessing the device before the binding operation
> > + * is completed.
> > + *
> > + * Unbind is automatically conducted when device fd is closed.
> > + *
> > + * @argsz:	 User filled size of this data.
> > + * @flags:	 Must be 0.
> > + * @iommufd:	 iommufd to bind.
> > + * @out_devid:	 The device id generated by this bind. devid is a handle for
> > + *		 this device/iommufd bond and can be used in IOMMUFD commands.
> > + *
> > + * Return: 0 on success, -errno on failure.
> > + */
> > +struct vfio_device_bind_iommufd {
> > +	__u32		argsz;
> > +	__u32		flags;
> > +	__s32		iommufd;
> > +	__u32		out_devid;
> > +};
> > +
> > +#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
> > +
> 
> Why is this preempting the first device ioctl below rather than being
> added in sequential order?  I'm also not sure what's at device ioctl 18
> that we started at 19.  VFIO_DEVICE_FEATURE is at 17.  Yes, they're
> hard to keep track of.  Thanks,

yes, 17 is the last occupied ioctl offset on device fd. Will correct
it.

Regards,
Yi Liu
Alex Williamson May 23, 2023, 3:51 p.m. UTC | #3
On Tue, 23 May 2023 01:41:36 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Tuesday, May 23, 2023 6:01 AM
> > 
> > On Sat, 13 May 2023 06:28:23 -0700
> > Yi Liu <yi.l.liu@intel.com> wrote:
> >   
> > > This adds ioctl for userspace to bind device cdev fd to iommufd.
> > >
> > >     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> > > 			      control provided by the iommufd. open_device
> > > 			      op is called after bind_iommufd op.
> > >
> > > Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> > > Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > > ---
> > >  drivers/vfio/device_cdev.c | 130 +++++++++++++++++++++++++++++++++++++
> > >  drivers/vfio/vfio.h        |  13 ++++
> > >  drivers/vfio/vfio_main.c   |   5 ++
> > >  include/linux/vfio.h       |   3 +-
> > >  include/uapi/linux/vfio.h  |  28 ++++++++
> > >  5 files changed, 178 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > > index 1c640016a824..291cc678a18b 100644
> > > --- a/drivers/vfio/device_cdev.c
> > > +++ b/drivers/vfio/device_cdev.c
> > > @@ -3,6 +3,7 @@
> > >   * Copyright (c) 2023 Intel Corporation.
> > >   */
> > >  #include <linux/vfio.h>
> > > +#include <linux/iommufd.h>
> > >
> > >  #include "vfio.h"
> > >
> > > @@ -44,6 +45,135 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct  
> > file *filep)  
> > >  	return ret;
> > >  }
> > >
> > > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> > > +{
> > > +	spin_lock(&df->kvm_ref_lock);
> > > +	if (df->kvm)
> > > +		_vfio_device_get_kvm_safe(df->device, df->kvm);
> > > +	spin_unlock(&df->kvm_ref_lock);
> > > +}
> > > +
> > > +void vfio_device_cdev_close(struct vfio_device_file *df)
> > > +{
> > > +	struct vfio_device *device = df->device;
> > > +
> > > +	/*
> > > +	 * In the time of close, there is no contention with another one
> > > +	 * changing this flag.  So read df->access_granted without lock
> > > +	 * and no smp_load_acquire() is ok.
> > > +	 */
> > > +	if (!df->access_granted)
> > > +		return;
> > > +
> > > +	mutex_lock(&device->dev_set->lock);
> > > +	vfio_device_close(df);
> > > +	vfio_device_put_kvm(device);
> > > +	iommufd_ctx_put(df->iommufd);
> > > +	device->cdev_opened = false;
> > > +	mutex_unlock(&device->dev_set->lock);
> > > +	vfio_device_unblock_group(device);
> > > +}
> > > +
> > > +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> > > +{
> > > +	struct iommufd_ctx *iommufd;
> > > +	struct fd f;
> > > +
> > > +	f = fdget(fd);
> > > +	if (!f.file)
> > > +		return ERR_PTR(-EBADF);
> > > +
> > > +	iommufd = iommufd_ctx_from_file(f.file);
> > > +
> > > +	fdput(f);
> > > +	return iommufd;
> > > +}
> > > +
> > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > +				    struct vfio_device_bind_iommufd __user *arg)
> > > +{
> > > +	struct vfio_device *device = df->device;
> > > +	struct vfio_device_bind_iommufd bind;
> > > +	unsigned long minsz;
> > > +	int ret;
> > > +
> > > +	static_assert(__same_type(arg->out_devid, df->devid));
> > > +
> > > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > > +
> > > +	if (copy_from_user(&bind, arg, minsz))
> > > +		return -EFAULT;
> > > +
> > > +	if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
> > > +		return -EINVAL;
> > > +
> > > +	/* BIND_IOMMUFD only allowed for cdev fds */
> > > +	if (df->group)
> > > +		return -EINVAL;
> > > +
> > > +	if (vfio_device_is_noiommu(device) && !capable(CAP_SYS_RAWIO))
> > > +		return -EPERM;
> > > +
> > > +	ret = vfio_device_block_group(device);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	mutex_lock(&device->dev_set->lock);
> > > +	/* one device cannot be bound twice */
> > > +	if (df->access_granted) {
> > > +		ret = -EINVAL;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	df->iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> > > +	if (IS_ERR(df->iommufd)) {
> > > +		ret = PTR_ERR(df->iommufd);
> > > +		df->iommufd = NULL;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	/*
> > > +	 * Before the device open, get the KVM pointer currently
> > > +	 * associated with the device file (if there is) and obtain
> > > +	 * a reference.  This reference is held until device closed.
> > > +	 * Save the pointer in the device for use by drivers.
> > > +	 */
> > > +	vfio_device_get_kvm_safe(df);
> > > +
> > > +	ret = vfio_device_open(df);
> > > +	if (ret)
> > > +		goto out_put_kvm;
> > > +
> > > +	ret = copy_to_user(&arg->out_devid, &df->devid,
> > > +			   sizeof(df->devid)) ? -EFAULT : 0;
> > > +	if (ret)
> > > +		goto out_close_device;
> > > +
> > > +	/*
> > > +	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
> > > +	 * read/write/mmap
> > > +	 */
> > > +	smp_store_release(&df->access_granted, true);
> > > +	device->cdev_opened = true;
> > > +	mutex_unlock(&device->dev_set->lock);
> > > +
> > > +	if (vfio_device_is_noiommu(device))
> > > +		dev_warn(device->dev, "noiommu device is bound to iommufd by user  
> > "  
> > > +			 "(%s:%d)\n", current->comm, task_pid_nr(current));  
> > 
> > The noiommu kernel taint only happens in vfio_group_find_or_alloc(), so
> > how does noiommu taint the kernel when !CONFIG_VFIO_GROUP?  
> 
> Yeah, in the cdev path, no taint. I add this just in order to par with the below
> message in the group path.
> 
> vfio_device_open_file()
> {
> 	dev_warn(device->dev, "vfio-noiommu device opened by user "
> 		   "(%s:%d)\n", current->comm, task_pid_nr(current));
> }

There needs to be a taint when VFIO_GROUP is disabled.  Thanks,

Alex
 
> > > +	return 0;
> > > +
> > > +out_close_device:
> > > +	vfio_device_close(df);
> > > +out_put_kvm:
> > > +	vfio_device_put_kvm(device);
> > > +	iommufd_ctx_put(df->iommufd);
> > > +	df->iommufd = NULL;
> > > +out_unlock:
> > > +	mutex_unlock(&device->dev_set->lock);
> > > +	vfio_device_unblock_group(device);
> > > +	return ret;
> > > +}
> > > +
> > >  static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
> > >  {
> > >  	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> > > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> > > index 6861f8ebb64d..8b359a7794be 100644
> > > --- a/drivers/vfio/vfio.h
> > > +++ b/drivers/vfio/vfio.h
> > > @@ -279,6 +279,9 @@ static inline void vfio_device_del(struct vfio_device *device)
> > >
> > >  void vfio_init_device_cdev(struct vfio_device *device);
> > >  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
> > > +void vfio_device_cdev_close(struct vfio_device_file *df);
> > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > +				    struct vfio_device_bind_iommufd __user *arg);
> > >  int vfio_cdev_init(struct class *device_class);
> > >  void vfio_cdev_cleanup(void);
> > >  #else
> > > @@ -302,6 +305,16 @@ static inline int vfio_device_fops_cdev_open(struct inode  
> > *inode,  
> > >  	return 0;
> > >  }
> > >
> > > +static inline void vfio_device_cdev_close(struct vfio_device_file *df)
> > > +{
> > > +}
> > > +
> > > +static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > +						  struct vfio_device_bind_iommufd  
> > __user *arg)  
> > > +{
> > > +	return -EOPNOTSUPP;
> > > +}
> > > +
> > >  static inline int vfio_cdev_init(struct class *device_class)
> > >  {
> > >  	return 0;
> > > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > > index c87cc7afe92c..c9fa39ac4b02 100644
> > > --- a/drivers/vfio/vfio_main.c
> > > +++ b/drivers/vfio/vfio_main.c
> > > @@ -574,6 +574,8 @@ static int vfio_device_fops_release(struct inode *inode, struct  
> > file *filep)  
> > >
> > >  	if (df->group)
> > >  		vfio_device_group_close(df);
> > > +	else
> > > +		vfio_device_cdev_close(df);
> > >
> > >  	vfio_device_put_registration(device);
> > >
> > > @@ -1147,6 +1149,9 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
> > >  	struct vfio_device *device = df->device;
> > >  	int ret;
> > >
> > > +	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
> > > +		return vfio_device_ioctl_bind_iommufd(df, (void __user *)arg);
> > > +
> > >  	/* Paired with smp_store_release() following vfio_device_open() */
> > >  	if (!smp_load_acquire(&df->access_granted))
> > >  		return -EINVAL;
> > > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > > index 873275419f13..cf9d082a623c 100644
> > > --- a/include/linux/vfio.h
> > > +++ b/include/linux/vfio.h
> > > @@ -67,6 +67,7 @@ struct vfio_device {
> > >  	struct iommufd_device *iommufd_device;
> > >  	bool iommufd_attached;
> > >  #endif
> > > +	bool cdev_opened:1;
> > >  };
> > >
> > >  /**
> > > @@ -169,7 +170,7 @@ vfio_iommufd_physical_devid(struct vfio_device *vdev)
> > >
> > >  static inline bool vfio_device_cdev_opened(struct vfio_device *device)
> > >  {
> > > -	return false;
> > > +	return device->cdev_opened;
> > >  }
> > >
> > >  /**
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index 24858b650562..07c917de31e9 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -194,6 +194,34 @@ struct vfio_group_status {
> > >
> > >  /* --------------- IOCTLs for DEVICE file descriptors --------------- */
> > >
> > > +/*
> > > + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> > > + *				   struct vfio_device_bind_iommufd)
> > > + *
> > > + * Bind a vfio_device to the specified iommufd.
> > > + *
> > > + * User is restricted from accessing the device before the binding operation
> > > + * is completed.
> > > + *
> > > + * Unbind is automatically conducted when device fd is closed.
> > > + *
> > > + * @argsz:	 User filled size of this data.
> > > + * @flags:	 Must be 0.
> > > + * @iommufd:	 iommufd to bind.
> > > + * @out_devid:	 The device id generated by this bind. devid is a handle for
> > > + *		 this device/iommufd bond and can be used in IOMMUFD commands.
> > > + *
> > > + * Return: 0 on success, -errno on failure.
> > > + */
> > > +struct vfio_device_bind_iommufd {
> > > +	__u32		argsz;
> > > +	__u32		flags;
> > > +	__s32		iommufd;
> > > +	__u32		out_devid;
> > > +};
> > > +
> > > +#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
> > > +  
> > 
> > Why is this preempting the first device ioctl below rather than being
> > added in sequential order?  I'm also not sure what's at device ioctl 18
> > that we started at 19.  VFIO_DEVICE_FEATURE is at 17.  Yes, they're
> > hard to keep track of.  Thanks,  
> 
> yes, 17 is the last occupied ioctl offset on device fd. Will correct
> it.
> 
> Regards,
> Yi Liu
>
Yi Liu May 24, 2023, 2:20 a.m. UTC | #4
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Tuesday, May 23, 2023 11:51 PM
> 
> On Tue, 23 May 2023 01:41:36 +0000
> "Liu, Yi L" <yi.l.liu@intel.com> wrote:
> 
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Tuesday, May 23, 2023 6:01 AM
> > >
> > > On Sat, 13 May 2023 06:28:23 -0700
> > > Yi Liu <yi.l.liu@intel.com> wrote:
> > >
> > > > This adds ioctl for userspace to bind device cdev fd to iommufd.
> > > >
> > > >     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> > > > 			      control provided by the iommufd. open_device
> > > > 			      op is called after bind_iommufd op.
> > > >
> > > > Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> > > > Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > > > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > > > ---
> > > >  drivers/vfio/device_cdev.c | 130 +++++++++++++++++++++++++++++++++++++
> > > >  drivers/vfio/vfio.h        |  13 ++++
> > > >  drivers/vfio/vfio_main.c   |   5 ++
> > > >  include/linux/vfio.h       |   3 +-
> > > >  include/uapi/linux/vfio.h  |  28 ++++++++
> > > >  5 files changed, 178 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > > > index 1c640016a824..291cc678a18b 100644
> > > > --- a/drivers/vfio/device_cdev.c
> > > > +++ b/drivers/vfio/device_cdev.c
> > > > @@ -3,6 +3,7 @@
> > > >   * Copyright (c) 2023 Intel Corporation.
> > > >   */
> > > >  #include <linux/vfio.h>
> > > > +#include <linux/iommufd.h>
> > > >
> > > >  #include "vfio.h"
> > > >
> > > > @@ -44,6 +45,135 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct
> > > file *filep)
> > > >  	return ret;
> > > >  }
> > > >
> > > > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> > > > +{
> > > > +	spin_lock(&df->kvm_ref_lock);
> > > > +	if (df->kvm)
> > > > +		_vfio_device_get_kvm_safe(df->device, df->kvm);
> > > > +	spin_unlock(&df->kvm_ref_lock);
> > > > +}
> > > > +
> > > > +void vfio_device_cdev_close(struct vfio_device_file *df)
> > > > +{
> > > > +	struct vfio_device *device = df->device;
> > > > +
> > > > +	/*
> > > > +	 * In the time of close, there is no contention with another one
> > > > +	 * changing this flag.  So read df->access_granted without lock
> > > > +	 * and no smp_load_acquire() is ok.
> > > > +	 */
> > > > +	if (!df->access_granted)
> > > > +		return;
> > > > +
> > > > +	mutex_lock(&device->dev_set->lock);
> > > > +	vfio_device_close(df);
> > > > +	vfio_device_put_kvm(device);
> > > > +	iommufd_ctx_put(df->iommufd);
> > > > +	device->cdev_opened = false;
> > > > +	mutex_unlock(&device->dev_set->lock);
> > > > +	vfio_device_unblock_group(device);
> > > > +}
> > > > +
> > > > +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> > > > +{
> > > > +	struct iommufd_ctx *iommufd;
> > > > +	struct fd f;
> > > > +
> > > > +	f = fdget(fd);
> > > > +	if (!f.file)
> > > > +		return ERR_PTR(-EBADF);
> > > > +
> > > > +	iommufd = iommufd_ctx_from_file(f.file);
> > > > +
> > > > +	fdput(f);
> > > > +	return iommufd;
> > > > +}
> > > > +
> > > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > > +				    struct vfio_device_bind_iommufd __user *arg)
> > > > +{
> > > > +	struct vfio_device *device = df->device;
> > > > +	struct vfio_device_bind_iommufd bind;
> > > > +	unsigned long minsz;
> > > > +	int ret;
> > > > +
> > > > +	static_assert(__same_type(arg->out_devid, df->devid));
> > > > +
> > > > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > > > +
> > > > +	if (copy_from_user(&bind, arg, minsz))
> > > > +		return -EFAULT;
> > > > +
> > > > +	if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
> > > > +		return -EINVAL;
> > > > +
> > > > +	/* BIND_IOMMUFD only allowed for cdev fds */
> > > > +	if (df->group)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (vfio_device_is_noiommu(device) && !capable(CAP_SYS_RAWIO))
> > > > +		return -EPERM;
> > > > +
> > > > +	ret = vfio_device_block_group(device);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	mutex_lock(&device->dev_set->lock);
> > > > +	/* one device cannot be bound twice */
> > > > +	if (df->access_granted) {
> > > > +		ret = -EINVAL;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	df->iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> > > > +	if (IS_ERR(df->iommufd)) {
> > > > +		ret = PTR_ERR(df->iommufd);
> > > > +		df->iommufd = NULL;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * Before the device open, get the KVM pointer currently
> > > > +	 * associated with the device file (if there is) and obtain
> > > > +	 * a reference.  This reference is held until device closed.
> > > > +	 * Save the pointer in the device for use by drivers.
> > > > +	 */
> > > > +	vfio_device_get_kvm_safe(df);
> > > > +
> > > > +	ret = vfio_device_open(df);
> > > > +	if (ret)
> > > > +		goto out_put_kvm;
> > > > +
> > > > +	ret = copy_to_user(&arg->out_devid, &df->devid,
> > > > +			   sizeof(df->devid)) ? -EFAULT : 0;
> > > > +	if (ret)
> > > > +		goto out_close_device;
> > > > +
> > > > +	/*
> > > > +	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
> > > > +	 * read/write/mmap
> > > > +	 */
> > > > +	smp_store_release(&df->access_granted, true);
> > > > +	device->cdev_opened = true;
> > > > +	mutex_unlock(&device->dev_set->lock);
> > > > +
> > > > +	if (vfio_device_is_noiommu(device))
> > > > +		dev_warn(device->dev, "noiommu device is bound to iommufd by user
> > > "
> > > > +			 "(%s:%d)\n", current->comm, task_pid_nr(current));
> > >
> > > The noiommu kernel taint only happens in vfio_group_find_or_alloc(), so
> > > how does noiommu taint the kernel when !CONFIG_VFIO_GROUP?
> >
> > Yeah, in the cdev path, no taint. I add this just in order to par with the below
> > message in the group path.
> >
> > vfio_device_open_file()
> > {
> > 	dev_warn(device->dev, "vfio-noiommu device opened by user "
> > 		   "(%s:%d)\n", current->comm, task_pid_nr(current));
> > }
> 
> There needs to be a taint when VFIO_GROUP is disabled.  Thanks,
I see. I misunderstood you. You are asking for a taint. 
Tian, Kevin May 24, 2023, 2:39 a.m. UTC | #5
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Wednesday, May 24, 2023 10:21 AM
> 
> > >
> > > vfio_device_open_file()
> > > {
> > > 	dev_warn(device->dev, "vfio-noiommu device opened by user "
> > > 		   "(%s:%d)\n", current->comm, task_pid_nr(current));
> > > }
> >
> > There needs to be a taint when VFIO_GROUP is disabled.  Thanks,
> I see. I misunderstood you. You are asking for a taint. 
Yi Liu May 24, 2023, 2:40 a.m. UTC | #6
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Wednesday, May 24, 2023 10:39 AM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Wednesday, May 24, 2023 10:21 AM
> >
> > > >
> > > > vfio_device_open_file()
> > > > {
> > > > 	dev_warn(device->dev, "vfio-noiommu device opened by user "
> > > > 		   "(%s:%d)\n", current->comm, task_pid_nr(current));
> > > > }
> > >
> > > There needs to be a taint when VFIO_GROUP is disabled.  Thanks,
> > I see. I misunderstood you. You are asking for a taint. 
Yi Liu May 24, 2023, 8:31 a.m. UTC | #7
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Wednesday, May 24, 2023 10:41 AM
> 
> > From: Tian, Kevin <kevin.tian@intel.com>
> > Sent: Wednesday, May 24, 2023 10:39 AM
> >
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Wednesday, May 24, 2023 10:21 AM
> > >
> > > > >
> > > > > vfio_device_open_file()
> > > > > {
> > > > > 	dev_warn(device->dev, "vfio-noiommu device opened by user "
> > > > > 		   "(%s:%d)\n", current->comm, task_pid_nr(current));
> > > > > }
> > > >
> > > > There needs to be a taint when VFIO_GROUP is disabled.  Thanks,
> > > I see. I misunderstood you. You are asking for a taint. 
diff mbox series

Patch

diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 1c640016a824..291cc678a18b 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -3,6 +3,7 @@ 
  * Copyright (c) 2023 Intel Corporation.
  */
 #include <linux/vfio.h>
+#include <linux/iommufd.h>
 
 #include "vfio.h"
 
@@ -44,6 +45,135 @@  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
 	return ret;
 }
 
+static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
+{
+	spin_lock(&df->kvm_ref_lock);
+	if (df->kvm)
+		_vfio_device_get_kvm_safe(df->device, df->kvm);
+	spin_unlock(&df->kvm_ref_lock);
+}
+
+void vfio_device_cdev_close(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+
+	/*
+	 * In the time of close, there is no contention with another one
+	 * changing this flag.  So read df->access_granted without lock
+	 * and no smp_load_acquire() is ok.
+	 */
+	if (!df->access_granted)
+		return;
+
+	mutex_lock(&device->dev_set->lock);
+	vfio_device_close(df);
+	vfio_device_put_kvm(device);
+	iommufd_ctx_put(df->iommufd);
+	device->cdev_opened = false;
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+}
+
+static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
+{
+	struct iommufd_ctx *iommufd;
+	struct fd f;
+
+	f = fdget(fd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	iommufd = iommufd_ctx_from_file(f.file);
+
+	fdput(f);
+	return iommufd;
+}
+
+long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+				    struct vfio_device_bind_iommufd __user *arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_bind_iommufd bind;
+	unsigned long minsz;
+	int ret;
+
+	static_assert(__same_type(arg->out_devid, df->devid));
+
+	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
+
+	if (copy_from_user(&bind, arg, minsz))
+		return -EFAULT;
+
+	if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
+		return -EINVAL;
+
+	/* BIND_IOMMUFD only allowed for cdev fds */
+	if (df->group)
+		return -EINVAL;
+
+	if (vfio_device_is_noiommu(device) && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	ret = vfio_device_block_group(device);
+	if (ret)
+		return ret;
+
+	mutex_lock(&device->dev_set->lock);
+	/* one device cannot be bound twice */
+	if (df->access_granted) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	df->iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
+	if (IS_ERR(df->iommufd)) {
+		ret = PTR_ERR(df->iommufd);
+		df->iommufd = NULL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Before the device open, get the KVM pointer currently
+	 * associated with the device file (if there is) and obtain
+	 * a reference.  This reference is held until device closed.
+	 * Save the pointer in the device for use by drivers.
+	 */
+	vfio_device_get_kvm_safe(df);
+
+	ret = vfio_device_open(df);
+	if (ret)
+		goto out_put_kvm;
+
+	ret = copy_to_user(&arg->out_devid, &df->devid,
+			   sizeof(df->devid)) ? -EFAULT : 0;
+	if (ret)
+		goto out_close_device;
+
+	/*
+	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
+	 * read/write/mmap
+	 */
+	smp_store_release(&df->access_granted, true);
+	device->cdev_opened = true;
+	mutex_unlock(&device->dev_set->lock);
+
+	if (vfio_device_is_noiommu(device))
+		dev_warn(device->dev, "noiommu device is bound to iommufd by user "
+			 "(%s:%d)\n", current->comm, task_pid_nr(current));
+	return 0;
+
+out_close_device:
+	vfio_device_close(df);
+out_put_kvm:
+	vfio_device_put_kvm(device);
+	iommufd_ctx_put(df->iommufd);
+	df->iommufd = NULL;
+out_unlock:
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+	return ret;
+}
+
 static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 6861f8ebb64d..8b359a7794be 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -279,6 +279,9 @@  static inline void vfio_device_del(struct vfio_device *device)
 
 void vfio_init_device_cdev(struct vfio_device *device);
 int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
+void vfio_device_cdev_close(struct vfio_device_file *df);
+long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+				    struct vfio_device_bind_iommufd __user *arg);
 int vfio_cdev_init(struct class *device_class);
 void vfio_cdev_cleanup(void);
 #else
@@ -302,6 +305,16 @@  static inline int vfio_device_fops_cdev_open(struct inode *inode,
 	return 0;
 }
 
+static inline void vfio_device_cdev_close(struct vfio_device_file *df)
+{
+}
+
+static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+						  struct vfio_device_bind_iommufd __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int vfio_cdev_init(struct class *device_class)
 {
 	return 0;
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index c87cc7afe92c..c9fa39ac4b02 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -574,6 +574,8 @@  static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 
 	if (df->group)
 		vfio_device_group_close(df);
+	else
+		vfio_device_cdev_close(df);
 
 	vfio_device_put_registration(device);
 
@@ -1147,6 +1149,9 @@  static long vfio_device_fops_unl_ioctl(struct file *filep,
 	struct vfio_device *device = df->device;
 	int ret;
 
+	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
+		return vfio_device_ioctl_bind_iommufd(df, (void __user *)arg);
+
 	/* Paired with smp_store_release() following vfio_device_open() */
 	if (!smp_load_acquire(&df->access_granted))
 		return -EINVAL;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 873275419f13..cf9d082a623c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -67,6 +67,7 @@  struct vfio_device {
 	struct iommufd_device *iommufd_device;
 	bool iommufd_attached;
 #endif
+	bool cdev_opened:1;
 };
 
 /**
@@ -169,7 +170,7 @@  vfio_iommufd_physical_devid(struct vfio_device *vdev)
 
 static inline bool vfio_device_cdev_opened(struct vfio_device *device)
 {
-	return false;
+	return device->cdev_opened;
 }
 
 /**
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 24858b650562..07c917de31e9 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -194,6 +194,34 @@  struct vfio_group_status {
 
 /* --------------- IOCTLs for DEVICE file descriptors --------------- */
 
+/*
+ * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
+ *				   struct vfio_device_bind_iommufd)
+ *
+ * Bind a vfio_device to the specified iommufd.
+ *
+ * User is restricted from accessing the device before the binding operation
+ * is completed.
+ *
+ * Unbind is automatically conducted when device fd is closed.
+ *
+ * @argsz:	 User filled size of this data.
+ * @flags:	 Must be 0.
+ * @iommufd:	 iommufd to bind.
+ * @out_devid:	 The device id generated by this bind. devid is a handle for
+ *		 this device/iommufd bond and can be used in IOMMUFD commands.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_bind_iommufd {
+	__u32		argsz;
+	__u32		flags;
+	__s32		iommufd;
+	__u32		out_devid;
+};
+
+#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
 /**
  * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
  *						struct vfio_device_info)