diff mbox series

[11/13] vfio: Add cdev for vfio_device

Message ID 20230117134942.101112-12-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Add vfio_device cdev for iommufd support | expand

Commit Message

Liu, Yi L Jan. 17, 2023, 1:49 p.m. UTC
This allows user to directly open a vfio device w/o using the legacy
container/group interface, as a prerequisite for supporting new iommu
features like nested translation.

The device fd opened in this manner doesn't have the capability to access
the device as the fops open() doesn't open the device until the successful
BIND_IOMMUFD which be added in next patch.

With this patch, devices registered to vfio core have both group and device
interface created.

- group interface : /dev/vfio/$groupID
- device interface: /dev/vfio/devices/vfioX  (X is the minor number and
					      is unique across devices)

Given a vfio device the user can identify the matching vfioX by checking
the sysfs path of the device. Take PCI device (0000:6a:01.0) for example,
/sys/bus/pci/devices/0000\:6a\:01.0/vfio-dev/vfio0/dev contains the
major:minor of the matching vfioX.

Userspace then opens the /dev/vfio/devices/vfioX and checks with fstat
that the major:minor matches.

The vfio_device cdev logic in this patch:
*) __vfio_register_dev() path ends up doing cdev_device_add() for each
   vfio_device;
*) vfio_unregister_group_dev() path does cdev_device_del();

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 drivers/vfio/vfio_main.c | 103 ++++++++++++++++++++++++++++++++++++---
 include/linux/vfio.h     |   7 ++-
 2 files changed, 102 insertions(+), 8 deletions(-)

Comments

Tian, Kevin Jan. 20, 2023, 7:26 a.m. UTC | #1
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Tuesday, January 17, 2023 9:50 PM
> 
> @@ -156,7 +159,11 @@ static void vfio_device_release(struct device *dev)
>  			container_of(dev, struct vfio_device, device);
> 
>  	vfio_release_device_set(device);
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	ida_free(&vfio.device_ida, MINOR(device->device.devt));
> +#else
>  	ida_free(&vfio.device_ida, device->index);
> +#endif

There are many #if in this patch, leading to bad readability.

for this what about letting device->index always storing the minor
value? then here it could just be:

	ida_free(&vfio.device_ida, device->index);

> @@ -232,17 +240,25 @@ static int vfio_init_device(struct vfio_device
> *device, struct device *dev,
>  	device->device.release = vfio_device_release;
>  	device->device.class = vfio.device_class;
>  	device->device.parent = device->dev;
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	device->device.devt = MKDEV(MAJOR(vfio.device_devt), minor);
> +	cdev_init(&device->cdev, &vfio_device_fops);
> +	device->cdev.owner = THIS_MODULE;
> +#else
> +	device->index = minor;
> +#endif

Probably we can have a vfio_init_device_cdev() in iommufd.c and let
it be empty if !CONFIG_IOMMUFD. Then here could be:

	device->index = minor;
	vfio_init_device_cdev(device, vfio.device_devt, minor);

> @@ -257,7 +273,12 @@ static int __vfio_register_dev(struct vfio_device
> *device,
>  	if (!device->dev_set)
>  		vfio_assign_device_set(device, device);
> 
> -	ret = dev_set_name(&device->device, "vfio%d", device->index);
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	minor = MINOR(device->device.devt);
> +#else
> +	minor = device->index;
> +#endif

then just "minor = device->index"

> 
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	ret = cdev_device_add(&device->cdev, &device->device);
> +#else
>  	ret = device_add(&device->device);
> +#endif

also add a wrapper vfio_register_device_cdev() which does
cdev_device_add() if CONFIG_IOMMUFD and device_add() otherwise.


> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	/*
> +	 * Balances device_add in register path. Putting it as the first
> +	 * operation in unregister to prevent registration refcount from
> +	 * incrementing per cdev open.
> +	 */
> +	cdev_device_del(&device->cdev, &device->device);
> +#else
> +	device_del(&device->device);
> +#endif

ditto

> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +static int vfio_device_fops_open(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_device *device = container_of(inode->i_cdev,
> +						  struct vfio_device, cdev);
> +	struct vfio_device_file *df;
> +	int ret;
> +
> +	if (!vfio_device_try_get_registration(device))
> +		return -ENODEV;
> +
> +	/*
> +	 * device access is blocked until .open_device() is called
> +	 * in BIND_IOMMUFD.
> +	 */
> +	df = vfio_allocate_device_file(device, true);
> +	if (IS_ERR(df)) {
> +		ret = PTR_ERR(df);
> +		goto err_put_registration;
> +	}
> +
> +	filep->private_data = df;
> +
> +	return 0;
> +
> +err_put_registration:
> +	vfio_device_put_registration(device);
> +	return ret;
> +}
> +#endif

move to iommufd.c

> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
> +{
> +	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> +}
> +#endif

ditto

> @@ -1543,9 +1617,21 @@ static int __init vfio_init(void)
>  		goto err_dev_class;
>  	}
> 
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	vfio.device_class->devnode = vfio_device_devnode;
> +	ret = alloc_chrdev_region(&vfio.device_devt, 0,
> +				  MINORMASK + 1, "vfio-dev");
> +	if (ret)
> +		goto err_alloc_dev_chrdev;
> +#endif

vfio_cdev_init()

>  static void __exit vfio_cleanup(void)
>  {
>  	ida_destroy(&vfio.device_ida);
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	unregister_chrdev_region(vfio.device_devt, MINORMASK + 1);
> +#endif

vfio_cdev_cleanup()
Jason Gunthorpe Jan. 24, 2023, 8:44 p.m. UTC | #2
On Tue, Jan 17, 2023 at 05:49:40AM -0800, Yi Liu wrote:
> This allows user to directly open a vfio device w/o using the legacy
> container/group interface, as a prerequisite for supporting new iommu
> features like nested translation.
> 
> The device fd opened in this manner doesn't have the capability to access
> the device as the fops open() doesn't open the device until the successful
> BIND_IOMMUFD which be added in next patch.
> 
> With this patch, devices registered to vfio core have both group and device
> interface created.
> 
> - group interface : /dev/vfio/$groupID
> - device interface: /dev/vfio/devices/vfioX  (X is the minor number and
> 					      is unique across devices)
> 
> Given a vfio device the user can identify the matching vfioX by checking
> the sysfs path of the device. Take PCI device (0000:6a:01.0) for example,
> /sys/bus/pci/devices/0000\:6a\:01.0/vfio-dev/vfio0/dev contains the
> major:minor of the matching vfioX.
> 
> Userspace then opens the /dev/vfio/devices/vfioX and checks with fstat
> that the major:minor matches.
> 
> The vfio_device cdev logic in this patch:
> *) __vfio_register_dev() path ends up doing cdev_device_add() for each
>    vfio_device;
> *) vfio_unregister_group_dev() path does cdev_device_del();
> 
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  drivers/vfio/vfio_main.c | 103 ++++++++++++++++++++++++++++++++++++---
>  include/linux/vfio.h     |   7 ++-
>  2 files changed, 102 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 78725c28b933..6068ffb7c6b7 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -43,6 +43,9 @@
>  static struct vfio {
>  	struct class			*device_class;
>  	struct ida			device_ida;
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	dev_t                           device_devt;
> +#endif

It is a bit strange to see this called CONFIG_IOMMUFD, maybe we should
have a CONFIG_VFIO_DEVICE_FILE that depends on IOMMUFD?

Please try to use a plain 'if (IS_ENABLED())' for more of these

It probably isn't worth saving a few bytes in memory to complicate all
the code, so maybe just always include things like this.

> @@ -156,7 +159,11 @@ static void vfio_device_release(struct device *dev)
>  			container_of(dev, struct vfio_device, device);
>  
>  	vfio_release_device_set(device);
> +#if IS_ENABLED(CONFIG_IOMMUFD)
> +	ida_free(&vfio.device_ida, MINOR(device->device.devt));
> +#else
>  	ida_free(&vfio.device_ida, device->index);
> +#endif

A vfio_device_get_index() would help this

Jason
Liu, Yi L Jan. 31, 2023, 6:17 a.m. UTC | #3
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Friday, January 20, 2023 3:27 PM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Tuesday, January 17, 2023 9:50 PM
> >
> > @@ -156,7 +159,11 @@ static void vfio_device_release(struct device *dev)
> >  			container_of(dev, struct vfio_device, device);
> >
> >  	vfio_release_device_set(device);
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +	ida_free(&vfio.device_ida, MINOR(device->device.devt));
> > +#else
> >  	ida_free(&vfio.device_ida, device->index);
> > +#endif
> 
> There are many #if in this patch, leading to bad readability.
> 
> for this what about letting device->index always storing the minor
> value? then here it could just be:
> 
> 	ida_free(&vfio.device_ida, device->index);

Yes.

> > @@ -232,17 +240,25 @@ static int vfio_init_device(struct vfio_device
> > *device, struct device *dev,
> >  	device->device.release = vfio_device_release;
> >  	device->device.class = vfio.device_class;
> >  	device->device.parent = device->dev;
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +	device->device.devt = MKDEV(MAJOR(vfio.device_devt), minor);
> > +	cdev_init(&device->cdev, &vfio_device_fops);
> > +	device->cdev.owner = THIS_MODULE;
> > +#else
> > +	device->index = minor;
> > +#endif
> 
> Probably we can have a vfio_init_device_cdev() in iommufd.c and let
> it be empty if !CONFIG_IOMMUFD. Then here could be:

Yes. Btw. Will adding another device_cdev.c better than reusing iommufd.c?

> 
> 	device->index = minor;
> 	vfio_init_device_cdev(device, vfio.device_devt, minor);
>
> > @@ -257,7 +273,12 @@ static int __vfio_register_dev(struct vfio_device
> > *device,
> >  	if (!device->dev_set)
> >  		vfio_assign_device_set(device, device);
> >
> > -	ret = dev_set_name(&device->device, "vfio%d", device->index);
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +	minor = MINOR(device->device.devt);
> > +#else
> > +	minor = device->index;
> > +#endif
> 
> then just "minor = device->index"

Yes.

> >
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +	ret = cdev_device_add(&device->cdev, &device->device);
> > +#else
> >  	ret = device_add(&device->device);
> > +#endif
> 
> also add a wrapper vfio_register_device_cdev() which does
> cdev_device_add() if CONFIG_IOMMUFD and device_add() otherwise.

Got it.
> 
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +	/*
> > +	 * Balances device_add in register path. Putting it as the first
> > +	 * operation in unregister to prevent registration refcount from
> > +	 * incrementing per cdev open.
> > +	 */
> > +	cdev_device_del(&device->cdev, &device->device);
> > +#else
> > +	device_del(&device->device);
> > +#endif
> 
> ditto
> 
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +static int vfio_device_fops_open(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_device *device = container_of(inode->i_cdev,
> > +						  struct vfio_device, cdev);
> > +	struct vfio_device_file *df;
> > +	int ret;
> > +
> > +	if (!vfio_device_try_get_registration(device))
> > +		return -ENODEV;
> > +
> > +	/*
> > +	 * device access is blocked until .open_device() is called
> > +	 * in BIND_IOMMUFD.
> > +	 */
> > +	df = vfio_allocate_device_file(device, true);
> > +	if (IS_ERR(df)) {
> > +		ret = PTR_ERR(df);
> > +		goto err_put_registration;
> > +	}
> > +
> > +	filep->private_data = df;
> > +
> > +	return 0;
> > +
> > +err_put_registration:
> > +	vfio_device_put_registration(device);
> > +	return ret;
> > +}
> > +#endif
> 
> move to iommufd.c
> 
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +static char *vfio_device_devnode(const struct device *dev, umode_t
> *mode)
> > +{
> > +	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> > +}
> > +#endif
> 
> ditto
> 
> > @@ -1543,9 +1617,21 @@ static int __init vfio_init(void)
> >  		goto err_dev_class;
> >  	}
> >
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +	vfio.device_class->devnode = vfio_device_devnode;
> > +	ret = alloc_chrdev_region(&vfio.device_devt, 0,
> > +				  MINORMASK + 1, "vfio-dev");
> > +	if (ret)
> > +		goto err_alloc_dev_chrdev;
> > +#endif
> 
> vfio_cdev_init()
> 
> >  static void __exit vfio_cleanup(void)
> >  {
> >  	ida_destroy(&vfio.device_ida);
> > +#if IS_ENABLED(CONFIG_IOMMUFD)
> > +	unregister_chrdev_region(vfio.device_devt, MINORMASK + 1);
> > +#endif
> 
> vfio_cdev_cleanup()

All above comments got.

Regards,
Yi Liu
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 78725c28b933..6068ffb7c6b7 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -43,6 +43,9 @@ 
 static struct vfio {
 	struct class			*device_class;
 	struct ida			device_ida;
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	dev_t                           device_devt;
+#endif
 } vfio;
 
 static DEFINE_XARRAY(vfio_device_set_xa);
@@ -156,7 +159,11 @@  static void vfio_device_release(struct device *dev)
 			container_of(dev, struct vfio_device, device);
 
 	vfio_release_device_set(device);
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	ida_free(&vfio.device_ida, MINOR(device->device.devt));
+#else
 	ida_free(&vfio.device_ida, device->index);
+#endif
 
 	if (device->ops->release)
 		device->ops->release(device);
@@ -209,15 +216,16 @@  EXPORT_SYMBOL_GPL(_vfio_alloc_device);
 static int vfio_init_device(struct vfio_device *device, struct device *dev,
 			    const struct vfio_device_ops *ops)
 {
+	unsigned int minor;
 	int ret;
 
 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
 	if (ret < 0) {
-		dev_dbg(dev, "Error to alloc index\n");
+		dev_dbg(dev, "Error to alloc minor\n");
 		return ret;
 	}
 
-	device->index = ret;
+	minor = ret;
 	init_completion(&device->comp);
 	device->dev = dev;
 	device->ops = ops;
@@ -232,17 +240,25 @@  static int vfio_init_device(struct vfio_device *device, struct device *dev,
 	device->device.release = vfio_device_release;
 	device->device.class = vfio.device_class;
 	device->device.parent = device->dev;
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	device->device.devt = MKDEV(MAJOR(vfio.device_devt), minor);
+	cdev_init(&device->cdev, &vfio_device_fops);
+	device->cdev.owner = THIS_MODULE;
+#else
+	device->index = minor;
+#endif
 	return 0;
 
 out_uninit:
 	vfio_release_device_set(device);
-	ida_free(&vfio.device_ida, device->index);
+	ida_free(&vfio.device_ida, minor);
 	return ret;
 }
 
 static int __vfio_register_dev(struct vfio_device *device,
 			       enum vfio_group_type type)
 {
+	unsigned int minor;
 	int ret;
 
 	if (WARN_ON(device->ops->bind_iommufd &&
@@ -257,7 +273,12 @@  static int __vfio_register_dev(struct vfio_device *device,
 	if (!device->dev_set)
 		vfio_assign_device_set(device, device);
 
-	ret = dev_set_name(&device->device, "vfio%d", device->index);
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	minor = MINOR(device->device.devt);
+#else
+	minor = device->index;
+#endif
+	ret = dev_set_name(&device->device, "vfio%d", minor);
 	if (ret)
 		return ret;
 
@@ -265,7 +286,11 @@  static int __vfio_register_dev(struct vfio_device *device,
 	if (ret)
 		return ret;
 
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	ret = cdev_device_add(&device->cdev, &device->device);
+#else
 	ret = device_add(&device->device);
+#endif
 	if (ret)
 		goto err_out;
 
@@ -305,6 +330,17 @@  void vfio_unregister_group_dev(struct vfio_device *device)
 	bool interrupted = false;
 	long rc;
 
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	/*
+	 * Balances device_add in register path. Putting it as the first
+	 * operation in unregister to prevent registration refcount from
+	 * incrementing per cdev open.
+	 */
+	cdev_device_del(&device->cdev, &device->device);
+#else
+	device_del(&device->device);
+#endif
+
 	vfio_device_put_registration(device);
 	rc = try_wait_for_completion(&device->comp);
 	while (rc <= 0) {
@@ -330,9 +366,6 @@  void vfio_unregister_group_dev(struct vfio_device *device)
 
 	vfio_device_group_unregister(device);
 
-	/* Balances device_add in register path */
-	device_del(&device->device);
-
 	/* Balances vfio_device_set_group in register path */
 	vfio_device_remove_group(device);
 }
@@ -502,6 +535,37 @@  static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
 /*
  * VFIO Device fd
  */
+#if IS_ENABLED(CONFIG_IOMMUFD)
+static int vfio_device_fops_open(struct inode *inode, struct file *filep)
+{
+	struct vfio_device *device = container_of(inode->i_cdev,
+						  struct vfio_device, cdev);
+	struct vfio_device_file *df;
+	int ret;
+
+	if (!vfio_device_try_get_registration(device))
+		return -ENODEV;
+
+	/*
+	 * device access is blocked until .open_device() is called
+	 * in BIND_IOMMUFD.
+	 */
+	df = vfio_allocate_device_file(device, true);
+	if (IS_ERR(df)) {
+		ret = PTR_ERR(df);
+		goto err_put_registration;
+	}
+
+	filep->private_data = df;
+
+	return 0;
+
+err_put_registration:
+	vfio_device_put_registration(device);
+	return ret;
+}
+#endif
+
 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device_file *df = filep->private_data;
@@ -1169,6 +1233,9 @@  static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 
 const struct file_operations vfio_device_fops = {
 	.owner		= THIS_MODULE,
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	.open		= vfio_device_fops_open,
+#endif
 	.release	= vfio_device_fops_release,
 	.read		= vfio_device_fops_read,
 	.write		= vfio_device_fops_write,
@@ -1522,6 +1589,13 @@  EXPORT_SYMBOL(vfio_dma_rw);
 /*
  * Module/class support
  */
+#if IS_ENABLED(CONFIG_IOMMUFD)
+static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
+}
+#endif
+
 static int __init vfio_init(void)
 {
 	int ret;
@@ -1543,9 +1617,21 @@  static int __init vfio_init(void)
 		goto err_dev_class;
 	}
 
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	vfio.device_class->devnode = vfio_device_devnode;
+	ret = alloc_chrdev_region(&vfio.device_devt, 0,
+				  MINORMASK + 1, "vfio-dev");
+	if (ret)
+		goto err_alloc_dev_chrdev;
+#endif
 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 	return 0;
 
+#if IS_ENABLED(CONFIG_IOMMUFD)
+err_alloc_dev_chrdev:
+	class_destroy(vfio.device_class);
+	vfio.device_class = NULL;
+#endif
 err_dev_class:
 	vfio_virqfd_exit();
 err_virqfd:
@@ -1556,6 +1642,9 @@  static int __init vfio_init(void)
 static void __exit vfio_cleanup(void)
 {
 	ida_destroy(&vfio.device_ida);
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	unregister_chrdev_region(vfio.device_devt, MINORMASK + 1);
+#endif
 	class_destroy(vfio.device_class);
 	vfio.device_class = NULL;
 	vfio_virqfd_exit();
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 300318f0d448..4a31842ebe0b 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -13,6 +13,7 @@ 
 #include <linux/mm.h>
 #include <linux/workqueue.h>
 #include <linux/poll.h>
+#include <linux/cdev.h>
 #include <uapi/linux/vfio.h>
 #include <linux/iova_bitmap.h>
 
@@ -50,8 +51,12 @@  struct vfio_device {
 	struct kvm *kvm;
 
 	/* Members below here are private, not for driver use */
-	unsigned int index;
 	struct device device;	/* device.kref covers object life circle */
+#if IS_ENABLED(CONFIG_IOMMUFD)
+	struct cdev cdev;
+#else
+	unsigned int index;
+#endif
 	refcount_t refcount;	/* user count on registered device*/
 	unsigned int open_count;
 	struct completion comp;