diff mbox series

[v5,09/10] vfio/pci: Extend VFIO_DEVICE_GET_PCI_HOT_RESET_INFO for vfio device cdev

Message ID 20230513132136.15021-10-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Enhance vfio PCI hot reset for vfio cdev device | expand

Commit Message

Yi Liu May 13, 2023, 1:21 p.m. UTC
This makes VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl to use the iommufd_ctx
of the cdev device to check the ownership of the other affected devices.

This returns devid for each of the affected devices. If it is bound to the
iommufd_ctx of the cdev device, _INFO reports a valid devid > 0; If it is
not opened by the calling user, but it belongs to the same iommu_group of
a device that is bound to the iommufd_ctx of the cdev device, reports devid
value of 0; If the device is un-owned device, configured within a different
iommufd, or opened outside of the vfio device cdev API, the _INFO ioctl shall
report devid value of -1.

devid >=0 doesn't block hot-reset as the affected devices are considered to
be owned, while devid == -1 will block the use of VFIO_DEVICE_PCI_HOT_RESET
outside of proof-of-ownership calling conventions (ie. via legacy group
accessed devices).

This adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID to tell the user devid is
returned in case of calling user get device fd from other software stack
and adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED to tell user if all
the affected devices are owned, so user can know it without looping all
the returned devids.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 52 ++++++++++++++++++++++++++++++--
 include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 3 deletions(-)

Comments

Cédric Le Goater May 15, 2023, 7:29 a.m. UTC | #1
On 5/13/23 15:21, Yi Liu wrote:
> This makes VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl to use the iommufd_ctx
> of the cdev device to check the ownership of the other affected devices.
> 
> This returns devid for each of the affected devices. If it is bound to the
> iommufd_ctx of the cdev device, _INFO reports a valid devid > 0; If it is
> not opened by the calling user, but it belongs to the same iommu_group of
> a device that is bound to the iommufd_ctx of the cdev device, reports devid
> value of 0; If the device is un-owned device, configured within a different
> iommufd, or opened outside of the vfio device cdev API, the _INFO ioctl shall
> report devid value of -1.
> 
> devid >=0 doesn't block hot-reset as the affected devices are considered to
> be owned, while devid == -1 will block the use of VFIO_DEVICE_PCI_HOT_RESET
> outside of proof-of-ownership calling conventions (ie. via legacy group
> accessed devices).
> 
> This adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID to tell the user devid is
> returned in case of calling user get device fd from other software stack
> and adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED to tell user if all
> the affected devices are owned, so user can know it without looping all
> the returned devids.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>   drivers/vfio/pci/vfio_pci_core.c | 52 ++++++++++++++++++++++++++++++--
>   include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++++-
>   2 files changed, 95 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 4df2def35bdd..57586be770af 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -27,6 +27,7 @@
>   #include <linux/vgaarb.h>
>   #include <linux/nospec.h>
>   #include <linux/sched/mm.h>
> +#include <linux/iommufd.h>
>   #if IS_ENABLED(CONFIG_EEH)
>   #include <asm/eeh.h>
>   #endif
> @@ -36,6 +37,10 @@
>   #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
>   #define DRIVER_DESC "core driver for VFIO based PCI devices"
>   
> +#ifdef CONFIG_IOMMUFD

To import the IOMMUFD namespace, I had to use :

#if IS_ENABLED(CONFIG_IOMMUFD)

Thanks,

C.


> +MODULE_IMPORT_NS(IOMMUFD);
> +#endif
> +
>   static bool nointxmask;
>   static bool disable_vga;
>   static bool disable_idle_d3;
> @@ -776,6 +781,9 @@ struct vfio_pci_fill_info {
>   	int max;
>   	int cur;
>   	struct vfio_pci_dependent_device *devices;
> +	struct vfio_device *vdev;
> +	bool devid:1;
> +	bool dev_owned:1;
>   };
>   
>   static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> @@ -790,7 +798,37 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
>   	if (!iommu_group)
>   		return -EPERM; /* Cannot reset non-isolated devices */
>   
> -	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> +	if (fill->devid) {
> +		struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(fill->vdev);
> +		struct vfio_device_set *dev_set = fill->vdev->dev_set;
> +		struct vfio_device *vdev;
> +
> +		/*
> +		 * Report devid for the affected devices:
> +		 * - valid devid > 0 for the devices that are bound with
> +		 *   the iommufd of the calling device.
> +		 * - devid == 0 for the devices that have not been opened
> +		 *   but have same group with one of the devices bound to
> +		 *   the iommufd of the calling device.
> +		 * - devid == -1 for others, and clear dev_owned flag.
> +		 */
> +		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> +		if (vdev && iommufd == vfio_iommufd_physical_ictx(vdev)) {
> +			int ret;
> +
> +			ret = vfio_iommufd_physical_devid(vdev);
> +			if (WARN_ON(ret < 0))
> +				return ret;
> +			fill->devices[fill->cur].devid = ret;
> +		} else if (vdev && iommufd_ctx_has_group(iommufd, iommu_group)) {
> +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_OWNED;
> +		} else {
> +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED;
> +			fill->dev_owned = false;
> +		}
> +	} else {
> +		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> +	}
>   	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
>   	fill->devices[fill->cur].bus = pdev->bus->number;
>   	fill->devices[fill->cur].devfn = pdev->devfn;
> @@ -1229,17 +1267,27 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
>   		return -ENOMEM;
>   
>   	fill.devices = devices;
> +	fill.vdev = &vdev->vdev;
>   
> +	mutex_lock(&vdev->vdev.dev_set->lock);
> +	fill.devid = fill.dev_owned = vfio_device_cdev_opened(&vdev->vdev);
>   	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
>   					    &fill, slot);
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
>   
>   	/*
>   	 * If a device was removed between counting and filling, we may come up
>   	 * short of fill.max.  If a device was added, we'll have a return of
>   	 * -EAGAIN above.
>   	 */
> -	if (!ret)
> +	if (!ret) {
>   		hdr.count = fill.cur;
> +		if (fill.devid) {
> +			hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID;
> +			if (fill.dev_owned)
> +				hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
> +		}
> +	}
>   
>   reset_info_exit:
>   	if (copy_to_user(arg, &hdr, minsz))
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 0552e8dcf0cb..01203215251a 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -650,11 +650,53 @@ enum {
>    * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
>    *					      struct vfio_pci_hot_reset_info)
>    *
> + * This command is used to query the affected devices in the hot reset for
> + * a given device.
> + *
> + * This command always reports the segment, bus, and devfn information for
> + * each affected device, and selectively reports the group_id or devid per
> + * the way how the calling device is opened.
> + *
> + *	- If the calling device is opened via the traditional group/container
> + *	  API, group_id is reported.  User should check if it has owned all
> + *	  the affected devices and provides a set of group fds to prove the
> + *	  ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl.
> + *
> + *	- If the calling device is opened as a cdev, devid is reported.
> + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this
> + *	  data type.  For a given affected device, it is considered owned by
> + *	  this interface if it meets the following conditions:
> + *	  1) Has a valid devid within the iommufd_ctx of the calling device.
> + *	     Ownership cannot be determined across separate iommufd_ctx and the
> + *	     cdev calling conventions do not support a proof-of-ownership model
> + *	     as provided in the legacy group interface.  In this case a valid
> + *	     devid with value greater than zero is provided in the return
> + *	     structure.
> + *	  2) Does not have a valid devid within the iommufd_ctx of the calling
> + *	     device, but belongs to the same IOMMU group as the calling device
> + *	     or another opened device that has a valid devid within the
> + *	     iommufd_ctx of the calling device.  This provides implicit ownership
> + *	     for devices within the same DMA isolation context.  In this case
> + *	     the invalid devid value of zero is provided in the return structure.
> + *
> + *	  A devid value of -1 is provided in the return structure for devices
> + *	  where ownership is not available.  Such devices prevent the use of
> + *	  VFIO_DEVICE_PCI_HOT_RESET outside of proof-of-ownership calling
> + *	  conventions (ie. via legacy group accessed devices).
> + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
> + *	  affected devices are owned by the user.  This flag is available only
> + *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
> + *
>    * Return: 0 on success, -errno on failure:
>    *	-enospc = insufficient buffer, -enodev = unsupported for device.
>    */
>   struct vfio_pci_dependent_device {
> -	__u32	group_id;
> +	union {
> +		__u32   group_id;
> +		__u32	devid;
> +#define VFIO_PCI_DEVID_OWNED		0
> +#define VFIO_PCI_DEVID_NOT_OWNED	-1
> +	};
>   	__u16	segment;
>   	__u8	bus;
>   	__u8	devfn; /* Use PCI_SLOT/PCI_FUNC */
> @@ -663,6 +705,8 @@ struct vfio_pci_dependent_device {
>   struct vfio_pci_hot_reset_info {
>   	__u32	argsz;
>   	__u32	flags;
> +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID		(1 << 0)
> +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED	(1 << 1)
>   	__u32	count;
>   	struct vfio_pci_dependent_device	devices[];
>   };
Yi Liu May 15, 2023, 7:47 a.m. UTC | #2
> From: Cédric Le Goater <clegoate@redhat.com>
> Sent: Monday, May 15, 2023 3:30 PM
> 
> On 5/13/23 15:21, Yi Liu wrote:
> > This makes VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl to use the iommufd_ctx
> > of the cdev device to check the ownership of the other affected devices.
> >
> > This returns devid for each of the affected devices. If it is bound to the
> > iommufd_ctx of the cdev device, _INFO reports a valid devid > 0; If it is
> > not opened by the calling user, but it belongs to the same iommu_group of
> > a device that is bound to the iommufd_ctx of the cdev device, reports devid
> > value of 0; If the device is un-owned device, configured within a different
> > iommufd, or opened outside of the vfio device cdev API, the _INFO ioctl shall
> > report devid value of -1.
> >
> > devid >=0 doesn't block hot-reset as the affected devices are considered to
> > be owned, while devid == -1 will block the use of VFIO_DEVICE_PCI_HOT_RESET
> > outside of proof-of-ownership calling conventions (ie. via legacy group
> > accessed devices).
> >
> > This adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID to tell the user devid is
> > returned in case of calling user get device fd from other software stack
> > and adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED to tell user if all
> > the affected devices are owned, so user can know it without looping all
> > the returned devids.
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >   drivers/vfio/pci/vfio_pci_core.c | 52 ++++++++++++++++++++++++++++++--
> >   include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++++-
> >   2 files changed, 95 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 4df2def35bdd..57586be770af 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -27,6 +27,7 @@
> >   #include <linux/vgaarb.h>
> >   #include <linux/nospec.h>
> >   #include <linux/sched/mm.h>
> > +#include <linux/iommufd.h>
> >   #if IS_ENABLED(CONFIG_EEH)
> >   #include <asm/eeh.h>
> >   #endif
> > @@ -36,6 +37,10 @@
> >   #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> >   #define DRIVER_DESC "core driver for VFIO based PCI devices"
> >
> > +#ifdef CONFIG_IOMMUFD
> 
> To import the IOMMUFD namespace, I had to use :
> 
> #if IS_ENABLED(CONFIG_IOMMUFD)

Thanks. Yes, IOMMUFD is tristate now, so needs to test CONFIG_IOMMUFD=m.
and "#if IS_ENABLED(CONFIG_IOMMUFD)" fixes the compiling failure.

Regards,
Yi Liu
> 
> 
> > +MODULE_IMPORT_NS(IOMMUFD);
> > +#endif
> > +
> >   static bool nointxmask;
> >   static bool disable_vga;
> >   static bool disable_idle_d3;
> > @@ -776,6 +781,9 @@ struct vfio_pci_fill_info {
> >   	int max;
> >   	int cur;
> >   	struct vfio_pci_dependent_device *devices;
> > +	struct vfio_device *vdev;
> > +	bool devid:1;
> > +	bool dev_owned:1;
> >   };
> >
> >   static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> > @@ -790,7 +798,37 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> >   	if (!iommu_group)
> >   		return -EPERM; /* Cannot reset non-isolated devices */
> >
> > -	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> > +	if (fill->devid) {
> > +		struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(fill->vdev);
> > +		struct vfio_device_set *dev_set = fill->vdev->dev_set;
> > +		struct vfio_device *vdev;
> > +
> > +		/*
> > +		 * Report devid for the affected devices:
> > +		 * - valid devid > 0 for the devices that are bound with
> > +		 *   the iommufd of the calling device.
> > +		 * - devid == 0 for the devices that have not been opened
> > +		 *   but have same group with one of the devices bound to
> > +		 *   the iommufd of the calling device.
> > +		 * - devid == -1 for others, and clear dev_owned flag.
> > +		 */
> > +		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> > +		if (vdev && iommufd == vfio_iommufd_physical_ictx(vdev)) {
> > +			int ret;
> > +
> > +			ret = vfio_iommufd_physical_devid(vdev);
> > +			if (WARN_ON(ret < 0))
> > +				return ret;
> > +			fill->devices[fill->cur].devid = ret;
> > +		} else if (vdev && iommufd_ctx_has_group(iommufd, iommu_group)) {
> > +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_OWNED;
> > +		} else {
> > +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED;
> > +			fill->dev_owned = false;
> > +		}
> > +	} else {
> > +		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> > +	}
> >   	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
> >   	fill->devices[fill->cur].bus = pdev->bus->number;
> >   	fill->devices[fill->cur].devfn = pdev->devfn;
> > @@ -1229,17 +1267,27 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
> >   		return -ENOMEM;
> >
> >   	fill.devices = devices;
> > +	fill.vdev = &vdev->vdev;
> >
> > +	mutex_lock(&vdev->vdev.dev_set->lock);
> > +	fill.devid = fill.dev_owned = vfio_device_cdev_opened(&vdev->vdev);
> >   	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
> >   					    &fill, slot);
> > +	mutex_unlock(&vdev->vdev.dev_set->lock);
> >
> >   	/*
> >   	 * If a device was removed between counting and filling, we may come up
> >   	 * short of fill.max.  If a device was added, we'll have a return of
> >   	 * -EAGAIN above.
> >   	 */
> > -	if (!ret)
> > +	if (!ret) {
> >   		hdr.count = fill.cur;
> > +		if (fill.devid) {
> > +			hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID;
> > +			if (fill.dev_owned)
> > +				hdr.flags |=
> VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
> > +		}
> > +	}
> >
> >   reset_info_exit:
> >   	if (copy_to_user(arg, &hdr, minsz))
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 0552e8dcf0cb..01203215251a 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -650,11 +650,53 @@ enum {
> >    * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
> >    *					      struct vfio_pci_hot_reset_info)
> >    *
> > + * This command is used to query the affected devices in the hot reset for
> > + * a given device.
> > + *
> > + * This command always reports the segment, bus, and devfn information for
> > + * each affected device, and selectively reports the group_id or devid per
> > + * the way how the calling device is opened.
> > + *
> > + *	- If the calling device is opened via the traditional group/container
> > + *	  API, group_id is reported.  User should check if it has owned all
> > + *	  the affected devices and provides a set of group fds to prove the
> > + *	  ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl.
> > + *
> > + *	- If the calling device is opened as a cdev, devid is reported.
> > + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this
> > + *	  data type.  For a given affected device, it is considered owned by
> > + *	  this interface if it meets the following conditions:
> > + *	  1) Has a valid devid within the iommufd_ctx of the calling device.
> > + *	     Ownership cannot be determined across separate iommufd_ctx and the
> > + *	     cdev calling conventions do not support a proof-of-ownership model
> > + *	     as provided in the legacy group interface.  In this case a valid
> > + *	     devid with value greater than zero is provided in the return
> > + *	     structure.
> > + *	  2) Does not have a valid devid within the iommufd_ctx of the calling
> > + *	     device, but belongs to the same IOMMU group as the calling device
> > + *	     or another opened device that has a valid devid within the
> > + *	     iommufd_ctx of the calling device.  This provides implicit ownership
> > + *	     for devices within the same DMA isolation context.  In this case
> > + *	     the invalid devid value of zero is provided in the return structure.
> > + *
> > + *	  A devid value of -1 is provided in the return structure for devices
> > + *	  where ownership is not available.  Such devices prevent the use of
> > + *	  VFIO_DEVICE_PCI_HOT_RESET outside of proof-of-ownership calling
> > + *	  conventions (ie. via legacy group accessed devices).
> > + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
> > + *	  affected devices are owned by the user.  This flag is available only
> > + *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
> > + *
> >    * Return: 0 on success, -errno on failure:
> >    *	-enospc = insufficient buffer, -enodev = unsupported for device.
> >    */
> >   struct vfio_pci_dependent_device {
> > -	__u32	group_id;
> > +	union {
> > +		__u32   group_id;
> > +		__u32	devid;
> > +#define VFIO_PCI_DEVID_OWNED		0
> > +#define VFIO_PCI_DEVID_NOT_OWNED	-1
> > +	};
> >   	__u16	segment;
> >   	__u8	bus;
> >   	__u8	devfn; /* Use PCI_SLOT/PCI_FUNC */
> > @@ -663,6 +705,8 @@ struct vfio_pci_dependent_device {
> >   struct vfio_pci_hot_reset_info {
> >   	__u32	argsz;
> >   	__u32	flags;
> > +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID		(1 << 0)
> > +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED	(1 << 1)
> >   	__u32	count;
> >   	struct vfio_pci_dependent_device	devices[];
> >   };
Alex Williamson May 17, 2023, 10:01 p.m. UTC | #3
On Sat, 13 May 2023 06:21:35 -0700
Yi Liu <yi.l.liu@intel.com> wrote:

> This makes VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl to use the iommufd_ctx

s/makes/allows/?

s/to//

> of the cdev device to check the ownership of the other affected devices.
> 
> This returns devid for each of the affected devices. If it is bound to the
> iommufd_ctx of the cdev device, _INFO reports a valid devid > 0; If it is
> not opened by the calling user, but it belongs to the same iommu_group of
> a device that is bound to the iommufd_ctx of the cdev device, reports devid
> value of 0; If the device is un-owned device, configured within a different
> iommufd, or opened outside of the vfio device cdev API, the _INFO ioctl shall
> report devid value of -1.
> 
> devid >=0 doesn't block hot-reset as the affected devices are considered to
> be owned, while devid == -1 will block the use of VFIO_DEVICE_PCI_HOT_RESET
> outside of proof-of-ownership calling conventions (ie. via legacy group
> accessed devices).
> 
> This adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID to tell the user devid is
> returned in case of calling user get device fd from other software stack

"other software stack"?  I think this is trying to say something like:

  When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD
  managed device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is
  reported to indicate the values returned are IOMMUFD devids rather
  than group IDs as used when accessing vfio devices through the
  conventional vfio group interface.  Additionally the flag
  VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported in this mode if
  all of the devices affected by the hot-reset are owned by either
  virtue of being directly bound to the same iommufd context as the
  calling device, or implicitly owned via a shared IOMMU group.

> and adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED to tell user if all
> the affected devices are owned, so user can know it without looping all
> the returned devids.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 52 ++++++++++++++++++++++++++++++--
>  include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++++-
>  2 files changed, 95 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 4df2def35bdd..57586be770af 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -27,6 +27,7 @@
>  #include <linux/vgaarb.h>
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
> +#include <linux/iommufd.h>
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -36,6 +37,10 @@
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
>  #define DRIVER_DESC "core driver for VFIO based PCI devices"
>  
> +#ifdef CONFIG_IOMMUFD
> +MODULE_IMPORT_NS(IOMMUFD);
> +#endif
> +
>  static bool nointxmask;
>  static bool disable_vga;
>  static bool disable_idle_d3;
> @@ -776,6 +781,9 @@ struct vfio_pci_fill_info {
>  	int max;
>  	int cur;
>  	struct vfio_pci_dependent_device *devices;
> +	struct vfio_device *vdev;
> +	bool devid:1;
> +	bool dev_owned:1;
>  };
>  
>  static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> @@ -790,7 +798,37 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
>  	if (!iommu_group)
>  		return -EPERM; /* Cannot reset non-isolated devices */
>  
> -	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> +	if (fill->devid) {
> +		struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(fill->vdev);
> +		struct vfio_device_set *dev_set = fill->vdev->dev_set;
> +		struct vfio_device *vdev;
> +
> +		/*
> +		 * Report devid for the affected devices:
> +		 * - valid devid > 0 for the devices that are bound with
> +		 *   the iommufd of the calling device.
> +		 * - devid == 0 for the devices that have not been opened
> +		 *   but have same group with one of the devices bound to
> +		 *   the iommufd of the calling device.
> +		 * - devid == -1 for others, and clear dev_owned flag.
> +		 */
> +		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> +		if (vdev && iommufd == vfio_iommufd_physical_ictx(vdev)) {
> +			int ret;
> +
> +			ret = vfio_iommufd_physical_devid(vdev);
> +			if (WARN_ON(ret < 0))
> +				return ret;
> +			fill->devices[fill->cur].devid = ret;

Nit, @devid seems like a better variable name here rather than @ret.

> +		} else if (vdev && iommufd_ctx_has_group(iommufd, iommu_group)) {
> +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_OWNED;
> +		} else {
> +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED;
> +			fill->dev_owned = false;
> +		}

I think we're not describing the requirements for this middle test
correctly.  We're essentially only stating the iommufd_ctx_has_group()
part of the requirement, but we're also enforcing a
vfio_find_device_in_devset() requirement, which means the device is not
just unopened within a group shared by the iommufd context, but it must
also still be a device registered as a member of the devset, ie. it
must be bound to a vfio driver.

It's not a new requirement, it's imposed in the hot-reset ioctl itself,
but it's new for the info ioctl given that it's now trying to report
that the user can perform the reset for cdev callers.

This also shares too much logic with vfio_device_owned() added in the
next patch.  I think it might be cleaner to move the iommu_group_get() to
the group path below and change vfio_device_owned() to something that
can be used here and in the reset path.  For example, if we had a
function like:

static int vfio_hot_reset_devid(struct vfio_device *vdev,
                                struct iommufd_ctx *iommufd_ctx)
{
        struct iommu_group *group;
        int devid;

        if (!vdev)
                return VFIO_PCI_DEVID_NOT_OWNED;

        if (vfio_iommufd_physical_ictx(vdev) == iommufd_ctx) 
                return vfio_iommufd_physical_devid(vdev);

        group = iommu_group_get(vdev->dev);
        if (!group)
                return VFIO_PCI_DEVID_NOT_OWNED;
                        
        if (iommufd_ctx_has_group(iommufd_ctx, group))
                devid = VFIO_PCI_DEVID_OWNED;

        iommu_group_put(group);
                                
        return devid;
} 

It could be called above as:

	vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
	fill->devices[fill->cur].devid =
			vfio_hot_reset_devid(vdev, iommufd);


And from vfio_pci_dev_set_hot_reset() as:

	bool owned;

	if (iommufd_ctx) {
		int devid = vfio_hot_reset_devid(&cur_vma->vdev,
						 iommufd_ctx);

		owned = (devid != VFIO_PCI_DEVID_NOT_OWNED);
	} else
		owned = vfio_dev_in_groups(&cur_vma->vdev, groups);

Any better?

> +	} else {
> +		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> +	}
>  	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
>  	fill->devices[fill->cur].bus = pdev->bus->number;
>  	fill->devices[fill->cur].devfn = pdev->devfn;
> @@ -1229,17 +1267,27 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
>  		return -ENOMEM;
>  
>  	fill.devices = devices;
> +	fill.vdev = &vdev->vdev;
>  
> +	mutex_lock(&vdev->vdev.dev_set->lock);
> +	fill.devid = fill.dev_owned = vfio_device_cdev_opened(&vdev->vdev);
>  	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
>  					    &fill, slot);
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
>  
>  	/*
>  	 * If a device was removed between counting and filling, we may come up
>  	 * short of fill.max.  If a device was added, we'll have a return of
>  	 * -EAGAIN above.
>  	 */
> -	if (!ret)
> +	if (!ret) {
>  		hdr.count = fill.cur;
> +		if (fill.devid) {
> +			hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID;
> +			if (fill.dev_owned)
> +				hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
> +		}
> +	}

Does this clean up the flag and branching a bit?

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 737115d16a79..6a2a079e452d 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -786,8 +786,7 @@ struct vfio_pci_fill_info {
 	int cur;
 	struct vfio_pci_dependent_device *devices;
 	struct vfio_device *vdev;
-	bool devid:1;
-	bool dev_owned:1;
+	u32 flags;
 };
 
 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
@@ -802,7 +801,7 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 	if (!iommu_group)
 		return -EPERM; /* Cannot reset non-isolated devices */
 
-	if (fill->devid) {
+	if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
 		struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(fill->vdev);
 		struct vfio_device_set *dev_set = fill->vdev->dev_set;
 		struct vfio_device *vdev;
@@ -814,7 +813,7 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 		 * - devid == 0 for the devices that have not been opened
 		 *   but have same group with one of the devices bound to
 		 *   the iommufd of the calling device.
-		 * - devid == -1 for others, and clear dev_owned flag.
+		 * - devid == -1 for others, and clear owned flag.
 		 */
 		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
 		if (vdev && iommufd == vfio_iommufd_physical_ictx(vdev)) {
@@ -828,7 +827,7 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_OWNED;
 		} else {
 			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED;
-			fill->dev_owned = false;
+			fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
 		}
 	} else {
 		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
@@ -1273,8 +1272,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
 	fill.devices = devices;
 	fill.vdev = &vdev->vdev;
 
+	if (vfio_device_cdev_opened(&vdev->vdev))
+		fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
+			     VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+
 	mutex_lock(&vdev->vdev.dev_set->lock);
-	fill.devid = fill.dev_owned = vfio_device_cdev_opened(&vdev->vdev);
 	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
 					    &fill, slot);
 	mutex_unlock(&vdev->vdev.dev_set->lock);
@@ -1286,11 +1288,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
 	 */
 	if (!ret) {
 		hdr.count = fill.cur;
-		if (fill.devid) {
-			hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID;
-			if (fill.dev_owned)
-				hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
-		}
+		hdr.flags = fill.flags;
 	}
 
 reset_info_exit:

Thanks,
Alex

>  
>  reset_info_exit:
>  	if (copy_to_user(arg, &hdr, minsz))
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 0552e8dcf0cb..01203215251a 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -650,11 +650,53 @@ enum {
>   * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
>   *					      struct vfio_pci_hot_reset_info)
>   *
> + * This command is used to query the affected devices in the hot reset for
> + * a given device.
> + *
> + * This command always reports the segment, bus, and devfn information for
> + * each affected device, and selectively reports the group_id or devid per
> + * the way how the calling device is opened.
> + *
> + *	- If the calling device is opened via the traditional group/container
> + *	  API, group_id is reported.  User should check if it has owned all
> + *	  the affected devices and provides a set of group fds to prove the
> + *	  ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl.
> + *
> + *	- If the calling device is opened as a cdev, devid is reported.
> + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this
> + *	  data type.  For a given affected device, it is considered owned by
> + *	  this interface if it meets the following conditions:
> + *	  1) Has a valid devid within the iommufd_ctx of the calling device.
> + *	     Ownership cannot be determined across separate iommufd_ctx and the
> + *	     cdev calling conventions do not support a proof-of-ownership model
> + *	     as provided in the legacy group interface.  In this case a valid
> + *	     devid with value greater than zero is provided in the return
> + *	     structure.
> + *	  2) Does not have a valid devid within the iommufd_ctx of the calling
> + *	     device, but belongs to the same IOMMU group as the calling device
> + *	     or another opened device that has a valid devid within the
> + *	     iommufd_ctx of the calling device.  This provides implicit ownership
> + *	     for devices within the same DMA isolation context.  In this case
> + *	     the invalid devid value of zero is provided in the return structure.
> + *
> + *	  A devid value of -1 is provided in the return structure for devices
> + *	  where ownership is not available.  Such devices prevent the use of
> + *	  VFIO_DEVICE_PCI_HOT_RESET outside of proof-of-ownership calling
> + *	  conventions (ie. via legacy group accessed devices).
> + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
> + *	  affected devices are owned by the user.  This flag is available only
> + *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
> + *
>   * Return: 0 on success, -errno on failure:
>   *	-enospc = insufficient buffer, -enodev = unsupported for device.
>   */
>  struct vfio_pci_dependent_device {
> -	__u32	group_id;
> +	union {
> +		__u32   group_id;
> +		__u32	devid;
> +#define VFIO_PCI_DEVID_OWNED		0
> +#define VFIO_PCI_DEVID_NOT_OWNED	-1
> +	};
>  	__u16	segment;
>  	__u8	bus;
>  	__u8	devfn; /* Use PCI_SLOT/PCI_FUNC */
> @@ -663,6 +705,8 @@ struct vfio_pci_dependent_device {
>  struct vfio_pci_hot_reset_info {
>  	__u32	argsz;
>  	__u32	flags;
> +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID		(1 << 0)
> +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED	(1 << 1)
>  	__u32	count;
>  	struct vfio_pci_dependent_device	devices[];
>  };
Yi Liu May 18, 2023, 1:21 p.m. UTC | #4
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, May 18, 2023 6:02 AM
> 
> On Sat, 13 May 2023 06:21:35 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This makes VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl to use the iommufd_ctx
> 
> s/makes/allows/?
> 
> s/to//
> 
> > of the cdev device to check the ownership of the other affected devices.
> >
> > This returns devid for each of the affected devices. If it is bound to the
> > iommufd_ctx of the cdev device, _INFO reports a valid devid > 0; If it is
> > not opened by the calling user, but it belongs to the same iommu_group of
> > a device that is bound to the iommufd_ctx of the cdev device, reports devid
> > value of 0; If the device is un-owned device, configured within a different
> > iommufd, or opened outside of the vfio device cdev API, the _INFO ioctl shall
> > report devid value of -1.
> >
> > devid >=0 doesn't block hot-reset as the affected devices are considered to
> > be owned, while devid == -1 will block the use of VFIO_DEVICE_PCI_HOT_RESET
> > outside of proof-of-ownership calling conventions (ie. via legacy group
> > accessed devices).
> >
> > This adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID to tell the user devid is
> > returned in case of calling user get device fd from other software stack
> 
> "other software stack"?  I think this is trying to say something like:
> 
>   When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD
>   managed device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is
>   reported to indicate the values returned are IOMMUFD devids rather
>   than group IDs as used when accessing vfio devices through the
>   conventional vfio group interface.  Additionally the flag
>   VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported in this mode if
>   all of the devices affected by the hot-reset are owned by either
>   virtue of being directly bound to the same iommufd context as the
>   calling device, or implicitly owned via a shared IOMMU group.

Yes. it is.

> > and adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED to tell user if all
> > the affected devices are owned, so user can know it without looping all
> > the returned devids.
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/pci/vfio_pci_core.c | 52 ++++++++++++++++++++++++++++++--
> >  include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++++-
> >  2 files changed, 95 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 4df2def35bdd..57586be770af 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/vgaarb.h>
> >  #include <linux/nospec.h>
> >  #include <linux/sched/mm.h>
> > +#include <linux/iommufd.h>
> >  #if IS_ENABLED(CONFIG_EEH)
> >  #include <asm/eeh.h>
> >  #endif
> > @@ -36,6 +37,10 @@
> >  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> >  #define DRIVER_DESC "core driver for VFIO based PCI devices"
> >
> > +#ifdef CONFIG_IOMMUFD
> > +MODULE_IMPORT_NS(IOMMUFD);
> > +#endif
> > +
> >  static bool nointxmask;
> >  static bool disable_vga;
> >  static bool disable_idle_d3;
> > @@ -776,6 +781,9 @@ struct vfio_pci_fill_info {
> >  	int max;
> >  	int cur;
> >  	struct vfio_pci_dependent_device *devices;
> > +	struct vfio_device *vdev;
> > +	bool devid:1;
> > +	bool dev_owned:1;
> >  };
> >
> >  static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> > @@ -790,7 +798,37 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> >  	if (!iommu_group)
> >  		return -EPERM; /* Cannot reset non-isolated devices */
> >
> > -	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> > +	if (fill->devid) {
> > +		struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(fill->vdev);
> > +		struct vfio_device_set *dev_set = fill->vdev->dev_set;
> > +		struct vfio_device *vdev;
> > +
> > +		/*
> > +		 * Report devid for the affected devices:
> > +		 * - valid devid > 0 for the devices that are bound with
> > +		 *   the iommufd of the calling device.
> > +		 * - devid == 0 for the devices that have not been opened
> > +		 *   but have same group with one of the devices bound to
> > +		 *   the iommufd of the calling device.
> > +		 * - devid == -1 for others, and clear dev_owned flag.
> > +		 */
> > +		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> > +		if (vdev && iommufd == vfio_iommufd_physical_ictx(vdev)) {
> > +			int ret;
> > +
> > +			ret = vfio_iommufd_physical_devid(vdev);
> > +			if (WARN_ON(ret < 0))
> > +				return ret;
> > +			fill->devices[fill->cur].devid = ret;
> 
> Nit, @devid seems like a better variable name here rather than @ret.
> 
> > +		} else if (vdev && iommufd_ctx_has_group(iommufd, iommu_group)) {
> > +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_OWNED;
> > +		} else {
> > +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED;
> > +			fill->dev_owned = false;
> > +		}
> 
> I think we're not describing the requirements for this middle test
> correctly.  We're essentially only stating the iommufd_ctx_has_group()
> part of the requirement, but we're also enforcing a
> vfio_find_device_in_devset() requirement, which means the device is not
> just unopened within a group shared by the iommufd context, but it must
> also still be a device registered as a member of the devset, ie. it
> must be bound to a vfio driver.

Yes. This is to make it be par with hot-reset path. This is needed. Is it?

> 
> It's not a new requirement, it's imposed in the hot-reset ioctl itself,
> but it's new for the info ioctl given that it's now trying to report
> that the user can perform the reset for cdev callers.
>
> This also shares too much logic with vfio_device_owned() added in the
> next patch.  I think it might be cleaner to move the iommu_group_get() to
> the group path below and change vfio_device_owned() to something that
> can be used here and in the reset path.  For example, if we had a
> function like:
> 
> static int vfio_hot_reset_devid(struct vfio_device *vdev,
>                                 struct iommufd_ctx *iommufd_ctx)
> {
>         struct iommu_group *group;
>         int devid;
> 
>         if (!vdev)
>                 return VFIO_PCI_DEVID_NOT_OWNED;
> 
>         if (vfio_iommufd_physical_ictx(vdev) == iommufd_ctx)
>                 return vfio_iommufd_physical_devid(vdev);
> 
>         group = iommu_group_get(vdev->dev);
>         if (!group)
>                 return VFIO_PCI_DEVID_NOT_OWNED;
> 
>         if (iommufd_ctx_has_group(iommufd_ctx, group))
>                 devid = VFIO_PCI_DEVID_OWNED;
> 
>         iommu_group_put(group);
> 
>         return devid;
> }

Thanks. This can be placed in vfio/iommufd.c, hence no need to
Import the IOMMUFD namespace in vfio_pci_core.c.
vfio_iommufd_physical_devid() can ebe static within vfio/iommufd.c

> It could be called above as:
> 
> 	vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> 	fill->devices[fill->cur].devid =
> 			vfio_hot_reset_devid(vdev, iommufd);
> 
> 
> And from vfio_pci_dev_set_hot_reset() as:
> 
> 	bool owned;
> 
> 	if (iommufd_ctx) {
> 		int devid = vfio_hot_reset_devid(&cur_vma->vdev,
> 						 iommufd_ctx);
> 
> 		owned = (devid != VFIO_PCI_DEVID_NOT_OWNED);
> 	} else
> 		owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
> 
> Any better?

Above looks good enough.

> 
> > +	} else {
> > +		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> > +	}
> >  	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
> >  	fill->devices[fill->cur].bus = pdev->bus->number;
> >  	fill->devices[fill->cur].devfn = pdev->devfn;
> > @@ -1229,17 +1267,27 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
> >  		return -ENOMEM;
> >
> >  	fill.devices = devices;
> > +	fill.vdev = &vdev->vdev;
> >
> > +	mutex_lock(&vdev->vdev.dev_set->lock);
> > +	fill.devid = fill.dev_owned = vfio_device_cdev_opened(&vdev->vdev);
> >  	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
> >  					    &fill, slot);
> > +	mutex_unlock(&vdev->vdev.dev_set->lock);
> >
> >  	/*
> >  	 * If a device was removed between counting and filling, we may come up
> >  	 * short of fill.max.  If a device was added, we'll have a return of
> >  	 * -EAGAIN above.
> >  	 */
> > -	if (!ret)
> > +	if (!ret) {
> >  		hdr.count = fill.cur;
> > +		if (fill.devid) {
> > +			hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID;
> > +			if (fill.dev_owned)
> > +				hdr.flags |=
> VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
> > +		}
> > +	}
> 
> Does this clean up the flag and branching a bit?

Yes. 
Yi Liu May 18, 2023, 1:31 p.m. UTC | #5
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, May 18, 2023 9:22 PM
> 
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Thursday, May 18, 2023 6:02 AM
> >
> > On Sat, 13 May 2023 06:21:35 -0700
> > Yi Liu <yi.l.liu@intel.com> wrote:

> >
> > static int vfio_hot_reset_devid(struct vfio_device *vdev,
> >                                 struct iommufd_ctx *iommufd_ctx)
> > {
> >         struct iommu_group *group;
> >         int devid;
> >
> >         if (!vdev)
> >                 return VFIO_PCI_DEVID_NOT_OWNED;
> >
> >         if (vfio_iommufd_physical_ictx(vdev) == iommufd_ctx)
> >                 return vfio_iommufd_physical_devid(vdev);

Do we need to check the return of this helper? It returns -EINVAL
when iommufd_access and iommufd_device are both null. Though
not possible in this path. Is a WARN_ON needed or not?

Regards,
Yi Liu

> >
> >         group = iommu_group_get(vdev->dev);
> >         if (!group)
> >                 return VFIO_PCI_DEVID_NOT_OWNED;
> >
> >         if (iommufd_ctx_has_group(iommufd_ctx, group))
> >                 devid = VFIO_PCI_DEVID_OWNED;
> >
> >         iommu_group_put(group);
> >
> >         return devid;
> > }
Alex Williamson May 18, 2023, 7:53 p.m. UTC | #6
On Thu, 18 May 2023 13:31:47 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Thursday, May 18, 2023 9:22 PM
> >   
> > > From: Alex Williamson <alex.williamson@redhat.com>
> > > Sent: Thursday, May 18, 2023 6:02 AM
> > >
> > > On Sat, 13 May 2023 06:21:35 -0700
> > > Yi Liu <yi.l.liu@intel.com> wrote:  
> 
> > >
> > > static int vfio_hot_reset_devid(struct vfio_device *vdev,
> > >                                 struct iommufd_ctx *iommufd_ctx)
> > > {
> > >         struct iommu_group *group;
> > >         int devid;
> > >
> > >         if (!vdev)
> > >                 return VFIO_PCI_DEVID_NOT_OWNED;
> > >
> > >         if (vfio_iommufd_physical_ictx(vdev) == iommufd_ctx)
> > >                 return vfio_iommufd_physical_devid(vdev);  
> 
> Do we need to check the return of this helper? It returns -EINVAL
> when iommufd_access and iommufd_device are both null. Though
> not possible in this path. Is a WARN_ON needed or not?

I also came to the conclusion that it wasn't possible while reviewing
the code.  I wouldn't got to extreme measures to introduce paranoia
checks for impossible conditions.  Thanks,

Alex

> > >
> > >         group = iommu_group_get(vdev->dev);
> > >         if (!group)
> > >                 return VFIO_PCI_DEVID_NOT_OWNED;
> > >
> > >         if (iommufd_ctx_has_group(iommufd_ctx, group))
> > >                 devid = VFIO_PCI_DEVID_OWNED;
> > >
> > >         iommu_group_put(group);
> > >
> > >         return devid;
> > > }
Alex Williamson May 18, 2023, 8:02 p.m. UTC | #7
On Thu, 18 May 2023 13:21:57 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Thursday, May 18, 2023 6:02 AM
> > 
> > On Sat, 13 May 2023 06:21:35 -0700
> > Yi Liu <yi.l.liu@intel.com> wrote:
> >   
> > > This makes VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl to use the iommufd_ctx  
> > 
> > s/makes/allows/?
> > 
> > s/to//
> >   
> > > of the cdev device to check the ownership of the other affected devices.
> > >
> > > This returns devid for each of the affected devices. If it is bound to the
> > > iommufd_ctx of the cdev device, _INFO reports a valid devid > 0; If it is
> > > not opened by the calling user, but it belongs to the same iommu_group of
> > > a device that is bound to the iommufd_ctx of the cdev device, reports devid
> > > value of 0; If the device is un-owned device, configured within a different
> > > iommufd, or opened outside of the vfio device cdev API, the _INFO ioctl shall
> > > report devid value of -1.
> > >
> > > devid >=0 doesn't block hot-reset as the affected devices are considered to
> > > be owned, while devid == -1 will block the use of VFIO_DEVICE_PCI_HOT_RESET
> > > outside of proof-of-ownership calling conventions (ie. via legacy group
> > > accessed devices).
> > >
> > > This adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID to tell the user devid is
> > > returned in case of calling user get device fd from other software stack  
> > 
> > "other software stack"?  I think this is trying to say something like:
> > 
> >   When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD
> >   managed device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is
> >   reported to indicate the values returned are IOMMUFD devids rather
> >   than group IDs as used when accessing vfio devices through the
> >   conventional vfio group interface.  Additionally the flag
> >   VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported in this mode if
> >   all of the devices affected by the hot-reset are owned by either
> >   virtue of being directly bound to the same iommufd context as the
> >   calling device, or implicitly owned via a shared IOMMU group.  
> 
> Yes. it is.
> 
> > > and adds flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED to tell user if all
> > > the affected devices are owned, so user can know it without looping all
> > > the returned devids.
> > >
> > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > > Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> > > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > > ---
> > >  drivers/vfio/pci/vfio_pci_core.c | 52 ++++++++++++++++++++++++++++++--
> > >  include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++++-
> > >  2 files changed, 95 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > > index 4df2def35bdd..57586be770af 100644
> > > --- a/drivers/vfio/pci/vfio_pci_core.c
> > > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > > @@ -27,6 +27,7 @@
> > >  #include <linux/vgaarb.h>
> > >  #include <linux/nospec.h>
> > >  #include <linux/sched/mm.h>
> > > +#include <linux/iommufd.h>
> > >  #if IS_ENABLED(CONFIG_EEH)
> > >  #include <asm/eeh.h>
> > >  #endif
> > > @@ -36,6 +37,10 @@
> > >  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> > >  #define DRIVER_DESC "core driver for VFIO based PCI devices"
> > >
> > > +#ifdef CONFIG_IOMMUFD
> > > +MODULE_IMPORT_NS(IOMMUFD);
> > > +#endif
> > > +
> > >  static bool nointxmask;
> > >  static bool disable_vga;
> > >  static bool disable_idle_d3;
> > > @@ -776,6 +781,9 @@ struct vfio_pci_fill_info {
> > >  	int max;
> > >  	int cur;
> > >  	struct vfio_pci_dependent_device *devices;
> > > +	struct vfio_device *vdev;
> > > +	bool devid:1;
> > > +	bool dev_owned:1;
> > >  };
> > >
> > >  static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> > > @@ -790,7 +798,37 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> > >  	if (!iommu_group)
> > >  		return -EPERM; /* Cannot reset non-isolated devices */
> > >
> > > -	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> > > +	if (fill->devid) {
> > > +		struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(fill->vdev);
> > > +		struct vfio_device_set *dev_set = fill->vdev->dev_set;
> > > +		struct vfio_device *vdev;
> > > +
> > > +		/*
> > > +		 * Report devid for the affected devices:
> > > +		 * - valid devid > 0 for the devices that are bound with
> > > +		 *   the iommufd of the calling device.
> > > +		 * - devid == 0 for the devices that have not been opened
> > > +		 *   but have same group with one of the devices bound to
> > > +		 *   the iommufd of the calling device.
> > > +		 * - devid == -1 for others, and clear dev_owned flag.
> > > +		 */
> > > +		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> > > +		if (vdev && iommufd == vfio_iommufd_physical_ictx(vdev)) {
> > > +			int ret;
> > > +
> > > +			ret = vfio_iommufd_physical_devid(vdev);
> > > +			if (WARN_ON(ret < 0))
> > > +				return ret;
> > > +			fill->devices[fill->cur].devid = ret;  
> > 
> > Nit, @devid seems like a better variable name here rather than @ret.
> >   
> > > +		} else if (vdev && iommufd_ctx_has_group(iommufd, iommu_group)) {
> > > +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_OWNED;
> > > +		} else {
> > > +			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED;
> > > +			fill->dev_owned = false;
> > > +		}  
> > 
> > I think we're not describing the requirements for this middle test
> > correctly.  We're essentially only stating the iommufd_ctx_has_group()
> > part of the requirement, but we're also enforcing a
> > vfio_find_device_in_devset() requirement, which means the device is not
> > just unopened within a group shared by the iommufd context, but it must
> > also still be a device registered as a member of the devset, ie. it
> > must be bound to a vfio driver.  
> 
> Yes. This is to make it be par with hot-reset path. This is needed. Is it?

Yes.  In supporting this null-array model, the kernel is taking on the
responsibility to indicate that the hot-reset is available, which
necessarily includes the devset test.  I think the logic we have is
correct, but the documentation and comments should match the code as
well.

> > It's not a new requirement, it's imposed in the hot-reset ioctl itself,
> > but it's new for the info ioctl given that it's now trying to report
> > that the user can perform the reset for cdev callers.
> >
> > This also shares too much logic with vfio_device_owned() added in the
> > next patch.  I think it might be cleaner to move the iommu_group_get() to
> > the group path below and change vfio_device_owned() to something that
> > can be used here and in the reset path.  For example, if we had a
> > function like:
> > 
> > static int vfio_hot_reset_devid(struct vfio_device *vdev,
> >                                 struct iommufd_ctx *iommufd_ctx)
> > {
> >         struct iommu_group *group;
> >         int devid;
> > 
> >         if (!vdev)
> >                 return VFIO_PCI_DEVID_NOT_OWNED;
> > 
> >         if (vfio_iommufd_physical_ictx(vdev) == iommufd_ctx)
> >                 return vfio_iommufd_physical_devid(vdev);
> > 
> >         group = iommu_group_get(vdev->dev);
> >         if (!group)
> >                 return VFIO_PCI_DEVID_NOT_OWNED;
> > 
> >         if (iommufd_ctx_has_group(iommufd_ctx, group))
> >                 devid = VFIO_PCI_DEVID_OWNED;
> > 
> >         iommu_group_put(group);
> > 
> >         return devid;
> > }  
> 
> Thanks. This can be placed in vfio/iommufd.c, hence no need to
> Import the IOMMUFD namespace in vfio_pci_core.c.
> vfio_iommufd_physical_devid() can ebe static within vfio/iommufd.c

Good, that's an improvement too.  Thanks,

Alex
 
> > It could be called above as:
> > 
> > 	vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> > 	fill->devices[fill->cur].devid =
> > 			vfio_hot_reset_devid(vdev, iommufd);
> > 
> > 
> > And from vfio_pci_dev_set_hot_reset() as:
> > 
> > 	bool owned;
> > 
> > 	if (iommufd_ctx) {
> > 		int devid = vfio_hot_reset_devid(&cur_vma->vdev,
> > 						 iommufd_ctx);
> > 
> > 		owned = (devid != VFIO_PCI_DEVID_NOT_OWNED);
> > 	} else
> > 		owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
> > 
> > Any better?  
> 
> Above looks good enough.
> 
> >   
> > > +	} else {
> > > +		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> > > +	}
> > >  	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
> > >  	fill->devices[fill->cur].bus = pdev->bus->number;
> > >  	fill->devices[fill->cur].devfn = pdev->devfn;
> > > @@ -1229,17 +1267,27 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
> > >  		return -ENOMEM;
> > >
> > >  	fill.devices = devices;
> > > +	fill.vdev = &vdev->vdev;
> > >
> > > +	mutex_lock(&vdev->vdev.dev_set->lock);
> > > +	fill.devid = fill.dev_owned = vfio_device_cdev_opened(&vdev->vdev);
> > >  	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
> > >  					    &fill, slot);
> > > +	mutex_unlock(&vdev->vdev.dev_set->lock);
> > >
> > >  	/*
> > >  	 * If a device was removed between counting and filling, we may come up
> > >  	 * short of fill.max.  If a device was added, we'll have a return of
> > >  	 * -EAGAIN above.
> > >  	 */
> > > -	if (!ret)
> > > +	if (!ret) {
> > >  		hdr.count = fill.cur;
> > > +		if (fill.devid) {
> > > +			hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID;
> > > +			if (fill.dev_owned)
> > > +				hdr.flags |=  
> > VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;  
> > > +		}
> > > +	}  
> > 
> > Does this clean up the flag and branching a bit?  
> 
> Yes. 
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 4df2def35bdd..57586be770af 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -27,6 +27,7 @@ 
 #include <linux/vgaarb.h>
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
+#include <linux/iommufd.h>
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
@@ -36,6 +37,10 @@ 
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
 #define DRIVER_DESC "core driver for VFIO based PCI devices"
 
+#ifdef CONFIG_IOMMUFD
+MODULE_IMPORT_NS(IOMMUFD);
+#endif
+
 static bool nointxmask;
 static bool disable_vga;
 static bool disable_idle_d3;
@@ -776,6 +781,9 @@  struct vfio_pci_fill_info {
 	int max;
 	int cur;
 	struct vfio_pci_dependent_device *devices;
+	struct vfio_device *vdev;
+	bool devid:1;
+	bool dev_owned:1;
 };
 
 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
@@ -790,7 +798,37 @@  static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 	if (!iommu_group)
 		return -EPERM; /* Cannot reset non-isolated devices */
 
-	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
+	if (fill->devid) {
+		struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(fill->vdev);
+		struct vfio_device_set *dev_set = fill->vdev->dev_set;
+		struct vfio_device *vdev;
+
+		/*
+		 * Report devid for the affected devices:
+		 * - valid devid > 0 for the devices that are bound with
+		 *   the iommufd of the calling device.
+		 * - devid == 0 for the devices that have not been opened
+		 *   but have same group with one of the devices bound to
+		 *   the iommufd of the calling device.
+		 * - devid == -1 for others, and clear dev_owned flag.
+		 */
+		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
+		if (vdev && iommufd == vfio_iommufd_physical_ictx(vdev)) {
+			int ret;
+
+			ret = vfio_iommufd_physical_devid(vdev);
+			if (WARN_ON(ret < 0))
+				return ret;
+			fill->devices[fill->cur].devid = ret;
+		} else if (vdev && iommufd_ctx_has_group(iommufd, iommu_group)) {
+			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_OWNED;
+		} else {
+			fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED;
+			fill->dev_owned = false;
+		}
+	} else {
+		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
+	}
 	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
 	fill->devices[fill->cur].bus = pdev->bus->number;
 	fill->devices[fill->cur].devfn = pdev->devfn;
@@ -1229,17 +1267,27 @@  static int vfio_pci_ioctl_get_pci_hot_reset_info(
 		return -ENOMEM;
 
 	fill.devices = devices;
+	fill.vdev = &vdev->vdev;
 
+	mutex_lock(&vdev->vdev.dev_set->lock);
+	fill.devid = fill.dev_owned = vfio_device_cdev_opened(&vdev->vdev);
 	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
 					    &fill, slot);
+	mutex_unlock(&vdev->vdev.dev_set->lock);
 
 	/*
 	 * If a device was removed between counting and filling, we may come up
 	 * short of fill.max.  If a device was added, we'll have a return of
 	 * -EAGAIN above.
 	 */
-	if (!ret)
+	if (!ret) {
 		hdr.count = fill.cur;
+		if (fill.devid) {
+			hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID;
+			if (fill.dev_owned)
+				hdr.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+		}
+	}
 
 reset_info_exit:
 	if (copy_to_user(arg, &hdr, minsz))
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0552e8dcf0cb..01203215251a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -650,11 +650,53 @@  enum {
  * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
  *					      struct vfio_pci_hot_reset_info)
  *
+ * This command is used to query the affected devices in the hot reset for
+ * a given device.
+ *
+ * This command always reports the segment, bus, and devfn information for
+ * each affected device, and selectively reports the group_id or devid per
+ * the way how the calling device is opened.
+ *
+ *	- If the calling device is opened via the traditional group/container
+ *	  API, group_id is reported.  User should check if it has owned all
+ *	  the affected devices and provides a set of group fds to prove the
+ *	  ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl.
+ *
+ *	- If the calling device is opened as a cdev, devid is reported.
+ *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this
+ *	  data type.  For a given affected device, it is considered owned by
+ *	  this interface if it meets the following conditions:
+ *	  1) Has a valid devid within the iommufd_ctx of the calling device.
+ *	     Ownership cannot be determined across separate iommufd_ctx and the
+ *	     cdev calling conventions do not support a proof-of-ownership model
+ *	     as provided in the legacy group interface.  In this case a valid
+ *	     devid with value greater than zero is provided in the return
+ *	     structure.
+ *	  2) Does not have a valid devid within the iommufd_ctx of the calling
+ *	     device, but belongs to the same IOMMU group as the calling device
+ *	     or another opened device that has a valid devid within the
+ *	     iommufd_ctx of the calling device.  This provides implicit ownership
+ *	     for devices within the same DMA isolation context.  In this case
+ *	     the invalid devid value of zero is provided in the return structure.
+ *
+ *	  A devid value of -1 is provided in the return structure for devices
+ *	  where ownership is not available.  Such devices prevent the use of
+ *	  VFIO_DEVICE_PCI_HOT_RESET outside of proof-of-ownership calling
+ *	  conventions (ie. via legacy group accessed devices).
+ *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
+ *	  affected devices are owned by the user.  This flag is available only
+ *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
+ *
  * Return: 0 on success, -errno on failure:
  *	-enospc = insufficient buffer, -enodev = unsupported for device.
  */
 struct vfio_pci_dependent_device {
-	__u32	group_id;
+	union {
+		__u32   group_id;
+		__u32	devid;
+#define VFIO_PCI_DEVID_OWNED		0
+#define VFIO_PCI_DEVID_NOT_OWNED	-1
+	};
 	__u16	segment;
 	__u8	bus;
 	__u8	devfn; /* Use PCI_SLOT/PCI_FUNC */
@@ -663,6 +705,8 @@  struct vfio_pci_dependent_device {
 struct vfio_pci_hot_reset_info {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID		(1 << 0)
+#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED	(1 << 1)
 	__u32	count;
 	struct vfio_pci_dependent_device	devices[];
 };