diff mbox series

[v6,09/10] vfio/pci: Extend VFIO_DEVICE_GET_PCI_HOT_RESET_INFO for vfio device cdev

Message ID 20230522115751.326947-10-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Enhance vfio PCI hot reset for vfio cdev device | expand

Commit Message

Yi Liu May 22, 2023, 11:57 a.m. UTC
This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx
of the cdev device to check the ownership of the other affected devices.

When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed
device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate
the values returned are IOMMUFD devids rather than group IDs as used when
accessing vfio devices through the conventional vfio group interface.
Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported
in this mode if all of the devices affected by the hot-reset are owned by
either virtue of being directly bound to the same iommufd context as the
calling device, or implicitly owned via a shared IOMMU group.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/iommufd.c           | 57 ++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_core.c | 40 ++++++++++++++++++----
 include/linux/vfio.h             | 16 +++++++++
 include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++-
 4 files changed, 151 insertions(+), 8 deletions(-)

Comments

Alex Williamson May 24, 2023, 7:56 p.m. UTC | #1
On Mon, 22 May 2023 04:57:50 -0700
Yi Liu <yi.l.liu@intel.com> wrote:

> This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx
> of the cdev device to check the ownership of the other affected devices.
> 
> When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed
> device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate
> the values returned are IOMMUFD devids rather than group IDs as used when
> accessing vfio devices through the conventional vfio group interface.
> Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported
> in this mode if all of the devices affected by the hot-reset are owned by
> either virtue of being directly bound to the same iommufd context as the
> calling device, or implicitly owned via a shared IOMMU group.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/iommufd.c           | 57 ++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci_core.c | 40 ++++++++++++++++++----
>  include/linux/vfio.h             | 16 +++++++++
>  include/uapi/linux/vfio.h        | 46 +++++++++++++++++++++++++-
>  4 files changed, 151 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> index 356dd215a8d5..4dae9ab94eed 100644
> --- a/drivers/vfio/iommufd.c
> +++ b/drivers/vfio/iommufd.c
> @@ -106,6 +106,63 @@ void vfio_iommufd_unbind(struct vfio_device *vdev)
>  		vdev->ops->unbind_iommufd(vdev);
>  }
>  
> +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev)
> +{
> +	if (vdev->iommufd_device)
> +		return iommufd_device_to_ictx(vdev->iommufd_device);
> +	if (vdev->iommufd_access)
> +		return iommufd_access_to_ictx(vdev->iommufd_access);
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx);
> +
> +static int vfio_iommufd_device_id(struct vfio_device *vdev)
> +{
> +	if (vdev->iommufd_device)
> +		return iommufd_device_to_id(vdev->iommufd_device);
> +	if (vdev->iommufd_access)
> +		return iommufd_access_to_id(vdev->iommufd_access);
> +	return -EINVAL;
> +}
> +
> +/*
> + * Return devid for vfio_device if the device is owned by the input
> + * ictx.
> + * - valid devid > 0 for the device that are bound to the input
> + *   iommufd_ctx.
> + * - devid == VFIO_PCI_DEVID_OWNED for the devices that have not
> + *   been opened but but other device within its group has been

"but but"

> + *   bound to the input iommufd_ctx.
> + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. vdev is
> + *   NULL.
> + */
> +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev,
> +					struct iommufd_ctx *ictx)
> +{
> +	struct iommu_group *group;
> +	int devid;
> +
> +	if (!vdev)
> +		return VFIO_PCI_DEVID_NOT_OWNED;
> +
> +	if (vfio_iommufd_device_ictx(vdev) == ictx)
> +		return vfio_iommufd_device_id(vdev);
> +
> +	group = iommu_group_get(vdev->dev);
> +	if (!group)
> +		return VFIO_PCI_DEVID_NOT_OWNED;
> +
> +	if (iommufd_ctx_has_group(ictx, group))
> +		devid = VFIO_PCI_DEVID_OWNED;
> +	else
> +		devid = VFIO_PCI_DEVID_NOT_OWNED;
> +
> +	iommu_group_put(group);
> +
> +	return devid;
> +}
> +EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid);
> +
>  /*
>   * The physical standard ops mean that the iommufd_device is bound to the
>   * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 3a2f67675036..890065f846e4 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -27,6 +27,7 @@
>  #include <linux/vgaarb.h>
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
> +#include <linux/iommufd.h>
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -776,26 +777,42 @@ struct vfio_pci_fill_info {
>  	int max;
>  	int cur;
>  	struct vfio_pci_dependent_device *devices;
> +	struct vfio_device *vdev;
> +	u32 flags;
>  };
>  
>  static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
>  {
>  	struct vfio_pci_fill_info *fill = data;
> -	struct iommu_group *iommu_group;
>  
>  	if (fill->cur == fill->max)
>  		return -EAGAIN; /* Something changed, try again */
>  
> -	iommu_group = iommu_group_get(&pdev->dev);
> -	if (!iommu_group)
> -		return -EPERM; /* Cannot reset non-isolated devices */
> +	if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
> +		struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
> +		struct vfio_device_set *dev_set = fill->vdev->dev_set;
> +		struct vfio_device *vdev;
> +
> +		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
> +		fill->devices[fill->cur].devid =
> +			vfio_iommufd_device_hot_reset_devid(vdev, iommufd);
> +		/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
> +		if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED)
> +			fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
> +	} else {
> +		struct iommu_group *iommu_group;
> +
> +		iommu_group = iommu_group_get(&pdev->dev);
> +		if (!iommu_group)
> +			return -EPERM; /* Cannot reset non-isolated devices */
>  
> -	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> +		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
> +		iommu_group_put(iommu_group);
> +	}
>  	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
>  	fill->devices[fill->cur].bus = pdev->bus->number;
>  	fill->devices[fill->cur].devfn = pdev->devfn;
>  	fill->cur++;
> -	iommu_group_put(iommu_group);
>  	return 0;
>  }
>  
> @@ -1229,17 +1246,26 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
>  		return -ENOMEM;
>  
>  	fill.devices = devices;
> +	fill.vdev = &vdev->vdev;
>  
> +	if (vfio_device_cdev_opened(&vdev->vdev))
> +		fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
> +			     VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
> +
> +	mutex_lock(&vdev->vdev.dev_set->lock);
>  	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
>  					    &fill, slot);
> +	mutex_unlock(&vdev->vdev.dev_set->lock);
>  
>  	/*
>  	 * If a device was removed between counting and filling, we may come up
>  	 * short of fill.max.  If a device was added, we'll have a return of
>  	 * -EAGAIN above.
>  	 */
> -	if (!ret)
> +	if (!ret) {
>  		hdr.count = fill.cur;
> +		hdr.flags = fill.flags;
> +	}
>  
>  reset_info_exit:
>  	if (copy_to_user(arg, &hdr, minsz))
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index ee120d2d530b..382a7b119c7c 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -114,6 +114,9 @@ struct vfio_device_ops {
>  };
>  
>  #if IS_ENABLED(CONFIG_IOMMUFD)
> +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev);
> +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev,
> +					struct iommufd_ctx *ictx);
>  int vfio_iommufd_physical_bind(struct vfio_device *vdev,
>  			       struct iommufd_ctx *ictx, u32 *out_device_id);
>  void vfio_iommufd_physical_unbind(struct vfio_device *vdev);
> @@ -123,6 +126,19 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
>  void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
>  int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
>  #else
> +static inline struct iommufd_ctx *
> +vfio_iommufd_device_ictx(struct vfio_device *vdev)
> +{
> +	return NULL;
> +}
> +
> +static inline int
> +vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev,
> +				    struct iommufd_ctx *ictx)
> +{
> +	return VFIO_PCI_DEVID_NOT_OWNED;
> +}
> +
>  #define vfio_iommufd_physical_bind                                      \
>  	((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx,   \
>  		  u32 *out_device_id)) NULL)
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 0552e8dcf0cb..01203215251a 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -650,11 +650,53 @@ enum {
>   * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
>   *					      struct vfio_pci_hot_reset_info)
>   *
> + * This command is used to query the affected devices in the hot reset for
> + * a given device.
> + *
> + * This command always reports the segment, bus, and devfn information for
> + * each affected device, and selectively reports the group_id or devid per
> + * the way how the calling device is opened.
> + *
> + *	- If the calling device is opened via the traditional group/container
> + *	  API, group_id is reported.  User should check if it has owned all
> + *	  the affected devices and provides a set of group fds to prove the
> + *	  ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl.
> + *
> + *	- If the calling device is opened as a cdev, devid is reported.
> + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this
> + *	  data type.  For a given affected device, it is considered owned by
> + *	  this interface if it meets the following conditions:
> + *	  1) Has a valid devid within the iommufd_ctx of the calling device.
> + *	     Ownership cannot be determined across separate iommufd_ctx and the
> + *	     cdev calling conventions do not support a proof-of-ownership model
> + *	     as provided in the legacy group interface.  In this case a valid
> + *	     devid with value greater than zero is provided in the return
> + *	     structure.
> + *	  2) Does not have a valid devid within the iommufd_ctx of the calling
> + *	     device, but belongs to the same IOMMU group as the calling device
> + *	     or another opened device that has a valid devid within the
> + *	     iommufd_ctx of the calling device.  This provides implicit ownership
> + *	     for devices within the same DMA isolation context.  In this case
> + *	     the invalid devid value of zero is provided in the return structure.
> + *
> + *	  A devid value of -1 is provided in the return structure for devices

s/zero/VFIO_PCI_DEVID_OWNED/

s/-1/VFIO_PCI_DEVID_NOT_OWNED/

2) above and previously in the code comment where I noted the repeated
"but" still doesn't actually describe the requirement as I noted in the
last review.  The user implicitly owns a device if they own another
device within the IOMMU group, but we also impose a dev_set requirement
in the hot reset path.  All affected devices need to be represented in
the dev_set, ex. bound to a vfio driver.  It's possible that requirement
might be relaxed in the new DMA ownership model, but as it is right
now, the code enforces that requirement and any new discussion about
what makes hot-reset available should note both the ownership and
dev_set requirement.  Thanks,

Alex


> + *	  where ownership is not available.  Such devices prevent the use of
> + *	  VFIO_DEVICE_PCI_HOT_RESET outside of proof-of-ownership calling
> + *	  conventions (ie. via legacy group accessed devices).
> + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
> + *	  affected devices are owned by the user.  This flag is available only
> + *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
> + *
>   * Return: 0 on success, -errno on failure:
>   *	-enospc = insufficient buffer, -enodev = unsupported for device.
>   */
>  struct vfio_pci_dependent_device {
> -	__u32	group_id;
> +	union {
> +		__u32   group_id;
> +		__u32	devid;
> +#define VFIO_PCI_DEVID_OWNED		0
> +#define VFIO_PCI_DEVID_NOT_OWNED	-1
> +	};
>  	__u16	segment;
>  	__u8	bus;
>  	__u8	devfn; /* Use PCI_SLOT/PCI_FUNC */
> @@ -663,6 +705,8 @@ struct vfio_pci_dependent_device {
>  struct vfio_pci_hot_reset_info {
>  	__u32	argsz;
>  	__u32	flags;
> +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID		(1 << 0)
> +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED	(1 << 1)
>  	__u32	count;
>  	struct vfio_pci_dependent_device	devices[];
>  };
Yi Liu May 25, 2023, 1:02 p.m. UTC | #2
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, May 25, 2023 3:56 AM
> On Mon, 22 May 2023 04:57:50 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > +
> > +/*
> > + * Return devid for vfio_device if the device is owned by the input
> > + * ictx.
> > + * - valid devid > 0 for the device that are bound to the input
> > + *   iommufd_ctx.
> > + * - devid == VFIO_PCI_DEVID_OWNED for the devices that have not
> > + *   been opened but but other device within its group has been
> 
> "but but"

Thanks for catching it.

> 
> > + *   bound to the input iommufd_ctx.
> > + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. vdev is
> > + *   NULL.
> > + */
> > +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev,
> > +					struct iommufd_ctx *ictx)
> > +{
> > +	struct iommu_group *group;
> > +	int devid;
> > +
> > +	if (!vdev)
> > +		return VFIO_PCI_DEVID_NOT_OWNED;
> > +
> > +	if (vfio_iommufd_device_ictx(vdev) == ictx)
> > +		return vfio_iommufd_device_id(vdev);
> > +
> > +	group = iommu_group_get(vdev->dev);
> > +	if (!group)
> > +		return VFIO_PCI_DEVID_NOT_OWNED;
> > +
> > +	if (iommufd_ctx_has_group(ictx, group))
> > +		devid = VFIO_PCI_DEVID_OWNED;
> > +	else
> > +		devid = VFIO_PCI_DEVID_NOT_OWNED;
> > +
> > +	iommu_group_put(group);
> > +
> > +	return devid;
> > +}

> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -650,11 +650,53 @@ enum {
> >   * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
> >   *					      struct vfio_pci_hot_reset_info)
> >   *
> > + * This command is used to query the affected devices in the hot reset for
> > + * a given device.
> > + *
> > + * This command always reports the segment, bus, and devfn information for
> > + * each affected device, and selectively reports the group_id or devid per
> > + * the way how the calling device is opened.
> > + *
> > + *	- If the calling device is opened via the traditional group/container
> > + *	  API, group_id is reported.  User should check if it has owned all
> > + *	  the affected devices and provides a set of group fds to prove the
> > + *	  ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl.
> > + *
> > + *	- If the calling device is opened as a cdev, devid is reported.
> > + *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this
> > + *	  data type.  For a given affected device, it is considered owned by
> > + *	  this interface if it meets the following conditions:
> > + *	  1) Has a valid devid within the iommufd_ctx of the calling device.
> > + *	     Ownership cannot be determined across separate iommufd_ctx and the
> > + *	     cdev calling conventions do not support a proof-of-ownership model
> > + *	     as provided in the legacy group interface.  In this case a valid
> > + *	     devid with value greater than zero is provided in the return
> > + *	     structure.
> > + *	  2) Does not have a valid devid within the iommufd_ctx of the calling
> > + *	     device, but belongs to the same IOMMU group as the calling device
> > + *	     or another opened device that has a valid devid within the
> > + *	     iommufd_ctx of the calling device.  This provides implicit ownership
> > + *	     for devices within the same DMA isolation context.  In this case
> > + *	     the invalid devid value of zero is provided in the return structure.
> > + *
> > + *	  A devid value of -1 is provided in the return structure for devices
> 
> s/zero/VFIO_PCI_DEVID_OWNED/
> 
> s/-1/VFIO_PCI_DEVID_NOT_OWNED/

Will do.

> 2) above and previously in the code comment where I noted the repeated
> "but" still doesn't actually describe the requirement as I noted in the
> last review.  The user implicitly owns a device if they own another
> device within the IOMMU group, but we also impose a dev_set requirement
> in the hot reset path.  All affected devices need to be represented in
> the dev_set, ex. bound to a vfio driver.

Yes. it is. Btw. dev_set is not visible to user. Is it good to mention it
in uapi header especially w.r.t. the below potential relaxing of this
requirement?

>  It's possible that requirement
> might be relaxed in the new DMA ownership model, but as it is right
> now, the code enforces that requirement and any new discussion about
> what makes hot-reset available should note both the ownership and
> dev_set requirement.  Thanks,

I think your point is that if an iommufd_ctx has acquired DMA ownerhisp
of an iommu_group, it means the device is owned. And it should not
matter whether all the devices in the iommu_group is present in the
dev_set. It is allowed that some devices are bound to pci-stub or
pcieport driver. Is it?

Actually I have a doubt on it. IIUC, the above requirement on dev_set
is to ensure the reset to the devices are protected by the dev_set->lock.
So that either the reset issued by driver itself or a hot reset request
from user, there is no race. But if a device is not in the dev_set, then
hot reset request from user might race with the bound driver. DMA ownership
only guarantees the drivers won't handle DMA via DMA API which would have
conflict with DMA mappings from user. I'm not sure if it is able to
guarantee reset is exclusive as well. I see pci-stub and pcieport driver
are the only two drivers that set the driver_managed_dma flag besides the
vfio drivers. pci-stub may be fine. not sure about pcieport driver.

   #   line  filename / context / line
   1     39  drivers/pci/pci-stub.c <<GLOBAL>>
             .driver_managed_dma = true,
   2    796  drivers/pci/pcie/portdrv.c <<GLOBAL>>
             .driver_managed_dma = true,
   3    607  drivers/vfio/fsl-mc/vfio_fsl_mc.c <<GLOBAL>>
             .driver_managed_dma = true,
   4   1459  drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c <<GLOBAL>>
             .driver_managed_dma = true,
   5   1374  drivers/vfio/pci/mlx5/main.c <<GLOBAL>>
             .driver_managed_dma = true,
   6    203  drivers/vfio/pci/vfio_pci.c <<GLOBAL>>
             .driver_managed_dma = true,
   7    139  drivers/vfio/platform/vfio_amba.c <<GLOBAL>>
             .driver_managed_dma = true,
   8    120  drivers/vfio/platform/vfio_platform.c <<GLOBAL>>
             .driver_managed_dma = true,

Anyhow, I think this is not a must so far. is it? Even doable, it shall
be done in the future. :-)

Regards,
Yi Liu
Baolu Lu May 26, 2023, 2:04 a.m. UTC | #3
On 5/25/23 9:02 PM, Liu, Yi L wrote:
>>   It's possible that requirement
>> might be relaxed in the new DMA ownership model, but as it is right
>> now, the code enforces that requirement and any new discussion about
>> what makes hot-reset available should note both the ownership and
>> dev_set requirement.  Thanks,
> I think your point is that if an iommufd_ctx has acquired DMA ownerhisp
> of an iommu_group, it means the device is owned. And it should not
> matter whether all the devices in the iommu_group is present in the
> dev_set. It is allowed that some devices are bound to pci-stub or
> pcieport driver. Is it?
> 
> Actually I have a doubt on it. IIUC, the above requirement on dev_set
> is to ensure the reset to the devices are protected by the dev_set->lock.
> So that either the reset issued by driver itself or a hot reset request
> from user, there is no race. But if a device is not in the dev_set, then
> hot reset request from user might race with the bound driver. DMA ownership
> only guarantees the drivers won't handle DMA via DMA API which would have
> conflict with DMA mappings from user. I'm not sure if it is able to
> guarantee reset is exclusive as well. I see pci-stub and pcieport driver
> are the only two drivers that set the driver_managed_dma flag besides the
> vfio drivers. pci-stub may be fine. not sure about pcieport driver.

commit c7d469849747 ("PCI: portdrv: Set driver_managed_dma") described
the criteria of adding driver_managed_dma to the pcieport driver.

"
We achieve this by setting ".driver_managed_dma = true" in pci_driver
structure. It is safe because the portdrv driver meets below criteria:

- This driver doesn't use DMA, as you can't find any related calls like
   pci_set_master() or any kernel DMA API (dma_map_*() and etc.).
- It doesn't use MMIO as you can't find ioremap() or similar calls. It's
   tolerant to userspace possibly also touching the same MMIO registers
   via P2P DMA access.
"

pci_rest_device() definitely shouldn't be done by the kernel drivers
that have driver_managed_dma set.

> 
>     #   line  filename / context / line
>     1     39  drivers/pci/pci-stub.c <<GLOBAL>>
>               .driver_managed_dma = true,
>     2    796  drivers/pci/pcie/portdrv.c <<GLOBAL>>
>               .driver_managed_dma = true,
>     3    607  drivers/vfio/fsl-mc/vfio_fsl_mc.c <<GLOBAL>>
>               .driver_managed_dma = true,
>     4   1459  drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c <<GLOBAL>>
>               .driver_managed_dma = true,
>     5   1374  drivers/vfio/pci/mlx5/main.c <<GLOBAL>>
>               .driver_managed_dma = true,
>     6    203  drivers/vfio/pci/vfio_pci.c <<GLOBAL>>
>               .driver_managed_dma = true,
>     7    139  drivers/vfio/platform/vfio_amba.c <<GLOBAL>>
>               .driver_managed_dma = true,
>     8    120  drivers/vfio/platform/vfio_platform.c <<GLOBAL>>
>               .driver_managed_dma = true,
> 
> Anyhow, I think this is not a must so far. is it? Even doable, it shall
> be done in the future. 
Yi Liu June 1, 2023, 6:06 a.m. UTC | #4
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, June 1, 2023 3:00 AM
> 
> On Fri, May 26, 2023 at 10:04:27AM +0800, Baolu Lu wrote:
> > On 5/25/23 9:02 PM, Liu, Yi L wrote:
> > > >   It's possible that requirement
> > > > might be relaxed in the new DMA ownership model, but as it is right
> > > > now, the code enforces that requirement and any new discussion about
> > > > what makes hot-reset available should note both the ownership and
> > > > dev_set requirement.  Thanks,
> > > I think your point is that if an iommufd_ctx has acquired DMA ownerhisp
> > > of an iommu_group, it means the device is owned. And it should not
> > > matter whether all the devices in the iommu_group is present in the
> > > dev_set. It is allowed that some devices are bound to pci-stub or
> > > pcieport driver. Is it?
> > >
> > > Actually I have a doubt on it. IIUC, the above requirement on dev_set
> > > is to ensure the reset to the devices are protected by the dev_set->lock.
> > > So that either the reset issued by driver itself or a hot reset request
> > > from user, there is no race. But if a device is not in the dev_set, then
> > > hot reset request from user might race with the bound driver. DMA ownership
> > > only guarantees the drivers won't handle DMA via DMA API which would have
> > > conflict with DMA mappings from user. I'm not sure if it is able to
> > > guarantee reset is exclusive as well. I see pci-stub and pcieport driver
> > > are the only two drivers that set the driver_managed_dma flag besides the
> > > vfio drivers. pci-stub may be fine. not sure about pcieport driver.
> >
> > commit c7d469849747 ("PCI: portdrv: Set driver_managed_dma") described
> > the criteria of adding driver_managed_dma to the pcieport driver.
> >
> > "
> > We achieve this by setting ".driver_managed_dma = true" in pci_driver
> > structure. It is safe because the portdrv driver meets below criteria:
> >
> > - This driver doesn't use DMA, as you can't find any related calls like
> >   pci_set_master() or any kernel DMA API (dma_map_*() and etc.).
> > - It doesn't use MMIO as you can't find ioremap() or similar calls. It's
> >   tolerant to userspace possibly also touching the same MMIO registers
> >   via P2P DMA access.
> > "
> >
> > pci_rest_device() definitely shouldn't be done by the kernel drivers
> > that have driver_managed_dma set.
> 
> Right
> 
> The only time it is safe to reset is if you know there is no attached
> driver or you know VFIO is the attached driver and the caller owns the
> VFIO too.
> 
> We haven't done a no attached driver test due to races.

Ok. @Alex, should we relax the above dev_set requirement now or should
be in a separate series?

Regards,
Yi Liu
Alex Williamson June 1, 2023, 2:24 p.m. UTC | #5
On Thu, 1 Jun 2023 06:06:17 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Thursday, June 1, 2023 3:00 AM
> > 
> > On Fri, May 26, 2023 at 10:04:27AM +0800, Baolu Lu wrote:  
> > > On 5/25/23 9:02 PM, Liu, Yi L wrote:  
> > > > >   It's possible that requirement
> > > > > might be relaxed in the new DMA ownership model, but as it is right
> > > > > now, the code enforces that requirement and any new discussion about
> > > > > what makes hot-reset available should note both the ownership and
> > > > > dev_set requirement.  Thanks,  
> > > > I think your point is that if an iommufd_ctx has acquired DMA ownerhisp
> > > > of an iommu_group, it means the device is owned. And it should not
> > > > matter whether all the devices in the iommu_group is present in the
> > > > dev_set. It is allowed that some devices are bound to pci-stub or
> > > > pcieport driver. Is it?
> > > >
> > > > Actually I have a doubt on it. IIUC, the above requirement on dev_set
> > > > is to ensure the reset to the devices are protected by the dev_set->lock.
> > > > So that either the reset issued by driver itself or a hot reset request
> > > > from user, there is no race. But if a device is not in the dev_set, then
> > > > hot reset request from user might race with the bound driver. DMA ownership
> > > > only guarantees the drivers won't handle DMA via DMA API which would have
> > > > conflict with DMA mappings from user. I'm not sure if it is able to
> > > > guarantee reset is exclusive as well. I see pci-stub and pcieport driver
> > > > are the only two drivers that set the driver_managed_dma flag besides the
> > > > vfio drivers. pci-stub may be fine. not sure about pcieport driver.  
> > >
> > > commit c7d469849747 ("PCI: portdrv: Set driver_managed_dma") described
> > > the criteria of adding driver_managed_dma to the pcieport driver.
> > >
> > > "
> > > We achieve this by setting ".driver_managed_dma = true" in pci_driver
> > > structure. It is safe because the portdrv driver meets below criteria:
> > >
> > > - This driver doesn't use DMA, as you can't find any related calls like
> > >   pci_set_master() or any kernel DMA API (dma_map_*() and etc.).
> > > - It doesn't use MMIO as you can't find ioremap() or similar calls. It's
> > >   tolerant to userspace possibly also touching the same MMIO registers
> > >   via P2P DMA access.
> > > "
> > >
> > > pci_rest_device() definitely shouldn't be done by the kernel drivers
> > > that have driver_managed_dma set.  
> > 
> > Right
> > 
> > The only time it is safe to reset is if you know there is no attached
> > driver or you know VFIO is the attached driver and the caller owns the
> > VFIO too.
> > 
> > We haven't done a no attached driver test due to races.  
> 
> Ok. @Alex, should we relax the above dev_set requirement now or should
> be in a separate series?


Sounds like no, you should be rejecting enhancements that increase
scope at this point and I don't see consensus here.  My concern was
that we're not correctly describing the dev_set restriction which is
already in place but needs to be more explicitly described in an
implied ownership model vs proof of ownership model.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 356dd215a8d5..4dae9ab94eed 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -106,6 +106,63 @@  void vfio_iommufd_unbind(struct vfio_device *vdev)
 		vdev->ops->unbind_iommufd(vdev);
 }
 
+struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev)
+{
+	if (vdev->iommufd_device)
+		return iommufd_device_to_ictx(vdev->iommufd_device);
+	if (vdev->iommufd_access)
+		return iommufd_access_to_ictx(vdev->iommufd_access);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx);
+
+static int vfio_iommufd_device_id(struct vfio_device *vdev)
+{
+	if (vdev->iommufd_device)
+		return iommufd_device_to_id(vdev->iommufd_device);
+	if (vdev->iommufd_access)
+		return iommufd_access_to_id(vdev->iommufd_access);
+	return -EINVAL;
+}
+
+/*
+ * Return devid for vfio_device if the device is owned by the input
+ * ictx.
+ * - valid devid > 0 for the device that are bound to the input
+ *   iommufd_ctx.
+ * - devid == VFIO_PCI_DEVID_OWNED for the devices that have not
+ *   been opened but but other device within its group has been
+ *   bound to the input iommufd_ctx.
+ * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. vdev is
+ *   NULL.
+ */
+int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev,
+					struct iommufd_ctx *ictx)
+{
+	struct iommu_group *group;
+	int devid;
+
+	if (!vdev)
+		return VFIO_PCI_DEVID_NOT_OWNED;
+
+	if (vfio_iommufd_device_ictx(vdev) == ictx)
+		return vfio_iommufd_device_id(vdev);
+
+	group = iommu_group_get(vdev->dev);
+	if (!group)
+		return VFIO_PCI_DEVID_NOT_OWNED;
+
+	if (iommufd_ctx_has_group(ictx, group))
+		devid = VFIO_PCI_DEVID_OWNED;
+	else
+		devid = VFIO_PCI_DEVID_NOT_OWNED;
+
+	iommu_group_put(group);
+
+	return devid;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid);
+
 /*
  * The physical standard ops mean that the iommufd_device is bound to the
  * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 3a2f67675036..890065f846e4 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -27,6 +27,7 @@ 
 #include <linux/vgaarb.h>
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
+#include <linux/iommufd.h>
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
@@ -776,26 +777,42 @@  struct vfio_pci_fill_info {
 	int max;
 	int cur;
 	struct vfio_pci_dependent_device *devices;
+	struct vfio_device *vdev;
+	u32 flags;
 };
 
 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 {
 	struct vfio_pci_fill_info *fill = data;
-	struct iommu_group *iommu_group;
 
 	if (fill->cur == fill->max)
 		return -EAGAIN; /* Something changed, try again */
 
-	iommu_group = iommu_group_get(&pdev->dev);
-	if (!iommu_group)
-		return -EPERM; /* Cannot reset non-isolated devices */
+	if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
+		struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
+		struct vfio_device_set *dev_set = fill->vdev->dev_set;
+		struct vfio_device *vdev;
+
+		vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
+		fill->devices[fill->cur].devid =
+			vfio_iommufd_device_hot_reset_devid(vdev, iommufd);
+		/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
+		if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED)
+			fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+	} else {
+		struct iommu_group *iommu_group;
+
+		iommu_group = iommu_group_get(&pdev->dev);
+		if (!iommu_group)
+			return -EPERM; /* Cannot reset non-isolated devices */
 
-	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
+		fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
+		iommu_group_put(iommu_group);
+	}
 	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
 	fill->devices[fill->cur].bus = pdev->bus->number;
 	fill->devices[fill->cur].devfn = pdev->devfn;
 	fill->cur++;
-	iommu_group_put(iommu_group);
 	return 0;
 }
 
@@ -1229,17 +1246,26 @@  static int vfio_pci_ioctl_get_pci_hot_reset_info(
 		return -ENOMEM;
 
 	fill.devices = devices;
+	fill.vdev = &vdev->vdev;
 
+	if (vfio_device_cdev_opened(&vdev->vdev))
+		fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
+			     VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+
+	mutex_lock(&vdev->vdev.dev_set->lock);
 	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
 					    &fill, slot);
+	mutex_unlock(&vdev->vdev.dev_set->lock);
 
 	/*
 	 * If a device was removed between counting and filling, we may come up
 	 * short of fill.max.  If a device was added, we'll have a return of
 	 * -EAGAIN above.
 	 */
-	if (!ret)
+	if (!ret) {
 		hdr.count = fill.cur;
+		hdr.flags = fill.flags;
+	}
 
 reset_info_exit:
 	if (copy_to_user(arg, &hdr, minsz))
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ee120d2d530b..382a7b119c7c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -114,6 +114,9 @@  struct vfio_device_ops {
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
+struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev);
+int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev,
+					struct iommufd_ctx *ictx);
 int vfio_iommufd_physical_bind(struct vfio_device *vdev,
 			       struct iommufd_ctx *ictx, u32 *out_device_id);
 void vfio_iommufd_physical_unbind(struct vfio_device *vdev);
@@ -123,6 +126,19 @@  int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
 void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
 int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 #else
+static inline struct iommufd_ctx *
+vfio_iommufd_device_ictx(struct vfio_device *vdev)
+{
+	return NULL;
+}
+
+static inline int
+vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev,
+				    struct iommufd_ctx *ictx)
+{
+	return VFIO_PCI_DEVID_NOT_OWNED;
+}
+
 #define vfio_iommufd_physical_bind                                      \
 	((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx,   \
 		  u32 *out_device_id)) NULL)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0552e8dcf0cb..01203215251a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -650,11 +650,53 @@  enum {
  * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
  *					      struct vfio_pci_hot_reset_info)
  *
+ * This command is used to query the affected devices in the hot reset for
+ * a given device.
+ *
+ * This command always reports the segment, bus, and devfn information for
+ * each affected device, and selectively reports the group_id or devid per
+ * the way how the calling device is opened.
+ *
+ *	- If the calling device is opened via the traditional group/container
+ *	  API, group_id is reported.  User should check if it has owned all
+ *	  the affected devices and provides a set of group fds to prove the
+ *	  ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl.
+ *
+ *	- If the calling device is opened as a cdev, devid is reported.
+ *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this
+ *	  data type.  For a given affected device, it is considered owned by
+ *	  this interface if it meets the following conditions:
+ *	  1) Has a valid devid within the iommufd_ctx of the calling device.
+ *	     Ownership cannot be determined across separate iommufd_ctx and the
+ *	     cdev calling conventions do not support a proof-of-ownership model
+ *	     as provided in the legacy group interface.  In this case a valid
+ *	     devid with value greater than zero is provided in the return
+ *	     structure.
+ *	  2) Does not have a valid devid within the iommufd_ctx of the calling
+ *	     device, but belongs to the same IOMMU group as the calling device
+ *	     or another opened device that has a valid devid within the
+ *	     iommufd_ctx of the calling device.  This provides implicit ownership
+ *	     for devices within the same DMA isolation context.  In this case
+ *	     the invalid devid value of zero is provided in the return structure.
+ *
+ *	  A devid value of -1 is provided in the return structure for devices
+ *	  where ownership is not available.  Such devices prevent the use of
+ *	  VFIO_DEVICE_PCI_HOT_RESET outside of proof-of-ownership calling
+ *	  conventions (ie. via legacy group accessed devices).
+ *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
+ *	  affected devices are owned by the user.  This flag is available only
+ *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
+ *
  * Return: 0 on success, -errno on failure:
  *	-enospc = insufficient buffer, -enodev = unsupported for device.
  */
 struct vfio_pci_dependent_device {
-	__u32	group_id;
+	union {
+		__u32   group_id;
+		__u32	devid;
+#define VFIO_PCI_DEVID_OWNED		0
+#define VFIO_PCI_DEVID_NOT_OWNED	-1
+	};
 	__u16	segment;
 	__u8	bus;
 	__u8	devfn; /* Use PCI_SLOT/PCI_FUNC */
@@ -663,6 +705,8 @@  struct vfio_pci_dependent_device {
 struct vfio_pci_hot_reset_info {
 	__u32	argsz;
 	__u32	flags;
+#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID		(1 << 0)
+#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED	(1 << 1)
 	__u32	count;
 	struct vfio_pci_dependent_device	devices[];
 };