diff mbox series

[v3,09/14] vfio/pci: Change vfio_pci_try_bus_reset() to use the dev_set

Message ID 9-v3-6c9e19cc7d44+15613-vfio_reflck_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Provide core infrastructure for managing open/release | expand

Commit Message

Jason Gunthorpe July 29, 2021, 12:49 a.m. UTC
Keep track of all the vfio_devices that have been added to the device set
and use this list in vfio_pci_try_bus_reset() instead of trying to work
backwards from the pci_device.

The dev_set->lock directly prevents devices from joining/leaving the set,
which further implies the pci_device cannot change drivers or that the
vfio_device be freed, eliminating the need for get/put's.

Completeness of the device set can be directly measured by checking if
every PCI device in the reset group is also in the device set - which
proves that VFIO drivers are attached to everything.

This restructuring corrects a call to pci_dev_driver() without holding the
device_lock() and removes a hard wiring to &vfio_pci_driver.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/vfio/pci/vfio_pci.c | 148 +++++++++++++++---------------------
 1 file changed, 62 insertions(+), 86 deletions(-)

Comments

Christoph Hellwig July 29, 2021, 7:30 a.m. UTC | #1
> +/*
> + * vfio-core considers a group to be viable and will create a vfio_device even
> + * if some devices are bound to drivers like pci-stub or pcieport.  Here we
> + * require all PCI devices to be inside our dev_set since that ensures they stay
> + * put and that every driver controlling the device can co-ordinate with the
> + * device reset.
> + */
> +static struct pci_dev *vfio_pci_find_reset_target(struct vfio_pci_device *vdev)
> +{
> +	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
> +	struct vfio_pci_device *cur;
> +	bool needs_reset = false;
> +
> +	/* No VFIO device has an open device FD */

s/has an/can have/ ?

Or maybe:

	/* No device in the set can have an open device FD */

Otherwise looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Alex Williamson Aug. 3, 2021, 4:34 p.m. UTC | #2
On Wed, 28 Jul 2021 21:49:18 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> Keep track of all the vfio_devices that have been added to the device set
> and use this list in vfio_pci_try_bus_reset() instead of trying to work
> backwards from the pci_device.
> 
> The dev_set->lock directly prevents devices from joining/leaving the set,
> which further implies the pci_device cannot change drivers or that the
> vfio_device be freed, eliminating the need for get/put's.
> 
> Completeness of the device set can be directly measured by checking if
> every PCI device in the reset group is also in the device set - which
> proves that VFIO drivers are attached to everything.
> 
> This restructuring corrects a call to pci_dev_driver() without holding the
> device_lock() and removes a hard wiring to &vfio_pci_driver.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/vfio/pci/vfio_pci.c | 148 +++++++++++++++---------------------
>  1 file changed, 62 insertions(+), 86 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 5d6db93d6c680f..a1ae9a83a38621 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -404,6 +404,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
>  	int i, bar;
>  
> +	/* For needs_reset */
> +	lockdep_assert_held(&vdev->vdev.dev_set->lock);
> +
>  	/* Stop the device from further DMA */
>  	pci_clear_master(pdev);
>  
> @@ -2145,7 +2148,7 @@ static struct pci_driver vfio_pci_driver = {
>  	.err_handler		= &vfio_err_handlers,
>  };
>  
> -static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
> +static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
>  {
>  	struct vfio_devices *devs = data;
>  	struct vfio_device *device;
> @@ -2165,8 +2168,11 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
>  
>  	vdev = container_of(device, struct vfio_pci_device, vdev);
>  
> -	/* Fault if the device is not unused */
> -	if (device->open_count) {
> +	/*
> +	 * Locking multiple devices is prone to deadlock, runaway and
> +	 * unwind if we hit contention.
> +	 */
> +	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
>  		vfio_device_put(device);
>  		return -EBUSY;
>  	}
> @@ -2175,112 +2181,82 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
>  	return 0;
>  }
>  
> -static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
> +static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
>  {
> -	struct vfio_devices *devs = data;
> -	struct vfio_device *device;
> -	struct vfio_pci_device *vdev;
> +	struct vfio_device_set *dev_set = data;
> +	struct vfio_device *cur;
>  
> -	if (devs->cur_index == devs->max_index)
> -		return -ENOSPC;
> +	lockdep_assert_held(&dev_set->lock);
>  
> -	device = vfio_device_get_from_dev(&pdev->dev);
> -	if (!device)
> -		return -EINVAL;
> -
> -	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
> -		vfio_device_put(device);
> -		return -EBUSY;
> -	}
> -
> -	vdev = container_of(device, struct vfio_pci_device, vdev);
> +	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
> +		if (cur->dev == &pdev->dev)
> +			return 0;
> +	return -EBUSY;
> +}
>  
> -	/*
> -	 * Locking multiple devices is prone to deadlock, runaway and
> -	 * unwind if we hit contention.
> -	 */
> -	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
> -		vfio_device_put(device);
> -		return -EBUSY;
> +/*
> + * vfio-core considers a group to be viable and will create a vfio_device even
> + * if some devices are bound to drivers like pci-stub or pcieport.  Here we
> + * require all PCI devices to be inside our dev_set since that ensures they stay
> + * put and that every driver controlling the device can co-ordinate with the
> + * device reset.
> + */
> +static struct pci_dev *vfio_pci_find_reset_target(struct vfio_pci_device *vdev)
> +{
> +	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
> +	struct vfio_pci_device *cur;
> +	bool needs_reset = false;
> +
> +	/* No VFIO device has an open device FD */
> +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
> +		if (cur->vdev.open_count)
> +			return NULL;
> +		needs_reset |= cur->needs_reset;
>  	}
> +	if (!needs_reset)
> +		return NULL;
>  
> -	devs->devices[devs->cur_index++] = vdev;
> -	return 0;
> +	/* All PCI devices in the group to be reset need to be in our dev_set */
> +	if (vfio_pci_for_each_slot_or_bus(
> +		    vdev->pdev, vfio_pci_is_device_in_set, dev_set,
> +		    !pci_probe_reset_slot(vdev->pdev->slot)))
> +		return NULL;
> +	return cur->pdev;


I don't understand the "reset target" aspect of this, cur->pdev is
simply the last entry in the dev_set->devices_list...

>  }
>  
>  /*
>   * If a bus or slot reset is available for the provided device and:
>   *  - All of the devices affected by that bus or slot reset are unused
> - *    (!refcnt)
>   *  - At least one of the affected devices is marked dirty via
>   *    needs_reset (such as by lack of FLR support)
> - * Then attempt to perform that bus or slot reset.  Callers are required
> - * to hold vdev->dev_set->lock, protecting the bus/slot reset group from
> - * concurrent opens.  A vfio_device reference is acquired for each device
> - * to prevent unbinds during the reset operation.
> - *
> - * NB: vfio-core considers a group to be viable even if some devices are
> - * bound to drivers like pci-stub or pcieport.  Here we require all devices
> - * to be bound to vfio_pci since that's the only way we can be sure they
> - * stay put.
> + * Then attempt to perform that bus or slot reset.
>   */
>  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
>  {
> -	struct vfio_devices devs = { .cur_index = 0 };
> -	int i = 0, ret = -EINVAL;
> -	bool slot = false;
> -	struct vfio_pci_device *tmp;
> -
> -	if (!pci_probe_reset_slot(vdev->pdev->slot))
> -		slot = true;
> -	else if (pci_probe_reset_bus(vdev->pdev->bus))
> -		return;
> +	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
> +	struct pci_dev *to_reset;
> +	struct vfio_pci_device *cur;
> +	int ret;
>  
> -	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
> -					  &i, slot) || !i)
> -		return;
> +	lockdep_assert_held(&vdev->vdev.dev_set->lock);
>  
> -	devs.max_index = i;
> -	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
> -	if (!devs.devices)
> +	if (pci_probe_reset_slot(vdev->pdev->slot) &&
> +	    pci_probe_reset_bus(vdev->pdev->bus))
>  		return;
>  
> -	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
> -					  vfio_pci_get_unused_devs,
> -					  &devs, slot))
> -		goto put_devs;
> -
> -	/* Does at least one need a reset? */
> -	for (i = 0; i < devs.cur_index; i++) {
> -		tmp = devs.devices[i];
> -		if (tmp->needs_reset) {
> -			ret = pci_reset_bus(vdev->pdev);
> -			break;
> -		}
> -	}
> -
> -put_devs:
> -	for (i = 0; i < devs.cur_index; i++) {
> -		tmp = devs.devices[i];
> -
> -		/*
> -		 * If reset was successful, affected devices no longer need
> -		 * a reset and we should return all the collateral devices
> -		 * to low power.  If not successful, we either didn't reset
> -		 * the bus or timed out waiting for it, so let's not touch
> -		 * the power state.
> -		 */
> -		if (!ret) {
> -			tmp->needs_reset = false;
> +	to_reset = vfio_pci_find_reset_target(vdev);
> +	if (!to_reset)
> +		return;
>  
> -			if (tmp != vdev && !disable_idle_d3)
> -				vfio_pci_set_power_state(tmp, PCI_D3hot);
> -		}
> +	ret = pci_reset_bus(to_reset);
> +	if (ret)
> +		return;
>  
> -		vfio_device_put(&tmp->vdev);
> +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
> +		cur->needs_reset = false;
> +		if (cur->pdev != to_reset && !disable_idle_d3)
> +			vfio_pci_set_power_state(cur, PCI_D3hot);
>  	}

...which means that here, I think we're putting all but whichever
random device was last in the list into D3.  The intention was that all
the devices except for the one we're operating on should already be in
D3, the bus reset will put them back in D0, so we want to force them
back to D3.

I think the vfio_pci_find_reset_target() function needs to be re-worked
to just tell us true/false that it's ok to reset the provided device,
not to anoint an arbitrary target device.  Thanks,

Alex

> -
> -	kfree(devs.devices);
>  }
>  
>  static void __exit vfio_pci_cleanup(void)
Jason Gunthorpe Aug. 3, 2021, 4:41 p.m. UTC | #3
On Tue, Aug 03, 2021 at 10:34:06AM -0600, Alex Williamson wrote:
> On Wed, 28 Jul 2021 21:49:18 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > Keep track of all the vfio_devices that have been added to the device set
> > and use this list in vfio_pci_try_bus_reset() instead of trying to work
> > backwards from the pci_device.
> > 
> > The dev_set->lock directly prevents devices from joining/leaving the set,
> > which further implies the pci_device cannot change drivers or that the
> > vfio_device be freed, eliminating the need for get/put's.
> > 
> > Completeness of the device set can be directly measured by checking if
> > every PCI device in the reset group is also in the device set - which
> > proves that VFIO drivers are attached to everything.
> > 
> > This restructuring corrects a call to pci_dev_driver() without holding the
> > device_lock() and removes a hard wiring to &vfio_pci_driver.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> >  drivers/vfio/pci/vfio_pci.c | 148 +++++++++++++++---------------------
> >  1 file changed, 62 insertions(+), 86 deletions(-)
> > 
> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index 5d6db93d6c680f..a1ae9a83a38621 100644
> > +++ b/drivers/vfio/pci/vfio_pci.c
> > @@ -404,6 +404,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
> >  	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
> >  	int i, bar;
> >  
> > +	/* For needs_reset */
> > +	lockdep_assert_held(&vdev->vdev.dev_set->lock);
> > +
> >  	/* Stop the device from further DMA */
> >  	pci_clear_master(pdev);
> >  
> > @@ -2145,7 +2148,7 @@ static struct pci_driver vfio_pci_driver = {
> >  	.err_handler		= &vfio_err_handlers,
> >  };
> >  
> > -static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
> > +static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
> >  {
> >  	struct vfio_devices *devs = data;
> >  	struct vfio_device *device;
> > @@ -2165,8 +2168,11 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
> >  
> >  	vdev = container_of(device, struct vfio_pci_device, vdev);
> >  
> > -	/* Fault if the device is not unused */
> > -	if (device->open_count) {
> > +	/*
> > +	 * Locking multiple devices is prone to deadlock, runaway and
> > +	 * unwind if we hit contention.
> > +	 */
> > +	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
> >  		vfio_device_put(device);
> >  		return -EBUSY;
> >  	}
> > @@ -2175,112 +2181,82 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
> >  	return 0;
> >  }
> >  
> > -static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
> > +static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
> >  {
> > -	struct vfio_devices *devs = data;
> > -	struct vfio_device *device;
> > -	struct vfio_pci_device *vdev;
> > +	struct vfio_device_set *dev_set = data;
> > +	struct vfio_device *cur;
> >  
> > -	if (devs->cur_index == devs->max_index)
> > -		return -ENOSPC;
> > +	lockdep_assert_held(&dev_set->lock);
> >  
> > -	device = vfio_device_get_from_dev(&pdev->dev);
> > -	if (!device)
> > -		return -EINVAL;
> > -
> > -	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
> > -		vfio_device_put(device);
> > -		return -EBUSY;
> > -	}
> > -
> > -	vdev = container_of(device, struct vfio_pci_device, vdev);
> > +	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
> > +		if (cur->dev == &pdev->dev)
> > +			return 0;
> > +	return -EBUSY;
> > +}
> >  
> > -	/*
> > -	 * Locking multiple devices is prone to deadlock, runaway and
> > -	 * unwind if we hit contention.
> > -	 */
> > -	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
> > -		vfio_device_put(device);
> > -		return -EBUSY;
> > +/*
> > + * vfio-core considers a group to be viable and will create a vfio_device even
> > + * if some devices are bound to drivers like pci-stub or pcieport.  Here we
> > + * require all PCI devices to be inside our dev_set since that ensures they stay
> > + * put and that every driver controlling the device can co-ordinate with the
> > + * device reset.
> > + */
> > +static struct pci_dev *vfio_pci_find_reset_target(struct vfio_pci_device *vdev)
> > +{
> > +	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
> > +	struct vfio_pci_device *cur;
> > +	bool needs_reset = false;
> > +
> > +	/* No VFIO device has an open device FD */
> > +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
> > +		if (cur->vdev.open_count)
> > +			return NULL;
> > +		needs_reset |= cur->needs_reset;
> >  	}
> > +	if (!needs_reset)
> > +		return NULL;
> >  
> > -	devs->devices[devs->cur_index++] = vdev;
> > -	return 0;
> > +	/* All PCI devices in the group to be reset need to be in our dev_set */
> > +	if (vfio_pci_for_each_slot_or_bus(
> > +		    vdev->pdev, vfio_pci_is_device_in_set, dev_set,
> > +		    !pci_probe_reset_slot(vdev->pdev->slot)))
> > +		return NULL;
> > +	return cur->pdev;
> 
> 
> I don't understand the "reset target" aspect of this, cur->pdev is
> simply the last entry in the dev_set->devices_list...

Oh, hum, this got messed up someplace along the way, the original code
was just:

        /* Does at least one need a reset? */
        for (i = 0; i < devs.cur_index; i++) {
                tmp = devs.devices[i];
                if (tmp->needs_reset) {
                        ret = pci_reset_bus(vdev->pdev);
                        break;

So should this, I'll fix it up, thanks

> I think the vfio_pci_find_reset_target() function needs to be re-worked
> to just tell us true/false that it's ok to reset the provided device,
> not to anoint an arbitrary target device.  Thanks,

Yes, though this logic is confusing, why do we need to check if any
device needs a reset at this point? If we are being asked to reset
vdev shouldn't vdev needs_reset?

Or is the function more of a 'synchronize pending reset' kind of
thing?

Jason
Alex Williamson Aug. 3, 2021, 4:52 p.m. UTC | #4
On Tue, 3 Aug 2021 13:41:52 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:
> On Tue, Aug 03, 2021 at 10:34:06AM -0600, Alex Williamson wrote:
> > I think the vfio_pci_find_reset_target() function needs to be re-worked
> > to just tell us true/false that it's ok to reset the provided device,
> > not to anoint an arbitrary target device.  Thanks,  
> 
> Yes, though this logic is confusing, why do we need to check if any
> device needs a reset at this point? If we are being asked to reset
> vdev shouldn't vdev needs_reset?
> 
> Or is the function more of a 'synchronize pending reset' kind of
> thing?

Yes, the latter.  For instance think about a multi-function PCI device
such as a GPU.  The functions have dramatically different capabilities,
some might have function level reset abilities and others not.  We want
to be able to trigger a bus reset as the last device of the set is
released, no matter the order they're released and no matter the
capabilities of the device we're currently processing.  Thanks,

Alex
Jason Gunthorpe Aug. 5, 2021, 11:47 a.m. UTC | #5
On Tue, Aug 03, 2021 at 10:52:25AM -0600, Alex Williamson wrote:
> On Tue, 3 Aug 2021 13:41:52 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> > On Tue, Aug 03, 2021 at 10:34:06AM -0600, Alex Williamson wrote:
> > > I think the vfio_pci_find_reset_target() function needs to be re-worked
> > > to just tell us true/false that it's ok to reset the provided device,
> > > not to anoint an arbitrary target device.  Thanks,  
> > 
> > Yes, though this logic is confusing, why do we need to check if any
> > device needs a reset at this point? If we are being asked to reset
> > vdev shouldn't vdev needs_reset?
> > 
> > Or is the function more of a 'synchronize pending reset' kind of
> > thing?
> 
> Yes, the latter.  For instance think about a multi-function PCI device
> such as a GPU.  The functions have dramatically different capabilities,
> some might have function level reset abilities and others not.  We want
> to be able to trigger a bus reset as the last device of the set is
> released, no matter the order they're released and no matter the
> capabilities of the device we're currently processing.  Thanks,

I worked on this for awhile, I think this is much clearer about what
this algorithm is trying to do:

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 5d6db93d6c680f..e418bcbb68facc 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -223,7 +223,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 	}
 }
 
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
+static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
 
@@ -404,6 +404,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
 	int i, bar;
 
+	/* For needs_reset */
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
+
 	/* Stop the device from further DMA */
 	pci_clear_master(pdev);
 
@@ -487,9 +490,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 out:
 	pci_disable_device(pdev);
 
-	vfio_pci_try_bus_reset(vdev);
-
-	if (!disable_idle_d3)
+	if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3)
 		vfio_pci_set_power_state(vdev, PCI_D3hot);
 }
 
@@ -2145,36 +2146,6 @@ static struct pci_driver vfio_pci_driver = {
 	.err_handler		= &vfio_err_handlers,
 };
 
-static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
-{
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
-
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	vdev = container_of(device, struct vfio_pci_device, vdev);
-
-	/* Fault if the device is not unused */
-	if (device->open_count) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	devs->devices[devs->cur_index++] = vdev;
-	return 0;
-}
-
 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
@@ -2208,79 +2179,86 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 	return 0;
 }
 
+static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
+{
+	struct vfio_device_set *dev_set = data;
+	struct vfio_device *cur;
+
+	lockdep_assert_held(&dev_set->lock);
+
+	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
+		if (cur->dev == &pdev->dev)
+			return 0;
+	return -EBUSY;
+}
+
+static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
+{
+	struct vfio_pci_device *cur;
+	bool needs_reset = false;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		/* No VFIO device in the set can have an open device FD */
+		if (cur->vdev.open_count)
+			return false;
+		needs_reset |= cur->needs_reset;
+	}
+	return needs_reset;
+}
+
 /*
- * If a bus or slot reset is available for the provided device and:
+ * If a bus or slot reset is available for the provided dev_set and:
  *  - All of the devices affected by that bus or slot reset are unused
- *    (!refcnt)
  *  - At least one of the affected devices is marked dirty via
  *    needs_reset (such as by lack of FLR support)
- * Then attempt to perform that bus or slot reset.  Callers are required
- * to hold vdev->dev_set->lock, protecting the bus/slot reset group from
- * concurrent opens.  A vfio_device reference is acquired for each device
- * to prevent unbinds during the reset operation.
- *
- * NB: vfio-core considers a group to be viable even if some devices are
- * bound to drivers like pci-stub or pcieport.  Here we require all devices
- * to be bound to vfio_pci since that's the only way we can be sure they
- * stay put.
+ * Then attempt to perform that bus or slot reset.
+ * Returns true if the dev_set was reset.
  */
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
+static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
 {
-	struct vfio_devices devs = { .cur_index = 0 };
-	int i = 0, ret = -EINVAL;
-	bool slot = false;
-	struct vfio_pci_device *tmp;
+	struct vfio_pci_device *cur;
+	struct pci_dev *pdev;
+	int ret;
 
-	if (!pci_probe_reset_slot(vdev->pdev->slot))
-		slot = true;
-	else if (pci_probe_reset_bus(vdev->pdev->bus))
-		return;
+	lockdep_assert_held(&dev_set->lock);
 
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
-					  &i, slot) || !i)
-		return;
+	/*
+	 * By definition all PCI devices in the dev_set share the same PCI
+	 * reset, so any pci_dev will have the same outcomes for
+	 * pci_probe_reset_*() and pci_reset_bus().
+	 */
+	pdev = list_first_entry(&dev_set->device_list, struct vfio_pci_device,
+				vdev.dev_set_list)->pdev;
 
-	devs.max_index = i;
-	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
-	if (!devs.devices)
-		return;
+	/* Reset of the dev_set is possible */
+	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
+		return false;
 
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_unused_devs,
-					  &devs, slot))
-		goto put_devs;
+	if (!vfio_pci_dev_set_needs_reset(dev_set))
+		return false;
 
-	/* Does at least one need a reset? */
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
-		if (tmp->needs_reset) {
-			ret = pci_reset_bus(vdev->pdev);
-			break;
-		}
+	/*
+	 * vfio-core considers a group to be viable and will create a
+	 * vfio_device even if some devices are bound to drivers like pci-stub
+	 * or pcieport. Here we require all PCI devices to be inside our dev_set
+	 * since that ensures they stay put and that every driver controlling
+	 * the device can co-ordinate with the device reset.
+	 */
+	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
+					  dev_set,
+					  !pci_probe_reset_slot(pdev->slot)))
+		return false;
+
+	ret = pci_reset_bus(pdev);
+	if (ret)
+		return false;
+
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		cur->needs_reset = false;
+		if (!disable_idle_d3)
+			vfio_pci_set_power_state(cur, PCI_D3hot);
 	}
-
-put_devs:
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
-
-		/*
-		 * If reset was successful, affected devices no longer need
-		 * a reset and we should return all the collateral devices
-		 * to low power.  If not successful, we either didn't reset
-		 * the bus or timed out waiting for it, so let's not touch
-		 * the power state.
-		 */
-		if (!ret) {
-			tmp->needs_reset = false;
-
-			if (tmp != vdev && !disable_idle_d3)
-				vfio_pci_set_power_state(tmp, PCI_D3hot);
-		}
-
-		vfio_device_put(&tmp->vdev);
-	}
-
-	kfree(devs.devices);
+	return true;
 }
 
 static void __exit vfio_pci_cleanup(void)
Alex Williamson Aug. 5, 2021, 5:33 p.m. UTC | #6
On Thu, 5 Aug 2021 08:47:01 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Aug 03, 2021 at 10:52:25AM -0600, Alex Williamson wrote:
> > On Tue, 3 Aug 2021 13:41:52 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:  
> > > On Tue, Aug 03, 2021 at 10:34:06AM -0600, Alex Williamson wrote:  
> > > > I think the vfio_pci_find_reset_target() function needs to be re-worked
> > > > to just tell us true/false that it's ok to reset the provided device,
> > > > not to anoint an arbitrary target device.  Thanks,    
> > > 
> > > Yes, though this logic is confusing, why do we need to check if any
> > > device needs a reset at this point? If we are being asked to reset
> > > vdev shouldn't vdev needs_reset?
> > > 
> > > Or is the function more of a 'synchronize pending reset' kind of
> > > thing?  
> > 
> > Yes, the latter.  For instance think about a multi-function PCI device
> > such as a GPU.  The functions have dramatically different capabilities,
> > some might have function level reset abilities and others not.  We want
> > to be able to trigger a bus reset as the last device of the set is
> > released, no matter the order they're released and no matter the
> > capabilities of the device we're currently processing.  Thanks,  
> 
> I worked on this for awhile, I think this is much clearer about what
> this algorithm is trying to do:
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 5d6db93d6c680f..e418bcbb68facc 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -223,7 +223,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>  	}
>  }
>  
> -static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
> +static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
>  static void vfio_pci_disable(struct vfio_pci_device *vdev);
>  static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
>  
> @@ -404,6 +404,9 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
>  	int i, bar;
>  
> +	/* For needs_reset */
> +	lockdep_assert_held(&vdev->vdev.dev_set->lock);
> +
>  	/* Stop the device from further DMA */
>  	pci_clear_master(pdev);
>  
> @@ -487,9 +490,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  out:
>  	pci_disable_device(pdev);
>  
> -	vfio_pci_try_bus_reset(vdev);
> -
> -	if (!disable_idle_d3)
> +	if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3)
>  		vfio_pci_set_power_state(vdev, PCI_D3hot);
>  }
>  
> @@ -2145,36 +2146,6 @@ static struct pci_driver vfio_pci_driver = {
>  	.err_handler		= &vfio_err_handlers,
>  };
>  
> -static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
> -{
> -	struct vfio_devices *devs = data;
> -	struct vfio_device *device;
> -	struct vfio_pci_device *vdev;
> -
> -	if (devs->cur_index == devs->max_index)
> -		return -ENOSPC;
> -
> -	device = vfio_device_get_from_dev(&pdev->dev);
> -	if (!device)
> -		return -EINVAL;
> -
> -	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
> -		vfio_device_put(device);
> -		return -EBUSY;
> -	}
> -
> -	vdev = container_of(device, struct vfio_pci_device, vdev);
> -
> -	/* Fault if the device is not unused */
> -	if (device->open_count) {
> -		vfio_device_put(device);
> -		return -EBUSY;
> -	}
> -
> -	devs->devices[devs->cur_index++] = vdev;
> -	return 0;
> -}
> -
>  static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
>  {
>  	struct vfio_devices *devs = data;
> @@ -2208,79 +2179,86 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
>  	return 0;
>  }
>  
> +static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
> +{
> +	struct vfio_device_set *dev_set = data;
> +	struct vfio_device *cur;
> +
> +	lockdep_assert_held(&dev_set->lock);
> +
> +	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
> +		if (cur->dev == &pdev->dev)
> +			return 0;
> +	return -EBUSY;
> +}
> +
> +static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)

Slight nit on the name here since we're essentially combining
needs_reset along with the notion of the device being unused.  I'm not
sure, maybe "should_reset"?  Otherwise it looks ok.  Thanks,

Alex

> +{
> +	struct vfio_pci_device *cur;
> +	bool needs_reset = false;
> +
> +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
> +		/* No VFIO device in the set can have an open device FD */
> +		if (cur->vdev.open_count)
> +			return false;
> +		needs_reset |= cur->needs_reset;
> +	}
> +	return needs_reset;
> +}
> +
>  /*
> - * If a bus or slot reset is available for the provided device and:
> + * If a bus or slot reset is available for the provided dev_set and:
>   *  - All of the devices affected by that bus or slot reset are unused
> - *    (!refcnt)
>   *  - At least one of the affected devices is marked dirty via
>   *    needs_reset (such as by lack of FLR support)
> - * Then attempt to perform that bus or slot reset.  Callers are required
> - * to hold vdev->dev_set->lock, protecting the bus/slot reset group from
> - * concurrent opens.  A vfio_device reference is acquired for each device
> - * to prevent unbinds during the reset operation.
> - *
> - * NB: vfio-core considers a group to be viable even if some devices are
> - * bound to drivers like pci-stub or pcieport.  Here we require all devices
> - * to be bound to vfio_pci since that's the only way we can be sure they
> - * stay put.
> + * Then attempt to perform that bus or slot reset.
> + * Returns true if the dev_set was reset.
>   */
> -static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
> +static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
>  {
> -	struct vfio_devices devs = { .cur_index = 0 };
> -	int i = 0, ret = -EINVAL;
> -	bool slot = false;
> -	struct vfio_pci_device *tmp;
> +	struct vfio_pci_device *cur;
> +	struct pci_dev *pdev;
> +	int ret;
>  
> -	if (!pci_probe_reset_slot(vdev->pdev->slot))
> -		slot = true;
> -	else if (pci_probe_reset_bus(vdev->pdev->bus))
> -		return;
> +	lockdep_assert_held(&dev_set->lock);
>  
> -	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
> -					  &i, slot) || !i)
> -		return;
> +	/*
> +	 * By definition all PCI devices in the dev_set share the same PCI
> +	 * reset, so any pci_dev will have the same outcomes for
> +	 * pci_probe_reset_*() and pci_reset_bus().
> +	 */
> +	pdev = list_first_entry(&dev_set->device_list, struct vfio_pci_device,
> +				vdev.dev_set_list)->pdev;
>  
> -	devs.max_index = i;
> -	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
> -	if (!devs.devices)
> -		return;
> +	/* Reset of the dev_set is possible */
> +	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
> +		return false;
>  
> -	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
> -					  vfio_pci_get_unused_devs,
> -					  &devs, slot))
> -		goto put_devs;
> +	if (!vfio_pci_dev_set_needs_reset(dev_set))
> +		return false;
>  
> -	/* Does at least one need a reset? */
> -	for (i = 0; i < devs.cur_index; i++) {
> -		tmp = devs.devices[i];
> -		if (tmp->needs_reset) {
> -			ret = pci_reset_bus(vdev->pdev);
> -			break;
> -		}
> +	/*
> +	 * vfio-core considers a group to be viable and will create a
> +	 * vfio_device even if some devices are bound to drivers like pci-stub
> +	 * or pcieport. Here we require all PCI devices to be inside our dev_set
> +	 * since that ensures they stay put and that every driver controlling
> +	 * the device can co-ordinate with the device reset.
> +	 */
> +	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
> +					  dev_set,
> +					  !pci_probe_reset_slot(pdev->slot)))
> +		return false;
> +
> +	ret = pci_reset_bus(pdev);
> +	if (ret)
> +		return false;
> +
> +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
> +		cur->needs_reset = false;
> +		if (!disable_idle_d3)
> +			vfio_pci_set_power_state(cur, PCI_D3hot);
>  	}
> -
> -put_devs:
> -	for (i = 0; i < devs.cur_index; i++) {
> -		tmp = devs.devices[i];
> -
> -		/*
> -		 * If reset was successful, affected devices no longer need
> -		 * a reset and we should return all the collateral devices
> -		 * to low power.  If not successful, we either didn't reset
> -		 * the bus or timed out waiting for it, so let's not touch
> -		 * the power state.
> -		 */
> -		if (!ret) {
> -			tmp->needs_reset = false;
> -
> -			if (tmp != vdev && !disable_idle_d3)
> -				vfio_pci_set_power_state(tmp, PCI_D3hot);
> -		}
> -
> -		vfio_device_put(&tmp->vdev);
> -	}
> -
> -	kfree(devs.devices);
> +	return true;
>  }
>  
>  static void __exit vfio_pci_cleanup(void)
>
Jason Gunthorpe Aug. 5, 2021, 11:05 p.m. UTC | #7
On Thu, Aug 05, 2021 at 11:33:11AM -0600, Alex Williamson wrote:
> > +static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
> > +{
> > +	struct vfio_device_set *dev_set = data;
> > +	struct vfio_device *cur;
> > +
> > +	lockdep_assert_held(&dev_set->lock);
> > +
> > +	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
> > +		if (cur->dev == &pdev->dev)
> > +			return 0;
> > +	return -EBUSY;
> > +}
> > +
> > +static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
> 
> Slight nit on the name here since we're essentially combining
> needs_reset along with the notion of the device being unused.  I'm not
> sure, maybe "should_reset"?  Otherwise it looks ok.  Thanks,

What I did is add a new function vfio_pci_dev_set_resettable() which
pulls in three parts of logic that can be be shared with the
VFIO_DEVICE_PCI_HOT_RESET change in the next patch. That leaves this
function as purely needs_reset.

In turn the VFIO_DEVICE_PCI_HOT_RESET patch gets the same treatment
where it becomes a dev_set centric API just like this.

I'll send it as a v4.

Thanks,
Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 5d6db93d6c680f..a1ae9a83a38621 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -404,6 +404,9 @@  static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
 	int i, bar;
 
+	/* For needs_reset */
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
+
 	/* Stop the device from further DMA */
 	pci_clear_master(pdev);
 
@@ -2145,7 +2148,7 @@  static struct pci_driver vfio_pci_driver = {
 	.err_handler		= &vfio_err_handlers,
 };
 
-static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
 	struct vfio_device *device;
@@ -2165,8 +2168,11 @@  static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 
 	vdev = container_of(device, struct vfio_pci_device, vdev);
 
-	/* Fault if the device is not unused */
-	if (device->open_count) {
+	/*
+	 * Locking multiple devices is prone to deadlock, runaway and
+	 * unwind if we hit contention.
+	 */
+	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
 		vfio_device_put(device);
 		return -EBUSY;
 	}
@@ -2175,112 +2181,82 @@  static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 	return 0;
 }
 
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
+static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
 {
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
+	struct vfio_device_set *dev_set = data;
+	struct vfio_device *cur;
 
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
+	lockdep_assert_held(&dev_set->lock);
 
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	vdev = container_of(device, struct vfio_pci_device, vdev);
+	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
+		if (cur->dev == &pdev->dev)
+			return 0;
+	return -EBUSY;
+}
 
-	/*
-	 * Locking multiple devices is prone to deadlock, runaway and
-	 * unwind if we hit contention.
-	 */
-	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
-		vfio_device_put(device);
-		return -EBUSY;
+/*
+ * vfio-core considers a group to be viable and will create a vfio_device even
+ * if some devices are bound to drivers like pci-stub or pcieport.  Here we
+ * require all PCI devices to be inside our dev_set since that ensures they stay
+ * put and that every driver controlling the device can co-ordinate with the
+ * device reset.
+ */
+static struct pci_dev *vfio_pci_find_reset_target(struct vfio_pci_device *vdev)
+{
+	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
+	struct vfio_pci_device *cur;
+	bool needs_reset = false;
+
+	/* No VFIO device has an open device FD */
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		if (cur->vdev.open_count)
+			return NULL;
+		needs_reset |= cur->needs_reset;
 	}
+	if (!needs_reset)
+		return NULL;
 
-	devs->devices[devs->cur_index++] = vdev;
-	return 0;
+	/* All PCI devices in the group to be reset need to be in our dev_set */
+	if (vfio_pci_for_each_slot_or_bus(
+		    vdev->pdev, vfio_pci_is_device_in_set, dev_set,
+		    !pci_probe_reset_slot(vdev->pdev->slot)))
+		return NULL;
+	return cur->pdev;
 }
 
 /*
  * If a bus or slot reset is available for the provided device and:
  *  - All of the devices affected by that bus or slot reset are unused
- *    (!refcnt)
  *  - At least one of the affected devices is marked dirty via
  *    needs_reset (such as by lack of FLR support)
- * Then attempt to perform that bus or slot reset.  Callers are required
- * to hold vdev->dev_set->lock, protecting the bus/slot reset group from
- * concurrent opens.  A vfio_device reference is acquired for each device
- * to prevent unbinds during the reset operation.
- *
- * NB: vfio-core considers a group to be viable even if some devices are
- * bound to drivers like pci-stub or pcieport.  Here we require all devices
- * to be bound to vfio_pci since that's the only way we can be sure they
- * stay put.
+ * Then attempt to perform that bus or slot reset.
  */
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 {
-	struct vfio_devices devs = { .cur_index = 0 };
-	int i = 0, ret = -EINVAL;
-	bool slot = false;
-	struct vfio_pci_device *tmp;
-
-	if (!pci_probe_reset_slot(vdev->pdev->slot))
-		slot = true;
-	else if (pci_probe_reset_bus(vdev->pdev->bus))
-		return;
+	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
+	struct pci_dev *to_reset;
+	struct vfio_pci_device *cur;
+	int ret;
 
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
-					  &i, slot) || !i)
-		return;
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
 
-	devs.max_index = i;
-	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
-	if (!devs.devices)
+	if (pci_probe_reset_slot(vdev->pdev->slot) &&
+	    pci_probe_reset_bus(vdev->pdev->bus))
 		return;
 
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_unused_devs,
-					  &devs, slot))
-		goto put_devs;
-
-	/* Does at least one need a reset? */
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
-		if (tmp->needs_reset) {
-			ret = pci_reset_bus(vdev->pdev);
-			break;
-		}
-	}
-
-put_devs:
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
-
-		/*
-		 * If reset was successful, affected devices no longer need
-		 * a reset and we should return all the collateral devices
-		 * to low power.  If not successful, we either didn't reset
-		 * the bus or timed out waiting for it, so let's not touch
-		 * the power state.
-		 */
-		if (!ret) {
-			tmp->needs_reset = false;
+	to_reset = vfio_pci_find_reset_target(vdev);
+	if (!to_reset)
+		return;
 
-			if (tmp != vdev && !disable_idle_d3)
-				vfio_pci_set_power_state(tmp, PCI_D3hot);
-		}
+	ret = pci_reset_bus(to_reset);
+	if (ret)
+		return;
 
-		vfio_device_put(&tmp->vdev);
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		cur->needs_reset = false;
+		if (cur->pdev != to_reset && !disable_idle_d3)
+			vfio_pci_set_power_state(cur, PCI_D3hot);
 	}
-
-	kfree(devs.devices);
 }
 
 static void __exit vfio_pci_cleanup(void)