diff mbox series

[v2,09/14] vfio/pci: Change vfio_pci_try_bus_reset() to use the dev_set

Message ID 9-v2-b6a5582525c9+ff96-vfio_reflck_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Provide core infrastructure for managing open/release | expand

Commit Message

Jason Gunthorpe July 20, 2021, 5:42 p.m. UTC
Keep track of all the vfio_devices that have been added to the device set
and use this list in vfio_pci_try_bus_reset() instead of trying to work
backwards from the pci_device.

The dev_set->lock directly prevents devices from joining/leaving the set,
which further implies the pci_device cannot change drivers or that the
vfio_device be freed, eliminating the need for get/put's.

Completeness of the device set can be directly measured by checking if
every PCI device in the reset group is also in the device set - which
proves that VFIO drivers are attached to everything.

This restructuring corrects a call to pci_dev_driver() without holding the
device_lock() and removes a hard wiring to &vfio_pci_driver.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/vfio/pci/vfio_pci.c | 110 ++++++++++++++----------------------
 drivers/vfio/vfio.c         |  10 ++++
 include/linux/vfio.h        |   2 +
 3 files changed, 53 insertions(+), 69 deletions(-)

Comments

Christoph Hellwig July 23, 2021, 8:05 a.m. UTC | #1
On Tue, Jul 20, 2021 at 02:42:55PM -0300, Jason Gunthorpe wrote:
> Keep track of all the vfio_devices that have been added to the device set
> and use this list in vfio_pci_try_bus_reset() instead of trying to work
> backwards from the pci_device.
> 
> The dev_set->lock directly prevents devices from joining/leaving the set,
> which further implies the pci_device cannot change drivers or that the
> vfio_device be freed, eliminating the need for get/put's.
> 
> Completeness of the device set can be directly measured by checking if
> every PCI device in the reset group is also in the device set - which
> proves that VFIO drivers are attached to everything.
> 
> This restructuring corrects a call to pci_dev_driver() without holding the
> device_lock() and removes a hard wiring to &vfio_pci_driver.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

I think the addition of the list to the dev_set should be a different
patch.  Or maybe even go into the one adding the dev_set concept.

> +static int vfio_pci_check_all_devices_bound(struct pci_dev *pdev, void *data)
>  {
> +	struct vfio_device_set *dev_set = data;
> +	struct vfio_device *cur;
>  
> +	lockdep_assert_held(&dev_set->lock);
>  
> +	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
> +		if (cur->dev == &pdev->dev)
> +			return 0;
> +	return -EBUSY;

I don't understand this logic.  If there is any device in the set that
does now have the same struct device we're in trouble?  Please clearly
document what this is trying to do.  If the bound in the name makes sense
you probably want to check the driver instead.

>  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
>  {
> +	/* All VFIO devices have a closed FD */
> +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
> +		if (cur->vdev.open_count)
> +			return;
> +
> +	/* All devices in the group to be reset need VFIO devices */
> +	if (vfio_pci_for_each_slot_or_bus(
> +		    vdev->pdev, vfio_pci_check_all_devices_bound, dev_set,
> +		    !pci_probe_reset_slot(vdev->pdev->slot)))
> +		return;
>  
>  	/* Does at least one need a reset? */

These checks look a little strange, and the comments don't make much
sense.  What about an incremental patch like this?

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index fbc20f6d2dd412..d8375a5e77e07c 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -2188,10 +2188,34 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 	return 0;
 }
 
+static struct pci_dev *vfio_pci_reset_target(struct vfio_pci_device *vdev)
+{
+	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
+	struct vfio_pci_device *cur;
+
+	/* none of the device is allowed to be currently open: */
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+		if (cur->vdev.open_count)
+			return NULL;
+
+	/* all devices in the group to be reset need to be VFIO devices: */
+	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
+			vfio_pci_check_all_devices_bound, dev_set,
+			!pci_probe_reset_slot(vdev->pdev->slot)))
+		return NULL;
+
+	/* Does at least one need a reset? */
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+		if (cur->needs_reset)
+			return cur->pdev;
+
+	return NULL;
+}
+
 /*
  * If a bus or slot reset is available for the provided device and:
  *  - All of the devices affected by that bus or slot reset are unused
- *    (!refcnt)
+ *    (!open_count)
  *  - At least one of the affected devices is marked dirty via
  *    needs_reset (such as by lack of FLR support)
  * Then attempt to perform that bus or slot reset.  Callers are required
@@ -2206,8 +2230,8 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 {
 	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
-	struct vfio_pci_device *to_reset = NULL;
 	struct vfio_pci_device *cur;
+	struct pci_dev *to_reset;
 	int ret;
 
 	if (pci_probe_reset_slot(vdev->pdev->slot) &&
@@ -2216,35 +2240,18 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 
 	lockdep_assert_held(&vdev->vdev.dev_set->lock);
 
-	/* All VFIO devices have a closed FD */
-	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
-		if (cur->vdev.open_count)
-			return;
-
-	/* All devices in the group to be reset need VFIO devices */
-	if (vfio_pci_for_each_slot_or_bus(
-		    vdev->pdev, vfio_pci_check_all_devices_bound, dev_set,
-		    !pci_probe_reset_slot(vdev->pdev->slot)))
-		return;
-
-	/* Does at least one need a reset? */
-	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
-		if (cur->needs_reset) {
-			to_reset = cur;
-			break;
-		}
-	}
+	to_reset = vfio_pci_reset_target(vdev);
 	if (!to_reset)
 		return;
 
-	ret = pci_reset_bus(to_reset->pdev);
+	ret = pci_reset_bus(to_reset);
 	if (ret)
 		return;
 
 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
 		cur->needs_reset = false;
 
-		if (cur != to_reset && !disable_idle_d3)
+		if (cur->pdev != to_reset && !disable_idle_d3)
 			vfio_pci_set_power_state(cur, PCI_D3hot);
 	}
 }
Jason Gunthorpe July 23, 2021, 1:30 p.m. UTC | #2
On Fri, Jul 23, 2021 at 10:05:43AM +0200, Christoph Hellwig wrote:
> On Tue, Jul 20, 2021 at 02:42:55PM -0300, Jason Gunthorpe wrote:
> > Keep track of all the vfio_devices that have been added to the device set
> > and use this list in vfio_pci_try_bus_reset() instead of trying to work
> > backwards from the pci_device.
> > 
> > The dev_set->lock directly prevents devices from joining/leaving the set,
> > which further implies the pci_device cannot change drivers or that the
> > vfio_device be freed, eliminating the need for get/put's.
> > 
> > Completeness of the device set can be directly measured by checking if
> > every PCI device in the reset group is also in the device set - which
> > proves that VFIO drivers are attached to everything.
> > 
> > This restructuring corrects a call to pci_dev_driver() without holding the
> > device_lock() and removes a hard wiring to &vfio_pci_driver.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> 
> I think the addition of the list to the dev_set should be a different
> patch.  Or maybe even go into the one adding the dev_set concept.

OK

> > +static int vfio_pci_check_all_devices_bound(struct pci_dev *pdev, void *data)
> >  {
> > +	struct vfio_device_set *dev_set = data;
> > +	struct vfio_device *cur;
> >  
> > +	lockdep_assert_held(&dev_set->lock);
> >  
> > +	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
> > +		if (cur->dev == &pdev->dev)
> > +			return 0;
> > +	return -EBUSY;
> 
> I don't understand this logic.  If there is any device in the set that
> does now have the same struct device we're in trouble?  Please clearly
> document what this is trying to do.  If the bound in the name makes sense
> you probably want to check the driver instead.

The PCI reset this code is tring to do effects a set of PCI devices,
due to how the HW works.

Along with the vfio_pci_for_each_slot_or_bus() this is computing a set
wise 'is superset' between the list of pci_dev's the reset will affect
(the reset group) and the list of vfio_devices that we have locking
control over to sequence the reset (the dev_set).

If every PCI device we will reset is under the dev_set then we
directly know it is safe to trigger the reset. If any PCI device is
not in this dev_set then we cannot use the reset as we can't know what
will happen to the device that we don't control.

Let's use a different word than bound? vfio_pci_check_device_in_set()?

> >  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
> >  {
> > +	/* All VFIO devices have a closed FD */
> > +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
> > +		if (cur->vdev.open_count)
> > +			return;
> > +
> > +	/* All devices in the group to be reset need VFIO devices */
> > +	if (vfio_pci_for_each_slot_or_bus(
> > +		    vdev->pdev, vfio_pci_check_all_devices_bound, dev_set,
> > +		    !pci_probe_reset_slot(vdev->pdev->slot)))
> > +		return;
> >  
> >  	/* Does at least one need a reset? */
> 
> These checks look a little strange, and the comments don't make much
> sense.  What about an incremental patch like this?

Sure, it can go in a function

> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index fbc20f6d2dd412..d8375a5e77e07c 100644
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -2188,10 +2188,34 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
>  	return 0;
>  }
>  
> +static struct pci_dev *vfio_pci_reset_target(struct vfio_pci_device *vdev)
> +{
> +	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
> +	struct vfio_pci_device *cur;
> +
> +	/* none of the device is allowed to be currently open: */
> +	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
> +		if (cur->vdev.open_count)
> +			return NULL;
> +
> +	/* all devices in the group to be reset need to be VFIO devices: */

It is not "need to be VFIO devices" it is "need to be in our
dev_set". Have the PCI dev bound to, say, a mdev VFIO device isn't
good enough.

Thanks,
Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 22774e447b5f4a..fbc20f6d2dd412 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -404,6 +404,9 @@  static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
 	int i, bar;
 
+	/* For needs_reset */
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
+
 	/* Stop the device from further DMA */
 	pci_clear_master(pdev);
 
@@ -2139,34 +2142,17 @@  static struct pci_driver vfio_pci_driver = {
 	.err_handler		= &vfio_err_handlers,
 };
 
-static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_check_all_devices_bound(struct pci_dev *pdev, void *data)
 {
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *vdev;
-
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
+	struct vfio_device_set *dev_set = data;
+	struct vfio_device *cur;
 
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
-
-	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	vdev = container_of(device, struct vfio_pci_device, vdev);
-
-	/* Fault if the device is not unused */
-	if (device->open_count) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
+	lockdep_assert_held(&dev_set->lock);
 
-	devs->devices[devs->cur_index++] = vdev;
-	return 0;
+	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
+		if (cur->dev == &pdev->dev)
+			return 0;
+	return -EBUSY;
 }
 
 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
@@ -2210,8 +2196,7 @@  static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
  *    needs_reset (such as by lack of FLR support)
  * Then attempt to perform that bus or slot reset.  Callers are required
  * to hold vdev->dev_set->lock, protecting the bus/slot reset group from
- * concurrent opens.  A vfio_device reference is acquired for each device
- * to prevent unbinds during the reset operation.
+ * concurrent opens.
  *
  * NB: vfio-core considers a group to be viable even if some devices are
  * bound to drivers like pci-stub or pcieport.  Here we require all devices
@@ -2220,61 +2205,48 @@  static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
  */
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 {
-	struct vfio_devices devs = { .cur_index = 0 };
-	int i = 0, ret = -EINVAL;
-	bool slot = false;
-	struct vfio_pci_device *tmp;
-
-	if (!pci_probe_reset_slot(vdev->pdev->slot))
-		slot = true;
-	else if (pci_probe_reset_bus(vdev->pdev->bus))
-		return;
+	struct vfio_device_set *dev_set = vdev->vdev.dev_set;
+	struct vfio_pci_device *to_reset = NULL;
+	struct vfio_pci_device *cur;
+	int ret;
 
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
-					  &i, slot) || !i)
+	if (pci_probe_reset_slot(vdev->pdev->slot) &&
+	    pci_probe_reset_bus(vdev->pdev->bus))
 		return;
 
-	devs.max_index = i;
-	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
-	if (!devs.devices)
-		return;
+	lockdep_assert_held(&vdev->vdev.dev_set->lock);
 
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_unused_devs,
-					  &devs, slot))
-		goto put_devs;
+	/* All VFIO devices have a closed FD */
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+		if (cur->vdev.open_count)
+			return;
+
+	/* All devices in the group to be reset need VFIO devices */
+	if (vfio_pci_for_each_slot_or_bus(
+		    vdev->pdev, vfio_pci_check_all_devices_bound, dev_set,
+		    !pci_probe_reset_slot(vdev->pdev->slot)))
+		return;
 
 	/* Does at least one need a reset? */
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
-		if (tmp->needs_reset) {
-			ret = pci_reset_bus(vdev->pdev);
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		if (cur->needs_reset) {
+			to_reset = cur;
 			break;
 		}
 	}
+	if (!to_reset)
+		return;
 
-put_devs:
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = devs.devices[i];
-
-		/*
-		 * If reset was successful, affected devices no longer need
-		 * a reset and we should return all the collateral devices
-		 * to low power.  If not successful, we either didn't reset
-		 * the bus or timed out waiting for it, so let's not touch
-		 * the power state.
-		 */
-		if (!ret) {
-			tmp->needs_reset = false;
+	ret = pci_reset_bus(to_reset->pdev);
+	if (ret)
+		return;
 
-			if (tmp != vdev && !disable_idle_d3)
-				vfio_pci_set_power_state(tmp, PCI_D3hot);
-		}
+	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+		cur->needs_reset = false;
 
-		vfio_device_put(&tmp->vdev);
+		if (cur != to_reset && !disable_idle_d3)
+			vfio_pci_set_power_state(cur, PCI_D3hot);
 	}
-
-	kfree(devs.devices);
 }
 
 static void __exit vfio_pci_cleanup(void)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 8572d943320214..7e352d68b1b01d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -129,7 +129,12 @@  int vfio_assign_device_set(struct vfio_device *device, void *set_id)
 	if (dev_set) {
 		dev_set->device_count++;
 		xa_unlock(&vfio_device_set_xa);
+
+		mutex_lock(&dev_set->lock);
 		device->dev_set = dev_set;
+		list_add_tail(&device->dev_set_list, &dev_set->device_list);
+		mutex_unlock(&dev_set->lock);
+
 		if (dev_set != alloc_dev_set)
 			kfree(alloc_dev_set);
 		return 0;
@@ -143,6 +148,7 @@  int vfio_assign_device_set(struct vfio_device *device, void *set_id)
 	if (!alloc_dev_set)
 		return -ENOMEM;
 	mutex_init(&alloc_dev_set->lock);
+	INIT_LIST_HEAD(&alloc_dev_set->device_list);
 	alloc_dev_set->set_id = set_id;
 	goto again;
 }
@@ -155,6 +161,10 @@  static void vfio_release_device_set(struct vfio_device *device)
 	if (!dev_set)
 		return;
 
+	mutex_lock(&dev_set->lock);
+	list_del(&device->dev_set_list);
+	mutex_unlock(&dev_set->lock);
+
 	xa_lock(&vfio_device_set_xa);
 	dev_set->device_count--;
 	if (!dev_set->device_count) {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 128b4db00adc57..f0e6a72875e471 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -23,6 +23,7 @@ 
 struct vfio_device_set {
 	void *set_id;
 	struct mutex lock;
+	struct list_head device_list;
 	unsigned int device_count;
 };
 
@@ -31,6 +32,7 @@  struct vfio_device {
 	const struct vfio_device_ops *ops;
 	struct vfio_group *group;
 	struct vfio_device_set *dev_set;
+	struct list_head dev_set_list;
 
 	/* Members below here are private, not for driver use */
 	refcount_t refcount;