diff mbox series

[v2] vfio/pci: Parallelize device open and release

Message ID 154238501981.17682.10484940408205923021.stgit@gimli.home (mailing list archive)
State New, archived
Headers show
Series [v2] vfio/pci: Parallelize device open and release | expand

Commit Message

Alex Williamson Nov. 16, 2018, 4:18 p.m. UTC
In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
remove") a mutex was added to freeze the refcnt for a device so that
we can handle errors and perform bus resets on final close.  However,
bus resets can be rather slow and a global mutex here is undesirable.
Evaluating the potential locking granularity, a per-device mutex
provides the best resolution but with multiple devices on a bus all
released concurrently, they'll race to acquire each other's mutex,
likely resulting in no reset at all if we use trylock.  We therefore
lock at the granularity of the bus/slot reset as we're only attempting
a single reset for this group of devices anyway.  This allows much
greater scaling as we're bounded in the number of devices protected by
a single reflck object.

Reported-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
Tested-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

v2:
 - Rolled in PTR_ERR_OR_ZERO suggestion from kbuild bot
 - Updated commit log and comments per Eric's feedback

 drivers/vfio/pci/vfio_pci.c         |  160 ++++++++++++++++++++++++++++++-----
 drivers/vfio/pci/vfio_pci_private.h |    6 +
 2 files changed, 142 insertions(+), 24 deletions(-)

Comments

Cornelia Huck Nov. 19, 2018, 10:02 a.m. UTC | #1
On Fri, 16 Nov 2018 09:18:41 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
> remove") a mutex was added to freeze the refcnt for a device so that
> we can handle errors and perform bus resets on final close.  However,
> bus resets can be rather slow and a global mutex here is undesirable.
> Evaluating the potential locking granularity, a per-device mutex
> provides the best resolution but with multiple devices on a bus all
> released concurrently, they'll race to acquire each other's mutex,
> likely resulting in no reset at all if we use trylock.  We therefore
> lock at the granularity of the bus/slot reset as we're only attempting
> a single reset for this group of devices anyway.  This allows much
> greater scaling as we're bounded in the number of devices protected by
> a single reflck object.
> 
> Reported-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> Tested-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> v2:
>  - Rolled in PTR_ERR_OR_ZERO suggestion from kbuild bot
>  - Updated commit log and comments per Eric's feedback
> 
>  drivers/vfio/pci/vfio_pci.c         |  160 ++++++++++++++++++++++++++++++-----
>  drivers/vfio/pci/vfio_pci_private.h |    6 +
>  2 files changed, 142 insertions(+), 24 deletions(-)

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 50cdedfca9fe..ea0670c60c80 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -56,8 +56,6 @@  module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(disable_idle_d3,
 		 "Disable using the PCI D3 low power state for idle, unused devices");
 
-static DEFINE_MUTEX(driver_lock);
-
 static inline bool vfio_vga_disabled(void)
 {
 #ifdef CONFIG_VFIO_PCI_VGA
@@ -393,14 +391,14 @@  static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
 
-	mutex_lock(&driver_lock);
+	mutex_lock(&vdev->reflck->lock);
 
 	if (!(--vdev->refcnt)) {
 		vfio_spapr_pci_eeh_release(vdev->pdev);
 		vfio_pci_disable(vdev);
 	}
 
-	mutex_unlock(&driver_lock);
+	mutex_unlock(&vdev->reflck->lock);
 
 	module_put(THIS_MODULE);
 }
@@ -413,7 +411,7 @@  static int vfio_pci_open(void *device_data)
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
-	mutex_lock(&driver_lock);
+	mutex_lock(&vdev->reflck->lock);
 
 	if (!vdev->refcnt) {
 		ret = vfio_pci_enable(vdev);
@@ -424,7 +422,7 @@  static int vfio_pci_open(void *device_data)
 	}
 	vdev->refcnt++;
 error:
-	mutex_unlock(&driver_lock);
+	mutex_unlock(&vdev->reflck->lock);
 	if (ret)
 		module_put(THIS_MODULE);
 	return ret;
@@ -1187,6 +1185,9 @@  static const struct vfio_device_ops vfio_pci_ops = {
 	.request	= vfio_pci_request,
 };
 
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct vfio_pci_device *vdev;
@@ -1233,6 +1234,14 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return ret;
 	}
 
+	ret = vfio_pci_reflck_attach(vdev);
+	if (ret) {
+		vfio_del_group_dev(&pdev->dev);
+		vfio_iommu_group_put(group, &pdev->dev);
+		kfree(vdev);
+		return ret;
+	}
+
 	if (vfio_pci_is_vga(pdev)) {
 		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
 		vga_set_legacy_decoding(pdev,
@@ -1264,6 +1273,8 @@  static void vfio_pci_remove(struct pci_dev *pdev)
 	if (!vdev)
 		return;
 
+	vfio_pci_reflck_put(vdev->reflck);
+
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
 	kfree(vdev->region);
 	mutex_destroy(&vdev->ioeventfds_lock);
@@ -1320,16 +1331,97 @@  static struct pci_driver vfio_pci_driver = {
 	.err_handler	= &vfio_err_handlers,
 };
 
+static DEFINE_MUTEX(reflck_lock);
+
+static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
+{
+	struct vfio_pci_reflck *reflck;
+
+	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
+	if (!reflck)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&reflck->kref);
+	mutex_init(&reflck->lock);
+
+	return reflck;
+}
+
+static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
+{
+	kref_get(&reflck->kref);
+}
+
+static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
+{
+	struct vfio_pci_reflck **preflck = data;
+	struct vfio_device *device;
+	struct vfio_pci_device *vdev;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (!device)
+		return 0;
+
+	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
+		vfio_device_put(device);
+		return 0;
+	}
+
+	vdev = vfio_device_data(device);
+
+	if (vdev->reflck) {
+		vfio_pci_reflck_get(vdev->reflck);
+		*preflck = vdev->reflck;
+		vfio_device_put(device);
+		return 1;
+	}
+
+	vfio_device_put(device);
+	return 0;
+}
+
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
+{
+	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
+
+	mutex_lock(&reflck_lock);
+
+	if (pci_is_root_bus(vdev->pdev->bus) ||
+	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
+					  &vdev->reflck, slot) <= 0)
+		vdev->reflck = vfio_pci_reflck_alloc();
+
+	mutex_unlock(&reflck_lock);
+
+	return PTR_ERR_OR_ZERO(vdev->reflck);
+}
+
+static void vfio_pci_reflck_release(struct kref *kref)
+{
+	struct vfio_pci_reflck *reflck = container_of(kref,
+						      struct vfio_pci_reflck,
+						      kref);
+
+	kfree(reflck);
+	mutex_unlock(&reflck_lock);
+}
+
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
+{
+	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
+}
+
 struct vfio_devices {
 	struct vfio_device **devices;
 	int cur_index;
 	int max_index;
 };
 
-static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
 	struct vfio_device *device;
+	struct vfio_pci_device *vdev;
 
 	if (devs->cur_index == devs->max_index)
 		return -ENOSPC;
@@ -1343,16 +1435,28 @@  static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
 		return -EBUSY;
 	}
 
+	vdev = vfio_device_data(device);
+
+	/* Fault if the device is not unused */
+	if (vdev->refcnt) {
+		vfio_device_put(device);
+		return -EBUSY;
+	}
+
 	devs->devices[devs->cur_index++] = device;
 	return 0;
 }
 
 /*
- * Attempt to do a bus/slot reset if there are devices affected by a reset for
- * this device that are needs_reset and all of the affected devices are unused
- * (!refcnt).  Callers are required to hold driver_lock when calling this to
- * prevent device opens and concurrent bus reset attempts.  We prevent device
- * unbinds by acquiring and holding a reference to the vfio_device.
+ * If a bus or slot reset is available for the provided device and:
+ *  - All of the devices affected by that bus or slot reset are unused
+ *    (!refcnt)
+ *  - At least one of the affected devices is marked dirty via
+ *    needs_reset (such as by lack of FLR support)
+ * Then attempt to perform that bus or slot reset.  Callers are required
+ * to hold vdev->reflck->lock, protecting the bus/slot reset group from
+ * concurrent opens.  A vfio_device reference is acquired for each device
+ * to prevent unbinds during the reset operation.
  *
  * NB: vfio-core considers a group to be viable even if some devices are
  * bound to drivers like pci-stub or pcieport.  Here we require all devices
@@ -1363,7 +1467,7 @@  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 {
 	struct vfio_devices devs = { .cur_index = 0 };
 	int i = 0, ret = -EINVAL;
-	bool needs_reset = false, slot = false;
+	bool slot = false;
 	struct vfio_pci_device *tmp;
 
 	if (!pci_probe_reset_slot(vdev->pdev->slot))
@@ -1381,28 +1485,36 @@  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 		return;
 
 	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_devs, &devs, slot))
+					  vfio_pci_get_unused_devs,
+					  &devs, slot))
 		goto put_devs;
 
+	/* Does at least one need a reset? */
 	for (i = 0; i < devs.cur_index; i++) {
 		tmp = vfio_device_data(devs.devices[i]);
-		if (tmp->needs_reset)
-			needs_reset = true;
-		if (tmp->refcnt)
-			goto put_devs;
+		if (tmp->needs_reset) {
+			ret = pci_reset_bus(vdev->pdev);
+			break;
+		}
 	}
 
-	if (needs_reset)
-		ret = pci_reset_bus(vdev->pdev);
-
 put_devs:
 	for (i = 0; i < devs.cur_index; i++) {
 		tmp = vfio_device_data(devs.devices[i]);
-		if (!ret)
+
+		/*
+		 * If reset was successful, affected devices no longer need
+		 * a reset and we should return all the collateral devices
+		 * to low power.  If not successful, we either didn't reset
+		 * the bus or timed out waiting for it, so let's not touch
+		 * the power state.
+		 */
+		if (!ret) {
 			tmp->needs_reset = false;
 
-		if (!tmp->refcnt && !disable_idle_d3)
-			pci_set_power_state(tmp->pdev, PCI_D3hot);
+			if (tmp != vdev && !disable_idle_d3)
+				pci_set_power_state(tmp->pdev, PCI_D3hot);
+		}
 
 		vfio_device_put(devs.devices[i]);
 	}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d3441a..aa2355e67340 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -76,6 +76,11 @@  struct vfio_pci_dummy_resource {
 	struct list_head	res_next;
 };
 
+struct vfio_pci_reflck {
+	struct kref		kref;
+	struct mutex		lock;
+};
+
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
@@ -104,6 +109,7 @@  struct vfio_pci_device {
 	bool			needs_reset;
 	bool			nointx;
 	struct pci_saved_state	*pci_saved_state;
+	struct vfio_pci_reflck	*reflck;
 	int			refcnt;
 	int			ioeventfds_nr;
 	struct eventfd_ctx	*err_trigger;