diff mbox series

vfio/pci: Parallelize device open and release

Message ID 154180134334.31154.16289123769826098135.stgit@gimli.home (mailing list archive)
State New, archived
Headers show
Series vfio/pci: Parallelize device open and release | expand

Commit Message

Alex Williamson Nov. 9, 2018, 10:09 p.m. UTC
In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
remove") a mutex was added to freeze the refcnt for a device so that
we can handle errors and perform bus resets on final close.  However,
bus resets can be rather slow and a global mutex here is undesirable.
A per-device mutex provides the best granularity, but then our chances
of triggering a bus/slot reset with multiple affected devices is slim
when devices are released in parallel.  Instead create a reflck object
shared among all devices under the same bus or slot, allowing devices
on independent buses to be released in parallel while serializing per
bus/slot.

Reported-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
Tested-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         |  157 ++++++++++++++++++++++++++++++-----
 drivers/vfio/pci/vfio_pci_private.h |    6 +
 2 files changed, 139 insertions(+), 24 deletions(-)

Comments

kernel test robot Nov. 10, 2018, 10:44 p.m. UTC | #1
Hi Alex,

I love your patch! Perhaps something to improve:

[auto build test WARNING on vfio/next]
[also build test WARNING on v4.20-rc1 next-20181109]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Alex-Williamson/vfio-pci-Parallelize-device-open-and-release/20181111-025016
base:   https://github.com/awilliam/linux-vfio.git next


coccinelle warnings: (new ones prefixed by >>)

>> drivers/vfio/pci/vfio_pci.c:1396:8-14: WARNING: PTR_ERR_OR_ZERO can be used

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Eric Auger Nov. 13, 2018, 2:42 p.m. UTC | #2
Hi Alex,

On 11/9/18 11:09 PM, Alex Williamson wrote:
> In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
> remove") a mutex was added to freeze the refcnt for a device so that
> we can handle errors and perform bus resets on final close.  However,
> bus resets can be rather slow and a global mutex here is undesirable.
> A per-device mutex provides the best granularity, but then our chances
> of triggering a bus/slot reset with multiple affected devices is slim
> when devices are released in parallel.
Sorry I don't get the above sentence.
  Instead create a reflck object
> shared among all devices under the same bus or slot, allowing devices
> on independent buses to be released in parallel while serializing per
> bus/slot.>
> Reported-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> Tested-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  drivers/vfio/pci/vfio_pci.c         |  157 ++++++++++++++++++++++++++++++-----
>  drivers/vfio/pci/vfio_pci_private.h |    6 +
>  2 files changed, 139 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 50cdedfca9fe..d443fb7a4e75 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -56,8 +56,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(disable_idle_d3,
>  		 "Disable using the PCI D3 low power state for idle, unused devices");
>  
> -static DEFINE_MUTEX(driver_lock);
> -
>  static inline bool vfio_vga_disabled(void)
>  {
>  #ifdef CONFIG_VFIO_PCI_VGA
> @@ -393,14 +391,14 @@ static void vfio_pci_release(void *device_data)
>  {
>  	struct vfio_pci_device *vdev = device_data;
>  
> -	mutex_lock(&driver_lock);
> +	mutex_lock(&vdev->reflck->lock);
>  
>  	if (!(--vdev->refcnt)) {
>  		vfio_spapr_pci_eeh_release(vdev->pdev);
>  		vfio_pci_disable(vdev);
>  	}
>  
> -	mutex_unlock(&driver_lock);
> +	mutex_unlock(&vdev->reflck->lock);
>  
>  	module_put(THIS_MODULE);
>  }
> @@ -413,7 +411,7 @@ static int vfio_pci_open(void *device_data)
>  	if (!try_module_get(THIS_MODULE))
>  		return -ENODEV;
>  
> -	mutex_lock(&driver_lock);
> +	mutex_lock(&vdev->reflck->lock);
>  
>  	if (!vdev->refcnt) {
>  		ret = vfio_pci_enable(vdev);
> @@ -424,7 +422,7 @@ static int vfio_pci_open(void *device_data)
>  	}
>  	vdev->refcnt++;
>  error:
> -	mutex_unlock(&driver_lock);
> +	mutex_unlock(&vdev->reflck->lock);
>  	if (ret)
>  		module_put(THIS_MODULE);
>  	return ret;
> @@ -1187,6 +1185,9 @@ static const struct vfio_device_ops vfio_pci_ops = {
>  	.request	= vfio_pci_request,
>  };
>  
> +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
> +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
> +
>  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct vfio_pci_device *vdev;
> @@ -1233,6 +1234,14 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  		return ret;
>  	}
>  
> +	ret = vfio_pci_reflck_attach(vdev);
> +	if (ret) {
> +		vfio_del_group_dev(&pdev->dev);
> +		vfio_iommu_group_put(group, &pdev->dev);
> +		kfree(vdev);
> +		return ret;
> +	}
> +
>  	if (vfio_pci_is_vga(pdev)) {
>  		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
>  		vga_set_legacy_decoding(pdev,
> @@ -1264,6 +1273,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
>  	if (!vdev)
>  		return;
>  
> +	vfio_pci_reflck_put(vdev->reflck);
> +
>  	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
>  	kfree(vdev->region);
>  	mutex_destroy(&vdev->ioeventfds_lock);
> @@ -1320,16 +1331,97 @@ static struct pci_driver vfio_pci_driver = {
>  	.err_handler	= &vfio_err_handlers,
>  };
>  
> +static DEFINE_MUTEX(reflck_lock);
> +
> +static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
> +{
> +	struct vfio_pci_reflck *reflck;
> +
> +	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
> +	if (!reflck)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&reflck->kref);
> +	mutex_init(&reflck->lock);
> +
> +	return reflck;
> +}
> +
> +static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
> +{
> +	kref_get(&reflck->kref);
> +}
> +
> +static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
> +{
> +	struct vfio_pci_reflck **preflck = data;
> +	struct vfio_device *device;
> +	struct vfio_pci_device *vdev;
> +
> +	device = vfio_device_get_from_dev(&pdev->dev);
> +	if (!device)
> +		return 0;
> +
> +	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
> +		vfio_device_put(device);
> +		return 0;
> +	}
> +
> +	vdev = vfio_device_data(device);
> +
> +	if (vdev->reflck) {
> +		vfio_pci_reflck_get(vdev->reflck);
> +		*preflck = vdev->reflck;
> +		vfio_device_put(device);
> +		return 1;
> +	}
> +
> +	vfio_device_put(device);
> +	return 0;
> +}
> +
> +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
> +{
> +	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
> +
> +	mutex_lock(&reflck_lock);
> +
> +	if (pci_is_root_bus(vdev->pdev->bus) ||
> +	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
> +					  &vdev->reflck, slot) <= 0)
!vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
					  &vdev->reflck, slot)?
I don't think we can have negative returned value.
> +		vdev->reflck = vfio_pci_reflck_alloc();
> +
> +	mutex_unlock(&reflck_lock);
> +
> +	return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0;
> +}
> +
> +static void vfio_pci_reflck_release(struct kref *kref)
> +{
> +	struct vfio_pci_reflck *reflck = container_of(kref,
> +						      struct vfio_pci_reflck,
> +						      kref);
> +
> +	kfree(reflck);
> +	mutex_unlock(&reflck_lock);
> +}
> +
> +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
> +{
> +	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
> +}
> +
>  struct vfio_devices {
>  	struct vfio_device **devices;
>  	int cur_index;
>  	int max_index;
>  };
>  
> -static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
> +static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
>  {
>  	struct vfio_devices *devs = data;
>  	struct vfio_device *device;
> +	struct vfio_pci_device *vdev;
>  
>  	if (devs->cur_index == devs->max_index)
>  		return -ENOSPC;
> @@ -1343,16 +1435,25 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
>  		return -EBUSY;
>  	}
>  
> +	vdev = vfio_device_data(device);
> +
> +	/* Only collect unused devices */
abort if the slot/bus features a used device?
> +	if (vdev->refcnt) {
> +		vfio_device_put(device);
> +		return -EBUSY;
> +	}
> +
>  	devs->devices[devs->cur_index++] = device;
>  	return 0;
>  }
>  
>  /*
> - * Attempt to do a bus/slot reset if there are devices affected by a reset for
> - * this device that are needs_reset and all of the affected devices are unused
> - * (!refcnt). Callers are required to hold driver_lock when calling this to
> - * prevent device opens and concurrent bus reset attempts.  We prevent device
> - * unbinds by acquiring and holding a reference to the vfio_device.
> + * Attempt to do a bus/slot reset if there are devices affected by a reset
> + * for this device that are "needs_reset" and all of the affected devices
> + * are unused (!refcnt).

This comment still sounds a bit cryptic to me. Assuming I got the point,
may I suggest something like:

If one of the device of the slot/bus needs a reset (but cannot be reset
at function level) and all the devices of the slot/bus are unused
(!refcount), we attempt to do a bus/slot reset.

  Callers are required to hold vdev->reflck->lock
> + * to prevent concurrent device opens, which is expected to protect all
> + * affected devices.  A vfio_device reference is also acquired for each
> + * affected device to prevent unbinds.
>   *
>   * NB: vfio-core considers a group to be viable even if some devices are
>   * bound to drivers like pci-stub or pcieport.  Here we require all devices
> @@ -1363,7 +1464,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
>  {
>  	struct vfio_devices devs = { .cur_index = 0 };
>  	int i = 0, ret = -EINVAL;
> -	bool needs_reset = false, slot = false;
> +	bool slot = false;
>  	struct vfio_pci_device *tmp;
>  
>  	if (!pci_probe_reset_slot(vdev->pdev->slot))
> @@ -1381,28 +1482,36 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
>  		return;
>  
>  	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
> -					  vfio_pci_get_devs, &devs, slot))
> +					  vfio_pci_get_unused_devs,
> +					  &devs, slot))
>  		goto put_devs;
>  
> +	/* Does at least one need a reset? */
>  	for (i = 0; i < devs.cur_index; i++) {
>  		tmp = vfio_device_data(devs.devices[i]);
> -		if (tmp->needs_reset)
> -			needs_reset = true;
> -		if (tmp->refcnt)
> -			goto put_devs;
> +		if (tmp->needs_reset) {
> +			ret = pci_reset_bus(vdev->pdev);
> +			break;
> +		}
>  	}
>  
> -	if (needs_reset)
> -		ret = pci_reset_bus(vdev->pdev);
> -
>  put_devs:
>  	for (i = 0; i < devs.cur_index; i++) {
>  		tmp = vfio_device_data(devs.devices[i]);
> -		if (!ret)
> +
> +		/*
> +		 * If reset was successful, affected devices no longer need
> +		 * a reset and we should return all the collateral devices
> +		 * to low power.  If not successful, we either didn't reset
> +		 * the bus or timed out waiting for it, so let's not touch
> +		 * the power state.
> +		 */
> +		if (!ret) {
>  			tmp->needs_reset = false;
>  
> -		if (!tmp->refcnt && !disable_idle_d3)
> -			pci_set_power_state(tmp->pdev, PCI_D3hot);
> +			if (tmp != vdev && !disable_idle_d3)
> +				pci_set_power_state(tmp->pdev, PCI_D3hot);
> +		}
>  
>  		vfio_device_put(devs.devices[i]);
>  	}
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index cde3b5d3441a..aa2355e67340 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -76,6 +76,11 @@ struct vfio_pci_dummy_resource {
>  	struct list_head	res_next;
>  };
>  
> +struct vfio_pci_reflck {
> +	struct kref		kref;
> +	struct mutex		lock;
> +};
> +
>  struct vfio_pci_device {
>  	struct pci_dev		*pdev;
>  	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
> @@ -104,6 +109,7 @@ struct vfio_pci_device {
>  	bool			needs_reset;
>  	bool			nointx;
>  	struct pci_saved_state	*pci_saved_state;
> +	struct vfio_pci_reflck	*reflck;
>  	int			refcnt;
>  	int			ioeventfds_nr;
>  	struct eventfd_ctx	*err_trigger;
> 
Reviewed-by: Eric Auger <eric.auger@redhat.com>

Thanks

Eric
Alex Williamson Nov. 13, 2018, 4:34 p.m. UTC | #3
On Tue, 13 Nov 2018 15:42:49 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Alex,
> 
> On 11/9/18 11:09 PM, Alex Williamson wrote:
> > In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
> > remove") a mutex was added to freeze the refcnt for a device so that
> > we can handle errors and perform bus resets on final close.  However,
> > bus resets can be rather slow and a global mutex here is undesirable.
> > A per-device mutex provides the best granularity, but then our chances
> > of triggering a bus/slot reset with multiple affected devices is slim
> > when devices are released in parallel.  
> Sorry I don't get the above sentence.

There's a locking granularity question here, where currently we're
locking at a global level.  If I want to reduce that granularity, it
seems the obvious question is what minimum granularity can we achieve.
A per-device lock it technically that minimum for the purposes of
serializing the device around open/release, but then concurrent
releases don't necessarily provide us the opportunity to perform a bus
reset affecting multiple devices since all the devices are racing each
other.  Therefore I conclude that a bus/slot locking granularity
provides us the best compromise of granularity vs functionality.

>   Instead create a reflck object
> > shared among all devices under the same bus or slot, allowing devices
> > on independent buses to be released in parallel while serializing per  
> > bus/slot.>  
> > Reported-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > Tested-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> >  drivers/vfio/pci/vfio_pci.c         |  157 ++++++++++++++++++++++++++++++-----
> >  drivers/vfio/pci/vfio_pci_private.h |    6 +
> >  2 files changed, 139 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index 50cdedfca9fe..d443fb7a4e75 100644
> > --- a/drivers/vfio/pci/vfio_pci.c
> > +++ b/drivers/vfio/pci/vfio_pci.c
> > @@ -56,8 +56,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
> >  MODULE_PARM_DESC(disable_idle_d3,
> >  		 "Disable using the PCI D3 low power state for idle, unused devices");
> >  
> > -static DEFINE_MUTEX(driver_lock);
> > -
> >  static inline bool vfio_vga_disabled(void)
> >  {
> >  #ifdef CONFIG_VFIO_PCI_VGA
> > @@ -393,14 +391,14 @@ static void vfio_pci_release(void *device_data)
> >  {
> >  	struct vfio_pci_device *vdev = device_data;
> >  
> > -	mutex_lock(&driver_lock);
> > +	mutex_lock(&vdev->reflck->lock);
> >  
> >  	if (!(--vdev->refcnt)) {
> >  		vfio_spapr_pci_eeh_release(vdev->pdev);
> >  		vfio_pci_disable(vdev);
> >  	}
> >  
> > -	mutex_unlock(&driver_lock);
> > +	mutex_unlock(&vdev->reflck->lock);
> >  
> >  	module_put(THIS_MODULE);
> >  }
> > @@ -413,7 +411,7 @@ static int vfio_pci_open(void *device_data)
> >  	if (!try_module_get(THIS_MODULE))
> >  		return -ENODEV;
> >  
> > -	mutex_lock(&driver_lock);
> > +	mutex_lock(&vdev->reflck->lock);
> >  
> >  	if (!vdev->refcnt) {
> >  		ret = vfio_pci_enable(vdev);
> > @@ -424,7 +422,7 @@ static int vfio_pci_open(void *device_data)
> >  	}
> >  	vdev->refcnt++;
> >  error:
> > -	mutex_unlock(&driver_lock);
> > +	mutex_unlock(&vdev->reflck->lock);
> >  	if (ret)
> >  		module_put(THIS_MODULE);
> >  	return ret;
> > @@ -1187,6 +1185,9 @@ static const struct vfio_device_ops vfio_pci_ops = {
> >  	.request	= vfio_pci_request,
> >  };
> >  
> > +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
> > +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
> > +
> >  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  {
> >  	struct vfio_pci_device *vdev;
> > @@ -1233,6 +1234,14 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  		return ret;
> >  	}
> >  
> > +	ret = vfio_pci_reflck_attach(vdev);
> > +	if (ret) {
> > +		vfio_del_group_dev(&pdev->dev);
> > +		vfio_iommu_group_put(group, &pdev->dev);
> > +		kfree(vdev);
> > +		return ret;
> > +	}
> > +
> >  	if (vfio_pci_is_vga(pdev)) {
> >  		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
> >  		vga_set_legacy_decoding(pdev,
> > @@ -1264,6 +1273,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
> >  	if (!vdev)
> >  		return;
> >  
> > +	vfio_pci_reflck_put(vdev->reflck);
> > +
> >  	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
> >  	kfree(vdev->region);
> >  	mutex_destroy(&vdev->ioeventfds_lock);
> > @@ -1320,16 +1331,97 @@ static struct pci_driver vfio_pci_driver = {
> >  	.err_handler	= &vfio_err_handlers,
> >  };
> >  
> > +static DEFINE_MUTEX(reflck_lock);
> > +
> > +static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
> > +{
> > +	struct vfio_pci_reflck *reflck;
> > +
> > +	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
> > +	if (!reflck)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	kref_init(&reflck->kref);
> > +	mutex_init(&reflck->lock);
> > +
> > +	return reflck;
> > +}
> > +
> > +static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
> > +{
> > +	kref_get(&reflck->kref);
> > +}
> > +
> > +static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
> > +{
> > +	struct vfio_pci_reflck **preflck = data;
> > +	struct vfio_device *device;
> > +	struct vfio_pci_device *vdev;
> > +
> > +	device = vfio_device_get_from_dev(&pdev->dev);
> > +	if (!device)
> > +		return 0;
> > +
> > +	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
> > +		vfio_device_put(device);
> > +		return 0;
> > +	}
> > +
> > +	vdev = vfio_device_data(device);
> > +
> > +	if (vdev->reflck) {
> > +		vfio_pci_reflck_get(vdev->reflck);
> > +		*preflck = vdev->reflck;
> > +		vfio_device_put(device);
> > +		return 1;
> > +	}
> > +
> > +	vfio_device_put(device);
> > +	return 0;
> > +}
> > +
> > +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
> > +{
> > +	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
> > +
> > +	mutex_lock(&reflck_lock);
> > +
> > +	if (pci_is_root_bus(vdev->pdev->bus) ||
> > +	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
> > +					  &vdev->reflck, slot) <= 0)  
> !vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
> 					  &vdev->reflck, slot)?
> I don't think we can have negative returned value.

Correct, our callback function does not return a negative, but it's
certainly within the framework of this callback to do so.  I thought it
was more consistent with the framework to handle that case so we don't
overlook it in the future should the callback function change.

> > +		vdev->reflck = vfio_pci_reflck_alloc();
> > +
> > +	mutex_unlock(&reflck_lock);
> > +
> > +	return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0;
> > +}
> > +
> > +static void vfio_pci_reflck_release(struct kref *kref)
> > +{
> > +	struct vfio_pci_reflck *reflck = container_of(kref,
> > +						      struct vfio_pci_reflck,
> > +						      kref);
> > +
> > +	kfree(reflck);
> > +	mutex_unlock(&reflck_lock);
> > +}
> > +
> > +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
> > +{
> > +	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
> > +}
> > +
> >  struct vfio_devices {
> >  	struct vfio_device **devices;
> >  	int cur_index;
> >  	int max_index;
> >  };
> >  
> > -static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
> > +static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
> >  {
> >  	struct vfio_devices *devs = data;
> >  	struct vfio_device *device;
> > +	struct vfio_pci_device *vdev;
> >  
> >  	if (devs->cur_index == devs->max_index)
> >  		return -ENOSPC;
> > @@ -1343,16 +1435,25 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
> >  		return -EBUSY;
> >  	}
> >  
> > +	vdev = vfio_device_data(device);
> > +
> > +	/* Only collect unused devices */  
> abort if the slot/bus features a used device?

Correct.  Is this a suggestion for a wording change?

> > +	if (vdev->refcnt) {
> > +		vfio_device_put(device);
> > +		return -EBUSY;
> > +	}
> > +
> >  	devs->devices[devs->cur_index++] = device;
> >  	return 0;
> >  }
> >  
> >  /*
> > - * Attempt to do a bus/slot reset if there are devices affected by a reset for
> > - * this device that are needs_reset and all of the affected devices are unused
> > - * (!refcnt). Callers are required to hold driver_lock when calling this to
> > - * prevent device opens and concurrent bus reset attempts.  We prevent device
> > - * unbinds by acquiring and holding a reference to the vfio_device.
> > + * Attempt to do a bus/slot reset if there are devices affected by a reset
> > + * for this device that are "needs_reset" and all of the affected devices
> > + * are unused (!refcnt).  
> 
> This comment still sounds a bit cryptic to me. Assuming I got the point,
> may I suggest something like:
> 
> If one of the device of the slot/bus needs a reset (but cannot be reset
> at function level) and all the devices of the slot/bus are unused
> (!refcount), we attempt to do a bus/slot reset.

I don't disagree that the original wording is somewhat convoluted, but
I don't see that this is a tremendous improvement.  How about:

  If a bus or slot reset is available for the provided device and:
    - All of the devices affected by that bus or slot reset are unused
      (!refcnt)
    - At least one of the affected devices is marked dirty via
      needs_reset (such as by lack of FLR support)
  Then attempt to perform that bus or slot reset.

>   Callers are required to hold vdev->reflck->lock
> > + * to prevent concurrent device opens, which is expected to protect all
> > + * affected devices.  A vfio_device reference is also acquired for each
> > + * affected device to prevent unbinds.
> >   *
> >   * NB: vfio-core considers a group to be viable even if some devices are
> >   * bound to drivers like pci-stub or pcieport.  Here we require all devices
> > @@ -1363,7 +1464,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
> >  {
> >  	struct vfio_devices devs = { .cur_index = 0 };
> >  	int i = 0, ret = -EINVAL;
> > -	bool needs_reset = false, slot = false;
> > +	bool slot = false;
> >  	struct vfio_pci_device *tmp;
> >  
> >  	if (!pci_probe_reset_slot(vdev->pdev->slot))
> > @@ -1381,28 +1482,36 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
> >  		return;
> >  
> >  	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
> > -					  vfio_pci_get_devs, &devs, slot))
> > +					  vfio_pci_get_unused_devs,
> > +					  &devs, slot))
> >  		goto put_devs;
> >  
> > +	/* Does at least one need a reset? */
> >  	for (i = 0; i < devs.cur_index; i++) {
> >  		tmp = vfio_device_data(devs.devices[i]);
> > -		if (tmp->needs_reset)
> > -			needs_reset = true;
> > -		if (tmp->refcnt)
> > -			goto put_devs;
> > +		if (tmp->needs_reset) {
> > +			ret = pci_reset_bus(vdev->pdev);
> > +			break;
> > +		}
> >  	}
> >  
> > -	if (needs_reset)
> > -		ret = pci_reset_bus(vdev->pdev);
> > -
> >  put_devs:
> >  	for (i = 0; i < devs.cur_index; i++) {
> >  		tmp = vfio_device_data(devs.devices[i]);
> > -		if (!ret)
> > +
> > +		/*
> > +		 * If reset was successful, affected devices no longer need
> > +		 * a reset and we should return all the collateral devices
> > +		 * to low power.  If not successful, we either didn't reset
> > +		 * the bus or timed out waiting for it, so let's not touch
> > +		 * the power state.
> > +		 */
> > +		if (!ret) {
> >  			tmp->needs_reset = false;
> >  
> > -		if (!tmp->refcnt && !disable_idle_d3)
> > -			pci_set_power_state(tmp->pdev, PCI_D3hot);
> > +			if (tmp != vdev && !disable_idle_d3)
> > +				pci_set_power_state(tmp->pdev, PCI_D3hot);
> > +		}
> >  
> >  		vfio_device_put(devs.devices[i]);
> >  	}
> > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> > index cde3b5d3441a..aa2355e67340 100644
> > --- a/drivers/vfio/pci/vfio_pci_private.h
> > +++ b/drivers/vfio/pci/vfio_pci_private.h
> > @@ -76,6 +76,11 @@ struct vfio_pci_dummy_resource {
> >  	struct list_head	res_next;
> >  };
> >  
> > +struct vfio_pci_reflck {
> > +	struct kref		kref;
> > +	struct mutex		lock;
> > +};
> > +
> >  struct vfio_pci_device {
> >  	struct pci_dev		*pdev;
> >  	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
> > @@ -104,6 +109,7 @@ struct vfio_pci_device {
> >  	bool			needs_reset;
> >  	bool			nointx;
> >  	struct pci_saved_state	*pci_saved_state;
> > +	struct vfio_pci_reflck	*reflck;
> >  	int			refcnt;
> >  	int			ioeventfds_nr;
> >  	struct eventfd_ctx	*err_trigger;
> >   
> Reviewed-by: Eric Auger <eric.auger@redhat.com>

Thanks!

Alex
Eric Auger Nov. 13, 2018, 9:22 p.m. UTC | #4
Hi Alex,

On 11/13/18 5:34 PM, Alex Williamson wrote:
> On Tue, 13 Nov 2018 15:42:49 +0100
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Alex,
>>
>> On 11/9/18 11:09 PM, Alex Williamson wrote:
>>> In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
>>> remove") a mutex was added to freeze the refcnt for a device so that
>>> we can handle errors and perform bus resets on final close.  However,
>>> bus resets can be rather slow and a global mutex here is undesirable.
>>> A per-device mutex provides the best granularity, but then our chances
>>> of triggering a bus/slot reset with multiple affected devices is slim
>>> when devices are released in parallel.  
>> Sorry I don't get the above sentence.
> 
> There's a locking granularity question here, where currently we're
> locking at a global level.  If I want to reduce that granularity, it
> seems the obvious question is what minimum granularity can we achieve.
> A per-device lock it technically that minimum for the purposes of
> serializing the device around open/release, but then concurrent
> releases don't necessarily provide us the opportunity to perform a bus
> reset affecting multiple devices since all the devices are racing each
> other.  Therefore I conclude that a bus/slot locking granularity
> provides us the best compromise of granularity vs functionality.

Hum. My understanding was that you need to make sure the refcnt of the
set of devices attached to the bus/slot must be frozen when making the
decision to reset the bus/slot, hence the lock at bus/slot level. Need
to digest more.
> 
>>   Instead create a reflck object
>>> shared among all devices under the same bus or slot, allowing devices
>>> on independent buses to be released in parallel while serializing per  
>>> bus/slot.>  
>>> Reported-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
>>> Tested-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
>>> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>>> ---
>>>  drivers/vfio/pci/vfio_pci.c         |  157 ++++++++++++++++++++++++++++++-----
>>>  drivers/vfio/pci/vfio_pci_private.h |    6 +
>>>  2 files changed, 139 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>>> index 50cdedfca9fe..d443fb7a4e75 100644
>>> --- a/drivers/vfio/pci/vfio_pci.c
>>> +++ b/drivers/vfio/pci/vfio_pci.c
>>> @@ -56,8 +56,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
>>>  MODULE_PARM_DESC(disable_idle_d3,
>>>  		 "Disable using the PCI D3 low power state for idle, unused devices");
>>>  
>>> -static DEFINE_MUTEX(driver_lock);
>>> -
>>>  static inline bool vfio_vga_disabled(void)
>>>  {
>>>  #ifdef CONFIG_VFIO_PCI_VGA
>>> @@ -393,14 +391,14 @@ static void vfio_pci_release(void *device_data)
>>>  {
>>>  	struct vfio_pci_device *vdev = device_data;
>>>  
>>> -	mutex_lock(&driver_lock);
>>> +	mutex_lock(&vdev->reflck->lock);
>>>  
>>>  	if (!(--vdev->refcnt)) {
>>>  		vfio_spapr_pci_eeh_release(vdev->pdev);
>>>  		vfio_pci_disable(vdev);
>>>  	}
>>>  
>>> -	mutex_unlock(&driver_lock);
>>> +	mutex_unlock(&vdev->reflck->lock);
>>>  
>>>  	module_put(THIS_MODULE);
>>>  }
>>> @@ -413,7 +411,7 @@ static int vfio_pci_open(void *device_data)
>>>  	if (!try_module_get(THIS_MODULE))
>>>  		return -ENODEV;
>>>  
>>> -	mutex_lock(&driver_lock);
>>> +	mutex_lock(&vdev->reflck->lock);
>>>  
>>>  	if (!vdev->refcnt) {
>>>  		ret = vfio_pci_enable(vdev);
>>> @@ -424,7 +422,7 @@ static int vfio_pci_open(void *device_data)
>>>  	}
>>>  	vdev->refcnt++;
>>>  error:
>>> -	mutex_unlock(&driver_lock);
>>> +	mutex_unlock(&vdev->reflck->lock);
>>>  	if (ret)
>>>  		module_put(THIS_MODULE);
>>>  	return ret;
>>> @@ -1187,6 +1185,9 @@ static const struct vfio_device_ops vfio_pci_ops = {
>>>  	.request	= vfio_pci_request,
>>>  };
>>>  
>>> +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
>>> +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
>>> +
>>>  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>>  {
>>>  	struct vfio_pci_device *vdev;
>>> @@ -1233,6 +1234,14 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>>  		return ret;
>>>  	}
>>>  
>>> +	ret = vfio_pci_reflck_attach(vdev);
>>> +	if (ret) {
>>> +		vfio_del_group_dev(&pdev->dev);
>>> +		vfio_iommu_group_put(group, &pdev->dev);
>>> +		kfree(vdev);
>>> +		return ret;
>>> +	}
>>> +
>>>  	if (vfio_pci_is_vga(pdev)) {
>>>  		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
>>>  		vga_set_legacy_decoding(pdev,
>>> @@ -1264,6 +1273,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
>>>  	if (!vdev)
>>>  		return;
>>>  
>>> +	vfio_pci_reflck_put(vdev->reflck);
>>> +
>>>  	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
>>>  	kfree(vdev->region);
>>>  	mutex_destroy(&vdev->ioeventfds_lock);
>>> @@ -1320,16 +1331,97 @@ static struct pci_driver vfio_pci_driver = {
>>>  	.err_handler	= &vfio_err_handlers,
>>>  };
>>>  
>>> +static DEFINE_MUTEX(reflck_lock);
>>> +
>>> +static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
>>> +{
>>> +	struct vfio_pci_reflck *reflck;
>>> +
>>> +	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
>>> +	if (!reflck)
>>> +		return ERR_PTR(-ENOMEM);
>>> +
>>> +	kref_init(&reflck->kref);
>>> +	mutex_init(&reflck->lock);
>>> +
>>> +	return reflck;
>>> +}
>>> +
>>> +static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
>>> +{
>>> +	kref_get(&reflck->kref);
>>> +}
>>> +
>>> +static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
>>> +{
>>> +	struct vfio_pci_reflck **preflck = data;
>>> +	struct vfio_device *device;
>>> +	struct vfio_pci_device *vdev;
>>> +
>>> +	device = vfio_device_get_from_dev(&pdev->dev);
>>> +	if (!device)
>>> +		return 0;
>>> +
>>> +	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
>>> +		vfio_device_put(device);
>>> +		return 0;
>>> +	}
>>> +
>>> +	vdev = vfio_device_data(device);
>>> +
>>> +	if (vdev->reflck) {
>>> +		vfio_pci_reflck_get(vdev->reflck);
>>> +		*preflck = vdev->reflck;
>>> +		vfio_device_put(device);
>>> +		return 1;
>>> +	}
>>> +
>>> +	vfio_device_put(device);
>>> +	return 0;
>>> +}
>>> +
>>> +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
>>> +{
>>> +	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
>>> +
>>> +	mutex_lock(&reflck_lock);
>>> +
>>> +	if (pci_is_root_bus(vdev->pdev->bus) ||
>>> +	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
>>> +					  &vdev->reflck, slot) <= 0)  
>> !vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
>> 					  &vdev->reflck, slot)?
>> I don't think we can have negative returned value.
> 
> Correct, our callback function does not return a negative, but it's
> certainly within the framework of this callback to do so.  I thought it
> was more consistent with the framework to handle that case so we don't
> overlook it in the future should the callback function change.
fair enough
> 
>>> +		vdev->reflck = vfio_pci_reflck_alloc();
>>> +
>>> +	mutex_unlock(&reflck_lock);
>>> +
>>> +	return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0;
>>> +}
>>> +
>>> +static void vfio_pci_reflck_release(struct kref *kref)
>>> +{
>>> +	struct vfio_pci_reflck *reflck = container_of(kref,
>>> +						      struct vfio_pci_reflck,
>>> +						      kref);
>>> +
>>> +	kfree(reflck);
>>> +	mutex_unlock(&reflck_lock);
>>> +}
>>> +
>>> +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
>>> +{
>>> +	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
>>> +}
>>> +
>>>  struct vfio_devices {
>>>  	struct vfio_device **devices;
>>>  	int cur_index;
>>>  	int max_index;
>>>  };
>>>  
>>> -static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
>>> +static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
>>>  {
>>>  	struct vfio_devices *devs = data;
>>>  	struct vfio_device *device;
>>> +	struct vfio_pci_device *vdev;
>>>  
>>>  	if (devs->cur_index == devs->max_index)
>>>  		return -ENOSPC;
>>> @@ -1343,16 +1435,25 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
>>>  		return -EBUSY;
>>>  	}
>>>  
>>> +	vdev = vfio_device_data(device);
>>> +
>>> +	/* Only collect unused devices */  
>> abort if the slot/bus features a used device?
> 
> Correct.  Is this a suggestion for a wording change?
yes it was. My 1st impression when reading the code was that we
collected unused devices and we actually performed an actual action on
them but we simply vfio_device_put() them in that case. So it is more an
abort and cleanup thing right? But that's just a nit and a way for me to
make sure my understanding is correct ;-)
> 
>>> +	if (vdev->refcnt) {
>>> +		vfio_device_put(device);
>>> +		return -EBUSY;
>>> +	}
>>> +
>>>  	devs->devices[devs->cur_index++] = device;
>>>  	return 0;
>>>  }
>>>  
>>>  /*
>>> - * Attempt to do a bus/slot reset if there are devices affected by a reset for
>>> - * this device that are needs_reset and all of the affected devices are unused
>>> - * (!refcnt). Callers are required to hold driver_lock when calling this to
>>> - * prevent device opens and concurrent bus reset attempts.  We prevent device
>>> - * unbinds by acquiring and holding a reference to the vfio_device.
>>> + * Attempt to do a bus/slot reset if there are devices affected by a reset
>>> + * for this device that are "needs_reset" and all of the affected devices
>>> + * are unused (!refcnt).  
>>
>> This comment still sounds a bit cryptic to me. Assuming I got the point,
>> may I suggest something like:
>>
>> If one of the device of the slot/bus needs a reset (but cannot be reset
>> at function level) and all the devices of the slot/bus are unused
>> (!refcount), we attempt to do a bus/slot reset.
> 
> I don't disagree that the original wording is somewhat convoluted, but
> I don't see that this is a tremendous improvement.  How about:
> 
>   If a bus or slot reset is available for the provided device and:
>     - All of the devices affected by that bus or slot reset are unused
>       (!refcnt)
>     - At least one of the affected devices is marked dirty via
>       needs_reset (such as by lack of FLR support)
>   Then attempt to perform that bus or slot reset.
looks fine to me.

Thanks

Eric
> 
>>   Callers are required to hold vdev->reflck->lock
>>> + * to prevent concurrent device opens, which is expected to protect all
>>> + * affected devices.  A vfio_device reference is also acquired for each
>>> + * affected device to prevent unbinds.
>>>   *
>>>   * NB: vfio-core considers a group to be viable even if some devices are
>>>   * bound to drivers like pci-stub or pcieport.  Here we require all devices
>>> @@ -1363,7 +1464,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
>>>  {
>>>  	struct vfio_devices devs = { .cur_index = 0 };
>>>  	int i = 0, ret = -EINVAL;
>>> -	bool needs_reset = false, slot = false;
>>> +	bool slot = false;
>>>  	struct vfio_pci_device *tmp;
>>>  
>>>  	if (!pci_probe_reset_slot(vdev->pdev->slot))
>>> @@ -1381,28 +1482,36 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
>>>  		return;
>>>  
>>>  	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
>>> -					  vfio_pci_get_devs, &devs, slot))
>>> +					  vfio_pci_get_unused_devs,
>>> +					  &devs, slot))
>>>  		goto put_devs;
>>>  
>>> +	/* Does at least one need a reset? */
>>>  	for (i = 0; i < devs.cur_index; i++) {
>>>  		tmp = vfio_device_data(devs.devices[i]);
>>> -		if (tmp->needs_reset)
>>> -			needs_reset = true;
>>> -		if (tmp->refcnt)
>>> -			goto put_devs;
>>> +		if (tmp->needs_reset) {
>>> +			ret = pci_reset_bus(vdev->pdev);
>>> +			break;
>>> +		}
>>>  	}
>>>  
>>> -	if (needs_reset)
>>> -		ret = pci_reset_bus(vdev->pdev);
>>> -
>>>  put_devs:
>>>  	for (i = 0; i < devs.cur_index; i++) {
>>>  		tmp = vfio_device_data(devs.devices[i]);
>>> -		if (!ret)
>>> +
>>> +		/*
>>> +		 * If reset was successful, affected devices no longer need
>>> +		 * a reset and we should return all the collateral devices
>>> +		 * to low power.  If not successful, we either didn't reset
>>> +		 * the bus or timed out waiting for it, so let's not touch
>>> +		 * the power state.
>>> +		 */
>>> +		if (!ret) {
>>>  			tmp->needs_reset = false;
>>>  
>>> -		if (!tmp->refcnt && !disable_idle_d3)
>>> -			pci_set_power_state(tmp->pdev, PCI_D3hot);
>>> +			if (tmp != vdev && !disable_idle_d3)
>>> +				pci_set_power_state(tmp->pdev, PCI_D3hot);
>>> +		}
>>>  
>>>  		vfio_device_put(devs.devices[i]);
>>>  	}
>>> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
>>> index cde3b5d3441a..aa2355e67340 100644
>>> --- a/drivers/vfio/pci/vfio_pci_private.h
>>> +++ b/drivers/vfio/pci/vfio_pci_private.h
>>> @@ -76,6 +76,11 @@ struct vfio_pci_dummy_resource {
>>>  	struct list_head	res_next;
>>>  };
>>>  
>>> +struct vfio_pci_reflck {
>>> +	struct kref		kref;
>>> +	struct mutex		lock;
>>> +};
>>> +
>>>  struct vfio_pci_device {
>>>  	struct pci_dev		*pdev;
>>>  	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
>>> @@ -104,6 +109,7 @@ struct vfio_pci_device {
>>>  	bool			needs_reset;
>>>  	bool			nointx;
>>>  	struct pci_saved_state	*pci_saved_state;
>>> +	struct vfio_pci_reflck	*reflck;
>>>  	int			refcnt;
>>>  	int			ioeventfds_nr;
>>>  	struct eventfd_ctx	*err_trigger;
>>>   
>> Reviewed-by: Eric Auger <eric.auger@redhat.com>
> 
> Thanks!
> 
> Alex
>
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 50cdedfca9fe..d443fb7a4e75 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -56,8 +56,6 @@  module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(disable_idle_d3,
 		 "Disable using the PCI D3 low power state for idle, unused devices");
 
-static DEFINE_MUTEX(driver_lock);
-
 static inline bool vfio_vga_disabled(void)
 {
 #ifdef CONFIG_VFIO_PCI_VGA
@@ -393,14 +391,14 @@  static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
 
-	mutex_lock(&driver_lock);
+	mutex_lock(&vdev->reflck->lock);
 
 	if (!(--vdev->refcnt)) {
 		vfio_spapr_pci_eeh_release(vdev->pdev);
 		vfio_pci_disable(vdev);
 	}
 
-	mutex_unlock(&driver_lock);
+	mutex_unlock(&vdev->reflck->lock);
 
 	module_put(THIS_MODULE);
 }
@@ -413,7 +411,7 @@  static int vfio_pci_open(void *device_data)
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
-	mutex_lock(&driver_lock);
+	mutex_lock(&vdev->reflck->lock);
 
 	if (!vdev->refcnt) {
 		ret = vfio_pci_enable(vdev);
@@ -424,7 +422,7 @@  static int vfio_pci_open(void *device_data)
 	}
 	vdev->refcnt++;
 error:
-	mutex_unlock(&driver_lock);
+	mutex_unlock(&vdev->reflck->lock);
 	if (ret)
 		module_put(THIS_MODULE);
 	return ret;
@@ -1187,6 +1185,9 @@  static const struct vfio_device_ops vfio_pci_ops = {
 	.request	= vfio_pci_request,
 };
 
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct vfio_pci_device *vdev;
@@ -1233,6 +1234,14 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return ret;
 	}
 
+	ret = vfio_pci_reflck_attach(vdev);
+	if (ret) {
+		vfio_del_group_dev(&pdev->dev);
+		vfio_iommu_group_put(group, &pdev->dev);
+		kfree(vdev);
+		return ret;
+	}
+
 	if (vfio_pci_is_vga(pdev)) {
 		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
 		vga_set_legacy_decoding(pdev,
@@ -1264,6 +1273,8 @@  static void vfio_pci_remove(struct pci_dev *pdev)
 	if (!vdev)
 		return;
 
+	vfio_pci_reflck_put(vdev->reflck);
+
 	vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
 	kfree(vdev->region);
 	mutex_destroy(&vdev->ioeventfds_lock);
@@ -1320,16 +1331,97 @@  static struct pci_driver vfio_pci_driver = {
 	.err_handler	= &vfio_err_handlers,
 };
 
+static DEFINE_MUTEX(reflck_lock);
+
+static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
+{
+	struct vfio_pci_reflck *reflck;
+
+	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
+	if (!reflck)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&reflck->kref);
+	mutex_init(&reflck->lock);
+
+	return reflck;
+}
+
+static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
+{
+	kref_get(&reflck->kref);
+}
+
+static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
+{
+	struct vfio_pci_reflck **preflck = data;
+	struct vfio_device *device;
+	struct vfio_pci_device *vdev;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (!device)
+		return 0;
+
+	if (pci_dev_driver(pdev) != &vfio_pci_driver) {
+		vfio_device_put(device);
+		return 0;
+	}
+
+	vdev = vfio_device_data(device);
+
+	if (vdev->reflck) {
+		vfio_pci_reflck_get(vdev->reflck);
+		*preflck = vdev->reflck;
+		vfio_device_put(device);
+		return 1;
+	}
+
+	vfio_device_put(device);
+	return 0;
+}
+
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
+{
+	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
+
+	mutex_lock(&reflck_lock);
+
+	if (pci_is_root_bus(vdev->pdev->bus) ||
+	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
+					  &vdev->reflck, slot) <= 0)
+		vdev->reflck = vfio_pci_reflck_alloc();
+
+	mutex_unlock(&reflck_lock);
+
+	return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0;
+}
+
+static void vfio_pci_reflck_release(struct kref *kref)
+{
+	struct vfio_pci_reflck *reflck = container_of(kref,
+						      struct vfio_pci_reflck,
+						      kref);
+
+	kfree(reflck);
+	mutex_unlock(&reflck_lock);
+}
+
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
+{
+	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
+}
+
 struct vfio_devices {
 	struct vfio_device **devices;
 	int cur_index;
 	int max_index;
 };
 
-static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
 	struct vfio_device *device;
+	struct vfio_pci_device *vdev;
 
 	if (devs->cur_index == devs->max_index)
 		return -ENOSPC;
@@ -1343,16 +1435,25 @@  static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
 		return -EBUSY;
 	}
 
+	vdev = vfio_device_data(device);
+
+	/* Only collect unused devices */
+	if (vdev->refcnt) {
+		vfio_device_put(device);
+		return -EBUSY;
+	}
+
 	devs->devices[devs->cur_index++] = device;
 	return 0;
 }
 
 /*
- * Attempt to do a bus/slot reset if there are devices affected by a reset for
- * this device that are needs_reset and all of the affected devices are unused
- * (!refcnt).  Callers are required to hold driver_lock when calling this to
- * prevent device opens and concurrent bus reset attempts.  We prevent device
- * unbinds by acquiring and holding a reference to the vfio_device.
+ * Attempt to do a bus/slot reset if there are devices affected by a reset
+ * for this device that are "needs_reset" and all of the affected devices
+ * are unused (!refcnt).  Callers are required to hold vdev->reflck->lock
+ * to prevent concurrent device opens, which is expected to protect all
+ * affected devices.  A vfio_device reference is also acquired for each
+ * affected device to prevent unbinds.
  *
  * NB: vfio-core considers a group to be viable even if some devices are
  * bound to drivers like pci-stub or pcieport.  Here we require all devices
@@ -1363,7 +1464,7 @@  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 {
 	struct vfio_devices devs = { .cur_index = 0 };
 	int i = 0, ret = -EINVAL;
-	bool needs_reset = false, slot = false;
+	bool slot = false;
 	struct vfio_pci_device *tmp;
 
 	if (!pci_probe_reset_slot(vdev->pdev->slot))
@@ -1381,28 +1482,36 @@  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 		return;
 
 	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_devs, &devs, slot))
+					  vfio_pci_get_unused_devs,
+					  &devs, slot))
 		goto put_devs;
 
+	/* Does at least one need a reset? */
 	for (i = 0; i < devs.cur_index; i++) {
 		tmp = vfio_device_data(devs.devices[i]);
-		if (tmp->needs_reset)
-			needs_reset = true;
-		if (tmp->refcnt)
-			goto put_devs;
+		if (tmp->needs_reset) {
+			ret = pci_reset_bus(vdev->pdev);
+			break;
+		}
 	}
 
-	if (needs_reset)
-		ret = pci_reset_bus(vdev->pdev);
-
 put_devs:
 	for (i = 0; i < devs.cur_index; i++) {
 		tmp = vfio_device_data(devs.devices[i]);
-		if (!ret)
+
+		/*
+		 * If reset was successful, affected devices no longer need
+		 * a reset and we should return all the collateral devices
+		 * to low power.  If not successful, we either didn't reset
+		 * the bus or timed out waiting for it, so let's not touch
+		 * the power state.
+		 */
+		if (!ret) {
 			tmp->needs_reset = false;
 
-		if (!tmp->refcnt && !disable_idle_d3)
-			pci_set_power_state(tmp->pdev, PCI_D3hot);
+			if (tmp != vdev && !disable_idle_d3)
+				pci_set_power_state(tmp->pdev, PCI_D3hot);
+		}
 
 		vfio_device_put(devs.devices[i]);
 	}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d3441a..aa2355e67340 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -76,6 +76,11 @@  struct vfio_pci_dummy_resource {
 	struct list_head	res_next;
 };
 
+struct vfio_pci_reflck {
+	struct kref		kref;
+	struct mutex		lock;
+};
+
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
@@ -104,6 +109,7 @@  struct vfio_pci_device {
 	bool			needs_reset;
 	bool			nointx;
 	struct pci_saved_state	*pci_saved_state;
+	struct vfio_pci_reflck	*reflck;
 	int			refcnt;
 	int			ioeventfds_nr;
 	struct eventfd_ctx	*err_trigger;