diff mbox series

[RFC,04/10] vfio/pci: Use vfio_device_unmap_mapping_range()

Message ID 161401267316.16443.11184767955094847849.stgit@gimli.home (mailing list archive)
State New, archived
Headers show
Series vfio: Device memory DMA mapping improvements | expand

Commit Message

Alex Williamson Feb. 22, 2021, 4:51 p.m. UTC
With the vfio device fd tied to the address space of the pseudo fs
inode, we can use the mm to track all vmas that might be mmap'ing
device BARs, which removes our vma_list and all the complicated
lock ordering necessary to manually zap each related vma.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         |  217 ++++-------------------------------
 drivers/vfio/pci/vfio_pci_private.h |    3 
 2 files changed, 28 insertions(+), 192 deletions(-)

Comments

Jason Gunthorpe Feb. 22, 2021, 5:22 p.m. UTC | #1
On Mon, Feb 22, 2021 at 09:51:13AM -0700, Alex Williamson wrote:

> +	vfio_device_unmap_mapping_range(vdev->device,
> +			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX),
> +			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX) -
> +			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX));

Isn't this the same as invalidating everything? I see in
vfio_pci_mmap():

	if (index >= VFIO_PCI_ROM_REGION_INDEX)
		return -EINVAL;

> @@ -2273,15 +2112,13 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
>  
>  	vdev = vfio_device_data(device);
>  
> -	/*
> -	 * Locking multiple devices is prone to deadlock, runaway and
> -	 * unwind if we hit contention.
> -	 */
> -	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
> +	if (!down_write_trylock(&vdev->memory_lock)) {
>  		vfio_device_put(device);
>  		return -EBUSY;
>  	}

And this is only done as part of VFIO_DEVICE_PCI_HOT_RESET?

It looks like VFIO_DEVICE_PCI_HOT_RESET effects the entire slot?

How about putting the inode on the reflck structure, which is also
per-slot, and then a single unmap_mapping_range() will take care of
everything, no need to iterate over things in the driver core.

Note the vm->pg_off space doesn't have any special meaning, it is
fine that two struct vfio_pci_device's are sharing the same address
space and using an incompatible overlapping pg_offs

> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index 9cd1882a05af..ba37f4eeefd0 100644
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -101,6 +101,7 @@ struct vfio_pci_mmap_vma {
>  
>  struct vfio_pci_device {
>  	struct pci_dev		*pdev;
> +	struct vfio_device	*device;

Ah, I did this too, but I didn't use a pointer :)

All the places trying to call vfio_device_put() when they really want
a vfio_pci_device * become simpler now. Eg struct vfio_devices wants
to have an array of vfio_pci_device, and get_pf_vdev() only needs to
return one pointer.

Jason
Alex Williamson Feb. 24, 2021, 9:55 p.m. UTC | #2
On Mon, 22 Feb 2021 13:22:30 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Mon, Feb 22, 2021 at 09:51:13AM -0700, Alex Williamson wrote:
> 
> > +	vfio_device_unmap_mapping_range(vdev->device,
> > +			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX),
> > +			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX) -
> > +			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX));  
> 
> Isn't this the same as invalidating everything? I see in
> vfio_pci_mmap():
> 
> 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
> 		return -EINVAL;

No, immediately above that is:

        if (index >= VFIO_PCI_NUM_REGIONS) {
                int regnum = index - VFIO_PCI_NUM_REGIONS;
                struct vfio_pci_region *region = vdev->region + regnum;

                if (region && region->ops && region->ops->mmap &&
                    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
                        return region->ops->mmap(vdev, region, vma);
                return -EINVAL;
        }

We can have device specific regions that can support mmap, but those
regions aren't necessarily on-device memory so we can't assume they're
tied to the memory bit in the command register.
 
> > @@ -2273,15 +2112,13 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
> >  
> >  	vdev = vfio_device_data(device);
> >  
> > -	/*
> > -	 * Locking multiple devices is prone to deadlock, runaway and
> > -	 * unwind if we hit contention.
> > -	 */
> > -	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
> > +	if (!down_write_trylock(&vdev->memory_lock)) {
> >  		vfio_device_put(device);
> >  		return -EBUSY;
> >  	}  
> 
> And this is only done as part of VFIO_DEVICE_PCI_HOT_RESET?

Yes.

> It looks like VFIO_DEVICE_PCI_HOT_RESET effects the entire slot?

Yes.

> How about putting the inode on the reflck structure, which is also
> per-slot, and then a single unmap_mapping_range() will take care of
> everything, no need to iterate over things in the driver core.
>
> Note the vm->pg_off space doesn't have any special meaning, it is
> fine that two struct vfio_pci_device's are sharing the same address
> space and using an incompatible overlapping pg_offs

Ok, but how does this really help us, unless you're also proposing some
redesign of the memory_lock semaphore?  Even if we're zapping all the
affected devices for a bus reset that doesn't eliminate that we still
require device level granularity for other events.  Maybe there's some
layering of the inodes that you're implying that allows both, but it
still feels like a minor optimization if we need to traverse devices
for the memory_lock.

> > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> > index 9cd1882a05af..ba37f4eeefd0 100644
> > +++ b/drivers/vfio/pci/vfio_pci_private.h
> > @@ -101,6 +101,7 @@ struct vfio_pci_mmap_vma {
> >  
> >  struct vfio_pci_device {
> >  	struct pci_dev		*pdev;
> > +	struct vfio_device	*device;  
> 
> Ah, I did this too, but I didn't use a pointer :)

vfio_device is embedded in vfio.c, so that worries me.

> All the places trying to call vfio_device_put() when they really want
> a vfio_pci_device * become simpler now. Eg struct vfio_devices wants
> to have an array of vfio_pci_device, and get_pf_vdev() only needs to
> return one pointer.

Sure, that example would be a good simplification.  I'm not sure see
other cases where we're going out of our way to manage the vfio_device
versus vfio_pci_device objects though.  Thanks,

Alex
Jason Gunthorpe Feb. 25, 2021, 12:57 a.m. UTC | #3
On Wed, Feb 24, 2021 at 02:55:05PM -0700, Alex Williamson wrote:

> Ok, but how does this really help us, unless you're also proposing some
> redesign of the memory_lock semaphore?  Even if we're zapping all the
> affected devices for a bus reset that doesn't eliminate that we still
> require device level granularity for other events.

Ok, I missed the device level one, forget this remark about the reflk
then, per vfio_pci_device is the right granularity

> > >  struct vfio_pci_device {
> > >  	struct pci_dev		*pdev;
> > > +	struct vfio_device	*device;  
> > 
> > Ah, I did this too, but I didn't use a pointer :)
> 
> vfio_device is embedded in vfio.c, so that worries me.

I'm working on what we talked about in the other thread to show how
VFIO looks if it follows the normal Linux container_of idiom and to
then remove the vfio_mdev.c indirection as an illustration.

I want to directly make the case the VFIO is better off following the
standard kernel designs and driver core norms so we can find an
agreement to get Max's work on vfio_pci going forward.

You were concerned about hand waving, and maintainability so I'm
willing to put in some more work to make it concrete.

I hope you'll keep an open mind

> > All the places trying to call vfio_device_put() when they really want
> > a vfio_pci_device * become simpler now. Eg struct vfio_devices wants
> > to have an array of vfio_pci_device, and get_pf_vdev() only needs to
> > return one pointer.
> 
> Sure, that example would be a good simplification.  I'm not sure see
> other cases where we're going out of our way to manage the vfio_device
> versus vfio_pci_device objects though.

What I've found is there are lots of little costs all over the
place. The above was just easy to describe in this context.

In my mind the biggest negative cost is the type erasure. Lots of
places are using 'device *', 'void *', 'kobj *' as generic handles to
things that are actually already concretely typed. In the kernel I
would say concretely typing things is considered a virtue.

Having gone through alot of VFIO now, as it relates to the
vfio_device, I think the type erasure directly hurts readability and
maintainability. It just takes too much information away from the
reader and the compiler. The core code is managable, but when you
chuck mdev into it that follows the same, it really starts hurting.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index f0a1d05f0137..115f10f7b096 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -225,7 +225,7 @@  static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
+static int vfio_pci_mem_trylock_and_zap_cb(struct pci_dev *pdev, void *data);
 
 /*
  * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
@@ -1168,7 +1168,7 @@  static long vfio_pci_ioctl(void *device_data,
 		struct vfio_pci_group_info info;
 		struct vfio_devices devs = { .cur_index = 0 };
 		bool slot = false;
-		int i, group_idx, mem_idx = 0, count = 0, ret = 0;
+		int i, group_idx, count = 0, ret = 0;
 
 		minsz = offsetofend(struct vfio_pci_hot_reset, count);
 
@@ -1268,32 +1268,16 @@  static long vfio_pci_ioctl(void *device_data,
 		}
 
 		/*
-		 * We need to get memory_lock for each device, but devices
-		 * can share mmap_lock, therefore we need to zap and hold
-		 * the vma_lock for each device, and only then get each
-		 * memory_lock.
+		 * Try to get the memory_lock write lock for all devices and
+		 * zap all BAR mmaps.
 		 */
 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					    vfio_pci_try_zap_and_vma_lock_cb,
+					    vfio_pci_mem_trylock_and_zap_cb,
 					    &devs, slot);
-		if (ret)
-			goto hot_reset_release;
-
-		for (; mem_idx < devs.cur_index; mem_idx++) {
-			struct vfio_pci_device *tmp;
-
-			tmp = vfio_device_data(devs.devices[mem_idx]);
-
-			ret = down_write_trylock(&tmp->memory_lock);
-			if (!ret) {
-				ret = -EBUSY;
-				goto hot_reset_release;
-			}
-			mutex_unlock(&tmp->vma_lock);
-		}
 
 		/* User has access, do the reset */
-		ret = pci_reset_bus(vdev->pdev);
+		if (!ret)
+			ret = pci_reset_bus(vdev->pdev);
 
 hot_reset_release:
 		for (i = 0; i < devs.cur_index; i++) {
@@ -1303,10 +1287,7 @@  static long vfio_pci_ioctl(void *device_data,
 			device = devs.devices[i];
 			tmp = vfio_device_data(device);
 
-			if (i < mem_idx)
-				up_write(&tmp->memory_lock);
-			else
-				mutex_unlock(&tmp->vma_lock);
+			up_write(&tmp->memory_lock);
 			vfio_device_put(device);
 		}
 		kfree(devs.devices);
@@ -1452,100 +1433,18 @@  static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
 	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
 }
 
-/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
-static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try)
+static void vfio_pci_zap_bars(struct vfio_pci_device *vdev)
 {
-	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
-
-	/*
-	 * Lock ordering:
-	 * vma_lock is nested under mmap_lock for vm_ops callback paths.
-	 * The memory_lock semaphore is used by both code paths calling
-	 * into this function to zap vmas and the vm_ops.fault callback
-	 * to protect the memory enable state of the device.
-	 *
-	 * When zapping vmas we need to maintain the mmap_lock => vma_lock
-	 * ordering, which requires using vma_lock to walk vma_list to
-	 * acquire an mm, then dropping vma_lock to get the mmap_lock and
-	 * reacquiring vma_lock.  This logic is derived from similar
-	 * requirements in uverbs_user_mmap_disassociate().
-	 *
-	 * mmap_lock must always be the top-level lock when it is taken.
-	 * Therefore we can only hold the memory_lock write lock when
-	 * vma_list is empty, as we'd need to take mmap_lock to clear
-	 * entries.  vma_list can only be guaranteed empty when holding
-	 * vma_lock, thus memory_lock is nested under vma_lock.
-	 *
-	 * This enables the vm_ops.fault callback to acquire vma_lock,
-	 * followed by memory_lock read lock, while already holding
-	 * mmap_lock without risk of deadlock.
-	 */
-	while (1) {
-		struct mm_struct *mm = NULL;
-
-		if (try) {
-			if (!mutex_trylock(&vdev->vma_lock))
-				return 0;
-		} else {
-			mutex_lock(&vdev->vma_lock);
-		}
-		while (!list_empty(&vdev->vma_list)) {
-			mmap_vma = list_first_entry(&vdev->vma_list,
-						    struct vfio_pci_mmap_vma,
-						    vma_next);
-			mm = mmap_vma->vma->vm_mm;
-			if (mmget_not_zero(mm))
-				break;
-
-			list_del(&mmap_vma->vma_next);
-			kfree(mmap_vma);
-			mm = NULL;
-		}
-		if (!mm)
-			return 1;
-		mutex_unlock(&vdev->vma_lock);
-
-		if (try) {
-			if (!mmap_read_trylock(mm)) {
-				mmput(mm);
-				return 0;
-			}
-		} else {
-			mmap_read_lock(mm);
-		}
-		if (try) {
-			if (!mutex_trylock(&vdev->vma_lock)) {
-				mmap_read_unlock(mm);
-				mmput(mm);
-				return 0;
-			}
-		} else {
-			mutex_lock(&vdev->vma_lock);
-		}
-		list_for_each_entry_safe(mmap_vma, tmp,
-					 &vdev->vma_list, vma_next) {
-			struct vm_area_struct *vma = mmap_vma->vma;
-
-			if (vma->vm_mm != mm)
-				continue;
-
-			list_del(&mmap_vma->vma_next);
-			kfree(mmap_vma);
-
-			zap_vma_ptes(vma, vma->vm_start,
-				     vma->vm_end - vma->vm_start);
-		}
-		mutex_unlock(&vdev->vma_lock);
-		mmap_read_unlock(mm);
-		mmput(mm);
-	}
+	vfio_device_unmap_mapping_range(vdev->device,
+			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX),
+			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX) -
+			VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX));
 }
 
 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev)
 {
-	vfio_pci_zap_and_vma_lock(vdev, false);
 	down_write(&vdev->memory_lock);
-	mutex_unlock(&vdev->vma_lock);
+	vfio_pci_zap_bars(vdev);
 }
 
 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev)
@@ -1567,82 +1466,25 @@  void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd)
 	up_write(&vdev->memory_lock);
 }
 
-/* Caller holds vma_lock */
-static int __vfio_pci_add_vma(struct vfio_pci_device *vdev,
-			      struct vm_area_struct *vma)
-{
-	struct vfio_pci_mmap_vma *mmap_vma;
-
-	mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
-	if (!mmap_vma)
-		return -ENOMEM;
-
-	mmap_vma->vma = vma;
-	list_add(&mmap_vma->vma_next, &vdev->vma_list);
-
-	return 0;
-}
-
-/*
- * Zap mmaps on open so that we can fault them in on access and therefore
- * our vma_list only tracks mappings accessed since last zap.
- */
-static void vfio_pci_mmap_open(struct vm_area_struct *vma)
-{
-	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
-}
-
-static void vfio_pci_mmap_close(struct vm_area_struct *vma)
-{
-	struct vfio_pci_device *vdev = vma->vm_private_data;
-	struct vfio_pci_mmap_vma *mmap_vma;
-
-	mutex_lock(&vdev->vma_lock);
-	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
-		if (mmap_vma->vma == vma) {
-			list_del(&mmap_vma->vma_next);
-			kfree(mmap_vma);
-			break;
-		}
-	}
-	mutex_unlock(&vdev->vma_lock);
-}
-
 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct vfio_pci_device *vdev = vma->vm_private_data;
-	vm_fault_t ret = VM_FAULT_NOPAGE;
+	vm_fault_t ret = VM_FAULT_SIGBUS;
 
-	mutex_lock(&vdev->vma_lock);
 	down_read(&vdev->memory_lock);
 
-	if (!__vfio_pci_memory_enabled(vdev)) {
-		ret = VM_FAULT_SIGBUS;
-		mutex_unlock(&vdev->vma_lock);
-		goto up_out;
-	}
-
-	if (__vfio_pci_add_vma(vdev, vma)) {
-		ret = VM_FAULT_OOM;
-		mutex_unlock(&vdev->vma_lock);
-		goto up_out;
-	}
-
-	mutex_unlock(&vdev->vma_lock);
+	if (__vfio_pci_memory_enabled(vdev) &&
+	    !io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				vma->vm_end - vma->vm_start, vma->vm_page_prot))
+		ret = VM_FAULT_NOPAGE;
 
-	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       vma->vm_end - vma->vm_start, vma->vm_page_prot))
-		ret = VM_FAULT_SIGBUS;
-
-up_out:
 	up_read(&vdev->memory_lock);
+
 	return ret;
 }
 
 static const struct vm_operations_struct vfio_pci_mmap_ops = {
-	.open = vfio_pci_mmap_open,
-	.close = vfio_pci_mmap_close,
 	.fault = vfio_pci_mmap_fault,
 };
 
@@ -1926,7 +1768,6 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct vfio_pci_device *vdev;
 	struct iommu_group *group;
-	struct vfio_device *device;
 	int ret;
 
 	if (vfio_pci_is_denylisted(pdev))
@@ -1965,13 +1806,11 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mutex_init(&vdev->ioeventfds_lock);
 	INIT_LIST_HEAD(&vdev->dummy_resources_list);
 	INIT_LIST_HEAD(&vdev->ioeventfds_list);
-	mutex_init(&vdev->vma_lock);
-	INIT_LIST_HEAD(&vdev->vma_list);
 	init_rwsem(&vdev->memory_lock);
 
-	device = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
-	if (IS_ERR(device)) {
-		ret = PTR_ERR(device);
+	vdev->device = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
+	if (IS_ERR(vdev->device)) {
+		ret = PTR_ERR(vdev->device);
 		goto out_free;
 	}
 
@@ -2253,7 +2092,7 @@  static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
 	return 0;
 }
 
-static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
+static int vfio_pci_mem_trylock_and_zap_cb(struct pci_dev *pdev, void *data)
 {
 	struct vfio_devices *devs = data;
 	struct vfio_device *device;
@@ -2273,15 +2112,13 @@  static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data)
 
 	vdev = vfio_device_data(device);
 
-	/*
-	 * Locking multiple devices is prone to deadlock, runaway and
-	 * unwind if we hit contention.
-	 */
-	if (!vfio_pci_zap_and_vma_lock(vdev, true)) {
+	if (!down_write_trylock(&vdev->memory_lock)) {
 		vfio_device_put(device);
 		return -EBUSY;
 	}
 
+	vfio_pci_zap_bars(vdev);
+
 	devs->devices[devs->cur_index++] = device;
 	return 0;
 }
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 9cd1882a05af..ba37f4eeefd0 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -101,6 +101,7 @@  struct vfio_pci_mmap_vma {
 
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
+	struct vfio_device	*device;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
 	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
 	u8			*pci_config_map;
@@ -139,8 +140,6 @@  struct vfio_pci_device {
 	struct list_head	ioeventfds_list;
 	struct vfio_pci_vf_token	*vf_token;
 	struct notifier_block	nb;
-	struct mutex		vma_lock;
-	struct list_head	vma_list;
 	struct rw_semaphore	memory_lock;
 };