diff mbox series

[2/3] vfio-pci: Fault mmaps to enable vma tracking

Message ID 158836915917.8433.8017639758883869710.stgit@gimli.home (mailing list archive)
State New, archived
Headers show
Series vfio-pci: Block user access to disabled device MMIO | expand

Commit Message

Alex Williamson May 1, 2020, 9:39 p.m. UTC
Rather than calling remap_pfn_range() when a region is mmap'd, setup
a vm_ops handler to support dynamic faulting of the range on access.
This allows us to manage a list of vmas actively mapping the area that
we can later use to invalidate those mappings.  The open callback
invalidates the vma range so that all tracking is inserted in the
fault handler and removed in the close handler.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         |   76 ++++++++++++++++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_private.h |    7 +++
 2 files changed, 81 insertions(+), 2 deletions(-)

Comments

Jason Gunthorpe May 1, 2020, 11:25 p.m. UTC | #1
On Fri, May 01, 2020 at 03:39:19PM -0600, Alex Williamson wrote:
> Rather than calling remap_pfn_range() when a region is mmap'd, setup
> a vm_ops handler to support dynamic faulting of the range on access.
> This allows us to manage a list of vmas actively mapping the area that
> we can later use to invalidate those mappings.  The open callback
> invalidates the vma range so that all tracking is inserted in the
> fault handler and removed in the close handler.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  drivers/vfio/pci/vfio_pci.c         |   76 ++++++++++++++++++++++++++++++++++-
>  drivers/vfio/pci/vfio_pci_private.h |    7 +++
>  2 files changed, 81 insertions(+), 2 deletions(-)

> +static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct vfio_pci_device *vdev = vma->vm_private_data;
> +
> +	if (vfio_pci_add_vma(vdev, vma))
> +		return VM_FAULT_OOM;
> +
> +	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> +			    vma->vm_end - vma->vm_start, vma->vm_page_prot))
> +		return VM_FAULT_SIGBUS;
> +
> +	return VM_FAULT_NOPAGE;
> +}
> +
> +static const struct vm_operations_struct vfio_pci_mmap_ops = {
> +	.open = vfio_pci_mmap_open,
> +	.close = vfio_pci_mmap_close,
> +	.fault = vfio_pci_mmap_fault,
> +};
> +
>  static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  {
>  	struct vfio_pci_device *vdev = device_data;
> @@ -1357,8 +1421,14 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
>  	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
>  
> -	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> -			       req_len, vma->vm_page_prot);
> +	/*
> +	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
> +	 * change vm_flags within the fault handler.  Set them now.
> +	 */
> +	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
> +	vma->vm_ops = &vfio_pci_mmap_ops;

Perhaps do the vfio_pci_add_vma & remap_pfn_range combo here if the
BAR is activated ? That way a fully populated BAR is presented in the
common case and avoids taking a fault path?

But it does seem OK as is

Jason
Alex Williamson May 4, 2020, 2:20 p.m. UTC | #2
On Fri, 1 May 2020 20:25:50 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Fri, May 01, 2020 at 03:39:19PM -0600, Alex Williamson wrote:
> > Rather than calling remap_pfn_range() when a region is mmap'd, setup
> > a vm_ops handler to support dynamic faulting of the range on access.
> > This allows us to manage a list of vmas actively mapping the area that
> > we can later use to invalidate those mappings.  The open callback
> > invalidates the vma range so that all tracking is inserted in the
> > fault handler and removed in the close handler.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> >  drivers/vfio/pci/vfio_pci.c         |   76 ++++++++++++++++++++++++++++++++++-
> >  drivers/vfio/pci/vfio_pci_private.h |    7 +++
> >  2 files changed, 81 insertions(+), 2 deletions(-)  
> 
> > +static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
> > +{
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	struct vfio_pci_device *vdev = vma->vm_private_data;
> > +
> > +	if (vfio_pci_add_vma(vdev, vma))
> > +		return VM_FAULT_OOM;
> > +
> > +	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> > +			    vma->vm_end - vma->vm_start, vma->vm_page_prot))
> > +		return VM_FAULT_SIGBUS;
> > +
> > +	return VM_FAULT_NOPAGE;
> > +}
> > +
> > +static const struct vm_operations_struct vfio_pci_mmap_ops = {
> > +	.open = vfio_pci_mmap_open,
> > +	.close = vfio_pci_mmap_close,
> > +	.fault = vfio_pci_mmap_fault,
> > +};
> > +
> >  static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> >  {
> >  	struct vfio_pci_device *vdev = device_data;
> > @@ -1357,8 +1421,14 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> >  	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> >  	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
> >  
> > -	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> > -			       req_len, vma->vm_page_prot);
> > +	/*
> > +	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
> > +	 * change vm_flags within the fault handler.  Set them now.
> > +	 */
> > +	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
> > +	vma->vm_ops = &vfio_pci_mmap_ops;  
> 
> Perhaps do the vfio_pci_add_vma & remap_pfn_range combo here if the
> BAR is activated ? That way a fully populated BAR is presented in the
> common case and avoids taking a fault path?
> 
> But it does seem OK as is

Thanks for reviewing.  There's also an argument that we defer
remap_pfn_range() until the device is actually touched, which might
reduce the startup latency.  It's also a bit inconsistent with the
vm_ops.open() path where I can't return error, so I can't call
vfio_pci_add_vma(), I can only zap the vma so that the fault handler
can return an error if necessary.  Therefore it felt more consistent,
with potential startup latency improvements, to defer all mappings to
the fault handler.  If there's a good reason to do otherwise, I can
make the change, but I doubt I'd have encountered the dma mapping of an
unfaulted vma issue had I done it this way, so maybe there's a test
coverage argument as well.  Thanks,

Alex
Jason Gunthorpe May 4, 2020, 3:05 p.m. UTC | #3
On Mon, May 04, 2020 at 08:20:55AM -0600, Alex Williamson wrote:
> On Fri, 1 May 2020 20:25:50 -0300
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
> 
> > On Fri, May 01, 2020 at 03:39:19PM -0600, Alex Williamson wrote:
> > > Rather than calling remap_pfn_range() when a region is mmap'd, setup
> > > a vm_ops handler to support dynamic faulting of the range on access.
> > > This allows us to manage a list of vmas actively mapping the area that
> > > we can later use to invalidate those mappings.  The open callback
> > > invalidates the vma range so that all tracking is inserted in the
> > > fault handler and removed in the close handler.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > >  drivers/vfio/pci/vfio_pci.c         |   76 ++++++++++++++++++++++++++++++++++-
> > >  drivers/vfio/pci/vfio_pci_private.h |    7 +++
> > >  2 files changed, 81 insertions(+), 2 deletions(-)  
> > 
> > > +static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
> > > +{
> > > +	struct vm_area_struct *vma = vmf->vma;
> > > +	struct vfio_pci_device *vdev = vma->vm_private_data;
> > > +
> > > +	if (vfio_pci_add_vma(vdev, vma))
> > > +		return VM_FAULT_OOM;
> > > +
> > > +	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> > > +			    vma->vm_end - vma->vm_start, vma->vm_page_prot))
> > > +		return VM_FAULT_SIGBUS;
> > > +
> > > +	return VM_FAULT_NOPAGE;
> > > +}
> > > +
> > > +static const struct vm_operations_struct vfio_pci_mmap_ops = {
> > > +	.open = vfio_pci_mmap_open,
> > > +	.close = vfio_pci_mmap_close,
> > > +	.fault = vfio_pci_mmap_fault,
> > > +};
> > > +
> > >  static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> > >  {
> > >  	struct vfio_pci_device *vdev = device_data;
> > > @@ -1357,8 +1421,14 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> > >  	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > >  	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
> > >  
> > > -	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> > > -			       req_len, vma->vm_page_prot);
> > > +	/*
> > > +	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
> > > +	 * change vm_flags within the fault handler.  Set them now.
> > > +	 */
> > > +	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
> > > +	vma->vm_ops = &vfio_pci_mmap_ops;  
> > 
> > Perhaps do the vfio_pci_add_vma & remap_pfn_range combo here if the
> > BAR is activated ? That way a fully populated BAR is presented in the
> > common case and avoids taking a fault path?
> > 
> > But it does seem OK as is
> 
> Thanks for reviewing.  There's also an argument that we defer
> remap_pfn_range() until the device is actually touched, which might
> reduce the startup latency.

But not startup to a functional VM as that will now have to take the
slower fault path.

> It's also a bit inconsistent with the vm_ops.open() path where I
> can't return error, so I can't call vfio_pci_add_vma(), I can only
> zap the vma so that the fault handler can return an error if
> necessary.

open could allocate memory so the zap isn't needed. If allocation
fails then do the zap and take the slow path.

> handler.  If there's a good reason to do otherwise, I can make the
> change, but I doubt I'd have encountered the dma mapping of an
> unfaulted vma issue had I done it this way, so maybe there's a test
> coverage argument as well.  Thanks,

This test is best done by having one thread disable the other bar
while another thread is trying to map it

Jason
Alex Williamson May 4, 2020, 3:53 p.m. UTC | #4
On Mon, 4 May 2020 12:05:56 -0300
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Mon, May 04, 2020 at 08:20:55AM -0600, Alex Williamson wrote:
> > On Fri, 1 May 2020 20:25:50 -0300
> > Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >   
> > > On Fri, May 01, 2020 at 03:39:19PM -0600, Alex Williamson wrote:  
> > > > Rather than calling remap_pfn_range() when a region is mmap'd, setup
> > > > a vm_ops handler to support dynamic faulting of the range on access.
> > > > This allows us to manage a list of vmas actively mapping the area that
> > > > we can later use to invalidate those mappings.  The open callback
> > > > invalidates the vma range so that all tracking is inserted in the
> > > > fault handler and removed in the close handler.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > >  drivers/vfio/pci/vfio_pci.c         |   76 ++++++++++++++++++++++++++++++++++-
> > > >  drivers/vfio/pci/vfio_pci_private.h |    7 +++
> > > >  2 files changed, 81 insertions(+), 2 deletions(-)    
> > >   
> > > > +static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
> > > > +{
> > > > +	struct vm_area_struct *vma = vmf->vma;
> > > > +	struct vfio_pci_device *vdev = vma->vm_private_data;
> > > > +
> > > > +	if (vfio_pci_add_vma(vdev, vma))
> > > > +		return VM_FAULT_OOM;
> > > > +
> > > > +	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> > > > +			    vma->vm_end - vma->vm_start, vma->vm_page_prot))
> > > > +		return VM_FAULT_SIGBUS;
> > > > +
> > > > +	return VM_FAULT_NOPAGE;
> > > > +}
> > > > +
> > > > +static const struct vm_operations_struct vfio_pci_mmap_ops = {
> > > > +	.open = vfio_pci_mmap_open,
> > > > +	.close = vfio_pci_mmap_close,
> > > > +	.fault = vfio_pci_mmap_fault,
> > > > +};
> > > > +
> > > >  static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> > > >  {
> > > >  	struct vfio_pci_device *vdev = device_data;
> > > > @@ -1357,8 +1421,14 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> > > >  	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > > >  	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
> > > >  
> > > > -	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
> > > > -			       req_len, vma->vm_page_prot);
> > > > +	/*
> > > > +	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
> > > > +	 * change vm_flags within the fault handler.  Set them now.
> > > > +	 */
> > > > +	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
> > > > +	vma->vm_ops = &vfio_pci_mmap_ops;    
> > > 
> > > Perhaps do the vfio_pci_add_vma & remap_pfn_range combo here if the
> > > BAR is activated ? That way a fully populated BAR is presented in the
> > > common case and avoids taking a fault path?
> > > 
> > > But it does seem OK as is  
> > 
> > Thanks for reviewing.  There's also an argument that we defer
> > remap_pfn_range() until the device is actually touched, which might
> > reduce the startup latency.  
> 
> But not startup to a functional VM as that will now have to take the
> slower fault path.

We need to take the fault path regardless because a VM will size and
(virtually) map the BARs, toggling the memory enable bit.  As provided
here, we don't trigger the fault unless the user attempts to access the
BAR or we DMA map the BAR.  That defers the fault until the VM is (to
some extent) up and running, and has a better chance for multi-threaded
faulting than does QEMU initialization. 
 
> > It's also a bit inconsistent with the vm_ops.open() path where I
> > can't return error, so I can't call vfio_pci_add_vma(), I can only
> > zap the vma so that the fault handler can return an error if
> > necessary.  
> 
> open could allocate memory so the zap isn't needed. If allocation
> fails then do the zap and take the slow path.

That's a good idea, but it also gives us one more initialization
variation.  I thought it was a rather nice feature that our vma_list
includes only vmas that have actually faulted in the mapping since it
was last zapped.  This is our steady state, so why not get there
immediately rather than putting every mmap or open into the list,
regardless of whether that particular vma ever accesses the mapping?
If we have vmas in our vma_list that have never accessed the mapping,
we're increasing our latency on zap.

> > handler.  If there's a good reason to do otherwise, I can make the
> > change, but I doubt I'd have encountered the dma mapping of an
> > unfaulted vma issue had I done it this way, so maybe there's a test
> > coverage argument as well.  Thanks,  
> 
> This test is best done by having one thread disable the other bar
> while another thread is trying to map it

We can split hairs on the best mechanism to trigger this, but the best
test is the one that actually gets run, and this triggered by simply
starting an assigned device VM.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 6c6b37b5c04e..da2fef666d9c 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1299,6 +1299,70 @@  static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
 	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
 }
 
+static int vfio_pci_add_vma(struct vfio_pci_device *vdev,
+			    struct vm_area_struct *vma)
+{
+	struct vfio_pci_mmap_vma *mmap_vma;
+
+	mmap_vma = kzalloc(sizeof(*mmap_vma), GFP_KERNEL);
+	if (!mmap_vma)
+		return -ENOMEM;
+
+	mmap_vma->vma = vma;
+
+	mutex_lock(&vdev->vma_lock);
+	list_add(&mmap_vma->vma_next, &vdev->vma_list);
+	mutex_unlock(&vdev->vma_lock);
+
+	return 0;
+}
+
+/*
+ * Zap mmaps on open so that we can fault them in on access and therefore
+ * our vma_list only tracks mappings accessed since last zap.
+ */
+static void vfio_pci_mmap_open(struct vm_area_struct *vma)
+{
+	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void vfio_pci_mmap_close(struct vm_area_struct *vma)
+{
+	struct vfio_pci_device *vdev = vma->vm_private_data;
+	struct vfio_pci_mmap_vma *mmap_vma;
+
+	mutex_lock(&vdev->vma_lock);
+	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+		if (mmap_vma->vma == vma) {
+			list_del(&mmap_vma->vma_next);
+			kfree(mmap_vma);
+			break;
+		}
+	}
+	mutex_unlock(&vdev->vma_lock);
+}
+
+static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_device *vdev = vma->vm_private_data;
+
+	if (vfio_pci_add_vma(vdev, vma))
+		return VM_FAULT_OOM;
+
+	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			    vma->vm_end - vma->vm_start, vma->vm_page_prot))
+		return VM_FAULT_SIGBUS;
+
+	return VM_FAULT_NOPAGE;
+}
+
+static const struct vm_operations_struct vfio_pci_mmap_ops = {
+	.open = vfio_pci_mmap_open,
+	.close = vfio_pci_mmap_close,
+	.fault = vfio_pci_mmap_fault,
+};
+
 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 {
 	struct vfio_pci_device *vdev = device_data;
@@ -1357,8 +1421,14 @@  static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
 
-	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       req_len, vma->vm_page_prot);
+	/*
+	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
+	 * change vm_flags within the fault handler.  Set them now.
+	 */
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_ops = &vfio_pci_mmap_ops;
+
+	return 0;
 }
 
 static void vfio_pci_request(void *device_data, unsigned int count)
@@ -1608,6 +1678,8 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	spin_lock_init(&vdev->irqlock);
 	mutex_init(&vdev->ioeventfds_lock);
 	INIT_LIST_HEAD(&vdev->ioeventfds_list);
+	mutex_init(&vdev->vma_lock);
+	INIT_LIST_HEAD(&vdev->vma_list);
 
 	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
 	if (ret)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 36ec69081ecd..9b25f9f6ce1d 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -92,6 +92,11 @@  struct vfio_pci_vf_token {
 	int			users;
 };
 
+struct vfio_pci_mmap_vma {
+	struct vm_area_struct	*vma;
+	struct list_head	vma_next;
+};
+
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
@@ -132,6 +137,8 @@  struct vfio_pci_device {
 	struct list_head	ioeventfds_list;
 	struct vfio_pci_vf_token	*vf_token;
 	struct notifier_block	nb;
+	struct mutex		vma_lock;
+	struct list_head	vma_list;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)