diff mbox series

[19/19] vfio/pci: Implement huge_fault support

Message ID 20240809160909.1023470-20-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm: Support huge pfnmaps | expand

Commit Message

Peter Xu Aug. 9, 2024, 4:09 p.m. UTC
From: Alex Williamson <alex.williamson@redhat.com>

With the addition of pfnmap support in vmf_insert_pfn_{pmd,pud}() we
can take advantage of PMD and PUD faults to PCI BAR mmaps and create
more efficient mappings.  PCI BARs are always a power of two and will
typically get at least PMD alignment without userspace even trying.
Userspace alignment for PUD mappings is also not too difficult.

Consolidate faults through a single handler with a new wrapper for
standard single page faults.  The pre-faulting behavior of commit
d71a989cf5d9 ("vfio/pci: Insert full vma on mmap'd MMIO fault") is
removed in this refactoring since huge_fault will cover the bulk of
the faults and results in more efficient page table usage.  We also
want to avoid that pre-faulted single page mappings preempt huge page
mappings.

Cc: kvm@vger.kernel.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 60 +++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 17 deletions(-)

Comments

Jason Gunthorpe Aug. 14, 2024, 1:25 p.m. UTC | #1
On Fri, Aug 09, 2024 at 12:09:09PM -0400, Peter Xu wrote:
> @@ -1672,30 +1679,49 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
>  	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
>  		goto out_unlock;
>  
> -	ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
> -	if (ret & VM_FAULT_ERROR)
> -		goto out_unlock;
> -
> -	/*
> -	 * Pre-fault the remainder of the vma, abort further insertions and
> -	 * supress error if fault is encountered during pre-fault.
> -	 */
> -	for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
> -		if (addr == vmf->address)
> -			continue;
> -
> -		if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
> -			break;
> +	switch (order) {
> +	case 0:
> +		ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
> +		break;
> +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
> +	case PMD_ORDER:
> +		ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff,
> +							     PFN_DEV), false);
> +		break;
> +#endif
> +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
> +	case PUD_ORDER:
> +		ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff,
> +							     PFN_DEV), false);
> +		break;
> +#endif

I feel like this switch should be in some general function? 

vmf_insert_pfn_order(vmf, order, __pfn_to_pfn_t(pfn + pgoff, PFN_DEV), false);

No reason to expose every driver to this when you've already got a
nice contract to have the driver work on the passed in order.

What happens if the driver can't get a PFN that matches the requested
order?

Jason
Alex Williamson Aug. 14, 2024, 4:08 p.m. UTC | #2
On Wed, 14 Aug 2024 10:25:08 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Aug 09, 2024 at 12:09:09PM -0400, Peter Xu wrote:
> > @@ -1672,30 +1679,49 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
> >  	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
> >  		goto out_unlock;
> >  
> > -	ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
> > -	if (ret & VM_FAULT_ERROR)
> > -		goto out_unlock;
> > -
> > -	/*
> > -	 * Pre-fault the remainder of the vma, abort further insertions and
> > -	 * supress error if fault is encountered during pre-fault.
> > -	 */
> > -	for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
> > -		if (addr == vmf->address)
> > -			continue;
> > -
> > -		if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
> > -			break;
> > +	switch (order) {
> > +	case 0:
> > +		ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
> > +		break;
> > +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
> > +	case PMD_ORDER:
> > +		ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff,
> > +							     PFN_DEV), false);
> > +		break;
> > +#endif
> > +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
> > +	case PUD_ORDER:
> > +		ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff,
> > +							     PFN_DEV), false);
> > +		break;
> > +#endif  
> 
> I feel like this switch should be in some general function? 
> 
> vmf_insert_pfn_order(vmf, order, __pfn_to_pfn_t(pfn + pgoff, PFN_DEV), false);
> 
> No reason to expose every driver to this when you've already got a
> nice contract to have the driver work on the passed in order.
> 
> What happens if the driver can't get a PFN that matches the requested
> order?

There was some alignment and size checking chopped from the previous
reply that triggered a fallback, but in general PCI BARs are a power of
two and naturally aligned, so there should always be an order aligned
pfn.  Thanks,

Alex
Jason Gunthorpe Aug. 14, 2024, 4:24 p.m. UTC | #3
On Wed, Aug 14, 2024 at 10:08:49AM -0600, Alex Williamson wrote:

> There was some alignment and size checking chopped from the previous
> reply that triggered a fallback, but in general PCI BARs are a power of
> two and naturally aligned, so there should always be an order aligned
> pfn.

Sure, though I was mostlyo thinking about how to use this API in other
drivers.

Maybe the device has 2M page alignment only but the VMA was aligned to
1G? It will be called with an order higher than it can support but
that is not an error that should be failed.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index ba0ce0075b2f..2d7478e9a62d 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -20,6 +20,7 @@ 
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/pci.h>
+#include <linux/pfn_t.h>
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -1657,14 +1658,20 @@  static unsigned long vma_to_pfn(struct vm_area_struct *vma)
 	return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
 }
 
-static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
+static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
+					   unsigned int order)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct vfio_pci_core_device *vdev = vma->vm_private_data;
 	unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
-	unsigned long addr = vma->vm_start;
 	vm_fault_t ret = VM_FAULT_SIGBUS;
 
+	if (order && (vmf->address & ((PAGE_SIZE << order) - 1) ||
+		      vmf->address + (PAGE_SIZE << order) > vma->vm_end)) {
+		ret = VM_FAULT_FALLBACK;
+		goto out;
+	}
+
 	pfn = vma_to_pfn(vma);
 
 	down_read(&vdev->memory_lock);
@@ -1672,30 +1679,49 @@  static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
 	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
 		goto out_unlock;
 
-	ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
-	if (ret & VM_FAULT_ERROR)
-		goto out_unlock;
-
-	/*
-	 * Pre-fault the remainder of the vma, abort further insertions and
-	 * supress error if fault is encountered during pre-fault.
-	 */
-	for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
-		if (addr == vmf->address)
-			continue;
-
-		if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
-			break;
+	switch (order) {
+	case 0:
+		ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
+		break;
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+	case PMD_ORDER:
+		ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff,
+							     PFN_DEV), false);
+		break;
+#endif
+#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+	case PUD_ORDER:
+		ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff,
+							     PFN_DEV), false);
+		break;
+#endif
+	default:
+		ret = VM_FAULT_FALLBACK;
 	}
 
 out_unlock:
 	up_read(&vdev->memory_lock);
+out:
+	dev_dbg_ratelimited(&vdev->pdev->dev,
+			   "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n",
+			    __func__, order,
+			    vma->vm_pgoff >>
+				(VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT),
+			    pgoff, (unsigned int)ret);
 
 	return ret;
 }
 
+static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf)
+{
+	return vfio_pci_mmap_huge_fault(vmf, 0);
+}
+
 static const struct vm_operations_struct vfio_pci_mmap_ops = {
-	.fault = vfio_pci_mmap_fault,
+	.fault = vfio_pci_mmap_page_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+	.huge_fault = vfio_pci_mmap_huge_fault,
+#endif
 };
 
 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)