diff mbox series

[v1:,vfio:,avoid,unnecessary,pin,memory,when,dma,map,io,address,space,2/2] vfio: avoid unnecessary pin memory when dma map io address space

Message ID 15b38c90ef1eb0825b7492d633d46d901e428f7d.1729760996.git.qinyuntan@linux.alibaba.com (mailing list archive)
State New
Headers show
Series [v1:,vfio:,avoid,unnecessary,pin,memory,when,dma,map,io,address,space,1/2] mm: introduce vma flag VM_PGOFF_IS_PFN | expand

Commit Message

Qinyun Tan Oct. 24, 2024, 9:34 a.m. UTC
When user application call ioctl(VFIO_IOMMU_MAP_DMA) to map a dma address,
the general handler 'vfio_pin_map_dma' attempts to pin the memory and
then create the mapping in the iommu.

However, some mappings aren't backed by a struct page, for example an
mmap'd MMIO range for our own or another device. In this scenario, a vma
with flag VM_IO | VM_PFNMAP, the pin operation will fail. Moreover, the
pin operation incurs a large overhead which will result in a longer
startup time for the VM. We don't actually need a pin in this scenario.

To address this issue, we introduce a new DMA MAP flag
'VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN' to skip the 'vfio_pin_pages_remote'
operation in the DMA map process for mmio memory. Additionally, we add
the 'VM_PGOFF_IS_PFN' flag for vfio_pci_mmap address, ensuring that we can
directly obtain the pfn through vma->vm_pgoff.

This approach allows us to avoid unnecessary memory pinning operations,
which would otherwise introduce additional overhead during DMA mapping.

In my tests, using vfio to pass through an 8-card AMD GPU which with a
large bar size (128GB*8), the time mapping the 192GB*8 bar was reduced
from about 50.79s to 1.57s.

Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com>
Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 drivers/vfio/pci/vfio_pci_core.c |  2 +-
 drivers/vfio/vfio_iommu_type1.c  | 64 +++++++++++++++++++++++++-------
 include/uapi/linux/vfio.h        | 11 ++++++
 3 files changed, 62 insertions(+), 15 deletions(-)
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1ab58da9f38a6..9e8743429e490 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1802,7 +1802,7 @@  int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
 	 * the VMA flags.
 	 */
 	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
-			VM_DONTEXPAND | VM_DONTDUMP);
+			VM_DONTEXPAND | VM_DONTDUMP | VM_PGOFF_IS_PFN);
 	vma->vm_ops = &vfio_pci_mmap_ops;
 
 	return 0;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf391b40e576f..156e668de117d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1439,7 +1439,7 @@  static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 }
 
 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
-			    size_t map_size)
+			    size_t map_size, unsigned int map_flags)
 {
 	dma_addr_t iova = dma->iova;
 	unsigned long vaddr = dma->vaddr;
@@ -1448,27 +1448,61 @@  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	long npage;
 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	int ret = 0;
+	struct mm_struct *mm = current->mm;
+	bool mmio_dont_pin = map_flags & VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN;
+
+	/* This code path is only user initiated */
+	if (!mm) {
+		ret = -ENODEV;
+		goto out;
+	}
 
 	vfio_batch_init(&batch);
 
 	while (size) {
-		/* Pin a contiguous chunk of memory */
-		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
-					      size >> PAGE_SHIFT, &pfn, limit,
-					      &batch);
-		if (npage <= 0) {
-			WARN_ON(!npage);
-			ret = (int)npage;
-			break;
+		struct vm_area_struct *vma;
+		unsigned long start = vaddr + dma->size;
+		bool do_pin_pages = true;
+
+		if (mmio_dont_pin) {
+			mmap_read_lock(mm);
+
+			vma = find_vma_intersection(mm, start, start+1);
+
+			/*
+			 * If this dma address rang belongs to the IO address space with VMA flags
+			 * VM_IO | VM_PFNMAP | VM_PGOFF_IS_PFN, it doesn't need to be pinned.
+			 * Simply skip the pin operation to avoid unnecessary overhead.
+			 */
+			if (vma && (vma->vm_flags & VM_PFNMAP) && (vma->vm_flags & VM_IO)
+							&& (vma->vm_flags & VM_PGOFF_IS_PFN)) {
+				pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+				npage = min_t(long, (vma->vm_end - start), size) >> PAGE_SHIFT;
+				do_pin_pages = false;
+			}
+			mmap_read_unlock(mm);
+		}
+
+		if (do_pin_pages) {
+			/* Pin a contiguous chunk of memory */
+			npage = vfio_pin_pages_remote(dma, start, size >> PAGE_SHIFT, &pfn,
+							limit, &batch);
+			if (npage <= 0) {
+				WARN_ON(!npage);
+				ret = (int)npage;
+				break;
+			}
 		}
 
 		/* Map it! */
 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
 				     dma->prot);
 		if (ret) {
-			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
-						npage, true);
-			vfio_batch_unpin(&batch, dma);
+			if (do_pin_pages) {
+				vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
+							npage, true);
+				vfio_batch_unpin(&batch, dma);
+			}
 			break;
 		}
 
@@ -1479,6 +1513,7 @@  static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	vfio_batch_fini(&batch);
 	dma->iommu_mapped = true;
 
+out:
 	if (ret)
 		vfio_remove_dma(iommu, dma);
 
@@ -1645,7 +1680,7 @@  static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (list_empty(&iommu->domain_list))
 		dma->size = size;
 	else
-		ret = vfio_pin_map_dma(iommu, dma, size);
+		ret = vfio_pin_map_dma(iommu, dma, size, map->flags);
 
 	if (!ret && iommu->dirty_page_tracking) {
 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
@@ -2639,6 +2674,7 @@  static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	case VFIO_TYPE1_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
 	case VFIO_TYPE1_NESTING_IOMMU:
+	case VFIO_DMA_MAP_MMIO_DONT_PIN:
 	case VFIO_UNMAP_ALL:
 		return 1;
 	case VFIO_UPDATE_VADDR:
@@ -2811,7 +2847,7 @@  static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
 	struct vfio_iommu_type1_dma_map map;
 	unsigned long minsz;
 	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
-			VFIO_DMA_MAP_FLAG_VADDR;
+			VFIO_DMA_MAP_FLAG_VADDR | VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN;
 
 	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf1902..ca391ec41b3c3 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -56,6 +56,16 @@ 
  */
 #define VFIO_UPDATE_VADDR		10
 
+/*
+ * Support VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN for DMA mapping. For MMIO addresses,
+ * we do not need to pin the pages or establish address mapping in the MMU
+ * at this stage. We only need to establish the address mapping in the IOMMU for
+ * the ioctl(VFIO_IOMMU_MAP_DMA). The page table mapping in the MMU will be
+ * dynamically established through the page fault mechanism when the page
+ * is accessed in the future.
+ */
+#define VFIO_DMA_MAP_MMIO_DONT_PIN	11
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -1560,6 +1570,7 @@  struct vfio_iommu_type1_dma_map {
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
 #define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
+#define VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN (1 << 3)	/* MMIO doesn't need pin page */
 	__u64	vaddr;				/* Process virtual address */
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */