diff mbox series

[1/2] mm/vmalloc: Introduce vmap_file()

Message ID 20250131001806.92349-2-vishal.moola@gmail.com (mailing list archive)
State New
Headers show
Series vmalloc: Introduce vmap_file() | expand

Commit Message

Vishal Moola (Oracle) Jan. 31, 2025, 12:18 a.m. UTC
vmap_file() is effectively an in-kernel equivalent to calling mmap()
on a file. A user can pass in a file mapping, and vmap_file() will map
the specified portion of that file directly to kernel virtual space.

Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
 include/linux/vmalloc.h |  2 +
 mm/vmalloc.c            | 97 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 31e9ffd936e3..d5420985865f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -192,6 +192,8 @@  extern void vfree_atomic(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
+void *vmap_file(struct address_space *mapping, loff_t start, loff_t end,
+			unsigned long flags, pgprot_t prot);
 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
 extern void vunmap(const void *addr);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a6e7acebe9ad..4b1e31a8aad9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3475,6 +3475,103 @@  void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
+/**
+ * vmap_file - map all folios in a file to virtually contiguous space.
+ * @mapping: The address space to map.
+ * @start: The starting byte.
+ * @end: The final byte to map.
+ * @flags: vm_area->flags.
+ * @prot: page protection for the mapping.
+ *
+ * Maps a file into contiguous kernel virtual space. The caller is expected
+ * to ensure that the folios caching the file are present and uptodate. The
+ * folios must remain so until the file is unmapped.
+ *
+ * If @start or @end are not PAGE_ALIGNED, vmap_file() will round
+ * @start down and @end up to encompass the entire range. The
+ * address returned is always PAGE_ALIGNED.
+ *
+ * Return: the address of the area or %NULL on failure.
+ */
+void *vmap_file(struct address_space *mapping, loff_t start, loff_t end,
+		unsigned long flags, pgprot_t prot)
+{
+	struct vm_struct *area;
+	struct folio *folio;
+	unsigned long addr;
+	pgoff_t first = start >> PAGE_SHIFT;
+	pgoff_t last = end >> PAGE_SHIFT;
+	XA_STATE(xas, &mapping->i_pages, first);
+
+	unsigned long size = (last - first + 1) << PAGE_SHIFT;
+
+	if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
+		return NULL;
+
+	/*
+	 * Your top guard is someone else's bottom guard. Not having a top
+	 * guard compromises someone else's mappings too.
+	 */
+	if (WARN_ON_ONCE(flags & VM_NO_GUARD))
+		flags &= ~VM_NO_GUARD;
+
+	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
+	if (!area)
+		return NULL;
+
+	addr = (unsigned long) area->addr;
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, last) {
+		int err;
+		bool pmd_bound;
+
+		if (xas_retry(&xas, folio))
+			continue;
+		if (!folio || xa_is_value(folio) ||
+				!folio_test_uptodate(folio))
+			goto out;
+
+		/* We need to check if this folio will cross the pmd boundary.
+		 * If it does, we drop the rcu lock to allow for a new page
+		 * table allocation.
+		 */
+
+		pmd_bound = (addr == (unsigned long) area->addr) ||
+			(IS_ALIGNED(addr, PMD_SIZE)) ||
+			((addr & PMD_MASK) !=
+			((addr + folio_size(folio)) & PMD_MASK));
+
+		if (pmd_bound) {
+			xas_pause(&xas);
+			rcu_read_unlock();
+		}
+
+		err = vmap_range_noflush(addr, addr + folio_size(folio),
+				folio_pfn(folio) << PAGE_SHIFT, prot,
+				PAGE_SHIFT);
+
+		if (pmd_bound)
+			rcu_read_lock();
+
+		if (err) {
+			vunmap(area->addr);
+			area->addr = NULL;
+			goto out;
+		}
+
+		addr += folio_size(folio);
+	}
+
+out:
+	rcu_read_unlock();
+	flush_cache_vmap((unsigned long)area->addr,
+			 (unsigned long)area->addr + size);
+
+	return area->addr;
+}
+EXPORT_SYMBOL(vmap_file);
+
 #ifdef CONFIG_VMAP_PFN
 struct vmap_pfn_data {
 	unsigned long	*pfns;