diff mbox series

[v5.1,14/19] mm/mmap: Avoid zeroing vma tree in mmap_region()

Message ID 20240816111405.11793-15-spasswolf@web.de (mailing list archive)
State New
Headers show
Series Rebase v5 patchset to next-20240816 | expand

Commit Message

Bert Karwatzki Aug. 16, 2024, 11:13 a.m. UTC
Instead of zeroing the vma tree and then overwriting the area, let the
area be overwritten and then clean up the gathered vmas using
vms_complete_munmap_vmas().

If a driver is mapping over an existing vma, then clear the ptes before
the call_mmap() invocation.  If the vma has a vm_ops->close(), then call
the close() function.  This is done using the vms_clear_ptes() and
vms_close_vmas() helpers.  This has the side effect of needing to call
open() on the vmas if the mmap_region() fails later on.

Temporarily keep track of the number of pages that will be removed and
reduce the charged amount.

This commit drops the validate_mm() call in the vma_expand() function.
It is necessary to drop the validate as it would fail since the mm
map_count would be incorrect during a vma expansion, prior to the
cleanup from vms_complete_munmap_vmas().

Clean up the error handing of the vms_gather_munmap_vmas() by calling
the verification within the function.

Note that before this change, a MAP_FIXED could fail and leave a gap in
the vma tree.  With this change, a MAP_FIXED failure will fail without
creating a gap and leave *a* vma in the area (may have been split) and
attept to restore them to an operational state (re-attached and
vm_ops->open()'ed if they were vm_ops->close()'d).

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
---
 mm/mmap.c | 60 +++++++++++++++++++++++++------------------------------
 mm/vma.c  | 59 ++++++++++++++++++++++++++++++++++++++++++------------
 mm/vma.h  | 16 ++++++++++++---
 3 files changed, 86 insertions(+), 49 deletions(-)

--
2.45.2
diff mbox series

Patch

diff --git a/mm/mmap.c b/mm/mmap.c
index 1f0b8dc5a089..e152b6caaf9c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1373,24 +1373,19 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long merge_start = addr, merge_end = end;
 	bool writable_file_mapping = false;
 	pgoff_t vm_pgoff;
-	int error;
+	int error = -ENOMEM;
 	VMA_ITERATOR(vmi, mm, addr);
+	unsigned long nr_pages, nr_accounted;

-	/* Check against address space limit. */
-	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
-		unsigned long nr_pages;
-
-		/*
-		 * MAP_FIXED may remove pages of mappings that intersects with
-		 * requested mapping. Account for the pages it would unmap.
-		 */
-		nr_pages = count_vma_pages_range(mm, addr, end);
-
-		if (!may_expand_vm(mm, vm_flags,
-					(len >> PAGE_SHIFT) - nr_pages))
-			return -ENOMEM;
-	}
+	nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);

+	/*
+	 * Check against address space limit.
+	 * MAP_FIXED may remove pages of mappings that intersects with requested
+	 * mapping. Account for the pages it would unmap.
+	 */
+	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
+		return -ENOMEM;

 	if (unlikely(!can_modify_mm(mm, addr, end)))
 		return -EPERM;
@@ -1404,14 +1399,8 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 		mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
 		/* Prepare to unmap any existing mapping in the area */
 		if (vms_gather_munmap_vmas(&vms, &mas_detach))
-			goto gather_failed;
-
-		/* Remove any existing mappings from the vma tree */
-		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
-			goto clear_tree_failed;
+			return -ENOMEM;

-		/* Unmap any existing mapping in the area */
-		vms_complete_munmap_vmas(&vms, &mas_detach);
 		next = vms.next;
 		prev = vms.prev;
 		vma = NULL;
@@ -1427,8 +1416,10 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
+		charged -= nr_accounted;
 		if (security_vm_enough_memory_mm(mm, charged))
-			return -ENOMEM;
+			goto abort_munmap;
+		vms.nr_accounted = 0;
 		vm_flags |= VM_ACCOUNT;
 	}

@@ -1477,10 +1468,8 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	 * not unmapped, but the maps are removed from the list.
 	 */
 	vma = vm_area_alloc(mm);
-	if (!vma) {
-		error = -ENOMEM;
+	if (!vma)
 		goto unacct_error;
-	}

 	vma_iter_config(&vmi, addr, end);
 	vma_set_range(vma, addr, end, pgoff);
@@ -1489,6 +1478,9 @@  unsigned long mmap_region(struct file *file, unsigned long addr,

 	if (file) {
 		vma->vm_file = get_file(file);
+		/* call_mmap() may map PTE, so ensure there are no existing PTEs */
+		vms_clear_ptes(&vms, &mas_detach, /* mm_wr_locked = */ true);
+		vms_close_vmas(&vms, &mas_detach);
 		error = call_mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
@@ -1579,6 +1571,10 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 expanded:
 	perf_event_mmap(vma);

+	/* Unmap any existing mapping in the area */
+	if (vms.nr_pages)
+		vms_complete_munmap_vmas(&vms, &mas_detach);
+
 	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
@@ -1626,14 +1622,12 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
-	validate_mm(mm);
-	return error;

-clear_tree_failed:
-	abort_munmap_vmas(&mas_detach);
-gather_failed:
+abort_munmap:
+	if (vms.nr_pages)
+		abort_munmap_vmas(&mas_detach, vms.closed);
 	validate_mm(mm);
-	return -ENOMEM;
+	return error;
 }

 static int __vm_munmap(unsigned long start, size_t len, bool unlock)
@@ -1972,7 +1966,7 @@  void exit_mmap(struct mm_struct *mm)
 	do {
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += vma_pages(vma);
-		remove_vma(vma, true);
+		remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
 		count++;
 		cond_resched();
 		vma = vma_next(&vmi);
diff --git a/mm/vma.c b/mm/vma.c
index 0244320d76ab..86757443a7a2 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -136,10 +136,10 @@  can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 /*
  * Close a vm structure and free it.
  */
-void remove_vma(struct vm_area_struct *vma, bool unreachable)
+void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
 {
 	might_sleep();
-	if (vma->vm_ops && vma->vm_ops->close)
+	if (!closed && vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
@@ -374,6 +374,8 @@  init_vma_munmap(struct vma_munmap_struct *vms,
 	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
 	vms->unmap_start = FIRST_USER_ADDRESS;
 	vms->unmap_end = USER_PGTABLES_CEILING;
+	vms->clear_ptes = false;	/* No PTEs to clear yet */
+	vms->closed = false;
 }

 /*
@@ -556,7 +558,6 @@  int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	vma_iter_store(vmi, vma);

 	vma_complete(&vp, vmi, vma->vm_mm);
-	validate_mm(vma->vm_mm);
 	return 0;

 nomem:
@@ -685,22 +686,30 @@  void vma_complete(struct vma_prepare *vp,
  *
  * Reattach any detached vmas and free up the maple tree used to track the vmas.
  */
-void abort_munmap_vmas(struct ma_state *mas_detach)
+void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
 {
 	struct vm_area_struct *vma;

 	mas_set(mas_detach, 0);
-	mas_for_each(mas_detach, vma, ULONG_MAX)
+	mas_for_each(mas_detach, vma, ULONG_MAX) {
+		if (closed && vma->vm_ops && vma->vm_ops->close &&
+		    vma->vm_ops->open)
+			vma->vm_ops->open(vma);
+
 		vma_mark_detached(vma, false);
+	}

 	__mt_destroy(mas_detach->tree);
 }

-static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
-		struct ma_state *mas_detach, bool mm_wr_locked)
+void vms_clear_ptes(struct vma_munmap_struct *vms,
+		    struct ma_state *mas_detach, bool mm_wr_locked)
 {
 	struct mmu_gather tlb;

+	if (!vms->clear_ptes)
+		return;
+
 	/*
 	 * We can free page tables without write-locking mmap_lock because VMAs
 	 * were isolated before we downgraded mmap_lock.
@@ -716,6 +725,22 @@  static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
 	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
 		      vms->unmap_end, mm_wr_locked);
 	tlb_finish_mmu(&tlb);
+	vms->clear_ptes = false;
+}
+
+void vms_close_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach)
+{
+	struct vm_area_struct *vma;
+
+	if (!vms->vma_count)
+		return;
+
+	mas_set(mas_detach, 0);
+	mas_for_each(mas_detach, vma, ULONG_MAX)
+		if (vma->vm_ops && vma->vm_ops->close)
+			vma->vm_ops->close(vma);
+
+	vms->closed = true;
 }

 /*
@@ -740,7 +765,7 @@  void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
 	if (vms->unlock)
 		mmap_write_downgrade(mm);

-	vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
+	vms_clear_ptes(vms, mas_detach, !vms->unlock);
 	/* Update high watermark before we lower total_vm */
 	update_hiwater_vm(mm);
 	/* Stat accounting */
@@ -751,7 +776,7 @@  void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
 	/* Remove and clean up vmas */
 	mas_set(mas_detach, 0);
 	mas_for_each(mas_detach, vma, ULONG_MAX)
-		remove_vma(vma, false);
+		remove_vma(vma, /* unreachable = */ false, vms->closed);

 	vm_unacct_memory(vms->nr_accounted);
 	validate_mm(mm);
@@ -891,14 +916,18 @@  int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
 	while (vma_iter_addr(vms->vmi) > vms->start)
 		vma_iter_prev_range(vms->vmi);

+	/* There are now PTEs that need to be cleared */
+	vms->clear_ptes = true;
+
 	return 0;

 userfaultfd_error:
 munmap_gather_failed:
 end_split_failed:
-	abort_munmap_vmas(mas_detach);
+	abort_munmap_vmas(mas_detach, /* closed = */ false);
 start_split_failed:
 map_count_exceeded:
+	validate_mm(vms->mm);
 	return error;
 }

@@ -942,9 +971,9 @@  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	return 0;

 clear_tree_failed:
-	abort_munmap_vmas(&mas_detach);
-gather_failed:
+	abort_munmap_vmas(&mas_detach, /* closed = */ false);
 	validate_mm(mm);
+gather_failed:
 	return error;
 }

@@ -1667,17 +1696,21 @@  bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
 }

 unsigned long count_vma_pages_range(struct mm_struct *mm,
-				    unsigned long addr, unsigned long end)
+				    unsigned long addr, unsigned long end,
+				    unsigned long *nr_accounted)
 {
 	VMA_ITERATOR(vmi, mm, addr);
 	struct vm_area_struct *vma;
 	unsigned long nr_pages = 0;

+	*nr_accounted = 0;
 	for_each_vma_range(vmi, vma, end) {
 		unsigned long vm_start = max(addr, vma->vm_start);
 		unsigned long vm_end = min(end, vma->vm_end);

 		nr_pages += PHYS_PFN(vm_end - vm_start);
+		if (vma->vm_flags & VM_ACCOUNT)
+			*nr_accounted += PHYS_PFN(vm_end - vm_start);
 	}

 	return nr_pages;
diff --git a/mm/vma.h b/mm/vma.h
index 2a2ca489e622..73297dc7fa28 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -43,6 +43,8 @@  struct vma_munmap_struct {
 	unsigned long stack_vm;
 	unsigned long data_vm;
 	bool unlock;			/* Unlock after the munmap */
+	bool clear_ptes;		/* If there are outstanding PTE to be cleared */
+	bool closed;			/* vma->vm_ops->close() called already */
 };

 struct unlink_vma_file_batch {
@@ -57,7 +59,7 @@  void validate_mm(struct mm_struct *mm);
 #endif

 /* Required for mmap_region() */
-void abort_munmap_vmas(struct ma_state *mas_detach);
+void abort_munmap_vmas(struct ma_state *mas_detach, bool closed);

 /* Required for expand_downwards(). */
 void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
@@ -97,6 +99,13 @@  void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
 int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
 			   struct ma_state *mas_detach);

+/* Required for mmap_region() */
+void vms_clear_ptes(struct vma_munmap_struct *vms,
+		    struct ma_state *mas_detach, bool mm_wr_locked);
+
+/* Required for mmap_region() */
+void vms_close_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach);
+
 int
 do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		    struct mm_struct *mm, unsigned long start,
@@ -106,7 +115,7 @@  int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 		  unsigned long start, size_t len, struct list_head *uf,
 		  bool unlock);

-void remove_vma(struct vm_area_struct *vma, bool unreachable);
+void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);

 void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct vm_area_struct *next);
@@ -220,7 +229,8 @@  bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
 int mm_take_all_locks(struct mm_struct *mm);
 void mm_drop_all_locks(struct mm_struct *mm);
 unsigned long count_vma_pages_range(struct mm_struct *mm,
-				    unsigned long addr, unsigned long end);
+				    unsigned long addr, unsigned long end,
+				    unsigned long *nr_accounted);

 static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
 {