diff mbox series

[07/10] mm: avoid using vma_merge() for new VMAs

Message ID cf40652a2c3f6b987623f8f11a514618718546f7.1722849860.git.lorenzo.stoakes@oracle.com (mailing list archive)
State New
Headers show
Series mm: remove vma_merge() | expand

Commit Message

Lorenzo Stoakes Aug. 5, 2024, 12:13 p.m. UTC
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.

Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().

We implement this by replacing vma_merge_new_vma() with this newly
abstracted logic.

Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.

This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.

As a result of this change we are also able to make vma_prepare(),
init_vma_prep(), vma_complete(), can_vma_merge_before() and
can_vma_merge_after() static and internal to vma.c.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 mm/mmap.c                        |  79 ++---
 mm/vma.c                         | 482 +++++++++++++++++++------------
 mm/vma.h                         |  51 +---
 tools/testing/vma/vma_internal.h |   6 +
 4 files changed, 324 insertions(+), 294 deletions(-)

Comments

Petr Tesařík Aug. 6, 2024, 1:04 p.m. UTC | #1
On Mon,  5 Aug 2024 13:13:54 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:

> In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> to use vma_expand() rather than invoke a full vma_merge() operation.
> 
> Abstract this logic and eliminate all of the open-coding, and also use the
> same logic for all cases where we add new VMAs to, rather than ultimately
> use vma_merge(), rather use vma_expand().
> 
> We implement this by replacing vma_merge_new_vma() with this newly
> abstracted logic.
> 
> Doing so removes duplication and simplifies VMA merging in all such cases,
> laying the ground for us to eliminate the merging of new VMAs in
> vma_merge() altogether.
> 
> This makes it far easier to understand what is happening in these cases
> avoiding confusion, bugs and allowing for future optimisation.
> 
> As a result of this change we are also able to make vma_prepare(),
> init_vma_prep(), vma_complete(), can_vma_merge_before() and
> can_vma_merge_after() static and internal to vma.c.

This patch truly rocks. Let me just say: Wow!

Petr T

> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/mmap.c                        |  79 ++---
>  mm/vma.c                         | 482 +++++++++++++++++++------------
>  mm/vma.h                         |  51 +---
>  tools/testing/vma/vma_internal.h |   6 +
>  4 files changed, 324 insertions(+), 294 deletions(-)
> 
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f6593a81f73d..c03f50f46396 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  {
>  	struct mm_struct *mm = current->mm;
>  	struct vm_area_struct *vma = NULL;
> -	struct vm_area_struct *next, *prev, *merge;
> -	pgoff_t pglen = len >> PAGE_SHIFT;
> +	struct vm_area_struct *merge;
>  	unsigned long charged = 0;
>  	unsigned long end = addr + len;
>  	bool writable_file_mapping = false;
> @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		vm_flags |= VM_ACCOUNT;
>  	}
>  
> -	next = vmg.next = vma_next(&vmi);
> -	prev = vmg.prev = vma_prev(&vmi);
> -	if (vm_flags & VM_SPECIAL) {
> -		if (prev)
> -			vma_iter_next_range(&vmi);
> -		goto cannot_expand;
> -	}
> -
> -	/* Attempt to expand an old mapping */
> -	/* Check next */
> -	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> -		/* We can adjust this as can_vma_merge_after() doesn't touch */
> -		vmg.end = next->vm_end;
> -		vma = vmg.vma = next;
> -		vmg.pgoff = next->vm_pgoff - pglen;
> -
> -		/* We may merge our NULL anon_vma with non-NULL in next. */
> -		vmg.anon_vma = vma->anon_vma;
> -	}
> -
> -	/* Check prev */
> -	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> -		vmg.start = prev->vm_start;
> -		vma = vmg.vma = prev;
> -		vmg.pgoff = prev->vm_pgoff;
> -	} else if (prev) {
> -		vma_iter_next_range(&vmi);
> -	}
> -
> -	/* Actually expand, if possible */
> -	if (vma && !vma_expand(&vmg)) {
> -		khugepaged_enter_vma(vma, vm_flags);
> +	vma = vma_merge_new_vma(&vmg);
> +	if (vma)
>  		goto expanded;
> -	}
> -
> -	if (vma == prev)
> -		vma_iter_set(&vmi, addr);
> -cannot_expand:
>  
>  	/*
>  	 * Determine the object being mapped and call the appropriate
> @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		 * If vm_flags changed after call_mmap(), we should try merge
>  		 * vma again as we may succeed this time.
>  		 */
> -		if (unlikely(vm_flags != vma->vm_flags && prev)) {
> -			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> -							  vma->vm_start, vma->vm_end,
> -							  vma->vm_pgoff);
> +		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> +			merge = vma_merge_new_vma(&vmg);
> +
>  			if (merge) {
>  				/*
>  				 * ->mmap() can change vma->vm_file and fput
> @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  
>  		vma_iter_set(&vmi, vma->vm_end);
>  		/* Undo any partial mapping done by a device driver. */
> -		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> +		unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
>  			     vma->vm_end, vma->vm_end, true);
>  	}
>  	if (writable_file_mapping)
> @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		unsigned long addr, unsigned long len, unsigned long flags)
>  {
>  	struct mm_struct *mm = current->mm;
> -	struct vma_prepare vp;
>  
>  	/*
>  	 * Check against address space limits by the changed size
> @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	 */
>  	if (vma && vma->vm_end == addr) {
>  		struct vma_merge_struct vmg = {
> +			.vmi = vmi,
>  			.prev = vma,
> +			.next = NULL,
> +			.start = addr,
> +			.end = addr + len,
>  			.flags = flags,
>  			.pgoff = addr >> PAGE_SHIFT,
> +			.file = vma->vm_file,
> +			.anon_vma = vma->anon_vma,
> +			.policy = vma_policy(vma),
> +			.uffd_ctx = vma->vm_userfaultfd_ctx,
> +			.anon_name = anon_vma_name(vma),
>  		};
>  
> -		if (can_vma_merge_after(&vmg)) {
> -			vma_iter_config(vmi, vma->vm_start, addr + len);
> -			if (vma_iter_prealloc(vmi, vma))
> -				goto unacct_fail;
> -
> -			vma_start_write(vma);
> -
> -			init_vma_prep(&vp, vma);
> -			vma_prepare(&vp);
> -			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> -			vma->vm_end = addr + len;
> -			vm_flags_set(vma, VM_SOFTDIRTY);
> -			vma_iter_store(vmi, vma);
> -
> -			vma_complete(&vp, vmi, mm);
> -			khugepaged_enter_vma(vma, flags);
> +		if (vma_merge_new_vma(&vmg))
>  			goto out;
> -		}
>  	}
>  
>  	if (vma)
> diff --git a/mm/vma.c b/mm/vma.c
> index 55615392e8d2..a404cf718f9e 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
>   *
>   * We assume the vma may be removed as part of the merge.
>   */
> -bool
> -can_vma_merge_before(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
>  {
>  	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
>  
> @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
>   *
>   * We assume that vma is not removed as part of the merge.
>   */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
>  {
>  	if (is_mergeable_vma(vmg, false) &&
>  	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
>  	return false;
>  }
>  
> +static void __vma_link_file(struct vm_area_struct *vma,
> +			    struct address_space *mapping)
> +{
> +	if (vma_is_shared_maywrite(vma))
> +		mapping_allow_writable(mapping);
> +
> +	flush_dcache_mmap_lock(mapping);
> +	vma_interval_tree_insert(vma, &mapping->i_mmap);
> +	flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * Requires inode->i_mapping->i_mmap_rwsem
> + */
> +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> +				      struct address_space *mapping)
> +{
> +	if (vma_is_shared_maywrite(vma))
> +		mapping_unmap_writable(mapping);
> +
> +	flush_dcache_mmap_lock(mapping);
> +	vma_interval_tree_remove(vma, &mapping->i_mmap);
> +	flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> + * @vp: The initialized vma_prepare struct
> + */
> +static void vma_prepare(struct vma_prepare *vp)
> +{
> +	if (vp->file) {
> +		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> +
> +		if (vp->adj_next)
> +			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> +				      vp->adj_next->vm_end);
> +
> +		i_mmap_lock_write(vp->mapping);
> +		if (vp->insert && vp->insert->vm_file) {
> +			/*
> +			 * Put into interval tree now, so instantiated pages
> +			 * are visible to arm/parisc __flush_dcache_page
> +			 * throughout; but we cannot insert into address
> +			 * space until vma start or end is updated.
> +			 */
> +			__vma_link_file(vp->insert,
> +					vp->insert->vm_file->f_mapping);
> +		}
> +	}
> +
> +	if (vp->anon_vma) {
> +		anon_vma_lock_write(vp->anon_vma);
> +		anon_vma_interval_tree_pre_update_vma(vp->vma);
> +		if (vp->adj_next)
> +			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> +	}
> +
> +	if (vp->file) {
> +		flush_dcache_mmap_lock(vp->mapping);
> +		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> +		if (vp->adj_next)
> +			vma_interval_tree_remove(vp->adj_next,
> +						 &vp->mapping->i_mmap);
> +	}
> +
> +}
> +
> +/*
> + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> + * or for inserting a VMA.
> + *
> + * @vp: The vma_prepare struct
> + * @vmi: The vma iterator
> + * @mm: The mm_struct
> + */
> +static void vma_complete(struct vma_prepare *vp,
> +			 struct vma_iterator *vmi, struct mm_struct *mm)
> +{
> +	if (vp->file) {
> +		if (vp->adj_next)
> +			vma_interval_tree_insert(vp->adj_next,
> +						 &vp->mapping->i_mmap);
> +		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> +		flush_dcache_mmap_unlock(vp->mapping);
> +	}
> +
> +	if (vp->remove && vp->file) {
> +		__remove_shared_vm_struct(vp->remove, vp->mapping);
> +		if (vp->remove2)
> +			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> +	} else if (vp->insert) {
> +		/*
> +		 * split_vma has split insert from vma, and needs
> +		 * us to insert it before dropping the locks
> +		 * (it may either follow vma or precede it).
> +		 */
> +		vma_iter_store(vmi, vp->insert);
> +		mm->map_count++;
> +	}
> +
> +	if (vp->anon_vma) {
> +		anon_vma_interval_tree_post_update_vma(vp->vma);
> +		if (vp->adj_next)
> +			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> +		anon_vma_unlock_write(vp->anon_vma);
> +	}
> +
> +	if (vp->file) {
> +		i_mmap_unlock_write(vp->mapping);
> +		uprobe_mmap(vp->vma);
> +
> +		if (vp->adj_next)
> +			uprobe_mmap(vp->adj_next);
> +	}
> +
> +	if (vp->remove) {
> +again:
> +		vma_mark_detached(vp->remove, true);
> +		if (vp->file) {
> +			uprobe_munmap(vp->remove, vp->remove->vm_start,
> +				      vp->remove->vm_end);
> +			fput(vp->file);
> +		}
> +		if (vp->remove->anon_vma)
> +			anon_vma_merge(vp->vma, vp->remove);
> +		mm->map_count--;
> +		mpol_put(vma_policy(vp->remove));
> +		if (!vp->remove2)
> +			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> +		vm_area_free(vp->remove);
> +
> +		/*
> +		 * In mprotect's case 6 (see comments on vma_merge),
> +		 * we are removing both mid and next vmas
> +		 */
> +		if (vp->remove2) {
> +			vp->remove = vp->remove2;
> +			vp->remove2 = NULL;
> +			goto again;
> +		}
> +	}
> +	if (vp->insert && vp->file)
> +		uprobe_mmap(vp->insert);
> +	validate_mm(mm);
> +}
> +
> +/*
> + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> + * @vp: The vma_prepare struct
> + * @vma: The vma that will be altered once locked
> + */
> +static void init_vma_prep(struct vma_prepare *vp,
> +			  struct vm_area_struct *vma)
> +{
> +	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> +}
> +
>  /*
>   * Close a vm structure and free it.
>   */
> @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
>  	vm_unacct_memory(nr_accounted);
>  }
>  
> -/*
> - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> - * @vp: The vma_prepare struct
> - * @vma: The vma that will be altered once locked
> - */
> -void init_vma_prep(struct vma_prepare *vp,
> -		   struct vm_area_struct *vma)
> -{
> -	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> -}
> -
> -/*
> - * Requires inode->i_mapping->i_mmap_rwsem
> - */
> -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> -				      struct address_space *mapping)
> -{
> -	if (vma_is_shared_maywrite(vma))
> -		mapping_unmap_writable(mapping);
> -
> -	flush_dcache_mmap_lock(mapping);
> -	vma_interval_tree_remove(vma, &mapping->i_mmap);
> -	flush_dcache_mmap_unlock(mapping);
> -}
> -
>  /*
>   * vma has some anon_vma assigned, and is already inserted on that
>   * anon_vma's interval trees.
> @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
>  		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
>  }
>  
> -static void __vma_link_file(struct vm_area_struct *vma,
> -			    struct address_space *mapping)
> -{
> -	if (vma_is_shared_maywrite(vma))
> -		mapping_allow_writable(mapping);
> -
> -	flush_dcache_mmap_lock(mapping);
> -	vma_interval_tree_insert(vma, &mapping->i_mmap);
> -	flush_dcache_mmap_unlock(mapping);
> -}
> -
> -/*
> - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> - * @vp: The initialized vma_prepare struct
> - */
> -void vma_prepare(struct vma_prepare *vp)
> -{
> -	if (vp->file) {
> -		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> -
> -		if (vp->adj_next)
> -			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> -				      vp->adj_next->vm_end);
> -
> -		i_mmap_lock_write(vp->mapping);
> -		if (vp->insert && vp->insert->vm_file) {
> -			/*
> -			 * Put into interval tree now, so instantiated pages
> -			 * are visible to arm/parisc __flush_dcache_page
> -			 * throughout; but we cannot insert into address
> -			 * space until vma start or end is updated.
> -			 */
> -			__vma_link_file(vp->insert,
> -					vp->insert->vm_file->f_mapping);
> -		}
> -	}
> -
> -	if (vp->anon_vma) {
> -		anon_vma_lock_write(vp->anon_vma);
> -		anon_vma_interval_tree_pre_update_vma(vp->vma);
> -		if (vp->adj_next)
> -			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> -	}
> -
> -	if (vp->file) {
> -		flush_dcache_mmap_lock(vp->mapping);
> -		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> -		if (vp->adj_next)
> -			vma_interval_tree_remove(vp->adj_next,
> -						 &vp->mapping->i_mmap);
> -	}
> -
> -}
> -
>  /*
>   * dup_anon_vma() - Helper function to duplicate anon_vma
>   * @dst: The destination VMA
> @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
>  }
>  #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
>  
> +/*
> + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> + *
> + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> + *       (exclusive), which we try to merge with any adjacent VMAs if possible.
> + *
> + * We are about to add a VMA to the address space starting at @vmg->start and
> + * ending at @vmg->end. There are three different possible scenarios:
> + *
> + * 1. There is a VMA with identical properties immediately adjacent to the
> + *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> + *    EXPAND that VMA:
> + *
> + * Proposed:       |-----|  or  |-----|
> + * Existing:  |----|                  |----|
> + *
> + * 2. There are VMAs with identical properties immediately adjacent to the
> + *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> + *    EXPAND the former and REMOVE the latter:
> + *
> + * Proposed:       |-----|
> + * Existing:  |----|     |----|
> + *
> + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> + *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
> + *
> + * In instances where we can merge, this function returns the expanded VMA which
> + * will have its range adjusted accordingly and the underlying maple tree also
> + * adjusted.
> + *
> + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> + *          to the VMA we expanded.
> + *
> + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> + * expanded range.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> + */
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> +{
> +	bool is_special = vmg->flags & VM_SPECIAL;
> +	struct vm_area_struct *prev = vmg->prev;
> +	struct vm_area_struct *next = vmg->next;
> +	unsigned long start = vmg->start;
> +	unsigned long end = vmg->end;
> +	pgoff_t pgoff = vmg->pgoff;
> +	pgoff_t pglen = PHYS_PFN(end - start);
> +
> +	VM_WARN_ON(vmg->vma);
> +
> +	if (!prev && !next) {
> +		/*
> +		 * Since the caller must have determined that the requested
> +		 * range is empty, vmg->vmi will be left pointing at the VMA
> +		 * immediately prior.
> +		 */
> +		next = vmg->next = vma_next(vmg->vmi);
> +		prev = vmg->prev = vma_prev(vmg->vmi);
> +
> +		/* Avoid maple tree re-walk. */
> +		if (is_special && prev)
> +			vma_iter_next_range(vmg->vmi);
> +	}
> +
> +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> +	if (is_special || (!prev && !next))
> +		return NULL;
> +
> +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> +		/*
> +		 * We can adjust this here as can_vma_merge_after() doesn't
> +		 * touch vmg->end.
> +		 */
> +		vmg->end = next->vm_end;
> +		vmg->vma = next;
> +		vmg->pgoff = next->vm_pgoff - pglen;
> +
> +		vmg->anon_vma = next->anon_vma;
> +	}
> +
> +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> +		vmg->start = prev->vm_start;
> +		vmg->vma = prev;
> +		vmg->pgoff = prev->vm_pgoff;
> +	} else if (prev) {
> +		vma_iter_next_range(vmg->vmi);
> +	}
> +
> +	/*
> +	 * Now try to expand adjacent VMA(s). This takes care of removing the
> +	 * following VMA if we have VMAs on both sides.
> +	 */
> +	if (vmg->vma && !vma_expand(vmg)) {
> +		khugepaged_enter_vma(vmg->vma, vmg->flags);
> +		return vmg->vma;
> +	}
> +
> +	/* If expansion failed, reset state. Allows us to retry merge later. */
> +	vmg->vma = NULL;
> +	vmg->anon_vma = NULL;
> +	vmg->start = start;
> +	vmg->end = end;
> +	vmg->pgoff = pgoff;
> +	if (vmg->vma == prev)
> +		vma_iter_set(vmg->vmi, start);
> +
> +	return NULL;
> +}
> +
>  /*
>   * vma_expand - Expand an existing VMA
>   *
> @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
>   * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
>   * vmg->next needs to be handled by the caller.
>   *
> - * Returns: 0 on success
> + * Returns: 0 on success.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> + * - The caller must have set @vmg->prev and @vmg->next.
>   */
>  int vma_expand(struct vma_merge_struct *vmg)
>  {
> @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
>  	return 0;
>  }
>  
> -/*
> - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> - * or for inserting a VMA.
> - *
> - * @vp: The vma_prepare struct
> - * @vmi: The vma iterator
> - * @mm: The mm_struct
> - */
> -void vma_complete(struct vma_prepare *vp,
> -		  struct vma_iterator *vmi, struct mm_struct *mm)
> -{
> -	if (vp->file) {
> -		if (vp->adj_next)
> -			vma_interval_tree_insert(vp->adj_next,
> -						 &vp->mapping->i_mmap);
> -		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> -		flush_dcache_mmap_unlock(vp->mapping);
> -	}
> -
> -	if (vp->remove && vp->file) {
> -		__remove_shared_vm_struct(vp->remove, vp->mapping);
> -		if (vp->remove2)
> -			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> -	} else if (vp->insert) {
> -		/*
> -		 * split_vma has split insert from vma, and needs
> -		 * us to insert it before dropping the locks
> -		 * (it may either follow vma or precede it).
> -		 */
> -		vma_iter_store(vmi, vp->insert);
> -		mm->map_count++;
> -	}
> -
> -	if (vp->anon_vma) {
> -		anon_vma_interval_tree_post_update_vma(vp->vma);
> -		if (vp->adj_next)
> -			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> -		anon_vma_unlock_write(vp->anon_vma);
> -	}
> -
> -	if (vp->file) {
> -		i_mmap_unlock_write(vp->mapping);
> -		uprobe_mmap(vp->vma);
> -
> -		if (vp->adj_next)
> -			uprobe_mmap(vp->adj_next);
> -	}
> -
> -	if (vp->remove) {
> -again:
> -		vma_mark_detached(vp->remove, true);
> -		if (vp->file) {
> -			uprobe_munmap(vp->remove, vp->remove->vm_start,
> -				      vp->remove->vm_end);
> -			fput(vp->file);
> -		}
> -		if (vp->remove->anon_vma)
> -			anon_vma_merge(vp->vma, vp->remove);
> -		mm->map_count--;
> -		mpol_put(vma_policy(vp->remove));
> -		if (!vp->remove2)
> -			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> -		vm_area_free(vp->remove);
> -
> -		/*
> -		 * In mprotect's case 6 (see comments on vma_merge),
> -		 * we are removing both mid and next vmas
> -		 */
> -		if (vp->remove2) {
> -			vp->remove = vp->remove2;
> -			vp->remove2 = NULL;
> -			goto again;
> -		}
> -	}
> -	if (vp->insert && vp->file)
> -		uprobe_mmap(vp->insert);
> -	validate_mm(mm);
> -}
> -
>  /*
>   * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
>   * @vmi: The vma iterator
> @@ -1261,20 +1378,6 @@ struct vm_area_struct
>  	return vma_modify(&vmg);
>  }
>  
> -/*
> - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> - * must ensure that [start, end) does not overlap any existing VMA.
> - */
> -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> -{
> -	if (!vmg->prev) {
> -		vmg->prev = vma_prev(vmg->vmi);
> -		vma_iter_set(vmg->vmi, vmg->start);
> -	}
> -
> -	return vma_merge(vmg);
> -}
> -
>  /*
>   * Expand vma by delta bytes, potentially merging with an immediately adjacent
>   * VMA with identical properties.
> @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
>  		.anon_name = anon_vma_name(vma),
>  	};
>  
> -	/* vma is specified as prev, so case 1 or 2 will apply. */
> -	return vma_merge(&vmg);
> +	return vma_merge_new_vma(&vmg);
>  }
>  
>  void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
>  	struct vm_area_struct *vma = *vmap;
>  	unsigned long vma_start = vma->vm_start;
>  	struct mm_struct *mm = vma->vm_mm;
> -	struct vm_area_struct *new_vma, *prev;
> +	struct vm_area_struct *new_vma;
>  	bool faulted_in_anon_vma = true;
>  	VMA_ITERATOR(vmi, mm, addr);
> +	struct vma_merge_struct vmg = {
> +		.vmi = &vmi,
> +		.start = addr,
> +		.end = addr + len,
> +		.flags = vma->vm_flags,
> +		.pgoff = pgoff,
> +		.file = vma->vm_file,
> +		.anon_vma = vma->anon_vma,
> +		.policy = vma_policy(vma),
> +		.uffd_ctx = vma->vm_userfaultfd_ctx,
> +		.anon_name = anon_vma_name(vma),
> +	};
>  
>  	/*
>  	 * If anonymous vma has not yet been faulted, update new pgoff
>  	 * to match new location, to increase its chance of merging.
>  	 */
>  	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> -		pgoff = addr >> PAGE_SHIFT;
> +		pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
>  		faulted_in_anon_vma = false;
>  	}
>  
> -	new_vma = find_vma_prev(mm, addr, &prev);
> +	new_vma = find_vma_prev(mm, addr, &vmg.prev);
>  	if (new_vma && new_vma->vm_start < addr + len)
>  		return NULL;	/* should never get here */
>  
> -	new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> +	vmg.next = vma_next(&vmi);
> +	vma_prev(&vmi);
> +
> +	new_vma = vma_merge_new_vma(&vmg);
> +
>  	if (new_vma) {
>  		/*
>  		 * Source vma may have been merged into new_vma
> diff --git a/mm/vma.h b/mm/vma.h
> index 50459f9e4c7f..bbb173053f34 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
>  /* Required for expand_downwards(). */
>  void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
>  
> -/* Required for do_brk_flags(). */
> -void vma_prepare(struct vma_prepare *vp);
> -
> -/* Required for do_brk_flags(). */
> -void init_vma_prep(struct vma_prepare *vp,
> -		   struct vm_area_struct *vma);
> -
> -/* Required for do_brk_flags(). */
> -void vma_complete(struct vma_prepare *vp,
> -		  struct vma_iterator *vmi, struct mm_struct *mm);
> -
>  int vma_expand(struct vma_merge_struct *vmg);
>  int vma_shrink(struct vma_merge_struct *vmg);
>  
> @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
>  		struct vm_area_struct *next, unsigned long start,
>  		unsigned long end, unsigned long tree_end, bool mm_wr_locked);
>  
> -/*
> - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> - *
> - * Required by mmap_region().
> - */
> -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> -
> -/*
> - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> - *
> - * Required by mmap_region() and do_brk_flags().
> - */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> -
>  /* We are about to modify the VMA's flags. */
>  struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
>  					struct vm_area_struct *prev,
> @@ -133,31 +108,7 @@ struct vm_area_struct
>  		       unsigned long new_flags,
>  		       struct vm_userfaultfd_ctx new_ctx);
>  
> -struct vm_area_struct
> -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> -
> -/* Temporary convenience wrapper. */
> -static inline struct vm_area_struct
> -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> -			   struct vm_area_struct *vma, unsigned long start,
> -			   unsigned long end, pgoff_t pgoff)
> -{
> -	struct vma_merge_struct vmg = {
> -		.vmi = vmi,
> -		.prev = prev,
> -		.start = start,
> -		.end = end,
> -		.flags = vma->vm_flags,
> -		.file = vma->vm_file,
> -		.anon_vma = vma->anon_vma,
> -		.pgoff = pgoff,
> -		.policy = vma_policy(vma),
> -		.uffd_ctx = vma->vm_userfaultfd_ctx,
> -		.anon_name = anon_vma_name(vma),
> -	};
> -
> -	return vma_merge_new_vma(&vmg);
> -}
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
>  
>  /*
>   * Temporary wrapper around vma_merge() so we can have a common interface for
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 40797a819d3d..a39a734282d0 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
>  	mas_destroy(&vmi->mas);
>  }
>  
> +static inline
> +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> +{
> +	return mas_next_range(&vmi->mas, ULONG_MAX);
> +}
> +
>  static inline void vm_acct_memory(long pages)
>  {
>  }
Lorenzo Stoakes Aug. 6, 2024, 1:44 p.m. UTC | #2
On Tue, Aug 06, 2024 at 03:04:22PM GMT, Petr Tesařík wrote:
> On Mon,  5 Aug 2024 13:13:54 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
>
> This patch truly rocks. Let me just say: Wow!

Thanks!

>
> Petr T
>
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/mmap.c                        |  79 ++---
> >  mm/vma.c                         | 482 +++++++++++++++++++------------
> >  mm/vma.h                         |  51 +---
> >  tools/testing/vma/vma_internal.h |   6 +
> >  4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  {
> >  	struct mm_struct *mm = current->mm;
> >  	struct vm_area_struct *vma = NULL;
> > -	struct vm_area_struct *next, *prev, *merge;
> > -	pgoff_t pglen = len >> PAGE_SHIFT;
> > +	struct vm_area_struct *merge;
> >  	unsigned long charged = 0;
> >  	unsigned long end = addr + len;
> >  	bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		vm_flags |= VM_ACCOUNT;
> >  	}
> >
> > -	next = vmg.next = vma_next(&vmi);
> > -	prev = vmg.prev = vma_prev(&vmi);
> > -	if (vm_flags & VM_SPECIAL) {
> > -		if (prev)
> > -			vma_iter_next_range(&vmi);
> > -		goto cannot_expand;
> > -	}
> > -
> > -	/* Attempt to expand an old mapping */
> > -	/* Check next */
> > -	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > -		/* We can adjust this as can_vma_merge_after() doesn't touch */
> > -		vmg.end = next->vm_end;
> > -		vma = vmg.vma = next;
> > -		vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > -		/* We may merge our NULL anon_vma with non-NULL in next. */
> > -		vmg.anon_vma = vma->anon_vma;
> > -	}
> > -
> > -	/* Check prev */
> > -	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > -		vmg.start = prev->vm_start;
> > -		vma = vmg.vma = prev;
> > -		vmg.pgoff = prev->vm_pgoff;
> > -	} else if (prev) {
> > -		vma_iter_next_range(&vmi);
> > -	}
> > -
> > -	/* Actually expand, if possible */
> > -	if (vma && !vma_expand(&vmg)) {
> > -		khugepaged_enter_vma(vma, vm_flags);
> > +	vma = vma_merge_new_vma(&vmg);
> > +	if (vma)
> >  		goto expanded;
> > -	}
> > -
> > -	if (vma == prev)
> > -		vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> >  	/*
> >  	 * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		 * If vm_flags changed after call_mmap(), we should try merge
> >  		 * vma again as we may succeed this time.
> >  		 */
> > -		if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > -			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > -							  vma->vm_start, vma->vm_end,
> > -							  vma->vm_pgoff);
> > +		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > +			merge = vma_merge_new_vma(&vmg);
> > +
> >  			if (merge) {
> >  				/*
> >  				 * ->mmap() can change vma->vm_file and fput
> > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> >  		vma_iter_set(&vmi, vma->vm_end);
> >  		/* Undo any partial mapping done by a device driver. */
> > -		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> > +		unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> >  			     vma->vm_end, vma->vm_end, true);
> >  	}
> >  	if (writable_file_mapping)
> > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  		unsigned long addr, unsigned long len, unsigned long flags)
> >  {
> >  	struct mm_struct *mm = current->mm;
> > -	struct vma_prepare vp;
> >
> >  	/*
> >  	 * Check against address space limits by the changed size
> > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  	 */
> >  	if (vma && vma->vm_end == addr) {
> >  		struct vma_merge_struct vmg = {
> > +			.vmi = vmi,
> >  			.prev = vma,
> > +			.next = NULL,
> > +			.start = addr,
> > +			.end = addr + len,
> >  			.flags = flags,
> >  			.pgoff = addr >> PAGE_SHIFT,
> > +			.file = vma->vm_file,
> > +			.anon_vma = vma->anon_vma,
> > +			.policy = vma_policy(vma),
> > +			.uffd_ctx = vma->vm_userfaultfd_ctx,
> > +			.anon_name = anon_vma_name(vma),
> >  		};
> >
> > -		if (can_vma_merge_after(&vmg)) {
> > -			vma_iter_config(vmi, vma->vm_start, addr + len);
> > -			if (vma_iter_prealloc(vmi, vma))
> > -				goto unacct_fail;
> > -
> > -			vma_start_write(vma);
> > -
> > -			init_vma_prep(&vp, vma);
> > -			vma_prepare(&vp);
> > -			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > -			vma->vm_end = addr + len;
> > -			vm_flags_set(vma, VM_SOFTDIRTY);
> > -			vma_iter_store(vmi, vma);
> > -
> > -			vma_complete(&vp, vmi, mm);
> > -			khugepaged_enter_vma(vma, flags);
> > +		if (vma_merge_new_vma(&vmg))
> >  			goto out;
> > -		}
> >  	}
> >
> >  	if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 55615392e8d2..a404cf718f9e 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> >   *
> >   * We assume the vma may be removed as part of the merge.
> >   */
> > -bool
> > -can_vma_merge_before(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> >  {
> >  	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> >
> > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> >   *
> >   * We assume that vma is not removed as part of the merge.
> >   */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> >  {
> >  	if (is_mergeable_vma(vmg, false) &&
> >  	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> >  	return false;
> >  }
> >
> > +static void __vma_link_file(struct vm_area_struct *vma,
> > +			    struct address_space *mapping)
> > +{
> > +	if (vma_is_shared_maywrite(vma))
> > +		mapping_allow_writable(mapping);
> > +
> > +	flush_dcache_mmap_lock(mapping);
> > +	vma_interval_tree_insert(vma, &mapping->i_mmap);
> > +	flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * Requires inode->i_mapping->i_mmap_rwsem
> > + */
> > +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > +				      struct address_space *mapping)
> > +{
> > +	if (vma_is_shared_maywrite(vma))
> > +		mapping_unmap_writable(mapping);
> > +
> > +	flush_dcache_mmap_lock(mapping);
> > +	vma_interval_tree_remove(vma, &mapping->i_mmap);
> > +	flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > + * @vp: The initialized vma_prepare struct
> > + */
> > +static void vma_prepare(struct vma_prepare *vp)
> > +{
> > +	if (vp->file) {
> > +		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > +
> > +		if (vp->adj_next)
> > +			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > +				      vp->adj_next->vm_end);
> > +
> > +		i_mmap_lock_write(vp->mapping);
> > +		if (vp->insert && vp->insert->vm_file) {
> > +			/*
> > +			 * Put into interval tree now, so instantiated pages
> > +			 * are visible to arm/parisc __flush_dcache_page
> > +			 * throughout; but we cannot insert into address
> > +			 * space until vma start or end is updated.
> > +			 */
> > +			__vma_link_file(vp->insert,
> > +					vp->insert->vm_file->f_mapping);
> > +		}
> > +	}
> > +
> > +	if (vp->anon_vma) {
> > +		anon_vma_lock_write(vp->anon_vma);
> > +		anon_vma_interval_tree_pre_update_vma(vp->vma);
> > +		if (vp->adj_next)
> > +			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > +	}
> > +
> > +	if (vp->file) {
> > +		flush_dcache_mmap_lock(vp->mapping);
> > +		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > +		if (vp->adj_next)
> > +			vma_interval_tree_remove(vp->adj_next,
> > +						 &vp->mapping->i_mmap);
> > +	}
> > +
> > +}
> > +
> > +/*
> > + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > + * or for inserting a VMA.
> > + *
> > + * @vp: The vma_prepare struct
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + */
> > +static void vma_complete(struct vma_prepare *vp,
> > +			 struct vma_iterator *vmi, struct mm_struct *mm)
> > +{
> > +	if (vp->file) {
> > +		if (vp->adj_next)
> > +			vma_interval_tree_insert(vp->adj_next,
> > +						 &vp->mapping->i_mmap);
> > +		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > +		flush_dcache_mmap_unlock(vp->mapping);
> > +	}
> > +
> > +	if (vp->remove && vp->file) {
> > +		__remove_shared_vm_struct(vp->remove, vp->mapping);
> > +		if (vp->remove2)
> > +			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> > +	} else if (vp->insert) {
> > +		/*
> > +		 * split_vma has split insert from vma, and needs
> > +		 * us to insert it before dropping the locks
> > +		 * (it may either follow vma or precede it).
> > +		 */
> > +		vma_iter_store(vmi, vp->insert);
> > +		mm->map_count++;
> > +	}
> > +
> > +	if (vp->anon_vma) {
> > +		anon_vma_interval_tree_post_update_vma(vp->vma);
> > +		if (vp->adj_next)
> > +			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > +		anon_vma_unlock_write(vp->anon_vma);
> > +	}
> > +
> > +	if (vp->file) {
> > +		i_mmap_unlock_write(vp->mapping);
> > +		uprobe_mmap(vp->vma);
> > +
> > +		if (vp->adj_next)
> > +			uprobe_mmap(vp->adj_next);
> > +	}
> > +
> > +	if (vp->remove) {
> > +again:
> > +		vma_mark_detached(vp->remove, true);
> > +		if (vp->file) {
> > +			uprobe_munmap(vp->remove, vp->remove->vm_start,
> > +				      vp->remove->vm_end);
> > +			fput(vp->file);
> > +		}
> > +		if (vp->remove->anon_vma)
> > +			anon_vma_merge(vp->vma, vp->remove);
> > +		mm->map_count--;
> > +		mpol_put(vma_policy(vp->remove));
> > +		if (!vp->remove2)
> > +			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > +		vm_area_free(vp->remove);
> > +
> > +		/*
> > +		 * In mprotect's case 6 (see comments on vma_merge),
> > +		 * we are removing both mid and next vmas
> > +		 */
> > +		if (vp->remove2) {
> > +			vp->remove = vp->remove2;
> > +			vp->remove2 = NULL;
> > +			goto again;
> > +		}
> > +	}
> > +	if (vp->insert && vp->file)
> > +		uprobe_mmap(vp->insert);
> > +	validate_mm(mm);
> > +}
> > +
> > +/*
> > + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + */
> > +static void init_vma_prep(struct vma_prepare *vp,
> > +			  struct vm_area_struct *vma)
> > +{
> > +	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > +}
> > +
> >  /*
> >   * Close a vm structure and free it.
> >   */
> > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> >  	vm_unacct_memory(nr_accounted);
> >  }
> >
> > -/*
> > - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - */
> > -void init_vma_prep(struct vma_prepare *vp,
> > -		   struct vm_area_struct *vma)
> > -{
> > -	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > -}
> > -
> > -/*
> > - * Requires inode->i_mapping->i_mmap_rwsem
> > - */
> > -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > -				      struct address_space *mapping)
> > -{
> > -	if (vma_is_shared_maywrite(vma))
> > -		mapping_unmap_writable(mapping);
> > -
> > -	flush_dcache_mmap_lock(mapping);
> > -	vma_interval_tree_remove(vma, &mapping->i_mmap);
> > -	flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> >  /*
> >   * vma has some anon_vma assigned, and is already inserted on that
> >   * anon_vma's interval trees.
> > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> >  		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> >  }
> >
> > -static void __vma_link_file(struct vm_area_struct *vma,
> > -			    struct address_space *mapping)
> > -{
> > -	if (vma_is_shared_maywrite(vma))
> > -		mapping_allow_writable(mapping);
> > -
> > -	flush_dcache_mmap_lock(mapping);
> > -	vma_interval_tree_insert(vma, &mapping->i_mmap);
> > -	flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -/*
> > - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > - * @vp: The initialized vma_prepare struct
> > - */
> > -void vma_prepare(struct vma_prepare *vp)
> > -{
> > -	if (vp->file) {
> > -		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > -
> > -		if (vp->adj_next)
> > -			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > -				      vp->adj_next->vm_end);
> > -
> > -		i_mmap_lock_write(vp->mapping);
> > -		if (vp->insert && vp->insert->vm_file) {
> > -			/*
> > -			 * Put into interval tree now, so instantiated pages
> > -			 * are visible to arm/parisc __flush_dcache_page
> > -			 * throughout; but we cannot insert into address
> > -			 * space until vma start or end is updated.
> > -			 */
> > -			__vma_link_file(vp->insert,
> > -					vp->insert->vm_file->f_mapping);
> > -		}
> > -	}
> > -
> > -	if (vp->anon_vma) {
> > -		anon_vma_lock_write(vp->anon_vma);
> > -		anon_vma_interval_tree_pre_update_vma(vp->vma);
> > -		if (vp->adj_next)
> > -			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > -	}
> > -
> > -	if (vp->file) {
> > -		flush_dcache_mmap_lock(vp->mapping);
> > -		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > -		if (vp->adj_next)
> > -			vma_interval_tree_remove(vp->adj_next,
> > -						 &vp->mapping->i_mmap);
> > -	}
> > -
> > -}
> > -
> >  /*
> >   * dup_anon_vma() - Helper function to duplicate anon_vma
> >   * @dst: The destination VMA
> > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> >  }
> >  #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + *       (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + *    EXPAND that VMA:
> > + *
> > + * Proposed:       |-----|  or  |-----|
> > + * Existing:  |----|                  |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + *    EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed:       |-----|
> > + * Existing:  |----|     |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + *          to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > +	bool is_special = vmg->flags & VM_SPECIAL;
> > +	struct vm_area_struct *prev = vmg->prev;
> > +	struct vm_area_struct *next = vmg->next;
> > +	unsigned long start = vmg->start;
> > +	unsigned long end = vmg->end;
> > +	pgoff_t pgoff = vmg->pgoff;
> > +	pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > +	VM_WARN_ON(vmg->vma);
> > +
> > +	if (!prev && !next) {
> > +		/*
> > +		 * Since the caller must have determined that the requested
> > +		 * range is empty, vmg->vmi will be left pointing at the VMA
> > +		 * immediately prior.
> > +		 */
> > +		next = vmg->next = vma_next(vmg->vmi);
> > +		prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > +		/* Avoid maple tree re-walk. */
> > +		if (is_special && prev)
> > +			vma_iter_next_range(vmg->vmi);
> > +	}
> > +
> > +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> > +	if (is_special || (!prev && !next))
> > +		return NULL;
> > +
> > +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> > +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > +		/*
> > +		 * We can adjust this here as can_vma_merge_after() doesn't
> > +		 * touch vmg->end.
> > +		 */
> > +		vmg->end = next->vm_end;
> > +		vmg->vma = next;
> > +		vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > +		vmg->anon_vma = next->anon_vma;
> > +	}
> > +
> > +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> > +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > +		vmg->start = prev->vm_start;
> > +		vmg->vma = prev;
> > +		vmg->pgoff = prev->vm_pgoff;
> > +	} else if (prev) {
> > +		vma_iter_next_range(vmg->vmi);
> > +	}
> > +
> > +	/*
> > +	 * Now try to expand adjacent VMA(s). This takes care of removing the
> > +	 * following VMA if we have VMAs on both sides.
> > +	 */
> > +	if (vmg->vma && !vma_expand(vmg)) {
> > +		khugepaged_enter_vma(vmg->vma, vmg->flags);
> > +		return vmg->vma;
> > +	}
> > +
> > +	/* If expansion failed, reset state. Allows us to retry merge later. */
> > +	vmg->vma = NULL;
> > +	vmg->anon_vma = NULL;
> > +	vmg->start = start;
> > +	vmg->end = end;
> > +	vmg->pgoff = pgoff;
> > +	if (vmg->vma == prev)
> > +		vma_iter_set(vmg->vmi, start);
> > +
> > +	return NULL;
> > +}
> > +
> >  /*
> >   * vma_expand - Expand an existing VMA
> >   *
> > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> >   * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
> >   * vmg->next needs to be handled by the caller.
> >   *
> > - * Returns: 0 on success
> > + * Returns: 0 on success.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> > + * - The caller must have set @vmg->prev and @vmg->next.
> >   */
> >  int vma_expand(struct vma_merge_struct *vmg)
> >  {
> > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> >  	return 0;
> >  }
> >
> > -/*
> > - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > - * or for inserting a VMA.
> > - *
> > - * @vp: The vma_prepare struct
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - */
> > -void vma_complete(struct vma_prepare *vp,
> > -		  struct vma_iterator *vmi, struct mm_struct *mm)
> > -{
> > -	if (vp->file) {
> > -		if (vp->adj_next)
> > -			vma_interval_tree_insert(vp->adj_next,
> > -						 &vp->mapping->i_mmap);
> > -		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > -		flush_dcache_mmap_unlock(vp->mapping);
> > -	}
> > -
> > -	if (vp->remove && vp->file) {
> > -		__remove_shared_vm_struct(vp->remove, vp->mapping);
> > -		if (vp->remove2)
> > -			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> > -	} else if (vp->insert) {
> > -		/*
> > -		 * split_vma has split insert from vma, and needs
> > -		 * us to insert it before dropping the locks
> > -		 * (it may either follow vma or precede it).
> > -		 */
> > -		vma_iter_store(vmi, vp->insert);
> > -		mm->map_count++;
> > -	}
> > -
> > -	if (vp->anon_vma) {
> > -		anon_vma_interval_tree_post_update_vma(vp->vma);
> > -		if (vp->adj_next)
> > -			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > -		anon_vma_unlock_write(vp->anon_vma);
> > -	}
> > -
> > -	if (vp->file) {
> > -		i_mmap_unlock_write(vp->mapping);
> > -		uprobe_mmap(vp->vma);
> > -
> > -		if (vp->adj_next)
> > -			uprobe_mmap(vp->adj_next);
> > -	}
> > -
> > -	if (vp->remove) {
> > -again:
> > -		vma_mark_detached(vp->remove, true);
> > -		if (vp->file) {
> > -			uprobe_munmap(vp->remove, vp->remove->vm_start,
> > -				      vp->remove->vm_end);
> > -			fput(vp->file);
> > -		}
> > -		if (vp->remove->anon_vma)
> > -			anon_vma_merge(vp->vma, vp->remove);
> > -		mm->map_count--;
> > -		mpol_put(vma_policy(vp->remove));
> > -		if (!vp->remove2)
> > -			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > -		vm_area_free(vp->remove);
> > -
> > -		/*
> > -		 * In mprotect's case 6 (see comments on vma_merge),
> > -		 * we are removing both mid and next vmas
> > -		 */
> > -		if (vp->remove2) {
> > -			vp->remove = vp->remove2;
> > -			vp->remove2 = NULL;
> > -			goto again;
> > -		}
> > -	}
> > -	if (vp->insert && vp->file)
> > -		uprobe_mmap(vp->insert);
> > -	validate_mm(mm);
> > -}
> > -
> >  /*
> >   * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> >   * @vmi: The vma iterator
> > @@ -1261,20 +1378,6 @@ struct vm_area_struct
> >  	return vma_modify(&vmg);
> >  }
> >
> > -/*
> > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > - * must ensure that [start, end) does not overlap any existing VMA.
> > - */
> > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > -{
> > -	if (!vmg->prev) {
> > -		vmg->prev = vma_prev(vmg->vmi);
> > -		vma_iter_set(vmg->vmi, vmg->start);
> > -	}
> > -
> > -	return vma_merge(vmg);
> > -}
> > -
> >  /*
> >   * Expand vma by delta bytes, potentially merging with an immediately adjacent
> >   * VMA with identical properties.
> > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> >  		.anon_name = anon_vma_name(vma),
> >  	};
> >
> > -	/* vma is specified as prev, so case 1 or 2 will apply. */
> > -	return vma_merge(&vmg);
> > +	return vma_merge_new_vma(&vmg);
> >  }
> >
> >  void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> >  	struct vm_area_struct *vma = *vmap;
> >  	unsigned long vma_start = vma->vm_start;
> >  	struct mm_struct *mm = vma->vm_mm;
> > -	struct vm_area_struct *new_vma, *prev;
> > +	struct vm_area_struct *new_vma;
> >  	bool faulted_in_anon_vma = true;
> >  	VMA_ITERATOR(vmi, mm, addr);
> > +	struct vma_merge_struct vmg = {
> > +		.vmi = &vmi,
> > +		.start = addr,
> > +		.end = addr + len,
> > +		.flags = vma->vm_flags,
> > +		.pgoff = pgoff,
> > +		.file = vma->vm_file,
> > +		.anon_vma = vma->anon_vma,
> > +		.policy = vma_policy(vma),
> > +		.uffd_ctx = vma->vm_userfaultfd_ctx,
> > +		.anon_name = anon_vma_name(vma),
> > +	};
> >
> >  	/*
> >  	 * If anonymous vma has not yet been faulted, update new pgoff
> >  	 * to match new location, to increase its chance of merging.
> >  	 */
> >  	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > -		pgoff = addr >> PAGE_SHIFT;
> > +		pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> >  		faulted_in_anon_vma = false;
> >  	}
> >
> > -	new_vma = find_vma_prev(mm, addr, &prev);
> > +	new_vma = find_vma_prev(mm, addr, &vmg.prev);
> >  	if (new_vma && new_vma->vm_start < addr + len)
> >  		return NULL;	/* should never get here */
> >
> > -	new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> > +	vmg.next = vma_next(&vmi);
> > +	vma_prev(&vmi);
> > +
> > +	new_vma = vma_merge_new_vma(&vmg);
> > +
> >  	if (new_vma) {
> >  		/*
> >  		 * Source vma may have been merged into new_vma
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 50459f9e4c7f..bbb173053f34 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> >  /* Required for expand_downwards(). */
> >  void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
> >
> > -/* Required for do_brk_flags(). */
> > -void vma_prepare(struct vma_prepare *vp);
> > -
> > -/* Required for do_brk_flags(). */
> > -void init_vma_prep(struct vma_prepare *vp,
> > -		   struct vm_area_struct *vma);
> > -
> > -/* Required for do_brk_flags(). */
> > -void vma_complete(struct vma_prepare *vp,
> > -		  struct vma_iterator *vmi, struct mm_struct *mm);
> > -
> >  int vma_expand(struct vma_merge_struct *vmg);
> >  int vma_shrink(struct vma_merge_struct *vmg);
> >
> > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> >  		struct vm_area_struct *next, unsigned long start,
> >  		unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/*
> > - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > - *
> > - * Required by mmap_region().
> > - */
> > -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > -
> > -/*
> > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > - *
> > - * Required by mmap_region() and do_brk_flags().
> > - */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> > -
> >  /* We are about to modify the VMA's flags. */
> >  struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> >  					struct vm_area_struct *prev,
> > @@ -133,31 +108,7 @@ struct vm_area_struct
> >  		       unsigned long new_flags,
> >  		       struct vm_userfaultfd_ctx new_ctx);
> >
> > -struct vm_area_struct
> > -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> > -
> > -/* Temporary convenience wrapper. */
> > -static inline struct vm_area_struct
> > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > -			   struct vm_area_struct *vma, unsigned long start,
> > -			   unsigned long end, pgoff_t pgoff)
> > -{
> > -	struct vma_merge_struct vmg = {
> > -		.vmi = vmi,
> > -		.prev = prev,
> > -		.start = start,
> > -		.end = end,
> > -		.flags = vma->vm_flags,
> > -		.file = vma->vm_file,
> > -		.anon_vma = vma->anon_vma,
> > -		.pgoff = pgoff,
> > -		.policy = vma_policy(vma),
> > -		.uffd_ctx = vma->vm_userfaultfd_ctx,
> > -		.anon_name = anon_vma_name(vma),
> > -	};
> > -
> > -	return vma_merge_new_vma(&vmg);
> > -}
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
> >
> >  /*
> >   * Temporary wrapper around vma_merge() so we can have a common interface for
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 40797a819d3d..a39a734282d0 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> >  	mas_destroy(&vmi->mas);
> >  }
> >
> > +static inline
> > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> > +{
> > +	return mas_next_range(&vmi->mas, ULONG_MAX);
> > +}
> > +
> >  static inline void vm_acct_memory(long pages)
> >  {
> >  }
>
Vlastimil Babka Aug. 8, 2024, 4:45 p.m. UTC | #3
On 8/5/24 14:13, Lorenzo Stoakes wrote:
> In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> to use vma_expand() rather than invoke a full vma_merge() operation.
> 
> Abstract this logic and eliminate all of the open-coding, and also use the
> same logic for all cases where we add new VMAs to, rather than ultimately
> use vma_merge(), rather use vma_expand().
> 
> We implement this by replacing vma_merge_new_vma() with this newly
> abstracted logic.
> 
> Doing so removes duplication and simplifies VMA merging in all such cases,
> laying the ground for us to eliminate the merging of new VMAs in
> vma_merge() altogether.
> 
> This makes it far easier to understand what is happening in these cases
> avoiding confusion, bugs and allowing for future optimisation.
> 
> As a result of this change we are also able to make vma_prepare(),
> init_vma_prep(), vma_complete(), can_vma_merge_before() and
> can_vma_merge_after() static and internal to vma.c.

That's really great, but it would be even better if these code moves could
be a separate patch as it would make reviewing so much easier. But with git
diff's --color-moved to the rescue, let me try...

> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/mmap.c                        |  79 ++---
>  mm/vma.c                         | 482 +++++++++++++++++++------------
>  mm/vma.h                         |  51 +---
>  tools/testing/vma/vma_internal.h |   6 +
>  4 files changed, 324 insertions(+), 294 deletions(-)
> 
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f6593a81f73d..c03f50f46396 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  {
>  	struct mm_struct *mm = current->mm;
>  	struct vm_area_struct *vma = NULL;
> -	struct vm_area_struct *next, *prev, *merge;
> -	pgoff_t pglen = len >> PAGE_SHIFT;
> +	struct vm_area_struct *merge;
>  	unsigned long charged = 0;
>  	unsigned long end = addr + len;
>  	bool writable_file_mapping = false;
> @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		vm_flags |= VM_ACCOUNT;
>  	}
>  
> -	next = vmg.next = vma_next(&vmi);
> -	prev = vmg.prev = vma_prev(&vmi);
> -	if (vm_flags & VM_SPECIAL) {
> -		if (prev)
> -			vma_iter_next_range(&vmi);
> -		goto cannot_expand;
> -	}
> -
> -	/* Attempt to expand an old mapping */
> -	/* Check next */
> -	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> -		/* We can adjust this as can_vma_merge_after() doesn't touch */
> -		vmg.end = next->vm_end;
> -		vma = vmg.vma = next;
> -		vmg.pgoff = next->vm_pgoff - pglen;
> -
> -		/* We may merge our NULL anon_vma with non-NULL in next. */
> -		vmg.anon_vma = vma->anon_vma;
> -	}
> -
> -	/* Check prev */
> -	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> -		vmg.start = prev->vm_start;
> -		vma = vmg.vma = prev;
> -		vmg.pgoff = prev->vm_pgoff;
> -	} else if (prev) {
> -		vma_iter_next_range(&vmi);
> -	}
> -
> -	/* Actually expand, if possible */
> -	if (vma && !vma_expand(&vmg)) {
> -		khugepaged_enter_vma(vma, vm_flags);
> +	vma = vma_merge_new_vma(&vmg);
> +	if (vma)
>  		goto expanded;
> -	}
> -
> -	if (vma == prev)
> -		vma_iter_set(&vmi, addr);
> -cannot_expand:
>  
>  	/*
>  	 * Determine the object being mapped and call the appropriate
> @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		 * If vm_flags changed after call_mmap(), we should try merge
>  		 * vma again as we may succeed this time.
>  		 */
> -		if (unlikely(vm_flags != vma->vm_flags && prev)) {
> -			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> -							  vma->vm_start, vma->vm_end,
> -							  vma->vm_pgoff);
> +		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> +			merge = vma_merge_new_vma(&vmg);

Can this even succeed if we don't update vmg->vm_flags? Previously the
wrapper would take them from vma.

> +
>  			if (merge) {
>  				/*
>  				 * ->mmap() can change vma->vm_file and fput

<snip>

> +/*
> + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> + *
> + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> + *       (exclusive), which we try to merge with any adjacent VMAs if possible.
> + *
> + * We are about to add a VMA to the address space starting at @vmg->start and
> + * ending at @vmg->end. There are three different possible scenarios:
> + *
> + * 1. There is a VMA with identical properties immediately adjacent to the
> + *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> + *    EXPAND that VMA:
> + *
> + * Proposed:       |-----|  or  |-----|
> + * Existing:  |----|                  |----|
> + *
> + * 2. There are VMAs with identical properties immediately adjacent to the
> + *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> + *    EXPAND the former and REMOVE the latter:
> + *
> + * Proposed:       |-----|
> + * Existing:  |----|     |----|
> + *
> + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> + *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
> + *
> + * In instances where we can merge, this function returns the expanded VMA which
> + * will have its range adjusted accordingly and the underlying maple tree also
> + * adjusted.
> + *
> + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> + *          to the VMA we expanded.
> + *
> + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> + * expanded range.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.

Should we be paranoid and assert something?

> + */
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> +{
> +	bool is_special = vmg->flags & VM_SPECIAL;
> +	struct vm_area_struct *prev = vmg->prev;
> +	struct vm_area_struct *next = vmg->next;
> +	unsigned long start = vmg->start;
> +	unsigned long end = vmg->end;
> +	pgoff_t pgoff = vmg->pgoff;
> +	pgoff_t pglen = PHYS_PFN(end - start);
> +
> +	VM_WARN_ON(vmg->vma);
> +
> +	if (!prev && !next) {
> +		/*
> +		 * Since the caller must have determined that the requested
> +		 * range is empty, vmg->vmi will be left pointing at the VMA
> +		 * immediately prior.
> +		 */

OK that's perhaps not that obvious, as it seems copy_vma() is doing some
special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
it, or is there a maple tree operation we can do to ensure it, ideally if
it's very cheap if the iterator is already set the way we want it to be?

> +		next = vmg->next = vma_next(vmg->vmi);
> +		prev = vmg->prev = vma_prev(vmg->vmi);
> +
> +		/* Avoid maple tree re-walk. */
> +		if (is_special && prev)
> +			vma_iter_next_range(vmg->vmi);

I wish I knew what this did but seems it's the same as the old code did so
hopefully that's fine.

> +	}
> +
> +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> +	if (is_special || (!prev && !next))
> +		return NULL;
> +
> +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> +		/*
> +		 * We can adjust this here as can_vma_merge_after() doesn't
> +		 * touch vmg->end.
> +		 */
> +		vmg->end = next->vm_end;
> +		vmg->vma = next;
> +		vmg->pgoff = next->vm_pgoff - pglen;
> +
> +		vmg->anon_vma = next->anon_vma;
> +	}
> +
> +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> +		vmg->start = prev->vm_start;
> +		vmg->vma = prev;
> +		vmg->pgoff = prev->vm_pgoff;
> +	} else if (prev) {
> +		vma_iter_next_range(vmg->vmi);
> +	}

Sigh... ditto.
Lorenzo Stoakes Aug. 8, 2024, 6:02 p.m. UTC | #4
On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
>
> That's really great, but it would be even better if these code moves could
> be a separate patch as it would make reviewing so much easier. But with git
> diff's --color-moved to the rescue, let me try...

Will separate out on respin.

>
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/mmap.c                        |  79 ++---
> >  mm/vma.c                         | 482 +++++++++++++++++++------------
> >  mm/vma.h                         |  51 +---
> >  tools/testing/vma/vma_internal.h |   6 +
> >  4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  {
> >  	struct mm_struct *mm = current->mm;
> >  	struct vm_area_struct *vma = NULL;
> > -	struct vm_area_struct *next, *prev, *merge;
> > -	pgoff_t pglen = len >> PAGE_SHIFT;
> > +	struct vm_area_struct *merge;
> >  	unsigned long charged = 0;
> >  	unsigned long end = addr + len;
> >  	bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		vm_flags |= VM_ACCOUNT;
> >  	}
> >
> > -	next = vmg.next = vma_next(&vmi);
> > -	prev = vmg.prev = vma_prev(&vmi);
> > -	if (vm_flags & VM_SPECIAL) {
> > -		if (prev)
> > -			vma_iter_next_range(&vmi);
> > -		goto cannot_expand;
> > -	}
> > -
> > -	/* Attempt to expand an old mapping */
> > -	/* Check next */
> > -	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > -		/* We can adjust this as can_vma_merge_after() doesn't touch */
> > -		vmg.end = next->vm_end;
> > -		vma = vmg.vma = next;
> > -		vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > -		/* We may merge our NULL anon_vma with non-NULL in next. */
> > -		vmg.anon_vma = vma->anon_vma;
> > -	}
> > -
> > -	/* Check prev */
> > -	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > -		vmg.start = prev->vm_start;
> > -		vma = vmg.vma = prev;
> > -		vmg.pgoff = prev->vm_pgoff;
> > -	} else if (prev) {
> > -		vma_iter_next_range(&vmi);
> > -	}
> > -
> > -	/* Actually expand, if possible */
> > -	if (vma && !vma_expand(&vmg)) {
> > -		khugepaged_enter_vma(vma, vm_flags);
> > +	vma = vma_merge_new_vma(&vmg);
> > +	if (vma)
> >  		goto expanded;
> > -	}
> > -
> > -	if (vma == prev)
> > -		vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> >  	/*
> >  	 * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		 * If vm_flags changed after call_mmap(), we should try merge
> >  		 * vma again as we may succeed this time.
> >  		 */
> > -		if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > -			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > -							  vma->vm_start, vma->vm_end,
> > -							  vma->vm_pgoff);
> > +		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > +			merge = vma_merge_new_vma(&vmg);
>
> Can this even succeed if we don't update vmg->vm_flags? Previously the
> wrapper would take them from vma.

You're right... ugh. Will fix.

This is yet another example of how having this _not_ be under test is
problematic, as that'd have picked this up.

I will try to move at least VMA merge invocation logic over in a later
series.

>
> > +
> >  			if (merge) {
> >  				/*
> >  				 * ->mmap() can change vma->vm_file and fput
>
> <snip>
>
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + *       (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + *    EXPAND that VMA:
> > + *
> > + * Proposed:       |-----|  or  |-----|
> > + * Existing:  |----|                  |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + *    EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed:       |-----|
> > + * Existing:  |----|     |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + *          to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
>
> Should we be paranoid and assert something?

This will have a performance impact, if we do that we'll want something like
an #ifdef CONFIG_DEBUG_VM around that.

>
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > +	bool is_special = vmg->flags & VM_SPECIAL;
> > +	struct vm_area_struct *prev = vmg->prev;
> > +	struct vm_area_struct *next = vmg->next;
> > +	unsigned long start = vmg->start;
> > +	unsigned long end = vmg->end;
> > +	pgoff_t pgoff = vmg->pgoff;
> > +	pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > +	VM_WARN_ON(vmg->vma);
> > +
> > +	if (!prev && !next) {
> > +		/*
> > +		 * Since the caller must have determined that the requested
> > +		 * range is empty, vmg->vmi will be left pointing at the VMA
> > +		 * immediately prior.
> > +		 */
>
> OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> it, or is there a maple tree operation we can do to ensure it, ideally if
> it's very cheap if the iterator is already set the way we want it to be?
>

To be fair this is something that was previously assumed, and I just added
a comment.

Will add to assumptions, and again I think any assert should be done in
such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
VM_WARN_ON()?

Will try to come up with something.

> > +		next = vmg->next = vma_next(vmg->vmi);
> > +		prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > +		/* Avoid maple tree re-walk. */
> > +		if (is_special && prev)
> > +			vma_iter_next_range(vmg->vmi);
>
> I wish I knew what this did but seems it's the same as the old code did so
> hopefully that's fine.

I think point is that we are about to exit, so we'd be left pointing at
prev. But since we're exiting in just a second, we want to be pointing at
the next vma which will become the prev of the next merge attempt.

Liam can maybe elucidate further.

>
> > +	}
> > +
> > +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> > +	if (is_special || (!prev && !next))
> > +		return NULL;
> > +
> > +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> > +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > +		/*
> > +		 * We can adjust this here as can_vma_merge_after() doesn't
> > +		 * touch vmg->end.
> > +		 */
> > +		vmg->end = next->vm_end;
> > +		vmg->vma = next;
> > +		vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > +		vmg->anon_vma = next->anon_vma;
> > +	}
> > +
> > +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> > +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > +		vmg->start = prev->vm_start;
> > +		vmg->vma = prev;
> > +		vmg->pgoff = prev->vm_pgoff;
> > +	} else if (prev) {
> > +		vma_iter_next_range(vmg->vmi);
> > +	}
>
> Sigh... ditto.
>

(Liam can correct me) I think this is just setting up the vmi similar to
the other case such that if expansion fails we can positioned correctly for
the next merge attempt.

Yes it's fiddly, maybe needs a comment...
Liam R. Howlett Aug. 8, 2024, 6:34 p.m. UTC | #5
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]:
> On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > > to use vma_expand() rather than invoke a full vma_merge() operation.
> > >
> > > Abstract this logic and eliminate all of the open-coding, and also use the
> > > same logic for all cases where we add new VMAs to, rather than ultimately
> > > use vma_merge(), rather use vma_expand().
> > >
> > > We implement this by replacing vma_merge_new_vma() with this newly
> > > abstracted logic.
> > >
> > > Doing so removes duplication and simplifies VMA merging in all such cases,
> > > laying the ground for us to eliminate the merging of new VMAs in
> > > vma_merge() altogether.
> > >
> > > This makes it far easier to understand what is happening in these cases
> > > avoiding confusion, bugs and allowing for future optimisation.
> > >
> > > As a result of this change we are also able to make vma_prepare(),
> > > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > > can_vma_merge_after() static and internal to vma.c.
> >
> > That's really great, but it would be even better if these code moves could
> > be a separate patch as it would make reviewing so much easier. But with git
> > diff's --color-moved to the rescue, let me try...
> 
> Will separate out on respin.
> 
> >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > >  mm/mmap.c                        |  79 ++---
> > >  mm/vma.c                         | 482 +++++++++++++++++++------------
> > >  mm/vma.h                         |  51 +---
> > >  tools/testing/vma/vma_internal.h |   6 +
> > >  4 files changed, 324 insertions(+), 294 deletions(-)

...
> > > + */
> > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > +{
> > > +	bool is_special = vmg->flags & VM_SPECIAL;
> > > +	struct vm_area_struct *prev = vmg->prev;
> > > +	struct vm_area_struct *next = vmg->next;
> > > +	unsigned long start = vmg->start;
> > > +	unsigned long end = vmg->end;
> > > +	pgoff_t pgoff = vmg->pgoff;
> > > +	pgoff_t pglen = PHYS_PFN(end - start);
> > > +
> > > +	VM_WARN_ON(vmg->vma);
> > > +
> > > +	if (!prev && !next) {
> > > +		/*
> > > +		 * Since the caller must have determined that the requested
> > > +		 * range is empty, vmg->vmi will be left pointing at the VMA
> > > +		 * immediately prior.
> > > +		 */
> >
> > OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> > it, or is there a maple tree operation we can do to ensure it, ideally if
> > it's very cheap if the iterator is already set the way we want it to be?
> >
> 
> To be fair this is something that was previously assumed, and I just added
> a comment.
> 
> Will add to assumptions, and again I think any assert should be done in
> such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
> VM_WARN_ON()?
> 
> Will try to come up with something.
> 
> > > +		next = vmg->next = vma_next(vmg->vmi);
> > > +		prev = vmg->prev = vma_prev(vmg->vmi);
> > > +
> > > +		/* Avoid maple tree re-walk. */
> > > +		if (is_special && prev)
> > > +			vma_iter_next_range(vmg->vmi);
> >
> > I wish I knew what this did but seems it's the same as the old code did so
> > hopefully that's fine.
> 
> I think point is that we are about to exit, so we'd be left pointing at
> prev. But since we're exiting in just a second, we want to be pointing at
> the next vma which will become the prev of the next merge attempt.
> 
> Liam can maybe elucidate further.

What you have to remember is that the vma iterator (vmg->vmi above),
contains (or, basically is) a maple state (usually written as mas).  We
keep state of the maple tree walker so that we don't have to keep
re-walking to find the same thing.  We move around the tree with this
maple state because going prev/next is faster from leaves (almost always
just the next thing in the nodes array of pointers).

We use the maple state to write as well, so the maple state needs to
point to the correct location in the tree for a write.

The maple tree is a range-based tree, so each entry exists for a span of
values.  A write happens at the lowest index and can overwrite
subsequent values.  This means that the maple state needs to point to
the range containing the lowest index for the write (if it's pointing to
a node - it could walk from the top).

A side effect of writing to the lowest index is that we need to point to
the previous vma if we are going to 'expand' the vma.  The range is
essentially going to be from prev->start to "whatever we are expanding
over".

In the old code, the vm_flags & VM_SPECIAL code meant there was no way
an expansion was going to happen, but we've moved the maple state to the
wrong location for a write of a new vma - so this vma_iter_next_range()
just moves it back.  Then we "goto cannot_expand".

> 
> >
> > > +	}
> > > +
> > > +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> > > +	if (is_special || (!prev && !next))
> > > +		return NULL;
> > > +
> > > +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> > > +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > > +		/*
> > > +		 * We can adjust this here as can_vma_merge_after() doesn't
> > > +		 * touch vmg->end.
> > > +		 */
> > > +		vmg->end = next->vm_end;
> > > +		vmg->vma = next;
> > > +		vmg->pgoff = next->vm_pgoff - pglen;
> > > +
> > > +		vmg->anon_vma = next->anon_vma;
> > > +	}
> > > +
> > > +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> > > +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > > +		vmg->start = prev->vm_start;
> > > +		vmg->vma = prev;
> > > +		vmg->pgoff = prev->vm_pgoff;
> > > +	} else if (prev) {
> > > +		vma_iter_next_range(vmg->vmi);
> > > +	}
> >
> > Sigh... ditto.
> >
> 
> (Liam can correct me) I think this is just setting up the vmi similar to
> the other case such that if expansion fails we can positioned correctly for
> the next merge attempt.
> 
> Yes it's fiddly, maybe needs a comment...

Yes, ditto.
Liam R. Howlett Aug. 8, 2024, 7:06 p.m. UTC | #6
* Liam R. Howlett <Liam.Howlett@oracle.com> [240808 14:34]:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]:
> > On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> > > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > > > to use vma_expand() rather than invoke a full vma_merge() operation.
> > > >
> > > > Abstract this logic and eliminate all of the open-coding, and also use the
> > > > same logic for all cases where we add new VMAs to, rather than ultimately
> > > > use vma_merge(), rather use vma_expand().
> > > >
> > > > We implement this by replacing vma_merge_new_vma() with this newly
> > > > abstracted logic.
> > > >
> > > > Doing so removes duplication and simplifies VMA merging in all such cases,
> > > > laying the ground for us to eliminate the merging of new VMAs in
> > > > vma_merge() altogether.
> > > >
> > > > This makes it far easier to understand what is happening in these cases
> > > > avoiding confusion, bugs and allowing for future optimisation.
> > > >
> > > > As a result of this change we are also able to make vma_prepare(),
> > > > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > > > can_vma_merge_after() static and internal to vma.c.
> > >
> > > That's really great, but it would be even better if these code moves could
> > > be a separate patch as it would make reviewing so much easier. But with git
> > > diff's --color-moved to the rescue, let me try...
> > 
> > Will separate out on respin.
> > 
> > >
> > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > ---
> > > >  mm/mmap.c                        |  79 ++---
> > > >  mm/vma.c                         | 482 +++++++++++++++++++------------
> > > >  mm/vma.h                         |  51 +---
> > > >  tools/testing/vma/vma_internal.h |   6 +
> > > >  4 files changed, 324 insertions(+), 294 deletions(-)
> 
> ...
> > > > + */
> > > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > > +{
> > > > +	bool is_special = vmg->flags & VM_SPECIAL;
> > > > +	struct vm_area_struct *prev = vmg->prev;
> > > > +	struct vm_area_struct *next = vmg->next;
> > > > +	unsigned long start = vmg->start;
> > > > +	unsigned long end = vmg->end;
> > > > +	pgoff_t pgoff = vmg->pgoff;
> > > > +	pgoff_t pglen = PHYS_PFN(end - start);
> > > > +
> > > > +	VM_WARN_ON(vmg->vma);
> > > > +
> > > > +	if (!prev && !next) {
> > > > +		/*
> > > > +		 * Since the caller must have determined that the requested
> > > > +		 * range is empty, vmg->vmi will be left pointing at the VMA
> > > > +		 * immediately prior.
> > > > +		 */
> > >
> > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> > > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> > > it, or is there a maple tree operation we can do to ensure it, ideally if
> > > it's very cheap if the iterator is already set the way we want it to be?
> > >
> > 
> > To be fair this is something that was previously assumed, and I just added
> > a comment.
> > 
> > Will add to assumptions, and again I think any assert should be done in
> > such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
> > VM_WARN_ON()?
> > 
> > Will try to come up with something.

Something like:

VM_BUG_ON(vma_iter_end(vmg->vmi) > start);

> > 
> > > > +		next = vmg->next = vma_next(vmg->vmi);

and:

VM_BUG_ON(vma_iter_addr(vmg->vmi) < end);

> > > > +		prev = vmg->prev = vma_prev(vmg->vmi);
> > > > +
> > > > +		/* Avoid maple tree re-walk. */
> > > > +		if (is_special && prev)
> > > > +			vma_iter_next_range(vmg->vmi);
> > >
> > > I wish I knew what this did but seems it's the same as the old code did so
> > > hopefully that's fine.
> > 
> > I think point is that we are about to exit, so we'd be left pointing at
> > prev. But since we're exiting in just a second, we want to be pointing at
> > the next vma which will become the prev of the next merge attempt.
> > 
> > Liam can maybe elucidate further.
> 
> What you have to remember is that the vma iterator (vmg->vmi above),
> contains (or, basically is) a maple state (usually written as mas).  We
> keep state of the maple tree walker so that we don't have to keep
> re-walking to find the same thing.  We move around the tree with this
> maple state because going prev/next is faster from leaves (almost always
> just the next thing in the nodes array of pointers).
> 
> We use the maple state to write as well, so the maple state needs to
> point to the correct location in the tree for a write.
> 
> The maple tree is a range-based tree, so each entry exists for a span of
> values.  A write happens at the lowest index and can overwrite
> subsequent values.  This means that the maple state needs to point to
> the range containing the lowest index for the write (if it's pointing to
> a node - it could walk from the top).
> 
> A side effect of writing to the lowest index is that we need to point to
> the previous vma if we are going to 'expand' the vma.  The range is
> essentially going to be from prev->start to "whatever we are expanding
> over".
> 
> In the old code, the vm_flags & VM_SPECIAL code meant there was no way
> an expansion was going to happen, but we've moved the maple state to the
> wrong location for a write of a new vma - so this vma_iter_next_range()
> just moves it back.  Then we "goto cannot_expand".
>
Lorenzo Stoakes Aug. 9, 2024, 10:14 a.m. UTC | #7
On Thu, Aug 08, 2024 at 03:06:14PM GMT, Liam R. Howlett wrote:
> * Liam R. Howlett <Liam.Howlett@oracle.com> [240808 14:34]:
> > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]:
> > > On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> > > > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > > > > to use vma_expand() rather than invoke a full vma_merge() operation.
> > > > >
> > > > > Abstract this logic and eliminate all of the open-coding, and also use the
> > > > > same logic for all cases where we add new VMAs to, rather than ultimately
> > > > > use vma_merge(), rather use vma_expand().
> > > > >
> > > > > We implement this by replacing vma_merge_new_vma() with this newly
> > > > > abstracted logic.
> > > > >
> > > > > Doing so removes duplication and simplifies VMA merging in all such cases,
> > > > > laying the ground for us to eliminate the merging of new VMAs in
> > > > > vma_merge() altogether.
> > > > >
> > > > > This makes it far easier to understand what is happening in these cases
> > > > > avoiding confusion, bugs and allowing for future optimisation.
> > > > >
> > > > > As a result of this change we are also able to make vma_prepare(),
> > > > > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > > > > can_vma_merge_after() static and internal to vma.c.
> > > >
> > > > That's really great, but it would be even better if these code moves could
> > > > be a separate patch as it would make reviewing so much easier. But with git
> > > > diff's --color-moved to the rescue, let me try...
> > >
> > > Will separate out on respin.
> > >
> > > >
> > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > > ---
> > > > >  mm/mmap.c                        |  79 ++---
> > > > >  mm/vma.c                         | 482 +++++++++++++++++++------------
> > > > >  mm/vma.h                         |  51 +---
> > > > >  tools/testing/vma/vma_internal.h |   6 +
> > > > >  4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > ...
> > > > > + */
> > > > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > > > +{
> > > > > +	bool is_special = vmg->flags & VM_SPECIAL;
> > > > > +	struct vm_area_struct *prev = vmg->prev;
> > > > > +	struct vm_area_struct *next = vmg->next;
> > > > > +	unsigned long start = vmg->start;
> > > > > +	unsigned long end = vmg->end;
> > > > > +	pgoff_t pgoff = vmg->pgoff;
> > > > > +	pgoff_t pglen = PHYS_PFN(end - start);
> > > > > +
> > > > > +	VM_WARN_ON(vmg->vma);
> > > > > +
> > > > > +	if (!prev && !next) {
> > > > > +		/*
> > > > > +		 * Since the caller must have determined that the requested
> > > > > +		 * range is empty, vmg->vmi will be left pointing at the VMA
> > > > > +		 * immediately prior.
> > > > > +		 */
> > > >
> > > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> > > > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> > > > it, or is there a maple tree operation we can do to ensure it, ideally if
> > > > it's very cheap if the iterator is already set the way we want it to be?
> > > >
> > >
> > > To be fair this is something that was previously assumed, and I just added
> > > a comment.
> > >
> > > Will add to assumptions, and again I think any assert should be done in
> > > such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
> > > VM_WARN_ON()?
> > >
> > > Will try to come up with something.
>
> Something like:
>
> VM_BUG_ON(vma_iter_end(vmg->vmi) > start);
>
> > >
> > > > > +		next = vmg->next = vma_next(vmg->vmi);
>
> and:
>
> VM_BUG_ON(vma_iter_addr(vmg->vmi) < end);
>

Ack x2.

Thought VM_BUG_ON() was 'not done' these days though... but checkpatch.pl
has become rather hit or miss as to what should be given attention to or
not.

> > > > > +		prev = vmg->prev = vma_prev(vmg->vmi);
> > > > > +
> > > > > +		/* Avoid maple tree re-walk. */
> > > > > +		if (is_special && prev)
> > > > > +			vma_iter_next_range(vmg->vmi);
> > > >
> > > > I wish I knew what this did but seems it's the same as the old code did so
> > > > hopefully that's fine.
> > >
> > > I think point is that we are about to exit, so we'd be left pointing at
> > > prev. But since we're exiting in just a second, we want to be pointing at
> > > the next vma which will become the prev of the next merge attempt.
> > >
> > > Liam can maybe elucidate further.
> >
> > What you have to remember is that the vma iterator (vmg->vmi above),
> > contains (or, basically is) a maple state (usually written as mas).  We
> > keep state of the maple tree walker so that we don't have to keep
> > re-walking to find the same thing.  We move around the tree with this
> > maple state because going prev/next is faster from leaves (almost always
> > just the next thing in the nodes array of pointers).
> >
> > We use the maple state to write as well, so the maple state needs to
> > point to the correct location in the tree for a write.
> >
> > The maple tree is a range-based tree, so each entry exists for a span of
> > values.  A write happens at the lowest index and can overwrite
> > subsequent values.  This means that the maple state needs to point to
> > the range containing the lowest index for the write (if it's pointing to
> > a node - it could walk from the top).
> >
> > A side effect of writing to the lowest index is that we need to point to
> > the previous vma if we are going to 'expand' the vma.  The range is
> > essentially going to be from prev->start to "whatever we are expanding
> > over".
> >
> > In the old code, the vm_flags & VM_SPECIAL code meant there was no way
> > an expansion was going to happen, but we've moved the maple state to the
> > wrong location for a write of a new vma - so this vma_iter_next_range()
> > just moves it back.  Then we "goto cannot_expand".
> >
Liam R. Howlett Aug. 9, 2024, 3:23 p.m. UTC | #8
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]:
> In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> to use vma_expand() rather than invoke a full vma_merge() operation.
> 
> Abstract this logic and eliminate all of the open-coding, and also use the
> same logic for all cases where we add new VMAs to, rather than ultimately
> use vma_merge(), rather use vma_expand().
> 
> We implement this by replacing vma_merge_new_vma() with this newly
> abstracted logic.
> 
> Doing so removes duplication and simplifies VMA merging in all such cases,
> laying the ground for us to eliminate the merging of new VMAs in
> vma_merge() altogether.
> 
> This makes it far easier to understand what is happening in these cases
> avoiding confusion, bugs and allowing for future optimisation.
> 
> As a result of this change we are also able to make vma_prepare(),
> init_vma_prep(), vma_complete(), can_vma_merge_before() and
> can_vma_merge_after() static and internal to vma.c.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/mmap.c                        |  79 ++---
>  mm/vma.c                         | 482 +++++++++++++++++++------------
>  mm/vma.h                         |  51 +---
>  tools/testing/vma/vma_internal.h |   6 +
>  4 files changed, 324 insertions(+), 294 deletions(-)
> 
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f6593a81f73d..c03f50f46396 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  {
>  	struct mm_struct *mm = current->mm;
>  	struct vm_area_struct *vma = NULL;
> -	struct vm_area_struct *next, *prev, *merge;
> -	pgoff_t pglen = len >> PAGE_SHIFT;
> +	struct vm_area_struct *merge;
>  	unsigned long charged = 0;
>  	unsigned long end = addr + len;
>  	bool writable_file_mapping = false;
> @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		vm_flags |= VM_ACCOUNT;
>  	}
>  
> -	next = vmg.next = vma_next(&vmi);
> -	prev = vmg.prev = vma_prev(&vmi);
> -	if (vm_flags & VM_SPECIAL) {
> -		if (prev)
> -			vma_iter_next_range(&vmi);
> -		goto cannot_expand;
> -	}
> -
> -	/* Attempt to expand an old mapping */
> -	/* Check next */
> -	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> -		/* We can adjust this as can_vma_merge_after() doesn't touch */
> -		vmg.end = next->vm_end;
> -		vma = vmg.vma = next;
> -		vmg.pgoff = next->vm_pgoff - pglen;
> -
> -		/* We may merge our NULL anon_vma with non-NULL in next. */
> -		vmg.anon_vma = vma->anon_vma;
> -	}
> -
> -	/* Check prev */
> -	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> -		vmg.start = prev->vm_start;
> -		vma = vmg.vma = prev;
> -		vmg.pgoff = prev->vm_pgoff;
> -	} else if (prev) {
> -		vma_iter_next_range(&vmi);
> -	}
> -
> -	/* Actually expand, if possible */
> -	if (vma && !vma_expand(&vmg)) {
> -		khugepaged_enter_vma(vma, vm_flags);
> +	vma = vma_merge_new_vma(&vmg);
> +	if (vma)
>  		goto expanded;
> -	}
> -
> -	if (vma == prev)
> -		vma_iter_set(&vmi, addr);
> -cannot_expand:
>  
>  	/*
>  	 * Determine the object being mapped and call the appropriate
> @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		 * If vm_flags changed after call_mmap(), we should try merge
>  		 * vma again as we may succeed this time.
>  		 */
> -		if (unlikely(vm_flags != vma->vm_flags && prev)) {
> -			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> -							  vma->vm_start, vma->vm_end,
> -							  vma->vm_pgoff);
> +		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> +			merge = vma_merge_new_vma(&vmg);
> +
>  			if (merge) {
>  				/*
>  				 * ->mmap() can change vma->vm_file and fput
> @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  
>  		vma_iter_set(&vmi, vma->vm_end);
>  		/* Undo any partial mapping done by a device driver. */
> -		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> +		unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
>  			     vma->vm_end, vma->vm_end, true);
>  	}
>  	if (writable_file_mapping)
> @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		unsigned long addr, unsigned long len, unsigned long flags)
>  {
>  	struct mm_struct *mm = current->mm;
> -	struct vma_prepare vp;
>  
>  	/*
>  	 * Check against address space limits by the changed size
> @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	 */
>  	if (vma && vma->vm_end == addr) {
>  		struct vma_merge_struct vmg = {
> +			.vmi = vmi,
>  			.prev = vma,
> +			.next = NULL,
> +			.start = addr,
> +			.end = addr + len,
>  			.flags = flags,
>  			.pgoff = addr >> PAGE_SHIFT,
> +			.file = vma->vm_file,
> +			.anon_vma = vma->anon_vma,
> +			.policy = vma_policy(vma),
> +			.uffd_ctx = vma->vm_userfaultfd_ctx,
> +			.anon_name = anon_vma_name(vma),
>  		};
>  
> -		if (can_vma_merge_after(&vmg)) {
> -			vma_iter_config(vmi, vma->vm_start, addr + len);
> -			if (vma_iter_prealloc(vmi, vma))
> -				goto unacct_fail;
> -
> -			vma_start_write(vma);
> -
> -			init_vma_prep(&vp, vma);
> -			vma_prepare(&vp);
> -			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> -			vma->vm_end = addr + len;
> -			vm_flags_set(vma, VM_SOFTDIRTY);
> -			vma_iter_store(vmi, vma);
> -
> -			vma_complete(&vp, vmi, mm);
> -			khugepaged_enter_vma(vma, flags);
> +		if (vma_merge_new_vma(&vmg))
>  			goto out;

This is very convoluted to follow.  It seems vma_merge_new_vma() will do
what is necessary by finding out that it can merge after, then call
vma_expand() which calls commit merge(), which sets the iterator to
vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr
here.. it's actually set to prev->vm_start in vma_merge_new_vma().

This is getting really hard to trace what happens.  I'm also concerned
that the overhead of following all these checks will cost performance on
the brk system call?

Maybe we can have a way to set up the vmg and call the right function to
just make the above happen?  We know with the can_vma_merge_after() that
it is going to work, so could we just call vma_start_write() and
commit_merge()?

Also, vma_merge_new_vma() could fail because it's out of memory so it
should goto unacct_fail.. but we now don't know if it's because the
merge wasn't allowed or if we are out of memory..

> -		}
>  	}
>  
>  	if (vma)
> diff --git a/mm/vma.c b/mm/vma.c
> index 55615392e8d2..a404cf718f9e 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
>   *
>   * We assume the vma may be removed as part of the merge.
>   */
> -bool
> -can_vma_merge_before(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
>  {
>  	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
>  
> @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
>   *
>   * We assume that vma is not removed as part of the merge.
>   */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
>  {
>  	if (is_mergeable_vma(vmg, false) &&
>  	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
>  	return false;
>  }
>  
> +static void __vma_link_file(struct vm_area_struct *vma,
> +			    struct address_space *mapping)
> +{
> +	if (vma_is_shared_maywrite(vma))
> +		mapping_allow_writable(mapping);
> +
> +	flush_dcache_mmap_lock(mapping);
> +	vma_interval_tree_insert(vma, &mapping->i_mmap);
> +	flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * Requires inode->i_mapping->i_mmap_rwsem
> + */
> +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> +				      struct address_space *mapping)
> +{
> +	if (vma_is_shared_maywrite(vma))
> +		mapping_unmap_writable(mapping);
> +
> +	flush_dcache_mmap_lock(mapping);
> +	vma_interval_tree_remove(vma, &mapping->i_mmap);
> +	flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> + * @vp: The initialized vma_prepare struct
> + */
> +static void vma_prepare(struct vma_prepare *vp)
> +{
> +	if (vp->file) {
> +		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> +
> +		if (vp->adj_next)
> +			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> +				      vp->adj_next->vm_end);
> +
> +		i_mmap_lock_write(vp->mapping);
> +		if (vp->insert && vp->insert->vm_file) {
> +			/*
> +			 * Put into interval tree now, so instantiated pages
> +			 * are visible to arm/parisc __flush_dcache_page
> +			 * throughout; but we cannot insert into address
> +			 * space until vma start or end is updated.
> +			 */
> +			__vma_link_file(vp->insert,
> +					vp->insert->vm_file->f_mapping);
> +		}
> +	}
> +
> +	if (vp->anon_vma) {
> +		anon_vma_lock_write(vp->anon_vma);
> +		anon_vma_interval_tree_pre_update_vma(vp->vma);
> +		if (vp->adj_next)
> +			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> +	}
> +
> +	if (vp->file) {
> +		flush_dcache_mmap_lock(vp->mapping);
> +		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> +		if (vp->adj_next)
> +			vma_interval_tree_remove(vp->adj_next,
> +						 &vp->mapping->i_mmap);
> +	}
> +
> +}
> +
> +/*
> + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> + * or for inserting a VMA.
> + *
> + * @vp: The vma_prepare struct
> + * @vmi: The vma iterator
> + * @mm: The mm_struct
> + */
> +static void vma_complete(struct vma_prepare *vp,
> +			 struct vma_iterator *vmi, struct mm_struct *mm)
> +{
> +	if (vp->file) {
> +		if (vp->adj_next)
> +			vma_interval_tree_insert(vp->adj_next,
> +						 &vp->mapping->i_mmap);
> +		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> +		flush_dcache_mmap_unlock(vp->mapping);
> +	}
> +
> +	if (vp->remove && vp->file) {
> +		__remove_shared_vm_struct(vp->remove, vp->mapping);
> +		if (vp->remove2)
> +			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> +	} else if (vp->insert) {
> +		/*
> +		 * split_vma has split insert from vma, and needs
> +		 * us to insert it before dropping the locks
> +		 * (it may either follow vma or precede it).
> +		 */
> +		vma_iter_store(vmi, vp->insert);
> +		mm->map_count++;
> +	}
> +
> +	if (vp->anon_vma) {
> +		anon_vma_interval_tree_post_update_vma(vp->vma);
> +		if (vp->adj_next)
> +			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> +		anon_vma_unlock_write(vp->anon_vma);
> +	}
> +
> +	if (vp->file) {
> +		i_mmap_unlock_write(vp->mapping);
> +		uprobe_mmap(vp->vma);
> +
> +		if (vp->adj_next)
> +			uprobe_mmap(vp->adj_next);
> +	}
> +
> +	if (vp->remove) {
> +again:
> +		vma_mark_detached(vp->remove, true);
> +		if (vp->file) {
> +			uprobe_munmap(vp->remove, vp->remove->vm_start,
> +				      vp->remove->vm_end);
> +			fput(vp->file);
> +		}
> +		if (vp->remove->anon_vma)
> +			anon_vma_merge(vp->vma, vp->remove);
> +		mm->map_count--;
> +		mpol_put(vma_policy(vp->remove));
> +		if (!vp->remove2)
> +			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> +		vm_area_free(vp->remove);
> +
> +		/*
> +		 * In mprotect's case 6 (see comments on vma_merge),
> +		 * we are removing both mid and next vmas
> +		 */
> +		if (vp->remove2) {
> +			vp->remove = vp->remove2;
> +			vp->remove2 = NULL;
> +			goto again;
> +		}
> +	}
> +	if (vp->insert && vp->file)
> +		uprobe_mmap(vp->insert);
> +	validate_mm(mm);
> +}
> +
> +/*
> + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> + * @vp: The vma_prepare struct
> + * @vma: The vma that will be altered once locked
> + */
> +static void init_vma_prep(struct vma_prepare *vp,
> +			  struct vm_area_struct *vma)
> +{
> +	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> +}
> +
>  /*
>   * Close a vm structure and free it.
>   */
> @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
>  	vm_unacct_memory(nr_accounted);
>  }
>  
> -/*
> - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> - * @vp: The vma_prepare struct
> - * @vma: The vma that will be altered once locked
> - */
> -void init_vma_prep(struct vma_prepare *vp,
> -		   struct vm_area_struct *vma)
> -{
> -	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> -}
> -
> -/*
> - * Requires inode->i_mapping->i_mmap_rwsem
> - */
> -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> -				      struct address_space *mapping)
> -{
> -	if (vma_is_shared_maywrite(vma))
> -		mapping_unmap_writable(mapping);
> -
> -	flush_dcache_mmap_lock(mapping);
> -	vma_interval_tree_remove(vma, &mapping->i_mmap);
> -	flush_dcache_mmap_unlock(mapping);
> -}
> -
>  /*
>   * vma has some anon_vma assigned, and is already inserted on that
>   * anon_vma's interval trees.
> @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
>  		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
>  }
>  
> -static void __vma_link_file(struct vm_area_struct *vma,
> -			    struct address_space *mapping)
> -{
> -	if (vma_is_shared_maywrite(vma))
> -		mapping_allow_writable(mapping);
> -
> -	flush_dcache_mmap_lock(mapping);
> -	vma_interval_tree_insert(vma, &mapping->i_mmap);
> -	flush_dcache_mmap_unlock(mapping);
> -}
> -
> -/*
> - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> - * @vp: The initialized vma_prepare struct
> - */
> -void vma_prepare(struct vma_prepare *vp)
> -{
> -	if (vp->file) {
> -		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> -
> -		if (vp->adj_next)
> -			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> -				      vp->adj_next->vm_end);
> -
> -		i_mmap_lock_write(vp->mapping);
> -		if (vp->insert && vp->insert->vm_file) {
> -			/*
> -			 * Put into interval tree now, so instantiated pages
> -			 * are visible to arm/parisc __flush_dcache_page
> -			 * throughout; but we cannot insert into address
> -			 * space until vma start or end is updated.
> -			 */
> -			__vma_link_file(vp->insert,
> -					vp->insert->vm_file->f_mapping);
> -		}
> -	}
> -
> -	if (vp->anon_vma) {
> -		anon_vma_lock_write(vp->anon_vma);
> -		anon_vma_interval_tree_pre_update_vma(vp->vma);
> -		if (vp->adj_next)
> -			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> -	}
> -
> -	if (vp->file) {
> -		flush_dcache_mmap_lock(vp->mapping);
> -		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> -		if (vp->adj_next)
> -			vma_interval_tree_remove(vp->adj_next,
> -						 &vp->mapping->i_mmap);
> -	}
> -
> -}
> -
>  /*
>   * dup_anon_vma() - Helper function to duplicate anon_vma
>   * @dst: The destination VMA
> @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
>  }
>  #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
>  
> +/*
> + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> + *
> + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> + *       (exclusive), which we try to merge with any adjacent VMAs if possible.
> + *
> + * We are about to add a VMA to the address space starting at @vmg->start and
> + * ending at @vmg->end. There are three different possible scenarios:
> + *
> + * 1. There is a VMA with identical properties immediately adjacent to the
> + *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> + *    EXPAND that VMA:
> + *
> + * Proposed:       |-----|  or  |-----|
> + * Existing:  |----|                  |----|
> + *
> + * 2. There are VMAs with identical properties immediately adjacent to the
> + *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> + *    EXPAND the former and REMOVE the latter:
> + *
> + * Proposed:       |-----|
> + * Existing:  |----|     |----|
> + *
> + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> + *    VMAs do not have identical attributes - NO MERGE POSSIBLE.

We still have diagrams, that's too bad.

> + *
> + * In instances where we can merge, this function returns the expanded VMA which
> + * will have its range adjusted accordingly and the underlying maple tree also
> + * adjusted.
> + *
> + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> + *          to the VMA we expanded.
> + *
> + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> + * expanded range.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> + */
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> +{
> +	bool is_special = vmg->flags & VM_SPECIAL;
> +	struct vm_area_struct *prev = vmg->prev;
> +	struct vm_area_struct *next = vmg->next;
> +	unsigned long start = vmg->start;
> +	unsigned long end = vmg->end;
> +	pgoff_t pgoff = vmg->pgoff;
> +	pgoff_t pglen = PHYS_PFN(end - start);
> +
> +	VM_WARN_ON(vmg->vma);
> +
> +	if (!prev && !next) {
> +		/*
> +		 * Since the caller must have determined that the requested
> +		 * range is empty, vmg->vmi will be left pointing at the VMA
> +		 * immediately prior.
> +		 */
> +		next = vmg->next = vma_next(vmg->vmi);
> +		prev = vmg->prev = vma_prev(vmg->vmi);
> +
> +		/* Avoid maple tree re-walk. */
> +		if (is_special && prev)
> +			vma_iter_next_range(vmg->vmi);
> +	}
> +
> +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> +	if (is_special || (!prev && !next))
> +		return NULL;
> +
> +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> +		/*
> +		 * We can adjust this here as can_vma_merge_after() doesn't
> +		 * touch vmg->end.
> +		 */
> +		vmg->end = next->vm_end;
> +		vmg->vma = next;
> +		vmg->pgoff = next->vm_pgoff - pglen;
> +
> +		vmg->anon_vma = next->anon_vma;
> +	}
> +
> +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> +		vmg->start = prev->vm_start;
> +		vmg->vma = prev;
> +		vmg->pgoff = prev->vm_pgoff;
> +	} else if (prev) {
> +		vma_iter_next_range(vmg->vmi);
> +	}
> +
> +	/*
> +	 * Now try to expand adjacent VMA(s). This takes care of removing the
> +	 * following VMA if we have VMAs on both sides.
> +	 */
> +	if (vmg->vma && !vma_expand(vmg)) {
> +		khugepaged_enter_vma(vmg->vma, vmg->flags);
> +		return vmg->vma;
> +	}
> +
> +	/* If expansion failed, reset state. Allows us to retry merge later. */
> +	vmg->vma = NULL;
> +	vmg->anon_vma = NULL;
> +	vmg->start = start;
> +	vmg->end = end;
> +	vmg->pgoff = pgoff;
> +	if (vmg->vma == prev)
> +		vma_iter_set(vmg->vmi, start);
> +
> +	return NULL;
> +}

Can we split this up a bit?  I was thinking that, for the brk() case, we
need to know if we can merge prev and if that merge fails.  I was
thinking something that you create a vmg with whatever, then call
can_merge_prev, and that'd do the block above and change the vmg as
required.  We could have a can_merge_next that does the same, then we
need to prepare the change (dup anon vma, preallocate for maple tree,
locking, whatever), then commit.

There could still be the function above, but with smaller widgets to do
what we need so we gain flexibility in what we decide to check - prev
only in brk().

I'm not sure if we'd need one for expanding vs existing or if we could
check !vmg->vma to figure that out..

This would also have the effect of self-documenting what is going on.
For brk, it would look like this:

if (vmg_expand_prev()) {
	if (vmg_prepare())
		goto no_mem;
	vmg_commit();
}

I think this would change your exposed interface, at least for brk() -
or a wrapper for this, but small widgets may gain us some
self-documented code?

If you really don't like the exposure of the interface, the vmg could
have a return so we can see if we ran out of memory?

> +
>  /*
>   * vma_expand - Expand an existing VMA
>   *
> @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
>   * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
>   * vmg->next needs to be handled by the caller.
>   *
> - * Returns: 0 on success
> + * Returns: 0 on success.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> + * - The caller must have set @vmg->prev and @vmg->next.
>   */
>  int vma_expand(struct vma_merge_struct *vmg)
>  {
> @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
>  	return 0;
>  }
>  
> -/*
> - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> - * or for inserting a VMA.
> - *
> - * @vp: The vma_prepare struct
> - * @vmi: The vma iterator
> - * @mm: The mm_struct
> - */
> -void vma_complete(struct vma_prepare *vp,
> -		  struct vma_iterator *vmi, struct mm_struct *mm)
> -{
> -	if (vp->file) {
> -		if (vp->adj_next)
> -			vma_interval_tree_insert(vp->adj_next,
> -						 &vp->mapping->i_mmap);
> -		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> -		flush_dcache_mmap_unlock(vp->mapping);
> -	}
> -
> -	if (vp->remove && vp->file) {
> -		__remove_shared_vm_struct(vp->remove, vp->mapping);
> -		if (vp->remove2)
> -			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> -	} else if (vp->insert) {
> -		/*
> -		 * split_vma has split insert from vma, and needs
> -		 * us to insert it before dropping the locks
> -		 * (it may either follow vma or precede it).
> -		 */
> -		vma_iter_store(vmi, vp->insert);
> -		mm->map_count++;
> -	}
> -
> -	if (vp->anon_vma) {
> -		anon_vma_interval_tree_post_update_vma(vp->vma);
> -		if (vp->adj_next)
> -			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> -		anon_vma_unlock_write(vp->anon_vma);
> -	}
> -
> -	if (vp->file) {
> -		i_mmap_unlock_write(vp->mapping);
> -		uprobe_mmap(vp->vma);
> -
> -		if (vp->adj_next)
> -			uprobe_mmap(vp->adj_next);
> -	}
> -
> -	if (vp->remove) {
> -again:
> -		vma_mark_detached(vp->remove, true);
> -		if (vp->file) {
> -			uprobe_munmap(vp->remove, vp->remove->vm_start,
> -				      vp->remove->vm_end);
> -			fput(vp->file);
> -		}
> -		if (vp->remove->anon_vma)
> -			anon_vma_merge(vp->vma, vp->remove);
> -		mm->map_count--;
> -		mpol_put(vma_policy(vp->remove));
> -		if (!vp->remove2)
> -			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> -		vm_area_free(vp->remove);
> -
> -		/*
> -		 * In mprotect's case 6 (see comments on vma_merge),
> -		 * we are removing both mid and next vmas
> -		 */
> -		if (vp->remove2) {
> -			vp->remove = vp->remove2;
> -			vp->remove2 = NULL;
> -			goto again;
> -		}
> -	}
> -	if (vp->insert && vp->file)
> -		uprobe_mmap(vp->insert);
> -	validate_mm(mm);
> -}
> -
>  /*
>   * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
>   * @vmi: The vma iterator
> @@ -1261,20 +1378,6 @@ struct vm_area_struct
>  	return vma_modify(&vmg);
>  }
>  
> -/*
> - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> - * must ensure that [start, end) does not overlap any existing VMA.
> - */
> -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> -{
> -	if (!vmg->prev) {
> -		vmg->prev = vma_prev(vmg->vmi);
> -		vma_iter_set(vmg->vmi, vmg->start);
> -	}
> -
> -	return vma_merge(vmg);
> -}
> -
>  /*
>   * Expand vma by delta bytes, potentially merging with an immediately adjacent
>   * VMA with identical properties.
> @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
>  		.anon_name = anon_vma_name(vma),
>  	};
>  
> -	/* vma is specified as prev, so case 1 or 2 will apply. */
> -	return vma_merge(&vmg);
> +	return vma_merge_new_vma(&vmg);
>  }
>  
>  void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
>  	struct vm_area_struct *vma = *vmap;
>  	unsigned long vma_start = vma->vm_start;
>  	struct mm_struct *mm = vma->vm_mm;
> -	struct vm_area_struct *new_vma, *prev;
> +	struct vm_area_struct *new_vma;
>  	bool faulted_in_anon_vma = true;
>  	VMA_ITERATOR(vmi, mm, addr);
> +	struct vma_merge_struct vmg = {
> +		.vmi = &vmi,
> +		.start = addr,
> +		.end = addr + len,
> +		.flags = vma->vm_flags,
> +		.pgoff = pgoff,
> +		.file = vma->vm_file,
> +		.anon_vma = vma->anon_vma,
> +		.policy = vma_policy(vma),
> +		.uffd_ctx = vma->vm_userfaultfd_ctx,
> +		.anon_name = anon_vma_name(vma),
> +	};
>  
>  	/*
>  	 * If anonymous vma has not yet been faulted, update new pgoff
>  	 * to match new location, to increase its chance of merging.
>  	 */
>  	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> -		pgoff = addr >> PAGE_SHIFT;
> +		pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
>  		faulted_in_anon_vma = false;
>  	}
>  
> -	new_vma = find_vma_prev(mm, addr, &prev);
> +	new_vma = find_vma_prev(mm, addr, &vmg.prev);
>  	if (new_vma && new_vma->vm_start < addr + len)
>  		return NULL;	/* should never get here */
>  
> -	new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> +	vmg.next = vma_next(&vmi);
> +	vma_prev(&vmi);
> +
> +	new_vma = vma_merge_new_vma(&vmg);
> +
>  	if (new_vma) {
>  		/*
>  		 * Source vma may have been merged into new_vma
> diff --git a/mm/vma.h b/mm/vma.h
> index 50459f9e4c7f..bbb173053f34 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
>  /* Required for expand_downwards(). */
>  void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
>  
> -/* Required for do_brk_flags(). */
> -void vma_prepare(struct vma_prepare *vp);
> -
> -/* Required for do_brk_flags(). */
> -void init_vma_prep(struct vma_prepare *vp,
> -		   struct vm_area_struct *vma);
> -
> -/* Required for do_brk_flags(). */
> -void vma_complete(struct vma_prepare *vp,
> -		  struct vma_iterator *vmi, struct mm_struct *mm);
> -
>  int vma_expand(struct vma_merge_struct *vmg);
>  int vma_shrink(struct vma_merge_struct *vmg);
>  
> @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
>  		struct vm_area_struct *next, unsigned long start,
>  		unsigned long end, unsigned long tree_end, bool mm_wr_locked);
>  
> -/*
> - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> - *
> - * Required by mmap_region().
> - */
> -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> -
> -/*
> - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> - *
> - * Required by mmap_region() and do_brk_flags().
> - */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> -
>  /* We are about to modify the VMA's flags. */
>  struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
>  					struct vm_area_struct *prev,
> @@ -133,31 +108,7 @@ struct vm_area_struct
>  		       unsigned long new_flags,
>  		       struct vm_userfaultfd_ctx new_ctx);
>  
> -struct vm_area_struct
> -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> -
> -/* Temporary convenience wrapper. */
> -static inline struct vm_area_struct
> -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> -			   struct vm_area_struct *vma, unsigned long start,
> -			   unsigned long end, pgoff_t pgoff)
> -{
> -	struct vma_merge_struct vmg = {
> -		.vmi = vmi,
> -		.prev = prev,
> -		.start = start,
> -		.end = end,
> -		.flags = vma->vm_flags,
> -		.file = vma->vm_file,
> -		.anon_vma = vma->anon_vma,
> -		.pgoff = pgoff,
> -		.policy = vma_policy(vma),
> -		.uffd_ctx = vma->vm_userfaultfd_ctx,
> -		.anon_name = anon_vma_name(vma),
> -	};
> -
> -	return vma_merge_new_vma(&vmg);
> -}
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
>  
>  /*
>   * Temporary wrapper around vma_merge() so we can have a common interface for
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 40797a819d3d..a39a734282d0 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
>  	mas_destroy(&vmi->mas);
>  }
>  
> +static inline
> +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> +{
> +	return mas_next_range(&vmi->mas, ULONG_MAX);
> +}
> +
>  static inline void vm_acct_memory(long pages)
>  {
>  }
> -- 
> 2.45.2
>
Lorenzo Stoakes Aug. 9, 2024, 5:20 p.m. UTC | #9
On Fri, Aug 09, 2024 at 11:23:30AM GMT, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]:
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/mmap.c                        |  79 ++---
> >  mm/vma.c                         | 482 +++++++++++++++++++------------
> >  mm/vma.h                         |  51 +---
> >  tools/testing/vma/vma_internal.h |   6 +
> >  4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  {
> >  	struct mm_struct *mm = current->mm;
> >  	struct vm_area_struct *vma = NULL;
> > -	struct vm_area_struct *next, *prev, *merge;
> > -	pgoff_t pglen = len >> PAGE_SHIFT;
> > +	struct vm_area_struct *merge;
> >  	unsigned long charged = 0;
> >  	unsigned long end = addr + len;
> >  	bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		vm_flags |= VM_ACCOUNT;
> >  	}
> >
> > -	next = vmg.next = vma_next(&vmi);
> > -	prev = vmg.prev = vma_prev(&vmi);
> > -	if (vm_flags & VM_SPECIAL) {
> > -		if (prev)
> > -			vma_iter_next_range(&vmi);
> > -		goto cannot_expand;
> > -	}
> > -
> > -	/* Attempt to expand an old mapping */
> > -	/* Check next */
> > -	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > -		/* We can adjust this as can_vma_merge_after() doesn't touch */
> > -		vmg.end = next->vm_end;
> > -		vma = vmg.vma = next;
> > -		vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > -		/* We may merge our NULL anon_vma with non-NULL in next. */
> > -		vmg.anon_vma = vma->anon_vma;
> > -	}
> > -
> > -	/* Check prev */
> > -	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > -		vmg.start = prev->vm_start;
> > -		vma = vmg.vma = prev;
> > -		vmg.pgoff = prev->vm_pgoff;
> > -	} else if (prev) {
> > -		vma_iter_next_range(&vmi);
> > -	}
> > -
> > -	/* Actually expand, if possible */
> > -	if (vma && !vma_expand(&vmg)) {
> > -		khugepaged_enter_vma(vma, vm_flags);
> > +	vma = vma_merge_new_vma(&vmg);
> > +	if (vma)
> >  		goto expanded;
> > -	}
> > -
> > -	if (vma == prev)
> > -		vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> >  	/*
> >  	 * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		 * If vm_flags changed after call_mmap(), we should try merge
> >  		 * vma again as we may succeed this time.
> >  		 */
> > -		if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > -			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > -							  vma->vm_start, vma->vm_end,
> > -							  vma->vm_pgoff);
> > +		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > +			merge = vma_merge_new_vma(&vmg);
> > +
> >  			if (merge) {
> >  				/*
> >  				 * ->mmap() can change vma->vm_file and fput
> > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> >  		vma_iter_set(&vmi, vma->vm_end);
> >  		/* Undo any partial mapping done by a device driver. */
> > -		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> > +		unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> >  			     vma->vm_end, vma->vm_end, true);
> >  	}
> >  	if (writable_file_mapping)
> > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  		unsigned long addr, unsigned long len, unsigned long flags)
> >  {
> >  	struct mm_struct *mm = current->mm;
> > -	struct vma_prepare vp;
> >
> >  	/*
> >  	 * Check against address space limits by the changed size
> > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  	 */
> >  	if (vma && vma->vm_end == addr) {
> >  		struct vma_merge_struct vmg = {
> > +			.vmi = vmi,
> >  			.prev = vma,
> > +			.next = NULL,
> > +			.start = addr,
> > +			.end = addr + len,
> >  			.flags = flags,
> >  			.pgoff = addr >> PAGE_SHIFT,
> > +			.file = vma->vm_file,
> > +			.anon_vma = vma->anon_vma,
> > +			.policy = vma_policy(vma),
> > +			.uffd_ctx = vma->vm_userfaultfd_ctx,
> > +			.anon_name = anon_vma_name(vma),
> >  		};
> >
> > -		if (can_vma_merge_after(&vmg)) {
> > -			vma_iter_config(vmi, vma->vm_start, addr + len);
> > -			if (vma_iter_prealloc(vmi, vma))
> > -				goto unacct_fail;
> > -
> > -			vma_start_write(vma);
> > -
> > -			init_vma_prep(&vp, vma);
> > -			vma_prepare(&vp);
> > -			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > -			vma->vm_end = addr + len;
> > -			vm_flags_set(vma, VM_SOFTDIRTY);
> > -			vma_iter_store(vmi, vma);
> > -
> > -			vma_complete(&vp, vmi, mm);
> > -			khugepaged_enter_vma(vma, flags);
> > +		if (vma_merge_new_vma(&vmg))
> >  			goto out;
>
> This is very convoluted to follow.  It seems vma_merge_new_vma() will do
> what is necessary by finding out that it can merge after, then call
> vma_expand() which calls commit merge(), which sets the iterator to
> vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr
> here.. it's actually set to prev->vm_start in vma_merge_new_vma().

Sorry, it's kind of hard to make a change like this all that lovely to
follow.

The only extra checks before it checks mergeability are prev - which we set
to vma, so is not NULL (except in the case of first vma, which is wasteful,
but a one-off) and an is_special and next check.

So this isn't _hugely_ terrible.

As to the vmi positioning... I thought there might be some things that we
could improve on this :)

However, we set prev == vma here, so vmg->start = vma->vm_start, vmg->end =
addr + len which is the same as before right?

I do notice that we've incorrectly removed the vm_flags_set(VM_SOFTDIRTY)
though... will add that back in. Ugh.

Again, so frustrating to not have these functions testable. I'd like to
find a way to move things around if possible at some point. But if we're
worried about call stack maybe not feasible.

>
> This is getting really hard to trace what happens.  I'm also concerned
> that the overhead of following all these checks will cost performance on
> the brk system call?

I'll take a look at mm-tests results.

>
> Maybe we can have a way to set up the vmg and call the right function to
> just make the above happen?  We know with the can_vma_merge_after() that
> it is going to work, so could we just call vma_start_write() and
> commit_merge()?

I'm happy to add an enum or something to set a specific mode if we want,
but maybe worth looking at scalability results first to see if there's
really a regression?

I mean from our discussions on irc, it sounds like this is very possible so
we could figure something out.

>
> Also, vma_merge_new_vma() could fail because it's out of memory so it
> should goto unacct_fail.. but we now don't know if it's because the
> merge wasn't allowed or if we are out of memory..

Yes this is a mistake, damn it. Will fix. Grumble about untestability of
these functions x2.

As per your comment below I think simplest way may be to have an error or
outcome field or some such that we can check to see _why_ things failed.

>
> > -		}
> >  	}
> >
> >  	if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 55615392e8d2..a404cf718f9e 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> >   *
> >   * We assume the vma may be removed as part of the merge.
> >   */
> > -bool
> > -can_vma_merge_before(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> >  {
> >  	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> >
> > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> >   *
> >   * We assume that vma is not removed as part of the merge.
> >   */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> >  {
> >  	if (is_mergeable_vma(vmg, false) &&
> >  	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> >  	return false;
> >  }
> >
> > +static void __vma_link_file(struct vm_area_struct *vma,
> > +			    struct address_space *mapping)
> > +{
> > +	if (vma_is_shared_maywrite(vma))
> > +		mapping_allow_writable(mapping);
> > +
> > +	flush_dcache_mmap_lock(mapping);
> > +	vma_interval_tree_insert(vma, &mapping->i_mmap);
> > +	flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * Requires inode->i_mapping->i_mmap_rwsem
> > + */
> > +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > +				      struct address_space *mapping)
> > +{
> > +	if (vma_is_shared_maywrite(vma))
> > +		mapping_unmap_writable(mapping);
> > +
> > +	flush_dcache_mmap_lock(mapping);
> > +	vma_interval_tree_remove(vma, &mapping->i_mmap);
> > +	flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > + * @vp: The initialized vma_prepare struct
> > + */
> > +static void vma_prepare(struct vma_prepare *vp)
> > +{
> > +	if (vp->file) {
> > +		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > +
> > +		if (vp->adj_next)
> > +			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > +				      vp->adj_next->vm_end);
> > +
> > +		i_mmap_lock_write(vp->mapping);
> > +		if (vp->insert && vp->insert->vm_file) {
> > +			/*
> > +			 * Put into interval tree now, so instantiated pages
> > +			 * are visible to arm/parisc __flush_dcache_page
> > +			 * throughout; but we cannot insert into address
> > +			 * space until vma start or end is updated.
> > +			 */
> > +			__vma_link_file(vp->insert,
> > +					vp->insert->vm_file->f_mapping);
> > +		}
> > +	}
> > +
> > +	if (vp->anon_vma) {
> > +		anon_vma_lock_write(vp->anon_vma);
> > +		anon_vma_interval_tree_pre_update_vma(vp->vma);
> > +		if (vp->adj_next)
> > +			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > +	}
> > +
> > +	if (vp->file) {
> > +		flush_dcache_mmap_lock(vp->mapping);
> > +		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > +		if (vp->adj_next)
> > +			vma_interval_tree_remove(vp->adj_next,
> > +						 &vp->mapping->i_mmap);
> > +	}
> > +
> > +}
> > +
> > +/*
> > + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > + * or for inserting a VMA.
> > + *
> > + * @vp: The vma_prepare struct
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + */
> > +static void vma_complete(struct vma_prepare *vp,
> > +			 struct vma_iterator *vmi, struct mm_struct *mm)
> > +{
> > +	if (vp->file) {
> > +		if (vp->adj_next)
> > +			vma_interval_tree_insert(vp->adj_next,
> > +						 &vp->mapping->i_mmap);
> > +		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > +		flush_dcache_mmap_unlock(vp->mapping);
> > +	}
> > +
> > +	if (vp->remove && vp->file) {
> > +		__remove_shared_vm_struct(vp->remove, vp->mapping);
> > +		if (vp->remove2)
> > +			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> > +	} else if (vp->insert) {
> > +		/*
> > +		 * split_vma has split insert from vma, and needs
> > +		 * us to insert it before dropping the locks
> > +		 * (it may either follow vma or precede it).
> > +		 */
> > +		vma_iter_store(vmi, vp->insert);
> > +		mm->map_count++;
> > +	}
> > +
> > +	if (vp->anon_vma) {
> > +		anon_vma_interval_tree_post_update_vma(vp->vma);
> > +		if (vp->adj_next)
> > +			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > +		anon_vma_unlock_write(vp->anon_vma);
> > +	}
> > +
> > +	if (vp->file) {
> > +		i_mmap_unlock_write(vp->mapping);
> > +		uprobe_mmap(vp->vma);
> > +
> > +		if (vp->adj_next)
> > +			uprobe_mmap(vp->adj_next);
> > +	}
> > +
> > +	if (vp->remove) {
> > +again:
> > +		vma_mark_detached(vp->remove, true);
> > +		if (vp->file) {
> > +			uprobe_munmap(vp->remove, vp->remove->vm_start,
> > +				      vp->remove->vm_end);
> > +			fput(vp->file);
> > +		}
> > +		if (vp->remove->anon_vma)
> > +			anon_vma_merge(vp->vma, vp->remove);
> > +		mm->map_count--;
> > +		mpol_put(vma_policy(vp->remove));
> > +		if (!vp->remove2)
> > +			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > +		vm_area_free(vp->remove);
> > +
> > +		/*
> > +		 * In mprotect's case 6 (see comments on vma_merge),
> > +		 * we are removing both mid and next vmas
> > +		 */
> > +		if (vp->remove2) {
> > +			vp->remove = vp->remove2;
> > +			vp->remove2 = NULL;
> > +			goto again;
> > +		}
> > +	}
> > +	if (vp->insert && vp->file)
> > +		uprobe_mmap(vp->insert);
> > +	validate_mm(mm);
> > +}
> > +
> > +/*
> > + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + */
> > +static void init_vma_prep(struct vma_prepare *vp,
> > +			  struct vm_area_struct *vma)
> > +{
> > +	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > +}
> > +
> >  /*
> >   * Close a vm structure and free it.
> >   */
> > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> >  	vm_unacct_memory(nr_accounted);
> >  }
> >
> > -/*
> > - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - */
> > -void init_vma_prep(struct vma_prepare *vp,
> > -		   struct vm_area_struct *vma)
> > -{
> > -	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > -}
> > -
> > -/*
> > - * Requires inode->i_mapping->i_mmap_rwsem
> > - */
> > -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > -				      struct address_space *mapping)
> > -{
> > -	if (vma_is_shared_maywrite(vma))
> > -		mapping_unmap_writable(mapping);
> > -
> > -	flush_dcache_mmap_lock(mapping);
> > -	vma_interval_tree_remove(vma, &mapping->i_mmap);
> > -	flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> >  /*
> >   * vma has some anon_vma assigned, and is already inserted on that
> >   * anon_vma's interval trees.
> > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> >  		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> >  }
> >
> > -static void __vma_link_file(struct vm_area_struct *vma,
> > -			    struct address_space *mapping)
> > -{
> > -	if (vma_is_shared_maywrite(vma))
> > -		mapping_allow_writable(mapping);
> > -
> > -	flush_dcache_mmap_lock(mapping);
> > -	vma_interval_tree_insert(vma, &mapping->i_mmap);
> > -	flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -/*
> > - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > - * @vp: The initialized vma_prepare struct
> > - */
> > -void vma_prepare(struct vma_prepare *vp)
> > -{
> > -	if (vp->file) {
> > -		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > -
> > -		if (vp->adj_next)
> > -			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > -				      vp->adj_next->vm_end);
> > -
> > -		i_mmap_lock_write(vp->mapping);
> > -		if (vp->insert && vp->insert->vm_file) {
> > -			/*
> > -			 * Put into interval tree now, so instantiated pages
> > -			 * are visible to arm/parisc __flush_dcache_page
> > -			 * throughout; but we cannot insert into address
> > -			 * space until vma start or end is updated.
> > -			 */
> > -			__vma_link_file(vp->insert,
> > -					vp->insert->vm_file->f_mapping);
> > -		}
> > -	}
> > -
> > -	if (vp->anon_vma) {
> > -		anon_vma_lock_write(vp->anon_vma);
> > -		anon_vma_interval_tree_pre_update_vma(vp->vma);
> > -		if (vp->adj_next)
> > -			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > -	}
> > -
> > -	if (vp->file) {
> > -		flush_dcache_mmap_lock(vp->mapping);
> > -		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > -		if (vp->adj_next)
> > -			vma_interval_tree_remove(vp->adj_next,
> > -						 &vp->mapping->i_mmap);
> > -	}
> > -
> > -}
> > -
> >  /*
> >   * dup_anon_vma() - Helper function to duplicate anon_vma
> >   * @dst: The destination VMA
> > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> >  }
> >  #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + *       (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + *    EXPAND that VMA:
> > + *
> > + * Proposed:       |-----|  or  |-----|
> > + * Existing:  |----|                  |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + *    EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed:       |-----|
> > + * Existing:  |----|     |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
>
> We still have diagrams, that's too bad.

But they're cute ones! Upgrade right?

>
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + *          to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > +	bool is_special = vmg->flags & VM_SPECIAL;
> > +	struct vm_area_struct *prev = vmg->prev;
> > +	struct vm_area_struct *next = vmg->next;
> > +	unsigned long start = vmg->start;
> > +	unsigned long end = vmg->end;
> > +	pgoff_t pgoff = vmg->pgoff;
> > +	pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > +	VM_WARN_ON(vmg->vma);
> > +
> > +	if (!prev && !next) {
> > +		/*
> > +		 * Since the caller must have determined that the requested
> > +		 * range is empty, vmg->vmi will be left pointing at the VMA
> > +		 * immediately prior.
> > +		 */
> > +		next = vmg->next = vma_next(vmg->vmi);
> > +		prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > +		/* Avoid maple tree re-walk. */
> > +		if (is_special && prev)
> > +			vma_iter_next_range(vmg->vmi);
> > +	}
> > +
> > +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> > +	if (is_special || (!prev && !next))
> > +		return NULL;
> > +
> > +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> > +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > +		/*
> > +		 * We can adjust this here as can_vma_merge_after() doesn't
> > +		 * touch vmg->end.
> > +		 */
> > +		vmg->end = next->vm_end;
> > +		vmg->vma = next;
> > +		vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > +		vmg->anon_vma = next->anon_vma;
> > +	}
> > +
> > +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> > +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > +		vmg->start = prev->vm_start;
> > +		vmg->vma = prev;
> > +		vmg->pgoff = prev->vm_pgoff;
> > +	} else if (prev) {
> > +		vma_iter_next_range(vmg->vmi);
> > +	}
> > +
> > +	/*
> > +	 * Now try to expand adjacent VMA(s). This takes care of removing the
> > +	 * following VMA if we have VMAs on both sides.
> > +	 */
> > +	if (vmg->vma && !vma_expand(vmg)) {
> > +		khugepaged_enter_vma(vmg->vma, vmg->flags);
> > +		return vmg->vma;
> > +	}
> > +
> > +	/* If expansion failed, reset state. Allows us to retry merge later. */
> > +	vmg->vma = NULL;
> > +	vmg->anon_vma = NULL;
> > +	vmg->start = start;
> > +	vmg->end = end;
> > +	vmg->pgoff = pgoff;
> > +	if (vmg->vma == prev)
> > +		vma_iter_set(vmg->vmi, start);
> > +
> > +	return NULL;
> > +}
>
> Can we split this up a bit?  I was thinking that, for the brk() case, we
> need to know if we can merge prev and if that merge fails.  I was
> thinking something that you create a vmg with whatever, then call
> can_merge_prev, and that'd do the block above and change the vmg as
> required.  We could have a can_merge_next that does the same, then we
> need to prepare the change (dup anon vma, preallocate for maple tree,
> locking, whatever), then commit.

Yeah that's not a bad idea, that could actually help really help clarify
what's going on.

Then could have a sort of state machine that indicates that we've already
adjusted vmg parameters for the merge.

I'm thinking though of a vma_merge_new_vma() / vma_merge_modified_vma()
that invokes different code to figure out how to expand.

I will have a fiddle around and see what I can figure out that makes sense.

>
> There could still be the function above, but with smaller widgets to do
> what we need so we gain flexibility in what we decide to check - prev
> only in brk().
>
> I'm not sure if we'd need one for expanding vs existing or if we could
> check !vmg->vma to figure that out..
>
> This would also have the effect of self-documenting what is going on.
> For brk, it would look like this:
>
> if (vmg_expand_prev()) {
> 	if (vmg_prepare())
> 		goto no_mem;
> 	vmg_commit();
> }
>
> I think this would change your exposed interface, at least for brk() -
> or a wrapper for this, but small widgets may gain us some
> self-documented code?
>
> If you really don't like the exposure of the interface, the vmg could
> have a return so we can see if we ran out of memory?

I really don't like can_vma_merge_xxx() being exposed, it's very clearly an
internal interface.

As mentioned above we can have some kind of way of passing back an error
code.

Obviously if testing indicates stack size/perf is a problem we can
begrudgingly accept the interface leak :'(. Will check that.

>
> > +
> >  /*
> >   * vma_expand - Expand an existing VMA
> >   *
> > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> >   * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
> >   * vmg->next needs to be handled by the caller.
> >   *
> > - * Returns: 0 on success
> > + * Returns: 0 on success.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> > + * - The caller must have set @vmg->prev and @vmg->next.
> >   */
> >  int vma_expand(struct vma_merge_struct *vmg)
> >  {
> > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> >  	return 0;
> >  }
> >
> > -/*
> > - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > - * or for inserting a VMA.
> > - *
> > - * @vp: The vma_prepare struct
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - */
> > -void vma_complete(struct vma_prepare *vp,
> > -		  struct vma_iterator *vmi, struct mm_struct *mm)
> > -{
> > -	if (vp->file) {
> > -		if (vp->adj_next)
> > -			vma_interval_tree_insert(vp->adj_next,
> > -						 &vp->mapping->i_mmap);
> > -		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > -		flush_dcache_mmap_unlock(vp->mapping);
> > -	}
> > -
> > -	if (vp->remove && vp->file) {
> > -		__remove_shared_vm_struct(vp->remove, vp->mapping);
> > -		if (vp->remove2)
> > -			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> > -	} else if (vp->insert) {
> > -		/*
> > -		 * split_vma has split insert from vma, and needs
> > -		 * us to insert it before dropping the locks
> > -		 * (it may either follow vma or precede it).
> > -		 */
> > -		vma_iter_store(vmi, vp->insert);
> > -		mm->map_count++;
> > -	}
> > -
> > -	if (vp->anon_vma) {
> > -		anon_vma_interval_tree_post_update_vma(vp->vma);
> > -		if (vp->adj_next)
> > -			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > -		anon_vma_unlock_write(vp->anon_vma);
> > -	}
> > -
> > -	if (vp->file) {
> > -		i_mmap_unlock_write(vp->mapping);
> > -		uprobe_mmap(vp->vma);
> > -
> > -		if (vp->adj_next)
> > -			uprobe_mmap(vp->adj_next);
> > -	}
> > -
> > -	if (vp->remove) {
> > -again:
> > -		vma_mark_detached(vp->remove, true);
> > -		if (vp->file) {
> > -			uprobe_munmap(vp->remove, vp->remove->vm_start,
> > -				      vp->remove->vm_end);
> > -			fput(vp->file);
> > -		}
> > -		if (vp->remove->anon_vma)
> > -			anon_vma_merge(vp->vma, vp->remove);
> > -		mm->map_count--;
> > -		mpol_put(vma_policy(vp->remove));
> > -		if (!vp->remove2)
> > -			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > -		vm_area_free(vp->remove);
> > -
> > -		/*
> > -		 * In mprotect's case 6 (see comments on vma_merge),
> > -		 * we are removing both mid and next vmas
> > -		 */
> > -		if (vp->remove2) {
> > -			vp->remove = vp->remove2;
> > -			vp->remove2 = NULL;
> > -			goto again;
> > -		}
> > -	}
> > -	if (vp->insert && vp->file)
> > -		uprobe_mmap(vp->insert);
> > -	validate_mm(mm);
> > -}
> > -
> >  /*
> >   * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> >   * @vmi: The vma iterator
> > @@ -1261,20 +1378,6 @@ struct vm_area_struct
> >  	return vma_modify(&vmg);
> >  }
> >
> > -/*
> > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > - * must ensure that [start, end) does not overlap any existing VMA.
> > - */
> > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > -{
> > -	if (!vmg->prev) {
> > -		vmg->prev = vma_prev(vmg->vmi);
> > -		vma_iter_set(vmg->vmi, vmg->start);
> > -	}
> > -
> > -	return vma_merge(vmg);
> > -}
> > -
> >  /*
> >   * Expand vma by delta bytes, potentially merging with an immediately adjacent
> >   * VMA with identical properties.
> > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> >  		.anon_name = anon_vma_name(vma),
> >  	};
> >
> > -	/* vma is specified as prev, so case 1 or 2 will apply. */
> > -	return vma_merge(&vmg);
> > +	return vma_merge_new_vma(&vmg);
> >  }
> >
> >  void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> >  	struct vm_area_struct *vma = *vmap;
> >  	unsigned long vma_start = vma->vm_start;
> >  	struct mm_struct *mm = vma->vm_mm;
> > -	struct vm_area_struct *new_vma, *prev;
> > +	struct vm_area_struct *new_vma;
> >  	bool faulted_in_anon_vma = true;
> >  	VMA_ITERATOR(vmi, mm, addr);
> > +	struct vma_merge_struct vmg = {
> > +		.vmi = &vmi,
> > +		.start = addr,
> > +		.end = addr + len,
> > +		.flags = vma->vm_flags,
> > +		.pgoff = pgoff,
> > +		.file = vma->vm_file,
> > +		.anon_vma = vma->anon_vma,
> > +		.policy = vma_policy(vma),
> > +		.uffd_ctx = vma->vm_userfaultfd_ctx,
> > +		.anon_name = anon_vma_name(vma),
> > +	};
> >
> >  	/*
> >  	 * If anonymous vma has not yet been faulted, update new pgoff
> >  	 * to match new location, to increase its chance of merging.
> >  	 */
> >  	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > -		pgoff = addr >> PAGE_SHIFT;
> > +		pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> >  		faulted_in_anon_vma = false;
> >  	}
> >
> > -	new_vma = find_vma_prev(mm, addr, &prev);
> > +	new_vma = find_vma_prev(mm, addr, &vmg.prev);
> >  	if (new_vma && new_vma->vm_start < addr + len)
> >  		return NULL;	/* should never get here */
> >
> > -	new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> > +	vmg.next = vma_next(&vmi);
> > +	vma_prev(&vmi);
> > +
> > +	new_vma = vma_merge_new_vma(&vmg);
> > +
> >  	if (new_vma) {
> >  		/*
> >  		 * Source vma may have been merged into new_vma
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 50459f9e4c7f..bbb173053f34 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> >  /* Required for expand_downwards(). */
> >  void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
> >
> > -/* Required for do_brk_flags(). */
> > -void vma_prepare(struct vma_prepare *vp);
> > -
> > -/* Required for do_brk_flags(). */
> > -void init_vma_prep(struct vma_prepare *vp,
> > -		   struct vm_area_struct *vma);
> > -
> > -/* Required for do_brk_flags(). */
> > -void vma_complete(struct vma_prepare *vp,
> > -		  struct vma_iterator *vmi, struct mm_struct *mm);
> > -
> >  int vma_expand(struct vma_merge_struct *vmg);
> >  int vma_shrink(struct vma_merge_struct *vmg);
> >
> > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> >  		struct vm_area_struct *next, unsigned long start,
> >  		unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/*
> > - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > - *
> > - * Required by mmap_region().
> > - */
> > -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > -
> > -/*
> > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > - *
> > - * Required by mmap_region() and do_brk_flags().
> > - */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> > -
> >  /* We are about to modify the VMA's flags. */
> >  struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> >  					struct vm_area_struct *prev,
> > @@ -133,31 +108,7 @@ struct vm_area_struct
> >  		       unsigned long new_flags,
> >  		       struct vm_userfaultfd_ctx new_ctx);
> >
> > -struct vm_area_struct
> > -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> > -
> > -/* Temporary convenience wrapper. */
> > -static inline struct vm_area_struct
> > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > -			   struct vm_area_struct *vma, unsigned long start,
> > -			   unsigned long end, pgoff_t pgoff)
> > -{
> > -	struct vma_merge_struct vmg = {
> > -		.vmi = vmi,
> > -		.prev = prev,
> > -		.start = start,
> > -		.end = end,
> > -		.flags = vma->vm_flags,
> > -		.file = vma->vm_file,
> > -		.anon_vma = vma->anon_vma,
> > -		.pgoff = pgoff,
> > -		.policy = vma_policy(vma),
> > -		.uffd_ctx = vma->vm_userfaultfd_ctx,
> > -		.anon_name = anon_vma_name(vma),
> > -	};
> > -
> > -	return vma_merge_new_vma(&vmg);
> > -}
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
> >
> >  /*
> >   * Temporary wrapper around vma_merge() so we can have a common interface for
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 40797a819d3d..a39a734282d0 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> >  	mas_destroy(&vmi->mas);
> >  }
> >
> > +static inline
> > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> > +{
> > +	return mas_next_range(&vmi->mas, ULONG_MAX);
> > +}
> > +
> >  static inline void vm_acct_memory(long pages)
> >  {
> >  }
> > --
> > 2.45.2
> >
diff mbox series

Patch

diff --git a/mm/mmap.c b/mm/mmap.c
index f6593a81f73d..c03f50f46396 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1363,8 +1363,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
-	struct vm_area_struct *next, *prev, *merge;
-	pgoff_t pglen = len >> PAGE_SHIFT;
+	struct vm_area_struct *merge;
 	unsigned long charged = 0;
 	unsigned long end = addr + len;
 	bool writable_file_mapping = false;
@@ -1411,44 +1410,9 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 		vm_flags |= VM_ACCOUNT;
 	}
 
-	next = vmg.next = vma_next(&vmi);
-	prev = vmg.prev = vma_prev(&vmi);
-	if (vm_flags & VM_SPECIAL) {
-		if (prev)
-			vma_iter_next_range(&vmi);
-		goto cannot_expand;
-	}
-
-	/* Attempt to expand an old mapping */
-	/* Check next */
-	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
-		/* We can adjust this as can_vma_merge_after() doesn't touch */
-		vmg.end = next->vm_end;
-		vma = vmg.vma = next;
-		vmg.pgoff = next->vm_pgoff - pglen;
-
-		/* We may merge our NULL anon_vma with non-NULL in next. */
-		vmg.anon_vma = vma->anon_vma;
-	}
-
-	/* Check prev */
-	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
-		vmg.start = prev->vm_start;
-		vma = vmg.vma = prev;
-		vmg.pgoff = prev->vm_pgoff;
-	} else if (prev) {
-		vma_iter_next_range(&vmi);
-	}
-
-	/* Actually expand, if possible */
-	if (vma && !vma_expand(&vmg)) {
-		khugepaged_enter_vma(vma, vm_flags);
+	vma = vma_merge_new_vma(&vmg);
+	if (vma)
 		goto expanded;
-	}
-
-	if (vma == prev)
-		vma_iter_set(&vmi, addr);
-cannot_expand:
 
 	/*
 	 * Determine the object being mapped and call the appropriate
@@ -1493,10 +1457,9 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 		 * If vm_flags changed after call_mmap(), we should try merge
 		 * vma again as we may succeed this time.
 		 */
-		if (unlikely(vm_flags != vma->vm_flags && prev)) {
-			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
-							  vma->vm_start, vma->vm_end,
-							  vma->vm_pgoff);
+		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
+			merge = vma_merge_new_vma(&vmg);
+
 			if (merge) {
 				/*
 				 * ->mmap() can change vma->vm_file and fput
@@ -1596,7 +1559,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 
 		vma_iter_set(&vmi, vma->vm_end);
 		/* Undo any partial mapping done by a device driver. */
-		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
+		unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
 			     vma->vm_end, vma->vm_end, true);
 	}
 	if (writable_file_mapping)
@@ -1773,7 +1736,6 @@  static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long len, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
-	struct vma_prepare vp;
 
 	/*
 	 * Check against address space limits by the changed size
@@ -1795,29 +1757,22 @@  static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	 */
 	if (vma && vma->vm_end == addr) {
 		struct vma_merge_struct vmg = {
+			.vmi = vmi,
 			.prev = vma,
+			.next = NULL,
+			.start = addr,
+			.end = addr + len,
 			.flags = flags,
 			.pgoff = addr >> PAGE_SHIFT,
+			.file = vma->vm_file,
+			.anon_vma = vma->anon_vma,
+			.policy = vma_policy(vma),
+			.uffd_ctx = vma->vm_userfaultfd_ctx,
+			.anon_name = anon_vma_name(vma),
 		};
 
-		if (can_vma_merge_after(&vmg)) {
-			vma_iter_config(vmi, vma->vm_start, addr + len);
-			if (vma_iter_prealloc(vmi, vma))
-				goto unacct_fail;
-
-			vma_start_write(vma);
-
-			init_vma_prep(&vp, vma);
-			vma_prepare(&vp);
-			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
-			vma->vm_end = addr + len;
-			vm_flags_set(vma, VM_SOFTDIRTY);
-			vma_iter_store(vmi, vma);
-
-			vma_complete(&vp, vmi, mm);
-			khugepaged_enter_vma(vma, flags);
+		if (vma_merge_new_vma(&vmg))
 			goto out;
-		}
 	}
 
 	if (vma)
diff --git a/mm/vma.c b/mm/vma.c
index 55615392e8d2..a404cf718f9e 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -97,8 +97,7 @@  static void init_multi_vma_prep(struct vma_prepare *vp,
  *
  * We assume the vma may be removed as part of the merge.
  */
-bool
-can_vma_merge_before(struct vma_merge_struct *vmg)
+static bool can_vma_merge_before(struct vma_merge_struct *vmg)
 {
 	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
 
@@ -120,7 +119,7 @@  can_vma_merge_before(struct vma_merge_struct *vmg)
  *
  * We assume that vma is not removed as part of the merge.
  */
-bool can_vma_merge_after(struct vma_merge_struct *vmg)
+static bool can_vma_merge_after(struct vma_merge_struct *vmg)
 {
 	if (is_mergeable_vma(vmg, false) &&
 	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
@@ -130,6 +129,164 @@  bool can_vma_merge_after(struct vma_merge_struct *vmg)
 	return false;
 }
 
+static void __vma_link_file(struct vm_area_struct *vma,
+			    struct address_space *mapping)
+{
+	if (vma_is_shared_maywrite(vma))
+		mapping_allow_writable(mapping);
+
+	flush_dcache_mmap_lock(mapping);
+	vma_interval_tree_insert(vma, &mapping->i_mmap);
+	flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * Requires inode->i_mapping->i_mmap_rwsem
+ */
+static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+				      struct address_space *mapping)
+{
+	if (vma_is_shared_maywrite(vma))
+		mapping_unmap_writable(mapping);
+
+	flush_dcache_mmap_lock(mapping);
+	vma_interval_tree_remove(vma, &mapping->i_mmap);
+	flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * vma_prepare() - Helper function for handling locking VMAs prior to altering
+ * @vp: The initialized vma_prepare struct
+ */
+static void vma_prepare(struct vma_prepare *vp)
+{
+	if (vp->file) {
+		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
+
+		if (vp->adj_next)
+			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
+				      vp->adj_next->vm_end);
+
+		i_mmap_lock_write(vp->mapping);
+		if (vp->insert && vp->insert->vm_file) {
+			/*
+			 * Put into interval tree now, so instantiated pages
+			 * are visible to arm/parisc __flush_dcache_page
+			 * throughout; but we cannot insert into address
+			 * space until vma start or end is updated.
+			 */
+			__vma_link_file(vp->insert,
+					vp->insert->vm_file->f_mapping);
+		}
+	}
+
+	if (vp->anon_vma) {
+		anon_vma_lock_write(vp->anon_vma);
+		anon_vma_interval_tree_pre_update_vma(vp->vma);
+		if (vp->adj_next)
+			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
+	}
+
+	if (vp->file) {
+		flush_dcache_mmap_lock(vp->mapping);
+		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
+		if (vp->adj_next)
+			vma_interval_tree_remove(vp->adj_next,
+						 &vp->mapping->i_mmap);
+	}
+
+}
+
+/*
+ * vma_complete- Helper function for handling the unlocking after altering VMAs,
+ * or for inserting a VMA.
+ *
+ * @vp: The vma_prepare struct
+ * @vmi: The vma iterator
+ * @mm: The mm_struct
+ */
+static void vma_complete(struct vma_prepare *vp,
+			 struct vma_iterator *vmi, struct mm_struct *mm)
+{
+	if (vp->file) {
+		if (vp->adj_next)
+			vma_interval_tree_insert(vp->adj_next,
+						 &vp->mapping->i_mmap);
+		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
+		flush_dcache_mmap_unlock(vp->mapping);
+	}
+
+	if (vp->remove && vp->file) {
+		__remove_shared_vm_struct(vp->remove, vp->mapping);
+		if (vp->remove2)
+			__remove_shared_vm_struct(vp->remove2, vp->mapping);
+	} else if (vp->insert) {
+		/*
+		 * split_vma has split insert from vma, and needs
+		 * us to insert it before dropping the locks
+		 * (it may either follow vma or precede it).
+		 */
+		vma_iter_store(vmi, vp->insert);
+		mm->map_count++;
+	}
+
+	if (vp->anon_vma) {
+		anon_vma_interval_tree_post_update_vma(vp->vma);
+		if (vp->adj_next)
+			anon_vma_interval_tree_post_update_vma(vp->adj_next);
+		anon_vma_unlock_write(vp->anon_vma);
+	}
+
+	if (vp->file) {
+		i_mmap_unlock_write(vp->mapping);
+		uprobe_mmap(vp->vma);
+
+		if (vp->adj_next)
+			uprobe_mmap(vp->adj_next);
+	}
+
+	if (vp->remove) {
+again:
+		vma_mark_detached(vp->remove, true);
+		if (vp->file) {
+			uprobe_munmap(vp->remove, vp->remove->vm_start,
+				      vp->remove->vm_end);
+			fput(vp->file);
+		}
+		if (vp->remove->anon_vma)
+			anon_vma_merge(vp->vma, vp->remove);
+		mm->map_count--;
+		mpol_put(vma_policy(vp->remove));
+		if (!vp->remove2)
+			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
+		vm_area_free(vp->remove);
+
+		/*
+		 * In mprotect's case 6 (see comments on vma_merge),
+		 * we are removing both mid and next vmas
+		 */
+		if (vp->remove2) {
+			vp->remove = vp->remove2;
+			vp->remove2 = NULL;
+			goto again;
+		}
+	}
+	if (vp->insert && vp->file)
+		uprobe_mmap(vp->insert);
+	validate_mm(mm);
+}
+
+/*
+ * init_vma_prep() - Initializer wrapper for vma_prepare struct
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
+ */
+static void init_vma_prep(struct vma_prepare *vp,
+			  struct vm_area_struct *vma)
+{
+	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
+}
+
 /*
  * Close a vm structure and free it.
  */
@@ -292,31 +449,6 @@  static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
 	vm_unacct_memory(nr_accounted);
 }
 
-/*
- * init_vma_prep() - Initializer wrapper for vma_prepare struct
- * @vp: The vma_prepare struct
- * @vma: The vma that will be altered once locked
- */
-void init_vma_prep(struct vma_prepare *vp,
-		   struct vm_area_struct *vma)
-{
-	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
-}
-
-/*
- * Requires inode->i_mapping->i_mmap_rwsem
- */
-static void __remove_shared_vm_struct(struct vm_area_struct *vma,
-				      struct address_space *mapping)
-{
-	if (vma_is_shared_maywrite(vma))
-		mapping_unmap_writable(mapping);
-
-	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_remove(vma, &mapping->i_mmap);
-	flush_dcache_mmap_unlock(mapping);
-}
-
 /*
  * vma has some anon_vma assigned, and is already inserted on that
  * anon_vma's interval trees.
@@ -349,60 +481,6 @@  anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
 		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
 }
 
-static void __vma_link_file(struct vm_area_struct *vma,
-			    struct address_space *mapping)
-{
-	if (vma_is_shared_maywrite(vma))
-		mapping_allow_writable(mapping);
-
-	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_insert(vma, &mapping->i_mmap);
-	flush_dcache_mmap_unlock(mapping);
-}
-
-/*
- * vma_prepare() - Helper function for handling locking VMAs prior to altering
- * @vp: The initialized vma_prepare struct
- */
-void vma_prepare(struct vma_prepare *vp)
-{
-	if (vp->file) {
-		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
-
-		if (vp->adj_next)
-			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
-				      vp->adj_next->vm_end);
-
-		i_mmap_lock_write(vp->mapping);
-		if (vp->insert && vp->insert->vm_file) {
-			/*
-			 * Put into interval tree now, so instantiated pages
-			 * are visible to arm/parisc __flush_dcache_page
-			 * throughout; but we cannot insert into address
-			 * space until vma start or end is updated.
-			 */
-			__vma_link_file(vp->insert,
-					vp->insert->vm_file->f_mapping);
-		}
-	}
-
-	if (vp->anon_vma) {
-		anon_vma_lock_write(vp->anon_vma);
-		anon_vma_interval_tree_pre_update_vma(vp->vma);
-		if (vp->adj_next)
-			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
-	}
-
-	if (vp->file) {
-		flush_dcache_mmap_lock(vp->mapping);
-		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
-		if (vp->adj_next)
-			vma_interval_tree_remove(vp->adj_next,
-						 &vp->mapping->i_mmap);
-	}
-
-}
-
 /*
  * dup_anon_vma() - Helper function to duplicate anon_vma
  * @dst: The destination VMA
@@ -486,6 +564,120 @@  void validate_mm(struct mm_struct *mm)
 }
 #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
 
+/*
+ * vma_merge_new_vma - Attempt to merge a new VMA into address space
+ *
+ * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
+ *       (exclusive), which we try to merge with any adjacent VMAs if possible.
+ *
+ * We are about to add a VMA to the address space starting at @vmg->start and
+ * ending at @vmg->end. There are three different possible scenarios:
+ *
+ * 1. There is a VMA with identical properties immediately adjacent to the
+ *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
+ *    EXPAND that VMA:
+ *
+ * Proposed:       |-----|  or  |-----|
+ * Existing:  |----|                  |----|
+ *
+ * 2. There are VMAs with identical properties immediately adjacent to the
+ *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
+ *    EXPAND the former and REMOVE the latter:
+ *
+ * Proposed:       |-----|
+ * Existing:  |----|     |----|
+ *
+ * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
+ *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
+ *
+ * In instances where we can merge, this function returns the expanded VMA which
+ * will have its range adjusted accordingly and the underlying maple tree also
+ * adjusted.
+ *
+ * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
+ *          to the VMA we expanded.
+ *
+ * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
+ * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
+ * expanded range.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
+ */
+struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
+{
+	bool is_special = vmg->flags & VM_SPECIAL;
+	struct vm_area_struct *prev = vmg->prev;
+	struct vm_area_struct *next = vmg->next;
+	unsigned long start = vmg->start;
+	unsigned long end = vmg->end;
+	pgoff_t pgoff = vmg->pgoff;
+	pgoff_t pglen = PHYS_PFN(end - start);
+
+	VM_WARN_ON(vmg->vma);
+
+	if (!prev && !next) {
+		/*
+		 * Since the caller must have determined that the requested
+		 * range is empty, vmg->vmi will be left pointing at the VMA
+		 * immediately prior.
+		 */
+		next = vmg->next = vma_next(vmg->vmi);
+		prev = vmg->prev = vma_prev(vmg->vmi);
+
+		/* Avoid maple tree re-walk. */
+		if (is_special && prev)
+			vma_iter_next_range(vmg->vmi);
+	}
+
+	/* If special mapping or no adjacent VMAs, nothing to merge. */
+	if (is_special || (!prev && !next))
+		return NULL;
+
+	/* If we can merge with the following VMA, adjust vmg accordingly. */
+	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
+		/*
+		 * We can adjust this here as can_vma_merge_after() doesn't
+		 * touch vmg->end.
+		 */
+		vmg->end = next->vm_end;
+		vmg->vma = next;
+		vmg->pgoff = next->vm_pgoff - pglen;
+
+		vmg->anon_vma = next->anon_vma;
+	}
+
+	/* If we can merge with the previous VMA, adjust vmg accordingly. */
+	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
+		vmg->start = prev->vm_start;
+		vmg->vma = prev;
+		vmg->pgoff = prev->vm_pgoff;
+	} else if (prev) {
+		vma_iter_next_range(vmg->vmi);
+	}
+
+	/*
+	 * Now try to expand adjacent VMA(s). This takes care of removing the
+	 * following VMA if we have VMAs on both sides.
+	 */
+	if (vmg->vma && !vma_expand(vmg)) {
+		khugepaged_enter_vma(vmg->vma, vmg->flags);
+		return vmg->vma;
+	}
+
+	/* If expansion failed, reset state. Allows us to retry merge later. */
+	vmg->vma = NULL;
+	vmg->anon_vma = NULL;
+	vmg->start = start;
+	vmg->end = end;
+	vmg->pgoff = pgoff;
+	if (vmg->vma == prev)
+		vma_iter_set(vmg->vmi, start);
+
+	return NULL;
+}
+
 /*
  * vma_expand - Expand an existing VMA
  *
@@ -496,7 +688,11 @@  void validate_mm(struct mm_struct *mm)
  * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
  * vmg->next needs to be handled by the caller.
  *
- * Returns: 0 on success
+ * Returns: 0 on success.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
+ * - The caller must have set @vmg->prev and @vmg->next.
  */
 int vma_expand(struct vma_merge_struct *vmg)
 {
@@ -576,85 +772,6 @@  int vma_shrink(struct vma_merge_struct *vmg)
 	return 0;
 }
 
-/*
- * vma_complete- Helper function for handling the unlocking after altering VMAs,
- * or for inserting a VMA.
- *
- * @vp: The vma_prepare struct
- * @vmi: The vma iterator
- * @mm: The mm_struct
- */
-void vma_complete(struct vma_prepare *vp,
-		  struct vma_iterator *vmi, struct mm_struct *mm)
-{
-	if (vp->file) {
-		if (vp->adj_next)
-			vma_interval_tree_insert(vp->adj_next,
-						 &vp->mapping->i_mmap);
-		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
-		flush_dcache_mmap_unlock(vp->mapping);
-	}
-
-	if (vp->remove && vp->file) {
-		__remove_shared_vm_struct(vp->remove, vp->mapping);
-		if (vp->remove2)
-			__remove_shared_vm_struct(vp->remove2, vp->mapping);
-	} else if (vp->insert) {
-		/*
-		 * split_vma has split insert from vma, and needs
-		 * us to insert it before dropping the locks
-		 * (it may either follow vma or precede it).
-		 */
-		vma_iter_store(vmi, vp->insert);
-		mm->map_count++;
-	}
-
-	if (vp->anon_vma) {
-		anon_vma_interval_tree_post_update_vma(vp->vma);
-		if (vp->adj_next)
-			anon_vma_interval_tree_post_update_vma(vp->adj_next);
-		anon_vma_unlock_write(vp->anon_vma);
-	}
-
-	if (vp->file) {
-		i_mmap_unlock_write(vp->mapping);
-		uprobe_mmap(vp->vma);
-
-		if (vp->adj_next)
-			uprobe_mmap(vp->adj_next);
-	}
-
-	if (vp->remove) {
-again:
-		vma_mark_detached(vp->remove, true);
-		if (vp->file) {
-			uprobe_munmap(vp->remove, vp->remove->vm_start,
-				      vp->remove->vm_end);
-			fput(vp->file);
-		}
-		if (vp->remove->anon_vma)
-			anon_vma_merge(vp->vma, vp->remove);
-		mm->map_count--;
-		mpol_put(vma_policy(vp->remove));
-		if (!vp->remove2)
-			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
-		vm_area_free(vp->remove);
-
-		/*
-		 * In mprotect's case 6 (see comments on vma_merge),
-		 * we are removing both mid and next vmas
-		 */
-		if (vp->remove2) {
-			vp->remove = vp->remove2;
-			vp->remove2 = NULL;
-			goto again;
-		}
-	}
-	if (vp->insert && vp->file)
-		uprobe_mmap(vp->insert);
-	validate_mm(mm);
-}
-
 /*
  * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
  * @vmi: The vma iterator
@@ -1261,20 +1378,6 @@  struct vm_area_struct
 	return vma_modify(&vmg);
 }
 
-/*
- * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
- * must ensure that [start, end) does not overlap any existing VMA.
- */
-struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
-{
-	if (!vmg->prev) {
-		vmg->prev = vma_prev(vmg->vmi);
-		vma_iter_set(vmg->vmi, vmg->start);
-	}
-
-	return vma_merge(vmg);
-}
-
 /*
  * Expand vma by delta bytes, potentially merging with an immediately adjacent
  * VMA with identical properties.
@@ -1297,8 +1400,7 @@  struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
 		.anon_name = anon_vma_name(vma),
 	};
 
-	/* vma is specified as prev, so case 1 or 2 will apply. */
-	return vma_merge(&vmg);
+	return vma_merge_new_vma(&vmg);
 }
 
 void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
@@ -1399,24 +1501,40 @@  struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	struct vm_area_struct *vma = *vmap;
 	unsigned long vma_start = vma->vm_start;
 	struct mm_struct *mm = vma->vm_mm;
-	struct vm_area_struct *new_vma, *prev;
+	struct vm_area_struct *new_vma;
 	bool faulted_in_anon_vma = true;
 	VMA_ITERATOR(vmi, mm, addr);
+	struct vma_merge_struct vmg = {
+		.vmi = &vmi,
+		.start = addr,
+		.end = addr + len,
+		.flags = vma->vm_flags,
+		.pgoff = pgoff,
+		.file = vma->vm_file,
+		.anon_vma = vma->anon_vma,
+		.policy = vma_policy(vma),
+		.uffd_ctx = vma->vm_userfaultfd_ctx,
+		.anon_name = anon_vma_name(vma),
+	};
 
 	/*
 	 * If anonymous vma has not yet been faulted, update new pgoff
 	 * to match new location, to increase its chance of merging.
 	 */
 	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
-		pgoff = addr >> PAGE_SHIFT;
+		pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
 		faulted_in_anon_vma = false;
 	}
 
-	new_vma = find_vma_prev(mm, addr, &prev);
+	new_vma = find_vma_prev(mm, addr, &vmg.prev);
 	if (new_vma && new_vma->vm_start < addr + len)
 		return NULL;	/* should never get here */
 
-	new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
+	vmg.next = vma_next(&vmi);
+	vma_prev(&vmi);
+
+	new_vma = vma_merge_new_vma(&vmg);
+
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/vma.h b/mm/vma.h
index 50459f9e4c7f..bbb173053f34 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -55,17 +55,6 @@  void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
 /* Required for expand_downwards(). */
 void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
 
-/* Required for do_brk_flags(). */
-void vma_prepare(struct vma_prepare *vp);
-
-/* Required for do_brk_flags(). */
-void init_vma_prep(struct vma_prepare *vp,
-		   struct vm_area_struct *vma);
-
-/* Required for do_brk_flags(). */
-void vma_complete(struct vma_prepare *vp,
-		  struct vma_iterator *vmi, struct mm_struct *mm);
-
 int vma_expand(struct vma_merge_struct *vmg);
 int vma_shrink(struct vma_merge_struct *vmg);
 
@@ -85,20 +74,6 @@  void unmap_region(struct mm_struct *mm, struct ma_state *mas,
 		struct vm_area_struct *next, unsigned long start,
 		unsigned long end, unsigned long tree_end, bool mm_wr_locked);
 
-/*
- * Can we merge the VMA described by vmg into the following VMA vmg->next?
- *
- * Required by mmap_region().
- */
-bool can_vma_merge_before(struct vma_merge_struct *vmg);
-
-/*
- * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
- *
- * Required by mmap_region() and do_brk_flags().
- */
-bool can_vma_merge_after(struct vma_merge_struct *vmg);
-
 /* We are about to modify the VMA's flags. */
 struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
 					struct vm_area_struct *prev,
@@ -133,31 +108,7 @@  struct vm_area_struct
 		       unsigned long new_flags,
 		       struct vm_userfaultfd_ctx new_ctx);
 
-struct vm_area_struct
-*vma_merge_new_vma(struct vma_merge_struct *vmg);
-
-/* Temporary convenience wrapper. */
-static inline struct vm_area_struct
-*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
-			   struct vm_area_struct *vma, unsigned long start,
-			   unsigned long end, pgoff_t pgoff)
-{
-	struct vma_merge_struct vmg = {
-		.vmi = vmi,
-		.prev = prev,
-		.start = start,
-		.end = end,
-		.flags = vma->vm_flags,
-		.file = vma->vm_file,
-		.anon_vma = vma->anon_vma,
-		.pgoff = pgoff,
-		.policy = vma_policy(vma),
-		.uffd_ctx = vma->vm_userfaultfd_ctx,
-		.anon_name = anon_vma_name(vma),
-	};
-
-	return vma_merge_new_vma(&vmg);
-}
+struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
 
 /*
  * Temporary wrapper around vma_merge() so we can have a common interface for
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 40797a819d3d..a39a734282d0 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -709,6 +709,12 @@  static inline void vma_iter_free(struct vma_iterator *vmi)
 	mas_destroy(&vmi->mas);
 }
 
+static inline
+struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
+{
+	return mas_next_range(&vmi->mas, ULONG_MAX);
+}
+
 static inline void vm_acct_memory(long pages)
 {
 }