diff mbox series

[v4,21/66] mm/mmap: Change do_brk_flags() to expand existing VMA and add do_brk_munmap()

Message ID 20211201142918.921493-22-Liam.Howlett@oracle.com (mailing list archive)
State New
Headers show
Series Introducing the Maple Tree | expand

Commit Message

Liam R. Howlett Dec. 1, 2021, 2:29 p.m. UTC
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>

Avoid allocating a new VMA when it a vma modification can occur.  When a
brk() can expand or contract a VMA, then the single store operation will
only modify one index of the maple tree instead of causing a node to
split or coalesce.  This avoids unnecessary allocations/frees of maple
tree nodes and VMAs.

Use the advanced API for the maple tree to avoid unnecessary walks of
the tree.

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
---
 mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 207 insertions(+), 51 deletions(-)

Comments

Vlastimil Babka Jan. 13, 2022, 12:59 p.m. UTC | #1
On 12/1/21 15:29, Liam Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> 
> Avoid allocating a new VMA when it a vma modification can occur.  When a
> brk() can expand or contract a VMA, then the single store operation will
> only modify one index of the maple tree instead of causing a node to
> split or coalesce.  This avoids unnecessary allocations/frees of maple
> tree nodes and VMAs.
> 
> Use the advanced API for the maple tree to avoid unnecessary walks of
> the tree.
> 
> Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> ---
>  mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 207 insertions(+), 51 deletions(-)
> 
> +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
> +			unsigned long addr, unsigned long len,
> +			unsigned long flags)
>  {
>  	struct mm_struct *mm = current->mm;
> -	struct vm_area_struct *vma, *prev;
> -	pgoff_t pgoff = addr >> PAGE_SHIFT;
> +	struct vm_area_struct *prev = NULL;
>  	int error;
>  	unsigned long mapped_addr;
>  	validate_mm_mt(mm);
> @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
>  	if (error)
>  		return error;
>  
> -	/* Clear old maps, set up prev and uf */
> -	if (munmap_vma_range(mm, addr, len, &prev, uf))
> -		return -ENOMEM;
> -
> -	/* Check against address space limits *after* clearing old maps... */
> +	/* Check against address space limits by the changed size */

Can that cause spurious ENOMEM because now the check assumes 'len' worth of
purely new pages and no reuse?

>  	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
>  		return -ENOMEM;
>  
> @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
>  	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
>  		return -ENOMEM;
>  
> -	/* Can we just expand an old private anonymous mapping? */
> -	vma = vma_merge(mm, prev, addr, addr + len, flags,
> -			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
> -	if (vma)
> -		goto out;
> +	mas->last = addr + len - 1;
> +	if (vma) {
> +		/* Expand the existing vma if possible; almost never a singular
> +		 * list, so this will almost always fail. */
>  
> -	/*
> -	 * create a vma struct for an anonymous mapping
> -	 */
> -	vma = vm_area_alloc(mm);
> -	if (!vma) {
> -		vm_unacct_memory(len >> PAGE_SHIFT);
> -		return -ENOMEM;
> +		if ((!vma->anon_vma ||
> +		     list_is_singular(&vma->anon_vma_chain)) &&

Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks
like something e.g. we can easily forget to adjust when changing vma_merge()
itself.
Is this optimization worth the trouble given the comment above "so this will
almost always fail"?

> +		     ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){
> +			mas->index = vma->vm_start;
> +
> +			vma_adjust_trans_huge(vma, addr, addr + len, 0);
> +			if (vma->anon_vma) {
> +				anon_vma_lock_write(vma->anon_vma);
> +				anon_vma_interval_tree_pre_update_vma(vma);
> +			}
> +			vma->vm_end = addr + len;
> +			vma->vm_flags |= VM_SOFTDIRTY;
> +			if (mas_store_gfp(mas, vma, GFP_KERNEL))
> +				goto mas_mod_fail;
> +
> +			if (vma->anon_vma) {
> +				anon_vma_interval_tree_post_update_vma(vma);
> +				anon_vma_unlock_write(vma->anon_vma);
> +			}
> +			khugepaged_enter_vma_merge(vma, flags);
> +			goto out;
> +		}
> +		prev = vma;
>  	}
> +	mas->index = addr;
> +	mas_walk(mas);
> +
> +	/* create a vma struct for an anonymous mapping */
> +	vma = vm_area_alloc(mm);
> +	if (!vma)
> +		goto vma_alloc_fail;
>  
>  	vma_set_anonymous(vma);
>  	vma->vm_start = addr;
>  	vma->vm_end = addr + len;
> -	vma->vm_pgoff = pgoff;
> +	vma->vm_pgoff = addr >> PAGE_SHIFT;
>  	vma->vm_flags = flags;
>  	vma->vm_page_prot = vm_get_page_prot(flags);
> -	vma_link(mm, vma, prev);
> +	if (vma_mas_store(vma, mas))
> +		goto mas_store_fail;
> +
> +	if (!prev)
> +		prev = mas_prev(mas, 0);
> +
> +	__vma_link_list(mm, vma, prev);
> +	mm->map_count++;
>  out:
>  	perf_event_mmap(vma);
>  	mm->total_vm += len >> PAGE_SHIFT;
> @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
>  	vma->vm_flags |= VM_SOFTDIRTY;
>  	validate_mm_mt(mm);
>  	return 0;
> +
> +mas_store_fail:
> +	vm_area_free(vma);
> +vma_alloc_fail:
> +	vm_unacct_memory(len >> PAGE_SHIFT);
> +	return -ENOMEM;
> +
> +mas_mod_fail:
> +	vma->vm_end = addr;
> +	if (vma->anon_vma) {
> +		anon_vma_interval_tree_post_update_vma(vma);
> +		anon_vma_unlock_write(vma->anon_vma);
> +	}
> +	return -ENOMEM;
> +
>  }
>  
>  int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
>  {
>  	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma = NULL;
>  	unsigned long len;
>  	int ret;
>  	bool populate;
> -	LIST_HEAD(uf);
> +	MA_STATE(mas, &mm->mm_mt, addr, addr);
>  
>  	len = PAGE_ALIGN(request);
>  	if (len < request)
> @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
>  	if (mmap_write_lock_killable(mm))
>  		return -EINTR;
>  
> -	ret = do_brk_flags(addr, len, flags, &uf);
> +	// This vma left intentionally blank.

This comment using unintentionally bad syntax (// vs /* */)

Also if we leave it blank it means this path won't ever expand an existing
vma, while previously it could succeed the vma_merge, no? Or all callers of
vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have
a more verbose comment...

> +	mas_walk(&mas);
> +	ret = do_brk_flags(&mas, vma, addr, len, flags);
>  	populate = ((mm->def_flags & VM_LOCKED) != 0);
>  	mmap_write_unlock(mm);
> -	userfaultfd_unmap_complete(mm, &uf);

Looks like this part is removed completely from vm_brk_flags() paths?

OK it seems the whole patch makes some asumption that vm_brk_flags() never
has to unmap a pre-existing area, and in the brk() syscall this is now
delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it
might be safe, it should be discussed in the patch that vm_brk_flags()
didn't actually need to support the unmap part, because x y z. And best if
there are some DEBUG_VM based assertions supporting that.

But then again, is the optimized scenario happening often enough to warrant it?

>  	if (populate && !ret)
>  		mm_populate(addr, len);
>  	return ret;
Vlastimil Babka Jan. 13, 2022, 3:28 p.m. UTC | #2
On 12/1/21 15:29, Liam Howlett wrote:
> @@ -1989,6 +2013,7 @@ EXPORT_SYMBOL(get_unmapped_area);
>  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
>  {
>  	struct vm_area_struct *vma;
> +	MA_STATE(mas, &mm->mm_mt, addr, addr);
>  
>  	mmap_assert_locked(mm);
>  	/* Check the cache first. */
> @@ -1996,7 +2021,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
>  	if (likely(vma))
>  		return vma;
>  
> -	vma = mt_find(&mm->mm_mt, &addr, ULONG_MAX);
> +	vma = mas_find(&mas, -1);
>  	if (vma)
>  		vmacache_update(addr, vma);
>  	return vma;

Oh and this change to find_vma() was supposed to go to the next patch, no?
Liam R. Howlett Jan. 19, 2022, 3:03 a.m. UTC | #3
* Vlastimil Babka <vbabka@suse.cz> [220113 07:59]:
> On 12/1/21 15:29, Liam Howlett wrote:
> > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> > 
> > Avoid allocating a new VMA when it a vma modification can occur.  When a
> > brk() can expand or contract a VMA, then the single store operation will
> > only modify one index of the maple tree instead of causing a node to
> > split or coalesce.  This avoids unnecessary allocations/frees of maple
> > tree nodes and VMAs.
> > 
> > Use the advanced API for the maple tree to avoid unnecessary walks of
> > the tree.
> > 
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> > ---
> >  mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++-----------
> >  1 file changed, 207 insertions(+), 51 deletions(-)
> > 
> > +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
> > +			unsigned long addr, unsigned long len,
> > +			unsigned long flags)
> >  {
> >  	struct mm_struct *mm = current->mm;
> > -	struct vm_area_struct *vma, *prev;
> > -	pgoff_t pgoff = addr >> PAGE_SHIFT;
> > +	struct vm_area_struct *prev = NULL;
> >  	int error;
> >  	unsigned long mapped_addr;
> >  	validate_mm_mt(mm);
> > @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> >  	if (error)
> >  		return error;
> >  
> > -	/* Clear old maps, set up prev and uf */
> > -	if (munmap_vma_range(mm, addr, len, &prev, uf))
> > -		return -ENOMEM;
> > -
> > -	/* Check against address space limits *after* clearing old maps... */
> > +	/* Check against address space limits by the changed size */
> 
> Can that cause spurious ENOMEM because now the check assumes 'len' worth of
> purely new pages and no reuse?


I don't think so?  I must be missing how anything could exist in this
range to begin with?  The brk syscall checks to ensure there is enough
room and the other two users are the elf and a.out loaders - could
either of those two map over parts of themselves on load?  This seemed
to be there primarily to set up for an rb insert (set prev, rb_link,
rb_parent) in the current code.  Sort of like how get_unmapped_area()
with MAP_FIXED appears to be used to sanitize the addr and len.


> 
> >  	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
> >  		return -ENOMEM;
> >  
> > @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> >  	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
> >  		return -ENOMEM;
> >  
> > -	/* Can we just expand an old private anonymous mapping? */
> > -	vma = vma_merge(mm, prev, addr, addr + len, flags,
> > -			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
> > -	if (vma)
> > -		goto out;
> > +	mas->last = addr + len - 1;
> > +	if (vma) {
> > +		/* Expand the existing vma if possible; almost never a singular
> > +		 * list, so this will almost always fail. */
> >  
> > -	/*
> > -	 * create a vma struct for an anonymous mapping
> > -	 */
> > -	vma = vm_area_alloc(mm);
> > -	if (!vma) {
> > -		vm_unacct_memory(len >> PAGE_SHIFT);
> > -		return -ENOMEM;
> > +		if ((!vma->anon_vma ||
> > +		     list_is_singular(&vma->anon_vma_chain)) &&
> 
> Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks
> like something e.g. we can easily forget to adjust when changing vma_merge()
> itself.

vma_merge() is overly heavy for what brk() is doing.  I get what you are
saying about it potentially being missed and I think brk is already in
the 'potentially overlooked' category as it stands today.  Honestly, one
less user of vma_merge() (and thus less __vma_adjust() users) seems like
a win to me.

> Is this optimization worth the trouble given the comment above "so this will
> almost always fail"?

vma_merge() walks the tree for next and next->next and does a lot of
extra checks before arriving at the conclusion that this will fail.
Maybe 'almost always fail' is too strong wording; on boot of my VM, 63
expands happen out of 517, so 87.8% fail this test.

> 
> > +		     ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){
> > +			mas->index = vma->vm_start;
> > +
> > +			vma_adjust_trans_huge(vma, addr, addr + len, 0);
> > +			if (vma->anon_vma) {
> > +				anon_vma_lock_write(vma->anon_vma);
> > +				anon_vma_interval_tree_pre_update_vma(vma);
> > +			}
> > +			vma->vm_end = addr + len;
> > +			vma->vm_flags |= VM_SOFTDIRTY;
> > +			if (mas_store_gfp(mas, vma, GFP_KERNEL))
> > +				goto mas_mod_fail;
> > +
> > +			if (vma->anon_vma) {
> > +				anon_vma_interval_tree_post_update_vma(vma);
> > +				anon_vma_unlock_write(vma->anon_vma);
> > +			}
> > +			khugepaged_enter_vma_merge(vma, flags);
> > +			goto out;
> > +		}
> > +		prev = vma;
> >  	}
> > +	mas->index = addr;
> > +	mas_walk(mas);
> > +
> > +	/* create a vma struct for an anonymous mapping */
> > +	vma = vm_area_alloc(mm);
> > +	if (!vma)
> > +		goto vma_alloc_fail;
> >  
> >  	vma_set_anonymous(vma);
> >  	vma->vm_start = addr;
> >  	vma->vm_end = addr + len;
> > -	vma->vm_pgoff = pgoff;
> > +	vma->vm_pgoff = addr >> PAGE_SHIFT;
> >  	vma->vm_flags = flags;
> >  	vma->vm_page_prot = vm_get_page_prot(flags);
> > -	vma_link(mm, vma, prev);
> > +	if (vma_mas_store(vma, mas))
> > +		goto mas_store_fail;
> > +
> > +	if (!prev)
> > +		prev = mas_prev(mas, 0);
> > +
> > +	__vma_link_list(mm, vma, prev);
> > +	mm->map_count++;
> >  out:
> >  	perf_event_mmap(vma);
> >  	mm->total_vm += len >> PAGE_SHIFT;
> > @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> >  	vma->vm_flags |= VM_SOFTDIRTY;
> >  	validate_mm_mt(mm);
> >  	return 0;
> > +
> > +mas_store_fail:
> > +	vm_area_free(vma);
> > +vma_alloc_fail:
> > +	vm_unacct_memory(len >> PAGE_SHIFT);
> > +	return -ENOMEM;
> > +
> > +mas_mod_fail:
> > +	vma->vm_end = addr;
> > +	if (vma->anon_vma) {
> > +		anon_vma_interval_tree_post_update_vma(vma);
> > +		anon_vma_unlock_write(vma->anon_vma);
> > +	}
> > +	return -ENOMEM;
> > +
> >  }
> >  
> >  int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
> >  {
> >  	struct mm_struct *mm = current->mm;
> > +	struct vm_area_struct *vma = NULL;
> >  	unsigned long len;
> >  	int ret;
> >  	bool populate;
> > -	LIST_HEAD(uf);
> > +	MA_STATE(mas, &mm->mm_mt, addr, addr);
> >  
> >  	len = PAGE_ALIGN(request);
> >  	if (len < request)
> > @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
> >  	if (mmap_write_lock_killable(mm))
> >  		return -EINTR;
> >  
> > -	ret = do_brk_flags(addr, len, flags, &uf);
> > +	// This vma left intentionally blank.
> 
> This comment using unintentionally bad syntax (// vs /* */)

Ha!  Thanks!

> 
> Also if we leave it blank it means this path won't ever expand an existing
> vma, while previously it could succeed the vma_merge, no? Or all callers of
> vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have
> a more verbose comment...

Is it possible that other code paths lead to a vma merge?  From what I
can tell the other entry points are for the BSS or after loading a
binary anyways.  I guess the next vma could be anon and have matching
flags, but I think they will all have a vma->vm_file.  In fact, if I
change the do_brk_flags() to check !vma->vm_file and pass through the
vma in the case of vma->vm_end == addr, then it works - but there are no
merging from this code path that I can see on boot.  If you think this
is necessary, I can add it in, but I don't think it's needed.


> 
> > +	mas_walk(&mas);
> > +	ret = do_brk_flags(&mas, vma, addr, len, flags);
> >  	populate = ((mm->def_flags & VM_LOCKED) != 0);
> >  	mmap_write_unlock(mm);
> > -	userfaultfd_unmap_complete(mm, &uf);
> 
> Looks like this part is removed completely from vm_brk_flags() paths?
> 

When I removed the call to munmap_vma_range(), I dropped the userfaultfd
here as there was no need.


> OK it seems the whole patch makes some asumption that vm_brk_flags() never
> has to unmap a pre-existing area, and in the brk() syscall this is now
> delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it
> might be safe, it should be discussed in the patch that vm_brk_flags()
> didn't actually need to support the unmap part, because x y z. And best if
> there are some DEBUG_VM based assertions supporting that.

Yes,  I do believe this to be true.  I don't think do_brk_flags() needs
to unmap.  If you look at brk() syscall, it actually ensure there is
enough room for the expansion + vm_start_gap(next), so that one is
probably safe.  brk() already depends on do_brk_munmap() doing the
unmap.  The other callers are the elf and a.out loaders, which also
don't appear to use the functionality.  I thought the call was only for
setting up for insertion into the rbtree.

> 
> But then again, is the optimized scenario happening often enough to warrant it?

well, 12.2% use the optimization to the fullest, the rest fail faster.
I am really after the faster failure optimization here.  I especially do
not like the fact that vma_merge() gets the next vma and the next->next
vma prior to seeing if it can be merged.  I get why, but is there really
going to be an anon vma with the right flags, no file, etc, etc, etc
often enough to try this?  In fact, it's not possible at all to need
next->next when we unmap the area first.  Out of the 8 cases in
vma_merge, only 1, 2, and 3 are possible.  Of the 3 possibilities, I am
suggesting that 2 is really the only one we should check as 1 and 3 are
so unlikely.


> 
> >  	if (populate && !ret)
> >  		mm_populate(addr, len);
> >  	return ret;
>
Liam R. Howlett Jan. 19, 2022, 3:51 p.m. UTC | #4
* Vlastimil Babka <vbabka@suse.cz> [220113 10:28]:
> On 12/1/21 15:29, Liam Howlett wrote:
> > @@ -1989,6 +2013,7 @@ EXPORT_SYMBOL(get_unmapped_area);
> >  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
> >  {
> >  	struct vm_area_struct *vma;
> > +	MA_STATE(mas, &mm->mm_mt, addr, addr);
> >  
> >  	mmap_assert_locked(mm);
> >  	/* Check the cache first. */
> > @@ -1996,7 +2021,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
> >  	if (likely(vma))
> >  		return vma;
> >  
> > -	vma = mt_find(&mm->mm_mt, &addr, ULONG_MAX);
> > +	vma = mas_find(&mas, -1);
> >  	if (vma)
> >  		vmacache_update(addr, vma);
> >  	return vma;
> 
> Oh and this change to find_vma() was supposed to go to the next patch, no?


Yes, thanks.  I will relocate this change to the next patch.
Vlastimil Babka Jan. 21, 2022, 12:41 p.m. UTC | #5
On 1/19/22 04:03, Liam Howlett wrote:
> * Vlastimil Babka <vbabka@suse.cz> [220113 07:59]:
>> On 12/1/21 15:29, Liam Howlett wrote:
>> > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
>> > 
>> > Avoid allocating a new VMA when it a vma modification can occur.  When a
>> > brk() can expand or contract a VMA, then the single store operation will
>> > only modify one index of the maple tree instead of causing a node to
>> > split or coalesce.  This avoids unnecessary allocations/frees of maple
>> > tree nodes and VMAs.
>> > 
>> > Use the advanced API for the maple tree to avoid unnecessary walks of
>> > the tree.
>> > 
>> > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
>> > ---
>> >  mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++-----------
>> >  1 file changed, 207 insertions(+), 51 deletions(-)
>> > 
>> > +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
>> > +			unsigned long addr, unsigned long len,
>> > +			unsigned long flags)
>> >  {
>> >  	struct mm_struct *mm = current->mm;
>> > -	struct vm_area_struct *vma, *prev;
>> > -	pgoff_t pgoff = addr >> PAGE_SHIFT;
>> > +	struct vm_area_struct *prev = NULL;
>> >  	int error;
>> >  	unsigned long mapped_addr;
>> >  	validate_mm_mt(mm);
>> > @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
>> >  	if (error)
>> >  		return error;
>> >  
>> > -	/* Clear old maps, set up prev and uf */
>> > -	if (munmap_vma_range(mm, addr, len, &prev, uf))
>> > -		return -ENOMEM;
>> > -
>> > -	/* Check against address space limits *after* clearing old maps... */
>> > +	/* Check against address space limits by the changed size */
>> 
>> Can that cause spurious ENOMEM because now the check assumes 'len' worth of
>> purely new pages and no reuse?
> 
> 
> I don't think so?  I must be missing how anything could exist in this
> range to begin with?

Well the comment including "*after*" made it look like somebody was careful
for a good reason. But it's possible that it's outdated, of course. So it's
generally good to explain in such changes how it was evaluated that it's now ok.

> The brk syscall checks to ensure there is enough
> room and the other two users are the elf and a.out loaders - could
> either of those two map over parts of themselves on load?

IIRC some past changes between MAP_FIXED/MAP_FIXED_NOREPLACE made us realize
that some loaders do, see e.g. 5f501d555653 ("binfmt_elf: reintroduce using
MAP_FIXED_NOREPLACE")

But I guess we can also assume that during the initial elf loading, we are
not even close to the limits and it's unlikely to manifest as a problem
somewhere...

> This seemed
> to be there primarily to set up for an rb insert (set prev, rb_link,
> rb_parent) in the current code.  Sort of like how get_unmapped_area()
> with MAP_FIXED appears to be used to sanitize the addr and len.
> 
> 
>> 
>> >  	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
>> >  		return -ENOMEM;
>> >  
>> > @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
>> >  	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
>> >  		return -ENOMEM;
>> >  
>> > -	/* Can we just expand an old private anonymous mapping? */
>> > -	vma = vma_merge(mm, prev, addr, addr + len, flags,
>> > -			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
>> > -	if (vma)
>> > -		goto out;
>> > +	mas->last = addr + len - 1;
>> > +	if (vma) {
>> > +		/* Expand the existing vma if possible; almost never a singular
>> > +		 * list, so this will almost always fail. */
>> >  
>> > -	/*
>> > -	 * create a vma struct for an anonymous mapping
>> > -	 */
>> > -	vma = vm_area_alloc(mm);
>> > -	if (!vma) {
>> > -		vm_unacct_memory(len >> PAGE_SHIFT);
>> > -		return -ENOMEM;
>> > +		if ((!vma->anon_vma ||
>> > +		     list_is_singular(&vma->anon_vma_chain)) &&
>> 
>> Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks
>> like something e.g. we can easily forget to adjust when changing vma_merge()
>> itself.
> 
> vma_merge() is overly heavy for what brk() is doing.  I get what you are
> saying about it potentially being missed and I think brk is already in
> the 'potentially overlooked' category as it stands today.  Honestly, one
> less user of vma_merge() (and thus less __vma_adjust() users) seems like
> a win to me.
> 
>> Is this optimization worth the trouble given the comment above "so this will
>> almost always fail"?
> 
> vma_merge() walks the tree for next and next->next and does a lot of
> extra checks before arriving at the conclusion that this will fail.
> Maybe 'almost always fail' is too strong wording; on boot of my VM, 63
> expands happen out of 517, so 87.8% fail this test.

OK guess we can live with it, I think you already mentioned there's plan for
future cleanups in this area :)

>> 
>> > +		     ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){
>> > +			mas->index = vma->vm_start;
>> > +
>> > +			vma_adjust_trans_huge(vma, addr, addr + len, 0);
>> > +			if (vma->anon_vma) {
>> > +				anon_vma_lock_write(vma->anon_vma);
>> > +				anon_vma_interval_tree_pre_update_vma(vma);
>> > +			}
>> > +			vma->vm_end = addr + len;
>> > +			vma->vm_flags |= VM_SOFTDIRTY;
>> > +			if (mas_store_gfp(mas, vma, GFP_KERNEL))
>> > +				goto mas_mod_fail;
>> > +
>> > +			if (vma->anon_vma) {
>> > +				anon_vma_interval_tree_post_update_vma(vma);
>> > +				anon_vma_unlock_write(vma->anon_vma);
>> > +			}
>> > +			khugepaged_enter_vma_merge(vma, flags);
>> > +			goto out;
>> > +		}
>> > +		prev = vma;
>> >  	}
>> > +	mas->index = addr;
>> > +	mas_walk(mas);
>> > +
>> > +	/* create a vma struct for an anonymous mapping */
>> > +	vma = vm_area_alloc(mm);
>> > +	if (!vma)
>> > +		goto vma_alloc_fail;
>> >  
>> >  	vma_set_anonymous(vma);
>> >  	vma->vm_start = addr;
>> >  	vma->vm_end = addr + len;
>> > -	vma->vm_pgoff = pgoff;
>> > +	vma->vm_pgoff = addr >> PAGE_SHIFT;
>> >  	vma->vm_flags = flags;
>> >  	vma->vm_page_prot = vm_get_page_prot(flags);
>> > -	vma_link(mm, vma, prev);
>> > +	if (vma_mas_store(vma, mas))
>> > +		goto mas_store_fail;
>> > +
>> > +	if (!prev)
>> > +		prev = mas_prev(mas, 0);
>> > +
>> > +	__vma_link_list(mm, vma, prev);
>> > +	mm->map_count++;
>> >  out:
>> >  	perf_event_mmap(vma);
>> >  	mm->total_vm += len >> PAGE_SHIFT;
>> > @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
>> >  	vma->vm_flags |= VM_SOFTDIRTY;
>> >  	validate_mm_mt(mm);
>> >  	return 0;
>> > +
>> > +mas_store_fail:
>> > +	vm_area_free(vma);
>> > +vma_alloc_fail:
>> > +	vm_unacct_memory(len >> PAGE_SHIFT);
>> > +	return -ENOMEM;
>> > +
>> > +mas_mod_fail:
>> > +	vma->vm_end = addr;
>> > +	if (vma->anon_vma) {
>> > +		anon_vma_interval_tree_post_update_vma(vma);
>> > +		anon_vma_unlock_write(vma->anon_vma);
>> > +	}
>> > +	return -ENOMEM;
>> > +
>> >  }
>> >  
>> >  int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
>> >  {
>> >  	struct mm_struct *mm = current->mm;
>> > +	struct vm_area_struct *vma = NULL;
>> >  	unsigned long len;
>> >  	int ret;
>> >  	bool populate;
>> > -	LIST_HEAD(uf);
>> > +	MA_STATE(mas, &mm->mm_mt, addr, addr);
>> >  
>> >  	len = PAGE_ALIGN(request);
>> >  	if (len < request)
>> > @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
>> >  	if (mmap_write_lock_killable(mm))
>> >  		return -EINTR;
>> >  
>> > -	ret = do_brk_flags(addr, len, flags, &uf);
>> > +	// This vma left intentionally blank.
>> 
>> This comment using unintentionally bad syntax (// vs /* */)
> 
> Ha!  Thanks!
> 
>> 
>> Also if we leave it blank it means this path won't ever expand an existing
>> vma, while previously it could succeed the vma_merge, no? Or all callers of
>> vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have
>> a more verbose comment...
> 
> Is it possible that other code paths lead to a vma merge?  From what I
> can tell the other entry points are for the BSS or after loading a
> binary anyways.  I guess the next vma could be anon and have matching
> flags, but I think they will all have a vma->vm_file.  In fact, if I
> change the do_brk_flags() to check !vma->vm_file and pass through the
> vma in the case of vma->vm_end == addr, then it works - but there are no
> merging from this code path that I can see on boot.  If you think this
> is necessary, I can add it in, but I don't think it's needed.

Looks like this is all changed again towards the end of the series anyway...

>> 
>> > +	mas_walk(&mas);
>> > +	ret = do_brk_flags(&mas, vma, addr, len, flags);
>> >  	populate = ((mm->def_flags & VM_LOCKED) != 0);
>> >  	mmap_write_unlock(mm);
>> > -	userfaultfd_unmap_complete(mm, &uf);
>> 
>> Looks like this part is removed completely from vm_brk_flags() paths?
>> 
> 
> When I removed the call to munmap_vma_range(), I dropped the userfaultfd
> here as there was no need.
> 
> 
>> OK it seems the whole patch makes some asumption that vm_brk_flags() never
>> has to unmap a pre-existing area, and in the brk() syscall this is now
>> delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it
>> might be safe, it should be discussed in the patch that vm_brk_flags()
>> didn't actually need to support the unmap part, because x y z. And best if
>> there are some DEBUG_VM based assertions supporting that.
> 
> Yes,  I do believe this to be true.  I don't think do_brk_flags() needs
> to unmap.  If you look at brk() syscall, it actually ensure there is
> enough room for the expansion + vm_start_gap(next), so that one is
> probably safe.  brk() already depends on do_brk_munmap() doing the
> unmap.  The other callers are the elf and a.out loaders, which also
> don't appear to use the functionality.

Hopefully that doesn't rely on assuming no elf segments overlap, as
discussed above.

>   I thought the call was only for
> setting up for insertion into the rbtree.
> 
>> 
>> But then again, is the optimized scenario happening often enough to warrant it?
> 
> well, 12.2% use the optimization to the fullest, the rest fail faster.
> I am really after the faster failure optimization here.  I especially do
> not like the fact that vma_merge() gets the next vma and the next->next
> vma prior to seeing if it can be merged.  I get why, but is there really
> going to be an anon vma with the right flags, no file, etc, etc, etc
> often enough to try this?  In fact, it's not possible at all to need
> next->next when we unmap the area first.  Out of the 8 cases in
> vma_merge, only 1, 2, and 3 are possible.  Of the 3 possibilities, I am
> suggesting that 2 is really the only one we should check as 1 and 3 are
> so unlikely.
> 
> 
>> 
>> >  	if (populate && !ret)
>> >  		mm_populate(addr, len);
>> >  	return ret;
>>
diff mbox series

Patch

diff --git a/mm/mmap.c b/mm/mmap.c
index c5f92666d145..e4c8ce377f2c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,17 +188,22 @@  static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
-		struct list_head *uf);
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+			 unsigned long newbrk, unsigned long oldbrk,
+			 struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+			unsigned long addr, unsigned long request,
+			unsigned long flags);
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
 	unsigned long newbrk, oldbrk, origbrk;
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *next;
+	struct vm_area_struct *brkvma, *next = NULL;
 	unsigned long min_brk;
 	bool populate;
 	bool downgraded = false;
 	LIST_HEAD(uf);
+	MA_STATE(mas, &mm->mm_mt, 0, 0);
 
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
@@ -238,37 +243,56 @@  SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto success;
 	}
 
-	/*
-	 * Always allow shrinking brk.
-	 * __do_munmap() may downgrade mmap_lock to read.
-	 */
-	if (brk <= mm->brk) {
+	mas_set(&mas, newbrk);
+	brkvma = mas_walk(&mas);
+	if (brkvma) { // munmap necessary, there is something at newbrk.
+		/*
+		 * Always allow shrinking brk.
+		 * do_brk_munmap() may downgrade mmap_lock to read.
+		 */
 		int ret;
 
+		if (brkvma->vm_start >= oldbrk)
+			goto out; // mapping intersects with an existing non-brk vma.
 		/*
-		 * mm->brk must to be protected by write mmap_lock so update it
-		 * before downgrading mmap_lock. When __do_munmap() fails,
-		 * mm->brk will be restored from origbrk.
+		 * mm->brk must be protected by write mmap_lock.
+		 * do_brk_munmap() may downgrade the lock,  so update it
+		 * before calling do_brk_munmap().
 		 */
 		mm->brk = brk;
-		ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
-		if (ret < 0) {
-			mm->brk = origbrk;
-			goto out;
-		} else if (ret == 1) {
+		mas.last = oldbrk - 1;
+		ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+		if (ret == 1)  {
 			downgraded = true;
-		}
-		goto success;
-	}
+			goto success;
+		} else if (!ret)
+			goto success;
 
+		mm->brk = origbrk;
+		goto out;
+	}
+	/* Only check if the next VMA is within the stack_guard_gap of the
+	 * expansion area */
+	next = mas_next(&mas, newbrk + PAGE_SIZE + stack_guard_gap);
 	/* Check against existing mmap mappings. */
-	next = find_vma(mm, oldbrk);
 	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
 		goto out;
 
+	brkvma = mas_prev(&mas, mm->start_brk);
+	if (brkvma) {
+		if (brkvma->vm_start >= oldbrk)
+			goto out; // Trying to map over another vma.
+
+		if (brkvma->vm_end <= min_brk) {
+			brkvma = NULL;
+			mas_reset(&mas);
+		}
+	}
+
 	/* Ok, looks good - let it rip. */
-	if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+	if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
 		goto out;
+
 	mm->brk = brk;
 
 success:
@@ -1989,6 +2013,7 @@  EXPORT_SYMBOL(get_unmapped_area);
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
 	struct vm_area_struct *vma;
+	MA_STATE(mas, &mm->mm_mt, addr, addr);
 
 	mmap_assert_locked(mm);
 	/* Check the cache first. */
@@ -1996,7 +2021,7 @@  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 	if (likely(vma))
 		return vma;
 
-	vma = mt_find(&mm->mm_mt, &addr, ULONG_MAX);
+	vma = mas_find(&mas, -1);
 	if (vma)
 		vmacache_update(addr, vma);
 	return vma;
@@ -2713,16 +2738,105 @@  SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 }
 
 /*
- *  this is really a simplified "do_mmap".  it only handles
- *  anonymous maps.  eventually we may be able to do some
- *  brk-specific accounting here.
+ * brk_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 1 on success.
+ * unmaps a partial VMA mapping.  Does not handle alignment, downgrades lock if
+ * possible.
+ */
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+			 unsigned long newbrk, unsigned long oldbrk,
+			 struct list_head *uf)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct unmap;
+	unsigned long unmap_pages;
+	int ret = 1;
+
+	arch_unmap(mm, newbrk, oldbrk);
+
+	if (likely(vma->vm_start >= newbrk)) { // remove entire mapping(s)
+		mas_set(mas, newbrk);
+		if (vma->vm_start != newbrk)
+			mas_reset(mas); // cause a re-walk for the first overlap.
+		ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true);
+		goto munmap_full_vma;
+	}
+
+	vma_init(&unmap, mm);
+	unmap.vm_start = newbrk;
+	unmap.vm_end = oldbrk;
+	ret = userfaultfd_unmap_prep(&unmap, newbrk, oldbrk, uf);
+	if (ret)
+		return ret;
+	ret = 1;
+
+	// Change the oldbrk of vma to the newbrk of the munmap area
+	vma_adjust_trans_huge(vma, vma->vm_start, newbrk, 0);
+	if (vma->anon_vma) {
+		anon_vma_lock_write(vma->anon_vma);
+		anon_vma_interval_tree_pre_update_vma(vma);
+	}
+
+	vma->vm_end = newbrk;
+	if (vma_mas_remove(&unmap, mas))
+		goto mas_store_fail;
+
+	vmacache_invalidate(vma->vm_mm);
+	if (vma->anon_vma) {
+		anon_vma_interval_tree_post_update_vma(vma);
+		anon_vma_unlock_write(vma->anon_vma);
+	}
+
+	unmap_pages = vma_pages(&unmap);
+	if (unmap.vm_flags & VM_LOCKED) {
+		mm->locked_vm -= unmap_pages;
+		munlock_vma_pages_range(&unmap, newbrk, oldbrk);
+	}
+
+	mmap_write_downgrade(mm);
+	unmap_region(mm, &unmap, vma, newbrk, oldbrk);
+	/* Statistics */
+	vm_stat_account(mm, unmap.vm_flags, -unmap_pages);
+	if (unmap.vm_flags & VM_ACCOUNT)
+		vm_unacct_memory(unmap_pages);
+
+munmap_full_vma:
+	validate_mm_mt(mm);
+	return ret;
+
+mas_store_fail:
+	vma->vm_end = oldbrk;
+	if (vma->anon_vma) {
+		anon_vma_interval_tree_post_update_vma(vma);
+		anon_vma_unlock_write(vma->anon_vma);
+	}
+	return -ENOMEM;
+}
+
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len.  If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA.  Eventually we may be able to
+ * do some brk-specific accounting here.
  */
-static int do_brk_flags(unsigned long addr, unsigned long len,
-			unsigned long flags, struct list_head *uf)
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+			unsigned long addr, unsigned long len,
+			unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma, *prev;
-	pgoff_t pgoff = addr >> PAGE_SHIFT;
+	struct vm_area_struct *prev = NULL;
 	int error;
 	unsigned long mapped_addr;
 	validate_mm_mt(mm);
@@ -2740,11 +2854,7 @@  static int do_brk_flags(unsigned long addr, unsigned long len,
 	if (error)
 		return error;
 
-	/* Clear old maps, set up prev and uf */
-	if (munmap_vma_range(mm, addr, len, &prev, uf))
-		return -ENOMEM;
-
-	/* Check against address space limits *after* clearing old maps... */
+	/* Check against address space limits by the changed size */
 	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
@@ -2754,28 +2864,57 @@  static int do_brk_flags(unsigned long addr, unsigned long len,
 	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
-	/* Can we just expand an old private anonymous mapping? */
-	vma = vma_merge(mm, prev, addr, addr + len, flags,
-			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
-	if (vma)
-		goto out;
+	mas->last = addr + len - 1;
+	if (vma) {
+		/* Expand the existing vma if possible; almost never a singular
+		 * list, so this will almost always fail. */
 
-	/*
-	 * create a vma struct for an anonymous mapping
-	 */
-	vma = vm_area_alloc(mm);
-	if (!vma) {
-		vm_unacct_memory(len >> PAGE_SHIFT);
-		return -ENOMEM;
+		if ((!vma->anon_vma ||
+		     list_is_singular(&vma->anon_vma_chain)) &&
+		     ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){
+			mas->index = vma->vm_start;
+
+			vma_adjust_trans_huge(vma, addr, addr + len, 0);
+			if (vma->anon_vma) {
+				anon_vma_lock_write(vma->anon_vma);
+				anon_vma_interval_tree_pre_update_vma(vma);
+			}
+			vma->vm_end = addr + len;
+			vma->vm_flags |= VM_SOFTDIRTY;
+			if (mas_store_gfp(mas, vma, GFP_KERNEL))
+				goto mas_mod_fail;
+
+			if (vma->anon_vma) {
+				anon_vma_interval_tree_post_update_vma(vma);
+				anon_vma_unlock_write(vma->anon_vma);
+			}
+			khugepaged_enter_vma_merge(vma, flags);
+			goto out;
+		}
+		prev = vma;
 	}
+	mas->index = addr;
+	mas_walk(mas);
+
+	/* create a vma struct for an anonymous mapping */
+	vma = vm_area_alloc(mm);
+	if (!vma)
+		goto vma_alloc_fail;
 
 	vma_set_anonymous(vma);
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
-	vma->vm_pgoff = pgoff;
+	vma->vm_pgoff = addr >> PAGE_SHIFT;
 	vma->vm_flags = flags;
 	vma->vm_page_prot = vm_get_page_prot(flags);
-	vma_link(mm, vma, prev);
+	if (vma_mas_store(vma, mas))
+		goto mas_store_fail;
+
+	if (!prev)
+		prev = mas_prev(mas, 0);
+
+	__vma_link_list(mm, vma, prev);
+	mm->map_count++;
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
@@ -2785,15 +2924,31 @@  static int do_brk_flags(unsigned long addr, unsigned long len,
 	vma->vm_flags |= VM_SOFTDIRTY;
 	validate_mm_mt(mm);
 	return 0;
+
+mas_store_fail:
+	vm_area_free(vma);
+vma_alloc_fail:
+	vm_unacct_memory(len >> PAGE_SHIFT);
+	return -ENOMEM;
+
+mas_mod_fail:
+	vma->vm_end = addr;
+	if (vma->anon_vma) {
+		anon_vma_interval_tree_post_update_vma(vma);
+		anon_vma_unlock_write(vma->anon_vma);
+	}
+	return -ENOMEM;
+
 }
 
 int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma = NULL;
 	unsigned long len;
 	int ret;
 	bool populate;
-	LIST_HEAD(uf);
+	MA_STATE(mas, &mm->mm_mt, addr, addr);
 
 	len = PAGE_ALIGN(request);
 	if (len < request)
@@ -2804,10 +2959,11 @@  int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
 
-	ret = do_brk_flags(addr, len, flags, &uf);
+	// This vma left intentionally blank.
+	mas_walk(&mas);
+	ret = do_brk_flags(&mas, vma, addr, len, flags);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	mmap_write_unlock(mm);
-	userfaultfd_unmap_complete(mm, &uf);
 	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;