diff mbox series

[v3,13/16] mm/mmap: Avoid zeroing vma tree in mmap_region()

Message ID 20240704182718.2653918-14-Liam.Howlett@oracle.com (mailing list archive)
State New
Headers show
Series Avoid MAP_FIXED gap exposure | expand

Commit Message

Liam R. Howlett July 4, 2024, 6:27 p.m. UTC
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>

Instead of zeroing the vma tree and then overwriting the area, let the
area be overwritten and then clean up the gathered vmas using
vms_complete_munmap_vmas().

In the case of a driver mapping over existing vmas, the PTEs are cleared
using the helper vms_complete_pte_clear().

Temporarily keep track of the number of pages that will be removed and
reduce the charged amount.

This also drops the validate_mm() call in the vma_expand() function.
It is necessary to drop the validate as it would fail since the mm
map_count would be incorrect during a vma expansion, prior to the
cleanup from vms_complete_munmap_vmas().

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
---
 mm/internal.h |  1 +
 mm/mmap.c     | 61 ++++++++++++++++++++++++++++++---------------------
 2 files changed, 37 insertions(+), 25 deletions(-)

Comments

Lorenzo Stoakes July 8, 2024, 12:18 p.m. UTC | #1
On Thu, Jul 04, 2024 at 02:27:15PM GMT, Liam R. Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
>
> Instead of zeroing the vma tree and then overwriting the area, let the
> area be overwritten and then clean up the gathered vmas using
> vms_complete_munmap_vmas().
>
> In the case of a driver mapping over existing vmas, the PTEs are cleared
> using the helper vms_complete_pte_clear().
>
> Temporarily keep track of the number of pages that will be removed and
> reduce the charged amount.
>
> This also drops the validate_mm() call in the vma_expand() function.
> It is necessary to drop the validate as it would fail since the mm
> map_count would be incorrect during a vma expansion, prior to the
> cleanup from vms_complete_munmap_vmas().
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> ---
>  mm/internal.h |  1 +
>  mm/mmap.c     | 61 ++++++++++++++++++++++++++++++---------------------
>  2 files changed, 37 insertions(+), 25 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 4c9f06669cc4..fae4a1bba732 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
>  	unsigned long stack_vm;
>  	unsigned long data_vm;
>  	bool unlock;			/* Unlock after the munmap */
> +	bool cleared_ptes;		/* If the PTE are cleared already */
>  };
>
>  void __meminit __init_single_page(struct page *page, unsigned long pfn,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5d458c5f080e..0c334eeae8cd 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
>  }
>
>  static unsigned long count_vma_pages_range(struct mm_struct *mm,
> -		unsigned long addr, unsigned long end)
> +		unsigned long addr, unsigned long end,
> +		unsigned long *nr_accounted)
>  {
>  	VMA_ITERATOR(vmi, mm, addr);
>  	struct vm_area_struct *vma;
>  	unsigned long nr_pages = 0;
>
> +	*nr_accounted = 0;
>  	for_each_vma_range(vmi, vma, end) {
>  		unsigned long vm_start = max(addr, vma->vm_start);
>  		unsigned long vm_end = min(end, vma->vm_end);
>
>  		nr_pages += PHYS_PFN(vm_end - vm_start);
> +		if (vma->vm_flags & VM_ACCOUNT)
> +			*nr_accounted += PHYS_PFN(vm_end - vm_start);

We're duplicating the PHYS_PFN(vm_end - vm_start) thing, probably worth
adding something like:

unsigned long num_pages = PHYS_PFN(vm_end - vm_start);

Side-note, but it'd be nice to sort out the inconsistency of PHYS_PFN()
vs. (end - start) >> PAGE_SHIFT. This is probably not a huge deal though...

>  	}
>
>  	return nr_pages;
> @@ -522,6 +526,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
>  	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
>  	vms->unmap_start = FIRST_USER_ADDRESS;
>  	vms->unmap_end = USER_PGTABLES_CEILING;
> +	vms->cleared_ptes = false;
>  }
>
>  /*
> @@ -730,7 +735,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	vma_iter_store(vmi, vma);
>
>  	vma_complete(&vp, vmi, vma->vm_mm);
> -	validate_mm(vma->vm_mm);

Since we're dropping this here, do we need to re-add this back somehwere
where we are confident the state will be consistent?

>  	return 0;
>
>  nomem:
> @@ -2612,6 +2616,9 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
>  {
>  	struct mmu_gather tlb;
>
> +	if (vms->cleared_ptes)
> +		return;
> +
>  	/*
>  	 * We can free page tables without write-locking mmap_lock because VMAs
>  	 * were isolated before we downgraded mmap_lock.
> @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
>  	mas_set(mas_detach, 1);
>  	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
>  	tlb_finish_mmu(&tlb);
> +	vms->cleared_ptes = true;
>  }
>
>  /*
> @@ -2936,24 +2944,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	unsigned long merge_start = addr, merge_end = end;
>  	bool writable_file_mapping = false;
>  	pgoff_t vm_pgoff;
> -	int error;
> +	int error = -ENOMEM;
>  	VMA_ITERATOR(vmi, mm, addr);
> +	unsigned long nr_pages, nr_accounted;
>
> -	/* Check against address space limit. */
> -	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> -		unsigned long nr_pages;
> -
> -		/*
> -		 * MAP_FIXED may remove pages of mappings that intersects with
> -		 * requested mapping. Account for the pages it would unmap.
> -		 */
> -		nr_pages = count_vma_pages_range(mm, addr, end);
> -
> -		if (!may_expand_vm(mm, vm_flags,
> -					(len >> PAGE_SHIFT) - nr_pages))
> -			return -ENOMEM;
> -	}
> +	nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
>
> +	/* Check against address space limit. */
> +	/*
> +	 * MAP_FIXED may remove pages of mappings that intersects with requested
> +	 * mapping. Account for the pages it would unmap.
> +	 */

Utter pedantry, but could these comments be combined? Bit ugly to have one
after another like this.

> +	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> +		return -ENOMEM;
>
>  	if (unlikely(!can_modify_mm(mm, addr, end)))
>  		return -EPERM;
> @@ -2971,14 +2974,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		if (vms_gather_munmap_vmas(&vms, &mas_detach))
>  			return -ENOMEM;
>
> -		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> -			return -ENOMEM;
> -
> -		vms_complete_munmap_vmas(&vms, &mas_detach);
>  		next = vms.next;
>  		prev = vms.prev;
>  		vma = NULL;
>  	} else {
> +		/* Minimal setup of vms */
> +		vms.nr_pages = 0;

I'm not a huge fan of having vms be uninitialised other than this field and
then to rely on no further code change accidentally using an uninitialised
field. This is kind of asking for bugs.

Can we not find a way to sensibly initialise it somehow?

>  		next = vma_next(&vmi);
>  		prev = vma_prev(&vmi);
>  		if (prev)
> @@ -2990,8 +2991,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	 */
>  	if (accountable_mapping(file, vm_flags)) {
>  		charged = len >> PAGE_SHIFT;
> +		charged -= nr_accounted;
>  		if (security_vm_enough_memory_mm(mm, charged))
> -			return -ENOMEM;
> +			goto abort_munmap;
> +		vms.nr_accounted = 0;

This is kind of expanding the 'vms possibly unitialised apart from selected
fields' pattern, makes me worry.

>  		vm_flags |= VM_ACCOUNT;
>  	}
>
> @@ -3040,10 +3043,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	 * not unmapped, but the maps are removed from the list.
>  	 */
>  	vma = vm_area_alloc(mm);
> -	if (!vma) {
> -		error = -ENOMEM;
> +	if (!vma)
>  		goto unacct_error;
> -	}
>
>  	vma_iter_config(&vmi, addr, end);
>  	vma_set_range(vma, addr, end, pgoff);
> @@ -3052,6 +3053,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
>  	if (file) {
>  		vma->vm_file = get_file(file);
> +		/* call_mmap() map PTE, so ensure there are no existing PTEs */

Typo? Should this be 'call_mmap() maps PTEs, so ensure there are no
existing PTEs'? I feel like this could be reworded something like:

'call_map() may map PTEs, so clear any that may be pending unmap ahead of
time.'

> +		if (vms.nr_pages)
> +			vms_complete_pte_clear(&vms, &mas_detach, true);
>  		error = call_mmap(file, vma);
>  		if (error)
>  			goto unmap_and_free_vma;
> @@ -3142,6 +3146,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  expanded:
>  	perf_event_mmap(vma);
>
> +	if (vms.nr_pages)
> +		vms_complete_munmap_vmas(&vms, &mas_detach);
> +

Hang on, if we already did this in the if (file) branch above, might we end
up calling this twice? I didn't see vms.nr_pages get set to zero or
decremented anywhere (unless I missed it)?

>  	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
>  	if (vm_flags & VM_LOCKED) {
>  		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> @@ -3189,6 +3196,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  unacct_error:
>  	if (charged)
>  		vm_unacct_memory(charged);
> +
> +abort_munmap:
> +	if (vms.nr_pages)
> +		abort_munmap_vmas(&mas_detach);
>  	validate_mm(mm);
>  	return error;
>  }
> --
> 2.43.0
>

In general I like the approach and you've made it very clear how you've
altered this behaviour.

However I have a few concerns (as well some trivial comments) above. With
those cleared up we'll be good to go!
Liam R. Howlett July 8, 2024, 7:10 p.m. UTC | #2
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240708 08:18]:
> On Thu, Jul 04, 2024 at 02:27:15PM GMT, Liam R. Howlett wrote:
> > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> >
> > Instead of zeroing the vma tree and then overwriting the area, let the
> > area be overwritten and then clean up the gathered vmas using
> > vms_complete_munmap_vmas().
> >
> > In the case of a driver mapping over existing vmas, the PTEs are cleared
> > using the helper vms_complete_pte_clear().
> >
> > Temporarily keep track of the number of pages that will be removed and
> > reduce the charged amount.
> >
> > This also drops the validate_mm() call in the vma_expand() function.
> > It is necessary to drop the validate as it would fail since the mm
> > map_count would be incorrect during a vma expansion, prior to the
> > cleanup from vms_complete_munmap_vmas().
> >
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> > ---
> >  mm/internal.h |  1 +
> >  mm/mmap.c     | 61 ++++++++++++++++++++++++++++++---------------------
> >  2 files changed, 37 insertions(+), 25 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 4c9f06669cc4..fae4a1bba732 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
> >  	unsigned long stack_vm;
> >  	unsigned long data_vm;
> >  	bool unlock;			/* Unlock after the munmap */
> > +	bool cleared_ptes;		/* If the PTE are cleared already */
> >  };
> >
> >  void __meminit __init_single_page(struct page *page, unsigned long pfn,
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 5d458c5f080e..0c334eeae8cd 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> >  }
> >
> >  static unsigned long count_vma_pages_range(struct mm_struct *mm,
> > -		unsigned long addr, unsigned long end)
> > +		unsigned long addr, unsigned long end,
> > +		unsigned long *nr_accounted)
> >  {
> >  	VMA_ITERATOR(vmi, mm, addr);
> >  	struct vm_area_struct *vma;
> >  	unsigned long nr_pages = 0;
> >
> > +	*nr_accounted = 0;
> >  	for_each_vma_range(vmi, vma, end) {
> >  		unsigned long vm_start = max(addr, vma->vm_start);
> >  		unsigned long vm_end = min(end, vma->vm_end);
> >
> >  		nr_pages += PHYS_PFN(vm_end - vm_start);
> > +		if (vma->vm_flags & VM_ACCOUNT)
> > +			*nr_accounted += PHYS_PFN(vm_end - vm_start);
> 
> We're duplicating the PHYS_PFN(vm_end - vm_start) thing, probably worth
> adding something like:
> 
> unsigned long num_pages = PHYS_PFN(vm_end - vm_start);
> 
> Side-note, but it'd be nice to sort out the inconsistency of PHYS_PFN()
> vs. (end - start) >> PAGE_SHIFT. This is probably not a huge deal though...

I split this out into another patch for easier reviewing.

> 
> >  	}
> >
> >  	return nr_pages;
> > @@ -522,6 +526,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
> >  	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
> >  	vms->unmap_start = FIRST_USER_ADDRESS;
> >  	vms->unmap_end = USER_PGTABLES_CEILING;
> > +	vms->cleared_ptes = false;
> >  }
> >
> >  /*
> > @@ -730,7 +735,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  	vma_iter_store(vmi, vma);
> >
> >  	vma_complete(&vp, vmi, vma->vm_mm);
> > -	validate_mm(vma->vm_mm);
> 
> Since we're dropping this here, do we need to re-add this back somehwere
> where we are confident the state will be consistent?

The vma_expand() function is used in two places - one is in the mmap.c
file which can no longer validate the mm until the munmap is complete.
The other is in fs/exec.c which cannot call the validate_mm().  So
to add this call back, I'd have to add a wrapper to vma_expand() to call
the validate_mm() function for debug builds.

Really all this code in fs/exec.c doesn't belong there so we don't need
to do an extra function wrapper just to call validate_mm(). And you have
a patch to do that which is out for review!

> 
> >  	return 0;
> >
> >  nomem:
> > @@ -2612,6 +2616,9 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> >  {
> >  	struct mmu_gather tlb;
> >
> > +	if (vms->cleared_ptes)
> > +		return;
> > +
> >  	/*
> >  	 * We can free page tables without write-locking mmap_lock because VMAs
> >  	 * were isolated before we downgraded mmap_lock.
> > @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> >  	mas_set(mas_detach, 1);
> >  	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
> >  	tlb_finish_mmu(&tlb);
> > +	vms->cleared_ptes = true;
> >  }
> >
> >  /*
> > @@ -2936,24 +2944,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  	unsigned long merge_start = addr, merge_end = end;
> >  	bool writable_file_mapping = false;
> >  	pgoff_t vm_pgoff;
> > -	int error;
> > +	int error = -ENOMEM;
> >  	VMA_ITERATOR(vmi, mm, addr);
> > +	unsigned long nr_pages, nr_accounted;
> >
> > -	/* Check against address space limit. */
> > -	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > -		unsigned long nr_pages;
> > -
> > -		/*
> > -		 * MAP_FIXED may remove pages of mappings that intersects with
> > -		 * requested mapping. Account for the pages it would unmap.
> > -		 */
> > -		nr_pages = count_vma_pages_range(mm, addr, end);
> > -
> > -		if (!may_expand_vm(mm, vm_flags,
> > -					(len >> PAGE_SHIFT) - nr_pages))
> > -			return -ENOMEM;
> > -	}
> > +	nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
> >
> > +	/* Check against address space limit. */
> > +	/*
> > +	 * MAP_FIXED may remove pages of mappings that intersects with requested
> > +	 * mapping. Account for the pages it would unmap.
> > +	 */
> 
> Utter pedantry, but could these comments be combined? Bit ugly to have one
> after another like this.

Since this was mainly a relocation, I didn't want to change it too much
but since you asked, I'll do it.

> 
> > +	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> > +		return -ENOMEM;
> >
> >  	if (unlikely(!can_modify_mm(mm, addr, end)))
> >  		return -EPERM;
> > @@ -2971,14 +2974,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		if (vms_gather_munmap_vmas(&vms, &mas_detach))
> >  			return -ENOMEM;
> >
> > -		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> > -			return -ENOMEM;
> > -
> > -		vms_complete_munmap_vmas(&vms, &mas_detach);
> >  		next = vms.next;
> >  		prev = vms.prev;
> >  		vma = NULL;
> >  	} else {
> > +		/* Minimal setup of vms */
> > +		vms.nr_pages = 0;
> 
> I'm not a huge fan of having vms be uninitialised other than this field and
> then to rely on no further code change accidentally using an uninitialised
> field. This is kind of asking for bugs.
> 
> Can we not find a way to sensibly initialise it somehow?

Yes, I can switch to the same sort of thing as the maple state and
initialize things as empty.

> 
> >  		next = vma_next(&vmi);
> >  		prev = vma_prev(&vmi);
> >  		if (prev)
> > @@ -2990,8 +2991,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  	 */
> >  	if (accountable_mapping(file, vm_flags)) {
> >  		charged = len >> PAGE_SHIFT;
> > +		charged -= nr_accounted;
> >  		if (security_vm_enough_memory_mm(mm, charged))
> > -			return -ENOMEM;
> > +			goto abort_munmap;
> > +		vms.nr_accounted = 0;
> 
> This is kind of expanding the 'vms possibly unitialised apart from selected
> fields' pattern, makes me worry.

I'll fix this with an init of the struct that will always be called.

> 
> >  		vm_flags |= VM_ACCOUNT;
> >  	}
> >
> > @@ -3040,10 +3043,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  	 * not unmapped, but the maps are removed from the list.
> >  	 */
> >  	vma = vm_area_alloc(mm);
> > -	if (!vma) {
> > -		error = -ENOMEM;
> > +	if (!vma)
> >  		goto unacct_error;
> > -	}
> >
> >  	vma_iter_config(&vmi, addr, end);
> >  	vma_set_range(vma, addr, end, pgoff);
> > @@ -3052,6 +3053,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> >  	if (file) {
> >  		vma->vm_file = get_file(file);
> > +		/* call_mmap() map PTE, so ensure there are no existing PTEs */
> 
> Typo? Should this be 'call_mmap() maps PTEs, so ensure there are no
> existing PTEs'? I feel like this could be reworded something like:
> 
> 'call_map() may map PTEs, so clear any that may be pending unmap ahead of
> time.'

I had changed this already to 'call_mmap() may map PTE, so ensure there
are no existing PTEs'  That way it's still one line and more descriptive
than what I had.

> 
> > +		if (vms.nr_pages)
> > +			vms_complete_pte_clear(&vms, &mas_detach, true);
> >  		error = call_mmap(file, vma);
> >  		if (error)
> >  			goto unmap_and_free_vma;
> > @@ -3142,6 +3146,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  expanded:
> >  	perf_event_mmap(vma);
> >
> > +	if (vms.nr_pages)
> > +		vms_complete_munmap_vmas(&vms, &mas_detach);
> > +
> 
> Hang on, if we already did this in the if (file) branch above, might we end
> up calling this twice? I didn't see vms.nr_pages get set to zero or
> decremented anywhere (unless I missed it)?

No, we called the new helper vms_complete_pte_clear(), which will avoid
clearing the ptes by the added flag vms->cleared_ptes in the second
call.

Above, I modified vms_complete_pte_clear() to check vms->cleared_ptes
prior to clearing the ptes, so it will only be cleared if it needs
clearing.

I debated moving this nr_pages check within vms_complete_munmap_vmas(),
but that would add an unnecessary check to the munmap() path.  Avoiding
both checks seemed too much code (yet another static inline, or such).
I also wanted to keep the sanity of nr_pages checking to a single
function - as you highlighted it could be a path to insanity.

Considering I'll switch this ti a VMS_INIT(), I think that I could pass
it through and do the logic within the static inline at the expense of
the munmap() having a few extra instructions (but no cache hits, so not
a really big deal).

> 
> >  	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> >  	if (vm_flags & VM_LOCKED) {
> >  		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> > @@ -3189,6 +3196,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  unacct_error:
> >  	if (charged)
> >  		vm_unacct_memory(charged);
> > +
> > +abort_munmap:
> > +	if (vms.nr_pages)
> > +		abort_munmap_vmas(&mas_detach);
> >  	validate_mm(mm);
> >  	return error;
> >  }
> > --
> > 2.43.0
> >
> 
> In general I like the approach and you've made it very clear how you've
> altered this behaviour.
> 
> However I have a few concerns (as well some trivial comments) above. With
> those cleared up we'll be good to go!
Lorenzo Stoakes July 9, 2024, 2:27 p.m. UTC | #3
On Mon, Jul 08, 2024 at 03:10:10PM GMT, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240708 08:18]:
> > On Thu, Jul 04, 2024 at 02:27:15PM GMT, Liam R. Howlett wrote:
> > > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> > >
> > > Instead of zeroing the vma tree and then overwriting the area, let the
> > > area be overwritten and then clean up the gathered vmas using
> > > vms_complete_munmap_vmas().
> > >
> > > In the case of a driver mapping over existing vmas, the PTEs are cleared
> > > using the helper vms_complete_pte_clear().
> > >
> > > Temporarily keep track of the number of pages that will be removed and
> > > reduce the charged amount.
> > >
> > > This also drops the validate_mm() call in the vma_expand() function.
> > > It is necessary to drop the validate as it would fail since the mm
> > > map_count would be incorrect during a vma expansion, prior to the
> > > cleanup from vms_complete_munmap_vmas().
> > >
> > > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> > > ---
> > >  mm/internal.h |  1 +
> > >  mm/mmap.c     | 61 ++++++++++++++++++++++++++++++---------------------
> > >  2 files changed, 37 insertions(+), 25 deletions(-)
> > >
> > > diff --git a/mm/internal.h b/mm/internal.h
> > > index 4c9f06669cc4..fae4a1bba732 100644
> > > --- a/mm/internal.h
> > > +++ b/mm/internal.h
> > > @@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
> > >  	unsigned long stack_vm;
> > >  	unsigned long data_vm;
> > >  	bool unlock;			/* Unlock after the munmap */
> > > +	bool cleared_ptes;		/* If the PTE are cleared already */
> > >  };
> > >
> > >  void __meminit __init_single_page(struct page *page, unsigned long pfn,
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 5d458c5f080e..0c334eeae8cd 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> > >  }
> > >
> > >  static unsigned long count_vma_pages_range(struct mm_struct *mm,
> > > -		unsigned long addr, unsigned long end)
> > > +		unsigned long addr, unsigned long end,
> > > +		unsigned long *nr_accounted)
> > >  {
> > >  	VMA_ITERATOR(vmi, mm, addr);
> > >  	struct vm_area_struct *vma;
> > >  	unsigned long nr_pages = 0;
> > >
> > > +	*nr_accounted = 0;
> > >  	for_each_vma_range(vmi, vma, end) {
> > >  		unsigned long vm_start = max(addr, vma->vm_start);
> > >  		unsigned long vm_end = min(end, vma->vm_end);
> > >
> > >  		nr_pages += PHYS_PFN(vm_end - vm_start);
> > > +		if (vma->vm_flags & VM_ACCOUNT)
> > > +			*nr_accounted += PHYS_PFN(vm_end - vm_start);
> >
> > We're duplicating the PHYS_PFN(vm_end - vm_start) thing, probably worth
> > adding something like:
> >
> > unsigned long num_pages = PHYS_PFN(vm_end - vm_start);
> >
> > Side-note, but it'd be nice to sort out the inconsistency of PHYS_PFN()
> > vs. (end - start) >> PAGE_SHIFT. This is probably not a huge deal though...
>
> I split this out into another patch for easier reviewing.

Yeah I noticed, inevitably :) the PHYS_PFN(...) duplication persisted, a
small thing obviously but covered in the subsequent commit.

>
> >
> > >  	}
> > >
> > >  	return nr_pages;
> > > @@ -522,6 +526,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
> > >  	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
> > >  	vms->unmap_start = FIRST_USER_ADDRESS;
> > >  	vms->unmap_end = USER_PGTABLES_CEILING;
> > > +	vms->cleared_ptes = false;
> > >  }
> > >
> > >  /*
> > > @@ -730,7 +735,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > >  	vma_iter_store(vmi, vma);
> > >
> > >  	vma_complete(&vp, vmi, vma->vm_mm);
> > > -	validate_mm(vma->vm_mm);
> >
> > Since we're dropping this here, do we need to re-add this back somehwere
> > where we are confident the state will be consistent?
>
> The vma_expand() function is used in two places - one is in the mmap.c
> file which can no longer validate the mm until the munmap is complete.
> The other is in fs/exec.c which cannot call the validate_mm().  So
> to add this call back, I'd have to add a wrapper to vma_expand() to call
> the validate_mm() function for debug builds.
>
> Really all this code in fs/exec.c doesn't belong there so we don't need
> to do an extra function wrapper just to call validate_mm(). And you have
> a patch to do that which is out for review!

Indeed :) perhaps we should add back to the wrapper?

>
> >
> > >  	return 0;
> > >
> > >  nomem:
> > > @@ -2612,6 +2616,9 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> > >  {
> > >  	struct mmu_gather tlb;
> > >
> > > +	if (vms->cleared_ptes)
> > > +		return;
> > > +
> > >  	/*
> > >  	 * We can free page tables without write-locking mmap_lock because VMAs
> > >  	 * were isolated before we downgraded mmap_lock.
> > > @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> > >  	mas_set(mas_detach, 1);
> > >  	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
> > >  	tlb_finish_mmu(&tlb);
> > > +	vms->cleared_ptes = true;
> > >  }
> > >
> > >  /*
> > > @@ -2936,24 +2944,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  	unsigned long merge_start = addr, merge_end = end;
> > >  	bool writable_file_mapping = false;
> > >  	pgoff_t vm_pgoff;
> > > -	int error;
> > > +	int error = -ENOMEM;
> > >  	VMA_ITERATOR(vmi, mm, addr);
> > > +	unsigned long nr_pages, nr_accounted;
> > >
> > > -	/* Check against address space limit. */
> > > -	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > > -		unsigned long nr_pages;
> > > -
> > > -		/*
> > > -		 * MAP_FIXED may remove pages of mappings that intersects with
> > > -		 * requested mapping. Account for the pages it would unmap.
> > > -		 */
> > > -		nr_pages = count_vma_pages_range(mm, addr, end);
> > > -
> > > -		if (!may_expand_vm(mm, vm_flags,
> > > -					(len >> PAGE_SHIFT) - nr_pages))
> > > -			return -ENOMEM;
> > > -	}
> > > +	nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
> > >
> > > +	/* Check against address space limit. */
> > > +	/*
> > > +	 * MAP_FIXED may remove pages of mappings that intersects with requested
> > > +	 * mapping. Account for the pages it would unmap.
> > > +	 */
> >
> > Utter pedantry, but could these comments be combined? Bit ugly to have one
> > after another like this.
>
> Since this was mainly a relocation, I didn't want to change it too much
> but since you asked, I'll do it.

Thanks, obviously a highly pedantic nit this one!

>
> >
> > > +	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> > > +		return -ENOMEM;
> > >
> > >  	if (unlikely(!can_modify_mm(mm, addr, end)))
> > >  		return -EPERM;
> > > @@ -2971,14 +2974,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  		if (vms_gather_munmap_vmas(&vms, &mas_detach))
> > >  			return -ENOMEM;
> > >
> > > -		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> > > -			return -ENOMEM;
> > > -
> > > -		vms_complete_munmap_vmas(&vms, &mas_detach);
> > >  		next = vms.next;
> > >  		prev = vms.prev;
> > >  		vma = NULL;
> > >  	} else {
> > > +		/* Minimal setup of vms */
> > > +		vms.nr_pages = 0;
> >
> > I'm not a huge fan of having vms be uninitialised other than this field and
> > then to rely on no further code change accidentally using an uninitialised
> > field. This is kind of asking for bugs.
> >
> > Can we not find a way to sensibly initialise it somehow?
>
> Yes, I can switch to the same sort of thing as the maple state and
> initialize things as empty.

Thanks.

>
> >
> > >  		next = vma_next(&vmi);
> > >  		prev = vma_prev(&vmi);
> > >  		if (prev)
> > > @@ -2990,8 +2991,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  	 */
> > >  	if (accountable_mapping(file, vm_flags)) {
> > >  		charged = len >> PAGE_SHIFT;
> > > +		charged -= nr_accounted;
> > >  		if (security_vm_enough_memory_mm(mm, charged))
> > > -			return -ENOMEM;
> > > +			goto abort_munmap;
> > > +		vms.nr_accounted = 0;
> >
> > This is kind of expanding the 'vms possibly unitialised apart from selected
> > fields' pattern, makes me worry.
>
> I'll fix this with an init of the struct that will always be called.

Thanks.

>
> >
> > >  		vm_flags |= VM_ACCOUNT;
> > >  	}
> > >
> > > @@ -3040,10 +3043,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  	 * not unmapped, but the maps are removed from the list.
> > >  	 */
> > >  	vma = vm_area_alloc(mm);
> > > -	if (!vma) {
> > > -		error = -ENOMEM;
> > > +	if (!vma)
> > >  		goto unacct_error;
> > > -	}
> > >
> > >  	vma_iter_config(&vmi, addr, end);
> > >  	vma_set_range(vma, addr, end, pgoff);
> > > @@ -3052,6 +3053,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >
> > >  	if (file) {
> > >  		vma->vm_file = get_file(file);
> > > +		/* call_mmap() map PTE, so ensure there are no existing PTEs */
> >
> > Typo? Should this be 'call_mmap() maps PTEs, so ensure there are no
> > existing PTEs'? I feel like this could be reworded something like:
> >
> > 'call_map() may map PTEs, so clear any that may be pending unmap ahead of
> > time.'
>
> I had changed this already to 'call_mmap() may map PTE, so ensure there
> are no existing PTEs'  That way it's still one line and more descriptive
> than what I had.

That works!

>
> >
> > > +		if (vms.nr_pages)
> > > +			vms_complete_pte_clear(&vms, &mas_detach, true);
> > >  		error = call_mmap(file, vma);
> > >  		if (error)
> > >  			goto unmap_and_free_vma;
> > > @@ -3142,6 +3146,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  expanded:
> > >  	perf_event_mmap(vma);
> > >
> > > +	if (vms.nr_pages)
> > > +		vms_complete_munmap_vmas(&vms, &mas_detach);
> > > +
> >
> > Hang on, if we already did this in the if (file) branch above, might we end
> > up calling this twice? I didn't see vms.nr_pages get set to zero or
> > decremented anywhere (unless I missed it)?
>
> No, we called the new helper vms_complete_pte_clear(), which will avoid
> clearing the ptes by the added flag vms->cleared_ptes in the second
> call.
>
> Above, I modified vms_complete_pte_clear() to check vms->cleared_ptes
> prior to clearing the ptes, so it will only be cleared if it needs
> clearing.
>
> I debated moving this nr_pages check within vms_complete_munmap_vmas(),
> but that would add an unnecessary check to the munmap() path.  Avoiding
> both checks seemed too much code (yet another static inline, or such).
> I also wanted to keep the sanity of nr_pages checking to a single
> function - as you highlighted it could be a path to insanity.
>
> Considering I'll switch this ti a VMS_INIT(), I think that I could pass
> it through and do the logic within the static inline at the expense of
> the munmap() having a few extra instructions (but no cache hits, so not
> a really big deal).

Yeah it's a bit confusing that the rest of vms_complete_munmap_vmas() is
potentially run twice even if the vms_complete_pte_clear() exits early due
to vms->cleared_ptes being set.

>
> >
> > >  	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> > >  	if (vm_flags & VM_LOCKED) {
> > >  		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> > > @@ -3189,6 +3196,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  unacct_error:
> > >  	if (charged)
> > >  		vm_unacct_memory(charged);
> > > +
> > > +abort_munmap:
> > > +	if (vms.nr_pages)
> > > +		abort_munmap_vmas(&mas_detach);
> > >  	validate_mm(mm);
> > >  	return error;
> > >  }
> > > --
> > > 2.43.0
> > >
> >
> > In general I like the approach and you've made it very clear how you've
> > altered this behaviour.
> >
> > However I have a few concerns (as well some trivial comments) above. With
> > those cleared up we'll be good to go!
Liam R. Howlett July 9, 2024, 6:43 p.m. UTC | #4
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240709 10:27]:
> On Mon, Jul 08, 2024 at 03:10:10PM GMT, Liam R. Howlett wrote:
> > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240708 08:18]:
> > > On Thu, Jul 04, 2024 at 02:27:15PM GMT, Liam R. Howlett wrote:
> > > > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> > > >
> > > > Instead of zeroing the vma tree and then overwriting the area, let the
> > > > area be overwritten and then clean up the gathered vmas using
> > > > vms_complete_munmap_vmas().
> > > >
> > > > In the case of a driver mapping over existing vmas, the PTEs are cleared
> > > > using the helper vms_complete_pte_clear().
> > > >
> > > > Temporarily keep track of the number of pages that will be removed and
> > > > reduce the charged amount.
> > > >
> > > > This also drops the validate_mm() call in the vma_expand() function.
> > > > It is necessary to drop the validate as it would fail since the mm
> > > > map_count would be incorrect during a vma expansion, prior to the
> > > > cleanup from vms_complete_munmap_vmas().
> > > >
> > > > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> > > > ---
> > > >  mm/internal.h |  1 +
> > > >  mm/mmap.c     | 61 ++++++++++++++++++++++++++++++---------------------
> > > >  2 files changed, 37 insertions(+), 25 deletions(-)
> > > >
> > > > diff --git a/mm/internal.h b/mm/internal.h
> > > > index 4c9f06669cc4..fae4a1bba732 100644
> > > > --- a/mm/internal.h
> > > > +++ b/mm/internal.h
> > > > @@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
> > > >  	unsigned long stack_vm;
> > > >  	unsigned long data_vm;
> > > >  	bool unlock;			/* Unlock after the munmap */
> > > > +	bool cleared_ptes;		/* If the PTE are cleared already */
> > > >  };
> > > >
> > > >  void __meminit __init_single_page(struct page *page, unsigned long pfn,
> > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > index 5d458c5f080e..0c334eeae8cd 100644
> > > > --- a/mm/mmap.c
> > > > +++ b/mm/mmap.c
...
> > > > @@ -522,6 +526,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
> > > >  	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
> > > >  	vms->unmap_start = FIRST_USER_ADDRESS;
> > > >  	vms->unmap_end = USER_PGTABLES_CEILING;
> > > > +	vms->cleared_ptes = false;
> > > >  }
> > > >
> > > >  /*
> > > > @@ -730,7 +735,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > >  	vma_iter_store(vmi, vma);
> > > >
> > > >  	vma_complete(&vp, vmi, vma->vm_mm);
> > > > -	validate_mm(vma->vm_mm);
> > >
> > > Since we're dropping this here, do we need to re-add this back somehwere
> > > where we are confident the state will be consistent?
> >
> > The vma_expand() function is used in two places - one is in the mmap.c
> > file which can no longer validate the mm until the munmap is complete.
> > The other is in fs/exec.c which cannot call the validate_mm().  So
> > to add this call back, I'd have to add a wrapper to vma_expand() to call
> > the validate_mm() function for debug builds.
> >
> > Really all this code in fs/exec.c doesn't belong there so we don't need
> > to do an extra function wrapper just to call validate_mm(). And you have
> > a patch to do that which is out for review!
> 
> Indeed :) perhaps we should add back to the wrapper?
> 
...

> > >
> > > > +	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> > > > +		return -ENOMEM;
> > > >
> > > >  	if (unlikely(!can_modify_mm(mm, addr, end)))
> > > >  		return -EPERM;
> > > > @@ -2971,14 +2974,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > >  		if (vms_gather_munmap_vmas(&vms, &mas_detach))
> > > >  			return -ENOMEM;
> > > >
> > > > -		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> > > > -			return -ENOMEM;
> > > > -
> > > > -		vms_complete_munmap_vmas(&vms, &mas_detach);
> > > >  		next = vms.next;
> > > >  		prev = vms.prev;
> > > >  		vma = NULL;
> > > >  	} else {
> > > > +		/* Minimal setup of vms */
> > > > +		vms.nr_pages = 0;
> > >
...

> > > > @@ -3052,6 +3053,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > >
> > > >  	if (file) {
> > > >  		vma->vm_file = get_file(file);
> > > > +		/* call_mmap() map PTE, so ensure there are no existing PTEs */
...
> > > > +		if (vms.nr_pages)
> > > > +			vms_complete_pte_clear(&vms, &mas_detach, true);
> > > >  		error = call_mmap(file, vma);
> > > >  		if (error)
> > > >  			goto unmap_and_free_vma;
> > > > @@ -3142,6 +3146,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > >  expanded:
> > > >  	perf_event_mmap(vma);
> > > >
> > > > +	if (vms.nr_pages)
> > > > +		vms_complete_munmap_vmas(&vms, &mas_detach);
> > > > +
> > >
> > > Hang on, if we already did this in the if (file) branch above, might we end
> > > up calling this twice? I didn't see vms.nr_pages get set to zero or
> > > decremented anywhere (unless I missed it)?
> >
> > No, we called the new helper vms_complete_pte_clear(), which will avoid
> > clearing the ptes by the added flag vms->cleared_ptes in the second
> > call.
> >
> > Above, I modified vms_complete_pte_clear() to check vms->cleared_ptes
> > prior to clearing the ptes, so it will only be cleared if it needs
> > clearing.
> >
> > I debated moving this nr_pages check within vms_complete_munmap_vmas(),
> > but that would add an unnecessary check to the munmap() path.  Avoiding
> > both checks seemed too much code (yet another static inline, or such).
> > I also wanted to keep the sanity of nr_pages checking to a single
> > function - as you highlighted it could be a path to insanity.
> >
> > Considering I'll switch this ti a VMS_INIT(), I think that I could pass
> > it through and do the logic within the static inline at the expense of
> > the munmap() having a few extra instructions (but no cache hits, so not
> > a really big deal).
> 
> Yeah it's a bit confusing that the rest of vms_complete_munmap_vmas() is
> potentially run twice even if the vms_complete_pte_clear() exits early due
> to vms->cleared_ptes being set.

vms_complete_munmap_vmas() is never run twice, it's only ever run once.
vms_complete_pte_clear() is called from  vms_complete_munmap_vmas(), but
will do nothing if cleared_ptes == true, which is set at the end of the
pte_clear() itself, and initialized as false.

Hopefully this becomes more obvious with the change to an INIT_VMS()
paradigm.  I think I'll change the name of vms_complete_pte_clear() in
an attempt to make this more obvious as well (remove the _complete,
probably).

Thanks,
Liam
diff mbox series

Patch

diff --git a/mm/internal.h b/mm/internal.h
index 4c9f06669cc4..fae4a1bba732 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1503,6 +1503,7 @@  struct vma_munmap_struct {
 	unsigned long stack_vm;
 	unsigned long data_vm;
 	bool unlock;			/* Unlock after the munmap */
+	bool cleared_ptes;		/* If the PTE are cleared already */
 };
 
 void __meminit __init_single_page(struct page *page, unsigned long pfn,
diff --git a/mm/mmap.c b/mm/mmap.c
index 5d458c5f080e..0c334eeae8cd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -401,17 +401,21 @@  anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
 }
 
 static unsigned long count_vma_pages_range(struct mm_struct *mm,
-		unsigned long addr, unsigned long end)
+		unsigned long addr, unsigned long end,
+		unsigned long *nr_accounted)
 {
 	VMA_ITERATOR(vmi, mm, addr);
 	struct vm_area_struct *vma;
 	unsigned long nr_pages = 0;
 
+	*nr_accounted = 0;
 	for_each_vma_range(vmi, vma, end) {
 		unsigned long vm_start = max(addr, vma->vm_start);
 		unsigned long vm_end = min(end, vma->vm_end);
 
 		nr_pages += PHYS_PFN(vm_end - vm_start);
+		if (vma->vm_flags & VM_ACCOUNT)
+			*nr_accounted += PHYS_PFN(vm_end - vm_start);
 	}
 
 	return nr_pages;
@@ -522,6 +526,7 @@  static inline void init_vma_munmap(struct vma_munmap_struct *vms,
 	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
 	vms->unmap_start = FIRST_USER_ADDRESS;
 	vms->unmap_end = USER_PGTABLES_CEILING;
+	vms->cleared_ptes = false;
 }
 
 /*
@@ -730,7 +735,6 @@  int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	vma_iter_store(vmi, vma);
 
 	vma_complete(&vp, vmi, vma->vm_mm);
-	validate_mm(vma->vm_mm);
 	return 0;
 
 nomem:
@@ -2612,6 +2616,9 @@  static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
 {
 	struct mmu_gather tlb;
 
+	if (vms->cleared_ptes)
+		return;
+
 	/*
 	 * We can free page tables without write-locking mmap_lock because VMAs
 	 * were isolated before we downgraded mmap_lock.
@@ -2624,6 +2631,7 @@  static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
 	mas_set(mas_detach, 1);
 	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
 	tlb_finish_mmu(&tlb);
+	vms->cleared_ptes = true;
 }
 
 /*
@@ -2936,24 +2944,19 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long merge_start = addr, merge_end = end;
 	bool writable_file_mapping = false;
 	pgoff_t vm_pgoff;
-	int error;
+	int error = -ENOMEM;
 	VMA_ITERATOR(vmi, mm, addr);
+	unsigned long nr_pages, nr_accounted;
 
-	/* Check against address space limit. */
-	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
-		unsigned long nr_pages;
-
-		/*
-		 * MAP_FIXED may remove pages of mappings that intersects with
-		 * requested mapping. Account for the pages it would unmap.
-		 */
-		nr_pages = count_vma_pages_range(mm, addr, end);
-
-		if (!may_expand_vm(mm, vm_flags,
-					(len >> PAGE_SHIFT) - nr_pages))
-			return -ENOMEM;
-	}
+	nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
 
+	/* Check against address space limit. */
+	/*
+	 * MAP_FIXED may remove pages of mappings that intersects with requested
+	 * mapping. Account for the pages it would unmap.
+	 */
+	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
+		return -ENOMEM;
 
 	if (unlikely(!can_modify_mm(mm, addr, end)))
 		return -EPERM;
@@ -2971,14 +2974,12 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 		if (vms_gather_munmap_vmas(&vms, &mas_detach))
 			return -ENOMEM;
 
-		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
-			return -ENOMEM;
-
-		vms_complete_munmap_vmas(&vms, &mas_detach);
 		next = vms.next;
 		prev = vms.prev;
 		vma = NULL;
 	} else {
+		/* Minimal setup of vms */
+		vms.nr_pages = 0;
 		next = vma_next(&vmi);
 		prev = vma_prev(&vmi);
 		if (prev)
@@ -2990,8 +2991,10 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
+		charged -= nr_accounted;
 		if (security_vm_enough_memory_mm(mm, charged))
-			return -ENOMEM;
+			goto abort_munmap;
+		vms.nr_accounted = 0;
 		vm_flags |= VM_ACCOUNT;
 	}
 
@@ -3040,10 +3043,8 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 	 * not unmapped, but the maps are removed from the list.
 	 */
 	vma = vm_area_alloc(mm);
-	if (!vma) {
-		error = -ENOMEM;
+	if (!vma)
 		goto unacct_error;
-	}
 
 	vma_iter_config(&vmi, addr, end);
 	vma_set_range(vma, addr, end, pgoff);
@@ -3052,6 +3053,9 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 
 	if (file) {
 		vma->vm_file = get_file(file);
+		/* call_mmap() map PTE, so ensure there are no existing PTEs */
+		if (vms.nr_pages)
+			vms_complete_pte_clear(&vms, &mas_detach, true);
 		error = call_mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
@@ -3142,6 +3146,9 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 expanded:
 	perf_event_mmap(vma);
 
+	if (vms.nr_pages)
+		vms_complete_munmap_vmas(&vms, &mas_detach);
+
 	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
@@ -3189,6 +3196,10 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
+
+abort_munmap:
+	if (vms.nr_pages)
+		abort_munmap_vmas(&mas_detach);
 	validate_mm(mm);
 	return error;
 }