diff mbox series

[v5,1/2] mm: Add MREMAP_DONTUNMAP to mremap().

Message ID 20200214040952.43195-1-bgeffon@google.com (mailing list archive)
State New, archived
Headers show
Series [v5,1/2] mm: Add MREMAP_DONTUNMAP to mremap(). | expand

Commit Message

Brian Geffon Feb. 14, 2020, 4:09 a.m. UTC
When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
set, the source mapping will not be removed. The remap operation
will be performed as it would have been normally by moving over the
page tables to the new mapping. The old vma will have any locked
flags cleared, have no pagetables, and any userfaultfds that were
watching that range will continue watching it.

For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause
the mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
equally sized VMAs where the destination contains the PTEs of the source.

We hope to use this in Chrome OS where with userfaultfd we could write
an anonymous mapping to disk without having to STOP the process or worry
about VMA permission changes.

This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."

  v4 -> v5:
    - Correct commit message to more accurately reflect the behavior.
    - Clear VM_LOCKED and VM_LOCKEDONFAULT on the old vma.
           
Signed-off-by: Brian Geffon <bgeffon@google.com>

---
 include/uapi/linux/mman.h |   5 +-
 mm/mremap.c               | 106 ++++++++++++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 23 deletions(-)

Comments

Kirill A . Shutemov Feb. 14, 2020, 2:28 p.m. UTC | #1
On Thu, Feb 13, 2020 at 08:09:51PM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. The remap operation
> will be performed as it would have been normally by moving over the
> page tables to the new mapping. The old vma will have any locked
> flags cleared, have no pagetables, and any userfaultfds that were
> watching that range will continue watching it.
> 
> For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause
> the mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
> equally sized VMAs where the destination contains the PTEs of the source.
> 
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
> 
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
> 
>   v4 -> v5:
>     - Correct commit message to more accurately reflect the behavior.
>     - Clear VM_LOCKED and VM_LOCKEDONFAULT on the old vma.
>            
> Signed-off-by: Brian Geffon <bgeffon@google.com>
> 
> ---
>  include/uapi/linux/mman.h |   5 +-
>  mm/mremap.c               | 106 ++++++++++++++++++++++++++++++--------
>  2 files changed, 88 insertions(+), 23 deletions(-)
> 
> diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> index fc1a64c3447b..923cc162609c 100644
> --- a/include/uapi/linux/mman.h
> +++ b/include/uapi/linux/mman.h
> @@ -5,8 +5,9 @@
>  #include <asm/mman.h>
>  #include <asm-generic/hugetlb_encode.h>
>  
> -#define MREMAP_MAYMOVE	1
> -#define MREMAP_FIXED	2
> +#define MREMAP_MAYMOVE		1
> +#define MREMAP_FIXED		2
> +#define MREMAP_DONTUNMAP	4
>  
>  #define OVERCOMMIT_GUESS		0
>  #define OVERCOMMIT_ALWAYS		1
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 1fc8a29fbe3f..a2a792fdbc64 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
>  static unsigned long move_vma(struct vm_area_struct *vma,
>  		unsigned long old_addr, unsigned long old_len,
>  		unsigned long new_len, unsigned long new_addr,
> -		bool *locked, struct vm_userfaultfd_ctx *uf,
> -		struct list_head *uf_unmap)
> +		bool *locked, unsigned long flags,
> +		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct vm_area_struct *new_vma;
> @@ -408,11 +408,49 @@ static unsigned long move_vma(struct vm_area_struct *vma,
>  	if (unlikely(vma->vm_flags & VM_PFNMAP))
>  		untrack_pfn_moved(vma);
>  
> +	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> +		if (vm_flags & VM_ACCOUNT) {
> +			/* Always put back VM_ACCOUNT since we won't unmap */
> +			vma->vm_flags |= VM_ACCOUNT;
> +
> +			vm_acct_memory(vma_pages(new_vma));
> +		}
> +
> +		/*
> +		 * locked_vm accounting: if the mapping remained the same size
> +		 * it will have just moved and we don't need to touch locked_vm
> +		 * because we skip the do_unmap. If the mapping shrunk before
> +		 * being moved then the do_unmap on that portion will have
> +		 * adjusted vm_locked. Only if the mapping grows do we need to
> +		 * do something special; the reason is locked_vm only accounts
> +		 * for old_len, but we're now adding new_len - old_len locked
> +		 * bytes to the new mapping.
> +		 */
> +		if (vm_flags & VM_LOCKED) {
> +			/* We always clear VM_LOCKED[ONFAULT] on the old vma */
> +			vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
> +
> +			if (new_len > old_len) {
> +				mm->locked_vm +=
> +					(new_len - old_len) >> PAGE_SHIFT;
> +				*locked = true;

This level of code indentation suggests that code restructuring is
required.

> +			}
> +		}
> +
> +		goto out;
> +	}
> +
>  	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
>  		/* OOM: unable to split vma, just get accounts right */
>  		vm_unacct_memory(excess >> PAGE_SHIFT);
>  		excess = 0;
>  	}
> +
> +	if (vm_flags & VM_LOCKED) {
> +		mm->locked_vm += new_len >> PAGE_SHIFT;
> +		*locked = true;
> +	}

I don't follow why this is required.

> +out:
>  	mm->hiwater_vm = hiwater_vm;
>  
>  	/* Restore VM_ACCOUNT if one or two pieces of vma left */
> @@ -422,16 +460,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
>  			vma->vm_next->vm_flags |= VM_ACCOUNT;
>  	}
>  
> -	if (vm_flags & VM_LOCKED) {
> -		mm->locked_vm += new_len >> PAGE_SHIFT;
> -		*locked = true;
> -	}
> -

Ah. You moved this piece. Why?

>  	return new_addr;
>  }
>  
>  static struct vm_area_struct *vma_to_resize(unsigned long addr,
> -	unsigned long old_len, unsigned long new_len, unsigned long *p)
> +	unsigned long old_len, unsigned long new_len, unsigned long flags,
> +	unsigned long *p)
>  {
>  	struct mm_struct *mm = current->mm;
>  	struct vm_area_struct *vma = find_vma(mm, addr);
> @@ -453,6 +487,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
>  		return ERR_PTR(-EINVAL);
>  	}
>  
> +	if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
> +			vma->vm_flags & VM_SHARED))
> +		return ERR_PTR(-EINVAL);
> +
>  	if (is_vm_hugetlb_page(vma))
>  		return ERR_PTR(-EINVAL);
>  
> @@ -497,7 +535,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
>  
>  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  		unsigned long new_addr, unsigned long new_len, bool *locked,
> -		struct vm_userfaultfd_ctx *uf,
> +		unsigned long flags, struct vm_userfaultfd_ctx *uf,
>  		struct list_head *uf_unmap_early,
>  		struct list_head *uf_unmap)
>  {
> @@ -505,7 +543,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  	struct vm_area_struct *vma;
>  	unsigned long ret = -EINVAL;
>  	unsigned long charged = 0;
> -	unsigned long map_flags;
> +	unsigned long map_flags = 0;
>  
>  	if (offset_in_page(new_addr))
>  		goto out;
> @@ -534,9 +572,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
>  		return -ENOMEM;
>  
> -	ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
> -	if (ret)
> -		goto out;
> +	if (flags & MREMAP_FIXED) {

I think it has to be

	if (!(flags & MREMAP_DONTUNMAP)) {

No?


> +		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
> +		if (ret)
> +			goto out;
> +	}
>  
>  	if (old_len >= new_len) {
>  		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
> @@ -545,13 +585,26 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  		old_len = new_len;
>  	}
>  
> -	vma = vma_to_resize(addr, old_len, new_len, &charged);
> +	vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
>  	if (IS_ERR(vma)) {
>  		ret = PTR_ERR(vma);
>  		goto out;
>  	}
>  
> -	map_flags = MAP_FIXED;
> +	/*
> +	 * MREMAP_DONTUNMAP expands by new_len - (new_len - old_len), we will
> +	 * check that we can expand by new_len and vma_to_resize will handle
> +	 * the vma growing which is (new_len - old_len).
> +	 */
> +	if (flags & MREMAP_DONTUNMAP &&
> +		!may_expand_vm(mm, vma->vm_flags, new_len >> PAGE_SHIFT)) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	if (flags & MREMAP_FIXED)
> +		map_flags |= MAP_FIXED;
> +
>  	if (vma->vm_flags & VM_MAYSHARE)
>  		map_flags |= MAP_SHARED;
>  
> @@ -561,10 +614,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  	if (offset_in_page(ret))
>  		goto out1;
>  
> -	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
> +	/* We got a new mapping */
> +	if (!(flags & MREMAP_FIXED))
> +		new_addr = ret;
> +
> +	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
>  		       uf_unmap);
> +
>  	if (!(offset_in_page(ret)))
>  		goto out;
> +
>  out1:
>  	vm_unacct_memory(charged);
>  
> @@ -609,12 +668,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>  	addr = untagged_addr(addr);
>  	new_addr = untagged_addr(new_addr);
>  
> -	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
> +	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
>  		return ret;
>  
>  	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
>  		return ret;
>  
> +	/* MREMAP_DONTUNMAP is always a move */
> +	if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_MAYMOVE))
> +		return ret;
> +
>  	if (offset_in_page(addr))
>  		return ret;
>  
> @@ -632,9 +695,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>  	if (down_write_killable(&current->mm->mmap_sem))
>  		return -EINTR;
>  
> -	if (flags & MREMAP_FIXED) {
> +	if (flags & MREMAP_FIXED || flags & MREMAP_DONTUNMAP) {

	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {

>  		ret = mremap_to(addr, old_len, new_addr, new_len,
> -				&locked, &uf, &uf_unmap_early, &uf_unmap);
> +				&locked, flags, &uf, &uf_unmap_early,
> +				&uf_unmap);
>  		goto out;
>  	}
>  
> @@ -662,7 +726,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>  	/*
>  	 * Ok, we need to grow..
>  	 */
> -	vma = vma_to_resize(addr, old_len, new_len, &charged);
> +	vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
>  	if (IS_ERR(vma)) {
>  		ret = PTR_ERR(vma);
>  		goto out;
> @@ -712,7 +776,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>  		}
>  
>  		ret = move_vma(vma, addr, old_len, new_len, new_addr,
> -			       &locked, &uf, &uf_unmap);
> +			       &locked, flags, &uf, &uf_unmap);
>  	}
>  out:
>  	if (offset_in_page(ret)) {
> -- 
> 2.25.0.265.gbab2e86ba0-goog
>
Brian Geffon Feb. 14, 2020, 6:46 p.m. UTC | #2
Hi Kirill,

> > -     if (vm_flags & VM_LOCKED) {
> > -             mm->locked_vm += new_len >> PAGE_SHIFT;
> > -             *locked = true;
> > -     }
> > -
>
> Ah. You moved this piece. Why?

Because we're not unmapping, do_munmap would have adjusted
mm->locked_vm by decreasing it by old_len so then we have to add back
in the new_len in the normal case (non. MREMAP_DONTUNMAP), but since
we're not doing the unmap I want to skip the increase by new_len and
just adjust accordingly. In the MREMAP_DONTUNMAP case if the VMA got
smaller then the do_munmap on that portion would have decreased it by
new_len - old_len, and the accounting is correct. In the case of an
unchanged VMA size there is nothing to do, but in the case where it
grows we're responsible for adding new_len - old_len because of the
decision to jump that block and now the accounting is right for all
cases.

If we were to leave the original block and not jump over it then we
would have to remove old_len bytes and then we're doing the same thing
but now special casing the situation where new_len < old_len because
the unmap on the removed part would have reduced it by new_len -
old_len so backing old_len would be too much and we'd have to add back
in new_len - old_len. I hope that explains it all.

By doing it this way, IMO it makes it easier to see how the locked_vm
accounting is happening because the vm_locked incrementing happens in
only one of two places based on the type of remap that is happening.
But I definitely can clean up the code a bit to drop the levels of
indentation, maybe this:

        /*
         * locked_vm accounting: if the mapping remained the same size
         * it will have just moved and we don't need to touch locked_vm
         * because we skip the do_unmap. If the mapping shrunk before
         * being moved then the do_unmap on that portion will have
         * adjusted vm_locked. Only if the mapping grows do we need to
         * do something special; the reason is locked_vm only accounts
         * for old_len, but we're now adding new_len - old_len locked
         * bytes to the new mapping.
         */
        if (vm_flags & VM_LOCKED && new_len > old_len) {
          mm->locked_vm += (new_len - old_len) >> PAGE_SHIFT;
          *locked = true;
        }

        /* We always clear VM_LOCKED[ONFAULT] on the old vma */
        vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
        goto out;
     }

Having only one place where locked_vm is accounted and adjusted based
on the type of remap seems like it will be easier to follow and less
error prone later. What do you think about this?

> > +     if (flags & MREMAP_FIXED) {
>
> I think it has to be
>
>         if (!(flags & MREMAP_DONTUNMAP)) {
>
> No?

No. Because we dropped the requirement to use MREMAP_FIXED with
MREMAP_DONTUNMAP, if we're not using MREMAP_FIXED we don't need to
unmap anything at dest if it already exists because
get_unmapped_area() below will not be using the MAP_FIXED flag either,
instead it will search for a new unmapped area. If we were to change
it then we wouldn't be able to do MREMAP_FIXED | MREMAP_DONTUNMAP, so
I think this is correct.

> > -     if (flags & MREMAP_FIXED) {
> > +     if (flags & MREMAP_FIXED || flags & MREMAP_DONTUNMAP) {
>
>         if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {

Sure, I can change that.

If you're good with all of that I can mail a new patch today.

Thanks again,
Brian
diff mbox series

Patch

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@ 
 #include <asm/mman.h>
 #include <asm-generic/hugetlb_encode.h>
 
-#define MREMAP_MAYMOVE	1
-#define MREMAP_FIXED	2
+#define MREMAP_MAYMOVE		1
+#define MREMAP_FIXED		2
+#define MREMAP_DONTUNMAP	4
 
 #define OVERCOMMIT_GUESS		0
 #define OVERCOMMIT_ALWAYS		1
diff --git a/mm/mremap.c b/mm/mremap.c
index 1fc8a29fbe3f..a2a792fdbc64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@  unsigned long move_page_tables(struct vm_area_struct *vma,
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
 		unsigned long new_len, unsigned long new_addr,
-		bool *locked, struct vm_userfaultfd_ctx *uf,
-		struct list_head *uf_unmap)
+		bool *locked, unsigned long flags,
+		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -408,11 +408,49 @@  static unsigned long move_vma(struct vm_area_struct *vma,
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn_moved(vma);
 
+	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+		if (vm_flags & VM_ACCOUNT) {
+			/* Always put back VM_ACCOUNT since we won't unmap */
+			vma->vm_flags |= VM_ACCOUNT;
+
+			vm_acct_memory(vma_pages(new_vma));
+		}
+
+		/*
+		 * locked_vm accounting: if the mapping remained the same size
+		 * it will have just moved and we don't need to touch locked_vm
+		 * because we skip the do_unmap. If the mapping shrunk before
+		 * being moved then the do_unmap on that portion will have
+		 * adjusted vm_locked. Only if the mapping grows do we need to
+		 * do something special; the reason is locked_vm only accounts
+		 * for old_len, but we're now adding new_len - old_len locked
+		 * bytes to the new mapping.
+		 */
+		if (vm_flags & VM_LOCKED) {
+			/* We always clear VM_LOCKED[ONFAULT] on the old vma */
+			vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+
+			if (new_len > old_len) {
+				mm->locked_vm +=
+					(new_len - old_len) >> PAGE_SHIFT;
+				*locked = true;
+			}
+		}
+
+		goto out;
+	}
+
 	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += new_len >> PAGE_SHIFT;
+		*locked = true;
+	}
+out:
 	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -422,16 +460,12 @@  static unsigned long move_vma(struct vm_area_struct *vma,
 			vma->vm_next->vm_flags |= VM_ACCOUNT;
 	}
 
-	if (vm_flags & VM_LOCKED) {
-		mm->locked_vm += new_len >> PAGE_SHIFT;
-		*locked = true;
-	}
-
 	return new_addr;
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
-	unsigned long old_len, unsigned long new_len, unsigned long *p)
+	unsigned long old_len, unsigned long new_len, unsigned long flags,
+	unsigned long *p)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = find_vma(mm, addr);
@@ -453,6 +487,10 @@  static struct vm_area_struct *vma_to_resize(unsigned long addr,
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
+			vma->vm_flags & VM_SHARED))
+		return ERR_PTR(-EINVAL);
+
 	if (is_vm_hugetlb_page(vma))
 		return ERR_PTR(-EINVAL);
 
@@ -497,7 +535,7 @@  static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		unsigned long new_addr, unsigned long new_len, bool *locked,
-		struct vm_userfaultfd_ctx *uf,
+		unsigned long flags, struct vm_userfaultfd_ctx *uf,
 		struct list_head *uf_unmap_early,
 		struct list_head *uf_unmap)
 {
@@ -505,7 +543,7 @@  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
-	unsigned long map_flags;
+	unsigned long map_flags = 0;
 
 	if (offset_in_page(new_addr))
 		goto out;
@@ -534,9 +572,11 @@  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
 		return -ENOMEM;
 
-	ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
-	if (ret)
-		goto out;
+	if (flags & MREMAP_FIXED) {
+		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
+		if (ret)
+			goto out;
+	}
 
 	if (old_len >= new_len) {
 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
@@ -545,13 +585,26 @@  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		old_len = new_len;
 	}
 
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
+	vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto out;
 	}
 
-	map_flags = MAP_FIXED;
+	/*
+	 * MREMAP_DONTUNMAP expands by new_len - (new_len - old_len), we will
+	 * check that we can expand by new_len and vma_to_resize will handle
+	 * the vma growing which is (new_len - old_len).
+	 */
+	if (flags & MREMAP_DONTUNMAP &&
+		!may_expand_vm(mm, vma->vm_flags, new_len >> PAGE_SHIFT)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (flags & MREMAP_FIXED)
+		map_flags |= MAP_FIXED;
+
 	if (vma->vm_flags & VM_MAYSHARE)
 		map_flags |= MAP_SHARED;
 
@@ -561,10 +614,16 @@  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+	/* We got a new mapping */
+	if (!(flags & MREMAP_FIXED))
+		new_addr = ret;
+
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
 		       uf_unmap);
+
 	if (!(offset_in_page(ret)))
 		goto out;
+
 out1:
 	vm_unacct_memory(charged);
 
@@ -609,12 +668,16 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	addr = untagged_addr(addr);
 	new_addr = untagged_addr(new_addr);
 
-	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
 		return ret;
 
 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
 		return ret;
 
+	/* MREMAP_DONTUNMAP is always a move */
+	if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_MAYMOVE))
+		return ret;
+
 	if (offset_in_page(addr))
 		return ret;
 
@@ -632,9 +695,10 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	if (down_write_killable(&current->mm->mmap_sem))
 		return -EINTR;
 
-	if (flags & MREMAP_FIXED) {
+	if (flags & MREMAP_FIXED || flags & MREMAP_DONTUNMAP) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked, &uf, &uf_unmap_early, &uf_unmap);
+				&locked, flags, &uf, &uf_unmap_early,
+				&uf_unmap);
 		goto out;
 	}
 
@@ -662,7 +726,7 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	/*
 	 * Ok, we need to grow..
 	 */
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
+	vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto out;
@@ -712,7 +776,7 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 
 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
-			       &locked, &uf, &uf_unmap);
+			       &locked, flags, &uf, &uf_unmap);
 	}
 out:
 	if (offset_in_page(ret)) {