diff mbox series

[v2] mm: Add MREMAP_DONTUNMAP to mremap().

Message ID 20200124190625.257659-1-bgeffon@google.com (mailing list archive)
State New, archived
Headers show
Series [v2] mm: Add MREMAP_DONTUNMAP to mremap(). | expand

Commit Message

Brian Geffon Jan. 24, 2020, 7:06 p.m. UTC
When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
set, the source mapping will not be removed. Instead it will be
cleared as if a brand new anonymous, private mapping had been created
atomically as part of the mremap() call.  If a userfaultfd was watching
the source, it will continue to watch the new mapping.  For a mapping
that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
also used. The final result is two equally sized VMAs where the
destination contains the PTEs of the source.

We hope to use this in Chrome OS where with userfaultfd we could write
an anonymous mapping to disk without having to STOP the process or worry
about VMA permission changes.

This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."

Signed-off-by: Brian Geffon <bgeffon@google.com>
---
 include/uapi/linux/mman.h |  5 +++--
 mm/mremap.c               | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 9 deletions(-)

Comments

Nathan Chancellor Jan. 26, 2020, 5:16 a.m. UTC | #1
Hi Brian,

On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call.  If a userfaultfd was watching
> the source, it will continue to watch the new mapping.  For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> also used. The final result is two equally sized VMAs where the
> destination contains the PTEs of the source.
> 
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
> 
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
> 
> Signed-off-by: Brian Geffon <bgeffon@google.com>
> ---
>  include/uapi/linux/mman.h |  5 +++--
>  mm/mremap.c               | 37 ++++++++++++++++++++++++++++++-------
>  2 files changed, 33 insertions(+), 9 deletions(-)
> 
> diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> index fc1a64c3447b..923cc162609c 100644
> --- a/include/uapi/linux/mman.h
> +++ b/include/uapi/linux/mman.h
> @@ -5,8 +5,9 @@
>  #include <asm/mman.h>
>  #include <asm-generic/hugetlb_encode.h>
>  
> -#define MREMAP_MAYMOVE	1
> -#define MREMAP_FIXED	2
> +#define MREMAP_MAYMOVE		1
> +#define MREMAP_FIXED		2
> +#define MREMAP_DONTUNMAP	4
>  
>  #define OVERCOMMIT_GUESS		0
>  #define OVERCOMMIT_ALWAYS		1
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 122938dcec15..bf97c3eb538b 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
>  static unsigned long move_vma(struct vm_area_struct *vma,
>  		unsigned long old_addr, unsigned long old_len,
>  		unsigned long new_len, unsigned long new_addr,
> -		bool *locked, struct vm_userfaultfd_ctx *uf,
> -		struct list_head *uf_unmap)
> +		bool *locked, unsigned long flags,
> +		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct vm_area_struct *new_vma;
> @@ -408,6 +408,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
>  	if (unlikely(vma->vm_flags & VM_PFNMAP))
>  		untrack_pfn_moved(vma);
>  
> +	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> +		if (vm_flags & VM_ACCOUNT)
> +			vma->vm_flags |= VM_ACCOUNT;
> +
> +		goto out;
> +	}
> +
>  	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
>  		/* OOM: unable to split vma, just get accounts right */
>  		vm_unacct_memory(excess >> PAGE_SHIFT);
> @@ -422,6 +429,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
>  			vma->vm_next->vm_flags |= VM_ACCOUNT;
>  	}
>  
> +out:
>  	if (vm_flags & VM_LOCKED) {
>  		mm->locked_vm += new_len >> PAGE_SHIFT;
>  		*locked = true;
> @@ -497,7 +505,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
>  
>  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  		unsigned long new_addr, unsigned long new_len, bool *locked,
> -		struct vm_userfaultfd_ctx *uf,
> +		unsigned long flags, struct vm_userfaultfd_ctx *uf,
>  		struct list_head *uf_unmap_early,
>  		struct list_head *uf_unmap)
>  {
> @@ -545,6 +553,17 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  		old_len = new_len;
>  	}
>  
> +	/*
> +	 * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
> +	 * check that we can expand by old_len and vma_to_resize will handle
> +	 * the vma growing.
> +	 */
> +	if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
> +				vma->vm_flags, old_len >> PAGE_SHIFT))) {

We received a Clang build report that vma is used uninitialized here
(they aren't being publicly sent to LKML due to GCC vs Clang
warning/error overlap):

https://groups.google.com/d/msg/clang-built-linux/gE5wRaeHdSI/xVA0MBQVEgAJ

Sure enough, vma is initialized first in the next block. Not sure if
this section should be moved below that initialization or if something
else should be done to resolve it but that dereference will obviously be
fatal.

Cheers,
Nathan

> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
>  	vma = vma_to_resize(addr, old_len, new_len, &charged);
>  	if (IS_ERR(vma)) {
>  		ret = PTR_ERR(vma);
> @@ -561,7 +580,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
>  	if (IS_ERR_VALUE(ret))
>  		goto out1;
>  
> -	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
> +	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
>  		       uf_unmap);
>  	if (!(offset_in_page(ret)))
>  		goto out;
> @@ -609,12 +628,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>  	addr = untagged_addr(addr);
>  	new_addr = untagged_addr(new_addr);
>  
> -	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
> +	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
>  		return ret;
>  
>  	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
>  		return ret;
>  
> +	if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
> +		return ret;
> +
>  	if (offset_in_page(addr))
>  		return ret;
>  
> @@ -634,7 +656,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>  
>  	if (flags & MREMAP_FIXED) {
>  		ret = mremap_to(addr, old_len, new_addr, new_len,
> -				&locked, &uf, &uf_unmap_early, &uf_unmap);
> +				&locked, flags, &uf, &uf_unmap_early,
> +				&uf_unmap);
>  		goto out;
>  	}
>  
> @@ -712,7 +735,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>  		}
>  
>  		ret = move_vma(vma, addr, old_len, new_len, new_addr,
> -			       &locked, &uf, &uf_unmap);
> +			       &locked, flags, &uf, &uf_unmap);
>  	}
>  out:
>  	if (offset_in_page(ret)) {
> -- 
> 2.25.0.341.g760bfbb309-goog
>
Kirill A . Shutemov Jan. 26, 2020, 10:06 p.m. UTC | #2
On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call.  If a userfaultfd was watching
> the source, it will continue to watch the new mapping.  For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> also used.

Implies? From code it looks like it requires MREMAP_FIXED. And
MREMAP_FIXED requires MREMAP_MAYMOVE. That's strange flag chaining.
I don't really see need in such dependencies. It maybe indeed implied, not
requied.

> The final result is two equally sized VMAs where the
> destination contains the PTEs of the source.

Could you clarify rmap situation here? How the rmap hierarchy will look
like before and after the operation. Can the new VMA merge with the old
one? Sounds fishy to me.

> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
> 
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
Brian Geffon Jan. 27, 2020, 2:21 a.m. UTC | #3
Hi Nathan,
Thank you! That was an oversight on my part. I'll address it in the next patch.

Brian


On Sat, Jan 25, 2020 at 9:16 PM Nathan Chancellor
<natechancellor@gmail.com> wrote:
>
> Hi Brian,
>
> On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call.  If a userfaultfd was watching
> > the source, it will continue to watch the new mapping.  For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> > also used. The final result is two equally sized VMAs where the
> > destination contains the PTEs of the source.
> >
> > We hope to use this in Chrome OS where with userfaultfd we could write
> > an anonymous mapping to disk without having to STOP the process or worry
> > about VMA permission changes.
> >
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
> >
> > Signed-off-by: Brian Geffon <bgeffon@google.com>
> > ---
> >  include/uapi/linux/mman.h |  5 +++--
> >  mm/mremap.c               | 37 ++++++++++++++++++++++++++++++-------
> >  2 files changed, 33 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> > index fc1a64c3447b..923cc162609c 100644
> > --- a/include/uapi/linux/mman.h
> > +++ b/include/uapi/linux/mman.h
> > @@ -5,8 +5,9 @@
> >  #include <asm/mman.h>
> >  #include <asm-generic/hugetlb_encode.h>
> >
> > -#define MREMAP_MAYMOVE       1
> > -#define MREMAP_FIXED 2
> > +#define MREMAP_MAYMOVE               1
> > +#define MREMAP_FIXED         2
> > +#define MREMAP_DONTUNMAP     4
> >
> >  #define OVERCOMMIT_GUESS             0
> >  #define OVERCOMMIT_ALWAYS            1
> > diff --git a/mm/mremap.c b/mm/mremap.c
> > index 122938dcec15..bf97c3eb538b 100644
> > --- a/mm/mremap.c
> > +++ b/mm/mremap.c
> > @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
> >  static unsigned long move_vma(struct vm_area_struct *vma,
> >               unsigned long old_addr, unsigned long old_len,
> >               unsigned long new_len, unsigned long new_addr,
> > -             bool *locked, struct vm_userfaultfd_ctx *uf,
> > -             struct list_head *uf_unmap)
> > +             bool *locked, unsigned long flags,
> > +             struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
> >  {
> >       struct mm_struct *mm = vma->vm_mm;
> >       struct vm_area_struct *new_vma;
> > @@ -408,6 +408,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> >       if (unlikely(vma->vm_flags & VM_PFNMAP))
> >               untrack_pfn_moved(vma);
> >
> > +     if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> > +             if (vm_flags & VM_ACCOUNT)
> > +                     vma->vm_flags |= VM_ACCOUNT;
> > +
> > +             goto out;
> > +     }
> > +
> >       if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
> >               /* OOM: unable to split vma, just get accounts right */
> >               vm_unacct_memory(excess >> PAGE_SHIFT);
> > @@ -422,6 +429,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> >                       vma->vm_next->vm_flags |= VM_ACCOUNT;
> >       }
> >
> > +out:
> >       if (vm_flags & VM_LOCKED) {
> >               mm->locked_vm += new_len >> PAGE_SHIFT;
> >               *locked = true;
> > @@ -497,7 +505,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
> >
> >  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> >               unsigned long new_addr, unsigned long new_len, bool *locked,
> > -             struct vm_userfaultfd_ctx *uf,
> > +             unsigned long flags, struct vm_userfaultfd_ctx *uf,
> >               struct list_head *uf_unmap_early,
> >               struct list_head *uf_unmap)
> >  {
> > @@ -545,6 +553,17 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> >               old_len = new_len;
> >       }
> >
> > +     /*
> > +      * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
> > +      * check that we can expand by old_len and vma_to_resize will handle
> > +      * the vma growing.
> > +      */
> > +     if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
> > +                             vma->vm_flags, old_len >> PAGE_SHIFT))) {
>
> We received a Clang build report that vma is used uninitialized here
> (they aren't being publicly sent to LKML due to GCC vs Clang
> warning/error overlap):
>
> https://groups.google.com/d/msg/clang-built-linux/gE5wRaeHdSI/xVA0MBQVEgAJ
>
> Sure enough, vma is initialized first in the next block. Not sure if
> this section should be moved below that initialization or if something
> else should be done to resolve it but that dereference will obviously be
> fatal.
>
> Cheers,
> Nathan
>
> > +             ret = -ENOMEM;
> > +             goto out;
> > +     }
> > +
> >       vma = vma_to_resize(addr, old_len, new_len, &charged);
> >       if (IS_ERR(vma)) {
> >               ret = PTR_ERR(vma);
> > @@ -561,7 +580,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> >       if (IS_ERR_VALUE(ret))
> >               goto out1;
> >
> > -     ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
> > +     ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
> >                      uf_unmap);
> >       if (!(offset_in_page(ret)))
> >               goto out;
> > @@ -609,12 +628,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> >       addr = untagged_addr(addr);
> >       new_addr = untagged_addr(new_addr);
> >
> > -     if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
> > +     if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
> >               return ret;
> >
> >       if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
> >               return ret;
> >
> > +     if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
> > +             return ret;
> > +
> >       if (offset_in_page(addr))
> >               return ret;
> >
> > @@ -634,7 +656,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> >
> >       if (flags & MREMAP_FIXED) {
> >               ret = mremap_to(addr, old_len, new_addr, new_len,
> > -                             &locked, &uf, &uf_unmap_early, &uf_unmap);
> > +                             &locked, flags, &uf, &uf_unmap_early,
> > +                             &uf_unmap);
> >               goto out;
> >       }
> >
> > @@ -712,7 +735,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> >               }
> >
> >               ret = move_vma(vma, addr, old_len, new_len, new_addr,
> > -                            &locked, &uf, &uf_unmap);
> > +                            &locked, flags, &uf, &uf_unmap);
> >       }
> >  out:
> >       if (offset_in_page(ret)) {
> > --
> > 2.25.0.341.g760bfbb309-goog
> >
Florian Weimer Jan. 27, 2020, 10:13 a.m. UTC | #4
* Brian Geffon:

> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call.  If a userfaultfd was watching
> the source, it will continue to watch the new mapping.  For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> also used. The final result is two equally sized VMAs where the
> destination contains the PTEs of the source.

What will be the protection flags of the source mapping?  Will they
remain unchanged?  Or PROT_NONE?

Thanks,
Florian
Brian Geffon Jan. 27, 2020, 10:33 p.m. UTC | #5
Hi Florian,
copy_vma will make a copy of the existing VMA leaving the old VMA
unchanged, so the source keeps its existing protections, this is what
makes it very useful along with userfaultfd.

Thanks,
Brian


On Mon, Jan 27, 2020 at 2:13 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Brian Geffon:
>
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call.  If a userfaultfd was watching
> > the source, it will continue to watch the new mapping.  For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> > also used. The final result is two equally sized VMAs where the
> > destination contains the PTEs of the source.
>
> What will be the protection flags of the source mapping?  Will they
> remain unchanged?  Or PROT_NONE?
>
> Thanks,
> Florian
>
Brian Geffon Jan. 28, 2020, 1:35 a.m. UTC | #6
Hi Kirill,
Thanks for taking the time to look at this. I'll update the wording to
make it clear that MREMAP_FIXED is required with MREMAP_DONTUNMAP.

Regarding rmap, you're completely right I'm going to roll a new patch
which will call unlink_anon_vmas() to make sure the rmap is correct,
I'll also explicitly check that the vma is anonymous and not shared
returning EINVAL if not, how does that sound?

Brian

On Sun, Jan 26, 2020 at 2:06 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call.  If a userfaultfd was watching
> > the source, it will continue to watch the new mapping.  For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> > also used.
>
> Implies? From code it looks like it requires MREMAP_FIXED. And
> MREMAP_FIXED requires MREMAP_MAYMOVE. That's strange flag chaining.
> I don't really see need in such dependencies. It maybe indeed implied, not
> requied.
>
> > The final result is two equally sized VMAs where the
> > destination contains the PTEs of the source.
>
> Could you clarify rmap situation here? How the rmap hierarchy will look
> like before and after the operation. Can the new VMA merge with the old
> one? Sounds fishy to me.
>
> > We hope to use this in Chrome OS where with userfaultfd we could write
> > an anonymous mapping to disk without having to STOP the process or worry
> > about VMA permission changes.
> >
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
>
> --
>  Kirill A. Shutemov
Kirill A . Shutemov Jan. 29, 2020, 10:46 a.m. UTC | #7
On Mon, Jan 27, 2020 at 05:35:40PM -0800, Brian Geffon wrote:
> Hi Kirill,
> Thanks for taking the time to look at this. I'll update the wording to
> make it clear that MREMAP_FIXED is required with MREMAP_DONTUNMAP.

I still think that chaining flags is strange. The new flag requires all
existing.

And based on the use case you probably don't really need 'fixed'
semantics all the time. The user should be fine with moving the mapping
*somewhere*, not neccessary to the given address.

BTW, name of the flag is confusing. My initial reaction was that it is
variant of MREMAP_FIXED that does't anything at the target address.
Like MAP_FIXED vs. MAP_FIXED_NOREPLACE.

Any better options for the flag name? (I have none)

> Regarding rmap, you're completely right I'm going to roll a new patch
> which will call unlink_anon_vmas() to make sure the rmap is correct,
> I'll also explicitly check that the vma is anonymous and not shared
> returning EINVAL if not, how does that sound?

Fine, I guess. But let me see the end result.
Florian Weimer Jan. 30, 2020, 12:19 p.m. UTC | #8
* Brian Geffon:

> Hi Florian,
> copy_vma will make a copy of the existing VMA leaving the old VMA
> unchanged, so the source keeps its existing protections, this is what
> makes it very useful along with userfaultfd.

I see.  On the other hand, it's impossible to get the PROT_NONE behavior
by a subsequent mprotect call because the mremap has to fail in some
cases in vm.overcommit_memory=2 mode.  But maybe that other behavior can
be provided with a different flag if it turns out to be useful in the
future.

Thanks,
Florian
Brian Geffon Feb. 1, 2020, 9:03 p.m. UTC | #9
Hi Kirill,

On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
> And based on the use case you probably don't really need 'fixed'
> semantics all the time. The user should be fine with moving the mapping
> *somewhere*, not neccessary to the given address

This is true and and it simplifies things a bit as for the outlined
use cases the user would not be required to mmap the destination
before hand. Part of the reason I chose to require MREMAP_FIXED was
because mremap need not move the mapping if it can shrink/grow in
place and it seemed a bit awkward to have "MUSTMOVE" behavior without
MAP_FIXED. I'll make this change to drop the requirement on
MREMAP_FIXED in my next patch.

> BTW, name of the flag is confusing. My initial reaction was that it is
> variant of MREMAP_FIXED that does't anything at the target address.
> Like MAP_FIXED vs. MAP_FIXED_NOREPLACE.
>
> Any better options for the flag name? (I have none)

I see your point. Perhaps MREMAP_MOVEPAGES or MREMAP_KEEP_SOURCE? I
struggle to come up with a single name that encapsulates this behavior
but I'll try to think of other ideas before I mail the next patch.
Given that we will drop the requirement on MREMAP_FIXED, perhaps
MOVEPAGES is the better option as it captures that the mapping WILL be
moved?

Thanks again for taking the time to look at this.

Best,
Brian
Brian Geffon Feb. 2, 2020, 4:17 a.m. UTC | #10
On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
> Any better options for the flag name? (I have none)

The other option is that it's broken up into two new flags the first
MREMAP_MUSTMOVE which can be used regardless of whether or not you're
leaving the original mapping mapped. This would do exactly what it
describes: move the mapping to a new address with or without
MREMAP_FIXED, this keeps consistency with MAYMOVE.

The second flag would be the new MREMAP_DONTUNMAP flag which requires
MREMAP_MUSTMOVE, again with or without MREMAP_FIXED.

What are your thoughts on this?
Kirill A . Shutemov Feb. 3, 2020, 1:09 p.m. UTC | #11
On Sun, Feb 02, 2020 at 05:17:53AM +0100, Brian Geffon wrote:
> On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> > Any better options for the flag name? (I have none)
> 
> The other option is that it's broken up into two new flags the first
> MREMAP_MUSTMOVE which can be used regardless of whether or not you're
> leaving the original mapping mapped. This would do exactly what it
> describes: move the mapping to a new address with or without
> MREMAP_FIXED, this keeps consistency with MAYMOVE.
> 
> The second flag would be the new MREMAP_DONTUNMAP flag which requires
> MREMAP_MUSTMOVE, again with or without MREMAP_FIXED.
> 
> What are your thoughts on this?

Sounds reasonable.

MREMAP_DONTUNMAP doesn't really convey that you move pages to the new
mapping, but leave empty mapping behind. But I guess there's only so much
you can encode into the name. (Patch to the man page should do the rest)
Brian Geffon Feb. 7, 2020, 8:42 p.m. UTC | #12
Hi Kirill,
I started a new thread https://lkml.org/lkml/2020/2/7/640 for my v4
patch. But I wanted to quickly address your comments. Regarding the
concern around the rmap, no changes actually need to be made. If we
were to unlink_anon_vma(vma) and then set vma->anon_vma = NULL, that
would be fine but then as soon as there was a fault the same anon_vma
would be attached since it's a private anonymous mapping. So there is
really nothing to do regarding the rmap.

I considered the two flag approach but since I could not come up with
a concrete use case of MREMAP_MUSTMOVE I decided to just leave the
single MREMAP_DONTUNMAP flag, the two flag approach would be only for
clarifying the operations so I'm not sure it's worth it. (Still trying
to come up with a better name). But I've attached a man page diff to
the latest patch.

Thanks,
Brian

On Mon, Feb 3, 2020 at 5:09 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Sun, Feb 02, 2020 at 05:17:53AM +0100, Brian Geffon wrote:
> > On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > > Any better options for the flag name? (I have none)
> >
> > The other option is that it's broken up into two new flags the first
> > MREMAP_MUSTMOVE which can be used regardless of whether or not you're
> > leaving the original mapping mapped. This would do exactly what it
> > describes: move the mapping to a new address with or without
> > MREMAP_FIXED, this keeps consistency with MAYMOVE.
> >
> > The second flag would be the new MREMAP_DONTUNMAP flag which requires
> > MREMAP_MUSTMOVE, again with or without MREMAP_FIXED.
> >
> > What are your thoughts on this?
>
> Sounds reasonable.
>
> MREMAP_DONTUNMAP doesn't really convey that you move pages to the new
> mapping, but leave empty mapping behind. But I guess there's only so much
> you can encode into the name. (Patch to the man page should do the rest)
>
> --
>  Kirill A. Shutemov
Kirill A . Shutemov Feb. 10, 2020, 10:35 a.m. UTC | #13
On Fri, Feb 07, 2020 at 12:42:15PM -0800, Brian Geffon wrote:
> Hi Kirill,
> I started a new thread https://lkml.org/lkml/2020/2/7/640 for my v4
> patch. But I wanted to quickly address your comments. Regarding the
> concern around the rmap, no changes actually need to be made. If we
> were to unlink_anon_vma(vma) and then set vma->anon_vma = NULL, that
> would be fine but then as soon as there was a fault the same anon_vma
> would be attached since it's a private anonymous mapping. So there is
> really nothing to do regarding the rmap.

Okay.

My worry was that we create a new VMA with the same anon_vma *and*
vm_pgoff, but I just realized we can do the same with the current
mremap(2) plus following mmap(2) in the old place. So it's not regression.

I guess we are fine here.

> I considered the two flag approach but since I could not come up with
> a concrete use case of MREMAP_MUSTMOVE I decided to just leave the
> single MREMAP_DONTUNMAP flag, the two flag approach would be only for
> clarifying the operations so I'm not sure it's worth it. (Still trying
> to come up with a better name). But I've attached a man page diff to
> the latest patch.

At least it doesn't have 'FIXED' semantics forced on user. It's fine with
me.
diff mbox series

Patch

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@ 
 #include <asm/mman.h>
 #include <asm-generic/hugetlb_encode.h>
 
-#define MREMAP_MAYMOVE	1
-#define MREMAP_FIXED	2
+#define MREMAP_MAYMOVE		1
+#define MREMAP_FIXED		2
+#define MREMAP_DONTUNMAP	4
 
 #define OVERCOMMIT_GUESS		0
 #define OVERCOMMIT_ALWAYS		1
diff --git a/mm/mremap.c b/mm/mremap.c
index 122938dcec15..bf97c3eb538b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@  unsigned long move_page_tables(struct vm_area_struct *vma,
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
 		unsigned long new_len, unsigned long new_addr,
-		bool *locked, struct vm_userfaultfd_ctx *uf,
-		struct list_head *uf_unmap)
+		bool *locked, unsigned long flags,
+		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -408,6 +408,13 @@  static unsigned long move_vma(struct vm_area_struct *vma,
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn_moved(vma);
 
+	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+		if (vm_flags & VM_ACCOUNT)
+			vma->vm_flags |= VM_ACCOUNT;
+
+		goto out;
+	}
+
 	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -422,6 +429,7 @@  static unsigned long move_vma(struct vm_area_struct *vma,
 			vma->vm_next->vm_flags |= VM_ACCOUNT;
 	}
 
+out:
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += new_len >> PAGE_SHIFT;
 		*locked = true;
@@ -497,7 +505,7 @@  static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		unsigned long new_addr, unsigned long new_len, bool *locked,
-		struct vm_userfaultfd_ctx *uf,
+		unsigned long flags, struct vm_userfaultfd_ctx *uf,
 		struct list_head *uf_unmap_early,
 		struct list_head *uf_unmap)
 {
@@ -545,6 +553,17 @@  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		old_len = new_len;
 	}
 
+	/*
+	 * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
+	 * check that we can expand by old_len and vma_to_resize will handle
+	 * the vma growing.
+	 */
+	if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
+				vma->vm_flags, old_len >> PAGE_SHIFT))) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	vma = vma_to_resize(addr, old_len, new_len, &charged);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
@@ -561,7 +580,7 @@  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (IS_ERR_VALUE(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
 		       uf_unmap);
 	if (!(offset_in_page(ret)))
 		goto out;
@@ -609,12 +628,15 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	addr = untagged_addr(addr);
 	new_addr = untagged_addr(new_addr);
 
-	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
 		return ret;
 
 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
 		return ret;
 
+	if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
+		return ret;
+
 	if (offset_in_page(addr))
 		return ret;
 
@@ -634,7 +656,8 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked, &uf, &uf_unmap_early, &uf_unmap);
+				&locked, flags, &uf, &uf_unmap_early,
+				&uf_unmap);
 		goto out;
 	}
 
@@ -712,7 +735,7 @@  SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 
 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
-			       &locked, &uf, &uf_unmap);
+			       &locked, flags, &uf, &uf_unmap);
 	}
 out:
 	if (offset_in_page(ret)) {