diff mbox series

[v2,3/3] userfaultfd: use per-vma locks in userfaultfd operations

Message ID 20240129193512.123145-4-lokeshgidra@google.com (mailing list archive)
State Handled Elsewhere
Delegated to: Paul Moore
Headers show
Series per-vma locks in userfaultfd | expand

Commit Message

Lokesh Gidra Jan. 29, 2024, 7:35 p.m. UTC
All userfaultfd operations, except write-protect, opportunistically use
per-vma locks to lock vmas. If we fail then fall back to locking
mmap-lock in read-mode.

Write-protect operation requires mmap_lock as it iterates over multiple vmas.

Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
---
 fs/userfaultfd.c              |  13 +--
 include/linux/userfaultfd_k.h |   5 +-
 mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
 3 files changed, 122 insertions(+), 71 deletions(-)

Comments

Liam R. Howlett Jan. 29, 2024, 8:36 p.m. UTC | #1
* Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> All userfaultfd operations, except write-protect, opportunistically use
> per-vma locks to lock vmas. If we fail then fall back to locking
> mmap-lock in read-mode.
> 
> Write-protect operation requires mmap_lock as it iterates over multiple vmas.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> ---
>  fs/userfaultfd.c              |  13 +--
>  include/linux/userfaultfd_k.h |   5 +-
>  mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
>  3 files changed, 122 insertions(+), 71 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index c00a021bcce4..60dcfafdc11a 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
>  		return -EINVAL;
>  
>  	if (mmget_not_zero(mm)) {
> -		mmap_read_lock(mm);
> -
> -		/* Re-check after taking map_changing_lock */
> -		down_read(&ctx->map_changing_lock);
> -		if (likely(!atomic_read(&ctx->mmap_changing)))
> -			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> -					 uffdio_move.len, uffdio_move.mode);
> -		else
> -			ret = -EAGAIN;
> -		up_read(&ctx->map_changing_lock);
> -		mmap_read_unlock(mm);
> +		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> +				 uffdio_move.len, uffdio_move.mode);
>  		mmput(mm);
>  	} else {
>  		return -ESRCH;
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 3210c3552976..05d59f74fc88 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
>  /* move_pages */
>  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
>  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> -		   unsigned long dst_start, unsigned long src_start,
> -		   unsigned long len, __u64 flags);
> +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +		   unsigned long src_start, unsigned long len, __u64 flags);
>  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
>  			struct vm_area_struct *dst_vma,
>  			struct vm_area_struct *src_vma,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 6e2ca04ab04d..d55bf18b80db 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -19,20 +19,39 @@
>  #include <asm/tlb.h>
>  #include "internal.h"
>  
> -static __always_inline
> -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> -				    unsigned long dst_start,
> -				    unsigned long len)
> +void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
> +{
> +	BUG_ON(!vma && !*mmap_locked);
> +
> +	if (*mmap_locked) {
> +		mmap_read_unlock(mm);
> +		*mmap_locked = false;
> +	} else
> +		vma_end_read(vma);

You are missing braces here.

This function is small so it could be inline, although I hope the
compiler would get that right for us.

I don't think this small helper is worth it, considering you are
altering a pointer in here, which makes things harder to follow (not to
mention the locking).  The only code that depends on this update is a
single place, which already assigns a custom variable after the function
return.

> +}
> +
> +/*
> + * Search for VMA and make sure it is stable either by locking it or taking
> + * mmap_lock.

This function returns something that isn't documented and also sets a
boolean which is passed in as a pointer which also is lacking from the
documentation.

> + */
> +struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
> +					    unsigned long dst_start,
> +					    unsigned long len,
> +					    bool *mmap_locked)
>  {
> +	struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);

lock_vma_under_rcu() calls mas_walk(), which goes to dst_start for the
VMA.  It is not possible for dst_start to be outside the range.

> +	if (!dst_vma) {

BUG_ON(mmap_locked) ?

> +		mmap_read_lock(dst_mm);
> +		*mmap_locked = true;
> +		dst_vma = find_vma(dst_mm, dst_start);

find_vma() walks to dst_start and searches upwards from that address.
This is functionally different than what you have asked for above.  You
will not see an issue as you have coded it - but it may be suboptimal
since a start address lower than the VMA you are looking for can be
found... however, later you check the range falls between the dst_start
and dst_start + len.

If you expect the dst_start to always be within the VMA range and not
lower, then you should use vma_lookup().

If you want to search upwards from dst_start for a VMA then you should
move the range check below into this brace.

> +	}
> +
>  	/*
>  	 * Make sure that the dst range is both valid and fully within a
>  	 * single existing vma.
>  	 */
> -	struct vm_area_struct *dst_vma;
> -
> -	dst_vma = find_vma(dst_mm, dst_start);
>  	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> -		return NULL;
> +		goto unpin;
>  
>  	/*
>  	 * Check the vma is registered in uffd, this is required to
> @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
>  	 * time.
>  	 */
>  	if (!dst_vma->vm_userfaultfd_ctx.ctx)
> -		return NULL;
> +		goto unpin;
>  
>  	return dst_vma;
> +
> +unpin:
> +	unpin_vma(dst_mm, dst_vma, mmap_locked);
> +	return NULL;
>  }
>  
>  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>  #ifdef CONFIG_HUGETLB_PAGE
>  /*
>   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> - * called with mmap_lock held, it will release mmap_lock before returning.
> + * called with either vma-lock or mmap_lock held, it will release the lock
> + * before returning.
>   */
>  static __always_inline ssize_t mfill_atomic_hugetlb(
>  					      struct userfaultfd_ctx *ctx,
> @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  					      unsigned long dst_start,
>  					      unsigned long src_start,
>  					      unsigned long len,
> -					      uffd_flags_t flags)
> +					      uffd_flags_t flags,
> +					      bool *mmap_locked)
>  {
>  	struct mm_struct *dst_mm = dst_vma->vm_mm;
>  	int vm_shared = dst_vma->vm_flags & VM_SHARED;
> @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 */
>  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
>  		up_read(&ctx->map_changing_lock);
> -		mmap_read_unlock(dst_mm);
> +		unpin_vma(dst_mm, dst_vma, mmap_locked);
>  		return -EINVAL;
>  	}
>  
> @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 */
>  	if (!dst_vma) {
>  		err = -ENOENT;
> -		dst_vma = find_dst_vma(dst_mm, dst_start, len);
> -		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> -			goto out_unlock;
> +		dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> +					       len, mmap_locked);
> +		if (!dst_vma)
> +			goto out;
> +		if (!is_vm_hugetlb_page(dst_vma))
> +			goto out_unlock_vma;
>  
>  		err = -EINVAL;
>  		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> +			goto out_unlock_vma;
> +
> +		/*
> +		 * If memory mappings are changing because of non-cooperative
> +		 * operation (e.g. mremap) running in parallel, bail out and
> +		 * request the user to retry later
> +		 */
> +		down_read(&ctx->map_changing_lock);
> +		err = -EAGAIN;
> +		if (atomic_read(&ctx->mmap_changing))
>  			goto out_unlock;
>  
>  		vm_shared = dst_vma->vm_flags & VM_SHARED;
> @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  
>  		if (unlikely(err == -ENOENT)) {
>  			up_read(&ctx->map_changing_lock);
> -			mmap_read_unlock(dst_mm);
> +			unpin_vma(dst_mm, dst_vma, mmap_locked);
>  			BUG_ON(!folio);
>  
>  			err = copy_folio_from_user(folio,
> @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  				err = -EFAULT;
>  				goto out;
>  			}
> -			mmap_read_lock(dst_mm);
> -			down_read(&ctx->map_changing_lock);
> -			/*
> -			 * If memory mappings are changing because of non-cooperative
> -			 * operation (e.g. mremap) running in parallel, bail out and
> -			 * request the user to retry later
> -			 */
> -			if (atomic_read(ctx->mmap_changing)) {
> -				err = -EAGAIN;
> -				break;
> -			}

... Okay, this is where things get confusing.

How about this: Don't do this locking/boolean dance.

Instead, do something like this:
In mm/memory.c, below lock_vma_under_rcu(), but something like this

struct vm_area_struct *lock_vma(struct mm_struct *mm,
	unsigned long addr))	/* or some better name.. */
{
	struct vm_area_struct *vma;

	vma = lock_vma_under_rcu(mm, addr);

	if (vma)
		return vma;

	mmap_read_lock(mm);
	vma = lookup_vma(mm, addr);
	if (vma)
		vma_start_read(vma); /* Won't fail */

	mmap_read_unlock(mm);
	return vma;
}

Now, we know we have a vma that's vma locked if there is a vma.  The vma
won't go away - you have it locked.  The mmap lock is held for even
less time for your worse case, and the code gets easier to follow.

Once you are done with the vma do a vma_end_read(vma).  Don't forget to
do this!

Now the comment above such a function should state that the vma needs to
be vma_end_read(vma), or that could go undetected..  It might be worth
adding a unlock_vma() counterpart to vma_end_read(vma) even.


>  
>  			dst_vma = NULL;
>  			goto retry;
> @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  
>  out_unlock:
>  	up_read(&ctx->map_changing_lock);
> -	mmap_read_unlock(dst_mm);
> +out_unlock_vma:
> +	unpin_vma(dst_mm, dst_vma, mmap_locked);
>  out:
>  	if (folio)
>  		folio_put(folio);
> @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
>  				    unsigned long dst_start,
>  				    unsigned long src_start,
>  				    unsigned long len,
> -				    uffd_flags_t flags);
> +				    uffd_flags_t flags,
> +				    bool *mmap_locked);

Just a thought, tabbing in twice for each argument would make this more
compact.


>  #endif /* CONFIG_HUGETLB_PAGE */
>  
>  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	unsigned long src_addr, dst_addr;
>  	long copied;
>  	struct folio *folio;
> +	bool mmap_locked = false;
>  
>  	/*
>  	 * Sanitize the command parameters:
> @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	copied = 0;
>  	folio = NULL;
>  retry:
> -	mmap_read_lock(dst_mm);
> +	/*
> +	 * Make sure the vma is not shared, that the dst range is
> +	 * both valid and fully within a single existing vma.
> +	 */
> +	err = -ENOENT;
> +	dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> +	if (!dst_vma)
> +		goto out;
>  
>  	/*
>  	 * If memory mappings are changing because of non-cooperative
> @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
> -	/*
> -	 * Make sure the vma is not shared, that the dst range is
> -	 * both valid and fully within a single existing vma.
> -	 */
> -	err = -ENOENT;
> -	dst_vma = find_dst_vma(dst_mm, dst_start, len);
> -	if (!dst_vma)
> -		goto out_unlock;
> -
>  	err = -EINVAL;
>  	/*
>  	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	 * If this is a HUGETLB vma, pass off to appropriate routine
>  	 */
>  	if (is_vm_hugetlb_page(dst_vma))
> -		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> -					     src_start, len, flags);
> +		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> +					     len, flags, &mmap_locked);
>  
>  	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>  		goto out_unlock;
> @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  			void *kaddr;
>  
>  			up_read(&ctx->map_changing_lock);
> -			mmap_read_unlock(dst_mm);
> +			unpin_vma(dst_mm, dst_vma, &mmap_locked);
> +
>  			BUG_ON(!folio);
>  
>  			kaddr = kmap_local_folio(folio, 0);
> @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  
>  out_unlock:
>  	up_read(&ctx->map_changing_lock);
> -	mmap_read_unlock(dst_mm);
> +	unpin_vma(dst_mm, dst_vma, &mmap_locked);
>  out:
>  	if (folio)
>  		folio_put(folio);
> @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
>   * @len: length of the virtual memory range
>   * @mode: flags from uffdio_move.mode
>   *
> - * Must be called with mmap_lock held for read.
> - *
>   * move_pages() remaps arbitrary anonymous pages atomically in zero
>   * copy. It only works on non shared anonymous pages because those can
>   * be relocated without generating non linear anon_vmas in the rmap
> @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
>   * could be obtained. This is the only additional complexity added to
>   * the rmap code to provide this anonymous page remapping functionality.
>   */
> -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> -		   unsigned long dst_start, unsigned long src_start,
> -		   unsigned long len, __u64 mode)
> +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +		   unsigned long src_start, unsigned long len, __u64 mode)
>  {
> +	struct mm_struct *mm = ctx->mm;
>  	struct vm_area_struct *src_vma, *dst_vma;
>  	unsigned long src_addr, dst_addr;
>  	pmd_t *src_pmd, *dst_pmd;
>  	long err = -EINVAL;
>  	ssize_t moved = 0;
> +	bool mmap_locked = false;
>  
>  	/* Sanitize the command parameters. */
>  	if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
>  	    WARN_ON_ONCE(dst_start + len <= dst_start))
>  		goto out;

Ah, is this safe for rmap?  I think you need to leave this read lock.

>  
> +	dst_vma = NULL;
> +	src_vma = lock_vma_under_rcu(mm, src_start);
> +	if (src_vma) {
> +		dst_vma = lock_vma_under_rcu(mm, dst_start);
> +		if (!dst_vma)
> +			vma_end_read(src_vma);
> +	}
> +
> +	/* If we failed to lock both VMAs, fall back to mmap_lock */
> +	if (!dst_vma) {
> +		mmap_read_lock(mm);
> +		mmap_locked = true;
> +		src_vma = find_vma(mm, src_start);
> +		if (!src_vma)
> +			goto out_unlock_mmap;
> +		dst_vma = find_vma(mm, dst_start);

Again, there is a difference in how find_vma and lock_vam_under_rcu
works.

> +		if (!dst_vma)
> +			goto out_unlock_mmap;
> +	}
> +
> +	/* Re-check after taking map_changing_lock */
> +	down_read(&ctx->map_changing_lock);
> +	if (likely(atomic_read(&ctx->mmap_changing))) {
> +		err = -EAGAIN;
> +		goto out_unlock;
> +	}
>  	/*
>  	 * Make sure the vma is not shared, that the src and dst remap
>  	 * ranges are both valid and fully within a single existing
>  	 * vma.
>  	 */
> -	src_vma = find_vma(mm, src_start);
> -	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> -		goto out;
> +	if (src_vma->vm_flags & VM_SHARED)
> +		goto out_unlock;
>  	if (src_start < src_vma->vm_start ||
>  	    src_start + len > src_vma->vm_end)
> -		goto out;
> +		goto out_unlock;
>  
> -	dst_vma = find_vma(mm, dst_start);
> -	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> -		goto out;
> +	if (dst_vma->vm_flags & VM_SHARED)
> +		goto out_unlock;
>  	if (dst_start < dst_vma->vm_start ||
>  	    dst_start + len > dst_vma->vm_end)
> -		goto out;
> +		goto out_unlock;
>  
>  	err = validate_move_areas(ctx, src_vma, dst_vma);
>  	if (err)
> -		goto out;
> +		goto out_unlock;
>  
>  	for (src_addr = src_start, dst_addr = dst_start;
>  	     src_addr < src_start + len;) {
> @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
>  		moved += step_size;
>  	}
>  
> +out_unlock:
> +	up_read(&ctx->map_changing_lock);
> +out_unlock_mmap:
> +	if (mmap_locked)
> +		mmap_read_unlock(mm);
> +	else {
> +		vma_end_read(dst_vma);
> +		vma_end_read(src_vma);
> +	}
>  out:
>  	VM_WARN_ON(moved < 0);
>  	VM_WARN_ON(err > 0);
> -- 
> 2.43.0.429.g432eaa2c6b-goog
> 
>
Suren Baghdasaryan Jan. 29, 2024, 8:52 p.m. UTC | #2
On Mon, Jan 29, 2024 at 12:36 PM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > All userfaultfd operations, except write-protect, opportunistically use
> > per-vma locks to lock vmas. If we fail then fall back to locking
> > mmap-lock in read-mode.
> >
> > Write-protect operation requires mmap_lock as it iterates over multiple vmas.
> >
> > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > ---
> >  fs/userfaultfd.c              |  13 +--
> >  include/linux/userfaultfd_k.h |   5 +-
> >  mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
> >  3 files changed, 122 insertions(+), 71 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index c00a021bcce4..60dcfafdc11a 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> >               return -EINVAL;
> >
> >       if (mmget_not_zero(mm)) {
> > -             mmap_read_lock(mm);
> > -
> > -             /* Re-check after taking map_changing_lock */
> > -             down_read(&ctx->map_changing_lock);
> > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > -                                      uffdio_move.len, uffdio_move.mode);
> > -             else
> > -                     ret = -EAGAIN;
> > -             up_read(&ctx->map_changing_lock);
> > -             mmap_read_unlock(mm);
> > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > +                              uffdio_move.len, uffdio_move.mode);
> >               mmput(mm);
> >       } else {
> >               return -ESRCH;
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index 3210c3552976..05d59f74fc88 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> >  /* move_pages */
> >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > -                unsigned long dst_start, unsigned long src_start,
> > -                unsigned long len, __u64 flags);
> > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > +                unsigned long src_start, unsigned long len, __u64 flags);
> >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> >                       struct vm_area_struct *dst_vma,
> >                       struct vm_area_struct *src_vma,
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 6e2ca04ab04d..d55bf18b80db 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -19,20 +19,39 @@
> >  #include <asm/tlb.h>
> >  #include "internal.h"
> >
> > -static __always_inline
> > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > -                                 unsigned long dst_start,
> > -                                 unsigned long len)
> > +void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
> > +{
> > +     BUG_ON(!vma && !*mmap_locked);
> > +
> > +     if (*mmap_locked) {
> > +             mmap_read_unlock(mm);
> > +             *mmap_locked = false;
> > +     } else
> > +             vma_end_read(vma);
>
> You are missing braces here.
>
> This function is small so it could be inline, although I hope the
> compiler would get that right for us.
>
> I don't think this small helper is worth it, considering you are
> altering a pointer in here, which makes things harder to follow (not to
> mention the locking).  The only code that depends on this update is a
> single place, which already assigns a custom variable after the function
> return.
>
> > +}
> > +
> > +/*
> > + * Search for VMA and make sure it is stable either by locking it or taking
> > + * mmap_lock.
>
> This function returns something that isn't documented and also sets a
> boolean which is passed in as a pointer which also is lacking from the
> documentation.
>
> > + */
> > +struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
> > +                                         unsigned long dst_start,
> > +                                         unsigned long len,
> > +                                         bool *mmap_locked)
> >  {
> > +     struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
>
> lock_vma_under_rcu() calls mas_walk(), which goes to dst_start for the
> VMA.  It is not possible for dst_start to be outside the range.
>
> > +     if (!dst_vma) {
>
> BUG_ON(mmap_locked) ?
>
> > +             mmap_read_lock(dst_mm);
> > +             *mmap_locked = true;
> > +             dst_vma = find_vma(dst_mm, dst_start);
>
> find_vma() walks to dst_start and searches upwards from that address.
> This is functionally different than what you have asked for above.  You
> will not see an issue as you have coded it - but it may be suboptimal
> since a start address lower than the VMA you are looking for can be
> found... however, later you check the range falls between the dst_start
> and dst_start + len.
>
> If you expect the dst_start to always be within the VMA range and not
> lower, then you should use vma_lookup().
>
> If you want to search upwards from dst_start for a VMA then you should
> move the range check below into this brace.
>
> > +     }
> > +
> >       /*
> >        * Make sure that the dst range is both valid and fully within a
> >        * single existing vma.
> >        */
> > -     struct vm_area_struct *dst_vma;
> > -
> > -     dst_vma = find_vma(dst_mm, dst_start);
> >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > -             return NULL;
> > +             goto unpin;
> >
> >       /*
> >        * Check the vma is registered in uffd, this is required to
> > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> >        * time.
> >        */
> >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > -             return NULL;
> > +             goto unpin;
> >
> >       return dst_vma;
> > +
> > +unpin:
> > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > +     return NULL;
> >  }
> >
> >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> >  #ifdef CONFIG_HUGETLB_PAGE
> >  /*
> >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > - * called with mmap_lock held, it will release mmap_lock before returning.
> > + * called with either vma-lock or mmap_lock held, it will release the lock
> > + * before returning.
> >   */
> >  static __always_inline ssize_t mfill_atomic_hugetlb(
> >                                             struct userfaultfd_ctx *ctx,
> > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >                                             unsigned long dst_start,
> >                                             unsigned long src_start,
> >                                             unsigned long len,
> > -                                           uffd_flags_t flags)
> > +                                           uffd_flags_t flags,
> > +                                           bool *mmap_locked)
> >  {
> >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        */
> >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> >               up_read(&ctx->map_changing_lock);
> > -             mmap_read_unlock(dst_mm);
> > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> >               return -EINVAL;
> >       }
> >
> > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        */
> >       if (!dst_vma) {
> >               err = -ENOENT;
> > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > -                     goto out_unlock;
> > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > +                                            len, mmap_locked);
> > +             if (!dst_vma)
> > +                     goto out;
> > +             if (!is_vm_hugetlb_page(dst_vma))
> > +                     goto out_unlock_vma;
> >
> >               err = -EINVAL;
> >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > +                     goto out_unlock_vma;
> > +
> > +             /*
> > +              * If memory mappings are changing because of non-cooperative
> > +              * operation (e.g. mremap) running in parallel, bail out and
> > +              * request the user to retry later
> > +              */
> > +             down_read(&ctx->map_changing_lock);
> > +             err = -EAGAIN;
> > +             if (atomic_read(&ctx->mmap_changing))
> >                       goto out_unlock;
> >
> >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >
> >               if (unlikely(err == -ENOENT)) {
> >                       up_read(&ctx->map_changing_lock);
> > -                     mmap_read_unlock(dst_mm);
> > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> >                       BUG_ON(!folio);
> >
> >                       err = copy_folio_from_user(folio,
> > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >                               err = -EFAULT;
> >                               goto out;
> >                       }
> > -                     mmap_read_lock(dst_mm);
> > -                     down_read(&ctx->map_changing_lock);
> > -                     /*
> > -                      * If memory mappings are changing because of non-cooperative
> > -                      * operation (e.g. mremap) running in parallel, bail out and
> > -                      * request the user to retry later
> > -                      */
> > -                     if (atomic_read(ctx->mmap_changing)) {
> > -                             err = -EAGAIN;
> > -                             break;
> > -                     }
>
> ... Okay, this is where things get confusing.
>
> How about this: Don't do this locking/boolean dance.
>
> Instead, do something like this:
> In mm/memory.c, below lock_vma_under_rcu(), but something like this
>
> struct vm_area_struct *lock_vma(struct mm_struct *mm,
>         unsigned long addr))    /* or some better name.. */
> {
>         struct vm_area_struct *vma;
>
>         vma = lock_vma_under_rcu(mm, addr);
>
>         if (vma)
>                 return vma;
>
>         mmap_read_lock(mm);
>         vma = lookup_vma(mm, addr);
>         if (vma)
>                 vma_start_read(vma); /* Won't fail */

Please don't assume vma_start_read() won't fail even when you have
mmap_read_lock(). See the comment in vma_start_read() about the
possibility of an overflow producing false negatives.

>
>         mmap_read_unlock(mm);
>         return vma;
> }
>
> Now, we know we have a vma that's vma locked if there is a vma.  The vma
> won't go away - you have it locked.  The mmap lock is held for even
> less time for your worse case, and the code gets easier to follow.
>
> Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> do this!
>
> Now the comment above such a function should state that the vma needs to
> be vma_end_read(vma), or that could go undetected..  It might be worth
> adding a unlock_vma() counterpart to vma_end_read(vma) even.

Locking VMA while holding mmap_read_lock is an interesting usage
pattern I haven't seen yet. I think this should work quite well!

>
>
> >
> >                       dst_vma = NULL;
> >                       goto retry;
> > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >
> >  out_unlock:
> >       up_read(&ctx->map_changing_lock);
> > -     mmap_read_unlock(dst_mm);
> > +out_unlock_vma:
> > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> >  out:
> >       if (folio)
> >               folio_put(folio);
> > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> >                                   unsigned long dst_start,
> >                                   unsigned long src_start,
> >                                   unsigned long len,
> > -                                 uffd_flags_t flags);
> > +                                 uffd_flags_t flags,
> > +                                 bool *mmap_locked);
>
> Just a thought, tabbing in twice for each argument would make this more
> compact.
>
>
> >  #endif /* CONFIG_HUGETLB_PAGE */
> >
> >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       unsigned long src_addr, dst_addr;
> >       long copied;
> >       struct folio *folio;
> > +     bool mmap_locked = false;
> >
> >       /*
> >        * Sanitize the command parameters:
> > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       copied = 0;
> >       folio = NULL;
> >  retry:
> > -     mmap_read_lock(dst_mm);
> > +     /*
> > +      * Make sure the vma is not shared, that the dst range is
> > +      * both valid and fully within a single existing vma.
> > +      */
> > +     err = -ENOENT;
> > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > +     if (!dst_vma)
> > +             goto out;
> >
> >       /*
> >        * If memory mappings are changing because of non-cooperative
> > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       if (atomic_read(&ctx->mmap_changing))
> >               goto out_unlock;
> >
> > -     /*
> > -      * Make sure the vma is not shared, that the dst range is
> > -      * both valid and fully within a single existing vma.
> > -      */
> > -     err = -ENOENT;
> > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > -     if (!dst_vma)
> > -             goto out_unlock;
> > -
> >       err = -EINVAL;
> >       /*
> >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >        * If this is a HUGETLB vma, pass off to appropriate routine
> >        */
> >       if (is_vm_hugetlb_page(dst_vma))
> > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > -                                          src_start, len, flags);
> > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > +                                          len, flags, &mmap_locked);
> >
> >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> >               goto out_unlock;
> > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >                       void *kaddr;
> >
> >                       up_read(&ctx->map_changing_lock);
> > -                     mmap_read_unlock(dst_mm);
> > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > +
> >                       BUG_ON(!folio);
> >
> >                       kaddr = kmap_local_folio(folio, 0);
> > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >
> >  out_unlock:
> >       up_read(&ctx->map_changing_lock);
> > -     mmap_read_unlock(dst_mm);
> > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> >  out:
> >       if (folio)
> >               folio_put(folio);
> > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> >   * @len: length of the virtual memory range
> >   * @mode: flags from uffdio_move.mode
> >   *
> > - * Must be called with mmap_lock held for read.
> > - *
> >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> >   * copy. It only works on non shared anonymous pages because those can
> >   * be relocated without generating non linear anon_vmas in the rmap
> > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> >   * could be obtained. This is the only additional complexity added to
> >   * the rmap code to provide this anonymous page remapping functionality.
> >   */
> > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > -                unsigned long dst_start, unsigned long src_start,
> > -                unsigned long len, __u64 mode)
> > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > +                unsigned long src_start, unsigned long len, __u64 mode)
> >  {
> > +     struct mm_struct *mm = ctx->mm;
> >       struct vm_area_struct *src_vma, *dst_vma;
> >       unsigned long src_addr, dst_addr;
> >       pmd_t *src_pmd, *dst_pmd;
> >       long err = -EINVAL;
> >       ssize_t moved = 0;
> > +     bool mmap_locked = false;
> >
> >       /* Sanitize the command parameters. */
> >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> >           WARN_ON_ONCE(dst_start + len <= dst_start))
> >               goto out;
>
> Ah, is this safe for rmap?  I think you need to leave this read lock.
>
> >
> > +     dst_vma = NULL;
> > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > +     if (src_vma) {
> > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > +             if (!dst_vma)
> > +                     vma_end_read(src_vma);
> > +     }
> > +
> > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > +     if (!dst_vma) {
> > +             mmap_read_lock(mm);
> > +             mmap_locked = true;
> > +             src_vma = find_vma(mm, src_start);
> > +             if (!src_vma)
> > +                     goto out_unlock_mmap;
> > +             dst_vma = find_vma(mm, dst_start);
>
> Again, there is a difference in how find_vma and lock_vam_under_rcu
> works.
>
> > +             if (!dst_vma)
> > +                     goto out_unlock_mmap;
> > +     }
> > +
> > +     /* Re-check after taking map_changing_lock */
> > +     down_read(&ctx->map_changing_lock);
> > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > +             err = -EAGAIN;
> > +             goto out_unlock;
> > +     }
> >       /*
> >        * Make sure the vma is not shared, that the src and dst remap
> >        * ranges are both valid and fully within a single existing
> >        * vma.
> >        */
> > -     src_vma = find_vma(mm, src_start);
> > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > -             goto out;
> > +     if (src_vma->vm_flags & VM_SHARED)
> > +             goto out_unlock;
> >       if (src_start < src_vma->vm_start ||
> >           src_start + len > src_vma->vm_end)
> > -             goto out;
> > +             goto out_unlock;
> >
> > -     dst_vma = find_vma(mm, dst_start);
> > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > -             goto out;
> > +     if (dst_vma->vm_flags & VM_SHARED)
> > +             goto out_unlock;
> >       if (dst_start < dst_vma->vm_start ||
> >           dst_start + len > dst_vma->vm_end)
> > -             goto out;
> > +             goto out_unlock;
> >
> >       err = validate_move_areas(ctx, src_vma, dst_vma);
> >       if (err)
> > -             goto out;
> > +             goto out_unlock;
> >
> >       for (src_addr = src_start, dst_addr = dst_start;
> >            src_addr < src_start + len;) {
> > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> >               moved += step_size;
> >       }
> >
> > +out_unlock:
> > +     up_read(&ctx->map_changing_lock);
> > +out_unlock_mmap:
> > +     if (mmap_locked)
> > +             mmap_read_unlock(mm);
> > +     else {
> > +             vma_end_read(dst_vma);
> > +             vma_end_read(src_vma);
> > +     }
> >  out:
> >       VM_WARN_ON(moved < 0);
> >       VM_WARN_ON(err > 0);
> > --
> > 2.43.0.429.g432eaa2c6b-goog
> >
> >
Liam R. Howlett Jan. 29, 2024, 9:18 p.m. UTC | #3
* Suren Baghdasaryan <surenb@google.com> [240129 15:53]:
> On Mon, Jan 29, 2024 at 12:36 PM Liam R. Howlett

...

> > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >               if (unlikely(err == -ENOENT)) {
> > >                       up_read(&ctx->map_changing_lock);
> > > -                     mmap_read_unlock(dst_mm);
> > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >                       BUG_ON(!folio);
> > >
> > >                       err = copy_folio_from_user(folio,
> > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                               err = -EFAULT;
> > >                               goto out;
> > >                       }
> > > -                     mmap_read_lock(dst_mm);
> > > -                     down_read(&ctx->map_changing_lock);
> > > -                     /*
> > > -                      * If memory mappings are changing because of non-cooperative
> > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > -                      * request the user to retry later
> > > -                      */
> > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > -                             err = -EAGAIN;
> > > -                             break;
> > > -                     }
> >
> > ... Okay, this is where things get confusing.
> >
> > How about this: Don't do this locking/boolean dance.
> >
> > Instead, do something like this:
> > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> >
> > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> >         unsigned long addr))    /* or some better name.. */
> > {
> >         struct vm_area_struct *vma;
> >
> >         vma = lock_vma_under_rcu(mm, addr);
> >
> >         if (vma)
> >                 return vma;
> >
> >         mmap_read_lock(mm);
> >         vma = lookup_vma(mm, addr);
> >         if (vma)
> >                 vma_start_read(vma); /* Won't fail */
> 
> Please don't assume vma_start_read() won't fail even when you have
> mmap_read_lock(). See the comment in vma_start_read() about the
> possibility of an overflow producing false negatives.

I did say something *like* this...

Thanks for catching my mistake.

> 
> >
> >         mmap_read_unlock(mm);
> >         return vma;
> > }
> >
> > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > won't go away - you have it locked.  The mmap lock is held for even
> > less time for your worse case, and the code gets easier to follow.
> >
> > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > do this!
> >
> > Now the comment above such a function should state that the vma needs to
> > be vma_end_read(vma), or that could go undetected..  It might be worth
> > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> 
> Locking VMA while holding mmap_read_lock is an interesting usage
> pattern I haven't seen yet. I think this should work quite well!

What concerns me is this working too well - for instance someone *ahem*
binder *ahem* forever and always isolating their VMA, or someone
forgetting to unlock and never noticing.

vma->vm_lock->lock being locked should be caught by lockdep on exit
though.

...

Thanks,
Liam
Lokesh Gidra Jan. 30, 2024, 12:28 a.m. UTC | #4
On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Jan 29, 2024 at 12:36 PM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240129 14:35]:
> > > All userfaultfd operations, except write-protect, opportunistically use
> > > per-vma locks to lock vmas. If we fail then fall back to locking
> > > mmap-lock in read-mode.
> > >
> > > Write-protect operation requires mmap_lock as it iterates over multiple vmas.
> > >
> > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > ---
> > >  fs/userfaultfd.c              |  13 +--
> > >  include/linux/userfaultfd_k.h |   5 +-
> > >  mm/userfaultfd.c              | 175 +++++++++++++++++++++++-----------
> > >  3 files changed, 122 insertions(+), 71 deletions(-)
> > >
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index c00a021bcce4..60dcfafdc11a 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> > >               return -EINVAL;
> > >
> > >       if (mmget_not_zero(mm)) {
> > > -             mmap_read_lock(mm);
> > > -
> > > -             /* Re-check after taking map_changing_lock */
> > > -             down_read(&ctx->map_changing_lock);
> > > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > > -                                      uffdio_move.len, uffdio_move.mode);
> > > -             else
> > > -                     ret = -EAGAIN;
> > > -             up_read(&ctx->map_changing_lock);
> > > -             mmap_read_unlock(mm);
> > > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > > +                              uffdio_move.len, uffdio_move.mode);
> > >               mmput(mm);
> > >       } else {
> > >               return -ESRCH;
> > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > > index 3210c3552976..05d59f74fc88 100644
> > > --- a/include/linux/userfaultfd_k.h
> > > +++ b/include/linux/userfaultfd_k.h
> > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> > >  /* move_pages */
> > >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> > >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > -                unsigned long dst_start, unsigned long src_start,
> > > -                unsigned long len, __u64 flags);
> > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > +                unsigned long src_start, unsigned long len, __u64 flags);
> > >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> > >                       struct vm_area_struct *dst_vma,
> > >                       struct vm_area_struct *src_vma,
> > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > index 6e2ca04ab04d..d55bf18b80db 100644
> > > --- a/mm/userfaultfd.c
> > > +++ b/mm/userfaultfd.c
> > > @@ -19,20 +19,39 @@
> > >  #include <asm/tlb.h>
> > >  #include "internal.h"
> > >
> > > -static __always_inline
> > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > -                                 unsigned long dst_start,
> > > -                                 unsigned long len)
> > > +void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
> > > +{
> > > +     BUG_ON(!vma && !*mmap_locked);
> > > +
> > > +     if (*mmap_locked) {
> > > +             mmap_read_unlock(mm);
> > > +             *mmap_locked = false;
> > > +     } else
> > > +             vma_end_read(vma);
> >
> > You are missing braces here.
> >
> > This function is small so it could be inline, although I hope the
> > compiler would get that right for us.
> >
> > I don't think this small helper is worth it, considering you are
> > altering a pointer in here, which makes things harder to follow (not to
> > mention the locking).  The only code that depends on this update is a
> > single place, which already assigns a custom variable after the function
> > return.
> >
Sure. I'll replace unpin_vma() calls with inlined unlocking.
> > > +}
> > > +
> > > +/*
> > > + * Search for VMA and make sure it is stable either by locking it or taking
> > > + * mmap_lock.
> >
> > This function returns something that isn't documented and also sets a
> > boolean which is passed in as a pointer which also is lacking from the
> > documentation.
> >

I'll fix the comment in next version.
> > > + */
> > > +struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
> > > +                                         unsigned long dst_start,
> > > +                                         unsigned long len,
> > > +                                         bool *mmap_locked)
> > >  {
> > > +     struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
> >
> > lock_vma_under_rcu() calls mas_walk(), which goes to dst_start for the
> > VMA.  It is not possible for dst_start to be outside the range.
> >
> > > +     if (!dst_vma) {
> >
> > BUG_ON(mmap_locked) ?
> >
> > > +             mmap_read_lock(dst_mm);
> > > +             *mmap_locked = true;
> > > +             dst_vma = find_vma(dst_mm, dst_start);
> >
> > find_vma() walks to dst_start and searches upwards from that address.
> > This is functionally different than what you have asked for above.  You
> > will not see an issue as you have coded it - but it may be suboptimal
> > since a start address lower than the VMA you are looking for can be
> > found... however, later you check the range falls between the dst_start
> > and dst_start + len.
> >
> > If you expect the dst_start to always be within the VMA range and not
> > lower, then you should use vma_lookup().
> >

Thanks for informing. So vma_lookup() returns the vma for any address
within [vma->vm_start, vma->vm_end)?
> > If you want to search upwards from dst_start for a VMA then you should
> > move the range check below into this brace.
> >
> > > +     }
> > > +
> > >       /*
> > >        * Make sure that the dst range is both valid and fully within a
> > >        * single existing vma.
> > >        */
> > > -     struct vm_area_struct *dst_vma;
> > > -
> > > -     dst_vma = find_vma(dst_mm, dst_start);
> > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > -             return NULL;
> > > +             goto unpin;
> > >
> > >       /*
> > >        * Check the vma is registered in uffd, this is required to
> > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > >        * time.
> > >        */
> > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > -             return NULL;
> > > +             goto unpin;
> > >
> > >       return dst_vma;
> > > +
> > > +unpin:
> > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > +     return NULL;
> > >  }
> > >
> > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > >  #ifdef CONFIG_HUGETLB_PAGE
> > >  /*
> > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > + * before returning.
> > >   */
> > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                                             struct userfaultfd_ctx *ctx,
> > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                                             unsigned long dst_start,
> > >                                             unsigned long src_start,
> > >                                             unsigned long len,
> > > -                                           uffd_flags_t flags)
> > > +                                           uffd_flags_t flags,
> > > +                                           bool *mmap_locked)
> > >  {
> > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >        */
> > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > >               up_read(&ctx->map_changing_lock);
> > > -             mmap_read_unlock(dst_mm);
> > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >               return -EINVAL;
> > >       }
> > >
> > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >        */
> > >       if (!dst_vma) {
> > >               err = -ENOENT;
> > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > -                     goto out_unlock;
> > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > +                                            len, mmap_locked);
> > > +             if (!dst_vma)
> > > +                     goto out;
> > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > +                     goto out_unlock_vma;
> > >
> > >               err = -EINVAL;
> > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > +                     goto out_unlock_vma;
> > > +
> > > +             /*
> > > +              * If memory mappings are changing because of non-cooperative
> > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > +              * request the user to retry later
> > > +              */
> > > +             down_read(&ctx->map_changing_lock);
> > > +             err = -EAGAIN;
> > > +             if (atomic_read(&ctx->mmap_changing))
> > >                       goto out_unlock;
> > >
> > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >               if (unlikely(err == -ENOENT)) {
> > >                       up_read(&ctx->map_changing_lock);
> > > -                     mmap_read_unlock(dst_mm);
> > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >                       BUG_ON(!folio);
> > >
> > >                       err = copy_folio_from_user(folio,
> > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                               err = -EFAULT;
> > >                               goto out;
> > >                       }
> > > -                     mmap_read_lock(dst_mm);
> > > -                     down_read(&ctx->map_changing_lock);
> > > -                     /*
> > > -                      * If memory mappings are changing because of non-cooperative
> > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > -                      * request the user to retry later
> > > -                      */
> > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > -                             err = -EAGAIN;
> > > -                             break;
> > > -                     }
> >
> > ... Okay, this is where things get confusing.
> >
> > How about this: Don't do this locking/boolean dance.
> >
> > Instead, do something like this:
> > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> >
> > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> >         unsigned long addr))    /* or some better name.. */
> > {
> >         struct vm_area_struct *vma;
> >
> >         vma = lock_vma_under_rcu(mm, addr);
> >
> >         if (vma)
> >                 return vma;
> >
> >         mmap_read_lock(mm);
> >         vma = lookup_vma(mm, addr);
> >         if (vma)
> >                 vma_start_read(vma); /* Won't fail */
>
> Please don't assume vma_start_read() won't fail even when you have
> mmap_read_lock(). See the comment in vma_start_read() about the
> possibility of an overflow producing false negatives.
>
> >
> >         mmap_read_unlock(mm);
> >         return vma;
> > }
> >
> > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > won't go away - you have it locked.  The mmap lock is held for even
> > less time for your worse case, and the code gets easier to follow.

Your suggestion is definitely simpler and easier to follow, but due to
the overflow situation that Suren pointed out, I would still need to
keep the locking/boolean dance, no? IIUC, even if I were to return
EAGAIN to the userspace, there is no guarantee that subsequent ioctls
on the same vma will succeed due to the same overflow, until someone
acquires and releases mmap_lock in write-mode.
Also, sometimes it seems insufficient whether we managed to lock vma
or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
anonymous vma) exists. If not then it bails out.
So it seems to me that we have to provide some fall back in
userfaultfd operations which executes with mmap_lock in read-mode.
> >
> > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > do this!
> >
> > Now the comment above such a function should state that the vma needs to
> > be vma_end_read(vma), or that could go undetected..  It might be worth
> > adding a unlock_vma() counterpart to vma_end_read(vma) even.
>
> Locking VMA while holding mmap_read_lock is an interesting usage
> pattern I haven't seen yet. I think this should work quite well!
>
> >
> >
> > >
> > >                       dst_vma = NULL;
> > >                       goto retry;
> > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >  out_unlock:
> > >       up_read(&ctx->map_changing_lock);
> > > -     mmap_read_unlock(dst_mm);
> > > +out_unlock_vma:
> > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > >  out:
> > >       if (folio)
> > >               folio_put(folio);
> > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > >                                   unsigned long dst_start,
> > >                                   unsigned long src_start,
> > >                                   unsigned long len,
> > > -                                 uffd_flags_t flags);
> > > +                                 uffd_flags_t flags,
> > > +                                 bool *mmap_locked);
> >
> > Just a thought, tabbing in twice for each argument would make this more
> > compact.
> >
> >
> > >  #endif /* CONFIG_HUGETLB_PAGE */
> > >
> > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       unsigned long src_addr, dst_addr;
> > >       long copied;
> > >       struct folio *folio;
> > > +     bool mmap_locked = false;
> > >
> > >       /*
> > >        * Sanitize the command parameters:
> > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       copied = 0;
> > >       folio = NULL;
> > >  retry:
> > > -     mmap_read_lock(dst_mm);
> > > +     /*
> > > +      * Make sure the vma is not shared, that the dst range is
> > > +      * both valid and fully within a single existing vma.
> > > +      */
> > > +     err = -ENOENT;
> > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > +     if (!dst_vma)
> > > +             goto out;
> > >
> > >       /*
> > >        * If memory mappings are changing because of non-cooperative
> > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       if (atomic_read(&ctx->mmap_changing))
> > >               goto out_unlock;
> > >
> > > -     /*
> > > -      * Make sure the vma is not shared, that the dst range is
> > > -      * both valid and fully within a single existing vma.
> > > -      */
> > > -     err = -ENOENT;
> > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > -     if (!dst_vma)
> > > -             goto out_unlock;
> > > -
> > >       err = -EINVAL;
> > >       /*
> > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > >        */
> > >       if (is_vm_hugetlb_page(dst_vma))
> > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > -                                          src_start, len, flags);
> > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > +                                          len, flags, &mmap_locked);
> > >
> > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > >               goto out_unlock;
> > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >                       void *kaddr;
> > >
> > >                       up_read(&ctx->map_changing_lock);
> > > -                     mmap_read_unlock(dst_mm);
> > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > +
> > >                       BUG_ON(!folio);
> > >
> > >                       kaddr = kmap_local_folio(folio, 0);
> > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >
> > >  out_unlock:
> > >       up_read(&ctx->map_changing_lock);
> > > -     mmap_read_unlock(dst_mm);
> > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > >  out:
> > >       if (folio)
> > >               folio_put(folio);
> > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > >   * @len: length of the virtual memory range
> > >   * @mode: flags from uffdio_move.mode
> > >   *
> > > - * Must be called with mmap_lock held for read.
> > > - *
> > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > >   * copy. It only works on non shared anonymous pages because those can
> > >   * be relocated without generating non linear anon_vmas in the rmap
> > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > >   * could be obtained. This is the only additional complexity added to
> > >   * the rmap code to provide this anonymous page remapping functionality.
> > >   */
> > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > -                unsigned long dst_start, unsigned long src_start,
> > > -                unsigned long len, __u64 mode)
> > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > >  {
> > > +     struct mm_struct *mm = ctx->mm;
> > >       struct vm_area_struct *src_vma, *dst_vma;
> > >       unsigned long src_addr, dst_addr;
> > >       pmd_t *src_pmd, *dst_pmd;
> > >       long err = -EINVAL;
> > >       ssize_t moved = 0;
> > > +     bool mmap_locked = false;
> > >
> > >       /* Sanitize the command parameters. */
> > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > >               goto out;
> >
> > Ah, is this safe for rmap?  I think you need to leave this read lock.
> >
I didn't fully understand you here.
> > >
> > > +     dst_vma = NULL;
> > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > +     if (src_vma) {
> > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > +             if (!dst_vma)
> > > +                     vma_end_read(src_vma);
> > > +     }
> > > +
> > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > +     if (!dst_vma) {
> > > +             mmap_read_lock(mm);
> > > +             mmap_locked = true;
> > > +             src_vma = find_vma(mm, src_start);
> > > +             if (!src_vma)
> > > +                     goto out_unlock_mmap;
> > > +             dst_vma = find_vma(mm, dst_start);
> >
> > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > works.

Sure, I'll use vma_lookup() instead of find_vma().
> >
> > > +             if (!dst_vma)
> > > +                     goto out_unlock_mmap;
> > > +     }
> > > +
> > > +     /* Re-check after taking map_changing_lock */
> > > +     down_read(&ctx->map_changing_lock);
> > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > +             err = -EAGAIN;
> > > +             goto out_unlock;
> > > +     }
> > >       /*
> > >        * Make sure the vma is not shared, that the src and dst remap
> > >        * ranges are both valid and fully within a single existing
> > >        * vma.
> > >        */
> > > -     src_vma = find_vma(mm, src_start);
> > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > -             goto out;
> > > +     if (src_vma->vm_flags & VM_SHARED)
> > > +             goto out_unlock;
> > >       if (src_start < src_vma->vm_start ||
> > >           src_start + len > src_vma->vm_end)
> > > -             goto out;
> > > +             goto out_unlock;
> > >
> > > -     dst_vma = find_vma(mm, dst_start);
> > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > -             goto out;
> > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > +             goto out_unlock;
> > >       if (dst_start < dst_vma->vm_start ||
> > >           dst_start + len > dst_vma->vm_end)
> > > -             goto out;
> > > +             goto out_unlock;
> > >
> > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > >       if (err)
> > > -             goto out;
> > > +             goto out_unlock;
> > >
> > >       for (src_addr = src_start, dst_addr = dst_start;
> > >            src_addr < src_start + len;) {
> > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > >               moved += step_size;
> > >       }
> > >
> > > +out_unlock:
> > > +     up_read(&ctx->map_changing_lock);
> > > +out_unlock_mmap:
> > > +     if (mmap_locked)
> > > +             mmap_read_unlock(mm);
> > > +     else {
> > > +             vma_end_read(dst_vma);
> > > +             vma_end_read(src_vma);
> > > +     }
> > >  out:
> > >       VM_WARN_ON(moved < 0);
> > >       VM_WARN_ON(err > 0);
> > > --
> > > 2.43.0.429.g432eaa2c6b-goog
> > >
> > >
Liam R. Howlett Jan. 30, 2024, 2:58 a.m. UTC | #5
* Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >

...

> 
> Thanks for informing. So vma_lookup() returns the vma for any address
> within [vma->vm_start, vma->vm_end)?

No.  It returns the vma that contains the address passed.  If there
isn't one, you will get NULL.  This is why the range check is not
needed.

find_vma() walks to the address passed and if it is NULL, it returns a
vma that has a higher start address than the one passed (or, rarely NULL
if it runs off the edge).

> > > If you want to search upwards from dst_start for a VMA then you should
> > > move the range check below into this brace.
> > >
> > > > +     }
> > > > +
> > > >       /*
> > > >        * Make sure that the dst range is both valid and fully within a
> > > >        * single existing vma.
> > > >        */
> > > > -     struct vm_area_struct *dst_vma;
> > > > -
> > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > -             return NULL;
> > > > +             goto unpin;
> > > >
> > > >       /*
> > > >        * Check the vma is registered in uffd, this is required to
> > > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > >        * time.
> > > >        */
> > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > -             return NULL;
> > > > +             goto unpin;
> > > >
> > > >       return dst_vma;
> > > > +
> > > > +unpin:
> > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > +     return NULL;
> > > >  }
> > > >
> > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > >  /*
> > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > + * before returning.
> > > >   */
> > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                                             struct userfaultfd_ctx *ctx,
> > > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                                             unsigned long dst_start,
> > > >                                             unsigned long src_start,
> > > >                                             unsigned long len,
> > > > -                                           uffd_flags_t flags)
> > > > +                                           uffd_flags_t flags,
> > > > +                                           bool *mmap_locked)
> > > >  {
> > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >        */
> > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > >               up_read(&ctx->map_changing_lock);
> > > > -             mmap_read_unlock(dst_mm);
> > > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > >               return -EINVAL;
> > > >       }
> > > >
> > > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >        */
> > > >       if (!dst_vma) {
> > > >               err = -ENOENT;
> > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > -                     goto out_unlock;
> > > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > > +                                            len, mmap_locked);
> > > > +             if (!dst_vma)
> > > > +                     goto out;
> > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > +                     goto out_unlock_vma;
> > > >
> > > >               err = -EINVAL;
> > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > +                     goto out_unlock_vma;
> > > > +
> > > > +             /*
> > > > +              * If memory mappings are changing because of non-cooperative
> > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > +              * request the user to retry later
> > > > +              */
> > > > +             down_read(&ctx->map_changing_lock);
> > > > +             err = -EAGAIN;
> > > > +             if (atomic_read(&ctx->mmap_changing))
> > > >                       goto out_unlock;
> > > >
> > > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >
> > > >               if (unlikely(err == -ENOENT)) {
> > > >                       up_read(&ctx->map_changing_lock);
> > > > -                     mmap_read_unlock(dst_mm);
> > > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > >                       BUG_ON(!folio);
> > > >
> > > >                       err = copy_folio_from_user(folio,
> > > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                               err = -EFAULT;
> > > >                               goto out;
> > > >                       }
> > > > -                     mmap_read_lock(dst_mm);
> > > > -                     down_read(&ctx->map_changing_lock);
> > > > -                     /*
> > > > -                      * If memory mappings are changing because of non-cooperative
> > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > -                      * request the user to retry later
> > > > -                      */
> > > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > > -                             err = -EAGAIN;
> > > > -                             break;
> > > > -                     }
> > >
> > > ... Okay, this is where things get confusing.
> > >
> > > How about this: Don't do this locking/boolean dance.
> > >
> > > Instead, do something like this:
> > > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> > >
> > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > >         unsigned long addr))    /* or some better name.. */
> > > {
> > >         struct vm_area_struct *vma;
> > >
> > >         vma = lock_vma_under_rcu(mm, addr);
> > >
> > >         if (vma)
> > >                 return vma;
> > >
> > >         mmap_read_lock(mm);
> > >         vma = lookup_vma(mm, addr);
> > >         if (vma)
> > >                 vma_start_read(vma); /* Won't fail */
> >
> > Please don't assume vma_start_read() won't fail even when you have
> > mmap_read_lock(). See the comment in vma_start_read() about the
> > possibility of an overflow producing false negatives.
> >
> > >
> > >         mmap_read_unlock(mm);
> > >         return vma;
> > > }
> > >
> > > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > > won't go away - you have it locked.  The mmap lock is held for even
> > > less time for your worse case, and the code gets easier to follow.
> 
> Your suggestion is definitely simpler and easier to follow, but due to
> the overflow situation that Suren pointed out, I would still need to
> keep the locking/boolean dance, no? IIUC, even if I were to return
> EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> on the same vma will succeed due to the same overflow, until someone
> acquires and releases mmap_lock in write-mode.
> Also, sometimes it seems insufficient whether we managed to lock vma
> or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> anonymous vma) exists. If not then it bails out.
> So it seems to me that we have to provide some fall back in
> userfaultfd operations which executes with mmap_lock in read-mode.

Fair enough, what if we didn't use the sequence number and just locked
the vma directly?

/* This will wait on the vma lock, so once we return it's locked */
void vma_aquire_read_lock(struct vm_area_struct *vma)
{
	mmap_assert_locked(vma->vm_mm);
	down_read(&vma->vm_lock->lock);
}

struct vm_area_struct *lock_vma(struct mm_struct *mm,
        unsigned long addr))    /* or some better name.. */
{
        struct vm_area_struct *vma;

        vma = lock_vma_under_rcu(mm, addr);
        if (vma)
                return vma;

        mmap_read_lock(mm);
	/* mm sequence cannot change, no mm writers anyways.
	 * find_mergeable_anon_vma is only a concern in the page fault
	 * path
	 * start/end won't change under the mmap_lock
	 * vma won't become detached as we have the mmap_lock in read
	 * We are now sure no writes will change the VMA
	 * So let's make sure no other context is isolating the vma
	 */
        vma = lookup_vma(mm, addr);
        if (vma)
                vma_aquire_read_lock(vma);

        mmap_read_unlock(mm);
        return vma;
}

I'm betting that avoiding the mmap_lock most of the time is good, but
then holding it just to lock the vma will have extremely rare collisions
- and they will be short lived.

This would allow us to simplify your code.

> > >
> > > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > > do this!
> > >
> > > Now the comment above such a function should state that the vma needs to
> > > be vma_end_read(vma), or that could go undetected..  It might be worth
> > > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> >
> > Locking VMA while holding mmap_read_lock is an interesting usage
> > pattern I haven't seen yet. I think this should work quite well!
> >
> > >
> > >
> > > >
> > > >                       dst_vma = NULL;
> > > >                       goto retry;
> > > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >
> > > >  out_unlock:
> > > >       up_read(&ctx->map_changing_lock);
> > > > -     mmap_read_unlock(dst_mm);
> > > > +out_unlock_vma:
> > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > >  out:
> > > >       if (folio)
> > > >               folio_put(folio);
> > > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > > >                                   unsigned long dst_start,
> > > >                                   unsigned long src_start,
> > > >                                   unsigned long len,
> > > > -                                 uffd_flags_t flags);
> > > > +                                 uffd_flags_t flags,
> > > > +                                 bool *mmap_locked);
> > >
> > > Just a thought, tabbing in twice for each argument would make this more
> > > compact.
> > >
> > >
> > > >  #endif /* CONFIG_HUGETLB_PAGE */
> > > >
> > > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       unsigned long src_addr, dst_addr;
> > > >       long copied;
> > > >       struct folio *folio;
> > > > +     bool mmap_locked = false;
> > > >
> > > >       /*
> > > >        * Sanitize the command parameters:
> > > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       copied = 0;
> > > >       folio = NULL;
> > > >  retry:
> > > > -     mmap_read_lock(dst_mm);
> > > > +     /*
> > > > +      * Make sure the vma is not shared, that the dst range is
> > > > +      * both valid and fully within a single existing vma.
> > > > +      */
> > > > +     err = -ENOENT;
> > > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > > +     if (!dst_vma)
> > > > +             goto out;
> > > >
> > > >       /*
> > > >        * If memory mappings are changing because of non-cooperative
> > > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       if (atomic_read(&ctx->mmap_changing))
> > > >               goto out_unlock;
> > > >
> > > > -     /*
> > > > -      * Make sure the vma is not shared, that the dst range is
> > > > -      * both valid and fully within a single existing vma.
> > > > -      */
> > > > -     err = -ENOENT;
> > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > -     if (!dst_vma)
> > > > -             goto out_unlock;
> > > > -
> > > >       err = -EINVAL;
> > > >       /*
> > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > > >        */
> > > >       if (is_vm_hugetlb_page(dst_vma))
> > > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > > -                                          src_start, len, flags);
> > > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > > +                                          len, flags, &mmap_locked);
> > > >
> > > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > > >               goto out_unlock;
> > > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >                       void *kaddr;
> > > >
> > > >                       up_read(&ctx->map_changing_lock);
> > > > -                     mmap_read_unlock(dst_mm);
> > > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > +
> > > >                       BUG_ON(!folio);
> > > >
> > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >
> > > >  out_unlock:
> > > >       up_read(&ctx->map_changing_lock);
> > > > -     mmap_read_unlock(dst_mm);
> > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > >  out:
> > > >       if (folio)
> > > >               folio_put(folio);
> > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > >   * @len: length of the virtual memory range
> > > >   * @mode: flags from uffdio_move.mode
> > > >   *
> > > > - * Must be called with mmap_lock held for read.
> > > > - *
> > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > >   * copy. It only works on non shared anonymous pages because those can
> > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > >   * could be obtained. This is the only additional complexity added to
> > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > >   */
> > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > -                unsigned long dst_start, unsigned long src_start,
> > > > -                unsigned long len, __u64 mode)
> > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > >  {
> > > > +     struct mm_struct *mm = ctx->mm;
> > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > >       unsigned long src_addr, dst_addr;
> > > >       pmd_t *src_pmd, *dst_pmd;
> > > >       long err = -EINVAL;
> > > >       ssize_t moved = 0;
> > > > +     bool mmap_locked = false;
> > > >
> > > >       /* Sanitize the command parameters. */
> > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > >               goto out;
> > >
> > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > >
> I didn't fully understand you here.

Sorry, I'm confused on how your locking scheme avoids rmap from trying
to use the VMA with the atomic increment part.

> > > >
> > > > +     dst_vma = NULL;
> > > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > > +     if (src_vma) {
> > > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > > +             if (!dst_vma)
> > > > +                     vma_end_read(src_vma);
> > > > +     }
> > > > +
> > > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > > +     if (!dst_vma) {
> > > > +             mmap_read_lock(mm);
> > > > +             mmap_locked = true;
> > > > +             src_vma = find_vma(mm, src_start);
> > > > +             if (!src_vma)
> > > > +                     goto out_unlock_mmap;
> > > > +             dst_vma = find_vma(mm, dst_start);
> > >
> > > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > > works.
> 
> Sure, I'll use vma_lookup() instead of find_vma().

Be sure it fits with what you are doing, I'm not entire sure it's right
to switch.  If it is not right then I don't think you can use
lock_vma_under_rcu() - but we can work around that too.

> > >
> > > > +             if (!dst_vma)
> > > > +                     goto out_unlock_mmap;
> > > > +     }
> > > > +
> > > > +     /* Re-check after taking map_changing_lock */
> > > > +     down_read(&ctx->map_changing_lock);
> > > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > > +             err = -EAGAIN;
> > > > +             goto out_unlock;
> > > > +     }
> > > >       /*
> > > >        * Make sure the vma is not shared, that the src and dst remap
> > > >        * ranges are both valid and fully within a single existing
> > > >        * vma.
> > > >        */
> > > > -     src_vma = find_vma(mm, src_start);
> > > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > > -             goto out;
> > > > +     if (src_vma->vm_flags & VM_SHARED)
> > > > +             goto out_unlock;
> > > >       if (src_start < src_vma->vm_start ||
> > > >           src_start + len > src_vma->vm_end)
> > > > -             goto out;
> > > > +             goto out_unlock;
> > > >
> > > > -     dst_vma = find_vma(mm, dst_start);
> > > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > > -             goto out;
> > > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > > +             goto out_unlock;
> > > >       if (dst_start < dst_vma->vm_start ||
> > > >           dst_start + len > dst_vma->vm_end)
> > > > -             goto out;
> > > > +             goto out_unlock;
> > > >
> > > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > > >       if (err)
> > > > -             goto out;
> > > > +             goto out_unlock;
> > > >
> > > >       for (src_addr = src_start, dst_addr = dst_start;
> > > >            src_addr < src_start + len;) {
> > > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > >               moved += step_size;
> > > >       }
> > > >
> > > > +out_unlock:
> > > > +     up_read(&ctx->map_changing_lock);
> > > > +out_unlock_mmap:
> > > > +     if (mmap_locked)
> > > > +             mmap_read_unlock(mm);
> > > > +     else {
> > > > +             vma_end_read(dst_vma);
> > > > +             vma_end_read(src_vma);
> > > > +     }
> > > >  out:
> > > >       VM_WARN_ON(moved < 0);
> > > >       VM_WARN_ON(err > 0);
> > > > --
> > > > 2.43.0.429.g432eaa2c6b-goog
> > > >
> > > >
Lokesh Gidra Jan. 31, 2024, 2:49 a.m. UTC | #6
On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
>
> ...
>
> >
> > Thanks for informing. So vma_lookup() returns the vma for any address
> > within [vma->vm_start, vma->vm_end)?
>
> No.  It returns the vma that contains the address passed.  If there
> isn't one, you will get NULL.  This is why the range check is not
> needed.

This is what we need. IIUC, with vma_lookup() and lock_vma_under_rcu()
the only validation required is

if (vma && vma->vm_end >= dst_start + len)

Thanks for clarifying.
>
> find_vma() walks to the address passed and if it is NULL, it returns a
> vma that has a higher start address than the one passed (or, rarely NULL
> if it runs off the edge).
>
> > > > If you want to search upwards from dst_start for a VMA then you should
> > > > move the range check below into this brace.
> > > >
> > > > > +     }
> > > > > +
> > > > >       /*
> > > > >        * Make sure that the dst range is both valid and fully within a
> > > > >        * single existing vma.
> > > > >        */
> > > > > -     struct vm_area_struct *dst_vma;
> > > > > -
> > > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       /*
> > > > >        * Check the vma is registered in uffd, this is required to
> > > > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > >        * time.
> > > > >        */
> > > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       return dst_vma;
> > > > > +
> > > > > +unpin:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > > +     return NULL;
> > > > >  }
> > > > >
> > > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > > >  /*
> > > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > > + * before returning.
> > > > >   */
> > > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             struct userfaultfd_ctx *ctx,
> > > > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             unsigned long dst_start,
> > > > >                                             unsigned long src_start,
> > > > >                                             unsigned long len,
> > > > > -                                           uffd_flags_t flags)
> > > > > +                                           uffd_flags_t flags,
> > > > > +                                           bool *mmap_locked)
> > > > >  {
> > > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > > >               up_read(&ctx->map_changing_lock);
> > > > > -             mmap_read_unlock(dst_mm);
> > > > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >               return -EINVAL;
> > > > >       }
> > > > >
> > > > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (!dst_vma) {
> > > > >               err = -ENOENT;
> > > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > > -                     goto out_unlock;
> > > > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > > > +                                            len, mmap_locked);
> > > > > +             if (!dst_vma)
> > > > > +                     goto out;
> > > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > >
> > > > >               err = -EINVAL;
> > > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > > +
> > > > > +             /*
> > > > > +              * If memory mappings are changing because of non-cooperative
> > > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > > +              * request the user to retry later
> > > > > +              */
> > > > > +             down_read(&ctx->map_changing_lock);
> > > > > +             err = -EAGAIN;
> > > > > +             if (atomic_read(&ctx->mmap_changing))
> > > > >                       goto out_unlock;
> > > > >
> > > > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >               if (unlikely(err == -ENOENT)) {
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       err = copy_folio_from_user(folio,
> > > > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                               err = -EFAULT;
> > > > >                               goto out;
> > > > >                       }
> > > > > -                     mmap_read_lock(dst_mm);
> > > > > -                     down_read(&ctx->map_changing_lock);
> > > > > -                     /*
> > > > > -                      * If memory mappings are changing because of non-cooperative
> > > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > > -                      * request the user to retry later
> > > > > -                      */
> > > > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > > > -                             err = -EAGAIN;
> > > > > -                             break;
> > > > > -                     }
> > > >
> > > > ... Okay, this is where things get confusing.
> > > >
> > > > How about this: Don't do this locking/boolean dance.
> > > >
> > > > Instead, do something like this:
> > > > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> > > >
> > > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > >         unsigned long addr))    /* or some better name.. */
> > > > {
> > > >         struct vm_area_struct *vma;
> > > >
> > > >         vma = lock_vma_under_rcu(mm, addr);
> > > >
> > > >         if (vma)
> > > >                 return vma;
> > > >
> > > >         mmap_read_lock(mm);
> > > >         vma = lookup_vma(mm, addr);
> > > >         if (vma)
> > > >                 vma_start_read(vma); /* Won't fail */
> > >
> > > Please don't assume vma_start_read() won't fail even when you have
> > > mmap_read_lock(). See the comment in vma_start_read() about the
> > > possibility of an overflow producing false negatives.
> > >
> > > >
> > > >         mmap_read_unlock(mm);
> > > >         return vma;
> > > > }
> > > >
> > > > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > > > won't go away - you have it locked.  The mmap lock is held for even
> > > > less time for your worse case, and the code gets easier to follow.
> >
> > Your suggestion is definitely simpler and easier to follow, but due to
> > the overflow situation that Suren pointed out, I would still need to
> > keep the locking/boolean dance, no? IIUC, even if I were to return
> > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > on the same vma will succeed due to the same overflow, until someone
> > acquires and releases mmap_lock in write-mode.
> > Also, sometimes it seems insufficient whether we managed to lock vma
> > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > anonymous vma) exists. If not then it bails out.
> > So it seems to me that we have to provide some fall back in
> > userfaultfd operations which executes with mmap_lock in read-mode.
>
> Fair enough, what if we didn't use the sequence number and just locked
> the vma directly?

Looks good to me, unless someone else has any objections.
>
> /* This will wait on the vma lock, so once we return it's locked */
> void vma_aquire_read_lock(struct vm_area_struct *vma)
> {
>         mmap_assert_locked(vma->vm_mm);
>         down_read(&vma->vm_lock->lock);
> }
>
> struct vm_area_struct *lock_vma(struct mm_struct *mm,
>         unsigned long addr))    /* or some better name.. */
> {
>         struct vm_area_struct *vma;
>
>         vma = lock_vma_under_rcu(mm, addr);
>         if (vma)
>                 return vma;
>
>         mmap_read_lock(mm);
>         /* mm sequence cannot change, no mm writers anyways.
>          * find_mergeable_anon_vma is only a concern in the page fault
>          * path
>          * start/end won't change under the mmap_lock
>          * vma won't become detached as we have the mmap_lock in read
>          * We are now sure no writes will change the VMA
>          * So let's make sure no other context is isolating the vma
>          */
>         vma = lookup_vma(mm, addr);
>         if (vma)
We can take care of anon_vma as well here right? I can take a bool
parameter ('prepare_anon' or something) and then:

           if (vma) {
                    if (prepare_anon && vma_is_anonymous(vma)) &&
!anon_vma_prepare(vma)) {
                                      vma = ERR_PTR(-ENOMEM);
                                      goto out_unlock;
                   }
>                 vma_aquire_read_lock(vma);
           }
out_unlock:
>         mmap_read_unlock(mm);
>         return vma;
> }
>
> I'm betting that avoiding the mmap_lock most of the time is good, but
> then holding it just to lock the vma will have extremely rare collisions
> - and they will be short lived.
>
> This would allow us to simplify your code.

Agreed! Thanks for the suggestion.
>
> > > >
> > > > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > > > do this!
> > > >
> > > > Now the comment above such a function should state that the vma needs to
> > > > be vma_end_read(vma), or that could go undetected..  It might be worth
> > > > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> > >
> > > Locking VMA while holding mmap_read_lock is an interesting usage
> > > pattern I haven't seen yet. I think this should work quite well!
> > >
> > > >
> > > >
> > > > >
> > > > >                       dst_vma = NULL;
> > > > >                       goto retry;
> > > > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +out_unlock_vma:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > > > >                                   unsigned long dst_start,
> > > > >                                   unsigned long src_start,
> > > > >                                   unsigned long len,
> > > > > -                                 uffd_flags_t flags);
> > > > > +                                 uffd_flags_t flags,
> > > > > +                                 bool *mmap_locked);
> > > >
> > > > Just a thought, tabbing in twice for each argument would make this more
> > > > compact.
> > > >
> > > >
> > > > >  #endif /* CONFIG_HUGETLB_PAGE */
> > > > >
> > > > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       long copied;
> > > > >       struct folio *folio;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /*
> > > > >        * Sanitize the command parameters:
> > > > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       copied = 0;
> > > > >       folio = NULL;
> > > > >  retry:
> > > > > -     mmap_read_lock(dst_mm);
> > > > > +     /*
> > > > > +      * Make sure the vma is not shared, that the dst range is
> > > > > +      * both valid and fully within a single existing vma.
> > > > > +      */
> > > > > +     err = -ENOENT;
> > > > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > > > +     if (!dst_vma)
> > > > > +             goto out;
> > > > >
> > > > >       /*
> > > > >        * If memory mappings are changing because of non-cooperative
> > > > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       if (atomic_read(&ctx->mmap_changing))
> > > > >               goto out_unlock;
> > > > >
> > > > > -     /*
> > > > > -      * Make sure the vma is not shared, that the dst range is
> > > > > -      * both valid and fully within a single existing vma.
> > > > > -      */
> > > > > -     err = -ENOENT;
> > > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -     if (!dst_vma)
> > > > > -             goto out_unlock;
> > > > > -
> > > > >       err = -EINVAL;
> > > > >       /*
> > > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > > > >        */
> > > > >       if (is_vm_hugetlb_page(dst_vma))
> > > > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > > > -                                          src_start, len, flags);
> > > > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > > > +                                          len, flags, &mmap_locked);
> > > > >
> > > > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > > > >               goto out_unlock;
> > > > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >                       void *kaddr;
> > > > >
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > > +
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * @len: length of the virtual memory range
> > > > >   * @mode: flags from uffdio_move.mode
> > > > >   *
> > > > > - * Must be called with mmap_lock held for read.
> > > > > - *
> > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * could be obtained. This is the only additional complexity added to
> > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > >   */
> > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > -                unsigned long len, __u64 mode)
> > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > >  {
> > > > > +     struct mm_struct *mm = ctx->mm;
> > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > >       long err = -EINVAL;
> > > > >       ssize_t moved = 0;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /* Sanitize the command parameters. */
> > > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > >               goto out;
> > > >
> > > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > > >
> > I didn't fully understand you here.
>
> Sorry, I'm confused on how your locking scheme avoids rmap from trying
> to use the VMA with the atomic increment part.
>
> > > > >
> > > > > +     dst_vma = NULL;
> > > > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > > > +     if (src_vma) {
> > > > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > > > +             if (!dst_vma)
> > > > > +                     vma_end_read(src_vma);
> > > > > +     }
> > > > > +
> > > > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > > > +     if (!dst_vma) {
> > > > > +             mmap_read_lock(mm);
> > > > > +             mmap_locked = true;
> > > > > +             src_vma = find_vma(mm, src_start);
> > > > > +             if (!src_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +             dst_vma = find_vma(mm, dst_start);
> > > >
> > > > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > > > works.
> >
> > Sure, I'll use vma_lookup() instead of find_vma().
>
> Be sure it fits with what you are doing, I'm not entire sure it's right
> to switch.  If it is not right then I don't think you can use
> lock_vma_under_rcu() - but we can work around that too.
>
> > > >
> > > > > +             if (!dst_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +     }
> > > > > +
> > > > > +     /* Re-check after taking map_changing_lock */
> > > > > +     down_read(&ctx->map_changing_lock);
> > > > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > > > +             err = -EAGAIN;
> > > > > +             goto out_unlock;
> > > > > +     }
> > > > >       /*
> > > > >        * Make sure the vma is not shared, that the src and dst remap
> > > > >        * ranges are both valid and fully within a single existing
> > > > >        * vma.
> > > > >        */
> > > > > -     src_vma = find_vma(mm, src_start);
> > > > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (src_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (src_start < src_vma->vm_start ||
> > > > >           src_start + len > src_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > > -     dst_vma = find_vma(mm, dst_start);
> > > > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (dst_start < dst_vma->vm_start ||
> > > > >           dst_start + len > dst_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > > > >       if (err)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       for (src_addr = src_start, dst_addr = dst_start;
> > > > >            src_addr < src_start + len;) {
> > > > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >               moved += step_size;
> > > > >       }
> > > > >
> > > > > +out_unlock:
> > > > > +     up_read(&ctx->map_changing_lock);
> > > > > +out_unlock_mmap:
> > > > > +     if (mmap_locked)
> > > > > +             mmap_read_unlock(mm);
> > > > > +     else {
> > > > > +             vma_end_read(dst_vma);
> > > > > +             vma_end_read(src_vma);
> > > > > +     }
> > > > >  out:
> > > > >       VM_WARN_ON(moved < 0);
> > > > >       VM_WARN_ON(err > 0);
> > > > > --
> > > > > 2.43.0.429.g432eaa2c6b-goog
> > > > >
> > > > >
Suren Baghdasaryan Jan. 31, 2024, 3:03 a.m. UTC | #7
On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
>
> ...
>
> >
> > Thanks for informing. So vma_lookup() returns the vma for any address
> > within [vma->vm_start, vma->vm_end)?
>
> No.  It returns the vma that contains the address passed.  If there
> isn't one, you will get NULL.  This is why the range check is not
> needed.
>
> find_vma() walks to the address passed and if it is NULL, it returns a
> vma that has a higher start address than the one passed (or, rarely NULL
> if it runs off the edge).
>
> > > > If you want to search upwards from dst_start for a VMA then you should
> > > > move the range check below into this brace.
> > > >
> > > > > +     }
> > > > > +
> > > > >       /*
> > > > >        * Make sure that the dst range is both valid and fully within a
> > > > >        * single existing vma.
> > > > >        */
> > > > > -     struct vm_area_struct *dst_vma;
> > > > > -
> > > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > > >       if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       /*
> > > > >        * Check the vma is registered in uffd, this is required to
> > > > > @@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > >        * time.
> > > > >        */
> > > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > > -             return NULL;
> > > > > +             goto unpin;
> > > > >
> > > > >       return dst_vma;
> > > > > +
> > > > > +unpin:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > > +     return NULL;
> > > > >  }
> > > > >
> > > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > > @@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > > >  /*
> > > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > > + * before returning.
> > > > >   */
> > > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             struct userfaultfd_ctx *ctx,
> > > > > @@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             unsigned long dst_start,
> > > > >                                             unsigned long src_start,
> > > > >                                             unsigned long len,
> > > > > -                                           uffd_flags_t flags)
> > > > > +                                           uffd_flags_t flags,
> > > > > +                                           bool *mmap_locked)
> > > > >  {
> > > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > > >       int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > > >               up_read(&ctx->map_changing_lock);
> > > > > -             mmap_read_unlock(dst_mm);
> > > > > +             unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >               return -EINVAL;
> > > > >       }
> > > > >
> > > > > @@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (!dst_vma) {
> > > > >               err = -ENOENT;
> > > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > > -                     goto out_unlock;
> > > > > +             dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
> > > > > +                                            len, mmap_locked);
> > > > > +             if (!dst_vma)
> > > > > +                     goto out;
> > > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > >
> > > > >               err = -EINVAL;
> > > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > > +
> > > > > +             /*
> > > > > +              * If memory mappings are changing because of non-cooperative
> > > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > > +              * request the user to retry later
> > > > > +              */
> > > > > +             down_read(&ctx->map_changing_lock);
> > > > > +             err = -EAGAIN;
> > > > > +             if (atomic_read(&ctx->mmap_changing))
> > > > >                       goto out_unlock;
> > > > >
> > > > >               vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > @@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >               if (unlikely(err == -ENOENT)) {
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       err = copy_folio_from_user(folio,
> > > > > @@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                               err = -EFAULT;
> > > > >                               goto out;
> > > > >                       }
> > > > > -                     mmap_read_lock(dst_mm);
> > > > > -                     down_read(&ctx->map_changing_lock);
> > > > > -                     /*
> > > > > -                      * If memory mappings are changing because of non-cooperative
> > > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > > -                      * request the user to retry later
> > > > > -                      */
> > > > > -                     if (atomic_read(ctx->mmap_changing)) {
> > > > > -                             err = -EAGAIN;
> > > > > -                             break;
> > > > > -                     }
> > > >
> > > > ... Okay, this is where things get confusing.
> > > >
> > > > How about this: Don't do this locking/boolean dance.
> > > >
> > > > Instead, do something like this:
> > > > In mm/memory.c, below lock_vma_under_rcu(), but something like this
> > > >
> > > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > >         unsigned long addr))    /* or some better name.. */
> > > > {
> > > >         struct vm_area_struct *vma;
> > > >
> > > >         vma = lock_vma_under_rcu(mm, addr);
> > > >
> > > >         if (vma)
> > > >                 return vma;
> > > >
> > > >         mmap_read_lock(mm);
> > > >         vma = lookup_vma(mm, addr);
> > > >         if (vma)
> > > >                 vma_start_read(vma); /* Won't fail */
> > >
> > > Please don't assume vma_start_read() won't fail even when you have
> > > mmap_read_lock(). See the comment in vma_start_read() about the
> > > possibility of an overflow producing false negatives.
> > >
> > > >
> > > >         mmap_read_unlock(mm);
> > > >         return vma;
> > > > }
> > > >
> > > > Now, we know we have a vma that's vma locked if there is a vma.  The vma
> > > > won't go away - you have it locked.  The mmap lock is held for even
> > > > less time for your worse case, and the code gets easier to follow.
> >
> > Your suggestion is definitely simpler and easier to follow, but due to
> > the overflow situation that Suren pointed out, I would still need to
> > keep the locking/boolean dance, no? IIUC, even if I were to return
> > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > on the same vma will succeed due to the same overflow, until someone
> > acquires and releases mmap_lock in write-mode.
> > Also, sometimes it seems insufficient whether we managed to lock vma
> > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > anonymous vma) exists. If not then it bails out.
> > So it seems to me that we have to provide some fall back in
> > userfaultfd operations which executes with mmap_lock in read-mode.
>
> Fair enough, what if we didn't use the sequence number and just locked
> the vma directly?
>
> /* This will wait on the vma lock, so once we return it's locked */
> void vma_aquire_read_lock(struct vm_area_struct *vma)
> {
>         mmap_assert_locked(vma->vm_mm);
>         down_read(&vma->vm_lock->lock);
> }
>
> struct vm_area_struct *lock_vma(struct mm_struct *mm,
>         unsigned long addr))    /* or some better name.. */
> {
>         struct vm_area_struct *vma;
>
>         vma = lock_vma_under_rcu(mm, addr);
>         if (vma)
>                 return vma;
>
>         mmap_read_lock(mm);
>         /* mm sequence cannot change, no mm writers anyways.
>          * find_mergeable_anon_vma is only a concern in the page fault
>          * path
>          * start/end won't change under the mmap_lock
>          * vma won't become detached as we have the mmap_lock in read
>          * We are now sure no writes will change the VMA
>          * So let's make sure no other context is isolating the vma
>          */
>         vma = lookup_vma(mm, addr);
>         if (vma)
>                 vma_aquire_read_lock(vma);
>
>         mmap_read_unlock(mm);
>         return vma;
> }
>
> I'm betting that avoiding the mmap_lock most of the time is good, but
> then holding it just to lock the vma will have extremely rare collisions
> - and they will be short lived.
>
> This would allow us to simplify your code.
>
> > > >
> > > > Once you are done with the vma do a vma_end_read(vma).  Don't forget to
> > > > do this!
> > > >
> > > > Now the comment above such a function should state that the vma needs to
> > > > be vma_end_read(vma), or that could go undetected..  It might be worth
> > > > adding a unlock_vma() counterpart to vma_end_read(vma) even.
> > >
> > > Locking VMA while holding mmap_read_lock is an interesting usage
> > > pattern I haven't seen yet. I think this should work quite well!
> > >
> > > >
> > > >
> > > > >
> > > > >                       dst_vma = NULL;
> > > > >                       goto retry;
> > > > > @@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +out_unlock_vma:
> > > > > +     unpin_vma(dst_mm, dst_vma, mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> > > > >                                   unsigned long dst_start,
> > > > >                                   unsigned long src_start,
> > > > >                                   unsigned long len,
> > > > > -                                 uffd_flags_t flags);
> > > > > +                                 uffd_flags_t flags,
> > > > > +                                 bool *mmap_locked);
> > > >
> > > > Just a thought, tabbing in twice for each argument would make this more
> > > > compact.
> > > >
> > > >
> > > > >  #endif /* CONFIG_HUGETLB_PAGE */
> > > > >
> > > > >  static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > > > > @@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       long copied;
> > > > >       struct folio *folio;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /*
> > > > >        * Sanitize the command parameters:
> > > > > @@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       copied = 0;
> > > > >       folio = NULL;
> > > > >  retry:
> > > > > -     mmap_read_lock(dst_mm);
> > > > > +     /*
> > > > > +      * Make sure the vma is not shared, that the dst range is
> > > > > +      * both valid and fully within a single existing vma.
> > > > > +      */
> > > > > +     err = -ENOENT;
> > > > > +     dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
> > > > > +     if (!dst_vma)
> > > > > +             goto out;
> > > > >
> > > > >       /*
> > > > >        * If memory mappings are changing because of non-cooperative
> > > > > @@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       if (atomic_read(&ctx->mmap_changing))
> > > > >               goto out_unlock;
> > > > >
> > > > > -     /*
> > > > > -      * Make sure the vma is not shared, that the dst range is
> > > > > -      * both valid and fully within a single existing vma.
> > > > > -      */
> > > > > -     err = -ENOENT;
> > > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -     if (!dst_vma)
> > > > > -             goto out_unlock;
> > > > > -
> > > > >       err = -EINVAL;
> > > > >       /*
> > > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > > @@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >        * If this is a HUGETLB vma, pass off to appropriate routine
> > > > >        */
> > > > >       if (is_vm_hugetlb_page(dst_vma))
> > > > > -             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > > > > -                                          src_start, len, flags);
> > > > > +             return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
> > > > > +                                          len, flags, &mmap_locked);
> > > > >
> > > > >       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> > > > >               goto out_unlock;
> > > > > @@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >                       void *kaddr;
> > > > >
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > -                     mmap_read_unlock(dst_mm);
> > > > > +                     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > > +
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > -     mmap_read_unlock(dst_mm);
> > > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * @len: length of the virtual memory range
> > > > >   * @mode: flags from uffdio_move.mode
> > > > >   *
> > > > > - * Must be called with mmap_lock held for read.
> > > > > - *
> > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * could be obtained. This is the only additional complexity added to
> > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > >   */
> > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > -                unsigned long len, __u64 mode)
> > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > >  {
> > > > > +     struct mm_struct *mm = ctx->mm;
> > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > >       long err = -EINVAL;
> > > > >       ssize_t moved = 0;
> > > > > +     bool mmap_locked = false;
> > > > >
> > > > >       /* Sanitize the command parameters. */
> > > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > >               goto out;
> > > >
> > > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > > >
> > I didn't fully understand you here.
>
> Sorry, I'm confused on how your locking scheme avoids rmap from trying
> to use the VMA with the atomic increment part.

I'm also a bit confused. Which atomic increment are you referring to?
AFAIU move_pages() will lock both src_vma and dst_vma, so even if rmap
finds them it can't modify them, no?

>
> > > > >
> > > > > +     dst_vma = NULL;
> > > > > +     src_vma = lock_vma_under_rcu(mm, src_start);
> > > > > +     if (src_vma) {
> > > > > +             dst_vma = lock_vma_under_rcu(mm, dst_start);
> > > > > +             if (!dst_vma)
> > > > > +                     vma_end_read(src_vma);
> > > > > +     }
> > > > > +
> > > > > +     /* If we failed to lock both VMAs, fall back to mmap_lock */
> > > > > +     if (!dst_vma) {
> > > > > +             mmap_read_lock(mm);
> > > > > +             mmap_locked = true;
> > > > > +             src_vma = find_vma(mm, src_start);
> > > > > +             if (!src_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +             dst_vma = find_vma(mm, dst_start);
> > > >
> > > > Again, there is a difference in how find_vma and lock_vam_under_rcu
> > > > works.
> >
> > Sure, I'll use vma_lookup() instead of find_vma().
>
> Be sure it fits with what you are doing, I'm not entire sure it's right
> to switch.  If it is not right then I don't think you can use
> lock_vma_under_rcu() - but we can work around that too.
>
> > > >
> > > > > +             if (!dst_vma)
> > > > > +                     goto out_unlock_mmap;
> > > > > +     }
> > > > > +
> > > > > +     /* Re-check after taking map_changing_lock */
> > > > > +     down_read(&ctx->map_changing_lock);
> > > > > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > > > > +             err = -EAGAIN;
> > > > > +             goto out_unlock;
> > > > > +     }
> > > > >       /*
> > > > >        * Make sure the vma is not shared, that the src and dst remap
> > > > >        * ranges are both valid and fully within a single existing
> > > > >        * vma.
> > > > >        */
> > > > > -     src_vma = find_vma(mm, src_start);
> > > > > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (src_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (src_start < src_vma->vm_start ||
> > > > >           src_start + len > src_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > > -     dst_vma = find_vma(mm, dst_start);
> > > > > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > > > > -             goto out;
> > > > > +     if (dst_vma->vm_flags & VM_SHARED)
> > > > > +             goto out_unlock;
> > > > >       if (dst_start < dst_vma->vm_start ||
> > > > >           dst_start + len > dst_vma->vm_end)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       err = validate_move_areas(ctx, src_vma, dst_vma);
> > > > >       if (err)
> > > > > -             goto out;
> > > > > +             goto out_unlock;
> > > > >
> > > > >       for (src_addr = src_start, dst_addr = dst_start;
> > > > >            src_addr < src_start + len;) {
> > > > > @@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >               moved += step_size;
> > > > >       }
> > > > >
> > > > > +out_unlock:
> > > > > +     up_read(&ctx->map_changing_lock);
> > > > > +out_unlock_mmap:
> > > > > +     if (mmap_locked)
> > > > > +             mmap_read_unlock(mm);
> > > > > +     else {
> > > > > +             vma_end_read(dst_vma);
> > > > > +             vma_end_read(src_vma);
> > > > > +     }
> > > > >  out:
> > > > >       VM_WARN_ON(moved < 0);
> > > > >       VM_WARN_ON(err > 0);
> > > > > --
> > > > > 2.43.0.429.g432eaa2c6b-goog
> > > > >
> > > > >
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Liam R. Howlett Jan. 31, 2024, 9:41 p.m. UTC | #8
* Lokesh Gidra <lokeshgidra@google.com> [240130 21:49]:
> On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> >

...

> >
> > > Your suggestion is definitely simpler and easier to follow, but due to
> > > the overflow situation that Suren pointed out, I would still need to
> > > keep the locking/boolean dance, no? IIUC, even if I were to return
> > > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > > on the same vma will succeed due to the same overflow, until someone
> > > acquires and releases mmap_lock in write-mode.
> > > Also, sometimes it seems insufficient whether we managed to lock vma
> > > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > > anonymous vma) exists. If not then it bails out.
> > > So it seems to me that we have to provide some fall back in
> > > userfaultfd operations which executes with mmap_lock in read-mode.
> >
> > Fair enough, what if we didn't use the sequence number and just locked
> > the vma directly?
> 
> Looks good to me, unless someone else has any objections.
> >
> > /* This will wait on the vma lock, so once we return it's locked */
> > void vma_aquire_read_lock(struct vm_area_struct *vma)
> > {
> >         mmap_assert_locked(vma->vm_mm);
> >         down_read(&vma->vm_lock->lock);
> > }
> >
> > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> >         unsigned long addr))    /* or some better name.. */
> > {
> >         struct vm_area_struct *vma;
> >
> >         vma = lock_vma_under_rcu(mm, addr);
> >         if (vma)
> >                 return vma;
> >
> >         mmap_read_lock(mm);
> >         /* mm sequence cannot change, no mm writers anyways.
> >          * find_mergeable_anon_vma is only a concern in the page fault
> >          * path
> >          * start/end won't change under the mmap_lock
> >          * vma won't become detached as we have the mmap_lock in read
> >          * We are now sure no writes will change the VMA
> >          * So let's make sure no other context is isolating the vma
> >          */
> >         vma = lookup_vma(mm, addr);
> >         if (vma)
> We can take care of anon_vma as well here right? I can take a bool
> parameter ('prepare_anon' or something) and then:
> 
>            if (vma) {
>                     if (prepare_anon && vma_is_anonymous(vma)) &&
> !anon_vma_prepare(vma)) {
>                                       vma = ERR_PTR(-ENOMEM);
>                                       goto out_unlock;
>                    }
> >                 vma_aquire_read_lock(vma);
>            }
> out_unlock:
> >         mmap_read_unlock(mm);
> >         return vma;
> > }

Do you need this?  I didn't think this was happening in the code as
written?  If you need it I would suggest making it happen always and
ditch the flag until a user needs this variant, but document what's
going on in here or even have a better name.

Thanks,
Liam
Liam R. Howlett Jan. 31, 2024, 9:43 p.m. UTC | #9
* Suren Baghdasaryan <surenb@google.com> [240130 22:03]:
> On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:

...

> > > > > > @@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > > >
> > > > > >  out_unlock:
> > > > > >       up_read(&ctx->map_changing_lock);
> > > > > > -     mmap_read_unlock(dst_mm);
> > > > > > +     unpin_vma(dst_mm, dst_vma, &mmap_locked);
> > > > > >  out:
> > > > > >       if (folio)
> > > > > >               folio_put(folio);
> > > > > > @@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > > >   * @len: length of the virtual memory range
> > > > > >   * @mode: flags from uffdio_move.mode
> > > > > >   *
> > > > > > - * Must be called with mmap_lock held for read.
> > > > > > - *
> > > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > > @@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > > >   * could be obtained. This is the only additional complexity added to
> > > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > > >   */
> > > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > > -                unsigned long len, __u64 mode)
> > > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > > >  {
> > > > > > +     struct mm_struct *mm = ctx->mm;
> > > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > > >       unsigned long src_addr, dst_addr;
> > > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > > >       long err = -EINVAL;
> > > > > >       ssize_t moved = 0;
> > > > > > +     bool mmap_locked = false;
> > > > > >
> > > > > >       /* Sanitize the command parameters. */
> > > > > >       if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
> > > > > > @@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > > >               goto out;
> > > > >
> > > > > Ah, is this safe for rmap?  I think you need to leave this read lock.
> > > > >
> > > I didn't fully understand you here.
> >
> > Sorry, I'm confused on how your locking scheme avoids rmap from trying
> > to use the VMA with the atomic increment part.
> 
> I'm also a bit confused. Which atomic increment are you referring to?
> AFAIU move_pages() will lock both src_vma and dst_vma, so even if rmap
> finds them it can't modify them, no?

The uffd atomic, mmap_changing.

...
Suren Baghdasaryan Feb. 5, 2024, 9:46 p.m. UTC | #10
On Wed, Jan 31, 2024 at 1:41 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240130 21:49]:
> > On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > > > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > >
>
> ...
>
> > >
> > > > Your suggestion is definitely simpler and easier to follow, but due to
> > > > the overflow situation that Suren pointed out, I would still need to
> > > > keep the locking/boolean dance, no? IIUC, even if I were to return
> > > > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > > > on the same vma will succeed due to the same overflow, until someone
> > > > acquires and releases mmap_lock in write-mode.
> > > > Also, sometimes it seems insufficient whether we managed to lock vma
> > > > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > > > anonymous vma) exists. If not then it bails out.
> > > > So it seems to me that we have to provide some fall back in
> > > > userfaultfd operations which executes with mmap_lock in read-mode.
> > >
> > > Fair enough, what if we didn't use the sequence number and just locked
> > > the vma directly?
> >
> > Looks good to me, unless someone else has any objections.
> > >
> > > /* This will wait on the vma lock, so once we return it's locked */
> > > void vma_aquire_read_lock(struct vm_area_struct *vma)
> > > {
> > >         mmap_assert_locked(vma->vm_mm);
> > >         down_read(&vma->vm_lock->lock);
> > > }
> > >
> > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > >         unsigned long addr))    /* or some better name.. */
> > > {
> > >         struct vm_area_struct *vma;
> > >
> > >         vma = lock_vma_under_rcu(mm, addr);
> > >         if (vma)
> > >                 return vma;
> > >
> > >         mmap_read_lock(mm);
> > >         /* mm sequence cannot change, no mm writers anyways.
> > >          * find_mergeable_anon_vma is only a concern in the page fault
> > >          * path
> > >          * start/end won't change under the mmap_lock
> > >          * vma won't become detached as we have the mmap_lock in read
> > >          * We are now sure no writes will change the VMA
> > >          * So let's make sure no other context is isolating the vma
> > >          */
> > >         vma = lookup_vma(mm, addr);
> > >         if (vma)
> > We can take care of anon_vma as well here right? I can take a bool
> > parameter ('prepare_anon' or something) and then:
> >
> >            if (vma) {
> >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > !anon_vma_prepare(vma)) {
> >                                       vma = ERR_PTR(-ENOMEM);
> >                                       goto out_unlock;
> >                    }
> > >                 vma_aquire_read_lock(vma);
> >            }
> > out_unlock:
> > >         mmap_read_unlock(mm);
> > >         return vma;
> > > }
>
> Do you need this?  I didn't think this was happening in the code as
> written?  If you need it I would suggest making it happen always and
> ditch the flag until a user needs this variant, but document what's
> going on in here or even have a better name.

I think yes, you do need this. I can see calls to anon_vma_prepare()
under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
in mfill_atomic(). This means, just like in the pagefault path, we
modify vma->anon_vma under mmap_read_lock protection which guarantees
that adjacent VMAs won't change. This is important because
__anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
neighboring VMAs to be stable. Per-VMA lock guarantees stability of
the VMA we locked but not of its neighbors, therefore holding per-VMA
lock while calling anon_vma_prepare() is not enough. The solution
Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
therefore would avoid the issue.


>
> Thanks,
> Liam
Lokesh Gidra Feb. 5, 2024, 9:54 p.m. UTC | #11
On Mon, Feb 5, 2024 at 1:47 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Jan 31, 2024 at 1:41 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240130 21:49]:
> > > On Mon, Jan 29, 2024 at 6:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240129 19:28]:
> > > > > On Mon, Jan 29, 2024 at 12:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > >
> >
> > ...
> >
> > > >
> > > > > Your suggestion is definitely simpler and easier to follow, but due to
> > > > > the overflow situation that Suren pointed out, I would still need to
> > > > > keep the locking/boolean dance, no? IIUC, even if I were to return
> > > > > EAGAIN to the userspace, there is no guarantee that subsequent ioctls
> > > > > on the same vma will succeed due to the same overflow, until someone
> > > > > acquires and releases mmap_lock in write-mode.
> > > > > Also, sometimes it seems insufficient whether we managed to lock vma
> > > > > or not. For instance, lock_vma_under_rcu() checks if anon_vma (for
> > > > > anonymous vma) exists. If not then it bails out.
> > > > > So it seems to me that we have to provide some fall back in
> > > > > userfaultfd operations which executes with mmap_lock in read-mode.
> > > >
> > > > Fair enough, what if we didn't use the sequence number and just locked
> > > > the vma directly?
> > >
> > > Looks good to me, unless someone else has any objections.
> > > >
> > > > /* This will wait on the vma lock, so once we return it's locked */
> > > > void vma_aquire_read_lock(struct vm_area_struct *vma)
> > > > {
> > > >         mmap_assert_locked(vma->vm_mm);
> > > >         down_read(&vma->vm_lock->lock);
> > > > }
> > > >
> > > > struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > >         unsigned long addr))    /* or some better name.. */
> > > > {
> > > >         struct vm_area_struct *vma;
> > > >
> > > >         vma = lock_vma_under_rcu(mm, addr);
> > > >         if (vma)
> > > >                 return vma;
> > > >
> > > >         mmap_read_lock(mm);
> > > >         /* mm sequence cannot change, no mm writers anyways.
> > > >          * find_mergeable_anon_vma is only a concern in the page fault
> > > >          * path
> > > >          * start/end won't change under the mmap_lock
> > > >          * vma won't become detached as we have the mmap_lock in read
> > > >          * We are now sure no writes will change the VMA
> > > >          * So let's make sure no other context is isolating the vma
> > > >          */
> > > >         vma = lookup_vma(mm, addr);
> > > >         if (vma)
> > > We can take care of anon_vma as well here right? I can take a bool
> > > parameter ('prepare_anon' or something) and then:
> > >
> > >            if (vma) {
> > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > !anon_vma_prepare(vma)) {
> > >                                       vma = ERR_PTR(-ENOMEM);
> > >                                       goto out_unlock;
> > >                    }
> > > >                 vma_aquire_read_lock(vma);
> > >            }
> > > out_unlock:
> > > >         mmap_read_unlock(mm);
> > > >         return vma;
> > > > }
> >
> > Do you need this?  I didn't think this was happening in the code as
> > written?  If you need it I would suggest making it happen always and
> > ditch the flag until a user needs this variant, but document what's
> > going on in here or even have a better name.
>
> I think yes, you do need this. I can see calls to anon_vma_prepare()
> under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> in mfill_atomic(). This means, just like in the pagefault path, we
> modify vma->anon_vma under mmap_read_lock protection which guarantees
> that adjacent VMAs won't change. This is important because
> __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> the VMA we locked but not of its neighbors, therefore holding per-VMA
> lock while calling anon_vma_prepare() is not enough. The solution
> Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> therefore would avoid the issue.
>
Thanks, Suren.
anon_vma_prepare() is also called in validate_move_areas() via move_pages().
>
> >
> > Thanks,
> > Liam
Liam R. Howlett Feb. 5, 2024, 10 p.m. UTC | #12
* Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
...

> > > > We can take care of anon_vma as well here right? I can take a bool
> > > > parameter ('prepare_anon' or something) and then:
> > > >
> > > >            if (vma) {
> > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > !anon_vma_prepare(vma)) {
> > > >                                       vma = ERR_PTR(-ENOMEM);
> > > >                                       goto out_unlock;
> > > >                    }
> > > > >                 vma_aquire_read_lock(vma);
> > > >            }
> > > > out_unlock:
> > > > >         mmap_read_unlock(mm);
> > > > >         return vma;
> > > > > }
> > >
> > > Do you need this?  I didn't think this was happening in the code as
> > > written?  If you need it I would suggest making it happen always and
> > > ditch the flag until a user needs this variant, but document what's
> > > going on in here or even have a better name.
> >
> > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > in mfill_atomic(). This means, just like in the pagefault path, we
> > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > that adjacent VMAs won't change. This is important because
> > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > lock while calling anon_vma_prepare() is not enough. The solution
> > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > therefore would avoid the issue.
> >

...

> anon_vma_prepare() is also called in validate_move_areas() via move_pages().

Probably worth doing it unconditionally and have a comment as to why it
is necessary.

Does this avoid your locking workaround?

Thanks,
Liam
Lokesh Gidra Feb. 5, 2024, 10:24 p.m. UTC | #13
On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> ...
>
> > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > parameter ('prepare_anon' or something) and then:
> > > > >
> > > > >            if (vma) {
> > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > !anon_vma_prepare(vma)) {
> > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > >                                       goto out_unlock;
> > > > >                    }
> > > > > >                 vma_aquire_read_lock(vma);
> > > > >            }
> > > > > out_unlock:
> > > > > >         mmap_read_unlock(mm);
> > > > > >         return vma;
> > > > > > }
> > > >
> > > > Do you need this?  I didn't think this was happening in the code as
> > > > written?  If you need it I would suggest making it happen always and
> > > > ditch the flag until a user needs this variant, but document what's
> > > > going on in here or even have a better name.
> > >
> > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > that adjacent VMAs won't change. This is important because
> > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > lock while calling anon_vma_prepare() is not enough. The solution
> > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > therefore would avoid the issue.
> > >
>
> ...
>
> > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
>
> Probably worth doing it unconditionally and have a comment as to why it
> is necessary.
>
The src_vma (in case of move_pages()) doesn't need to have it.

The only reason I'm not inclined to make it unconditional is what if
some future user of lock_vma() doesn't need it for their purpose? Why
allocate anon_vma in that case.

> Does this avoid your locking workaround?

Not sure which workaround you are referring to. I am almost done
implementing your suggestion. Very soon will share the next version of
the patch-set.
>
> Thanks,
> Liam
Liam R. Howlett Feb. 6, 2024, 2:35 p.m. UTC | #14
* Lokesh Gidra <lokeshgidra@google.com> [240205 17:24]:
> On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> > ...
> >
> > > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > > parameter ('prepare_anon' or something) and then:
> > > > > >
> > > > > >            if (vma) {
> > > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > > !anon_vma_prepare(vma)) {
> > > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > > >                                       goto out_unlock;
> > > > > >                    }
> > > > > > >                 vma_aquire_read_lock(vma);
> > > > > >            }
> > > > > > out_unlock:
> > > > > > >         mmap_read_unlock(mm);
> > > > > > >         return vma;
> > > > > > > }
> > > > >
> > > > > Do you need this?  I didn't think this was happening in the code as
> > > > > written?  If you need it I would suggest making it happen always and
> > > > > ditch the flag until a user needs this variant, but document what's
> > > > > going on in here or even have a better name.
> > > >
> > > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > > that adjacent VMAs won't change. This is important because
> > > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > > lock while calling anon_vma_prepare() is not enough. The solution
> > > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > > therefore would avoid the issue.
> > > >
> >
> > ...
> >
> > > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
> >
> > Probably worth doing it unconditionally and have a comment as to why it
> > is necessary.
> >
> The src_vma (in case of move_pages()) doesn't need to have it.
> 
> The only reason I'm not inclined to make it unconditional is what if
> some future user of lock_vma() doesn't need it for their purpose? Why
> allocate anon_vma in that case.

Because there isn't a user and it'll add a flag that's a constant.  If
there is a need for the flag later then it can be added at that time.
Maybe there will never be a user and we've just complicated the code for
no reason.  Don't implement features that aren't necessary, especially
if there is no intent to use them.

> 
> > Does this avoid your locking workaround?
> 
> Not sure which workaround you are referring to. I am almost done
> implementing your suggestion. Very soon will share the next version of
> the patch-set.

The locking dance with the flags indicating if it's per-vma lock or
mmap_lock.
Lokesh Gidra Feb. 6, 2024, 4:26 p.m. UTC | #15
On Tue, Feb 6, 2024 at 6:35 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240205 17:24]:
> > On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> > > ...
> > >
> > > > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > > > parameter ('prepare_anon' or something) and then:
> > > > > > >
> > > > > > >            if (vma) {
> > > > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > > > !anon_vma_prepare(vma)) {
> > > > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > > > >                                       goto out_unlock;
> > > > > > >                    }
> > > > > > > >                 vma_aquire_read_lock(vma);
> > > > > > >            }
> > > > > > > out_unlock:
> > > > > > > >         mmap_read_unlock(mm);
> > > > > > > >         return vma;
> > > > > > > > }
> > > > > >
> > > > > > Do you need this?  I didn't think this was happening in the code as
> > > > > > written?  If you need it I would suggest making it happen always and
> > > > > > ditch the flag until a user needs this variant, but document what's
> > > > > > going on in here or even have a better name.
> > > > >
> > > > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > > > that adjacent VMAs won't change. This is important because
> > > > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > > > lock while calling anon_vma_prepare() is not enough. The solution
> > > > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > > > therefore would avoid the issue.
> > > > >
> > >
> > > ...
> > >
> > > > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
> > >
> > > Probably worth doing it unconditionally and have a comment as to why it
> > > is necessary.
> > >
> > The src_vma (in case of move_pages()) doesn't need to have it.
> >
> > The only reason I'm not inclined to make it unconditional is what if
> > some future user of lock_vma() doesn't need it for their purpose? Why
> > allocate anon_vma in that case.
>
> Because there isn't a user and it'll add a flag that's a constant.  If
> there is a need for the flag later then it can be added at that time.
> Maybe there will never be a user and we've just complicated the code for
> no reason.  Don't implement features that aren't necessary, especially
> if there is no intent to use them.
>

I'm not too attached to the idea of keeping it conditional. But I have
already sent v3 which currently does it conditionally. Please take a
look at it. Along with any other comments/changes that I get, I'll
also make it unconditional in v4, if you say so.
> >
> > > Does this avoid your locking workaround?
> >
> > Not sure which workaround you are referring to. I am almost done
> > implementing your suggestion. Very soon will share the next version of
> > the patch-set.
>
> The locking dance with the flags indicating if it's per-vma lock or
> mmap_lock.
>
That dance was not because of anon_vma. It's just that I hadn't
realized that we can do it the way you suggested :) I really liked
your suggestion and is implemented in v3. PTAL.
Liam R. Howlett Feb. 6, 2024, 5:07 p.m. UTC | #16
* Lokesh Gidra <lokeshgidra@google.com> [240206 11:26]:
> On Tue, Feb 6, 2024 at 6:35 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240205 17:24]:
> > > On Mon, Feb 5, 2024 at 2:00 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240205 16:55]:
> > > > ...
> > > >
> > > > > > > > We can take care of anon_vma as well here right? I can take a bool
> > > > > > > > parameter ('prepare_anon' or something) and then:
> > > > > > > >
> > > > > > > >            if (vma) {
> > > > > > > >                     if (prepare_anon && vma_is_anonymous(vma)) &&
> > > > > > > > !anon_vma_prepare(vma)) {
> > > > > > > >                                       vma = ERR_PTR(-ENOMEM);
> > > > > > > >                                       goto out_unlock;
> > > > > > > >                    }
> > > > > > > > >                 vma_aquire_read_lock(vma);
> > > > > > > >            }
> > > > > > > > out_unlock:
> > > > > > > > >         mmap_read_unlock(mm);
> > > > > > > > >         return vma;
> > > > > > > > > }
> > > > > > >
> > > > > > > Do you need this?  I didn't think this was happening in the code as
> > > > > > > written?  If you need it I would suggest making it happen always and
> > > > > > > ditch the flag until a user needs this variant, but document what's
> > > > > > > going on in here or even have a better name.
> > > > > >
> > > > > > I think yes, you do need this. I can see calls to anon_vma_prepare()
> > > > > > under mmap_read_lock() protection in both mfill_atomic_hugetlb() and
> > > > > > in mfill_atomic(). This means, just like in the pagefault path, we
> > > > > > modify vma->anon_vma under mmap_read_lock protection which guarantees
> > > > > > that adjacent VMAs won't change. This is important because
> > > > > > __anon_vma_prepare() uses find_mergeable_anon_vma() that needs the
> > > > > > neighboring VMAs to be stable. Per-VMA lock guarantees stability of
> > > > > > the VMA we locked but not of its neighbors, therefore holding per-VMA
> > > > > > lock while calling anon_vma_prepare() is not enough. The solution
> > > > > > Lokesh suggests would call anon_vma_prepare() under mmap_read_lock and
> > > > > > therefore would avoid the issue.
> > > > > >
> > > >
> > > > ...
> > > >
> > > > > anon_vma_prepare() is also called in validate_move_areas() via move_pages().
> > > >
> > > > Probably worth doing it unconditionally and have a comment as to why it
> > > > is necessary.
> > > >
> > > The src_vma (in case of move_pages()) doesn't need to have it.
> > >
> > > The only reason I'm not inclined to make it unconditional is what if
> > > some future user of lock_vma() doesn't need it for their purpose? Why
> > > allocate anon_vma in that case.
> >
> > Because there isn't a user and it'll add a flag that's a constant.  If
> > there is a need for the flag later then it can be added at that time.
> > Maybe there will never be a user and we've just complicated the code for
> > no reason.  Don't implement features that aren't necessary, especially
> > if there is no intent to use them.
> >
> 
> I'm not too attached to the idea of keeping it conditional. But I have
> already sent v3 which currently does it conditionally. Please take a
> look at it. Along with any other comments/changes that I get, I'll
> also make it unconditional in v4, if you say so.

well, you use it conditionally, so it does have use.  It was not clear
in your comment above that you were going to use it.  I am not sure
about the dst/src needing/not needing it.  If you have a user, then
leave it in.

Thanks,
Liam
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index c00a021bcce4..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,17 +2005,8 @@  static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(mm)) {
-		mmap_read_lock(mm);
-
-		/* Re-check after taking map_changing_lock */
-		down_read(&ctx->map_changing_lock);
-		if (likely(!atomic_read(&ctx->mmap_changing)))
-			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
-					 uffdio_move.len, uffdio_move.mode);
-		else
-			ret = -EAGAIN;
-		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(mm);
+		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+				 uffdio_move.len, uffdio_move.mode);
 		mmput(mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3210c3552976..05d59f74fc88 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -138,9 +138,8 @@  extern long uffd_wp_range(struct vm_area_struct *vma,
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
 void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 flags);
 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
 			struct vm_area_struct *dst_vma,
 			struct vm_area_struct *src_vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 6e2ca04ab04d..d55bf18b80db 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -19,20 +19,39 @@ 
 #include <asm/tlb.h>
 #include "internal.h"
 
-static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
-				    unsigned long dst_start,
-				    unsigned long len)
+void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
+{
+	BUG_ON(!vma && !*mmap_locked);
+
+	if (*mmap_locked) {
+		mmap_read_unlock(mm);
+		*mmap_locked = false;
+	} else
+		vma_end_read(vma);
+}
+
+/*
+ * Search for VMA and make sure it is stable either by locking it or taking
+ * mmap_lock.
+ */
+struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
+					    unsigned long dst_start,
+					    unsigned long len,
+					    bool *mmap_locked)
 {
+	struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
+	if (!dst_vma) {
+		mmap_read_lock(dst_mm);
+		*mmap_locked = true;
+		dst_vma = find_vma(dst_mm, dst_start);
+	}
+
 	/*
 	 * Make sure that the dst range is both valid and fully within a
 	 * single existing vma.
 	 */
-	struct vm_area_struct *dst_vma;
-
-	dst_vma = find_vma(dst_mm, dst_start);
 	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
-		return NULL;
+		goto unpin;
 
 	/*
 	 * Check the vma is registered in uffd, this is required to
@@ -40,9 +59,13 @@  struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 	 * time.
 	 */
 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
-		return NULL;
+		goto unpin;
 
 	return dst_vma;
+
+unpin:
+	unpin_vma(dst_mm, dst_vma, mmap_locked);
+	return NULL;
 }
 
 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
@@ -350,7 +373,8 @@  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
  */
 static __always_inline ssize_t mfill_atomic_hugetlb(
 					      struct userfaultfd_ctx *ctx,
@@ -358,7 +382,8 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      uffd_flags_t flags)
+					      uffd_flags_t flags,
+					      bool *mmap_locked)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -380,7 +405,7 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 	 */
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
 		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(dst_mm);
+		unpin_vma(dst_mm, dst_vma, mmap_locked);
 		return -EINVAL;
 	}
 
@@ -404,12 +429,25 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 	 */
 	if (!dst_vma) {
 		err = -ENOENT;
-		dst_vma = find_dst_vma(dst_mm, dst_start, len);
-		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
-			goto out_unlock;
+		dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
+					       len, mmap_locked);
+		if (!dst_vma)
+			goto out;
+		if (!is_vm_hugetlb_page(dst_vma))
+			goto out_unlock_vma;
 
 		err = -EINVAL;
 		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+			goto out_unlock_vma;
+
+		/*
+		 * If memory mappings are changing because of non-cooperative
+		 * operation (e.g. mremap) running in parallel, bail out and
+		 * request the user to retry later
+		 */
+		down_read(&ctx->map_changing_lock);
+		err = -EAGAIN;
+		if (atomic_read(&ctx->mmap_changing))
 			goto out_unlock;
 
 		vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -465,7 +503,7 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 
 		if (unlikely(err == -ENOENT)) {
 			up_read(&ctx->map_changing_lock);
-			mmap_read_unlock(dst_mm);
+			unpin_vma(dst_mm, dst_vma, mmap_locked);
 			BUG_ON(!folio);
 
 			err = copy_folio_from_user(folio,
@@ -474,17 +512,6 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 				err = -EFAULT;
 				goto out;
 			}
-			mmap_read_lock(dst_mm);
-			down_read(&ctx->map_changing_lock);
-			/*
-			 * If memory mappings are changing because of non-cooperative
-			 * operation (e.g. mremap) running in parallel, bail out and
-			 * request the user to retry later
-			 */
-			if (atomic_read(ctx->mmap_changing)) {
-				err = -EAGAIN;
-				break;
-			}
 
 			dst_vma = NULL;
 			goto retry;
@@ -505,7 +532,8 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
-	mmap_read_unlock(dst_mm);
+out_unlock_vma:
+	unpin_vma(dst_mm, dst_vma, mmap_locked);
 out:
 	if (folio)
 		folio_put(folio);
@@ -521,7 +549,8 @@  extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
 				    unsigned long dst_start,
 				    unsigned long src_start,
 				    unsigned long len,
-				    uffd_flags_t flags);
+				    uffd_flags_t flags,
+				    bool *mmap_locked);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
@@ -581,6 +610,7 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	unsigned long src_addr, dst_addr;
 	long copied;
 	struct folio *folio;
+	bool mmap_locked = false;
 
 	/*
 	 * Sanitize the command parameters:
@@ -597,7 +627,14 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	copied = 0;
 	folio = NULL;
 retry:
-	mmap_read_lock(dst_mm);
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+	err = -ENOENT;
+	dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
+	if (!dst_vma)
+		goto out;
 
 	/*
 	 * If memory mappings are changing because of non-cooperative
@@ -609,15 +646,6 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
-	/*
-	 * Make sure the vma is not shared, that the dst range is
-	 * both valid and fully within a single existing vma.
-	 */
-	err = -ENOENT;
-	dst_vma = find_dst_vma(dst_mm, dst_start, len);
-	if (!dst_vma)
-		goto out_unlock;
-
 	err = -EINVAL;
 	/*
 	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
@@ -638,8 +666,8 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	 * If this is a HUGETLB vma, pass off to appropriate routine
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
-		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
-					     src_start, len, flags);
+		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
+					     len, flags, &mmap_locked);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
@@ -699,7 +727,8 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			void *kaddr;
 
 			up_read(&ctx->map_changing_lock);
-			mmap_read_unlock(dst_mm);
+			unpin_vma(dst_mm, dst_vma, &mmap_locked);
+
 			BUG_ON(!folio);
 
 			kaddr = kmap_local_folio(folio, 0);
@@ -730,7 +759,7 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
-	mmap_read_unlock(dst_mm);
+	unpin_vma(dst_mm, dst_vma, &mmap_locked);
 out:
 	if (folio)
 		folio_put(folio);
@@ -1285,8 +1314,6 @@  static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * @len: length of the virtual memory range
  * @mode: flags from uffdio_move.mode
  *
- * Must be called with mmap_lock held for read.
- *
  * move_pages() remaps arbitrary anonymous pages atomically in zero
  * copy. It only works on non shared anonymous pages because those can
  * be relocated without generating non linear anon_vmas in the rmap
@@ -1353,15 +1380,16 @@  static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * could be obtained. This is the only additional complexity added to
  * the rmap code to provide this anonymous page remapping functionality.
  */
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 mode)
 {
+	struct mm_struct *mm = ctx->mm;
 	struct vm_area_struct *src_vma, *dst_vma;
 	unsigned long src_addr, dst_addr;
 	pmd_t *src_pmd, *dst_pmd;
 	long err = -EINVAL;
 	ssize_t moved = 0;
+	bool mmap_locked = false;
 
 	/* Sanitize the command parameters. */
 	if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
@@ -1374,28 +1402,52 @@  ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 	    WARN_ON_ONCE(dst_start + len <= dst_start))
 		goto out;
 
+	dst_vma = NULL;
+	src_vma = lock_vma_under_rcu(mm, src_start);
+	if (src_vma) {
+		dst_vma = lock_vma_under_rcu(mm, dst_start);
+		if (!dst_vma)
+			vma_end_read(src_vma);
+	}
+
+	/* If we failed to lock both VMAs, fall back to mmap_lock */
+	if (!dst_vma) {
+		mmap_read_lock(mm);
+		mmap_locked = true;
+		src_vma = find_vma(mm, src_start);
+		if (!src_vma)
+			goto out_unlock_mmap;
+		dst_vma = find_vma(mm, dst_start);
+		if (!dst_vma)
+			goto out_unlock_mmap;
+	}
+
+	/* Re-check after taking map_changing_lock */
+	down_read(&ctx->map_changing_lock);
+	if (likely(atomic_read(&ctx->mmap_changing))) {
+		err = -EAGAIN;
+		goto out_unlock;
+	}
 	/*
 	 * Make sure the vma is not shared, that the src and dst remap
 	 * ranges are both valid and fully within a single existing
 	 * vma.
 	 */
-	src_vma = find_vma(mm, src_start);
-	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
-		goto out;
+	if (src_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
 	if (src_start < src_vma->vm_start ||
 	    src_start + len > src_vma->vm_end)
-		goto out;
+		goto out_unlock;
 
-	dst_vma = find_vma(mm, dst_start);
-	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-		goto out;
+	if (dst_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
 	if (dst_start < dst_vma->vm_start ||
 	    dst_start + len > dst_vma->vm_end)
-		goto out;
+		goto out_unlock;
 
 	err = validate_move_areas(ctx, src_vma, dst_vma);
 	if (err)
-		goto out;
+		goto out_unlock;
 
 	for (src_addr = src_start, dst_addr = dst_start;
 	     src_addr < src_start + len;) {
@@ -1512,6 +1564,15 @@  ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 		moved += step_size;
 	}
 
+out_unlock:
+	up_read(&ctx->map_changing_lock);
+out_unlock_mmap:
+	if (mmap_locked)
+		mmap_read_unlock(mm);
+	else {
+		vma_end_read(dst_vma);
+		vma_end_read(src_vma);
+	}
 out:
 	VM_WARN_ON(moved < 0);
 	VM_WARN_ON(err > 0);