diff mbox series

[v4,3/3] userfaultfd: use per-vma locks in userfaultfd operations

Message ID 20240208212204.2043140-4-lokeshgidra@google.com (mailing list archive)
State Handled Elsewhere
Delegated to: Paul Moore
Headers show
Series per-vma locks in userfaultfd | expand

Commit Message

Lokesh Gidra Feb. 8, 2024, 9:22 p.m. UTC
All userfaultfd operations, except write-protect, opportunistically use
per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
critical section.

Write-protect operation requires mmap_lock as it iterates over multiple
vmas.

Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
---
 fs/userfaultfd.c              |  13 +-
 include/linux/userfaultfd_k.h |   5 +-
 mm/userfaultfd.c              | 356 ++++++++++++++++++++++++++--------
 3 files changed, 275 insertions(+), 99 deletions(-)

Comments

Liam R. Howlett Feb. 9, 2024, 3:06 a.m. UTC | #1
* Lokesh Gidra <lokeshgidra@google.com> [240208 16:22]:
> All userfaultfd operations, except write-protect, opportunistically use
> per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> critical section.
> 
> Write-protect operation requires mmap_lock as it iterates over multiple
> vmas.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> ---
>  fs/userfaultfd.c              |  13 +-
>  include/linux/userfaultfd_k.h |   5 +-
>  mm/userfaultfd.c              | 356 ++++++++++++++++++++++++++--------
>  3 files changed, 275 insertions(+), 99 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index c00a021bcce4..60dcfafdc11a 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
>  		return -EINVAL;
>  
>  	if (mmget_not_zero(mm)) {
> -		mmap_read_lock(mm);
> -
> -		/* Re-check after taking map_changing_lock */
> -		down_read(&ctx->map_changing_lock);
> -		if (likely(!atomic_read(&ctx->mmap_changing)))
> -			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> -					 uffdio_move.len, uffdio_move.mode);
> -		else
> -			ret = -EAGAIN;
> -		up_read(&ctx->map_changing_lock);
> -		mmap_read_unlock(mm);
> +		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> +				 uffdio_move.len, uffdio_move.mode);
>  		mmput(mm);
>  	} else {
>  		return -ESRCH;
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 3210c3552976..05d59f74fc88 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
>  /* move_pages */
>  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
>  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> -		   unsigned long dst_start, unsigned long src_start,
> -		   unsigned long len, __u64 flags);
> +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +		   unsigned long src_start, unsigned long len, __u64 flags);
>  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
>  			struct vm_area_struct *dst_vma,
>  			struct vm_area_struct *src_vma,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 74aad0831e40..1e25768b2136 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -19,20 +19,12 @@
>  #include <asm/tlb.h>
>  #include "internal.h"
>  
> -static __always_inline
> -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> -				    unsigned long dst_start,
> -				    unsigned long len)

You could probably leave the __always_inline for this.

> +static bool validate_dst_vma(struct vm_area_struct *dst_vma,
> +			     unsigned long dst_end)
>  {
> -	/*
> -	 * Make sure that the dst range is both valid and fully within a
> -	 * single existing vma.
> -	 */
> -	struct vm_area_struct *dst_vma;
> -
> -	dst_vma = find_vma(dst_mm, dst_start);
> -	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> -		return NULL;
> +	/* Make sure that the dst range is fully within dst_vma. */
> +	if (dst_end > dst_vma->vm_end)
> +		return false;
>  
>  	/*
>  	 * Check the vma is registered in uffd, this is required to
> @@ -40,11 +32,125 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
>  	 * time.
>  	 */
>  	if (!dst_vma->vm_userfaultfd_ctx.ctx)
> -		return NULL;
> +		return false;
> +
> +	return true;
> +}
> +
> +#ifdef CONFIG_PER_VMA_LOCK
> +/*
> + * lock_vma() - Lookup and lock vma corresponding to @address.
> + * @mm: mm to search vma in.
> + * @address: address that the vma should contain.
> + * @prepare_anon: If true, then prepare the vma (if private) with anon_vma.
> + *
> + * Should be called without holding mmap_lock. vma should be unlocked after use
> + * with unlock_vma().
> + *
> + * Return: A locked vma containing @address, NULL if no vma is found, or
> + * -ENOMEM if anon_vma couldn't be allocated.
> + */
> +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> +				       unsigned long address,
> +				       bool prepare_anon)
> +{
> +	struct vm_area_struct *vma;
> +
> +	vma = lock_vma_under_rcu(mm, address);
> +	if (vma) {
> +		/*
> +		 * lock_vma_under_rcu() only checks anon_vma for private
> +		 * anonymous mappings. But we need to ensure it is assigned in
> +		 * private file-backed vmas as well.
> +		 */
> +		if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> +		    !vma->anon_vma)
> +			vma_end_read(vma);
> +		else
> +			return vma;
> +	}
> +
> +	mmap_read_lock(mm);
> +	vma = vma_lookup(mm, address);
> +	if (vma) {
> +		if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> +		    anon_vma_prepare(vma)) {
> +			vma = ERR_PTR(-ENOMEM);
> +		} else {
> +			/*
> +			 * We cannot use vma_start_read() as it may fail due to
> +			 * false locked (see comment in vma_start_read()). We
> +			 * can avoid that by directly locking vm_lock under
> +			 * mmap_lock, which guarantees that nobody can lock the
> +			 * vma for write (vma_start_write()) under us.
> +			 */
> +			down_read(&vma->vm_lock->lock);
> +		}
> +	}
> +
> +	mmap_read_unlock(mm);
> +	return vma;
> +}
> +
> +static void unlock_vma(struct vm_area_struct *vma)
> +{
> +	vma_end_read(vma);
> +}
> +
> +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
> +						    unsigned long dst_start,
> +						    unsigned long len)
> +{
> +	struct vm_area_struct *dst_vma;
> +
> +	/* Ensure anon_vma is assigned for private vmas */
> +	dst_vma = lock_vma(dst_mm, dst_start, true);
> +
> +	if (!dst_vma)
> +		return ERR_PTR(-ENOENT);
> +
> +	if (PTR_ERR(dst_vma) == -ENOMEM)
> +		return dst_vma;
> +
> +	if (!validate_dst_vma(dst_vma, dst_start + len))
> +		goto out_unlock;
>  
>  	return dst_vma;
> +out_unlock:
> +	unlock_vma(dst_vma);
> +	return ERR_PTR(-ENOENT);
>  }
>  
> +#else
> +
> +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> +						       unsigned long dst_start,
> +						       unsigned long len)
> +{
> +	struct vm_area_struct *dst_vma;
> +	int err = -ENOENT;
> +
> +	mmap_read_lock(dst_mm);
> +	dst_vma = vma_lookup(dst_mm, dst_start);
> +	if (!dst_vma)
> +		goto out_unlock;
> +
> +	/* Ensure anon_vma is assigned for private vmas */
> +	if (!(dst_vma->vm_flags & VM_SHARED) && anon_vma_prepare(dst_vma)) {
> +		err = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	if (!validate_dst_vma(dst_vma, dst_start + len))
> +		goto out_unlock;
> +
> +	return dst_vma;
> +out_unlock:
> +	mmap_read_unlock(dst_mm);
> +	return ERR_PTR(err);
> +}
> +#endif
> +
>  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
>  static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
>  				 unsigned long dst_addr)
> @@ -350,7 +456,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>  #ifdef CONFIG_HUGETLB_PAGE
>  /*
>   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> - * called with mmap_lock held, it will release mmap_lock before returning.
> + * called with either vma-lock or mmap_lock held, it will release the lock
> + * before returning.
>   */
>  static __always_inline ssize_t mfill_atomic_hugetlb(
>  					      struct userfaultfd_ctx *ctx,
> @@ -361,7 +468,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  					      uffd_flags_t flags)
>  {
>  	struct mm_struct *dst_mm = dst_vma->vm_mm;
> -	int vm_shared = dst_vma->vm_flags & VM_SHARED;
>  	ssize_t err;
>  	pte_t *dst_pte;
>  	unsigned long src_addr, dst_addr;
> @@ -380,7 +486,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 */
>  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
>  		up_read(&ctx->map_changing_lock);
> +#ifdef CONFIG_PER_VMA_LOCK
> +		unlock_vma(dst_vma);
> +#else
>  		mmap_read_unlock(dst_mm);
> +#endif
>  		return -EINVAL;
>  	}
>  
> @@ -403,24 +513,32 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 * retry, dst_vma will be set to NULL and we must lookup again.
>  	 */
>  	if (!dst_vma) {
> +#ifdef CONFIG_PER_VMA_LOCK
> +		dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> +#else
> +		dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> +#endif
> +		if (IS_ERR(dst_vma)) {
> +			err = PTR_ERR(dst_vma);
> +			goto out;
> +		}
> +
>  		err = -ENOENT;
> -		dst_vma = find_dst_vma(dst_mm, dst_start, len);
> -		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> -			goto out_unlock;
> +		if (!is_vm_hugetlb_page(dst_vma))
> +			goto out_unlock_vma;
>  
>  		err = -EINVAL;
>  		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> -			goto out_unlock;
> -
> -		vm_shared = dst_vma->vm_flags & VM_SHARED;
> -	}
> +			goto out_unlock_vma;
>  
> -	/*
> -	 * If not shared, ensure the dst_vma has a anon_vma.
> -	 */
> -	err = -ENOMEM;
> -	if (!vm_shared) {
> -		if (unlikely(anon_vma_prepare(dst_vma)))
> +		/*
> +		 * If memory mappings are changing because of non-cooperative
> +		 * operation (e.g. mremap) running in parallel, bail out and
> +		 * request the user to retry later
> +		 */
> +		down_read(&ctx->map_changing_lock);
> +		err = -EAGAIN;
> +		if (atomic_read(&ctx->mmap_changing))
>  			goto out_unlock;
>  	}
>  
> @@ -465,7 +583,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  
>  		if (unlikely(err == -ENOENT)) {
>  			up_read(&ctx->map_changing_lock);
> +#ifdef CONFIG_PER_VMA_LOCK
> +			unlock_vma(dst_vma);
> +#else
>  			mmap_read_unlock(dst_mm);
> +#endif
>  			BUG_ON(!folio);
>  
>  			err = copy_folio_from_user(folio,
> @@ -474,17 +596,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  				err = -EFAULT;
>  				goto out;
>  			}
> -			mmap_read_lock(dst_mm);
> -			down_read(&ctx->map_changing_lock);
> -			/*
> -			 * If memory mappings are changing because of non-cooperative
> -			 * operation (e.g. mremap) running in parallel, bail out and
> -			 * request the user to retry later
> -			 */
> -			if (atomic_read(&ctx->mmap_changing)) {
> -				err = -EAGAIN;
> -				break;
> -			}
>  
>  			dst_vma = NULL;
>  			goto retry;
> @@ -505,7 +616,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  
>  out_unlock:
>  	up_read(&ctx->map_changing_lock);
> +out_unlock_vma:
> +#ifdef CONFIG_PER_VMA_LOCK
> +	unlock_vma(dst_vma);
> +#else
>  	mmap_read_unlock(dst_mm);
> +#endif
>  out:
>  	if (folio)
>  		folio_put(folio);
> @@ -597,7 +713,19 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	copied = 0;
>  	folio = NULL;
>  retry:
> -	mmap_read_lock(dst_mm);
> +	/*
> +	 * Make sure the vma is not shared, that the dst range is
> +	 * both valid and fully within a single existing vma.
> +	 */
> +#ifdef CONFIG_PER_VMA_LOCK
> +	dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> +#else
> +	dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> +#endif
> +	if (IS_ERR(dst_vma)) {
> +		err = PTR_ERR(dst_vma);
> +		goto out;
> +	}
>  
>  	/*
>  	 * If memory mappings are changing because of non-cooperative
> @@ -609,15 +737,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
> -	/*
> -	 * Make sure the vma is not shared, that the dst range is
> -	 * both valid and fully within a single existing vma.
> -	 */
> -	err = -ENOENT;
> -	dst_vma = find_dst_vma(dst_mm, dst_start, len);
> -	if (!dst_vma)
> -		goto out_unlock;
> -
>  	err = -EINVAL;
>  	/*
>  	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> @@ -647,16 +766,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
>  		goto out_unlock;
>  
> -	/*
> -	 * Ensure the dst_vma has a anon_vma or this page
> -	 * would get a NULL anon_vma when moved in the
> -	 * dst_vma.
> -	 */
> -	err = -ENOMEM;
> -	if (!(dst_vma->vm_flags & VM_SHARED) &&
> -	    unlikely(anon_vma_prepare(dst_vma)))
> -		goto out_unlock;
> -
>  	while (src_addr < src_start + len) {
>  		pmd_t dst_pmdval;
>  
> @@ -699,7 +808,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  			void *kaddr;
>  
>  			up_read(&ctx->map_changing_lock);
> +#ifdef CONFIG_PER_VMA_LOCK
> +			unlock_vma(dst_vma);
> +#else
>  			mmap_read_unlock(dst_mm);
> +#endif
>  			BUG_ON(!folio);
>  
>  			kaddr = kmap_local_folio(folio, 0);
> @@ -730,7 +843,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  
>  out_unlock:
>  	up_read(&ctx->map_changing_lock);
> +#ifdef CONFIG_PER_VMA_LOCK
> +	unlock_vma(dst_vma);
> +#else
>  	mmap_read_unlock(dst_mm);
> +#endif
>  out:
>  	if (folio)
>  		folio_put(folio);
> @@ -1267,16 +1384,67 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
>  	if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
>  		return -EINVAL;
>  
> -	/*
> -	 * Ensure the dst_vma has a anon_vma or this page
> -	 * would get a NULL anon_vma when moved in the
> -	 * dst_vma.
> -	 */
> -	if (unlikely(anon_vma_prepare(dst_vma)))
> -		return -ENOMEM;
> +	return 0;
> +}
> +
> +#ifdef CONFIG_PER_VMA_LOCK
> +static int find_and_lock_vmas(struct mm_struct *mm,
> +			      unsigned long dst_start,
> +			      unsigned long src_start,
> +			      struct vm_area_struct **dst_vmap,
> +			      struct vm_area_struct **src_vmap)
> +{
> +	int err;
> +
> +	/* There is no need to prepare anon_vma for src_vma */
> +	*src_vmap = lock_vma(mm, src_start, false);
> +	if (!*src_vmap)
> +		return -ENOENT;
> +
> +	/* Ensure anon_vma is assigned for anonymous vma */
> +	*dst_vmap = lock_vma(mm, dst_start, true);
> +	err = -ENOENT;
> +	if (!*dst_vmap)
> +		goto out_unlock;
> +
> +	err = -ENOMEM;
> +	if (PTR_ERR(*dst_vmap) == -ENOMEM)
> +		goto out_unlock;

If you change lock_vma() to return the vma or ERR_PTR(-ENOENT) /
ERR_PTR(-ENOMEM), then you could change this to check IS_ERR() and
return the PTR_ERR().

You could also use IS_ERR_OR_NULL here, but the first suggestion will
simplify your life for find_and_lock_dst_vma() and the error type to
return.

What you have here will work though.

>  
>  	return 0;
> +out_unlock:
> +	unlock_vma(*src_vmap);
> +	return err;
>  }
> +#else
> +static int lock_mm_and_find_vmas(struct mm_struct *mm,
> +				 unsigned long dst_start,
> +				 unsigned long src_start,
> +				 struct vm_area_struct **dst_vmap,
> +				 struct vm_area_struct **src_vmap)
> +{
> +	int err = -ENOENT;

Nit: new line after declarations.

> +	mmap_read_lock(mm);
> +
> +	*src_vmap = vma_lookup(mm, src_start);
> +	if (!*src_vmap)
> +		goto out_unlock;
> +
> +	*dst_vmap = vma_lookup(mm, dst_start);
> +	if (!*dst_vmap)
> +		goto out_unlock;
> +
> +	/* Ensure anon_vma is assigned */
> +	err = -ENOMEM;
> +	if (vma_is_anonymous(*dst_vmap) && anon_vma_prepare(*dst_vmap))
> +		goto out_unlock;
> +
> +	return 0;
> +out_unlock:
> +	mmap_read_unlock(mm);
> +	return err;
> +}
> +#endif
>  
>  /**
>   * move_pages - move arbitrary anonymous pages of an existing vma
> @@ -1287,8 +1455,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
>   * @len: length of the virtual memory range
>   * @mode: flags from uffdio_move.mode
>   *
> - * Must be called with mmap_lock held for read.
> - *

Will either use the mmap_lock in read mode or per-vma locking ?

>   * move_pages() remaps arbitrary anonymous pages atomically in zero
>   * copy. It only works on non shared anonymous pages because those can
>   * be relocated without generating non linear anon_vmas in the rmap
> @@ -1355,10 +1521,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
>   * could be obtained. This is the only additional complexity added to
>   * the rmap code to provide this anonymous page remapping functionality.
>   */
> -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> -		   unsigned long dst_start, unsigned long src_start,
> -		   unsigned long len, __u64 mode)
> +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +		   unsigned long src_start, unsigned long len, __u64 mode)
>  {
> +	struct mm_struct *mm = ctx->mm;

You dropped the argument, but left the comment for the argument.

>  	struct vm_area_struct *src_vma, *dst_vma;
>  	unsigned long src_addr, dst_addr;
>  	pmd_t *src_pmd, *dst_pmd;
> @@ -1376,28 +1542,40 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
>  	    WARN_ON_ONCE(dst_start + len <= dst_start))
>  		goto out;
>  
> +#ifdef CONFIG_PER_VMA_LOCK
> +	err = find_and_lock_vmas(mm, dst_start, src_start,
> +				 &dst_vma, &src_vma);
> +#else
> +	err = lock_mm_and_find_vmas(mm, dst_start, src_start,
> +				    &dst_vma, &src_vma);
> +#endif

I was hoping you could hide this completely, but it's probably better to
show what's going on and the function names document it well.

> +	if (err)
> +		goto out;
> +
> +	/* Re-check after taking map_changing_lock */
> +	down_read(&ctx->map_changing_lock);
> +	if (likely(atomic_read(&ctx->mmap_changing))) {
> +		err = -EAGAIN;
> +		goto out_unlock;
> +	}
>  	/*
>  	 * Make sure the vma is not shared, that the src and dst remap
>  	 * ranges are both valid and fully within a single existing
>  	 * vma.
>  	 */
> -	src_vma = find_vma(mm, src_start);
> -	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> -		goto out;
> -	if (src_start < src_vma->vm_start ||
> -	    src_start + len > src_vma->vm_end)
> -		goto out;
> +	if (src_vma->vm_flags & VM_SHARED)
> +		goto out_unlock;
> +	if (src_start + len > src_vma->vm_end)
> +		goto out_unlock;
>  
> -	dst_vma = find_vma(mm, dst_start);
> -	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> -		goto out;
> -	if (dst_start < dst_vma->vm_start ||
> -	    dst_start + len > dst_vma->vm_end)
> -		goto out;
> +	if (dst_vma->vm_flags & VM_SHARED)
> +		goto out_unlock;
> +	if (dst_start + len > dst_vma->vm_end)
> +		goto out_unlock;
>  
>  	err = validate_move_areas(ctx, src_vma, dst_vma);
>  	if (err)
> -		goto out;
> +		goto out_unlock;
>  
>  	for (src_addr = src_start, dst_addr = dst_start;
>  	     src_addr < src_start + len;) {
> @@ -1514,6 +1692,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
>  		moved += step_size;
>  	}
>  
> +out_unlock:
> +	up_read(&ctx->map_changing_lock);
> +#ifdef CONFIG_PER_VMA_LOCK
> +	unlock_vma(dst_vma);
> +	unlock_vma(src_vma);
> +#else
> +	mmap_read_unlock(mm);
> +#endif
>  out:
>  	VM_WARN_ON(moved < 0);
>  	VM_WARN_ON(err > 0);
> -- 
> 2.43.0.687.g38aa6559b0-goog
>
Lokesh Gidra Feb. 9, 2024, 6:01 p.m. UTC | #2
On Thu, Feb 8, 2024 at 7:07 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240208 16:22]:
> > All userfaultfd operations, except write-protect, opportunistically use
> > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > critical section.
> >
> > Write-protect operation requires mmap_lock as it iterates over multiple
> > vmas.
> >
> > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > ---
> >  fs/userfaultfd.c              |  13 +-
> >  include/linux/userfaultfd_k.h |   5 +-
> >  mm/userfaultfd.c              | 356 ++++++++++++++++++++++++++--------
> >  3 files changed, 275 insertions(+), 99 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index c00a021bcce4..60dcfafdc11a 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> >               return -EINVAL;
> >
> >       if (mmget_not_zero(mm)) {
> > -             mmap_read_lock(mm);
> > -
> > -             /* Re-check after taking map_changing_lock */
> > -             down_read(&ctx->map_changing_lock);
> > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > -                                      uffdio_move.len, uffdio_move.mode);
> > -             else
> > -                     ret = -EAGAIN;
> > -             up_read(&ctx->map_changing_lock);
> > -             mmap_read_unlock(mm);
> > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > +                              uffdio_move.len, uffdio_move.mode);
> >               mmput(mm);
> >       } else {
> >               return -ESRCH;
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index 3210c3552976..05d59f74fc88 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> >  /* move_pages */
> >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > -                unsigned long dst_start, unsigned long src_start,
> > -                unsigned long len, __u64 flags);
> > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > +                unsigned long src_start, unsigned long len, __u64 flags);
> >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> >                       struct vm_area_struct *dst_vma,
> >                       struct vm_area_struct *src_vma,
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 74aad0831e40..1e25768b2136 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -19,20 +19,12 @@
> >  #include <asm/tlb.h>
> >  #include "internal.h"
> >
> > -static __always_inline
> > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > -                                 unsigned long dst_start,
> > -                                 unsigned long len)
>
> You could probably leave the __always_inline for this.

Sure
>
> > +static bool validate_dst_vma(struct vm_area_struct *dst_vma,
> > +                          unsigned long dst_end)
> >  {
> > -     /*
> > -      * Make sure that the dst range is both valid and fully within a
> > -      * single existing vma.
> > -      */
> > -     struct vm_area_struct *dst_vma;
> > -
> > -     dst_vma = find_vma(dst_mm, dst_start);
> > -     if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > -             return NULL;
> > +     /* Make sure that the dst range is fully within dst_vma. */
> > +     if (dst_end > dst_vma->vm_end)
> > +             return false;
> >
> >       /*
> >        * Check the vma is registered in uffd, this is required to
> > @@ -40,11 +32,125 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> >        * time.
> >        */
> >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > -             return NULL;
> > +             return false;
> > +
> > +     return true;
> > +}
> > +
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +/*
> > + * lock_vma() - Lookup and lock vma corresponding to @address.
> > + * @mm: mm to search vma in.
> > + * @address: address that the vma should contain.
> > + * @prepare_anon: If true, then prepare the vma (if private) with anon_vma.
> > + *
> > + * Should be called without holding mmap_lock. vma should be unlocked after use
> > + * with unlock_vma().
> > + *
> > + * Return: A locked vma containing @address, NULL if no vma is found, or
> > + * -ENOMEM if anon_vma couldn't be allocated.
> > + */
> > +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > +                                    unsigned long address,
> > +                                    bool prepare_anon)
> > +{
> > +     struct vm_area_struct *vma;
> > +
> > +     vma = lock_vma_under_rcu(mm, address);
> > +     if (vma) {
> > +             /*
> > +              * lock_vma_under_rcu() only checks anon_vma for private
> > +              * anonymous mappings. But we need to ensure it is assigned in
> > +              * private file-backed vmas as well.
> > +              */
> > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > +                 !vma->anon_vma)
> > +                     vma_end_read(vma);
> > +             else
> > +                     return vma;
> > +     }
> > +
> > +     mmap_read_lock(mm);
> > +     vma = vma_lookup(mm, address);
> > +     if (vma) {
> > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > +                 anon_vma_prepare(vma)) {
> > +                     vma = ERR_PTR(-ENOMEM);
> > +             } else {
> > +                     /*
> > +                      * We cannot use vma_start_read() as it may fail due to
> > +                      * false locked (see comment in vma_start_read()). We
> > +                      * can avoid that by directly locking vm_lock under
> > +                      * mmap_lock, which guarantees that nobody can lock the
> > +                      * vma for write (vma_start_write()) under us.
> > +                      */
> > +                     down_read(&vma->vm_lock->lock);
> > +             }
> > +     }
> > +
> > +     mmap_read_unlock(mm);
> > +     return vma;
> > +}
> > +
> > +static void unlock_vma(struct vm_area_struct *vma)
> > +{
> > +     vma_end_read(vma);
> > +}
> > +
> > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
> > +                                                 unsigned long dst_start,
> > +                                                 unsigned long len)
> > +{
> > +     struct vm_area_struct *dst_vma;
> > +
> > +     /* Ensure anon_vma is assigned for private vmas */
> > +     dst_vma = lock_vma(dst_mm, dst_start, true);
> > +
> > +     if (!dst_vma)
> > +             return ERR_PTR(-ENOENT);
> > +
> > +     if (PTR_ERR(dst_vma) == -ENOMEM)
> > +             return dst_vma;
> > +
> > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > +             goto out_unlock;
> >
> >       return dst_vma;
> > +out_unlock:
> > +     unlock_vma(dst_vma);
> > +     return ERR_PTR(-ENOENT);
> >  }
> >
> > +#else
> > +
> > +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> > +                                                    unsigned long dst_start,
> > +                                                    unsigned long len)
> > +{
> > +     struct vm_area_struct *dst_vma;
> > +     int err = -ENOENT;
> > +
> > +     mmap_read_lock(dst_mm);
> > +     dst_vma = vma_lookup(dst_mm, dst_start);
> > +     if (!dst_vma)
> > +             goto out_unlock;
> > +
> > +     /* Ensure anon_vma is assigned for private vmas */
> > +     if (!(dst_vma->vm_flags & VM_SHARED) && anon_vma_prepare(dst_vma)) {
> > +             err = -ENOMEM;
> > +             goto out_unlock;
> > +     }
> > +
> > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > +             goto out_unlock;
> > +
> > +     return dst_vma;
> > +out_unlock:
> > +     mmap_read_unlock(dst_mm);
> > +     return ERR_PTR(err);
> > +}
> > +#endif
> > +
> >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> >  static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
> >                                unsigned long dst_addr)
> > @@ -350,7 +456,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> >  #ifdef CONFIG_HUGETLB_PAGE
> >  /*
> >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > - * called with mmap_lock held, it will release mmap_lock before returning.
> > + * called with either vma-lock or mmap_lock held, it will release the lock
> > + * before returning.
> >   */
> >  static __always_inline ssize_t mfill_atomic_hugetlb(
> >                                             struct userfaultfd_ctx *ctx,
> > @@ -361,7 +468,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >                                             uffd_flags_t flags)
> >  {
> >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > -     int vm_shared = dst_vma->vm_flags & VM_SHARED;
> >       ssize_t err;
> >       pte_t *dst_pte;
> >       unsigned long src_addr, dst_addr;
> > @@ -380,7 +486,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        */
> >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> >               up_read(&ctx->map_changing_lock);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +             unlock_vma(dst_vma);
> > +#else
> >               mmap_read_unlock(dst_mm);
> > +#endif
> >               return -EINVAL;
> >       }
> >
> > @@ -403,24 +513,32 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        * retry, dst_vma will be set to NULL and we must lookup again.
> >        */
> >       if (!dst_vma) {
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +             dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > +#else
> > +             dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > +#endif
> > +             if (IS_ERR(dst_vma)) {
> > +                     err = PTR_ERR(dst_vma);
> > +                     goto out;
> > +             }
> > +
> >               err = -ENOENT;
> > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > -                     goto out_unlock;
> > +             if (!is_vm_hugetlb_page(dst_vma))
> > +                     goto out_unlock_vma;
> >
> >               err = -EINVAL;
> >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > -                     goto out_unlock;
> > -
> > -             vm_shared = dst_vma->vm_flags & VM_SHARED;
> > -     }
> > +                     goto out_unlock_vma;
> >
> > -     /*
> > -      * If not shared, ensure the dst_vma has a anon_vma.
> > -      */
> > -     err = -ENOMEM;
> > -     if (!vm_shared) {
> > -             if (unlikely(anon_vma_prepare(dst_vma)))
> > +             /*
> > +              * If memory mappings are changing because of non-cooperative
> > +              * operation (e.g. mremap) running in parallel, bail out and
> > +              * request the user to retry later
> > +              */
> > +             down_read(&ctx->map_changing_lock);
> > +             err = -EAGAIN;
> > +             if (atomic_read(&ctx->mmap_changing))
> >                       goto out_unlock;
> >       }
> >
> > @@ -465,7 +583,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >
> >               if (unlikely(err == -ENOENT)) {
> >                       up_read(&ctx->map_changing_lock);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +                     unlock_vma(dst_vma);
> > +#else
> >                       mmap_read_unlock(dst_mm);
> > +#endif
> >                       BUG_ON(!folio);
> >
> >                       err = copy_folio_from_user(folio,
> > @@ -474,17 +596,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >                               err = -EFAULT;
> >                               goto out;
> >                       }
> > -                     mmap_read_lock(dst_mm);
> > -                     down_read(&ctx->map_changing_lock);
> > -                     /*
> > -                      * If memory mappings are changing because of non-cooperative
> > -                      * operation (e.g. mremap) running in parallel, bail out and
> > -                      * request the user to retry later
> > -                      */
> > -                     if (atomic_read(&ctx->mmap_changing)) {
> > -                             err = -EAGAIN;
> > -                             break;
> > -                     }
> >
> >                       dst_vma = NULL;
> >                       goto retry;
> > @@ -505,7 +616,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >
> >  out_unlock:
> >       up_read(&ctx->map_changing_lock);
> > +out_unlock_vma:
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     unlock_vma(dst_vma);
> > +#else
> >       mmap_read_unlock(dst_mm);
> > +#endif
> >  out:
> >       if (folio)
> >               folio_put(folio);
> > @@ -597,7 +713,19 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       copied = 0;
> >       folio = NULL;
> >  retry:
> > -     mmap_read_lock(dst_mm);
> > +     /*
> > +      * Make sure the vma is not shared, that the dst range is
> > +      * both valid and fully within a single existing vma.
> > +      */
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > +#else
> > +     dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > +#endif
> > +     if (IS_ERR(dst_vma)) {
> > +             err = PTR_ERR(dst_vma);
> > +             goto out;
> > +     }
> >
> >       /*
> >        * If memory mappings are changing because of non-cooperative
> > @@ -609,15 +737,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >       if (atomic_read(&ctx->mmap_changing))
> >               goto out_unlock;
> >
> > -     /*
> > -      * Make sure the vma is not shared, that the dst range is
> > -      * both valid and fully within a single existing vma.
> > -      */
> > -     err = -ENOENT;
> > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > -     if (!dst_vma)
> > -             goto out_unlock;
> > -
> >       err = -EINVAL;
> >       /*
> >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > @@ -647,16 +766,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >           uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> >               goto out_unlock;
> >
> > -     /*
> > -      * Ensure the dst_vma has a anon_vma or this page
> > -      * would get a NULL anon_vma when moved in the
> > -      * dst_vma.
> > -      */
> > -     err = -ENOMEM;
> > -     if (!(dst_vma->vm_flags & VM_SHARED) &&
> > -         unlikely(anon_vma_prepare(dst_vma)))
> > -             goto out_unlock;
> > -
> >       while (src_addr < src_start + len) {
> >               pmd_t dst_pmdval;
> >
> > @@ -699,7 +808,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >                       void *kaddr;
> >
> >                       up_read(&ctx->map_changing_lock);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +                     unlock_vma(dst_vma);
> > +#else
> >                       mmap_read_unlock(dst_mm);
> > +#endif
> >                       BUG_ON(!folio);
> >
> >                       kaddr = kmap_local_folio(folio, 0);
> > @@ -730,7 +843,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >
> >  out_unlock:
> >       up_read(&ctx->map_changing_lock);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     unlock_vma(dst_vma);
> > +#else
> >       mmap_read_unlock(dst_mm);
> > +#endif
> >  out:
> >       if (folio)
> >               folio_put(folio);
> > @@ -1267,16 +1384,67 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> >       if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
> >               return -EINVAL;
> >
> > -     /*
> > -      * Ensure the dst_vma has a anon_vma or this page
> > -      * would get a NULL anon_vma when moved in the
> > -      * dst_vma.
> > -      */
> > -     if (unlikely(anon_vma_prepare(dst_vma)))
> > -             return -ENOMEM;
> > +     return 0;
> > +}
> > +
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static int find_and_lock_vmas(struct mm_struct *mm,
> > +                           unsigned long dst_start,
> > +                           unsigned long src_start,
> > +                           struct vm_area_struct **dst_vmap,
> > +                           struct vm_area_struct **src_vmap)
> > +{
> > +     int err;
> > +
> > +     /* There is no need to prepare anon_vma for src_vma */
> > +     *src_vmap = lock_vma(mm, src_start, false);
> > +     if (!*src_vmap)
> > +             return -ENOENT;
> > +
> > +     /* Ensure anon_vma is assigned for anonymous vma */
> > +     *dst_vmap = lock_vma(mm, dst_start, true);
> > +     err = -ENOENT;
> > +     if (!*dst_vmap)
> > +             goto out_unlock;
> > +
> > +     err = -ENOMEM;
> > +     if (PTR_ERR(*dst_vmap) == -ENOMEM)
> > +             goto out_unlock;
>
> If you change lock_vma() to return the vma or ERR_PTR(-ENOENT) /
> ERR_PTR(-ENOMEM), then you could change this to check IS_ERR() and
> return the PTR_ERR().
>
> You could also use IS_ERR_OR_NULL here, but the first suggestion will
> simplify your life for find_and_lock_dst_vma() and the error type to
> return.

Good suggestion. I'll make the change. Thanks
>
> What you have here will work though.
>
> >
> >       return 0;
> > +out_unlock:
> > +     unlock_vma(*src_vmap);
> > +     return err;
> >  }
> > +#else
> > +static int lock_mm_and_find_vmas(struct mm_struct *mm,
> > +                              unsigned long dst_start,
> > +                              unsigned long src_start,
> > +                              struct vm_area_struct **dst_vmap,
> > +                              struct vm_area_struct **src_vmap)
> > +{
> > +     int err = -ENOENT;
>
> Nit: new line after declarations.
>
> > +     mmap_read_lock(mm);
> > +
> > +     *src_vmap = vma_lookup(mm, src_start);
> > +     if (!*src_vmap)
> > +             goto out_unlock;
> > +
> > +     *dst_vmap = vma_lookup(mm, dst_start);
> > +     if (!*dst_vmap)
> > +             goto out_unlock;
> > +
> > +     /* Ensure anon_vma is assigned */
> > +     err = -ENOMEM;
> > +     if (vma_is_anonymous(*dst_vmap) && anon_vma_prepare(*dst_vmap))
> > +             goto out_unlock;
> > +
> > +     return 0;
> > +out_unlock:
> > +     mmap_read_unlock(mm);
> > +     return err;
> > +}
> > +#endif
> >
> >  /**
> >   * move_pages - move arbitrary anonymous pages of an existing vma
> > @@ -1287,8 +1455,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> >   * @len: length of the virtual memory range
> >   * @mode: flags from uffdio_move.mode
> >   *
> > - * Must be called with mmap_lock held for read.
> > - *
>
> Will either use the mmap_lock in read mode or per-vma locking ?

Makes sense. Will add it.
>
> >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> >   * copy. It only works on non shared anonymous pages because those can
> >   * be relocated without generating non linear anon_vmas in the rmap
> > @@ -1355,10 +1521,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> >   * could be obtained. This is the only additional complexity added to
> >   * the rmap code to provide this anonymous page remapping functionality.
> >   */
> > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > -                unsigned long dst_start, unsigned long src_start,
> > -                unsigned long len, __u64 mode)
> > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > +                unsigned long src_start, unsigned long len, __u64 mode)
> >  {
> > +     struct mm_struct *mm = ctx->mm;
>
> You dropped the argument, but left the comment for the argument.

Thanks, will fix it.
>
> >       struct vm_area_struct *src_vma, *dst_vma;
> >       unsigned long src_addr, dst_addr;
> >       pmd_t *src_pmd, *dst_pmd;
> > @@ -1376,28 +1542,40 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> >           WARN_ON_ONCE(dst_start + len <= dst_start))
> >               goto out;
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     err = find_and_lock_vmas(mm, dst_start, src_start,
> > +                              &dst_vma, &src_vma);
> > +#else
> > +     err = lock_mm_and_find_vmas(mm, dst_start, src_start,
> > +                                 &dst_vma, &src_vma);
> > +#endif
>
> I was hoping you could hide this completely, but it's probably better to
> show what's going on and the function names document it well.

I wanted to hide unlock as it's called several times, but then I
thought you wanted explicit calls to mmap_read_unlock() so didn't hide
it. If you are ok can I define unlock_vma() for !CONFIG_PER_VMA_LOCK
as well, calling mmap_read_unlock()?
>
> > +     if (err)
> > +             goto out;
> > +
> > +     /* Re-check after taking map_changing_lock */
> > +     down_read(&ctx->map_changing_lock);
> > +     if (likely(atomic_read(&ctx->mmap_changing))) {
> > +             err = -EAGAIN;
> > +             goto out_unlock;
> > +     }
> >       /*
> >        * Make sure the vma is not shared, that the src and dst remap
> >        * ranges are both valid and fully within a single existing
> >        * vma.
> >        */
> > -     src_vma = find_vma(mm, src_start);
> > -     if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > -             goto out;
> > -     if (src_start < src_vma->vm_start ||
> > -         src_start + len > src_vma->vm_end)
> > -             goto out;
> > +     if (src_vma->vm_flags & VM_SHARED)
> > +             goto out_unlock;
> > +     if (src_start + len > src_vma->vm_end)
> > +             goto out_unlock;
> >
> > -     dst_vma = find_vma(mm, dst_start);
> > -     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > -             goto out;
> > -     if (dst_start < dst_vma->vm_start ||
> > -         dst_start + len > dst_vma->vm_end)
> > -             goto out;
> > +     if (dst_vma->vm_flags & VM_SHARED)
> > +             goto out_unlock;
> > +     if (dst_start + len > dst_vma->vm_end)
> > +             goto out_unlock;
> >
> >       err = validate_move_areas(ctx, src_vma, dst_vma);
> >       if (err)
> > -             goto out;
> > +             goto out_unlock;
> >
> >       for (src_addr = src_start, dst_addr = dst_start;
> >            src_addr < src_start + len;) {
> > @@ -1514,6 +1692,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> >               moved += step_size;
> >       }
> >
> > +out_unlock:
> > +     up_read(&ctx->map_changing_lock);
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     unlock_vma(dst_vma);
> > +     unlock_vma(src_vma);
> > +#else
> > +     mmap_read_unlock(mm);
> > +#endif
> >  out:
> >       VM_WARN_ON(moved < 0);
> >       VM_WARN_ON(err > 0);
> > --
> > 2.43.0.687.g38aa6559b0-goog
> >
Liam R. Howlett Feb. 9, 2024, 7:06 p.m. UTC | #3
* Lokesh Gidra <lokeshgidra@google.com> [240209 13:02]:
> On Thu, Feb 8, 2024 at 7:07 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240208 16:22]:
> > > All userfaultfd operations, except write-protect, opportunistically use
> > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > critical section.
> > >
> > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > vmas.
> > >
> > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > ---
> > >  fs/userfaultfd.c              |  13 +-
> > >  include/linux/userfaultfd_k.h |   5 +-
> > >  mm/userfaultfd.c              | 356 ++++++++++++++++++++++++++--------
> > >  3 files changed, 275 insertions(+), 99 deletions(-)
> > >
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index c00a021bcce4..60dcfafdc11a 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> > >               return -EINVAL;
> > >
> > >       if (mmget_not_zero(mm)) {
> > > -             mmap_read_lock(mm);
> > > -
> > > -             /* Re-check after taking map_changing_lock */
> > > -             down_read(&ctx->map_changing_lock);
> > > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > > -                                      uffdio_move.len, uffdio_move.mode);
> > > -             else
> > > -                     ret = -EAGAIN;
> > > -             up_read(&ctx->map_changing_lock);
> > > -             mmap_read_unlock(mm);
> > > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > > +                              uffdio_move.len, uffdio_move.mode);
> > >               mmput(mm);
> > >       } else {
> > >               return -ESRCH;
> > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > > index 3210c3552976..05d59f74fc88 100644
> > > --- a/include/linux/userfaultfd_k.h
> > > +++ b/include/linux/userfaultfd_k.h
> > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> > >  /* move_pages */
> > >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> > >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > -                unsigned long dst_start, unsigned long src_start,
> > > -                unsigned long len, __u64 flags);
> > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > +                unsigned long src_start, unsigned long len, __u64 flags);
> > >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> > >                       struct vm_area_struct *dst_vma,
> > >                       struct vm_area_struct *src_vma,
> > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > index 74aad0831e40..1e25768b2136 100644
> > > --- a/mm/userfaultfd.c
> > > +++ b/mm/userfaultfd.c
> > > @@ -19,20 +19,12 @@
> > >  #include <asm/tlb.h>
> > >  #include "internal.h"
> > >
> > > -static __always_inline
> > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > -                                 unsigned long dst_start,
> > > -                                 unsigned long len)
> >
> > You could probably leave the __always_inline for this.
> 
> Sure
> >
> > > +static bool validate_dst_vma(struct vm_area_struct *dst_vma,
> > > +                          unsigned long dst_end)
> > >  {
> > > -     /*
> > > -      * Make sure that the dst range is both valid and fully within a
> > > -      * single existing vma.
> > > -      */
> > > -     struct vm_area_struct *dst_vma;
> > > -
> > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > -     if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > -             return NULL;
> > > +     /* Make sure that the dst range is fully within dst_vma. */
> > > +     if (dst_end > dst_vma->vm_end)
> > > +             return false;
> > >
> > >       /*
> > >        * Check the vma is registered in uffd, this is required to
> > > @@ -40,11 +32,125 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > >        * time.
> > >        */
> > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > -             return NULL;
> > > +             return false;
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +/*
> > > + * lock_vma() - Lookup and lock vma corresponding to @address.
> > > + * @mm: mm to search vma in.
> > > + * @address: address that the vma should contain.
> > > + * @prepare_anon: If true, then prepare the vma (if private) with anon_vma.
> > > + *
> > > + * Should be called without holding mmap_lock. vma should be unlocked after use
> > > + * with unlock_vma().
> > > + *
> > > + * Return: A locked vma containing @address, NULL if no vma is found, or
> > > + * -ENOMEM if anon_vma couldn't be allocated.
> > > + */
> > > +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > +                                    unsigned long address,
> > > +                                    bool prepare_anon)
> > > +{
> > > +     struct vm_area_struct *vma;
> > > +
> > > +     vma = lock_vma_under_rcu(mm, address);
> > > +     if (vma) {
> > > +             /*
> > > +              * lock_vma_under_rcu() only checks anon_vma for private
> > > +              * anonymous mappings. But we need to ensure it is assigned in
> > > +              * private file-backed vmas as well.
> > > +              */
> > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > +                 !vma->anon_vma)
> > > +                     vma_end_read(vma);
> > > +             else
> > > +                     return vma;
> > > +     }
> > > +
> > > +     mmap_read_lock(mm);
> > > +     vma = vma_lookup(mm, address);
> > > +     if (vma) {
> > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > +                 anon_vma_prepare(vma)) {
> > > +                     vma = ERR_PTR(-ENOMEM);
> > > +             } else {
> > > +                     /*
> > > +                      * We cannot use vma_start_read() as it may fail due to
> > > +                      * false locked (see comment in vma_start_read()). We
> > > +                      * can avoid that by directly locking vm_lock under
> > > +                      * mmap_lock, which guarantees that nobody can lock the
> > > +                      * vma for write (vma_start_write()) under us.
> > > +                      */
> > > +                     down_read(&vma->vm_lock->lock);
> > > +             }
> > > +     }
> > > +
> > > +     mmap_read_unlock(mm);
> > > +     return vma;
> > > +}
> > > +
> > > +static void unlock_vma(struct vm_area_struct *vma)
> > > +{
> > > +     vma_end_read(vma);
> > > +}
> > > +
> > > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
> > > +                                                 unsigned long dst_start,
> > > +                                                 unsigned long len)
> > > +{
> > > +     struct vm_area_struct *dst_vma;
> > > +
> > > +     /* Ensure anon_vma is assigned for private vmas */
> > > +     dst_vma = lock_vma(dst_mm, dst_start, true);
> > > +
> > > +     if (!dst_vma)
> > > +             return ERR_PTR(-ENOENT);
> > > +
> > > +     if (PTR_ERR(dst_vma) == -ENOMEM)
> > > +             return dst_vma;
> > > +
> > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > +             goto out_unlock;
> > >
> > >       return dst_vma;
> > > +out_unlock:
> > > +     unlock_vma(dst_vma);
> > > +     return ERR_PTR(-ENOENT);
> > >  }
> > >
> > > +#else
> > > +
> > > +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> > > +                                                    unsigned long dst_start,
> > > +                                                    unsigned long len)
> > > +{
> > > +     struct vm_area_struct *dst_vma;
> > > +     int err = -ENOENT;
> > > +
> > > +     mmap_read_lock(dst_mm);
> > > +     dst_vma = vma_lookup(dst_mm, dst_start);
> > > +     if (!dst_vma)
> > > +             goto out_unlock;
> > > +
> > > +     /* Ensure anon_vma is assigned for private vmas */
> > > +     if (!(dst_vma->vm_flags & VM_SHARED) && anon_vma_prepare(dst_vma)) {
> > > +             err = -ENOMEM;
> > > +             goto out_unlock;
> > > +     }
> > > +
> > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > +             goto out_unlock;
> > > +
> > > +     return dst_vma;
> > > +out_unlock:
> > > +     mmap_read_unlock(dst_mm);
> > > +     return ERR_PTR(err);
> > > +}
> > > +#endif
> > > +
> > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > >  static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
> > >                                unsigned long dst_addr)
> > > @@ -350,7 +456,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > >  #ifdef CONFIG_HUGETLB_PAGE
> > >  /*
> > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > + * before returning.
> > >   */
> > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                                             struct userfaultfd_ctx *ctx,
> > > @@ -361,7 +468,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                                             uffd_flags_t flags)
> > >  {
> > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > -     int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > >       ssize_t err;
> > >       pte_t *dst_pte;
> > >       unsigned long src_addr, dst_addr;
> > > @@ -380,7 +486,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >        */
> > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > >               up_read(&ctx->map_changing_lock);
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +             unlock_vma(dst_vma);
> > > +#else
> > >               mmap_read_unlock(dst_mm);
> > > +#endif
> > >               return -EINVAL;
> > >       }
> > >
> > > @@ -403,24 +513,32 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >        * retry, dst_vma will be set to NULL and we must lookup again.
> > >        */
> > >       if (!dst_vma) {
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +             dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > +#else
> > > +             dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > +#endif
> > > +             if (IS_ERR(dst_vma)) {
> > > +                     err = PTR_ERR(dst_vma);
> > > +                     goto out;
> > > +             }
> > > +
> > >               err = -ENOENT;
> > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > -                     goto out_unlock;
> > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > +                     goto out_unlock_vma;
> > >
> > >               err = -EINVAL;
> > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > -                     goto out_unlock;
> > > -
> > > -             vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > -     }
> > > +                     goto out_unlock_vma;
> > >
> > > -     /*
> > > -      * If not shared, ensure the dst_vma has a anon_vma.
> > > -      */
> > > -     err = -ENOMEM;
> > > -     if (!vm_shared) {
> > > -             if (unlikely(anon_vma_prepare(dst_vma)))
> > > +             /*
> > > +              * If memory mappings are changing because of non-cooperative
> > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > +              * request the user to retry later
> > > +              */
> > > +             down_read(&ctx->map_changing_lock);
> > > +             err = -EAGAIN;
> > > +             if (atomic_read(&ctx->mmap_changing))
> > >                       goto out_unlock;
> > >       }
> > >
> > > @@ -465,7 +583,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >               if (unlikely(err == -ENOENT)) {
> > >                       up_read(&ctx->map_changing_lock);
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +                     unlock_vma(dst_vma);
> > > +#else
> > >                       mmap_read_unlock(dst_mm);
> > > +#endif
> > >                       BUG_ON(!folio);
> > >
> > >                       err = copy_folio_from_user(folio,
> > > @@ -474,17 +596,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >                               err = -EFAULT;
> > >                               goto out;
> > >                       }
> > > -                     mmap_read_lock(dst_mm);
> > > -                     down_read(&ctx->map_changing_lock);
> > > -                     /*
> > > -                      * If memory mappings are changing because of non-cooperative
> > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > -                      * request the user to retry later
> > > -                      */
> > > -                     if (atomic_read(&ctx->mmap_changing)) {
> > > -                             err = -EAGAIN;
> > > -                             break;
> > > -                     }
> > >
> > >                       dst_vma = NULL;
> > >                       goto retry;
> > > @@ -505,7 +616,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > >
> > >  out_unlock:
> > >       up_read(&ctx->map_changing_lock);
> > > +out_unlock_vma:
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +     unlock_vma(dst_vma);
> > > +#else
> > >       mmap_read_unlock(dst_mm);
> > > +#endif
> > >  out:
> > >       if (folio)
> > >               folio_put(folio);
> > > @@ -597,7 +713,19 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       copied = 0;
> > >       folio = NULL;
> > >  retry:
> > > -     mmap_read_lock(dst_mm);
> > > +     /*
> > > +      * Make sure the vma is not shared, that the dst range is
> > > +      * both valid and fully within a single existing vma.
> > > +      */
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +     dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > +#else
> > > +     dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > +#endif
> > > +     if (IS_ERR(dst_vma)) {
> > > +             err = PTR_ERR(dst_vma);
> > > +             goto out;
> > > +     }
> > >
> > >       /*
> > >        * If memory mappings are changing because of non-cooperative
> > > @@ -609,15 +737,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >       if (atomic_read(&ctx->mmap_changing))
> > >               goto out_unlock;
> > >
> > > -     /*
> > > -      * Make sure the vma is not shared, that the dst range is
> > > -      * both valid and fully within a single existing vma.
> > > -      */
> > > -     err = -ENOENT;
> > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > -     if (!dst_vma)
> > > -             goto out_unlock;
> > > -
> > >       err = -EINVAL;
> > >       /*
> > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > @@ -647,16 +766,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >           uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> > >               goto out_unlock;
> > >
> > > -     /*
> > > -      * Ensure the dst_vma has a anon_vma or this page
> > > -      * would get a NULL anon_vma when moved in the
> > > -      * dst_vma.
> > > -      */
> > > -     err = -ENOMEM;
> > > -     if (!(dst_vma->vm_flags & VM_SHARED) &&
> > > -         unlikely(anon_vma_prepare(dst_vma)))
> > > -             goto out_unlock;
> > > -
> > >       while (src_addr < src_start + len) {
> > >               pmd_t dst_pmdval;
> > >
> > > @@ -699,7 +808,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >                       void *kaddr;
> > >
> > >                       up_read(&ctx->map_changing_lock);
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +                     unlock_vma(dst_vma);
> > > +#else
> > >                       mmap_read_unlock(dst_mm);
> > > +#endif
> > >                       BUG_ON(!folio);
> > >
> > >                       kaddr = kmap_local_folio(folio, 0);
> > > @@ -730,7 +843,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >
> > >  out_unlock:
> > >       up_read(&ctx->map_changing_lock);
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +     unlock_vma(dst_vma);
> > > +#else
> > >       mmap_read_unlock(dst_mm);
> > > +#endif
> > >  out:
> > >       if (folio)
> > >               folio_put(folio);
> > > @@ -1267,16 +1384,67 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > >       if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
> > >               return -EINVAL;
> > >
> > > -     /*
> > > -      * Ensure the dst_vma has a anon_vma or this page
> > > -      * would get a NULL anon_vma when moved in the
> > > -      * dst_vma.
> > > -      */
> > > -     if (unlikely(anon_vma_prepare(dst_vma)))
> > > -             return -ENOMEM;
> > > +     return 0;
> > > +}
> > > +
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +static int find_and_lock_vmas(struct mm_struct *mm,
> > > +                           unsigned long dst_start,
> > > +                           unsigned long src_start,
> > > +                           struct vm_area_struct **dst_vmap,
> > > +                           struct vm_area_struct **src_vmap)
> > > +{
> > > +     int err;
> > > +
> > > +     /* There is no need to prepare anon_vma for src_vma */
> > > +     *src_vmap = lock_vma(mm, src_start, false);
> > > +     if (!*src_vmap)
> > > +             return -ENOENT;
> > > +
> > > +     /* Ensure anon_vma is assigned for anonymous vma */
> > > +     *dst_vmap = lock_vma(mm, dst_start, true);
> > > +     err = -ENOENT;
> > > +     if (!*dst_vmap)
> > > +             goto out_unlock;
> > > +
> > > +     err = -ENOMEM;
> > > +     if (PTR_ERR(*dst_vmap) == -ENOMEM)
> > > +             goto out_unlock;
> >
> > If you change lock_vma() to return the vma or ERR_PTR(-ENOENT) /
> > ERR_PTR(-ENOMEM), then you could change this to check IS_ERR() and
> > return the PTR_ERR().
> >
> > You could also use IS_ERR_OR_NULL here, but the first suggestion will
> > simplify your life for find_and_lock_dst_vma() and the error type to
> > return.
> 
> Good suggestion. I'll make the change. Thanks
> >
> > What you have here will work though.
> >
> > >
> > >       return 0;
> > > +out_unlock:
> > > +     unlock_vma(*src_vmap);
> > > +     return err;
> > >  }
> > > +#else
> > > +static int lock_mm_and_find_vmas(struct mm_struct *mm,
> > > +                              unsigned long dst_start,
> > > +                              unsigned long src_start,
> > > +                              struct vm_area_struct **dst_vmap,
> > > +                              struct vm_area_struct **src_vmap)
> > > +{
> > > +     int err = -ENOENT;
> >
> > Nit: new line after declarations.
> >
> > > +     mmap_read_lock(mm);
> > > +
> > > +     *src_vmap = vma_lookup(mm, src_start);
> > > +     if (!*src_vmap)
> > > +             goto out_unlock;
> > > +
> > > +     *dst_vmap = vma_lookup(mm, dst_start);
> > > +     if (!*dst_vmap)
> > > +             goto out_unlock;
> > > +
> > > +     /* Ensure anon_vma is assigned */
> > > +     err = -ENOMEM;
> > > +     if (vma_is_anonymous(*dst_vmap) && anon_vma_prepare(*dst_vmap))
> > > +             goto out_unlock;
> > > +
> > > +     return 0;
> > > +out_unlock:
> > > +     mmap_read_unlock(mm);
> > > +     return err;
> > > +}
> > > +#endif
> > >
> > >  /**
> > >   * move_pages - move arbitrary anonymous pages of an existing vma
> > > @@ -1287,8 +1455,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > >   * @len: length of the virtual memory range
> > >   * @mode: flags from uffdio_move.mode
> > >   *
> > > - * Must be called with mmap_lock held for read.
> > > - *
> >
> > Will either use the mmap_lock in read mode or per-vma locking ?
> 
> Makes sense. Will add it.
> >
> > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > >   * copy. It only works on non shared anonymous pages because those can
> > >   * be relocated without generating non linear anon_vmas in the rmap
> > > @@ -1355,10 +1521,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > >   * could be obtained. This is the only additional complexity added to
> > >   * the rmap code to provide this anonymous page remapping functionality.
> > >   */
> > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > -                unsigned long dst_start, unsigned long src_start,
> > > -                unsigned long len, __u64 mode)
> > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > >  {
> > > +     struct mm_struct *mm = ctx->mm;
> >
> > You dropped the argument, but left the comment for the argument.
> 
> Thanks, will fix it.
> >
> > >       struct vm_area_struct *src_vma, *dst_vma;
> > >       unsigned long src_addr, dst_addr;
> > >       pmd_t *src_pmd, *dst_pmd;
> > > @@ -1376,28 +1542,40 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > >               goto out;
> > >
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +     err = find_and_lock_vmas(mm, dst_start, src_start,
> > > +                              &dst_vma, &src_vma);
> > > +#else
> > > +     err = lock_mm_and_find_vmas(mm, dst_start, src_start,
> > > +                                 &dst_vma, &src_vma);
> > > +#endif
> >
> > I was hoping you could hide this completely, but it's probably better to
> > show what's going on and the function names document it well.
> 
> I wanted to hide unlock as it's called several times, but then I
> thought you wanted explicit calls to mmap_read_unlock() so didn't hide
> it. If you are ok can I define unlock_vma() for !CONFIG_PER_VMA_LOCK
> as well, calling mmap_read_unlock()?

My bigger problem was with the name.  We can't have unlock_vma()
just unlock the mm - it is confusing to read and I think it'll lead to
misunderstandings of what is really going on here.

Whatever you decide to do is fine as long as it's clear what's going on.
I think this is clear while hiding it could also be clear with the right
function name - I'm not sure what that would be; naming is hard.

Thanks,
Liam
Lokesh Gidra Feb. 9, 2024, 7:21 p.m. UTC | #4
On Fri, Feb 9, 2024 at 11:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240209 13:02]:
> > On Thu, Feb 8, 2024 at 7:07 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240208 16:22]:
> > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > critical section.
> > > >
> > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > vmas.
> > > >
> > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > ---
> > > >  fs/userfaultfd.c              |  13 +-
> > > >  include/linux/userfaultfd_k.h |   5 +-
> > > >  mm/userfaultfd.c              | 356 ++++++++++++++++++++++++++--------
> > > >  3 files changed, 275 insertions(+), 99 deletions(-)
> > > >
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index c00a021bcce4..60dcfafdc11a 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> > > >               return -EINVAL;
> > > >
> > > >       if (mmget_not_zero(mm)) {
> > > > -             mmap_read_lock(mm);
> > > > -
> > > > -             /* Re-check after taking map_changing_lock */
> > > > -             down_read(&ctx->map_changing_lock);
> > > > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > > > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > > > -                                      uffdio_move.len, uffdio_move.mode);
> > > > -             else
> > > > -                     ret = -EAGAIN;
> > > > -             up_read(&ctx->map_changing_lock);
> > > > -             mmap_read_unlock(mm);
> > > > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > > > +                              uffdio_move.len, uffdio_move.mode);
> > > >               mmput(mm);
> > > >       } else {
> > > >               return -ESRCH;
> > > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > > > index 3210c3552976..05d59f74fc88 100644
> > > > --- a/include/linux/userfaultfd_k.h
> > > > +++ b/include/linux/userfaultfd_k.h
> > > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> > > >  /* move_pages */
> > > >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > -                unsigned long dst_start, unsigned long src_start,
> > > > -                unsigned long len, __u64 flags);
> > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > +                unsigned long src_start, unsigned long len, __u64 flags);
> > > >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> > > >                       struct vm_area_struct *dst_vma,
> > > >                       struct vm_area_struct *src_vma,
> > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > > index 74aad0831e40..1e25768b2136 100644
> > > > --- a/mm/userfaultfd.c
> > > > +++ b/mm/userfaultfd.c
> > > > @@ -19,20 +19,12 @@
> > > >  #include <asm/tlb.h>
> > > >  #include "internal.h"
> > > >
> > > > -static __always_inline
> > > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > -                                 unsigned long dst_start,
> > > > -                                 unsigned long len)
> > >
> > > You could probably leave the __always_inline for this.
> >
> > Sure
> > >
> > > > +static bool validate_dst_vma(struct vm_area_struct *dst_vma,
> > > > +                          unsigned long dst_end)
> > > >  {
> > > > -     /*
> > > > -      * Make sure that the dst range is both valid and fully within a
> > > > -      * single existing vma.
> > > > -      */
> > > > -     struct vm_area_struct *dst_vma;
> > > > -
> > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > > -     if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > -             return NULL;
> > > > +     /* Make sure that the dst range is fully within dst_vma. */
> > > > +     if (dst_end > dst_vma->vm_end)
> > > > +             return false;
> > > >
> > > >       /*
> > > >        * Check the vma is registered in uffd, this is required to
> > > > @@ -40,11 +32,125 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > >        * time.
> > > >        */
> > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > -             return NULL;
> > > > +             return false;
> > > > +
> > > > +     return true;
> > > > +}
> > > > +
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +/*
> > > > + * lock_vma() - Lookup and lock vma corresponding to @address.
> > > > + * @mm: mm to search vma in.
> > > > + * @address: address that the vma should contain.
> > > > + * @prepare_anon: If true, then prepare the vma (if private) with anon_vma.
> > > > + *
> > > > + * Should be called without holding mmap_lock. vma should be unlocked after use
> > > > + * with unlock_vma().
> > > > + *
> > > > + * Return: A locked vma containing @address, NULL if no vma is found, or
> > > > + * -ENOMEM if anon_vma couldn't be allocated.
> > > > + */
> > > > +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > > +                                    unsigned long address,
> > > > +                                    bool prepare_anon)
> > > > +{
> > > > +     struct vm_area_struct *vma;
> > > > +
> > > > +     vma = lock_vma_under_rcu(mm, address);
> > > > +     if (vma) {
> > > > +             /*
> > > > +              * lock_vma_under_rcu() only checks anon_vma for private
> > > > +              * anonymous mappings. But we need to ensure it is assigned in
> > > > +              * private file-backed vmas as well.
> > > > +              */
> > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > +                 !vma->anon_vma)
> > > > +                     vma_end_read(vma);
> > > > +             else
> > > > +                     return vma;
> > > > +     }
> > > > +
> > > > +     mmap_read_lock(mm);
> > > > +     vma = vma_lookup(mm, address);
> > > > +     if (vma) {
> > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > +                 anon_vma_prepare(vma)) {
> > > > +                     vma = ERR_PTR(-ENOMEM);
> > > > +             } else {
> > > > +                     /*
> > > > +                      * We cannot use vma_start_read() as it may fail due to
> > > > +                      * false locked (see comment in vma_start_read()). We
> > > > +                      * can avoid that by directly locking vm_lock under
> > > > +                      * mmap_lock, which guarantees that nobody can lock the
> > > > +                      * vma for write (vma_start_write()) under us.
> > > > +                      */
> > > > +                     down_read(&vma->vm_lock->lock);
> > > > +             }
> > > > +     }
> > > > +
> > > > +     mmap_read_unlock(mm);
> > > > +     return vma;
> > > > +}
> > > > +
> > > > +static void unlock_vma(struct vm_area_struct *vma)
> > > > +{
> > > > +     vma_end_read(vma);
> > > > +}
> > > > +
> > > > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
> > > > +                                                 unsigned long dst_start,
> > > > +                                                 unsigned long len)
> > > > +{
> > > > +     struct vm_area_struct *dst_vma;
> > > > +
> > > > +     /* Ensure anon_vma is assigned for private vmas */
> > > > +     dst_vma = lock_vma(dst_mm, dst_start, true);
> > > > +
> > > > +     if (!dst_vma)
> > > > +             return ERR_PTR(-ENOENT);
> > > > +
> > > > +     if (PTR_ERR(dst_vma) == -ENOMEM)
> > > > +             return dst_vma;
> > > > +
> > > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > > +             goto out_unlock;
> > > >
> > > >       return dst_vma;
> > > > +out_unlock:
> > > > +     unlock_vma(dst_vma);
> > > > +     return ERR_PTR(-ENOENT);
> > > >  }
> > > >
> > > > +#else
> > > > +
> > > > +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> > > > +                                                    unsigned long dst_start,
> > > > +                                                    unsigned long len)
> > > > +{
> > > > +     struct vm_area_struct *dst_vma;
> > > > +     int err = -ENOENT;
> > > > +
> > > > +     mmap_read_lock(dst_mm);
> > > > +     dst_vma = vma_lookup(dst_mm, dst_start);
> > > > +     if (!dst_vma)
> > > > +             goto out_unlock;
> > > > +
> > > > +     /* Ensure anon_vma is assigned for private vmas */
> > > > +     if (!(dst_vma->vm_flags & VM_SHARED) && anon_vma_prepare(dst_vma)) {
> > > > +             err = -ENOMEM;
> > > > +             goto out_unlock;
> > > > +     }
> > > > +
> > > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > > +             goto out_unlock;
> > > > +
> > > > +     return dst_vma;
> > > > +out_unlock:
> > > > +     mmap_read_unlock(dst_mm);
> > > > +     return ERR_PTR(err);
> > > > +}
> > > > +#endif
> > > > +
> > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > >  static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
> > > >                                unsigned long dst_addr)
> > > > @@ -350,7 +456,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > >  /*
> > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > + * before returning.
> > > >   */
> > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                                             struct userfaultfd_ctx *ctx,
> > > > @@ -361,7 +468,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                                             uffd_flags_t flags)
> > > >  {
> > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > > -     int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > >       ssize_t err;
> > > >       pte_t *dst_pte;
> > > >       unsigned long src_addr, dst_addr;
> > > > @@ -380,7 +486,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >        */
> > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > >               up_read(&ctx->map_changing_lock);
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +             unlock_vma(dst_vma);
> > > > +#else
> > > >               mmap_read_unlock(dst_mm);
> > > > +#endif
> > > >               return -EINVAL;
> > > >       }
> > > >
> > > > @@ -403,24 +513,32 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >        * retry, dst_vma will be set to NULL and we must lookup again.
> > > >        */
> > > >       if (!dst_vma) {
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +             dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > > +#else
> > > > +             dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > > +#endif
> > > > +             if (IS_ERR(dst_vma)) {
> > > > +                     err = PTR_ERR(dst_vma);
> > > > +                     goto out;
> > > > +             }
> > > > +
> > > >               err = -ENOENT;
> > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > -                     goto out_unlock;
> > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > +                     goto out_unlock_vma;
> > > >
> > > >               err = -EINVAL;
> > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > -                     goto out_unlock;
> > > > -
> > > > -             vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > -     }
> > > > +                     goto out_unlock_vma;
> > > >
> > > > -     /*
> > > > -      * If not shared, ensure the dst_vma has a anon_vma.
> > > > -      */
> > > > -     err = -ENOMEM;
> > > > -     if (!vm_shared) {
> > > > -             if (unlikely(anon_vma_prepare(dst_vma)))
> > > > +             /*
> > > > +              * If memory mappings are changing because of non-cooperative
> > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > +              * request the user to retry later
> > > > +              */
> > > > +             down_read(&ctx->map_changing_lock);
> > > > +             err = -EAGAIN;
> > > > +             if (atomic_read(&ctx->mmap_changing))
> > > >                       goto out_unlock;
> > > >       }
> > > >
> > > > @@ -465,7 +583,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >
> > > >               if (unlikely(err == -ENOENT)) {
> > > >                       up_read(&ctx->map_changing_lock);
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +                     unlock_vma(dst_vma);
> > > > +#else
> > > >                       mmap_read_unlock(dst_mm);
> > > > +#endif
> > > >                       BUG_ON(!folio);
> > > >
> > > >                       err = copy_folio_from_user(folio,
> > > > @@ -474,17 +596,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >                               err = -EFAULT;
> > > >                               goto out;
> > > >                       }
> > > > -                     mmap_read_lock(dst_mm);
> > > > -                     down_read(&ctx->map_changing_lock);
> > > > -                     /*
> > > > -                      * If memory mappings are changing because of non-cooperative
> > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > -                      * request the user to retry later
> > > > -                      */
> > > > -                     if (atomic_read(&ctx->mmap_changing)) {
> > > > -                             err = -EAGAIN;
> > > > -                             break;
> > > > -                     }
> > > >
> > > >                       dst_vma = NULL;
> > > >                       goto retry;
> > > > @@ -505,7 +616,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > >
> > > >  out_unlock:
> > > >       up_read(&ctx->map_changing_lock);
> > > > +out_unlock_vma:
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +     unlock_vma(dst_vma);
> > > > +#else
> > > >       mmap_read_unlock(dst_mm);
> > > > +#endif
> > > >  out:
> > > >       if (folio)
> > > >               folio_put(folio);
> > > > @@ -597,7 +713,19 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       copied = 0;
> > > >       folio = NULL;
> > > >  retry:
> > > > -     mmap_read_lock(dst_mm);
> > > > +     /*
> > > > +      * Make sure the vma is not shared, that the dst range is
> > > > +      * both valid and fully within a single existing vma.
> > > > +      */
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +     dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > > +#else
> > > > +     dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > > +#endif
> > > > +     if (IS_ERR(dst_vma)) {
> > > > +             err = PTR_ERR(dst_vma);
> > > > +             goto out;
> > > > +     }
> > > >
> > > >       /*
> > > >        * If memory mappings are changing because of non-cooperative
> > > > @@ -609,15 +737,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >       if (atomic_read(&ctx->mmap_changing))
> > > >               goto out_unlock;
> > > >
> > > > -     /*
> > > > -      * Make sure the vma is not shared, that the dst range is
> > > > -      * both valid and fully within a single existing vma.
> > > > -      */
> > > > -     err = -ENOENT;
> > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > -     if (!dst_vma)
> > > > -             goto out_unlock;
> > > > -
> > > >       err = -EINVAL;
> > > >       /*
> > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > @@ -647,16 +766,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >           uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> > > >               goto out_unlock;
> > > >
> > > > -     /*
> > > > -      * Ensure the dst_vma has a anon_vma or this page
> > > > -      * would get a NULL anon_vma when moved in the
> > > > -      * dst_vma.
> > > > -      */
> > > > -     err = -ENOMEM;
> > > > -     if (!(dst_vma->vm_flags & VM_SHARED) &&
> > > > -         unlikely(anon_vma_prepare(dst_vma)))
> > > > -             goto out_unlock;
> > > > -
> > > >       while (src_addr < src_start + len) {
> > > >               pmd_t dst_pmdval;
> > > >
> > > > @@ -699,7 +808,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >                       void *kaddr;
> > > >
> > > >                       up_read(&ctx->map_changing_lock);
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +                     unlock_vma(dst_vma);
> > > > +#else
> > > >                       mmap_read_unlock(dst_mm);
> > > > +#endif
> > > >                       BUG_ON(!folio);
> > > >
> > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > @@ -730,7 +843,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > >
> > > >  out_unlock:
> > > >       up_read(&ctx->map_changing_lock);
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +     unlock_vma(dst_vma);
> > > > +#else
> > > >       mmap_read_unlock(dst_mm);
> > > > +#endif
> > > >  out:
> > > >       if (folio)
> > > >               folio_put(folio);
> > > > @@ -1267,16 +1384,67 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > >       if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
> > > >               return -EINVAL;
> > > >
> > > > -     /*
> > > > -      * Ensure the dst_vma has a anon_vma or this page
> > > > -      * would get a NULL anon_vma when moved in the
> > > > -      * dst_vma.
> > > > -      */
> > > > -     if (unlikely(anon_vma_prepare(dst_vma)))
> > > > -             return -ENOMEM;
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +static int find_and_lock_vmas(struct mm_struct *mm,
> > > > +                           unsigned long dst_start,
> > > > +                           unsigned long src_start,
> > > > +                           struct vm_area_struct **dst_vmap,
> > > > +                           struct vm_area_struct **src_vmap)
> > > > +{
> > > > +     int err;
> > > > +
> > > > +     /* There is no need to prepare anon_vma for src_vma */
> > > > +     *src_vmap = lock_vma(mm, src_start, false);
> > > > +     if (!*src_vmap)
> > > > +             return -ENOENT;
> > > > +
> > > > +     /* Ensure anon_vma is assigned for anonymous vma */
> > > > +     *dst_vmap = lock_vma(mm, dst_start, true);
> > > > +     err = -ENOENT;
> > > > +     if (!*dst_vmap)
> > > > +             goto out_unlock;
> > > > +
> > > > +     err = -ENOMEM;
> > > > +     if (PTR_ERR(*dst_vmap) == -ENOMEM)
> > > > +             goto out_unlock;
> > >
> > > If you change lock_vma() to return the vma or ERR_PTR(-ENOENT) /
> > > ERR_PTR(-ENOMEM), then you could change this to check IS_ERR() and
> > > return the PTR_ERR().
> > >
> > > You could also use IS_ERR_OR_NULL here, but the first suggestion will
> > > simplify your life for find_and_lock_dst_vma() and the error type to
> > > return.
> >
> > Good suggestion. I'll make the change. Thanks
> > >
> > > What you have here will work though.
> > >
> > > >
> > > >       return 0;
> > > > +out_unlock:
> > > > +     unlock_vma(*src_vmap);
> > > > +     return err;
> > > >  }
> > > > +#else
> > > > +static int lock_mm_and_find_vmas(struct mm_struct *mm,
> > > > +                              unsigned long dst_start,
> > > > +                              unsigned long src_start,
> > > > +                              struct vm_area_struct **dst_vmap,
> > > > +                              struct vm_area_struct **src_vmap)
> > > > +{
> > > > +     int err = -ENOENT;
> > >
> > > Nit: new line after declarations.
> > >
> > > > +     mmap_read_lock(mm);
> > > > +
> > > > +     *src_vmap = vma_lookup(mm, src_start);
> > > > +     if (!*src_vmap)
> > > > +             goto out_unlock;
> > > > +
> > > > +     *dst_vmap = vma_lookup(mm, dst_start);
> > > > +     if (!*dst_vmap)
> > > > +             goto out_unlock;
> > > > +
> > > > +     /* Ensure anon_vma is assigned */
> > > > +     err = -ENOMEM;
> > > > +     if (vma_is_anonymous(*dst_vmap) && anon_vma_prepare(*dst_vmap))
> > > > +             goto out_unlock;
> > > > +
> > > > +     return 0;
> > > > +out_unlock:
> > > > +     mmap_read_unlock(mm);
> > > > +     return err;
> > > > +}
> > > > +#endif
> > > >
> > > >  /**
> > > >   * move_pages - move arbitrary anonymous pages of an existing vma
> > > > @@ -1287,8 +1455,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > >   * @len: length of the virtual memory range
> > > >   * @mode: flags from uffdio_move.mode
> > > >   *
> > > > - * Must be called with mmap_lock held for read.
> > > > - *
> > >
> > > Will either use the mmap_lock in read mode or per-vma locking ?
> >
> > Makes sense. Will add it.
> > >
> > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > >   * copy. It only works on non shared anonymous pages because those can
> > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > @@ -1355,10 +1521,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > >   * could be obtained. This is the only additional complexity added to
> > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > >   */
> > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > -                unsigned long dst_start, unsigned long src_start,
> > > > -                unsigned long len, __u64 mode)
> > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > >  {
> > > > +     struct mm_struct *mm = ctx->mm;
> > >
> > > You dropped the argument, but left the comment for the argument.
> >
> > Thanks, will fix it.
> > >
> > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > >       unsigned long src_addr, dst_addr;
> > > >       pmd_t *src_pmd, *dst_pmd;
> > > > @@ -1376,28 +1542,40 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > >               goto out;
> > > >
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +     err = find_and_lock_vmas(mm, dst_start, src_start,
> > > > +                              &dst_vma, &src_vma);
> > > > +#else
> > > > +     err = lock_mm_and_find_vmas(mm, dst_start, src_start,
> > > > +                                 &dst_vma, &src_vma);
> > > > +#endif
> > >
> > > I was hoping you could hide this completely, but it's probably better to
> > > show what's going on and the function names document it well.
> >
> > I wanted to hide unlock as it's called several times, but then I
> > thought you wanted explicit calls to mmap_read_unlock() so didn't hide
> > it. If you are ok can I define unlock_vma() for !CONFIG_PER_VMA_LOCK
> > as well, calling mmap_read_unlock()?
>
> My bigger problem was with the name.  We can't have unlock_vma()
> just unlock the mm - it is confusing to read and I think it'll lead to
> misunderstandings of what is really going on here.
>
> Whatever you decide to do is fine as long as it's clear what's going on.
> I think this is clear while hiding it could also be clear with the right
> function name - I'm not sure what that would be; naming is hard.

Maybe unlock_mm_or_vma() ? If not then I'll just keep it as is.

Naming is indeed hard!

>
> Thanks,
> Liam
>
Liam R. Howlett Feb. 9, 2024, 7:31 p.m. UTC | #5
* Lokesh Gidra <lokeshgidra@google.com> [240209 14:21]:
> On Fri, Feb 9, 2024 at 11:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240209 13:02]:
> > > On Thu, Feb 8, 2024 at 7:07 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240208 16:22]:
> > > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > > critical section.
> > > > >
> > > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > > vmas.
> > > > >
> > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > ---
> > > > >  fs/userfaultfd.c              |  13 +-
> > > > >  include/linux/userfaultfd_k.h |   5 +-
> > > > >  mm/userfaultfd.c              | 356 ++++++++++++++++++++++++++--------
> > > > >  3 files changed, 275 insertions(+), 99 deletions(-)
> > > > >
> > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > index c00a021bcce4..60dcfafdc11a 100644
> > > > > --- a/fs/userfaultfd.c
> > > > > +++ b/fs/userfaultfd.c
> > > > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> > > > >               return -EINVAL;
> > > > >
> > > > >       if (mmget_not_zero(mm)) {
> > > > > -             mmap_read_lock(mm);
> > > > > -
> > > > > -             /* Re-check after taking map_changing_lock */
> > > > > -             down_read(&ctx->map_changing_lock);
> > > > > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > > > > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > > > > -                                      uffdio_move.len, uffdio_move.mode);
> > > > > -             else
> > > > > -                     ret = -EAGAIN;
> > > > > -             up_read(&ctx->map_changing_lock);
> > > > > -             mmap_read_unlock(mm);
> > > > > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > > > > +                              uffdio_move.len, uffdio_move.mode);
> > > > >               mmput(mm);
> > > > >       } else {
> > > > >               return -ESRCH;
> > > > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > > > > index 3210c3552976..05d59f74fc88 100644
> > > > > --- a/include/linux/userfaultfd_k.h
> > > > > +++ b/include/linux/userfaultfd_k.h
> > > > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> > > > >  /* move_pages */
> > > > >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > > >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > -                unsigned long len, __u64 flags);
> > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > +                unsigned long src_start, unsigned long len, __u64 flags);
> > > > >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> > > > >                       struct vm_area_struct *dst_vma,
> > > > >                       struct vm_area_struct *src_vma,
> > > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > > > index 74aad0831e40..1e25768b2136 100644
> > > > > --- a/mm/userfaultfd.c
> > > > > +++ b/mm/userfaultfd.c
> > > > > @@ -19,20 +19,12 @@
> > > > >  #include <asm/tlb.h>
> > > > >  #include "internal.h"
> > > > >
> > > > > -static __always_inline
> > > > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > > -                                 unsigned long dst_start,
> > > > > -                                 unsigned long len)
> > > >
> > > > You could probably leave the __always_inline for this.
> > >
> > > Sure
> > > >
> > > > > +static bool validate_dst_vma(struct vm_area_struct *dst_vma,
> > > > > +                          unsigned long dst_end)
> > > > >  {
> > > > > -     /*
> > > > > -      * Make sure that the dst range is both valid and fully within a
> > > > > -      * single existing vma.
> > > > > -      */
> > > > > -     struct vm_area_struct *dst_vma;
> > > > > -
> > > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > > > -     if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > > -             return NULL;
> > > > > +     /* Make sure that the dst range is fully within dst_vma. */
> > > > > +     if (dst_end > dst_vma->vm_end)
> > > > > +             return false;
> > > > >
> > > > >       /*
> > > > >        * Check the vma is registered in uffd, this is required to
> > > > > @@ -40,11 +32,125 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > >        * time.
> > > > >        */
> > > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > > -             return NULL;
> > > > > +             return false;
> > > > > +
> > > > > +     return true;
> > > > > +}
> > > > > +
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +/*
> > > > > + * lock_vma() - Lookup and lock vma corresponding to @address.
> > > > > + * @mm: mm to search vma in.
> > > > > + * @address: address that the vma should contain.
> > > > > + * @prepare_anon: If true, then prepare the vma (if private) with anon_vma.
> > > > > + *
> > > > > + * Should be called without holding mmap_lock. vma should be unlocked after use
> > > > > + * with unlock_vma().
> > > > > + *
> > > > > + * Return: A locked vma containing @address, NULL if no vma is found, or
> > > > > + * -ENOMEM if anon_vma couldn't be allocated.
> > > > > + */
> > > > > +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > > > +                                    unsigned long address,
> > > > > +                                    bool prepare_anon)
> > > > > +{
> > > > > +     struct vm_area_struct *vma;
> > > > > +
> > > > > +     vma = lock_vma_under_rcu(mm, address);
> > > > > +     if (vma) {
> > > > > +             /*
> > > > > +              * lock_vma_under_rcu() only checks anon_vma for private
> > > > > +              * anonymous mappings. But we need to ensure it is assigned in
> > > > > +              * private file-backed vmas as well.
> > > > > +              */
> > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > +                 !vma->anon_vma)
> > > > > +                     vma_end_read(vma);
> > > > > +             else
> > > > > +                     return vma;
> > > > > +     }
> > > > > +
> > > > > +     mmap_read_lock(mm);
> > > > > +     vma = vma_lookup(mm, address);
> > > > > +     if (vma) {
> > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > +                 anon_vma_prepare(vma)) {
> > > > > +                     vma = ERR_PTR(-ENOMEM);
> > > > > +             } else {
> > > > > +                     /*
> > > > > +                      * We cannot use vma_start_read() as it may fail due to
> > > > > +                      * false locked (see comment in vma_start_read()). We
> > > > > +                      * can avoid that by directly locking vm_lock under
> > > > > +                      * mmap_lock, which guarantees that nobody can lock the
> > > > > +                      * vma for write (vma_start_write()) under us.
> > > > > +                      */
> > > > > +                     down_read(&vma->vm_lock->lock);
> > > > > +             }
> > > > > +     }
> > > > > +
> > > > > +     mmap_read_unlock(mm);
> > > > > +     return vma;
> > > > > +}
> > > > > +
> > > > > +static void unlock_vma(struct vm_area_struct *vma)
> > > > > +{
> > > > > +     vma_end_read(vma);
> > > > > +}
> > > > > +
> > > > > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
> > > > > +                                                 unsigned long dst_start,
> > > > > +                                                 unsigned long len)
> > > > > +{
> > > > > +     struct vm_area_struct *dst_vma;
> > > > > +
> > > > > +     /* Ensure anon_vma is assigned for private vmas */
> > > > > +     dst_vma = lock_vma(dst_mm, dst_start, true);
> > > > > +
> > > > > +     if (!dst_vma)
> > > > > +             return ERR_PTR(-ENOENT);
> > > > > +
> > > > > +     if (PTR_ERR(dst_vma) == -ENOMEM)
> > > > > +             return dst_vma;
> > > > > +
> > > > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > > > +             goto out_unlock;
> > > > >
> > > > >       return dst_vma;
> > > > > +out_unlock:
> > > > > +     unlock_vma(dst_vma);
> > > > > +     return ERR_PTR(-ENOENT);
> > > > >  }
> > > > >
> > > > > +#else
> > > > > +
> > > > > +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> > > > > +                                                    unsigned long dst_start,
> > > > > +                                                    unsigned long len)
> > > > > +{
> > > > > +     struct vm_area_struct *dst_vma;
> > > > > +     int err = -ENOENT;
> > > > > +
> > > > > +     mmap_read_lock(dst_mm);
> > > > > +     dst_vma = vma_lookup(dst_mm, dst_start);
> > > > > +     if (!dst_vma)
> > > > > +             goto out_unlock;
> > > > > +
> > > > > +     /* Ensure anon_vma is assigned for private vmas */
> > > > > +     if (!(dst_vma->vm_flags & VM_SHARED) && anon_vma_prepare(dst_vma)) {
> > > > > +             err = -ENOMEM;
> > > > > +             goto out_unlock;
> > > > > +     }
> > > > > +
> > > > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > > > +             goto out_unlock;
> > > > > +
> > > > > +     return dst_vma;
> > > > > +out_unlock:
> > > > > +     mmap_read_unlock(dst_mm);
> > > > > +     return ERR_PTR(err);
> > > > > +}
> > > > > +#endif
> > > > > +
> > > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > >  static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
> > > > >                                unsigned long dst_addr)
> > > > > @@ -350,7 +456,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > > >  /*
> > > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > > + * before returning.
> > > > >   */
> > > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             struct userfaultfd_ctx *ctx,
> > > > > @@ -361,7 +468,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                                             uffd_flags_t flags)
> > > > >  {
> > > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > > > -     int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > >       ssize_t err;
> > > > >       pte_t *dst_pte;
> > > > >       unsigned long src_addr, dst_addr;
> > > > > @@ -380,7 +486,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        */
> > > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > > >               up_read(&ctx->map_changing_lock);
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +             unlock_vma(dst_vma);
> > > > > +#else
> > > > >               mmap_read_unlock(dst_mm);
> > > > > +#endif
> > > > >               return -EINVAL;
> > > > >       }
> > > > >
> > > > > @@ -403,24 +513,32 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >        * retry, dst_vma will be set to NULL and we must lookup again.
> > > > >        */
> > > > >       if (!dst_vma) {
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +             dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > > > +#else
> > > > > +             dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > > > +#endif
> > > > > +             if (IS_ERR(dst_vma)) {
> > > > > +                     err = PTR_ERR(dst_vma);
> > > > > +                     goto out;
> > > > > +             }
> > > > > +
> > > > >               err = -ENOENT;
> > > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > > -                     goto out_unlock;
> > > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > > +                     goto out_unlock_vma;
> > > > >
> > > > >               err = -EINVAL;
> > > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > > -                     goto out_unlock;
> > > > > -
> > > > > -             vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > -     }
> > > > > +                     goto out_unlock_vma;
> > > > >
> > > > > -     /*
> > > > > -      * If not shared, ensure the dst_vma has a anon_vma.
> > > > > -      */
> > > > > -     err = -ENOMEM;
> > > > > -     if (!vm_shared) {
> > > > > -             if (unlikely(anon_vma_prepare(dst_vma)))
> > > > > +             /*
> > > > > +              * If memory mappings are changing because of non-cooperative
> > > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > > +              * request the user to retry later
> > > > > +              */
> > > > > +             down_read(&ctx->map_changing_lock);
> > > > > +             err = -EAGAIN;
> > > > > +             if (atomic_read(&ctx->mmap_changing))
> > > > >                       goto out_unlock;
> > > > >       }
> > > > >
> > > > > @@ -465,7 +583,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >               if (unlikely(err == -ENOENT)) {
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +                     unlock_vma(dst_vma);
> > > > > +#else
> > > > >                       mmap_read_unlock(dst_mm);
> > > > > +#endif
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       err = copy_folio_from_user(folio,
> > > > > @@ -474,17 +596,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >                               err = -EFAULT;
> > > > >                               goto out;
> > > > >                       }
> > > > > -                     mmap_read_lock(dst_mm);
> > > > > -                     down_read(&ctx->map_changing_lock);
> > > > > -                     /*
> > > > > -                      * If memory mappings are changing because of non-cooperative
> > > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > > -                      * request the user to retry later
> > > > > -                      */
> > > > > -                     if (atomic_read(&ctx->mmap_changing)) {
> > > > > -                             err = -EAGAIN;
> > > > > -                             break;
> > > > > -                     }
> > > > >
> > > > >                       dst_vma = NULL;
> > > > >                       goto retry;
> > > > > @@ -505,7 +616,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > +out_unlock_vma:
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +     unlock_vma(dst_vma);
> > > > > +#else
> > > > >       mmap_read_unlock(dst_mm);
> > > > > +#endif
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -597,7 +713,19 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       copied = 0;
> > > > >       folio = NULL;
> > > > >  retry:
> > > > > -     mmap_read_lock(dst_mm);
> > > > > +     /*
> > > > > +      * Make sure the vma is not shared, that the dst range is
> > > > > +      * both valid and fully within a single existing vma.
> > > > > +      */
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +     dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > > > +#else
> > > > > +     dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > > > +#endif
> > > > > +     if (IS_ERR(dst_vma)) {
> > > > > +             err = PTR_ERR(dst_vma);
> > > > > +             goto out;
> > > > > +     }
> > > > >
> > > > >       /*
> > > > >        * If memory mappings are changing because of non-cooperative
> > > > > @@ -609,15 +737,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >       if (atomic_read(&ctx->mmap_changing))
> > > > >               goto out_unlock;
> > > > >
> > > > > -     /*
> > > > > -      * Make sure the vma is not shared, that the dst range is
> > > > > -      * both valid and fully within a single existing vma.
> > > > > -      */
> > > > > -     err = -ENOENT;
> > > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > -     if (!dst_vma)
> > > > > -             goto out_unlock;
> > > > > -
> > > > >       err = -EINVAL;
> > > > >       /*
> > > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > > @@ -647,16 +766,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >           uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> > > > >               goto out_unlock;
> > > > >
> > > > > -     /*
> > > > > -      * Ensure the dst_vma has a anon_vma or this page
> > > > > -      * would get a NULL anon_vma when moved in the
> > > > > -      * dst_vma.
> > > > > -      */
> > > > > -     err = -ENOMEM;
> > > > > -     if (!(dst_vma->vm_flags & VM_SHARED) &&
> > > > > -         unlikely(anon_vma_prepare(dst_vma)))
> > > > > -             goto out_unlock;
> > > > > -
> > > > >       while (src_addr < src_start + len) {
> > > > >               pmd_t dst_pmdval;
> > > > >
> > > > > @@ -699,7 +808,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >                       void *kaddr;
> > > > >
> > > > >                       up_read(&ctx->map_changing_lock);
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +                     unlock_vma(dst_vma);
> > > > > +#else
> > > > >                       mmap_read_unlock(dst_mm);
> > > > > +#endif
> > > > >                       BUG_ON(!folio);
> > > > >
> > > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > > @@ -730,7 +843,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > >
> > > > >  out_unlock:
> > > > >       up_read(&ctx->map_changing_lock);
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +     unlock_vma(dst_vma);
> > > > > +#else
> > > > >       mmap_read_unlock(dst_mm);
> > > > > +#endif
> > > > >  out:
> > > > >       if (folio)
> > > > >               folio_put(folio);
> > > > > @@ -1267,16 +1384,67 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >       if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
> > > > >               return -EINVAL;
> > > > >
> > > > > -     /*
> > > > > -      * Ensure the dst_vma has a anon_vma or this page
> > > > > -      * would get a NULL anon_vma when moved in the
> > > > > -      * dst_vma.
> > > > > -      */
> > > > > -     if (unlikely(anon_vma_prepare(dst_vma)))
> > > > > -             return -ENOMEM;
> > > > > +     return 0;
> > > > > +}
> > > > > +
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +static int find_and_lock_vmas(struct mm_struct *mm,
> > > > > +                           unsigned long dst_start,
> > > > > +                           unsigned long src_start,
> > > > > +                           struct vm_area_struct **dst_vmap,
> > > > > +                           struct vm_area_struct **src_vmap)
> > > > > +{
> > > > > +     int err;
> > > > > +
> > > > > +     /* There is no need to prepare anon_vma for src_vma */
> > > > > +     *src_vmap = lock_vma(mm, src_start, false);
> > > > > +     if (!*src_vmap)
> > > > > +             return -ENOENT;
> > > > > +
> > > > > +     /* Ensure anon_vma is assigned for anonymous vma */
> > > > > +     *dst_vmap = lock_vma(mm, dst_start, true);
> > > > > +     err = -ENOENT;
> > > > > +     if (!*dst_vmap)
> > > > > +             goto out_unlock;
> > > > > +
> > > > > +     err = -ENOMEM;
> > > > > +     if (PTR_ERR(*dst_vmap) == -ENOMEM)
> > > > > +             goto out_unlock;
> > > >
> > > > If you change lock_vma() to return the vma or ERR_PTR(-ENOENT) /
> > > > ERR_PTR(-ENOMEM), then you could change this to check IS_ERR() and
> > > > return the PTR_ERR().
> > > >
> > > > You could also use IS_ERR_OR_NULL here, but the first suggestion will
> > > > simplify your life for find_and_lock_dst_vma() and the error type to
> > > > return.
> > >
> > > Good suggestion. I'll make the change. Thanks
> > > >
> > > > What you have here will work though.
> > > >
> > > > >
> > > > >       return 0;
> > > > > +out_unlock:
> > > > > +     unlock_vma(*src_vmap);
> > > > > +     return err;
> > > > >  }
> > > > > +#else
> > > > > +static int lock_mm_and_find_vmas(struct mm_struct *mm,
> > > > > +                              unsigned long dst_start,
> > > > > +                              unsigned long src_start,
> > > > > +                              struct vm_area_struct **dst_vmap,
> > > > > +                              struct vm_area_struct **src_vmap)
> > > > > +{
> > > > > +     int err = -ENOENT;
> > > >
> > > > Nit: new line after declarations.
> > > >
> > > > > +     mmap_read_lock(mm);
> > > > > +
> > > > > +     *src_vmap = vma_lookup(mm, src_start);
> > > > > +     if (!*src_vmap)
> > > > > +             goto out_unlock;
> > > > > +
> > > > > +     *dst_vmap = vma_lookup(mm, dst_start);
> > > > > +     if (!*dst_vmap)
> > > > > +             goto out_unlock;
> > > > > +
> > > > > +     /* Ensure anon_vma is assigned */
> > > > > +     err = -ENOMEM;
> > > > > +     if (vma_is_anonymous(*dst_vmap) && anon_vma_prepare(*dst_vmap))
> > > > > +             goto out_unlock;
> > > > > +
> > > > > +     return 0;
> > > > > +out_unlock:
> > > > > +     mmap_read_unlock(mm);
> > > > > +     return err;
> > > > > +}
> > > > > +#endif
> > > > >
> > > > >  /**
> > > > >   * move_pages - move arbitrary anonymous pages of an existing vma
> > > > > @@ -1287,8 +1455,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * @len: length of the virtual memory range
> > > > >   * @mode: flags from uffdio_move.mode
> > > > >   *
> > > > > - * Must be called with mmap_lock held for read.
> > > > > - *
> > > >
> > > > Will either use the mmap_lock in read mode or per-vma locking ?
> > >
> > > Makes sense. Will add it.
> > > >
> > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > @@ -1355,10 +1521,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > >   * could be obtained. This is the only additional complexity added to
> > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > >   */
> > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > -                unsigned long len, __u64 mode)
> > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > >  {
> > > > > +     struct mm_struct *mm = ctx->mm;
> > > >
> > > > You dropped the argument, but left the comment for the argument.
> > >
> > > Thanks, will fix it.
> > > >
> > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > >       unsigned long src_addr, dst_addr;
> > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > > @@ -1376,28 +1542,40 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > >               goto out;
> > > > >
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +     err = find_and_lock_vmas(mm, dst_start, src_start,
> > > > > +                              &dst_vma, &src_vma);
> > > > > +#else
> > > > > +     err = lock_mm_and_find_vmas(mm, dst_start, src_start,
> > > > > +                                 &dst_vma, &src_vma);
> > > > > +#endif
> > > >
> > > > I was hoping you could hide this completely, but it's probably better to
> > > > show what's going on and the function names document it well.
> > >
> > > I wanted to hide unlock as it's called several times, but then I
> > > thought you wanted explicit calls to mmap_read_unlock() so didn't hide
> > > it. If you are ok can I define unlock_vma() for !CONFIG_PER_VMA_LOCK
> > > as well, calling mmap_read_unlock()?
> >
> > My bigger problem was with the name.  We can't have unlock_vma()
> > just unlock the mm - it is confusing to read and I think it'll lead to
> > misunderstandings of what is really going on here.
> >
> > Whatever you decide to do is fine as long as it's clear what's going on.
> > I think this is clear while hiding it could also be clear with the right
> > function name - I'm not sure what that would be; naming is hard.
> 
> Maybe unlock_mm_or_vma() ? If not then I'll just keep it as is.
> 

Maybe just leave it as it is unless someone else has issue with it.
Using some form of uffd_unlock name runs into the question of that
atomic and the new lock.
Lokesh Gidra Feb. 9, 2024, 8:58 p.m. UTC | #6
On Fri, Feb 9, 2024 at 11:31 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240209 14:21]:
> > On Fri, Feb 9, 2024 at 11:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240209 13:02]:
> > > > On Thu, Feb 8, 2024 at 7:07 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > > >
> > > > > * Lokesh Gidra <lokeshgidra@google.com> [240208 16:22]:
> > > > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > > > critical section.
> > > > > >
> > > > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > > > vmas.
> > > > > >
> > > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > > ---
> > > > > >  fs/userfaultfd.c              |  13 +-
> > > > > >  include/linux/userfaultfd_k.h |   5 +-
> > > > > >  mm/userfaultfd.c              | 356 ++++++++++++++++++++++++++--------
> > > > > >  3 files changed, 275 insertions(+), 99 deletions(-)
> > > > > >
> > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > > > index c00a021bcce4..60dcfafdc11a 100644
> > > > > > --- a/fs/userfaultfd.c
> > > > > > +++ b/fs/userfaultfd.c
> > > > > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
> > > > > >               return -EINVAL;
> > > > > >
> > > > > >       if (mmget_not_zero(mm)) {
> > > > > > -             mmap_read_lock(mm);
> > > > > > -
> > > > > > -             /* Re-check after taking map_changing_lock */
> > > > > > -             down_read(&ctx->map_changing_lock);
> > > > > > -             if (likely(!atomic_read(&ctx->mmap_changing)))
> > > > > > -                     ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
> > > > > > -                                      uffdio_move.len, uffdio_move.mode);
> > > > > > -             else
> > > > > > -                     ret = -EAGAIN;
> > > > > > -             up_read(&ctx->map_changing_lock);
> > > > > > -             mmap_read_unlock(mm);
> > > > > > +             ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
> > > > > > +                              uffdio_move.len, uffdio_move.mode);
> > > > > >               mmput(mm);
> > > > > >       } else {
> > > > > >               return -ESRCH;
> > > > > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > > > > > index 3210c3552976..05d59f74fc88 100644
> > > > > > --- a/include/linux/userfaultfd_k.h
> > > > > > +++ b/include/linux/userfaultfd_k.h
> > > > > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
> > > > > >  /* move_pages */
> > > > > >  void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > > > >  void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> > > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > > -                unsigned long len, __u64 flags);
> > > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > > +                unsigned long src_start, unsigned long len, __u64 flags);
> > > > > >  int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> > > > > >                       struct vm_area_struct *dst_vma,
> > > > > >                       struct vm_area_struct *src_vma,
> > > > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > > > > index 74aad0831e40..1e25768b2136 100644
> > > > > > --- a/mm/userfaultfd.c
> > > > > > +++ b/mm/userfaultfd.c
> > > > > > @@ -19,20 +19,12 @@
> > > > > >  #include <asm/tlb.h>
> > > > > >  #include "internal.h"
> > > > > >
> > > > > > -static __always_inline
> > > > > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > > > -                                 unsigned long dst_start,
> > > > > > -                                 unsigned long len)
> > > > >
> > > > > You could probably leave the __always_inline for this.
> > > >
> > > > Sure
> > > > >
> > > > > > +static bool validate_dst_vma(struct vm_area_struct *dst_vma,
> > > > > > +                          unsigned long dst_end)
> > > > > >  {
> > > > > > -     /*
> > > > > > -      * Make sure that the dst range is both valid and fully within a
> > > > > > -      * single existing vma.
> > > > > > -      */
> > > > > > -     struct vm_area_struct *dst_vma;
> > > > > > -
> > > > > > -     dst_vma = find_vma(dst_mm, dst_start);
> > > > > > -     if (!range_in_vma(dst_vma, dst_start, dst_start + len))
> > > > > > -             return NULL;
> > > > > > +     /* Make sure that the dst range is fully within dst_vma. */
> > > > > > +     if (dst_end > dst_vma->vm_end)
> > > > > > +             return false;
> > > > > >
> > > > > >       /*
> > > > > >        * Check the vma is registered in uffd, this is required to
> > > > > > @@ -40,11 +32,125 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
> > > > > >        * time.
> > > > > >        */
> > > > > >       if (!dst_vma->vm_userfaultfd_ctx.ctx)
> > > > > > -             return NULL;
> > > > > > +             return false;
> > > > > > +
> > > > > > +     return true;
> > > > > > +}
> > > > > > +
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +/*
> > > > > > + * lock_vma() - Lookup and lock vma corresponding to @address.
> > > > > > + * @mm: mm to search vma in.
> > > > > > + * @address: address that the vma should contain.
> > > > > > + * @prepare_anon: If true, then prepare the vma (if private) with anon_vma.
> > > > > > + *
> > > > > > + * Should be called without holding mmap_lock. vma should be unlocked after use
> > > > > > + * with unlock_vma().
> > > > > > + *
> > > > > > + * Return: A locked vma containing @address, NULL if no vma is found, or
> > > > > > + * -ENOMEM if anon_vma couldn't be allocated.
> > > > > > + */
> > > > > > +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > > > > +                                    unsigned long address,
> > > > > > +                                    bool prepare_anon)
> > > > > > +{
> > > > > > +     struct vm_area_struct *vma;
> > > > > > +
> > > > > > +     vma = lock_vma_under_rcu(mm, address);
> > > > > > +     if (vma) {
> > > > > > +             /*
> > > > > > +              * lock_vma_under_rcu() only checks anon_vma for private
> > > > > > +              * anonymous mappings. But we need to ensure it is assigned in
> > > > > > +              * private file-backed vmas as well.
> > > > > > +              */
> > > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > > +                 !vma->anon_vma)
> > > > > > +                     vma_end_read(vma);
> > > > > > +             else
> > > > > > +                     return vma;
> > > > > > +     }
> > > > > > +
> > > > > > +     mmap_read_lock(mm);
> > > > > > +     vma = vma_lookup(mm, address);
> > > > > > +     if (vma) {
> > > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > > +                 anon_vma_prepare(vma)) {
> > > > > > +                     vma = ERR_PTR(-ENOMEM);
> > > > > > +             } else {
> > > > > > +                     /*
> > > > > > +                      * We cannot use vma_start_read() as it may fail due to
> > > > > > +                      * false locked (see comment in vma_start_read()). We
> > > > > > +                      * can avoid that by directly locking vm_lock under
> > > > > > +                      * mmap_lock, which guarantees that nobody can lock the
> > > > > > +                      * vma for write (vma_start_write()) under us.
> > > > > > +                      */
> > > > > > +                     down_read(&vma->vm_lock->lock);
> > > > > > +             }
> > > > > > +     }
> > > > > > +
> > > > > > +     mmap_read_unlock(mm);
> > > > > > +     return vma;
> > > > > > +}
> > > > > > +
> > > > > > +static void unlock_vma(struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +     vma_end_read(vma);
> > > > > > +}
> > > > > > +
> > > > > > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
> > > > > > +                                                 unsigned long dst_start,
> > > > > > +                                                 unsigned long len)
> > > > > > +{
> > > > > > +     struct vm_area_struct *dst_vma;
> > > > > > +
> > > > > > +     /* Ensure anon_vma is assigned for private vmas */
> > > > > > +     dst_vma = lock_vma(dst_mm, dst_start, true);
> > > > > > +
> > > > > > +     if (!dst_vma)
> > > > > > +             return ERR_PTR(-ENOENT);
> > > > > > +
> > > > > > +     if (PTR_ERR(dst_vma) == -ENOMEM)
> > > > > > +             return dst_vma;
> > > > > > +
> > > > > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > > > > +             goto out_unlock;
> > > > > >
> > > > > >       return dst_vma;
> > > > > > +out_unlock:
> > > > > > +     unlock_vma(dst_vma);
> > > > > > +     return ERR_PTR(-ENOENT);
> > > > > >  }
> > > > > >
> > > > > > +#else
> > > > > > +
> > > > > > +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> > > > > > +                                                    unsigned long dst_start,
> > > > > > +                                                    unsigned long len)
> > > > > > +{
> > > > > > +     struct vm_area_struct *dst_vma;
> > > > > > +     int err = -ENOENT;
> > > > > > +
> > > > > > +     mmap_read_lock(dst_mm);
> > > > > > +     dst_vma = vma_lookup(dst_mm, dst_start);
> > > > > > +     if (!dst_vma)
> > > > > > +             goto out_unlock;
> > > > > > +
> > > > > > +     /* Ensure anon_vma is assigned for private vmas */
> > > > > > +     if (!(dst_vma->vm_flags & VM_SHARED) && anon_vma_prepare(dst_vma)) {
> > > > > > +             err = -ENOMEM;
> > > > > > +             goto out_unlock;
> > > > > > +     }
> > > > > > +
> > > > > > +     if (!validate_dst_vma(dst_vma, dst_start + len))
> > > > > > +             goto out_unlock;
> > > > > > +
> > > > > > +     return dst_vma;
> > > > > > +out_unlock:
> > > > > > +     mmap_read_unlock(dst_mm);
> > > > > > +     return ERR_PTR(err);
> > > > > > +}
> > > > > > +#endif
> > > > > > +
> > > > > >  /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
> > > > > >  static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
> > > > > >                                unsigned long dst_addr)
> > > > > > @@ -350,7 +456,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > > > > >  #ifdef CONFIG_HUGETLB_PAGE
> > > > > >  /*
> > > > > >   * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
> > > > > > - * called with mmap_lock held, it will release mmap_lock before returning.
> > > > > > + * called with either vma-lock or mmap_lock held, it will release the lock
> > > > > > + * before returning.
> > > > > >   */
> > > > > >  static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > > >                                             struct userfaultfd_ctx *ctx,
> > > > > > @@ -361,7 +468,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > > >                                             uffd_flags_t flags)
> > > > > >  {
> > > > > >       struct mm_struct *dst_mm = dst_vma->vm_mm;
> > > > > > -     int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > >       ssize_t err;
> > > > > >       pte_t *dst_pte;
> > > > > >       unsigned long src_addr, dst_addr;
> > > > > > @@ -380,7 +486,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > > >        */
> > > > > >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > > > > >               up_read(&ctx->map_changing_lock);
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +             unlock_vma(dst_vma);
> > > > > > +#else
> > > > > >               mmap_read_unlock(dst_mm);
> > > > > > +#endif
> > > > > >               return -EINVAL;
> > > > > >       }
> > > > > >
> > > > > > @@ -403,24 +513,32 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > > >        * retry, dst_vma will be set to NULL and we must lookup again.
> > > > > >        */
> > > > > >       if (!dst_vma) {
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +             dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > > > > +#else
> > > > > > +             dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > > > > +#endif
> > > > > > +             if (IS_ERR(dst_vma)) {
> > > > > > +                     err = PTR_ERR(dst_vma);
> > > > > > +                     goto out;
> > > > > > +             }
> > > > > > +
> > > > > >               err = -ENOENT;
> > > > > > -             dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > > -             if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
> > > > > > -                     goto out_unlock;
> > > > > > +             if (!is_vm_hugetlb_page(dst_vma))
> > > > > > +                     goto out_unlock_vma;
> > > > > >
> > > > > >               err = -EINVAL;
> > > > > >               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
> > > > > > -                     goto out_unlock;
> > > > > > -
> > > > > > -             vm_shared = dst_vma->vm_flags & VM_SHARED;
> > > > > > -     }
> > > > > > +                     goto out_unlock_vma;
> > > > > >
> > > > > > -     /*
> > > > > > -      * If not shared, ensure the dst_vma has a anon_vma.
> > > > > > -      */
> > > > > > -     err = -ENOMEM;
> > > > > > -     if (!vm_shared) {
> > > > > > -             if (unlikely(anon_vma_prepare(dst_vma)))
> > > > > > +             /*
> > > > > > +              * If memory mappings are changing because of non-cooperative
> > > > > > +              * operation (e.g. mremap) running in parallel, bail out and
> > > > > > +              * request the user to retry later
> > > > > > +              */
> > > > > > +             down_read(&ctx->map_changing_lock);
> > > > > > +             err = -EAGAIN;
> > > > > > +             if (atomic_read(&ctx->mmap_changing))
> > > > > >                       goto out_unlock;
> > > > > >       }
> > > > > >
> > > > > > @@ -465,7 +583,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > > >
> > > > > >               if (unlikely(err == -ENOENT)) {
> > > > > >                       up_read(&ctx->map_changing_lock);
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +                     unlock_vma(dst_vma);
> > > > > > +#else
> > > > > >                       mmap_read_unlock(dst_mm);
> > > > > > +#endif
> > > > > >                       BUG_ON(!folio);
> > > > > >
> > > > > >                       err = copy_folio_from_user(folio,
> > > > > > @@ -474,17 +596,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > > >                               err = -EFAULT;
> > > > > >                               goto out;
> > > > > >                       }
> > > > > > -                     mmap_read_lock(dst_mm);
> > > > > > -                     down_read(&ctx->map_changing_lock);
> > > > > > -                     /*
> > > > > > -                      * If memory mappings are changing because of non-cooperative
> > > > > > -                      * operation (e.g. mremap) running in parallel, bail out and
> > > > > > -                      * request the user to retry later
> > > > > > -                      */
> > > > > > -                     if (atomic_read(&ctx->mmap_changing)) {
> > > > > > -                             err = -EAGAIN;
> > > > > > -                             break;
> > > > > > -                     }
> > > > > >
> > > > > >                       dst_vma = NULL;
> > > > > >                       goto retry;
> > > > > > @@ -505,7 +616,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > > > > >
> > > > > >  out_unlock:
> > > > > >       up_read(&ctx->map_changing_lock);
> > > > > > +out_unlock_vma:
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +     unlock_vma(dst_vma);
> > > > > > +#else
> > > > > >       mmap_read_unlock(dst_mm);
> > > > > > +#endif
> > > > > >  out:
> > > > > >       if (folio)
> > > > > >               folio_put(folio);
> > > > > > @@ -597,7 +713,19 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > > >       copied = 0;
> > > > > >       folio = NULL;
> > > > > >  retry:
> > > > > > -     mmap_read_lock(dst_mm);
> > > > > > +     /*
> > > > > > +      * Make sure the vma is not shared, that the dst range is
> > > > > > +      * both valid and fully within a single existing vma.
> > > > > > +      */
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +     dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
> > > > > > +#else
> > > > > > +     dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
> > > > > > +#endif
> > > > > > +     if (IS_ERR(dst_vma)) {
> > > > > > +             err = PTR_ERR(dst_vma);
> > > > > > +             goto out;
> > > > > > +     }
> > > > > >
> > > > > >       /*
> > > > > >        * If memory mappings are changing because of non-cooperative
> > > > > > @@ -609,15 +737,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > > >       if (atomic_read(&ctx->mmap_changing))
> > > > > >               goto out_unlock;
> > > > > >
> > > > > > -     /*
> > > > > > -      * Make sure the vma is not shared, that the dst range is
> > > > > > -      * both valid and fully within a single existing vma.
> > > > > > -      */
> > > > > > -     err = -ENOENT;
> > > > > > -     dst_vma = find_dst_vma(dst_mm, dst_start, len);
> > > > > > -     if (!dst_vma)
> > > > > > -             goto out_unlock;
> > > > > > -
> > > > > >       err = -EINVAL;
> > > > > >       /*
> > > > > >        * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> > > > > > @@ -647,16 +766,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > > >           uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> > > > > >               goto out_unlock;
> > > > > >
> > > > > > -     /*
> > > > > > -      * Ensure the dst_vma has a anon_vma or this page
> > > > > > -      * would get a NULL anon_vma when moved in the
> > > > > > -      * dst_vma.
> > > > > > -      */
> > > > > > -     err = -ENOMEM;
> > > > > > -     if (!(dst_vma->vm_flags & VM_SHARED) &&
> > > > > > -         unlikely(anon_vma_prepare(dst_vma)))
> > > > > > -             goto out_unlock;
> > > > > > -
> > > > > >       while (src_addr < src_start + len) {
> > > > > >               pmd_t dst_pmdval;
> > > > > >
> > > > > > @@ -699,7 +808,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > > >                       void *kaddr;
> > > > > >
> > > > > >                       up_read(&ctx->map_changing_lock);
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +                     unlock_vma(dst_vma);
> > > > > > +#else
> > > > > >                       mmap_read_unlock(dst_mm);
> > > > > > +#endif
> > > > > >                       BUG_ON(!folio);
> > > > > >
> > > > > >                       kaddr = kmap_local_folio(folio, 0);
> > > > > > @@ -730,7 +843,11 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > > > >
> > > > > >  out_unlock:
> > > > > >       up_read(&ctx->map_changing_lock);
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +     unlock_vma(dst_vma);
> > > > > > +#else
> > > > > >       mmap_read_unlock(dst_mm);
> > > > > > +#endif
> > > > > >  out:
> > > > > >       if (folio)
> > > > > >               folio_put(folio);
> > > > > > @@ -1267,16 +1384,67 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > > >       if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
> > > > > >               return -EINVAL;
> > > > > >
> > > > > > -     /*
> > > > > > -      * Ensure the dst_vma has a anon_vma or this page
> > > > > > -      * would get a NULL anon_vma when moved in the
> > > > > > -      * dst_vma.
> > > > > > -      */
> > > > > > -     if (unlikely(anon_vma_prepare(dst_vma)))
> > > > > > -             return -ENOMEM;
> > > > > > +     return 0;
> > > > > > +}
> > > > > > +
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +static int find_and_lock_vmas(struct mm_struct *mm,
> > > > > > +                           unsigned long dst_start,
> > > > > > +                           unsigned long src_start,
> > > > > > +                           struct vm_area_struct **dst_vmap,
> > > > > > +                           struct vm_area_struct **src_vmap)
> > > > > > +{
> > > > > > +     int err;
> > > > > > +
> > > > > > +     /* There is no need to prepare anon_vma for src_vma */
> > > > > > +     *src_vmap = lock_vma(mm, src_start, false);
> > > > > > +     if (!*src_vmap)
> > > > > > +             return -ENOENT;
> > > > > > +
> > > > > > +     /* Ensure anon_vma is assigned for anonymous vma */
> > > > > > +     *dst_vmap = lock_vma(mm, dst_start, true);
> > > > > > +     err = -ENOENT;
> > > > > > +     if (!*dst_vmap)
> > > > > > +             goto out_unlock;
> > > > > > +
> > > > > > +     err = -ENOMEM;
> > > > > > +     if (PTR_ERR(*dst_vmap) == -ENOMEM)
> > > > > > +             goto out_unlock;
> > > > >
> > > > > If you change lock_vma() to return the vma or ERR_PTR(-ENOENT) /
> > > > > ERR_PTR(-ENOMEM), then you could change this to check IS_ERR() and
> > > > > return the PTR_ERR().
> > > > >
> > > > > You could also use IS_ERR_OR_NULL here, but the first suggestion will
> > > > > simplify your life for find_and_lock_dst_vma() and the error type to
> > > > > return.
> > > >
> > > > Good suggestion. I'll make the change. Thanks
> > > > >
> > > > > What you have here will work though.
> > > > >
> > > > > >
> > > > > >       return 0;
> > > > > > +out_unlock:
> > > > > > +     unlock_vma(*src_vmap);
> > > > > > +     return err;
> > > > > >  }

The current implementation has a deadlock problem:

thread 1
                                     thread 2

vma_start_read(dst_vma)


                                         mmap_write_lock()

                                         vma_start_write(src_vma)
vma_start_read(src_vma) fails
mmap_read_lock() blocks


                                        vma_start_write(dst_vma)
blocks


I think the solution is to implement it this way:

long find_and_lock_vmas(...)
{
              dst_vma = lock_vma(mm, dst_start, true);
              if (IS_ERR(dst_vma))
                          return PTR_ERR(vma);

              src_vma = lock_vma_under_rcu(mm, src_start);
              if (!src_vma) {
                            long err;
                            if (mmap_read_trylock(mm)) {
                                            src_vma = vma_lookup(mm, src_start);
                                            if (src_vma) {

down_read(&src_vma->vm_lock->lock);
                                                        err = 0;
                                            } else {
                                                       err = -ENOENT;
                                            }
                                            mmap_read_unlock(mm);
                               } else {
                                           vma_end_read(dst_vma);
                                           err = lock_mm_and_find_vmas(...);
                                           if (!err) {
                                                         mmap_read_unlock(mm);
                                           }
                                }
                                return err;
              }
              return 0;
}

Of course this would need defining lock_mm_and_find_vmas() regardless
of CONFIG_PER_VMA_LOCK. I can also remove the prepare_anon condition
in lock_vma().

> > > > > > +#else
> > > > > > +static int lock_mm_and_find_vmas(struct mm_struct *mm,
> > > > > > +                              unsigned long dst_start,
> > > > > > +                              unsigned long src_start,
> > > > > > +                              struct vm_area_struct **dst_vmap,
> > > > > > +                              struct vm_area_struct **src_vmap)
> > > > > > +{
> > > > > > +     int err = -ENOENT;
> > > > >
> > > > > Nit: new line after declarations.
> > > > >
> > > > > > +     mmap_read_lock(mm);
> > > > > > +
> > > > > > +     *src_vmap = vma_lookup(mm, src_start);
> > > > > > +     if (!*src_vmap)
> > > > > > +             goto out_unlock;
> > > > > > +
> > > > > > +     *dst_vmap = vma_lookup(mm, dst_start);
> > > > > > +     if (!*dst_vmap)
> > > > > > +             goto out_unlock;
> > > > > > +
> > > > > > +     /* Ensure anon_vma is assigned */
> > > > > > +     err = -ENOMEM;
> > > > > > +     if (vma_is_anonymous(*dst_vmap) && anon_vma_prepare(*dst_vmap))
> > > > > > +             goto out_unlock;
> > > > > > +
> > > > > > +     return 0;
> > > > > > +out_unlock:
> > > > > > +     mmap_read_unlock(mm);
> > > > > > +     return err;
> > > > > > +}
> > > > > > +#endif
> > > > > >
> > > > > >  /**
> > > > > >   * move_pages - move arbitrary anonymous pages of an existing vma
> > > > > > @@ -1287,8 +1455,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > > >   * @len: length of the virtual memory range
> > > > > >   * @mode: flags from uffdio_move.mode
> > > > > >   *
> > > > > > - * Must be called with mmap_lock held for read.
> > > > > > - *
> > > > >
> > > > > Will either use the mmap_lock in read mode or per-vma locking ?
> > > >
> > > > Makes sense. Will add it.
> > > > >
> > > > > >   * move_pages() remaps arbitrary anonymous pages atomically in zero
> > > > > >   * copy. It only works on non shared anonymous pages because those can
> > > > > >   * be relocated without generating non linear anon_vmas in the rmap
> > > > > > @@ -1355,10 +1521,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
> > > > > >   * could be obtained. This is the only additional complexity added to
> > > > > >   * the rmap code to provide this anonymous page remapping functionality.
> > > > > >   */
> > > > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > > -                unsigned long dst_start, unsigned long src_start,
> > > > > > -                unsigned long len, __u64 mode)
> > > > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> > > > > > +                unsigned long src_start, unsigned long len, __u64 mode)
> > > > > >  {
> > > > > > +     struct mm_struct *mm = ctx->mm;
> > > > >
> > > > > You dropped the argument, but left the comment for the argument.
> > > >
> > > > Thanks, will fix it.
> > > > >
> > > > > >       struct vm_area_struct *src_vma, *dst_vma;
> > > > > >       unsigned long src_addr, dst_addr;
> > > > > >       pmd_t *src_pmd, *dst_pmd;
> > > > > > @@ -1376,28 +1542,40 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> > > > > >           WARN_ON_ONCE(dst_start + len <= dst_start))
> > > > > >               goto out;
> > > > > >
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +     err = find_and_lock_vmas(mm, dst_start, src_start,
> > > > > > +                              &dst_vma, &src_vma);
> > > > > > +#else
> > > > > > +     err = lock_mm_and_find_vmas(mm, dst_start, src_start,
> > > > > > +                                 &dst_vma, &src_vma);
> > > > > > +#endif
> > > > >
> > > > > I was hoping you could hide this completely, but it's probably better to
> > > > > show what's going on and the function names document it well.
> > > >
> > > > I wanted to hide unlock as it's called several times, but then I
> > > > thought you wanted explicit calls to mmap_read_unlock() so didn't hide
> > > > it. If you are ok can I define unlock_vma() for !CONFIG_PER_VMA_LOCK
> > > > as well, calling mmap_read_unlock()?
> > >
> > > My bigger problem was with the name.  We can't have unlock_vma()
> > > just unlock the mm - it is confusing to read and I think it'll lead to
> > > misunderstandings of what is really going on here.
> > >
> > > Whatever you decide to do is fine as long as it's clear what's going on.
> > > I think this is clear while hiding it could also be clear with the right
> > > function name - I'm not sure what that would be; naming is hard.
> >
> > Maybe unlock_mm_or_vma() ? If not then I'll just keep it as is.
> >
>
> Maybe just leave it as it is unless someone else has issue with it.
> Using some form of uffd_unlock name runs into the question of that
> atomic and the new lock.

Sure. I'll let it as it is.
>
>
Liam R. Howlett Feb. 12, 2024, 3:19 p.m. UTC | #7
* Lokesh Gidra <lokeshgidra@google.com> [240209 15:59]:
> On Fri, Feb 9, 2024 at 11:31 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
...

> > > > > > > +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > > > > > +                                    unsigned long address,
> > > > > > > +                                    bool prepare_anon)
> > > > > > > +{
> > > > > > > +     struct vm_area_struct *vma;
> > > > > > > +
> > > > > > > +     vma = lock_vma_under_rcu(mm, address);
> > > > > > > +     if (vma) {
> > > > > > > +             /*
> > > > > > > +              * lock_vma_under_rcu() only checks anon_vma for private
> > > > > > > +              * anonymous mappings. But we need to ensure it is assigned in
> > > > > > > +              * private file-backed vmas as well.
> > > > > > > +              */
> > > > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > > > +                 !vma->anon_vma)
> > > > > > > +                     vma_end_read(vma);
> > > > > > > +             else
> > > > > > > +                     return vma;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     mmap_read_lock(mm);
> > > > > > > +     vma = vma_lookup(mm, address);
> > > > > > > +     if (vma) {
> > > > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > > > +                 anon_vma_prepare(vma)) {
> > > > > > > +                     vma = ERR_PTR(-ENOMEM);
> > > > > > > +             } else {
> > > > > > > +                     /*
> > > > > > > +                      * We cannot use vma_start_read() as it may fail due to
> > > > > > > +                      * false locked (see comment in vma_start_read()). We
> > > > > > > +                      * can avoid that by directly locking vm_lock under
> > > > > > > +                      * mmap_lock, which guarantees that nobody can lock the
> > > > > > > +                      * vma for write (vma_start_write()) under us.
> > > > > > > +                      */
> > > > > > > +                     down_read(&vma->vm_lock->lock);
> > > > > > > +             }
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     mmap_read_unlock(mm);
> > > > > > > +     return vma;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void unlock_vma(struct vm_area_struct *vma)
> > > > > > > +{
> > > > > > > +     vma_end_read(vma);
> > > > > > > +}
> > > > > > > +
...

> 
> The current implementation has a deadlock problem:
> 
> thread 1
>                                      thread 2
> 
> vma_start_read(dst_vma)
> 
> 
>                                          mmap_write_lock()
> 
>                                          vma_start_write(src_vma)
> vma_start_read(src_vma) fails
> mmap_read_lock() blocks
> 
> 
>                                         vma_start_write(dst_vma)
> blocks
> 
> 
> I think the solution is to implement it this way:
> 
> long find_and_lock_vmas(...)
> {
>               dst_vma = lock_vma(mm, dst_start, true);
>               if (IS_ERR(dst_vma))
>                           return PTR_ERR(vma);
> 
>               src_vma = lock_vma_under_rcu(mm, src_start);
>               if (!src_vma) {
>                             long err;
>                             if (mmap_read_trylock(mm)) {
>                                             src_vma = vma_lookup(mm, src_start);
>                                             if (src_vma) {
> 
> down_read(&src_vma->vm_lock->lock);
>                                                         err = 0;
>                                             } else {
>                                                        err = -ENOENT;
>                                             }
>                                             mmap_read_unlock(mm);
>                                } else {
>                                            vma_end_read(dst_vma);
>                                            err = lock_mm_and_find_vmas(...);
>                                            if (!err) {

Right now lock_mm_and_find_vmas() doesn't use the per-vma locking, so
you'd have to lock those here too.  I'm sure you realise that, but this
is very convoluted.

>                                                          mmap_read_unlock(mm);
>                                            }
>                                 }
>                                 return err;

On contention you will now abort vs block.

>               }
>               return 0;
> }
> 
> Of course this would need defining lock_mm_and_find_vmas() regardless
> of CONFIG_PER_VMA_LOCK. I can also remove the prepare_anon condition
> in lock_vma().

You are adding a lot of complexity for a relatively rare case, which is
probably not worth optimising.

I think you'd be better served by something like :

if (likely(src_vma))
	return 0;

/* Undo any locking */
vma_end_read(dst_vma);

/* Fall back to locking both in mmap_lock critical section */
mmap_read_lock();
/*
 * probably worth creating an inline function for a single vma
 * find/populate that can be used in lock_vma() that does the anon vma
 * population?
 */
dst_vm = lock_vma_populate_anon();
...

src_vma = lock_vma_under_rcu(mm, src_start);
...


mmap_read_unlock();
return 0;

src_failed:
	vma_end_read(dst_vma);
dst_failed:
mmap_read_unlock();
return err;

Thanks,
Liam

...
Lokesh Gidra Feb. 12, 2024, 6:08 p.m. UTC | #8
On Mon, Feb 12, 2024 at 7:20 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240209 15:59]:
> > On Fri, Feb 9, 2024 at 11:31 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> ...
>
> > > > > > > > +static struct vm_area_struct *lock_vma(struct mm_struct *mm,
> > > > > > > > +                                    unsigned long address,
> > > > > > > > +                                    bool prepare_anon)
> > > > > > > > +{
> > > > > > > > +     struct vm_area_struct *vma;
> > > > > > > > +
> > > > > > > > +     vma = lock_vma_under_rcu(mm, address);
> > > > > > > > +     if (vma) {
> > > > > > > > +             /*
> > > > > > > > +              * lock_vma_under_rcu() only checks anon_vma for private
> > > > > > > > +              * anonymous mappings. But we need to ensure it is assigned in
> > > > > > > > +              * private file-backed vmas as well.
> > > > > > > > +              */
> > > > > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > > > > +                 !vma->anon_vma)
> > > > > > > > +                     vma_end_read(vma);
> > > > > > > > +             else
> > > > > > > > +                     return vma;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     mmap_read_lock(mm);
> > > > > > > > +     vma = vma_lookup(mm, address);
> > > > > > > > +     if (vma) {
> > > > > > > > +             if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
> > > > > > > > +                 anon_vma_prepare(vma)) {
> > > > > > > > +                     vma = ERR_PTR(-ENOMEM);
> > > > > > > > +             } else {
> > > > > > > > +                     /*
> > > > > > > > +                      * We cannot use vma_start_read() as it may fail due to
> > > > > > > > +                      * false locked (see comment in vma_start_read()). We
> > > > > > > > +                      * can avoid that by directly locking vm_lock under
> > > > > > > > +                      * mmap_lock, which guarantees that nobody can lock the
> > > > > > > > +                      * vma for write (vma_start_write()) under us.
> > > > > > > > +                      */
> > > > > > > > +                     down_read(&vma->vm_lock->lock);
> > > > > > > > +             }
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     mmap_read_unlock(mm);
> > > > > > > > +     return vma;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void unlock_vma(struct vm_area_struct *vma)
> > > > > > > > +{
> > > > > > > > +     vma_end_read(vma);
> > > > > > > > +}
> > > > > > > > +
> ...
>
> >
> > The current implementation has a deadlock problem:
> >
> > thread 1
> >                                      thread 2
> >
> > vma_start_read(dst_vma)
> >
> >
> >                                          mmap_write_lock()
> >
> >                                          vma_start_write(src_vma)
> > vma_start_read(src_vma) fails
> > mmap_read_lock() blocks
> >
> >
> >                                         vma_start_write(dst_vma)
> > blocks
> >
> >
> > I think the solution is to implement it this way:
> >
> > long find_and_lock_vmas(...)
> > {
> >               dst_vma = lock_vma(mm, dst_start, true);
> >               if (IS_ERR(dst_vma))
> >                           return PTR_ERR(vma);
> >
> >               src_vma = lock_vma_under_rcu(mm, src_start);
> >               if (!src_vma) {
> >                             long err;
> >                             if (mmap_read_trylock(mm)) {
> >                                             src_vma = vma_lookup(mm, src_start);
> >                                             if (src_vma) {
> >
> > down_read(&src_vma->vm_lock->lock);
> >                                                         err = 0;
> >                                             } else {
> >                                                        err = -ENOENT;
> >                                             }
> >                                             mmap_read_unlock(mm);
> >                                } else {
> >                                            vma_end_read(dst_vma);
> >                                            err = lock_mm_and_find_vmas(...);
> >                                            if (!err) {
>
> Right now lock_mm_and_find_vmas() doesn't use the per-vma locking, so
> you'd have to lock those here too.  I'm sure you realise that, but this
> is very convoluted.

That's right. I realized that after I sent this email.
>
> >                                                          mmap_read_unlock(mm);
> >                                            }
> >                                 }
> >                                 return err;
>
> On contention you will now abort vs block.

Is it? On contention mmap_read_trylock() will fail and we do the whole
operation using lock_mm_and_find_vmas() which blocks on mmap_lock. Am
I missing something?
>
> >               }
> >               return 0;
> > }
> >
> > Of course this would need defining lock_mm_and_find_vmas() regardless
> > of CONFIG_PER_VMA_LOCK. I can also remove the prepare_anon condition
> > in lock_vma().
>
> You are adding a lot of complexity for a relatively rare case, which is
> probably not worth optimising.
>
> I think you'd be better served by something like :
>
> if (likely(src_vma))
>         return 0;
>
> /* Undo any locking */
> vma_end_read(dst_vma);
>
> /* Fall back to locking both in mmap_lock critical section */

Agreed on reduced complexity. But as Suren pointed out in one of his
replies that lock_vma_under_rcu() may fail due to seq overflow. That's
why lock_vma() uses vma_lookup() followed by direct down_read() on
vma-lock. IMHO what we need here is exactly lock_mm_and_find_vmas()
and the code can be further simplified as follows:

err = lock_mm_and_find_vmas(...);
if (!err) {
          down_read(dst_vma...);
          if (dst_vma != src_vma)
                       down_read(src_vma....);
          mmap_read_unlock(mm);
}
return err;

(another thing I'm fixing in the next version is avoiding locking the
vma twice if both src_start and dst_start are in same vma)

> mmap_read_lock();
> /*
>  * probably worth creating an inline function for a single vma
>  * find/populate that can be used in lock_vma() that does the anon vma
>  * population?
>  */
Good idea. This can simplify both lock_vma() as well as lock_mm_and_find_vmas().

> dst_vm = lock_vma_populate_anon();
> ...
>
> src_vma = lock_vma_under_rcu(mm, src_start);
> ...
>
>
> mmap_read_unlock();
> return 0;
>
> src_failed:
>         vma_end_read(dst_vma);
> dst_failed:
> mmap_read_unlock();
> return err;
>
> Thanks,
> Liam
>
> ...
>
Liam R. Howlett Feb. 12, 2024, 8:11 p.m. UTC | #9
* Lokesh Gidra <lokeshgidra@google.com> [240212 13:08]:
> On Mon, Feb 12, 2024 at 7:20 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
...

> > >
> > > The current implementation has a deadlock problem:
...

> > On contention you will now abort vs block.
> 
> Is it? On contention mmap_read_trylock() will fail and we do the whole
> operation using lock_mm_and_find_vmas() which blocks on mmap_lock. Am
> I missing something?

You are right, I missed the taking of the lock in the function call.

> >
> > >               }
> > >               return 0;
> > > }
> > >
> > > Of course this would need defining lock_mm_and_find_vmas() regardless
> > > of CONFIG_PER_VMA_LOCK. I can also remove the prepare_anon condition
> > > in lock_vma().
> >
> > You are adding a lot of complexity for a relatively rare case, which is
> > probably not worth optimising.
> >
...

> 
> Agreed on reduced complexity. But as Suren pointed out in one of his
> replies that lock_vma_under_rcu() may fail due to seq overflow. That's
> why lock_vma() uses vma_lookup() followed by direct down_read() on
> vma-lock.

I'd rather see another function that doesn't care about anon (I think
src is special that way?), and avoid splitting the locking across
functions as much as possible.


> IMHO what we need here is exactly lock_mm_and_find_vmas()
> and the code can be further simplified as follows:
> 
> err = lock_mm_and_find_vmas(...);
> if (!err) {
>           down_read(dst_vma...);
>           if (dst_vma != src_vma)
>                        down_read(src_vma....);
>           mmap_read_unlock(mm);
> }
> return err;

If we exactly needed lock_mm_and_find_vmas(), there wouldn't be three
lock/unlock calls depending on the return code.

The fact that lock_mm_and_find_vmas() returns with the mm locked or
unlocked depending on the return code is not reducing the complexity of
this code.

You could use a widget that does something with dst, and a different
widget that does something with src (if they are different).  The dst
widget can be used for the lock_vma(), and in the
lock_mm_and_find_vmas(), while the src one can be used in this and the
lock_mm_and_find_vmas(). Neither widget would touch the locks.  This way
you can build your functions that have the locking and unlocking
co-located (except the obvious necessity of holding the mmap_read lock
for the !per-vma case).

I've also thought of how you can name the abstraction in the functions:
use a 'prepare() and complete()' to find/lock and unlock what you need.
Might be worth exploring?  If we fail to 'prepare()' then we don't need
to 'complete()', which means there won't be mismatched locking hanging
around.  Maybe it's too late to change to this sort of thing, but I
thought I'd mention it.

Thanks,
Liam
Lokesh Gidra Feb. 12, 2024, 10:30 p.m. UTC | #10
On Mon, Feb 12, 2024 at 12:11 PM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240212 13:08]:
> > On Mon, Feb 12, 2024 at 7:20 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> ...
>
> > > >
> > > > The current implementation has a deadlock problem:
> ...
>
> > > On contention you will now abort vs block.
> >
> > Is it? On contention mmap_read_trylock() will fail and we do the whole
> > operation using lock_mm_and_find_vmas() which blocks on mmap_lock. Am
> > I missing something?
>
> You are right, I missed the taking of the lock in the function call.
>
> > >
> > > >               }
> > > >               return 0;
> > > > }
> > > >
> > > > Of course this would need defining lock_mm_and_find_vmas() regardless
> > > > of CONFIG_PER_VMA_LOCK. I can also remove the prepare_anon condition
> > > > in lock_vma().
> > >
> > > You are adding a lot of complexity for a relatively rare case, which is
> > > probably not worth optimising.
> > >
> ...
>
> >
> > Agreed on reduced complexity. But as Suren pointed out in one of his
> > replies that lock_vma_under_rcu() may fail due to seq overflow. That's
> > why lock_vma() uses vma_lookup() followed by direct down_read() on
> > vma-lock.
>
> I'd rather see another function that doesn't care about anon (I think
> src is special that way?), and avoid splitting the locking across
> functions as much as possible.
>
Fair point about not splitting locking across functions.
>
> > IMHO what we need here is exactly lock_mm_and_find_vmas()
> > and the code can be further simplified as follows:
> >
> > err = lock_mm_and_find_vmas(...);
> > if (!err) {
> >           down_read(dst_vma...);
> >           if (dst_vma != src_vma)
> >                        down_read(src_vma....);
> >           mmap_read_unlock(mm);
> > }
> > return err;
>
> If we exactly needed lock_mm_and_find_vmas(), there wouldn't be three
> lock/unlock calls depending on the return code.
>
> The fact that lock_mm_and_find_vmas() returns with the mm locked or
> unlocked depending on the return code is not reducing the complexity of
> this code.
>
> You could use a widget that does something with dst, and a different
> widget that does something with src (if they are different).  The dst
> widget can be used for the lock_vma(), and in the
> lock_mm_and_find_vmas(), while the src one can be used in this and the
> lock_mm_and_find_vmas(). Neither widget would touch the locks.  This way
> you can build your functions that have the locking and unlocking
> co-located (except the obvious necessity of holding the mmap_read lock
> for the !per-vma case).
>
I think I have managed to minimize the code duplication while not
complicating locking/unlocking.

I have added a find_vmas_mm_locked() handler which, as the name
suggests, expects mmap_lock is held and finds the two vmas and ensures
anon_vma for dst_vma is populated. I call this handler from
lock_mm_and_find_vmas() and find_and_lock_vmas() in the fallback case.

I have also introduced a handler for finding dst_vma and preparing its
anon_vma, which is used in lock_vma() and find_vmas_mm_locked().

Sounds good?

> I've also thought of how you can name the abstraction in the functions:
> use a 'prepare() and complete()' to find/lock and unlock what you need.
> Might be worth exploring?  If we fail to 'prepare()' then we don't need
> to 'complete()', which means there won't be mismatched locking hanging
> around.  Maybe it's too late to change to this sort of thing, but I
> thought I'd mention it.
>
Nice suggestion! But after (fortunately) finding the function names
that are self-explanatory, dropping them seems like going in the wrong
direction. Please let me know if you think this is a missing piece. I
am open to incorporating this.

> Thanks,
> Liam
Liam R. Howlett Feb. 12, 2024, 10:53 p.m. UTC | #11
* Lokesh Gidra <lokeshgidra@google.com> [240212 17:31]:
> I have also introduced a handler for finding dst_vma and preparing its
> anon_vma, which is used in lock_vma() and find_vmas_mm_locked().
> 
> Sounds good?
> 
> > I've also thought of how you can name the abstraction in the functions:
> > use a 'prepare() and complete()' to find/lock and unlock what you need.
> > Might be worth exploring?  If we fail to 'prepare()' then we don't need
> > to 'complete()', which means there won't be mismatched locking hanging
> > around.  Maybe it's too late to change to this sort of thing, but I
> > thought I'd mention it.
> >
> Nice suggestion! But after (fortunately) finding the function names
> that are self-explanatory, dropping them seems like going in the wrong
> direction. Please let me know if you think this is a missing piece. I
> am open to incorporating this.

This plan sounds good, thanks!

Regards,
Liam
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index c00a021bcce4..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,17 +2005,8 @@  static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(mm)) {
-		mmap_read_lock(mm);
-
-		/* Re-check after taking map_changing_lock */
-		down_read(&ctx->map_changing_lock);
-		if (likely(!atomic_read(&ctx->mmap_changing)))
-			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
-					 uffdio_move.len, uffdio_move.mode);
-		else
-			ret = -EAGAIN;
-		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(mm);
+		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+				 uffdio_move.len, uffdio_move.mode);
 		mmput(mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3210c3552976..05d59f74fc88 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -138,9 +138,8 @@  extern long uffd_wp_range(struct vm_area_struct *vma,
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
 void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 flags);
 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
 			struct vm_area_struct *dst_vma,
 			struct vm_area_struct *src_vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 74aad0831e40..1e25768b2136 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -19,20 +19,12 @@ 
 #include <asm/tlb.h>
 #include "internal.h"
 
-static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
-				    unsigned long dst_start,
-				    unsigned long len)
+static bool validate_dst_vma(struct vm_area_struct *dst_vma,
+			     unsigned long dst_end)
 {
-	/*
-	 * Make sure that the dst range is both valid and fully within a
-	 * single existing vma.
-	 */
-	struct vm_area_struct *dst_vma;
-
-	dst_vma = find_vma(dst_mm, dst_start);
-	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
-		return NULL;
+	/* Make sure that the dst range is fully within dst_vma. */
+	if (dst_end > dst_vma->vm_end)
+		return false;
 
 	/*
 	 * Check the vma is registered in uffd, this is required to
@@ -40,11 +32,125 @@  struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 	 * time.
 	 */
 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
-		return NULL;
+		return false;
+
+	return true;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * lock_vma() - Lookup and lock vma corresponding to @address.
+ * @mm: mm to search vma in.
+ * @address: address that the vma should contain.
+ * @prepare_anon: If true, then prepare the vma (if private) with anon_vma.
+ *
+ * Should be called without holding mmap_lock. vma should be unlocked after use
+ * with unlock_vma().
+ *
+ * Return: A locked vma containing @address, NULL if no vma is found, or
+ * -ENOMEM if anon_vma couldn't be allocated.
+ */
+static struct vm_area_struct *lock_vma(struct mm_struct *mm,
+				       unsigned long address,
+				       bool prepare_anon)
+{
+	struct vm_area_struct *vma;
+
+	vma = lock_vma_under_rcu(mm, address);
+	if (vma) {
+		/*
+		 * lock_vma_under_rcu() only checks anon_vma for private
+		 * anonymous mappings. But we need to ensure it is assigned in
+		 * private file-backed vmas as well.
+		 */
+		if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
+		    !vma->anon_vma)
+			vma_end_read(vma);
+		else
+			return vma;
+	}
+
+	mmap_read_lock(mm);
+	vma = vma_lookup(mm, address);
+	if (vma) {
+		if (prepare_anon && !(vma->vm_flags & VM_SHARED) &&
+		    anon_vma_prepare(vma)) {
+			vma = ERR_PTR(-ENOMEM);
+		} else {
+			/*
+			 * We cannot use vma_start_read() as it may fail due to
+			 * false locked (see comment in vma_start_read()). We
+			 * can avoid that by directly locking vm_lock under
+			 * mmap_lock, which guarantees that nobody can lock the
+			 * vma for write (vma_start_write()) under us.
+			 */
+			down_read(&vma->vm_lock->lock);
+		}
+	}
+
+	mmap_read_unlock(mm);
+	return vma;
+}
+
+static void unlock_vma(struct vm_area_struct *vma)
+{
+	vma_end_read(vma);
+}
+
+static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
+						    unsigned long dst_start,
+						    unsigned long len)
+{
+	struct vm_area_struct *dst_vma;
+
+	/* Ensure anon_vma is assigned for private vmas */
+	dst_vma = lock_vma(dst_mm, dst_start, true);
+
+	if (!dst_vma)
+		return ERR_PTR(-ENOENT);
+
+	if (PTR_ERR(dst_vma) == -ENOMEM)
+		return dst_vma;
+
+	if (!validate_dst_vma(dst_vma, dst_start + len))
+		goto out_unlock;
 
 	return dst_vma;
+out_unlock:
+	unlock_vma(dst_vma);
+	return ERR_PTR(-ENOENT);
 }
 
+#else
+
+static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
+						       unsigned long dst_start,
+						       unsigned long len)
+{
+	struct vm_area_struct *dst_vma;
+	int err = -ENOENT;
+
+	mmap_read_lock(dst_mm);
+	dst_vma = vma_lookup(dst_mm, dst_start);
+	if (!dst_vma)
+		goto out_unlock;
+
+	/* Ensure anon_vma is assigned for private vmas */
+	if (!(dst_vma->vm_flags & VM_SHARED) && anon_vma_prepare(dst_vma)) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	if (!validate_dst_vma(dst_vma, dst_start + len))
+		goto out_unlock;
+
+	return dst_vma;
+out_unlock:
+	mmap_read_unlock(dst_mm);
+	return ERR_PTR(err);
+}
+#endif
+
 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
 				 unsigned long dst_addr)
@@ -350,7 +456,8 @@  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
  */
 static __always_inline ssize_t mfill_atomic_hugetlb(
 					      struct userfaultfd_ctx *ctx,
@@ -361,7 +468,6 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 					      uffd_flags_t flags)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
-	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	ssize_t err;
 	pte_t *dst_pte;
 	unsigned long src_addr, dst_addr;
@@ -380,7 +486,11 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 	 */
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
 		up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+		unlock_vma(dst_vma);
+#else
 		mmap_read_unlock(dst_mm);
+#endif
 		return -EINVAL;
 	}
 
@@ -403,24 +513,32 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 	 * retry, dst_vma will be set to NULL and we must lookup again.
 	 */
 	if (!dst_vma) {
+#ifdef CONFIG_PER_VMA_LOCK
+		dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
+#else
+		dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
+#endif
+		if (IS_ERR(dst_vma)) {
+			err = PTR_ERR(dst_vma);
+			goto out;
+		}
+
 		err = -ENOENT;
-		dst_vma = find_dst_vma(dst_mm, dst_start, len);
-		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
-			goto out_unlock;
+		if (!is_vm_hugetlb_page(dst_vma))
+			goto out_unlock_vma;
 
 		err = -EINVAL;
 		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
-			goto out_unlock;
-
-		vm_shared = dst_vma->vm_flags & VM_SHARED;
-	}
+			goto out_unlock_vma;
 
-	/*
-	 * If not shared, ensure the dst_vma has a anon_vma.
-	 */
-	err = -ENOMEM;
-	if (!vm_shared) {
-		if (unlikely(anon_vma_prepare(dst_vma)))
+		/*
+		 * If memory mappings are changing because of non-cooperative
+		 * operation (e.g. mremap) running in parallel, bail out and
+		 * request the user to retry later
+		 */
+		down_read(&ctx->map_changing_lock);
+		err = -EAGAIN;
+		if (atomic_read(&ctx->mmap_changing))
 			goto out_unlock;
 	}
 
@@ -465,7 +583,11 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 
 		if (unlikely(err == -ENOENT)) {
 			up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+			unlock_vma(dst_vma);
+#else
 			mmap_read_unlock(dst_mm);
+#endif
 			BUG_ON(!folio);
 
 			err = copy_folio_from_user(folio,
@@ -474,17 +596,6 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 				err = -EFAULT;
 				goto out;
 			}
-			mmap_read_lock(dst_mm);
-			down_read(&ctx->map_changing_lock);
-			/*
-			 * If memory mappings are changing because of non-cooperative
-			 * operation (e.g. mremap) running in parallel, bail out and
-			 * request the user to retry later
-			 */
-			if (atomic_read(&ctx->mmap_changing)) {
-				err = -EAGAIN;
-				break;
-			}
 
 			dst_vma = NULL;
 			goto retry;
@@ -505,7 +616,12 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
+out_unlock_vma:
+#ifdef CONFIG_PER_VMA_LOCK
+	unlock_vma(dst_vma);
+#else
 	mmap_read_unlock(dst_mm);
+#endif
 out:
 	if (folio)
 		folio_put(folio);
@@ -597,7 +713,19 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	copied = 0;
 	folio = NULL;
 retry:
-	mmap_read_lock(dst_mm);
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+#ifdef CONFIG_PER_VMA_LOCK
+	dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
+#else
+	dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
+#endif
+	if (IS_ERR(dst_vma)) {
+		err = PTR_ERR(dst_vma);
+		goto out;
+	}
 
 	/*
 	 * If memory mappings are changing because of non-cooperative
@@ -609,15 +737,6 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
-	/*
-	 * Make sure the vma is not shared, that the dst range is
-	 * both valid and fully within a single existing vma.
-	 */
-	err = -ENOENT;
-	dst_vma = find_dst_vma(dst_mm, dst_start, len);
-	if (!dst_vma)
-		goto out_unlock;
-
 	err = -EINVAL;
 	/*
 	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
@@ -647,16 +766,6 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
-	/*
-	 * Ensure the dst_vma has a anon_vma or this page
-	 * would get a NULL anon_vma when moved in the
-	 * dst_vma.
-	 */
-	err = -ENOMEM;
-	if (!(dst_vma->vm_flags & VM_SHARED) &&
-	    unlikely(anon_vma_prepare(dst_vma)))
-		goto out_unlock;
-
 	while (src_addr < src_start + len) {
 		pmd_t dst_pmdval;
 
@@ -699,7 +808,11 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			void *kaddr;
 
 			up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+			unlock_vma(dst_vma);
+#else
 			mmap_read_unlock(dst_mm);
+#endif
 			BUG_ON(!folio);
 
 			kaddr = kmap_local_folio(folio, 0);
@@ -730,7 +843,11 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+	unlock_vma(dst_vma);
+#else
 	mmap_read_unlock(dst_mm);
+#endif
 out:
 	if (folio)
 		folio_put(folio);
@@ -1267,16 +1384,67 @@  static int validate_move_areas(struct userfaultfd_ctx *ctx,
 	if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
 		return -EINVAL;
 
-	/*
-	 * Ensure the dst_vma has a anon_vma or this page
-	 * would get a NULL anon_vma when moved in the
-	 * dst_vma.
-	 */
-	if (unlikely(anon_vma_prepare(dst_vma)))
-		return -ENOMEM;
+	return 0;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static int find_and_lock_vmas(struct mm_struct *mm,
+			      unsigned long dst_start,
+			      unsigned long src_start,
+			      struct vm_area_struct **dst_vmap,
+			      struct vm_area_struct **src_vmap)
+{
+	int err;
+
+	/* There is no need to prepare anon_vma for src_vma */
+	*src_vmap = lock_vma(mm, src_start, false);
+	if (!*src_vmap)
+		return -ENOENT;
+
+	/* Ensure anon_vma is assigned for anonymous vma */
+	*dst_vmap = lock_vma(mm, dst_start, true);
+	err = -ENOENT;
+	if (!*dst_vmap)
+		goto out_unlock;
+
+	err = -ENOMEM;
+	if (PTR_ERR(*dst_vmap) == -ENOMEM)
+		goto out_unlock;
 
 	return 0;
+out_unlock:
+	unlock_vma(*src_vmap);
+	return err;
 }
+#else
+static int lock_mm_and_find_vmas(struct mm_struct *mm,
+				 unsigned long dst_start,
+				 unsigned long src_start,
+				 struct vm_area_struct **dst_vmap,
+				 struct vm_area_struct **src_vmap)
+{
+	int err = -ENOENT;
+	mmap_read_lock(mm);
+
+	*src_vmap = vma_lookup(mm, src_start);
+	if (!*src_vmap)
+		goto out_unlock;
+
+	*dst_vmap = vma_lookup(mm, dst_start);
+	if (!*dst_vmap)
+		goto out_unlock;
+
+	/* Ensure anon_vma is assigned */
+	err = -ENOMEM;
+	if (vma_is_anonymous(*dst_vmap) && anon_vma_prepare(*dst_vmap))
+		goto out_unlock;
+
+	return 0;
+out_unlock:
+	mmap_read_unlock(mm);
+	return err;
+}
+#endif
 
 /**
  * move_pages - move arbitrary anonymous pages of an existing vma
@@ -1287,8 +1455,6 @@  static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * @len: length of the virtual memory range
  * @mode: flags from uffdio_move.mode
  *
- * Must be called with mmap_lock held for read.
- *
  * move_pages() remaps arbitrary anonymous pages atomically in zero
  * copy. It only works on non shared anonymous pages because those can
  * be relocated without generating non linear anon_vmas in the rmap
@@ -1355,10 +1521,10 @@  static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * could be obtained. This is the only additional complexity added to
  * the rmap code to provide this anonymous page remapping functionality.
  */
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 mode)
 {
+	struct mm_struct *mm = ctx->mm;
 	struct vm_area_struct *src_vma, *dst_vma;
 	unsigned long src_addr, dst_addr;
 	pmd_t *src_pmd, *dst_pmd;
@@ -1376,28 +1542,40 @@  ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 	    WARN_ON_ONCE(dst_start + len <= dst_start))
 		goto out;
 
+#ifdef CONFIG_PER_VMA_LOCK
+	err = find_and_lock_vmas(mm, dst_start, src_start,
+				 &dst_vma, &src_vma);
+#else
+	err = lock_mm_and_find_vmas(mm, dst_start, src_start,
+				    &dst_vma, &src_vma);
+#endif
+	if (err)
+		goto out;
+
+	/* Re-check after taking map_changing_lock */
+	down_read(&ctx->map_changing_lock);
+	if (likely(atomic_read(&ctx->mmap_changing))) {
+		err = -EAGAIN;
+		goto out_unlock;
+	}
 	/*
 	 * Make sure the vma is not shared, that the src and dst remap
 	 * ranges are both valid and fully within a single existing
 	 * vma.
 	 */
-	src_vma = find_vma(mm, src_start);
-	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
-		goto out;
-	if (src_start < src_vma->vm_start ||
-	    src_start + len > src_vma->vm_end)
-		goto out;
+	if (src_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
+	if (src_start + len > src_vma->vm_end)
+		goto out_unlock;
 
-	dst_vma = find_vma(mm, dst_start);
-	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-		goto out;
-	if (dst_start < dst_vma->vm_start ||
-	    dst_start + len > dst_vma->vm_end)
-		goto out;
+	if (dst_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
+	if (dst_start + len > dst_vma->vm_end)
+		goto out_unlock;
 
 	err = validate_move_areas(ctx, src_vma, dst_vma);
 	if (err)
-		goto out;
+		goto out_unlock;
 
 	for (src_addr = src_start, dst_addr = dst_start;
 	     src_addr < src_start + len;) {
@@ -1514,6 +1692,14 @@  ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 		moved += step_size;
 	}
 
+out_unlock:
+	up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+	unlock_vma(dst_vma);
+	unlock_vma(src_vma);
+#else
+	mmap_read_unlock(mm);
+#endif
 out:
 	VM_WARN_ON(moved < 0);
 	VM_WARN_ON(err > 0);