diff mbox series

[v5,3/3] userfaultfd: use per-vma locks in userfaultfd operations

Message ID 20240213001920.3551772-4-lokeshgidra@google.com (mailing list archive)
State Handled Elsewhere
Delegated to: Paul Moore
Headers show
Series per-vma locks in userfaultfd | expand

Commit Message

Lokesh Gidra Feb. 13, 2024, 12:19 a.m. UTC
All userfaultfd operations, except write-protect, opportunistically use
per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
critical section.

Write-protect operation requires mmap_lock as it iterates over multiple
vmas.

Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
---
 fs/userfaultfd.c              |  13 +-
 include/linux/userfaultfd_k.h |   5 +-
 mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
 3 files changed, 312 insertions(+), 98 deletions(-)

Comments

Liam R. Howlett Feb. 13, 2024, 3:33 a.m. UTC | #1
* Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> All userfaultfd operations, except write-protect, opportunistically use
> per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> critical section.
> 
> Write-protect operation requires mmap_lock as it iterates over multiple
> vmas.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> ---
>  fs/userfaultfd.c              |  13 +-
>  include/linux/userfaultfd_k.h |   5 +-
>  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
>  3 files changed, 312 insertions(+), 98 deletions(-)
> 
...

> +
> +static __always_inline
> +struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
> +						 unsigned long addr)
> +{
> +	struct vm_area_struct *vma;
> +
> +	mmap_assert_locked(mm);
> +	vma = vma_lookup(mm, addr);
> +	if (!vma)
> +		vma = ERR_PTR(-ENOENT);
> +	else if (!(vma->vm_flags & VM_SHARED) && anon_vma_prepare(vma))
> +		vma = ERR_PTR(-ENOMEM);

Nit: I just noticed that the code below says anon_vma_prepare() is unlikely.

...

> +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> +						       unsigned long dst_start,
> +						       unsigned long len)
> +{
> +	struct vm_area_struct *dst_vma;
> +	int err;
> +
> +	mmap_read_lock(dst_mm);
> +	dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
> +	if (IS_ERR(dst_vma)) {
> +		err = PTR_ERR(dst_vma);

It's sort of odd you decode then re-encode this error, but it's correct
the way you have it written.  You could just encode ENOENT instead?

> +		goto out_unlock;
> +	}
> +
> +	if (validate_dst_vma(dst_vma, dst_start + len))
> +		return dst_vma;
> +
> +	err = -ENOENT;
> +out_unlock:
> +	mmap_read_unlock(dst_mm);
> +	return ERR_PTR(err);
>  }
> +#endif
>  
...

> +static __always_inline
> +long find_vmas_mm_locked(struct mm_struct *mm,

int would probably do?
> +			 unsigned long dst_start,
> +			 unsigned long src_start,
> +			 struct vm_area_struct **dst_vmap,
> +			 struct vm_area_struct **src_vmap)
> +{
> +	struct vm_area_struct *vma;
> +
> +	mmap_assert_locked(mm);
> +	vma = find_vma_and_prepare_anon(mm, dst_start);
> +	if (IS_ERR(vma))
> +		return PTR_ERR(vma);
> +
> +	*dst_vmap = vma;
> +	/* Skip finding src_vma if src_start is in dst_vma */
> +	if (src_start >= vma->vm_start && src_start < vma->vm_end)
> +		goto out_success;
> +
> +	vma = vma_lookup(mm, src_start);
> +	if (!vma)
> +		return -ENOENT;
> +out_success:
> +	*src_vmap = vma;
> +	return 0;
> +}
> +
> +#ifdef CONFIG_PER_VMA_LOCK
> +static long find_and_lock_vmas(struct mm_struct *mm,

This could also be an int return type, I must be missing something?

...

> +	*src_vmap = lock_vma_under_rcu(mm, src_start);
> +	if (likely(*src_vmap))
> +		return 0;
> +
> +	/* Undo any locking and retry in mmap_lock critical section */
> +	vma_end_read(*dst_vmap);
> +
> +	mmap_read_lock(mm);
> +	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
> +	if (!err) {
> +		/*
> +		 * See comment in lock_vma() as to why not using
> +		 * vma_start_read() here.
> +		 */
> +		down_read(&(*dst_vmap)->vm_lock->lock);
> +		if (*dst_vmap != *src_vmap)
> +			down_read(&(*src_vmap)->vm_lock->lock);
> +	}
> +	mmap_read_unlock(mm);
> +	return err;
> +}
> +#else
> +static long lock_mm_and_find_vmas(struct mm_struct *mm,
> +				  unsigned long dst_start,
> +				  unsigned long src_start,
> +				  struct vm_area_struct **dst_vmap,
> +				  struct vm_area_struct **src_vmap)
> +{
> +	long err;
> +
> +	mmap_read_lock(mm);
> +	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
> +	if (err)
> +		mmap_read_unlock(mm);
> +	return err;
>  }
> +#endif

This section is much easier to understand.  Thanks.

Thanks,
Liam
Lokesh Gidra Feb. 13, 2024, 11:24 a.m. UTC | #2
On Mon, Feb 12, 2024 at 7:33 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> > All userfaultfd operations, except write-protect, opportunistically use
> > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > critical section.
> >
> > Write-protect operation requires mmap_lock as it iterates over multiple
> > vmas.
> >
> > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > ---
> >  fs/userfaultfd.c              |  13 +-
> >  include/linux/userfaultfd_k.h |   5 +-
> >  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
> >  3 files changed, 312 insertions(+), 98 deletions(-)
> >
> ...
>
> > +
> > +static __always_inline
> > +struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
> > +                                              unsigned long addr)
> > +{
> > +     struct vm_area_struct *vma;
> > +
> > +     mmap_assert_locked(mm);
> > +     vma = vma_lookup(mm, addr);
> > +     if (!vma)
> > +             vma = ERR_PTR(-ENOENT);
> > +     else if (!(vma->vm_flags & VM_SHARED) && anon_vma_prepare(vma))
> > +             vma = ERR_PTR(-ENOMEM);
>
> Nit: I just noticed that the code below says anon_vma_prepare() is unlikely.
>
Thanks for catching this. I'll add it in next version.
> ...
>
> > +static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
> > +                                                    unsigned long dst_start,
> > +                                                    unsigned long len)
> > +{
> > +     struct vm_area_struct *dst_vma;
> > +     int err;
> > +
> > +     mmap_read_lock(dst_mm);
> > +     dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
> > +     if (IS_ERR(dst_vma)) {
> > +             err = PTR_ERR(dst_vma);
>
> It's sort of odd you decode then re-encode this error, but it's correct
> the way you have it written.  You could just encode ENOENT instead?

Thanks. It was an oversight. I'll fix it.
>
> > +             goto out_unlock;
> > +     }
> > +
> > +     if (validate_dst_vma(dst_vma, dst_start + len))
> > +             return dst_vma;
> > +
> > +     err = -ENOENT;
> > +out_unlock:
> > +     mmap_read_unlock(dst_mm);
> > +     return ERR_PTR(err);
> >  }
> > +#endif
> >
> ...
>
> > +static __always_inline
> > +long find_vmas_mm_locked(struct mm_struct *mm,
>
> int would probably do?
> > +                      unsigned long dst_start,
> > +                      unsigned long src_start,
> > +                      struct vm_area_struct **dst_vmap,
> > +                      struct vm_area_struct **src_vmap)
> > +{
> > +     struct vm_area_struct *vma;
> > +
> > +     mmap_assert_locked(mm);
> > +     vma = find_vma_and_prepare_anon(mm, dst_start);
> > +     if (IS_ERR(vma))
> > +             return PTR_ERR(vma);
> > +
> > +     *dst_vmap = vma;
> > +     /* Skip finding src_vma if src_start is in dst_vma */
> > +     if (src_start >= vma->vm_start && src_start < vma->vm_end)
> > +             goto out_success;
> > +
> > +     vma = vma_lookup(mm, src_start);
> > +     if (!vma)
> > +             return -ENOENT;
> > +out_success:
> > +     *src_vmap = vma;
> > +     return 0;
> > +}
> > +
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static long find_and_lock_vmas(struct mm_struct *mm,
>
> This could also be an int return type, I must be missing something?

If you look at ERR_PTR() etc. macros, they all use 'long' for
conversions. Also, this file uses long/ssize_t/int at different
places. So I went in favor of long. I'm sure int would work just fine
too. Let me know if you want me to change it to int.
>
> ...
>
> > +     *src_vmap = lock_vma_under_rcu(mm, src_start);
> > +     if (likely(*src_vmap))
> > +             return 0;
> > +
> > +     /* Undo any locking and retry in mmap_lock critical section */
> > +     vma_end_read(*dst_vmap);
> > +
> > +     mmap_read_lock(mm);
> > +     err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
> > +     if (!err) {
> > +             /*
> > +              * See comment in lock_vma() as to why not using
> > +              * vma_start_read() here.
> > +              */
> > +             down_read(&(*dst_vmap)->vm_lock->lock);
> > +             if (*dst_vmap != *src_vmap)
> > +                     down_read(&(*src_vmap)->vm_lock->lock);
> > +     }
> > +     mmap_read_unlock(mm);
> > +     return err;
> > +}
> > +#else
> > +static long lock_mm_and_find_vmas(struct mm_struct *mm,
> > +                               unsigned long dst_start,
> > +                               unsigned long src_start,
> > +                               struct vm_area_struct **dst_vmap,
> > +                               struct vm_area_struct **src_vmap)
> > +{
> > +     long err;
> > +
> > +     mmap_read_lock(mm);
> > +     err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
> > +     if (err)
> > +             mmap_read_unlock(mm);
> > +     return err;
> >  }
> > +#endif
>
> This section is much easier to understand.  Thanks.

I'm glad finally the patch is easier to follow. Thanks so much for
your prompt reviews.
>
> Thanks,
> Liam
Liam R. Howlett Feb. 13, 2024, 5:06 p.m. UTC | #3
* Lokesh Gidra <lokeshgidra@google.com> [240213 06:25]:
> On Mon, Feb 12, 2024 at 7:33 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> > > All userfaultfd operations, except write-protect, opportunistically use
> > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > critical section.
> > >
> > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > vmas.
> > >
> > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > ---
> > >  fs/userfaultfd.c              |  13 +-
> > >  include/linux/userfaultfd_k.h |   5 +-
> > >  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
> > >  3 files changed, 312 insertions(+), 98 deletions(-)
> > >
> > ...

I just remembered an issue with the mmap tree that exists today that you
needs to be accounted for in this change.

If you hit a NULL VMA, you need to fall back to the mmap_lock() scenario
today.

This is a necessity to avoid a race of removal/replacement of a VMA in
the mmap(MAP_FIXED) case.  In this case, we munmap() prior to mmap()'ing
an area - which means you could see a NULL when there never should have
been a null.

Although this would be exceedingly rare, you need to handle this case.

Sorry I missed this earlier,
Liam
Lokesh Gidra Feb. 13, 2024, 6:14 p.m. UTC | #4
On Tue, Feb 13, 2024 at 9:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240213 06:25]:
> > On Mon, Feb 12, 2024 at 7:33 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > critical section.
> > > >
> > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > vmas.
> > > >
> > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > ---
> > > >  fs/userfaultfd.c              |  13 +-
> > > >  include/linux/userfaultfd_k.h |   5 +-
> > > >  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
> > > >  3 files changed, 312 insertions(+), 98 deletions(-)
> > > >
> > > ...
>
> I just remembered an issue with the mmap tree that exists today that you
> needs to be accounted for in this change.
>
> If you hit a NULL VMA, you need to fall back to the mmap_lock() scenario
> today.

Unless I'm missing something, isn't that already handled in the patch?
We get the VMA outside mmap_lock critical section only via
lock_vma_under_rcu() (in lock_vma() and find_and_lock_vmas()) and in
both cases if we get NULL in return, we retry in mmap_lock critical
section with vma_lookup(). Wouldn't that suffice?
>
> This is a necessity to avoid a race of removal/replacement of a VMA in
> the mmap(MAP_FIXED) case.  In this case, we munmap() prior to mmap()'ing
> an area - which means you could see a NULL when there never should have
> been a null.
>
> Although this would be exceedingly rare, you need to handle this case.
>
> Sorry I missed this earlier,
> Liam
Suren Baghdasaryan Feb. 13, 2024, 6:25 p.m. UTC | #5
On Tue, Feb 13, 2024 at 10:14 AM Lokesh Gidra <lokeshgidra@google.com> wrote:
>
> On Tue, Feb 13, 2024 at 9:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240213 06:25]:
> > > On Mon, Feb 12, 2024 at 7:33 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> > > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > > critical section.
> > > > >
> > > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > > vmas.
> > > > >
> > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > ---
> > > > >  fs/userfaultfd.c              |  13 +-
> > > > >  include/linux/userfaultfd_k.h |   5 +-
> > > > >  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
> > > > >  3 files changed, 312 insertions(+), 98 deletions(-)
> > > > >
> > > > ...
> >
> > I just remembered an issue with the mmap tree that exists today that you
> > needs to be accounted for in this change.
> >
> > If you hit a NULL VMA, you need to fall back to the mmap_lock() scenario
> > today.
>
> Unless I'm missing something, isn't that already handled in the patch?
> We get the VMA outside mmap_lock critical section only via
> lock_vma_under_rcu() (in lock_vma() and find_and_lock_vmas()) and in
> both cases if we get NULL in return, we retry in mmap_lock critical
> section with vma_lookup(). Wouldn't that suffice?

I think that case is handled correctly by lock_vma().

Sorry for coming back a bit late. The overall patch looks quite good
but the all these #ifdef CONFIG_PER_VMA_LOCK seem unnecessary to me.
Why find_and_lock_vmas() and lock_mm_and_find_vmas() be called the
same name (find_and_lock_vmas()) and in one case it would lock only
the VMA and in the other case it takes mmap_lock? Similarly
unlock_vma() would in one case unlock the VMA and in the other drop
the mmap_lock? That would remove all these #ifdefs from the code.
Maybe this was already discussed?

> >
> > This is a necessity to avoid a race of removal/replacement of a VMA in
> > the mmap(MAP_FIXED) case.  In this case, we munmap() prior to mmap()'ing
> > an area - which means you could see a NULL when there never should have
> > been a null.
> >
> > Although this would be exceedingly rare, you need to handle this case.
> >
> > Sorry I missed this earlier,
> > Liam
Liam R. Howlett Feb. 13, 2024, 6:49 p.m. UTC | #6
* Suren Baghdasaryan <surenb@google.com> [240213 13:25]:
> On Tue, Feb 13, 2024 at 10:14 AM Lokesh Gidra <lokeshgidra@google.com> wrote:
> >
> > On Tue, Feb 13, 2024 at 9:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lokesh Gidra <lokeshgidra@google.com> [240213 06:25]:
> > > > On Mon, Feb 12, 2024 at 7:33 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > > >
> > > > > * Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> > > > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > > > critical section.
> > > > > >
> > > > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > > > vmas.
> > > > > >
> > > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > > ---
> > > > > >  fs/userfaultfd.c              |  13 +-
> > > > > >  include/linux/userfaultfd_k.h |   5 +-
> > > > > >  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
> > > > > >  3 files changed, 312 insertions(+), 98 deletions(-)
> > > > > >
> > > > > ...
> > >
> > > I just remembered an issue with the mmap tree that exists today that you
> > > needs to be accounted for in this change.
> > >
> > > If you hit a NULL VMA, you need to fall back to the mmap_lock() scenario
> > > today.
> >
> > Unless I'm missing something, isn't that already handled in the patch?
> > We get the VMA outside mmap_lock critical section only via
> > lock_vma_under_rcu() (in lock_vma() and find_and_lock_vmas()) and in
> > both cases if we get NULL in return, we retry in mmap_lock critical
> > section with vma_lookup(). Wouldn't that suffice?
> 
> I think that case is handled correctly by lock_vma().

Yeah, it looks good.  I had a bit of a panic as I forgot to check that
and I was thinking of a previous version.  I rechecked and v5 looks
good.

> 
> Sorry for coming back a bit late. The overall patch looks quite good
> but the all these #ifdef CONFIG_PER_VMA_LOCK seem unnecessary to me.
> Why find_and_lock_vmas() and lock_mm_and_find_vmas() be called the
> same name (find_and_lock_vmas()) and in one case it would lock only
> the VMA and in the other case it takes mmap_lock? Similarly
> unlock_vma() would in one case unlock the VMA and in the other drop
> the mmap_lock? That would remove all these #ifdefs from the code.
> Maybe this was already discussed?

Yes, I don't think we should be locking the mm in lock_vma(), as it
makes things hard to follow.

We could use something like uffd_prepare(), uffd_complete() but I
thought of those names rather late in the cycle, but I've already caused
many iterations of this patch set and that clean up didn't seem as vital
as simplicity and clarity of the locking code.

Thanks,
Liam
Suren Baghdasaryan Feb. 13, 2024, 6:56 p.m. UTC | #7
On Tue, Feb 13, 2024 at 10:49 AM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [240213 13:25]:
> > On Tue, Feb 13, 2024 at 10:14 AM Lokesh Gidra <lokeshgidra@google.com> wrote:
> > >
> > > On Tue, Feb 13, 2024 at 9:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Lokesh Gidra <lokeshgidra@google.com> [240213 06:25]:
> > > > > On Mon, Feb 12, 2024 at 7:33 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > > > >
> > > > > > * Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> > > > > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > > > > critical section.
> > > > > > >
> > > > > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > > > > vmas.
> > > > > > >
> > > > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > > > ---
> > > > > > >  fs/userfaultfd.c              |  13 +-
> > > > > > >  include/linux/userfaultfd_k.h |   5 +-
> > > > > > >  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
> > > > > > >  3 files changed, 312 insertions(+), 98 deletions(-)
> > > > > > >
> > > > > > ...
> > > >
> > > > I just remembered an issue with the mmap tree that exists today that you
> > > > needs to be accounted for in this change.
> > > >
> > > > If you hit a NULL VMA, you need to fall back to the mmap_lock() scenario
> > > > today.
> > >
> > > Unless I'm missing something, isn't that already handled in the patch?
> > > We get the VMA outside mmap_lock critical section only via
> > > lock_vma_under_rcu() (in lock_vma() and find_and_lock_vmas()) and in
> > > both cases if we get NULL in return, we retry in mmap_lock critical
> > > section with vma_lookup(). Wouldn't that suffice?
> >
> > I think that case is handled correctly by lock_vma().
>
> Yeah, it looks good.  I had a bit of a panic as I forgot to check that
> and I was thinking of a previous version.  I rechecked and v5 looks
> good.
>
> >
> > Sorry for coming back a bit late. The overall patch looks quite good
> > but the all these #ifdef CONFIG_PER_VMA_LOCK seem unnecessary to me.
> > Why find_and_lock_vmas() and lock_mm_and_find_vmas() be called the
> > same name (find_and_lock_vmas()) and in one case it would lock only
> > the VMA and in the other case it takes mmap_lock? Similarly
> > unlock_vma() would in one case unlock the VMA and in the other drop
> > the mmap_lock? That would remove all these #ifdefs from the code.
> > Maybe this was already discussed?
>
> Yes, I don't think we should be locking the mm in lock_vma(), as it
> makes things hard to follow.
>
> We could use something like uffd_prepare(), uffd_complete() but I
> thought of those names rather late in the cycle, but I've already caused
> many iterations of this patch set and that clean up didn't seem as vital
> as simplicity and clarity of the locking code.

Maybe lock_vma_for_uffd()/unlock_vma_for_uffd()? Whatever name is
better I'm fine with it but all these #ifdef's sprinkled around don't
contribute to the readability.
Anyway, I don't see this as a blocker, just nice to have.

>
> Thanks,
> Liam
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Lokesh Gidra Feb. 13, 2024, 7:18 p.m. UTC | #8
On Tue, Feb 13, 2024 at 10:57 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Feb 13, 2024 at 10:49 AM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [240213 13:25]:
> > > On Tue, Feb 13, 2024 at 10:14 AM Lokesh Gidra <lokeshgidra@google.com> wrote:
> > > >
> > > > On Tue, Feb 13, 2024 at 9:06 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > > >
> > > > > * Lokesh Gidra <lokeshgidra@google.com> [240213 06:25]:
> > > > > > On Mon, Feb 12, 2024 at 7:33 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > > > > >
> > > > > > > * Lokesh Gidra <lokeshgidra@google.com> [240212 19:19]:
> > > > > > > > All userfaultfd operations, except write-protect, opportunistically use
> > > > > > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock
> > > > > > > > critical section.
> > > > > > > >
> > > > > > > > Write-protect operation requires mmap_lock as it iterates over multiple
> > > > > > > > vmas.
> > > > > > > >
> > > > > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > > > > > ---
> > > > > > > >  fs/userfaultfd.c              |  13 +-
> > > > > > > >  include/linux/userfaultfd_k.h |   5 +-
> > > > > > > >  mm/userfaultfd.c              | 392 ++++++++++++++++++++++++++--------
> > > > > > > >  3 files changed, 312 insertions(+), 98 deletions(-)
> > > > > > > >
> > > > > > > ...
> > > > >
> > > > > I just remembered an issue with the mmap tree that exists today that you
> > > > > needs to be accounted for in this change.
> > > > >
> > > > > If you hit a NULL VMA, you need to fall back to the mmap_lock() scenario
> > > > > today.
> > > >
> > > > Unless I'm missing something, isn't that already handled in the patch?
> > > > We get the VMA outside mmap_lock critical section only via
> > > > lock_vma_under_rcu() (in lock_vma() and find_and_lock_vmas()) and in
> > > > both cases if we get NULL in return, we retry in mmap_lock critical
> > > > section with vma_lookup(). Wouldn't that suffice?
> > >
> > > I think that case is handled correctly by lock_vma().
> >
> > Yeah, it looks good.  I had a bit of a panic as I forgot to check that
> > and I was thinking of a previous version.  I rechecked and v5 looks
> > good.
> >
> > >
> > > Sorry for coming back a bit late. The overall patch looks quite good
> > > but the all these #ifdef CONFIG_PER_VMA_LOCK seem unnecessary to me.
> > > Why find_and_lock_vmas() and lock_mm_and_find_vmas() be called the
> > > same name (find_and_lock_vmas()) and in one case it would lock only
> > > the VMA and in the other case it takes mmap_lock? Similarly
> > > unlock_vma() would in one case unlock the VMA and in the other drop
> > > the mmap_lock? That would remove all these #ifdefs from the code.
> > > Maybe this was already discussed?
> >
> > Yes, I don't think we should be locking the mm in lock_vma(), as it
> > makes things hard to follow.
> >
> > We could use something like uffd_prepare(), uffd_complete() but I
> > thought of those names rather late in the cycle, but I've already caused
> > many iterations of this patch set and that clean up didn't seem as vital
> > as simplicity and clarity of the locking code.

I anyway have to send another version to fix the error handling that
you reported earlier. I can take care of this in that version.

mfill_atomic...() functions (annoyingly) have to sometimes unlock and
relock. Using prepare/complete in that context seems incompatible.

>
> Maybe lock_vma_for_uffd()/unlock_vma_for_uffd()? Whatever name is
> better I'm fine with it but all these #ifdef's sprinkled around don't
> contribute to the readability.

I'll wait for an agreement on this because I too don't like using so
many ifdef's either.

Since these functions are supposed to have prototype depending on
mfill/move, how about the following names:

uffd_lock_mfill_vma()/uffd_unlock_mfill_vma()
uffd_lock_move_vmas()/uffd_unlock_move_vmas()

Of course, I'm open to other suggestions as well.

> Anyway, I don't see this as a blocker, just nice to have.
>
> >
> > Thanks,
> > Liam
> >
> > --
> > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >
Liam R. Howlett Feb. 13, 2024, 7:21 p.m. UTC | #9
* Suren Baghdasaryan <surenb@google.com> [240213 13:57]:
...

> >
> > Yes, I don't think we should be locking the mm in lock_vma(), as it
> > makes things hard to follow.
> >
> > We could use something like uffd_prepare(), uffd_complete() but I
> > thought of those names rather late in the cycle, but I've already caused
> > many iterations of this patch set and that clean up didn't seem as vital
> > as simplicity and clarity of the locking code.
> 
> Maybe lock_vma_for_uffd()/unlock_vma_for_uffd()? Whatever name is
> better I'm fine with it but all these #ifdef's sprinkled around don't
> contribute to the readability.

The issue I have is the vma in the name - we're not doing anything to
the vma when we mmap_lock.

> Anyway, I don't see this as a blocker, just nice to have.

Yes, that's how I see it as well.
Liam R. Howlett Feb. 13, 2024, 7:27 p.m. UTC | #10
* Lokesh Gidra <lokeshgidra@google.com> [240213 14:18]:
...

> > > We could use something like uffd_prepare(), uffd_complete() but I
> > > thought of those names rather late in the cycle, but I've already caused
> > > many iterations of this patch set and that clean up didn't seem as vital
> > > as simplicity and clarity of the locking code.
> 
> I anyway have to send another version to fix the error handling that
> you reported earlier. I can take care of this in that version.
> 
> mfill_atomic...() functions (annoyingly) have to sometimes unlock and
> relock. Using prepare/complete in that context seems incompatible.
> 
> >
> > Maybe lock_vma_for_uffd()/unlock_vma_for_uffd()? Whatever name is
> > better I'm fine with it but all these #ifdef's sprinkled around don't
> > contribute to the readability.
> 
> I'll wait for an agreement on this because I too don't like using so
> many ifdef's either.
> 
> Since these functions are supposed to have prototype depending on
> mfill/move, how about the following names:
> 
> uffd_lock_mfill_vma()/uffd_unlock_mfill_vma()
> uffd_lock_move_vmas()/uffd_unlock_move_vmas()
> 
> Of course, I'm open to other suggestions as well.
> 

I'm happy with those if you remove the vma/vmas from the name.
Suren Baghdasaryan Feb. 13, 2024, 7:30 p.m. UTC | #11
On Tue, Feb 13, 2024 at 11:27 AM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240213 14:18]:
> ...
>
> > > > We could use something like uffd_prepare(), uffd_complete() but I
> > > > thought of those names rather late in the cycle, but I've already caused
> > > > many iterations of this patch set and that clean up didn't seem as vital
> > > > as simplicity and clarity of the locking code.
> >
> > I anyway have to send another version to fix the error handling that
> > you reported earlier. I can take care of this in that version.
> >
> > mfill_atomic...() functions (annoyingly) have to sometimes unlock and
> > relock. Using prepare/complete in that context seems incompatible.
> >
> > >
> > > Maybe lock_vma_for_uffd()/unlock_vma_for_uffd()? Whatever name is
> > > better I'm fine with it but all these #ifdef's sprinkled around don't
> > > contribute to the readability.
> >
> > I'll wait for an agreement on this because I too don't like using so
> > many ifdef's either.
> >
> > Since these functions are supposed to have prototype depending on
> > mfill/move, how about the following names:
> >
> > uffd_lock_mfill_vma()/uffd_unlock_mfill_vma()
> > uffd_lock_move_vmas()/uffd_unlock_move_vmas()
> >
> > Of course, I'm open to other suggestions as well.
> >
>
> I'm happy with those if you remove the vma/vmas from the name.

Sounds good to me.

>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Lokesh Gidra Feb. 13, 2024, 7:36 p.m. UTC | #12
On Tue, Feb 13, 2024 at 11:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Feb 13, 2024 at 11:27 AM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Lokesh Gidra <lokeshgidra@google.com> [240213 14:18]:
> > ...
> >
> > > > > We could use something like uffd_prepare(), uffd_complete() but I
> > > > > thought of those names rather late in the cycle, but I've already caused
> > > > > many iterations of this patch set and that clean up didn't seem as vital
> > > > > as simplicity and clarity of the locking code.
> > >
> > > I anyway have to send another version to fix the error handling that
> > > you reported earlier. I can take care of this in that version.
> > >
> > > mfill_atomic...() functions (annoyingly) have to sometimes unlock and
> > > relock. Using prepare/complete in that context seems incompatible.
> > >
> > > >
> > > > Maybe lock_vma_for_uffd()/unlock_vma_for_uffd()? Whatever name is
> > > > better I'm fine with it but all these #ifdef's sprinkled around don't
> > > > contribute to the readability.
> > >
> > > I'll wait for an agreement on this because I too don't like using so
> > > many ifdef's either.
> > >
> > > Since these functions are supposed to have prototype depending on
> > > mfill/move, how about the following names:
> > >
> > > uffd_lock_mfill_vma()/uffd_unlock_mfill_vma()
> > > uffd_lock_move_vmas()/uffd_unlock_move_vmas()
> > >
> > > Of course, I'm open to other suggestions as well.
> > >
> >
> > I'm happy with those if you remove the vma/vmas from the name.
>
> Sounds good to me.
>
Sure. I'll do that:

Asking to avoid any more iterations: these functions should call the
currently defined ones or should replace them. For instance, should I
do the following:

#ifdef CONFIG_PER_VMA_LOCK
... uffd_mfill_lock()
{
        return find_and_lock_dst_vma(...);
}
#else
...uffd_mfill_lock()
{
       return lock_mm_and_find_dst_vma(...);
}
#endif

or have the function replace
find_and_lock_dst_vma()/lock_mm_and_find_dst_vma() ?

> >
> > --
> > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >
Liam R. Howlett Feb. 13, 2024, 7:51 p.m. UTC | #13
* Lokesh Gidra <lokeshgidra@google.com> [240213 14:37]:
...

> Asking to avoid any more iterations: these functions should call the
> currently defined ones or should replace them. For instance, should I
> do the following:
> 
> #ifdef CONFIG_PER_VMA_LOCK
> ... uffd_mfill_lock()
> {
>         return find_and_lock_dst_vma(...);
> }
> #else
> ...uffd_mfill_lock()
> {
>        return lock_mm_and_find_dst_vma(...);
> }
> #endif
> 
> or have the function replace
> find_and_lock_dst_vma()/lock_mm_and_find_dst_vma() ?

Since the two have the same prototype, then you can replace the function
names directly.

The other side should take the vma and use vma->vm_mm to get the mm to
unlock the mmap_lock in the !CONFIG_PER_VMA_LOCK.  That way those
prototypes also match and can use the same names directly.

move_pages() requires unlocking two VMAs or one, so pass both VMAs
through and do the check in there.  This, unfortunately means that one
of the VMAs will not be used in the !CONFIG_PER_VMA_LOCK case.  You
could add an assert to ensure src_vma is locked prior to using dst_vma
to unlock the mmap_lock(), to avoid potential bot emails.

Thanks,
Liam
Lokesh Gidra Feb. 13, 2024, 7:55 p.m. UTC | #14
On Tue, Feb 13, 2024 at 11:51 AM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lokesh Gidra <lokeshgidra@google.com> [240213 14:37]:
> ...
>
> > Asking to avoid any more iterations: these functions should call the
> > currently defined ones or should replace them. For instance, should I
> > do the following:
> >
> > #ifdef CONFIG_PER_VMA_LOCK
> > ... uffd_mfill_lock()
> > {
> >         return find_and_lock_dst_vma(...);
> > }
> > #else
> > ...uffd_mfill_lock()
> > {
> >        return lock_mm_and_find_dst_vma(...);
> > }
> > #endif
> >
> > or have the function replace
> > find_and_lock_dst_vma()/lock_mm_and_find_dst_vma() ?
>
> Since the two have the same prototype, then you can replace the function
> names directly.
>
> The other side should take the vma and use vma->vm_mm to get the mm to
> unlock the mmap_lock in the !CONFIG_PER_VMA_LOCK.  That way those
> prototypes also match and can use the same names directly.
>
> move_pages() requires unlocking two VMAs or one, so pass both VMAs
> through and do the check in there.  This, unfortunately means that one
> of the VMAs will not be used in the !CONFIG_PER_VMA_LOCK case.  You
> could add an assert to ensure src_vma is locked prior to using dst_vma
> to unlock the mmap_lock(), to avoid potential bot emails.
>
Perfect. Will do that and address the other comments you had on v5 as well.

Regarding int vs long for 'err' type, I'm assuming you are ok with my
explanation and I should keep long?

> Thanks,
> Liam
Liam R. Howlett Feb. 13, 2024, 8:40 p.m. UTC | #15
* Lokesh Gidra <lokeshgidra@google.com> [240213 14:55]:
...

> 
> Regarding int vs long for 'err' type, I'm assuming you are ok with my
> explanation and I should keep long?

For such things, I check for consensus in existing code:
mm/mmap.c - int is used when not returning an address or struct
fs/userfaultfd.c - decoded to int before returning

I'd use int to keep consistent with fs/userfaultfd.c

Thanks,
Liam
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index c00a021bcce4..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,17 +2005,8 @@  static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(mm)) {
-		mmap_read_lock(mm);
-
-		/* Re-check after taking map_changing_lock */
-		down_read(&ctx->map_changing_lock);
-		if (likely(!atomic_read(&ctx->mmap_changing)))
-			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
-					 uffdio_move.len, uffdio_move.mode);
-		else
-			ret = -EAGAIN;
-		up_read(&ctx->map_changing_lock);
-		mmap_read_unlock(mm);
+		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+				 uffdio_move.len, uffdio_move.mode);
 		mmput(mm);
 	} else {
 		return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3210c3552976..05d59f74fc88 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -138,9 +138,8 @@  extern long uffd_wp_range(struct vm_area_struct *vma,
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
 void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 flags);
 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
 			struct vm_area_struct *dst_vma,
 			struct vm_area_struct *src_vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 74aad0831e40..eb7ff220f315 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -20,19 +20,11 @@ 
 #include "internal.h"
 
 static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
-				    unsigned long dst_start,
-				    unsigned long len)
+bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
 {
-	/*
-	 * Make sure that the dst range is both valid and fully within a
-	 * single existing vma.
-	 */
-	struct vm_area_struct *dst_vma;
-
-	dst_vma = find_vma(dst_mm, dst_start);
-	if (!range_in_vma(dst_vma, dst_start, dst_start + len))
-		return NULL;
+	/* Make sure that the dst range is fully within dst_vma. */
+	if (dst_end > dst_vma->vm_end)
+		return false;
 
 	/*
 	 * Check the vma is registered in uffd, this is required to
@@ -40,10 +32,118 @@  struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 	 * time.
 	 */
 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
-		return NULL;
+		return false;
+
+	return true;
+}
+
+static __always_inline
+struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
+						 unsigned long addr)
+{
+	struct vm_area_struct *vma;
+
+	mmap_assert_locked(mm);
+	vma = vma_lookup(mm, addr);
+	if (!vma)
+		vma = ERR_PTR(-ENOENT);
+	else if (!(vma->vm_flags & VM_SHARED) && anon_vma_prepare(vma))
+		vma = ERR_PTR(-ENOMEM);
+
+	return vma;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * lock_vma() - Lookup and lock vma corresponding to @address.
+ * @mm: mm to search vma in.
+ * @address: address that the vma should contain.
+ *
+ * Should be called without holding mmap_lock. vma should be unlocked after use
+ * with unlock_vma().
+ *
+ * Return: A locked vma containing @address, -ENOENT if no vma is found, or
+ * -ENOMEM if anon_vma couldn't be allocated.
+ */
+static struct vm_area_struct *lock_vma(struct mm_struct *mm,
+				       unsigned long address)
+{
+	struct vm_area_struct *vma;
+
+	vma = lock_vma_under_rcu(mm, address);
+	if (vma) {
+		/*
+		 * lock_vma_under_rcu() only checks anon_vma for private
+		 * anonymous mappings. But we need to ensure it is assigned in
+		 * private file-backed vmas as well.
+		 */
+		if (!(vma->vm_flags & VM_SHARED) && !vma->anon_vma)
+			vma_end_read(vma);
+		else
+			return vma;
+	}
+
+	mmap_read_lock(mm);
+	vma = find_vma_and_prepare_anon(mm, address);
+	if (!IS_ERR(vma)) {
+		/*
+		 * We cannot use vma_start_read() as it may fail due to
+		 * false locked (see comment in vma_start_read()). We
+		 * can avoid that by directly locking vm_lock under
+		 * mmap_lock, which guarantees that nobody can lock the
+		 * vma for write (vma_start_write()) under us.
+		 */
+		down_read(&vma->vm_lock->lock);
+	}
+
+	mmap_read_unlock(mm);
+	return vma;
+}
+
+static void unlock_vma(struct vm_area_struct *vma)
+{
+	vma_end_read(vma);
+}
+
+static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm,
+						    unsigned long dst_start,
+						    unsigned long len)
+{
+	struct vm_area_struct *dst_vma;
 
-	return dst_vma;
+	dst_vma = lock_vma(dst_mm, dst_start);
+	if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
+		return dst_vma;
+
+	unlock_vma(dst_vma);
+	return ERR_PTR(-ENOENT);
+}
+
+#else
+
+static struct vm_area_struct *lock_mm_and_find_dst_vma(struct mm_struct *dst_mm,
+						       unsigned long dst_start,
+						       unsigned long len)
+{
+	struct vm_area_struct *dst_vma;
+	int err;
+
+	mmap_read_lock(dst_mm);
+	dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
+	if (IS_ERR(dst_vma)) {
+		err = PTR_ERR(dst_vma);
+		goto out_unlock;
+	}
+
+	if (validate_dst_vma(dst_vma, dst_start + len))
+		return dst_vma;
+
+	err = -ENOENT;
+out_unlock:
+	mmap_read_unlock(dst_mm);
+	return ERR_PTR(err);
 }
+#endif
 
 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
@@ -350,7 +450,8 @@  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
  */
 static __always_inline ssize_t mfill_atomic_hugetlb(
 					      struct userfaultfd_ctx *ctx,
@@ -361,7 +462,6 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 					      uffd_flags_t flags)
 {
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
-	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	ssize_t err;
 	pte_t *dst_pte;
 	unsigned long src_addr, dst_addr;
@@ -380,7 +480,11 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 	 */
 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
 		up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+		unlock_vma(dst_vma);
+#else
 		mmap_read_unlock(dst_mm);
+#endif
 		return -EINVAL;
 	}
 
@@ -403,24 +507,32 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 	 * retry, dst_vma will be set to NULL and we must lookup again.
 	 */
 	if (!dst_vma) {
+#ifdef CONFIG_PER_VMA_LOCK
+		dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
+#else
+		dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
+#endif
+		if (IS_ERR(dst_vma)) {
+			err = PTR_ERR(dst_vma);
+			goto out;
+		}
+
 		err = -ENOENT;
-		dst_vma = find_dst_vma(dst_mm, dst_start, len);
-		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
-			goto out_unlock;
+		if (!is_vm_hugetlb_page(dst_vma))
+			goto out_unlock_vma;
 
 		err = -EINVAL;
 		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
-			goto out_unlock;
-
-		vm_shared = dst_vma->vm_flags & VM_SHARED;
-	}
+			goto out_unlock_vma;
 
-	/*
-	 * If not shared, ensure the dst_vma has a anon_vma.
-	 */
-	err = -ENOMEM;
-	if (!vm_shared) {
-		if (unlikely(anon_vma_prepare(dst_vma)))
+		/*
+		 * If memory mappings are changing because of non-cooperative
+		 * operation (e.g. mremap) running in parallel, bail out and
+		 * request the user to retry later
+		 */
+		down_read(&ctx->map_changing_lock);
+		err = -EAGAIN;
+		if (atomic_read(&ctx->mmap_changing))
 			goto out_unlock;
 	}
 
@@ -465,7 +577,11 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 
 		if (unlikely(err == -ENOENT)) {
 			up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+			unlock_vma(dst_vma);
+#else
 			mmap_read_unlock(dst_mm);
+#endif
 			BUG_ON(!folio);
 
 			err = copy_folio_from_user(folio,
@@ -474,17 +590,6 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 				err = -EFAULT;
 				goto out;
 			}
-			mmap_read_lock(dst_mm);
-			down_read(&ctx->map_changing_lock);
-			/*
-			 * If memory mappings are changing because of non-cooperative
-			 * operation (e.g. mremap) running in parallel, bail out and
-			 * request the user to retry later
-			 */
-			if (atomic_read(&ctx->mmap_changing)) {
-				err = -EAGAIN;
-				break;
-			}
 
 			dst_vma = NULL;
 			goto retry;
@@ -505,7 +610,12 @@  static __always_inline ssize_t mfill_atomic_hugetlb(
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
+out_unlock_vma:
+#ifdef CONFIG_PER_VMA_LOCK
+	unlock_vma(dst_vma);
+#else
 	mmap_read_unlock(dst_mm);
+#endif
 out:
 	if (folio)
 		folio_put(folio);
@@ -597,7 +707,19 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	copied = 0;
 	folio = NULL;
 retry:
-	mmap_read_lock(dst_mm);
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+#ifdef CONFIG_PER_VMA_LOCK
+	dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len);
+#else
+	dst_vma = lock_mm_and_find_dst_vma(dst_mm, dst_start, len);
+#endif
+	if (IS_ERR(dst_vma)) {
+		err = PTR_ERR(dst_vma);
+		goto out;
+	}
 
 	/*
 	 * If memory mappings are changing because of non-cooperative
@@ -609,15 +731,6 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	if (atomic_read(&ctx->mmap_changing))
 		goto out_unlock;
 
-	/*
-	 * Make sure the vma is not shared, that the dst range is
-	 * both valid and fully within a single existing vma.
-	 */
-	err = -ENOENT;
-	dst_vma = find_dst_vma(dst_mm, dst_start, len);
-	if (!dst_vma)
-		goto out_unlock;
-
 	err = -EINVAL;
 	/*
 	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
@@ -647,16 +760,6 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
-	/*
-	 * Ensure the dst_vma has a anon_vma or this page
-	 * would get a NULL anon_vma when moved in the
-	 * dst_vma.
-	 */
-	err = -ENOMEM;
-	if (!(dst_vma->vm_flags & VM_SHARED) &&
-	    unlikely(anon_vma_prepare(dst_vma)))
-		goto out_unlock;
-
 	while (src_addr < src_start + len) {
 		pmd_t dst_pmdval;
 
@@ -699,7 +802,11 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			void *kaddr;
 
 			up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+			unlock_vma(dst_vma);
+#else
 			mmap_read_unlock(dst_mm);
+#endif
 			BUG_ON(!folio);
 
 			kaddr = kmap_local_folio(folio, 0);
@@ -730,7 +837,11 @@  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+	unlock_vma(dst_vma);
+#else
 	mmap_read_unlock(dst_mm);
+#endif
 out:
 	if (folio)
 		folio_put(folio);
@@ -1267,27 +1378,119 @@  static int validate_move_areas(struct userfaultfd_ctx *ctx,
 	if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
 		return -EINVAL;
 
+	return 0;
+}
+
+static __always_inline
+long find_vmas_mm_locked(struct mm_struct *mm,
+			 unsigned long dst_start,
+			 unsigned long src_start,
+			 struct vm_area_struct **dst_vmap,
+			 struct vm_area_struct **src_vmap)
+{
+	struct vm_area_struct *vma;
+
+	mmap_assert_locked(mm);
+	vma = find_vma_and_prepare_anon(mm, dst_start);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+
+	*dst_vmap = vma;
+	/* Skip finding src_vma if src_start is in dst_vma */
+	if (src_start >= vma->vm_start && src_start < vma->vm_end)
+		goto out_success;
+
+	vma = vma_lookup(mm, src_start);
+	if (!vma)
+		return -ENOENT;
+out_success:
+	*src_vmap = vma;
+	return 0;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static long find_and_lock_vmas(struct mm_struct *mm,
+			       unsigned long dst_start,
+			       unsigned long src_start,
+			       struct vm_area_struct **dst_vmap,
+			       struct vm_area_struct **src_vmap)
+{
+	struct vm_area_struct *vma;
+	long err;
+
+	vma = lock_vma(mm, dst_start);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+
+	*dst_vmap = vma;
 	/*
-	 * Ensure the dst_vma has a anon_vma or this page
-	 * would get a NULL anon_vma when moved in the
-	 * dst_vma.
+	 * Skip finding src_vma if src_start is in dst_vma. This also ensures
+	 * that we don't lock the same vma twice.
 	 */
-	if (unlikely(anon_vma_prepare(dst_vma)))
-		return -ENOMEM;
+	if (src_start >= vma->vm_start && src_start < vma->vm_end) {
+		*src_vmap = vma;
+		return 0;
+	}
 
-	return 0;
+	/*
+	 * Using lock_vma() to get src_vma can lead to following deadlock:
+	 *
+	 * Thread1				Thread2
+	 * -------				-------
+	 * vma_start_read(dst_vma)
+	 *					mmap_write_lock(mm)
+	 *					vma_start_write(src_vma)
+	 * vma_start_read(src_vma)
+	 * mmap_read_lock(mm)
+	 *					vma_start_write(dst_vma)
+	 */
+	*src_vmap = lock_vma_under_rcu(mm, src_start);
+	if (likely(*src_vmap))
+		return 0;
+
+	/* Undo any locking and retry in mmap_lock critical section */
+	vma_end_read(*dst_vmap);
+
+	mmap_read_lock(mm);
+	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+	if (!err) {
+		/*
+		 * See comment in lock_vma() as to why not using
+		 * vma_start_read() here.
+		 */
+		down_read(&(*dst_vmap)->vm_lock->lock);
+		if (*dst_vmap != *src_vmap)
+			down_read(&(*src_vmap)->vm_lock->lock);
+	}
+	mmap_read_unlock(mm);
+	return err;
+}
+#else
+static long lock_mm_and_find_vmas(struct mm_struct *mm,
+				  unsigned long dst_start,
+				  unsigned long src_start,
+				  struct vm_area_struct **dst_vmap,
+				  struct vm_area_struct **src_vmap)
+{
+	long err;
+
+	mmap_read_lock(mm);
+	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+	if (err)
+		mmap_read_unlock(mm);
+	return err;
 }
+#endif
 
 /**
  * move_pages - move arbitrary anonymous pages of an existing vma
  * @ctx: pointer to the userfaultfd context
- * @mm: the address space to move pages
  * @dst_start: start of the destination virtual memory range
  * @src_start: start of the source virtual memory range
  * @len: length of the virtual memory range
  * @mode: flags from uffdio_move.mode
  *
- * Must be called with mmap_lock held for read.
+ * It will either use the mmap_lock in read mode or per-vma locks
  *
  * move_pages() remaps arbitrary anonymous pages atomically in zero
  * copy. It only works on non shared anonymous pages because those can
@@ -1355,10 +1558,10 @@  static int validate_move_areas(struct userfaultfd_ctx *ctx,
  * could be obtained. This is the only additional complexity added to
  * the rmap code to provide this anonymous page remapping functionality.
  */
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
-		   unsigned long dst_start, unsigned long src_start,
-		   unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+		   unsigned long src_start, unsigned long len, __u64 mode)
 {
+	struct mm_struct *mm = ctx->mm;
 	struct vm_area_struct *src_vma, *dst_vma;
 	unsigned long src_addr, dst_addr;
 	pmd_t *src_pmd, *dst_pmd;
@@ -1376,28 +1579,40 @@  ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 	    WARN_ON_ONCE(dst_start + len <= dst_start))
 		goto out;
 
+#ifdef CONFIG_PER_VMA_LOCK
+	err = find_and_lock_vmas(mm, dst_start, src_start,
+				 &dst_vma, &src_vma);
+#else
+	err = lock_mm_and_find_vmas(mm, dst_start, src_start,
+				    &dst_vma, &src_vma);
+#endif
+	if (err)
+		goto out;
+
+	/* Re-check after taking map_changing_lock */
+	err = -EAGAIN;
+	down_read(&ctx->map_changing_lock);
+	if (likely(atomic_read(&ctx->mmap_changing)))
+		goto out_unlock;
 	/*
 	 * Make sure the vma is not shared, that the src and dst remap
 	 * ranges are both valid and fully within a single existing
 	 * vma.
 	 */
-	src_vma = find_vma(mm, src_start);
-	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
-		goto out;
-	if (src_start < src_vma->vm_start ||
-	    src_start + len > src_vma->vm_end)
-		goto out;
+	err = -EINVAL;
+	if (src_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
+	if (src_start + len > src_vma->vm_end)
+		goto out_unlock;
 
-	dst_vma = find_vma(mm, dst_start);
-	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-		goto out;
-	if (dst_start < dst_vma->vm_start ||
-	    dst_start + len > dst_vma->vm_end)
-		goto out;
+	if (dst_vma->vm_flags & VM_SHARED)
+		goto out_unlock;
+	if (dst_start + len > dst_vma->vm_end)
+		goto out_unlock;
 
 	err = validate_move_areas(ctx, src_vma, dst_vma);
 	if (err)
-		goto out;
+		goto out_unlock;
 
 	for (src_addr = src_start, dst_addr = dst_start;
 	     src_addr < src_start + len;) {
@@ -1514,6 +1729,15 @@  ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
 		moved += step_size;
 	}
 
+out_unlock:
+	up_read(&ctx->map_changing_lock);
+#ifdef CONFIG_PER_VMA_LOCK
+	unlock_vma(src_vma);
+	if (src_vma != dst_vma)
+		unlock_vma(dst_vma);
+#else
+	mmap_read_unlock(mm);
+#endif
 out:
 	VM_WARN_ON(moved < 0);
 	VM_WARN_ON(err > 0);