Message ID | 20240206010919.1109005-4-lokeshgidra@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | per-vma locks in userfaultfd | expand |
* Lokesh Gidra <lokeshgidra@google.com> [240205 20:10]: > All userfaultfd operations, except write-protect, opportunistically use > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock > critical section. > > Write-protect operation requires mmap_lock as it iterates over multiple > vmas. > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> > --- > fs/userfaultfd.c | 13 +- > include/linux/mm.h | 16 +++ > include/linux/userfaultfd_k.h | 5 +- > mm/memory.c | 48 +++++++ > mm/userfaultfd.c | 242 +++++++++++++++++++++------------- > 5 files changed, 222 insertions(+), 102 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index c00a021bcce4..60dcfafdc11a 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, > return -EINVAL; > > if (mmget_not_zero(mm)) { > - mmap_read_lock(mm); > - > - /* Re-check after taking map_changing_lock */ > - down_read(&ctx->map_changing_lock); > - if (likely(!atomic_read(&ctx->mmap_changing))) > - ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, > - uffdio_move.len, uffdio_move.mode); > - else > - ret = -EAGAIN; > - up_read(&ctx->map_changing_lock); > - mmap_read_unlock(mm); > + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, > + uffdio_move.len, uffdio_move.mode); > mmput(mm); > } else { > return -ESRCH; > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 0d1f98ab0c72..e69dfe2edcce 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -753,6 +753,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) > mmap_read_unlock(vmf->vma->vm_mm); > } > > +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) > +{ > + vma_end_read(vma); > +} > + > static inline void assert_fault_locked(struct vm_fault *vmf) > { > if (vmf->flags & FAULT_FLAG_VMA_LOCK) > @@ -774,6 +779,9 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) > { mmap_assert_write_locked(vma->vm_mm); } > static inline void vma_mark_detached(struct vm_area_struct *vma, > bool detached) {} > +static inline void vma_acquire_read_lock(struct vm_area_struct *vma) { > + mmap_assert_locked(vma->vm_mm); > +} > > static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > unsigned long address) > @@ -786,6 +794,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) > mmap_read_unlock(vmf->vma->vm_mm); > } > > +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) > +{ > + mmap_read_unlock(mm); > +} > + Instead of passing two variables and only using one based on configuration of kernel build, why not use vma->vm_mm to mmap_read_unlock() and just pass the vma? It is odd to call unlock_vma() which maps to mmap_read_unlock(). Could we have this abstraction depend on CONFIG_PER_VMA_LOCK in uffd so that reading the code remains clear? You seem to have pretty much two versions of each function already. If you do that, then we can leave unlock_vma() undefined if !CONFIG_PER_VMA_LOCK. > static inline void assert_fault_locked(struct vm_fault *vmf) > { > mmap_assert_locked(vmf->vma->vm_mm); > @@ -794,6 +807,9 @@ static inline void assert_fault_locked(struct vm_fault *vmf) > #endif /* CONFIG_PER_VMA_LOCK */ > > extern const struct vm_operations_struct vma_dummy_vm_ops; > +extern struct vm_area_struct *lock_vma(struct mm_struct *mm, > + unsigned long address, > + bool prepare_anon); > > /* > * WARNING: vma_init does not initialize vma->vm_lock. > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index 3210c3552976..05d59f74fc88 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma, > /* move_pages */ > void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); > void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, > - unsigned long dst_start, unsigned long src_start, > - unsigned long len, __u64 flags); > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, > + unsigned long src_start, unsigned long len, __u64 flags); > int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, > struct vm_area_struct *dst_vma, > struct vm_area_struct *src_vma, > diff --git a/mm/memory.c b/mm/memory.c > index b05fd28dbce1..393ab3b0d6f3 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -5760,8 +5760,56 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > count_vm_vma_lock_event(VMA_LOCK_ABORT); > return NULL; > } > + > +static void vma_acquire_read_lock(struct vm_area_struct *vma) > +{ > + /* > + * We cannot use vma_start_read() as it may fail due to false locked > + * (see comment in vma_start_read()). We can avoid that by directly > + * locking vm_lock under mmap_lock, which guarantees that nobody could > + * have locked the vma for write (vma_start_write()). > + */ > + mmap_assert_locked(vma->vm_mm); > + down_read(&vma->vm_lock->lock); > +} > #endif /* CONFIG_PER_VMA_LOCK */ > > +/* > + * lock_vma() - Lookup and lock VMA corresponding to @address. Missing arguments in the comment > + * @prepare_anon: If true, then prepare the VMA (if anonymous) with anon_vma. > + * > + * Should be called without holding mmap_lock. VMA should be unlocked after use > + * with unlock_vma(). > + * > + * Return: A locked VMA containing @address, NULL of no VMA is found, or > + * -ENOMEM if anon_vma couldn't be allocated. > + */ > +struct vm_area_struct *lock_vma(struct mm_struct *mm, > + unsigned long address, > + bool prepare_anon) > +{ > + struct vm_area_struct *vma; > + > + vma = lock_vma_under_rcu(mm, address); > + Nit: extra new line > + if (vma) > + return vma; > + > + mmap_read_lock(mm); > + vma = vma_lookup(mm, address); > + if (vma) { > + if (prepare_anon && vma_is_anonymous(vma) && > + anon_vma_prepare(vma)) > + vma = ERR_PTR(-ENOMEM); > + else > + vma_acquire_read_lock(vma); > + } > + > + if (IS_ENABLED(CONFIG_PER_VMA_LOCK) || !vma || PTR_ERR(vma) == -ENOMEM) > + mmap_read_unlock(mm); > + return vma; > +} > + It is also very odd that lock_vma() may, in fact, be locking the mm. It seems like there is a layer of abstraction missing here, where your code would either lock the vma or lock the mm - like you had before, but without the confusing semantics of unlocking with a flag. That is, we know what to do to unlock based on CONFIG_PER_VMA_LOCK, but it isn't always used. Maybe my comments were not clear on what I was thinking on the locking plan. I was thinking that, in the CONFIG_PER_VMA_LOCK case, you could have a lock_vma() which does the per-vma locking which you can use in your code. You could call lock_vma() in some uffd helper function that would do what is required (limit checking, etc) and return a locked vma. The counterpart of that would be another helper function that would do what was required under the mmap_read lock (limit check, etc). The unlocking would be entirely config dependant as you have today. Just write the few functions you have twice: once for per-vma lock support, once without it. Since we now can ensure the per-vma lock is taken in the per-vma lock path (or it failed), then you don't need to mmap_locked boolean you had in the previous version. You solved the unlock issue already, but it should be abstracted so uffd calls the underlying unlock vs vma_unlock() doing an mmap_read_unlock() - because that's very confusing to see. I'd drop the vma from the function names that lock the mm or the vma as well. Thanks, Liam
On Tue, Feb 6, 2024 at 2:09 AM Lokesh Gidra <lokeshgidra@google.com> wrote: > All userfaultfd operations, except write-protect, opportunistically use > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock > critical section. > > Write-protect operation requires mmap_lock as it iterates over multiple > vmas. > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> [...] > diff --git a/mm/memory.c b/mm/memory.c > index b05fd28dbce1..393ab3b0d6f3 100644 > --- a/mm/memory.c > +++ b/mm/memory.c [...] > +/* > + * lock_vma() - Lookup and lock VMA corresponding to @address. > + * @prepare_anon: If true, then prepare the VMA (if anonymous) with anon_vma. > + * > + * Should be called without holding mmap_lock. VMA should be unlocked after use > + * with unlock_vma(). > + * > + * Return: A locked VMA containing @address, NULL of no VMA is found, or > + * -ENOMEM if anon_vma couldn't be allocated. > + */ > +struct vm_area_struct *lock_vma(struct mm_struct *mm, > + unsigned long address, > + bool prepare_anon) > +{ > + struct vm_area_struct *vma; > + > + vma = lock_vma_under_rcu(mm, address); > + > + if (vma) > + return vma; > + > + mmap_read_lock(mm); > + vma = vma_lookup(mm, address); > + if (vma) { > + if (prepare_anon && vma_is_anonymous(vma) && > + anon_vma_prepare(vma)) > + vma = ERR_PTR(-ENOMEM); > + else > + vma_acquire_read_lock(vma); This new code only calls anon_vma_prepare() for VMAs where vma_is_anonymous() is true (meaning they are private anonymous). [...] > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 74aad0831e40..64e22e467e4f 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -19,20 +19,25 @@ > #include <asm/tlb.h> > #include "internal.h" > > -static __always_inline > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, > - unsigned long dst_start, > - unsigned long len) > +/* Search for VMA and make sure it is valid. */ > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm, > + unsigned long dst_start, > + unsigned long len) > { > - /* > - * Make sure that the dst range is both valid and fully within a > - * single existing vma. > - */ > struct vm_area_struct *dst_vma; > > - dst_vma = find_vma(dst_mm, dst_start); > - if (!range_in_vma(dst_vma, dst_start, dst_start + len)) > - return NULL; > + /* Ensure anon_vma is assigned for anonymous vma */ > + dst_vma = lock_vma(dst_mm, dst_start, true); lock_vma() is now used by find_and_lock_dst_vma(), which is used by mfill_atomic(). > + if (!dst_vma) > + return ERR_PTR(-ENOENT); > + > + if (PTR_ERR(dst_vma) == -ENOMEM) > + return dst_vma; > + > + /* Make sure that the dst range is fully within dst_vma. */ > + if (dst_start + len > dst_vma->vm_end) > + goto out_unlock; > > /* > * Check the vma is registered in uffd, this is required to [...] > @@ -597,7 +599,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > copied = 0; > folio = NULL; > retry: > - mmap_read_lock(dst_mm); > + /* > + * Make sure the vma is not shared, that the dst range is > + * both valid and fully within a single existing vma. > + */ > + dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len); > + if (IS_ERR(dst_vma)) { > + err = PTR_ERR(dst_vma); > + goto out; > + } > > /* > * If memory mappings are changing because of non-cooperative > @@ -609,15 +619,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > if (atomic_read(&ctx->mmap_changing)) > goto out_unlock; > > - /* > - * Make sure the vma is not shared, that the dst range is > - * both valid and fully within a single existing vma. > - */ > - err = -ENOENT; > - dst_vma = find_dst_vma(dst_mm, dst_start, len); > - if (!dst_vma) > - goto out_unlock; > - > err = -EINVAL; > /* > * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but > @@ -647,16 +648,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) > goto out_unlock; > > - /* > - * Ensure the dst_vma has a anon_vma or this page > - * would get a NULL anon_vma when moved in the > - * dst_vma. > - */ > - err = -ENOMEM; > - if (!(dst_vma->vm_flags & VM_SHARED) && > - unlikely(anon_vma_prepare(dst_vma))) > - goto out_unlock; But the check mfill_atomic() used to do was different, it checked for VM_SHARED. Each VMA has one of these three types: 1. shared (marked by VM_SHARED; does not have an anon_vma) 2. private file-backed (needs to have anon_vma when storing PTEs) 3. private anonymous (what vma_is_anonymous() detects; needs to have anon_vma when storing PTEs) This old code would call anon_vma_prepare() for both private VMA types (which is correct). The new code only calls anon_vma_prepare() for private anonymous VMAs, not for private file-backed ones. I think this code will probably crash with a BUG_ON() in __folio_set_anon() if you try to use userfaultfd to insert a PTE into a private file-backed VMA of a shmem file. (Which you should be able to get by creating a file in /dev/shm/ and then mapping that file with mmap(NULL, <size>, PROT_READ|PROT_WRITE, MAP_PRIVATE, <fd>, 0).)
On Tue, Feb 6, 2024 at 9:05 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Lokesh Gidra <lokeshgidra@google.com> [240205 20:10]: > > All userfaultfd operations, except write-protect, opportunistically use > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock > > critical section. > > > > Write-protect operation requires mmap_lock as it iterates over multiple > > vmas. > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> > > --- > > fs/userfaultfd.c | 13 +- > > include/linux/mm.h | 16 +++ > > include/linux/userfaultfd_k.h | 5 +- > > mm/memory.c | 48 +++++++ > > mm/userfaultfd.c | 242 +++++++++++++++++++++------------- > > 5 files changed, 222 insertions(+), 102 deletions(-) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index c00a021bcce4..60dcfafdc11a 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, > > return -EINVAL; > > > > if (mmget_not_zero(mm)) { > > - mmap_read_lock(mm); > > - > > - /* Re-check after taking map_changing_lock */ > > - down_read(&ctx->map_changing_lock); > > - if (likely(!atomic_read(&ctx->mmap_changing))) > > - ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, > > - uffdio_move.len, uffdio_move.mode); > > - else > > - ret = -EAGAIN; > > - up_read(&ctx->map_changing_lock); > > - mmap_read_unlock(mm); > > + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, > > + uffdio_move.len, uffdio_move.mode); > > mmput(mm); > > } else { > > return -ESRCH; > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 0d1f98ab0c72..e69dfe2edcce 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -753,6 +753,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) > > mmap_read_unlock(vmf->vma->vm_mm); > > } > > > > +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) > > +{ > > + vma_end_read(vma); > > +} > > + > > static inline void assert_fault_locked(struct vm_fault *vmf) > > { > > if (vmf->flags & FAULT_FLAG_VMA_LOCK) > > @@ -774,6 +779,9 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) > > { mmap_assert_write_locked(vma->vm_mm); } > > static inline void vma_mark_detached(struct vm_area_struct *vma, > > bool detached) {} > > +static inline void vma_acquire_read_lock(struct vm_area_struct *vma) { > > + mmap_assert_locked(vma->vm_mm); > > +} > > > > static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > unsigned long address) > > @@ -786,6 +794,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) > > mmap_read_unlock(vmf->vma->vm_mm); > > } > > > > +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) > > +{ > > + mmap_read_unlock(mm); > > +} > > + > > Instead of passing two variables and only using one based on > configuration of kernel build, why not use vma->vm_mm to > mmap_read_unlock() and just pass the vma? > > It is odd to call unlock_vma() which maps to mmap_read_unlock(). Could > we have this abstraction depend on CONFIG_PER_VMA_LOCK in uffd so that > reading the code remains clear? You seem to have pretty much two > versions of each function already. If you do that, then we can leave > unlock_vma() undefined if !CONFIG_PER_VMA_LOCK. > > > static inline void assert_fault_locked(struct vm_fault *vmf) > > { > > mmap_assert_locked(vmf->vma->vm_mm); > > @@ -794,6 +807,9 @@ static inline void assert_fault_locked(struct vm_fault *vmf) > > #endif /* CONFIG_PER_VMA_LOCK */ > > > > extern const struct vm_operations_struct vma_dummy_vm_ops; > > +extern struct vm_area_struct *lock_vma(struct mm_struct *mm, > > + unsigned long address, > > + bool prepare_anon); > > > > /* > > * WARNING: vma_init does not initialize vma->vm_lock. > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > > index 3210c3552976..05d59f74fc88 100644 > > --- a/include/linux/userfaultfd_k.h > > +++ b/include/linux/userfaultfd_k.h > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma, > > /* move_pages */ > > void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); > > void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, > > - unsigned long dst_start, unsigned long src_start, > > - unsigned long len, __u64 flags); > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, > > + unsigned long src_start, unsigned long len, __u64 flags); > > int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, > > struct vm_area_struct *dst_vma, > > struct vm_area_struct *src_vma, > > diff --git a/mm/memory.c b/mm/memory.c > > index b05fd28dbce1..393ab3b0d6f3 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -5760,8 +5760,56 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > count_vm_vma_lock_event(VMA_LOCK_ABORT); > > return NULL; > > } > > + > > +static void vma_acquire_read_lock(struct vm_area_struct *vma) > > +{ > > + /* > > + * We cannot use vma_start_read() as it may fail due to false locked > > + * (see comment in vma_start_read()). We can avoid that by directly > > + * locking vm_lock under mmap_lock, which guarantees that nobody could > > + * have locked the vma for write (vma_start_write()). > > + */ > > + mmap_assert_locked(vma->vm_mm); > > + down_read(&vma->vm_lock->lock); > > +} > > #endif /* CONFIG_PER_VMA_LOCK */ > > > > +/* > > + * lock_vma() - Lookup and lock VMA corresponding to @address. > > Missing arguments in the comment > > > + * @prepare_anon: If true, then prepare the VMA (if anonymous) with anon_vma. > > + * > > + * Should be called without holding mmap_lock. VMA should be unlocked after use > > + * with unlock_vma(). > > + * > > + * Return: A locked VMA containing @address, NULL of no VMA is found, or > > + * -ENOMEM if anon_vma couldn't be allocated. > > + */ > > +struct vm_area_struct *lock_vma(struct mm_struct *mm, > > + unsigned long address, > > + bool prepare_anon) > > +{ > > + struct vm_area_struct *vma; > > + > > + vma = lock_vma_under_rcu(mm, address); > > + > > Nit: extra new line > > > + if (vma) > > + return vma; > > + > > + mmap_read_lock(mm); > > + vma = vma_lookup(mm, address); > > + if (vma) { > > + if (prepare_anon && vma_is_anonymous(vma) && > > + anon_vma_prepare(vma)) > > + vma = ERR_PTR(-ENOMEM); > > + else > > + vma_acquire_read_lock(vma); > > + } > > + > > + if (IS_ENABLED(CONFIG_PER_VMA_LOCK) || !vma || PTR_ERR(vma) == -ENOMEM) > > + mmap_read_unlock(mm); > > + return vma; > > +} > > + > > It is also very odd that lock_vma() may, in fact, be locking the mm. It > seems like there is a layer of abstraction missing here, where your code > would either lock the vma or lock the mm - like you had before, but > without the confusing semantics of unlocking with a flag. That is, we > know what to do to unlock based on CONFIG_PER_VMA_LOCK, but it isn't > always used. > > Maybe my comments were not clear on what I was thinking on the locking > plan. I was thinking that, in the CONFIG_PER_VMA_LOCK case, you could > have a lock_vma() which does the per-vma locking which you can use in > your code. You could call lock_vma() in some uffd helper function that > would do what is required (limit checking, etc) and return a locked vma. > > The counterpart of that would be another helper function that would do > what was required under the mmap_read lock (limit check, etc). The > unlocking would be entirely config dependant as you have today. > > Just write the few functions you have twice: once for per-vma lock > support, once without it. Since we now can ensure the per-vma lock is > taken in the per-vma lock path (or it failed), then you don't need to > mmap_locked boolean you had in the previous version. You solved the > unlock issue already, but it should be abstracted so uffd calls the > underlying unlock vs vma_unlock() doing an mmap_read_unlock() - because > that's very confusing to see. > > I'd drop the vma from the function names that lock the mm or the vma as > well. > > Thanks, > Liam I got it now. I'll make the changes in the next version. Would it be ok to make lock_vma()/unlock_vma() (in case of CONFIG_PER_VMA_LOCK) also be defined in mm/userfaultfd.c? The reason I say this is because first there are no other users of these functions. And also due to what Jann pointed out about anon_vma. lock_vma_under_rcu() (rightly) only checks for private+anonymous case and not private+file-backed case. So lock_vma() implementation is getting very userfaultfd specific IMO.
On Tue, Feb 6, 2024 at 10:28 AM Jann Horn <jannh@google.com> wrote: > > On Tue, Feb 6, 2024 at 2:09 AM Lokesh Gidra <lokeshgidra@google.com> wrote: > > All userfaultfd operations, except write-protect, opportunistically use > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock > > critical section. > > > > Write-protect operation requires mmap_lock as it iterates over multiple > > vmas. > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> > [...] > > diff --git a/mm/memory.c b/mm/memory.c > > index b05fd28dbce1..393ab3b0d6f3 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > [...] > > +/* > > + * lock_vma() - Lookup and lock VMA corresponding to @address. > > + * @prepare_anon: If true, then prepare the VMA (if anonymous) with anon_vma. > > + * > > + * Should be called without holding mmap_lock. VMA should be unlocked after use > > + * with unlock_vma(). > > + * > > + * Return: A locked VMA containing @address, NULL of no VMA is found, or > > + * -ENOMEM if anon_vma couldn't be allocated. > > + */ > > +struct vm_area_struct *lock_vma(struct mm_struct *mm, > > + unsigned long address, > > + bool prepare_anon) > > +{ > > + struct vm_area_struct *vma; > > + > > + vma = lock_vma_under_rcu(mm, address); > > + > > + if (vma) > > + return vma; > > + > > + mmap_read_lock(mm); > > + vma = vma_lookup(mm, address); > > + if (vma) { > > + if (prepare_anon && vma_is_anonymous(vma) && > > + anon_vma_prepare(vma)) > > + vma = ERR_PTR(-ENOMEM); > > + else > > + vma_acquire_read_lock(vma); > > This new code only calls anon_vma_prepare() for VMAs where > vma_is_anonymous() is true (meaning they are private anonymous). > > [...] > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index 74aad0831e40..64e22e467e4f 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > > @@ -19,20 +19,25 @@ > > #include <asm/tlb.h> > > #include "internal.h" > > > > -static __always_inline > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, > > - unsigned long dst_start, > > - unsigned long len) > > +/* Search for VMA and make sure it is valid. */ > > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm, > > + unsigned long dst_start, > > + unsigned long len) > > { > > - /* > > - * Make sure that the dst range is both valid and fully within a > > - * single existing vma. > > - */ > > struct vm_area_struct *dst_vma; > > > > - dst_vma = find_vma(dst_mm, dst_start); > > - if (!range_in_vma(dst_vma, dst_start, dst_start + len)) > > - return NULL; > > + /* Ensure anon_vma is assigned for anonymous vma */ > > + dst_vma = lock_vma(dst_mm, dst_start, true); > > lock_vma() is now used by find_and_lock_dst_vma(), which is used by > mfill_atomic(). > > > + if (!dst_vma) > > + return ERR_PTR(-ENOENT); > > + > > + if (PTR_ERR(dst_vma) == -ENOMEM) > > + return dst_vma; > > + > > + /* Make sure that the dst range is fully within dst_vma. */ > > + if (dst_start + len > dst_vma->vm_end) > > + goto out_unlock; > > > > /* > > * Check the vma is registered in uffd, this is required to > [...] > > @@ -597,7 +599,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > > copied = 0; > > folio = NULL; > > retry: > > - mmap_read_lock(dst_mm); > > + /* > > + * Make sure the vma is not shared, that the dst range is > > + * both valid and fully within a single existing vma. > > + */ > > + dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len); > > + if (IS_ERR(dst_vma)) { > > + err = PTR_ERR(dst_vma); > > + goto out; > > + } > > > > /* > > * If memory mappings are changing because of non-cooperative > > @@ -609,15 +619,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > > if (atomic_read(&ctx->mmap_changing)) > > goto out_unlock; > > > > - /* > > - * Make sure the vma is not shared, that the dst range is > > - * both valid and fully within a single existing vma. > > - */ > > - err = -ENOENT; > > - dst_vma = find_dst_vma(dst_mm, dst_start, len); > > - if (!dst_vma) > > - goto out_unlock; > > - > > err = -EINVAL; > > /* > > * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but > > @@ -647,16 +648,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > > uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) > > goto out_unlock; > > > > - /* > > - * Ensure the dst_vma has a anon_vma or this page > > - * would get a NULL anon_vma when moved in the > > - * dst_vma. > > - */ > > - err = -ENOMEM; > > - if (!(dst_vma->vm_flags & VM_SHARED) && > > - unlikely(anon_vma_prepare(dst_vma))) > > - goto out_unlock; > > But the check mfill_atomic() used to do was different, it checked for VM_SHARED. Thanks so much for catching this. > > Each VMA has one of these three types: > > 1. shared (marked by VM_SHARED; does not have an anon_vma) > 2. private file-backed (needs to have anon_vma when storing PTEs) > 3. private anonymous (what vma_is_anonymous() detects; needs to have > anon_vma when storing PTEs) As in the case of mfill_atomic(), it seems to me that checking for VM_SHARED flag will cover both (2) and (3) right? > > This old code would call anon_vma_prepare() for both private VMA types > (which is correct). The new code only calls anon_vma_prepare() for > private anonymous VMAs, not for private file-backed ones. I think this > code will probably crash with a BUG_ON() in __folio_set_anon() if you > try to use userfaultfd to insert a PTE into a private file-backed VMA > of a shmem file. (Which you should be able to get by creating a file > in /dev/shm/ and then mapping that file with mmap(NULL, <size>, > PROT_READ|PROT_WRITE, MAP_PRIVATE, <fd>, 0).)
* Lokesh Gidra <lokeshgidra@google.com> [240207 13:48]: > On Tue, Feb 6, 2024 at 9:05 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > * Lokesh Gidra <lokeshgidra@google.com> [240205 20:10]: > > > All userfaultfd operations, except write-protect, opportunistically use > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock > > > critical section. > > > > > > Write-protect operation requires mmap_lock as it iterates over multiple > > > vmas. > > > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> > > > --- > > > fs/userfaultfd.c | 13 +- > > > include/linux/mm.h | 16 +++ > > > include/linux/userfaultfd_k.h | 5 +- > > > mm/memory.c | 48 +++++++ > > > mm/userfaultfd.c | 242 +++++++++++++++++++++------------- > > > 5 files changed, 222 insertions(+), 102 deletions(-) > > > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > > index c00a021bcce4..60dcfafdc11a 100644 > > > --- a/fs/userfaultfd.c > > > +++ b/fs/userfaultfd.c > > > @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, > > > return -EINVAL; > > > > > > if (mmget_not_zero(mm)) { > > > - mmap_read_lock(mm); > > > - > > > - /* Re-check after taking map_changing_lock */ > > > - down_read(&ctx->map_changing_lock); > > > - if (likely(!atomic_read(&ctx->mmap_changing))) > > > - ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, > > > - uffdio_move.len, uffdio_move.mode); > > > - else > > > - ret = -EAGAIN; > > > - up_read(&ctx->map_changing_lock); > > > - mmap_read_unlock(mm); > > > + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, > > > + uffdio_move.len, uffdio_move.mode); > > > mmput(mm); > > > } else { > > > return -ESRCH; > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > > index 0d1f98ab0c72..e69dfe2edcce 100644 > > > --- a/include/linux/mm.h > > > +++ b/include/linux/mm.h > > > @@ -753,6 +753,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) > > > mmap_read_unlock(vmf->vma->vm_mm); > > > } > > > > > > +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) > > > +{ > > > + vma_end_read(vma); > > > +} > > > + > > > static inline void assert_fault_locked(struct vm_fault *vmf) > > > { > > > if (vmf->flags & FAULT_FLAG_VMA_LOCK) > > > @@ -774,6 +779,9 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) > > > { mmap_assert_write_locked(vma->vm_mm); } > > > static inline void vma_mark_detached(struct vm_area_struct *vma, > > > bool detached) {} > > > +static inline void vma_acquire_read_lock(struct vm_area_struct *vma) { > > > + mmap_assert_locked(vma->vm_mm); > > > +} > > > > > > static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > > unsigned long address) > > > @@ -786,6 +794,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) > > > mmap_read_unlock(vmf->vma->vm_mm); > > > } > > > > > > +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) > > > +{ > > > + mmap_read_unlock(mm); > > > +} > > > + > > > > Instead of passing two variables and only using one based on > > configuration of kernel build, why not use vma->vm_mm to > > mmap_read_unlock() and just pass the vma? > > > > It is odd to call unlock_vma() which maps to mmap_read_unlock(). Could > > we have this abstraction depend on CONFIG_PER_VMA_LOCK in uffd so that > > reading the code remains clear? You seem to have pretty much two > > versions of each function already. If you do that, then we can leave > > unlock_vma() undefined if !CONFIG_PER_VMA_LOCK. > > > > > static inline void assert_fault_locked(struct vm_fault *vmf) > > > { > > > mmap_assert_locked(vmf->vma->vm_mm); > > > @@ -794,6 +807,9 @@ static inline void assert_fault_locked(struct vm_fault *vmf) > > > #endif /* CONFIG_PER_VMA_LOCK */ > > > > > > extern const struct vm_operations_struct vma_dummy_vm_ops; > > > +extern struct vm_area_struct *lock_vma(struct mm_struct *mm, > > > + unsigned long address, > > > + bool prepare_anon); > > > > > > /* > > > * WARNING: vma_init does not initialize vma->vm_lock. > > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > > > index 3210c3552976..05d59f74fc88 100644 > > > --- a/include/linux/userfaultfd_k.h > > > +++ b/include/linux/userfaultfd_k.h > > > @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma, > > > /* move_pages */ > > > void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); > > > void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); > > > -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, > > > - unsigned long dst_start, unsigned long src_start, > > > - unsigned long len, __u64 flags); > > > +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, > > > + unsigned long src_start, unsigned long len, __u64 flags); > > > int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, > > > struct vm_area_struct *dst_vma, > > > struct vm_area_struct *src_vma, > > > diff --git a/mm/memory.c b/mm/memory.c > > > index b05fd28dbce1..393ab3b0d6f3 100644 > > > --- a/mm/memory.c > > > +++ b/mm/memory.c > > > @@ -5760,8 +5760,56 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > > count_vm_vma_lock_event(VMA_LOCK_ABORT); > > > return NULL; > > > } > > > + > > > +static void vma_acquire_read_lock(struct vm_area_struct *vma) > > > +{ > > > + /* > > > + * We cannot use vma_start_read() as it may fail due to false locked > > > + * (see comment in vma_start_read()). We can avoid that by directly > > > + * locking vm_lock under mmap_lock, which guarantees that nobody could > > > + * have locked the vma for write (vma_start_write()). > > > + */ > > > + mmap_assert_locked(vma->vm_mm); > > > + down_read(&vma->vm_lock->lock); > > > +} > > > #endif /* CONFIG_PER_VMA_LOCK */ > > > > > > +/* > > > + * lock_vma() - Lookup and lock VMA corresponding to @address. > > > > Missing arguments in the comment > > > > > + * @prepare_anon: If true, then prepare the VMA (if anonymous) with anon_vma. > > > + * > > > + * Should be called without holding mmap_lock. VMA should be unlocked after use > > > + * with unlock_vma(). > > > + * > > > + * Return: A locked VMA containing @address, NULL of no VMA is found, or > > > + * -ENOMEM if anon_vma couldn't be allocated. > > > + */ > > > +struct vm_area_struct *lock_vma(struct mm_struct *mm, > > > + unsigned long address, > > > + bool prepare_anon) > > > +{ > > > + struct vm_area_struct *vma; > > > + > > > + vma = lock_vma_under_rcu(mm, address); > > > + > > > > Nit: extra new line > > > > > + if (vma) > > > + return vma; > > > + > > > + mmap_read_lock(mm); > > > + vma = vma_lookup(mm, address); > > > + if (vma) { > > > + if (prepare_anon && vma_is_anonymous(vma) && > > > + anon_vma_prepare(vma)) > > > + vma = ERR_PTR(-ENOMEM); > > > + else > > > + vma_acquire_read_lock(vma); > > > + } > > > + > > > + if (IS_ENABLED(CONFIG_PER_VMA_LOCK) || !vma || PTR_ERR(vma) == -ENOMEM) > > > + mmap_read_unlock(mm); > > > + return vma; > > > +} > > > + > > > > It is also very odd that lock_vma() may, in fact, be locking the mm. It > > seems like there is a layer of abstraction missing here, where your code > > would either lock the vma or lock the mm - like you had before, but > > without the confusing semantics of unlocking with a flag. That is, we > > know what to do to unlock based on CONFIG_PER_VMA_LOCK, but it isn't > > always used. > > > > Maybe my comments were not clear on what I was thinking on the locking > > plan. I was thinking that, in the CONFIG_PER_VMA_LOCK case, you could > > have a lock_vma() which does the per-vma locking which you can use in > > your code. You could call lock_vma() in some uffd helper function that > > would do what is required (limit checking, etc) and return a locked vma. > > > > The counterpart of that would be another helper function that would do > > what was required under the mmap_read lock (limit check, etc). The > > unlocking would be entirely config dependant as you have today. > > > > Just write the few functions you have twice: once for per-vma lock > > support, once without it. Since we now can ensure the per-vma lock is > > taken in the per-vma lock path (or it failed), then you don't need to > > mmap_locked boolean you had in the previous version. You solved the > > unlock issue already, but it should be abstracted so uffd calls the > > underlying unlock vs vma_unlock() doing an mmap_read_unlock() - because > > that's very confusing to see. > > > > I'd drop the vma from the function names that lock the mm or the vma as > > well. > > > > Thanks, > > Liam > > I got it now. I'll make the changes in the next version. > > Would it be ok to make lock_vma()/unlock_vma() (in case of > CONFIG_PER_VMA_LOCK) also be defined in mm/userfaultfd.c? The reason I > say this is because first there are no other users of these functions. > And also due to what Jann pointed out about anon_vma. > lock_vma_under_rcu() (rightly) only checks for private+anonymous case > and not private+file-backed case. So lock_vma() implementation is > getting very userfaultfd specific IMO. Yes, this sounds reasonable. Looking forward to the next revision. Thanks, Liam
On Wed, Feb 7, 2024 at 7:50 PM Lokesh Gidra <lokeshgidra@google.com> wrote: > On Tue, Feb 6, 2024 at 10:28 AM Jann Horn <jannh@google.com> wrote: > > On Tue, Feb 6, 2024 at 2:09 AM Lokesh Gidra <lokeshgidra@google.com> wrote: > > > All userfaultfd operations, except write-protect, opportunistically use > > > per-vma locks to lock vmas. On failure, attempt again inside mmap_lock > > > critical section. > > > > > > Write-protect operation requires mmap_lock as it iterates over multiple > > > vmas. > > > > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> > > [...] > > > diff --git a/mm/memory.c b/mm/memory.c > > > index b05fd28dbce1..393ab3b0d6f3 100644 > > > --- a/mm/memory.c > > > +++ b/mm/memory.c > > [...] > > > +/* > > > + * lock_vma() - Lookup and lock VMA corresponding to @address. > > > + * @prepare_anon: If true, then prepare the VMA (if anonymous) with anon_vma. > > > + * > > > + * Should be called without holding mmap_lock. VMA should be unlocked after use > > > + * with unlock_vma(). > > > + * > > > + * Return: A locked VMA containing @address, NULL of no VMA is found, or > > > + * -ENOMEM if anon_vma couldn't be allocated. > > > + */ > > > +struct vm_area_struct *lock_vma(struct mm_struct *mm, > > > + unsigned long address, > > > + bool prepare_anon) > > > +{ > > > + struct vm_area_struct *vma; > > > + > > > + vma = lock_vma_under_rcu(mm, address); > > > + > > > + if (vma) > > > + return vma; > > > + > > > + mmap_read_lock(mm); > > > + vma = vma_lookup(mm, address); > > > + if (vma) { > > > + if (prepare_anon && vma_is_anonymous(vma) && > > > + anon_vma_prepare(vma)) > > > + vma = ERR_PTR(-ENOMEM); > > > + else > > > + vma_acquire_read_lock(vma); > > > > This new code only calls anon_vma_prepare() for VMAs where > > vma_is_anonymous() is true (meaning they are private anonymous). > > > > [...] > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > > index 74aad0831e40..64e22e467e4f 100644 > > > --- a/mm/userfaultfd.c > > > +++ b/mm/userfaultfd.c > > > @@ -19,20 +19,25 @@ > > > #include <asm/tlb.h> > > > #include "internal.h" > > > > > > -static __always_inline > > > -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, > > > - unsigned long dst_start, > > > - unsigned long len) > > > +/* Search for VMA and make sure it is valid. */ > > > +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm, > > > + unsigned long dst_start, > > > + unsigned long len) > > > { > > > - /* > > > - * Make sure that the dst range is both valid and fully within a > > > - * single existing vma. > > > - */ > > > struct vm_area_struct *dst_vma; > > > > > > - dst_vma = find_vma(dst_mm, dst_start); > > > - if (!range_in_vma(dst_vma, dst_start, dst_start + len)) > > > - return NULL; > > > + /* Ensure anon_vma is assigned for anonymous vma */ > > > + dst_vma = lock_vma(dst_mm, dst_start, true); > > > > lock_vma() is now used by find_and_lock_dst_vma(), which is used by > > mfill_atomic(). > > > > > + if (!dst_vma) > > > + return ERR_PTR(-ENOENT); > > > + > > > + if (PTR_ERR(dst_vma) == -ENOMEM) > > > + return dst_vma; > > > + > > > + /* Make sure that the dst range is fully within dst_vma. */ > > > + if (dst_start + len > dst_vma->vm_end) > > > + goto out_unlock; > > > > > > /* > > > * Check the vma is registered in uffd, this is required to > > [...] > > > @@ -597,7 +599,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > > > copied = 0; > > > folio = NULL; > > > retry: > > > - mmap_read_lock(dst_mm); > > > + /* > > > + * Make sure the vma is not shared, that the dst range is > > > + * both valid and fully within a single existing vma. > > > + */ > > > + dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len); > > > + if (IS_ERR(dst_vma)) { > > > + err = PTR_ERR(dst_vma); > > > + goto out; > > > + } > > > > > > /* > > > * If memory mappings are changing because of non-cooperative > > > @@ -609,15 +619,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > > > if (atomic_read(&ctx->mmap_changing)) > > > goto out_unlock; > > > > > > - /* > > > - * Make sure the vma is not shared, that the dst range is > > > - * both valid and fully within a single existing vma. > > > - */ > > > - err = -ENOENT; > > > - dst_vma = find_dst_vma(dst_mm, dst_start, len); > > > - if (!dst_vma) > > > - goto out_unlock; > > > - > > > err = -EINVAL; > > > /* > > > * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but > > > @@ -647,16 +648,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > > > uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) > > > goto out_unlock; > > > > > > - /* > > > - * Ensure the dst_vma has a anon_vma or this page > > > - * would get a NULL anon_vma when moved in the > > > - * dst_vma. > > > - */ > > > - err = -ENOMEM; > > > - if (!(dst_vma->vm_flags & VM_SHARED) && > > > - unlikely(anon_vma_prepare(dst_vma))) > > > - goto out_unlock; > > > > But the check mfill_atomic() used to do was different, it checked for VM_SHARED. > > Thanks so much for catching this. > > > > Each VMA has one of these three types: > > > > 1. shared (marked by VM_SHARED; does not have an anon_vma) > > 2. private file-backed (needs to have anon_vma when storing PTEs) > > 3. private anonymous (what vma_is_anonymous() detects; needs to have > > anon_vma when storing PTEs) > > As in the case of mfill_atomic(), it seems to me that checking for > VM_SHARED flag will cover both (2) and (3) right? Yes, I think that should work.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index c00a021bcce4..60dcfafdc11a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, return -EINVAL; if (mmget_not_zero(mm)) { - mmap_read_lock(mm); - - /* Re-check after taking map_changing_lock */ - down_read(&ctx->map_changing_lock); - if (likely(!atomic_read(&ctx->mmap_changing))) - ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, - uffdio_move.len, uffdio_move.mode); - else - ret = -EAGAIN; - up_read(&ctx->map_changing_lock); - mmap_read_unlock(mm); + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, + uffdio_move.len, uffdio_move.mode); mmput(mm); } else { return -ESRCH; diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d1f98ab0c72..e69dfe2edcce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -753,6 +753,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + vma_end_read(vma); +} + static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) @@ -774,6 +779,9 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} +static inline void vma_acquire_read_lock(struct vm_area_struct *vma) { + mmap_assert_locked(vma->vm_mm); +} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) @@ -786,6 +794,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void unlock_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + mmap_read_unlock(mm); +} + static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); @@ -794,6 +807,9 @@ static inline void assert_fault_locked(struct vm_fault *vmf) #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; +extern struct vm_area_struct *lock_vma(struct mm_struct *mm, + unsigned long address, + bool prepare_anon); /* * WARNING: vma_init does not initialize vma->vm_lock. diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3210c3552976..05d59f74fc88 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma, /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, __u64 flags); +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, diff --git a/mm/memory.c b/mm/memory.c index b05fd28dbce1..393ab3b0d6f3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5760,8 +5760,56 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } + +static void vma_acquire_read_lock(struct vm_area_struct *vma) +{ + /* + * We cannot use vma_start_read() as it may fail due to false locked + * (see comment in vma_start_read()). We can avoid that by directly + * locking vm_lock under mmap_lock, which guarantees that nobody could + * have locked the vma for write (vma_start_write()). + */ + mmap_assert_locked(vma->vm_mm); + down_read(&vma->vm_lock->lock); +} #endif /* CONFIG_PER_VMA_LOCK */ +/* + * lock_vma() - Lookup and lock VMA corresponding to @address. + * @prepare_anon: If true, then prepare the VMA (if anonymous) with anon_vma. + * + * Should be called without holding mmap_lock. VMA should be unlocked after use + * with unlock_vma(). + * + * Return: A locked VMA containing @address, NULL of no VMA is found, or + * -ENOMEM if anon_vma couldn't be allocated. + */ +struct vm_area_struct *lock_vma(struct mm_struct *mm, + unsigned long address, + bool prepare_anon) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, address); + + if (vma) + return vma; + + mmap_read_lock(mm); + vma = vma_lookup(mm, address); + if (vma) { + if (prepare_anon && vma_is_anonymous(vma) && + anon_vma_prepare(vma)) + vma = ERR_PTR(-ENOMEM); + else + vma_acquire_read_lock(vma); + } + + if (IS_ENABLED(CONFIG_PER_VMA_LOCK) || !vma || PTR_ERR(vma) == -ENOMEM) + mmap_read_unlock(mm); + return vma; +} + #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 74aad0831e40..64e22e467e4f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -19,20 +19,25 @@ #include <asm/tlb.h> #include "internal.h" -static __always_inline -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len) +/* Search for VMA and make sure it is valid. */ +static struct vm_area_struct *find_and_lock_dst_vma(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) { - /* - * Make sure that the dst range is both valid and fully within a - * single existing vma. - */ struct vm_area_struct *dst_vma; - dst_vma = find_vma(dst_mm, dst_start); - if (!range_in_vma(dst_vma, dst_start, dst_start + len)) - return NULL; + /* Ensure anon_vma is assigned for anonymous vma */ + dst_vma = lock_vma(dst_mm, dst_start, true); + + if (!dst_vma) + return ERR_PTR(-ENOENT); + + if (PTR_ERR(dst_vma) == -ENOMEM) + return dst_vma; + + /* Make sure that the dst range is fully within dst_vma. */ + if (dst_start + len > dst_vma->vm_end) + goto out_unlock; /* * Check the vma is registered in uffd, this is required to @@ -40,9 +45,12 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) - return NULL; + goto out_unlock; return dst_vma; +out_unlock: + unlock_vma(dst_mm, dst_vma); + return ERR_PTR(-ENOENT); } /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ @@ -350,7 +358,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is - * called with mmap_lock held, it will release mmap_lock before returning. + * called with either vma-lock or mmap_lock held, it will release the lock + * before returning. */ static __always_inline ssize_t mfill_atomic_hugetlb( struct userfaultfd_ctx *ctx, @@ -361,7 +370,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb( uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; - int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; @@ -380,7 +388,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( */ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + unlock_vma(dst_mm, dst_vma); return -EINVAL; } @@ -403,24 +411,28 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { + dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } + err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); - if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) - goto out_unlock; + if (!is_vm_hugetlb_page(dst_vma)) + goto out_unlock_vma; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) - goto out_unlock; + goto out_unlock_vma; - vm_shared = dst_vma->vm_flags & VM_SHARED; - } - - /* - * If not shared, ensure the dst_vma has a anon_vma. - */ - err = -ENOMEM; - if (!vm_shared) { - if (unlikely(anon_vma_prepare(dst_vma))) + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; } @@ -465,7 +477,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( if (unlikely(err == -ENOENT)) { up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + unlock_vma(dst_mm, dst_vma); BUG_ON(!folio); err = copy_folio_from_user(folio, @@ -474,17 +486,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb( err = -EFAULT; goto out; } - mmap_read_lock(dst_mm); - down_read(&ctx->map_changing_lock); - /* - * If memory mappings are changing because of non-cooperative - * operation (e.g. mremap) running in parallel, bail out and - * request the user to retry later - */ - if (atomic_read(&ctx->mmap_changing)) { - err = -EAGAIN; - break; - } dst_vma = NULL; goto retry; @@ -505,7 +506,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb( out_unlock: up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); +out_unlock_vma: + unlock_vma(dst_mm, dst_vma); out: if (folio) folio_put(folio); @@ -597,7 +599,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, copied = 0; folio = NULL; retry: - mmap_read_lock(dst_mm); + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma = find_and_lock_dst_vma(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } /* * If memory mappings are changing because of non-cooperative @@ -609,15 +619,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, if (atomic_read(&ctx->mmap_changing)) goto out_unlock; - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); - if (!dst_vma) - goto out_unlock; - err = -EINVAL; /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but @@ -647,16 +648,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock; - /* - * Ensure the dst_vma has a anon_vma or this page - * would get a NULL anon_vma when moved in the - * dst_vma. - */ - err = -ENOMEM; - if (!(dst_vma->vm_flags & VM_SHARED) && - unlikely(anon_vma_prepare(dst_vma))) - goto out_unlock; - while (src_addr < src_start + len) { pmd_t dst_pmdval; @@ -699,7 +690,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, void *kaddr; up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + unlock_vma(dst_mm, dst_vma); + BUG_ON(!folio); kaddr = kmap_local_folio(folio, 0); @@ -730,7 +722,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, out_unlock: up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + unlock_vma(dst_mm, dst_vma); out: if (folio) folio_put(folio); @@ -1267,16 +1259,82 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) return -EINVAL; - /* - * Ensure the dst_vma has a anon_vma or this page - * would get a NULL anon_vma when moved in the - * dst_vma. - */ - if (unlikely(anon_vma_prepare(dst_vma))) - return -ENOMEM; + return 0; +} + +#ifdef CONFIG_PER_VMA_LOCK +static int find_and_lock_move_vmas(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + int err; + + /* There is no need to prepare anon_vma for src_vma */ + *src_vmap = lock_vma(mm, src_start, false); + if (!*src_vmap) + return -ENOENT; + + /* Ensure anon_vma is assigned for anonymous vma */ + *dst_vmap = lock_vma(mm, dst_start, true); + err = -ENOENT; + if (!*dst_vmap) + goto out_unlock; + + err = -ENOMEM; + if (PTR_ERR(*dst_vmap) == -ENOMEM) + goto out_unlock; return 0; +out_unlock: + unlock_vma(mm, *src_vmap); + return err; +} + +static void unlock_move_vmas(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + unlock_vma(mm, dst_vma); + unlock_vma(mm, src_vma); } +#else +static int find_and_lock_move_vmas(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + int err = -ENOENT; + mmap_read_lock(mm); + + *src_vmap = vma_lookup(mm, src_start); + if (!*src_vmap) + goto out_unlock; + + *dst_vmap = vma_lookup(mm, dst_start); + if (!*dst_vmap) + goto out_unlock; + + /* Ensure anon_vma is assigned */ + err = -ENOMEM; + if (vma_is_anonymous(*dst_vmap) && !anon_vma_prepare(*dst_vmap)) + goto out_unlock; + + return 0; +out_unlock: + mmap_read_unlock(mm); + return err; +} + +static void unlock_move_vmas(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + mmap_read_unlock(mm); +} +#endif /** * move_pages - move arbitrary anonymous pages of an existing vma @@ -1287,8 +1345,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, * @len: length of the virtual memory range * @mode: flags from uffdio_move.mode * - * Must be called with mmap_lock held for read. - * * move_pages() remaps arbitrary anonymous pages atomically in zero * copy. It only works on non shared anonymous pages because those can * be relocated without generating non linear anon_vmas in the rmap @@ -1355,10 +1411,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, * could be obtained. This is the only additional complexity added to * the rmap code to provide this anonymous page remapping functionality. */ -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, __u64 mode) +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 mode) { + struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; unsigned long src_addr, dst_addr; pmd_t *src_pmd, *dst_pmd; @@ -1376,28 +1432,35 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, WARN_ON_ONCE(dst_start + len <= dst_start)) goto out; + err = find_and_lock_move_vmas(mm, dst_start, src_start, + &dst_vma, &src_vma); + if (err) + goto out; + + /* Re-check after taking map_changing_lock */ + down_read(&ctx->map_changing_lock); + if (likely(atomic_read(&ctx->mmap_changing))) { + err = -EAGAIN; + goto out_unlock; + } /* * Make sure the vma is not shared, that the src and dst remap * ranges are both valid and fully within a single existing * vma. */ - src_vma = find_vma(mm, src_start); - if (!src_vma || (src_vma->vm_flags & VM_SHARED)) - goto out; - if (src_start < src_vma->vm_start || - src_start + len > src_vma->vm_end) - goto out; + if (src_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (src_start + len > src_vma->vm_end) + goto out_unlock; - dst_vma = find_vma(mm, dst_start); - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) - goto out; - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) - goto out; + if (dst_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (dst_start + len > dst_vma->vm_end) + goto out_unlock; err = validate_move_areas(ctx, src_vma, dst_vma); if (err) - goto out; + goto out_unlock; for (src_addr = src_start, dst_addr = dst_start; src_addr < src_start + len;) { @@ -1514,6 +1577,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, moved += step_size; } +out_unlock: + up_read(&ctx->map_changing_lock); + unlock_move_vmas(mm, dst_vma, src_vma); out: VM_WARN_ON(moved < 0); VM_WARN_ON(err > 0);
All userfaultfd operations, except write-protect, opportunistically use per-vma locks to lock vmas. On failure, attempt again inside mmap_lock critical section. Write-protect operation requires mmap_lock as it iterates over multiple vmas. Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> --- fs/userfaultfd.c | 13 +- include/linux/mm.h | 16 +++ include/linux/userfaultfd_k.h | 5 +- mm/memory.c | 48 +++++++ mm/userfaultfd.c | 242 +++++++++++++++++++++------------- 5 files changed, 222 insertions(+), 102 deletions(-)