diff mbox series

[v2,4/6] mm: lock vma explicitly before doing vm_flags_reset and vm_flags_reset_once

Message ID 20230801220733.1987762-5-surenb@google.com (mailing list archive)
State New
Headers show
Series make vma locking more obvious | expand

Commit Message

Suren Baghdasaryan Aug. 1, 2023, 10:07 p.m. UTC
Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is
not obvious and makes it hard to understand where vma locking is happening.
Also in some cases (like in dup_userfaultfd()) vma should be locked earlier
than vma_flags modification. To make locking more visible, change these
functions to assert that the vma write lock is taken and explicitly lock
the vma beforehand. Fix userfaultfd functions which should lock the vma
earlier.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c    |  1 +
 drivers/infiniband/hw/hfi1/file_ops.c |  1 +
 fs/userfaultfd.c                      |  6 ++++++
 include/linux/mm.h                    | 10 +++++++---
 mm/madvise.c                          |  5 ++---
 mm/mlock.c                            |  3 ++-
 mm/mprotect.c                         |  1 +
 7 files changed, 20 insertions(+), 7 deletions(-)

Comments

Liam R. Howlett Aug. 2, 2023, 5:13 p.m. UTC | #1
* Suren Baghdasaryan <surenb@google.com> [230801 18:07]:
> Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is
> not obvious and makes it hard to understand where vma locking is happening.
> Also in some cases (like in dup_userfaultfd()) vma should be locked earlier
> than vma_flags modification. To make locking more visible, change these
> functions to assert that the vma write lock is taken and explicitly lock
> the vma beforehand. Fix userfaultfd functions which should lock the vma
> earlier.
> 
> Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>

> ---
>  arch/powerpc/kvm/book3s_hv_uvmem.c    |  1 +
>  drivers/infiniband/hw/hfi1/file_ops.c |  1 +
>  fs/userfaultfd.c                      |  6 ++++++
>  include/linux/mm.h                    | 10 +++++++---
>  mm/madvise.c                          |  5 ++---
>  mm/mlock.c                            |  3 ++-
>  mm/mprotect.c                         |  1 +
>  7 files changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 709ebd578394..e2d6f9327f77 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -410,6 +410,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
>  			ret = H_STATE;
>  			break;
>  		}
> +		vma_start_write(vma);
>  		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
>  		vm_flags = vma->vm_flags;
>  		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
> diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
> index a5ab22cedd41..5920bfc1e1c5 100644
> --- a/drivers/infiniband/hw/hfi1/file_ops.c
> +++ b/drivers/infiniband/hw/hfi1/file_ops.c
> @@ -344,6 +344,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
>  		goto done;
>  	}
>  
> +	vma_start_write(vma);
>  	/*
>  	 * vm_pgoff is used as a buffer selector cookie.  Always mmap from
>  	 * the beginning.
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 7cecd49e078b..6cde95533dcd 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -667,6 +667,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
>  		mmap_write_lock(mm);
>  		for_each_vma(vmi, vma) {
>  			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
> +				vma_start_write(vma);
>  				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  				userfaultfd_set_vm_flags(vma,
>  							 vma->vm_flags & ~__VM_UFFD_FLAGS);
> @@ -702,6 +703,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
>  
>  	octx = vma->vm_userfaultfd_ctx.ctx;
>  	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> +		vma_start_write(vma);
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
>  		return 0;
> @@ -783,6 +785,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
>  		atomic_inc(&ctx->mmap_changing);
>  	} else {
>  		/* Drop uffd context if remap feature not enabled */
> +		vma_start_write(vma);
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
>  	}
> @@ -940,6 +943,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
>  			prev = vma;
>  		}
>  
> +		vma_start_write(vma);
>  		userfaultfd_set_vm_flags(vma, new_flags);
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  	}
> @@ -1502,6 +1506,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  		 * the next vma was merged into the current one and
>  		 * the current one has not been updated yet.
>  		 */
> +		vma_start_write(vma);
>  		userfaultfd_set_vm_flags(vma, new_flags);
>  		vma->vm_userfaultfd_ctx.ctx = ctx;
>  
> @@ -1685,6 +1690,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  		 * the next vma was merged into the current one and
>  		 * the current one has not been updated yet.
>  		 */
> +		vma_start_write(vma);
>  		userfaultfd_set_vm_flags(vma, new_flags);
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 262b5f44101d..2c720c9bb1ae 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -780,18 +780,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
>  	ACCESS_PRIVATE(vma, __vm_flags) = flags;
>  }
>  
> -/* Use when VMA is part of the VMA tree and modifications need coordination */
> +/*
> + * Use when VMA is part of the VMA tree and modifications need coordination
> + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
> + * it should be locked explicitly beforehand.
> + */
>  static inline void vm_flags_reset(struct vm_area_struct *vma,
>  				  vm_flags_t flags)
>  {
> -	vma_start_write(vma);
> +	vma_assert_write_locked(vma);
>  	vm_flags_init(vma, flags);
>  }
>  
>  static inline void vm_flags_reset_once(struct vm_area_struct *vma,
>  				       vm_flags_t flags)
>  {
> -	vma_start_write(vma);
> +	vma_assert_write_locked(vma);
>  	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
>  }
>  
> diff --git a/mm/madvise.c b/mm/madvise.c
> index bfe0e06427bd..507b1d299fec 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -173,9 +173,8 @@ static int madvise_update_vma(struct vm_area_struct *vma,
>  	}
>  
>  success:
> -	/*
> -	 * vm_flags is protected by the mmap_lock held in write mode.
> -	 */
> +	/* vm_flags is protected by the mmap_lock held in write mode. */
> +	vma_start_write(vma);
>  	vm_flags_reset(vma, new_flags);
>  	if (!vma->vm_file || vma_is_anon_shmem(vma)) {
>  		error = replace_anon_vma_name(vma, anon_name);
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 479e09d0994c..06bdfab83b58 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -387,6 +387,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
>  	 */
>  	if (newflags & VM_LOCKED)
>  		newflags |= VM_IO;
> +	vma_start_write(vma);
>  	vm_flags_reset_once(vma, newflags);
>  
>  	lru_add_drain();
> @@ -461,9 +462,9 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	 * It's okay if try_to_unmap_one unmaps a page just after we
>  	 * set VM_LOCKED, populate_vma_page_range will bring it back.
>  	 */
> -
>  	if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
>  		/* No work to do, and mlocking twice would be wrong */
> +		vma_start_write(vma);
>  		vm_flags_reset(vma, newflags);
>  	} else {
>  		mlock_vma_pages_range(vma, start, end, newflags);
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 3aef1340533a..362e190a8f81 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -657,6 +657,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
>  	 * vm_flags and vm_page_prot are protected by the mmap_lock
>  	 * held in write mode.
>  	 */
> +	vma_start_write(vma);
>  	vm_flags_reset(vma, newflags);
>  	if (vma_wants_manual_pte_write_upgrade(vma))
>  		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
> -- 
> 2.41.0.585.gd2178a4bd4-goog
>
Linus Torvalds Aug. 2, 2023, 5:49 p.m. UTC | #2
On Tue, 1 Aug 2023 at 15:07, Suren Baghdasaryan <surenb@google.com> wrote:
>
>       To make locking more visible, change these
> functions to assert that the vma write lock is taken and explicitly lock
> the vma beforehand.

So I obviously think this is a good change, but the fact that it
touched driver files makes me go "we're still doing something wrong".

I'm not super-happy with hfi1_file_mmap() doing something like
vma_start_write(), in that I *really* don't think drivers should ever
have to think about issues like this.

And I think it's unnecessary.  This is the mmap op in the
hfi1_file_ops, and I think that any actual mmap() code had _better_
had locked the new vma before asking any driver to set things up (and
the assert would catch it if somebody didn't).

I realize that it doesn't hurt in a technical sense, but I think
having drivers call these VM-internal subtle locking functions does
hurt in a maintenance sense, so we should make sure to not have it.

                   Linus
Suren Baghdasaryan Aug. 2, 2023, 6:09 p.m. UTC | #3
On Wed, Aug 2, 2023 at 10:49 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Tue, 1 Aug 2023 at 15:07, Suren Baghdasaryan <surenb@google.com> wrote:
> >
> >       To make locking more visible, change these
> > functions to assert that the vma write lock is taken and explicitly lock
> > the vma beforehand.
>
> So I obviously think this is a good change, but the fact that it
> touched driver files makes me go "we're still doing something wrong".
>
> I'm not super-happy with hfi1_file_mmap() doing something like
> vma_start_write(), in that I *really* don't think drivers should ever
> have to think about issues like this.
>
> And I think it's unnecessary.  This is the mmap op in the
> hfi1_file_ops, and I think that any actual mmap() code had _better_
> had locked the new vma before asking any driver to set things up (and
> the assert would catch it if somebody didn't).
>
> I realize that it doesn't hurt in a technical sense, but I think
> having drivers call these VM-internal subtle locking functions does
> hurt in a maintenance sense, so we should make sure to not have it.

Ok, IOW the vma would be already locked before mmap() is called...
Just to confirm, you are suggesting to remove vma_start_write() call
from hfi1_file_mmap() and let vm_flags_reset() generate an assertion
if it's ever called with an unlocked vma, correct?

>
>                    Linus
Linus Torvalds Aug. 2, 2023, 6:53 p.m. UTC | #4
On Wed, 2 Aug 2023 at 11:09, Suren Baghdasaryan <surenb@google.com> wrote:
>
> Ok, IOW the vma would be already locked before mmap() is called...

Yup.

> Just to confirm, you are suggesting to remove vma_start_write() call
> from hfi1_file_mmap() and let vm_flags_reset() generate an assertion
> if it's ever called with an unlocked vma, correct?

Correct.

               Linus
Suren Baghdasaryan Aug. 2, 2023, 8:21 p.m. UTC | #5
On Wed, Aug 2, 2023 at 11:54 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Wed, 2 Aug 2023 at 11:09, Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > Ok, IOW the vma would be already locked before mmap() is called...
>
> Yup.
>
> > Just to confirm, you are suggesting to remove vma_start_write() call
> > from hfi1_file_mmap() and let vm_flags_reset() generate an assertion
> > if it's ever called with an unlocked vma, correct?
>
> Correct.

Got it. Will update in the next version. Thanks!

>
>                Linus
>
diff mbox series

Patch

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 709ebd578394..e2d6f9327f77 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -410,6 +410,7 @@  static int kvmppc_memslot_page_merge(struct kvm *kvm,
 			ret = H_STATE;
 			break;
 		}
+		vma_start_write(vma);
 		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
 		vm_flags = vma->vm_flags;
 		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index a5ab22cedd41..5920bfc1e1c5 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -344,6 +344,7 @@  static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 		goto done;
 	}
 
+	vma_start_write(vma);
 	/*
 	 * vm_pgoff is used as a buffer selector cookie.  Always mmap from
 	 * the beginning.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..6cde95533dcd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -667,6 +667,7 @@  static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 		mmap_write_lock(mm);
 		for_each_vma(vmi, vma) {
 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+				vma_start_write(vma);
 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 				userfaultfd_set_vm_flags(vma,
 							 vma->vm_flags & ~__VM_UFFD_FLAGS);
@@ -702,6 +703,7 @@  int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 
 	octx = vma->vm_userfaultfd_ctx.ctx;
 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma_start_write(vma);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
 		return 0;
@@ -783,6 +785,7 @@  void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 		atomic_inc(&ctx->mmap_changing);
 	} else {
 		/* Drop uffd context if remap feature not enabled */
+		vma_start_write(vma);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
 	}
@@ -940,6 +943,7 @@  static int userfaultfd_release(struct inode *inode, struct file *file)
 			prev = vma;
 		}
 
+		vma_start_write(vma);
 		userfaultfd_set_vm_flags(vma, new_flags);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
@@ -1502,6 +1506,7 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		 * the next vma was merged into the current one and
 		 * the current one has not been updated yet.
 		 */
+		vma_start_write(vma);
 		userfaultfd_set_vm_flags(vma, new_flags);
 		vma->vm_userfaultfd_ctx.ctx = ctx;
 
@@ -1685,6 +1690,7 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * the next vma was merged into the current one and
 		 * the current one has not been updated yet.
 		 */
+		vma_start_write(vma);
 		userfaultfd_set_vm_flags(vma, new_flags);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 262b5f44101d..2c720c9bb1ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -780,18 +780,22 @@  static inline void vm_flags_init(struct vm_area_struct *vma,
 	ACCESS_PRIVATE(vma, __vm_flags) = flags;
 }
 
-/* Use when VMA is part of the VMA tree and modifications need coordination */
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
 static inline void vm_flags_reset(struct vm_area_struct *vma,
 				  vm_flags_t flags)
 {
-	vma_start_write(vma);
+	vma_assert_write_locked(vma);
 	vm_flags_init(vma, flags);
 }
 
 static inline void vm_flags_reset_once(struct vm_area_struct *vma,
 				       vm_flags_t flags)
 {
-	vma_start_write(vma);
+	vma_assert_write_locked(vma);
 	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
 }
 
diff --git a/mm/madvise.c b/mm/madvise.c
index bfe0e06427bd..507b1d299fec 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -173,9 +173,8 @@  static int madvise_update_vma(struct vm_area_struct *vma,
 	}
 
 success:
-	/*
-	 * vm_flags is protected by the mmap_lock held in write mode.
-	 */
+	/* vm_flags is protected by the mmap_lock held in write mode. */
+	vma_start_write(vma);
 	vm_flags_reset(vma, new_flags);
 	if (!vma->vm_file || vma_is_anon_shmem(vma)) {
 		error = replace_anon_vma_name(vma, anon_name);
diff --git a/mm/mlock.c b/mm/mlock.c
index 479e09d0994c..06bdfab83b58 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -387,6 +387,7 @@  static void mlock_vma_pages_range(struct vm_area_struct *vma,
 	 */
 	if (newflags & VM_LOCKED)
 		newflags |= VM_IO;
+	vma_start_write(vma);
 	vm_flags_reset_once(vma, newflags);
 
 	lru_add_drain();
@@ -461,9 +462,9 @@  static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	 * It's okay if try_to_unmap_one unmaps a page just after we
 	 * set VM_LOCKED, populate_vma_page_range will bring it back.
 	 */
-
 	if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
 		/* No work to do, and mlocking twice would be wrong */
+		vma_start_write(vma);
 		vm_flags_reset(vma, newflags);
 	} else {
 		mlock_vma_pages_range(vma, start, end, newflags);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3aef1340533a..362e190a8f81 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -657,6 +657,7 @@  mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
 	 * vm_flags and vm_page_prot are protected by the mmap_lock
 	 * held in write mode.
 	 */
+	vma_start_write(vma);
 	vm_flags_reset(vma, newflags);
 	if (vma_wants_manual_pte_write_upgrade(vma))
 		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;