diff mbox series

[RFC,RESEND,06/28] mm: mark VMA as locked whenever vma->vm_flags are modified

Message ID 20220901173516.702122-7-surenb@google.com (mailing list archive)
State New, archived
Headers show
Series per-VMA locks proposal | expand

Commit Message

Suren Baghdasaryan Sept. 1, 2022, 5:34 p.m. UTC
VMA flag modifications should be done under VMA lock to prevent concurrent
page fault handling in that area.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 fs/proc/task_mmu.c | 1 +
 fs/userfaultfd.c   | 6 ++++++
 mm/madvise.c       | 1 +
 mm/mlock.c         | 2 ++
 mm/mmap.c          | 1 +
 mm/mprotect.c      | 1 +
 6 files changed, 12 insertions(+)

Comments

Laurent Dufour Sept. 6, 2022, 2:26 p.m. UTC | #1
Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit :
> VMA flag modifications should be done under VMA lock to prevent concurrent
> page fault handling in that area.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/proc/task_mmu.c | 1 +
>  fs/userfaultfd.c   | 6 ++++++
>  mm/madvise.c       | 1 +
>  mm/mlock.c         | 2 ++
>  mm/mmap.c          | 1 +
>  mm/mprotect.c      | 1 +
>  6 files changed, 12 insertions(+)

There are few changes also done in the driver's space, for instance:

*** arch/x86/kernel/cpu/sgx/driver.c:
sgx_mmap[98]                   vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND |
VM_DONTDUMP | VM_IO;
*** arch/x86/kernel/cpu/sgx/virt.c:
sgx_vepc_mmap[108]             vma->vm_flags |= VM_PFNMAP | VM_IO |
VM_DONTDUMP | VM_DONTCOPY;
*** drivers/dax/device.c:
dax_mmap[311]                  vma->vm_flags |= VM_HUGEPAGE;

I guess these changes to vm_flags should be protected as well, or to be
checked one by one.

> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 4e0023643f8b..ceffa5c2c650 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
>  			for (vma = mm->mmap; vma; vma = vma->vm_next) {
>  				if (!(vma->vm_flags & VM_SOFTDIRTY))
>  					continue;
> +				vma_mark_locked(vma);
>  				vma->vm_flags &= ~VM_SOFTDIRTY;
>  				vma_set_page_prot(vma);
>  			}
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 175de70e3adf..fe557b3d1c07 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
>  		mmap_write_lock(mm);
>  		for (vma = mm->mmap; vma; vma = vma->vm_next)
>  			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
> +				vma_mark_locked(vma);
>  				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  				vma->vm_flags &= ~__VM_UFFD_FLAGS;
>  			}
> @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
>  
>  	octx = vma->vm_userfaultfd_ctx.ctx;
>  	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> +		vma_mark_locked(vma);
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  		vma->vm_flags &= ~__VM_UFFD_FLAGS;
>  		return 0;
> @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
>  		atomic_inc(&ctx->mmap_changing);
>  	} else {
>  		/* Drop uffd context if remap feature not enabled */
> +		vma_mark_locked(vma);
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  		vma->vm_flags &= ~__VM_UFFD_FLAGS;
>  	}
> @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
>  			vma = prev;
>  		else
>  			prev = vma;
> +		vma_mark_locked(vma);
>  		vma->vm_flags = new_flags;
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  	}
> @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  		 * the next vma was merged into the current one and
>  		 * the current one has not been updated yet.
>  		 */
> +		vma_mark_locked(vma);
>  		vma->vm_flags = new_flags;
>  		vma->vm_userfaultfd_ctx.ctx = ctx;
>  
> @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
>  		 * the next vma was merged into the current one and
>  		 * the current one has not been updated yet.
>  		 */
> +		vma_mark_locked(vma);
>  		vma->vm_flags = new_flags;
>  		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 5f0f0948a50e..a173f0025abd 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
>  	/*
>  	 * vm_flags is protected by the mmap_lock held in write mode.
>  	 */
> +	vma_mark_locked(vma);
>  	vma->vm_flags = new_flags;
>  	if (!vma->vm_file) {
>  		error = replace_anon_vma_name(vma, anon_name);
> diff --git a/mm/mlock.c b/mm/mlock.c
> index b14e929084cc..f62e1a4d05f2 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
>  	 */
>  	if (newflags & VM_LOCKED)
>  		newflags |= VM_IO;
> +	vma_mark_locked(vma);
>  	WRITE_ONCE(vma->vm_flags, newflags);
>  
>  	lru_add_drain();
> @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
>  
>  	if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
>  		/* No work to do, and mlocking twice would be wrong */
> +		vma_mark_locked(vma);
>  		vma->vm_flags = newflags;
>  	} else {
>  		mlock_vma_pages_range(vma, start, end, newflags);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 693e6776be39..f89c9b058105 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  out:
>  	perf_event_mmap(vma);
>  
> +	vma_mark_locked(vma);
>  	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
>  	if (vm_flags & VM_LOCKED) {
>  		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||

I guess, this doesn't really impact, but the call to vma_mark_locked(vma)
may be done only in the case the vm_flags field is touched.
Something like this:

	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
	if (vm_flags & VM_LOCKED) {
		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
					is_vm_hugetlb_page(vma) ||
-					vma == get_gate_vma(current->mm))
+					vma == get_gate_vma(current->mm)) {
+			vma_mark_locked(vma);
			vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
-		else
+		} else
			mm->locked_vm += (len >> PAGE_SHIFT);
	}


> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index bc6bddd156ca..df47fc21b0e4 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  	 * vm_flags and vm_page_prot are protected by the mmap_lock
>  	 * held in write mode.
>  	 */
> +	vma_mark_locked(vma);
>  	vma->vm_flags = newflags;
>  	/*
>  	 * We want to check manually if we can change individual PTEs writable
Suren Baghdasaryan Sept. 6, 2022, 7 p.m. UTC | #2
On Tue, Sep 6, 2022 at 7:27 AM Laurent Dufour <ldufour@linux.ibm.com> wrote:
>
> Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit :
> > VMA flag modifications should be done under VMA lock to prevent concurrent
> > page fault handling in that area.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  fs/proc/task_mmu.c | 1 +
> >  fs/userfaultfd.c   | 6 ++++++
> >  mm/madvise.c       | 1 +
> >  mm/mlock.c         | 2 ++
> >  mm/mmap.c          | 1 +
> >  mm/mprotect.c      | 1 +
> >  6 files changed, 12 insertions(+)
>
> There are few changes also done in the driver's space, for instance:
>
> *** arch/x86/kernel/cpu/sgx/driver.c:
> sgx_mmap[98]                   vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND |
> VM_DONTDUMP | VM_IO;
> *** arch/x86/kernel/cpu/sgx/virt.c:
> sgx_vepc_mmap[108]             vma->vm_flags |= VM_PFNMAP | VM_IO |
> VM_DONTDUMP | VM_DONTCOPY;
> *** drivers/dax/device.c:
> dax_mmap[311]                  vma->vm_flags |= VM_HUGEPAGE;
>
> I guess these changes to vm_flags should be protected as well, or to be
> checked one by one.

Thanks for noting these! I'll add necessary locking here and will look
for other places I might have missed.

>
> >
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 4e0023643f8b..ceffa5c2c650 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> >                       for (vma = mm->mmap; vma; vma = vma->vm_next) {
> >                               if (!(vma->vm_flags & VM_SOFTDIRTY))
> >                                       continue;
> > +                             vma_mark_locked(vma);
> >                               vma->vm_flags &= ~VM_SOFTDIRTY;
> >                               vma_set_page_prot(vma);
> >                       }
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 175de70e3adf..fe557b3d1c07 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
> >               mmap_write_lock(mm);
> >               for (vma = mm->mmap; vma; vma = vma->vm_next)
> >                       if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
> > +                             vma_mark_locked(vma);
> >                               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> >                               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> >                       }
> > @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> >
> >       octx = vma->vm_userfaultfd_ctx.ctx;
> >       if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> > +             vma_mark_locked(vma);
> >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> >               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> >               return 0;
> > @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> >               atomic_inc(&ctx->mmap_changing);
> >       } else {
> >               /* Drop uffd context if remap feature not enabled */
> > +             vma_mark_locked(vma);
> >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> >               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> >       }
> > @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
> >                       vma = prev;
> >               else
> >                       prev = vma;
> > +             vma_mark_locked(vma);
> >               vma->vm_flags = new_flags;
> >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> >       }
> > @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >                * the next vma was merged into the current one and
> >                * the current one has not been updated yet.
> >                */
> > +             vma_mark_locked(vma);
> >               vma->vm_flags = new_flags;
> >               vma->vm_userfaultfd_ctx.ctx = ctx;
> >
> > @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> >                * the next vma was merged into the current one and
> >                * the current one has not been updated yet.
> >                */
> > +             vma_mark_locked(vma);
> >               vma->vm_flags = new_flags;
> >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 5f0f0948a50e..a173f0025abd 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
> >       /*
> >        * vm_flags is protected by the mmap_lock held in write mode.
> >        */
> > +     vma_mark_locked(vma);
> >       vma->vm_flags = new_flags;
> >       if (!vma->vm_file) {
> >               error = replace_anon_vma_name(vma, anon_name);
> > diff --git a/mm/mlock.c b/mm/mlock.c
> > index b14e929084cc..f62e1a4d05f2 100644
> > --- a/mm/mlock.c
> > +++ b/mm/mlock.c
> > @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
> >        */
> >       if (newflags & VM_LOCKED)
> >               newflags |= VM_IO;
> > +     vma_mark_locked(vma);
> >       WRITE_ONCE(vma->vm_flags, newflags);
> >
> >       lru_add_drain();
> > @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
> >
> >       if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
> >               /* No work to do, and mlocking twice would be wrong */
> > +             vma_mark_locked(vma);
> >               vma->vm_flags = newflags;
> >       } else {
> >               mlock_vma_pages_range(vma, start, end, newflags);
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 693e6776be39..f89c9b058105 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  out:
> >       perf_event_mmap(vma);
> >
> > +     vma_mark_locked(vma);
> >       vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> >       if (vm_flags & VM_LOCKED) {
> >               if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
>
> I guess, this doesn't really impact, but the call to vma_mark_locked(vma)
> may be done only in the case the vm_flags field is touched.
> Something like this:
>
>         vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
>         if (vm_flags & VM_LOCKED) {
>                 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
>                                         is_vm_hugetlb_page(vma) ||
> -                                       vma == get_gate_vma(current->mm))
> +                                       vma == get_gate_vma(current->mm)) {
> +                       vma_mark_locked(vma);
>                         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
> -               else
> +               } else
>                         mm->locked_vm += (len >> PAGE_SHIFT);
>         }
>
>
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index bc6bddd156ca..df47fc21b0e4 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >        * vm_flags and vm_page_prot are protected by the mmap_lock
> >        * held in write mode.
> >        */
> > +     vma_mark_locked(vma);
> >       vma->vm_flags = newflags;
> >       /*
> >        * We want to check manually if we can change individual PTEs writable
>
Liam R. Howlett Sept. 6, 2022, 8 p.m. UTC | #3
* Suren Baghdasaryan <surenb@google.com> [220906 15:01]:
> On Tue, Sep 6, 2022 at 7:27 AM Laurent Dufour <ldufour@linux.ibm.com> wrote:
> >
> > Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit :
> > > VMA flag modifications should be done under VMA lock to prevent concurrent
> > > page fault handling in that area.
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > ---
> > >  fs/proc/task_mmu.c | 1 +
> > >  fs/userfaultfd.c   | 6 ++++++
> > >  mm/madvise.c       | 1 +
> > >  mm/mlock.c         | 2 ++
> > >  mm/mmap.c          | 1 +
> > >  mm/mprotect.c      | 1 +
> > >  6 files changed, 12 insertions(+)
> >
> > There are few changes also done in the driver's space, for instance:
> >
> > *** arch/x86/kernel/cpu/sgx/driver.c:
> > sgx_mmap[98]                   vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND |
> > VM_DONTDUMP | VM_IO;
> > *** arch/x86/kernel/cpu/sgx/virt.c:
> > sgx_vepc_mmap[108]             vma->vm_flags |= VM_PFNMAP | VM_IO |
> > VM_DONTDUMP | VM_DONTCOPY;
> > *** drivers/dax/device.c:
> > dax_mmap[311]                  vma->vm_flags |= VM_HUGEPAGE;
> >
> > I guess these changes to vm_flags should be protected as well, or to be
> > checked one by one.
> 
> Thanks for noting these! I'll add necessary locking here and will look
> for other places I might have missed.

Would an inline set/clear bit function be worth while for vm_flags?  If
it is then a name change to vm_flags may get the compiler to catch any
missed cases.  There doesn't seem to be many cases (12 inserts) so maybe
not.

> 
> >
> > >
> > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > > index 4e0023643f8b..ceffa5c2c650 100644
> > > --- a/fs/proc/task_mmu.c
> > > +++ b/fs/proc/task_mmu.c
> > > @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> > >                       for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > >                               if (!(vma->vm_flags & VM_SOFTDIRTY))
> > >                                       continue;
> > > +                             vma_mark_locked(vma);
> > >                               vma->vm_flags &= ~VM_SOFTDIRTY;
> > >                               vma_set_page_prot(vma);
> > >                       }
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 175de70e3adf..fe557b3d1c07 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
> > >               mmap_write_lock(mm);
> > >               for (vma = mm->mmap; vma; vma = vma->vm_next)
> > >                       if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
> > > +                             vma_mark_locked(vma);
> > >                               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > >                               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> > >                       }
> > > @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > >
> > >       octx = vma->vm_userfaultfd_ctx.ctx;
> > >       if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> > > +             vma_mark_locked(vma);
> > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > >               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> > >               return 0;
> > > @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> > >               atomic_inc(&ctx->mmap_changing);
> > >       } else {
> > >               /* Drop uffd context if remap feature not enabled */
> > > +             vma_mark_locked(vma);
> > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > >               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> > >       }
> > > @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
> > >                       vma = prev;
> > >               else
> > >                       prev = vma;
> > > +             vma_mark_locked(vma);
> > >               vma->vm_flags = new_flags;
> > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > >       }
> > > @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > >                * the next vma was merged into the current one and
> > >                * the current one has not been updated yet.
> > >                */
> > > +             vma_mark_locked(vma);
> > >               vma->vm_flags = new_flags;
> > >               vma->vm_userfaultfd_ctx.ctx = ctx;
> > >
> > > @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > >                * the next vma was merged into the current one and
> > >                * the current one has not been updated yet.
> > >                */
> > > +             vma_mark_locked(vma);
> > >               vma->vm_flags = new_flags;
> > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > >
> > > diff --git a/mm/madvise.c b/mm/madvise.c
> > > index 5f0f0948a50e..a173f0025abd 100644
> > > --- a/mm/madvise.c
> > > +++ b/mm/madvise.c
> > > @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
> > >       /*
> > >        * vm_flags is protected by the mmap_lock held in write mode.
> > >        */
> > > +     vma_mark_locked(vma);
> > >       vma->vm_flags = new_flags;
> > >       if (!vma->vm_file) {
> > >               error = replace_anon_vma_name(vma, anon_name);
> > > diff --git a/mm/mlock.c b/mm/mlock.c
> > > index b14e929084cc..f62e1a4d05f2 100644
> > > --- a/mm/mlock.c
> > > +++ b/mm/mlock.c
> > > @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
> > >        */
> > >       if (newflags & VM_LOCKED)
> > >               newflags |= VM_IO;
> > > +     vma_mark_locked(vma);
> > >       WRITE_ONCE(vma->vm_flags, newflags);
> > >
> > >       lru_add_drain();
> > > @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
> > >
> > >       if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
> > >               /* No work to do, and mlocking twice would be wrong */
> > > +             vma_mark_locked(vma);
> > >               vma->vm_flags = newflags;
> > >       } else {
> > >               mlock_vma_pages_range(vma, start, end, newflags);
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 693e6776be39..f89c9b058105 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  out:
> > >       perf_event_mmap(vma);
> > >
> > > +     vma_mark_locked(vma);
> > >       vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> > >       if (vm_flags & VM_LOCKED) {
> > >               if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> >
> > I guess, this doesn't really impact, but the call to vma_mark_locked(vma)
> > may be done only in the case the vm_flags field is touched.
> > Something like this:
> >
> >         vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> >         if (vm_flags & VM_LOCKED) {
> >                 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> >                                         is_vm_hugetlb_page(vma) ||
> > -                                       vma == get_gate_vma(current->mm))
> > +                                       vma == get_gate_vma(current->mm)) {
> > +                       vma_mark_locked(vma);
> >                         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
> > -               else
> > +               } else
> >                         mm->locked_vm += (len >> PAGE_SHIFT);
> >         }
> >
> >
> > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > index bc6bddd156ca..df47fc21b0e4 100644
> > > --- a/mm/mprotect.c
> > > +++ b/mm/mprotect.c
> > > @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > >        * vm_flags and vm_page_prot are protected by the mmap_lock
> > >        * held in write mode.
> > >        */
> > > +     vma_mark_locked(vma);
> > >       vma->vm_flags = newflags;
> > >       /*
> > >        * We want to check manually if we can change individual PTEs writable
> >
Suren Baghdasaryan Sept. 6, 2022, 8:13 p.m. UTC | #4
On Tue, Sep 6, 2022 at 1:00 PM Liam Howlett <liam.howlett@oracle.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [220906 15:01]:
> > On Tue, Sep 6, 2022 at 7:27 AM Laurent Dufour <ldufour@linux.ibm.com> wrote:
> > >
> > > Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit :
> > > > VMA flag modifications should be done under VMA lock to prevent concurrent
> > > > page fault handling in that area.
> > > >
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > > ---
> > > >  fs/proc/task_mmu.c | 1 +
> > > >  fs/userfaultfd.c   | 6 ++++++
> > > >  mm/madvise.c       | 1 +
> > > >  mm/mlock.c         | 2 ++
> > > >  mm/mmap.c          | 1 +
> > > >  mm/mprotect.c      | 1 +
> > > >  6 files changed, 12 insertions(+)
> > >
> > > There are few changes also done in the driver's space, for instance:
> > >
> > > *** arch/x86/kernel/cpu/sgx/driver.c:
> > > sgx_mmap[98]                   vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND |
> > > VM_DONTDUMP | VM_IO;
> > > *** arch/x86/kernel/cpu/sgx/virt.c:
> > > sgx_vepc_mmap[108]             vma->vm_flags |= VM_PFNMAP | VM_IO |
> > > VM_DONTDUMP | VM_DONTCOPY;
> > > *** drivers/dax/device.c:
> > > dax_mmap[311]                  vma->vm_flags |= VM_HUGEPAGE;
> > >
> > > I guess these changes to vm_flags should be protected as well, or to be
> > > checked one by one.
> >
> > Thanks for noting these! I'll add necessary locking here and will look
> > for other places I might have missed.
>
> Would an inline set/clear bit function be worth while for vm_flags?  If
> it is then a name change to vm_flags may get the compiler to catch any
> missed cases.  There doesn't seem to be many cases (12 inserts) so maybe
> not.

That would probably simplify the maintenance for flags in the future
and we can add vma_mark_locked directly in the set/clear functions.

>
> >
> > >
> > > >
> > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > > > index 4e0023643f8b..ceffa5c2c650 100644
> > > > --- a/fs/proc/task_mmu.c
> > > > +++ b/fs/proc/task_mmu.c
> > > > @@ -1285,6 +1285,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> > > >                       for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > > >                               if (!(vma->vm_flags & VM_SOFTDIRTY))
> > > >                                       continue;
> > > > +                             vma_mark_locked(vma);
> > > >                               vma->vm_flags &= ~VM_SOFTDIRTY;
> > > >                               vma_set_page_prot(vma);
> > > >                       }
> > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > > index 175de70e3adf..fe557b3d1c07 100644
> > > > --- a/fs/userfaultfd.c
> > > > +++ b/fs/userfaultfd.c
> > > > @@ -620,6 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
> > > >               mmap_write_lock(mm);
> > > >               for (vma = mm->mmap; vma; vma = vma->vm_next)
> > > >                       if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
> > > > +                             vma_mark_locked(vma);
> > > >                               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > > >                               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> > > >                       }
> > > > @@ -653,6 +654,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > > >
> > > >       octx = vma->vm_userfaultfd_ctx.ctx;
> > > >       if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> > > > +             vma_mark_locked(vma);
> > > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > > >               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> > > >               return 0;
> > > > @@ -734,6 +736,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> > > >               atomic_inc(&ctx->mmap_changing);
> > > >       } else {
> > > >               /* Drop uffd context if remap feature not enabled */
> > > > +             vma_mark_locked(vma);
> > > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > > >               vma->vm_flags &= ~__VM_UFFD_FLAGS;
> > > >       }
> > > > @@ -891,6 +894,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
> > > >                       vma = prev;
> > > >               else
> > > >                       prev = vma;
> > > > +             vma_mark_locked(vma);
> > > >               vma->vm_flags = new_flags;
> > > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > > >       }
> > > > @@ -1449,6 +1453,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> > > >                * the next vma was merged into the current one and
> > > >                * the current one has not been updated yet.
> > > >                */
> > > > +             vma_mark_locked(vma);
> > > >               vma->vm_flags = new_flags;
> > > >               vma->vm_userfaultfd_ctx.ctx = ctx;
> > > >
> > > > @@ -1630,6 +1635,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> > > >                * the next vma was merged into the current one and
> > > >                * the current one has not been updated yet.
> > > >                */
> > > > +             vma_mark_locked(vma);
> > > >               vma->vm_flags = new_flags;
> > > >               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> > > >
> > > > diff --git a/mm/madvise.c b/mm/madvise.c
> > > > index 5f0f0948a50e..a173f0025abd 100644
> > > > --- a/mm/madvise.c
> > > > +++ b/mm/madvise.c
> > > > @@ -181,6 +181,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
> > > >       /*
> > > >        * vm_flags is protected by the mmap_lock held in write mode.
> > > >        */
> > > > +     vma_mark_locked(vma);
> > > >       vma->vm_flags = new_flags;
> > > >       if (!vma->vm_file) {
> > > >               error = replace_anon_vma_name(vma, anon_name);
> > > > diff --git a/mm/mlock.c b/mm/mlock.c
> > > > index b14e929084cc..f62e1a4d05f2 100644
> > > > --- a/mm/mlock.c
> > > > +++ b/mm/mlock.c
> > > > @@ -380,6 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
> > > >        */
> > > >       if (newflags & VM_LOCKED)
> > > >               newflags |= VM_IO;
> > > > +     vma_mark_locked(vma);
> > > >       WRITE_ONCE(vma->vm_flags, newflags);
> > > >
> > > >       lru_add_drain();
> > > > @@ -456,6 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
> > > >
> > > >       if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
> > > >               /* No work to do, and mlocking twice would be wrong */
> > > > +             vma_mark_locked(vma);
> > > >               vma->vm_flags = newflags;
> > > >       } else {
> > > >               mlock_vma_pages_range(vma, start, end, newflags);
> > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > index 693e6776be39..f89c9b058105 100644
> > > > --- a/mm/mmap.c
> > > > +++ b/mm/mmap.c
> > > > @@ -1818,6 +1818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > >  out:
> > > >       perf_event_mmap(vma);
> > > >
> > > > +     vma_mark_locked(vma);
> > > >       vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> > > >       if (vm_flags & VM_LOCKED) {
> > > >               if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> > >
> > > I guess, this doesn't really impact, but the call to vma_mark_locked(vma)
> > > may be done only in the case the vm_flags field is touched.
> > > Something like this:
> > >
> > >         vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> > >         if (vm_flags & VM_LOCKED) {
> > >                 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> > >                                         is_vm_hugetlb_page(vma) ||
> > > -                                       vma == get_gate_vma(current->mm))
> > > +                                       vma == get_gate_vma(current->mm)) {
> > > +                       vma_mark_locked(vma);
> > >                         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
> > > -               else
> > > +               } else
> > >                         mm->locked_vm += (len >> PAGE_SHIFT);
> > >         }
> > >
> > >
> > > > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > > > index bc6bddd156ca..df47fc21b0e4 100644
> > > > --- a/mm/mprotect.c
> > > > +++ b/mm/mprotect.c
> > > > @@ -621,6 +621,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > > >        * vm_flags and vm_page_prot are protected by the mmap_lock
> > > >        * held in write mode.
> > > >        */
> > > > +     vma_mark_locked(vma);
> > > >       vma->vm_flags = newflags;
> > > >       /*
> > > >        * We want to check manually if we can change individual PTEs writable
> > >
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4e0023643f8b..ceffa5c2c650 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1285,6 +1285,7 @@  static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			for (vma = mm->mmap; vma; vma = vma->vm_next) {
 				if (!(vma->vm_flags & VM_SOFTDIRTY))
 					continue;
+				vma_mark_locked(vma);
 				vma->vm_flags &= ~VM_SOFTDIRTY;
 				vma_set_page_prot(vma);
 			}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 175de70e3adf..fe557b3d1c07 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -620,6 +620,7 @@  static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 		mmap_write_lock(mm);
 		for (vma = mm->mmap; vma; vma = vma->vm_next)
 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+				vma_mark_locked(vma);
 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 				vma->vm_flags &= ~__VM_UFFD_FLAGS;
 			}
@@ -653,6 +654,7 @@  int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 
 	octx = vma->vm_userfaultfd_ctx.ctx;
 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma_mark_locked(vma);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		vma->vm_flags &= ~__VM_UFFD_FLAGS;
 		return 0;
@@ -734,6 +736,7 @@  void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 		atomic_inc(&ctx->mmap_changing);
 	} else {
 		/* Drop uffd context if remap feature not enabled */
+		vma_mark_locked(vma);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		vma->vm_flags &= ~__VM_UFFD_FLAGS;
 	}
@@ -891,6 +894,7 @@  static int userfaultfd_release(struct inode *inode, struct file *file)
 			vma = prev;
 		else
 			prev = vma;
+		vma_mark_locked(vma);
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
@@ -1449,6 +1453,7 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		 * the next vma was merged into the current one and
 		 * the current one has not been updated yet.
 		 */
+		vma_mark_locked(vma);
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx.ctx = ctx;
 
@@ -1630,6 +1635,7 @@  static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * the next vma was merged into the current one and
 		 * the current one has not been updated yet.
 		 */
+		vma_mark_locked(vma);
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 5f0f0948a50e..a173f0025abd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -181,6 +181,7 @@  static int madvise_update_vma(struct vm_area_struct *vma,
 	/*
 	 * vm_flags is protected by the mmap_lock held in write mode.
 	 */
+	vma_mark_locked(vma);
 	vma->vm_flags = new_flags;
 	if (!vma->vm_file) {
 		error = replace_anon_vma_name(vma, anon_name);
diff --git a/mm/mlock.c b/mm/mlock.c
index b14e929084cc..f62e1a4d05f2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -380,6 +380,7 @@  static void mlock_vma_pages_range(struct vm_area_struct *vma,
 	 */
 	if (newflags & VM_LOCKED)
 		newflags |= VM_IO;
+	vma_mark_locked(vma);
 	WRITE_ONCE(vma->vm_flags, newflags);
 
 	lru_add_drain();
@@ -456,6 +457,7 @@  static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
 		/* No work to do, and mlocking twice would be wrong */
+		vma_mark_locked(vma);
 		vma->vm_flags = newflags;
 	} else {
 		mlock_vma_pages_range(vma, start, end, newflags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 693e6776be39..f89c9b058105 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1818,6 +1818,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 out:
 	perf_event_mmap(vma);
 
+	vma_mark_locked(vma);
 	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bc6bddd156ca..df47fc21b0e4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -621,6 +621,7 @@  mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	 * vm_flags and vm_page_prot are protected by the mmap_lock
 	 * held in write mode.
 	 */
+	vma_mark_locked(vma);
 	vma->vm_flags = newflags;
 	/*
 	 * We want to check manually if we can change individual PTEs writable