diff mbox series

[v2,4/9] fs/procfs: use per-VMA RCU-protected locking in PROCMAP_QUERY API

Message ID 20240524041032.1048094-5-andrii@kernel.org (mailing list archive)
State New
Headers show
Series ioctl()-based API to query VMAs from /proc/<pid>/maps | expand

Commit Message

Andrii Nakryiko May 24, 2024, 4:10 a.m. UTC
Attempt to use RCU-protected per-VAM lock when looking up requested VMA
as much as possible, only falling back to mmap_lock if per-VMA lock
failed. This is done so that querying of VMAs doesn't interfere with
other critical tasks, like page fault handling.

This has been suggested by mm folks, and we make use of a newly added
internal API that works like find_vma(), but tries to use per-VMA lock.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 fs/proc/task_mmu.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

Comments

Liam R. Howlett May 24, 2024, 7:47 p.m. UTC | #1
* Andrii Nakryiko <andrii@kernel.org> [240524 00:10]:
> Attempt to use RCU-protected per-VAM lock when looking up requested VMA
> as much as possible, only falling back to mmap_lock if per-VMA lock
> failed. This is done so that querying of VMAs doesn't interfere with
> other critical tasks, like page fault handling.
> 
> This has been suggested by mm folks, and we make use of a newly added
> internal API that works like find_vma(), but tries to use per-VMA lock.

Thanks for doing this.

> 
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  fs/proc/task_mmu.c | 42 ++++++++++++++++++++++++++++++++++--------
>  1 file changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 8ad547efd38d..2b14d06d1def 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -389,12 +389,30 @@ static int pid_maps_open(struct inode *inode, struct file *file)
>  )
>  
>  static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
> -						 unsigned long addr, u32 flags)
> +						 unsigned long addr, u32 flags,
> +						 bool *mm_locked)
>  {
>  	struct vm_area_struct *vma;
> +	bool mmap_locked;
> +
> +	*mm_locked = mmap_locked = false;
>  
>  next_vma:
> -	vma = find_vma(mm, addr);
> +	if (!mmap_locked) {
> +		/* if we haven't yet acquired mmap_lock, try to use less disruptive per-VMA */
> +		vma = find_and_lock_vma_rcu(mm, addr);
> +		if (IS_ERR(vma)) {

There is a chance that find_and_lock_vma_rcu() will return NULL when
there should never be a NULL.

If you follow the MAP_FIXED call to mmap(), you'll land in map_region()
which does two operations: munmap(), then the mmap().  Since this was
behind a lock, it was fine.  Now that we're transitioning to rcu
readers, it's less ideal.  We have a race where we will see that gap.
In this implementation we may return NULL if the MAP_FIXED is at the end
of the address space.

It might also cause issues if we are searching for a specific address
and we will skip a VMA that is currently being inserted by MAP_FIXED.

The page fault handler doesn't have this issue as it looks for a
specific address then falls back to the lock if one is not found.

This problem needs to be fixed prior to shifting the existing proc maps
file to using rcu read locks as well.  We have a solution that isn't
upstream or on the ML, but is being tested and will go upstream.

> +			/* failed to take per-VMA lock, fallback to mmap_lock */
> +			if (mmap_read_lock_killable(mm))
> +				return ERR_PTR(-EINTR);
> +
> +			*mm_locked = mmap_locked = true;
> +			vma = find_vma(mm, addr);

If you lock the vma here then drop the mmap lock, then you should be
able to simplify the code by avoiding the passing of the mmap_locked
variable around.

It also means we don't need to do an unlokc_vma() call, which indicates
we are going to end the vma read but actually may be unlocking the mm.

This is exactly why I think we need a common pattern and infrastructure
to do this sort of walking.

Please have a look at userfaultfd patches here [1].  Note that
vma_start_read() cannot be used in the mmap_read_lock() critical
section.

> +		}
> +	} else {
> +		/* if we have mmap_lock, get through the search as fast as possible */
> +		vma = find_vma(mm, addr);

I think the only way we get here is if we are contending on the mmap
lock.  This is actually where we should try to avoid holding the lock?

> +	}
>  
>  	/* no VMA found */
>  	if (!vma)
> @@ -428,18 +446,25 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
>  skip_vma:
>  	/*
>  	 * If the user needs closest matching VMA, keep iterating.
> +	 * But before we proceed we might need to unlock current VMA.
>  	 */
>  	addr = vma->vm_end;
> +	if (!mmap_locked)
> +		vma_end_read(vma);
>  	if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
>  		goto next_vma;
>  no_vma:
> -	mmap_read_unlock(mm);
> +	if (mmap_locked)
> +		mmap_read_unlock(mm);
>  	return ERR_PTR(-ENOENT);
>  }
>  
> -static void unlock_vma(struct vm_area_struct *vma)
> +static void unlock_vma(struct vm_area_struct *vma, bool mm_locked)

Confusing function name, since it may not be doing anything with the
vma lock.

>  {
> -	mmap_read_unlock(vma->vm_mm);
> +	if (mm_locked)
> +		mmap_read_unlock(vma->vm_mm);
> +	else
> +		vma_end_read(vma);
>  }
>  
>  static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
> @@ -447,6 +472,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
>  	struct procmap_query karg;
>  	struct vm_area_struct *vma;
>  	struct mm_struct *mm;
> +	bool mm_locked;
>  	const char *name = NULL;
>  	char *name_buf = NULL;
>  	__u64 usize;
> @@ -475,7 +501,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
>  	if (!mm || !mmget_not_zero(mm))
>  		return -ESRCH;
>  
> -	vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
> +	vma = query_matching_vma(mm, karg.query_addr, karg.query_flags, &mm_locked);
>  	if (IS_ERR(vma)) {
>  		mmput(mm);
>  		return PTR_ERR(vma);
> @@ -542,7 +568,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
>  	}
>  
>  	/* unlock vma/mm_struct and put mm_struct before copying data to user */
> -	unlock_vma(vma);
> +	unlock_vma(vma, mm_locked);
>  	mmput(mm);
>  
>  	if (karg.vma_name_size && copy_to_user((void __user *)karg.vma_name_addr,
> @@ -558,7 +584,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
>  	return 0;
>  
>  out:
> -	unlock_vma(vma);
> +	unlock_vma(vma, mm_locked);
>  	mmput(mm);
>  	kfree(name_buf);
>  	return err;
> -- 
> 2.43.0
> 

[1]. https://lore.kernel.org/linux-mm/20240215182756.3448972-5-lokeshgidra@google.com/

Thanks,
Liam
Andrii Nakryiko May 28, 2024, 8:36 p.m. UTC | #2
On Fri, May 24, 2024 at 12:48 PM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Andrii Nakryiko <andrii@kernel.org> [240524 00:10]:
> > Attempt to use RCU-protected per-VAM lock when looking up requested VMA
> > as much as possible, only falling back to mmap_lock if per-VMA lock
> > failed. This is done so that querying of VMAs doesn't interfere with
> > other critical tasks, like page fault handling.
> >
> > This has been suggested by mm folks, and we make use of a newly added
> > internal API that works like find_vma(), but tries to use per-VMA lock.
>
> Thanks for doing this.
>
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  fs/proc/task_mmu.c | 42 ++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 34 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 8ad547efd38d..2b14d06d1def 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -389,12 +389,30 @@ static int pid_maps_open(struct inode *inode, struct file *file)
> >  )
> >
> >  static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
> > -                                              unsigned long addr, u32 flags)
> > +                                              unsigned long addr, u32 flags,
> > +                                              bool *mm_locked)
> >  {
> >       struct vm_area_struct *vma;
> > +     bool mmap_locked;
> > +
> > +     *mm_locked = mmap_locked = false;
> >
> >  next_vma:
> > -     vma = find_vma(mm, addr);
> > +     if (!mmap_locked) {
> > +             /* if we haven't yet acquired mmap_lock, try to use less disruptive per-VMA */
> > +             vma = find_and_lock_vma_rcu(mm, addr);
> > +             if (IS_ERR(vma)) {
>
> There is a chance that find_and_lock_vma_rcu() will return NULL when
> there should never be a NULL.
>
> If you follow the MAP_FIXED call to mmap(), you'll land in map_region()
> which does two operations: munmap(), then the mmap().  Since this was
> behind a lock, it was fine.  Now that we're transitioning to rcu
> readers, it's less ideal.  We have a race where we will see that gap.
> In this implementation we may return NULL if the MAP_FIXED is at the end
> of the address space.
>
> It might also cause issues if we are searching for a specific address
> and we will skip a VMA that is currently being inserted by MAP_FIXED.
>
> The page fault handler doesn't have this issue as it looks for a
> specific address then falls back to the lock if one is not found.
>
> This problem needs to be fixed prior to shifting the existing proc maps
> file to using rcu read locks as well.  We have a solution that isn't
> upstream or on the ML, but is being tested and will go upstream.

Ok, any ETA for that? Can it be retrofitted into
find_and_lock_vma_rcu() once the fix lands? It's not ideal, but I
think it's acceptable (for now) for this new API to have this race,
given it seems quite unlikely to be hit in practice.

Worst case, we can leave the per-VMA RCU-protected bits out until we
have this solution in place, and then add it back when ready.

>
> > +                     /* failed to take per-VMA lock, fallback to mmap_lock */
> > +                     if (mmap_read_lock_killable(mm))
> > +                             return ERR_PTR(-EINTR);
> > +
> > +                     *mm_locked = mmap_locked = true;
> > +                     vma = find_vma(mm, addr);
>
> If you lock the vma here then drop the mmap lock, then you should be
> able to simplify the code by avoiding the passing of the mmap_locked
> variable around.
>
> It also means we don't need to do an unlokc_vma() call, which indicates
> we are going to end the vma read but actually may be unlocking the mm.
>
> This is exactly why I think we need a common pattern and infrastructure
> to do this sort of walking.
>
> Please have a look at userfaultfd patches here [1].  Note that
> vma_start_read() cannot be used in the mmap_read_lock() critical
> section.

Ok, so you'd like me to do something like below, right?

vma = find_vma(mm, addr);
if (vma)
    down_read(&vma->vm_lock->lock)
mmap_read_unlock(mm);

... and for the rest of logic always assume having per-VMA lock. ...


The problem here is that I think we can't assume per-VMA lock, because
it's gated by CONFIG_PER_VMA_LOCK, so I think we'll have to deal with
this mmap_locked flag either way. Or am I missing anything?

I don't think the flag makes things that much worse, tbh, but I'm
happy to accommodate any better solution that would work regardless of
CONFIG_PER_VMA_LOCK.

>
> > +             }
> > +     } else {
> > +             /* if we have mmap_lock, get through the search as fast as possible */
> > +             vma = find_vma(mm, addr);
>
> I think the only way we get here is if we are contending on the mmap
> lock.  This is actually where we should try to avoid holding the lock?
>
> > +     }
> >
> >       /* no VMA found */
> >       if (!vma)
> > @@ -428,18 +446,25 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
> >  skip_vma:
> >       /*
> >        * If the user needs closest matching VMA, keep iterating.
> > +      * But before we proceed we might need to unlock current VMA.
> >        */
> >       addr = vma->vm_end;
> > +     if (!mmap_locked)
> > +             vma_end_read(vma);
> >       if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
> >               goto next_vma;
> >  no_vma:
> > -     mmap_read_unlock(mm);
> > +     if (mmap_locked)
> > +             mmap_read_unlock(mm);
> >       return ERR_PTR(-ENOENT);
> >  }
> >
> > -static void unlock_vma(struct vm_area_struct *vma)
> > +static void unlock_vma(struct vm_area_struct *vma, bool mm_locked)
>
> Confusing function name, since it may not be doing anything with the
> vma lock.

Would "unlock_vma_or_mm()" be ok?

>
> >  {
> > -     mmap_read_unlock(vma->vm_mm);
> > +     if (mm_locked)
> > +             mmap_read_unlock(vma->vm_mm);
> > +     else
> > +             vma_end_read(vma);
> >  }
> >
> >  static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
> > @@ -447,6 +472,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
> >       struct procmap_query karg;
> >       struct vm_area_struct *vma;
> >       struct mm_struct *mm;
> > +     bool mm_locked;
> >       const char *name = NULL;
> >       char *name_buf = NULL;
> >       __u64 usize;
> > @@ -475,7 +501,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
> >       if (!mm || !mmget_not_zero(mm))
> >               return -ESRCH;
> >
> > -     vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
> > +     vma = query_matching_vma(mm, karg.query_addr, karg.query_flags, &mm_locked);
> >       if (IS_ERR(vma)) {
> >               mmput(mm);
> >               return PTR_ERR(vma);
> > @@ -542,7 +568,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
> >       }
> >
> >       /* unlock vma/mm_struct and put mm_struct before copying data to user */
> > -     unlock_vma(vma);
> > +     unlock_vma(vma, mm_locked);
> >       mmput(mm);
> >
> >       if (karg.vma_name_size && copy_to_user((void __user *)karg.vma_name_addr,
> > @@ -558,7 +584,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
> >       return 0;
> >
> >  out:
> > -     unlock_vma(vma);
> > +     unlock_vma(vma, mm_locked);
> >       mmput(mm);
> >       kfree(name_buf);
> >       return err;
> > --
> > 2.43.0
> >
>
> [1]. https://lore.kernel.org/linux-mm/20240215182756.3448972-5-lokeshgidra@google.com/
>
> Thanks,
> Liam
Liam R. Howlett May 31, 2024, 1:37 p.m. UTC | #3
* Andrii Nakryiko <andrii.nakryiko@gmail.com> [240528 16:37]:
> On Fri, May 24, 2024 at 12:48 PM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Andrii Nakryiko <andrii@kernel.org> [240524 00:10]:
> > > Attempt to use RCU-protected per-VAM lock when looking up requested VMA
> > > as much as possible, only falling back to mmap_lock if per-VMA lock
> > > failed. This is done so that querying of VMAs doesn't interfere with
> > > other critical tasks, like page fault handling.
> > >
> > > This has been suggested by mm folks, and we make use of a newly added
> > > internal API that works like find_vma(), but tries to use per-VMA lock.
> >
> > Thanks for doing this.
> >
> > >
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > ---
> > >  fs/proc/task_mmu.c | 42 ++++++++++++++++++++++++++++++++++--------
> > >  1 file changed, 34 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > > index 8ad547efd38d..2b14d06d1def 100644
> > > --- a/fs/proc/task_mmu.c
> > > +++ b/fs/proc/task_mmu.c
> > > @@ -389,12 +389,30 @@ static int pid_maps_open(struct inode *inode, struct file *file)
> > >  )
> > >
> > >  static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
> > > -                                              unsigned long addr, u32 flags)
> > > +                                              unsigned long addr, u32 flags,
> > > +                                              bool *mm_locked)
> > >  {
> > >       struct vm_area_struct *vma;
> > > +     bool mmap_locked;
> > > +
> > > +     *mm_locked = mmap_locked = false;
> > >
> > >  next_vma:
> > > -     vma = find_vma(mm, addr);
> > > +     if (!mmap_locked) {
> > > +             /* if we haven't yet acquired mmap_lock, try to use less disruptive per-VMA */
> > > +             vma = find_and_lock_vma_rcu(mm, addr);
> > > +             if (IS_ERR(vma)) {
> >
> > There is a chance that find_and_lock_vma_rcu() will return NULL when
> > there should never be a NULL.
> >
> > If you follow the MAP_FIXED call to mmap(), you'll land in map_region()
> > which does two operations: munmap(), then the mmap().  Since this was
> > behind a lock, it was fine.  Now that we're transitioning to rcu
> > readers, it's less ideal.  We have a race where we will see that gap.
> > In this implementation we may return NULL if the MAP_FIXED is at the end
> > of the address space.
> >
> > It might also cause issues if we are searching for a specific address
> > and we will skip a VMA that is currently being inserted by MAP_FIXED.
> >
> > The page fault handler doesn't have this issue as it looks for a
> > specific address then falls back to the lock if one is not found.
> >
> > This problem needs to be fixed prior to shifting the existing proc maps
> > file to using rcu read locks as well.  We have a solution that isn't
> > upstream or on the ML, but is being tested and will go upstream.
> 
> Ok, any ETA for that? Can it be retrofitted into
> find_and_lock_vma_rcu() once the fix lands? It's not ideal, but I
> think it's acceptable (for now) for this new API to have this race,
> given it seems quite unlikely to be hit in practice.
> 
> Worst case, we can leave the per-VMA RCU-protected bits out until we
> have this solution in place, and then add it back when ready.

I've sent the patches to Suren for testing on the /proc/<pid>/maps he is
doing as he could recreate this issue, but I think he is busy with other
things.  They are isolated to the mm changes so I can send you the same
patches to include in this patch set.  This does increase the risk of
issues with the patch set, so you can have a look and decide how you
want to proceed.

> 
> >
> > > +                     /* failed to take per-VMA lock, fallback to mmap_lock */
> > > +                     if (mmap_read_lock_killable(mm))
> > > +                             return ERR_PTR(-EINTR);
> > > +
> > > +                     *mm_locked = mmap_locked = true;
> > > +                     vma = find_vma(mm, addr);
> >
> > If you lock the vma here then drop the mmap lock, then you should be
> > able to simplify the code by avoiding the passing of the mmap_locked
> > variable around.
> >
> > It also means we don't need to do an unlokc_vma() call, which indicates
> > we are going to end the vma read but actually may be unlocking the mm.
> >
> > This is exactly why I think we need a common pattern and infrastructure
> > to do this sort of walking.
> >
> > Please have a look at userfaultfd patches here [1].  Note that
> > vma_start_read() cannot be used in the mmap_read_lock() critical
> > section.
> 
> Ok, so you'd like me to do something like below, right?
> 
> vma = find_vma(mm, addr);
> if (vma)
>     down_read(&vma->vm_lock->lock)
> mmap_read_unlock(mm);
> 
> ... and for the rest of logic always assume having per-VMA lock. ...
> 
> 
> The problem here is that I think we can't assume per-VMA lock, because
> it's gated by CONFIG_PER_VMA_LOCK, so I think we'll have to deal with
> this mmap_locked flag either way. Or am I missing anything?

The per-vma lock being used depends on the CONFIG_PER_VMA_LOCK, so that
flag tells us which lock has been taken.

> 
> I don't think the flag makes things that much worse, tbh, but I'm
> happy to accommodate any better solution that would work regardless of
> CONFIG_PER_VMA_LOCK.
> 
> >
> > > +             }
> > > +     } else {
> > > +             /* if we have mmap_lock, get through the search as fast as possible */
> > > +             vma = find_vma(mm, addr);
> >
> > I think the only way we get here is if we are contending on the mmap
> > lock.  This is actually where we should try to avoid holding the lock?
> >
> > > +     }
> > >
> > >       /* no VMA found */
> > >       if (!vma)
> > > @@ -428,18 +446,25 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
> > >  skip_vma:
> > >       /*
> > >        * If the user needs closest matching VMA, keep iterating.
> > > +      * But before we proceed we might need to unlock current VMA.
> > >        */
> > >       addr = vma->vm_end;
> > > +     if (!mmap_locked)
> > > +             vma_end_read(vma);
> > >       if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
> > >               goto next_vma;
> > >  no_vma:
> > > -     mmap_read_unlock(mm);
> > > +     if (mmap_locked)
> > > +             mmap_read_unlock(mm);
> > >       return ERR_PTR(-ENOENT);
> > >  }
> > >
> > > -static void unlock_vma(struct vm_area_struct *vma)
> > > +static void unlock_vma(struct vm_area_struct *vma, bool mm_locked)
> >
> > Confusing function name, since it may not be doing anything with the
> > vma lock.
> 
> Would "unlock_vma_or_mm()" be ok?

The way that seemed most clear in the userfaultfd code
(/mm/userfaultfd.c), seemed to focus on what we were undoing instead of
the lock we were unlocking.  Instead of saying "unlock one or the other"
we have "uffd_mfill_unlock()", and have two versions of that function
that take the same argument.  This way we can have the same blocks of
code calling the same thing, with a different lock/unlock happening
based on the CONFIG_PER_VMA_LOCK compile time option.  If that makes
sense to you, then I'd prefer it over the other options - none are
ideal.

Note that people didn't like the "unlock_" name, even on static
functions as it implies it can be used everywhere and may conflict with
a global function in the future [1].

[1] https://lore.kernel.org/linux-mm/20240426144506.1290619-4-willy@infradead.org/

Thanks,
Liam
Andrii Nakryiko May 31, 2024, 4:37 p.m. UTC | #4
On Fri, May 31, 2024 at 6:38 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Andrii Nakryiko <andrii.nakryiko@gmail.com> [240528 16:37]:
> > On Fri, May 24, 2024 at 12:48 PM Liam R. Howlett
> > <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Andrii Nakryiko <andrii@kernel.org> [240524 00:10]:
> > > > Attempt to use RCU-protected per-VAM lock when looking up requested VMA
> > > > as much as possible, only falling back to mmap_lock if per-VMA lock
> > > > failed. This is done so that querying of VMAs doesn't interfere with
> > > > other critical tasks, like page fault handling.
> > > >
> > > > This has been suggested by mm folks, and we make use of a newly added
> > > > internal API that works like find_vma(), but tries to use per-VMA lock.
> > >
> > > Thanks for doing this.
> > >
> > > >
> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > ---
> > > >  fs/proc/task_mmu.c | 42 ++++++++++++++++++++++++++++++++++--------
> > > >  1 file changed, 34 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > > > index 8ad547efd38d..2b14d06d1def 100644
> > > > --- a/fs/proc/task_mmu.c
> > > > +++ b/fs/proc/task_mmu.c
> > > > @@ -389,12 +389,30 @@ static int pid_maps_open(struct inode *inode, struct file *file)
> > > >  )
> > > >
> > > >  static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
> > > > -                                              unsigned long addr, u32 flags)
> > > > +                                              unsigned long addr, u32 flags,
> > > > +                                              bool *mm_locked)
> > > >  {
> > > >       struct vm_area_struct *vma;
> > > > +     bool mmap_locked;
> > > > +
> > > > +     *mm_locked = mmap_locked = false;
> > > >
> > > >  next_vma:
> > > > -     vma = find_vma(mm, addr);
> > > > +     if (!mmap_locked) {
> > > > +             /* if we haven't yet acquired mmap_lock, try to use less disruptive per-VMA */
> > > > +             vma = find_and_lock_vma_rcu(mm, addr);
> > > > +             if (IS_ERR(vma)) {
> > >
> > > There is a chance that find_and_lock_vma_rcu() will return NULL when
> > > there should never be a NULL.
> > >
> > > If you follow the MAP_FIXED call to mmap(), you'll land in map_region()
> > > which does two operations: munmap(), then the mmap().  Since this was
> > > behind a lock, it was fine.  Now that we're transitioning to rcu
> > > readers, it's less ideal.  We have a race where we will see that gap.
> > > In this implementation we may return NULL if the MAP_FIXED is at the end
> > > of the address space.
> > >
> > > It might also cause issues if we are searching for a specific address
> > > and we will skip a VMA that is currently being inserted by MAP_FIXED.
> > >
> > > The page fault handler doesn't have this issue as it looks for a
> > > specific address then falls back to the lock if one is not found.
> > >
> > > This problem needs to be fixed prior to shifting the existing proc maps
> > > file to using rcu read locks as well.  We have a solution that isn't
> > > upstream or on the ML, but is being tested and will go upstream.
> >
> > Ok, any ETA for that? Can it be retrofitted into
> > find_and_lock_vma_rcu() once the fix lands? It's not ideal, but I
> > think it's acceptable (for now) for this new API to have this race,
> > given it seems quite unlikely to be hit in practice.
> >
> > Worst case, we can leave the per-VMA RCU-protected bits out until we
> > have this solution in place, and then add it back when ready.
>
> I've sent the patches to Suren for testing on the /proc/<pid>/maps he is
> doing as he could recreate this issue, but I think he is busy with other
> things.  They are isolated to the mm changes so I can send you the same
> patches to include in this patch set.  This does increase the risk of
> issues with the patch set, so you can have a look and decide how you
> want to proceed.

Please do send them, without seeing them it's hard to make a judgement
call if it's ok to proceed without them. If the changes are
transparent to the rest of my changes in this patch set, of course I'd
prefer to keep it simpler and land them separately.

>
> >
> > >
> > > > +                     /* failed to take per-VMA lock, fallback to mmap_lock */
> > > > +                     if (mmap_read_lock_killable(mm))
> > > > +                             return ERR_PTR(-EINTR);
> > > > +
> > > > +                     *mm_locked = mmap_locked = true;
> > > > +                     vma = find_vma(mm, addr);
> > >
> > > If you lock the vma here then drop the mmap lock, then you should be
> > > able to simplify the code by avoiding the passing of the mmap_locked
> > > variable around.
> > >
> > > It also means we don't need to do an unlokc_vma() call, which indicates
> > > we are going to end the vma read but actually may be unlocking the mm.
> > >
> > > This is exactly why I think we need a common pattern and infrastructure
> > > to do this sort of walking.
> > >
> > > Please have a look at userfaultfd patches here [1].  Note that
> > > vma_start_read() cannot be used in the mmap_read_lock() critical
> > > section.
> >
> > Ok, so you'd like me to do something like below, right?
> >
> > vma = find_vma(mm, addr);
> > if (vma)
> >     down_read(&vma->vm_lock->lock)
> > mmap_read_unlock(mm);
> >
> > ... and for the rest of logic always assume having per-VMA lock. ...
> >
> >
> > The problem here is that I think we can't assume per-VMA lock, because
> > it's gated by CONFIG_PER_VMA_LOCK, so I think we'll have to deal with
> > this mmap_locked flag either way. Or am I missing anything?
>
> The per-vma lock being used depends on the CONFIG_PER_VMA_LOCK, so that
> flag tells us which lock has been taken.
>
> >
> > I don't think the flag makes things that much worse, tbh, but I'm
> > happy to accommodate any better solution that would work regardless of
> > CONFIG_PER_VMA_LOCK.
> >
> > >
> > > > +             }
> > > > +     } else {
> > > > +             /* if we have mmap_lock, get through the search as fast as possible */
> > > > +             vma = find_vma(mm, addr);
> > >
> > > I think the only way we get here is if we are contending on the mmap
> > > lock.  This is actually where we should try to avoid holding the lock?
> > >
> > > > +     }
> > > >
> > > >       /* no VMA found */
> > > >       if (!vma)
> > > > @@ -428,18 +446,25 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
> > > >  skip_vma:
> > > >       /*
> > > >        * If the user needs closest matching VMA, keep iterating.
> > > > +      * But before we proceed we might need to unlock current VMA.
> > > >        */
> > > >       addr = vma->vm_end;
> > > > +     if (!mmap_locked)
> > > > +             vma_end_read(vma);
> > > >       if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
> > > >               goto next_vma;
> > > >  no_vma:
> > > > -     mmap_read_unlock(mm);
> > > > +     if (mmap_locked)
> > > > +             mmap_read_unlock(mm);
> > > >       return ERR_PTR(-ENOENT);
> > > >  }
> > > >
> > > > -static void unlock_vma(struct vm_area_struct *vma)
> > > > +static void unlock_vma(struct vm_area_struct *vma, bool mm_locked)
> > >
> > > Confusing function name, since it may not be doing anything with the
> > > vma lock.
> >
> > Would "unlock_vma_or_mm()" be ok?
>
> The way that seemed most clear in the userfaultfd code
> (/mm/userfaultfd.c), seemed to focus on what we were undoing instead of
> the lock we were unlocking.  Instead of saying "unlock one or the other"
> we have "uffd_mfill_unlock()", and have two versions of that function
> that take the same argument.  This way we can have the same blocks of
> code calling the same thing, with a different lock/unlock happening
> based on the CONFIG_PER_VMA_LOCK compile time option.  If that makes
> sense to you, then I'd prefer it over the other options - none are
> ideal.

ok, two implementations of the same function sounds fine to me, I'll
do that in the next revision, thanks!

>
> Note that people didn't like the "unlock_" name, even on static
> functions as it implies it can be used everywhere and may conflict with
> a global function in the future [1].

yep, ack

>
> [1] https://lore.kernel.org/linux-mm/20240426144506.1290619-4-willy@infradead.org/
>
> Thanks,
> Liam
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8ad547efd38d..2b14d06d1def 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -389,12 +389,30 @@  static int pid_maps_open(struct inode *inode, struct file *file)
 )
 
 static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
-						 unsigned long addr, u32 flags)
+						 unsigned long addr, u32 flags,
+						 bool *mm_locked)
 {
 	struct vm_area_struct *vma;
+	bool mmap_locked;
+
+	*mm_locked = mmap_locked = false;
 
 next_vma:
-	vma = find_vma(mm, addr);
+	if (!mmap_locked) {
+		/* if we haven't yet acquired mmap_lock, try to use less disruptive per-VMA */
+		vma = find_and_lock_vma_rcu(mm, addr);
+		if (IS_ERR(vma)) {
+			/* failed to take per-VMA lock, fallback to mmap_lock */
+			if (mmap_read_lock_killable(mm))
+				return ERR_PTR(-EINTR);
+
+			*mm_locked = mmap_locked = true;
+			vma = find_vma(mm, addr);
+		}
+	} else {
+		/* if we have mmap_lock, get through the search as fast as possible */
+		vma = find_vma(mm, addr);
+	}
 
 	/* no VMA found */
 	if (!vma)
@@ -428,18 +446,25 @@  static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
 skip_vma:
 	/*
 	 * If the user needs closest matching VMA, keep iterating.
+	 * But before we proceed we might need to unlock current VMA.
 	 */
 	addr = vma->vm_end;
+	if (!mmap_locked)
+		vma_end_read(vma);
 	if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
 		goto next_vma;
 no_vma:
-	mmap_read_unlock(mm);
+	if (mmap_locked)
+		mmap_read_unlock(mm);
 	return ERR_PTR(-ENOENT);
 }
 
-static void unlock_vma(struct vm_area_struct *vma)
+static void unlock_vma(struct vm_area_struct *vma, bool mm_locked)
 {
-	mmap_read_unlock(vma->vm_mm);
+	if (mm_locked)
+		mmap_read_unlock(vma->vm_mm);
+	else
+		vma_end_read(vma);
 }
 
 static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
@@ -447,6 +472,7 @@  static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
 	struct procmap_query karg;
 	struct vm_area_struct *vma;
 	struct mm_struct *mm;
+	bool mm_locked;
 	const char *name = NULL;
 	char *name_buf = NULL;
 	__u64 usize;
@@ -475,7 +501,7 @@  static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
 	if (!mm || !mmget_not_zero(mm))
 		return -ESRCH;
 
-	vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
+	vma = query_matching_vma(mm, karg.query_addr, karg.query_flags, &mm_locked);
 	if (IS_ERR(vma)) {
 		mmput(mm);
 		return PTR_ERR(vma);
@@ -542,7 +568,7 @@  static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
 	}
 
 	/* unlock vma/mm_struct and put mm_struct before copying data to user */
-	unlock_vma(vma);
+	unlock_vma(vma, mm_locked);
 	mmput(mm);
 
 	if (karg.vma_name_size && copy_to_user((void __user *)karg.vma_name_addr,
@@ -558,7 +584,7 @@  static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
 	return 0;
 
 out:
-	unlock_vma(vma);
+	unlock_vma(vma, mm_locked);
 	mmput(mm);
 	kfree(name_buf);
 	return err;