diff mbox series

[v6,10/16] mm: replace vm_lock and detached flag with a reference count

Message ID 20241216192419.2970941-11-surenb@google.com (mailing list archive)
State New
Headers show
Series move per-vma lock into vm_area_struct | expand

Commit Message

Suren Baghdasaryan Dec. 16, 2024, 7:24 p.m. UTC
rw_semaphore is a sizable structure of 40 bytes and consumes
considerable space for each vm_area_struct. However vma_lock has
two important specifics which can be used to replace rw_semaphore
with a simpler structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode.
Because of these requirements, full rw_semaphore functionality is not
needed and we can replace rw_semaphore and the vma->detached flag with
a refcount (vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached()
to be done only after vma has been write-locked. vma_mark_attached()
changes vm_refcnt to 1 to indicate that it has been attached to the vma
tree. When a reader takes read lock, it increments vm_refcnt, unless the
top usable bit of vm_refcnt (0x40000000) is set, indicating presence of
a writer. When writer takes write lock, it both increments vm_refcnt and
sets the top usable bit to indicate its presence. If there are readers,
writer will wait using newly introduced mm->vma_writer_wait. Since all
writers take mmap_lock in write mode first, there can be only one writer
at a time. The last reader to release the lock will signal the writer
to wake up.
refcount might overflow if there are many competing readers, in which case
read-locking will fail. Readers are expected to handle such failures.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h               | 95 ++++++++++++++++++++++++--------
 include/linux/mm_types.h         | 23 ++++----
 kernel/fork.c                    |  9 +--
 mm/init-mm.c                     |  1 +
 mm/memory.c                      | 33 +++++++----
 tools/testing/vma/linux/atomic.h |  5 ++
 tools/testing/vma/vma_internal.h | 57 ++++++++++---------
 7 files changed, 147 insertions(+), 76 deletions(-)

Comments

Peter Zijlstra Dec. 16, 2024, 8:42 p.m. UTC | #1
On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:
> @@ -734,10 +761,12 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	 * after it has been unlocked.
>  	 * This pairs with RELEASE semantics in vma_end_write_all().
>  	 */
> +	if (oldcnt & VMA_STATE_LOCKED ||
> +	    unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {

You likely want that unlikely to cover both conditions :-)

> +		vma_refcount_put(vma);
>  		return false;
>  	}
> +
>  	return true;
>  }
Suren Baghdasaryan Dec. 16, 2024, 8:53 p.m. UTC | #2
On Mon, Dec 16, 2024 at 12:42 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:
> > @@ -734,10 +761,12 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> >        * after it has been unlocked.
> >        * This pairs with RELEASE semantics in vma_end_write_all().
> >        */
> > +     if (oldcnt & VMA_STATE_LOCKED ||
> > +         unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
>
> You likely want that unlikely to cover both conditions :-)

True. VMA_STATE_LOCKED is set only while the writer is updating the
vm_lock_seq and that's a narrow window. I'll make that change in the
next revision. Thanks!

>
> > +             vma_refcount_put(vma);
> >               return false;
> >       }
> > +
> >       return true;
> >  }
Peter Zijlstra Dec. 16, 2024, 9:15 p.m. UTC | #3
On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:

FWIW, I find the whole VMA_STATE_{A,DE}TATCHED thing awkward. And
perhaps s/VMA_STATE_LOCKED/VMA_LOCK_OFFSET/ ?

Also, perhaps:

#define VMA_REF_LIMIT	(VMA_LOCK_OFFSET - 2)

> @@ -699,10 +700,27 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
>  #ifdef CONFIG_PER_VMA_LOCK
>  static inline void vma_lock_init(struct vm_area_struct *vma)
>  {
> -	init_rwsem(&vma->vm_lock.lock);
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	static struct lock_class_key lockdep_key;
> +
> +	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
> +#endif
> +	refcount_set(&vma->vm_refcnt, VMA_STATE_DETACHED);
>  	vma->vm_lock_seq = UINT_MAX;

Depending on how you do the actual allocation (GFP_ZERO) you might want
to avoid that vm_refcount store entirely.

Perhaps instead write: VM_WARN_ON(refcount_read(&vma->vm_refcnt));

> @@ -813,25 +849,42 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>  
>  static inline void vma_assert_locked(struct vm_area_struct *vma)
>  {
> -	if (!rwsem_is_locked(&vma->vm_lock.lock))
> +	if (refcount_read(&vma->vm_refcnt) <= VMA_STATE_ATTACHED)
	if (is_vma_detached(vma))

>  		vma_assert_write_locked(vma);
>  }
>  
> -static inline void vma_mark_attached(struct vm_area_struct *vma)
> +/*
> + * WARNING: to avoid racing with vma_mark_attached(), should be called either
> + * under mmap_write_lock or when the object has been isolated under
> + * mmap_write_lock, ensuring no competing writers.
> + */
> +static inline bool is_vma_detached(struct vm_area_struct *vma)
>  {
> -	vma->detached = false;
> +	return refcount_read(&vma->vm_refcnt) == VMA_STATE_DETACHED;
	return !refcount_read(&vma->vm_refcnt);
>  }
>  
> -static inline void vma_mark_detached(struct vm_area_struct *vma)
> +static inline void vma_mark_attached(struct vm_area_struct *vma)
>  {
> -	/* When detaching vma should be write-locked */
>  	vma_assert_write_locked(vma);
> -	vma->detached = true;
> +
> +	if (is_vma_detached(vma))
> +		refcount_set(&vma->vm_refcnt, VMA_STATE_ATTACHED);

Urgh, so it would be really good to not call this at all them not 0.
I've not tried to untangle the mess, but this is really awkward. Surely
you don't add it to the mas multiple times either.

Also:

	refcount_set(&vma->vm_refcnt, 1);

is so much clearer.

That is, should this not live in vma_iter_store*(), right before
mas_store_gfp() ?

Also, ISTR having to set vm_lock_seq right before it?

>  }
>  
> -static inline bool is_vma_detached(struct vm_area_struct *vma)
> +static inline void vma_mark_detached(struct vm_area_struct *vma)
>  {
> -	return vma->detached;
> +	vma_assert_write_locked(vma);
> +
> +	if (is_vma_detached(vma))
> +		return;

Again, this just reads like confusion :/ Surely you don't have the same
with mas_detach?

> +
> +	/* We are the only writer, so no need to use vma_refcount_put(). */
> +	if (!refcount_dec_and_test(&vma->vm_refcnt)) {
> +		/*
> +		 * Reader must have temporarily raised vm_refcnt but it will
> +		 * drop it without using the vma since vma is write-locked.
> +		 */
> +	}
>  }
Peter Zijlstra Dec. 16, 2024, 9:37 p.m. UTC | #4
On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:
> +static inline void vma_refcount_put(struct vm_area_struct *vma)
> +{
> +	int refcnt;
> +
> +	if (!__refcount_dec_and_test(&vma->vm_refcnt, &refcnt)) {
> +		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> +
> +		if (refcnt & VMA_STATE_LOCKED)
> +			rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> +	}
> +}
> +
>  /*
>   * Try to read-lock a vma. The function is allowed to occasionally yield false
>   * locked result to avoid performance overhead, in which case we fall back to
> @@ -710,6 +728,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma)
>   */
>  static inline bool vma_start_read(struct vm_area_struct *vma)
>  {
> +	int oldcnt;
> +
>  	/*
>  	 * Check before locking. A race might cause false locked result.
>  	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
> @@ -720,13 +740,20 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
>  		return false;
>  
> +
> +	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> +	/* Limit at VMA_STATE_LOCKED - 2 to leave one count for a writer */
> +	if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
> +						      VMA_STATE_LOCKED - 2))) {
> +		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
>  		return false;
> +	}
> +	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
>  
>  	/*
> +	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
>  	 * False unlocked result is impossible because we modify and check
> +	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
>  	 * modification invalidates all existing locks.
>  	 *
>  	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
> @@ -734,10 +761,12 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	 * after it has been unlocked.
>  	 * This pairs with RELEASE semantics in vma_end_write_all().
>  	 */
> +	if (oldcnt & VMA_STATE_LOCKED ||
> +	    unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> +		vma_refcount_put(vma);

Suppose we have detach race with a concurrent RCU lookup like:

					vma = mas_lookup();

	vma_start_write();
	mas_detach();
					vma_start_read()
					rwsem_acquire_read()
					inc // success
	vma_mark_detach();
	dec_and_test // assumes 1->0
		     // is actually 2->1

					if (vm_lock_seq == vma->vm_mm_mm_lock_seq) // true
					  vma_refcount_put
					    dec_and_test() // 1->0
					      *NO* rwsem_release()



>  		return false;
>  	}
> +
>  	return true;
>  }
Suren Baghdasaryan Dec. 16, 2024, 9:44 p.m. UTC | #5
On Mon, Dec 16, 2024 at 1:38 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:
> > +static inline void vma_refcount_put(struct vm_area_struct *vma)
> > +{
> > +     int refcnt;
> > +
> > +     if (!__refcount_dec_and_test(&vma->vm_refcnt, &refcnt)) {
> > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > +
> > +             if (refcnt & VMA_STATE_LOCKED)
> > +                     rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> > +     }
> > +}
> > +
> >  /*
> >   * Try to read-lock a vma. The function is allowed to occasionally yield false
> >   * locked result to avoid performance overhead, in which case we fall back to
> > @@ -710,6 +728,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma)
> >   */
> >  static inline bool vma_start_read(struct vm_area_struct *vma)
> >  {
> > +     int oldcnt;
> > +
> >       /*
> >        * Check before locking. A race might cause false locked result.
> >        * We can use READ_ONCE() for the mm_lock_seq here, and don't need
> > @@ -720,13 +740,20 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> >       if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
> >               return false;
> >
> > +
> > +     rwsem_acquire_read(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> > +     /* Limit at VMA_STATE_LOCKED - 2 to leave one count for a writer */
> > +     if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
> > +                                                   VMA_STATE_LOCKED - 2))) {
> > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> >               return false;
> > +     }
> > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> >
> >       /*
> > +      * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
> >        * False unlocked result is impossible because we modify and check
> > +      * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
> >        * modification invalidates all existing locks.
> >        *
> >        * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
> > @@ -734,10 +761,12 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> >        * after it has been unlocked.
> >        * This pairs with RELEASE semantics in vma_end_write_all().
> >        */
> > +     if (oldcnt & VMA_STATE_LOCKED ||
> > +         unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> > +             vma_refcount_put(vma);
>
> Suppose we have detach race with a concurrent RCU lookup like:
>
>                                         vma = mas_lookup();
>
>         vma_start_write();
>         mas_detach();
>                                         vma_start_read()
>                                         rwsem_acquire_read()
>                                         inc // success
>         vma_mark_detach();
>         dec_and_test // assumes 1->0
>                      // is actually 2->1
>
>                                         if (vm_lock_seq == vma->vm_mm_mm_lock_seq) // true
>                                           vma_refcount_put
>                                             dec_and_test() // 1->0
>                                               *NO* rwsem_release()
>

Yes, this is possible. I think that's not a problem until we start
reusing the vmas and I deal with this race later in this patchset.
I think what you described here is the same race I mention in the
description of this patch:
https://lore.kernel.org/all/20241216192419.2970941-14-surenb@google.com/
I introduce vma_ensure_detached() in that patch to handle this case
and ensure that vmas are detached before they are returned into the
slab cache for reuse. Does that make sense?


>
>
> >               return false;
> >       }
> > +
> >       return true;
> >  }
Suren Baghdasaryan Dec. 16, 2024, 9:53 p.m. UTC | #6
On Mon, Dec 16, 2024 at 1:15 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:
>
> FWIW, I find the whole VMA_STATE_{A,DE}TATCHED thing awkward. And

I'm bad with naming things, so any better suggestions are welcome.
Are you suggesting to drop VMA_STATE_{A,DE}TATCHED nomenclature and
use 0/1 values directly?

> perhaps s/VMA_STATE_LOCKED/VMA_LOCK_OFFSET/ ?

Sounds good. I'll change it to VMA_LOCK_OFFSET.

>
> Also, perhaps:
>
> #define VMA_REF_LIMIT   (VMA_LOCK_OFFSET - 2)

Ack.

>
> > @@ -699,10 +700,27 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
> >  #ifdef CONFIG_PER_VMA_LOCK
> >  static inline void vma_lock_init(struct vm_area_struct *vma)
> >  {
> > -     init_rwsem(&vma->vm_lock.lock);
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > +     static struct lock_class_key lockdep_key;
> > +
> > +     lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
> > +#endif
> > +     refcount_set(&vma->vm_refcnt, VMA_STATE_DETACHED);
> >       vma->vm_lock_seq = UINT_MAX;
>
> Depending on how you do the actual allocation (GFP_ZERO) you might want
> to avoid that vm_refcount store entirely.

I think we could initialize it to 0 in the slab cache constructor and
when vma is freed we already ensure it's 0. So, even when reused it
will be in the correct 0 state.

>
> Perhaps instead write: VM_WARN_ON(refcount_read(&vma->vm_refcnt));

Yes, with the above approach that should work.

>
> > @@ -813,25 +849,42 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> >
> >  static inline void vma_assert_locked(struct vm_area_struct *vma)
> >  {
> > -     if (!rwsem_is_locked(&vma->vm_lock.lock))
> > +     if (refcount_read(&vma->vm_refcnt) <= VMA_STATE_ATTACHED)
>         if (is_vma_detached(vma))
>
> >               vma_assert_write_locked(vma);
> >  }
> >
> > -static inline void vma_mark_attached(struct vm_area_struct *vma)
> > +/*
> > + * WARNING: to avoid racing with vma_mark_attached(), should be called either
> > + * under mmap_write_lock or when the object has been isolated under
> > + * mmap_write_lock, ensuring no competing writers.
> > + */
> > +static inline bool is_vma_detached(struct vm_area_struct *vma)
> >  {
> > -     vma->detached = false;
> > +     return refcount_read(&vma->vm_refcnt) == VMA_STATE_DETACHED;
>         return !refcount_read(&vma->vm_refcnt);
> >  }
> >
> > -static inline void vma_mark_detached(struct vm_area_struct *vma)
> > +static inline void vma_mark_attached(struct vm_area_struct *vma)
> >  {
> > -     /* When detaching vma should be write-locked */
> >       vma_assert_write_locked(vma);
> > -     vma->detached = true;
> > +
> > +     if (is_vma_detached(vma))
> > +             refcount_set(&vma->vm_refcnt, VMA_STATE_ATTACHED);
>
> Urgh, so it would be really good to not call this at all them not 0.
> I've not tried to untangle the mess, but this is really awkward. Surely
> you don't add it to the mas multiple times either.

The issue is that when we merge/split/shrink/grow vmas, we skip on
marking them detached while modifying them. Therefore from
vma_mark_attached() POV it will look like we are attaching an already
attached vma. I can try to clean that up if this is really a concern.

>
> Also:
>
>         refcount_set(&vma->vm_refcnt, 1);
>
> is so much clearer.

Ok, IIUC you are in favour of dropping VMA_STATE_ATTACHED/VMA_STATE_DETACHED.

>
> That is, should this not live in vma_iter_store*(), right before
> mas_store_gfp() ?

Currently it's done right *after* mas_store_gfp() but I was debating
with myself if it indeed should be *before* insertion into the tree...

>
> Also, ISTR having to set vm_lock_seq right before it?

Yes, vma_mark_attached() requires vma to be write-locked beforehand,
hence the above vma_assert_write_locked(). But oftentimes it's locked
not right before vma_mark_attached() because some other modification
functions also require vma to be write-locked.

>
> >  }
> >
> > -static inline bool is_vma_detached(struct vm_area_struct *vma)
> > +static inline void vma_mark_detached(struct vm_area_struct *vma)
> >  {
> > -     return vma->detached;
> > +     vma_assert_write_locked(vma);
> > +
> > +     if (is_vma_detached(vma))
> > +             return;
>
> Again, this just reads like confusion :/ Surely you don't have the same
> with mas_detach?

I'll double-check if we ever double-mark vma as detached.

Thanks for the review!

>
> > +
> > +     /* We are the only writer, so no need to use vma_refcount_put(). */
> > +     if (!refcount_dec_and_test(&vma->vm_refcnt)) {
> > +             /*
> > +              * Reader must have temporarily raised vm_refcnt but it will
> > +              * drop it without using the vma since vma is write-locked.
> > +              */
> > +     }
> >  }
Peter Zijlstra Dec. 16, 2024, 10 p.m. UTC | #7
On Mon, Dec 16, 2024 at 01:53:06PM -0800, Suren Baghdasaryan wrote:

> > That is, should this not live in vma_iter_store*(), right before
> > mas_store_gfp() ?
> 
> Currently it's done right *after* mas_store_gfp() but I was debating
> with myself if it indeed should be *before* insertion into the tree...

The moment it goes into the tree it becomes visible to RCU lookups, it's
a bit weird to have them with !refcnt at that point, but I don't suppose
it harms.
Peter Zijlstra Dec. 17, 2024, 10:30 a.m. UTC | #8
On Mon, Dec 16, 2024 at 01:44:45PM -0800, Suren Baghdasaryan wrote:
> On Mon, Dec 16, 2024 at 1:38 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:
> > > +static inline void vma_refcount_put(struct vm_area_struct *vma)
> > > +{
> > > +     int refcnt;
> > > +
> > > +     if (!__refcount_dec_and_test(&vma->vm_refcnt, &refcnt)) {
> > > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > > +
> > > +             if (refcnt & VMA_STATE_LOCKED)
> > > +                     rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> > > +     }
> > > +}
> > > +
> > >  /*
> > >   * Try to read-lock a vma. The function is allowed to occasionally yield false
> > >   * locked result to avoid performance overhead, in which case we fall back to
> > > @@ -710,6 +728,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma)
> > >   */
> > >  static inline bool vma_start_read(struct vm_area_struct *vma)
> > >  {
> > > +     int oldcnt;
> > > +
> > >       /*
> > >        * Check before locking. A race might cause false locked result.
> > >        * We can use READ_ONCE() for the mm_lock_seq here, and don't need
> > > @@ -720,13 +740,20 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > >       if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
> > >               return false;
> > >
> > > +
> > > +     rwsem_acquire_read(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> > > +     /* Limit at VMA_STATE_LOCKED - 2 to leave one count for a writer */
> > > +     if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
> > > +                                                   VMA_STATE_LOCKED - 2))) {
> > > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > >               return false;
> > > +     }
> > > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> > >
> > >       /*
> > > +      * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
> > >        * False unlocked result is impossible because we modify and check
> > > +      * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
> > >        * modification invalidates all existing locks.
> > >        *
> > >        * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
> > > @@ -734,10 +761,12 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > >        * after it has been unlocked.
> > >        * This pairs with RELEASE semantics in vma_end_write_all().
> > >        */
> > > +     if (oldcnt & VMA_STATE_LOCKED ||
> > > +         unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> > > +             vma_refcount_put(vma);
> >
> > Suppose we have detach race with a concurrent RCU lookup like:
> >
> >                                         vma = mas_lookup();
> >
> >         vma_start_write();
> >         mas_detach();
> >                                         vma_start_read()
> >                                         rwsem_acquire_read()
> >                                         inc // success
> >         vma_mark_detach();
> >         dec_and_test // assumes 1->0
> >                      // is actually 2->1
> >
> >                                         if (vm_lock_seq == vma->vm_mm_mm_lock_seq) // true
> >                                           vma_refcount_put
> >                                             dec_and_test() // 1->0
> >                                               *NO* rwsem_release()
> >
> 
> Yes, this is possible. I think that's not a problem until we start
> reusing the vmas and I deal with this race later in this patchset.
> I think what you described here is the same race I mention in the
> description of this patch:
> https://lore.kernel.org/all/20241216192419.2970941-14-surenb@google.com/
> I introduce vma_ensure_detached() in that patch to handle this case
> and ensure that vmas are detached before they are returned into the
> slab cache for reuse. Does that make sense?

So I just replied there, and no, I don't think it makes sense. Just put
the kmem_cache_free() in vma_refcount_put(), to be done on 0.

Anyway, my point was more about the weird entanglement of lockdep and
the refcount. Just pull the lockdep annotation out of _put() and put it
explicitly in the vma_start_read() error paths and vma_end_read().

Additionally, having vma_end_write() would allow you to put a lockdep
annotation in vma_{start,end}_write() -- which was I think the original
reason I proposed it a while back, that and having improved clarity when
reading the code, since explicitly marking the end of a section is
helpful.
Suren Baghdasaryan Dec. 17, 2024, 4:27 p.m. UTC | #9
On Tue, Dec 17, 2024 at 2:30 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Dec 16, 2024 at 01:44:45PM -0800, Suren Baghdasaryan wrote:
> > On Mon, Dec 16, 2024 at 1:38 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, Dec 16, 2024 at 11:24:13AM -0800, Suren Baghdasaryan wrote:
> > > > +static inline void vma_refcount_put(struct vm_area_struct *vma)
> > > > +{
> > > > +     int refcnt;
> > > > +
> > > > +     if (!__refcount_dec_and_test(&vma->vm_refcnt, &refcnt)) {
> > > > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > > > +
> > > > +             if (refcnt & VMA_STATE_LOCKED)
> > > > +                     rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> > > > +     }
> > > > +}
> > > > +
> > > >  /*
> > > >   * Try to read-lock a vma. The function is allowed to occasionally yield false
> > > >   * locked result to avoid performance overhead, in which case we fall back to
> > > > @@ -710,6 +728,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma)
> > > >   */
> > > >  static inline bool vma_start_read(struct vm_area_struct *vma)
> > > >  {
> > > > +     int oldcnt;
> > > > +
> > > >       /*
> > > >        * Check before locking. A race might cause false locked result.
> > > >        * We can use READ_ONCE() for the mm_lock_seq here, and don't need
> > > > @@ -720,13 +740,20 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > >       if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
> > > >               return false;
> > > >
> > > > +
> > > > +     rwsem_acquire_read(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> > > > +     /* Limit at VMA_STATE_LOCKED - 2 to leave one count for a writer */
> > > > +     if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
> > > > +                                                   VMA_STATE_LOCKED - 2))) {
> > > > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > > >               return false;
> > > > +     }
> > > > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> > > >
> > > >       /*
> > > > +      * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
> > > >        * False unlocked result is impossible because we modify and check
> > > > +      * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
> > > >        * modification invalidates all existing locks.
> > > >        *
> > > >        * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
> > > > @@ -734,10 +761,12 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > >        * after it has been unlocked.
> > > >        * This pairs with RELEASE semantics in vma_end_write_all().
> > > >        */
> > > > +     if (oldcnt & VMA_STATE_LOCKED ||
> > > > +         unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> > > > +             vma_refcount_put(vma);
> > >
> > > Suppose we have detach race with a concurrent RCU lookup like:
> > >
> > >                                         vma = mas_lookup();
> > >
> > >         vma_start_write();
> > >         mas_detach();
> > >                                         vma_start_read()
> > >                                         rwsem_acquire_read()
> > >                                         inc // success
> > >         vma_mark_detach();
> > >         dec_and_test // assumes 1->0
> > >                      // is actually 2->1
> > >
> > >                                         if (vm_lock_seq == vma->vm_mm_mm_lock_seq) // true
> > >                                           vma_refcount_put
> > >                                             dec_and_test() // 1->0
> > >                                               *NO* rwsem_release()
> > >
> >
> > Yes, this is possible. I think that's not a problem until we start
> > reusing the vmas and I deal with this race later in this patchset.
> > I think what you described here is the same race I mention in the
> > description of this patch:
> > https://lore.kernel.org/all/20241216192419.2970941-14-surenb@google.com/
> > I introduce vma_ensure_detached() in that patch to handle this case
> > and ensure that vmas are detached before they are returned into the
> > slab cache for reuse. Does that make sense?
>
> So I just replied there, and no, I don't think it makes sense. Just put
> the kmem_cache_free() in vma_refcount_put(), to be done on 0.

That's very appealing indeed and makes things much simpler. The
problem I see with that is the case when we detach a vma from the tree
to isolate it, then do some cleanup and only then free it. That's done
in vms_gather_munmap_vmas() here:
https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1240 and we
even might reattach detached vmas back:
https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1312. IOW,
detached state is not final and we can't destroy the object that
reached this state. We could change states to: 0=unused (we can free
the object), 1=detached, 2=attached, etc. but then vma_start_read()
should do something like refcount_inc_more_than_one() instead of
refcount_inc_not_zero(). Would you be ok with such an approach?

>
> Anyway, my point was more about the weird entanglement of lockdep and
> the refcount. Just pull the lockdep annotation out of _put() and put it
> explicitly in the vma_start_read() error paths and vma_end_read().

Ok, I think that's easy.

>
> Additionally, having vma_end_write() would allow you to put a lockdep
> annotation in vma_{start,end}_write() -- which was I think the original
> reason I proposed it a while back, that and having improved clarity when
> reading the code, since explicitly marking the end of a section is
> helpful.

The vma->vmlock_dep_map is tracking vma->vm_refcnt, not the
vma->vm_lock_seq (similar to how today vma->vm_lock has its lockdep
tracking that rw_semaphore). If I implement vma_end_write() then it
will simply be something like:

void vma_end_write(vma)
{
         vma_assert_write_locked(vma);
         vma->vm_lock_seq = UINT_MAX;
}

so, vmlock_dep_map would not be involved.

If you want to track vma->vm_lock_seq with a separate lockdep, that
would be more complicated. Specifically for vma_end_write_all() that
would require us to call rwsem_release() on all locked vmas, however
we currently do not track individual locked vmas. vma_end_write_all()
allows us not to worry about tracking them, knowing that once we do
mmap_write_unlock() they all will get unlocked with one increment of
mm->mm_lock_seq. If your suggestion is to replace vma_end_write_all()
with vma_end_write() and unlock vmas individually across the mm code,
that would be a sizable effort. If that is indeed your ultimate goal,
I can do that as a separate project: introduce vma_end_write(),
gradually add them in required places (not yet sure how complex that
would be), then retire vma_end_write_all() and add a lockdep for
vma->vm_lock_seq.
Peter Zijlstra Dec. 18, 2024, 9:41 a.m. UTC | #10
On Tue, Dec 17, 2024 at 08:27:46AM -0800, Suren Baghdasaryan wrote:

> > So I just replied there, and no, I don't think it makes sense. Just put
> > the kmem_cache_free() in vma_refcount_put(), to be done on 0.
> 
> That's very appealing indeed and makes things much simpler. The
> problem I see with that is the case when we detach a vma from the tree
> to isolate it, then do some cleanup and only then free it. That's done
> in vms_gather_munmap_vmas() here:
> https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1240 and we
> even might reattach detached vmas back:
> https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1312. IOW,
> detached state is not final and we can't destroy the object that
> reached this state. 

Urgh, so that's the munmap() path, but arguably when that fails, the
map stays in place.

I think this means you're marking detached too soon; you should only
mark detached once you reach the point of no return.

That said, once you've reached the point of no return; and are about to
go remove the page-tables, you very much want to ensure a lack of
concurrency.

So perhaps waiting for out-standing readers at this point isn't crazy.

Also, I'm having a very hard time reading this maple tree stuff :/
Afaict vms_gather_munmap_vmas() only adds the VMAs to be removed to a
second tree, it does not in fact unlink them from the mm yet.

AFAICT it's vma_iter_clear_gfp() that actually wipes the vmas from the
mm -- and that being able to fail is mind boggling and I suppose is what
gives rise to much of this insanity :/

Anyway, I would expect remove_vma() to be the one that marks it detached
(it's already unreachable through vma_lookup() at this point) and there
you should wait for concurrent readers to bugger off.

> We could change states to: 0=unused (we can free
> the object), 1=detached, 2=attached, etc. but then vma_start_read()
> should do something like refcount_inc_more_than_one() instead of
> refcount_inc_not_zero(). Would you be ok with such an approach?

Urgh, I would strongly suggest ditching refcount_t if we go this route.
The thing is; refcount_t should remain a 'simple' straight forward
interface and not allow people to do the wrong thing. Its not meant to
be the kitchen sink -- we have atomic_t for that.

Anyway, the more common scheme at that point is using -1 for 'free', I
think folio->_mapcount uses that even. For that see:
atomic_add_negative*().

> > Additionally, having vma_end_write() would allow you to put a lockdep
> > annotation in vma_{start,end}_write() -- which was I think the original
> > reason I proposed it a while back, that and having improved clarity when
> > reading the code, since explicitly marking the end of a section is
> > helpful.
> 
> The vma->vmlock_dep_map is tracking vma->vm_refcnt, not the
> vma->vm_lock_seq (similar to how today vma->vm_lock has its lockdep
> tracking that rw_semaphore). If I implement vma_end_write() then it
> will simply be something like:
> 
> void vma_end_write(vma)
> {
>          vma_assert_write_locked(vma);
>          vma->vm_lock_seq = UINT_MAX;
> }
> 
> so, vmlock_dep_map would not be involved.

That's just weird; why would you not track vma_{start,end}_write() with
the exclusive side of the 'rwsem' dep_map ?

> If you want to track vma->vm_lock_seq with a separate lockdep, that
> would be more complicated. Specifically for vma_end_write_all() that
> would require us to call rwsem_release() on all locked vmas, however
> we currently do not track individual locked vmas. vma_end_write_all()
> allows us not to worry about tracking them, knowing that once we do
> mmap_write_unlock() they all will get unlocked with one increment of
> mm->mm_lock_seq. If your suggestion is to replace vma_end_write_all()
> with vma_end_write() and unlock vmas individually across the mm code,
> that would be a sizable effort. If that is indeed your ultimate goal,
> I can do that as a separate project: introduce vma_end_write(),
> gradually add them in required places (not yet sure how complex that
> would be), then retire vma_end_write_all() and add a lockdep for
> vma->vm_lock_seq.

Yeah, so ultimately I think it would be clearer if you explicitly mark
the point where the vma modification is 'done'. But I don't suppose we
have to do that here.
Peter Zijlstra Dec. 18, 2024, 10:06 a.m. UTC | #11
On Wed, Dec 18, 2024 at 10:41:04AM +0100, Peter Zijlstra wrote:
> On Tue, Dec 17, 2024 at 08:27:46AM -0800, Suren Baghdasaryan wrote:
> 
> > > So I just replied there, and no, I don't think it makes sense. Just put
> > > the kmem_cache_free() in vma_refcount_put(), to be done on 0.
> > 
> > That's very appealing indeed and makes things much simpler. The
> > problem I see with that is the case when we detach a vma from the tree
> > to isolate it, then do some cleanup and only then free it. That's done
> > in vms_gather_munmap_vmas() here:
> > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1240 and we
> > even might reattach detached vmas back:
> > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1312. IOW,
> > detached state is not final and we can't destroy the object that
> > reached this state. 
> 
> Urgh, so that's the munmap() path, but arguably when that fails, the
> map stays in place.
> 
> I think this means you're marking detached too soon; you should only
> mark detached once you reach the point of no return.
> 
> That said, once you've reached the point of no return; and are about to
> go remove the page-tables, you very much want to ensure a lack of
> concurrency.
> 
> So perhaps waiting for out-standing readers at this point isn't crazy.
> 
> Also, I'm having a very hard time reading this maple tree stuff :/
> Afaict vms_gather_munmap_vmas() only adds the VMAs to be removed to a
> second tree, it does not in fact unlink them from the mm yet.
> 
> AFAICT it's vma_iter_clear_gfp() that actually wipes the vmas from the
> mm -- and that being able to fail is mind boggling and I suppose is what
> gives rise to much of this insanity :/
> 
> Anyway, I would expect remove_vma() to be the one that marks it detached
> (it's already unreachable through vma_lookup() at this point) and there
> you should wait for concurrent readers to bugger off.

Also, I think vma_start_write() in that gather look is too early, you're
not actually going to change the VMA yet -- with obvious exception of
the split cases.

That too should probably come after you've passes all the fail/unwind
spots.

Something like so perhaps? (yeah, I know, I wrecked a bunch)

diff --git a/mm/vma.c b/mm/vma.c
index 8e31b7e25aeb..45d43adcbb36 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
 	struct vm_area_struct *vma;
 	struct mm_struct *mm;
 
+	mas_for_each(mas_detach, vma, ULONG_MAX) {
+		vma_start_write(next);
+		vma_mark_detached(next, true);
+	}
+
 	mm = current->mm;
 	mm->map_count -= vms->vma_count;
 	mm->locked_vm -= vms->locked_vm;
@@ -1219,9 +1224,6 @@ static void reattach_vmas(struct ma_state *mas_detach)
 	struct vm_area_struct *vma;
 
 	mas_set(mas_detach, 0);
-	mas_for_each(mas_detach, vma, ULONG_MAX)
-		vma_mark_detached(vma, false);
-
 	__mt_destroy(mas_detach->tree);
 }
 
@@ -1289,13 +1291,11 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
 			if (error)
 				goto end_split_failed;
 		}
-		vma_start_write(next);
 		mas_set(mas_detach, vms->vma_count++);
 		error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
 		if (error)
 			goto munmap_gather_failed;
 
-		vma_mark_detached(next, true);
 		nrpages = vma_pages(next);
 
 		vms->nr_pages += nrpages;
@@ -1431,14 +1431,17 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	struct vma_munmap_struct vms;
 	int error;
 
+	error = mas_preallocate(vmi->mas);
+	if (error)
+		goto gather_failed;
+
 	init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
 	error = vms_gather_munmap_vmas(&vms, &mas_detach);
 	if (error)
 		goto gather_failed;
 
 	error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
-	if (error)
-		goto clear_tree_failed;
+	VM_WARN_ON(error);
 
 	/* Point of no return */
 	vms_complete_munmap_vmas(&vms, &mas_detach);
Liam R. Howlett Dec. 18, 2024, 3:37 p.m. UTC | #12
* Peter Zijlstra <peterz@infradead.org> [241218 05:06]:
> On Wed, Dec 18, 2024 at 10:41:04AM +0100, Peter Zijlstra wrote:
> > On Tue, Dec 17, 2024 at 08:27:46AM -0800, Suren Baghdasaryan wrote:
> > 
> > > > So I just replied there, and no, I don't think it makes sense. Just put
> > > > the kmem_cache_free() in vma_refcount_put(), to be done on 0.
> > > 
> > > That's very appealing indeed and makes things much simpler. The
> > > problem I see with that is the case when we detach a vma from the tree
> > > to isolate it, then do some cleanup and only then free it. That's done
> > > in vms_gather_munmap_vmas() here:
> > > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1240 and we
> > > even might reattach detached vmas back:
> > > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1312. IOW,
> > > detached state is not final and we can't destroy the object that
> > > reached this state. 
> > 
> > Urgh, so that's the munmap() path, but arguably when that fails, the
> > map stays in place.
> > 
> > I think this means you're marking detached too soon; you should only
> > mark detached once you reach the point of no return.
> > 
> > That said, once you've reached the point of no return; and are about to
> > go remove the page-tables, you very much want to ensure a lack of
> > concurrency.
> > 
> > So perhaps waiting for out-standing readers at this point isn't crazy.
> > 
> > Also, I'm having a very hard time reading this maple tree stuff :/
> > Afaict vms_gather_munmap_vmas() only adds the VMAs to be removed to a
> > second tree, it does not in fact unlink them from the mm yet.

Yes, that's correct.  I tried to make this clear with a gather/complete
naming like other areas of the mm.  I hope that helped.

Also, the comments for the function state that's what's going on:

 * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree                                             
 * for removal at a later date.  Handles splitting first and last if necessary                                          
 * and marking the vmas as isolated.

... might be worth updating with new information.

> > 
> > AFAICT it's vma_iter_clear_gfp() that actually wipes the vmas from the
> > mm -- and that being able to fail is mind boggling and I suppose is what
> > gives rise to much of this insanity :/

This is also correct.  The maple tree is a b-tree variant that has
internal nodes.  When you write to it, including nulls, they are tracked
and may need to allocate.  This is a cost for rcu lookups; we will use
the same or less memory in the end but must maintain a consistent view
of the ranges.

But to put this into perspective, we get 16 nodes per 4k page, most
writes will use 1 or 3 of these from a kmem_cache, so we are talking
about a very unlikely possibility.  Except when syzbot decides to fail
random allocations.

We could preallocate for the write, but this section of the code is
GFP_KERNEL, so we don't.  Preallocation is an option to simplify the
failure path though... which is what you did below.

> > 
> > Anyway, I would expect remove_vma() to be the one that marks it detached
> > (it's already unreachable through vma_lookup() at this point) and there
> > you should wait for concurrent readers to bugger off.
> 
> Also, I think vma_start_write() in that gather look is too early, you're
> not actually going to change the VMA yet -- with obvious exception of
> the split cases.

The split needs to start the write on the vma to avoid anyone reading it
while it's being altered.

> 
> That too should probably come after you've passes all the fail/unwind
> spots.

Do you mean the split?  I'd like to move the split later as well..
tracking that is a pain and may need an extra vma for when one vma is
split twice before removing the middle part.

Actually, I think we need to allocate two (or at least one) vmas in this
case and just pass one through to unmap (written only to the mas_detach
tree?).  It would be nice to find a way to NOT need to do that even.. I
had tried to use a vma on the stack years ago, which didn't work out.

> 
> Something like so perhaps? (yeah, I know, I wrecked a bunch)
> 
> diff --git a/mm/vma.c b/mm/vma.c
> index 8e31b7e25aeb..45d43adcbb36 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
>  	struct vm_area_struct *vma;
>  	struct mm_struct *mm;
>  

mas_set(mas_detach, 0);

> +	mas_for_each(mas_detach, vma, ULONG_MAX) {
> +		vma_start_write(next);
> +		vma_mark_detached(next, true);
> +	}
> +
>  	mm = current->mm;
>  	mm->map_count -= vms->vma_count;
>  	mm->locked_vm -= vms->locked_vm;
> @@ -1219,9 +1224,6 @@ static void reattach_vmas(struct ma_state *mas_detach)
>  	struct vm_area_struct *vma;
>  

>  	mas_set(mas_detach, 0);
Drop the mas_set here.

> -	mas_for_each(mas_detach, vma, ULONG_MAX)
> -		vma_mark_detached(vma, false);
> -
>  	__mt_destroy(mas_detach->tree);
>  }
>  
> @@ -1289,13 +1291,11 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
>  			if (error)
>  				goto end_split_failed;
>  		}
> -		vma_start_write(next);
>  		mas_set(mas_detach, vms->vma_count++);
>  		error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
>  		if (error)
>  			goto munmap_gather_failed;
>  
> -		vma_mark_detached(next, true);
>  		nrpages = vma_pages(next);
>  
>  		vms->nr_pages += nrpages;
> @@ -1431,14 +1431,17 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	struct vma_munmap_struct vms;
>  	int error;
>  

The preallocation needs to know the range being stored to know what's
going to happen.

vma_iter_config(vmi, start, end);

> +	error = mas_preallocate(vmi->mas);

We haven't had a need to have a vma iterator preallocate for storing a
null, but we can add one for this.

> +	if (error)
> +		goto gather_failed;
> +
>  	init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
>  	error = vms_gather_munmap_vmas(&vms, &mas_detach);
>  	if (error)
>  		goto gather_failed;
>  

Drop this stuff.
>  	error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
> -	if (error)
> -		goto clear_tree_failed;
> +	VM_WARN_ON(error);

Do this instead
vma_iter_config(vmi, start, end);
vma_iter_clear(vmi);

>  
>  	/* Point of no return */
>  	vms_complete_munmap_vmas(&vms, &mas_detach);
Suren Baghdasaryan Dec. 18, 2024, 3:42 p.m. UTC | #13
On Wed, Dec 18, 2024 at 1:41 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Dec 17, 2024 at 08:27:46AM -0800, Suren Baghdasaryan wrote:
>
> > > So I just replied there, and no, I don't think it makes sense. Just put
> > > the kmem_cache_free() in vma_refcount_put(), to be done on 0.
> >
> > That's very appealing indeed and makes things much simpler. The
> > problem I see with that is the case when we detach a vma from the tree
> > to isolate it, then do some cleanup and only then free it. That's done
> > in vms_gather_munmap_vmas() here:
> > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1240 and we
> > even might reattach detached vmas back:
> > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1312. IOW,
> > detached state is not final and we can't destroy the object that
> > reached this state.
>
> Urgh, so that's the munmap() path, but arguably when that fails, the
> map stays in place.
>
> I think this means you're marking detached too soon; you should only
> mark detached once you reach the point of no return.
>
> That said, once you've reached the point of no return; and are about to
> go remove the page-tables, you very much want to ensure a lack of
> concurrency.
>
> So perhaps waiting for out-standing readers at this point isn't crazy.
>
> Also, I'm having a very hard time reading this maple tree stuff :/
> Afaict vms_gather_munmap_vmas() only adds the VMAs to be removed to a
> second tree, it does not in fact unlink them from the mm yet.

Yes, I think you are correct.

>
> AFAICT it's vma_iter_clear_gfp() that actually wipes the vmas from the
> mm -- and that being able to fail is mind boggling and I suppose is what
> gives rise to much of this insanity :/
>
> Anyway, I would expect remove_vma() to be the one that marks it detached
> (it's already unreachable through vma_lookup() at this point) and there
> you should wait for concurrent readers to bugger off.

There is an issue with that. Note that vms_complete_munmap_vmas()
that's calling remove_vma() might drop the mmap write lock, so
detaching without a write lock would break current rules.

>
> > We could change states to: 0=unused (we can free
> > the object), 1=detached, 2=attached, etc. but then vma_start_read()
> > should do something like refcount_inc_more_than_one() instead of
> > refcount_inc_not_zero(). Would you be ok with such an approach?
>
> Urgh, I would strongly suggest ditching refcount_t if we go this route.
> The thing is; refcount_t should remain a 'simple' straight forward
> interface and not allow people to do the wrong thing. Its not meant to
> be the kitchen sink -- we have atomic_t for that.

Ack. If we go this route I'll use atomics directly.

>
> Anyway, the more common scheme at that point is using -1 for 'free', I
> think folio->_mapcount uses that even. For that see:
> atomic_add_negative*().

Thanks for the reference.

>
> > > Additionally, having vma_end_write() would allow you to put a lockdep
> > > annotation in vma_{start,end}_write() -- which was I think the original
> > > reason I proposed it a while back, that and having improved clarity when
> > > reading the code, since explicitly marking the end of a section is
> > > helpful.
> >
> > The vma->vmlock_dep_map is tracking vma->vm_refcnt, not the
> > vma->vm_lock_seq (similar to how today vma->vm_lock has its lockdep
> > tracking that rw_semaphore). If I implement vma_end_write() then it
> > will simply be something like:
> >
> > void vma_end_write(vma)
> > {
> >          vma_assert_write_locked(vma);
> >          vma->vm_lock_seq = UINT_MAX;
> > }
> >
> > so, vmlock_dep_map would not be involved.
>
> That's just weird; why would you not track vma_{start,end}_write() with
> the exclusive side of the 'rwsem' dep_map ?
>
> > If you want to track vma->vm_lock_seq with a separate lockdep, that
> > would be more complicated. Specifically for vma_end_write_all() that
> > would require us to call rwsem_release() on all locked vmas, however
> > we currently do not track individual locked vmas. vma_end_write_all()
> > allows us not to worry about tracking them, knowing that once we do
> > mmap_write_unlock() they all will get unlocked with one increment of
> > mm->mm_lock_seq. If your suggestion is to replace vma_end_write_all()
> > with vma_end_write() and unlock vmas individually across the mm code,
> > that would be a sizable effort. If that is indeed your ultimate goal,
> > I can do that as a separate project: introduce vma_end_write(),
> > gradually add them in required places (not yet sure how complex that
> > would be), then retire vma_end_write_all() and add a lockdep for
> > vma->vm_lock_seq.
>
> Yeah, so ultimately I think it would be clearer if you explicitly mark
> the point where the vma modification is 'done'. But I don't suppose we
> have to do that here.

Ack.
Suren Baghdasaryan Dec. 18, 2024, 3:50 p.m. UTC | #14
On Wed, Dec 18, 2024 at 7:37 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Peter Zijlstra <peterz@infradead.org> [241218 05:06]:
> > On Wed, Dec 18, 2024 at 10:41:04AM +0100, Peter Zijlstra wrote:
> > > On Tue, Dec 17, 2024 at 08:27:46AM -0800, Suren Baghdasaryan wrote:
> > >
> > > > > So I just replied there, and no, I don't think it makes sense. Just put
> > > > > the kmem_cache_free() in vma_refcount_put(), to be done on 0.
> > > >
> > > > That's very appealing indeed and makes things much simpler. The
> > > > problem I see with that is the case when we detach a vma from the tree
> > > > to isolate it, then do some cleanup and only then free it. That's done
> > > > in vms_gather_munmap_vmas() here:
> > > > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1240 and we
> > > > even might reattach detached vmas back:
> > > > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1312. IOW,
> > > > detached state is not final and we can't destroy the object that
> > > > reached this state.
> > >
> > > Urgh, so that's the munmap() path, but arguably when that fails, the
> > > map stays in place.
> > >
> > > I think this means you're marking detached too soon; you should only
> > > mark detached once you reach the point of no return.
> > >
> > > That said, once you've reached the point of no return; and are about to
> > > go remove the page-tables, you very much want to ensure a lack of
> > > concurrency.
> > >
> > > So perhaps waiting for out-standing readers at this point isn't crazy.
> > >
> > > Also, I'm having a very hard time reading this maple tree stuff :/
> > > Afaict vms_gather_munmap_vmas() only adds the VMAs to be removed to a
> > > second tree, it does not in fact unlink them from the mm yet.
>
> Yes, that's correct.  I tried to make this clear with a gather/complete
> naming like other areas of the mm.  I hope that helped.
>
> Also, the comments for the function state that's what's going on:
>
>  * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
>  * for removal at a later date.  Handles splitting first and last if necessary
>  * and marking the vmas as isolated.
>
> ... might be worth updating with new information.
>
> > >
> > > AFAICT it's vma_iter_clear_gfp() that actually wipes the vmas from the
> > > mm -- and that being able to fail is mind boggling and I suppose is what
> > > gives rise to much of this insanity :/
>
> This is also correct.  The maple tree is a b-tree variant that has
> internal nodes.  When you write to it, including nulls, they are tracked
> and may need to allocate.  This is a cost for rcu lookups; we will use
> the same or less memory in the end but must maintain a consistent view
> of the ranges.
>
> But to put this into perspective, we get 16 nodes per 4k page, most
> writes will use 1 or 3 of these from a kmem_cache, so we are talking
> about a very unlikely possibility.  Except when syzbot decides to fail
> random allocations.
>
> We could preallocate for the write, but this section of the code is
> GFP_KERNEL, so we don't.  Preallocation is an option to simplify the
> failure path though... which is what you did below.
>
> > >
> > > Anyway, I would expect remove_vma() to be the one that marks it detached
> > > (it's already unreachable through vma_lookup() at this point) and there
> > > you should wait for concurrent readers to bugger off.
> >
> > Also, I think vma_start_write() in that gather look is too early, you're
> > not actually going to change the VMA yet -- with obvious exception of
> > the split cases.
>
> The split needs to start the write on the vma to avoid anyone reading it
> while it's being altered.

I think vma_start_write() should be done inside
vms_gather_munmap_vmas() for __mmap_prepare() to work correctly:

__mmap_prepare
    vms_gather_munmap_vmas
    vms_clean_up_area // clears PTEs
...
__mmap_complete
    vms_complete_munmap_vmas

If we do not write-lock the vmas inside vms_gather_munmap_vmas(), we
will be clearing PTEs from under a discoverable vma.
There might be other places like this too but I think we can move
vma_mark_detach() like you suggested without moving vma_start_write()
and that should be enough.

>
> >
> > That too should probably come after you've passes all the fail/unwind
> > spots.
>
> Do you mean the split?  I'd like to move the split later as well..
> tracking that is a pain and may need an extra vma for when one vma is
> split twice before removing the middle part.
>
> Actually, I think we need to allocate two (or at least one) vmas in this
> case and just pass one through to unmap (written only to the mas_detach
> tree?).  It would be nice to find a way to NOT need to do that even.. I
> had tried to use a vma on the stack years ago, which didn't work out.
>
> >
> > Something like so perhaps? (yeah, I know, I wrecked a bunch)
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 8e31b7e25aeb..45d43adcbb36 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
> >       struct vm_area_struct *vma;
> >       struct mm_struct *mm;
> >
>
> mas_set(mas_detach, 0);
>
> > +     mas_for_each(mas_detach, vma, ULONG_MAX) {
> > +             vma_start_write(next);
> > +             vma_mark_detached(next, true);
> > +     }
> > +
> >       mm = current->mm;
> >       mm->map_count -= vms->vma_count;
> >       mm->locked_vm -= vms->locked_vm;
> > @@ -1219,9 +1224,6 @@ static void reattach_vmas(struct ma_state *mas_detach)
> >       struct vm_area_struct *vma;
> >
>
> >       mas_set(mas_detach, 0);
> Drop the mas_set here.
>
> > -     mas_for_each(mas_detach, vma, ULONG_MAX)
> > -             vma_mark_detached(vma, false);
> > -
> >       __mt_destroy(mas_detach->tree);
> >  }
> >
> > @@ -1289,13 +1291,11 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> >                       if (error)
> >                               goto end_split_failed;
> >               }
> > -             vma_start_write(next);
> >               mas_set(mas_detach, vms->vma_count++);
> >               error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
> >               if (error)
> >                       goto munmap_gather_failed;
> >
> > -             vma_mark_detached(next, true);
> >               nrpages = vma_pages(next);
> >
> >               vms->nr_pages += nrpages;
> > @@ -1431,14 +1431,17 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >       struct vma_munmap_struct vms;
> >       int error;
> >
>
> The preallocation needs to know the range being stored to know what's
> going to happen.
>
> vma_iter_config(vmi, start, end);
>
> > +     error = mas_preallocate(vmi->mas);
>
> We haven't had a need to have a vma iterator preallocate for storing a
> null, but we can add one for this.
>
> > +     if (error)
> > +             goto gather_failed;
> > +
> >       init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
> >       error = vms_gather_munmap_vmas(&vms, &mas_detach);
> >       if (error)
> >               goto gather_failed;
> >
>
> Drop this stuff.
> >       error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
> > -     if (error)
> > -             goto clear_tree_failed;
> > +     VM_WARN_ON(error);
>
> Do this instead
> vma_iter_config(vmi, start, end);
> vma_iter_clear(vmi);
>
> >
> >       /* Point of no return */
> >       vms_complete_munmap_vmas(&vms, &mas_detach);
Suren Baghdasaryan Dec. 18, 2024, 3:57 p.m. UTC | #15
On Wed, Dec 18, 2024 at 7:37 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Peter Zijlstra <peterz@infradead.org> [241218 05:06]:
> > On Wed, Dec 18, 2024 at 10:41:04AM +0100, Peter Zijlstra wrote:
> > > On Tue, Dec 17, 2024 at 08:27:46AM -0800, Suren Baghdasaryan wrote:
> > >
> > > > > So I just replied there, and no, I don't think it makes sense. Just put
> > > > > the kmem_cache_free() in vma_refcount_put(), to be done on 0.
> > > >
> > > > That's very appealing indeed and makes things much simpler. The
> > > > problem I see with that is the case when we detach a vma from the tree
> > > > to isolate it, then do some cleanup and only then free it. That's done
> > > > in vms_gather_munmap_vmas() here:
> > > > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1240 and we
> > > > even might reattach detached vmas back:
> > > > https://elixir.bootlin.com/linux/v6.12.5/source/mm/vma.c#L1312. IOW,
> > > > detached state is not final and we can't destroy the object that
> > > > reached this state.
> > >
> > > Urgh, so that's the munmap() path, but arguably when that fails, the
> > > map stays in place.
> > >
> > > I think this means you're marking detached too soon; you should only
> > > mark detached once you reach the point of no return.
> > >
> > > That said, once you've reached the point of no return; and are about to
> > > go remove the page-tables, you very much want to ensure a lack of
> > > concurrency.
> > >
> > > So perhaps waiting for out-standing readers at this point isn't crazy.
> > >
> > > Also, I'm having a very hard time reading this maple tree stuff :/
> > > Afaict vms_gather_munmap_vmas() only adds the VMAs to be removed to a
> > > second tree, it does not in fact unlink them from the mm yet.
>
> Yes, that's correct.  I tried to make this clear with a gather/complete
> naming like other areas of the mm.  I hope that helped.
>
> Also, the comments for the function state that's what's going on:
>
>  * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
>  * for removal at a later date.  Handles splitting first and last if necessary
>  * and marking the vmas as isolated.
>
> ... might be worth updating with new information.
>
> > >
> > > AFAICT it's vma_iter_clear_gfp() that actually wipes the vmas from the
> > > mm -- and that being able to fail is mind boggling and I suppose is what
> > > gives rise to much of this insanity :/
>
> This is also correct.  The maple tree is a b-tree variant that has
> internal nodes.  When you write to it, including nulls, they are tracked
> and may need to allocate.  This is a cost for rcu lookups; we will use
> the same or less memory in the end but must maintain a consistent view
> of the ranges.
>
> But to put this into perspective, we get 16 nodes per 4k page, most
> writes will use 1 or 3 of these from a kmem_cache, so we are talking
> about a very unlikely possibility.  Except when syzbot decides to fail
> random allocations.
>
> We could preallocate for the write, but this section of the code is
> GFP_KERNEL, so we don't.  Preallocation is an option to simplify the
> failure path though... which is what you did below.
>
> > >
> > > Anyway, I would expect remove_vma() to be the one that marks it detached
> > > (it's already unreachable through vma_lookup() at this point) and there
> > > you should wait for concurrent readers to bugger off.
> >
> > Also, I think vma_start_write() in that gather look is too early, you're
> > not actually going to change the VMA yet -- with obvious exception of
> > the split cases.
>
> The split needs to start the write on the vma to avoid anyone reading it
> while it's being altered.
>
> >
> > That too should probably come after you've passes all the fail/unwind
> > spots.
>
> Do you mean the split?  I'd like to move the split later as well..
> tracking that is a pain and may need an extra vma for when one vma is
> split twice before removing the middle part.
>
> Actually, I think we need to allocate two (or at least one) vmas in this
> case and just pass one through to unmap (written only to the mas_detach
> tree?).  It would be nice to find a way to NOT need to do that even.. I
> had tried to use a vma on the stack years ago, which didn't work out.
>
> >
> > Something like so perhaps? (yeah, I know, I wrecked a bunch)
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 8e31b7e25aeb..45d43adcbb36 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
> >       struct vm_area_struct *vma;
> >       struct mm_struct *mm;
> >
>
> mas_set(mas_detach, 0);
>
> > +     mas_for_each(mas_detach, vma, ULONG_MAX) {
> > +             vma_start_write(next);
> > +             vma_mark_detached(next, true);
> > +     }
> > +
> >       mm = current->mm;
> >       mm->map_count -= vms->vma_count;
> >       mm->locked_vm -= vms->locked_vm;
> > @@ -1219,9 +1224,6 @@ static void reattach_vmas(struct ma_state *mas_detach)
> >       struct vm_area_struct *vma;
> >
>
> >       mas_set(mas_detach, 0);
> Drop the mas_set here.
>
> > -     mas_for_each(mas_detach, vma, ULONG_MAX)
> > -             vma_mark_detached(vma, false);
> > -
> >       __mt_destroy(mas_detach->tree);
> >  }
> >
> > @@ -1289,13 +1291,11 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> >                       if (error)
> >                               goto end_split_failed;
> >               }
> > -             vma_start_write(next);
> >               mas_set(mas_detach, vms->vma_count++);
> >               error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
> >               if (error)
> >                       goto munmap_gather_failed;
> >
> > -             vma_mark_detached(next, true);
> >               nrpages = vma_pages(next);
> >
> >               vms->nr_pages += nrpages;
> > @@ -1431,14 +1431,17 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >       struct vma_munmap_struct vms;
> >       int error;
> >
>
> The preallocation needs to know the range being stored to know what's
> going to happen.
>
> vma_iter_config(vmi, start, end);
>
> > +     error = mas_preallocate(vmi->mas);
>
> We haven't had a need to have a vma iterator preallocate for storing a
> null, but we can add one for this.
>
> > +     if (error)
> > +             goto gather_failed;
> > +
> >       init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
> >       error = vms_gather_munmap_vmas(&vms, &mas_detach);
> >       if (error)
> >               goto gather_failed;
> >
>
> Drop this stuff.
> >       error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
> > -     if (error)
> > -             goto clear_tree_failed;
> > +     VM_WARN_ON(error);
>
> Do this instead
> vma_iter_config(vmi, start, end);
> vma_iter_clear(vmi);

Thanks for the input, Liam. Let me try to make a patch from these
suggestions and see where we end up and what might blow up.

>
> >
> >       /* Point of no return */
> >       vms_complete_munmap_vmas(&vms, &mas_detach);
Peter Zijlstra Dec. 18, 2024, 4:13 p.m. UTC | #16
On Wed, Dec 18, 2024 at 10:37:24AM -0500, Liam R. Howlett wrote:

> This is also correct.  The maple tree is a b-tree variant that has
> internal nodes.

Right, I remembered that much :-)

> > Also, I think vma_start_write() in that gather look is too early, you're
> > not actually going to change the VMA yet -- with obvious exception of
> > the split cases.
> 
> The split needs to start the write on the vma to avoid anyone reading it
> while it's being altered.

__split_vma() does vma_start_write() itself, so that should be good
already.

> > That too should probably come after you've passes all the fail/unwind
> > spots.
> 
> Do you mean the split? 

No, I means the detach muck :-)

> I'd like to move the split later as well..
> tracking that is a pain and may need an extra vma for when one vma is
> split twice before removing the middle part.
> 
> Actually, I think we need to allocate two (or at least one) vmas in this
> case and just pass one through to unmap (written only to the mas_detach
> tree?).  It would be nice to find a way to NOT need to do that even.. I
> had tried to use a vma on the stack years ago, which didn't work out.

Urgh yeah, vma on stack sounds like utter pain :-)
Peter Zijlstra Dec. 18, 2024, 4:18 p.m. UTC | #17
On Wed, Dec 18, 2024 at 07:50:34AM -0800, Suren Baghdasaryan wrote:

> I think vma_start_write() should be done inside
> vms_gather_munmap_vmas() for __mmap_prepare() to work correctly:
> 
> __mmap_prepare
>     vms_gather_munmap_vmas
>     vms_clean_up_area // clears PTEs
> ...
> __mmap_complete
>     vms_complete_munmap_vmas

I'm unsure what exactly you mean; __split_vma() will start_write on the
one that is broken up and the rest won't actually change until
vms_complete_munmap_vmas().

> If we do not write-lock the vmas inside vms_gather_munmap_vmas(), we
> will be clearing PTEs from under a discoverable vma.

You will not. vms_complete_munmap_vmas() will call remove_vma() to
remove PTEs IIRC, and if you do start_write() and detach() before
dropping mmap_lock_write, you should be good.

> There might be other places like this too but I think we can move
> vma_mark_detach() like you suggested without moving vma_start_write()
> and that should be enough.

I really don't see why you can't move vma_start_write() -- note that by
moving that after you've unhooked the vmas from the mm (which you have
by that point) you get the sync point you wanted.
Suren Baghdasaryan Dec. 18, 2024, 5:36 p.m. UTC | #18
On Wed, Dec 18, 2024 at 8:18 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Dec 18, 2024 at 07:50:34AM -0800, Suren Baghdasaryan wrote:
>
> > I think vma_start_write() should be done inside
> > vms_gather_munmap_vmas() for __mmap_prepare() to work correctly:
> >
> > __mmap_prepare
> >     vms_gather_munmap_vmas
> >     vms_clean_up_area // clears PTEs
> > ...
> > __mmap_complete
> >     vms_complete_munmap_vmas
>
> I'm unsure what exactly you mean; __split_vma() will start_write on the
> one that is broken up and the rest won't actually change until
> vms_complete_munmap_vmas().

Ah, sorry, I missed the write-locking in the __split_vma(). Looks like
indeed vma_start_write() is not needed in vms_gather_munmap_vmas().

>
> > If we do not write-lock the vmas inside vms_gather_munmap_vmas(), we
> > will be clearing PTEs from under a discoverable vma.
>
> You will not. vms_complete_munmap_vmas() will call remove_vma() to
> remove PTEs IIRC, and if you do start_write() and detach() before
> dropping mmap_lock_write, you should be good.

Ok, I think we will have to move mmap_write_downgrade() inside
vms_complete_munmap_vmas() to be called after remove_vma().
vms_clear_ptes() is using vmas, so we can't move remove_vma() before
mmap_write_downgrade().

>
> > There might be other places like this too but I think we can move
> > vma_mark_detach() like you suggested without moving vma_start_write()
> > and that should be enough.
>
> I really don't see why you can't move vma_start_write() -- note that by
> moving that after you've unhooked the vmas from the mm (which you have
> by that point) you get the sync point you wanted.
>
>
Peter Zijlstra Dec. 18, 2024, 5:44 p.m. UTC | #19
On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:

> > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > remove PTEs IIRC, and if you do start_write() and detach() before
> > dropping mmap_lock_write, you should be good.
> 
> Ok, I think we will have to move mmap_write_downgrade() inside
> vms_complete_munmap_vmas() to be called after remove_vma().
> vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> mmap_write_downgrade().

Why ?!

vms_clear_ptes() and remove_vma() are fine where they are -- there is no
concurrency left at this point.

Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
which is *after* the vmas have been unhooked from the mm, you wait for
any concurrent user to go away.

And since they're unhooked, there can't be any new users.

So you're the one and only user left, and code is fine the way it is.
Suren Baghdasaryan Dec. 18, 2024, 5:58 p.m. UTC | #20
On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
>
> > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > dropping mmap_lock_write, you should be good.
> >
> > Ok, I think we will have to move mmap_write_downgrade() inside
> > vms_complete_munmap_vmas() to be called after remove_vma().
> > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > mmap_write_downgrade().
>
> Why ?!
>
> vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> concurrency left at this point.
>
> Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> which is *after* the vmas have been unhooked from the mm, you wait for
> any concurrent user to go away.
>
> And since they're unhooked, there can't be any new users.
>
> So you're the one and only user left, and code is fine the way it is.

Ok, let me make sure I understand this part of your proposal. From
your earlier email:

@@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
vma_munmap_struct *vms,
        struct vm_area_struct *vma;
        struct mm_struct *mm;

+       mas_for_each(mas_detach, vma, ULONG_MAX) {
+               vma_start_write(next);
+               vma_mark_detached(next, true);
+       }
+
        mm = current->mm;
        mm->map_count -= vms->vma_count;
        mm->locked_vm -= vms->locked_vm;

This would mean:

vms_complete_munmap_vmas
           vma_start_write
           vma_mark_detached
           mmap_write_downgrade
           vms_clear_ptes
           remove_vma

And remove_vma will be just freeing the vmas. Is that correct?
I'm a bit confused because the original thinking was that
vma_mark_detached() would drop the last refcnt and if it's 0 we would
free the vma right there. If that's still what we want to do then I
think the above sequence should look like this:

vms_complete_munmap_vmas
           vms_clear_ptes
           remove_vma
               vma_start_write
               vma_mark_detached
           mmap_write_downgrade

because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
Please let me know which way you want to move forward.


>
>
Liam R. Howlett Dec. 18, 2024, 7 p.m. UTC | #21
* Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> >
> > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > dropping mmap_lock_write, you should be good.
> > >
> > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > mmap_write_downgrade().
> >
> > Why ?!
> >
> > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > concurrency left at this point.
> >
> > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > which is *after* the vmas have been unhooked from the mm, you wait for
> > any concurrent user to go away.
> >
> > And since they're unhooked, there can't be any new users.
> >
> > So you're the one and only user left, and code is fine the way it is.
> 
> Ok, let me make sure I understand this part of your proposal. From
> your earlier email:
> 
> @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> vma_munmap_struct *vms,
>         struct vm_area_struct *vma;
>         struct mm_struct *mm;
> 
> +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> +               vma_start_write(next);
> +               vma_mark_detached(next, true);
> +       }
> +
>         mm = current->mm;
>         mm->map_count -= vms->vma_count;
>         mm->locked_vm -= vms->locked_vm;
> 
> This would mean:
> 
> vms_complete_munmap_vmas
>            vma_start_write
>            vma_mark_detached
>            mmap_write_downgrade
>            vms_clear_ptes
>            remove_vma
> 
> And remove_vma will be just freeing the vmas. Is that correct?
> I'm a bit confused because the original thinking was that
> vma_mark_detached() would drop the last refcnt and if it's 0 we would
> free the vma right there. If that's still what we want to do then I
> think the above sequence should look like this:
> 
> vms_complete_munmap_vmas
>            vms_clear_ptes
>            remove_vma
>                vma_start_write
>                vma_mark_detached
>            mmap_write_downgrade
> 
> because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> Please let me know which way you want to move forward.
> 

Are we sure we're not causing issues with the MAP_FIXED path here?

With the above change, we'd be freeing the PTEs before marking the vmas
as detached or vma_start_write().
Suren Baghdasaryan Dec. 18, 2024, 7:07 p.m. UTC | #22
On Wed, Dec 18, 2024 at 11:00 AM 'Liam R. Howlett' via kernel-team
<kernel-team@android.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> > On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> > >
> > > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > > dropping mmap_lock_write, you should be good.
> > > >
> > > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > > mmap_write_downgrade().
> > >
> > > Why ?!
> > >
> > > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > > concurrency left at this point.
> > >
> > > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > > which is *after* the vmas have been unhooked from the mm, you wait for
> > > any concurrent user to go away.
> > >
> > > And since they're unhooked, there can't be any new users.
> > >
> > > So you're the one and only user left, and code is fine the way it is.
> >
> > Ok, let me make sure I understand this part of your proposal. From
> > your earlier email:
> >
> > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> > vma_munmap_struct *vms,
> >         struct vm_area_struct *vma;
> >         struct mm_struct *mm;
> >
> > +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> > +               vma_start_write(next);
> > +               vma_mark_detached(next, true);
> > +       }
> > +
> >         mm = current->mm;
> >         mm->map_count -= vms->vma_count;
> >         mm->locked_vm -= vms->locked_vm;
> >
> > This would mean:
> >
> > vms_complete_munmap_vmas
> >            vma_start_write
> >            vma_mark_detached
> >            mmap_write_downgrade
> >            vms_clear_ptes
> >            remove_vma
> >
> > And remove_vma will be just freeing the vmas. Is that correct?
> > I'm a bit confused because the original thinking was that
> > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > free the vma right there. If that's still what we want to do then I
> > think the above sequence should look like this:
> >
> > vms_complete_munmap_vmas
> >            vms_clear_ptes
> >            remove_vma
> >                vma_start_write
> >                vma_mark_detached
> >            mmap_write_downgrade
> >
> > because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> > Please let me know which way you want to move forward.
> >
>
> Are we sure we're not causing issues with the MAP_FIXED path here?
>
> With the above change, we'd be freeing the PTEs before marking the vmas
> as detached or vma_start_write().

IIUC when we call vms_complete_munmap_vmas() all vmas inside
mas_detach have been already write-locked, no?

>
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Suren Baghdasaryan Dec. 18, 2024, 7:29 p.m. UTC | #23
On Wed, Dec 18, 2024 at 11:07 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Dec 18, 2024 at 11:00 AM 'Liam R. Howlett' via kernel-team
> <kernel-team@android.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> > > On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> > > >
> > > > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > > > dropping mmap_lock_write, you should be good.
> > > > >
> > > > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > > > mmap_write_downgrade().
> > > >
> > > > Why ?!
> > > >
> > > > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > > > concurrency left at this point.
> > > >
> > > > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > > > which is *after* the vmas have been unhooked from the mm, you wait for
> > > > any concurrent user to go away.
> > > >
> > > > And since they're unhooked, there can't be any new users.
> > > >
> > > > So you're the one and only user left, and code is fine the way it is.
> > >
> > > Ok, let me make sure I understand this part of your proposal. From
> > > your earlier email:
> > >
> > > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> > > vma_munmap_struct *vms,
> > >         struct vm_area_struct *vma;
> > >         struct mm_struct *mm;
> > >
> > > +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> > > +               vma_start_write(next);
> > > +               vma_mark_detached(next, true);
> > > +       }
> > > +
> > >         mm = current->mm;
> > >         mm->map_count -= vms->vma_count;
> > >         mm->locked_vm -= vms->locked_vm;
> > >
> > > This would mean:
> > >
> > > vms_complete_munmap_vmas
> > >            vma_start_write
> > >            vma_mark_detached
> > >            mmap_write_downgrade
> > >            vms_clear_ptes
> > >            remove_vma
> > >
> > > And remove_vma will be just freeing the vmas. Is that correct?
> > > I'm a bit confused because the original thinking was that
> > > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > > free the vma right there. If that's still what we want to do then I
> > > think the above sequence should look like this:
> > >
> > > vms_complete_munmap_vmas
> > >            vms_clear_ptes
> > >            remove_vma
> > >                vma_start_write
> > >                vma_mark_detached
> > >            mmap_write_downgrade
> > >
> > > because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> > > Please let me know which way you want to move forward.
> > >
> >
> > Are we sure we're not causing issues with the MAP_FIXED path here?
> >
> > With the above change, we'd be freeing the PTEs before marking the vmas
> > as detached or vma_start_write().
>
> IIUC when we call vms_complete_munmap_vmas() all vmas inside
> mas_detach have been already write-locked, no?

Yeah, I think we can simply do this:

vms_complete_munmap_vmas
           vms_clear_ptes
           remove_vma
               vma_mark_detached
           mmap_write_downgrade

If my assumption is incorrect, assertion inside vma_mark_detached()
should trigger. I tried a quick test and so far nothing exploded.

>
> >
> > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >
Liam R. Howlett Dec. 18, 2024, 7:38 p.m. UTC | #24
* Suren Baghdasaryan <surenb@google.com> [241218 14:29]:
> On Wed, Dec 18, 2024 at 11:07 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Wed, Dec 18, 2024 at 11:00 AM 'Liam R. Howlett' via kernel-team
> > <kernel-team@android.com> wrote:
> > >
> > > * Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> > > > On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> > > > >
> > > > > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > > > > dropping mmap_lock_write, you should be good.
> > > > > >
> > > > > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > > > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > > > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > > > > mmap_write_downgrade().
> > > > >
> > > > > Why ?!
> > > > >
> > > > > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > > > > concurrency left at this point.
> > > > >
> > > > > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > > > > which is *after* the vmas have been unhooked from the mm, you wait for
> > > > > any concurrent user to go away.
> > > > >
> > > > > And since they're unhooked, there can't be any new users.
> > > > >
> > > > > So you're the one and only user left, and code is fine the way it is.
> > > >
> > > > Ok, let me make sure I understand this part of your proposal. From
> > > > your earlier email:
> > > >
> > > > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> > > > vma_munmap_struct *vms,
> > > >         struct vm_area_struct *vma;
> > > >         struct mm_struct *mm;
> > > >
> > > > +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> > > > +               vma_start_write(next);
> > > > +               vma_mark_detached(next, true);
> > > > +       }
> > > > +
> > > >         mm = current->mm;
> > > >         mm->map_count -= vms->vma_count;
> > > >         mm->locked_vm -= vms->locked_vm;
> > > >
> > > > This would mean:
> > > >
> > > > vms_complete_munmap_vmas
> > > >            vma_start_write
> > > >            vma_mark_detached
> > > >            mmap_write_downgrade
> > > >            vms_clear_ptes
> > > >            remove_vma
> > > >
> > > > And remove_vma will be just freeing the vmas. Is that correct?
> > > > I'm a bit confused because the original thinking was that
> > > > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > > > free the vma right there. If that's still what we want to do then I
> > > > think the above sequence should look like this:
> > > >
> > > > vms_complete_munmap_vmas
> > > >            vms_clear_ptes
> > > >            remove_vma
> > > >                vma_start_write
> > > >                vma_mark_detached
> > > >            mmap_write_downgrade
> > > >
> > > > because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> > > > Please let me know which way you want to move forward.
> > > >
> > >
> > > Are we sure we're not causing issues with the MAP_FIXED path here?
> > >
> > > With the above change, we'd be freeing the PTEs before marking the vmas
> > > as detached or vma_start_write().
> >
> > IIUC when we call vms_complete_munmap_vmas() all vmas inside
> > mas_detach have been already write-locked, no?

That's the way it is today - but I thought you were moving the lock to
the complete stage, not adding a new one? (why add a new one otherwise?)

> 
> Yeah, I think we can simply do this:
> 
> vms_complete_munmap_vmas
>            vms_clear_ptes
>            remove_vma
>                vma_mark_detached
>            mmap_write_downgrade
> 
> If my assumption is incorrect, assertion inside vma_mark_detached()
> should trigger. I tried a quick test and so far nothing exploded.
> 

If they are write locked, then the page faults are not a concern.  There
is also the rmap race that Jann found in mmap_region() [1].  This is
probably also fine since you are keeping the write lock in place earlier
on in the gather stage.  Note the ptes will already be cleared by the
time vms_complete_munmap_vmas() is called in this case.

[1] https://lore.kernel.org/all/CAG48ez0ZpGzxi=-5O_uGQ0xKXOmbjeQ0LjZsRJ1Qtf2X5eOr1w@mail.gmail.com/

Thanks,
Liam
Suren Baghdasaryan Dec. 18, 2024, 8 p.m. UTC | #25
On Wed, Dec 18, 2024 at 11:38 AM 'Liam R. Howlett' via kernel-team
<kernel-team@android.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [241218 14:29]:
> > On Wed, Dec 18, 2024 at 11:07 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Wed, Dec 18, 2024 at 11:00 AM 'Liam R. Howlett' via kernel-team
> > > <kernel-team@android.com> wrote:
> > > >
> > > > * Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> > > > > On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > >
> > > > > > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> > > > > >
> > > > > > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > > > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > > > > > dropping mmap_lock_write, you should be good.
> > > > > > >
> > > > > > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > > > > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > > > > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > > > > > mmap_write_downgrade().
> > > > > >
> > > > > > Why ?!
> > > > > >
> > > > > > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > > > > > concurrency left at this point.
> > > > > >
> > > > > > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > > > > > which is *after* the vmas have been unhooked from the mm, you wait for
> > > > > > any concurrent user to go away.
> > > > > >
> > > > > > And since they're unhooked, there can't be any new users.
> > > > > >
> > > > > > So you're the one and only user left, and code is fine the way it is.
> > > > >
> > > > > Ok, let me make sure I understand this part of your proposal. From
> > > > > your earlier email:
> > > > >
> > > > > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> > > > > vma_munmap_struct *vms,
> > > > >         struct vm_area_struct *vma;
> > > > >         struct mm_struct *mm;
> > > > >
> > > > > +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> > > > > +               vma_start_write(next);
> > > > > +               vma_mark_detached(next, true);
> > > > > +       }
> > > > > +
> > > > >         mm = current->mm;
> > > > >         mm->map_count -= vms->vma_count;
> > > > >         mm->locked_vm -= vms->locked_vm;
> > > > >
> > > > > This would mean:
> > > > >
> > > > > vms_complete_munmap_vmas
> > > > >            vma_start_write
> > > > >            vma_mark_detached
> > > > >            mmap_write_downgrade
> > > > >            vms_clear_ptes
> > > > >            remove_vma
> > > > >
> > > > > And remove_vma will be just freeing the vmas. Is that correct?
> > > > > I'm a bit confused because the original thinking was that
> > > > > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > > > > free the vma right there. If that's still what we want to do then I
> > > > > think the above sequence should look like this:
> > > > >
> > > > > vms_complete_munmap_vmas
> > > > >            vms_clear_ptes
> > > > >            remove_vma
> > > > >                vma_start_write
> > > > >                vma_mark_detached
> > > > >            mmap_write_downgrade
> > > > >
> > > > > because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> > > > > Please let me know which way you want to move forward.
> > > > >
> > > >
> > > > Are we sure we're not causing issues with the MAP_FIXED path here?
> > > >
> > > > With the above change, we'd be freeing the PTEs before marking the vmas
> > > > as detached or vma_start_write().
> > >
> > > IIUC when we call vms_complete_munmap_vmas() all vmas inside
> > > mas_detach have been already write-locked, no?
>
> That's the way it is today - but I thought you were moving the lock to
> the complete stage, not adding a new one? (why add a new one otherwise?)

Is my understanding correct that mas_detach is populated by
vms_gather_munmap_vmas() only with vmas that went through
__split_vma() (and were write-locked there)? I don't see any path that
would add any other vma into mas_detach but maybe I'm missing
something?

>
> >
> > Yeah, I think we can simply do this:
> >
> > vms_complete_munmap_vmas
> >            vms_clear_ptes
> >            remove_vma
> >                vma_mark_detached
> >            mmap_write_downgrade
> >
> > If my assumption is incorrect, assertion inside vma_mark_detached()
> > should trigger. I tried a quick test and so far nothing exploded.
> >
>
> If they are write locked, then the page faults are not a concern.  There
> is also the rmap race that Jann found in mmap_region() [1].  This is
> probably also fine since you are keeping the write lock in place earlier
> on in the gather stage.  Note the ptes will already be cleared by the
> time vms_complete_munmap_vmas() is called in this case.
>
> [1] https://lore.kernel.org/all/CAG48ez0ZpGzxi=-5O_uGQ0xKXOmbjeQ0LjZsRJ1Qtf2X5eOr1w@mail.gmail.com/
>
> Thanks,
> Liam
>
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Liam R. Howlett Dec. 18, 2024, 8:38 p.m. UTC | #26
* Suren Baghdasaryan <surenb@google.com> [241218 15:01]:
> On Wed, Dec 18, 2024 at 11:38 AM 'Liam R. Howlett' via kernel-team
> <kernel-team@android.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [241218 14:29]:
> > > On Wed, Dec 18, 2024 at 11:07 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Wed, Dec 18, 2024 at 11:00 AM 'Liam R. Howlett' via kernel-team
> > > > <kernel-team@android.com> wrote:
> > > > >
> > > > > * Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> > > > > > On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > >
> > > > > > > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> > > > > > >
> > > > > > > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > > > > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > > > > > > dropping mmap_lock_write, you should be good.
> > > > > > > >
> > > > > > > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > > > > > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > > > > > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > > > > > > mmap_write_downgrade().
> > > > > > >
> > > > > > > Why ?!
> > > > > > >
> > > > > > > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > > > > > > concurrency left at this point.
> > > > > > >
> > > > > > > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > > > > > > which is *after* the vmas have been unhooked from the mm, you wait for
> > > > > > > any concurrent user to go away.
> > > > > > >
> > > > > > > And since they're unhooked, there can't be any new users.
> > > > > > >
> > > > > > > So you're the one and only user left, and code is fine the way it is.
> > > > > >
> > > > > > Ok, let me make sure I understand this part of your proposal. From
> > > > > > your earlier email:
> > > > > >
> > > > > > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> > > > > > vma_munmap_struct *vms,
> > > > > >         struct vm_area_struct *vma;
> > > > > >         struct mm_struct *mm;
> > > > > >
> > > > > > +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> > > > > > +               vma_start_write(next);
> > > > > > +               vma_mark_detached(next, true);
> > > > > > +       }
> > > > > > +
> > > > > >         mm = current->mm;
> > > > > >         mm->map_count -= vms->vma_count;
> > > > > >         mm->locked_vm -= vms->locked_vm;
> > > > > >
> > > > > > This would mean:
> > > > > >
> > > > > > vms_complete_munmap_vmas
> > > > > >            vma_start_write
> > > > > >            vma_mark_detached
> > > > > >            mmap_write_downgrade
> > > > > >            vms_clear_ptes
> > > > > >            remove_vma
> > > > > >
> > > > > > And remove_vma will be just freeing the vmas. Is that correct?
> > > > > > I'm a bit confused because the original thinking was that
> > > > > > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > > > > > free the vma right there. If that's still what we want to do then I
> > > > > > think the above sequence should look like this:
> > > > > >
> > > > > > vms_complete_munmap_vmas
> > > > > >            vms_clear_ptes
> > > > > >            remove_vma
> > > > > >                vma_start_write
> > > > > >                vma_mark_detached
> > > > > >            mmap_write_downgrade
> > > > > >
> > > > > > because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> > > > > > Please let me know which way you want to move forward.
> > > > > >
> > > > >
> > > > > Are we sure we're not causing issues with the MAP_FIXED path here?
> > > > >
> > > > > With the above change, we'd be freeing the PTEs before marking the vmas
> > > > > as detached or vma_start_write().
> > > >
> > > > IIUC when we call vms_complete_munmap_vmas() all vmas inside
> > > > mas_detach have been already write-locked, no?
> >
> > That's the way it is today - but I thought you were moving the lock to
> > the complete stage, not adding a new one? (why add a new one otherwise?)
> 
> Is my understanding correct that mas_detach is populated by
> vms_gather_munmap_vmas() only with vmas that went through
> __split_vma() (and were write-locked there)? I don't see any path that
> would add any other vma into mas_detach but maybe I'm missing
> something?

No, that is not correct.

vms_gather_munmap_vmas() calls split on the first vma, then adds all
vmas that are within the range of the munmap() call.  Potentially
splitting the last vma and adding that in the
"if (next->vm_end > vms->end)" block.

Sometimes this is a single vma that gets split twice, sometimes no
splits happen and entire vmas are unmapped, sometimes it's just one vma
that isn't split.

My observation is the common case is a single vma, but besides that we
see 3, and sometimes 7 at a time, but it could be any number of vmas and
not all of them are split.

There is a loop for_each_vma_range() that does:

vma_start_write(next);
mas_set(mas_detach, vms->mas_count++);
mas_store_gfp(mas_detach, next, GFP_KERNEL);


> 
> >
> > >
> > > Yeah, I think we can simply do this:
> > >
> > > vms_complete_munmap_vmas
> > >            vms_clear_ptes
> > >            remove_vma
> > >                vma_mark_detached
> > >            mmap_write_downgrade
> > >
> > > If my assumption is incorrect, assertion inside vma_mark_detached()
> > > should trigger. I tried a quick test and so far nothing exploded.
> > >
> >
> > If they are write locked, then the page faults are not a concern.  There
> > is also the rmap race that Jann found in mmap_region() [1].  This is
> > probably also fine since you are keeping the write lock in place earlier
> > on in the gather stage.  Note the ptes will already be cleared by the
> > time vms_complete_munmap_vmas() is called in this case.
> >
> > [1] https://lore.kernel.org/all/CAG48ez0ZpGzxi=-5O_uGQ0xKXOmbjeQ0LjZsRJ1Qtf2X5eOr1w@mail.gmail.com/
> >
> > Thanks,
> > Liam
> >
> > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >
Suren Baghdasaryan Dec. 18, 2024, 9:53 p.m. UTC | #27
On Wed, Dec 18, 2024 at 12:38 PM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [241218 15:01]:
> > On Wed, Dec 18, 2024 at 11:38 AM 'Liam R. Howlett' via kernel-team
> > <kernel-team@android.com> wrote:
> > >
> > > * Suren Baghdasaryan <surenb@google.com> [241218 14:29]:
> > > > On Wed, Dec 18, 2024 at 11:07 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Wed, Dec 18, 2024 at 11:00 AM 'Liam R. Howlett' via kernel-team
> > > > > <kernel-team@android.com> wrote:
> > > > > >
> > > > > > * Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> > > > > > > On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > > >
> > > > > > > > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> > > > > > > >
> > > > > > > > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > > > > > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > > > > > > > dropping mmap_lock_write, you should be good.
> > > > > > > > >
> > > > > > > > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > > > > > > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > > > > > > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > > > > > > > mmap_write_downgrade().
> > > > > > > >
> > > > > > > > Why ?!
> > > > > > > >
> > > > > > > > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > > > > > > > concurrency left at this point.
> > > > > > > >
> > > > > > > > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > > > > > > > which is *after* the vmas have been unhooked from the mm, you wait for
> > > > > > > > any concurrent user to go away.
> > > > > > > >
> > > > > > > > And since they're unhooked, there can't be any new users.
> > > > > > > >
> > > > > > > > So you're the one and only user left, and code is fine the way it is.
> > > > > > >
> > > > > > > Ok, let me make sure I understand this part of your proposal. From
> > > > > > > your earlier email:
> > > > > > >
> > > > > > > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> > > > > > > vma_munmap_struct *vms,
> > > > > > >         struct vm_area_struct *vma;
> > > > > > >         struct mm_struct *mm;
> > > > > > >
> > > > > > > +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> > > > > > > +               vma_start_write(next);
> > > > > > > +               vma_mark_detached(next, true);
> > > > > > > +       }
> > > > > > > +
> > > > > > >         mm = current->mm;
> > > > > > >         mm->map_count -= vms->vma_count;
> > > > > > >         mm->locked_vm -= vms->locked_vm;
> > > > > > >
> > > > > > > This would mean:
> > > > > > >
> > > > > > > vms_complete_munmap_vmas
> > > > > > >            vma_start_write
> > > > > > >            vma_mark_detached
> > > > > > >            mmap_write_downgrade
> > > > > > >            vms_clear_ptes
> > > > > > >            remove_vma
> > > > > > >
> > > > > > > And remove_vma will be just freeing the vmas. Is that correct?
> > > > > > > I'm a bit confused because the original thinking was that
> > > > > > > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > > > > > > free the vma right there. If that's still what we want to do then I
> > > > > > > think the above sequence should look like this:
> > > > > > >
> > > > > > > vms_complete_munmap_vmas
> > > > > > >            vms_clear_ptes
> > > > > > >            remove_vma
> > > > > > >                vma_start_write
> > > > > > >                vma_mark_detached
> > > > > > >            mmap_write_downgrade
> > > > > > >
> > > > > > > because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> > > > > > > Please let me know which way you want to move forward.
> > > > > > >
> > > > > >
> > > > > > Are we sure we're not causing issues with the MAP_FIXED path here?
> > > > > >
> > > > > > With the above change, we'd be freeing the PTEs before marking the vmas
> > > > > > as detached or vma_start_write().
> > > > >
> > > > > IIUC when we call vms_complete_munmap_vmas() all vmas inside
> > > > > mas_detach have been already write-locked, no?
> > >
> > > That's the way it is today - but I thought you were moving the lock to
> > > the complete stage, not adding a new one? (why add a new one otherwise?)
> >
> > Is my understanding correct that mas_detach is populated by
> > vms_gather_munmap_vmas() only with vmas that went through
> > __split_vma() (and were write-locked there)? I don't see any path that
> > would add any other vma into mas_detach but maybe I'm missing
> > something?
>
> No, that is not correct.
>
> vms_gather_munmap_vmas() calls split on the first vma, then adds all
> vmas that are within the range of the munmap() call.  Potentially
> splitting the last vma and adding that in the
> "if (next->vm_end > vms->end)" block.
>
> Sometimes this is a single vma that gets split twice, sometimes no
> splits happen and entire vmas are unmapped, sometimes it's just one vma
> that isn't split.
>
> My observation is the common case is a single vma, but besides that we
> see 3, and sometimes 7 at a time, but it could be any number of vmas and
> not all of them are split.
>
> There is a loop for_each_vma_range() that does:
>
> vma_start_write(next);
> mas_set(mas_detach, vms->mas_count++);
> mas_store_gfp(mas_detach, next, GFP_KERNEL);

Ah, ok I see now. I completely misunderstood what for_each_vma_range()
was doing.

Then I think vma_start_write() should remain inside
vms_gather_munmap_vmas() and all vmas in mas_detach should be
write-locked, even the ones we are not modifying. Otherwise what would
prevent the race I mentioned before?

__mmap_region
    __mmap_prepare
        vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
                                                      // some locked
by __split_vma(), some not locked

                                     lock_vma_under_rcu()
                                         vma = mas_walk // finds
unlocked vma also in mas_detach
                                         vma_start_read(vma) //
succeeds since vma is not locked
                                         // vma->detached, vm_start,
vm_end checks pass
                                     // vma is successfully read-locked

       vms_clean_up_area(mas_detach)
            vms_clear_ptes
                                     // steps on a cleared PTE
    __mmap_new_vma
        vma_set_range // installs new vma in the range
    __mmap_complete
        vms_complete_munmap_vmas // vmas are write-locked and detached
but it's too late



>
>
> >
> > >
> > > >
> > > > Yeah, I think we can simply do this:
> > > >
> > > > vms_complete_munmap_vmas
> > > >            vms_clear_ptes
> > > >            remove_vma
> > > >                vma_mark_detached
> > > >            mmap_write_downgrade
> > > >
> > > > If my assumption is incorrect, assertion inside vma_mark_detached()
> > > > should trigger. I tried a quick test and so far nothing exploded.
> > > >
> > >
> > > If they are write locked, then the page faults are not a concern.  There
> > > is also the rmap race that Jann found in mmap_region() [1].  This is
> > > probably also fine since you are keeping the write lock in place earlier
> > > on in the gather stage.  Note the ptes will already be cleared by the
> > > time vms_complete_munmap_vmas() is called in this case.
> > >
> > > [1] https://lore.kernel.org/all/CAG48ez0ZpGzxi=-5O_uGQ0xKXOmbjeQ0LjZsRJ1Qtf2X5eOr1w@mail.gmail.com/
> > >
> > > Thanks,
> > > Liam
> > >
> > > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> > >
Suren Baghdasaryan Dec. 18, 2024, 9:55 p.m. UTC | #28
On Wed, Dec 18, 2024 at 1:53 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Dec 18, 2024 at 12:38 PM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [241218 15:01]:
> > > On Wed, Dec 18, 2024 at 11:38 AM 'Liam R. Howlett' via kernel-team
> > > <kernel-team@android.com> wrote:
> > > >
> > > > * Suren Baghdasaryan <surenb@google.com> [241218 14:29]:
> > > > > On Wed, Dec 18, 2024 at 11:07 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > > > > On Wed, Dec 18, 2024 at 11:00 AM 'Liam R. Howlett' via kernel-team
> > > > > > <kernel-team@android.com> wrote:
> > > > > > >
> > > > > > > * Suren Baghdasaryan <surenb@google.com> [241218 12:58]:
> > > > > > > > On Wed, Dec 18, 2024 at 9:44 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Dec 18, 2024 at 09:36:42AM -0800, Suren Baghdasaryan wrote:
> > > > > > > > >
> > > > > > > > > > > You will not. vms_complete_munmap_vmas() will call remove_vma() to
> > > > > > > > > > > remove PTEs IIRC, and if you do start_write() and detach() before
> > > > > > > > > > > dropping mmap_lock_write, you should be good.
> > > > > > > > > >
> > > > > > > > > > Ok, I think we will have to move mmap_write_downgrade() inside
> > > > > > > > > > vms_complete_munmap_vmas() to be called after remove_vma().
> > > > > > > > > > vms_clear_ptes() is using vmas, so we can't move remove_vma() before
> > > > > > > > > > mmap_write_downgrade().
> > > > > > > > >
> > > > > > > > > Why ?!
> > > > > > > > >
> > > > > > > > > vms_clear_ptes() and remove_vma() are fine where they are -- there is no
> > > > > > > > > concurrency left at this point.
> > > > > > > > >
> > > > > > > > > Note that by doing vma_start_write() inside vms_complete_munmap_vmas(),
> > > > > > > > > which is *after* the vmas have been unhooked from the mm, you wait for
> > > > > > > > > any concurrent user to go away.
> > > > > > > > >
> > > > > > > > > And since they're unhooked, there can't be any new users.
> > > > > > > > >
> > > > > > > > > So you're the one and only user left, and code is fine the way it is.
> > > > > > > >
> > > > > > > > Ok, let me make sure I understand this part of your proposal. From
> > > > > > > > your earlier email:
> > > > > > > >
> > > > > > > > @@ -1173,6 +1173,11 @@ static void vms_complete_munmap_vmas(struct
> > > > > > > > vma_munmap_struct *vms,
> > > > > > > >         struct vm_area_struct *vma;
> > > > > > > >         struct mm_struct *mm;
> > > > > > > >
> > > > > > > > +       mas_for_each(mas_detach, vma, ULONG_MAX) {
> > > > > > > > +               vma_start_write(next);
> > > > > > > > +               vma_mark_detached(next, true);
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > >         mm = current->mm;
> > > > > > > >         mm->map_count -= vms->vma_count;
> > > > > > > >         mm->locked_vm -= vms->locked_vm;
> > > > > > > >
> > > > > > > > This would mean:
> > > > > > > >
> > > > > > > > vms_complete_munmap_vmas
> > > > > > > >            vma_start_write
> > > > > > > >            vma_mark_detached
> > > > > > > >            mmap_write_downgrade
> > > > > > > >            vms_clear_ptes
> > > > > > > >            remove_vma
> > > > > > > >
> > > > > > > > And remove_vma will be just freeing the vmas. Is that correct?
> > > > > > > > I'm a bit confused because the original thinking was that
> > > > > > > > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > > > > > > > free the vma right there. If that's still what we want to do then I
> > > > > > > > think the above sequence should look like this:
> > > > > > > >
> > > > > > > > vms_complete_munmap_vmas
> > > > > > > >            vms_clear_ptes
> > > > > > > >            remove_vma
> > > > > > > >                vma_start_write
> > > > > > > >                vma_mark_detached
> > > > > > > >            mmap_write_downgrade
> > > > > > > >
> > > > > > > > because vma_start_write+vma_mark_detached should be done under  mmap_write_lock.
> > > > > > > > Please let me know which way you want to move forward.
> > > > > > > >
> > > > > > >
> > > > > > > Are we sure we're not causing issues with the MAP_FIXED path here?
> > > > > > >
> > > > > > > With the above change, we'd be freeing the PTEs before marking the vmas
> > > > > > > as detached or vma_start_write().
> > > > > >
> > > > > > IIUC when we call vms_complete_munmap_vmas() all vmas inside
> > > > > > mas_detach have been already write-locked, no?
> > > >
> > > > That's the way it is today - but I thought you were moving the lock to
> > > > the complete stage, not adding a new one? (why add a new one otherwise?)
> > >
> > > Is my understanding correct that mas_detach is populated by
> > > vms_gather_munmap_vmas() only with vmas that went through
> > > __split_vma() (and were write-locked there)? I don't see any path that
> > > would add any other vma into mas_detach but maybe I'm missing
> > > something?
> >
> > No, that is not correct.
> >
> > vms_gather_munmap_vmas() calls split on the first vma, then adds all
> > vmas that are within the range of the munmap() call.  Potentially
> > splitting the last vma and adding that in the
> > "if (next->vm_end > vms->end)" block.
> >
> > Sometimes this is a single vma that gets split twice, sometimes no
> > splits happen and entire vmas are unmapped, sometimes it's just one vma
> > that isn't split.
> >
> > My observation is the common case is a single vma, but besides that we
> > see 3, and sometimes 7 at a time, but it could be any number of vmas and
> > not all of them are split.
> >
> > There is a loop for_each_vma_range() that does:
> >
> > vma_start_write(next);
> > mas_set(mas_detach, vms->mas_count++);
> > mas_store_gfp(mas_detach, next, GFP_KERNEL);
>
> Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> was doing.
>
> Then I think vma_start_write() should remain inside
> vms_gather_munmap_vmas() and all vmas in mas_detach should be
> write-locked, even the ones we are not modifying. Otherwise what would
> prevent the race I mentioned before?
>
> __mmap_region
>     __mmap_prepare
>         vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
>                                                       // some locked
> by __split_vma(), some not locked
>
>                                      lock_vma_under_rcu()
>                                          vma = mas_walk // finds
> unlocked vma also in mas_detach
>                                          vma_start_read(vma) //
> succeeds since vma is not locked
>                                          // vma->detached, vm_start,
> vm_end checks pass
>                                      // vma is successfully read-locked
>
>        vms_clean_up_area(mas_detach)
>             vms_clear_ptes
>                                      // steps on a cleared PTE
>     __mmap_new_vma
>         vma_set_range // installs new vma in the range
>     __mmap_complete
>         vms_complete_munmap_vmas // vmas are write-locked and detached
> but it's too late

Sorry about the formatting. Without comments should look better:

__mmap_region
    __mmap_prepare
        vms_gather_munmap_vmas

                                     lock_vma_under_rcu()
                                         vma = mas_walk
                                         vma_start_read(vma)
                                         // vma is still valid and attached

       vms_clean_up_area(mas_detach)
            vms_clear_ptes
                                     // steps on a cleared PTE
    __mmap_new_vma
        vma_set_range
    __mmap_complete
        vms_complete_munmap_vmas

>
>
>
> >
> >
> > >
> > > >
> > > > >
> > > > > Yeah, I think we can simply do this:
> > > > >
> > > > > vms_complete_munmap_vmas
> > > > >            vms_clear_ptes
> > > > >            remove_vma
> > > > >                vma_mark_detached
> > > > >            mmap_write_downgrade
> > > > >
> > > > > If my assumption is incorrect, assertion inside vma_mark_detached()
> > > > > should trigger. I tried a quick test and so far nothing exploded.
> > > > >
> > > >
> > > > If they are write locked, then the page faults are not a concern.  There
> > > > is also the rmap race that Jann found in mmap_region() [1].  This is
> > > > probably also fine since you are keeping the write lock in place earlier
> > > > on in the gather stage.  Note the ptes will already be cleared by the
> > > > time vms_complete_munmap_vmas() is called in this case.
> > > >
> > > > [1] https://lore.kernel.org/all/CAG48ez0ZpGzxi=-5O_uGQ0xKXOmbjeQ0LjZsRJ1Qtf2X5eOr1w@mail.gmail.com/
> > > >
> > > > Thanks,
> > > > Liam
> > > >
> > > > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> > > >
Andrew Morton Dec. 19, 2024, 12:35 a.m. UTC | #29
On Wed, 18 Dec 2024 13:53:17 -0800 Suren Baghdasaryan <surenb@google.com> wrote:

> > There is a loop for_each_vma_range() that does:
> >
> > vma_start_write(next);
> > mas_set(mas_detach, vms->mas_count++);
> > mas_store_gfp(mas_detach, next, GFP_KERNEL);
> 
> Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> was doing.

I'll drop the v6 series from mm-unstable.
Suren Baghdasaryan Dec. 19, 2024, 12:47 a.m. UTC | #30
On Wed, Dec 18, 2024 at 4:36 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Wed, 18 Dec 2024 13:53:17 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
>
> > > There is a loop for_each_vma_range() that does:
> > >
> > > vma_start_write(next);
> > > mas_set(mas_detach, vms->mas_count++);
> > > mas_store_gfp(mas_detach, next, GFP_KERNEL);
> >
> > Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> > was doing.
>
> I'll drop the v6 series from mm-unstable.

Sounds good. v7 will be quite different.
Peter Zijlstra Dec. 19, 2024, 8:53 a.m. UTC | #31
On Wed, Dec 18, 2024 at 09:58:12AM -0800, Suren Baghdasaryan wrote:
	
> And remove_vma will be just freeing the vmas. Is that correct?

Yep.

> I'm a bit confused because the original thinking was that
> vma_mark_detached() would drop the last refcnt and if it's 0 we would
> free the vma right there. If that's still what we want to do then I
> think the above sequence should look like this:

Right; sorry about that. So my initial objection to that extra sync was
based on the reasons presented -- but having had to look at the unmap
path again (my mm-foo is somewhat rusty, I've not done much the past few
years) I realized that keeping a VMA alive beyond unmapping PTEs is just
plain daft.

So yes, back to your original semantics, but cleaned up to not need that
extra sync point -- instead relying on the natural placement of
vma_start_write() after unhooking from the mm. And not for reasons of
the race, but for reasons of integrity -- VMA without PTEs is asking for
more trouble.
Peter Zijlstra Dec. 19, 2024, 8:55 a.m. UTC | #32
On Wed, Dec 18, 2024 at 11:29:23AM -0800, Suren Baghdasaryan wrote:

> Yeah, I think we can simply do this:
> 
> vms_complete_munmap_vmas
>            vms_clear_ptes
>            remove_vma
>                vma_mark_detached
>            mmap_write_downgrade
> 
> If my assumption is incorrect, assertion inside vma_mark_detached()
> should trigger. I tried a quick test and so far nothing exploded.

I think that would be unfortunate and could cause regressions. I think
we want to keep vms_clear_ptes() under the read-lock.
Peter Zijlstra Dec. 19, 2024, 9:13 a.m. UTC | #33
On Wed, Dec 18, 2024 at 01:53:17PM -0800, Suren Baghdasaryan wrote:

> Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> was doing.
> 
> Then I think vma_start_write() should remain inside
> vms_gather_munmap_vmas() and all vmas in mas_detach should be

No, it must not. You really are not modifying anything yet (except the
split, which we've already noted mark write themselves).

> write-locked, even the ones we are not modifying. Otherwise what would
> prevent the race I mentioned before?
> 
> __mmap_region
>     __mmap_prepare
>         vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
>                                                       // some locked
> by __split_vma(), some not locked
> 
>                                      lock_vma_under_rcu()
>                                          vma = mas_walk // finds
> unlocked vma also in mas_detach
>                                          vma_start_read(vma) //
> succeeds since vma is not locked
>                                          // vma->detached, vm_start,
> vm_end checks pass
>                                      // vma is successfully read-locked
> 
>        vms_clean_up_area(mas_detach)
>             vms_clear_ptes
>                                      // steps on a cleared PTE

So here we have the added complexity that the vma is not unhooked at
all. Is there anything that would prevent a concurrent gup_fast() from
doing the same -- touch a cleared PTE?

AFAICT two threads, one doing overlapping mmap() and the other doing
gup_fast() can result in exactly this scenario.

If we don't care about the GUP case, when I'm thinking we should not
care about the lockless RCU case either.

>     __mmap_new_vma
>         vma_set_range // installs new vma in the range
>     __mmap_complete
>         vms_complete_munmap_vmas // vmas are write-locked and detached
> but it's too late

But at this point that old vma really is unhooked, and the
vma_write_start() here will ensure readers are gone and it will clear
PTEs *again*.
Peter Zijlstra Dec. 19, 2024, 11:20 a.m. UTC | #34
On Thu, Dec 19, 2024 at 10:13:34AM +0100, Peter Zijlstra wrote:
> On Wed, Dec 18, 2024 at 01:53:17PM -0800, Suren Baghdasaryan wrote:
> 
> > Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> > was doing.
> > 
> > Then I think vma_start_write() should remain inside
> > vms_gather_munmap_vmas() and all vmas in mas_detach should be
> 
> No, it must not. You really are not modifying anything yet (except the
> split, which we've already noted mark write themselves).
> 
> > write-locked, even the ones we are not modifying. Otherwise what would
> > prevent the race I mentioned before?
> > 
> > __mmap_region
> >     __mmap_prepare
> >         vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
> >                                                       // some locked
> > by __split_vma(), some not locked
> > 
> >                                      lock_vma_under_rcu()
> >                                          vma = mas_walk // finds
> > unlocked vma also in mas_detach
> >                                          vma_start_read(vma) //
> > succeeds since vma is not locked
> >                                          // vma->detached, vm_start,
> > vm_end checks pass
> >                                      // vma is successfully read-locked
> > 
> >        vms_clean_up_area(mas_detach)
> >             vms_clear_ptes
> >                                      // steps on a cleared PTE
> 
> So here we have the added complexity that the vma is not unhooked at
> all. Is there anything that would prevent a concurrent gup_fast() from
> doing the same -- touch a cleared PTE?
> 
> AFAICT two threads, one doing overlapping mmap() and the other doing
> gup_fast() can result in exactly this scenario.
> 
> If we don't care about the GUP case, when I'm thinking we should not
> care about the lockless RCU case either.

Also, at this point we'll just fail to find a page, and that is nothing
special. The problem with accessing an unmapped VMA is that the
page-table walk will instantiate page-tables.

Given this is an overlapping mmap -- we're going to need to those
page-tables anyway, so no harm done.

Only after the VMA is unlinked must we ensure we don't accidentally
re-instantiate page-tables.
Suren Baghdasaryan Dec. 19, 2024, 4:08 p.m. UTC | #35
On Thu, Dec 19, 2024 at 12:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Dec 18, 2024 at 09:58:12AM -0800, Suren Baghdasaryan wrote:
>
> > And remove_vma will be just freeing the vmas. Is that correct?
>
> Yep.
>
> > I'm a bit confused because the original thinking was that
> > vma_mark_detached() would drop the last refcnt and if it's 0 we would
> > free the vma right there. If that's still what we want to do then I
> > think the above sequence should look like this:
>
> Right; sorry about that. So my initial objection to that extra sync was
> based on the reasons presented -- but having had to look at the unmap
> path again (my mm-foo is somewhat rusty, I've not done much the past few
> years) I realized that keeping a VMA alive beyond unmapping PTEs is just
> plain daft.
>
> So yes, back to your original semantics, but cleaned up to not need that
> extra sync point -- instead relying on the natural placement of
> vma_start_write() after unhooking from the mm. And not for reasons of
> the race, but for reasons of integrity -- VMA without PTEs is asking for
> more trouble.

Ack. Thanks for clarification!
Suren Baghdasaryan Dec. 19, 2024, 4:08 p.m. UTC | #36
On Thu, Dec 19, 2024 at 12:55 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Dec 18, 2024 at 11:29:23AM -0800, Suren Baghdasaryan wrote:
>
> > Yeah, I think we can simply do this:
> >
> > vms_complete_munmap_vmas
> >            vms_clear_ptes
> >            remove_vma
> >                vma_mark_detached
> >            mmap_write_downgrade
> >
> > If my assumption is incorrect, assertion inside vma_mark_detached()
> > should trigger. I tried a quick test and so far nothing exploded.
>
> I think that would be unfortunate and could cause regressions. I think
> we want to keep vms_clear_ptes() under the read-lock.

Ok, I'll stop considering this option.
Suren Baghdasaryan Dec. 19, 2024, 4:14 p.m. UTC | #37
On Thu, Dec 19, 2024 at 1:13 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Dec 18, 2024 at 01:53:17PM -0800, Suren Baghdasaryan wrote:
>
> > Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> > was doing.
> >
> > Then I think vma_start_write() should remain inside
> > vms_gather_munmap_vmas() and all vmas in mas_detach should be
>
> No, it must not. You really are not modifying anything yet (except the
> split, which we've already noted mark write themselves).
>
> > write-locked, even the ones we are not modifying. Otherwise what would
> > prevent the race I mentioned before?
> >
> > __mmap_region
> >     __mmap_prepare
> >         vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
> >                                                       // some locked
> > by __split_vma(), some not locked
> >
> >                                      lock_vma_under_rcu()
> >                                          vma = mas_walk // finds
> > unlocked vma also in mas_detach
> >                                          vma_start_read(vma) //
> > succeeds since vma is not locked
> >                                          // vma->detached, vm_start,
> > vm_end checks pass
> >                                      // vma is successfully read-locked
> >
> >        vms_clean_up_area(mas_detach)
> >             vms_clear_ptes
> >                                      // steps on a cleared PTE
>
> So here we have the added complexity that the vma is not unhooked at
> all. Is there anything that would prevent a concurrent gup_fast() from
> doing the same -- touch a cleared PTE?
>
> AFAICT two threads, one doing overlapping mmap() and the other doing
> gup_fast() can result in exactly this scenario.
>
> If we don't care about the GUP case, when I'm thinking we should not
> care about the lockless RCU case either.
>
> >     __mmap_new_vma
> >         vma_set_range // installs new vma in the range
> >     __mmap_complete
> >         vms_complete_munmap_vmas // vmas are write-locked and detached
> > but it's too late
>
> But at this point that old vma really is unhooked, and the
> vma_write_start() here will ensure readers are gone and it will clear
> PTEs *again*.

So, to summarize, you want vma_start_write() and vma_mark_detached()
to be done when we are removing the vma from the tree, right?
Something like:

vma_start_write()
vma_iter_store()
vma_mark_detached()

And the race I described is not a real problem since the vma is still
in the tree, so gup_fast() does exactly that and will simply reinstall
the ptes.

>
>
Suren Baghdasaryan Dec. 19, 2024, 4:17 p.m. UTC | #38
On Thu, Dec 19, 2024 at 3:20 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 19, 2024 at 10:13:34AM +0100, Peter Zijlstra wrote:
> > On Wed, Dec 18, 2024 at 01:53:17PM -0800, Suren Baghdasaryan wrote:
> >
> > > Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> > > was doing.
> > >
> > > Then I think vma_start_write() should remain inside
> > > vms_gather_munmap_vmas() and all vmas in mas_detach should be
> >
> > No, it must not. You really are not modifying anything yet (except the
> > split, which we've already noted mark write themselves).
> >
> > > write-locked, even the ones we are not modifying. Otherwise what would
> > > prevent the race I mentioned before?
> > >
> > > __mmap_region
> > >     __mmap_prepare
> > >         vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
> > >                                                       // some locked
> > > by __split_vma(), some not locked
> > >
> > >                                      lock_vma_under_rcu()
> > >                                          vma = mas_walk // finds
> > > unlocked vma also in mas_detach
> > >                                          vma_start_read(vma) //
> > > succeeds since vma is not locked
> > >                                          // vma->detached, vm_start,
> > > vm_end checks pass
> > >                                      // vma is successfully read-locked
> > >
> > >        vms_clean_up_area(mas_detach)
> > >             vms_clear_ptes
> > >                                      // steps on a cleared PTE
> >
> > So here we have the added complexity that the vma is not unhooked at
> > all. Is there anything that would prevent a concurrent gup_fast() from
> > doing the same -- touch a cleared PTE?
> >
> > AFAICT two threads, one doing overlapping mmap() and the other doing
> > gup_fast() can result in exactly this scenario.
> >
> > If we don't care about the GUP case, when I'm thinking we should not
> > care about the lockless RCU case either.
>
> Also, at this point we'll just fail to find a page, and that is nothing
> special. The problem with accessing an unmapped VMA is that the
> page-table walk will instantiate page-tables.
>
> Given this is an overlapping mmap -- we're going to need to those
> page-tables anyway, so no harm done.
>
> Only after the VMA is unlinked must we ensure we don't accidentally
> re-instantiate page-tables.

Got it. I'll need some time to digest all the input but I think I
understand more or less the overall direction. Thanks, Peter!
Liam R. Howlett Dec. 19, 2024, 5:16 p.m. UTC | #39
* Peter Zijlstra <peterz@infradead.org> [241219 06:20]:
> On Thu, Dec 19, 2024 at 10:13:34AM +0100, Peter Zijlstra wrote:
> > On Wed, Dec 18, 2024 at 01:53:17PM -0800, Suren Baghdasaryan wrote:
> > 
> > > Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> > > was doing.
> > > 
> > > Then I think vma_start_write() should remain inside
> > > vms_gather_munmap_vmas() and all vmas in mas_detach should be
> > 
> > No, it must not. You really are not modifying anything yet (except the
> > split, which we've already noted mark write themselves).
> > 
> > > write-locked, even the ones we are not modifying. Otherwise what would
> > > prevent the race I mentioned before?
> > > 
> > > __mmap_region
> > >     __mmap_prepare
> > >         vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
> > >                                                       // some locked
> > > by __split_vma(), some not locked
> > > 
> > >                                      lock_vma_under_rcu()
> > >                                          vma = mas_walk // finds
> > > unlocked vma also in mas_detach
> > >                                          vma_start_read(vma) //
> > > succeeds since vma is not locked
> > >                                          // vma->detached, vm_start,
> > > vm_end checks pass
> > >                                      // vma is successfully read-locked
> > > 
> > >        vms_clean_up_area(mas_detach)
> > >             vms_clear_ptes
> > >                                      // steps on a cleared PTE
> > 
> > So here we have the added complexity that the vma is not unhooked at
> > all.

Well, hold on - it is taken out of the rmap/anon vma chain here.  It is
completely unhooked except the vma tree at this point.  We're not adding
complexity, we're dealing with it.


>Is there anything that would prevent a concurrent gup_fast() from
> > doing the same -- touch a cleared PTE?

Where does gup_fast() install PTEs?  Doesn't it bail once a READ_ONCE()
on any level returns no PTE?

> > 
> > AFAICT two threads, one doing overlapping mmap() and the other doing
> > gup_fast() can result in exactly this scenario.

The mmap() call will race with the gup_fast(), but either the nr_pinned
will be returned from gup_fast() before vms_clean_up_area() removes the
page table (or any higher level), or gup_fast() will find nothing.

> > 
> > If we don't care about the GUP case, when I'm thinking we should not
> > care about the lockless RCU case either.
> 
> Also, at this point we'll just fail to find a page, and that is nothing
> special. The problem with accessing an unmapped VMA is that the
> page-table walk will instantiate page-tables.

I think there is a problem if we are reinstalling page tables on a vma
that's about to be removed.  I think we are avoiding this with our
locking though?

> 
> Given this is an overlapping mmap -- we're going to need to those
> page-tables anyway, so no harm done.

Well, maybe?  The mapping may now be an anon vma vs a file backed, or
maybe it's PROT_NONE?

> Only after the VMA is unlinked must we ensure we don't accidentally
> re-instantiate page-tables.

It's not as simple as that, unfortunately.  There are vma callbacks for
drivers (or hugetlbfs, or whatever) that do other things.  So we need to
clean up the area before we are able to replace the vma and part of that
clean up is the page tables, or anon vma chain, and/or closing a file.

There are other ways of finding the vma as well, besides the vma tree.
We are following the locking so that we are safe from those perspectives
as well, and so the vma has to be unlinked in a few places in a certain
order.

Thanks,
Liam
Peter Zijlstra Dec. 19, 2024, 5:23 p.m. UTC | #40
On Thu, Dec 19, 2024 at 08:14:24AM -0800, Suren Baghdasaryan wrote:
> On Thu, Dec 19, 2024 at 1:13 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Dec 18, 2024 at 01:53:17PM -0800, Suren Baghdasaryan wrote:
> >
> > > Ah, ok I see now. I completely misunderstood what for_each_vma_range()
> > > was doing.
> > >
> > > Then I think vma_start_write() should remain inside
> > > vms_gather_munmap_vmas() and all vmas in mas_detach should be
> >
> > No, it must not. You really are not modifying anything yet (except the
> > split, which we've already noted mark write themselves).
> >
> > > write-locked, even the ones we are not modifying. Otherwise what would
> > > prevent the race I mentioned before?
> > >
> > > __mmap_region
> > >     __mmap_prepare
> > >         vms_gather_munmap_vmas // adds vmas to be unmapped into mas_detach,
> > >                                                       // some locked
> > > by __split_vma(), some not locked
> > >
> > >                                      lock_vma_under_rcu()
> > >                                          vma = mas_walk // finds
> > > unlocked vma also in mas_detach
> > >                                          vma_start_read(vma) //
> > > succeeds since vma is not locked
> > >                                          // vma->detached, vm_start,
> > > vm_end checks pass
> > >                                      // vma is successfully read-locked
> > >
> > >        vms_clean_up_area(mas_detach)
> > >             vms_clear_ptes
> > >                                      // steps on a cleared PTE
> >
> > So here we have the added complexity that the vma is not unhooked at
> > all. Is there anything that would prevent a concurrent gup_fast() from
> > doing the same -- touch a cleared PTE?
> >
> > AFAICT two threads, one doing overlapping mmap() and the other doing
> > gup_fast() can result in exactly this scenario.
> >
> > If we don't care about the GUP case, when I'm thinking we should not
> > care about the lockless RCU case either.
> >
> > >     __mmap_new_vma
> > >         vma_set_range // installs new vma in the range
> > >     __mmap_complete
> > >         vms_complete_munmap_vmas // vmas are write-locked and detached
> > > but it's too late
> >
> > But at this point that old vma really is unhooked, and the
> > vma_write_start() here will ensure readers are gone and it will clear
> > PTEs *again*.
> 
> So, to summarize, you want vma_start_write() and vma_mark_detached()
> to be done when we are removing the vma from the tree, right?

*after*

> Something like:
 
  vma_iter_store()
  vma_start_write()
  vma_mark_detached()

By having vma_start_write() after being unlinked you get the guarantee
of no concurrency. New lookups cannot find you (because of that
vma_iter_store()) and existing readers will be waited for.

> And the race I described is not a real problem since the vma is still
> in the tree, so gup_fast() does exactly that and will simply reinstall
> the ptes.

Just so.
Peter Zijlstra Dec. 19, 2024, 5:42 p.m. UTC | #41
On Thu, Dec 19, 2024 at 12:16:45PM -0500, Liam R. Howlett wrote:

> Well, hold on - it is taken out of the rmap/anon vma chain here.  It is
> completely unhooked except the vma tree at this point.  We're not adding
> complexity, we're dealing with it.

So I'm not entirely sure I understand the details here -- this is again
about being able to do rollback when things fail?

There is comment above the vms_clean_up_area() call in __mmap_prepare(),
but its not making sense atm.

> >Is there anything that would prevent a concurrent gup_fast() from
> > > doing the same -- touch a cleared PTE?
> 
> Where does gup_fast() install PTEs?  Doesn't it bail once a READ_ONCE()
> on any level returns no PTE?

I think you're right, GUP doesn't, but any 'normal' page-table walker
will.

> > > AFAICT two threads, one doing overlapping mmap() and the other doing
> > > gup_fast() can result in exactly this scenario.
> 
> The mmap() call will race with the gup_fast(), but either the nr_pinned
> will be returned from gup_fast() before vms_clean_up_area() removes the
> page table (or any higher level), or gup_fast() will find nothing.

Agreed.

> > > If we don't care about the GUP case, when I'm thinking we should not
> > > care about the lockless RCU case either.
> > 
> > Also, at this point we'll just fail to find a page, and that is nothing
> > special. The problem with accessing an unmapped VMA is that the
> > page-table walk will instantiate page-tables.
> 
> I think there is a problem if we are reinstalling page tables on a vma
> that's about to be removed.  I think we are avoiding this with our
> locking though?

So this is purely about the overlapping part, right? We need to remove
the old pages, install the new mapping and have new pages populate the
thing.

But either way around, the range stays valid and page-tables stay
needed.

> > Given this is an overlapping mmap -- we're going to need to those
> > page-tables anyway, so no harm done.
> 
> Well, maybe?  The mapping may now be an anon vma vs a file backed, or
> maybe it's PROT_NONE?

The page-tables don't care about all that no? The only thing where it
matters is for things like THP, because that affects the level of
page-tables, but otherwise it's all page-table content (ptes).

> > Only after the VMA is unlinked must we ensure we don't accidentally
> > re-instantiate page-tables.
> 
> It's not as simple as that, unfortunately.  There are vma callbacks for
> drivers (or hugetlbfs, or whatever) that do other things.  So we need to
> clean up the area before we are able to replace the vma and part of that
> clean up is the page tables, or anon vma chain, and/or closing a file.
> 
> There are other ways of finding the vma as well, besides the vma tree.
> We are following the locking so that we are safe from those perspectives
> as well, and so the vma has to be unlinked in a few places in a certain
> order.

For RCU lookups only the mas tree matters -- and its left present there.

If you really want to block RCU readers, I would suggest punching a hole
in the mm_mt. All the traditional code won't notice anyway, this is all
with mmap_lock held for writing.
Liam R. Howlett Dec. 19, 2024, 6:18 p.m. UTC | #42
* Peter Zijlstra <peterz@infradead.org> [241219 12:42]:
> On Thu, Dec 19, 2024 at 12:16:45PM -0500, Liam R. Howlett wrote:
> 
> > Well, hold on - it is taken out of the rmap/anon vma chain here.  It is
> > completely unhooked except the vma tree at this point.  We're not adding
> > complexity, we're dealing with it.
> 
> So I'm not entirely sure I understand the details here -- this is again
> about being able to do rollback when things fail?

no, it's so that things can be ready for the replacement that won't
cause issues during insertion.  We can't have two VMAs in the rmap for
the same area, for instance.

I would like to be able to undo things, but if there's a file that's
closed then we can't undo that.

> 
> There is comment above the vms_clean_up_area() call in __mmap_prepare(),
> but its not making sense atm.
> 
> > >Is there anything that would prevent a concurrent gup_fast() from
> > > > doing the same -- touch a cleared PTE?
> > 
> > Where does gup_fast() install PTEs?  Doesn't it bail once a READ_ONCE()
> > on any level returns no PTE?
> 
> I think you're right, GUP doesn't, but any 'normal' page-table walker
> will.

Normal page-table walkers will be locked out by the page table lock
while the area is cleared, it cannot be re-walked after that because of
the vma lock (except rcu walkers, which is why these new locks are
needed).  Any direct page table walking won't be a problem because we've
removed any way to get to it - I think?

> 
> > > > AFAICT two threads, one doing overlapping mmap() and the other doing
> > > > gup_fast() can result in exactly this scenario.
> > 
> > The mmap() call will race with the gup_fast(), but either the nr_pinned
> > will be returned from gup_fast() before vms_clean_up_area() removes the
> > page table (or any higher level), or gup_fast() will find nothing.
> 
> Agreed.
> 
> > > > If we don't care about the GUP case, when I'm thinking we should not
> > > > care about the lockless RCU case either.
> > > 
> > > Also, at this point we'll just fail to find a page, and that is nothing
> > > special. The problem with accessing an unmapped VMA is that the
> > > page-table walk will instantiate page-tables.
> > 
> > I think there is a problem if we are reinstalling page tables on a vma
> > that's about to be removed.  I think we are avoiding this with our
> > locking though?
> 
> So this is purely about the overlapping part, right? We need to remove
> the old pages, install the new mapping and have new pages populate the
> thing.

No.  If we allow rcu readers to re-fault, we may hit a race where the
page fault has found the vma, but doesn't fault things in before the
ptes are removed or the vma is freed/reused.  Today that won't happen
because we mark the vma as going to be removed before the tree is
changed, so anyone reaching the vma will see it's not safe.

If we want to use different locking strategies for munmap() vs
MAP_FIXED, then we'd need to be sure the vma that is being freed is
isolated for removal and all readers are done before freeing.  I think
this is what you are trying to convey to Suren?

I don't like the idea of another locking strategy in munmap() vs
MAP_FIXED, especially if you think about who would be doing this..
basically no one.  There isn't a sane (legitimate) application that's
going to try and page fault in something that's being removed.

> 
> But either way around, the range stays valid and page-tables stay
> needed.
> 
> > > Given this is an overlapping mmap -- we're going to need to those
> > > page-tables anyway, so no harm done.
> > 
> > Well, maybe?  The mapping may now be an anon vma vs a file backed, or
> > maybe it's PROT_NONE?
> 
> The page-tables don't care about all that no? The only thing where it
> matters is for things like THP, because that affects the level of
> page-tables, but otherwise it's all page-table content (ptes).

It sounds racy, couldn't we read the old page table entry attributes and
act on it after the new attributes are set?

During the switch, after we drop the page table lock but haven't yet
inserted the new vma, then we'd run into a situation where the new
mapping may already be occupied if we don't have some sort of locking
here.  Wouldn't that be an issue as well?  It seems like there are a
number of things that could go bad?

> 
> > > Only after the VMA is unlinked must we ensure we don't accidentally
> > > re-instantiate page-tables.
> > 
> > It's not as simple as that, unfortunately.  There are vma callbacks for
> > drivers (or hugetlbfs, or whatever) that do other things.  So we need to
> > clean up the area before we are able to replace the vma and part of that
> > clean up is the page tables, or anon vma chain, and/or closing a file.
> > 
> > There are other ways of finding the vma as well, besides the vma tree.
> > We are following the locking so that we are safe from those perspectives
> > as well, and so the vma has to be unlinked in a few places in a certain
> > order.
> 
> For RCU lookups only the mas tree matters -- and its left present there.
> 
> If you really want to block RCU readers, I would suggest punching a hole
> in the mm_mt. All the traditional code won't notice anyway, this is all
> with mmap_lock held for writing.

We don't want to block all rcu readers, we want to block the rcu readers
that would see the problem - that is, anyone trying to read a particular
area.

Right now we can page fault in unpopulated vmas while writing other vmas
to the tree.  We are also moving more users to rcu reading to use the
vmas they need without waiting on writes to finish.

Maybe I don't understand your suggestion, but I would think punching a
hole would lose this advantage?

Thanks,
Liam
Peter Zijlstra Dec. 19, 2024, 6:46 p.m. UTC | #43
On Thu, Dec 19, 2024 at 01:18:23PM -0500, Liam R. Howlett wrote:

> > For RCU lookups only the mas tree matters -- and its left present there.
> > 
> > If you really want to block RCU readers, I would suggest punching a hole
> > in the mm_mt. All the traditional code won't notice anyway, this is all
> > with mmap_lock held for writing.
> 
> We don't want to block all rcu readers, we want to block the rcu readers
> that would see the problem - that is, anyone trying to read a particular
> area.
> 
> Right now we can page fault in unpopulated vmas while writing other vmas
> to the tree.  We are also moving more users to rcu reading to use the
> vmas they need without waiting on writes to finish.
> 
> Maybe I don't understand your suggestion, but I would think punching a
> hole would lose this advantage?

My suggestion was to remove the range stuck in mas_detach from mm_mt.
That is exactly the affected range, no?
Liam R. Howlett Dec. 19, 2024, 6:55 p.m. UTC | #44
* Peter Zijlstra <peterz@infradead.org> [241219 13:47]:
> On Thu, Dec 19, 2024 at 01:18:23PM -0500, Liam R. Howlett wrote:
> 
> > > For RCU lookups only the mas tree matters -- and its left present there.
> > > 
> > > If you really want to block RCU readers, I would suggest punching a hole
> > > in the mm_mt. All the traditional code won't notice anyway, this is all
> > > with mmap_lock held for writing.
> > 
> > We don't want to block all rcu readers, we want to block the rcu readers
> > that would see the problem - that is, anyone trying to read a particular
> > area.
> > 
> > Right now we can page fault in unpopulated vmas while writing other vmas
> > to the tree.  We are also moving more users to rcu reading to use the
> > vmas they need without waiting on writes to finish.
> > 
> > Maybe I don't understand your suggestion, but I would think punching a
> > hole would lose this advantage?
> 
> My suggestion was to remove the range stuck in mas_detach from mm_mt.
> That is exactly the affected range, no?

Yes.

But then looping over the vmas will show a gap where there should not be
a gap.

If we stop rcu readers entirely we lose the advantage.

This is exactly the issue that the locking dance was working around :)
Suren Baghdasaryan Dec. 20, 2024, 3:22 p.m. UTC | #45
On Thu, Dec 19, 2024 at 10:55 AM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Peter Zijlstra <peterz@infradead.org> [241219 13:47]:
> > On Thu, Dec 19, 2024 at 01:18:23PM -0500, Liam R. Howlett wrote:
> >
> > > > For RCU lookups only the mas tree matters -- and its left present there.
> > > >
> > > > If you really want to block RCU readers, I would suggest punching a hole
> > > > in the mm_mt. All the traditional code won't notice anyway, this is all
> > > > with mmap_lock held for writing.
> > >
> > > We don't want to block all rcu readers, we want to block the rcu readers
> > > that would see the problem - that is, anyone trying to read a particular
> > > area.
> > >
> > > Right now we can page fault in unpopulated vmas while writing other vmas
> > > to the tree.  We are also moving more users to rcu reading to use the
> > > vmas they need without waiting on writes to finish.
> > >
> > > Maybe I don't understand your suggestion, but I would think punching a
> > > hole would lose this advantage?
> >
> > My suggestion was to remove the range stuck in mas_detach from mm_mt.
> > That is exactly the affected range, no?
>
> Yes.
>
> But then looping over the vmas will show a gap where there should not be
> a gap.
>
> If we stop rcu readers entirely we lose the advantage.
>
> This is exactly the issue that the locking dance was working around :)

Sorry for not participating in the discussion, folks. I'm down with a
terrible flu and my brain is not working well. I'll catch up once I
get better.

>
Suren Baghdasaryan Dec. 23, 2024, 3:03 a.m. UTC | #46
On Thu, Dec 19, 2024 at 10:55 AM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Peter Zijlstra <peterz@infradead.org> [241219 13:47]:
> > On Thu, Dec 19, 2024 at 01:18:23PM -0500, Liam R. Howlett wrote:
> >
> > > > For RCU lookups only the mas tree matters -- and its left present there.
> > > >
> > > > If you really want to block RCU readers, I would suggest punching a hole
> > > > in the mm_mt. All the traditional code won't notice anyway, this is all
> > > > with mmap_lock held for writing.
> > >
> > > We don't want to block all rcu readers, we want to block the rcu readers
> > > that would see the problem - that is, anyone trying to read a particular
> > > area.
> > >
> > > Right now we can page fault in unpopulated vmas while writing other vmas
> > > to the tree.  We are also moving more users to rcu reading to use the
> > > vmas they need without waiting on writes to finish.
> > >
> > > Maybe I don't understand your suggestion, but I would think punching a
> > > hole would lose this advantage?
> >
> > My suggestion was to remove the range stuck in mas_detach from mm_mt.
> > That is exactly the affected range, no?
>
> Yes.
>
> But then looping over the vmas will show a gap where there should not be
> a gap.
>
> If we stop rcu readers entirely we lose the advantage.
>
> This is exactly the issue that the locking dance was working around :)

IOW we write-lock the entire range before removing any part of it for
the whole transaction to be atomic, correct?


Peter, you suggested the following pattern for ensuring vma is
detached with no possible readers:

vma_iter_store()
vma_start_write()
vma_mark_detached()

What do you think about this alternative?

vma_start_write()
...
vma_iter_store()
vma_mark_detached()
        vma_assert_write_locked(vma)
        if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt)))
                vma_start_write()

The second vma_start_write() is unlikely to be executed because the
vma is locked, vm_refcnt might be increased only temporarily by
readers before they realize the vma is locked and that's a very narrow
window. I think performance should not visibly suffer?
OTOH this would let us keep current locking patterns and would
guarantee that vma_mark_detached() always exits with a detached and
unused vma (less possibilities for someone not following an exact
pattern and ending up with a detached but still used vma).

>
Suren Baghdasaryan Dec. 26, 2024, 5:12 p.m. UTC | #47
On Sun, Dec 22, 2024 at 7:03 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Dec 19, 2024 at 10:55 AM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Peter Zijlstra <peterz@infradead.org> [241219 13:47]:
> > > On Thu, Dec 19, 2024 at 01:18:23PM -0500, Liam R. Howlett wrote:
> > >
> > > > > For RCU lookups only the mas tree matters -- and its left present there.
> > > > >
> > > > > If you really want to block RCU readers, I would suggest punching a hole
> > > > > in the mm_mt. All the traditional code won't notice anyway, this is all
> > > > > with mmap_lock held for writing.
> > > >
> > > > We don't want to block all rcu readers, we want to block the rcu readers
> > > > that would see the problem - that is, anyone trying to read a particular
> > > > area.
> > > >
> > > > Right now we can page fault in unpopulated vmas while writing other vmas
> > > > to the tree.  We are also moving more users to rcu reading to use the
> > > > vmas they need without waiting on writes to finish.
> > > >
> > > > Maybe I don't understand your suggestion, but I would think punching a
> > > > hole would lose this advantage?
> > >
> > > My suggestion was to remove the range stuck in mas_detach from mm_mt.
> > > That is exactly the affected range, no?
> >
> > Yes.
> >
> > But then looping over the vmas will show a gap where there should not be
> > a gap.
> >
> > If we stop rcu readers entirely we lose the advantage.
> >
> > This is exactly the issue that the locking dance was working around :)
>
> IOW we write-lock the entire range before removing any part of it for
> the whole transaction to be atomic, correct?
>
>
> Peter, you suggested the following pattern for ensuring vma is
> detached with no possible readers:
>
> vma_iter_store()
> vma_start_write()
> vma_mark_detached()
>
> What do you think about this alternative?
>
> vma_start_write()
> ...
> vma_iter_store()
> vma_mark_detached()
>         vma_assert_write_locked(vma)
>         if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt)))
>                 vma_start_write()
>
> The second vma_start_write() is unlikely to be executed because the
> vma is locked, vm_refcnt might be increased only temporarily by
> readers before they realize the vma is locked and that's a very narrow
> window. I think performance should not visibly suffer?
> OTOH this would let us keep current locking patterns and would
> guarantee that vma_mark_detached() always exits with a detached and
> unused vma (less possibilities for someone not following an exact
> pattern and ending up with a detached but still used vma).

I posted v7 of this patchset at
https://lore.kernel.org/all/20241226170710.1159679-1-surenb@google.com/
From the things we discussed, I didn't include the following:
- Changing vma locking patterns
- Changing do_vmi_align_munmap() to avoid reattach_vmas()
It seems we need more discussion for the first one and the second one
can be done completely independent from this patchset. I feel this
patchset is already quite large, so trying to keep its size
manageable.
Thanks,
Suren.

>
> >
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ccb8f2afeca8..d9edabc385b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -32,6 +32,7 @@ 
 #include <linux/memremap.h>
 #include <linux/slab.h>
 #include <linux/cacheinfo.h>
+#include <linux/rcuwait.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -699,10 +700,27 @@  static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
 #ifdef CONFIG_PER_VMA_LOCK
 static inline void vma_lock_init(struct vm_area_struct *vma)
 {
-	init_rwsem(&vma->vm_lock.lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	static struct lock_class_key lockdep_key;
+
+	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+	refcount_set(&vma->vm_refcnt, VMA_STATE_DETACHED);
 	vma->vm_lock_seq = UINT_MAX;
 }
 
+static inline void vma_refcount_put(struct vm_area_struct *vma)
+{
+	int refcnt;
+
+	if (!__refcount_dec_and_test(&vma->vm_refcnt, &refcnt)) {
+		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+
+		if (refcnt & VMA_STATE_LOCKED)
+			rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
+	}
+}
+
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -710,6 +728,8 @@  static inline void vma_lock_init(struct vm_area_struct *vma)
  */
 static inline bool vma_start_read(struct vm_area_struct *vma)
 {
+	int oldcnt;
+
 	/*
 	 * Check before locking. A race might cause false locked result.
 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -720,13 +740,20 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
+
+	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+	/* Limit at VMA_STATE_LOCKED - 2 to leave one count for a writer */
+	if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
+						      VMA_STATE_LOCKED - 2))) {
+		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
 		return false;
+	}
+	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
 
 	/*
-	 * Overflow might produce false locked result.
+	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
-	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
 	 * modification invalidates all existing locks.
 	 *
 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
@@ -734,10 +761,12 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * after it has been unlocked.
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
-	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->vm_lock.lock);
+	if (oldcnt & VMA_STATE_LOCKED ||
+	    unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
+		vma_refcount_put(vma);
 		return false;
 	}
+
 	return true;
 }
 
@@ -749,8 +778,17 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
  */
 static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
 {
+	int oldcnt;
+
 	mmap_assert_locked(vma->vm_mm);
-	down_read_nested(&vma->vm_lock.lock, subclass);
+	rwsem_acquire_read(&vma->vmlock_dep_map, subclass, 0, _RET_IP_);
+	/* Limit at VMA_STATE_LOCKED - 2 to leave one count for a writer */
+	if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
+						      VMA_STATE_LOCKED - 2))) {
+		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+		return false;
+	}
+	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
 	return true;
 }
 
@@ -762,15 +800,13 @@  static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int
  */
 static inline bool vma_start_read_locked(struct vm_area_struct *vma)
 {
-	mmap_assert_locked(vma->vm_mm);
-	down_read(&vma->vm_lock.lock);
-	return true;
+	return vma_start_read_locked_nested(vma, 0);
 }
 
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->vm_lock.lock);
+	vma_refcount_put(vma);
 	rcu_read_unlock();
 }
 
@@ -813,25 +849,42 @@  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 
 static inline void vma_assert_locked(struct vm_area_struct *vma)
 {
-	if (!rwsem_is_locked(&vma->vm_lock.lock))
+	if (refcount_read(&vma->vm_refcnt) <= VMA_STATE_ATTACHED)
 		vma_assert_write_locked(vma);
 }
 
-static inline void vma_mark_attached(struct vm_area_struct *vma)
+/*
+ * WARNING: to avoid racing with vma_mark_attached(), should be called either
+ * under mmap_write_lock or when the object has been isolated under
+ * mmap_write_lock, ensuring no competing writers.
+ */
+static inline bool is_vma_detached(struct vm_area_struct *vma)
 {
-	vma->detached = false;
+	return refcount_read(&vma->vm_refcnt) == VMA_STATE_DETACHED;
 }
 
-static inline void vma_mark_detached(struct vm_area_struct *vma)
+static inline void vma_mark_attached(struct vm_area_struct *vma)
 {
-	/* When detaching vma should be write-locked */
 	vma_assert_write_locked(vma);
-	vma->detached = true;
+
+	if (is_vma_detached(vma))
+		refcount_set(&vma->vm_refcnt, VMA_STATE_ATTACHED);
 }
 
-static inline bool is_vma_detached(struct vm_area_struct *vma)
+static inline void vma_mark_detached(struct vm_area_struct *vma)
 {
-	return vma->detached;
+	vma_assert_write_locked(vma);
+
+	if (is_vma_detached(vma))
+		return;
+
+	/* We are the only writer, so no need to use vma_refcount_put(). */
+	if (!refcount_dec_and_test(&vma->vm_refcnt)) {
+		/*
+		 * Reader must have temporarily raised vm_refcnt but it will
+		 * drop it without using the vma since vma is write-locked.
+		 */
+	}
 }
 
 static inline void release_fault_lock(struct vm_fault *vmf)
@@ -896,10 +949,6 @@  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-#ifdef CONFIG_PER_VMA_LOCK
-	/* vma is not locked, can't use vma_mark_detached() */
-	vma->detached = true;
-#endif
 	vma_numab_state_init(vma);
 	vma_lock_init(vma);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 825f6328f9e5..803f718c007c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -19,6 +19,7 @@ 
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
 #include <linux/percpu_counter.h>
+#include <linux/types.h>
 
 #include <asm/mmu.h>
 
@@ -599,9 +600,9 @@  static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
 }
 #endif
 
-struct vma_lock {
-	struct rw_semaphore lock;
-};
+#define VMA_STATE_DETACHED	0x0
+#define VMA_STATE_ATTACHED	0x1
+#define VMA_STATE_LOCKED	0x40000000
 
 struct vma_numab_state {
 	/*
@@ -679,19 +680,13 @@  struct vm_area_struct {
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
-	/*
-	 * Flag to indicate areas detached from the mm->mm_mt tree.
-	 * Unstable RCU readers are allowed to read this.
-	 */
-	bool detached;
-
 	/*
 	 * Can only be written (using WRITE_ONCE()) while holding both:
 	 *  - mmap_lock (in write mode)
-	 *  - vm_lock->lock (in write mode)
+	 *  - vm_refcnt VMA_STATE_LOCKED is set
 	 * Can be read reliably while holding one of:
 	 *  - mmap_lock (in read or write mode)
-	 *  - vm_lock->lock (in read or write mode)
+	 *  - vm_refcnt VMA_STATE_LOCKED is set or vm_refcnt > VMA_STATE_ATTACHED
 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
 	 * while holding nothing (except RCU to keep the VMA struct allocated).
 	 *
@@ -754,7 +749,10 @@  struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_PER_VMA_LOCK
 	/* Unstable RCU readers are allowed to read this. */
-	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
+	refcount_t vm_refcnt ____cacheline_aligned_in_smp;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map vmlock_dep_map;
+#endif
 #endif
 } __randomize_layout;
 
@@ -889,6 +887,7 @@  struct mm_struct {
 					  * by mmlist_lock
 					  */
 #ifdef CONFIG_PER_VMA_LOCK
+		struct rcuwait vma_writer_wait;
 		/*
 		 * This field has lock-like semantics, meaning it is sometimes
 		 * accessed with ACQUIRE/RELEASE semantics.
diff --git a/kernel/fork.c b/kernel/fork.c
index 8cb19c23e892..283909d082cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -465,10 +465,6 @@  struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	data_race(memcpy(new, orig, sizeof(*new)));
 	vma_lock_init(new);
 	INIT_LIST_HEAD(&new->anon_vma_chain);
-#ifdef CONFIG_PER_VMA_LOCK
-	/* vma is not locked, can't use vma_mark_detached() */
-	new->detached = true;
-#endif
 	vma_numab_state_init(new);
 	dup_anon_vma_name(orig, new);
 
@@ -488,8 +484,6 @@  static void vm_area_free_rcu_cb(struct rcu_head *head)
 	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
 						  vm_rcu);
 
-	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -1228,6 +1222,9 @@  static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
 	mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+	rcuwait_init(&mm->vma_writer_wait);
+#endif
 }
 
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 6af3ad675930..4600e7605cab 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,6 +40,7 @@  struct mm_struct init_mm = {
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 #ifdef CONFIG_PER_VMA_LOCK
+	.vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
 	.mm_lock_seq	= SEQCNT_ZERO(init_mm.mm_lock_seq),
 #endif
 	.user_ns	= &init_user_ns,
diff --git a/mm/memory.c b/mm/memory.c
index c6356ea703d8..cff132003e24 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6331,7 +6331,25 @@  struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
 #ifdef CONFIG_PER_VMA_LOCK
 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
 {
-	down_write(&vma->vm_lock.lock);
+	bool detached;
+
+	/*
+	 * If vma is detached then only vma_mark_attached() can raise the
+	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+	 */
+	if (!refcount_inc_not_zero(&vma->vm_refcnt)) {
+		WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+		return;
+	}
+
+	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+	/* vma is attached, set the writer present bit */
+	refcount_add(VMA_STATE_LOCKED, &vma->vm_refcnt);
+	/* wait until state is VMA_STATE_ATTACHED + (VMA_STATE_LOCKED + 1) */
+	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+		   refcount_read(&vma->vm_refcnt) == VMA_STATE_ATTACHED + (VMA_STATE_LOCKED + 1),
+		   TASK_UNINTERRUPTIBLE);
+	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
 	 * from the early lockless pessimistic check in vma_start_read().
@@ -6339,7 +6357,10 @@  void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
 	 */
 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-	up_write(&vma->vm_lock.lock);
+	detached = refcount_sub_and_test(VMA_STATE_LOCKED + 1,
+					 &vma->vm_refcnt);
+	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+	VM_BUG_ON_VMA(detached, vma); /* vma should remain attached */
 }
 EXPORT_SYMBOL_GPL(__vma_start_write);
 
@@ -6355,7 +6376,6 @@  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	struct vm_area_struct *vma;
 
 	rcu_read_lock();
-retry:
 	vma = mas_walk(&mas);
 	if (!vma)
 		goto inval;
@@ -6363,13 +6383,6 @@  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	if (!vma_start_read(vma))
 		goto inval;
 
-	/* Check if the VMA got isolated after we found it */
-	if (is_vma_detached(vma)) {
-		vma_end_read(vma);
-		count_vm_vma_lock_event(VMA_LOCK_MISS);
-		/* The area was replaced with another one */
-		goto retry;
-	}
 	/*
 	 * At this point, we have a stable reference to a VMA: The VMA is
 	 * locked and we know it hasn't already been isolated.
diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h
index e01f66f98982..2e2021553196 100644
--- a/tools/testing/vma/linux/atomic.h
+++ b/tools/testing/vma/linux/atomic.h
@@ -9,4 +9,9 @@ 
 #define atomic_set(x, y) do {} while (0)
 #define U8_MAX UCHAR_MAX
 
+#ifndef atomic_cmpxchg_relaxed
+#define  atomic_cmpxchg_relaxed		uatomic_cmpxchg
+#define  atomic_cmpxchg_release         uatomic_cmpxchg
+#endif /* atomic_cmpxchg_relaxed */
+
 #endif	/* _LINUX_ATOMIC_H */
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 0cdc5f8c3d60..b55556b16060 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -25,7 +25,7 @@ 
 #include <linux/maple_tree.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
+#include <linux/refcount.h>
 
 extern unsigned long stack_guard_gap;
 #ifdef CONFIG_MMU
@@ -132,10 +132,6 @@  typedef __bitwise unsigned int vm_fault_t;
  */
 #define pr_warn_once pr_err
 
-typedef struct refcount_struct {
-	atomic_t refs;
-} refcount_t;
-
 struct kref {
 	refcount_t refcount;
 };
@@ -228,15 +224,14 @@  struct mm_struct {
 	unsigned long def_flags;
 };
 
-struct vma_lock {
-	struct rw_semaphore lock;
-};
-
-
 struct file {
 	struct address_space	*f_mapping;
 };
 
+#define VMA_STATE_DETACHED	0x0
+#define VMA_STATE_ATTACHED	0x1
+#define VMA_STATE_LOCKED	0x40000000
+
 struct vm_area_struct {
 	/* The first cache line has the info for VMA tree walking. */
 
@@ -264,16 +259,13 @@  struct vm_area_struct {
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
-	/* Flag to indicate areas detached from the mm->mm_mt tree */
-	bool detached;
-
 	/*
 	 * Can only be written (using WRITE_ONCE()) while holding both:
 	 *  - mmap_lock (in write mode)
-	 *  - vm_lock.lock (in write mode)
+	 *  - vm_refcnt VMA_STATE_LOCKED is set
 	 * Can be read reliably while holding one of:
 	 *  - mmap_lock (in read or write mode)
-	 *  - vm_lock.lock (in read or write mode)
+	 *  - vm_refcnt VMA_STATE_LOCKED is set or vm_refcnt > VMA_STATE_ATTACHED
 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
 	 * while holding nothing (except RCU to keep the VMA struct allocated).
 	 *
@@ -282,7 +274,6 @@  struct vm_area_struct {
 	 * slowpath.
 	 */
 	unsigned int vm_lock_seq;
-	struct vma_lock vm_lock;
 #endif
 
 	/*
@@ -335,6 +326,10 @@  struct vm_area_struct {
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Unstable RCU readers are allowed to read this. */
+	refcount_t vm_refcnt;
+#endif
 } __randomize_layout;
 
 struct vm_fault {};
@@ -461,21 +456,37 @@  static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
 
 static inline void vma_lock_init(struct vm_area_struct *vma)
 {
-	init_rwsem(&vma->vm_lock.lock);
+	refcount_set(&vma->vm_refcnt, VMA_STATE_DETACHED);
 	vma->vm_lock_seq = UINT_MAX;
 }
 
-static inline void vma_mark_attached(struct vm_area_struct *vma)
+static inline bool is_vma_detached(struct vm_area_struct *vma)
 {
-	vma->detached = false;
+	return refcount_read(&vma->vm_refcnt) == VMA_STATE_DETACHED;
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *);
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+	vma_assert_write_locked(vma);
+
+	if (is_vma_detached(vma))
+		refcount_set(&vma->vm_refcnt, VMA_STATE_ATTACHED);
+}
+
 static inline void vma_mark_detached(struct vm_area_struct *vma)
 {
-	/* When detaching vma should be write-locked */
 	vma_assert_write_locked(vma);
-	vma->detached = true;
+
+	if (is_vma_detached(vma))
+		return;
+
+	if (!refcount_dec_and_test(&vma->vm_refcnt)) {
+		/*
+		 * Reader must have temporarily raised vm_refcnt but it will
+		 * drop it without using the vma since vma is write-locked.
+		 */
+	}
 }
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
@@ -488,8 +499,6 @@  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	/* vma is not locked, can't use vma_mark_detached() */
-	vma->detached = true;
 	vma_lock_init(vma);
 }
 
@@ -515,8 +524,6 @@  static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	memcpy(new, orig, sizeof(*new));
 	vma_lock_init(new);
 	INIT_LIST_HEAD(&new->anon_vma_chain);
-	/* vma is not locked, can't use vma_mark_detached() */
-	new->detached = true;
 
 	return new;
 }