diff mbox series

[RFC] vm: align vma allocation and move the lock back into the struct

Message ID 20240808185949.1094891-1-mjguzik@gmail.com (mailing list archive)
State New
Headers show
Series [RFC] vm: align vma allocation and move the lock back into the struct | expand

Commit Message

Mateusz Guzik Aug. 8, 2024, 6:59 p.m. UTC
ACHTUNG: this is more of a request for benchmarking than a patch
proposal at this stage

I was pointed at your patch which moved the vma lock to a separate
allocation [1]. The commit message does not say anything about making
sure the object itself is allocated with proper alignment and I found
that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
not be the problem.

I verified with a simple one-liner than on a stock kernel the vmas keep
roaming around with a 16-byte alignment:
# bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
@[16]: 39
@[0]: 46
@[32]: 53
@[48]: 56

Note the stock vma lock cache also lacks the alignment flag. While I
have not verified experimentally, if they are also romaing it would mean
that 2 unrelated vmas can false-share locks. If the patch below is a
bust, the flag should probably be added to that one.

The patch has slapped-around vma lock cache removal + HWALLOC for the
vma cache. I left a pointer to not change relative offsets between
current fields. I does compile without CONFIG_PER_VMA_LOCK.

Vlastimil says you tested a case where the struct got bloated to 256
bytes, but the lock remained separate. It is unclear to me if this
happened with allocations made with the HWCACHE_ALIGN flag though.

There is 0 urgency on my end, but it would be nice if you could try
this out with your test rig.

1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/T/#u

---
 include/linux/mm.h       | 18 +++++++--------
 include/linux/mm_types.h | 10 ++++-----
 kernel/fork.c            | 47 ++++------------------------------------
 mm/userfaultfd.c         |  6 ++---
 4 files changed, 19 insertions(+), 62 deletions(-)

Comments

Suren Baghdasaryan Aug. 8, 2024, 7:38 p.m. UTC | #1
On Thu, Aug 8, 2024 at 7:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> ACHTUNG: this is more of a request for benchmarking than a patch
> proposal at this stage
>
> I was pointed at your patch which moved the vma lock to a separate
> allocation [1]. The commit message does not say anything about making
> sure the object itself is allocated with proper alignment and I found
> that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
> not be the problem.
>
> I verified with a simple one-liner than on a stock kernel the vmas keep
> roaming around with a 16-byte alignment:
> # bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
> @[16]: 39
> @[0]: 46
> @[32]: 53
> @[48]: 56
>
> Note the stock vma lock cache also lacks the alignment flag. While I
> have not verified experimentally, if they are also romaing it would mean
> that 2 unrelated vmas can false-share locks. If the patch below is a
> bust, the flag should probably be added to that one.
>
> The patch has slapped-around vma lock cache removal + HWALLOC for the
> vma cache. I left a pointer to not change relative offsets between
> current fields. I does compile without CONFIG_PER_VMA_LOCK.
>
> Vlastimil says you tested a case where the struct got bloated to 256
> bytes, but the lock remained separate. It is unclear to me if this
> happened with allocations made with the HWCACHE_ALIGN flag though.
>
> There is 0 urgency on my end, but it would be nice if you could try
> this out with your test rig.

Hi Mateusz,
Sure, I'll give it a spin but I'm not optimistic. Your code looks
almost identical to my latest attempt where I tried placing vm_lock
into different cachelines including a separate one and using
HWCACHE_ALIGN. And yet all my attempts showed regression.
Just FYI, the test I'm using is the pft-threads test from mmtests
suite. I'll post results today evening.
Thanks,
Suren.

>
> 1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/T/#u
>
> ---
>  include/linux/mm.h       | 18 +++++++--------
>  include/linux/mm_types.h | 10 ++++-----
>  kernel/fork.c            | 47 ++++------------------------------------
>  mm/userfaultfd.c         |  6 ++---
>  4 files changed, 19 insertions(+), 62 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 43b40334e9b2..6d8b668d3deb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -687,7 +687,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>         if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
>                 return false;
>
> -       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> +       if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
>                 return false;
>
>         /*
> @@ -702,7 +702,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>          * This pairs with RELEASE semantics in vma_end_write_all().
>          */
>         if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> -               up_read(&vma->vm_lock->lock);
> +               up_read(&vma->vm_lock);
>                 return false;
>         }
>         return true;
> @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  static inline void vma_end_read(struct vm_area_struct *vma)
>  {
>         rcu_read_lock(); /* keeps vma alive till the end of up_read */
> -       up_read(&vma->vm_lock->lock);
> +       up_read(&vma->vm_lock);
>         rcu_read_unlock();
>  }
>
> @@ -740,7 +740,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>         if (__is_vma_write_locked(vma, &mm_lock_seq))
>                 return;
>
> -       down_write(&vma->vm_lock->lock);
> +       down_write(&vma->vm_lock);
>         /*
>          * We should use WRITE_ONCE() here because we can have concurrent reads
>          * from the early lockless pessimistic check in vma_start_read().
> @@ -748,7 +748,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
>          */
>         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> -       up_write(&vma->vm_lock->lock);
> +       up_write(&vma->vm_lock);
>  }
>
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> @@ -760,7 +760,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>
>  static inline void vma_assert_locked(struct vm_area_struct *vma)
>  {
> -       if (!rwsem_is_locked(&vma->vm_lock->lock))
> +       if (!rwsem_is_locked(&vma->vm_lock))
>                 vma_assert_write_locked(vma);
>  }
>
> @@ -827,10 +827,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
>
>  extern const struct vm_operations_struct vma_dummy_vm_ops;
>
> -/*
> - * WARNING: vma_init does not initialize vma->vm_lock.
> - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> - */
>  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>  {
>         memset(vma, 0, sizeof(*vma));
> @@ -839,6 +835,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>         INIT_LIST_HEAD(&vma->anon_vma_chain);
>         vma_mark_detached(vma, false);
>         vma_numab_state_init(vma);
> +       init_rwsem(&vma->vm_lock);
> +       vma->vm_lock_seq = -1;
>  }
>
>  /* Use when VMA is not part of the VMA tree and needs no locking */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 003619fab20e..caffdb4eeb94 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -615,10 +615,6 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
>  }
>  #endif
>
> -struct vma_lock {
> -       struct rw_semaphore lock;
> -};
> -
>  struct vma_numab_state {
>         /*
>          * Initialised as time in 'jiffies' after which VMA
> @@ -716,8 +712,7 @@ struct vm_area_struct {
>          * slowpath.
>          */
>         int vm_lock_seq;
> -       /* Unstable RCU readers are allowed to read this. */
> -       struct vma_lock *vm_lock;
> +       void *vm_dummy;
>  #endif
>
>         /*
> @@ -770,6 +765,9 @@ struct vm_area_struct {
>         struct vma_numab_state *numab_state;    /* NUMA Balancing state */
>  #endif
>         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> +#ifdef CONFIG_PER_VMA_LOCK
> +       struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
> +#endif
>  } __randomize_layout;
>
>  #ifdef CONFIG_NUMA
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 92bfe56c9fed..eab04a24d5f1 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
>  /* SLAB cache for mm_struct structures (tsk->mm) */
>  static struct kmem_cache *mm_cachep;
>
> -#ifdef CONFIG_PER_VMA_LOCK
> -
> -/* SLAB cache for vm_area_struct.lock */
> -static struct kmem_cache *vma_lock_cachep;
> -
> -static bool vma_lock_alloc(struct vm_area_struct *vma)
> -{
> -       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> -       if (!vma->vm_lock)
> -               return false;
> -
> -       init_rwsem(&vma->vm_lock->lock);
> -       vma->vm_lock_seq = -1;
> -
> -       return true;
> -}
> -
> -static inline void vma_lock_free(struct vm_area_struct *vma)
> -{
> -       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> -}
> -
> -#else /* CONFIG_PER_VMA_LOCK */
> -
> -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> -
> -#endif /* CONFIG_PER_VMA_LOCK */
> -
>  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>  {
>         struct vm_area_struct *vma;
> @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>                 return NULL;
>
>         vma_init(vma, mm);
> -       if (!vma_lock_alloc(vma)) {
> -               kmem_cache_free(vm_area_cachep, vma);
> -               return NULL;
> -       }
>
>         return vma;
>  }
> @@ -496,10 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>          * will be reinitialized.
>          */
>         data_race(memcpy(new, orig, sizeof(*new)));
> -       if (!vma_lock_alloc(new)) {
> -               kmem_cache_free(vm_area_cachep, new);
> -               return NULL;
> -       }
> +       init_rwsem(&new->vm_lock);
> +       new->vm_lock_seq = -1;
>         INIT_LIST_HEAD(&new->anon_vma_chain);
>         vma_numab_state_init(new);
>         dup_anon_vma_name(orig, new);
> @@ -511,7 +476,6 @@ void __vm_area_free(struct vm_area_struct *vma)
>  {
>         vma_numab_state_free(vma);
>         free_anon_vma_name(vma);
> -       vma_lock_free(vma);
>         kmem_cache_free(vm_area_cachep, vma);
>  }
>
> @@ -522,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
>                                                   vm_rcu);
>
>         /* The vma should not be locked while being destroyed. */
> -       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> +       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
>         __vm_area_free(vma);
>  }
>  #endif
> @@ -3192,10 +3156,7 @@ void __init proc_caches_init(void)
>                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
>                         NULL);
>
> -       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> -#ifdef CONFIG_PER_VMA_LOCK
> -       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> -#endif
> +       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
>         mmap_init();
>         nsproxy_cache_init();
>  }
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 3b7715ecf292..e95ecb2063d2 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -92,7 +92,7 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
>                  * mmap_lock, which guarantees that nobody can lock the
>                  * vma for write (vma_start_write()) under us.
>                  */
> -               down_read(&vma->vm_lock->lock);
> +               down_read(&vma->vm_lock);
>         }
>
>         mmap_read_unlock(mm);
> @@ -1468,9 +1468,9 @@ static int uffd_move_lock(struct mm_struct *mm,
>                  * See comment in uffd_lock_vma() as to why not using
>                  * vma_start_read() here.
>                  */
> -               down_read(&(*dst_vmap)->vm_lock->lock);
> +               down_read(&(*dst_vmap)->vm_lock);
>                 if (*dst_vmap != *src_vmap)
> -                       down_read_nested(&(*src_vmap)->vm_lock->lock,
> +                       down_read_nested(&(*src_vmap)->vm_lock,
>                                          SINGLE_DEPTH_NESTING);
>         }
>         mmap_read_unlock(mm);
> --
> 2.43.0
>
Mateusz Guzik Aug. 8, 2024, 8:03 p.m. UTC | #2
On Thu, Aug 8, 2024 at 9:39 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Aug 8, 2024 at 7:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> >
> > ACHTUNG: this is more of a request for benchmarking than a patch
> > proposal at this stage
> >
> > I was pointed at your patch which moved the vma lock to a separate
> > allocation [1]. The commit message does not say anything about making
> > sure the object itself is allocated with proper alignment and I found
> > that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
> > not be the problem.
> >
> > I verified with a simple one-liner than on a stock kernel the vmas keep
> > roaming around with a 16-byte alignment:
> > # bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
> > @[16]: 39
> > @[0]: 46
> > @[32]: 53
> > @[48]: 56
> >
> > Note the stock vma lock cache also lacks the alignment flag. While I
> > have not verified experimentally, if they are also romaing it would mean
> > that 2 unrelated vmas can false-share locks. If the patch below is a
> > bust, the flag should probably be added to that one.
> >
> > The patch has slapped-around vma lock cache removal + HWALLOC for the
> > vma cache. I left a pointer to not change relative offsets between
> > current fields. I does compile without CONFIG_PER_VMA_LOCK.
> >
> > Vlastimil says you tested a case where the struct got bloated to 256
> > bytes, but the lock remained separate. It is unclear to me if this
> > happened with allocations made with the HWCACHE_ALIGN flag though.
> >
> > There is 0 urgency on my end, but it would be nice if you could try
> > this out with your test rig.
>
> Hi Mateusz,
> Sure, I'll give it a spin but I'm not optimistic. Your code looks
> almost identical to my latest attempt where I tried placing vm_lock
> into different cachelines including a separate one and using
> HWCACHE_ALIGN. And yet all my attempts showed regression.
> Just FYI, the test I'm using is the pft-threads test from mmtests
> suite. I'll post results today evening.
> Thanks,
> Suren.

Ok, well maybe you did not leave the pointer in place? :)

It is plausible the problem is on vs off cpu behavior of rwsems --
there is a corner case where they neglect to spin. It is plausible
perf goes down simply because there is less on cpu time.

Thus you bench can you make sure to time(1)?

For example with zsh I got:
./run-mmtests.sh --no-monitor --config configs/config-workload-pft-threads

39.35s user 445.45s system 390% cpu 124.04s (2:04.04) total

I verified with offcputime-bpfcc -K that indeed there is a bunch of
pft going off cpu from down_read/down_write even at the modest scale
this was running in my case.

>
> >
> > 1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/T/#u
> >
> > ---
> >  include/linux/mm.h       | 18 +++++++--------
> >  include/linux/mm_types.h | 10 ++++-----
> >  kernel/fork.c            | 47 ++++------------------------------------
> >  mm/userfaultfd.c         |  6 ++---
> >  4 files changed, 19 insertions(+), 62 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 43b40334e9b2..6d8b668d3deb 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -687,7 +687,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> >         if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> >                 return false;
> >
> > -       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> > +       if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
> >                 return false;
> >
> >         /*
> > @@ -702,7 +702,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> >          * This pairs with RELEASE semantics in vma_end_write_all().
> >          */
> >         if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> > -               up_read(&vma->vm_lock->lock);
> > +               up_read(&vma->vm_lock);
> >                 return false;
> >         }
> >         return true;
> > @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> >  static inline void vma_end_read(struct vm_area_struct *vma)
> >  {
> >         rcu_read_lock(); /* keeps vma alive till the end of up_read */
> > -       up_read(&vma->vm_lock->lock);
> > +       up_read(&vma->vm_lock);
> >         rcu_read_unlock();
> >  }
> >
> > @@ -740,7 +740,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> >         if (__is_vma_write_locked(vma, &mm_lock_seq))
> >                 return;
> >
> > -       down_write(&vma->vm_lock->lock);
> > +       down_write(&vma->vm_lock);
> >         /*
> >          * We should use WRITE_ONCE() here because we can have concurrent reads
> >          * from the early lockless pessimistic check in vma_start_read().
> > @@ -748,7 +748,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> >          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> >          */
> >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > -       up_write(&vma->vm_lock->lock);
> > +       up_write(&vma->vm_lock);
> >  }
> >
> >  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > @@ -760,7 +760,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> >
> >  static inline void vma_assert_locked(struct vm_area_struct *vma)
> >  {
> > -       if (!rwsem_is_locked(&vma->vm_lock->lock))
> > +       if (!rwsem_is_locked(&vma->vm_lock))
> >                 vma_assert_write_locked(vma);
> >  }
> >
> > @@ -827,10 +827,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
> >
> >  extern const struct vm_operations_struct vma_dummy_vm_ops;
> >
> > -/*
> > - * WARNING: vma_init does not initialize vma->vm_lock.
> > - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> > - */
> >  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> >  {
> >         memset(vma, 0, sizeof(*vma));
> > @@ -839,6 +835,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> >         INIT_LIST_HEAD(&vma->anon_vma_chain);
> >         vma_mark_detached(vma, false);
> >         vma_numab_state_init(vma);
> > +       init_rwsem(&vma->vm_lock);
> > +       vma->vm_lock_seq = -1;
> >  }
> >
> >  /* Use when VMA is not part of the VMA tree and needs no locking */
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 003619fab20e..caffdb4eeb94 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -615,10 +615,6 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
> >  }
> >  #endif
> >
> > -struct vma_lock {
> > -       struct rw_semaphore lock;
> > -};
> > -
> >  struct vma_numab_state {
> >         /*
> >          * Initialised as time in 'jiffies' after which VMA
> > @@ -716,8 +712,7 @@ struct vm_area_struct {
> >          * slowpath.
> >          */
> >         int vm_lock_seq;
> > -       /* Unstable RCU readers are allowed to read this. */
> > -       struct vma_lock *vm_lock;
> > +       void *vm_dummy;
> >  #endif
> >
> >         /*
> > @@ -770,6 +765,9 @@ struct vm_area_struct {
> >         struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> >  #endif
> >         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +       struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
> > +#endif
> >  } __randomize_layout;
> >
> >  #ifdef CONFIG_NUMA
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 92bfe56c9fed..eab04a24d5f1 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
> >  /* SLAB cache for mm_struct structures (tsk->mm) */
> >  static struct kmem_cache *mm_cachep;
> >
> > -#ifdef CONFIG_PER_VMA_LOCK
> > -
> > -/* SLAB cache for vm_area_struct.lock */
> > -static struct kmem_cache *vma_lock_cachep;
> > -
> > -static bool vma_lock_alloc(struct vm_area_struct *vma)
> > -{
> > -       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> > -       if (!vma->vm_lock)
> > -               return false;
> > -
> > -       init_rwsem(&vma->vm_lock->lock);
> > -       vma->vm_lock_seq = -1;
> > -
> > -       return true;
> > -}
> > -
> > -static inline void vma_lock_free(struct vm_area_struct *vma)
> > -{
> > -       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> > -}
> > -
> > -#else /* CONFIG_PER_VMA_LOCK */
> > -
> > -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> > -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> > -
> > -#endif /* CONFIG_PER_VMA_LOCK */
> > -
> >  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> >  {
> >         struct vm_area_struct *vma;
> > @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> >                 return NULL;
> >
> >         vma_init(vma, mm);
> > -       if (!vma_lock_alloc(vma)) {
> > -               kmem_cache_free(vm_area_cachep, vma);
> > -               return NULL;
> > -       }
> >
> >         return vma;
> >  }
> > @@ -496,10 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> >          * will be reinitialized.
> >          */
> >         data_race(memcpy(new, orig, sizeof(*new)));
> > -       if (!vma_lock_alloc(new)) {
> > -               kmem_cache_free(vm_area_cachep, new);
> > -               return NULL;
> > -       }
> > +       init_rwsem(&new->vm_lock);
> > +       new->vm_lock_seq = -1;
> >         INIT_LIST_HEAD(&new->anon_vma_chain);
> >         vma_numab_state_init(new);
> >         dup_anon_vma_name(orig, new);
> > @@ -511,7 +476,6 @@ void __vm_area_free(struct vm_area_struct *vma)
> >  {
> >         vma_numab_state_free(vma);
> >         free_anon_vma_name(vma);
> > -       vma_lock_free(vma);
> >         kmem_cache_free(vm_area_cachep, vma);
> >  }
> >
> > @@ -522,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
> >                                                   vm_rcu);
> >
> >         /* The vma should not be locked while being destroyed. */
> > -       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> > +       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
> >         __vm_area_free(vma);
> >  }
> >  #endif
> > @@ -3192,10 +3156,7 @@ void __init proc_caches_init(void)
> >                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> >                         NULL);
> >
> > -       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> > -#ifdef CONFIG_PER_VMA_LOCK
> > -       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> > -#endif
> > +       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
> >         mmap_init();
> >         nsproxy_cache_init();
> >  }
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 3b7715ecf292..e95ecb2063d2 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -92,7 +92,7 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
> >                  * mmap_lock, which guarantees that nobody can lock the
> >                  * vma for write (vma_start_write()) under us.
> >                  */
> > -               down_read(&vma->vm_lock->lock);
> > +               down_read(&vma->vm_lock);
> >         }
> >
> >         mmap_read_unlock(mm);
> > @@ -1468,9 +1468,9 @@ static int uffd_move_lock(struct mm_struct *mm,
> >                  * See comment in uffd_lock_vma() as to why not using
> >                  * vma_start_read() here.
> >                  */
> > -               down_read(&(*dst_vmap)->vm_lock->lock);
> > +               down_read(&(*dst_vmap)->vm_lock);
> >                 if (*dst_vmap != *src_vmap)
> > -                       down_read_nested(&(*src_vmap)->vm_lock->lock,
> > +                       down_read_nested(&(*src_vmap)->vm_lock,
> >                                          SINGLE_DEPTH_NESTING);
> >         }
> >         mmap_read_unlock(mm);
> > --
> > 2.43.0
> >
Suren Baghdasaryan Aug. 8, 2024, 9:19 p.m. UTC | #3
On Thu, Aug 8, 2024 at 1:04 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Thu, Aug 8, 2024 at 9:39 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Thu, Aug 8, 2024 at 7:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > >
> > > ACHTUNG: this is more of a request for benchmarking than a patch
> > > proposal at this stage
> > >
> > > I was pointed at your patch which moved the vma lock to a separate
> > > allocation [1]. The commit message does not say anything about making
> > > sure the object itself is allocated with proper alignment and I found
> > > that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
> > > not be the problem.
> > >
> > > I verified with a simple one-liner than on a stock kernel the vmas keep
> > > roaming around with a 16-byte alignment:
> > > # bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
> > > @[16]: 39
> > > @[0]: 46
> > > @[32]: 53
> > > @[48]: 56
> > >
> > > Note the stock vma lock cache also lacks the alignment flag. While I
> > > have not verified experimentally, if they are also romaing it would mean
> > > that 2 unrelated vmas can false-share locks. If the patch below is a
> > > bust, the flag should probably be added to that one.
> > >
> > > The patch has slapped-around vma lock cache removal + HWALLOC for the
> > > vma cache. I left a pointer to not change relative offsets between
> > > current fields. I does compile without CONFIG_PER_VMA_LOCK.
> > >
> > > Vlastimil says you tested a case where the struct got bloated to 256
> > > bytes, but the lock remained separate. It is unclear to me if this
> > > happened with allocations made with the HWCACHE_ALIGN flag though.
> > >
> > > There is 0 urgency on my end, but it would be nice if you could try
> > > this out with your test rig.
> >
> > Hi Mateusz,
> > Sure, I'll give it a spin but I'm not optimistic. Your code looks
> > almost identical to my latest attempt where I tried placing vm_lock
> > into different cachelines including a separate one and using
> > HWCACHE_ALIGN. And yet all my attempts showed regression.
> > Just FYI, the test I'm using is the pft-threads test from mmtests
> > suite. I'll post results today evening.
> > Thanks,
> > Suren.
>
> Ok, well maybe you did not leave the pointer in place? :)

True, maybe that will make a difference. I'll let you know soon.

>
> It is plausible the problem is on vs off cpu behavior of rwsems --
> there is a corner case where they neglect to spin. It is plausible
> perf goes down simply because there is less on cpu time.
>
> Thus you bench can you make sure to time(1)?

Sure, will do once I'm home. Thanks for the hints!

>
> For example with zsh I got:
> ./run-mmtests.sh --no-monitor --config configs/config-workload-pft-threads
>
> 39.35s user 445.45s system 390% cpu 124.04s (2:04.04) total
>
> I verified with offcputime-bpfcc -K that indeed there is a bunch of
> pft going off cpu from down_read/down_write even at the modest scale
> this was running in my case.
>
> >
> > >
> > > 1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/T/#u
> > >
> > > ---
> > >  include/linux/mm.h       | 18 +++++++--------
> > >  include/linux/mm_types.h | 10 ++++-----
> > >  kernel/fork.c            | 47 ++++------------------------------------
> > >  mm/userfaultfd.c         |  6 ++---
> > >  4 files changed, 19 insertions(+), 62 deletions(-)
> > >
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 43b40334e9b2..6d8b668d3deb 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -687,7 +687,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > >         if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > >                 return false;
> > >
> > > -       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> > > +       if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
> > >                 return false;
> > >
> > >         /*
> > > @@ -702,7 +702,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > >          * This pairs with RELEASE semantics in vma_end_write_all().
> > >          */
> > >         if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> > > -               up_read(&vma->vm_lock->lock);
> > > +               up_read(&vma->vm_lock);
> > >                 return false;
> > >         }
> > >         return true;
> > > @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > >  static inline void vma_end_read(struct vm_area_struct *vma)
> > >  {
> > >         rcu_read_lock(); /* keeps vma alive till the end of up_read */
> > > -       up_read(&vma->vm_lock->lock);
> > > +       up_read(&vma->vm_lock);
> > >         rcu_read_unlock();
> > >  }
> > >
> > > @@ -740,7 +740,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > >         if (__is_vma_write_locked(vma, &mm_lock_seq))
> > >                 return;
> > >
> > > -       down_write(&vma->vm_lock->lock);
> > > +       down_write(&vma->vm_lock);
> > >         /*
> > >          * We should use WRITE_ONCE() here because we can have concurrent reads
> > >          * from the early lockless pessimistic check in vma_start_read().
> > > @@ -748,7 +748,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > >          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> > >          */
> > >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > > -       up_write(&vma->vm_lock->lock);
> > > +       up_write(&vma->vm_lock);
> > >  }
> > >
> > >  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > > @@ -760,7 +760,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > >
> > >  static inline void vma_assert_locked(struct vm_area_struct *vma)
> > >  {
> > > -       if (!rwsem_is_locked(&vma->vm_lock->lock))
> > > +       if (!rwsem_is_locked(&vma->vm_lock))
> > >                 vma_assert_write_locked(vma);
> > >  }
> > >
> > > @@ -827,10 +827,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
> > >
> > >  extern const struct vm_operations_struct vma_dummy_vm_ops;
> > >
> > > -/*
> > > - * WARNING: vma_init does not initialize vma->vm_lock.
> > > - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> > > - */
> > >  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > >  {
> > >         memset(vma, 0, sizeof(*vma));
> > > @@ -839,6 +835,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > >         INIT_LIST_HEAD(&vma->anon_vma_chain);
> > >         vma_mark_detached(vma, false);
> > >         vma_numab_state_init(vma);
> > > +       init_rwsem(&vma->vm_lock);
> > > +       vma->vm_lock_seq = -1;
> > >  }
> > >
> > >  /* Use when VMA is not part of the VMA tree and needs no locking */
> > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > index 003619fab20e..caffdb4eeb94 100644
> > > --- a/include/linux/mm_types.h
> > > +++ b/include/linux/mm_types.h
> > > @@ -615,10 +615,6 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
> > >  }
> > >  #endif
> > >
> > > -struct vma_lock {
> > > -       struct rw_semaphore lock;
> > > -};
> > > -
> > >  struct vma_numab_state {
> > >         /*
> > >          * Initialised as time in 'jiffies' after which VMA
> > > @@ -716,8 +712,7 @@ struct vm_area_struct {
> > >          * slowpath.
> > >          */
> > >         int vm_lock_seq;
> > > -       /* Unstable RCU readers are allowed to read this. */
> > > -       struct vma_lock *vm_lock;
> > > +       void *vm_dummy;
> > >  #endif
> > >
> > >         /*
> > > @@ -770,6 +765,9 @@ struct vm_area_struct {
> > >         struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> > >  #endif
> > >         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +       struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
> > > +#endif
> > >  } __randomize_layout;
> > >
> > >  #ifdef CONFIG_NUMA
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 92bfe56c9fed..eab04a24d5f1 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
> > >  /* SLAB cache for mm_struct structures (tsk->mm) */
> > >  static struct kmem_cache *mm_cachep;
> > >
> > > -#ifdef CONFIG_PER_VMA_LOCK
> > > -
> > > -/* SLAB cache for vm_area_struct.lock */
> > > -static struct kmem_cache *vma_lock_cachep;
> > > -
> > > -static bool vma_lock_alloc(struct vm_area_struct *vma)
> > > -{
> > > -       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> > > -       if (!vma->vm_lock)
> > > -               return false;
> > > -
> > > -       init_rwsem(&vma->vm_lock->lock);
> > > -       vma->vm_lock_seq = -1;
> > > -
> > > -       return true;
> > > -}
> > > -
> > > -static inline void vma_lock_free(struct vm_area_struct *vma)
> > > -{
> > > -       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> > > -}
> > > -
> > > -#else /* CONFIG_PER_VMA_LOCK */
> > > -
> > > -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> > > -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> > > -
> > > -#endif /* CONFIG_PER_VMA_LOCK */
> > > -
> > >  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > >  {
> > >         struct vm_area_struct *vma;
> > > @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > >                 return NULL;
> > >
> > >         vma_init(vma, mm);
> > > -       if (!vma_lock_alloc(vma)) {
> > > -               kmem_cache_free(vm_area_cachep, vma);
> > > -               return NULL;
> > > -       }
> > >
> > >         return vma;
> > >  }
> > > @@ -496,10 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > >          * will be reinitialized.
> > >          */
> > >         data_race(memcpy(new, orig, sizeof(*new)));
> > > -       if (!vma_lock_alloc(new)) {
> > > -               kmem_cache_free(vm_area_cachep, new);
> > > -               return NULL;
> > > -       }
> > > +       init_rwsem(&new->vm_lock);
> > > +       new->vm_lock_seq = -1;
> > >         INIT_LIST_HEAD(&new->anon_vma_chain);
> > >         vma_numab_state_init(new);
> > >         dup_anon_vma_name(orig, new);
> > > @@ -511,7 +476,6 @@ void __vm_area_free(struct vm_area_struct *vma)
> > >  {
> > >         vma_numab_state_free(vma);
> > >         free_anon_vma_name(vma);
> > > -       vma_lock_free(vma);
> > >         kmem_cache_free(vm_area_cachep, vma);
> > >  }
> > >
> > > @@ -522,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
> > >                                                   vm_rcu);
> > >
> > >         /* The vma should not be locked while being destroyed. */
> > > -       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> > > +       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
> > >         __vm_area_free(vma);
> > >  }
> > >  #endif
> > > @@ -3192,10 +3156,7 @@ void __init proc_caches_init(void)
> > >                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > >                         NULL);
> > >
> > > -       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> > > -#ifdef CONFIG_PER_VMA_LOCK
> > > -       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> > > -#endif
> > > +       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
> > >         mmap_init();
> > >         nsproxy_cache_init();
> > >  }
> > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > index 3b7715ecf292..e95ecb2063d2 100644
> > > --- a/mm/userfaultfd.c
> > > +++ b/mm/userfaultfd.c
> > > @@ -92,7 +92,7 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
> > >                  * mmap_lock, which guarantees that nobody can lock the
> > >                  * vma for write (vma_start_write()) under us.
> > >                  */
> > > -               down_read(&vma->vm_lock->lock);
> > > +               down_read(&vma->vm_lock);
> > >         }
> > >
> > >         mmap_read_unlock(mm);
> > > @@ -1468,9 +1468,9 @@ static int uffd_move_lock(struct mm_struct *mm,
> > >                  * See comment in uffd_lock_vma() as to why not using
> > >                  * vma_start_read() here.
> > >                  */
> > > -               down_read(&(*dst_vmap)->vm_lock->lock);
> > > +               down_read(&(*dst_vmap)->vm_lock);
> > >                 if (*dst_vmap != *src_vmap)
> > > -                       down_read_nested(&(*src_vmap)->vm_lock->lock,
> > > +                       down_read_nested(&(*src_vmap)->vm_lock,
> > >                                          SINGLE_DEPTH_NESTING);
> > >         }
> > >         mmap_read_unlock(mm);
> > > --
> > > 2.43.0
> > >
>
>
>
> --
> Mateusz Guzik <mjguzik gmail.com>
Suren Baghdasaryan Aug. 9, 2024, 3:57 a.m. UTC | #4
On Thu, Aug 8, 2024 at 9:19 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Aug 8, 2024 at 1:04 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> >
> > On Thu, Aug 8, 2024 at 9:39 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Thu, Aug 8, 2024 at 7:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > >
> > > > ACHTUNG: this is more of a request for benchmarking than a patch
> > > > proposal at this stage
> > > >
> > > > I was pointed at your patch which moved the vma lock to a separate
> > > > allocation [1]. The commit message does not say anything about making
> > > > sure the object itself is allocated with proper alignment and I found
> > > > that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
> > > > not be the problem.
> > > >
> > > > I verified with a simple one-liner than on a stock kernel the vmas keep
> > > > roaming around with a 16-byte alignment:
> > > > # bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
> > > > @[16]: 39
> > > > @[0]: 46
> > > > @[32]: 53
> > > > @[48]: 56
> > > >
> > > > Note the stock vma lock cache also lacks the alignment flag. While I
> > > > have not verified experimentally, if they are also romaing it would mean
> > > > that 2 unrelated vmas can false-share locks. If the patch below is a
> > > > bust, the flag should probably be added to that one.
> > > >
> > > > The patch has slapped-around vma lock cache removal + HWALLOC for the
> > > > vma cache. I left a pointer to not change relative offsets between
> > > > current fields. I does compile without CONFIG_PER_VMA_LOCK.
> > > >
> > > > Vlastimil says you tested a case where the struct got bloated to 256
> > > > bytes, but the lock remained separate. It is unclear to me if this
> > > > happened with allocations made with the HWCACHE_ALIGN flag though.
> > > >
> > > > There is 0 urgency on my end, but it would be nice if you could try
> > > > this out with your test rig.
> > >
> > > Hi Mateusz,
> > > Sure, I'll give it a spin but I'm not optimistic. Your code looks
> > > almost identical to my latest attempt where I tried placing vm_lock
> > > into different cachelines including a separate one and using
> > > HWCACHE_ALIGN. And yet all my attempts showed regression.
> > > Just FYI, the test I'm using is the pft-threads test from mmtests
> > > suite. I'll post results today evening.
> > > Thanks,
> > > Suren.
> >
> > Ok, well maybe you did not leave the pointer in place? :)
>
> True, maybe that will make a difference. I'll let you know soon.
>
> >
> > It is plausible the problem is on vs off cpu behavior of rwsems --
> > there is a corner case where they neglect to spin. It is plausible
> > perf goes down simply because there is less on cpu time.
> >
> > Thus you bench can you make sure to time(1)?
>
> Sure, will do once I'm home. Thanks for the hints!

Unfortunately the same regression shows its ugly face:

compare-mmtests.pl Hmean results:
Hmean     faults/cpu-1    471264.4904 (   0.00%)   473085.6736 *   0.39%*
Hmean     faults/cpu-4    434571.7116 (   0.00%)   431214.3974 *  -0.77%*
Hmean     faults/cpu-7    407755.3217 (   0.00%)   395773.4052 *  -2.94%*
Hmean     faults/cpu-12   335604.9251 (   0.00%)   285426.3358 * -14.95%*
Hmean     faults/cpu-21   187588.9077 (   0.00%)   171227.7179 *  -8.72%*
Hmean     faults/cpu-30   140875.7878 (   0.00%)   124120.3437 * -11.89%*
Hmean     faults/cpu-48   106175.5493 (   0.00%)    93073.1499 * -12.34%*
Hmean     faults/cpu-56    92585.2536 (   0.00%)    82837.4299 * -10.53%*
Hmean     faults/sec-1    470924.4946 (   0.00%)   472730.9937 *   0.38%*
Hmean     faults/sec-4   1714823.8198 (   0.00%)  1693226.7248 *  -1.26%*
Hmean     faults/sec-7   2801395.1898 (   0.00%)  2717561.9417 *  -2.99%*
Hmean     faults/sec-12  3934168.2690 (   0.00%)  3319710.7540 * -15.62%*
Hmean     faults/sec-21  3736832.4592 (   0.00%)  3444687.9145 *  -7.82%*
Hmean     faults/sec-30  3845187.2636 (   0.00%)  3403585.7064 * -11.48%*
Hmean     faults/sec-48  4712317.7461 (   0.00%)  4180658.4710 * -11.28%*
Hmean     faults/sec-56  4873233.9844 (   0.00%)  4423608.6568 *  -9.23%*

This is the time(1) output with the baseline:
920.47user 7748.31system 18:30.85elapsed 780%CPU (0avgtext+0avgdata
26385096maxresident)k
140848inputs+19744outputs (66major+1583463207minor)pagefaults 0swaps

This is the time(1) output with your change:
1025.73user 8618.74system 19:10.79elapsed 838%CPU (0avgtext+0avgdata
26385116maxresident)k
16584inputs+19512outputs (61major+1583468687minor)pagefaults 0swaps

Maybe it has something to do with NUMA? The system I'm running has 2 NUMA nodes:

$ lscpu
Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   56
  On-line CPU(s) list:    0-55
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
    CPU family:           6
    Model:                79
    Thread(s) per core:   2
    Core(s) per socket:   14
    Socket(s):            2
    Stepping:             1
    CPU max MHz:          3500.0000
    CPU min MHz:          1200.0000
    BogoMIPS:             5188.26
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep
mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
tm pbe syscall nx pdpe1gb rdtscp lm constant_ts
                          c arch_perfmon pebs bts rep_good nopl
xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor
ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm p
                          cid dca sse4_1 sse4_2 x2apic movbe popcnt
tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch
cpuid_fault epb cat_l3 cdp_l3 pti intel_ppin ss
                          bd ibrs ibpb stibp tpr_shadow flexpriority
ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms
invpcid rtm cqm rdt_a rdseed adx smap intel_pt xsave
                          opt cqm_llc cqm_occup_llc cqm_mbm_total
cqm_mbm_local dtherm ida arat pln pts vnmi md_clear flush_l1d
Virtualization features:
  Virtualization:         VT-x
Caches (sum of all):
  L1d:                    896 KiB (28 instances)
  L1i:                    896 KiB (28 instances)
  L2:                     7 MiB (28 instances)
  L3:                     70 MiB (2 instances)
NUMA:
  NUMA node(s):           2
  NUMA node0 CPU(s):      0-13,28-41
  NUMA node1 CPU(s):      14-27,42-55

Any ideas?








>
> >
> > For example with zsh I got:
> > ./run-mmtests.sh --no-monitor --config configs/config-workload-pft-threads
> >
> > 39.35s user 445.45s system 390% cpu 124.04s (2:04.04) total
> >
> > I verified with offcputime-bpfcc -K that indeed there is a bunch of
> > pft going off cpu from down_read/down_write even at the modest scale
> > this was running in my case.
> >
> > >
> > > >
> > > > 1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/T/#u
> > > >
> > > > ---
> > > >  include/linux/mm.h       | 18 +++++++--------
> > > >  include/linux/mm_types.h | 10 ++++-----
> > > >  kernel/fork.c            | 47 ++++------------------------------------
> > > >  mm/userfaultfd.c         |  6 ++---
> > > >  4 files changed, 19 insertions(+), 62 deletions(-)
> > > >
> > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > index 43b40334e9b2..6d8b668d3deb 100644
> > > > --- a/include/linux/mm.h
> > > > +++ b/include/linux/mm.h
> > > > @@ -687,7 +687,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > >         if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > > >                 return false;
> > > >
> > > > -       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> > > > +       if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
> > > >                 return false;
> > > >
> > > >         /*
> > > > @@ -702,7 +702,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > >          * This pairs with RELEASE semantics in vma_end_write_all().
> > > >          */
> > > >         if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> > > > -               up_read(&vma->vm_lock->lock);
> > > > +               up_read(&vma->vm_lock);
> > > >                 return false;
> > > >         }
> > > >         return true;
> > > > @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > >  static inline void vma_end_read(struct vm_area_struct *vma)
> > > >  {
> > > >         rcu_read_lock(); /* keeps vma alive till the end of up_read */
> > > > -       up_read(&vma->vm_lock->lock);
> > > > +       up_read(&vma->vm_lock);
> > > >         rcu_read_unlock();
> > > >  }
> > > >
> > > > @@ -740,7 +740,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > > >         if (__is_vma_write_locked(vma, &mm_lock_seq))
> > > >                 return;
> > > >
> > > > -       down_write(&vma->vm_lock->lock);
> > > > +       down_write(&vma->vm_lock);
> > > >         /*
> > > >          * We should use WRITE_ONCE() here because we can have concurrent reads
> > > >          * from the early lockless pessimistic check in vma_start_read().
> > > > @@ -748,7 +748,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > > >          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> > > >          */
> > > >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > > > -       up_write(&vma->vm_lock->lock);
> > > > +       up_write(&vma->vm_lock);
> > > >  }
> > > >
> > > >  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > > > @@ -760,7 +760,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > > >
> > > >  static inline void vma_assert_locked(struct vm_area_struct *vma)
> > > >  {
> > > > -       if (!rwsem_is_locked(&vma->vm_lock->lock))
> > > > +       if (!rwsem_is_locked(&vma->vm_lock))
> > > >                 vma_assert_write_locked(vma);
> > > >  }
> > > >
> > > > @@ -827,10 +827,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
> > > >
> > > >  extern const struct vm_operations_struct vma_dummy_vm_ops;
> > > >
> > > > -/*
> > > > - * WARNING: vma_init does not initialize vma->vm_lock.
> > > > - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> > > > - */
> > > >  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > > >  {
> > > >         memset(vma, 0, sizeof(*vma));
> > > > @@ -839,6 +835,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > > >         INIT_LIST_HEAD(&vma->anon_vma_chain);
> > > >         vma_mark_detached(vma, false);
> > > >         vma_numab_state_init(vma);
> > > > +       init_rwsem(&vma->vm_lock);
> > > > +       vma->vm_lock_seq = -1;
> > > >  }
> > > >
> > > >  /* Use when VMA is not part of the VMA tree and needs no locking */
> > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > index 003619fab20e..caffdb4eeb94 100644
> > > > --- a/include/linux/mm_types.h
> > > > +++ b/include/linux/mm_types.h
> > > > @@ -615,10 +615,6 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
> > > >  }
> > > >  #endif
> > > >
> > > > -struct vma_lock {
> > > > -       struct rw_semaphore lock;
> > > > -};
> > > > -
> > > >  struct vma_numab_state {
> > > >         /*
> > > >          * Initialised as time in 'jiffies' after which VMA
> > > > @@ -716,8 +712,7 @@ struct vm_area_struct {
> > > >          * slowpath.
> > > >          */
> > > >         int vm_lock_seq;
> > > > -       /* Unstable RCU readers are allowed to read this. */
> > > > -       struct vma_lock *vm_lock;
> > > > +       void *vm_dummy;
> > > >  #endif
> > > >
> > > >         /*
> > > > @@ -770,6 +765,9 @@ struct vm_area_struct {
> > > >         struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> > > >  #endif
> > > >         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +       struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
> > > > +#endif
> > > >  } __randomize_layout;
> > > >
> > > >  #ifdef CONFIG_NUMA
> > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > index 92bfe56c9fed..eab04a24d5f1 100644
> > > > --- a/kernel/fork.c
> > > > +++ b/kernel/fork.c
> > > > @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
> > > >  /* SLAB cache for mm_struct structures (tsk->mm) */
> > > >  static struct kmem_cache *mm_cachep;
> > > >
> > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > -
> > > > -/* SLAB cache for vm_area_struct.lock */
> > > > -static struct kmem_cache *vma_lock_cachep;
> > > > -
> > > > -static bool vma_lock_alloc(struct vm_area_struct *vma)
> > > > -{
> > > > -       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> > > > -       if (!vma->vm_lock)
> > > > -               return false;
> > > > -
> > > > -       init_rwsem(&vma->vm_lock->lock);
> > > > -       vma->vm_lock_seq = -1;
> > > > -
> > > > -       return true;
> > > > -}
> > > > -
> > > > -static inline void vma_lock_free(struct vm_area_struct *vma)
> > > > -{
> > > > -       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> > > > -}
> > > > -
> > > > -#else /* CONFIG_PER_VMA_LOCK */
> > > > -
> > > > -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> > > > -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> > > > -
> > > > -#endif /* CONFIG_PER_VMA_LOCK */
> > > > -
> > > >  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > >  {
> > > >         struct vm_area_struct *vma;
> > > > @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > >                 return NULL;
> > > >
> > > >         vma_init(vma, mm);
> > > > -       if (!vma_lock_alloc(vma)) {
> > > > -               kmem_cache_free(vm_area_cachep, vma);
> > > > -               return NULL;
> > > > -       }
> > > >
> > > >         return vma;
> > > >  }
> > > > @@ -496,10 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > > >          * will be reinitialized.
> > > >          */
> > > >         data_race(memcpy(new, orig, sizeof(*new)));
> > > > -       if (!vma_lock_alloc(new)) {
> > > > -               kmem_cache_free(vm_area_cachep, new);
> > > > -               return NULL;
> > > > -       }
> > > > +       init_rwsem(&new->vm_lock);
> > > > +       new->vm_lock_seq = -1;
> > > >         INIT_LIST_HEAD(&new->anon_vma_chain);
> > > >         vma_numab_state_init(new);
> > > >         dup_anon_vma_name(orig, new);
> > > > @@ -511,7 +476,6 @@ void __vm_area_free(struct vm_area_struct *vma)
> > > >  {
> > > >         vma_numab_state_free(vma);
> > > >         free_anon_vma_name(vma);
> > > > -       vma_lock_free(vma);
> > > >         kmem_cache_free(vm_area_cachep, vma);
> > > >  }
> > > >
> > > > @@ -522,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
> > > >                                                   vm_rcu);
> > > >
> > > >         /* The vma should not be locked while being destroyed. */
> > > > -       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> > > > +       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
> > > >         __vm_area_free(vma);
> > > >  }
> > > >  #endif
> > > > @@ -3192,10 +3156,7 @@ void __init proc_caches_init(void)
> > > >                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > >                         NULL);
> > > >
> > > > -       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > -       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> > > > -#endif
> > > > +       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
> > > >         mmap_init();
> > > >         nsproxy_cache_init();
> > > >  }
> > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > > index 3b7715ecf292..e95ecb2063d2 100644
> > > > --- a/mm/userfaultfd.c
> > > > +++ b/mm/userfaultfd.c
> > > > @@ -92,7 +92,7 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
> > > >                  * mmap_lock, which guarantees that nobody can lock the
> > > >                  * vma for write (vma_start_write()) under us.
> > > >                  */
> > > > -               down_read(&vma->vm_lock->lock);
> > > > +               down_read(&vma->vm_lock);
> > > >         }
> > > >
> > > >         mmap_read_unlock(mm);
> > > > @@ -1468,9 +1468,9 @@ static int uffd_move_lock(struct mm_struct *mm,
> > > >                  * See comment in uffd_lock_vma() as to why not using
> > > >                  * vma_start_read() here.
> > > >                  */
> > > > -               down_read(&(*dst_vmap)->vm_lock->lock);
> > > > +               down_read(&(*dst_vmap)->vm_lock);
> > > >                 if (*dst_vmap != *src_vmap)
> > > > -                       down_read_nested(&(*src_vmap)->vm_lock->lock,
> > > > +                       down_read_nested(&(*src_vmap)->vm_lock,
> > > >                                          SINGLE_DEPTH_NESTING);
> > > >         }
> > > >         mmap_read_unlock(mm);
> > > > --
> > > > 2.43.0
> > > >
> >
> >
> >
> > --
> > Mateusz Guzik <mjguzik gmail.com>
Mateusz Guzik Aug. 9, 2024, 8:14 a.m. UTC | #5
On Fri, Aug 9, 2024 at 5:57 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Aug 8, 2024 at 9:19 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Thu, Aug 8, 2024 at 1:04 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > >
> > > On Thu, Aug 8, 2024 at 9:39 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Thu, Aug 8, 2024 at 7:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > > >
> > > > > ACHTUNG: this is more of a request for benchmarking than a patch
> > > > > proposal at this stage
> > > > >
> > > > > I was pointed at your patch which moved the vma lock to a separate
> > > > > allocation [1]. The commit message does not say anything about making
> > > > > sure the object itself is allocated with proper alignment and I found
> > > > > that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
> > > > > not be the problem.
> > > > >
> > > > > I verified with a simple one-liner than on a stock kernel the vmas keep
> > > > > roaming around with a 16-byte alignment:
> > > > > # bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
> > > > > @[16]: 39
> > > > > @[0]: 46
> > > > > @[32]: 53
> > > > > @[48]: 56
> > > > >
> > > > > Note the stock vma lock cache also lacks the alignment flag. While I
> > > > > have not verified experimentally, if they are also romaing it would mean
> > > > > that 2 unrelated vmas can false-share locks. If the patch below is a
> > > > > bust, the flag should probably be added to that one.
> > > > >
> > > > > The patch has slapped-around vma lock cache removal + HWALLOC for the
> > > > > vma cache. I left a pointer to not change relative offsets between
> > > > > current fields. I does compile without CONFIG_PER_VMA_LOCK.
> > > > >
> > > > > Vlastimil says you tested a case where the struct got bloated to 256
> > > > > bytes, but the lock remained separate. It is unclear to me if this
> > > > > happened with allocations made with the HWCACHE_ALIGN flag though.
> > > > >
> > > > > There is 0 urgency on my end, but it would be nice if you could try
> > > > > this out with your test rig.
> > > >
> > > > Hi Mateusz,
> > > > Sure, I'll give it a spin but I'm not optimistic. Your code looks
> > > > almost identical to my latest attempt where I tried placing vm_lock
> > > > into different cachelines including a separate one and using
> > > > HWCACHE_ALIGN. And yet all my attempts showed regression.
> > > > Just FYI, the test I'm using is the pft-threads test from mmtests
> > > > suite. I'll post results today evening.
> > > > Thanks,
> > > > Suren.
> > >
> > > Ok, well maybe you did not leave the pointer in place? :)
> >
> > True, maybe that will make a difference. I'll let you know soon.
> >
> > >
> > > It is plausible the problem is on vs off cpu behavior of rwsems --
> > > there is a corner case where they neglect to spin. It is plausible
> > > perf goes down simply because there is less on cpu time.
> > >
> > > Thus you bench can you make sure to time(1)?
> >
> > Sure, will do once I'm home. Thanks for the hints!
>
> Unfortunately the same regression shows its ugly face:
>
> compare-mmtests.pl Hmean results:
> Hmean     faults/cpu-1    471264.4904 (   0.00%)   473085.6736 *   0.39%*
> Hmean     faults/cpu-4    434571.7116 (   0.00%)   431214.3974 *  -0.77%*
> Hmean     faults/cpu-7    407755.3217 (   0.00%)   395773.4052 *  -2.94%*
> Hmean     faults/cpu-12   335604.9251 (   0.00%)   285426.3358 * -14.95%*
> Hmean     faults/cpu-21   187588.9077 (   0.00%)   171227.7179 *  -8.72%*
> Hmean     faults/cpu-30   140875.7878 (   0.00%)   124120.3437 * -11.89%*
> Hmean     faults/cpu-48   106175.5493 (   0.00%)    93073.1499 * -12.34%*
> Hmean     faults/cpu-56    92585.2536 (   0.00%)    82837.4299 * -10.53%*
> Hmean     faults/sec-1    470924.4946 (   0.00%)   472730.9937 *   0.38%*
> Hmean     faults/sec-4   1714823.8198 (   0.00%)  1693226.7248 *  -1.26%*
> Hmean     faults/sec-7   2801395.1898 (   0.00%)  2717561.9417 *  -2.99%*
> Hmean     faults/sec-12  3934168.2690 (   0.00%)  3319710.7540 * -15.62%*
> Hmean     faults/sec-21  3736832.4592 (   0.00%)  3444687.9145 *  -7.82%*
> Hmean     faults/sec-30  3845187.2636 (   0.00%)  3403585.7064 * -11.48%*
> Hmean     faults/sec-48  4712317.7461 (   0.00%)  4180658.4710 * -11.28%*
> Hmean     faults/sec-56  4873233.9844 (   0.00%)  4423608.6568 *  -9.23%*
>
> This is the time(1) output with the baseline:
> 920.47user 7748.31system 18:30.85elapsed 780%CPU (0avgtext+0avgdata
> 26385096maxresident)k
> 140848inputs+19744outputs (66major+1583463207minor)pagefaults 0swaps
>
> This is the time(1) output with your change:
> 1025.73user 8618.74system 19:10.79elapsed 838%CPU (0avgtext+0avgdata
> 26385116maxresident)k
> 16584inputs+19512outputs (61major+1583468687minor)pagefaults 0swaps
>
> Maybe it has something to do with NUMA? The system I'm running has 2 NUMA nodes:
>

hrmpf. final cheap stab I forgot to mention is that plausibly this is
all about the adjacent cacheline prefetcher.

google-fu temporarily fails me, but there was a one-liner to toggle
that on Linux. Worst case you can flip it in the BIOS

if that does not change anything, I'm going to grab a numa box of
similar scale to poke around myself, but I don't have an ETA

even so, do you have a handy one-liner to run the case with 56
threads? *maybe* comparing instructions which generate cache misses
before/after will explain what's up

> $ lscpu
> Architecture:             x86_64
>   CPU op-mode(s):         32-bit, 64-bit
>   Address sizes:          46 bits physical, 48 bits virtual
>   Byte Order:             Little Endian
> CPU(s):                   56
>   On-line CPU(s) list:    0-55
> Vendor ID:                GenuineIntel
>   Model name:             Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
>     CPU family:           6
>     Model:                79
>     Thread(s) per core:   2
>     Core(s) per socket:   14
>     Socket(s):            2
>     Stepping:             1
>     CPU max MHz:          3500.0000
>     CPU min MHz:          1200.0000
>     BogoMIPS:             5188.26
>     Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep
> mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
> tm pbe syscall nx pdpe1gb rdtscp lm constant_ts
>                           c arch_perfmon pebs bts rep_good nopl
> xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor
> ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm p
>                           cid dca sse4_1 sse4_2 x2apic movbe popcnt
> tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch
> cpuid_fault epb cat_l3 cdp_l3 pti intel_ppin ss
>                           bd ibrs ibpb stibp tpr_shadow flexpriority
> ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms
> invpcid rtm cqm rdt_a rdseed adx smap intel_pt xsave
>                           opt cqm_llc cqm_occup_llc cqm_mbm_total
> cqm_mbm_local dtherm ida arat pln pts vnmi md_clear flush_l1d
> Virtualization features:
>   Virtualization:         VT-x
> Caches (sum of all):
>   L1d:                    896 KiB (28 instances)
>   L1i:                    896 KiB (28 instances)
>   L2:                     7 MiB (28 instances)
>   L3:                     70 MiB (2 instances)
> NUMA:
>   NUMA node(s):           2
>   NUMA node0 CPU(s):      0-13,28-41
>   NUMA node1 CPU(s):      14-27,42-55
>
> Any ideas?
>
>
>
>
>
>
>
>
> >
> > >
> > > For example with zsh I got:
> > > ./run-mmtests.sh --no-monitor --config configs/config-workload-pft-threads
> > >
> > > 39.35s user 445.45s system 390% cpu 124.04s (2:04.04) total
> > >
> > > I verified with offcputime-bpfcc -K that indeed there is a bunch of
> > > pft going off cpu from down_read/down_write even at the modest scale
> > > this was running in my case.
> > >
> > > >
> > > > >
> > > > > 1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/T/#u
> > > > >
> > > > > ---
> > > > >  include/linux/mm.h       | 18 +++++++--------
> > > > >  include/linux/mm_types.h | 10 ++++-----
> > > > >  kernel/fork.c            | 47 ++++------------------------------------
> > > > >  mm/userfaultfd.c         |  6 ++---
> > > > >  4 files changed, 19 insertions(+), 62 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > > index 43b40334e9b2..6d8b668d3deb 100644
> > > > > --- a/include/linux/mm.h
> > > > > +++ b/include/linux/mm.h
> > > > > @@ -687,7 +687,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > > >         if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > > > >                 return false;
> > > > >
> > > > > -       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> > > > > +       if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
> > > > >                 return false;
> > > > >
> > > > >         /*
> > > > > @@ -702,7 +702,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > > >          * This pairs with RELEASE semantics in vma_end_write_all().
> > > > >          */
> > > > >         if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> > > > > -               up_read(&vma->vm_lock->lock);
> > > > > +               up_read(&vma->vm_lock);
> > > > >                 return false;
> > > > >         }
> > > > >         return true;
> > > > > @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > > >  static inline void vma_end_read(struct vm_area_struct *vma)
> > > > >  {
> > > > >         rcu_read_lock(); /* keeps vma alive till the end of up_read */
> > > > > -       up_read(&vma->vm_lock->lock);
> > > > > +       up_read(&vma->vm_lock);
> > > > >         rcu_read_unlock();
> > > > >  }
> > > > >
> > > > > @@ -740,7 +740,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > > > >         if (__is_vma_write_locked(vma, &mm_lock_seq))
> > > > >                 return;
> > > > >
> > > > > -       down_write(&vma->vm_lock->lock);
> > > > > +       down_write(&vma->vm_lock);
> > > > >         /*
> > > > >          * We should use WRITE_ONCE() here because we can have concurrent reads
> > > > >          * from the early lockless pessimistic check in vma_start_read().
> > > > > @@ -748,7 +748,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > > > >          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> > > > >          */
> > > > >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > > > > -       up_write(&vma->vm_lock->lock);
> > > > > +       up_write(&vma->vm_lock);
> > > > >  }
> > > > >
> > > > >  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > > > > @@ -760,7 +760,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > > > >
> > > > >  static inline void vma_assert_locked(struct vm_area_struct *vma)
> > > > >  {
> > > > > -       if (!rwsem_is_locked(&vma->vm_lock->lock))
> > > > > +       if (!rwsem_is_locked(&vma->vm_lock))
> > > > >                 vma_assert_write_locked(vma);
> > > > >  }
> > > > >
> > > > > @@ -827,10 +827,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
> > > > >
> > > > >  extern const struct vm_operations_struct vma_dummy_vm_ops;
> > > > >
> > > > > -/*
> > > > > - * WARNING: vma_init does not initialize vma->vm_lock.
> > > > > - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> > > > > - */
> > > > >  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > > > >  {
> > > > >         memset(vma, 0, sizeof(*vma));
> > > > > @@ -839,6 +835,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > > > >         INIT_LIST_HEAD(&vma->anon_vma_chain);
> > > > >         vma_mark_detached(vma, false);
> > > > >         vma_numab_state_init(vma);
> > > > > +       init_rwsem(&vma->vm_lock);
> > > > > +       vma->vm_lock_seq = -1;
> > > > >  }
> > > > >
> > > > >  /* Use when VMA is not part of the VMA tree and needs no locking */
> > > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > > index 003619fab20e..caffdb4eeb94 100644
> > > > > --- a/include/linux/mm_types.h
> > > > > +++ b/include/linux/mm_types.h
> > > > > @@ -615,10 +615,6 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
> > > > >  }
> > > > >  #endif
> > > > >
> > > > > -struct vma_lock {
> > > > > -       struct rw_semaphore lock;
> > > > > -};
> > > > > -
> > > > >  struct vma_numab_state {
> > > > >         /*
> > > > >          * Initialised as time in 'jiffies' after which VMA
> > > > > @@ -716,8 +712,7 @@ struct vm_area_struct {
> > > > >          * slowpath.
> > > > >          */
> > > > >         int vm_lock_seq;
> > > > > -       /* Unstable RCU readers are allowed to read this. */
> > > > > -       struct vma_lock *vm_lock;
> > > > > +       void *vm_dummy;
> > > > >  #endif
> > > > >
> > > > >         /*
> > > > > @@ -770,6 +765,9 @@ struct vm_area_struct {
> > > > >         struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> > > > >  #endif
> > > > >         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +       struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
> > > > > +#endif
> > > > >  } __randomize_layout;
> > > > >
> > > > >  #ifdef CONFIG_NUMA
> > > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > > index 92bfe56c9fed..eab04a24d5f1 100644
> > > > > --- a/kernel/fork.c
> > > > > +++ b/kernel/fork.c
> > > > > @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
> > > > >  /* SLAB cache for mm_struct structures (tsk->mm) */
> > > > >  static struct kmem_cache *mm_cachep;
> > > > >
> > > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > > -
> > > > > -/* SLAB cache for vm_area_struct.lock */
> > > > > -static struct kmem_cache *vma_lock_cachep;
> > > > > -
> > > > > -static bool vma_lock_alloc(struct vm_area_struct *vma)
> > > > > -{
> > > > > -       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> > > > > -       if (!vma->vm_lock)
> > > > > -               return false;
> > > > > -
> > > > > -       init_rwsem(&vma->vm_lock->lock);
> > > > > -       vma->vm_lock_seq = -1;
> > > > > -
> > > > > -       return true;
> > > > > -}
> > > > > -
> > > > > -static inline void vma_lock_free(struct vm_area_struct *vma)
> > > > > -{
> > > > > -       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> > > > > -}
> > > > > -
> > > > > -#else /* CONFIG_PER_VMA_LOCK */
> > > > > -
> > > > > -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> > > > > -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> > > > > -
> > > > > -#endif /* CONFIG_PER_VMA_LOCK */
> > > > > -
> > > > >  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > >  {
> > > > >         struct vm_area_struct *vma;
> > > > > @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > >                 return NULL;
> > > > >
> > > > >         vma_init(vma, mm);
> > > > > -       if (!vma_lock_alloc(vma)) {
> > > > > -               kmem_cache_free(vm_area_cachep, vma);
> > > > > -               return NULL;
> > > > > -       }
> > > > >
> > > > >         return vma;
> > > > >  }
> > > > > @@ -496,10 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > > > >          * will be reinitialized.
> > > > >          */
> > > > >         data_race(memcpy(new, orig, sizeof(*new)));
> > > > > -       if (!vma_lock_alloc(new)) {
> > > > > -               kmem_cache_free(vm_area_cachep, new);
> > > > > -               return NULL;
> > > > > -       }
> > > > > +       init_rwsem(&new->vm_lock);
> > > > > +       new->vm_lock_seq = -1;
> > > > >         INIT_LIST_HEAD(&new->anon_vma_chain);
> > > > >         vma_numab_state_init(new);
> > > > >         dup_anon_vma_name(orig, new);
> > > > > @@ -511,7 +476,6 @@ void __vm_area_free(struct vm_area_struct *vma)
> > > > >  {
> > > > >         vma_numab_state_free(vma);
> > > > >         free_anon_vma_name(vma);
> > > > > -       vma_lock_free(vma);
> > > > >         kmem_cache_free(vm_area_cachep, vma);
> > > > >  }
> > > > >
> > > > > @@ -522,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
> > > > >                                                   vm_rcu);
> > > > >
> > > > >         /* The vma should not be locked while being destroyed. */
> > > > > -       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> > > > > +       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
> > > > >         __vm_area_free(vma);
> > > > >  }
> > > > >  #endif
> > > > > @@ -3192,10 +3156,7 @@ void __init proc_caches_init(void)
> > > > >                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > > >                         NULL);
> > > > >
> > > > > -       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> > > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > > -       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> > > > > -#endif
> > > > > +       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
> > > > >         mmap_init();
> > > > >         nsproxy_cache_init();
> > > > >  }
> > > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > > > index 3b7715ecf292..e95ecb2063d2 100644
> > > > > --- a/mm/userfaultfd.c
> > > > > +++ b/mm/userfaultfd.c
> > > > > @@ -92,7 +92,7 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
> > > > >                  * mmap_lock, which guarantees that nobody can lock the
> > > > >                  * vma for write (vma_start_write()) under us.
> > > > >                  */
> > > > > -               down_read(&vma->vm_lock->lock);
> > > > > +               down_read(&vma->vm_lock);
> > > > >         }
> > > > >
> > > > >         mmap_read_unlock(mm);
> > > > > @@ -1468,9 +1468,9 @@ static int uffd_move_lock(struct mm_struct *mm,
> > > > >                  * See comment in uffd_lock_vma() as to why not using
> > > > >                  * vma_start_read() here.
> > > > >                  */
> > > > > -               down_read(&(*dst_vmap)->vm_lock->lock);
> > > > > +               down_read(&(*dst_vmap)->vm_lock);
> > > > >                 if (*dst_vmap != *src_vmap)
> > > > > -                       down_read_nested(&(*src_vmap)->vm_lock->lock,
> > > > > +                       down_read_nested(&(*src_vmap)->vm_lock,
> > > > >                                          SINGLE_DEPTH_NESTING);
> > > > >         }
> > > > >         mmap_read_unlock(mm);
> > > > > --
> > > > > 2.43.0
> > > > >
> > >
> > >
> > >
> > > --
> > > Mateusz Guzik <mjguzik gmail.com>
Suren Baghdasaryan Aug. 9, 2024, 2:59 p.m. UTC | #6
On Fri, Aug 9, 2024 at 8:15 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Fri, Aug 9, 2024 at 5:57 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Thu, Aug 8, 2024 at 9:19 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Thu, Aug 8, 2024 at 1:04 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > >
> > > > On Thu, Aug 8, 2024 at 9:39 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Thu, Aug 8, 2024 at 7:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > > > >
> > > > > > ACHTUNG: this is more of a request for benchmarking than a patch
> > > > > > proposal at this stage
> > > > > >
> > > > > > I was pointed at your patch which moved the vma lock to a separate
> > > > > > allocation [1]. The commit message does not say anything about making
> > > > > > sure the object itself is allocated with proper alignment and I found
> > > > > > that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
> > > > > > not be the problem.
> > > > > >
> > > > > > I verified with a simple one-liner than on a stock kernel the vmas keep
> > > > > > roaming around with a 16-byte alignment:
> > > > > > # bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
> > > > > > @[16]: 39
> > > > > > @[0]: 46
> > > > > > @[32]: 53
> > > > > > @[48]: 56
> > > > > >
> > > > > > Note the stock vma lock cache also lacks the alignment flag. While I
> > > > > > have not verified experimentally, if they are also romaing it would mean
> > > > > > that 2 unrelated vmas can false-share locks. If the patch below is a
> > > > > > bust, the flag should probably be added to that one.
> > > > > >
> > > > > > The patch has slapped-around vma lock cache removal + HWALLOC for the
> > > > > > vma cache. I left a pointer to not change relative offsets between
> > > > > > current fields. I does compile without CONFIG_PER_VMA_LOCK.
> > > > > >
> > > > > > Vlastimil says you tested a case where the struct got bloated to 256
> > > > > > bytes, but the lock remained separate. It is unclear to me if this
> > > > > > happened with allocations made with the HWCACHE_ALIGN flag though.
> > > > > >
> > > > > > There is 0 urgency on my end, but it would be nice if you could try
> > > > > > this out with your test rig.
> > > > >
> > > > > Hi Mateusz,
> > > > > Sure, I'll give it a spin but I'm not optimistic. Your code looks
> > > > > almost identical to my latest attempt where I tried placing vm_lock
> > > > > into different cachelines including a separate one and using
> > > > > HWCACHE_ALIGN. And yet all my attempts showed regression.
> > > > > Just FYI, the test I'm using is the pft-threads test from mmtests
> > > > > suite. I'll post results today evening.
> > > > > Thanks,
> > > > > Suren.
> > > >
> > > > Ok, well maybe you did not leave the pointer in place? :)
> > >
> > > True, maybe that will make a difference. I'll let you know soon.
> > >
> > > >
> > > > It is plausible the problem is on vs off cpu behavior of rwsems --
> > > > there is a corner case where they neglect to spin. It is plausible
> > > > perf goes down simply because there is less on cpu time.
> > > >
> > > > Thus you bench can you make sure to time(1)?
> > >
> > > Sure, will do once I'm home. Thanks for the hints!
> >
> > Unfortunately the same regression shows its ugly face:
> >
> > compare-mmtests.pl Hmean results:
> > Hmean     faults/cpu-1    471264.4904 (   0.00%)   473085.6736 *   0.39%*
> > Hmean     faults/cpu-4    434571.7116 (   0.00%)   431214.3974 *  -0.77%*
> > Hmean     faults/cpu-7    407755.3217 (   0.00%)   395773.4052 *  -2.94%*
> > Hmean     faults/cpu-12   335604.9251 (   0.00%)   285426.3358 * -14.95%*
> > Hmean     faults/cpu-21   187588.9077 (   0.00%)   171227.7179 *  -8.72%*
> > Hmean     faults/cpu-30   140875.7878 (   0.00%)   124120.3437 * -11.89%*
> > Hmean     faults/cpu-48   106175.5493 (   0.00%)    93073.1499 * -12.34%*
> > Hmean     faults/cpu-56    92585.2536 (   0.00%)    82837.4299 * -10.53%*
> > Hmean     faults/sec-1    470924.4946 (   0.00%)   472730.9937 *   0.38%*
> > Hmean     faults/sec-4   1714823.8198 (   0.00%)  1693226.7248 *  -1.26%*
> > Hmean     faults/sec-7   2801395.1898 (   0.00%)  2717561.9417 *  -2.99%*
> > Hmean     faults/sec-12  3934168.2690 (   0.00%)  3319710.7540 * -15.62%*
> > Hmean     faults/sec-21  3736832.4592 (   0.00%)  3444687.9145 *  -7.82%*
> > Hmean     faults/sec-30  3845187.2636 (   0.00%)  3403585.7064 * -11.48%*
> > Hmean     faults/sec-48  4712317.7461 (   0.00%)  4180658.4710 * -11.28%*
> > Hmean     faults/sec-56  4873233.9844 (   0.00%)  4423608.6568 *  -9.23%*
> >
> > This is the time(1) output with the baseline:
> > 920.47user 7748.31system 18:30.85elapsed 780%CPU (0avgtext+0avgdata
> > 26385096maxresident)k
> > 140848inputs+19744outputs (66major+1583463207minor)pagefaults 0swaps
> >
> > This is the time(1) output with your change:
> > 1025.73user 8618.74system 19:10.79elapsed 838%CPU (0avgtext+0avgdata
> > 26385116maxresident)k
> > 16584inputs+19512outputs (61major+1583468687minor)pagefaults 0swaps
> >
> > Maybe it has something to do with NUMA? The system I'm running has 2 NUMA nodes:
> >
>
> hrmpf. final cheap stab I forgot to mention is that plausibly this is
> all about the adjacent cacheline prefetcher.
>
> google-fu temporarily fails me, but there was a one-liner to toggle
> that on Linux. Worst case you can flip it in the BIOS

Is "L2 Adjacent Cache Line Prefetcher" described in
https://www.intel.com/content/dam/develop/external/us/en/documents/335592-sdm-vol-4.pdf
the feature you are referring to? Searching through some discussions
looks like I'll need MSR tools to enable/disable it:
https://mirrors.edge.kernel.org/pub/linux/utils/cpu/msr-tools/. I'll
try poking with it over the weekend and see what happens.

> if that does not change anything, I'm going to grab a numa box of
> similar scale to poke around myself, but I don't have an ETA
>
> even so, do you have a handy one-liner to run the case with 56
> threads?

Never tried restricting to only 56 threads before but I assume that
changing lines in configs/config-workload-pft-threads:
export PFT_MIN_CLIENTS=1
export PFT_MAX_CLIENTS=$NUMCPUS
to
export PFT_MIN_CLIENTS=$NUMCPUS
export PFT_MAX_CLIENTS=$NUMCPUS

should do the trick. If it doesn't I'll ask Mel.

> *maybe* comparing instructions which generate cache misses
> before/after will explain what's up

Any instructions/docs on how to capture this info?
Thanks,
Suren.

>
> > $ lscpu
> > Architecture:             x86_64
> >   CPU op-mode(s):         32-bit, 64-bit
> >   Address sizes:          46 bits physical, 48 bits virtual
> >   Byte Order:             Little Endian
> > CPU(s):                   56
> >   On-line CPU(s) list:    0-55
> > Vendor ID:                GenuineIntel
> >   Model name:             Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
> >     CPU family:           6
> >     Model:                79
> >     Thread(s) per core:   2
> >     Core(s) per socket:   14
> >     Socket(s):            2
> >     Stepping:             1
> >     CPU max MHz:          3500.0000
> >     CPU min MHz:          1200.0000
> >     BogoMIPS:             5188.26
> >     Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep
> > mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
> > tm pbe syscall nx pdpe1gb rdtscp lm constant_ts
> >                           c arch_perfmon pebs bts rep_good nopl
> > xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor
> > ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm p
> >                           cid dca sse4_1 sse4_2 x2apic movbe popcnt
> > tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch
> > cpuid_fault epb cat_l3 cdp_l3 pti intel_ppin ss
> >                           bd ibrs ibpb stibp tpr_shadow flexpriority
> > ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms
> > invpcid rtm cqm rdt_a rdseed adx smap intel_pt xsave
> >                           opt cqm_llc cqm_occup_llc cqm_mbm_total
> > cqm_mbm_local dtherm ida arat pln pts vnmi md_clear flush_l1d
> > Virtualization features:
> >   Virtualization:         VT-x
> > Caches (sum of all):
> >   L1d:                    896 KiB (28 instances)
> >   L1i:                    896 KiB (28 instances)
> >   L2:                     7 MiB (28 instances)
> >   L3:                     70 MiB (2 instances)
> > NUMA:
> >   NUMA node(s):           2
> >   NUMA node0 CPU(s):      0-13,28-41
> >   NUMA node1 CPU(s):      14-27,42-55
> >
> > Any ideas?
> >
> >
> >
> >
> >
> >
> >
> >
> > >
> > > >
> > > > For example with zsh I got:
> > > > ./run-mmtests.sh --no-monitor --config configs/config-workload-pft-threads
> > > >
> > > > 39.35s user 445.45s system 390% cpu 124.04s (2:04.04) total
> > > >
> > > > I verified with offcputime-bpfcc -K that indeed there is a bunch of
> > > > pft going off cpu from down_read/down_write even at the modest scale
> > > > this was running in my case.
> > > >
> > > > >
> > > > > >
> > > > > > 1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/T/#u
> > > > > >
> > > > > > ---
> > > > > >  include/linux/mm.h       | 18 +++++++--------
> > > > > >  include/linux/mm_types.h | 10 ++++-----
> > > > > >  kernel/fork.c            | 47 ++++------------------------------------
> > > > > >  mm/userfaultfd.c         |  6 ++---
> > > > > >  4 files changed, 19 insertions(+), 62 deletions(-)
> > > > > >
> > > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > > > index 43b40334e9b2..6d8b668d3deb 100644
> > > > > > --- a/include/linux/mm.h
> > > > > > +++ b/include/linux/mm.h
> > > > > > @@ -687,7 +687,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > > > >         if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > > > > >                 return false;
> > > > > >
> > > > > > -       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> > > > > > +       if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
> > > > > >                 return false;
> > > > > >
> > > > > >         /*
> > > > > > @@ -702,7 +702,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > > > >          * This pairs with RELEASE semantics in vma_end_write_all().
> > > > > >          */
> > > > > >         if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> > > > > > -               up_read(&vma->vm_lock->lock);
> > > > > > +               up_read(&vma->vm_lock);
> > > > > >                 return false;
> > > > > >         }
> > > > > >         return true;
> > > > > > @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > > > > >  static inline void vma_end_read(struct vm_area_struct *vma)
> > > > > >  {
> > > > > >         rcu_read_lock(); /* keeps vma alive till the end of up_read */
> > > > > > -       up_read(&vma->vm_lock->lock);
> > > > > > +       up_read(&vma->vm_lock);
> > > > > >         rcu_read_unlock();
> > > > > >  }
> > > > > >
> > > > > > @@ -740,7 +740,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > > > > >         if (__is_vma_write_locked(vma, &mm_lock_seq))
> > > > > >                 return;
> > > > > >
> > > > > > -       down_write(&vma->vm_lock->lock);
> > > > > > +       down_write(&vma->vm_lock);
> > > > > >         /*
> > > > > >          * We should use WRITE_ONCE() here because we can have concurrent reads
> > > > > >          * from the early lockless pessimistic check in vma_start_read().
> > > > > > @@ -748,7 +748,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> > > > > >          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> > > > > >          */
> > > > > >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > > > > > -       up_write(&vma->vm_lock->lock);
> > > > > > +       up_write(&vma->vm_lock);
> > > > > >  }
> > > > > >
> > > > > >  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > > > > > @@ -760,7 +760,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > > > > >
> > > > > >  static inline void vma_assert_locked(struct vm_area_struct *vma)
> > > > > >  {
> > > > > > -       if (!rwsem_is_locked(&vma->vm_lock->lock))
> > > > > > +       if (!rwsem_is_locked(&vma->vm_lock))
> > > > > >                 vma_assert_write_locked(vma);
> > > > > >  }
> > > > > >
> > > > > > @@ -827,10 +827,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
> > > > > >
> > > > > >  extern const struct vm_operations_struct vma_dummy_vm_ops;
> > > > > >
> > > > > > -/*
> > > > > > - * WARNING: vma_init does not initialize vma->vm_lock.
> > > > > > - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> > > > > > - */
> > > > > >  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > > > > >  {
> > > > > >         memset(vma, 0, sizeof(*vma));
> > > > > > @@ -839,6 +835,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > > > > >         INIT_LIST_HEAD(&vma->anon_vma_chain);
> > > > > >         vma_mark_detached(vma, false);
> > > > > >         vma_numab_state_init(vma);
> > > > > > +       init_rwsem(&vma->vm_lock);
> > > > > > +       vma->vm_lock_seq = -1;
> > > > > >  }
> > > > > >
> > > > > >  /* Use when VMA is not part of the VMA tree and needs no locking */
> > > > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > > > index 003619fab20e..caffdb4eeb94 100644
> > > > > > --- a/include/linux/mm_types.h
> > > > > > +++ b/include/linux/mm_types.h
> > > > > > @@ -615,10 +615,6 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
> > > > > >  }
> > > > > >  #endif
> > > > > >
> > > > > > -struct vma_lock {
> > > > > > -       struct rw_semaphore lock;
> > > > > > -};
> > > > > > -
> > > > > >  struct vma_numab_state {
> > > > > >         /*
> > > > > >          * Initialised as time in 'jiffies' after which VMA
> > > > > > @@ -716,8 +712,7 @@ struct vm_area_struct {
> > > > > >          * slowpath.
> > > > > >          */
> > > > > >         int vm_lock_seq;
> > > > > > -       /* Unstable RCU readers are allowed to read this. */
> > > > > > -       struct vma_lock *vm_lock;
> > > > > > +       void *vm_dummy;
> > > > > >  #endif
> > > > > >
> > > > > >         /*
> > > > > > @@ -770,6 +765,9 @@ struct vm_area_struct {
> > > > > >         struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> > > > > >  #endif
> > > > > >         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +       struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
> > > > > > +#endif
> > > > > >  } __randomize_layout;
> > > > > >
> > > > > >  #ifdef CONFIG_NUMA
> > > > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > > > index 92bfe56c9fed..eab04a24d5f1 100644
> > > > > > --- a/kernel/fork.c
> > > > > > +++ b/kernel/fork.c
> > > > > > @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
> > > > > >  /* SLAB cache for mm_struct structures (tsk->mm) */
> > > > > >  static struct kmem_cache *mm_cachep;
> > > > > >
> > > > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > > > -
> > > > > > -/* SLAB cache for vm_area_struct.lock */
> > > > > > -static struct kmem_cache *vma_lock_cachep;
> > > > > > -
> > > > > > -static bool vma_lock_alloc(struct vm_area_struct *vma)
> > > > > > -{
> > > > > > -       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> > > > > > -       if (!vma->vm_lock)
> > > > > > -               return false;
> > > > > > -
> > > > > > -       init_rwsem(&vma->vm_lock->lock);
> > > > > > -       vma->vm_lock_seq = -1;
> > > > > > -
> > > > > > -       return true;
> > > > > > -}
> > > > > > -
> > > > > > -static inline void vma_lock_free(struct vm_area_struct *vma)
> > > > > > -{
> > > > > > -       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> > > > > > -}
> > > > > > -
> > > > > > -#else /* CONFIG_PER_VMA_LOCK */
> > > > > > -
> > > > > > -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> > > > > > -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> > > > > > -
> > > > > > -#endif /* CONFIG_PER_VMA_LOCK */
> > > > > > -
> > > > > >  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > > >  {
> > > > > >         struct vm_area_struct *vma;
> > > > > > @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > > >                 return NULL;
> > > > > >
> > > > > >         vma_init(vma, mm);
> > > > > > -       if (!vma_lock_alloc(vma)) {
> > > > > > -               kmem_cache_free(vm_area_cachep, vma);
> > > > > > -               return NULL;
> > > > > > -       }
> > > > > >
> > > > > >         return vma;
> > > > > >  }
> > > > > > @@ -496,10 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > > > > >          * will be reinitialized.
> > > > > >          */
> > > > > >         data_race(memcpy(new, orig, sizeof(*new)));
> > > > > > -       if (!vma_lock_alloc(new)) {
> > > > > > -               kmem_cache_free(vm_area_cachep, new);
> > > > > > -               return NULL;
> > > > > > -       }
> > > > > > +       init_rwsem(&new->vm_lock);
> > > > > > +       new->vm_lock_seq = -1;
> > > > > >         INIT_LIST_HEAD(&new->anon_vma_chain);
> > > > > >         vma_numab_state_init(new);
> > > > > >         dup_anon_vma_name(orig, new);
> > > > > > @@ -511,7 +476,6 @@ void __vm_area_free(struct vm_area_struct *vma)
> > > > > >  {
> > > > > >         vma_numab_state_free(vma);
> > > > > >         free_anon_vma_name(vma);
> > > > > > -       vma_lock_free(vma);
> > > > > >         kmem_cache_free(vm_area_cachep, vma);
> > > > > >  }
> > > > > >
> > > > > > @@ -522,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
> > > > > >                                                   vm_rcu);
> > > > > >
> > > > > >         /* The vma should not be locked while being destroyed. */
> > > > > > -       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> > > > > > +       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
> > > > > >         __vm_area_free(vma);
> > > > > >  }
> > > > > >  #endif
> > > > > > @@ -3192,10 +3156,7 @@ void __init proc_caches_init(void)
> > > > > >                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > > > >                         NULL);
> > > > > >
> > > > > > -       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> > > > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > > > -       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> > > > > > -#endif
> > > > > > +       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
> > > > > >         mmap_init();
> > > > > >         nsproxy_cache_init();
> > > > > >  }
> > > > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > > > > index 3b7715ecf292..e95ecb2063d2 100644
> > > > > > --- a/mm/userfaultfd.c
> > > > > > +++ b/mm/userfaultfd.c
> > > > > > @@ -92,7 +92,7 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
> > > > > >                  * mmap_lock, which guarantees that nobody can lock the
> > > > > >                  * vma for write (vma_start_write()) under us.
> > > > > >                  */
> > > > > > -               down_read(&vma->vm_lock->lock);
> > > > > > +               down_read(&vma->vm_lock);
> > > > > >         }
> > > > > >
> > > > > >         mmap_read_unlock(mm);
> > > > > > @@ -1468,9 +1468,9 @@ static int uffd_move_lock(struct mm_struct *mm,
> > > > > >                  * See comment in uffd_lock_vma() as to why not using
> > > > > >                  * vma_start_read() here.
> > > > > >                  */
> > > > > > -               down_read(&(*dst_vmap)->vm_lock->lock);
> > > > > > +               down_read(&(*dst_vmap)->vm_lock);
> > > > > >                 if (*dst_vmap != *src_vmap)
> > > > > > -                       down_read_nested(&(*src_vmap)->vm_lock->lock,
> > > > > > +                       down_read_nested(&(*src_vmap)->vm_lock,
> > > > > >                                          SINGLE_DEPTH_NESTING);
> > > > > >         }
> > > > > >         mmap_read_unlock(mm);
> > > > > > --
> > > > > > 2.43.0
> > > > > >
> > > >
> > > >
> > > >
> > > > --
> > > > Mateusz Guzik <mjguzik gmail.com>
>
>
>
> --
> Mateusz Guzik <mjguzik gmail.com>
Vlastimil Babka Aug. 9, 2024, 3:09 p.m. UTC | #7
On 8/9/24 05:57, Suren Baghdasaryan wrote:
> Maybe it has something to do with NUMA? The system I'm running has 2 NUMA nodes:

I kinda doubt the NUMA aspect. Whether you allocate a vma that embeds a
lock, or a vma and immediately the separate lock, it's unlikely they would
end up on different nodes so from the NUMA perspective I don't see a
difference. And if they ended up on separate nodes, it would more likely be
worse for the case of separate locks, not better.
Suren Baghdasaryan Aug. 9, 2024, 4:56 p.m. UTC | #8
On Fri, Aug 9, 2024 at 3:09 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 8/9/24 05:57, Suren Baghdasaryan wrote:
> > Maybe it has something to do with NUMA? The system I'm running has 2 NUMA nodes:
>
> I kinda doubt the NUMA aspect. Whether you allocate a vma that embeds a
> lock, or a vma and immediately the separate lock, it's unlikely they would
> end up on different nodes so from the NUMA perspective I don't see a
> difference. And if they ended up on separate nodes, it would more likely be
> worse for the case of separate locks, not better.

I have an UMA machine. Will try the test there as well. It won't
provide hard proof but at least some possible hints.
Suren Baghdasaryan Aug. 11, 2024, 10:50 p.m. UTC | #9
On Fri, Aug 9, 2024 at 9:56 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Aug 9, 2024 at 3:09 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > On 8/9/24 05:57, Suren Baghdasaryan wrote:
> > > Maybe it has something to do with NUMA? The system I'm running has 2 NUMA nodes:
> >
> > I kinda doubt the NUMA aspect. Whether you allocate a vma that embeds a
> > lock, or a vma and immediately the separate lock, it's unlikely they would
> > end up on different nodes so from the NUMA perspective I don't see a
> > difference. And if they ended up on separate nodes, it would more likely be
> > worse for the case of separate locks, not better.
>
> I have an UMA machine. Will try the test there as well. It won't
> provide hard proof but at least some possible hints.

Ok, disabling adjacent cacheline prefetching seems to do the trick (or
at least cuts down the regression drastically):

Hmean     faults/cpu-1    470577.6434 (   0.00%)   470745.2649 *   0.04%*
Hmean     faults/cpu-4    445862.9701 (   0.00%)   445572.2252 *  -0.07%*
Hmean     faults/cpu-7    422516.4002 (   0.00%)   422677.5591 *   0.04%*
Hmean     faults/cpu-12   344483.7047 (   0.00%)   330476.7911 *  -4.07%*
Hmean     faults/cpu-21   192836.0188 (   0.00%)   195266.8071 *   1.26%*
Hmean     faults/cpu-30   140745.9472 (   0.00%)   140655.0459 *  -0.06%*
Hmean     faults/cpu-48   110507.4310 (   0.00%)   103802.1839 *  -6.07%*
Hmean     faults/cpu-56    93507.7919 (   0.00%)    95105.1875 *   1.71%*
Hmean     faults/sec-1    470232.3887 (   0.00%)   470404.6525 *   0.04%*
Hmean     faults/sec-4   1757368.9266 (   0.00%)  1752852.8697 *  -0.26%*
Hmean     faults/sec-7   2909554.8150 (   0.00%)  2915885.8739 *   0.22%*
Hmean     faults/sec-12  4033840.8719 (   0.00%)  3845165.3277 *  -4.68%*
Hmean     faults/sec-21  3845857.7079 (   0.00%)  3890316.8799 *   1.16%*
Hmean     faults/sec-30  3838607.4530 (   0.00%)  3838861.8142 *   0.01%*
Hmean     faults/sec-48  4882118.9701 (   0.00%)  4608985.0530 *  -5.59%*
Hmean     faults/sec-56  4933535.7567 (   0.00%)  5004208.3329 *   1.43%*

Now, how do we disable prefetching extra cachelines for vm_area_structs only?
Mateusz Guzik Aug. 12, 2024, 4:29 a.m. UTC | #10
On Mon, Aug 12, 2024 at 12:50 AM Suren Baghdasaryan <surenb@google.com> wrote:
> Ok, disabling adjacent cacheline prefetching seems to do the trick (or
> at least cuts down the regression drastically):
>
> Hmean     faults/cpu-1    470577.6434 (   0.00%)   470745.2649 *   0.04%*
> Hmean     faults/cpu-4    445862.9701 (   0.00%)   445572.2252 *  -0.07%*
> Hmean     faults/cpu-7    422516.4002 (   0.00%)   422677.5591 *   0.04%*
> Hmean     faults/cpu-12   344483.7047 (   0.00%)   330476.7911 *  -4.07%*
> Hmean     faults/cpu-21   192836.0188 (   0.00%)   195266.8071 *   1.26%*
> Hmean     faults/cpu-30   140745.9472 (   0.00%)   140655.0459 *  -0.06%*
> Hmean     faults/cpu-48   110507.4310 (   0.00%)   103802.1839 *  -6.07%*
> Hmean     faults/cpu-56    93507.7919 (   0.00%)    95105.1875 *   1.71%*
> Hmean     faults/sec-1    470232.3887 (   0.00%)   470404.6525 *   0.04%*
> Hmean     faults/sec-4   1757368.9266 (   0.00%)  1752852.8697 *  -0.26%*
> Hmean     faults/sec-7   2909554.8150 (   0.00%)  2915885.8739 *   0.22%*
> Hmean     faults/sec-12  4033840.8719 (   0.00%)  3845165.3277 *  -4.68%*
> Hmean     faults/sec-21  3845857.7079 (   0.00%)  3890316.8799 *   1.16%*
> Hmean     faults/sec-30  3838607.4530 (   0.00%)  3838861.8142 *   0.01%*
> Hmean     faults/sec-48  4882118.9701 (   0.00%)  4608985.0530 *  -5.59%*
> Hmean     faults/sec-56  4933535.7567 (   0.00%)  5004208.3329 *   1.43%*
>
> Now, how do we disable prefetching extra cachelines for vm_area_structs only?

I'm unaware of any mechanism of the sort.

The good news is that Broadwell is an old yeller and if memory serves
right the impact is not anywhere near this bad on newer
microarchitectures, making "merely" 64 alignment (used all over in the
kernel for amd64) a practical choice (not just for vma).

Also note that in your setup you are losing out on performance in
other multithreaded cases, unrelated to anything vma.

That aside as I mentioned earlier the dedicated vma lock cache results
in false sharing between separate vmas, except this particular
benchmark does not test for it (which in your setup should be visible
even if the cache grows the  SLAB_HWCACHE_ALIGN flag).

I think the thing to do here is to bench on other cpus and ignore the
Broadwell + adjacent cache line prefetcher result if they come back
fine -- the code should not be held hostage by an old yeller.

To that end I think it would be best to ask the LKP folks at Intel.
They are very approachable so there should be no problem arranging it
provided they have some spare capacity. I believe grabbing the From
person and the cc list from this thread will do it:
https://lore.kernel.org/oe-lkp/ZriCbCPF6I0JnbKi@xsang-OptiPlex-9020/ .
By default they would run their own suite, which presumably has some
overlap with this particular benchmark in terms of generated workload
(but I don't think they run *this* particular benchmark itself,
perhaps it would make sense to ask them to add it?). It's your call
here.

If there are still problems and the lock needs to remain separate, the
bare minimum damage-controlling measure would be to hwalign the vma
lock cache -- it wont affect the pts benchmark, but it should help
others.

Should the decision be to bring the lock back into the struct, I'll
note my patch is merely slapped together to a state where it can be
benchmarked and I have no interest in beating it into a committable
shape. You stated you already had an equivalent (modulo keeping
something in a space previously occupied by the pointer to the vma
lock), so as far as I'm concerned you can submit that with your
authorship.
Suren Baghdasaryan Aug. 12, 2024, 3:27 p.m. UTC | #11
On Sun, Aug 11, 2024 at 9:29 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Mon, Aug 12, 2024 at 12:50 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > Ok, disabling adjacent cacheline prefetching seems to do the trick (or
> > at least cuts down the regression drastically):
> >
> > Hmean     faults/cpu-1    470577.6434 (   0.00%)   470745.2649 *   0.04%*
> > Hmean     faults/cpu-4    445862.9701 (   0.00%)   445572.2252 *  -0.07%*
> > Hmean     faults/cpu-7    422516.4002 (   0.00%)   422677.5591 *   0.04%*
> > Hmean     faults/cpu-12   344483.7047 (   0.00%)   330476.7911 *  -4.07%*
> > Hmean     faults/cpu-21   192836.0188 (   0.00%)   195266.8071 *   1.26%*
> > Hmean     faults/cpu-30   140745.9472 (   0.00%)   140655.0459 *  -0.06%*
> > Hmean     faults/cpu-48   110507.4310 (   0.00%)   103802.1839 *  -6.07%*
> > Hmean     faults/cpu-56    93507.7919 (   0.00%)    95105.1875 *   1.71%*
> > Hmean     faults/sec-1    470232.3887 (   0.00%)   470404.6525 *   0.04%*
> > Hmean     faults/sec-4   1757368.9266 (   0.00%)  1752852.8697 *  -0.26%*
> > Hmean     faults/sec-7   2909554.8150 (   0.00%)  2915885.8739 *   0.22%*
> > Hmean     faults/sec-12  4033840.8719 (   0.00%)  3845165.3277 *  -4.68%*
> > Hmean     faults/sec-21  3845857.7079 (   0.00%)  3890316.8799 *   1.16%*
> > Hmean     faults/sec-30  3838607.4530 (   0.00%)  3838861.8142 *   0.01%*
> > Hmean     faults/sec-48  4882118.9701 (   0.00%)  4608985.0530 *  -5.59%*
> > Hmean     faults/sec-56  4933535.7567 (   0.00%)  5004208.3329 *   1.43%*
> >
> > Now, how do we disable prefetching extra cachelines for vm_area_structs only?
>
> I'm unaware of any mechanism of the sort.
>
> The good news is that Broadwell is an old yeller and if memory serves
> right the impact is not anywhere near this bad on newer
> microarchitectures, making "merely" 64 alignment (used all over in the
> kernel for amd64) a practical choice (not just for vma).

That's indeed good news if other archs are not that sensitive to this.

>
> Also note that in your setup you are losing out on performance in
> other multithreaded cases, unrelated to anything vma.
>
> That aside as I mentioned earlier the dedicated vma lock cache results
> in false sharing between separate vmas, except this particular
> benchmark does not test for it (which in your setup should be visible
> even if the cache grows the  SLAB_HWCACHE_ALIGN flag).

When implementing VMA locks I did experiment with SLAB_HWCACHE_ALIGN
for vm_lock cache using different benchmarks and didn't see
improvements above noise level. Do you know of some specific benchmark
that would possibly show improvement?

>
> I think the thing to do here is to bench on other cpus and ignore the
> Broadwell + adjacent cache line prefetcher result if they come back
> fine -- the code should not be held hostage by an old yeller.

That sounds like a good idea. Mel Gorman first reported this
regression when I was developing VMA locks and I believe he has a farm
of different machines to run mmtests on. CC'ing Mel.

Mel, would you be able to run PFT tests with the patch at
https://lore.kernel.org/all/20240808185949.1094891-1-mjguzik@gmail.com/
vs baseline on your farm? The goal is to see if any architecture other
than Broadwell shows performance regression.

>
> To that end I think it would be best to ask the LKP folks at Intel.
> They are very approachable so there should be no problem arranging it
> provided they have some spare capacity. I believe grabbing the From
> person and the cc list from this thread will do it:
> https://lore.kernel.org/oe-lkp/ZriCbCPF6I0JnbKi@xsang-OptiPlex-9020/ .
> By default they would run their own suite, which presumably has some
> overlap with this particular benchmark in terms of generated workload
> (but I don't think they run *this* particular benchmark itself,
> perhaps it would make sense to ask them to add it?). It's your call
> here.

Thanks for the suggestion. Let's see if Mel can use his farm first and
then will ask Intel folks.

>
> If there are still problems and the lock needs to remain separate, the
> bare minimum damage-controlling measure would be to hwalign the vma
> lock cache -- it wont affect the pts benchmark, but it should help
> others.

Sure but I'll need to measure the improvement and for that I need a
banchmark or a workload. Any suggestions?

>
> Should the decision be to bring the lock back into the struct, I'll
> note my patch is merely slapped together to a state where it can be
> benchmarked and I have no interest in beating it into a committable
> shape. You stated you already had an equivalent (modulo keeping
> something in a space previously occupied by the pointer to the vma
> lock), so as far as I'm concerned you can submit that with your
> authorship.

Thanks! If we end up doing that I'll keep you as Suggested-by and will
add a link to this thread.
Thanks,
Suren.

> --
> Mateusz Guzik <mjguzik gmail.com>
Mateusz Guzik Aug. 12, 2024, 4:30 p.m. UTC | #12
On Mon, Aug 12, 2024 at 5:27 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Sun, Aug 11, 2024 at 9:29 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > That aside as I mentioned earlier the dedicated vma lock cache results
> > in false sharing between separate vmas, except this particular
> > benchmark does not test for it (which in your setup should be visible
> > even if the cache grows the  SLAB_HWCACHE_ALIGN flag).
>
> When implementing VMA locks I did experiment with SLAB_HWCACHE_ALIGN
> for vm_lock cache using different benchmarks and didn't see
> improvements above noise level. Do you know of some specific benchmark
> that would possibly show improvement?
>

I don't know anything specific, I'm saying basic multicore hygiene
says these locks need to land in dedicated cachelines.

Consider the following: struct rw_semaphore is 40 bytes and the word
modified in the lock/unlock fast path is at offset 0.

I don't know how much waste is there in the allocator, if there is
anything less than 24 bytes (which obviously will be the case) there
will be massive false-sharing. 24 bytes of the *second* lock land in
the same cacheline as the first one, including the stuff which is
modified in the fast path. iow the locks allocated this way are
guaranteed to keep bouncing.

I don't believe any effort is warranted to try to find a real scenario
with this problem or synthetically trying to write one.

> > If there are still problems and the lock needs to remain separate, the
> > bare minimum damage-controlling measure would be to hwalign the vma
> > lock cache -- it wont affect the pts benchmark, but it should help
> > others.
>
> Sure but I'll need to measure the improvement and for that I need a
> banchmark or a workload. Any suggestions?
>

I believe I addressed this above.

If there is an individual who in your opinion is going to protest such
a patch on the grounds that no benchmark is being provided, I can give
them a talking to.

Even then, it may be this bit wont be applicable anyway, so....

> >
> > Should the decision be to bring the lock back into the struct, I'll
> > note my patch is merely slapped together to a state where it can be
> > benchmarked and I have no interest in beating it into a committable
> > shape. You stated you already had an equivalent (modulo keeping
> > something in a space previously occupied by the pointer to the vma
> > lock), so as far as I'm concerned you can submit that with your
> > authorship.
>
> Thanks! If we end up doing that I'll keep you as Suggested-by and will
> add a link to this thread.

sgtm
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43b40334e9b2..6d8b668d3deb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -687,7 +687,7 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+	if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
 		return false;
 
 	/*
@@ -702,7 +702,7 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
 	if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->vm_lock->lock);
+		up_read(&vma->vm_lock);
 		return false;
 	}
 	return true;
@@ -711,7 +711,7 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->vm_lock->lock);
+	up_read(&vma->vm_lock);
 	rcu_read_unlock();
 }
 
@@ -740,7 +740,7 @@  static inline void vma_start_write(struct vm_area_struct *vma)
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	down_write(&vma->vm_lock->lock);
+	down_write(&vma->vm_lock);
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
 	 * from the early lockless pessimistic check in vma_start_read().
@@ -748,7 +748,7 @@  static inline void vma_start_write(struct vm_area_struct *vma)
 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
 	 */
 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-	up_write(&vma->vm_lock->lock);
+	up_write(&vma->vm_lock);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -760,7 +760,7 @@  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 
 static inline void vma_assert_locked(struct vm_area_struct *vma)
 {
-	if (!rwsem_is_locked(&vma->vm_lock->lock))
+	if (!rwsem_is_locked(&vma->vm_lock))
 		vma_assert_write_locked(vma);
 }
 
@@ -827,10 +827,6 @@  static inline void assert_fault_locked(struct vm_fault *vmf)
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
 
-/*
- * WARNING: vma_init does not initialize vma->vm_lock.
- * Use vm_area_alloc()/vm_area_free() if vma needs locking.
- */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	memset(vma, 0, sizeof(*vma));
@@ -839,6 +835,8 @@  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_mark_detached(vma, false);
 	vma_numab_state_init(vma);
+	init_rwsem(&vma->vm_lock);
+	vma->vm_lock_seq = -1;
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 003619fab20e..caffdb4eeb94 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -615,10 +615,6 @@  static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
 }
 #endif
 
-struct vma_lock {
-	struct rw_semaphore lock;
-};
-
 struct vma_numab_state {
 	/*
 	 * Initialised as time in 'jiffies' after which VMA
@@ -716,8 +712,7 @@  struct vm_area_struct {
 	 * slowpath.
 	 */
 	int vm_lock_seq;
-	/* Unstable RCU readers are allowed to read this. */
-	struct vma_lock *vm_lock;
+	void *vm_dummy;
 #endif
 
 	/*
@@ -770,6 +765,9 @@  struct vm_area_struct {
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
+#endif
 } __randomize_layout;
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 92bfe56c9fed..eab04a24d5f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -436,35 +436,6 @@  static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-#ifdef CONFIG_PER_VMA_LOCK
-
-/* SLAB cache for vm_area_struct.lock */
-static struct kmem_cache *vma_lock_cachep;
-
-static bool vma_lock_alloc(struct vm_area_struct *vma)
-{
-	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
-	if (!vma->vm_lock)
-		return false;
-
-	init_rwsem(&vma->vm_lock->lock);
-	vma->vm_lock_seq = -1;
-
-	return true;
-}
-
-static inline void vma_lock_free(struct vm_area_struct *vma)
-{
-	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
-}
-
-#else /* CONFIG_PER_VMA_LOCK */
-
-static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
-static inline void vma_lock_free(struct vm_area_struct *vma) {}
-
-#endif /* CONFIG_PER_VMA_LOCK */
-
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
@@ -474,10 +445,6 @@  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 		return NULL;
 
 	vma_init(vma, mm);
-	if (!vma_lock_alloc(vma)) {
-		kmem_cache_free(vm_area_cachep, vma);
-		return NULL;
-	}
 
 	return vma;
 }
@@ -496,10 +463,8 @@  struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	 * will be reinitialized.
 	 */
 	data_race(memcpy(new, orig, sizeof(*new)));
-	if (!vma_lock_alloc(new)) {
-		kmem_cache_free(vm_area_cachep, new);
-		return NULL;
-	}
+	init_rwsem(&new->vm_lock);
+	new->vm_lock_seq = -1;
 	INIT_LIST_HEAD(&new->anon_vma_chain);
 	vma_numab_state_init(new);
 	dup_anon_vma_name(orig, new);
@@ -511,7 +476,6 @@  void __vm_area_free(struct vm_area_struct *vma)
 {
 	vma_numab_state_free(vma);
 	free_anon_vma_name(vma);
-	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -522,7 +486,7 @@  static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3192,10 +3156,7 @@  void __init proc_caches_init(void)
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 
-	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
-#ifdef CONFIG_PER_VMA_LOCK
-	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
-#endif
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
 	mmap_init();
 	nsproxy_cache_init();
 }
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3b7715ecf292..e95ecb2063d2 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -92,7 +92,7 @@  static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
 		 * mmap_lock, which guarantees that nobody can lock the
 		 * vma for write (vma_start_write()) under us.
 		 */
-		down_read(&vma->vm_lock->lock);
+		down_read(&vma->vm_lock);
 	}
 
 	mmap_read_unlock(mm);
@@ -1468,9 +1468,9 @@  static int uffd_move_lock(struct mm_struct *mm,
 		 * See comment in uffd_lock_vma() as to why not using
 		 * vma_start_read() here.
 		 */
-		down_read(&(*dst_vmap)->vm_lock->lock);
+		down_read(&(*dst_vmap)->vm_lock);
 		if (*dst_vmap != *src_vmap)
-			down_read_nested(&(*src_vmap)->vm_lock->lock,
+			down_read_nested(&(*src_vmap)->vm_lock,
 					 SINGLE_DEPTH_NESTING);
 	}
 	mmap_read_unlock(mm);