diff mbox series

memcg: rearrage fields of mem_cgroup_per_node

Message ID 20240523034824.1255719-1-shakeel.butt@linux.dev (mailing list archive)
State New
Headers show
Series memcg: rearrage fields of mem_cgroup_per_node | expand

Commit Message

Shakeel Butt May 23, 2024, 3:48 a.m. UTC
Kernel test robot reported [1] performance regression for will-it-scale
test suite's page_fault2 test case for the commit 70a64b7919cb ("memcg:
dynamically allocate lruvec_stats"). After inspection it seems like the
commit has unintentionally introduced false cache sharing.

After the commit the fields of mem_cgroup_per_node which get read on the
performance critical path share the cacheline with the fields which
get updated often on LRU page allocations or deallocations. This has
caused contention on that cacheline and the workloads which manipulates
a lot of LRU pages are regressed as reported by the test report.

The solution is to rearrange the fields of mem_cgroup_per_node such that
the false sharing is eliminated. Let's move all the read only pointers
at the start of the struct, followed by memcg-v1 only fields and at the
end fields which get updated often.

Experiment setup: Ran fallocate1, fallocate2, page_fault1, page_fault2
and page_fault3 from the will-it-scale test suite inside a three level
memcg with /tmp mounted as tmpfs on two different machines, one a single
numa node and the other one, two node machine.

 $ ./[testcase]_processes -t $NR_CPUS -s 50

Results for single node, 52 CPU machine:

Testcase        base        with-patch

fallocate1      1031081     1431291  (38.80 %)
fallocate2      1029993     1421421  (38.00 %)
page_fault1     2269440     3405788  (50.07 %)
page_fault2     2375799     3572868  (50.30 %)
page_fault3     28641143    28673950 ( 0.11 %)

Results for dual node, 80 CPU machine:

Testcase        base        with-patch

fallocate1      2976288     3641185  (22.33 %)
fallocate2      2979366     3638181  (22.11 %)
page_fault1     6221790     7748245  (24.53 %)
page_fault2     6482854     7847698  (21.05 %)
page_fault3     28804324    28991870 ( 0.65 %)

Fixes: 70a64b7919cb ("memcg: dynamically allocate lruvec_stats")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202405171353.b56b845-oliver.sang@intel.com
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 include/linux/memcontrol.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

Comments

Yosry Ahmed May 23, 2024, 4:35 a.m. UTC | #1
On Wed, May 22, 2024 at 8:48 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> Kernel test robot reported [1] performance regression for will-it-scale
> test suite's page_fault2 test case for the commit 70a64b7919cb ("memcg:
> dynamically allocate lruvec_stats"). After inspection it seems like the
> commit has unintentionally introduced false cache sharing.
>
> After the commit the fields of mem_cgroup_per_node which get read on the
> performance critical path share the cacheline with the fields which
> get updated often on LRU page allocations or deallocations. This has
> caused contention on that cacheline and the workloads which manipulates
> a lot of LRU pages are regressed as reported by the test report.
>
> The solution is to rearrange the fields of mem_cgroup_per_node such that
> the false sharing is eliminated. Let's move all the read only pointers
> at the start of the struct, followed by memcg-v1 only fields and at the
> end fields which get updated often.
>
> Experiment setup: Ran fallocate1, fallocate2, page_fault1, page_fault2
> and page_fault3 from the will-it-scale test suite inside a three level
> memcg with /tmp mounted as tmpfs on two different machines, one a single
> numa node and the other one, two node machine.
>
>  $ ./[testcase]_processes -t $NR_CPUS -s 50
>
> Results for single node, 52 CPU machine:
>
> Testcase        base        with-patch
>
> fallocate1      1031081     1431291  (38.80 %)
> fallocate2      1029993     1421421  (38.00 %)
> page_fault1     2269440     3405788  (50.07 %)
> page_fault2     2375799     3572868  (50.30 %)
> page_fault3     28641143    28673950 ( 0.11 %)
>
> Results for dual node, 80 CPU machine:
>
> Testcase        base        with-patch
>
> fallocate1      2976288     3641185  (22.33 %)
> fallocate2      2979366     3638181  (22.11 %)
> page_fault1     6221790     7748245  (24.53 %)
> page_fault2     6482854     7847698  (21.05 %)
> page_fault3     28804324    28991870 ( 0.65 %)

Great analysis :)

>
> Fixes: 70a64b7919cb ("memcg: dynamically allocate lruvec_stats")
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/oe-lkp/202405171353.b56b845-oliver.sang@intel.com
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
>  include/linux/memcontrol.h | 18 ++++++++++--------
>  1 file changed, 10 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 030d34e9d117..16efd9737be9 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -96,23 +96,25 @@ struct mem_cgroup_reclaim_iter {
>   * per-node information in memory controller.
>   */
>  struct mem_cgroup_per_node {
> -       struct lruvec           lruvec;
> +       /* Keep the read-only fields at the start */
> +       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> +                                               /* use container_of        */
>
>         struct lruvec_stats_percpu __percpu     *lruvec_stats_percpu;
>         struct lruvec_stats                     *lruvec_stats;
> -
> -       unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
> -
> -       struct mem_cgroup_reclaim_iter  iter;
> -
>         struct shrinker_info __rcu      *shrinker_info;
>
> +       /* memcg-v1 only stuff in middle */
> +
>         struct rb_node          tree_node;      /* RB tree node */
>         unsigned long           usage_in_excess;/* Set to the value by which */
>                                                 /* the soft limit is exceeded*/
>         bool                    on_tree;
> -       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> -                                               /* use container_of        */

Do we need CACHELINE_PADDING() here (or maybe make struct lruvec
cache-aligned) to make sure the false cacheline sharing doesn't happen
again with the fields below, or is the idea that the fields that get
read in hot paths (memcg, lruvec_stats_percpu, lruvec_stats) are far
at the top, and the memcg v1 elements in the middle act as a buffer?

IOW, is sharing between the fields below and memcg v1 fields okay
because they are not read in the hot path? If yes, I believe it's
worth a comment. It can be easily missed if the memcg v1 soft limit is
removed later for example.

> +
> +       /* Fields which get updated often at the end. */
> +       struct lruvec           lruvec;
> +       unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
> +       struct mem_cgroup_reclaim_iter  iter;
>  };
>
>  struct mem_cgroup_threshold {
> --
> 2.43.0
>
Shakeel Butt May 23, 2024, 5:34 a.m. UTC | #2
On Wed, May 22, 2024 at 09:35:57PM -0700, Yosry Ahmed wrote:
> On Wed, May 22, 2024 at 8:48 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
[...]
> >
> >  struct mem_cgroup_per_node {
> > -       struct lruvec           lruvec;
> > +       /* Keep the read-only fields at the start */
> > +       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> > +                                               /* use container_of        */
> >
> >         struct lruvec_stats_percpu __percpu     *lruvec_stats_percpu;
> >         struct lruvec_stats                     *lruvec_stats;
> > -
> > -       unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
> > -
> > -       struct mem_cgroup_reclaim_iter  iter;
> > -
> >         struct shrinker_info __rcu      *shrinker_info;
> >
> > +       /* memcg-v1 only stuff in middle */
> > +
> >         struct rb_node          tree_node;      /* RB tree node */
> >         unsigned long           usage_in_excess;/* Set to the value by which */
> >                                                 /* the soft limit is exceeded*/
> >         bool                    on_tree;
> > -       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> > -                                               /* use container_of        */
> 
> Do we need CACHELINE_PADDING() here (or maybe make struct lruvec
> cache-aligned) to make sure the false cacheline sharing doesn't happen
> again with the fields below, or is the idea that the fields that get
> read in hot paths (memcg, lruvec_stats_percpu, lruvec_stats) are far
> at the top, and the memcg v1 elements in the middle act as a buffer?
> 
> IOW, is sharing between the fields below and memcg v1 fields okay
> because they are not read in the hot path? If yes, I believe it's
> worth a comment. It can be easily missed if the memcg v1 soft limit is
> removed later for example.
> 

For 6.10, I wanted to keep the change simple and yes, the memcg v1 stuff
as a buffer between the pointers and lruvec/lru_zone_size fields. For
6.11 or later kernels, I am planning to use some asserts to make sure
these fields don't share a cacheline, so later when we remove the
v1-only stuff, the asserts will make sure we keep the separate cacheline
property intact.
Yosry Ahmed May 23, 2024, 5:52 a.m. UTC | #3
On Wed, May 22, 2024 at 10:34 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Wed, May 22, 2024 at 09:35:57PM -0700, Yosry Ahmed wrote:
> > On Wed, May 22, 2024 at 8:48 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> [...]
> > >
> > >  struct mem_cgroup_per_node {
> > > -       struct lruvec           lruvec;
> > > +       /* Keep the read-only fields at the start */
> > > +       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> > > +                                               /* use container_of        */
> > >
> > >         struct lruvec_stats_percpu __percpu     *lruvec_stats_percpu;
> > >         struct lruvec_stats                     *lruvec_stats;
> > > -
> > > -       unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
> > > -
> > > -       struct mem_cgroup_reclaim_iter  iter;
> > > -
> > >         struct shrinker_info __rcu      *shrinker_info;
> > >
> > > +       /* memcg-v1 only stuff in middle */
> > > +
> > >         struct rb_node          tree_node;      /* RB tree node */
> > >         unsigned long           usage_in_excess;/* Set to the value by which */
> > >                                                 /* the soft limit is exceeded*/
> > >         bool                    on_tree;
> > > -       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> > > -                                               /* use container_of        */
> >
> > Do we need CACHELINE_PADDING() here (or maybe make struct lruvec
> > cache-aligned) to make sure the false cacheline sharing doesn't happen
> > again with the fields below, or is the idea that the fields that get
> > read in hot paths (memcg, lruvec_stats_percpu, lruvec_stats) are far
> > at the top, and the memcg v1 elements in the middle act as a buffer?
> >
> > IOW, is sharing between the fields below and memcg v1 fields okay
> > because they are not read in the hot path? If yes, I believe it's
> > worth a comment. It can be easily missed if the memcg v1 soft limit is
> > removed later for example.
> >
>
> For 6.10, I wanted to keep the change simple and yes, the memcg v1 stuff
> as a buffer between the pointers and lruvec/lru_zone_size fields. For

Fair enough, could we update the comment to explicitly mention this?

> 6.11 or later kernels, I am planning to use some asserts to make sure
> these fields don't share a cacheline, so later when we remove the
> v1-only stuff, the asserts will make sure we keep the separate cacheline
> property intact.
>

Makes sense to me.

With the comment update, feel free to add:
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Roman Gushchin May 23, 2024, 3:30 p.m. UTC | #4
On Wed, May 22, 2024 at 10:34:38PM -0700, Shakeel Butt wrote:
> On Wed, May 22, 2024 at 09:35:57PM -0700, Yosry Ahmed wrote:
> > On Wed, May 22, 2024 at 8:48 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> [...]
> > >
> > >  struct mem_cgroup_per_node {
> > > -       struct lruvec           lruvec;
> > > +       /* Keep the read-only fields at the start */
> > > +       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> > > +                                               /* use container_of        */
> > >
> > >         struct lruvec_stats_percpu __percpu     *lruvec_stats_percpu;
> > >         struct lruvec_stats                     *lruvec_stats;
> > > -
> > > -       unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
> > > -
> > > -       struct mem_cgroup_reclaim_iter  iter;
> > > -
> > >         struct shrinker_info __rcu      *shrinker_info;
> > >
> > > +       /* memcg-v1 only stuff in middle */
> > > +
> > >         struct rb_node          tree_node;      /* RB tree node */
> > >         unsigned long           usage_in_excess;/* Set to the value by which */
> > >                                                 /* the soft limit is exceeded*/
> > >         bool                    on_tree;
> > > -       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
> > > -                                               /* use container_of        */
> > 
> > Do we need CACHELINE_PADDING() here (or maybe make struct lruvec
> > cache-aligned) to make sure the false cacheline sharing doesn't happen
> > again with the fields below, or is the idea that the fields that get
> > read in hot paths (memcg, lruvec_stats_percpu, lruvec_stats) are far
> > at the top, and the memcg v1 elements in the middle act as a buffer?

It's a good point. Once we will compile out the memcg v1 stuff, it might stop
working.

> > 
> > IOW, is sharing between the fields below and memcg v1 fields okay
> > because they are not read in the hot path? If yes, I believe it's
> > worth a comment. It can be easily missed if the memcg v1 soft limit is
> > removed later for example.
> > 
> 
> For 6.10, I wanted to keep the change simple and yes, the memcg v1 stuff
> as a buffer between the pointers and lruvec/lru_zone_size fields. For
> 6.11 or later kernels, I am planning to use some asserts to make sure
> these fields don't share a cacheline, so later when we remove the
> v1-only stuff, the asserts will make sure we keep the separate cacheline
> property intact.

Sounds good. Once we'll have memcg v1 stuff under a config option, we'll
put those asserts in.
Btw, I'm about (today/tomorrow) to post the memcg-v1 separation patchset,
so it won't take a long time.

Thanks!
Roman Gushchin May 23, 2024, 3:34 p.m. UTC | #5
On Wed, May 22, 2024 at 08:48:24PM -0700, Shakeel Butt wrote:
> Kernel test robot reported [1] performance regression for will-it-scale
> test suite's page_fault2 test case for the commit 70a64b7919cb ("memcg:
> dynamically allocate lruvec_stats"). After inspection it seems like the
> commit has unintentionally introduced false cache sharing.
> 
> After the commit the fields of mem_cgroup_per_node which get read on the
> performance critical path share the cacheline with the fields which
> get updated often on LRU page allocations or deallocations. This has
> caused contention on that cacheline and the workloads which manipulates
> a lot of LRU pages are regressed as reported by the test report.
> 
> The solution is to rearrange the fields of mem_cgroup_per_node such that
> the false sharing is eliminated. Let's move all the read only pointers
> at the start of the struct, followed by memcg-v1 only fields and at the
> end fields which get updated often.
> 
> Experiment setup: Ran fallocate1, fallocate2, page_fault1, page_fault2
> and page_fault3 from the will-it-scale test suite inside a three level
> memcg with /tmp mounted as tmpfs on two different machines, one a single
> numa node and the other one, two node machine.
> 
>  $ ./[testcase]_processes -t $NR_CPUS -s 50
> 
> Results for single node, 52 CPU machine:
> 
> Testcase        base        with-patch
> 
> fallocate1      1031081     1431291  (38.80 %)
> fallocate2      1029993     1421421  (38.00 %)
> page_fault1     2269440     3405788  (50.07 %)
> page_fault2     2375799     3572868  (50.30 %)
> page_fault3     28641143    28673950 ( 0.11 %)
> 
> Results for dual node, 80 CPU machine:
> 
> Testcase        base        with-patch
> 
> fallocate1      2976288     3641185  (22.33 %)
> fallocate2      2979366     3638181  (22.11 %)
> page_fault1     6221790     7748245  (24.53 %)
> page_fault2     6482854     7847698  (21.05 %)
> page_fault3     28804324    28991870 ( 0.65 %)
> 
> Fixes: 70a64b7919cb ("memcg: dynamically allocate lruvec_stats")
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/oe-lkp/202405171353.b56b845-oliver.sang@intel.com
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>

Thanks!
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 030d34e9d117..16efd9737be9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -96,23 +96,25 @@  struct mem_cgroup_reclaim_iter {
  * per-node information in memory controller.
  */
 struct mem_cgroup_per_node {
-	struct lruvec		lruvec;
+	/* Keep the read-only fields at the start */
+	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
+						/* use container_of	   */
 
 	struct lruvec_stats_percpu __percpu	*lruvec_stats_percpu;
 	struct lruvec_stats			*lruvec_stats;
-
-	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
-
-	struct mem_cgroup_reclaim_iter	iter;
-
 	struct shrinker_info __rcu	*shrinker_info;
 
+	/* memcg-v1 only stuff in middle */
+
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
-	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
-						/* use container_of	   */
+
+	/* Fields which get updated often at the end. */
+	struct lruvec		lruvec;
+	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
+	struct mem_cgroup_reclaim_iter	iter;
 };
 
 struct mem_cgroup_threshold {