diff mbox series

[2/3] mm: page_counter: rearrange struct page_counter fields

Message ID 20220822001737.4120417-3-shakeelb@google.com (mailing list archive)
State New
Headers show
Series memcg: optimizatize charge codepath | expand

Commit Message

Shakeel Butt Aug. 22, 2022, 12:17 a.m. UTC
With memcg v2 enabled, memcg->memory.usage is a very hot member for
the workloads doing memcg charging on multiple CPUs concurrently.
Particularly the network intensive workloads. In addition, there is a
false cache sharing between memory.usage and memory.high on the charge
path. This patch moves the usage into a separate cacheline and move all
the read most fields into separate cacheline.

To evaluate the impact of this optimization, on a 72 CPUs machine, we
ran the following workload in a three level of cgroup hierarchy with top
level having min and low setup appropriately. More specifically
memory.min equal to size of netperf binary and memory.low double of
that.

 $ netserver -6
 # 36 instances of netperf with following params
 $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K

Results (average throughput of netperf):
Without (6.0-rc1)	10482.7 Mbps
With patch		12413.7 Mbps (18.4% improvement)

With the patch, the throughput improved by 18.4%.

One side-effect of this patch is the increase in the size of struct
mem_cgroup. However for the performance improvement, this additional
size is worth it. In addition there are opportunities to reduce the size
of struct mem_cgroup like deprecation of kmem and tcpmem page counters
and better packing.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
---
 include/linux/page_counter.h | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

Comments

Soheil Hassas Yeganeh Aug. 22, 2022, 12:24 a.m. UTC | #1
On Sun, Aug 21, 2022 at 8:18 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> With memcg v2 enabled, memcg->memory.usage is a very hot member for
> the workloads doing memcg charging on multiple CPUs concurrently.
> Particularly the network intensive workloads. In addition, there is a
> false cache sharing between memory.usage and memory.high on the charge
> path. This patch moves the usage into a separate cacheline and move all
> the read most fields into separate cacheline.
>
> To evaluate the impact of this optimization, on a 72 CPUs machine, we
> ran the following workload in a three level of cgroup hierarchy with top
> level having min and low setup appropriately. More specifically
> memory.min equal to size of netperf binary and memory.low double of
> that.
>
>  $ netserver -6
>  # 36 instances of netperf with following params
>  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
>
> Results (average throughput of netperf):
> Without (6.0-rc1)       10482.7 Mbps
> With patch              12413.7 Mbps (18.4% improvement)
>
> With the patch, the throughput improved by 18.4%.

Shakeel, for my understanding: is this on top of the gains from the
previous patch?

> One side-effect of this patch is the increase in the size of struct
> mem_cgroup. However for the performance improvement, this additional
> size is worth it. In addition there are opportunities to reduce the size
> of struct mem_cgroup like deprecation of kmem and tcpmem page counters
> and better packing.
>
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Reported-by: kernel test robot <oliver.sang@intel.com>
> ---
>  include/linux/page_counter.h | 34 +++++++++++++++++++++++-----------
>  1 file changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> index 679591301994..8ce99bde645f 100644
> --- a/include/linux/page_counter.h
> +++ b/include/linux/page_counter.h
> @@ -3,15 +3,27 @@
>  #define _LINUX_PAGE_COUNTER_H
>
>  #include <linux/atomic.h>
> +#include <linux/cache.h>
>  #include <linux/kernel.h>
>  #include <asm/page.h>
>
> +#if defined(CONFIG_SMP)
> +struct pc_padding {
> +       char x[0];
> +} ____cacheline_internodealigned_in_smp;
> +#define PC_PADDING(name)       struct pc_padding name
> +#else
> +#define PC_PADDING(name)
> +#endif
> +
>  struct page_counter {
> +       /*
> +        * Make sure 'usage' does not share cacheline with any other field. The
> +        * memcg->memory.usage is a hot member of struct mem_cgroup.
> +        */
> +       PC_PADDING(_pad1_);
>         atomic_long_t usage;
> -       unsigned long min;
> -       unsigned long low;
> -       unsigned long high;
> -       unsigned long max;
> +       PC_PADDING(_pad2_);
>
>         /* effective memory.min and memory.min usage tracking */
>         unsigned long emin;
> @@ -23,16 +35,16 @@ struct page_counter {
>         atomic_long_t low_usage;
>         atomic_long_t children_low_usage;
>
> -       /* legacy */
>         unsigned long watermark;
>         unsigned long failcnt;
>
> -       /*
> -        * 'parent' is placed here to be far from 'usage' to reduce
> -        * cache false sharing, as 'usage' is written mostly while
> -        * parent is frequently read for cgroup's hierarchical
> -        * counting nature.
> -        */
> +       /* Keep all the read most fields in a separete cacheline. */
> +       PC_PADDING(_pad3_);
> +
> +       unsigned long min;
> +       unsigned long low;
> +       unsigned long high;
> +       unsigned long max;
>         struct page_counter *parent;
>  };
>
> --
> 2.37.1.595.g718a3a8f04-goog
>
Feng Tang Aug. 22, 2022, 2:10 a.m. UTC | #2
On Mon, Aug 22, 2022 at 08:17:36AM +0800, Shakeel Butt wrote:
> With memcg v2 enabled, memcg->memory.usage is a very hot member for
> the workloads doing memcg charging on multiple CPUs concurrently.
> Particularly the network intensive workloads. In addition, there is a
> false cache sharing between memory.usage and memory.high on the charge
> path. This patch moves the usage into a separate cacheline and move all
> the read most fields into separate cacheline.
> 
> To evaluate the impact of this optimization, on a 72 CPUs machine, we
> ran the following workload in a three level of cgroup hierarchy with top
> level having min and low setup appropriately. More specifically
> memory.min equal to size of netperf binary and memory.low double of
> that.
> 
>  $ netserver -6
>  # 36 instances of netperf with following params
>  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
> 
> Results (average throughput of netperf):
> Without (6.0-rc1)	10482.7 Mbps
> With patch		12413.7 Mbps (18.4% improvement)
> 
> With the patch, the throughput improved by 18.4%.
> 
> One side-effect of this patch is the increase in the size of struct
> mem_cgroup. However for the performance improvement, this additional
> size is worth it. In addition there are opportunities to reduce the size
> of struct mem_cgroup like deprecation of kmem and tcpmem page counters
> and better packing.
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Reported-by: kernel test robot <oliver.sang@intel.com>

Looks good to me, with one nit below. 

Reviewed-by: Feng Tang <feng.tang@intel.com>

> ---
>  include/linux/page_counter.h | 34 +++++++++++++++++++++++-----------
>  1 file changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> index 679591301994..8ce99bde645f 100644
> --- a/include/linux/page_counter.h
> +++ b/include/linux/page_counter.h
> @@ -3,15 +3,27 @@
>  #define _LINUX_PAGE_COUNTER_H
>  
>  #include <linux/atomic.h>
> +#include <linux/cache.h>
>  #include <linux/kernel.h>
>  #include <asm/page.h>
>  
> +#if defined(CONFIG_SMP)
> +struct pc_padding {
> +	char x[0];
> +} ____cacheline_internodealigned_in_smp;
> +#define PC_PADDING(name)	struct pc_padding name
> +#else
> +#define PC_PADDING(name)
> +#endif

There are 2 similar padding definitions in mmzone.h and memcontrol.h:

	struct memcg_padding {
		char x[0];
	} ____cacheline_internodealigned_in_smp;
	#define MEMCG_PADDING(name)      struct memcg_padding name

	struct zone_padding {
		char x[0];
	} ____cacheline_internodealigned_in_smp;
	#define ZONE_PADDING(name)	struct zone_padding name;

Maybe we can generalize them, and lift it into include/cache.h? so
that more places can reuse it in future.

Thanks,
Feng
Shakeel Butt Aug. 22, 2022, 4:55 a.m. UTC | #3
On Sun, Aug 21, 2022 at 5:24 PM Soheil Hassas Yeganeh <soheil@google.com> wrote:
>
> On Sun, Aug 21, 2022 at 8:18 PM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > With memcg v2 enabled, memcg->memory.usage is a very hot member for
> > the workloads doing memcg charging on multiple CPUs concurrently.
> > Particularly the network intensive workloads. In addition, there is a
> > false cache sharing between memory.usage and memory.high on the charge
> > path. This patch moves the usage into a separate cacheline and move all
> > the read most fields into separate cacheline.
> >
> > To evaluate the impact of this optimization, on a 72 CPUs machine, we
> > ran the following workload in a three level of cgroup hierarchy with top
> > level having min and low setup appropriately. More specifically
> > memory.min equal to size of netperf binary and memory.low double of
> > that.
> >
> >  $ netserver -6
> >  # 36 instances of netperf with following params
> >  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
> >
> > Results (average throughput of netperf):
> > Without (6.0-rc1)       10482.7 Mbps
> > With patch              12413.7 Mbps (18.4% improvement)
> >
> > With the patch, the throughput improved by 18.4%.
>
> Shakeel, for my understanding: is this on top of the gains from the
> previous patch?
>

No, this is independent of the previous patch. The cover letter has
the numbers for all three optimizations applied together.
Shakeel Butt Aug. 22, 2022, 4:59 a.m. UTC | #4
On Sun, Aug 21, 2022 at 7:12 PM Feng Tang <feng.tang@intel.com> wrote:
>
> On Mon, Aug 22, 2022 at 08:17:36AM +0800, Shakeel Butt wrote:
> > With memcg v2 enabled, memcg->memory.usage is a very hot member for
> > the workloads doing memcg charging on multiple CPUs concurrently.
> > Particularly the network intensive workloads. In addition, there is a
> > false cache sharing between memory.usage and memory.high on the charge
> > path. This patch moves the usage into a separate cacheline and move all
> > the read most fields into separate cacheline.
> >
> > To evaluate the impact of this optimization, on a 72 CPUs machine, we
> > ran the following workload in a three level of cgroup hierarchy with top
> > level having min and low setup appropriately. More specifically
> > memory.min equal to size of netperf binary and memory.low double of
> > that.
> >
> >  $ netserver -6
> >  # 36 instances of netperf with following params
> >  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
> >
> > Results (average throughput of netperf):
> > Without (6.0-rc1)     10482.7 Mbps
> > With patch            12413.7 Mbps (18.4% improvement)
> >
> > With the patch, the throughput improved by 18.4%.
> >
> > One side-effect of this patch is the increase in the size of struct
> > mem_cgroup. However for the performance improvement, this additional
> > size is worth it. In addition there are opportunities to reduce the size
> > of struct mem_cgroup like deprecation of kmem and tcpmem page counters
> > and better packing.
> >
> > Signed-off-by: Shakeel Butt <shakeelb@google.com>
> > Reported-by: kernel test robot <oliver.sang@intel.com>
>
> Looks good to me, with one nit below.
>
> Reviewed-by: Feng Tang <feng.tang@intel.com>

Thanks.

>
> > ---
> >  include/linux/page_counter.h | 34 +++++++++++++++++++++++-----------
> >  1 file changed, 23 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> > index 679591301994..8ce99bde645f 100644
> > --- a/include/linux/page_counter.h
> > +++ b/include/linux/page_counter.h
> > @@ -3,15 +3,27 @@
> >  #define _LINUX_PAGE_COUNTER_H
> >
> >  #include <linux/atomic.h>
> > +#include <linux/cache.h>
> >  #include <linux/kernel.h>
> >  #include <asm/page.h>
> >
> > +#if defined(CONFIG_SMP)
> > +struct pc_padding {
> > +     char x[0];
> > +} ____cacheline_internodealigned_in_smp;
> > +#define PC_PADDING(name)     struct pc_padding name
> > +#else
> > +#define PC_PADDING(name)
> > +#endif
>
> There are 2 similar padding definitions in mmzone.h and memcontrol.h:
>
>         struct memcg_padding {
>                 char x[0];
>         } ____cacheline_internodealigned_in_smp;
>         #define MEMCG_PADDING(name)      struct memcg_padding name
>
>         struct zone_padding {
>                 char x[0];
>         } ____cacheline_internodealigned_in_smp;
>         #define ZONE_PADDING(name)      struct zone_padding name;
>
> Maybe we can generalize them, and lift it into include/cache.h? so
> that more places can reuse it in future.
>

This makes sense but let me do that in a separate patch.
Shakeel Butt Aug. 22, 2022, 3:06 p.m. UTC | #5
On Mon, Aug 22, 2022 at 3:23 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 22-08-22 00:17:36, Shakeel Butt wrote:
> > With memcg v2 enabled, memcg->memory.usage is a very hot member for
> > the workloads doing memcg charging on multiple CPUs concurrently.
> > Particularly the network intensive workloads. In addition, there is a
> > false cache sharing between memory.usage and memory.high on the charge
> > path. This patch moves the usage into a separate cacheline and move all
> > the read most fields into separate cacheline.
> >
> > To evaluate the impact of this optimization, on a 72 CPUs machine, we
> > ran the following workload in a three level of cgroup hierarchy with top
> > level having min and low setup appropriately. More specifically
> > memory.min equal to size of netperf binary and memory.low double of
> > that.
>
> Again the workload description is not particularly useful. I guess the
> only important aspect is the netserver part below and the number of CPUs
> because min and low setup doesn't have much to do with this, right? At
> least that is my reading of the memory.high mentioned above.
>

The experiment numbers below are for only this patch independently
i.e. the unnecessary min/low atomic xchg() is still happening for both
setups. I could run the experiment without setting min and low but I
wanted to keep the setup exactly the same for all three optimizations.

This patch and the following perf numbers shows only the impact of
removing false sharing in struct page_counter for memcg->memory on the
charging code path.

> >  $ netserver -6
> >  # 36 instances of netperf with following params
> >  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
> >
> > Results (average throughput of netperf):
> > Without (6.0-rc1)     10482.7 Mbps
> > With patch            12413.7 Mbps (18.4% improvement)
> >
> > With the patch, the throughput improved by 18.4%.
> >
> > One side-effect of this patch is the increase in the size of struct
> > mem_cgroup. However for the performance improvement, this additional
> > size is worth it. In addition there are opportunities to reduce the size
> > of struct mem_cgroup like deprecation of kmem and tcpmem page counters
> > and better packing.
> >
> > Signed-off-by: Shakeel Butt <shakeelb@google.com>
> > Reported-by: kernel test robot <oliver.sang@intel.com>
> > ---
> >  include/linux/page_counter.h | 34 +++++++++++++++++++++++-----------
> >  1 file changed, 23 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> > index 679591301994..8ce99bde645f 100644
> > --- a/include/linux/page_counter.h
> > +++ b/include/linux/page_counter.h
> > @@ -3,15 +3,27 @@
> >  #define _LINUX_PAGE_COUNTER_H
> >
> >  #include <linux/atomic.h>
> > +#include <linux/cache.h>
> >  #include <linux/kernel.h>
> >  #include <asm/page.h>
> >
> > +#if defined(CONFIG_SMP)
> > +struct pc_padding {
> > +     char x[0];
> > +} ____cacheline_internodealigned_in_smp;
> > +#define PC_PADDING(name)     struct pc_padding name
> > +#else
> > +#define PC_PADDING(name)
> > +#endif
> > +
> >  struct page_counter {
> > +     /*
> > +      * Make sure 'usage' does not share cacheline with any other field. The
> > +      * memcg->memory.usage is a hot member of struct mem_cgroup.
> > +      */
> > +     PC_PADDING(_pad1_);
>
> Why don't you simply require alignment for the structure?

I don't just want the alignment of the structure. I want different
fields of this structure to not share the cache line. More
specifically the 'high' and 'usage' fields. With this change the usage
will be its own cache line, the read-most fields will be on separate
cache line and the fields which sometimes get updated on charge path
based on some condition will be a different cache line from the
previous two.
Michal Hocko Aug. 22, 2022, 3:15 p.m. UTC | #6
On Mon 22-08-22 08:06:14, Shakeel Butt wrote:
[...]
> > >  struct page_counter {
> > > +     /*
> > > +      * Make sure 'usage' does not share cacheline with any other field. The
> > > +      * memcg->memory.usage is a hot member of struct mem_cgroup.
> > > +      */
> > > +     PC_PADDING(_pad1_);
> >
> > Why don't you simply require alignment for the structure?
> 
> I don't just want the alignment of the structure. I want different
> fields of this structure to not share the cache line. More
> specifically the 'high' and 'usage' fields. With this change the usage
> will be its own cache line, the read-most fields will be on separate
> cache line and the fields which sometimes get updated on charge path
> based on some condition will be a different cache line from the
> previous two.

I do not follow. If you make an explicit requirement for the structure
alignement then the first field in the structure will be guarantied to
have that alignement and you achieve the rest to be in the other cache
line by adding padding behind that.
Shakeel Butt Aug. 22, 2022, 4:04 p.m. UTC | #7
On Mon, Aug 22, 2022 at 8:15 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 22-08-22 08:06:14, Shakeel Butt wrote:
> [...]
> > > >  struct page_counter {
> > > > +     /*
> > > > +      * Make sure 'usage' does not share cacheline with any other field. The
> > > > +      * memcg->memory.usage is a hot member of struct mem_cgroup.
> > > > +      */
> > > > +     PC_PADDING(_pad1_);
> > >
> > > Why don't you simply require alignment for the structure?
> >
> > I don't just want the alignment of the structure. I want different
> > fields of this structure to not share the cache line. More
> > specifically the 'high' and 'usage' fields. With this change the usage
> > will be its own cache line, the read-most fields will be on separate
> > cache line and the fields which sometimes get updated on charge path
> > based on some condition will be a different cache line from the
> > previous two.
>
> I do not follow. If you make an explicit requirement for the structure
> alignement then the first field in the structure will be guarantied to
> have that alignement and you achieve the rest to be in the other cache
> line by adding padding behind that.

Oh, you were talking explicitly about _pad1_, yes, we can remove it
and make the struct cache align. I will do it in the next version.
Roman Gushchin Aug. 22, 2022, 6:27 p.m. UTC | #8
On Mon, Aug 22, 2022 at 09:04:59AM -0700, Shakeel Butt wrote:
> On Mon, Aug 22, 2022 at 8:15 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 22-08-22 08:06:14, Shakeel Butt wrote:
> > [...]
> > > > >  struct page_counter {
> > > > > +     /*
> > > > > +      * Make sure 'usage' does not share cacheline with any other field. The
> > > > > +      * memcg->memory.usage is a hot member of struct mem_cgroup.
> > > > > +      */
> > > > > +     PC_PADDING(_pad1_);
> > > >
> > > > Why don't you simply require alignment for the structure?
> > >
> > > I don't just want the alignment of the structure. I want different
> > > fields of this structure to not share the cache line. More
> > > specifically the 'high' and 'usage' fields. With this change the usage
> > > will be its own cache line, the read-most fields will be on separate
> > > cache line and the fields which sometimes get updated on charge path
> > > based on some condition will be a different cache line from the
> > > previous two.
> >
> > I do not follow. If you make an explicit requirement for the structure
> > alignement then the first field in the structure will be guarantied to
> > have that alignement and you achieve the rest to be in the other cache
> > line by adding padding behind that.
> 
> Oh, you were talking explicitly about _pad1_, yes, we can remove it
> and make the struct cache align. I will do it in the next version.

Yes, please, it caught my eyes too.
With this change:
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>

Also, can you, please, include the numbers on the additional memory overhead?
I think it still worth it, just think we need to include them for a record.

Thanks!
diff mbox series

Patch

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 679591301994..8ce99bde645f 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -3,15 +3,27 @@ 
 #define _LINUX_PAGE_COUNTER_H
 
 #include <linux/atomic.h>
+#include <linux/cache.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
 
+#if defined(CONFIG_SMP)
+struct pc_padding {
+	char x[0];
+} ____cacheline_internodealigned_in_smp;
+#define PC_PADDING(name)	struct pc_padding name
+#else
+#define PC_PADDING(name)
+#endif
+
 struct page_counter {
+	/*
+	 * Make sure 'usage' does not share cacheline with any other field. The
+	 * memcg->memory.usage is a hot member of struct mem_cgroup.
+	 */
+	PC_PADDING(_pad1_);
 	atomic_long_t usage;
-	unsigned long min;
-	unsigned long low;
-	unsigned long high;
-	unsigned long max;
+	PC_PADDING(_pad2_);
 
 	/* effective memory.min and memory.min usage tracking */
 	unsigned long emin;
@@ -23,16 +35,16 @@  struct page_counter {
 	atomic_long_t low_usage;
 	atomic_long_t children_low_usage;
 
-	/* legacy */
 	unsigned long watermark;
 	unsigned long failcnt;
 
-	/*
-	 * 'parent' is placed here to be far from 'usage' to reduce
-	 * cache false sharing, as 'usage' is written mostly while
-	 * parent is frequently read for cgroup's hierarchical
-	 * counting nature.
-	 */
+	/* Keep all the read most fields in a separete cacheline. */
+	PC_PADDING(_pad3_);
+
+	unsigned long min;
+	unsigned long low;
+	unsigned long high;
+	unsigned long max;
 	struct page_counter *parent;
 };