diff mbox series

[7/8] mm: memcontrol: consolidate lruvec stat flushing

Message ID 20210205182806.17220-8-hannes@cmpxchg.org (mailing list archive)
State New, archived
Headers show
Series mm: memcontrol: switch to rstat v2 | expand

Commit Message

Johannes Weiner Feb. 5, 2021, 6:28 p.m. UTC
There are two functions to flush the per-cpu data of an lruvec into
the rest of the cgroup tree: when the cgroup is being freed, and when
a CPU disappears during hotplug. The difference is whether all CPUs or
just one is being collected, but the rest of the flushing code is the
same. Merge them into one function and share the common code.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memcontrol.c | 74 +++++++++++++++++++------------------------------
 1 file changed, 28 insertions(+), 46 deletions(-)

Comments

Shakeel Butt Feb. 8, 2021, 2:28 a.m. UTC | #1
On Fri, Feb 5, 2021 at 10:28 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> There are two functions to flush the per-cpu data of an lruvec into
> the rest of the cgroup tree: when the cgroup is being freed, and when
> a CPU disappears during hotplug. The difference is whether all CPUs or
> just one is being collected, but the rest of the flushing code is the
> same. Merge them into one function and share the common code.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Reviewed-by: Shakeel Butt <shakeelb@google.com>

BTW what about the lruvec stats? Why not convert them to rstat as well?
Michal Hocko Feb. 8, 2021, 1:54 p.m. UTC | #2
On Fri 05-02-21 13:28:05, Johannes Weiner wrote:
> There are two functions to flush the per-cpu data of an lruvec into
> the rest of the cgroup tree: when the cgroup is being freed, and when
> a CPU disappears during hotplug. The difference is whether all CPUs or
> just one is being collected, but the rest of the flushing code is the
> same. Merge them into one function and share the common code.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Yes, this looks much better/cleaner.

Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
>  mm/memcontrol.c | 74 +++++++++++++++++++------------------------------
>  1 file changed, 28 insertions(+), 46 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 5dc0bd53b64a..490357945f2c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2410,39 +2410,39 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
>  	mutex_unlock(&percpu_charge_mutex);
>  }
>  
> -static int memcg_hotplug_cpu_dead(unsigned int cpu)
> +static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
>  {
> -	struct memcg_stock_pcp *stock;
> -	struct mem_cgroup *memcg;
> -
> -	stock = &per_cpu(memcg_stock, cpu);
> -	drain_stock(stock);
> +	int nid;
>  
> -	for_each_mem_cgroup(memcg) {
> +	for_each_node(nid) {
> +		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
> +		unsigned long stat[NR_VM_NODE_STAT_ITEMS];
> +		struct batched_lruvec_stat *lstatc;
>  		int i;
>  
> +		lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
>  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> -			int nid;
> +			stat[i] = lstatc->count[i];
> +			lstatc->count[i] = 0;
> +		}
>  
> -			for_each_node(nid) {
> -				struct batched_lruvec_stat *lstatc;
> -				struct mem_cgroup_per_node *pn;
> -				long x;
> +		do {
> +			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
> +				atomic_long_add(stat[i], &pn->lruvec_stat[i]);
> +		} while ((pn = parent_nodeinfo(pn, nid)));
> +	}
> +}
>  
> -				pn = memcg->nodeinfo[nid];
> -				lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
> +static int memcg_hotplug_cpu_dead(unsigned int cpu)
> +{
> +	struct memcg_stock_pcp *stock;
> +	struct mem_cgroup *memcg;
>  
> -				x = lstatc->count[i];
> -				lstatc->count[i] = 0;
> +	stock = &per_cpu(memcg_stock, cpu);
> +	drain_stock(stock);
>  
> -				if (x) {
> -					do {
> -						atomic_long_add(x, &pn->lruvec_stat[i]);
> -					} while ((pn = parent_nodeinfo(pn, nid)));
> -				}
> -			}
> -		}
> -	}
> +	for_each_mem_cgroup(memcg)
> +		memcg_flush_lruvec_page_state(memcg, cpu);
>  
>  	return 0;
>  }
> @@ -3636,27 +3636,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
>  	}
>  }
>  
> -static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg)
> -{
> -	int node;
> -
> -	for_each_node(node) {
> -		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
> -		unsigned long stat[NR_VM_NODE_STAT_ITEMS] = { 0 };
> -		struct mem_cgroup_per_node *pi;
> -		int cpu, i;
> -
> -		for_each_online_cpu(cpu)
> -			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
> -				stat[i] += per_cpu(
> -					pn->lruvec_stat_cpu->count[i], cpu);
> -
> -		for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
> -			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
> -				atomic_long_add(stat[i], &pi->lruvec_stat[i]);
> -	}
> -}
> -
>  #ifdef CONFIG_MEMCG_KMEM
>  static int memcg_online_kmem(struct mem_cgroup *memcg)
>  {
> @@ -5192,12 +5171,15 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
>  
>  static void mem_cgroup_free(struct mem_cgroup *memcg)
>  {
> +	int cpu;
> +
>  	memcg_wb_domain_exit(memcg);
>  	/*
>  	 * Flush percpu lruvec stats to guarantee the value
>  	 * correctness on parent's and all ancestor levels.
>  	 */
> -	memcg_flush_lruvec_page_state(memcg);
> +	for_each_online_cpu(cpu)
> +		memcg_flush_lruvec_page_state(memcg, cpu);
>  	__mem_cgroup_free(memcg);
>  }
>  
> -- 
> 2.30.0
>
Tejun Heo Feb. 8, 2021, 4:02 p.m. UTC | #3
Hello,

On Mon, Feb 08, 2021 at 03:54:14PM -0500, Johannes Weiner wrote:
> We probably do need a better solution for the lruvecs as well, but in
> this case it just started holding up fixing the memory.stat issue for
> no reason and so I tabled it for another patch series.

rstat doesn't currently have a flush throttling mechanism cuz it doens't
expect readers to be super hot but adding one should be pretty easy - e.g.
it can just keep track of the number of updates on this cpu since the last
flush and then flush iff the it's above a certain threshold. Shouldn't be
too difficult to match or exceed the performance and error characteristics
of the existing code.

Thanks.
Johannes Weiner Feb. 8, 2021, 8:54 p.m. UTC | #4
On Sun, Feb 07, 2021 at 06:28:37PM -0800, Shakeel Butt wrote:
> On Fri, Feb 5, 2021 at 10:28 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > There are two functions to flush the per-cpu data of an lruvec into
> > the rest of the cgroup tree: when the cgroup is being freed, and when
> > a CPU disappears during hotplug. The difference is whether all CPUs or
> > just one is being collected, but the rest of the flushing code is the
> > same. Merge them into one function and share the common code.
> >
> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> Reviewed-by: Shakeel Butt <shakeelb@google.com>

Thanks!

> BTW what about the lruvec stats? Why not convert them to rstat as well?

Great question.

I actually started this series with the lruvec stats included, but I'm
worried about the readers being too hot to use rstat (in its current
shape, at least). For example, the refault code accesses the lruvec
stats for every page that is refaulting - at the root level, in case
of global reclaim. With an active workload, that would result in a
very high rate of whole-tree flushes.

We probably do need a better solution for the lruvecs as well, but in
this case it just started holding up fixing the memory.stat issue for
no reason and so I tabled it for another patch series.
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5dc0bd53b64a..490357945f2c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2410,39 +2410,39 @@  static void drain_all_stock(struct mem_cgroup *root_memcg)
 	mutex_unlock(&percpu_charge_mutex);
 }
 
-static int memcg_hotplug_cpu_dead(unsigned int cpu)
+static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
 {
-	struct memcg_stock_pcp *stock;
-	struct mem_cgroup *memcg;
-
-	stock = &per_cpu(memcg_stock, cpu);
-	drain_stock(stock);
+	int nid;
 
-	for_each_mem_cgroup(memcg) {
+	for_each_node(nid) {
+		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+		unsigned long stat[NR_VM_NODE_STAT_ITEMS];
+		struct batched_lruvec_stat *lstatc;
 		int i;
 
+		lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
-			int nid;
+			stat[i] = lstatc->count[i];
+			lstatc->count[i] = 0;
+		}
 
-			for_each_node(nid) {
-				struct batched_lruvec_stat *lstatc;
-				struct mem_cgroup_per_node *pn;
-				long x;
+		do {
+			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+				atomic_long_add(stat[i], &pn->lruvec_stat[i]);
+		} while ((pn = parent_nodeinfo(pn, nid)));
+	}
+}
 
-				pn = memcg->nodeinfo[nid];
-				lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
+{
+	struct memcg_stock_pcp *stock;
+	struct mem_cgroup *memcg;
 
-				x = lstatc->count[i];
-				lstatc->count[i] = 0;
+	stock = &per_cpu(memcg_stock, cpu);
+	drain_stock(stock);
 
-				if (x) {
-					do {
-						atomic_long_add(x, &pn->lruvec_stat[i]);
-					} while ((pn = parent_nodeinfo(pn, nid)));
-				}
-			}
-		}
-	}
+	for_each_mem_cgroup(memcg)
+		memcg_flush_lruvec_page_state(memcg, cpu);
 
 	return 0;
 }
@@ -3636,27 +3636,6 @@  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 	}
 }
 
-static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg)
-{
-	int node;
-
-	for_each_node(node) {
-		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
-		unsigned long stat[NR_VM_NODE_STAT_ITEMS] = { 0 };
-		struct mem_cgroup_per_node *pi;
-		int cpu, i;
-
-		for_each_online_cpu(cpu)
-			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-				stat[i] += per_cpu(
-					pn->lruvec_stat_cpu->count[i], cpu);
-
-		for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
-			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-				atomic_long_add(stat[i], &pi->lruvec_stat[i]);
-	}
-}
-
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
@@ -5192,12 +5171,15 @@  static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
 static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
+	int cpu;
+
 	memcg_wb_domain_exit(memcg);
 	/*
 	 * Flush percpu lruvec stats to guarantee the value
 	 * correctness on parent's and all ancestor levels.
 	 */
-	memcg_flush_lruvec_page_state(memcg);
+	for_each_online_cpu(cpu)
+		memcg_flush_lruvec_page_state(memcg, cpu);
 	__mem_cgroup_free(memcg);
 }