diff mbox series

[5/5] mm: spill memcg percpu stats and events before releasing

Message ID 20190307230033.31975-6-guro@fb.com (mailing list archive)
State New, archived
Headers show
Series mm: reduce the memory footprint of dying memory cgroups | expand

Commit Message

Roman Gushchin March 7, 2019, 11 p.m. UTC
Spill percpu stats and events data to corresponding before releasing
percpu memory.

Although per-cpu stats are never exactly precise, dropping them on
floor regularly may lead to an accumulation of an error. So, it's
safer to sync them before releasing.

To minimize the number of atomic updates, let's sum all stats/events
on all cpus locally, and then make a single update per entry.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

Comments

Johannes Weiner March 11, 2019, 5:38 p.m. UTC | #1
On Thu, Mar 07, 2019 at 03:00:33PM -0800, Roman Gushchin wrote:
> Spill percpu stats and events data to corresponding before releasing
> percpu memory.
> 
> Although per-cpu stats are never exactly precise, dropping them on
> floor regularly may lead to an accumulation of an error. So, it's
> safer to sync them before releasing.
> 
> To minimize the number of atomic updates, let's sum all stats/events
> on all cpus locally, and then make a single update per entry.
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>
> ---
>  mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 18e863890392..b7eb6fac735e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4612,11 +4612,63 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
>  	return 0;
>  }
>  
> +/*
> + * Spill all per-cpu stats and events into atomics.
> + * Try to minimize the number of atomic writes by gathering data from
> + * all cpus locally, and then make one atomic update.
> + * No locking is required, because no one has an access to
> + * the offlined percpu data.
> + */
> +static void mem_cgroup_spill_offlined_percpu(struct mem_cgroup *memcg)
> +{
> +	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
> +	struct lruvec_stat __percpu *lruvec_stat_cpu;
> +	struct mem_cgroup_per_node *pn;
> +	int cpu, i;
> +	long x;
> +
> +	vmstats_percpu = memcg->vmstats_percpu_offlined;
> +
> +	for (i = 0; i < MEMCG_NR_STAT; i++) {
> +		int nid;
> +
> +		x = 0;
> +		for_each_possible_cpu(cpu)
> +			x += per_cpu(vmstats_percpu->stat[i], cpu);
> +		if (x)
> +			atomic_long_add(x, &memcg->vmstats[i]);
> +
> +		if (i >= NR_VM_NODE_STAT_ITEMS)
> +			continue;
> +
> +		for_each_node(nid) {
> +			pn = mem_cgroup_nodeinfo(memcg, nid);
> +			lruvec_stat_cpu = pn->lruvec_stat_cpu_offlined;
> +
> +			x = 0;
> +			for_each_possible_cpu(cpu)
> +				x += per_cpu(lruvec_stat_cpu->count[i], cpu);
> +			if (x)
> +				atomic_long_add(x, &pn->lruvec_stat[i]);
> +		}
> +	}
> +
> +	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
> +		x = 0;
> +		for_each_possible_cpu(cpu)
> +			x += per_cpu(vmstats_percpu->events[i], cpu);
> +		if (x)
> +			atomic_long_add(x, &memcg->vmevents[i]);
> +	}

This looks good, but couldn't this be merged with the cpu offlining?
It seems to be exactly the same code, except for the nesting of the
for_each_possible_cpu() iteration here.

This could be a function that takes a CPU argument and then iterates
the cgroups and stat items to collect and spill the counters of that
specified CPU; offlining would call it once, and this spill code here
would call it for_each_possible_cpu().

We shouldn't need the atomicity of this_cpu_xchg() during hotunplug,
the scheduler isn't even active on that CPU anymore when it's called.
Roman Gushchin March 11, 2019, 7:27 p.m. UTC | #2
On Mon, Mar 11, 2019 at 01:38:25PM -0400, Johannes Weiner wrote:
> On Thu, Mar 07, 2019 at 03:00:33PM -0800, Roman Gushchin wrote:
> > Spill percpu stats and events data to corresponding before releasing
> > percpu memory.
> > 
> > Although per-cpu stats are never exactly precise, dropping them on
> > floor regularly may lead to an accumulation of an error. So, it's
> > safer to sync them before releasing.
> > 
> > To minimize the number of atomic updates, let's sum all stats/events
> > on all cpus locally, and then make a single update per entry.
> > 
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> > ---
> >  mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 52 insertions(+)
> > 
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 18e863890392..b7eb6fac735e 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -4612,11 +4612,63 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
> >  	return 0;
> >  }
> >  
> > +/*
> > + * Spill all per-cpu stats and events into atomics.
> > + * Try to minimize the number of atomic writes by gathering data from
> > + * all cpus locally, and then make one atomic update.
> > + * No locking is required, because no one has an access to
> > + * the offlined percpu data.
> > + */
> > +static void mem_cgroup_spill_offlined_percpu(struct mem_cgroup *memcg)
> > +{
> > +	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
> > +	struct lruvec_stat __percpu *lruvec_stat_cpu;
> > +	struct mem_cgroup_per_node *pn;
> > +	int cpu, i;
> > +	long x;
> > +
> > +	vmstats_percpu = memcg->vmstats_percpu_offlined;
> > +
> > +	for (i = 0; i < MEMCG_NR_STAT; i++) {
> > +		int nid;
> > +
> > +		x = 0;
> > +		for_each_possible_cpu(cpu)
> > +			x += per_cpu(vmstats_percpu->stat[i], cpu);
> > +		if (x)
> > +			atomic_long_add(x, &memcg->vmstats[i]);
> > +
> > +		if (i >= NR_VM_NODE_STAT_ITEMS)
> > +			continue;
> > +
> > +		for_each_node(nid) {
> > +			pn = mem_cgroup_nodeinfo(memcg, nid);
> > +			lruvec_stat_cpu = pn->lruvec_stat_cpu_offlined;
> > +
> > +			x = 0;
> > +			for_each_possible_cpu(cpu)
> > +				x += per_cpu(lruvec_stat_cpu->count[i], cpu);
> > +			if (x)
> > +				atomic_long_add(x, &pn->lruvec_stat[i]);
> > +		}
> > +	}
> > +
> > +	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
> > +		x = 0;
> > +		for_each_possible_cpu(cpu)
> > +			x += per_cpu(vmstats_percpu->events[i], cpu);
> > +		if (x)
> > +			atomic_long_add(x, &memcg->vmevents[i]);
> > +	}
> 
> This looks good, but couldn't this be merged with the cpu offlining?
> It seems to be exactly the same code, except for the nesting of the
> for_each_possible_cpu() iteration here.
> 
> This could be a function that takes a CPU argument and then iterates
> the cgroups and stat items to collect and spill the counters of that
> specified CPU; offlining would call it once, and this spill code here
> would call it for_each_possible_cpu().
> 
> We shouldn't need the atomicity of this_cpu_xchg() during hotunplug,
> the scheduler isn't even active on that CPU anymore when it's called.

Good point!
I initially tried to adapt the cpu offlining code, but it didn't work
well: the code became too complex and ugly. But the opposite can be done
easily: mem_cgroup_spill_offlined_percpu() can take a cpumask,
and the cpu offlining code will look like:
	for_each_mem_cgroup(memcg)
		mem_cgroup_spill_offlined_percpu(memcg, cpumask);
I'll master a separate patch.

Thank you!
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 18e863890392..b7eb6fac735e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4612,11 +4612,63 @@  static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	return 0;
 }
 
+/*
+ * Spill all per-cpu stats and events into atomics.
+ * Try to minimize the number of atomic writes by gathering data from
+ * all cpus locally, and then make one atomic update.
+ * No locking is required, because no one has an access to
+ * the offlined percpu data.
+ */
+static void mem_cgroup_spill_offlined_percpu(struct mem_cgroup *memcg)
+{
+	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
+	struct lruvec_stat __percpu *lruvec_stat_cpu;
+	struct mem_cgroup_per_node *pn;
+	int cpu, i;
+	long x;
+
+	vmstats_percpu = memcg->vmstats_percpu_offlined;
+
+	for (i = 0; i < MEMCG_NR_STAT; i++) {
+		int nid;
+
+		x = 0;
+		for_each_possible_cpu(cpu)
+			x += per_cpu(vmstats_percpu->stat[i], cpu);
+		if (x)
+			atomic_long_add(x, &memcg->vmstats[i]);
+
+		if (i >= NR_VM_NODE_STAT_ITEMS)
+			continue;
+
+		for_each_node(nid) {
+			pn = mem_cgroup_nodeinfo(memcg, nid);
+			lruvec_stat_cpu = pn->lruvec_stat_cpu_offlined;
+
+			x = 0;
+			for_each_possible_cpu(cpu)
+				x += per_cpu(lruvec_stat_cpu->count[i], cpu);
+			if (x)
+				atomic_long_add(x, &pn->lruvec_stat[i]);
+		}
+	}
+
+	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+		x = 0;
+		for_each_possible_cpu(cpu)
+			x += per_cpu(vmstats_percpu->events[i], cpu);
+		if (x)
+			atomic_long_add(x, &memcg->vmevents[i]);
+	}
+}
+
 static void mem_cgroup_free_percpu(struct rcu_head *rcu)
 {
 	struct mem_cgroup *memcg = container_of(rcu, struct mem_cgroup, rcu);
 	int node;
 
+	mem_cgroup_spill_offlined_percpu(memcg);
+
 	for_each_node(node) {
 		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];