diff mbox series

[2/2] mm/memcontrol: split local and nested atomic vmstats/vmevents counters

Message ID 156336655979.2828.15196553724473875230.stgit@buzz (mailing list archive)
State New, archived
Headers show
Series [1/2] mm/memcontrol: fix flushing per-cpu counters in memcg_hotplug_cpu_dead | expand

Commit Message

Konstantin Khlebnikov July 17, 2019, 12:29 p.m. UTC
This is alternative solution for problem addressed in commit 815744d75152
("mm: memcontrol: don't batch updates of local VM stats and events").

Instead of adding second set of percpu counters which wastes memory and
slows down showing statistics in cgroup-v1 this patch use two arrays of
atomic counters: local and nested statistics.

Then update has the same amount of atomic operations: local update and
one nested for each parent cgroup. Readers of hierarchical statistics
have to sum two atomics which isn't a big deal.

All updates are still batched using one set of percpu counters.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 include/linux/memcontrol.h |   19 +++++++----------
 mm/memcontrol.c            |   48 +++++++++++++++++++-------------------------
 2 files changed, 29 insertions(+), 38 deletions(-)

Comments

Johannes Weiner July 17, 2019, 5:53 p.m. UTC | #1
On Wed, Jul 17, 2019 at 03:29:19PM +0300, Konstantin Khlebnikov wrote:
> This is alternative solution for problem addressed in commit 815744d75152
> ("mm: memcontrol: don't batch updates of local VM stats and events").
> 
> Instead of adding second set of percpu counters which wastes memory and
> slows down showing statistics in cgroup-v1 this patch use two arrays of
> atomic counters: local and nested statistics.
> 
> Then update has the same amount of atomic operations: local update and
> one nested for each parent cgroup. Readers of hierarchical statistics
> have to sum two atomics which isn't a big deal.
> 
> All updates are still batched using one set of percpu counters.
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

Yeah that looks better. Note that it was never about the atomics,
though, but rather the number of cachelines dirtied. Your patch should
solve this problem as well, but it might be a good idea to run
will-it-scale on it to make sure the struct layout is still fine.
Konstantin Khlebnikov July 18, 2019, 3:08 p.m. UTC | #2
On 17.07.2019 20:53, Johannes Weiner wrote:
> On Wed, Jul 17, 2019 at 03:29:19PM +0300, Konstantin Khlebnikov wrote:
>> This is alternative solution for problem addressed in commit 815744d75152
>> ("mm: memcontrol: don't batch updates of local VM stats and events").
>>
>> Instead of adding second set of percpu counters which wastes memory and
>> slows down showing statistics in cgroup-v1 this patch use two arrays of
>> atomic counters: local and nested statistics.
>>
>> Then update has the same amount of atomic operations: local update and
>> one nested for each parent cgroup. Readers of hierarchical statistics
>> have to sum two atomics which isn't a big deal.
>>
>> All updates are still batched using one set of percpu counters.
>>
>> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> 
> Yeah that looks better. Note that it was never about the atomics,
> though, but rather the number of cachelines dirtied. Your patch should
> solve this problem as well, but it might be a good idea to run
> will-it-scale on it to make sure the struct layout is still fine.
> 

Looks like this patch shows 2% regression for 24 core 2 numa node
machine I have. Compete remove of these counters gives 2% boost.
Also I cannot reproduce regression fixed by commit 815744d75152 - revert
have no effect.

So, feel free to ignore second patch. I'll play with this a little more.

Maybe atomic per-numa counters could give nice balance between scalability add overhead.
Ideally this memory could be mapped in per-cpu manner to give atomic access via fs/gs.
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44c41462be33..4dd75d50c200 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -269,16 +269,16 @@  struct mem_cgroup {
 	atomic_t		moving_account;
 	struct task_struct	*move_lock_task;
 
-	/* Legacy local VM stats and events */
-	struct memcg_vmstats_percpu __percpu *vmstats_local;
-
 	/* Subtree VM stats and events (batched updates) */
 	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
 
 	MEMCG_PADDING(_pad2_);
 
-	atomic_long_t		vmstats[MEMCG_NR_STAT];
-	atomic_long_t		vmevents[NR_VM_EVENT_ITEMS];
+	atomic_long_t		vmstats_local[MEMCG_NR_STAT];
+	atomic_long_t		vmstats_nested[MEMCG_NR_STAT];
+
+	atomic_long_t		vmevents_local[NR_VM_EVENT_ITEMS];
+	atomic_long_t		vmevents_nested[NR_VM_EVENT_ITEMS];
 
 	/* memory.events */
 	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -557,7 +557,8 @@  void unlock_page_memcg(struct page *page);
  */
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
-	long x = atomic_long_read(&memcg->vmstats[idx]);
+	long x = atomic_long_read(&memcg->vmstats_local[idx]) +
+		 atomic_long_read(&memcg->vmstats_nested[idx]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -572,11 +573,7 @@  static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
 						   int idx)
 {
-	long x = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
+	long x = atomic_long_read(&memcg->vmstats_local[idx]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06d33dfc4ec4..97debc8e4120 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -695,14 +695,13 @@  void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 	if (mem_cgroup_disabled())
 		return;
 
-	__this_cpu_add(memcg->vmstats_local->stat[idx], val);
-
 	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
 	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
 		struct mem_cgroup *mi;
 
-		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-			atomic_long_add(x, &mi->vmstats[idx]);
+		atomic_long_add(x, &memcg->vmstats_local[idx]);
+		for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+			atomic_long_add(x, &mi->vmstats_nested[idx]);
 		x = 0;
 	}
 	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
@@ -777,14 +776,13 @@  void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 	if (mem_cgroup_disabled())
 		return;
 
-	__this_cpu_add(memcg->vmstats_local->events[idx], count);
-
 	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
 	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
 		struct mem_cgroup *mi;
 
-		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-			atomic_long_add(x, &mi->vmevents[idx]);
+		atomic_long_add(x, &memcg->vmevents_local[idx]);
+		for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+			atomic_long_add(x, &mi->vmevents_nested[idx]);
 		x = 0;
 	}
 	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
@@ -792,17 +790,13 @@  void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 {
-	return atomic_long_read(&memcg->vmevents[event]);
+	return atomic_long_read(&memcg->vmevents_local[event]) +
+	       atomic_long_read(&memcg->vmevents_nested[event]);
 }
 
 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 {
-	long x = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		x += per_cpu(memcg->vmstats_local->events[event], cpu);
-	return x;
+	return atomic_long_read(&memcg->vmevents_local[event]);
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -2257,9 +2251,11 @@  static int memcg_hotplug_cpu_dead(unsigned int cpu)
 			long x;
 
 			x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
-			if (x)
-				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-					atomic_long_add(x, &mi->vmstats[i]);
+			if (x) {
+				atomic_long_add(x, &memcg->vmstats_local[i]);
+				for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+					atomic_long_add(x, &mi->vmstats_nested[i]);
+			}
 
 			if (i >= NR_VM_NODE_STAT_ITEMS)
 				continue;
@@ -2280,9 +2276,11 @@  static int memcg_hotplug_cpu_dead(unsigned int cpu)
 			long x;
 
 			x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
-			if (x)
-				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-					atomic_long_add(x, &mi->vmevents[i]);
+			if (x) {
+				atomic_long_add(x, &memcg->vmevents_local[i]);
+				for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+					atomic_long_add(x, &mi->vmevents_nested[i]);
+			}
 		}
 	}
 
@@ -4085,7 +4083,8 @@  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
  */
 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
 {
-	long x = atomic_long_read(&memcg->vmstats[idx]);
+	long x = atomic_long_read(&memcg->vmstats_local[idx]) +
+		 atomic_long_read(&memcg->vmstats_nested[idx]);
 	int cpu;
 
 	for_each_online_cpu(cpu)
@@ -4638,7 +4637,6 @@  static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
 	free_percpu(memcg->vmstats_percpu);
-	free_percpu(memcg->vmstats_local);
 	kfree(memcg);
 }
 
@@ -4667,10 +4665,6 @@  static struct mem_cgroup *mem_cgroup_alloc(void)
 	if (memcg->id.id < 0)
 		goto fail;
 
-	memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
-	if (!memcg->vmstats_local)
-		goto fail;
-
 	memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
 	if (!memcg->vmstats_percpu)
 		goto fail;