diff mbox series

[1/4] numa: introduce per-cgroup numa balancing locality, statistic

Message ID 3ac9b43a-cc80-01be-0079-df008a71ce4b@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series per cpu cgroup numa suite | expand

Commit Message

王贇 July 3, 2019, 3:28 a.m. UTC
This patch introduced numa locality statistic, which try to imply
the numa balancing efficiency per memory cgroup.

By doing 'cat /sys/fs/cgroup/memory/CGROUP_PATH/memory.numa_stat', we
see new output line heading with 'locality', the format is:

  locality 0%~29% 30%~39% 40%~49% 50%~59% 60%~69% 70%~79% 80%~89%
90%~100%

interval means that on a task's last numa balancing, the percentage
of accessing local pages, which we called numa balancing locality.

And the number means inside the cgroup, how many micro seconds tasks
with that locality are running, for example:

  locality 15393 21259 13023 44461 21247 17012 28496 145402

the first number means that this cgroup have some tasks with 0~29%
locality executed 15393 ms.

By monitoring the increment, we can check if the workload of a
particular
cgroup is doing well with numa, when most of the tasks are running with
locality 0~29%, then something is wrong with your numa policy.

Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
---
 include/linux/memcontrol.h | 36 +++++++++++++++++++++++++++++++
 include/linux/sched.h      |  8 ++++++-
 kernel/sched/debug.c       |  7 ++++++
 kernel/sched/fair.c        |  9 ++++++++
 mm/memcontrol.c            | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 112 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra July 11, 2019, 1:43 p.m. UTC | #1
On Wed, Jul 03, 2019 at 11:28:10AM +0800, 王贇 wrote:
> +#ifdef CONFIG_NUMA_BALANCING
> +
> +enum memcg_numa_locality_interval {
> +	PERCENT_0_29,
> +	PERCENT_30_39,
> +	PERCENT_40_49,
> +	PERCENT_50_59,
> +	PERCENT_60_69,
> +	PERCENT_70_79,
> +	PERCENT_80_89,
> +	PERCENT_90_100,
> +	NR_NL_INTERVAL,
> +};

That's just daft; why not make 8 equal sized buckets.

> +struct memcg_stat_numa {
> +	u64 locality[NR_NL_INTERVAL];
> +};

> +	if (remote || local) {
> +		idx = ((local * 10) / (remote + local)) - 2;

		idx = (NR_NL_INTERVAL * local) / (remote + local);

> +	}
> +
> +	rcu_read_lock();
> +	memcg = mem_cgroup_from_task(p);
> +	if (idx != -1)
> +		this_cpu_inc(memcg->stat_numa->locality[idx]);
> +	rcu_read_unlock();
> +}
> +#endif
Peter Zijlstra July 11, 2019, 1:47 p.m. UTC | #2
On Wed, Jul 03, 2019 at 11:28:10AM +0800, 王贇 wrote:

> @@ -3562,10 +3563,53 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
>  		seq_putc(m, '\n');
>  	}
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +	seq_puts(m, "locality");
> +	for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
> +		int cpu;
> +		u64 sum = 0;
> +
> +		for_each_possible_cpu(cpu)
> +			sum += per_cpu(memcg->stat_numa->locality[nr], cpu);
> +
> +		seq_printf(m, " %u", jiffies_to_msecs(sum));
> +	}
> +	seq_putc(m, '\n');
> +#endif
> +
>  	return 0;
>  }
>  #endif /* CONFIG_NUMA */
> 
> +#ifdef CONFIG_NUMA_BALANCING
> +
> +void memcg_stat_numa_update(struct task_struct *p)
> +{
> +	struct mem_cgroup *memcg;
> +	unsigned long remote = p->numa_faults_locality[3];
> +	unsigned long local = p->numa_faults_locality[4];
> +	unsigned long idx = -1;
> +
> +	if (mem_cgroup_disabled())
> +		return;
> +
> +	if (remote || local) {
> +		idx = ((local * 10) / (remote + local)) - 2;
> +		/* 0~29% in one slot for cache align */
> +		if (idx < PERCENT_0_29)
> +			idx = PERCENT_0_29;
> +		else if (idx >= NR_NL_INTERVAL)
> +			idx = NR_NL_INTERVAL - 1;
> +	}
> +
> +	rcu_read_lock();
> +	memcg = mem_cgroup_from_task(p);
> +	if (idx != -1)
> +		this_cpu_inc(memcg->stat_numa->locality[idx]);

I thought cgroups were supposed to be hierarchical. That is, if we have:

          R
	 / \
	 A
	/\
	  B
	  \
	   t1

Then our task t1 should be accounted to B (as you do), but also to A and
R.

> +	rcu_read_unlock();
> +}
> +#endif
王贇 July 12, 2019, 3:15 a.m. UTC | #3
On 2019/7/11 下午9:43, Peter Zijlstra wrote:
> On Wed, Jul 03, 2019 at 11:28:10AM +0800, 王贇 wrote:
>> +#ifdef CONFIG_NUMA_BALANCING
>> +
>> +enum memcg_numa_locality_interval {
>> +	PERCENT_0_29,
>> +	PERCENT_30_39,
>> +	PERCENT_40_49,
>> +	PERCENT_50_59,
>> +	PERCENT_60_69,
>> +	PERCENT_70_79,
>> +	PERCENT_80_89,
>> +	PERCENT_90_100,
>> +	NR_NL_INTERVAL,
>> +};
> 
> That's just daft; why not make 8 equal sized buckets.
> 
>> +struct memcg_stat_numa {
>> +	u64 locality[NR_NL_INTERVAL];
>> +};
> 
>> +	if (remote || local) {
>> +		idx = ((local * 10) / (remote + local)) - 2;
> 
> 		idx = (NR_NL_INTERVAL * local) / (remote + local);

Make sense, we actually want to observe the situation rather than
the ratio itself, will be in next version.

Regards,
Michael Wang

> 
>> +	}
>> +
>> +	rcu_read_lock();
>> +	memcg = mem_cgroup_from_task(p);
>> +	if (idx != -1)
>> +		this_cpu_inc(memcg->stat_numa->locality[idx]);
>> +	rcu_read_unlock();
>> +}
>> +#endif
王贇 July 12, 2019, 3:43 a.m. UTC | #4
On 2019/7/11 下午9:47, Peter Zijlstra wrote:
[snip]
>> +	rcu_read_lock();
>> +	memcg = mem_cgroup_from_task(p);
>> +	if (idx != -1)
>> +		this_cpu_inc(memcg->stat_numa->locality[idx]);
> 
> I thought cgroups were supposed to be hierarchical. That is, if we have:
> 
>           R
> 	 / \
> 	 A
> 	/\
> 	  B
> 	  \
> 	   t1
> 
> Then our task t1 should be accounted to B (as you do), but also to A and
> R.

I get the point but not quite sure about this...

Not like pages there are no hierarchical limitation on locality, also tasks
running in a particular group have no influence to others, not to mention the
extra overhead, does it really meaningful to account the stuff hierarchically?

Regards,
Michael Wang

> 
>> +	rcu_read_unlock();
>> +}
>> +#endif
Peter Zijlstra July 12, 2019, 7:58 a.m. UTC | #5
On Fri, Jul 12, 2019 at 11:43:17AM +0800, 王贇 wrote:
> 
> 
> On 2019/7/11 下午9:47, Peter Zijlstra wrote:
> [snip]
> >> +	rcu_read_lock();
> >> +	memcg = mem_cgroup_from_task(p);
> >> +	if (idx != -1)
> >> +		this_cpu_inc(memcg->stat_numa->locality[idx]);
> > 
> > I thought cgroups were supposed to be hierarchical. That is, if we have:
> > 
> >           R
> > 	 / \
> > 	 A
> > 	/\
> > 	  B
> > 	  \
> > 	   t1
> > 
> > Then our task t1 should be accounted to B (as you do), but also to A and
> > R.
> 
> I get the point but not quite sure about this...
> 
> Not like pages there are no hierarchical limitation on locality, also tasks

You can use cpusets to affect that.

> running in a particular group have no influence to others, not to mention the
> extra overhead, does it really meaningful to account the stuff hierarchically?

AFAIU it's a requirement of cgroups to be hierarchical. All our other
cgroup accounting is like that.
王贇 July 12, 2019, 9:11 a.m. UTC | #6
On 2019/7/12 下午3:58, Peter Zijlstra wrote:
[snip]
>>>
>>> Then our task t1 should be accounted to B (as you do), but also to A and
>>> R.
>>
>> I get the point but not quite sure about this...
>>
>> Not like pages there are no hierarchical limitation on locality, also tasks
> 
> You can use cpusets to affect that.

Could you please give more detail on this?

> 
>> running in a particular group have no influence to others, not to mention the
>> extra overhead, does it really meaningful to account the stuff hierarchically?
> 
> AFAIU it's a requirement of cgroups to be hierarchical. All our other
> cgroup accounting is like that.

Ok, should respect the convention :-)

Regards,
Michael Wang

>
Peter Zijlstra July 12, 2019, 9:42 a.m. UTC | #7
On Fri, Jul 12, 2019 at 05:11:25PM +0800, 王贇 wrote:
> 
> 
> On 2019/7/12 下午3:58, Peter Zijlstra wrote:
> [snip]
> >>>
> >>> Then our task t1 should be accounted to B (as you do), but also to A and
> >>> R.
> >>
> >> I get the point but not quite sure about this...
> >>
> >> Not like pages there are no hierarchical limitation on locality, also tasks
> > 
> > You can use cpusets to affect that.
> 
> Could you please give more detail on this?

Documentation/cgroup-v1/cpusets.txt

Look for mems_allowed.
王贇 July 12, 2019, 10:10 a.m. UTC | #8
On 2019/7/12 下午5:42, Peter Zijlstra wrote:
> On Fri, Jul 12, 2019 at 05:11:25PM +0800, 王贇 wrote:
>>
>>
>> On 2019/7/12 下午3:58, Peter Zijlstra wrote:
>> [snip]
>>>>>
>>>>> Then our task t1 should be accounted to B (as you do), but also to A and
>>>>> R.
>>>>
>>>> I get the point but not quite sure about this...
>>>>
>>>> Not like pages there are no hierarchical limitation on locality, also tasks
>>>
>>> You can use cpusets to affect that.
>>
>> Could you please give more detail on this?
> 
> Documentation/cgroup-v1/cpusets.txt
> 
> Look for mems_allowed.

This is the attribute belong to cpuset cgroup isn't it?

Forgive me but I have no idea on how to combined this
with memory cgroup's locality hierarchical update...
parent memory cgroup do not have influence on mems_allowed
to it's children, correct?

What about we just account the locality status of child
memory group into it's ancestors?

Regards,
Michael Wang

>
王贇 July 15, 2019, 2:09 a.m. UTC | #9
On 2019/7/12 下午6:10, 王贇 wrote:
[snip]
>>
>> Documentation/cgroup-v1/cpusets.txt
>>
>> Look for mems_allowed.
> 
> This is the attribute belong to cpuset cgroup isn't it?
> 
> Forgive me but I have no idea on how to combined this
> with memory cgroup's locality hierarchical update...
> parent memory cgroup do not have influence on mems_allowed
> to it's children, correct?
> 
> What about we just account the locality status of child
> memory group into it's ancestors?

We have rethink about this, and found no strong reason to stay
with memory cgroup anymore.

We used to acquire pages number, exectime and locality together
from memory cgroup, to make thing easier for our numa balancer
module, as now we use the numa group approach, maybe we can just
move these accounting into cpu cgroups, so all these features
stay in one subsys and could be hierarchical :-)

Regards,
Michael Wang

> 
> Regards,
> Michael Wang
> 
>>
Michal Koutný July 15, 2019, 12:10 p.m. UTC | #10
Hello Yun.

On Fri, Jul 12, 2019 at 06:10:24PM +0800, 王贇  <yun.wang@linux.alibaba.com> wrote:
> Forgive me but I have no idea on how to combined this
> with memory cgroup's locality hierarchical update...
> parent memory cgroup do not have influence on mems_allowed
> to it's children, correct?
I'd recommend to look at the v2 of the cpuset controller that implements
the hierarchical behavior among configured memory node sets.

(My comment would better fit to 
    [PATCH 3/4] numa: introduce numa group per task group
IIUC, you could use cpuset controller to constraint memory nodes.)

For the second part (accessing numa statistics, i.e. this patch), I
wonder wheter this information wouldn't be better presented under the
cpuset controller too.

HTH,
Michal
王贇 July 16, 2019, 2:41 a.m. UTC | #11
Hi Michal,

Thx for the comments :-)

On 2019/7/15 下午8:10, Michal Koutný wrote:
> Hello Yun.
> 
> On Fri, Jul 12, 2019 at 06:10:24PM +0800, 王贇  <yun.wang@linux.alibaba.com> wrote:
>> Forgive me but I have no idea on how to combined this
>> with memory cgroup's locality hierarchical update...
>> parent memory cgroup do not have influence on mems_allowed
>> to it's children, correct?
> I'd recommend to look at the v2 of the cpuset controller that implements
> the hierarchical behavior among configured memory node sets.

Actually whatever the memory node sets or cpu allow sets is, it will
take effect on task's behavior regarding memory location and cpu
location, while the locality only care about the results rather than
the sets.

For example if we bind tasks to cpus of node 0 and memory allow only
the node 1, by cgroup controller or madvise, then they will running
on node 0 with all the memory on node 1, on each PF for numa balancing,
the task will access page on node 1 from node 0 remotely, so the
locality will always be 0.

> 
> (My comment would better fit to 
>     [PATCH 3/4] numa: introduce numa group per task group
> IIUC, you could use cpuset controller to constraint memory nodes.)
> 
> For the second part (accessing numa statistics, i.e. this patch), I
> wonder wheter this information wouldn't be better presented under the
> cpuset controller too.

Yeah, we realized the cpu cgroup could be a better place to hold these
new statistics, both locality and exectime are task's running behavior,
related to memory location but not the memory behavior, will apply in
next version.

Regards,
Michael Wang

> 
> HTH,
> Michal
>
Michal Koutný July 19, 2019, 4:47 p.m. UTC | #12
On Tue, Jul 16, 2019 at 10:41:36AM +0800, 王贇  <yun.wang@linux.alibaba.com> wrote:
> Actually whatever the memory node sets or cpu allow sets is, it will
> take effect on task's behavior regarding memory location and cpu
> location, while the locality only care about the results rather than
> the sets.
My previous response missed much of the context, so it was a bit off.

I see what you mean by the locality now. Alas, I can't assess whether
it's the right thing to do regarding NUMA behavior that you try to
optimize (i.e. you need an answer from someone more familiar with NUMA
balancing).

Michal
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2cbce1fe7780..0a30d14c9f43 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -174,6 +174,25 @@  enum memcg_kmem_state {
 	KMEM_ONLINE,
 };

+#ifdef CONFIG_NUMA_BALANCING
+
+enum memcg_numa_locality_interval {
+	PERCENT_0_29,
+	PERCENT_30_39,
+	PERCENT_40_49,
+	PERCENT_50_59,
+	PERCENT_60_69,
+	PERCENT_70_79,
+	PERCENT_80_89,
+	PERCENT_90_100,
+	NR_NL_INTERVAL,
+};
+
+struct memcg_stat_numa {
+	u64 locality[NR_NL_INTERVAL];
+};
+
+#endif
 #if defined(CONFIG_SMP)
 struct memcg_padding {
 	char x[0];
@@ -313,6 +332,10 @@  struct mem_cgroup {
 	struct list_head event_list;
 	spinlock_t event_list_lock;

+#ifdef CONFIG_NUMA_BALANCING
+	struct memcg_stat_numa __percpu *stat_numa;
+#endif
+
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 };
@@ -795,6 +818,14 @@  static inline void memcg_memory_event_mm(struct mm_struct *mm,
 void mem_cgroup_split_huge_fixup(struct page *head);
 #endif

+#ifdef CONFIG_NUMA_BALANCING
+extern void memcg_stat_numa_update(struct task_struct *p);
+#else
+static inline void memcg_stat_numa_update(struct task_struct *p)
+{
+}
+#endif
+
 #else /* CONFIG_MEMCG */

 #define MEM_CGROUP_ID_SHIFT	0
@@ -1131,6 +1162,11 @@  static inline
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline void memcg_stat_numa_update(struct task_struct *p)
+{
+}
+
 #endif /* CONFIG_MEMCG */

 /* idx can be of type enum memcg_stat_item or node_stat_item */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 907808f1acc5..eb26098de6ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,8 +1117,14 @@  struct task_struct {
 	 * scan window were remote/local or failed to migrate. The task scan
 	 * period is adapted based on the locality of the faults with different
 	 * weights depending on whether they were shared or private faults
+	 *
+	 * 0 -- remote faults
+	 * 1 -- local faults
+	 * 2 -- page migration failure
+	 * 3 -- remote page accessing
+	 * 4 -- local page accessing
 	 */
-	unsigned long			numa_faults_locality[3];
+	unsigned long			numa_faults_locality[5];

 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579e746c..473e6b7a1b8d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -849,6 +849,13 @@  static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 	SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
 			task_node(p), task_numa_group_id(p));
 	show_numa_stats(p, m);
+	SEQ_printf(m, "faults_locality local=%lu remote=%lu failed=%lu ",
+			p->numa_faults_locality[1],
+			p->numa_faults_locality[0],
+			p->numa_faults_locality[2]);
+	SEQ_printf(m, "lhit=%lu rhit=%lu\n",
+			p->numa_faults_locality[4],
+			p->numa_faults_locality[3]);
 	mpol_put(pol);
 #endif
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 036be95a87e9..b32304817eeb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@ 
 #include "sched.h"

 #include <trace/events/sched.h>
+#include <linux/memcontrol.h>

 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -2449,6 +2450,12 @@  void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
 	p->numa_faults_locality[local] += pages;
+	/*
+	 * We want to have the real local/remote page access statistic
+	 * here, so use 'mem_node' which is the real residential node of
+	 * page after migrate_misplaced_page().
+	 */
+	p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
 }

 static void reset_ptenuma_scan(struct task_struct *p)
@@ -2625,6 +2632,8 @@  static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
 		return;

+	memcg_stat_numa_update(curr);
+
 	/*
 	 * Using runtime rather than walltime has the dual advantage that
 	 * we (mostly) drive the selection from busy threads and that the
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3f67a6b6527..2edf3f5ac4b9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -58,6 +58,7 @@ 
 #include <linux/file.h>
 #include <linux/tracehook.h>
 #include <linux/seq_buf.h>
+#include <linux/cpuset.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -3562,10 +3563,53 @@  static int memcg_numa_stat_show(struct seq_file *m, void *v)
 		seq_putc(m, '\n');
 	}

+#ifdef CONFIG_NUMA_BALANCING
+	seq_puts(m, "locality");
+	for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
+		int cpu;
+		u64 sum = 0;
+
+		for_each_possible_cpu(cpu)
+			sum += per_cpu(memcg->stat_numa->locality[nr], cpu);
+
+		seq_printf(m, " %u", jiffies_to_msecs(sum));
+	}
+	seq_putc(m, '\n');
+#endif
+
 	return 0;
 }
 #endif /* CONFIG_NUMA */

+#ifdef CONFIG_NUMA_BALANCING
+
+void memcg_stat_numa_update(struct task_struct *p)
+{
+	struct mem_cgroup *memcg;
+	unsigned long remote = p->numa_faults_locality[3];
+	unsigned long local = p->numa_faults_locality[4];
+	unsigned long idx = -1;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	if (remote || local) {
+		idx = ((local * 10) / (remote + local)) - 2;
+		/* 0~29% in one slot for cache align */
+		if (idx < PERCENT_0_29)
+			idx = PERCENT_0_29;
+		else if (idx >= NR_NL_INTERVAL)
+			idx = NR_NL_INTERVAL - 1;
+	}
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(p);
+	if (idx != -1)
+		this_cpu_inc(memcg->stat_numa->locality[idx]);
+	rcu_read_unlock();
+}
+#endif
+
 static const unsigned int memcg1_stats[] = {
 	MEMCG_CACHE,
 	MEMCG_RSS,
@@ -4641,6 +4685,9 @@  static void __mem_cgroup_free(struct mem_cgroup *memcg)

 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
+#ifdef CONFIG_NUMA_BALANCING
+	free_percpu(memcg->stat_numa);
+#endif
 	free_percpu(memcg->vmstats_percpu);
 	free_percpu(memcg->vmstats_local);
 	kfree(memcg);
@@ -4679,6 +4726,12 @@  static struct mem_cgroup *mem_cgroup_alloc(void)
 	if (!memcg->vmstats_percpu)
 		goto fail;

+#ifdef CONFIG_NUMA_BALANCING
+	memcg->stat_numa = alloc_percpu(struct memcg_stat_numa);
+	if (!memcg->stat_numa)
+		goto fail;
+#endif
+
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_node_info(memcg, node))
 			goto fail;