diff mbox series

[RESEND,v8,1/2] sched/numa: introduce per-cgroup NUMA locality info

Message ID cde13472-46c0-7e17-175f-4b2ba4d8148a@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series sched/numa: introduce numa locality | expand

Commit Message

王贇 Feb. 7, 2020, 3:35 a.m. UTC
Currently there are no good approach to monitoring the per-cgroup NUMA
efficiency, this could be a trouble especially when groups are sharing
CPUs, we don't know which one introduced remote-memory accessing.

Although the per-task NUMA accessing info from PMU is good for further
debuging, but not light enough for daily monitoring, especial on a box
with thousands of tasks.

Fortunately, when NUMA Balancing enabled, it will periodly trigger page
fault and try to increase the NUMA locality, by tracing the results we
will be able to estimate the NUMA efficiency.

On each page fault of NUMA Balancing, when task's executing CPU is from
the same node of pages, we call this a local page accessing, otherwise
a remote page accessing.

By updating task's accessing counter into it's cgroup on ticks, we get
the per-cgroup numa locality info.

For example the new entry 'cpu.numa_stat' show:
  page_access local=1231412 remote=53453

Here we know the workloads in hierarchy have totally been traced 1284865
times of page accessing, and 1231412 of them are local page access, which
imply a good NUMA efficiency.

By monitoring the increments, we will be able to locate the per-cgroup
workload which NUMA Balancing can't helpwith (usually caused by wrong
CPU and memory node bindings), then we got chance to fix that in time.

Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
---
 include/linux/sched.h        | 15 +++++++++
 include/linux/sched/sysctl.h |  6 ++++
 init/Kconfig                 |  9 ++++++
 kernel/sched/core.c          | 75 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 62 ++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h         | 12 +++++++
 kernel/sysctl.c              | 11 +++++++
 7 files changed, 190 insertions(+)

Comments

王贇 Feb. 7, 2020, 3:37 a.m. UTC | #1
Forwarded:

Hi, Peter, Ingo

Could you give some comments please?

As Mel replied previously, he won't disagree the idea, so we're looking
forward the opinion from the maintainers.

Please allow me to highlight the necessary of monitoring NUMA Balancing
again, this feature is critical to the performance on NUMA platform,
it cost and benefit -- lot or less, however there are not enough
information for an admin to analysis the trade-off, while locality could
be the missing piece.

Regards,
Michael Wang

On 2020/2/7 上午11:35, 王贇 wrote:
> Currently there are no good approach to monitoring the per-cgroup NUMA
> efficiency, this could be a trouble especially when groups are sharing
> CPUs, we don't know which one introduced remote-memory accessing.
> 
> Although the per-task NUMA accessing info from PMU is good for further
> debuging, but not light enough for daily monitoring, especial on a box
> with thousands of tasks.
> 
> Fortunately, when NUMA Balancing enabled, it will periodly trigger page
> fault and try to increase the NUMA locality, by tracing the results we
> will be able to estimate the NUMA efficiency.
> 
> On each page fault of NUMA Balancing, when task's executing CPU is from
> the same node of pages, we call this a local page accessing, otherwise
> a remote page accessing.
> 
> By updating task's accessing counter into it's cgroup on ticks, we get
> the per-cgroup numa locality info.
> 
> For example the new entry 'cpu.numa_stat' show:
>   page_access local=1231412 remote=53453
> 
> Here we know the workloads in hierarchy have totally been traced 1284865
> times of page accessing, and 1231412 of them are local page access, which
> imply a good NUMA efficiency.
> 
> By monitoring the increments, we will be able to locate the per-cgroup
> workload which NUMA Balancing can't helpwith (usually caused by wrong
> CPU and memory node bindings), then we got chance to fix that in time.
> 
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Michal Koutný <mkoutny@suse.com>
> Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
> ---
>  include/linux/sched.h        | 15 +++++++++
>  include/linux/sched/sysctl.h |  6 ++++
>  init/Kconfig                 |  9 ++++++
>  kernel/sched/core.c          | 75 ++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c          | 62 ++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h         | 12 +++++++
>  kernel/sysctl.c              | 11 +++++++
>  7 files changed, 190 insertions(+)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a6c924fa1c77..74bf234bae53 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1128,6 +1128,21 @@ struct task_struct {
>  	unsigned long			numa_pages_migrated;
>  #endif /* CONFIG_NUMA_BALANCING */
> 
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +	/*
> +	 * Counter index stand for:
> +	 * 0 -- remote page accessing
> +	 * 1 -- local page accessing
> +	 * 2 -- remote page accessing updated to cgroup
> +	 * 3 -- local page accessing updated to cgroup
> +	 *
> +	 * We record the counter before the end of task_numa_fault(), this
> +	 * is based on the fact that after page fault is handled, the task
> +	 * will access the page on the CPU where it triggered the PF.
> +	 */
> +	unsigned long			numa_page_access[4];
> +#endif
> +
>  #ifdef CONFIG_RSEQ
>  	struct rseq __user *rseq;
>  	u32 rseq_sig;
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index d4f6215ee03f..bb3721cf48e0 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -101,4 +101,10 @@ extern int sched_energy_aware_handler(struct ctl_table *table, int write,
>  				 loff_t *ppos);
>  #endif
> 
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +extern int sysctl_numa_locality(struct ctl_table *table, int write,
> +				 void __user *buffer, size_t *lenp,
> +				 loff_t *ppos);
> +#endif
> +
>  #endif /* _LINUX_SCHED_SYSCTL_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index 322fd2c65304..63c6b90a515d 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -813,6 +813,15 @@ config NUMA_BALANCING_DEFAULT_ENABLED
>  	  If set, automatic NUMA balancing will be enabled if running on a NUMA
>  	  machine.
> 
> +config CGROUP_NUMA_LOCALITY
> +	bool "per-cgroup NUMA Locality"
> +	default n
> +	depends on CGROUP_SCHED && NUMA_BALANCING
> +	help
> +	  This option enables the collection of per-cgroup NUMA locality info,
> +	  to tell whether NUMA Balancing is working well for a particular
> +	  workload, also imply the NUMA efficiency.
> +
>  menuconfig CGROUPS
>  	bool "Control Group support"
>  	select KERNFS
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e7b08d52db93..40dd6b221eef 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7657,6 +7657,68 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
>  }
>  #endif /* CONFIG_RT_GROUP_SCHED */
> 
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +DEFINE_STATIC_KEY_FALSE(sched_numa_locality);
> +
> +#ifdef CONFIG_PROC_SYSCTL
> +int sysctl_numa_locality(struct ctl_table *table, int write,
> +			 void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table t;
> +	int err;
> +	int state = static_branch_likely(&sched_numa_locality);
> +
> +	if (write && !capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	t = *table;
> +	t.data = &state;
> +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
> +	if (err < 0 || !write)
> +		return err;
> +
> +	if (state)
> +		static_branch_enable(&sched_numa_locality);
> +	else
> +		static_branch_disable(&sched_numa_locality);
> +
> +	return err;
> +}
> +#endif
> +
> +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
> +{
> +	return tg == &root_task_group ? &cpu_rq(cpu)->cfs : tg->cfs_rq[cpu];
> +}
> +
> +static int cpu_numa_stat_show(struct seq_file *sf, void *v)
> +{
> +	int cpu;
> +	u64 local = 0, remote = 0;
> +	struct task_group *tg = css_tg(seq_css(sf));
> +
> +	if (!static_branch_likely(&sched_numa_locality))
> +		return 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		local += tg_cfs_rq(tg, cpu)->local_page_access;
> +		remote += tg_cfs_rq(tg, cpu)->remote_page_access;
> +	}
> +
> +	seq_printf(sf, "page_access local=%llu remote=%llu\n", local, remote);
> +
> +	return 0;
> +}
> +
> +static __init int numa_locality_setup(char *opt)
> +{
> +	static_branch_enable(&sched_numa_locality);
> +
> +	return 0;
> +}
> +__setup("numa_locality", numa_locality_setup);
> +#endif
> +
>  static struct cftype cpu_legacy_files[] = {
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	{
> @@ -7706,6 +7768,12 @@ static struct cftype cpu_legacy_files[] = {
>  		.seq_show = cpu_uclamp_max_show,
>  		.write = cpu_uclamp_max_write,
>  	},
> +#endif
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +	{
> +		.name = "numa_stat",
> +		.seq_show = cpu_numa_stat_show,
> +	},
>  #endif
>  	{ }	/* Terminate */
>  };
> @@ -7887,6 +7955,13 @@ static struct cftype cpu_files[] = {
>  		.seq_show = cpu_uclamp_max_show,
>  		.write = cpu_uclamp_max_write,
>  	},
> +#endif
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +	{
> +		.name = "numa_stat",
> +		.flags = CFTYPE_NOT_ON_ROOT,
> +		.seq_show = cpu_numa_stat_show,
> +	},
>  #endif
>  	{ }	/* terminate */
>  };
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 2d170b5da0e3..eb838557bae2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1049,7 +1049,63 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   * Scheduling class queueing methods:
>   */
> 
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +/*
> + * We want to record the real local/remote page access statistic
> + * here, so 'pnid' should be pages's real residential node after
> + * migrate_misplaced_page(), and 'cnid' should be the node of CPU
> + * where triggered the PF.
> + */
> +static inline void
> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
> +{
> +	if (!static_branch_unlikely(&sched_numa_locality))
> +		return;
> +
> +	/*
> +	 * pnid != cnid --> remote idx 0
> +	 * pnid == cnid --> local idx 1
> +	 */
> +	p->numa_page_access[!!(pnid == cnid)] += pages;
> +}
> +
> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
> +{
> +	unsigned long ldiff, rdiff;
> +
> +	if (!static_branch_unlikely(&sched_numa_locality))
> +		return;
> +
> +	rdiff = current->numa_page_access[0] - current->numa_page_access[2];
> +	ldiff = current->numa_page_access[1] - current->numa_page_access[3];
> +	if (!ldiff && !rdiff)
> +		return;
> +
> +	cfs_rq->local_page_access += ldiff;
> +	cfs_rq->remote_page_access += rdiff;
> +
> +	/*
> +	 * Consider updated when reach root cfs_rq, no NUMA Balancing PF
> +	 * should happen on current task during the hierarchical updating.
> +	 */
> +	if (&cfs_rq->rq->cfs == cfs_rq) {
> +		current->numa_page_access[2] = current->numa_page_access[0];
> +		current->numa_page_access[3] = current->numa_page_access[1];
> +	}
> +}
> +#else
> +static inline void
> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
> +{
> +}
> +
> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
> +{
> +}
> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
> +
>  #ifdef CONFIG_NUMA_BALANCING
> +
>  /*
>   * Approximate time to scan a full NUMA task in ms. The task scan period is
>   * calculated based on the tasks virtual memory size and
> @@ -2465,6 +2521,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>  	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
>  	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
>  	p->numa_faults_locality[local] += pages;
> +
> +	update_task_locality(p, mem_node, numa_node_id(), pages);
>  }
> 
>  static void reset_ptenuma_scan(struct task_struct *p)
> @@ -2650,6 +2708,9 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
>  	p->last_sum_exec_runtime	= 0;
> 
>  	init_task_work(&p->numa_work, task_numa_work);
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +	memset(p->numa_page_access, 0, sizeof(p->numa_page_access));
> +#endif
> 
>  	/* New address space, reset the preferred nid */
>  	if (!(clone_flags & CLONE_VM)) {
> @@ -4313,6 +4374,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>  	 */
>  	update_load_avg(cfs_rq, curr, UPDATE_TG);
>  	update_cfs_group(curr);
> +	update_group_locality(cfs_rq);
> 
>  #ifdef CONFIG_SCHED_HRTICK
>  	/*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 1a88dc8ad11b..66b4e581b6ed 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -575,6 +575,14 @@ struct cfs_rq {
>  	struct list_head	throttled_list;
>  #endif /* CONFIG_CFS_BANDWIDTH */
>  #endif /* CONFIG_FAIR_GROUP_SCHED */
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +	/*
> +	 * The local/remote page access info collected from all
> +	 * the tasks in hierarchy.
> +	 */
> +	u64			local_page_access;
> +	u64			remote_page_access;
> +#endif
>  };
> 
>  static inline int rt_bandwidth_enabled(void)
> @@ -1601,6 +1609,10 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
>  extern struct static_key_false sched_numa_balancing;
>  extern struct static_key_false sched_schedstats;
> 
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +extern struct static_key_false sched_numa_locality;
> +#endif
> +
>  static inline u64 global_rt_period(void)
>  {
>  	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index d396aaaf19a3..a8f5951f92b3 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -428,6 +428,17 @@ static struct ctl_table kern_table[] = {
>  		.extra2		= SYSCTL_ONE,
>  	},
>  #endif /* CONFIG_NUMA_BALANCING */
> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
> +	{
> +		.procname	= "numa_locality",
> +		.data		= NULL, /* filled in by handler */
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= sysctl_numa_locality,
> +		.extra1		= SYSCTL_ZERO,
> +		.extra2		= SYSCTL_ONE,
> +	},
> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
>  #endif /* CONFIG_SCHED_DEBUG */
>  	{
>  		.procname	= "sched_rt_period_us",
>
王贇 Feb. 13, 2020, 2:35 a.m. UTC | #2
Hi, Peter

Could you please give some comments on this one?

Regards,
Michael Wang

On 2020/2/7 上午11:37, 王贇 wrote:
> Forwarded:
> 
> Hi, Peter, Ingo
> 
> Could you give some comments please?
> 
> As Mel replied previously, he won't disagree the idea, so we're looking
> forward the opinion from the maintainers.
> 
> Please allow me to highlight the necessary of monitoring NUMA Balancing
> again, this feature is critical to the performance on NUMA platform,
> it cost and benefit -- lot or less, however there are not enough
> information for an admin to analysis the trade-off, while locality could
> be the missing piece.
> 
> Regards,
> Michael Wang
> 
> On 2020/2/7 上午11:35, 王贇 wrote:
>> Currently there are no good approach to monitoring the per-cgroup NUMA
>> efficiency, this could be a trouble especially when groups are sharing
>> CPUs, we don't know which one introduced remote-memory accessing.
>>
>> Although the per-task NUMA accessing info from PMU is good for further
>> debuging, but not light enough for daily monitoring, especial on a box
>> with thousands of tasks.
>>
>> Fortunately, when NUMA Balancing enabled, it will periodly trigger page
>> fault and try to increase the NUMA locality, by tracing the results we
>> will be able to estimate the NUMA efficiency.
>>
>> On each page fault of NUMA Balancing, when task's executing CPU is from
>> the same node of pages, we call this a local page accessing, otherwise
>> a remote page accessing.
>>
>> By updating task's accessing counter into it's cgroup on ticks, we get
>> the per-cgroup numa locality info.
>>
>> For example the new entry 'cpu.numa_stat' show:
>>   page_access local=1231412 remote=53453
>>
>> Here we know the workloads in hierarchy have totally been traced 1284865
>> times of page accessing, and 1231412 of them are local page access, which
>> imply a good NUMA efficiency.
>>
>> By monitoring the increments, we will be able to locate the per-cgroup
>> workload which NUMA Balancing can't helpwith (usually caused by wrong
>> CPU and memory node bindings), then we got chance to fix that in time.
>>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Michal Koutný <mkoutny@suse.com>
>> Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
>> ---
>>  include/linux/sched.h        | 15 +++++++++
>>  include/linux/sched/sysctl.h |  6 ++++
>>  init/Kconfig                 |  9 ++++++
>>  kernel/sched/core.c          | 75 ++++++++++++++++++++++++++++++++++++++++++++
>>  kernel/sched/fair.c          | 62 ++++++++++++++++++++++++++++++++++++
>>  kernel/sched/sched.h         | 12 +++++++
>>  kernel/sysctl.c              | 11 +++++++
>>  7 files changed, 190 insertions(+)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index a6c924fa1c77..74bf234bae53 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1128,6 +1128,21 @@ struct task_struct {
>>  	unsigned long			numa_pages_migrated;
>>  #endif /* CONFIG_NUMA_BALANCING */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	/*
>> +	 * Counter index stand for:
>> +	 * 0 -- remote page accessing
>> +	 * 1 -- local page accessing
>> +	 * 2 -- remote page accessing updated to cgroup
>> +	 * 3 -- local page accessing updated to cgroup
>> +	 *
>> +	 * We record the counter before the end of task_numa_fault(), this
>> +	 * is based on the fact that after page fault is handled, the task
>> +	 * will access the page on the CPU where it triggered the PF.
>> +	 */
>> +	unsigned long			numa_page_access[4];
>> +#endif
>> +
>>  #ifdef CONFIG_RSEQ
>>  	struct rseq __user *rseq;
>>  	u32 rseq_sig;
>> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
>> index d4f6215ee03f..bb3721cf48e0 100644
>> --- a/include/linux/sched/sysctl.h
>> +++ b/include/linux/sched/sysctl.h
>> @@ -101,4 +101,10 @@ extern int sched_energy_aware_handler(struct ctl_table *table, int write,
>>  				 loff_t *ppos);
>>  #endif
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +extern int sysctl_numa_locality(struct ctl_table *table, int write,
>> +				 void __user *buffer, size_t *lenp,
>> +				 loff_t *ppos);
>> +#endif
>> +
>>  #endif /* _LINUX_SCHED_SYSCTL_H */
>> diff --git a/init/Kconfig b/init/Kconfig
>> index 322fd2c65304..63c6b90a515d 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -813,6 +813,15 @@ config NUMA_BALANCING_DEFAULT_ENABLED
>>  	  If set, automatic NUMA balancing will be enabled if running on a NUMA
>>  	  machine.
>>
>> +config CGROUP_NUMA_LOCALITY
>> +	bool "per-cgroup NUMA Locality"
>> +	default n
>> +	depends on CGROUP_SCHED && NUMA_BALANCING
>> +	help
>> +	  This option enables the collection of per-cgroup NUMA locality info,
>> +	  to tell whether NUMA Balancing is working well for a particular
>> +	  workload, also imply the NUMA efficiency.
>> +
>>  menuconfig CGROUPS
>>  	bool "Control Group support"
>>  	select KERNFS
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index e7b08d52db93..40dd6b221eef 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -7657,6 +7657,68 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
>>  }
>>  #endif /* CONFIG_RT_GROUP_SCHED */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +DEFINE_STATIC_KEY_FALSE(sched_numa_locality);
>> +
>> +#ifdef CONFIG_PROC_SYSCTL
>> +int sysctl_numa_locality(struct ctl_table *table, int write,
>> +			 void __user *buffer, size_t *lenp, loff_t *ppos)
>> +{
>> +	struct ctl_table t;
>> +	int err;
>> +	int state = static_branch_likely(&sched_numa_locality);
>> +
>> +	if (write && !capable(CAP_SYS_ADMIN))
>> +		return -EPERM;
>> +
>> +	t = *table;
>> +	t.data = &state;
>> +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
>> +	if (err < 0 || !write)
>> +		return err;
>> +
>> +	if (state)
>> +		static_branch_enable(&sched_numa_locality);
>> +	else
>> +		static_branch_disable(&sched_numa_locality);
>> +
>> +	return err;
>> +}
>> +#endif
>> +
>> +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
>> +{
>> +	return tg == &root_task_group ? &cpu_rq(cpu)->cfs : tg->cfs_rq[cpu];
>> +}
>> +
>> +static int cpu_numa_stat_show(struct seq_file *sf, void *v)
>> +{
>> +	int cpu;
>> +	u64 local = 0, remote = 0;
>> +	struct task_group *tg = css_tg(seq_css(sf));
>> +
>> +	if (!static_branch_likely(&sched_numa_locality))
>> +		return 0;
>> +
>> +	for_each_possible_cpu(cpu) {
>> +		local += tg_cfs_rq(tg, cpu)->local_page_access;
>> +		remote += tg_cfs_rq(tg, cpu)->remote_page_access;
>> +	}
>> +
>> +	seq_printf(sf, "page_access local=%llu remote=%llu\n", local, remote);
>> +
>> +	return 0;
>> +}
>> +
>> +static __init int numa_locality_setup(char *opt)
>> +{
>> +	static_branch_enable(&sched_numa_locality);
>> +
>> +	return 0;
>> +}
>> +__setup("numa_locality", numa_locality_setup);
>> +#endif
>> +
>>  static struct cftype cpu_legacy_files[] = {
>>  #ifdef CONFIG_FAIR_GROUP_SCHED
>>  	{
>> @@ -7706,6 +7768,12 @@ static struct cftype cpu_legacy_files[] = {
>>  		.seq_show = cpu_uclamp_max_show,
>>  		.write = cpu_uclamp_max_write,
>>  	},
>> +#endif
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	{
>> +		.name = "numa_stat",
>> +		.seq_show = cpu_numa_stat_show,
>> +	},
>>  #endif
>>  	{ }	/* Terminate */
>>  };
>> @@ -7887,6 +7955,13 @@ static struct cftype cpu_files[] = {
>>  		.seq_show = cpu_uclamp_max_show,
>>  		.write = cpu_uclamp_max_write,
>>  	},
>> +#endif
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	{
>> +		.name = "numa_stat",
>> +		.flags = CFTYPE_NOT_ON_ROOT,
>> +		.seq_show = cpu_numa_stat_show,
>> +	},
>>  #endif
>>  	{ }	/* terminate */
>>  };
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 2d170b5da0e3..eb838557bae2 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1049,7 +1049,63 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>   * Scheduling class queueing methods:
>>   */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +/*
>> + * We want to record the real local/remote page access statistic
>> + * here, so 'pnid' should be pages's real residential node after
>> + * migrate_misplaced_page(), and 'cnid' should be the node of CPU
>> + * where triggered the PF.
>> + */
>> +static inline void
>> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
>> +{
>> +	if (!static_branch_unlikely(&sched_numa_locality))
>> +		return;
>> +
>> +	/*
>> +	 * pnid != cnid --> remote idx 0
>> +	 * pnid == cnid --> local idx 1
>> +	 */
>> +	p->numa_page_access[!!(pnid == cnid)] += pages;
>> +}
>> +
>> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
>> +{
>> +	unsigned long ldiff, rdiff;
>> +
>> +	if (!static_branch_unlikely(&sched_numa_locality))
>> +		return;
>> +
>> +	rdiff = current->numa_page_access[0] - current->numa_page_access[2];
>> +	ldiff = current->numa_page_access[1] - current->numa_page_access[3];
>> +	if (!ldiff && !rdiff)
>> +		return;
>> +
>> +	cfs_rq->local_page_access += ldiff;
>> +	cfs_rq->remote_page_access += rdiff;
>> +
>> +	/*
>> +	 * Consider updated when reach root cfs_rq, no NUMA Balancing PF
>> +	 * should happen on current task during the hierarchical updating.
>> +	 */
>> +	if (&cfs_rq->rq->cfs == cfs_rq) {
>> +		current->numa_page_access[2] = current->numa_page_access[0];
>> +		current->numa_page_access[3] = current->numa_page_access[1];
>> +	}
>> +}
>> +#else
>> +static inline void
>> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
>> +{
>> +}
>> +
>> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
>> +{
>> +}
>> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
>> +
>>  #ifdef CONFIG_NUMA_BALANCING
>> +
>>  /*
>>   * Approximate time to scan a full NUMA task in ms. The task scan period is
>>   * calculated based on the tasks virtual memory size and
>> @@ -2465,6 +2521,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>>  	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
>>  	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
>>  	p->numa_faults_locality[local] += pages;
>> +
>> +	update_task_locality(p, mem_node, numa_node_id(), pages);
>>  }
>>
>>  static void reset_ptenuma_scan(struct task_struct *p)
>> @@ -2650,6 +2708,9 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
>>  	p->last_sum_exec_runtime	= 0;
>>
>>  	init_task_work(&p->numa_work, task_numa_work);
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	memset(p->numa_page_access, 0, sizeof(p->numa_page_access));
>> +#endif
>>
>>  	/* New address space, reset the preferred nid */
>>  	if (!(clone_flags & CLONE_VM)) {
>> @@ -4313,6 +4374,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>>  	 */
>>  	update_load_avg(cfs_rq, curr, UPDATE_TG);
>>  	update_cfs_group(curr);
>> +	update_group_locality(cfs_rq);
>>
>>  #ifdef CONFIG_SCHED_HRTICK
>>  	/*
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 1a88dc8ad11b..66b4e581b6ed 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -575,6 +575,14 @@ struct cfs_rq {
>>  	struct list_head	throttled_list;
>>  #endif /* CONFIG_CFS_BANDWIDTH */
>>  #endif /* CONFIG_FAIR_GROUP_SCHED */
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	/*
>> +	 * The local/remote page access info collected from all
>> +	 * the tasks in hierarchy.
>> +	 */
>> +	u64			local_page_access;
>> +	u64			remote_page_access;
>> +#endif
>>  };
>>
>>  static inline int rt_bandwidth_enabled(void)
>> @@ -1601,6 +1609,10 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
>>  extern struct static_key_false sched_numa_balancing;
>>  extern struct static_key_false sched_schedstats;
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +extern struct static_key_false sched_numa_locality;
>> +#endif
>> +
>>  static inline u64 global_rt_period(void)
>>  {
>>  	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>> index d396aaaf19a3..a8f5951f92b3 100644
>> --- a/kernel/sysctl.c
>> +++ b/kernel/sysctl.c
>> @@ -428,6 +428,17 @@ static struct ctl_table kern_table[] = {
>>  		.extra2		= SYSCTL_ONE,
>>  	},
>>  #endif /* CONFIG_NUMA_BALANCING */
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	{
>> +		.procname	= "numa_locality",
>> +		.data		= NULL, /* filled in by handler */
>> +		.maxlen		= sizeof(unsigned int),
>> +		.mode		= 0644,
>> +		.proc_handler	= sysctl_numa_locality,
>> +		.extra1		= SYSCTL_ZERO,
>> +		.extra2		= SYSCTL_ONE,
>> +	},
>> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
>>  #endif /* CONFIG_SCHED_DEBUG */
>>  	{
>>  		.procname	= "sched_rt_period_us",
>>
Peter Zijlstra Feb. 14, 2020, 3:10 p.m. UTC | #3
On Fri, Feb 07, 2020 at 11:35:30AM +0800, 王贇 wrote:
> Currently there are no good approach to monitoring the per-cgroup NUMA
> efficiency, this could be a trouble especially when groups are sharing
> CPUs, we don't know which one introduced remote-memory accessing.
> 
> Although the per-task NUMA accessing info from PMU is good for further
> debuging, but not light enough for daily monitoring, especial on a box
> with thousands of tasks.
> 
> Fortunately, when NUMA Balancing enabled, it will periodly trigger page
> fault and try to increase the NUMA locality, by tracing the results we
> will be able to estimate the NUMA efficiency.
> 
> On each page fault of NUMA Balancing, when task's executing CPU is from
> the same node of pages, we call this a local page accessing, otherwise
> a remote page accessing.
> 
> By updating task's accessing counter into it's cgroup on ticks, we get
> the per-cgroup numa locality info.
> 
> For example the new entry 'cpu.numa_stat' show:
>   page_access local=1231412 remote=53453
> 
> Here we know the workloads in hierarchy have totally been traced 1284865
> times of page accessing, and 1231412 of them are local page access, which
> imply a good NUMA efficiency.
> 
> By monitoring the increments, we will be able to locate the per-cgroup
> workload which NUMA Balancing can't helpwith (usually caused by wrong
> CPU and memory node bindings), then we got chance to fix that in time.
> 
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Michal Koutný <mkoutny@suse.com>
> Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>

So here:

  https://lkml.kernel.org/r/20191127101932.GN28938@suse.de

Mel argues that the information exposed is fairly implementation
specific and hard to use without understanding how NUMA balancing works.

By exposing it to userspace, we tie ourselves to these particulars. We
can no longer change these NUMA balancing details if we wanted to, due
to UAPI concerns.

Mel, I suspect you still feel that way, right?

In the document (patch 2/2) you write:

> +However, there are no hardware counters for per-task local/remote accessing
> +info, we don't know how many remote page accesses have occurred for a
> +particular task.

We can of course 'fix' that by adding a tracepoint.

Mel, would you feel better by having a tracepoint in task_numa_fault() ?

Now I'm not really a fan of tracepoints myself, since they also
establish a UAPI, but perhaps it is a lesser evil in this case.
Mel Gorman Feb. 17, 2020, 11:58 a.m. UTC | #4
On Fri, Feb 14, 2020 at 04:10:48PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 07, 2020 at 11:35:30AM +0800, ?????? wrote:
> > By monitoring the increments, we will be able to locate the per-cgroup
> > workload which NUMA Balancing can't helpwith (usually caused by wrong
> > CPU and memory node bindings), then we got chance to fix that in time.
> > 
> > Cc: Mel Gorman <mgorman@suse.de>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Michal Koutný <mkoutny@suse.com>
> > Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
> 
> So here:
> 
>   https://lkml.kernel.org/r/20191127101932.GN28938@suse.de
> 
> Mel argues that the information exposed is fairly implementation
> specific and hard to use without understanding how NUMA balancing works.
> 
> By exposing it to userspace, we tie ourselves to these particulars. We
> can no longer change these NUMA balancing details if we wanted to, due
> to UAPI concerns.
> 
> Mel, I suspect you still feel that way, right?
> 

Yes, I still think it would be a struggle to interpret the data
meaningfully without very specific knowledge of the implementation. If
the scan rate was constant, it would be easier but that would make NUMA
balancing worse overall. Similarly, the stat might get very difficult to
interpret when NUMA balancing is failing because of a load imbalance,
pages are shared and being interleaved or NUMA groups span multiple
active nodes.

For example, the series that reconciles NUMA and CPU balancers may look
worse in these stats even though the overall performance may be better.

> In the document (patch 2/2) you write:
> 
> > +However, there are no hardware counters for per-task local/remote accessing
> > +info, we don't know how many remote page accesses have occurred for a
> > +particular task.
> 
> We can of course 'fix' that by adding a tracepoint.
> 
> Mel, would you feel better by having a tracepoint in task_numa_fault() ?
> 

A bit, although interpreting the data would still be difficult and the
tracepoint would have to include information about the cgroup. While
I've never tried, this seems like the type of thing that would be suited
to a BPF script that probes task_numa_fault and extract the information
it needs.
王贇 Feb. 17, 2020, 1:23 p.m. UTC | #5
On 2020/2/17 下午7:58, Mel Gorman wrote:
[snip]
>> Mel, I suspect you still feel that way, right?
>>
> 
> Yes, I still think it would be a struggle to interpret the data
> meaningfully without very specific knowledge of the implementation. If
> the scan rate was constant, it would be easier but that would make NUMA
> balancing worse overall. Similarly, the stat might get very difficult to
> interpret when NUMA balancing is failing because of a load imbalance,
> pages are shared and being interleaved or NUMA groups span multiple
> active nodes.

Hi, Mel, appreciated to have you back on the table :-)

IMHO the scan period changing should not be a problem now, since the
maximum period is defined by user, so monitoring at maximum period
on the accumulated page accessing counters is always meaningful, correct?

FYI, by monitoring locality, we found that the kvm vcpu thread is not
covered by NUMA Balancing, whatever how many maximum period passed, the
counters are not increasing, or very slowly, although inside guest we are
copying memory.

Later we found such task rarely exit to user space to trigger task
work callbacks, and NUMA Balancing scan depends on that, which help us
realize the importance to enable NUMA Balancing inside guest, with the
correct NUMA topo, a big performance risk I'll say :-P

Maybe not a good example, but we just try to highlight that NUMA Balancing
could have issue in some cases, and we want them to be exposed, somehow,
maybe by the locality.

Regards,
Michael Wang

> 
> For example, the series that reconciles NUMA and CPU balancers may look
> worse in these stats even though the overall performance may be better.
> 
>> In the document (patch 2/2) you write:
>>
>>> +However, there are no hardware counters for per-task local/remote accessing
>>> +info, we don't know how many remote page accesses have occurred for a
>>> +particular task.
>>
>> We can of course 'fix' that by adding a tracepoint.
>>
>> Mel, would you feel better by having a tracepoint in task_numa_fault() ?
>>
> 
> A bit, although interpreting the data would still be difficult and the
> tracepoint would have to include information about the cgroup. While
> I've never tried, this seems like the type of thing that would be suited
> to a BPF script that probes task_numa_fault and extract the information
> it needs.

>
Mel Gorman Feb. 17, 2020, 2:16 p.m. UTC | #6
On Mon, Feb 17, 2020 at 09:23:52PM +0800, ?????? wrote:
> 
> 
> On 2020/2/17 ??????7:58, Mel Gorman wrote:
> [snip]
> >> Mel, I suspect you still feel that way, right?
> >>
> > 
> > Yes, I still think it would be a struggle to interpret the data
> > meaningfully without very specific knowledge of the implementation. If
> > the scan rate was constant, it would be easier but that would make NUMA
> > balancing worse overall. Similarly, the stat might get very difficult to
> > interpret when NUMA balancing is failing because of a load imbalance,
> > pages are shared and being interleaved or NUMA groups span multiple
> > active nodes.
> 
> Hi, Mel, appreciated to have you back on the table :-)
> 
> IMHO the scan period changing should not be a problem now, since the
> maximum period is defined by user, so monitoring at maximum period
> on the accumulated page accessing counters is always meaningful, correct?
> 

It has meaning but the scan rate drives the fault rate which is the basis
for the stats you accumulate. If the scan rate is high when accesses
are local, the stats can be skewed making it appear the task is much
more local than it may really is at a later point in time. The scan rate
affects the accuracy of the information. The counters have meaning but
they needs careful interpretation.

> FYI, by monitoring locality, we found that the kvm vcpu thread is not
> covered by NUMA Balancing, whatever how many maximum period passed, the
> counters are not increasing, or very slowly, although inside guest we are
> copying memory.
> 
> Later we found such task rarely exit to user space to trigger task
> work callbacks, and NUMA Balancing scan depends on that, which help us
> realize the importance to enable NUMA Balancing inside guest, with the
> correct NUMA topo, a big performance risk I'll say :-P
> 

Which is a very interesting corner case in itself but also one that
could have potentially have been inferred from monitoring /proc/vmstat
numa_pte_updates or on a per-task basis by monitoring /proc/PID/sched and
watching numa_scan_seq and total_numa_faults. Accumulating the information
on a per-cgroup basis would require a bit more legwork.

> Maybe not a good example, but we just try to highlight that NUMA Balancing
> could have issue in some cases, and we want them to be exposed, somehow,
> maybe by the locality.
> 

Again, I'm somewhat neutral on the patch simply because I would not use
the information for debugging problems with NUMA balancing. I would try
using tracepoints and if the tracepoints were not good enough, I'd add or
fix them -- similar to what I had to do with sched_stick_numa recently.
The caveat is that I mostly look at this sort of problem as a developer.
Sysadmins have very different requirements, especially simplicity even
if the simplicity in this case is an illusion.
王贇 Feb. 18, 2020, 1:39 a.m. UTC | #7
On 2020/2/17 下午10:16, Mel Gorman wrote:
> On Mon, Feb 17, 2020 at 09:23:52PM +0800, ?????? wrote:
[snip]
>>
>> IMHO the scan period changing should not be a problem now, since the
>> maximum period is defined by user, so monitoring at maximum period
>> on the accumulated page accessing counters is always meaningful, correct?
>>
> 
> It has meaning but the scan rate drives the fault rate which is the basis
> for the stats you accumulate. If the scan rate is high when accesses
> are local, the stats can be skewed making it appear the task is much
> more local than it may really is at a later point in time. The scan rate
> affects the accuracy of the information. The counters have meaning but
> they needs careful interpretation.

Yeah, to zip so many information from NUMA Balancing to some statistics
is a challenge itself, the locality still not so easy to be understood by
NUMA newbie :-P

> 
>> FYI, by monitoring locality, we found that the kvm vcpu thread is not
>> covered by NUMA Balancing, whatever how many maximum period passed, the
>> counters are not increasing, or very slowly, although inside guest we are
>> copying memory.
>>
>> Later we found such task rarely exit to user space to trigger task
>> work callbacks, and NUMA Balancing scan depends on that, which help us
>> realize the importance to enable NUMA Balancing inside guest, with the
>> correct NUMA topo, a big performance risk I'll say :-P
>>
> 
> Which is a very interesting corner case in itself but also one that
> could have potentially have been inferred from monitoring /proc/vmstat
> numa_pte_updates or on a per-task basis by monitoring /proc/PID/sched and
> watching numa_scan_seq and total_numa_faults. Accumulating the information
> on a per-cgroup basis would require a bit more legwork.

That's not working for daily monitoring...

Besides, compared with locality, this require much more deeper understand
on the implementation, which could even be tough for NUMA developers to
assemble all these statistics together.

> 
>> Maybe not a good example, but we just try to highlight that NUMA Balancing
>> could have issue in some cases, and we want them to be exposed, somehow,
>> maybe by the locality.
>>
> 
> Again, I'm somewhat neutral on the patch simply because I would not use
> the information for debugging problems with NUMA balancing. I would try
> using tracepoints and if the tracepoints were not good enough, I'd add or
> fix them -- similar to what I had to do with sched_stick_numa recently.
> The caveat is that I mostly look at this sort of problem as a developer.
> Sysadmins have very different requirements, especially simplicity even
> if the simplicity in this case is an illusion.

Fair enough, but I guess PeterZ still want your Ack, so neutral means
refuse in this case :-(

BTW, how do you think about the documentation in second patch?

Do you think it's necessary to have a doc to explain NUMA related statistics?

Regards,
Michael Wang

>
Mel Gorman Feb. 21, 2020, 2:20 p.m. UTC | #8
On Tue, Feb 18, 2020 at 09:39:35AM +0800, ?????? wrote:
> On 2020/2/17 ??????10:16, Mel Gorman wrote:
> > On Mon, Feb 17, 2020 at 09:23:52PM +0800, ?????? wrote:
> [snip]
> >>
> >> IMHO the scan period changing should not be a problem now, since the
> >> maximum period is defined by user, so monitoring at maximum period
> >> on the accumulated page accessing counters is always meaningful, correct?
> >>
> > 
> > It has meaning but the scan rate drives the fault rate which is the basis
> > for the stats you accumulate. If the scan rate is high when accesses
> > are local, the stats can be skewed making it appear the task is much
> > more local than it may really is at a later point in time. The scan rate
> > affects the accuracy of the information. The counters have meaning but
> > they needs careful interpretation.
> 
> Yeah, to zip so many information from NUMA Balancing to some statistics
> is a challenge itself, the locality still not so easy to be understood by
> NUMA newbie :-P
> 

Indeed and if they do not take into account historical skew into
account, they still might not understand.

> > 
> >> FYI, by monitoring locality, we found that the kvm vcpu thread is not
> >> covered by NUMA Balancing, whatever how many maximum period passed, the
> >> counters are not increasing, or very slowly, although inside guest we are
> >> copying memory.
> >>
> >> Later we found such task rarely exit to user space to trigger task
> >> work callbacks, and NUMA Balancing scan depends on that, which help us
> >> realize the importance to enable NUMA Balancing inside guest, with the
> >> correct NUMA topo, a big performance risk I'll say :-P
> >>
> > 
> > Which is a very interesting corner case in itself but also one that
> > could have potentially have been inferred from monitoring /proc/vmstat
> > numa_pte_updates or on a per-task basis by monitoring /proc/PID/sched and
> > watching numa_scan_seq and total_numa_faults. Accumulating the information
> > on a per-cgroup basis would require a bit more legwork.
> 
> That's not working for daily monitoring...
> 

Indeed although at least /proc/vmstat is cheap to monitor and it could
at least be tracked if the number of NUMA faults are abnormally low or
the ratio of remote to local hints are problematic.

> Besides, compared with locality, this require much more deeper understand
> on the implementation, which could even be tough for NUMA developers to
> assemble all these statistics together.
> 

My point is that even with the patch, the definition of locality is
subtle. At a single point in time, the locality might appear to be low
but it's due to an event that happened far in the past.

> > 
> >> Maybe not a good example, but we just try to highlight that NUMA Balancing
> >> could have issue in some cases, and we want them to be exposed, somehow,
> >> maybe by the locality.
> >>
> > 
> > Again, I'm somewhat neutral on the patch simply because I would not use
> > the information for debugging problems with NUMA balancing. I would try
> > using tracepoints and if the tracepoints were not good enough, I'd add or
> > fix them -- similar to what I had to do with sched_stick_numa recently.
> > The caveat is that I mostly look at this sort of problem as a developer.
> > Sysadmins have very different requirements, especially simplicity even
> > if the simplicity in this case is an illusion.
> 
> Fair enough, but I guess PeterZ still want your Ack, so neutral means
> refuse in this case :-(
> 

I think the patch is functionally harmless and can be disabled but I also
would be wary of dealing with a bug report that was based on the numbers
provided by the locality metric. The bulk of the work related to the bug
would likely be spent on trying to explain the metric and I've dealt with
quite a few bugs that were essentially "We don't like this number and think
something is wrong because of it -- fix it". Even then, I would want the
workload isolated and then vmstat recorded over time to determine it's
a persistent problem or not. That's the reason why I'm relucant to ack it.

I fully acknowledge that this may have value for sysadmins and may be a
good enough reason to merge it for environments that typically build and
configure their own kernels. I doubt that general distributions would
enable it but that's a guess.

> BTW, how do you think about the documentation in second patch?
> 

I think the documentation is great, it's clear and explains itself well.

> Do you think it's necessary to have a doc to explain NUMA related statistics?
> 

It would be nice but AFAIK, the stats in vmstats are not documented.
They are there because recording them over time can be very useful when
dealing with user bug reports.
Peter Zijlstra Feb. 21, 2020, 3:28 p.m. UTC | #9
On Mon, Feb 17, 2020 at 09:23:52PM +0800, 王贇 wrote:
> FYI, by monitoring locality, we found that the kvm vcpu thread is not
> covered by NUMA Balancing, whatever how many maximum period passed, the
> counters are not increasing, or very slowly, although inside guest we are
> copying memory.
> 
> Later we found such task rarely exit to user space to trigger task
> work callbacks, and NUMA Balancing scan depends on that, which help us
> realize the importance to enable NUMA Balancing inside guest, with the
> correct NUMA topo, a big performance risk I'll say :-P

That's a bug in KVM, see:

  https://lkml.kernel.org/r/20190801143657.785902257@linutronix.de
  https://lkml.kernel.org/r/20190801143657.887648487@linutronix.de

ISTR there being newer versions of that patch-set, but I can't seem to
find them in a hurry.
Peter Zijlstra Feb. 21, 2020, 3:47 p.m. UTC | #10
On Fri, Feb 21, 2020 at 02:20:10PM +0000, Mel Gorman wrote:
> I fully acknowledge that this may have value for sysadmins and may be a
> good enough reason to merge it for environments that typically build and
> configure their own kernels. I doubt that general distributions would
> enable it but that's a guess.

OTOH, many sysadmins seem to 'rely' on BPF scripts and other such fancy
things these days.

 ( of course, we have the open question on what happens when we break
   one of those BPF 'important' scripts ... )

My main reservation with this patch is that it exposes, to userspace, an
ABI that is very hard to interpret and subject to implementation
details.

So while it can be disabled; people who have it enabled might suddenly
complain when we change the meaning/interpretation/whatever of these
magic numbers.

Michael; you seem to have ignored the tracepoint / BPF angle earlier in
this discussion; that is not something that could/would work for you?
王贇 Feb. 24, 2020, 3:05 a.m. UTC | #11
On 2020/2/21 下午10:20, Mel Gorman wrote:
[snip]
>>>
>>> Which is a very interesting corner case in itself but also one that
>>> could have potentially have been inferred from monitoring /proc/vmstat
>>> numa_pte_updates or on a per-task basis by monitoring /proc/PID/sched and
>>> watching numa_scan_seq and total_numa_faults. Accumulating the information
>>> on a per-cgroup basis would require a bit more legwork.
>>
>> That's not working for daily monitoring...
>>
> 
> Indeed although at least /proc/vmstat is cheap to monitor and it could
> at least be tracked if the number of NUMA faults are abnormally low or
> the ratio of remote to local hints are problematic.
> 
>> Besides, compared with locality, this require much more deeper understand
>> on the implementation, which could even be tough for NUMA developers to
>> assemble all these statistics together.
>>
> 
> My point is that even with the patch, the definition of locality is
> subtle. At a single point in time, the locality might appear to be low
> but it's due to an event that happened far in the past.

Agree, the locality's meaning just keep changing... only those who
understand the implementation can figure out the useful information.

> 
>>>
>>>> Maybe not a good example, but we just try to highlight that NUMA Balancing
>>>> could have issue in some cases, and we want them to be exposed, somehow,
>>>> maybe by the locality.
>>>>
>>>
>>> Again, I'm somewhat neutral on the patch simply because I would not use
>>> the information for debugging problems with NUMA balancing. I would try
>>> using tracepoints and if the tracepoints were not good enough, I'd add or
>>> fix them -- similar to what I had to do with sched_stick_numa recently.
>>> The caveat is that I mostly look at this sort of problem as a developer.
>>> Sysadmins have very different requirements, especially simplicity even
>>> if the simplicity in this case is an illusion.
>>
>> Fair enough, but I guess PeterZ still want your Ack, so neutral means
>> refuse in this case :-(
>>
> 
> I think the patch is functionally harmless and can be disabled but I also
> would be wary of dealing with a bug report that was based on the numbers
> provided by the locality metric. The bulk of the work related to the bug
> would likely be spent on trying to explain the metric and I've dealt with
> quite a few bugs that were essentially "We don't like this number and think
> something is wrong because of it -- fix it". Even then, I would want the
> workload isolated and then vmstat recorded over time to determine it's
> a persistent problem or not. That's the reason why I'm relucant to ack it.
> 
> I fully acknowledge that this may have value for sysadmins and may be a
> good enough reason to merge it for environments that typically build and
> configure their own kernels. I doubt that general distributions would
> enable it but that's a guess.

Thanks for the kindly explain, I get the point.

False alarm maybe fine to admin, but could be nightmare if the user keep
asking why, I suppose those who want to do some improvement on NUMA may be
interested :-P

Anyway, I understand there is a gap between general requirement and this
locality idea, and it's really hard to be fulfill...

> 
>> BTW, how do you think about the documentation in second patch?
>>
> 
> I think the documentation is great, it's clear and explains itself well.
> 
>> Do you think it's necessary to have a doc to explain NUMA related statistics?
>>
> 
> It would be nice but AFAIK, the stats in vmstats are not documented.
> They are there because recording them over time can be very useful when
> dealing with user bug reports.

Another TODO then :-)

Regards,
Michael Wang

>
王贇 Feb. 24, 2020, 3:09 a.m. UTC | #12
On 2020/2/21 下午11:28, Peter Zijlstra wrote:
> On Mon, Feb 17, 2020 at 09:23:52PM +0800, 王贇 wrote:
>> FYI, by monitoring locality, we found that the kvm vcpu thread is not
>> covered by NUMA Balancing, whatever how many maximum period passed, the
>> counters are not increasing, or very slowly, although inside guest we are
>> copying memory.
>>
>> Later we found such task rarely exit to user space to trigger task
>> work callbacks, and NUMA Balancing scan depends on that, which help us
>> realize the importance to enable NUMA Balancing inside guest, with the
>> correct NUMA topo, a big performance risk I'll say :-P
> 
> That's a bug in KVM, see:
> 
>   https://lkml.kernel.org/r/20190801143657.785902257@linutronix.de
>   https://lkml.kernel.org/r/20190801143657.887648487@linutronix.de
> 
> ISTR there being newer versions of that patch-set, but I can't seem to
> find them in a hurry.

Aha, that's exactly the problem we saw, will check~

Regards,
Michael Wang

>
王贇 Feb. 24, 2020, 3:13 a.m. UTC | #13
On 2020/2/21 下午11:47, Peter Zijlstra wrote:
> On Fri, Feb 21, 2020 at 02:20:10PM +0000, Mel Gorman wrote:
>> I fully acknowledge that this may have value for sysadmins and may be a
>> good enough reason to merge it for environments that typically build and
>> configure their own kernels. I doubt that general distributions would
>> enable it but that's a guess.
> 
> OTOH, many sysadmins seem to 'rely' on BPF scripts and other such fancy
> things these days.
> 
>  ( of course, we have the open question on what happens when we break
>    one of those BPF 'important' scripts ... )
> 
> My main reservation with this patch is that it exposes, to userspace, an
> ABI that is very hard to interpret and subject to implementation
> details.
> 
> So while it can be disabled; people who have it enabled might suddenly
> complain when we change the meaning/interpretation/whatever of these
> magic numbers.
> 
> Michael; you seem to have ignored the tracepoint / BPF angle earlier in
> this discussion; that is not something that could/would work for you?

At very beginning I think these fancy stuff may consume too much resources
them selves, so just as you said, ignored the possibility :-P

But now I understand there is a big gap here, which require a much more general
way to evaluate the NUMA platform, I'll try to follow this way see if there
are any practical approach instead~

Regards,
Michael Wang

>
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a6c924fa1c77..74bf234bae53 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1128,6 +1128,21 @@  struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */

+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+	/*
+	 * Counter index stand for:
+	 * 0 -- remote page accessing
+	 * 1 -- local page accessing
+	 * 2 -- remote page accessing updated to cgroup
+	 * 3 -- local page accessing updated to cgroup
+	 *
+	 * We record the counter before the end of task_numa_fault(), this
+	 * is based on the fact that after page fault is handled, the task
+	 * will access the page on the CPU where it triggered the PF.
+	 */
+	unsigned long			numa_page_access[4];
+#endif
+
 #ifdef CONFIG_RSEQ
 	struct rseq __user *rseq;
 	u32 rseq_sig;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d4f6215ee03f..bb3721cf48e0 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -101,4 +101,10 @@  extern int sched_energy_aware_handler(struct ctl_table *table, int write,
 				 loff_t *ppos);
 #endif

+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+extern int sysctl_numa_locality(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos);
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/init/Kconfig b/init/Kconfig
index 322fd2c65304..63c6b90a515d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -813,6 +813,15 @@  config NUMA_BALANCING_DEFAULT_ENABLED
 	  If set, automatic NUMA balancing will be enabled if running on a NUMA
 	  machine.

+config CGROUP_NUMA_LOCALITY
+	bool "per-cgroup NUMA Locality"
+	default n
+	depends on CGROUP_SCHED && NUMA_BALANCING
+	help
+	  This option enables the collection of per-cgroup NUMA locality info,
+	  to tell whether NUMA Balancing is working well for a particular
+	  workload, also imply the NUMA efficiency.
+
 menuconfig CGROUPS
 	bool "Control Group support"
 	select KERNFS
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7b08d52db93..40dd6b221eef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7657,6 +7657,68 @@  static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */

+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+DEFINE_STATIC_KEY_FALSE(sched_numa_locality);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_locality(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_numa_locality);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0 || !write)
+		return err;
+
+	if (state)
+		static_branch_enable(&sched_numa_locality);
+	else
+		static_branch_disable(&sched_numa_locality);
+
+	return err;
+}
+#endif
+
+static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
+{
+	return tg == &root_task_group ? &cpu_rq(cpu)->cfs : tg->cfs_rq[cpu];
+}
+
+static int cpu_numa_stat_show(struct seq_file *sf, void *v)
+{
+	int cpu;
+	u64 local = 0, remote = 0;
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	if (!static_branch_likely(&sched_numa_locality))
+		return 0;
+
+	for_each_possible_cpu(cpu) {
+		local += tg_cfs_rq(tg, cpu)->local_page_access;
+		remote += tg_cfs_rq(tg, cpu)->remote_page_access;
+	}
+
+	seq_printf(sf, "page_access local=%llu remote=%llu\n", local, remote);
+
+	return 0;
+}
+
+static __init int numa_locality_setup(char *opt)
+{
+	static_branch_enable(&sched_numa_locality);
+
+	return 0;
+}
+__setup("numa_locality", numa_locality_setup);
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -7706,6 +7768,12 @@  static struct cftype cpu_legacy_files[] = {
 		.seq_show = cpu_uclamp_max_show,
 		.write = cpu_uclamp_max_write,
 	},
+#endif
+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+	{
+		.name = "numa_stat",
+		.seq_show = cpu_numa_stat_show,
+	},
 #endif
 	{ }	/* Terminate */
 };
@@ -7887,6 +7955,13 @@  static struct cftype cpu_files[] = {
 		.seq_show = cpu_uclamp_max_show,
 		.write = cpu_uclamp_max_write,
 	},
+#endif
+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+	{
+		.name = "numa_stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_numa_stat_show,
+	},
 #endif
 	{ }	/* terminate */
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2d170b5da0e3..eb838557bae2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1049,7 +1049,63 @@  update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */

+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+/*
+ * We want to record the real local/remote page access statistic
+ * here, so 'pnid' should be pages's real residential node after
+ * migrate_misplaced_page(), and 'cnid' should be the node of CPU
+ * where triggered the PF.
+ */
+static inline void
+update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
+{
+	if (!static_branch_unlikely(&sched_numa_locality))
+		return;
+
+	/*
+	 * pnid != cnid --> remote idx 0
+	 * pnid == cnid --> local idx 1
+	 */
+	p->numa_page_access[!!(pnid == cnid)] += pages;
+}
+
+static inline void update_group_locality(struct cfs_rq *cfs_rq)
+{
+	unsigned long ldiff, rdiff;
+
+	if (!static_branch_unlikely(&sched_numa_locality))
+		return;
+
+	rdiff = current->numa_page_access[0] - current->numa_page_access[2];
+	ldiff = current->numa_page_access[1] - current->numa_page_access[3];
+	if (!ldiff && !rdiff)
+		return;
+
+	cfs_rq->local_page_access += ldiff;
+	cfs_rq->remote_page_access += rdiff;
+
+	/*
+	 * Consider updated when reach root cfs_rq, no NUMA Balancing PF
+	 * should happen on current task during the hierarchical updating.
+	 */
+	if (&cfs_rq->rq->cfs == cfs_rq) {
+		current->numa_page_access[2] = current->numa_page_access[0];
+		current->numa_page_access[3] = current->numa_page_access[1];
+	}
+}
+#else
+static inline void
+update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
+{
+}
+
+static inline void update_group_locality(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
+
 #ifdef CONFIG_NUMA_BALANCING
+
 /*
  * Approximate time to scan a full NUMA task in ms. The task scan period is
  * calculated based on the tasks virtual memory size and
@@ -2465,6 +2521,8 @@  void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
 	p->numa_faults_locality[local] += pages;
+
+	update_task_locality(p, mem_node, numa_node_id(), pages);
 }

 static void reset_ptenuma_scan(struct task_struct *p)
@@ -2650,6 +2708,9 @@  void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	p->last_sum_exec_runtime	= 0;

 	init_task_work(&p->numa_work, task_numa_work);
+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+	memset(p->numa_page_access, 0, sizeof(p->numa_page_access));
+#endif

 	/* New address space, reset the preferred nid */
 	if (!(clone_flags & CLONE_VM)) {
@@ -4313,6 +4374,7 @@  entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 */
 	update_load_avg(cfs_rq, curr, UPDATE_TG);
 	update_cfs_group(curr);
+	update_group_locality(cfs_rq);

 #ifdef CONFIG_SCHED_HRTICK
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1a88dc8ad11b..66b4e581b6ed 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -575,6 +575,14 @@  struct cfs_rq {
 	struct list_head	throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+	/*
+	 * The local/remote page access info collected from all
+	 * the tasks in hierarchy.
+	 */
+	u64			local_page_access;
+	u64			remote_page_access;
+#endif
 };

 static inline int rt_bandwidth_enabled(void)
@@ -1601,6 +1609,10 @@  static const_debug __maybe_unused unsigned int sysctl_sched_features =
 extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false sched_schedstats;

+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+extern struct static_key_false sched_numa_locality;
+#endif
+
 static inline u64 global_rt_period(void)
 {
 	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d396aaaf19a3..a8f5951f92b3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -428,6 +428,17 @@  static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_CGROUP_NUMA_LOCALITY
+	{
+		.procname	= "numa_locality",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_numa_locality,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
 #endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",