diff mbox series

[v2,1/3] sched/numa: advanced per-cgroup numa statistic

Message ID 9354ffe8-81ba-9e76-e0b3-222bc942b3fc@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series sched/numa: introduce advanced numa statistic | expand

Commit Message

王贇 Nov. 27, 2019, 1:49 a.m. UTC
Currently there are no good approach to monitoring the per-cgroup
numa efficiency, this could be a trouble especially when groups
are sharing CPUs, it's impossible to tell which one caused the
remote-memory access by reading hardware counter since multiple
workloads could sharing the same CPU, which make it painful when
one want to find out the root cause and fix the issue.

In order to address this, we introduced new per-cgroup statistic
for numa:
  * the numa locality to imply the numa balancing efficiency
  * the numa execution time on each node

The task locality is the local page accessing ratio traced on numa
balancing PF, and the group locality is the topology of task execution
time, sectioned by the locality into 7 regions.

For example the new entry 'cpu.numa_stat' show:
  locality 39541 60962 36842 72519 118605 721778 946553
  exectime 1220127 1458684

Here we know the workloads in hierarchy executed 1220127ms on node_0
and 1458684ms on node_1 in total, tasks with locality around 0~13%
executed for 39541 ms, and tasks with locality around 86~100% executed
for 946553 ms, which imply most of the memory access are local access.

By monitoring the new statistic, we will be able to know the numa
efficiency of each per-cgroup workloads on machine, whatever they
sharing the CPUs or not, we will be able to find out which one
introduced the remote access mostly.

Besides, per-node memory topology from 'memory.numa_stat' become
more useful when we have the per-node execution time, workloads
always executing on node_0 while it's memory is all on node_1 is
usually a bad case.

Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
---
 include/linux/sched.h        | 18 ++++++++-
 include/linux/sched/sysctl.h |  6 +++
 init/Kconfig                 |  9 +++++
 kernel/sched/core.c          | 91 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 33 ++++++++++++++++
 kernel/sched/sched.h         | 17 +++++++++
 kernel/sysctl.c              | 11 ++++++
 7 files changed, 184 insertions(+), 1 deletion(-)

Comments

Mel Gorman Nov. 27, 2019, 10:19 a.m. UTC | #1
On Wed, Nov 27, 2019 at 09:49:34AM +0800, ?????? wrote:
> Currently there are no good approach to monitoring the per-cgroup
> numa efficiency, this could be a trouble especially when groups
> are sharing CPUs, it's impossible to tell which one caused the
> remote-memory access by reading hardware counter since multiple
> workloads could sharing the same CPU, which make it painful when
> one want to find out the root cause and fix the issue.
> 

It's already possible to identify specific tasks triggering PMU events
so this is not exactly true.

> In order to address this, we introduced new per-cgroup statistic
> for numa:
>   * the numa locality to imply the numa balancing efficiency
>   * the numa execution time on each node
> 
> The task locality is the local page accessing ratio traced on numa
> balancing PF, and the group locality is the topology of task execution
> time, sectioned by the locality into 7 regions.
> 
> For example the new entry 'cpu.numa_stat' show:
>   locality 39541 60962 36842 72519 118605 721778 946553
>   exectime 1220127 1458684
> 
> Here we know the workloads in hierarchy executed 1220127ms on node_0
> and 1458684ms on node_1 in total, tasks with locality around 0~13%
> executed for 39541 ms, and tasks with locality around 86~100% executed
> for 946553 ms, which imply most of the memory access are local access.
> 
> By monitoring the new statistic, we will be able to know the numa
> efficiency of each per-cgroup workloads on machine, whatever they
> sharing the CPUs or not, we will be able to find out which one
> introduced the remote access mostly.
> 
> Besides, per-node memory topology from 'memory.numa_stat' become
> more useful when we have the per-node execution time, workloads
> always executing on node_0 while it's memory is all on node_1 is
> usually a bad case.
> 
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Michal Koutný <mkoutny@suse.com>
> Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
> ---
>  include/linux/sched.h        | 18 ++++++++-
>  include/linux/sched/sysctl.h |  6 +++
>  init/Kconfig                 |  9 +++++
>  kernel/sched/core.c          | 91 ++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/fair.c          | 33 ++++++++++++++++
>  kernel/sched/sched.h         | 17 +++++++++
>  kernel/sysctl.c              | 11 ++++++
>  7 files changed, 184 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8f6607cd40ac..505b041594ef 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1118,9 +1118,25 @@ struct task_struct {
>  	 * numa_faults_locality tracks if faults recorded during the last
>  	 * scan window were remote/local or failed to migrate. The task scan
>  	 * period is adapted based on the locality of the faults with different
> -	 * weights depending on whether they were shared or private faults
> +	 * weights depending on whether they were shared or private faults.
> +	 *
> +	 * Counter id stand for:
> +	 * 0 -- remote faults
> +	 * 1 -- local faults
> +	 * 2 -- page migration failure
> +	 *
> +	 * Extra counters when CONFIG_CGROUP_NUMA_STAT enabled:
> +	 * 3 -- remote page accessing
> +	 * 4 -- local page accessing
> +	 *
> +	 * The 'remote/local faults' records the cpu-page relationship before
> +	 * page migration, while the 'remote/local page accessing' is after.
>  	 */
> +#ifndef CONFIG_CGROUP_NUMA_STAT
>  	unsigned long			numa_faults_locality[3];
> +#else
> +	unsigned long			numa_faults_locality[5];
> +#endif
> 
>  	unsigned long			numa_pages_migrated;
>  #endif /* CONFIG_NUMA_BALANCING */
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 89f55e914673..2d6a515df544 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -102,4 +102,10 @@ extern int sched_energy_aware_handler(struct ctl_table *table, int write,
>  				 loff_t *ppos);
>  #endif
> 
> +#ifdef CONFIG_CGROUP_NUMA_STAT
> +extern int sysctl_cg_numa_stat(struct ctl_table *table, int write,
> +				 void __user *buffer, size_t *lenp,
> +				 loff_t *ppos);
> +#endif
> +
>  #endif /* _LINUX_SCHED_SYSCTL_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index 4d8d145c41d2..b31d2b560493 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -817,6 +817,15 @@ config NUMA_BALANCING_DEFAULT_ENABLED
>  	  If set, automatic NUMA balancing will be enabled if running on a NUMA
>  	  machine.
> 
> +config CGROUP_NUMA_STAT
> +	bool "Advanced per-cgroup NUMA statistics"
> +	default n
> +	depends on CGROUP_SCHED && NUMA_BALANCING
> +	help
> +	  This option adds support for per-cgroup NUMA locality/execution
> +	  statistics, for monitoring NUMA efficiency of per-cgroup workloads
> +	  on NUMA platforms with NUMA Balancing enabled.
> +
>  menuconfig CGROUPS
>  	bool "Control Group support"
>  	select KERNFS
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index aaa1740e6497..eabcab25be50 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7657,6 +7657,84 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
>  }
>  #endif /* CONFIG_RT_GROUP_SCHED */
> 
> +#ifdef CONFIG_CGROUP_NUMA_STAT
> +DEFINE_STATIC_KEY_FALSE(sched_cg_numa_stat);
> +
> +#ifdef CONFIG_PROC_SYSCTL
> +int sysctl_cg_numa_stat(struct ctl_table *table, int write,
> +			 void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table t;
> +	int err;
> +	int state = static_branch_likely(&sched_cg_numa_stat);
> +
> +	if (write && !capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	t = *table;
> +	t.data = &state;
> +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
> +	if (err < 0 || !write)
> +		return err;
> +
> +	if (state)
> +		static_branch_enable(&sched_cg_numa_stat);
> +	else
> +		static_branch_disable(&sched_cg_numa_stat);
> +
> +	return err;
> +}
> +#endif
> +

Why is this implemented as a toggle? I'm finding it hard to make sense
of this. The numa_stat should not even exist if the feature is disabled.

Assuming that is fixed then the runtime overhead is fine but the same
issues with the quality of the information relying on NUMA balancing
limits the usefulness of this. Disabling NUMA balancing or the scan rate
dropping to a very low frequency would lead in misleading conclusions as
well as false positives if the CPU and memory policies force remote memory
usage. Similarly, the timing of the information available is variable du
to how numa_faults_locality gets reset so sometimes the information is
fine-grained and sometimes it's coarse grained. It will also pretend to
display useful information even if NUMA balancing is disabled.

I find it hard to believe it would be useful in practice and I think users
would have real trouble interpreting the data given how much it relies on
internal implementation details of NUMA balancing. I cannot be certain
as clearly something motivated the creation of this patch although it's
unclear if it has ever been used to debug and fix an actual problem in
the field. Hence, I'm neutral on the patch and will neither ack or nack
it and will defer to the scheduler maintainers but if I was pushed on it,
I would be disinclined to merge the patch due to the potential confusion
caused by users who believe it provides accurate information when at best
it gives a rough approximation with variable granularity.
王贇 Nov. 28, 2019, 2:09 a.m. UTC | #2
On 2019/11/27 下午6:19, Mel Gorman wrote:
> On Wed, Nov 27, 2019 at 09:49:34AM +0800, ?????? wrote:
>> Currently there are no good approach to monitoring the per-cgroup
>> numa efficiency, this could be a trouble especially when groups
>> are sharing CPUs, it's impossible to tell which one caused the
>> remote-memory access by reading hardware counter since multiple
>> workloads could sharing the same CPU, which make it painful when
>> one want to find out the root cause and fix the issue>>
> 
> It's already possible to identify specific tasks triggering PMU events
> so this is not exactly true.

Should fix the description regarding this...

I think you mean tools like numatop which showing per task local/remote
accessing info from PMU, correct?

It's a good one for debugging, but when we talking about monitoring over
cluster sharing by multiple users, still not very practical... compared
to the workloads classified historical data.

I'm not sure about the overhead and limitation of this PMU approach, or
whether there are any platform it's not yet supported, worth a survey.

> 
>> In order to address this, we introduced new per-cgroup statistic
>> for numa:
>>   * the numa locality to imply the numa balancing efficiency
>>   * the numa execution time on each node
>>
[snip]
>> +#ifdef CONFIG_PROC_SYSCTL
>> +int sysctl_cg_numa_stat(struct ctl_table *table, int write,
>> +			 void __user *buffer, size_t *lenp, loff_t *ppos)
>> +{
>> +	struct ctl_table t;
>> +	int err;
>> +	int state = static_branch_likely(&sched_cg_numa_stat);
>> +
>> +	if (write && !capable(CAP_SYS_ADMIN))
>> +		return -EPERM;
>> +
>> +	t = *table;
>> +	t.data = &state;
>> +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
>> +	if (err < 0 || !write)
>> +		return err;
>> +
>> +	if (state)
>> +		static_branch_enable(&sched_cg_numa_stat);
>> +	else
>> +		static_branch_disable(&sched_cg_numa_stat);
>> +
>> +	return err;
>> +}
>> +#endif
>> +
> 
> Why is this implemented as a toggle? I'm finding it hard to make sense
> of this. The numa_stat should not even exist if the feature is disabled.

numa_stat will not exist if CONFIG is not enabled, do you mean it should
also disappear when dynamically turn off?

> 
> Assuming that is fixed then the runtime overhead is fine but the same
> issues with the quality of the information relying on NUMA balancing
> limits the usefulness of this. Disabling NUMA balancing or the scan rate
> dropping to a very low frequency would lead in misleading conclusions as
> well as false positives if the CPU and memory policies force remote memory
> usage. Similarly, the timing of the information available is variable du
> to how numa_faults_locality gets reset so sometimes the information is
> fine-grained and sometimes it's coarse grained. It will also pretend to
> display useful information even if NUMA balancing is disabled.

The data just represent what we traced on NUMA balancing PF, so yes folks
need some understanding on NUMA balancing to figure out the real meaning
behind locality.

We want it to tell the real story, if NUMA balancing disabled or scan rate
dropped very low, the locality increments should be very small, when it
keep failing for memory policy or CPU binding reason, we want it to tell how
bad it is, locality just show us how NUMA Balancing is performing, the data
could contains many information since how OS dealing with NUMA could be
complicated...

> 
> I find it hard to believe it would be useful in practice and I think users
> would have real trouble interpreting the data given how much it relies on
> internal implementation details of NUMA balancing. I cannot be certain
> as clearly something motivated the creation of this patch although it's
> unclear if it has ever been used to debug and fix an actual problem in
> the field. Hence, I'm neutral on the patch and will neither ack or nack
> it and will defer to the scheduler maintainers but if I was pushed on it,
> I would be disinclined to merge the patch due to the potential confusion
> caused by users who believe it provides accurate information when at best
> it gives a rough approximation with variable granularity.

We have our cluster enabled this feature already, an old version though but
still helpful, when we want to debug NUMA issues, this could give good hints.

Consider it as load_1/5/15 which not accurate but tell the trend of system
behavior, locality giving the trend of NUMA Balancing as long as it's working,
when increasing slowly it means locality already good enough or no more memory
to adjust, and that's fine, for those who disabled the NUMA Balancing, they do
their own NUMA optimization and find their own ways to estimate the results.

Anyway, we thanks for all those good inputs from your side :-)

Regards,
Michael Wang
Michal Koutný Nov. 28, 2019, 12:39 p.m. UTC | #3
Hello.

My primary concern is still the measuring of per-NUMA node execution
time.

First, I think exposing the aggregated data into the numa_stat file is
loss of information. The data are collected per-CPU and then summed over
NUMA nodes -- this could be easily done by the userspace consumer of the
data, keeping the per-CPU data available.

Second, comparing with the cpuacct implementation, yours has only jiffy
granularity (I may have overlooked something or I miss some context,
then it's a non-concern).

IOW, to me it sounds like duplicating cpuacct job and if that is deemed
useful for cgroup v2, I think it should be done (only once) and at
proper place (i.e. how cputime is measured in the default hierarchy).

The previous two are design/theoretical remarks, however, your patch
misses measuring of other than fair_sched_class policy tasks. Is that
intentional?

My last two comments are to locality measurement but are based on no
experience or specific knowledge.

The seven percentile groups seem quite arbitrary to me, I find it
strange that the ratio of cache-line size and u64 leaks and is fixed in
the generally visible file. Wouldn't such a form be better hidden under
a _DEBUG config option?


On Thu, Nov 28, 2019 at 10:09:13AM +0800, 王贇 <yun.wang@linux.alibaba.com> wrote:
> Consider it as load_1/5/15 which not accurate but tell the trend of system
I understood your patchset provides cumulative data over time, i.e. if
a user wants to see an immediate trend, they have to calculate
differences. Have I overlooked some back-off or regular zeroing?

Michal
王贇 Nov. 28, 2019, 1:41 p.m. UTC | #4
On 2019/11/28 下午8:39, Michal Koutný wrote:
> Hello.
> 
> My primary concern is still the measuring of per-NUMA node execution
> time.
> 
> First, I think exposing the aggregated data into the numa_stat file is
> loss of information. The data are collected per-CPU and then summed over
> NUMA nodes -- this could be easily done by the userspace consumer of the
> data, keeping the per-CPU data available.
> 
> Second, comparing with the cpuacct implementation, yours has only jiffy
> granularity (I may have overlooked something or I miss some context,
> then it's a non-concern).

There are used to be a discussion on this, Peter mentioned we no longer
expose raw ticks into userspace and micro seconds could be fine.

Basically we use this to calculate percentages, for which jiffy could be
accurate enough :-)

> 
> IOW, to me it sounds like duplicating cpuacct job and if that is deemed
> useful for cgroup v2, I think it should be done (only once) and at
> proper place (i.e. how cputime is measured in the default hierarchy).

But still, what if folks don't use v2... any good suggestions?

> 
> The previous two are design/theoretical remarks, however, your patch
> misses measuring of other than fair_sched_class policy tasks. Is that
> intentional?

Yes, since they don't have NUMA balancing to do optimization, and
generally they are not that much.

> 
> My last two comments are to locality measurement but are based on no
> experience or specific knowledge.
> 
> The seven percentile groups seem quite arbitrary to me, I find it
> strange that the ratio of cache-line size and u64 leaks and is fixed in
> the generally visible file. Wouldn't such a form be better hidden under
> a _DEBUG config option?

Sorry but I don't get it... at first it was 10 regions, as Peter suggested
we pick 8, but now to insert member 'jiffies' it become 7, the address of
'jiffies' is cache aligned, so we pick u64 * 8 == 64Bytes to make sure the
whole thing could be load in cache once a time, or did I misunderstand
something?

> 
> 
> On Thu, Nov 28, 2019 at 10:09:13AM +0800, 王贇 <yun.wang@linux.alibaba.com> wrote:
>> Consider it as load_1/5/15 which not accurate but tell the trend of system
> I understood your patchset provides cumulative data over time, i.e. if
> a user wants to see an immediate trend, they have to calculate
> differences. Have I overlooked some back-off or regular zeroing?

Yes, here what I try to highlight is the similar usage, but not the way of
monitoring ;-) as the docs tell, we monitoring increments.

Regards,
Michale Wang

> 
> Michal
>
Michal Koutný Nov. 28, 2019, 3:58 p.m. UTC | #5
On Thu, Nov 28, 2019 at 09:41:37PM +0800, 王贇 <yun.wang@linux.alibaba.com> wrote:
> There are used to be a discussion on this, Peter mentioned we no longer
> expose raw ticks into userspace and micro seconds could be fine.
I don't mean the unit presented but the precision.

> Basically we use this to calculate percentages, for which jiffy could be
> accurate enough :-)
You also report the raw times.

Ad percentages (or raw times precision), on average, it should be fine
but can't there be any "aliasing" artifacts when only an unimportant
task is regularly sampled, hence not capturing the real pattern on the
CPU? (Again, I'm not confident I'm not missing anything that prevents
that behavior.)

> But still, what if folks don't use v2... any good suggestions?
(Note this applies to exectimes not locality.) On v1, they can add up
per CPU values from cpuacct. (So it's v2 that's missing the records.)


> Yes, since they don't have NUMA balancing to do optimization, and
> generally they are not that much.
Aha, I didn't realize that.

> Sorry but I don't get it... at first it was 10 regions, as Peter suggested
> we pick 8, but now to insert member 'jiffies' it become 7,
See, there are various arguments for different values :-)

I meant that the currently chosen one is imprinted into the API file.
That is IMO fixable by documenting (e.g. the number of bands may change,
assume uniform division) or making all this just a debug API. Or, see
below.

> Yes, here what I try to highlight is the similar usage, but not the way of
> monitoring ;-) as the docs tell, we monitoring increments.
I see, the docs give me an idea what's the supposed use case.

What about exposing only the counters for local, remote and let the user
do their monitoring based on Δlocal/(Δlocal + Δremote)?

That would avoid the partitioning question completely, exposed values
would be simple numbers and provided information should be equal. A
drawback is that such a sampling would be slower (but sufficient for the
illustrating example).

Michal
王贇 Nov. 29, 2019, 1:52 a.m. UTC | #6
On 2019/11/28 下午11:58, Michal Koutný wrote:
> On Thu, Nov 28, 2019 at 09:41:37PM +0800, 王贇 <yun.wang@linux.alibaba.com> wrote:
>> There are used to be a discussion on this, Peter mentioned we no longer
>> expose raw ticks into userspace and micro seconds could be fine.
> I don't mean the unit presented but the precision.
> 
>> Basically we use this to calculate percentages, for which jiffy could be
>> accurate enough :-)
> You also report the raw times.>
> Ad percentages (or raw times precision), on average, it should be fine
> but can't there be any "aliasing" artifacts when only an unimportant
> task is regularly sampled, hence not capturing the real pattern on the
> CPU? (Again, I'm not confident I'm not missing anything that prevents
> that behavior.)

Hmm.. I think I get your point now, so the concern is about the missing
situation between each ticks, correct?

It could be, like one tick hit task A running, then A switched to B, B
switched back to A before next tick, then we missing the exectime of B
in next tick, since it hit A again.

Actually we have the same issue for those data in /proc/stat too, don't
we? The user, sys, iowait was sampled in the similar way.

So if we have to pick a precision, I may still pick jiffy since the
exectime is some thing similar to user/sys time IMHO.

> 
>> But still, what if folks don't use v2... any good suggestions?
> (Note this applies to exectimes not locality.) On v1, they can add up
> per CPU values from cpuacct. (So it's v2 that's missing the records.)

Whatabout move the whole stuff into cpuacct cgroup?

I'm not sure but maybe we could use some data there to save the sample
of jiffies, for those v1 user who need these statistics, they should have
cpuacct enabled.

> 
> 
>> Yes, since they don't have NUMA balancing to do optimization, and
>> generally they are not that much.
> Aha, I didn't realize that.
> 
>> Sorry but I don't get it... at first it was 10 regions, as Peter suggested
>> we pick 8, but now to insert member 'jiffies' it become 7,
> See, there are various arguments for different values :-)
> 
> I meant that the currently chosen one is imprinted into the API file.
> That is IMO fixable by documenting (e.g. the number of bands may change,
> assume uniform division) or making all this just a debug API. Or, see
> below.
> 
>> Yes, here what I try to highlight is the similar usage, but not the way of
>> monitoring ;-) as the docs tell, we monitoring increments.
> I see, the docs give me an idea what's the supposed use case.
> 
> What about exposing only the counters for local, remote and let the user
> do their monitoring based on Δlocal/(Δlocal + Δremote)?
> 
> That would avoid the partitioning question completely, exposed values
> would be simple numbers and provided information should be equal. A
> drawback is that such a sampling would be slower (but sufficient for the
> illustrating example).

You mean the cgroup numa stat just give the accumulated local/remote access?

As long as the counter won't overflow, maybe... sounds easier to explain too.

So user tracing locality will then get just one percentage (calculated on
their own) from a cgroup, but one should be enough to represent the situation.

Sounds like a good idea to me :-) will try to do that in next version.

Regards,
Michael Wang

> 
> Michal
>
王贇 Nov. 29, 2019, 5:19 a.m. UTC | #7
On 2019/11/29 上午9:52, 王贇 wrote:
[snip]
>> That would avoid the partitioning question completely, exposed values
>> would be simple numbers and provided information should be equal. A
>> drawback is that such a sampling would be slower (but sufficient for the
>> illustrating example).
> 
> You mean the cgroup numa stat just give the accumulated local/remote access?
> 
> As long as the counter won't overflow, maybe... sounds easier to explain too.
> 
> So user tracing locality will then get just one percentage (calculated on
> their own) from a cgroup, but one should be enough to represent the situation.
> 
> Sounds like a good idea to me :-) will try to do that in next version.

I did some research regarding cpuacct, and find cpuacct_charge() is a good
place to do hierarchical update, however, what we get there is the execution
time delta since last update_curr().

I'm afraid we can't just do local/remote accumulation since the sample period
now is changing, still have to accumulate the execution time into locality
regions.

While at least we should be able to address your concern regarding exectime
collection :-)

Regards,
Michael Wang

> 
> Regards,
> Michael Wang
> 
>>
>> Michal
>>
Michal Koutný Nov. 29, 2019, 10:06 a.m. UTC | #8
On Fri, Nov 29, 2019 at 01:19:33PM +0800, 王贇 <yun.wang@linux.alibaba.com> wrote:
> I did some research regarding cpuacct, and find cpuacct_charge() is a good
> place to do hierarchical update, however, what we get there is the execution
> time delta since last update_curr().
I wouldn't extend cpuacct, I'd like to look into using the rstat
mechanism for per-CPU runtime collection. (Most certainly I won't get
down to this until mid December though.)

> I'm afraid we can't just do local/remote accumulation since the sample period
> now is changing, still have to accumulate the execution time into locality
> regions.
My idea was to decouple time from the locality counters completely. It'd
be up to the monitoring application to normalize differences wrt
sampling rate (and handle wrap arounds).


Michal
王贇 Dec. 2, 2019, 2:11 a.m. UTC | #9
On 2019/11/29 下午6:06, Michal Koutný wrote:
> On Fri, Nov 29, 2019 at 01:19:33PM +0800, 王贇 <yun.wang@linux.alibaba.com> wrote:
>> I did some research regarding cpuacct, and find cpuacct_charge() is a good
>> place to do hierarchical update, however, what we get there is the execution
>> time delta since last update_curr().
> I wouldn't extend cpuacct, I'd like to look into using the rstat
> mechanism for per-CPU runtime collection. (Most certainly I won't get
> down to this until mid December though.)
> 
>> I'm afraid we can't just do local/remote accumulation since the sample period
>> now is changing, still have to accumulate the execution time into locality
>> regions.y
> My idea was to decouple time from the locality counters completely. It'd
> be up to the monitoring application to normalize differences wrt
> sampling rate (and handle wrap arounds).

I see, basically I understand your proposal as utilize cpuacct's runtime
and only expose per-cgroup local/remote counters, I'm not sure if the
locality still helpful after decouple time factor from it, both need some
investigation, anyway, once I could convince myself it's working, I'll
be happy to make things simple ;-)

Regards,
Michael Wang

> 
> 
> Michal
>
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8f6607cd40ac..505b041594ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1118,9 +1118,25 @@  struct task_struct {
 	 * numa_faults_locality tracks if faults recorded during the last
 	 * scan window were remote/local or failed to migrate. The task scan
 	 * period is adapted based on the locality of the faults with different
-	 * weights depending on whether they were shared or private faults
+	 * weights depending on whether they were shared or private faults.
+	 *
+	 * Counter id stand for:
+	 * 0 -- remote faults
+	 * 1 -- local faults
+	 * 2 -- page migration failure
+	 *
+	 * Extra counters when CONFIG_CGROUP_NUMA_STAT enabled:
+	 * 3 -- remote page accessing
+	 * 4 -- local page accessing
+	 *
+	 * The 'remote/local faults' records the cpu-page relationship before
+	 * page migration, while the 'remote/local page accessing' is after.
 	 */
+#ifndef CONFIG_CGROUP_NUMA_STAT
 	unsigned long			numa_faults_locality[3];
+#else
+	unsigned long			numa_faults_locality[5];
+#endif

 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 89f55e914673..2d6a515df544 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -102,4 +102,10 @@  extern int sched_energy_aware_handler(struct ctl_table *table, int write,
 				 loff_t *ppos);
 #endif

+#ifdef CONFIG_CGROUP_NUMA_STAT
+extern int sysctl_cg_numa_stat(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos);
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/init/Kconfig b/init/Kconfig
index 4d8d145c41d2..b31d2b560493 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -817,6 +817,15 @@  config NUMA_BALANCING_DEFAULT_ENABLED
 	  If set, automatic NUMA balancing will be enabled if running on a NUMA
 	  machine.

+config CGROUP_NUMA_STAT
+	bool "Advanced per-cgroup NUMA statistics"
+	default n
+	depends on CGROUP_SCHED && NUMA_BALANCING
+	help
+	  This option adds support for per-cgroup NUMA locality/execution
+	  statistics, for monitoring NUMA efficiency of per-cgroup workloads
+	  on NUMA platforms with NUMA Balancing enabled.
+
 menuconfig CGROUPS
 	bool "Control Group support"
 	select KERNFS
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aaa1740e6497..eabcab25be50 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7657,6 +7657,84 @@  static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */

+#ifdef CONFIG_CGROUP_NUMA_STAT
+DEFINE_STATIC_KEY_FALSE(sched_cg_numa_stat);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_cg_numa_stat(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_cg_numa_stat);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0 || !write)
+		return err;
+
+	if (state)
+		static_branch_enable(&sched_cg_numa_stat);
+	else
+		static_branch_disable(&sched_cg_numa_stat);
+
+	return err;
+}
+#endif
+
+static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
+{
+	return tg == &root_task_group ? &cpu_rq(cpu)->cfs : tg->cfs_rq[cpu];
+}
+
+static int cpu_numa_stat_show(struct seq_file *sf, void *v)
+{
+	int nr;
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	if (!static_branch_likely(&sched_cg_numa_stat))
+		return 0;
+
+	seq_puts(sf, "locality");
+	for (nr = 0; nr < NR_NL_INTERVAL; nr++) {
+		int cpu;
+		u64 sum = 0;
+
+		for_each_possible_cpu(cpu)
+			sum += tg_cfs_rq(tg, cpu)->nstat.locality[nr];
+
+		seq_printf(sf, " %u", jiffies_to_msecs(sum));
+	}
+	seq_putc(sf, '\n');
+
+	seq_puts(sf, "exectime");
+	for_each_online_node(nr) {
+		int cpu;
+		u64 sum = 0;
+
+		for_each_cpu(cpu, cpumask_of_node(nr))
+			sum += tg_cfs_rq(tg, cpu)->nstat.jiffies;
+
+		seq_printf(sf, " %u", jiffies_to_msecs(sum));
+	}
+	seq_putc(sf, '\n');
+
+	return 0;
+}
+
+static __init int cg_numa_stat_setup(char *opt)
+{
+	static_branch_enable(&sched_cg_numa_stat);
+
+	return 0;
+}
+__setup("cg_numa_stat", cg_numa_stat_setup);
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -7706,6 +7784,12 @@  static struct cftype cpu_legacy_files[] = {
 		.seq_show = cpu_uclamp_max_show,
 		.write = cpu_uclamp_max_write,
 	},
+#endif
+#ifdef CONFIG_CGROUP_NUMA_STAT
+	{
+		.name = "numa_stat",
+		.seq_show = cpu_numa_stat_show,
+	},
 #endif
 	{ }	/* Terminate */
 };
@@ -7887,6 +7971,13 @@  static struct cftype cpu_files[] = {
 		.seq_show = cpu_uclamp_max_show,
 		.write = cpu_uclamp_max_write,
 	},
+#endif
+#ifdef CONFIG_CGROUP_NUMA_STAT
+	{
+		.name = "numa_stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_numa_stat_show,
+	},
 #endif
 	{ }	/* terminate */
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81eba554db8d..d5fa038d07d3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2465,6 +2465,18 @@  void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
 	p->numa_faults_locality[local] += pages;
+
+#ifdef CONFIG_CGROUP_NUMA_STAT
+	if (!static_branch_unlikely(&sched_cg_numa_stat))
+		return;
+
+	/*
+	 * We want to record the real local/remote page access statistic
+	 * here, so use 'mem_node' which is the real residential node of
+	 * page after migrate_misplaced_page().
+	 */
+	p->numa_faults_locality[3 + !!(mem_node == numa_node_id())] += pages;
+#endif
 }

 static void reset_ptenuma_scan(struct task_struct *p)
@@ -4285,6 +4297,23 @@  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	cfs_rq->curr = NULL;
 }

+#ifdef CONFIG_CGROUP_NUMA_STAT
+static void update_numa_statistics(struct cfs_rq *cfs_rq)
+{
+	int idx;
+	unsigned long remote = current->numa_faults_locality[3];
+	unsigned long local = current->numa_faults_locality[4];
+
+	cfs_rq->nstat.jiffies++;
+
+	if (!remote && !local)
+		return;
+
+	idx = (NR_NL_INTERVAL - 1) * local / (remote + local);
+	cfs_rq->nstat.locality[idx]++;
+}
+#endif
+
 static void
 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
@@ -4298,6 +4327,10 @@  entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 */
 	update_load_avg(cfs_rq, curr, UPDATE_TG);
 	update_cfs_group(curr);
+#ifdef CONFIG_CGROUP_NUMA_STAT
+	if (static_branch_unlikely(&sched_cg_numa_stat))
+		update_numa_statistics(cfs_rq);
+#endif

 #ifdef CONFIG_SCHED_HRTICK
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 05c282775f21..87080c7164ac 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -486,6 +486,16 @@  struct cfs_bandwidth { };

 #endif	/* CONFIG_CGROUP_SCHED */

+#ifdef CONFIG_CGROUP_NUMA_STAT
+/* NUMA Locality Interval, 7 buckets for cache align */
+#define NR_NL_INTERVAL	7
+
+struct numa_statistics {
+	u64 jiffies;
+	u64 locality[NR_NL_INTERVAL];
+};
+#endif
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight	load;
@@ -575,6 +585,9 @@  struct cfs_rq {
 	struct list_head	throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_CGROUP_NUMA_STAT
+	struct numa_statistics	nstat ____cacheline_aligned;
+#endif
 };

 static inline int rt_bandwidth_enabled(void)
@@ -1601,6 +1614,10 @@  static const_debug __maybe_unused unsigned int sysctl_sched_features =
 extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false sched_schedstats;

+#ifdef CONFIG_CGROUP_NUMA_STAT
+extern struct static_key_false sched_cg_numa_stat;
+#endif
+
 static inline u64 global_rt_period(void)
 {
 	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50373984a5e2..2df9a5e7f875 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -428,6 +428,17 @@  static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_CGROUP_NUMA_STAT
+	{
+		.procname	= "cg_numa_stat",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_cg_numa_stat,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif /* CONFIG_CGROUP_NUMA_STAT */
 #endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",