diff mbox series

[RFC,2/3] psi: cgroup v1 support

Message ID 20190604015745.78972-3-joseph.qi@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series psi: support cgroup v1 | expand

Commit Message

Joseph Qi June 4, 2019, 1:57 a.m. UTC
Implements pressure stall tracking for cgroup v1.

Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
---
 kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 9 deletions(-)

Comments

Johannes Weiner June 4, 2019, 11:55 a.m. UTC | #1
On Tue, Jun 04, 2019 at 09:57:44AM +0800, Joseph Qi wrote:
> Implements pressure stall tracking for cgroup v1.
> 
> Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
> ---
>  kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 56 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 7acc632c3b82..909083c828d5 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>  	return state_mask;
>  }
>  
> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> +static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
> +{
> +	switch (res) {
> +	case NR_PSI_RESOURCES:
> +		return task_dfl_cgroup(task);
> +	case PSI_IO:
> +		return task_cgroup(task, io_cgrp_subsys.id);
> +	case PSI_MEM:
> +		return task_cgroup(task, memory_cgrp_subsys.id);
> +	case PSI_CPU:
> +		return task_cgroup(task, cpu_cgrp_subsys.id);
> +	default:  /* won't reach here */
> +		return NULL;
> +	}
> +}
> +
> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
> +					enum psi_res res)
>  {
>  #ifdef CONFIG_CGROUPS
>  	struct cgroup *cgroup = NULL;
>  
>  	if (!*iter)
> -		cgroup = task->cgroups->dfl_cgrp;
> +		cgroup = psi_task_cgroup(task, res);
>  	else if (*iter == &psi_system)
>  		return NULL;
>  	else
> @@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>  		     wq_worker_last_func(task) == psi_avgs_work))
>  		wake_clock = false;
>  
> -	while ((group = iterate_groups(task, &iter))) {
> -		u32 state_mask = psi_group_change(group, cpu, clear, set);
> +	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
> +		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
> +			u32 state_mask = psi_group_change(group, cpu, clear, set);
>  
> -		if (state_mask & group->poll_states)
> -			psi_schedule_poll_work(group, 1);
> +			if (state_mask & group->poll_states)
> +				psi_schedule_poll_work(group, 1);
>  
> -		if (wake_clock && !delayed_work_pending(&group->avgs_work))
> -			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +			if (wake_clock && !delayed_work_pending(&group->avgs_work))
> +				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +		}
> +	} else {
> +		enum psi_task_count i;
> +		enum psi_res res;
> +		int psi_flags = clear | set;
> +
> +		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
> +			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
> +				res = PSI_IO;
> +			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
> +				res = PSI_MEM;
> +			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
> +				res = PSI_CPU;
> +			else
> +				continue;
> +
> +			while ((group = iterate_groups(task, &iter, res))) {
> +				u32 state_mask = psi_group_change(group, cpu, clear, set);

This doesn't work. Each resource state is composed of all possible
task states:

static bool test_state(unsigned int *tasks, enum psi_states state)
{
	switch (state) {
	case PSI_IO_SOME:
		return tasks[NR_IOWAIT];
	case PSI_IO_FULL:
		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
	case PSI_MEM_SOME:
		return tasks[NR_MEMSTALL];
	case PSI_MEM_FULL:
		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
	case PSI_CPU_SOME:
		return tasks[NR_RUNNING] > 1;
	case PSI_NONIDLE:
		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
			tasks[NR_RUNNING];
	default:
		return false;
	}
}

So the IO controller needs to know of NR_RUNNING to tell some vs full,
the memory controller needs to know of NR_IOWAIT to tell nonidle etc.

You need to run the full psi task tracking and aggregation machinery
separately for each of the different cgroups a task can be in in v1.

Needless to say, that is expensive. For cpu, memory and io, it's
triple the scheduling overhead with three ancestor walks and three
times the cache footprint; three times more aggregation workers every
two seconds... We could never turn this on per default.

Have you considered just co-mounting cgroup2, if for nothing else, to
get the pressure numbers?
Joseph Qi June 5, 2019, 1:15 a.m. UTC | #2
Hi Johannes,

Thanks for the quick comments.

On 19/6/4 19:55, Johannes Weiner wrote:
> On Tue, Jun 04, 2019 at 09:57:44AM +0800, Joseph Qi wrote:
>> Implements pressure stall tracking for cgroup v1.
>>
>> Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
>> ---
>>  kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 56 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 7acc632c3b82..909083c828d5 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>>  	return state_mask;
>>  }
>>  
>> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
>> +static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
>> +{
>> +	switch (res) {
>> +	case NR_PSI_RESOURCES:
>> +		return task_dfl_cgroup(task);
>> +	case PSI_IO:
>> +		return task_cgroup(task, io_cgrp_subsys.id);
>> +	case PSI_MEM:
>> +		return task_cgroup(task, memory_cgrp_subsys.id);
>> +	case PSI_CPU:
>> +		return task_cgroup(task, cpu_cgrp_subsys.id);
>> +	default:  /* won't reach here */
>> +		return NULL;
>> +	}
>> +}
>> +
>> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
>> +					enum psi_res res)
>>  {
>>  #ifdef CONFIG_CGROUPS
>>  	struct cgroup *cgroup = NULL;
>>  
>>  	if (!*iter)
>> -		cgroup = task->cgroups->dfl_cgrp;
>> +		cgroup = psi_task_cgroup(task, res);
>>  	else if (*iter == &psi_system)
>>  		return NULL;
>>  	else
>> @@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>>  		     wq_worker_last_func(task) == psi_avgs_work))
>>  		wake_clock = false;
>>  
>> -	while ((group = iterate_groups(task, &iter))) {
>> -		u32 state_mask = psi_group_change(group, cpu, clear, set);
>> +	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
>> +	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
>> +	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
>> +		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
>> +			u32 state_mask = psi_group_change(group, cpu, clear, set);
>>  
>> -		if (state_mask & group->poll_states)
>> -			psi_schedule_poll_work(group, 1);
>> +			if (state_mask & group->poll_states)
>> +				psi_schedule_poll_work(group, 1);
>>  
>> -		if (wake_clock && !delayed_work_pending(&group->avgs_work))
>> -			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>> +			if (wake_clock && !delayed_work_pending(&group->avgs_work))
>> +				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>> +		}
>> +	} else {
>> +		enum psi_task_count i;
>> +		enum psi_res res;
>> +		int psi_flags = clear | set;
>> +
>> +		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
>> +			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
>> +				res = PSI_IO;
>> +			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
>> +				res = PSI_MEM;
>> +			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
>> +				res = PSI_CPU;
>> +			else
>> +				continue;
>> +
>> +			while ((group = iterate_groups(task, &iter, res))) {
>> +				u32 state_mask = psi_group_change(group, cpu, clear, set);
> 
> This doesn't work. Each resource state is composed of all possible
> task states:
> 
> static bool test_state(unsigned int *tasks, enum psi_states state)
> {
> 	switch (state) {
> 	case PSI_IO_SOME:
> 		return tasks[NR_IOWAIT];
> 	case PSI_IO_FULL:
> 		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
> 	case PSI_MEM_SOME:
> 		return tasks[NR_MEMSTALL];
> 	case PSI_MEM_FULL:
> 		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
> 	case PSI_CPU_SOME:
> 		return tasks[NR_RUNNING] > 1;
> 	case PSI_NONIDLE:
> 		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
> 			tasks[NR_RUNNING];
> 	default:
> 		return false;
> 	}
> }
> 
> So the IO controller needs to know of NR_RUNNING to tell some vs full,
> the memory controller needs to know of NR_IOWAIT to tell nonidle etc.
> 
> You need to run the full psi task tracking and aggregation machinery
> separately for each of the different cgroups a task can be in in v1.
> 
Yes, since different controllers have their own hierarchy.

> Needless to say, that is expensive. For cpu, memory and io, it's
> triple the scheduling overhead with three ancestor walks and three
> times the cache footprint; three times more aggregation workers every
> two seconds... We could never turn this on per default.
> 
IC, but even on cgroup v2, would it still be expensive if we have many
cgroups?

> Have you considered just co-mounting cgroup2, if for nothing else, to
> get the pressure numbers?
> 
Do you mean mounting cgroup1 and cgroup2 at the same time? 
IIUC, this may not work since many cgroup code have xxx_on_dfl check.

Thanks,
Joseph
diff mbox series

Patch

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7acc632c3b82..909083c828d5 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -719,13 +719,30 @@  static u32 psi_group_change(struct psi_group *group, int cpu,
 	return state_mask;
 }
 
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
+{
+	switch (res) {
+	case NR_PSI_RESOURCES:
+		return task_dfl_cgroup(task);
+	case PSI_IO:
+		return task_cgroup(task, io_cgrp_subsys.id);
+	case PSI_MEM:
+		return task_cgroup(task, memory_cgrp_subsys.id);
+	case PSI_CPU:
+		return task_cgroup(task, cpu_cgrp_subsys.id);
+	default:  /* won't reach here */
+		return NULL;
+	}
+}
+
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
+					enum psi_res res)
 {
 #ifdef CONFIG_CGROUPS
 	struct cgroup *cgroup = NULL;
 
 	if (!*iter)
-		cgroup = task->cgroups->dfl_cgrp;
+		cgroup = psi_task_cgroup(task, res);
 	else if (*iter == &psi_system)
 		return NULL;
 	else
@@ -776,15 +793,45 @@  void psi_task_change(struct task_struct *task, int clear, int set)
 		     wq_worker_last_func(task) == psi_avgs_work))
 		wake_clock = false;
 
-	while ((group = iterate_groups(task, &iter))) {
-		u32 state_mask = psi_group_change(group, cpu, clear, set);
+	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
+	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
+		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
+			u32 state_mask = psi_group_change(group, cpu, clear, set);
 
-		if (state_mask & group->poll_states)
-			psi_schedule_poll_work(group, 1);
+			if (state_mask & group->poll_states)
+				psi_schedule_poll_work(group, 1);
 
-		if (wake_clock && !delayed_work_pending(&group->avgs_work))
-			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+			if (wake_clock && !delayed_work_pending(&group->avgs_work))
+				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+		}
+	} else {
+		enum psi_task_count i;
+		enum psi_res res;
+		int psi_flags = clear | set;
+
+		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
+			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
+				res = PSI_IO;
+			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
+				res = PSI_MEM;
+			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
+				res = PSI_CPU;
+			else
+				continue;
+
+			while ((group = iterate_groups(task, &iter, res))) {
+				u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+				if (state_mask & group->poll_states)
+					psi_schedule_poll_work(group, 1);
+
+				if (wake_clock && !delayed_work_pending(&group->avgs_work))
+					schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+			}
+		}
 	}
+
 }
 
 void psi_memstall_tick(struct task_struct *task, int cpu)
@@ -792,7 +839,7 @@  void psi_memstall_tick(struct task_struct *task, int cpu)
 	struct psi_group *group;
 	void *iter = NULL;
 
-	while ((group = iterate_groups(task, &iter))) {
+	while ((group = iterate_groups(task, &iter, PSI_MEM))) {
 		struct psi_group_cpu *groupc;
 
 		groupc = per_cpu_ptr(group->pcpu, cpu);