diff mbox

[RFCv6,09/10] sched: deadline: use deadline bandwidth in scale_rt_capacity

Message ID 1449641971-20827-10-git-send-email-smuckle@linaro.org (mailing list archive)
State RFC, archived
Headers show

Commit Message

Steve Muckle Dec. 9, 2015, 6:19 a.m. UTC
From: Vincent Guittot <vincent.guittot@linaro.org>

Instead of monitoring the exec time of deadline tasks to evaluate the
CPU capacity consumed by deadline scheduler class, we can directly
calculate it thanks to the sum of utilization of deadline tasks on the
CPU.  We can remove deadline tasks from rt_avg metric and directly use
the average bandwidth of deadline scheduler in scale_rt_capacity.

Based in part on a similar patch from Luca Abeni <luca.abeni@unitn.it>.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++--
 kernel/sched/fair.c     |  8 ++++++++
 kernel/sched/sched.h    |  2 ++
 3 files changed, 41 insertions(+), 2 deletions(-)

Comments

Vincent Guittot Dec. 9, 2015, 8:50 a.m. UTC | #1
adding Lucas

On 9 December 2015 at 07:19, Steve Muckle <steve.muckle@linaro.org> wrote:
> From: Vincent Guittot <vincent.guittot@linaro.org>
>
> Instead of monitoring the exec time of deadline tasks to evaluate the
> CPU capacity consumed by deadline scheduler class, we can directly
> calculate it thanks to the sum of utilization of deadline tasks on the
> CPU.  We can remove deadline tasks from rt_avg metric and directly use
> the average bandwidth of deadline scheduler in scale_rt_capacity.
>
> Based in part on a similar patch from Luca Abeni <luca.abeni@unitn.it>.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Steve Muckle <smuckle@linaro.org>
> ---
>  kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++--
>  kernel/sched/fair.c     |  8 ++++++++
>  kernel/sched/sched.h    |  2 ++
>  3 files changed, 41 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 8b0a15e..9d9eb50 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
>         return !RB_EMPTY_NODE(&dl_se->rb_node);
>  }
>
> +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
> +{
> +       u64 se_bw = dl_se->dl_bw;
> +
> +       dl_rq->avg_bw += se_bw;
> +}
> +
> +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
> +{
> +       u64 se_bw = dl_se->dl_bw;
> +
> +       dl_rq->avg_bw -= se_bw;
> +       if (dl_rq->avg_bw < 0) {
> +               WARN_ON(1);
> +               dl_rq->avg_bw = 0;
> +       }
> +}
> +
>  static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
>  {
>         struct sched_dl_entity *dl_se = &p->dl;
> @@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
>         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>         struct rq *rq = rq_of_dl_rq(dl_rq);
>
> +       if (dl_se->dl_new)
> +               add_average_bw(dl_se, dl_rq);
> +
>         /*
>          * The arrival of a new instance needs special treatment, i.e.,
>          * the actual scheduling parameters have to be "renewed".
> @@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq)
>         curr->se.exec_start = rq_clock_task(rq);
>         cpuacct_charge(curr, delta_exec);
>
> -       sched_rt_avg_update(rq, delta_exec);
> -
>         dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
>         if (dl_runtime_exceeded(dl_se)) {
>                 dl_se->dl_throttled = 1;
> @@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p)
>  static void task_dead_dl(struct task_struct *p)
>  {
>         struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
> +       struct dl_rq *dl_rq = dl_rq_of_se(&p->dl);
> +       struct rq *rq = rq_of_dl_rq(dl_rq);
>
>         /*
>          * Since we are TASK_DEAD we won't slip out of the domain!
> @@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p)
>         /* XXX we should retain the bw until 0-lag */
>         dl_b->total_bw -= p->dl.dl_bw;
>         raw_spin_unlock_irq(&dl_b->lock);
> +
> +       clear_average_bw(&p->dl, &rq->dl);
>  }
>
>  static void set_curr_task_dl(struct rq *rq)
> @@ -1556,7 +1579,9 @@ retry:
>         }
>
>         deactivate_task(rq, next_task, 0);
> +       clear_average_bw(&next_task->dl, &rq->dl);
>         set_task_cpu(next_task, later_rq->cpu);
> +       add_average_bw(&next_task->dl, &later_rq->dl);
>         activate_task(later_rq, next_task, 0);
>         ret = 1;
>
> @@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq)
>                         resched = true;
>
>                         deactivate_task(src_rq, p, 0);
> +                       clear_average_bw(&p->dl, &src_rq->dl);
>                         set_task_cpu(p, this_cpu);
> +                       add_average_bw(&p->dl, &this_rq->dl);
>                         activate_task(this_rq, p, 0);
>                         dmin = p->dl.deadline;
>
> @@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
>         if (!start_dl_timer(p))
>                 __dl_clear_params(p);
>
> +       clear_average_bw(&p->dl, &rq->dl);
> +
>         /*
>          * Since this might be the only -deadline task on the rq,
>          * this is the right place to try to pull some other one
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4c49f76..ce05f61 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6203,6 +6203,14 @@ static unsigned long scale_rt_capacity(int cpu)
>
>         used = div_u64(avg, total);
>
> +       /*
> +        * deadline bandwidth is defined at system level so we must
> +        * weight this bandwidth with the max capacity of the system.
> +        * As a reminder, avg_bw is 20bits width and
> +        * scale_cpu_capacity is 10 bits width
> +        */
> +       used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
> +
>         if (likely(used < SCHED_CAPACITY_SCALE))
>                 return SCHED_CAPACITY_SCALE - used;
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 08858d1..e44c6be 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -519,6 +519,8 @@ struct dl_rq {
>  #else
>         struct dl_bw dl_bw;
>  #endif
> +       /* This is the "average utilization" for this runqueue */
> +       s64 avg_bw;
>  };
>
>  #ifdef CONFIG_SMP
> --
> 2.4.10
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 10, 2015, 1:27 p.m. UTC | #2
Hi Vincent,

first of all, thanks for adding me in the discussion.

On 12/09/2015 09:50 AM, Vincent Guittot wrote:
> adding Lucas
>
> On 9 December 2015 at 07:19, Steve Muckle <steve.muckle@linaro.org> wrote:
>> From: Vincent Guittot <vincent.guittot@linaro.org>
>>
>> Instead of monitoring the exec time of deadline tasks to evaluate the
>> CPU capacity consumed by deadline scheduler class, we can directly
>> calculate it thanks to the sum of utilization of deadline tasks on the
>> CPU.  We can remove deadline tasks from rt_avg metric and directly use
>> the average bandwidth of deadline scheduler in scale_rt_capacity.
>>
>> Based in part on a similar patch from Luca Abeni <luca.abeni@unitn.it>.
Just to check if my understanding of your patch is correct: what you do is
to track the total utilisation of the tasks that are assigned to a CPU/core,
independently from their state (active or inactive). The difference with my
patch is that I try to track the "active utilisation" (eliminating the utilisation
of the tasks that are blocked).

Is this understanding correct?
If yes, I think your approach is safe (and easier to implement - modulo a small
issue when a task terminates of switches to other scheduling policies; I think
there already are some "XXX" comments in the current code). However, it allows to
save less energy (or reclaim less CPU time). For example, if I create a SCHED_DEADLINE
task with runtime 90ms and period 100ms it will not allow to scale the CPU frequency
even if it never executes (because is always blocked).


[...]
>> +       /* This is the "average utilization" for this runqueue */
>> +       s64 avg_bw;
>>   };
Small nit: why "average" utilization? I think a better name would be "runqueue utilization"
or "local utilization", or something similar... If I understand correctly (sorry if I
missed something), this is not an average, but the sum of the utilisations of the tasks
on this runqueue... No?



				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 10, 2015, 4:11 p.m. UTC | #3
On 10 December 2015 at 14:27, Luca Abeni <luca.abeni@unitn.it> wrote:
> Hi Vincent,
>
> first of all, thanks for adding me in the discussion.
>
> On 12/09/2015 09:50 AM, Vincent Guittot wrote:
>>
>> adding Lucas
>>
>> On 9 December 2015 at 07:19, Steve Muckle <steve.muckle@linaro.org> wrote:
>>>
>>> From: Vincent Guittot <vincent.guittot@linaro.org>
>>>
>>> Instead of monitoring the exec time of deadline tasks to evaluate the
>>> CPU capacity consumed by deadline scheduler class, we can directly
>>> calculate it thanks to the sum of utilization of deadline tasks on the
>>> CPU.  We can remove deadline tasks from rt_avg metric and directly use
>>> the average bandwidth of deadline scheduler in scale_rt_capacity.
>>>
>>> Based in part on a similar patch from Luca Abeni <luca.abeni@unitn.it>.
>
> Just to check if my understanding of your patch is correct: what you do is
> to track the total utilisation of the tasks that are assigned to a CPU/core,
> independently from their state (active or inactive). The difference with my
> patch is that I try to track the "active utilisation" (eliminating the
> utilisation
> of the tasks that are blocked).
>
> Is this understanding correct?

yes, i want to know the average utilization of the CPU/core by
deadline scheduler.

> If yes, I think your approach is safe (and easier to implement - modulo a
> small
> issue when a task terminates of switches to other scheduling policies; I
> think
> there already are some "XXX" comments in the current code). However, it
> allows to
> save less energy (or reclaim less CPU time). For example, if I create a
> SCHED_DEADLINE
> task with runtime 90ms and period 100ms it will not allow to scale the CPU
> frequency
> even if it never executes (because is always blocked).

Yes, i agree. If the task behavior is not aligned with its deadline
parameters, we will over provisioned CPU capacity to the deadline
scheduler.
This metric is not used to select the OPP but to provisioned some CPU
capacity to the deadline scheduler and to inform the CFS scheduler of
the remaining capacity.
So the main side effect of an always blocked deadline task will be to
decrease the rq->cpu_capacity that is then used by the CFS scheduler.

>
>
> [...]
>>>
>>> +       /* This is the "average utilization" for this runqueue */
>>> +       s64 avg_bw;
>>>   };
>
> Small nit: why "average" utilization? I think a better name would be
> "runqueue utilization"
> or "local utilization", or something similar... If I understand correctly
> (sorry if I
> missed something), this is not an average, but the sum of the utilisations
> of the tasks
> on this runqueue... No?

I have used "average" because it doesn't reflect the active/actual
utilization of the run-queue but the forecasted average bandwidth of
the CPU that will be used by deadline task. I'm open to change the
name if another one makes more sense

Regards,
Vincent

>
>
>
>                                 Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 11, 2015, 7:48 a.m. UTC | #4
Hi Vincent,

On 12/10/2015 05:11 PM, Vincent Guittot wrote:
[...]
>> If yes, I think your approach is safe (and easier to implement - modulo a
>> small
>> issue when a task terminates of switches to other scheduling policies; I
>> think
>> there already are some "XXX" comments in the current code). However, it
>> allows to
>> save less energy (or reclaim less CPU time). For example, if I create a
>> SCHED_DEADLINE
>> task with runtime 90ms and period 100ms it will not allow to scale the CPU
>> frequency
>> even if it never executes (because is always blocked).
>
> Yes, i agree. If the task behavior is not aligned with its deadline
> parameters, we will over provisioned CPU capacity to the deadline
> scheduler.
> This metric is not used to select the OPP but to provisioned some CPU
> capacity to the deadline scheduler and to inform the CFS scheduler of
> the remaining capacity.
Ah, sorry; I missed this point.


>>>> +       /* This is the "average utilization" for this runqueue */
>>>> +       s64 avg_bw;
>>>>    };
>>
>> Small nit: why "average" utilization? I think a better name would be
>> "runqueue utilization"
>> or "local utilization", or something similar... If I understand correctly
>> (sorry if I
>> missed something), this is not an average, but the sum of the utilisations
>> of the tasks
>> on this runqueue... No?
>
> I have used "average" because it doesn't reflect the active/actual
> utilization of the run-queue but the forecasted average bandwidth of
> the CPU that will be used by deadline task.
Well, this is just nitpicking, so feel free to ignore (I just mentioned
this point because I was initially confused by the "average" name). But I
think this is "maximum", or "worst-case", not "average", because (as far
as I can understand) this field indicates that SCHED_DEADLINE tasks will
not be able to consume more than this fraction of CPU (if they try to
consume more, the scheduler throttles them).

> I'm open to change the name if another one makes more sense
In real-time literature this is often called simply "utilization" (or "worst-case
utilization" by someone): when a task can have a variable execution time, its
utilization is defined as WCET (maximum execution time) / period.



				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 14, 2015, 2:02 p.m. UTC | #5
On 11 December 2015 at 08:48, Luca Abeni <luca.abeni@unitn.it> wrote:
> Hi Vincent,
>
> On 12/10/2015 05:11 PM, Vincent Guittot wrote:
> [...]
>>>
>>> If yes, I think your approach is safe (and easier to implement - modulo a
>>> small
>>> issue when a task terminates of switches to other scheduling policies; I
>>> think
>>> there already are some "XXX" comments in the current code). However, it
>>> allows to
>>> save less energy (or reclaim less CPU time). For example, if I create a
>>> SCHED_DEADLINE
>>> task with runtime 90ms and period 100ms it will not allow to scale the
>>> CPU
>>> frequency
>>> even if it never executes (because is always blocked).
>>
>>
>> Yes, i agree. If the task behavior is not aligned with its deadline
>> parameters, we will over provisioned CPU capacity to the deadline
>> scheduler.
>> This metric is not used to select the OPP but to provisioned some CPU
>> capacity to the deadline scheduler and to inform the CFS scheduler of
>> the remaining capacity.
>
> Ah, sorry; I missed this point.
>
>
>>>>> +       /* This is the "average utilization" for this runqueue */
>>>>> +       s64 avg_bw;
>>>>>    };
>>>
>>>
>>> Small nit: why "average" utilization? I think a better name would be
>>> "runqueue utilization"
>>> or "local utilization", or something similar... If I understand correctly
>>> (sorry if I
>>> missed something), this is not an average, but the sum of the
>>> utilisations
>>> of the tasks
>>> on this runqueue... No?
>>
>>
>> I have used "average" because it doesn't reflect the active/actual
>> utilization of the run-queue but the forecasted average bandwidth of
>> the CPU that will be used by deadline task.
>
> Well, this is just nitpicking, so feel free to ignore (I just mentioned
> this point because I was initially confused by the "average" name). But I
> think this is "maximum", or "worst-case", not "average", because (as far
> as I can understand) this field indicates that SCHED_DEADLINE tasks will
> not be able to consume more than this fraction of CPU (if they try to
> consume more, the scheduler throttles them).
>
>> I'm open to change the name if another one makes more sense
>
> In real-time literature this is often called simply "utilization" (or
> "worst-case
> utilization" by someone): when a task can have a variable execution time,
> its
> utilization is defined as WCET (maximum execution time) / period.

ok. Let follow real-time literature wording and remove "average" to
keep only utilization.
so the variable will be named:

s64 util_bw;

Thanks

>
>
>
>                                 Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 14, 2015, 2:38 p.m. UTC | #6
On 12/14/2015 03:02 PM, Vincent Guittot wrote:
[...]
>>>> Small nit: why "average" utilization? I think a better name would be
>>>> "runqueue utilization"
>>>> or "local utilization", or something similar... If I understand correctly
>>>> (sorry if I
>>>> missed something), this is not an average, but the sum of the
>>>> utilisations
>>>> of the tasks
>>>> on this runqueue... No?
>>>
>>>
>>> I have used "average" because it doesn't reflect the active/actual
>>> utilization of the run-queue but the forecasted average bandwidth of
>>> the CPU that will be used by deadline task.
>>
>> Well, this is just nitpicking, so feel free to ignore (I just mentioned
>> this point because I was initially confused by the "average" name). But I
>> think this is "maximum", or "worst-case", not "average", because (as far
>> as I can understand) this field indicates that SCHED_DEADLINE tasks will
>> not be able to consume more than this fraction of CPU (if they try to
>> consume more, the scheduler throttles them).
>>
>>> I'm open to change the name if another one makes more sense
>>
>> In real-time literature this is often called simply "utilization" (or
>> "worst-case
>> utilization" by someone): when a task can have a variable execution time,
>> its
>> utilization is defined as WCET (maximum execution time) / period.
>
> ok. Let follow real-time literature wording and remove "average" to
> keep only utilization.
> so the variable will be named:
>
> s64 util_bw;
Well, "utilization" and "bandwidth" are often used to indicate the same
quantity, so "util_bw" sounds strange. I'd call it simply "utilization" or
"bandwidth" (otherwise, just leave the name as it is... I said this is just
nitpicking).



				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Dec. 14, 2015, 3:17 p.m. UTC | #7
On Tue, Dec 08, 2015 at 10:19:30PM -0800, Steve Muckle wrote:
> From: Vincent Guittot <vincent.guittot@linaro.org>

> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 8b0a15e..9d9eb50 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
>  	return !RB_EMPTY_NODE(&dl_se->rb_node);
>  }
>  
> +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
> +{
> +	u64 se_bw = dl_se->dl_bw;
> +
> +	dl_rq->avg_bw += se_bw;
> +}
> +
> +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
> +{
> +	u64 se_bw = dl_se->dl_bw;
> +
> +	dl_rq->avg_bw -= se_bw;
> +	if (dl_rq->avg_bw < 0) {
> +		WARN_ON(1);
> +		dl_rq->avg_bw = 0;
> +	}
> +}


> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4c49f76..ce05f61 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6203,6 +6203,14 @@ static unsigned long scale_rt_capacity(int cpu)
>  
>  	used = div_u64(avg, total);
>  
> +	/*
> +	 * deadline bandwidth is defined at system level so we must
> +	 * weight this bandwidth with the max capacity of the system.
> +	 * As a reminder, avg_bw is 20bits width and
> +	 * scale_cpu_capacity is 10 bits width
> +	 */
> +	used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
> +
>  	if (likely(used < SCHED_CAPACITY_SCALE))
>  		return SCHED_CAPACITY_SCALE - used;
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 08858d1..e44c6be 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -519,6 +519,8 @@ struct dl_rq {
>  #else
>  	struct dl_bw dl_bw;
>  #endif
> +	/* This is the "average utilization" for this runqueue */
> +	s64 avg_bw;
>  };

So I don't think this is right. AFAICT this projects the WCET as the
amount of time actually used by DL. This will, under many circumstances,
vastly overestimate the amount of time actually spend on it. Therefore
unduly pessimisme the fair capacity of this CPU.


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 14, 2015, 3:56 p.m. UTC | #8
On 14 December 2015 at 16:17, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Dec 08, 2015 at 10:19:30PM -0800, Steve Muckle wrote:
>> From: Vincent Guittot <vincent.guittot@linaro.org>
>
>> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
>> index 8b0a15e..9d9eb50 100644
>> --- a/kernel/sched/deadline.c
>> +++ b/kernel/sched/deadline.c
>> @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
>>       return !RB_EMPTY_NODE(&dl_se->rb_node);
>>  }
>>
>> +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>> +{
>> +     u64 se_bw = dl_se->dl_bw;
>> +
>> +     dl_rq->avg_bw += se_bw;
>> +}
>> +
>> +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>> +{
>> +     u64 se_bw = dl_se->dl_bw;
>> +
>> +     dl_rq->avg_bw -= se_bw;
>> +     if (dl_rq->avg_bw < 0) {
>> +             WARN_ON(1);
>> +             dl_rq->avg_bw = 0;
>> +     }
>> +}
>
>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 4c49f76..ce05f61 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6203,6 +6203,14 @@ static unsigned long scale_rt_capacity(int cpu)
>>
>>       used = div_u64(avg, total);
>>
>> +     /*
>> +      * deadline bandwidth is defined at system level so we must
>> +      * weight this bandwidth with the max capacity of the system.
>> +      * As a reminder, avg_bw is 20bits width and
>> +      * scale_cpu_capacity is 10 bits width
>> +      */
>> +     used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
>> +
>>       if (likely(used < SCHED_CAPACITY_SCALE))
>>               return SCHED_CAPACITY_SCALE - used;
>>
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 08858d1..e44c6be 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -519,6 +519,8 @@ struct dl_rq {
>>  #else
>>       struct dl_bw dl_bw;
>>  #endif
>> +     /* This is the "average utilization" for this runqueue */
>> +     s64 avg_bw;
>>  };
>
> So I don't think this is right. AFAICT this projects the WCET as the
> amount of time actually used by DL. This will, under many circumstances,
> vastly overestimate the amount of time actually spend on it. Therefore
> unduly pessimisme the fair capacity of this CPU.

I agree that if the WCET is far from reality, we will underestimate
available capacity for CFS. Have you got some use case in mind which
overestimates the WCET ?
If we can't rely on this parameters to evaluate the amount of capacity
used by deadline scheduler on a core, this will imply that we can't
also use it for requesting capacity to cpufreq and we should fallback
on a monitoring mechanism which reacts to a change instead of
anticipating it.

>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Dec. 14, 2015, 4:07 p.m. UTC | #9
On 14/12/15 16:56, Vincent Guittot wrote:
> On 14 December 2015 at 16:17, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Dec 08, 2015 at 10:19:30PM -0800, Steve Muckle wrote:
> >> From: Vincent Guittot <vincent.guittot@linaro.org>
> >
> >> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> >> index 8b0a15e..9d9eb50 100644
> >> --- a/kernel/sched/deadline.c
> >> +++ b/kernel/sched/deadline.c
> >> @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
> >>       return !RB_EMPTY_NODE(&dl_se->rb_node);
> >>  }
> >>
> >> +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
> >> +{
> >> +     u64 se_bw = dl_se->dl_bw;
> >> +
> >> +     dl_rq->avg_bw += se_bw;
> >> +}
> >> +
> >> +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
> >> +{
> >> +     u64 se_bw = dl_se->dl_bw;
> >> +
> >> +     dl_rq->avg_bw -= se_bw;
> >> +     if (dl_rq->avg_bw < 0) {
> >> +             WARN_ON(1);
> >> +             dl_rq->avg_bw = 0;
> >> +     }
> >> +}
> >
> >
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 4c49f76..ce05f61 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -6203,6 +6203,14 @@ static unsigned long scale_rt_capacity(int cpu)
> >>
> >>       used = div_u64(avg, total);
> >>
> >> +     /*
> >> +      * deadline bandwidth is defined at system level so we must
> >> +      * weight this bandwidth with the max capacity of the system.
> >> +      * As a reminder, avg_bw is 20bits width and
> >> +      * scale_cpu_capacity is 10 bits width
> >> +      */
> >> +     used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
> >> +
> >>       if (likely(used < SCHED_CAPACITY_SCALE))
> >>               return SCHED_CAPACITY_SCALE - used;
> >>
> >> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> >> index 08858d1..e44c6be 100644
> >> --- a/kernel/sched/sched.h
> >> +++ b/kernel/sched/sched.h
> >> @@ -519,6 +519,8 @@ struct dl_rq {
> >>  #else
> >>       struct dl_bw dl_bw;
> >>  #endif
> >> +     /* This is the "average utilization" for this runqueue */
> >> +     s64 avg_bw;
> >>  };
> >
> > So I don't think this is right. AFAICT this projects the WCET as the
> > amount of time actually used by DL. This will, under many circumstances,
> > vastly overestimate the amount of time actually spend on it. Therefore
> > unduly pessimisme the fair capacity of this CPU.
> 
> I agree that if the WCET is far from reality, we will underestimate
> available capacity for CFS. Have you got some use case in mind which
> overestimates the WCET ?

I guess simply the fact that one task can be admitted to the system, but
then in practice sleep, waiting from some event to happen.

> If we can't rely on this parameters to evaluate the amount of capacity
> used by deadline scheduler on a core, this will imply that we can't
> also use it for requesting capacity to cpufreq and we should fallback
> on a monitoring mechanism which reacts to a change instead of
> anticipating it.
> 

There is at least one way in the middle: use utilization of active
servers (as I think Luca was already mentioning). This solution should
remove some of the pessimism, but still be safe for our needs. I should
be able to play with this alternative in the (hopefully) near future.

Thanks,

- Juri
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Dec. 14, 2015, 4:51 p.m. UTC | #10
On Mon, Dec 14, 2015 at 04:56:17PM +0100, Vincent Guittot wrote:
> I agree that if the WCET is far from reality, we will underestimate
> available capacity for CFS. Have you got some use case in mind which
> overestimates the WCET ?

Pretty much any 'correct' WCET is pessimistic. There's heaps of smart
people working on improving WCET bounds, but they're still out there.
This is mostly because of the .00001% tail cases that 'never' happen but
would make your tokamak burn a hole just when you're outside.

> If we can't rely on this parameters to evaluate the amount of capacity
> used by deadline scheduler on a core, this will imply that we can't
> also use it for requesting capacity to cpufreq and we should fallback
> on a monitoring mechanism which reacts to a change instead of
> anticipating it.

No, since the WCET can and _will_ happen, its the best you can do with
cpufreq. If you were to set it lower you could not be able to execute
correctly in your 'never' tail cases.

There 'might' be smart pants ways around this, where you run part of the
execution at lower speed and switch to a higher speed to 'catch' up if
you exceed some boundary, such that, on average, you run at the same
speed the WCET mandates, but I'm not sure that's worth it. Juri/Luca
might know.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 14, 2015, 9:12 p.m. UTC | #11
On Mon, 14 Dec 2015 16:56:17 +0100
Vincent Guittot <vincent.guittot@linaro.org> wrote:
[...]
> >> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> >> index 08858d1..e44c6be 100644
> >> --- a/kernel/sched/sched.h
> >> +++ b/kernel/sched/sched.h
> >> @@ -519,6 +519,8 @@ struct dl_rq {
> >>  #else
> >>       struct dl_bw dl_bw;
> >>  #endif
> >> +     /* This is the "average utilization" for this runqueue */
> >> +     s64 avg_bw;
> >>  };
> >
> > So I don't think this is right. AFAICT this projects the WCET as the
> > amount of time actually used by DL. This will, under many
> > circumstances, vastly overestimate the amount of time actually
> > spend on it. Therefore unduly pessimisme the fair capacity of this
> > CPU.
> 
> I agree that if the WCET is far from reality, we will underestimate
> available capacity for CFS. Have you got some use case in mind which
> overestimates the WCET ?
> If we can't rely on this parameters to evaluate the amount of capacity
> used by deadline scheduler on a core, this will imply that we can't
> also use it for requesting capacity to cpufreq and we should fallback
> on a monitoring mechanism which reacts to a change instead of
> anticipating it.
I think a more "theoretically sound" approach would be to track the
_active_ utilisation (informally speaking, the sum of the utilisations
of the tasks that are actually active on a core - the exact definition
of "active" is the trick here).
As done, for example, here:
https://github.com/lucabe72/linux-reclaiming/tree/track-utilisation-v2
(in particular, see
https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b
)
I understand this approach might look too complex... But I think it is
much less pessimistic while still being "safe".
If there is something that I can do to make that code more acceptable,
let me know.


			Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 14, 2015, 9:19 p.m. UTC | #12
On Mon, 14 Dec 2015 16:07:59 +0000
Juri Lelli <Juri.Lelli@arm.com> wrote:
[...]
> > I agree that if the WCET is far from reality, we will underestimate
> > available capacity for CFS. Have you got some use case in mind which
> > overestimates the WCET ?
> 
> I guess simply the fact that one task can be admitted to the system,
> but then in practice sleep, waiting from some event to happen.
My favourite example (since 1998 :) is a video player (but every task
processing compressed video should work as an example): there is a
noticeable difference between the time needed to process large I frames
with a lot of movement (that is about the WCET) and the time needed to
process small B frames with not much movement. And if we want to avoid
too much jitter in the video playback we have to allocate the runtime
based on the maximum time needed to process a video frame.


> > If we can't rely on this parameters to evaluate the amount of
> > capacity used by deadline scheduler on a core, this will imply that
> > we can't also use it for requesting capacity to cpufreq and we
> > should fallback on a monitoring mechanism which reacts to a change
> > instead of anticipating it.
> > 
> 
> There is at least one way in the middle: use utilization of active
> servers (as I think Luca was already mentioning). This solution should
> remove some of the pessimism, but still be safe for our needs.
If you track the active utilisation as done by the GRUB algorithm
( http://retis.sssup.it/~lipari/papers/lipariBaruah2000.pdf ) and by my
patches, you can remove _all_ the pessimism :)


			Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 14, 2015, 9:31 p.m. UTC | #13
On Mon, 14 Dec 2015 17:51:28 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Dec 14, 2015 at 04:56:17PM +0100, Vincent Guittot wrote:
> > I agree that if the WCET is far from reality, we will underestimate
> > available capacity for CFS. Have you got some use case in mind which
> > overestimates the WCET ?
> 
> Pretty much any 'correct' WCET is pessimistic. There's heaps of smart
> people working on improving WCET bounds, but they're still out there.
> This is mostly because of the .00001% tail cases that 'never' happen
> but would make your tokamak burn a hole just when you're outside.
As I mentioned in a previous email, you do not even need to consider
these extreme cases... If a task has a highly variable execution time
(I always mention video players and compressed video processing, but
collegues working on computer vision told me that some video tracking
algorithms have similar characteristics) you might want to allocate the
runtime based on the maximum execution time (or a time near to the
maximum)... But the task will consume less than that a lot of times.


> > If we can't rely on this parameters to evaluate the amount of
> > capacity used by deadline scheduler on a core, this will imply that
> > we can't also use it for requesting capacity to cpufreq and we
> > should fallback on a monitoring mechanism which reacts to a change
> > instead of anticipating it.
> 
> No, since the WCET can and _will_ happen, its the best you can do with
> cpufreq. If you were to set it lower you could not be able to execute
> correctly in your 'never' tail cases.
> 
> There 'might' be smart pants ways around this, where you run part of
> the execution at lower speed and switch to a higher speed to 'catch'
> up if you exceed some boundary, such that, on average, you run at the
> same speed the WCET mandates, but I'm not sure that's worth it.
> Juri/Luca might know.
Some previous works (see for example
https://www.researchgate.net/profile/Giuseppe_Lipari/publication/220800940_Using_resource_reservation_techniques_for_power-aware_scheduling/links/09e41513639b2703fc000000.pdf
) investigated the usage of the "active utilisation" for switching the
CPU frequency. This "active utilisation tracking" mechanism is the same
I mentioned in the previous email, and implemented here:
https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b .

I suspect the "inactive timer" I used to decrease the utilisation at
the so called 0-lag time might be problematic, but I did not find any
way to implement (or approximate) the active utilisation tracking
without this timer... Anyway, if there is interest I am willing to
adapt/rework/modify my patches as needed.


				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 15, 2015, 4:43 a.m. UTC | #14
On 14 December 2015 at 17:51, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Dec 14, 2015 at 04:56:17PM +0100, Vincent Guittot wrote:
>> I agree that if the WCET is far from reality, we will underestimate
>> available capacity for CFS. Have you got some use case in mind which
>> overestimates the WCET ?
>
> Pretty much any 'correct' WCET is pessimistic. There's heaps of smart
> people working on improving WCET bounds, but they're still out there.
> This is mostly because of the .00001% tail cases that 'never' happen but
> would make your tokamak burn a hole just when you're outside.
>
>> If we can't rely on this parameters to evaluate the amount of capacity
>> used by deadline scheduler on a core, this will imply that we can't
>> also use it for requesting capacity to cpufreq and we should fallback
>> on a monitoring mechanism which reacts to a change instead of
>> anticipating it.
>
> No, since the WCET can and _will_ happen, its the best you can do with
> cpufreq. If you were to set it lower you could not be able to execute
> correctly in your 'never' tail cases.

In the context of frequency scaling, This mean that we will never
reach low frequency


>
> There 'might' be smart pants ways around this, where you run part of the
> execution at lower speed and switch to a higher speed to 'catch' up if
> you exceed some boundary, such that, on average, you run at the same
> speed the WCET mandates, but I'm not sure that's worth it. Juri/Luca
> might know.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 15, 2015, 4:59 a.m. UTC | #15
On 14 December 2015 at 22:12, Luca Abeni <luca.abeni@unitn.it> wrote:
> On Mon, 14 Dec 2015 16:56:17 +0100
> Vincent Guittot <vincent.guittot@linaro.org> wrote:
> [...]
>> >> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> >> index 08858d1..e44c6be 100644
>> >> --- a/kernel/sched/sched.h
>> >> +++ b/kernel/sched/sched.h
>> >> @@ -519,6 +519,8 @@ struct dl_rq {
>> >>  #else
>> >>       struct dl_bw dl_bw;
>> >>  #endif
>> >> +     /* This is the "average utilization" for this runqueue */
>> >> +     s64 avg_bw;
>> >>  };
>> >
>> > So I don't think this is right. AFAICT this projects the WCET as the
>> > amount of time actually used by DL. This will, under many
>> > circumstances, vastly overestimate the amount of time actually
>> > spend on it. Therefore unduly pessimisme the fair capacity of this
>> > CPU.
>>
>> I agree that if the WCET is far from reality, we will underestimate
>> available capacity for CFS. Have you got some use case in mind which
>> overestimates the WCET ?
>> If we can't rely on this parameters to evaluate the amount of capacity
>> used by deadline scheduler on a core, this will imply that we can't
>> also use it for requesting capacity to cpufreq and we should fallback
>> on a monitoring mechanism which reacts to a change instead of
>> anticipating it.
> I think a more "theoretically sound" approach would be to track the
> _active_ utilisation (informally speaking, the sum of the utilisations
> of the tasks that are actually active on a core - the exact definition
> of "active" is the trick here).

The point is that we probably need 2 definitions of "active" tasks.
The 1st one would be used to scale the frequency. From a power saving
point of view, it have to reflect the minimum frequency needed at the
current time to handle all works without missing deadline. This one
should be updated quite often with the wake up and the sleep of tasks
as well as the throttling.
The 2nd definition is used to compute the remaining capacity for the
CFS scheduler. This one doesn't need to be updated at each wake/sleep
of a deadline task but should reflect the capacity used by deadline in
a larger time scale. The latter will be used by the CFS scheduler  at
the periodic load balance pace

> As done, for example, here:
> https://github.com/lucabe72/linux-reclaiming/tree/track-utilisation-v2
> (in particular, see
> https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b
> )
> I understand this approach might look too complex... But I think it is
> much less pessimistic while still being "safe".
> If there is something that I can do to make that code more acceptable,
> let me know.
>
>
>                         Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 15, 2015, 8:50 a.m. UTC | #16
On 12/15/2015 05:59 AM, Vincent Guittot wrote:
[...]
>>>> So I don't think this is right. AFAICT this projects the WCET as the
>>>> amount of time actually used by DL. This will, under many
>>>> circumstances, vastly overestimate the amount of time actually
>>>> spend on it. Therefore unduly pessimisme the fair capacity of this
>>>> CPU.
>>>
>>> I agree that if the WCET is far from reality, we will underestimate
>>> available capacity for CFS. Have you got some use case in mind which
>>> overestimates the WCET ?
>>> If we can't rely on this parameters to evaluate the amount of capacity
>>> used by deadline scheduler on a core, this will imply that we can't
>>> also use it for requesting capacity to cpufreq and we should fallback
>>> on a monitoring mechanism which reacts to a change instead of
>>> anticipating it.
>> I think a more "theoretically sound" approach would be to track the
>> _active_ utilisation (informally speaking, the sum of the utilisations
>> of the tasks that are actually active on a core - the exact definition
>> of "active" is the trick here).
>
> The point is that we probably need 2 definitions of "active" tasks.
Ok; thanks for clarifying. I do not know much about the remaining capacity
used by CFS; however, from what you write I guess CFS really need an "average"
utilisation (while frequency scaling needs the active utilisation).
So, I suspect you really need to track 2 different things.
 From a quick look at the code that is currently in mainline, it seems to
me that it does a reasonable thing for tracking the remaining capacity
used by CFS...

> The 1st one would be used to scale the frequency. From a power saving
> point of view, it have to reflect the minimum frequency needed at the
> current time to handle all works without missing deadline.
Right. And it can be computed as shown in the GRUB-PA paper I mentioned
in a previous mail (that is, by tracking the active utilisation, as done
by my patches).

> This one
> should be updated quite often with the wake up and the sleep of tasks
> as well as the throttling.
Strictly speaking, the active utilisation must be updated when a task
wakes up and when a task sleeps/terminates (but when a task sleeps/terminates
you cannot decrease the active utilisation immediately: you have to wait
some time because the task might already have used part of its "future
utilisation").
The active utilisation must not be updated when a task is throttled: a
task is throttled when its current runtime is 0, so it already used all
of its utilisation for the current period (think about two tasks with
runtime=50ms and period 100ms: they consume 100% of the time on a CPU,
and when the first task consumed all of its runtime, you cannot decrease
the active utilisation).

> The 2nd definition is used to compute the remaining capacity for the
> CFS scheduler. This one doesn't need to be updated at each wake/sleep
> of a deadline task but should reflect the capacity used by deadline in
> a larger time scale. The latter will be used by the CFS scheduler  at
> the periodic load balance pace
Ok, so as I wrote above this really looks like an average utilisation.
My impression (but I do not know the CFS code too much) is that the mainline
kernel is currently doing the right thing to compute it, so maybe there is no
need to change the current code in this regard.
If the current code is not acceptable for some reason, an alternative would
be to measure the active utilisation for frequency scaling, and then apply a
low-pass filter to it for CFS.


				Luca

>
>> As done, for example, here:
>> https://github.com/lucabe72/linux-reclaiming/tree/track-utilisation-v2
>> (in particular, see
>> https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b
>> )
>> I understand this approach might look too complex... But I think it is
>> much less pessimistic while still being "safe".
>> If there is something that I can do to make that code more acceptable,
>> let me know.
>>
>>
>>                          Luca

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Dec. 15, 2015, 12:20 p.m. UTC | #17
On Tue, Dec 15, 2015 at 09:50:14AM +0100, Luca Abeni wrote:
> On 12/15/2015 05:59 AM, Vincent Guittot wrote:

> >The 2nd definition is used to compute the remaining capacity for the
> >CFS scheduler. This one doesn't need to be updated at each wake/sleep
> >of a deadline task but should reflect the capacity used by deadline in
> >a larger time scale. The latter will be used by the CFS scheduler  at
> >the periodic load balance pace

> Ok, so as I wrote above this really looks like an average utilisation.
> My impression (but I do not know the CFS code too much) is that the mainline
> kernel is currently doing the right thing to compute it, so maybe there is no
> need to change the current code in this regard.
> If the current code is not acceptable for some reason, an alternative would
> be to measure the active utilisation for frequency scaling, and then apply a
> low-pass filter to it for CFS.

So CFS really only needs a 'vague' average idea on how much time it will
not get. Its best effort etc., so being a little wrong isn't a problem.

The current code suffices, but I think the reason its been changed in
this series is that they want/need separate tracking for fifo/rr and
deadline in the next patch, and taking out deadline like proposed was
the easiest way of achieving that.


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Dec. 15, 2015, 12:23 p.m. UTC | #18
On Tue, Dec 15, 2015 at 09:50:14AM +0100, Luca Abeni wrote:
> Strictly speaking, the active utilisation must be updated when a task
> wakes up and when a task sleeps/terminates (but when a task sleeps/terminates
> you cannot decrease the active utilisation immediately: you have to wait
> some time because the task might already have used part of its "future
> utilisation").
> The active utilisation must not be updated when a task is throttled: a
> task is throttled when its current runtime is 0, so it already used all
> of its utilisation for the current period (think about two tasks with
> runtime=50ms and period 100ms: they consume 100% of the time on a CPU,
> and when the first task consumed all of its runtime, you cannot decrease
> the active utilisation).

Hehe, this reminds me of the lag tracking in EEVDF/WF2Q/BFQ etc., that
had similar issues.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Dec. 15, 2015, 12:38 p.m. UTC | #19
On Mon, Dec 14, 2015 at 10:31:13PM +0100, Luca Abeni wrote:

> > There 'might' be smart pants ways around this, where you run part of
> > the execution at lower speed and switch to a higher speed to 'catch'
> > up if you exceed some boundary, such that, on average, you run at the
> > same speed the WCET mandates, but I'm not sure that's worth it.
> > Juri/Luca might know.

> Some previous works (see for example
> https://www.researchgate.net/profile/Giuseppe_Lipari/publication/220800940_Using_resource_reservation_techniques_for_power-aware_scheduling/links/09e41513639b2703fc000000.pdf
> ) investigated the usage of the "active utilisation" for switching the
> CPU frequency. This "active utilisation tracking" mechanism is the same
> I mentioned in the previous email, and implemented here:
> https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b .

I have stuck the various PDFs and commits you've linked into my todo
list ;-) Thanks!

> I suspect the "inactive timer" I used to decrease the utilisation at
> the so called 0-lag time might be problematic, but I did not find any
> way to implement (or approximate) the active utilisation tracking
> without this timer... Anyway, if there is interest I am willing to
> adapt/rework/modify my patches as needed.

So I remember something else from the BFQ code, which also had to track
entries for the 0-lag stuff, and I just had a quick peek at that code
again. And what they appear to do is keep inactive entries with a lag
deficit in a separate tree (the idle tree).

And every time they update the vtime, they also push fwd the idle tree
and expire entries on that.

Or that is what I can make of it in a quick few minutes staring at that
code -- look for bfq_forget_idle().
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Dec. 15, 2015, 12:41 p.m. UTC | #20
On Tue, Dec 15, 2015 at 05:43:44AM +0100, Vincent Guittot wrote:
> On 14 December 2015 at 17:51, Peter Zijlstra <peterz@infradead.org> wrote:

> > No, since the WCET can and _will_ happen, its the best you can do with
> > cpufreq. If you were to set it lower you could not be able to execute
> > correctly in your 'never' tail cases.
> 
> In the context of frequency scaling, This mean that we will never
> reach low frequency

Only if you've stuffed your machine full of deadline tasks, if you take
Luca's example of the I/B frame decoder thingy, then even the WCET for
the I frames should not be very much (albeit significantly more than B
frames).

So while the WCET is pessimistic compared to the avg case, most CPUs can
do video decoding without much effort at all, so even the WCET for the
I-frames might allow us to drop to the lowest cpufreq.

Now, if you were to decode 10 streams at the same time, different story
of course ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 15, 2015, 12:43 p.m. UTC | #21
On 15 December 2015 at 09:50, Luca Abeni <luca.abeni@unitn.it> wrote:
> On 12/15/2015 05:59 AM, Vincent Guittot wrote:
> [...]
>>>>>
>>>>> So I don't think this is right. AFAICT this projects the WCET as the
>>>>> amount of time actually used by DL. This will, under many
>>>>> circumstances, vastly overestimate the amount of time actually
>>>>> spend on it. Therefore unduly pessimisme the fair capacity of this
>>>>> CPU.
>>>>
>>>>
>>>> I agree that if the WCET is far from reality, we will underestimate
>>>> available capacity for CFS. Have you got some use case in mind which
>>>> overestimates the WCET ?
>>>> If we can't rely on this parameters to evaluate the amount of capacity
>>>> used by deadline scheduler on a core, this will imply that we can't
>>>> also use it for requesting capacity to cpufreq and we should fallback
>>>> on a monitoring mechanism which reacts to a change instead of
>>>> anticipating it.
>>>
>>> I think a more "theoretically sound" approach would be to track the
>>> _active_ utilisation (informally speaking, the sum of the utilisations
>>> of the tasks that are actually active on a core - the exact definition
>>> of "active" is the trick here).
>>
>>
>> The point is that we probably need 2 definitions of "active" tasks.
>
> Ok; thanks for clarifying. I do not know much about the remaining capacity
> used by CFS; however, from what you write I guess CFS really need an
> "average"
> utilisation (while frequency scaling needs the active utilisation).

yes. this patch is only about the "average" utilization

> So, I suspect you really need to track 2 different things.
> From a quick look at the code that is currently in mainline, it seems to
> me that it does a reasonable thing for tracking the remaining capacity
> used by CFS...
>
>> The 1st one would be used to scale the frequency. From a power saving
>> point of view, it have to reflect the minimum frequency needed at the
>> current time to handle all works without missing deadline.
>
> Right. And it can be computed as shown in the GRUB-PA paper I mentioned
> in a previous mail (that is, by tracking the active utilisation, as done
> by my patches).

I fully trust you on that part.
>
>> This one
>> should be updated quite often with the wake up and the sleep of tasks
>> as well as the throttling.
>
> Strictly speaking, the active utilisation must be updated when a task
> wakes up and when a task sleeps/terminates (but when a task
> sleeps/terminates
> you cannot decrease the active utilisation immediately: you have to wait
> some time because the task might already have used part of its "future
> utilisation").
> The active utilisation must not be updated when a task is throttled: a
> task is throttled when its current runtime is 0, so it already used all
> of its utilisation for the current period (think about two tasks with
> runtime=50ms and period 100ms: they consume 100% of the time on a CPU,
> and when the first task consumed all of its runtime, you cannot decrease
> the active utilisation).

 I haven't read the paper you pointed in the previous email but it's
on my todo list. Does the GRUB-PA take into account the frequency
transition when selecting the best frequency ?

>
>> The 2nd definition is used to compute the remaining capacity for the
>> CFS scheduler. This one doesn't need to be updated at each wake/sleep
>> of a deadline task but should reflect the capacity used by deadline in
>> a larger time scale. The latter will be used by the CFS scheduler  at
>> the periodic load balance pace
>
> Ok, so as I wrote above this really looks like an average utilisation.
> My impression (but I do not know the CFS code too much) is that the mainline
> kernel is currently doing the right thing to compute it, so maybe there is
> no
> need to change the current code in this regard.
> If the current code is not acceptable for some reason, an alternative would
> be to measure the active utilisation for frequency scaling, and then apply a
> low-pass filter to it for CFS.
>
>
>                                 Luca
>
>
>>
>>> As done, for example, here:
>>> https://github.com/lucabe72/linux-reclaiming/tree/track-utilisation-v2
>>> (in particular, see
>>>
>>> https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b
>>> )
>>> I understand this approach might look too complex... But I think it is
>>> much less pessimistic while still being "safe".
>>> If there is something that I can do to make that code more acceptable,
>>> let me know.
>>>
>>>
>>>                          Luca
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 15, 2015, 12:46 p.m. UTC | #22
On 15 December 2015 at 13:20, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Dec 15, 2015 at 09:50:14AM +0100, Luca Abeni wrote:
>> On 12/15/2015 05:59 AM, Vincent Guittot wrote:
>
>> >The 2nd definition is used to compute the remaining capacity for the
>> >CFS scheduler. This one doesn't need to be updated at each wake/sleep
>> >of a deadline task but should reflect the capacity used by deadline in
>> >a larger time scale. The latter will be used by the CFS scheduler  at
>> >the periodic load balance pace
>
>> Ok, so as I wrote above this really looks like an average utilisation.
>> My impression (but I do not know the CFS code too much) is that the mainline
>> kernel is currently doing the right thing to compute it, so maybe there is no
>> need to change the current code in this regard.
>> If the current code is not acceptable for some reason, an alternative would
>> be to measure the active utilisation for frequency scaling, and then apply a
>> low-pass filter to it for CFS.
>
> So CFS really only needs a 'vague' average idea on how much time it will
> not get. Its best effort etc., so being a little wrong isn't a problem.
>
> The current code suffices, but I think the reason its been changed in
> this series is that they want/need separate tracking for fifo/rr and
> deadline in the next patch, and taking out deadline like proposed was
> the easiest way of achieving that.

yes. you're right. The goal was to minimize the overhead for tracking
separately  fifo/rr and deadline.


>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 15, 2015, 12:56 p.m. UTC | #23
On 15 December 2015 at 13:41, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Dec 15, 2015 at 05:43:44AM +0100, Vincent Guittot wrote:
>> On 14 December 2015 at 17:51, Peter Zijlstra <peterz@infradead.org> wrote:
>
>> > No, since the WCET can and _will_ happen, its the best you can do with
>> > cpufreq. If you were to set it lower you could not be able to execute
>> > correctly in your 'never' tail cases.
>>
>> In the context of frequency scaling, This mean that we will never
>> reach low frequency
>
> Only if you've stuffed your machine full of deadline tasks, if you take
> Luca's example of the I/B frame decoder thingy, then even the WCET for
> the I frames should not be very much (albeit significantly more than B
> frames).

But in this case, the impact of deadline scheduler on the remaining
capacity for CFS should not be that much as well. This will not
prevent a CFS task but only will only make it a bit smaller than the
other

>
> So while the WCET is pessimistic compared to the avg case, most CPUs can
> do video decoding without much effort at all, so even the WCET for the
> I-frames might allow us to drop to the lowest cpufreq.
>
> Now, if you were to decode 10 streams at the same time, different story
> of course ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Dec. 15, 2015, 12:58 p.m. UTC | #24
On 15 December 2015 at 09:50, Luca Abeni <luca.abeni@unitn.it> wrote:
> On 12/15/2015 05:59 AM, Vincent Guittot wrote:
> [...]
>>>>>
>>>>> So I don't think this is right. AFAICT this projects the WCET as the
>>>>> amount of time actually used by DL. This will, under many
>>>>> circumstances, vastly overestimate the amount of time actually
>>>>> spend on it. Therefore unduly pessimisme the fair capacity of this
>>>>> CPU.
>>>>
>>>>

[snip]

>> The 2nd definition is used to compute the remaining capacity for the
>> CFS scheduler. This one doesn't need to be updated at each wake/sleep
>> of a deadline task but should reflect the capacity used by deadline in
>> a larger time scale. The latter will be used by the CFS scheduler  at
>> the periodic load balance pace
>
> Ok, so as I wrote above this really looks like an average utilisation.
> My impression (but I do not know the CFS code too much) is that the mainline
> kernel is currently doing the right thing to compute it, so maybe there is
> no
> need to change the current code in this regard.
> If the current code is not acceptable for some reason, an alternative would
> be to measure the active utilisation for frequency scaling, and then apply a
> low-pass filter to it for CFS.

In this case, it's probably easier to split what is already done into
a rt_avg metric  and a dl_avg metric

Vincent
>
>
>                                 Luca
>
>
>>
>>> As done, for example, here:
>>> https://github.com/lucabe72/linux-reclaiming/tree/track-utilisation-v2
>>> (in particular, see
>>>
>>> https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b
>>> )
>>> I understand this approach might look too complex... But I think it is
>>> much less pessimistic while still being "safe".
>>> If there is something that I can do to make that code more acceptable,
>>> let me know.
>>>
>>>
>>>                          Luca
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 15, 2015, 1:18 p.m. UTC | #25
On 12/15/2015 01:20 PM, Peter Zijlstra wrote:
> On Tue, Dec 15, 2015 at 09:50:14AM +0100, Luca Abeni wrote:
>> On 12/15/2015 05:59 AM, Vincent Guittot wrote:
>
>>> The 2nd definition is used to compute the remaining capacity for the
>>> CFS scheduler. This one doesn't need to be updated at each wake/sleep
>>> of a deadline task but should reflect the capacity used by deadline in
>>> a larger time scale. The latter will be used by the CFS scheduler  at
>>> the periodic load balance pace
>
>> Ok, so as I wrote above this really looks like an average utilisation.
>> My impression (but I do not know the CFS code too much) is that the mainline
>> kernel is currently doing the right thing to compute it, so maybe there is no
>> need to change the current code in this regard.
>> If the current code is not acceptable for some reason, an alternative would
>> be to measure the active utilisation for frequency scaling, and then apply a
>> low-pass filter to it for CFS.
>
> So CFS really only needs a 'vague' average idea on how much time it will
> not get. Its best effort etc., so being a little wrong isn't a problem.
>
> The current code suffices, but I think the reason its been changed in
> this series is that they want/need separate tracking for fifo/rr and
> deadline in the next patch, and taking out deadline like proposed was
> the easiest way of achieving that.
Ah, ok. Thanks for explaining.
So, I agree that this patch is not a good idea for estimating the average
utilisation needed by CFS.



				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 15, 2015, 1:21 p.m. UTC | #26
On 12/15/2015 01:23 PM, Peter Zijlstra wrote:
> On Tue, Dec 15, 2015 at 09:50:14AM +0100, Luca Abeni wrote:
>> Strictly speaking, the active utilisation must be updated when a task
>> wakes up and when a task sleeps/terminates (but when a task sleeps/terminates
>> you cannot decrease the active utilisation immediately: you have to wait
>> some time because the task might already have used part of its "future
>> utilisation").
>> The active utilisation must not be updated when a task is throttled: a
>> task is throttled when its current runtime is 0, so it already used all
>> of its utilisation for the current period (think about two tasks with
>> runtime=50ms and period 100ms: they consume 100% of the time on a CPU,
>> and when the first task consumed all of its runtime, you cannot decrease
>> the active utilisation).
>
> Hehe, this reminds me of the lag tracking in EEVDF/WF2Q/BFQ etc., that
> had similar issues.
Yes, I remember EEVDF and similar algorithms also needed to keep track of
the current lag (and to reset it at a later time respect to the "task is
blocking" event). I do not remember the exact details, but I "borrowed" the
"0-lag time" name from one of those papers :)


				Luca

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 15, 2015, 1:30 p.m. UTC | #27
On 12/15/2015 01:38 PM, Peter Zijlstra wrote:
> On Mon, Dec 14, 2015 at 10:31:13PM +0100, Luca Abeni wrote:
>
>>> There 'might' be smart pants ways around this, where you run part of
>>> the execution at lower speed and switch to a higher speed to 'catch'
>>> up if you exceed some boundary, such that, on average, you run at the
>>> same speed the WCET mandates, but I'm not sure that's worth it.
>>> Juri/Luca might know.
>
>> Some previous works (see for example
>> https://www.researchgate.net/profile/Giuseppe_Lipari/publication/220800940_Using_resource_reservation_techniques_for_power-aware_scheduling/links/09e41513639b2703fc000000.pdf
>> ) investigated the usage of the "active utilisation" for switching the
>> CPU frequency. This "active utilisation tracking" mechanism is the same
>> I mentioned in the previous email, and implemented here:
>> https://github.com/lucabe72/linux-reclaiming/commit/49fc786a1c453148625f064fa38ea538470df55b .
>
> I have stuck the various PDFs and commits you've linked into my todo
> list ;-) Thanks!
You are welcome :)


>> I suspect the "inactive timer" I used to decrease the utilisation at
>> the so called 0-lag time might be problematic, but I did not find any
>> way to implement (or approximate) the active utilisation tracking
>> without this timer... Anyway, if there is interest I am willing to
>> adapt/rework/modify my patches as needed.
>
> So I remember something else from the BFQ code, which also had to track
> entries for the 0-lag stuff, and I just had a quick peek at that code
> again. And what they appear to do is keep inactive entries with a lag
> deficit in a separate tree (the idle tree).
>
> And every time they update the vtime, they also push fwd the idle tree
> and expire entries on that.
I am not sure if I understand correctly the idea (I do not know the BFQ
code; I'll have a look), but I think I tried something similar:
- When a task blocks, instead of arming the inactive timer I can insert
   the task in an "active non contending" tree (to use GRUB terminology)
- So, when some sched deadline function is invoked, I check the "0-lag
   time" of the first task in the "active non contending" tree, and if
   that time is passed I remove the task from the tree and adjust the
   active utilisation

The resulting code ended up being more complex (basically, I needed to
handle the "active non contending" tree and to check it in task_tick_dl()
and update_curr_dl()). But maybe I did it wrong... I'll try this approach
again, after looking ad the BFQ code.


			Thanks,
				Luca
>
> Or that is what I can make of it in a quick few minutes staring at that
> code -- look for bfq_forget_idle().
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 15, 2015, 1:39 p.m. UTC | #28
On 12/15/2015 01:43 PM, Vincent Guittot wrote:
[...]
>>>>> I agree that if the WCET is far from reality, we will underestimate
>>>>> available capacity for CFS. Have you got some use case in mind which
>>>>> overestimates the WCET ?
>>>>> If we can't rely on this parameters to evaluate the amount of capacity
>>>>> used by deadline scheduler on a core, this will imply that we can't
>>>>> also use it for requesting capacity to cpufreq and we should fallback
>>>>> on a monitoring mechanism which reacts to a change instead of
>>>>> anticipating it.
>>>>
>>>> I think a more "theoretically sound" approach would be to track the
>>>> _active_ utilisation (informally speaking, the sum of the utilisations
>>>> of the tasks that are actually active on a core - the exact definition
>>>> of "active" is the trick here).
>>>
>>>
>>> The point is that we probably need 2 definitions of "active" tasks.
>>
>> Ok; thanks for clarifying. I do not know much about the remaining capacity
>> used by CFS; however, from what you write I guess CFS really need an
>> "average"
>> utilisation (while frequency scaling needs the active utilisation).
>
> yes. this patch is only about the "average" utilization
Ok; so, I think that the approach of this patch is too pessimistic (it uses
the "worst-case" utilisation as an estimation of the average one).

>>> This one
>>> should be updated quite often with the wake up and the sleep of tasks
>>> as well as the throttling.
>>
>> Strictly speaking, the active utilisation must be updated when a task
>> wakes up and when a task sleeps/terminates (but when a task
>> sleeps/terminates
>> you cannot decrease the active utilisation immediately: you have to wait
>> some time because the task might already have used part of its "future
>> utilisation").
>> The active utilisation must not be updated when a task is throttled: a
>> task is throttled when its current runtime is 0, so it already used all
>> of its utilisation for the current period (think about two tasks with
>> runtime=50ms and period 100ms: they consume 100% of the time on a CPU,
>> and when the first task consumed all of its runtime, you cannot decrease
>> the active utilisation).
>
>   I haven't read the paper you pointed in the previous email but it's
> on my todo list. Does the GRUB-PA take into account the frequency
> transition when selecting the best frequency ?
I do not know...
As far as I understand, the GRUB-PA approach is simple: if the active
utilisation of SCHED_DEADLINE tasks is Ua, then the CPU frequency can be
reduced to the maximum possible frequency multiplied by Ua (of course,
this must be adjusted a little bit, because the original GRUB-PA paper
only considered real-time/SCHED_DEADLINE tasks... To leave some CPU time
for other tasks, you have to increase Ua a little bit).

Some time ago one of the authors of the GRUB-PA paper told me that they
evaluated the performance of the algorithm by simulating the behaviour of
a real CPU, but I do not know the details, and I do not know if they took
the frequency transition into account.



				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 15, 2015, 1:41 p.m. UTC | #29
On 12/15/2015 01:58 PM, Vincent Guittot wrote:
> On 15 December 2015 at 09:50, Luca Abeni <luca.abeni@unitn.it> wrote:
>> On 12/15/2015 05:59 AM, Vincent Guittot wrote:
>> [...]
>>>>>>
>>>>>> So I don't think this is right. AFAICT this projects the WCET as the
>>>>>> amount of time actually used by DL. This will, under many
>>>>>> circumstances, vastly overestimate the amount of time actually
>>>>>> spend on it. Therefore unduly pessimisme the fair capacity of this
>>>>>> CPU.
>>>>>
>>>>>
>
> [snip]
>
>>> The 2nd definition is used to compute the remaining capacity for the
>>> CFS scheduler. This one doesn't need to be updated at each wake/sleep
>>> of a deadline task but should reflect the capacity used by deadline in
>>> a larger time scale. The latter will be used by the CFS scheduler  at
>>> the periodic load balance pace
>>
>> Ok, so as I wrote above this really looks like an average utilisation.
>> My impression (but I do not know the CFS code too much) is that the mainline
>> kernel is currently doing the right thing to compute it, so maybe there is
>> no
>> need to change the current code in this regard.
>> If the current code is not acceptable for some reason, an alternative would
>> be to measure the active utilisation for frequency scaling, and then apply a
>> low-pass filter to it for CFS.
>
> In this case, it's probably easier to split what is already done into
> a rt_avg metric  and a dl_avg metric
Yes, I think this could be the best approach for what concerns the average
utilisation used by CFS.



				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Dec. 15, 2015, 1:42 p.m. UTC | #30
On Tue, Dec 15, 2015 at 02:30:07PM +0100, Luca Abeni wrote:

> >So I remember something else from the BFQ code, which also had to track
> >entries for the 0-lag stuff, and I just had a quick peek at that code
> >again. And what they appear to do is keep inactive entries with a lag
> >deficit in a separate tree (the idle tree).
> >
> >And every time they update the vtime, they also push fwd the idle tree
> >and expire entries on that.
> I am not sure if I understand correctly the idea (I do not know the BFQ
> code; I'll have a look), but I think I tried something similar:
> - When a task blocks, instead of arming the inactive timer I can insert
>   the task in an "active non contending" tree (to use GRUB terminology)
> - So, when some sched deadline function is invoked, I check the "0-lag
>   time" of the first task in the "active non contending" tree, and if
>   that time is passed I remove the task from the tree and adjust the
>   active utilisation
> 
> The resulting code ended up being more complex (basically, I needed to
> handle the "active non contending" tree and to check it in task_tick_dl()
> and update_curr_dl()). But maybe I did it wrong... I'll try this approach
> again, after looking ad the BFQ code.

That sounds about right.

I've no idea if its more or less work. I just had vague memories on an
alternative approach to the timer.

Feel free to stick with the timer if that works better, just wanted to
mention there are indeed alternative solutions.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luca Abeni Dec. 15, 2015, 9:24 p.m. UTC | #31
On Tue, 15 Dec 2015 14:42:29 +0100
Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Dec 15, 2015 at 02:30:07PM +0100, Luca Abeni wrote:
> 
> > >So I remember something else from the BFQ code, which also had to
> > >track entries for the 0-lag stuff, and I just had a quick peek at
> > >that code again. And what they appear to do is keep inactive
> > >entries with a lag deficit in a separate tree (the idle tree).
> > >
> > >And every time they update the vtime, they also push fwd the idle
> > >tree and expire entries on that.
> > I am not sure if I understand correctly the idea (I do not know the
> > BFQ code; I'll have a look), but I think I tried something similar:
> > - When a task blocks, instead of arming the inactive timer I can
> > insert the task in an "active non contending" tree (to use GRUB
> > terminology)
> > - So, when some sched deadline function is invoked, I check the
> > "0-lag time" of the first task in the "active non contending" tree,
> > and if that time is passed I remove the task from the tree and
> > adjust the active utilisation
> > 
> > The resulting code ended up being more complex (basically, I needed
> > to handle the "active non contending" tree and to check it in
> > task_tick_dl() and update_curr_dl()). But maybe I did it wrong...
> > I'll try this approach again, after looking ad the BFQ code.
> 
> That sounds about right.
> 
> I've no idea if its more or less work. I just had vague memories on an
> alternative approach to the timer.
> 
> Feel free to stick with the timer if that works better, just wanted to
> mention there are indeed alternative solutions.
Ok; I'll try to implement this alternative approach again, after
looking at BFQ, to see if it turns out to be simpler or more complex
than the timer-based approach.

If there is interest, I'll send an RFC with these patches after some
testing.



			Thanks,
				Luca
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Dec. 16, 2015, 9:28 a.m. UTC | #32
Hi Luca,

On 15/12/15 22:24, Luca Abeni wrote:
> On Tue, 15 Dec 2015 14:42:29 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> > On Tue, Dec 15, 2015 at 02:30:07PM +0100, Luca Abeni wrote:
> > 
> > > >So I remember something else from the BFQ code, which also had to
> > > >track entries for the 0-lag stuff, and I just had a quick peek at
> > > >that code again. And what they appear to do is keep inactive
> > > >entries with a lag deficit in a separate tree (the idle tree).
> > > >
> > > >And every time they update the vtime, they also push fwd the idle
> > > >tree and expire entries on that.
> > > I am not sure if I understand correctly the idea (I do not know the
> > > BFQ code; I'll have a look), but I think I tried something similar:
> > > - When a task blocks, instead of arming the inactive timer I can
> > > insert the task in an "active non contending" tree (to use GRUB
> > > terminology)
> > > - So, when some sched deadline function is invoked, I check the
> > > "0-lag time" of the first task in the "active non contending" tree,
> > > and if that time is passed I remove the task from the tree and
> > > adjust the active utilisation
> > > 
> > > The resulting code ended up being more complex (basically, I needed
> > > to handle the "active non contending" tree and to check it in
> > > task_tick_dl() and update_curr_dl()). But maybe I did it wrong...
> > > I'll try this approach again, after looking ad the BFQ code.
> > 
> > That sounds about right.
> > 
> > I've no idea if its more or less work. I just had vague memories on an
> > alternative approach to the timer.
> > 
> > Feel free to stick with the timer if that works better, just wanted to
> > mention there are indeed alternative solutions.
> Ok; I'll try to implement this alternative approach again, after
> looking at BFQ, to see if it turns out to be simpler or more complex
> than the timer-based approach.
> 
> If there is interest, I'll send an RFC with these patches after some
> testing.
> 

I think there's definitely interest, as next step will be to start using
the new API for freq selection from DL as well.

Thanks a lot for your time and efforts!

Best,

- Juri
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e..9d9eb50 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,24 @@  static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 	return !RB_EMPTY_NODE(&dl_se->rb_node);
 }
 
+static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+	u64 se_bw = dl_se->dl_bw;
+
+	dl_rq->avg_bw += se_bw;
+}
+
+static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+	u64 se_bw = dl_se->dl_bw;
+
+	dl_rq->avg_bw -= se_bw;
+	if (dl_rq->avg_bw < 0) {
+		WARN_ON(1);
+		dl_rq->avg_bw = 0;
+	}
+}
+
 static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
@@ -494,6 +512,9 @@  static void update_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
+	if (dl_se->dl_new)
+		add_average_bw(dl_se, dl_rq);
+
 	/*
 	 * The arrival of a new instance needs special treatment, i.e.,
 	 * the actual scheduling parameters have to be "renewed".
@@ -741,8 +762,6 @@  static void update_curr_dl(struct rq *rq)
 	curr->se.exec_start = rq_clock_task(rq);
 	cpuacct_charge(curr, delta_exec);
 
-	sched_rt_avg_update(rq, delta_exec);
-
 	dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
 	if (dl_runtime_exceeded(dl_se)) {
 		dl_se->dl_throttled = 1;
@@ -1241,6 +1260,8 @@  static void task_fork_dl(struct task_struct *p)
 static void task_dead_dl(struct task_struct *p)
 {
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+	struct dl_rq *dl_rq = dl_rq_of_se(&p->dl);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
 
 	/*
 	 * Since we are TASK_DEAD we won't slip out of the domain!
@@ -1249,6 +1270,8 @@  static void task_dead_dl(struct task_struct *p)
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
+
+	clear_average_bw(&p->dl, &rq->dl);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1556,7 +1579,9 @@  retry:
 	}
 
 	deactivate_task(rq, next_task, 0);
+	clear_average_bw(&next_task->dl, &rq->dl);
 	set_task_cpu(next_task, later_rq->cpu);
+	add_average_bw(&next_task->dl, &later_rq->dl);
 	activate_task(later_rq, next_task, 0);
 	ret = 1;
 
@@ -1644,7 +1669,9 @@  static void pull_dl_task(struct rq *this_rq)
 			resched = true;
 
 			deactivate_task(src_rq, p, 0);
+			clear_average_bw(&p->dl, &src_rq->dl);
 			set_task_cpu(p, this_cpu);
+			add_average_bw(&p->dl, &this_rq->dl);
 			activate_task(this_rq, p, 0);
 			dmin = p->dl.deadline;
 
@@ -1750,6 +1777,8 @@  static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!start_dl_timer(p))
 		__dl_clear_params(p);
 
+	clear_average_bw(&p->dl, &rq->dl);
+
 	/*
 	 * Since this might be the only -deadline task on the rq,
 	 * this is the right place to try to pull some other one
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4c49f76..ce05f61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6203,6 +6203,14 @@  static unsigned long scale_rt_capacity(int cpu)
 
 	used = div_u64(avg, total);
 
+	/*
+	 * deadline bandwidth is defined at system level so we must
+	 * weight this bandwidth with the max capacity of the system.
+	 * As a reminder, avg_bw is 20bits width and
+	 * scale_cpu_capacity is 10 bits width
+	 */
+	used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
 	if (likely(used < SCHED_CAPACITY_SCALE))
 		return SCHED_CAPACITY_SCALE - used;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 08858d1..e44c6be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -519,6 +519,8 @@  struct dl_rq {
 #else
 	struct dl_bw dl_bw;
 #endif
+	/* This is the "average utilization" for this runqueue */
+	s64 avg_bw;
 };
 
 #ifdef CONFIG_SMP