diff mbox

[v2,10/11] sched: move cfs task on a CPU with higher capacity

Message ID 20140529095024.GF11074@laptop.programming.kicks-ass.net (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Zijlstra May 29, 2014, 9:50 a.m. UTC
On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote:
> If the CPU is used for handling lot of IRQs, trig a load balance to check if
> it's worth moving its tasks on another CPU that has more capacity
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c | 13 +++++++++++++
>  1 file changed, 13 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e8a30f9..2501e49 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  	if (sgs->sum_nr_running > sgs->group_capacity)
>  		return true;
>  
> +	/*
> +	 * The group capacity is reduced probably because of activity from other
> +	 * sched class or interrupts which use part of the available capacity
> +	 */
> +	if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct))
> +		return true;
> +
>  	if (sgs->group_imb)
>  		return true;
>  

But we should already do this because the load numbers are scaled with
the power/capacity figures. If one CPU gets significant less time to run
fair tasks, its effective load would spike and it'd get to be selected
here anyway.

Or am I missing something?

> @@ -7282,6 +7289,12 @@ static inline int nohz_kick_needed(struct rq *rq)
>  
>  		if (nr_busy > 1)
>  			goto need_kick_unlock;
> +
> +		if ((rq->cfs.h_nr_running >= 1)
> +		 && ((rq->cpu_power * sd->imbalance_pct) <
> +					(rq->cpu_power_orig * 100)))
> +			goto need_kick_unlock;
> +
>  	}
>  
>  	sd = rcu_dereference(per_cpu(sd_asym, cpu));

OK, so there you're kicking the idle balancer to try and get another CPU
to pull some load? That makes sense I suppose.

That function is pretty horrible though; how about something like this
first?

---
 kernel/sched/fair.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

Comments

Vincent Guittot May 29, 2014, 7:37 p.m. UTC | #1
On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote:
>> If the CPU is used for handling lot of IRQs, trig a load balance to check if
>> it's worth moving its tasks on another CPU that has more capacity
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>  kernel/sched/fair.c | 13 +++++++++++++
>>  1 file changed, 13 insertions(+)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index e8a30f9..2501e49 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>       if (sgs->sum_nr_running > sgs->group_capacity)
>>               return true;
>>
>> +     /*
>> +      * The group capacity is reduced probably because of activity from other
>> +      * sched class or interrupts which use part of the available capacity
>> +      */
>> +     if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct))
>> +             return true;
>> +
>>       if (sgs->group_imb)
>>               return true;
>>
>
> But we should already do this because the load numbers are scaled with
> the power/capacity figures. If one CPU gets significant less time to run
> fair tasks, its effective load would spike and it'd get to be selected
> here anyway.
>
> Or am I missing something?

The CPU could have been picked when the capacity becomes null (which
occurred when the cpu_power goes below half the default
SCHED_POWER_SCALE). And even after that, there were some conditions in
find_busiest_group that was bypassing this busiest group

>
>> @@ -7282,6 +7289,12 @@ static inline int nohz_kick_needed(struct rq *rq)
>>
>>               if (nr_busy > 1)
>>                       goto need_kick_unlock;
>> +
>> +             if ((rq->cfs.h_nr_running >= 1)
>> +              && ((rq->cpu_power * sd->imbalance_pct) <
>> +                                     (rq->cpu_power_orig * 100)))
>> +                     goto need_kick_unlock;
>> +
>>       }
>>
>>       sd = rcu_dereference(per_cpu(sd_asym, cpu));
>
> OK, so there you're kicking the idle balancer to try and get another CPU
> to pull some load? That makes sense I suppose.

and especially if we have idle CPUs and one task on the CPU with
reduced capacity

>
> That function is pretty horrible though; how about something like this
> first?

ok, i will integrate this modification in next version

>
> ---
>  kernel/sched/fair.c | 29 +++++++++++++----------------
>  1 file changed, 13 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c9617b73bcc0..47fb96e6fa83 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7215,15 +7215,16 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>   *     domain span are idle.
>   */
> -static inline int nohz_kick_needed(struct rq *rq)
> +static inline bool nohz_kick_needed(struct rq *rq)
>  {
>         unsigned long now = jiffies;
>         struct sched_domain *sd;
>         struct sched_group_power *sgp;
>         int nr_busy, cpu = rq->cpu;
> +       bool kick = false;
>
>         if (unlikely(rq->idle_balance))
> -               return 0;
> +               return false;
>
>         /*
>         * We may be recently in ticked or tickless idle mode. At the first
> @@ -7237,38 +7238,34 @@ static inline int nohz_kick_needed(struct rq *rq)
>          * balancing.
>          */
>         if (likely(!atomic_read(&nohz.nr_cpus)))
> -               return 0;
> +               return false;
>
>         if (time_before(now, nohz.next_balance))
> -               return 0;
> +               return false;
>
>         if (rq->nr_running >= 2)
> -               goto need_kick;
> +               return true;
>
>         rcu_read_lock();
>         sd = rcu_dereference(per_cpu(sd_busy, cpu));
> -
>         if (sd) {
>                 sgp = sd->groups->sgp;
>                 nr_busy = atomic_read(&sgp->nr_busy_cpus);
>
> -               if (nr_busy > 1)
> -                       goto need_kick_unlock;
> +               if (nr_busy > 1) {
> +                       kick = true;
> +                       goto unlock;
> +               }
>         }
>
>         sd = rcu_dereference(per_cpu(sd_asym, cpu));
> -
>         if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>                                   sched_domain_span(sd)) < cpu))
> -               goto need_kick_unlock;
> -
> -       rcu_read_unlock();
> -       return 0;
> +               kick = true;
>
> -need_kick_unlock:
> +unlock:
>         rcu_read_unlock();
> -need_kick:
> -       return 1;
> +       return kick;
>  }
>  #else
>  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
Peter Zijlstra May 30, 2014, 6:29 a.m. UTC | #2
On Thu, May 29, 2014 at 09:37:39PM +0200, Vincent Guittot wrote:
> On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote:
> >> If the CPU is used for handling lot of IRQs, trig a load balance to check if
> >> it's worth moving its tasks on another CPU that has more capacity
> >>
> >> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> >> ---
> >>  kernel/sched/fair.c | 13 +++++++++++++
> >>  1 file changed, 13 insertions(+)
> >>
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index e8a30f9..2501e49 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> >>       if (sgs->sum_nr_running > sgs->group_capacity)
> >>               return true;
> >>
> >> +     /*
> >> +      * The group capacity is reduced probably because of activity from other
> >> +      * sched class or interrupts which use part of the available capacity
> >> +      */
> >> +     if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct))
> >> +             return true;
> >> +
> >>       if (sgs->group_imb)
> >>               return true;
> >>
> >
> > But we should already do this because the load numbers are scaled with
> > the power/capacity figures. If one CPU gets significant less time to run
> > fair tasks, its effective load would spike and it'd get to be selected
> > here anyway.
> >
> > Or am I missing something?
> 
> The CPU could have been picked when the capacity becomes null (which
> occurred when the cpu_power goes below half the default
> SCHED_POWER_SCALE). And even after that, there were some conditions in
> find_busiest_group that was bypassing this busiest group

Could you detail those conditions? FWIW those make excellent Changelog
material.
Vincent Guittot May 30, 2014, 8:05 p.m. UTC | #3
On 30 May 2014 08:29, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, May 29, 2014 at 09:37:39PM +0200, Vincent Guittot wrote:
>> On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote:
>> > On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote:
>> >> If the CPU is used for handling lot of IRQs, trig a load balance to check if
>> >> it's worth moving its tasks on another CPU that has more capacity
>> >>
>> >> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> >> ---
>> >>  kernel/sched/fair.c | 13 +++++++++++++
>> >>  1 file changed, 13 insertions(+)
>> >>
>> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> >> index e8a30f9..2501e49 100644
>> >> --- a/kernel/sched/fair.c
>> >> +++ b/kernel/sched/fair.c
>> >> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>> >>       if (sgs->sum_nr_running > sgs->group_capacity)
>> >>               return true;
>> >>
>> >> +     /*
>> >> +      * The group capacity is reduced probably because of activity from other
>> >> +      * sched class or interrupts which use part of the available capacity
>> >> +      */
>> >> +     if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct))
>> >> +             return true;
>> >> +
>> >>       if (sgs->group_imb)
>> >>               return true;
>> >>
>> >
>> > But we should already do this because the load numbers are scaled with
>> > the power/capacity figures. If one CPU gets significant less time to run
>> > fair tasks, its effective load would spike and it'd get to be selected
>> > here anyway.
>> >
>> > Or am I missing something?
>>
>> The CPU could have been picked when the capacity becomes null (which
>> occurred when the cpu_power goes below half the default
>> SCHED_POWER_SCALE). And even after that, there were some conditions in
>> find_busiest_group that was bypassing this busiest group
>
> Could you detail those conditions? FWIW those make excellent Changelog
> material.

I need to go back to my traces and use case to be sure that I point
the right culprit but IIRC, the imbalance was null. I will come back
with more details once i'll be back in front of the boards.
Vincent Guittot June 2, 2014, 5:06 p.m. UTC | #4
On 30 May 2014 08:29, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, May 29, 2014 at 09:37:39PM +0200, Vincent Guittot wrote:
>> On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote:
>> > On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote:
>> >> If the CPU is used for handling lot of IRQs, trig a load balance to check if
>> >> it's worth moving its tasks on another CPU that has more capacity
>> >>
>> >> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> >> ---
>> >>  kernel/sched/fair.c | 13 +++++++++++++
>> >>  1 file changed, 13 insertions(+)
>> >>
>> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> >> index e8a30f9..2501e49 100644
>> >> --- a/kernel/sched/fair.c
>> >> +++ b/kernel/sched/fair.c
>> >> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>> >>       if (sgs->sum_nr_running > sgs->group_capacity)
>> >>               return true;
>> >>
>> >> +     /*
>> >> +      * The group capacity is reduced probably because of activity from other
>> >> +      * sched class or interrupts which use part of the available capacity
>> >> +      */
>> >> +     if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct))
>> >> +             return true;
>> >> +
>> >>       if (sgs->group_imb)
>> >>               return true;
>> >>
>> >
>> > But we should already do this because the load numbers are scaled with
>> > the power/capacity figures. If one CPU gets significant less time to run
>> > fair tasks, its effective load would spike and it'd get to be selected
>> > here anyway.
>> >
>> > Or am I missing something?
>>
>> The CPU could have been picked when the capacity becomes null (which
>> occurred when the cpu_power goes below half the default
>> SCHED_POWER_SCALE). And even after that, there were some conditions in
>> find_busiest_group that was bypassing this busiest group
>
> Could you detail those conditions? FWIW those make excellent Changelog
> material.

I have looked back into my tests and traces:

In a 1st test, the capacity of the CPU was still above half default
value (power=538) unlike what i remembered. So it's some what "normal"
to keep the task on CPU0 which also handles IRQ because sg_capacity
still returns 1.

In a 2nd test,the main task runs (most of the time) on CPU0 whereas
the max power of the latter is only 623 and the cpu_power goes below
512 (power=330) during the use case. So the sg_capacity of CPU0 is
null but the main task still stays on CPU0.
The use case (scp transfer) is made of a long running task (ssh) and a
periodic short task (scp). ssh runs on CPU0 and scp runs each 6ms on
CPU1. The newly idle load balance on CPU1 doesn't pull the long
running task although sg_capacity is null because of
sd->nr_balance_failed is never incremented and load_balance doesn't
trig an active load_balance. When an idle balance occurs in the middle
of the newly idle balance, the ssh long task migrates on CPU1 but as
soon as it sleeps and wakes up, it goes back on CPU0 because of the
wake affine which migrates it back on CPU0 (issue solved by patch 09).

Vincent
Peter Zijlstra June 3, 2014, 11:15 a.m. UTC | #5
On Mon, Jun 02, 2014 at 07:06:44PM +0200, Vincent Guittot wrote:
> > Could you detail those conditions? FWIW those make excellent Changelog
> > material.
> 
> I have looked back into my tests and traces:
> 
> In a 1st test, the capacity of the CPU was still above half default
> value (power=538) unlike what i remembered. So it's some what "normal"
> to keep the task on CPU0 which also handles IRQ because sg_capacity
> still returns 1.

OK, so I suspect that once we move to utilization based capacity stuff
we'll do the migration IF the task indeed requires more cpu than can be
provided by the reduced, one, right?

> In a 2nd test,the main task runs (most of the time) on CPU0 whereas
> the max power of the latter is only 623 and the cpu_power goes below
> 512 (power=330) during the use case. So the sg_capacity of CPU0 is
> null but the main task still stays on CPU0.
> The use case (scp transfer) is made of a long running task (ssh) and a
> periodic short task (scp). ssh runs on CPU0 and scp runs each 6ms on
> CPU1. The newly idle load balance on CPU1 doesn't pull the long
> running task although sg_capacity is null because of
> sd->nr_balance_failed is never incremented and load_balance doesn't
> trig an active load_balance. When an idle balance occurs in the middle
> of the newly idle balance, the ssh long task migrates on CPU1 but as
> soon as it sleeps and wakes up, it goes back on CPU0 because of the
> wake affine which migrates it back on CPU0 (issue solved by patch 09).

OK, so there's two problems here, right?
 1) we don't migrate away from cpu0
 2) if we do, we get pulled back.

And patch 9 solves 2, so maybe enhance its changelog to mention this
slightly more explicit.

Which leaves us with 1.. interesting problem. I'm just not sure
endlessly kicking a low capacity cpu is the right fix for that.
Vincent Guittot June 3, 2014, 12:31 p.m. UTC | #6
On 3 June 2014 13:15, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Jun 02, 2014 at 07:06:44PM +0200, Vincent Guittot wrote:
>> > Could you detail those conditions? FWIW those make excellent Changelog
>> > material.
>>
>> I have looked back into my tests and traces:
>>
>> In a 1st test, the capacity of the CPU was still above half default
>> value (power=538) unlike what i remembered. So it's some what "normal"
>> to keep the task on CPU0 which also handles IRQ because sg_capacity
>> still returns 1.
>
> OK, so I suspect that once we move to utilization based capacity stuff
> we'll do the migration IF the task indeed requires more cpu than can be
> provided by the reduced, one, right?

The current version of the patchset only checks if the capacity of a
CPU has significantly reduced that we should look for another CPU. But
we effectively could also add compare the remaining capacity with the
task load

>
>> In a 2nd test,the main task runs (most of the time) on CPU0 whereas
>> the max power of the latter is only 623 and the cpu_power goes below
>> 512 (power=330) during the use case. So the sg_capacity of CPU0 is
>> null but the main task still stays on CPU0.
>> The use case (scp transfer) is made of a long running task (ssh) and a
>> periodic short task (scp). ssh runs on CPU0 and scp runs each 6ms on
>> CPU1. The newly idle load balance on CPU1 doesn't pull the long
>> running task although sg_capacity is null because of
>> sd->nr_balance_failed is never incremented and load_balance doesn't
>> trig an active load_balance. When an idle balance occurs in the middle
>> of the newly idle balance, the ssh long task migrates on CPU1 but as
>> soon as it sleeps and wakes up, it goes back on CPU0 because of the
>> wake affine which migrates it back on CPU0 (issue solved by patch 09).
>
> OK, so there's two problems here, right?
>  1) we don't migrate away from cpu0
>  2) if we do, we get pulled back.
>
> And patch 9 solves 2, so maybe enhance its changelog to mention this
> slightly more explicit.
>
> Which leaves us with 1.. interesting problem. I'm just not sure
> endlessly kicking a low capacity cpu is the right fix for that.

What prevent us to migrate the task directly is the fact that
nr_balance_failed is not incremented for newly idle and it's the only
condition for active migration (except asym feature)

We could add a additional test in need_active_balance for newly_idle
load balance. Something like:

if ((sd->flags & SD_SHARE_PKG_RESOURCES)
 && (senv->rc_rq->cpu_power_orig * 100) > (env->src_rq->group_power *
env->sd->imbalance_pct))
return 1;

>
>
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c9617b73bcc0..47fb96e6fa83 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7215,15 +7215,16 @@  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
 {
 	unsigned long now = jiffies;
 	struct sched_domain *sd;
 	struct sched_group_power *sgp;
 	int nr_busy, cpu = rq->cpu;
+	bool kick = false;
 
 	if (unlikely(rq->idle_balance))
-		return 0;
+		return false;
 
        /*
 	* We may be recently in ticked or tickless idle mode. At the first
@@ -7237,38 +7238,34 @@  static inline int nohz_kick_needed(struct rq *rq)
 	 * balancing.
 	 */
 	if (likely(!atomic_read(&nohz.nr_cpus)))
-		return 0;
+		return false;
 
 	if (time_before(now, nohz.next_balance))
-		return 0;
+		return false;
 
 	if (rq->nr_running >= 2)
-		goto need_kick;
+		return true;
 
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
 	if (sd) {
 		sgp = sd->groups->sgp;
 		nr_busy = atomic_read(&sgp->nr_busy_cpus);
 
-		if (nr_busy > 1)
-			goto need_kick_unlock;
+		if (nr_busy > 1) {
+			kick = true;
+			goto unlock;
+		}
 	}
 
 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
-
 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
 				  sched_domain_span(sd)) < cpu))
-		goto need_kick_unlock;
-
-	rcu_read_unlock();
-	return 0;
+		kick = true;
 
-need_kick_unlock:
+unlock:
 	rcu_read_unlock();
-need_kick:
-	return 1;
+	return kick;
 }
 #else
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }