diff mbox

[RFC,v1,3/8] sched/cpufreq_schedutil: make worker kthread be SCHED_DEADLINE

Message ID 20170705085905.6558-4-juri.lelli@arm.com (mailing list archive)
State RFC, archived
Headers show

Commit Message

Juri Lelli July 5, 2017, 8:59 a.m. UTC
Worker kthread needs to be able to change frequency for all other
threads.

Make it special, just under STOP class.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Luca Abeni <luca.abeni@santannapisa.it>
Cc: Claudio Scordino <claudio@evidence.eu.com>
---
Changes from RFCv0:

 - use a high bit of sched_flags and don't expose the new flag to
   userspace (also add derisory comments) (Peter)
---
 include/linux/sched.h            |  1 +
 kernel/sched/core.c              | 15 +++++++++++++--
 kernel/sched/cpufreq_schedutil.c | 15 ++++++++++++---
 kernel/sched/deadline.c          | 13 +++++++++++++
 kernel/sched/sched.h             | 20 +++++++++++++++++++-
 5 files changed, 58 insertions(+), 6 deletions(-)

Comments

Joel Fernandes July 7, 2017, 3:56 a.m. UTC | #1
Hi Juri,

On Wed, Jul 5, 2017 at 1:59 AM, Juri Lelli <juri.lelli@arm.com> wrote:
> Worker kthread needs to be able to change frequency for all other
> threads.
>
> Make it special, just under STOP class.
>
> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Cc: Luca Abeni <luca.abeni@santannapisa.it>
> Cc: Claudio Scordino <claudio@evidence.eu.com>
> ---
> Changes from RFCv0:
>
>  - use a high bit of sched_flags and don't expose the new flag to
>    userspace (also add derisory comments) (Peter)
> ---
>  include/linux/sched.h            |  1 +
>  kernel/sched/core.c              | 15 +++++++++++++--
>  kernel/sched/cpufreq_schedutil.c | 15 ++++++++++++---
>  kernel/sched/deadline.c          | 13 +++++++++++++
>  kernel/sched/sched.h             | 20 +++++++++++++++++++-
>  5 files changed, 58 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1f0f427e0292..0ba767fdedae 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1359,6 +1359,7 @@ extern int idle_cpu(int cpu);
>  extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
>  extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
>  extern int sched_setattr(struct task_struct *, const struct sched_attr *);
> +extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
>  extern struct task_struct *idle_task(int cpu);
>
>  /**
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5186797908dc..0e40c7ff2bf4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3998,7 +3998,9 @@ static int __sched_setscheduler(struct task_struct *p,
>         }
>
>         if (attr->sched_flags &
> -               ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
> +               ~(SCHED_FLAG_RESET_ON_FORK |
> +                 SCHED_FLAG_RECLAIM |
> +                 SCHED_FLAG_SPECIAL))
>                 return -EINVAL;

I was thinking if its better to name SCHED_FLAG_SPECIAL something more
descriptive than "special", since it will not be clear looking at
other parts of the code that this is used for the frequency scaling
thread or that its a DL task. I was thinking since this flag is
specifically setup for the sugov thread frequency scaling purpose, a
better flag name than SPECIAL might be SCHED_FLAG_DL_FREQ_SCALE or
SCHED_FLAG_FREQ_SCALE. But I am Ok with whatever makes sense to do
here. Thanks,

Regards,

-Joel
Viresh Kumar July 7, 2017, 7:21 a.m. UTC | #2
On 05-07-17, 09:59, Juri Lelli wrote:
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index f2494d1fc8ef..ba6227625f24 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -424,7 +424,16 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
>  static int sugov_kthread_create(struct sugov_policy *sg_policy)
>  {
>  	struct task_struct *thread;
> -	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
> +	struct sched_attr attr = {
> +		.size = sizeof(struct sched_attr),
> +		.sched_policy = SCHED_DEADLINE,
> +		.sched_flags = SCHED_FLAG_SPECIAL,
> +		.sched_nice = 0,
> +		.sched_priority = 0,
> +		.sched_runtime = 0,
> +		.sched_deadline = 0,
> +		.sched_period = 0,
> +	};
>  	struct cpufreq_policy *policy = sg_policy->policy;
>  	int ret;
>  
> @@ -442,10 +451,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
>  		return PTR_ERR(thread);
>  	}
>  
> -	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
> +	ret = sched_setattr_nocheck(thread, &attr);
>  	if (ret) {
>  		kthread_stop(thread);
> -		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
> +		pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
>  		return ret;
>  	}

Acked-by: Viresh Kumar <viresh.kumar@linaro.org> (schedutil)
Juri Lelli July 7, 2017, 10:43 a.m. UTC | #3
Hi,

On 06/07/17 20:56, Joel Fernandes wrote:
> Hi Juri,
> 
> On Wed, Jul 5, 2017 at 1:59 AM, Juri Lelli <juri.lelli@arm.com> wrote:

[...]

> >         if (attr->sched_flags &
> > -               ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
> > +               ~(SCHED_FLAG_RESET_ON_FORK |
> > +                 SCHED_FLAG_RECLAIM |
> > +                 SCHED_FLAG_SPECIAL))
> >                 return -EINVAL;
> 
> I was thinking if its better to name SCHED_FLAG_SPECIAL something more
> descriptive than "special", since it will not be clear looking at
> other parts of the code that this is used for the frequency scaling
> thread or that its a DL task. I was thinking since this flag is
> specifically setup for the sugov thread frequency scaling purpose, a
> better flag name than SPECIAL might be SCHED_FLAG_DL_FREQ_SCALE or
> SCHED_FLAG_FREQ_SCALE. But I am Ok with whatever makes sense to do
> here. Thanks,
> 

Sure.

How about SCHED_FLAG_SUGOV then?

Thanks,

- Juri
Thomas Gleixner July 7, 2017, 10:46 a.m. UTC | #4
On Fri, 7 Jul 2017, Juri Lelli wrote:
> How about SCHED_FLAG_SUGOV then?

I know sugo della carne, but what's sugo V?

Thanks,

	tglx
Juri Lelli July 7, 2017, 10:53 a.m. UTC | #5
On 07/07/17 12:46, Thomas Gleixner wrote:
> On Fri, 7 Jul 2017, Juri Lelli wrote:
> > How about SCHED_FLAG_SUGOV then?
> 
> I know sugo della carne, but what's sugo V?
> 

Right.. can't really help not thinking about the same (especially close
to lunch) :)

But the abbreviation stands for SchedUtil GOVernor (AFAIK). We already
have a SUGOV_KTHREAD_PRIORITY (that I just noticed I need to remove BTW)
in sched/cpufreq_schedutil.c.

Thanks,

- Juri
Rafael J. Wysocki July 7, 2017, 1:11 p.m. UTC | #6
On Friday, July 07, 2017 11:53:16 AM Juri Lelli wrote:
> On 07/07/17 12:46, Thomas Gleixner wrote:
> > On Fri, 7 Jul 2017, Juri Lelli wrote:
> > > How about SCHED_FLAG_SUGOV then?
> > 
> > I know sugo della carne, but what's sugo V?
> > 
> 
> Right.. can't really help not thinking about the same (especially close
> to lunch) :)
> 
> But the abbreviation stands for SchedUtil GOVernor (AFAIK). We already
> have a SUGOV_KTHREAD_PRIORITY (that I just noticed I need to remove BTW)
> in sched/cpufreq_schedutil.c.

SCHED_FLAG_CPUFREQ_WORKER comes to mind, but it's somewhat long.

Thanks,
Rafael
Steven Rostedt July 7, 2017, 9:58 p.m. UTC | #7
On Fri, 07 Jul 2017 15:11:45 +0200
"Rafael J. Wysocki" <rjw@rjwysocki.net> wrote:

> On Friday, July 07, 2017 11:53:16 AM Juri Lelli wrote:
> > On 07/07/17 12:46, Thomas Gleixner wrote:  
> > > On Fri, 7 Jul 2017, Juri Lelli wrote:  
> > > > How about SCHED_FLAG_SUGOV then?  
> > > 
> > > I know sugo della carne, but what's sugo V?
> > >   
> > 
> > Right.. can't really help not thinking about the same (especially close
> > to lunch) :)
> > 
> > But the abbreviation stands for SchedUtil GOVernor (AFAIK). We already
> > have a SUGOV_KTHREAD_PRIORITY (that I just noticed I need to remove BTW)
> > in sched/cpufreq_schedutil.c.  
> 
> SCHED_FLAG_CPUFREQ_WORKER comes to mind, but it's somewhat long.
> 

It is rather long. Although I actually hate the SUGOV, it is easily
grepable. Just comment what it stands for. We can always change it
later.

-- Steve
Joel Fernandes July 7, 2017, 10:07 p.m. UTC | #8
Hi,

On Fri, Jul 7, 2017 at 2:58 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Fri, 07 Jul 2017 15:11:45 +0200
> "Rafael J. Wysocki" <rjw@rjwysocki.net> wrote:
>
>> On Friday, July 07, 2017 11:53:16 AM Juri Lelli wrote:
>> > On 07/07/17 12:46, Thomas Gleixner wrote:
>> > > On Fri, 7 Jul 2017, Juri Lelli wrote:
>> > > > How about SCHED_FLAG_SUGOV then?
>> > >
>> > > I know sugo della carne, but what's sugo V?
>> > >
>> >
>> > Right.. can't really help not thinking about the same (especially close
>> > to lunch) :)
>> >
>> > But the abbreviation stands for SchedUtil GOVernor (AFAIK). We already
>> > have a SUGOV_KTHREAD_PRIORITY (that I just noticed I need to remove BTW)
>> > in sched/cpufreq_schedutil.c.
>>
>> SCHED_FLAG_CPUFREQ_WORKER comes to mind, but it's somewhat long.
>>
>
> It is rather long. Although I actually hate the SUGOV, it is easily
> grepable. Just comment what it stands for. We can always change it
> later.

I was thinking why not just SCHED_FLAG_CPUFREQ. That says its for
cpufreq purposes, and is a bit self-documenting. "WORKER" is a bit
redundant and can be dropped in my opinion.

thanks,

-Joel
Steven Rostedt July 7, 2017, 10:15 p.m. UTC | #9
On Fri, 7 Jul 2017 15:07:09 -0700
Joel Fernandes <joelaf@google.com> wrote:


> > It is rather long. Although I actually hate the SUGOV, it is easily
> > grepable. Just comment what it stands for. We can always change it
> > later.  
> 
> I was thinking why not just SCHED_FLAG_CPUFREQ. That says its for
> cpufreq purposes, and is a bit self-documenting. "WORKER" is a bit
> redundant and can be dropped in my opinion.

I was thinking that too, but was wondering how tightly coupled is this
with SCHED_DEADLINE? I like the searchability of SUGOV, where as
CPUFREQ is still quite broad.

-- Steve
Joel Fernandes July 7, 2017, 10:57 p.m. UTC | #10
On Fri, Jul 7, 2017 at 3:15 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Fri, 7 Jul 2017 15:07:09 -0700
> Joel Fernandes <joelaf@google.com> wrote:
>
>
>> > It is rather long. Although I actually hate the SUGOV, it is easily
>> > grepable. Just comment what it stands for. We can always change it
>> > later.
>>
>> I was thinking why not just SCHED_FLAG_CPUFREQ. That says its for
>> cpufreq purposes, and is a bit self-documenting. "WORKER" is a bit
>> redundant and can be dropped in my opinion.
>
> I was thinking that too, but was wondering how tightly coupled is this
> with SCHED_DEADLINE? I like the searchability of SUGOV, where as
> CPUFREQ is still quite broad.

Yes this is tightly coupled with DL so in that case probably a more
specific name is better as you mentioned.

thanks,

-Joel
Peter Zijlstra July 11, 2017, 12:37 p.m. UTC | #11
On Thu, Jul 06, 2017 at 08:56:32PM -0700, Joel Fernandes wrote:

> > @@ -3998,7 +3998,9 @@ static int __sched_setscheduler(struct task_struct *p,
> >         }
> >
> >         if (attr->sched_flags &
> > -               ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
> > +               ~(SCHED_FLAG_RESET_ON_FORK |
> > +                 SCHED_FLAG_RECLAIM |
> > +                 SCHED_FLAG_SPECIAL))
> >                 return -EINVAL;
> 
> I was thinking if its better to name SCHED_FLAG_SPECIAL something more
> descriptive than "special", since it will not be clear looking at
> other parts of the code that this is used for the frequency scaling
> thread or that its a DL task. I was thinking since this flag is
> specifically setup for the sugov thread frequency scaling purpose, a
> better flag name than SPECIAL might be SCHED_FLAG_DL_FREQ_SCALE or
> SCHED_FLAG_FREQ_SCALE. But I am Ok with whatever makes sense to do
> here. Thanks,

SCHED_FLAG_IM_DOING_IT_WRONG ;-)
Peter Zijlstra July 11, 2017, 4:18 p.m. UTC | #12
On Wed, Jul 05, 2017 at 09:59:00AM +0100, Juri Lelli wrote:
> @@ -4065,6 +4067,9 @@ static int __sched_setscheduler(struct task_struct *p,
>  	}
>  
>  	if (user) {
> +		if (attr->sched_flags & SCHED_FLAG_SPECIAL)
> +			return -EPERM;

Should be -EINVAL I think, as if the bit didn't exist at all (it
doesn't, from a userspace perspective).

> +
>  		retval = security_task_setscheduler(p);
>  		if (retval)
>  			return retval;
Juri Lelli July 11, 2017, 5:02 p.m. UTC | #13
On 11/07/17 18:18, Peter Zijlstra wrote:
> On Wed, Jul 05, 2017 at 09:59:00AM +0100, Juri Lelli wrote:
> > @@ -4065,6 +4067,9 @@ static int __sched_setscheduler(struct task_struct *p,
> >  	}
> >  
> >  	if (user) {
> > +		if (attr->sched_flags & SCHED_FLAG_SPECIAL)
> > +			return -EPERM;
> 
> Should be -EINVAL I think, as if the bit didn't exist at all (it
> doesn't, from a userspace perspective).
> 

Makes sense. I'll change it.

Thanks,

- Juri
diff mbox

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f0f427e0292..0ba767fdedae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1359,6 +1359,7 @@  extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
+extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
 extern struct task_struct *idle_task(int cpu);
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5186797908dc..0e40c7ff2bf4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3998,7 +3998,9 @@  static int __sched_setscheduler(struct task_struct *p,
 	}
 
 	if (attr->sched_flags &
-		~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
+		~(SCHED_FLAG_RESET_ON_FORK |
+		  SCHED_FLAG_RECLAIM |
+		  SCHED_FLAG_SPECIAL))
 		return -EINVAL;
 
 	/*
@@ -4065,6 +4067,9 @@  static int __sched_setscheduler(struct task_struct *p,
 	}
 
 	if (user) {
+		if (attr->sched_flags & SCHED_FLAG_SPECIAL)
+			return -EPERM;
+
 		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
@@ -4120,7 +4125,8 @@  static int __sched_setscheduler(struct task_struct *p,
 		}
 #endif
 #ifdef CONFIG_SMP
-		if (dl_bandwidth_enabled() && dl_policy(policy)) {
+		if (dl_bandwidth_enabled() && dl_policy(policy) &&
+				!(attr->sched_flags & SCHED_FLAG_SPECIAL)) {
 			cpumask_t *span = rq->rd->span;
 
 			/*
@@ -4250,6 +4256,11 @@  int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+	return __sched_setscheduler(p, attr, false, true);
+}
+
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f2494d1fc8ef..ba6227625f24 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -424,7 +424,16 @@  static void sugov_policy_free(struct sugov_policy *sg_policy)
 static int sugov_kthread_create(struct sugov_policy *sg_policy)
 {
 	struct task_struct *thread;
-	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+	struct sched_attr attr = {
+		.size = sizeof(struct sched_attr),
+		.sched_policy = SCHED_DEADLINE,
+		.sched_flags = SCHED_FLAG_SPECIAL,
+		.sched_nice = 0,
+		.sched_priority = 0,
+		.sched_runtime = 0,
+		.sched_deadline = 0,
+		.sched_period = 0,
+	};
 	struct cpufreq_policy *policy = sg_policy->policy;
 	int ret;
 
@@ -442,10 +451,10 @@  static int sugov_kthread_create(struct sugov_policy *sg_policy)
 		return PTR_ERR(thread);
 	}
 
-	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+	ret = sched_setattr_nocheck(thread, &attr);
 	if (ret) {
 		kthread_stop(thread);
-		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
 		return ret;
 	}
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6912f7f35f9b..4ec82fa56b04 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -220,6 +220,9 @@  static void task_non_contending(struct task_struct *p)
 	if (dl_se->dl_runtime == 0)
 		return;
 
+	if (unlikely(dl_entity_is_special(dl_se)))
+		return;
+
 	WARN_ON(hrtimer_active(&dl_se->inactive_timer));
 	WARN_ON(dl_se->dl_non_contending);
 
@@ -1150,6 +1153,9 @@  static void update_curr_dl(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
+	if (unlikely(dl_entity_is_special(dl_se)))
+		return;
+
 	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
 		delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
 	dl_se->runtime -= delta_exec;
@@ -2447,6 +2453,9 @@  int sched_dl_overflow(struct task_struct *p, int policy,
 	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
 	int cpus, err = -1;
 
+	if (attr->sched_flags & SCHED_FLAG_SPECIAL)
+		return 0;
+
 	/* !deadline task may carry old deadline bandwidth */
 	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
 		return 0;
@@ -2533,6 +2542,10 @@  void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
  */
 bool __checkparam_dl(const struct sched_attr *attr)
 {
+	/* special dl tasks don't actually use any parameter */
+	if (attr->sched_flags & SCHED_FLAG_SPECIAL)
+		return true;
+
 	/* deadline != 0 */
 	if (attr->sched_deadline == 0)
 		return false;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d8798bb54ace..1be5dac728a4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -156,12 +156,30 @@  static inline int task_has_dl_policy(struct task_struct *p)
 }
 
 /*
+ * !! For sched_setattr_nocheck() (kernel) only !!
+ *
+ * This is actually gross. :(
+ *
+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
+ * tasks, but still be able to sleep. We need this on platforms that cannot
+ * atomically change clock frequency. Remove once fast switching will be
+ * available on such platforms.
+ */
+#define SCHED_FLAG_SPECIAL	0x10000000
+
+static inline int dl_entity_is_special(struct sched_dl_entity *dl_se)
+{
+	return dl_se->flags & SCHED_FLAG_SPECIAL;
+}
+
+/*
  * Tells if entity @a should preempt entity @b.
  */
 static inline bool
 dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
 {
-	return dl_time_before(a->deadline, b->deadline);
+	return dl_entity_is_special(a) ||
+	       dl_time_before(a->deadline, b->deadline);
 }
 
 /*