diff mbox series

[v8,04/16] sched/core: uclamp: Add system default clamps

Message ID 20190402104153.25404-5-patrick.bellasi@arm.com (mailing list archive)
State Not Applicable, archived
Headers show
Series Add utilization clamping support | expand

Commit Message

Patrick Bellasi April 2, 2019, 10:41 a.m. UTC
Tasks without a user-defined clamp value are considered not clamped
and by default their utilization can have any value in the
[0..SCHED_CAPACITY_SCALE] range.

Tasks with a user-defined clamp value are allowed to request any value
in that range, and the required clamp is unconditionally enforced.
However, a "System Management Software" could be interested in limiting
the range of clamp values allowed for all tasks.

Add a privileged interface to define a system default configuration via:

  /proc/sys/kernel/sched_uclamp_util_{min,max}

which works as an unconditional clamp range restriction for all tasks.

With the default configuration, the full SCHED_CAPACITY_SCALE range of
values is allowed for each clamp index. Otherwise, the task-specific
clamp is capped by the corresponding system default value.

Do that by tracking, for each task, the "effective" clamp value and
bucket the task has been refcounted in at enqueue time. This
allows to lazy aggregate "requested" and "system default" values at
enqueue time and simplifies refcounting updates at dequeue time.

The cached bucket ids are used to avoid (relatively) more expensive
integer divisions every time a task is enqueued.

An active flag is used to report when the "effective" value is valid and
thus the task is actually refcounted in the corresponding rq's bucket.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>

---
Changes in v8:
 Message-ID: <20190313201010.GU2482@worktop.programming.kicks-ass.net>
 - add "requested" values uclamp_se instance beside the existing
   "effective" values instance
 - rename uclamp_effective_{get,assign}() into uclamp_eff_{get,set}()
 - make uclamp_eff_get() return the new "effective" values by copy
Message-ID: <20190318125844.ajhjpaqlcgxn7qkq@e110439-lin>
 - run uclamp_fork() code independently from the class being supported.
   Resetting active flag is not harmful and following patches will add
   other code which still needs to be executed independently from class
   support.
Message-ID: <20190313201342.GV2482@worktop.programming.kicks-ass.net>
 - add sysctl_sched_uclamp_handler()'s internal mutex to serialize
   concurrent usages
---
 include/linux/sched.h        |  10 +++
 include/linux/sched/sysctl.h |  11 +++
 kernel/sched/core.c          | 131 ++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c              |  16 +++++
 4 files changed, 167 insertions(+), 1 deletion(-)

Comments

Suren Baghdasaryan April 18, 2019, 12:51 a.m. UTC | #1
On Tue, Apr 2, 2019 at 3:42 AM Patrick Bellasi <patrick.bellasi@arm.com> wrote:
>
> Tasks without a user-defined clamp value are considered not clamped
> and by default their utilization can have any value in the
> [0..SCHED_CAPACITY_SCALE] range.
>
> Tasks with a user-defined clamp value are allowed to request any value
> in that range, and the required clamp is unconditionally enforced.
> However, a "System Management Software" could be interested in limiting
> the range of clamp values allowed for all tasks.
>
> Add a privileged interface to define a system default configuration via:
>
>   /proc/sys/kernel/sched_uclamp_util_{min,max}
>
> which works as an unconditional clamp range restriction for all tasks.
>
> With the default configuration, the full SCHED_CAPACITY_SCALE range of
> values is allowed for each clamp index. Otherwise, the task-specific
> clamp is capped by the corresponding system default value.
>
> Do that by tracking, for each task, the "effective" clamp value and
> bucket the task has been refcounted in at enqueue time. This
> allows to lazy aggregate "requested" and "system default" values at
> enqueue time and simplifies refcounting updates at dequeue time.
>
> The cached bucket ids are used to avoid (relatively) more expensive
> integer divisions every time a task is enqueued.
>
> An active flag is used to report when the "effective" value is valid and
> thus the task is actually refcounted in the corresponding rq's bucket.
>
> Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
>
> ---
> Changes in v8:
>  Message-ID: <20190313201010.GU2482@worktop.programming.kicks-ass.net>
>  - add "requested" values uclamp_se instance beside the existing
>    "effective" values instance
>  - rename uclamp_effective_{get,assign}() into uclamp_eff_{get,set}()
>  - make uclamp_eff_get() return the new "effective" values by copy
> Message-ID: <20190318125844.ajhjpaqlcgxn7qkq@e110439-lin>
>  - run uclamp_fork() code independently from the class being supported.
>    Resetting active flag is not harmful and following patches will add
>    other code which still needs to be executed independently from class
>    support.
> Message-ID: <20190313201342.GV2482@worktop.programming.kicks-ass.net>
>  - add sysctl_sched_uclamp_handler()'s internal mutex to serialize
>    concurrent usages
> ---
>  include/linux/sched.h        |  10 +++
>  include/linux/sched/sysctl.h |  11 +++
>  kernel/sched/core.c          | 131 ++++++++++++++++++++++++++++++++++-
>  kernel/sysctl.c              |  16 +++++
>  4 files changed, 167 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0c0dd7aac8e9..d8491954e2e1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -584,14 +584,21 @@ struct sched_dl_entity {
>   * Utilization clamp for a scheduling entity
>   * @value:             clamp value "assigned" to a se
>   * @bucket_id:         bucket index corresponding to the "assigned" value
> + * @active:            the se is currently refcounted in a rq's bucket
>   *
>   * The bucket_id is the index of the clamp bucket matching the clamp value
>   * which is pre-computed and stored to avoid expensive integer divisions from
>   * the fast path.
> + *
> + * The active bit is set whenever a task has got an "effective" value assigned,
> + * which can be different from the clamp value "requested" from user-space.
> + * This allows to know a task is refcounted in the rq's bucket corresponding
> + * to the "effective" bucket_id.
>   */
>  struct uclamp_se {
>         unsigned int value              : bits_per(SCHED_CAPACITY_SCALE);
>         unsigned int bucket_id          : bits_per(UCLAMP_BUCKETS);
> +       unsigned int active             : 1;
>  };
>  #endif /* CONFIG_UCLAMP_TASK */
>
> @@ -676,6 +683,9 @@ struct task_struct {
>         struct sched_dl_entity          dl;
>
>  #ifdef CONFIG_UCLAMP_TASK
> +       /* Clamp values requested for a scheduling entity */
> +       struct uclamp_se                uclamp_req[UCLAMP_CNT];
> +       /* Effective clamp values used for a scheduling entity */
>         struct uclamp_se                uclamp[UCLAMP_CNT];
>  #endif
>
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 99ce6d728df7..d4f6215ee03f 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
>  extern unsigned int sysctl_sched_rt_period;
>  extern int sysctl_sched_rt_runtime;
>
> +#ifdef CONFIG_UCLAMP_TASK
> +extern unsigned int sysctl_sched_uclamp_util_min;
> +extern unsigned int sysctl_sched_uclamp_util_max;
> +#endif
> +
>  #ifdef CONFIG_CFS_BANDWIDTH
>  extern unsigned int sysctl_sched_cfs_bandwidth_slice;
>  #endif
> @@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
>                 void __user *buffer, size_t *lenp,
>                 loff_t *ppos);
>
> +#ifdef CONFIG_UCLAMP_TASK
> +extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
> +                                      void __user *buffer, size_t *lenp,
> +                                      loff_t *ppos);
> +#endif
> +
>  extern int sysctl_numa_balancing(struct ctl_table *table, int write,
>                                  void __user *buffer, size_t *lenp,
>                                  loff_t *ppos);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 046f61d33f00..d368ac26b8aa 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -733,6 +733,14 @@ static void set_load_weight(struct task_struct *p, bool update_load)
>  }
>
>  #ifdef CONFIG_UCLAMP_TASK
> +/* Max allowed minimum utilization */
> +unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
> +
> +/* Max allowed maximum utilization */
> +unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
> +
> +/* All clamps are required to be less or equal than these values */
> +static struct uclamp_se uclamp_default[UCLAMP_CNT];
>
>  /* Integer rounded range for each bucket */
>  #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
> @@ -801,6 +809,52 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
>         return uclamp_idle_value(rq, clamp_id, clamp_value);
>  }
>
> +/*
> + * The effective clamp bucket index of a task depends on, by increasing
> + * priority:
> + * - the task specific clamp value, when explicitly requested from userspace
> + * - the system default clamp value, defined by the sysadmin
> + */
> +static inline struct uclamp_se
> +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> +{
> +       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> +       struct uclamp_se uc_max = uclamp_default[clamp_id];
> +
> +       /* System default restrictions always apply */
> +       if (unlikely(uc_req.value > uc_max.value))
> +               return uc_max;
> +
> +       return uc_req;
> +}
> +
> +static inline unsigned int
> +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)

This function is not used anywhere AFAIKT. uclamp_eff_bucket_id() and
uclamp_eff_value() look very similar, maybe they can be combined into
one function returning struct uclamp_se?

> +{
> +       struct uclamp_se uc_eff;
> +
> +       /* Task currently refcounted: use back-annotated (effective) bucket */
> +       if (p->uclamp[clamp_id].active)
> +               return p->uclamp[clamp_id].bucket_id;
> +
> +       uc_eff = uclamp_eff_get(p, clamp_id);
> +
> +       return uc_eff.bucket_id;
> +}
> +
> +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> +{
> +       struct uclamp_se uc_eff;
> +
> +       /* Task currently refcounted: use back-annotated (effective) value */
> +       if (p->uclamp[clamp_id].active)
> +               return p->uclamp[clamp_id].value;
> +
> +       uc_eff = uclamp_eff_get(p, clamp_id);
> +
> +       return uc_eff.value;
> +}
> +
>  /*
>   * When a task is enqueued on a rq, the clamp bucket currently defined by the
>   * task's uclamp::bucket_id is refcounted on that rq. This also immediately
> @@ -818,8 +872,12 @@ static inline void uclamp_rq_inc_id(struct task_struct *p, struct rq *rq,
>         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
>         struct uclamp_bucket *bucket;
>
> +       /* Update task effective clamp */
> +       p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
> +
>         bucket = &uc_rq->bucket[uc_se->bucket_id];
>         bucket->tasks++;
> +       uc_se->active = true;
>
>         uclamp_idle_reset(rq, clamp_id, uc_se->value);
>
> @@ -856,6 +914,7 @@ static inline void uclamp_rq_dec_id(struct task_struct *p, struct rq *rq,
>         SCHED_WARN_ON(!bucket->tasks);
>         if (likely(bucket->tasks))
>                 bucket->tasks--;
> +       uc_se->active = false;
>
>         /*
>          * Keep "local max aggregation" simple and accept to (possibly)
> @@ -909,8 +968,69 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>                 uclamp_rq_dec_id(p, rq, clamp_id);
>  }
>
> +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
> +                               void __user *buffer, size_t *lenp,
> +                               loff_t *ppos)
> +{
> +       int old_min, old_max;
> +       static DEFINE_MUTEX(mutex);
> +       int result;
> +
> +       mutex_lock(&mutex);
> +       old_min = sysctl_sched_uclamp_util_min;
> +       old_max = sysctl_sched_uclamp_util_max;
> +
> +       result = proc_dointvec(table, write, buffer, lenp, ppos);
> +       if (result)
> +               goto undo;
> +       if (!write)
> +               goto done;
> +
> +       if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
> +           sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
> +               result = -EINVAL;
> +               goto undo;
> +       }
> +
> +       if (old_min != sysctl_sched_uclamp_util_min) {
> +               uclamp_default[UCLAMP_MIN].value =
> +                       sysctl_sched_uclamp_util_min;
> +               uclamp_default[UCLAMP_MIN].bucket_id =
> +                       uclamp_bucket_id(sysctl_sched_uclamp_util_min);
> +       }
> +       if (old_max != sysctl_sched_uclamp_util_max) {
> +               uclamp_default[UCLAMP_MAX].value =
> +                       sysctl_sched_uclamp_util_max;
> +               uclamp_default[UCLAMP_MAX].bucket_id =
> +                       uclamp_bucket_id(sysctl_sched_uclamp_util_max);
> +       }
> +
> +       /*
> +        * Updating all the RUNNABLE task is expensive, keep it simple and do
> +        * just a lazy update at each next enqueue time.
> +        */
> +       goto done;
> +
> +undo:
> +       sysctl_sched_uclamp_util_min = old_min;
> +       sysctl_sched_uclamp_util_max = old_max;
> +done:
> +       mutex_unlock(&mutex);
> +
> +       return result;
> +}
> +
> +static void uclamp_fork(struct task_struct *p)
> +{
> +       unsigned int clamp_id;
> +
> +       for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +               p->uclamp[clamp_id].active = false;
> +}
> +
>  static void __init init_uclamp(void)
>  {
> +       struct uclamp_se uc_max = {};
>         unsigned int clamp_id;
>         int cpu;
>
> @@ -920,16 +1040,23 @@ static void __init init_uclamp(void)
>         }
>
>         for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) {
> -               struct uclamp_se *uc_se = &init_task.uclamp[clamp_id];
> +               struct uclamp_se *uc_se = &init_task.uclamp_req[clamp_id];
>
>                 uc_se->value = uclamp_none(clamp_id);
>                 uc_se->bucket_id = uclamp_bucket_id(uc_se->value);
>         }
> +
> +       /* System defaults allow max clamp values for both indexes */
> +       uc_max.value = uclamp_none(UCLAMP_MAX);
> +       uc_max.bucket_id = uclamp_bucket_id(uc_max.value);
> +       for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +               uclamp_default[clamp_id] = uc_max;
>  }
>
>  #else /* CONFIG_UCLAMP_TASK */
>  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
>  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
> +static inline void uclamp_fork(struct task_struct *p) { }
>  static inline void init_uclamp(void) { }
>  #endif /* CONFIG_UCLAMP_TASK */
>
> @@ -2530,6 +2657,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
>          */
>         p->prio = current->normal_prio;
>
> +       uclamp_fork(p);
> +
>         /*
>          * Revert to default priority/policy on fork if requested.
>          */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 987ae08147bf..72277f09887d 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -446,6 +446,22 @@ static struct ctl_table kern_table[] = {
>                 .mode           = 0644,
>                 .proc_handler   = sched_rr_handler,
>         },
> +#ifdef CONFIG_UCLAMP_TASK
> +       {
> +               .procname       = "sched_uclamp_util_min",
> +               .data           = &sysctl_sched_uclamp_util_min,
> +               .maxlen         = sizeof(unsigned int),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_sched_uclamp_handler,
> +       },
> +       {
> +               .procname       = "sched_uclamp_util_max",
> +               .data           = &sysctl_sched_uclamp_util_max,
> +               .maxlen         = sizeof(unsigned int),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_sched_uclamp_handler,
> +       },
> +#endif
>  #ifdef CONFIG_SCHED_AUTOGROUP
>         {
>                 .procname       = "sched_autogroup_enabled",
> --
> 2.20.1
>
Patrick Bellasi May 7, 2019, 10:38 a.m. UTC | #2
On 17-Apr 17:51, Suren Baghdasaryan wrote:
> On Tue, Apr 2, 2019 at 3:42 AM Patrick Bellasi <patrick.bellasi@arm.com> wrote:

[...]

> > +/*
> > + * The effective clamp bucket index of a task depends on, by increasing
> > + * priority:
> > + * - the task specific clamp value, when explicitly requested from userspace
> > + * - the system default clamp value, defined by the sysadmin
> > + */
> > +static inline struct uclamp_se
> > +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> > +{
> > +       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> > +       struct uclamp_se uc_max = uclamp_default[clamp_id];
> > +
> > +       /* System default restrictions always apply */
> > +       if (unlikely(uc_req.value > uc_max.value))
> > +               return uc_max;
> > +
> > +       return uc_req;
> > +}
> > +
> > +static inline unsigned int
> > +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)
> 
> This function is not used anywhere AFAIKT.

Right, this is the dual of uclamp_eff_value() but, since we don't
actually use it in the corrent code, let's remove it and keep only the
latter.

> uclamp_eff_bucket_id() and
> uclamp_eff_value() look very similar, maybe they can be combined into
> one function returning struct uclamp_se?

I would prefer not since at the callsites of uclamp_eff_value() we
actually need just the value.

> > +{
> > +       struct uclamp_se uc_eff;
> > +
> > +       /* Task currently refcounted: use back-annotated (effective) bucket */
> > +       if (p->uclamp[clamp_id].active)
> > +               return p->uclamp[clamp_id].bucket_id;
> > +
> > +       uc_eff = uclamp_eff_get(p, clamp_id);
> > +
> > +       return uc_eff.bucket_id;
> > +}
> > +
> > +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> > +{
> > +       struct uclamp_se uc_eff;
> > +
> > +       /* Task currently refcounted: use back-annotated (effective) value */
> > +       if (p->uclamp[clamp_id].active)
> > +               return p->uclamp[clamp_id].value;
> > +
> > +       uc_eff = uclamp_eff_get(p, clamp_id);
> > +
> > +       return uc_eff.value;
> > +}
> > +
Peter Zijlstra May 8, 2019, 6:42 p.m. UTC | #3
On Tue, Apr 02, 2019 at 11:41:40AM +0100, Patrick Bellasi wrote:
> Add a privileged interface to define a system default configuration via:
> 
>   /proc/sys/kernel/sched_uclamp_util_{min,max}

Isn't the 'u' in "uclamp" already for util?
Peter Zijlstra May 8, 2019, 7 p.m. UTC | #4
There was a bunch of repetition that seemed fragile; does something like
the below make sense?


Index: linux-2.6/kernel/sched/core.c
===================================================================
--- linux-2.6.orig/kernel/sched/core.c
+++ linux-2.6/kernel/sched/core.c
@@ -770,6 +770,9 @@ unsigned int sysctl_sched_uclamp_util_ma
 /* All clamps are required to be less or equal than these values */
 static struct uclamp_se uclamp_default[UCLAMP_CNT];
 
+#define for_each_clamp_id(clamp_id)	\
+	for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
 /* Integer rounded range for each bucket */
 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
 
@@ -790,6 +793,12 @@ static inline unsigned int uclamp_none(i
 	return SCHED_CAPACITY_SCALE;
 }
 
+static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
+{
+	uc_se->value = value;
+	uc_se->bucket_id = uclamp_bucket_id(value);
+}
+
 static inline unsigned int
 uclamp_idle_value(struct rq *rq, unsigned int clamp_id, unsigned int clamp_value)
 {
@@ -977,7 +986,7 @@ static inline void uclamp_rq_inc(struct
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
-	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
+	for_each_clamp_id(clamp_id)
 		uclamp_rq_inc_id(p, rq, clamp_id);
 
 	/* Reset clamp idle holding when there is one RUNNABLE task */
@@ -992,7 +1001,7 @@ static inline void uclamp_rq_dec(struct
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
-	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
+	for_each_clamp_id(clamp_id)
 		uclamp_rq_dec_id(p, rq, clamp_id);
 }
 
@@ -1021,16 +1030,13 @@ int sysctl_sched_uclamp_handler(struct c
 	}
 
 	if (old_min != sysctl_sched_uclamp_util_min) {
-		uclamp_default[UCLAMP_MIN].value =
-			sysctl_sched_uclamp_util_min;
-		uclamp_default[UCLAMP_MIN].bucket_id =
-			uclamp_bucket_id(sysctl_sched_uclamp_util_min);
+		uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+			      sysctl_sched_uclamp_util_min);
 	}
+
 	if (old_max != sysctl_sched_uclamp_util_max) {
-		uclamp_default[UCLAMP_MAX].value =
-			sysctl_sched_uclamp_util_max;
-		uclamp_default[UCLAMP_MAX].bucket_id =
-			uclamp_bucket_id(sysctl_sched_uclamp_util_max);
+		uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+			      sysctl_sched_uclamp_util_max);
 	}
 
 	/*
@@ -1052,7 +1058,7 @@ static void uclamp_fork(struct task_stru
 {
 	unsigned int clamp_id;
 
-	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
+	for_each_clamp_id(clamp_id)
 		p->uclamp[clamp_id].active = false;
 }
 
@@ -1067,17 +1073,12 @@ static void __init init_uclamp(void)
 		cpu_rq(cpu)->uclamp_flags = 0;
 	}
 
-	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) {
-		struct uclamp_se *uc_se = &init_task.uclamp_req[clamp_id];
-
-		uc_se->value = uclamp_none(clamp_id);
-		uc_se->bucket_id = uclamp_bucket_id(uc_se->value);
-	}
+	for_each_clamp_id(clamp_id)
+		uclamp_se_set(&init_task.uclamp_req[clamp_id], uclamp_none(clamp_id));
 
 	/* System defaults allow max clamp values for both indexes */
-	uc_max.value = uclamp_none(UCLAMP_MAX);
-	uc_max.bucket_id = uclamp_bucket_id(uc_max.value);
-	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
+	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX));
+	for_each_clamp_id(clamp_id)
 		uclamp_default[clamp_id] = uc_max;
 }
Peter Zijlstra May 8, 2019, 7:07 p.m. UTC | #5
On Tue, Apr 02, 2019 at 11:41:40AM +0100, Patrick Bellasi wrote:
> +static inline struct uclamp_se
> +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> +{
> +	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> +	struct uclamp_se uc_max = uclamp_default[clamp_id];
> +
> +	/* System default restrictions always apply */
> +	if (unlikely(uc_req.value > uc_max.value))
> +		return uc_max;
> +
> +	return uc_req;
> +}
> +
> +static inline unsigned int
> +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)
> +{
> +	struct uclamp_se uc_eff;
> +
> +	/* Task currently refcounted: use back-annotated (effective) bucket */
> +	if (p->uclamp[clamp_id].active)
> +		return p->uclamp[clamp_id].bucket_id;
> +
> +	uc_eff = uclamp_eff_get(p, clamp_id);
> +
> +	return uc_eff.bucket_id;
> +}
> +
> +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> +{
> +	struct uclamp_se uc_eff;
> +
> +	/* Task currently refcounted: use back-annotated (effective) value */
> +	if (p->uclamp[clamp_id].active)
> +		return p->uclamp[clamp_id].value;
> +
> +	uc_eff = uclamp_eff_get(p, clamp_id);
> +
> +	return uc_eff.value;
> +}

This is 'wrong' because:

  uclamp_eff_value(p,id) := uclamp_eff(p,id).value

Which seems to suggest the uclamp_eff_*() functions want another name.

Also, suppose the above would be true; does GCC really generate better
code for the LHS compared to the RHS?
Peter Zijlstra May 8, 2019, 7:15 p.m. UTC | #6
On Wed, May 08, 2019 at 09:07:33PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 02, 2019 at 11:41:40AM +0100, Patrick Bellasi wrote:
> > +static inline struct uclamp_se
> > +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> > +{
> > +	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> > +	struct uclamp_se uc_max = uclamp_default[clamp_id];
> > +
> > +	/* System default restrictions always apply */
> > +	if (unlikely(uc_req.value > uc_max.value))
> > +		return uc_max;
> > +
> > +	return uc_req;
> > +}
> > +
> > +static inline unsigned int
> > +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)
> > +{
> > +	struct uclamp_se uc_eff;
> > +
> > +	/* Task currently refcounted: use back-annotated (effective) bucket */
> > +	if (p->uclamp[clamp_id].active)
> > +		return p->uclamp[clamp_id].bucket_id;
> > +
> > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > +
> > +	return uc_eff.bucket_id;
> > +}
> > +
> > +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> > +{
> > +	struct uclamp_se uc_eff;
> > +
> > +	/* Task currently refcounted: use back-annotated (effective) value */
> > +	if (p->uclamp[clamp_id].active)
> > +		return p->uclamp[clamp_id].value;
> > +
> > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > +
> > +	return uc_eff.value;
> > +}
> 
> This is 'wrong' because:
> 
>   uclamp_eff_value(p,id) := uclamp_eff(p,id).value

Clearly I means to say the above does not hold with the given
implementation, while the naming would suggest it does.

> Which seems to suggest the uclamp_eff_*() functions want another name.
> 
> Also, suppose the above would be true; does GCC really generate better
> code for the LHS compared to the RHS?
Patrick Bellasi May 9, 2019, 8:43 a.m. UTC | #7
On 08-May 20:42, Peter Zijlstra wrote:
> On Tue, Apr 02, 2019 at 11:41:40AM +0100, Patrick Bellasi wrote:
> > Add a privileged interface to define a system default configuration via:
> > 
> >   /proc/sys/kernel/sched_uclamp_util_{min,max}
> 
> Isn't the 'u' in "uclamp" already for util?

Yes, right... I've just wanted to keep the same "uclamp" prefix used
by all related kernel symbols. But, since that's user-space API we can
certainly drop it and go for either:

      /proc/sys/kernel/sched_clamp_util_{min,max}
      /proc/sys/kernel/sched_util_clamp_{min,max}

Preference?
Patrick Bellasi May 9, 2019, 8:45 a.m. UTC | #8
On 08-May 21:00, Peter Zijlstra wrote:
> 
> There was a bunch of repetition that seemed fragile; does something like
> the below make sense?

Absolutely yes... will add to v9, thanks.
 
> Index: linux-2.6/kernel/sched/core.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched/core.c
> +++ linux-2.6/kernel/sched/core.c
> @@ -770,6 +770,9 @@ unsigned int sysctl_sched_uclamp_util_ma
>  /* All clamps are required to be less or equal than these values */
>  static struct uclamp_se uclamp_default[UCLAMP_CNT];
>  
> +#define for_each_clamp_id(clamp_id)	\
> +	for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
> +
>  /* Integer rounded range for each bucket */
>  #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
>  
> @@ -790,6 +793,12 @@ static inline unsigned int uclamp_none(i
>  	return SCHED_CAPACITY_SCALE;
>  }
>  
> +static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
> +{
> +	uc_se->value = value;
> +	uc_se->bucket_id = uclamp_bucket_id(value);
> +}
> +
>  static inline unsigned int
>  uclamp_idle_value(struct rq *rq, unsigned int clamp_id, unsigned int clamp_value)
>  {
> @@ -977,7 +986,7 @@ static inline void uclamp_rq_inc(struct
>  	if (unlikely(!p->sched_class->uclamp_enabled))
>  		return;
>  
> -	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +	for_each_clamp_id(clamp_id)
>  		uclamp_rq_inc_id(p, rq, clamp_id);
>  
>  	/* Reset clamp idle holding when there is one RUNNABLE task */
> @@ -992,7 +1001,7 @@ static inline void uclamp_rq_dec(struct
>  	if (unlikely(!p->sched_class->uclamp_enabled))
>  		return;
>  
> -	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +	for_each_clamp_id(clamp_id)
>  		uclamp_rq_dec_id(p, rq, clamp_id);
>  }
>  
> @@ -1021,16 +1030,13 @@ int sysctl_sched_uclamp_handler(struct c
>  	}
>  
>  	if (old_min != sysctl_sched_uclamp_util_min) {
> -		uclamp_default[UCLAMP_MIN].value =
> -			sysctl_sched_uclamp_util_min;
> -		uclamp_default[UCLAMP_MIN].bucket_id =
> -			uclamp_bucket_id(sysctl_sched_uclamp_util_min);
> +		uclamp_se_set(&uclamp_default[UCLAMP_MIN],
> +			      sysctl_sched_uclamp_util_min);
>  	}
> +
>  	if (old_max != sysctl_sched_uclamp_util_max) {
> -		uclamp_default[UCLAMP_MAX].value =
> -			sysctl_sched_uclamp_util_max;
> -		uclamp_default[UCLAMP_MAX].bucket_id =
> -			uclamp_bucket_id(sysctl_sched_uclamp_util_max);
> +		uclamp_se_set(&uclamp_default[UCLAMP_MAX],
> +			      sysctl_sched_uclamp_util_max);
>  	}
>  
>  	/*
> @@ -1052,7 +1058,7 @@ static void uclamp_fork(struct task_stru
>  {
>  	unsigned int clamp_id;
>  
> -	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +	for_each_clamp_id(clamp_id)
>  		p->uclamp[clamp_id].active = false;
>  }
>  
> @@ -1067,17 +1073,12 @@ static void __init init_uclamp(void)
>  		cpu_rq(cpu)->uclamp_flags = 0;
>  	}
>  
> -	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) {
> -		struct uclamp_se *uc_se = &init_task.uclamp_req[clamp_id];
> -
> -		uc_se->value = uclamp_none(clamp_id);
> -		uc_se->bucket_id = uclamp_bucket_id(uc_se->value);
> -	}
> +	for_each_clamp_id(clamp_id)
> +		uclamp_se_set(&init_task.uclamp_req[clamp_id], uclamp_none(clamp_id));
>  
>  	/* System defaults allow max clamp values for both indexes */
> -	uc_max.value = uclamp_none(UCLAMP_MAX);
> -	uc_max.bucket_id = uclamp_bucket_id(uc_max.value);
> -	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX));
> +	for_each_clamp_id(clamp_id)
>  		uclamp_default[clamp_id] = uc_max;
>  }
>  
>
Patrick Bellasi May 9, 2019, 9:10 a.m. UTC | #9
On 08-May 21:15, Peter Zijlstra wrote:
> On Wed, May 08, 2019 at 09:07:33PM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 02, 2019 at 11:41:40AM +0100, Patrick Bellasi wrote:
> > > +static inline struct uclamp_se
> > > +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> > > +{
> > > +	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> > > +	struct uclamp_se uc_max = uclamp_default[clamp_id];
> > > +
> > > +	/* System default restrictions always apply */
> > > +	if (unlikely(uc_req.value > uc_max.value))
> > > +		return uc_max;
> > > +
> > > +	return uc_req;
> > > +}
> > > +
> > > +static inline unsigned int
> > > +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)
> > > +{
> > > +	struct uclamp_se uc_eff;
> > > +
> > > +	/* Task currently refcounted: use back-annotated (effective) bucket */
> > > +	if (p->uclamp[clamp_id].active)
> > > +		return p->uclamp[clamp_id].bucket_id;
> > > +
> > > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > > +
> > > +	return uc_eff.bucket_id;
> > > +}
> > > +
> > > +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> > > +{
> > > +	struct uclamp_se uc_eff;
> > > +
> > > +	/* Task currently refcounted: use back-annotated (effective) value */
> > > +	if (p->uclamp[clamp_id].active)
> > > +		return p->uclamp[clamp_id].value;
> > > +
> > > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > > +
> > > +	return uc_eff.value;
> > > +}
> > 
> > This is 'wrong' because:
> > 
> >   uclamp_eff_value(p,id) := uclamp_eff(p,id).value
> 
> Clearly I means to say the above does not hold with the given
> implementation, while the naming would suggest it does.

Not sure to completely get your point...

AFAIU, what you call uclamp_eff(p, id).value is the "value" member of
the struct returned by uclamp_eff_get(p,id), which is back annotate
by uclamp_rq_inc_id(p, rq, id) in:

   p->uclamp[clamp_id].value

when a task becomes RUNNABLE.

> > Which seems to suggest the uclamp_eff_*() functions want another name.

That function returns the effective value of a task, which is either:
 1. the back annotated value for a RUNNABLE task
or
 2. the aggregation of task-specific, system-default and cgroup values
    for a non RUNNABLE task.

> > Also, suppose the above would be true; does GCC really generate better
> > code for the LHS compared to the RHS?

It generate "sane" code which implements the above logic and allows
to know that whenever we call uclamp_eff_value(p,id) we get the most
updated effective value for a task, independently from its {!}RUNNABLE
state.

I would keep the function but, since Suren also complained also about
the name... perhaps I should come up with a better name? Proposals?
Peter Zijlstra May 9, 2019, 11:53 a.m. UTC | #10
On Thu, May 09, 2019 at 10:10:57AM +0100, Patrick Bellasi wrote:
> On 08-May 21:15, Peter Zijlstra wrote:
> > On Wed, May 08, 2019 at 09:07:33PM +0200, Peter Zijlstra wrote:
> > > On Tue, Apr 02, 2019 at 11:41:40AM +0100, Patrick Bellasi wrote:
> > > > +static inline struct uclamp_se
> > > > +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> > > > +{
> > > > +	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> > > > +	struct uclamp_se uc_max = uclamp_default[clamp_id];
> > > > +
> > > > +	/* System default restrictions always apply */
> > > > +	if (unlikely(uc_req.value > uc_max.value))
> > > > +		return uc_max;
> > > > +
> > > > +	return uc_req;
> > > > +}
> > > > +
> > > > +static inline unsigned int
> > > > +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)
> > > > +{
> > > > +	struct uclamp_se uc_eff;
> > > > +
> > > > +	/* Task currently refcounted: use back-annotated (effective) bucket */
> > > > +	if (p->uclamp[clamp_id].active)
> > > > +		return p->uclamp[clamp_id].bucket_id;
> > > > +
> > > > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > > > +
> > > > +	return uc_eff.bucket_id;
> > > > +}
> > > > +
> > > > +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> > > > +{
> > > > +	struct uclamp_se uc_eff;
> > > > +
> > > > +	/* Task currently refcounted: use back-annotated (effective) value */
> > > > +	if (p->uclamp[clamp_id].active)
> > > > +		return p->uclamp[clamp_id].value;
> > > > +
> > > > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > > > +
> > > > +	return uc_eff.value;
> > > > +}
> > > 
> > > This is 'wrong' because:
> > > 
> > >   uclamp_eff_value(p,id) := uclamp_eff(p,id).value
> > 
> > Clearly I means to say the above does not hold with the given
> > implementation, while the naming would suggest it does.
> 
> Not sure to completely get your point...

the point is that uclamp_eff_get() doesn't do the back annotate thing
and therefore returns something entirely different from
uclamp_eff_{bucket_id,value}(), where the naming would suggest it in
fact returns the same thing.

> > > Which seems to suggest the uclamp_eff_*() functions want another name.
> 
> That function returns the effective value of a task, which is either:
>  1. the back annotated value for a RUNNABLE task
> or
>  2. the aggregation of task-specific, system-default and cgroup values
>     for a non RUNNABLE task.

Right, but uclamp_eff_get() doesn't do 1, while the other two do do it.
And that is confusing.

> > > Also, suppose the above would be true; does GCC really generate better
> > > code for the LHS compared to the RHS?
> 
> It generate "sane" code which implements the above logic and allows
> to know that whenever we call uclamp_eff_value(p,id) we get the most
> updated effective value for a task, independently from its {!}RUNNABLE
> state.
> 
> I would keep the function but, since Suren also complained also about
> the name... perhaps I should come up with a better name? Proposals?

Right, so they should move to the patch where they're needed, but I was
wondering why you'd not written something like:

static inline
struct uclamp_se uclamp_active(struct task_struct *p, unsigned int clamp_id)
{
	if (p->uclamp[clamp_id].active)
		return p->uclamp[clamp_id];

	return uclamp_eff(p, clamp_id);
}

And then used:

	uclamp_active(p, id).{value,bucket_id}

- OR -

have uclamp_eff() include the active thing, afaict the callsite in
uclamp_rq_inc_id() guarantees !active.

In any case, I'm thinking the foo().member notation saves us from having
to have two almost identical functions and the 'inline' part should get
GCC to generate sane code.
Patrick Bellasi May 9, 2019, 1:04 p.m. UTC | #11
On 09-May 13:53, Peter Zijlstra wrote:
> On Thu, May 09, 2019 at 10:10:57AM +0100, Patrick Bellasi wrote:
> > On 08-May 21:15, Peter Zijlstra wrote:
> > > On Wed, May 08, 2019 at 09:07:33PM +0200, Peter Zijlstra wrote:
> > > > On Tue, Apr 02, 2019 at 11:41:40AM +0100, Patrick Bellasi wrote:
> > > > > +static inline struct uclamp_se
> > > > > +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> > > > > +{
> > > > > +	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> > > > > +	struct uclamp_se uc_max = uclamp_default[clamp_id];
> > > > > +
> > > > > +	/* System default restrictions always apply */
> > > > > +	if (unlikely(uc_req.value > uc_max.value))
> > > > > +		return uc_max;
> > > > > +
> > > > > +	return uc_req;
> > > > > +}
> > > > > +
> > > > > +static inline unsigned int
> > > > > +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)
> > > > > +{
> > > > > +	struct uclamp_se uc_eff;
> > > > > +
> > > > > +	/* Task currently refcounted: use back-annotated (effective) bucket */
> > > > > +	if (p->uclamp[clamp_id].active)
> > > > > +		return p->uclamp[clamp_id].bucket_id;
> > > > > +
> > > > > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > > > > +
> > > > > +	return uc_eff.bucket_id;
> > > > > +}
> > > > > +
> > > > > +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> > > > > +{
> > > > > +	struct uclamp_se uc_eff;
> > > > > +
> > > > > +	/* Task currently refcounted: use back-annotated (effective) value */
> > > > > +	if (p->uclamp[clamp_id].active)
> > > > > +		return p->uclamp[clamp_id].value;
> > > > > +
> > > > > +	uc_eff = uclamp_eff_get(p, clamp_id);
> > > > > +
> > > > > +	return uc_eff.value;
> > > > > +}
> > > > 
> > > > This is 'wrong' because:
> > > > 
> > > >   uclamp_eff_value(p,id) := uclamp_eff(p,id).value
> > > 
> > > Clearly I means to say the above does not hold with the given
> > > implementation, while the naming would suggest it does.
> > 
> > Not sure to completely get your point...
> 
> the point is that uclamp_eff_get() doesn't do the back annotate thing
> and therefore returns something entirely different from
> uclamp_eff_{bucket_id,value}(), where the naming would suggest it in
> fact returns the same thing.
> 
> > > > Which seems to suggest the uclamp_eff_*() functions want another name.
> > 
> > That function returns the effective value of a task, which is either:
> >  1. the back annotated value for a RUNNABLE task
> > or
> >  2. the aggregation of task-specific, system-default and cgroup values
> >     for a non RUNNABLE task.
> 
> Right, but uclamp_eff_get() doesn't do 1, while the other two do do it.
> And that is confusing.

I see, right.

> > > > Also, suppose the above would be true; does GCC really generate better
> > > > code for the LHS compared to the RHS?
> > 
> > It generate "sane" code which implements the above logic and allows
> > to know that whenever we call uclamp_eff_value(p,id) we get the most
> > updated effective value for a task, independently from its {!}RUNNABLE
> > state.
> > 
> > I would keep the function but, since Suren also complained also about
> > the name... perhaps I should come up with a better name? Proposals?
> 
> Right, so they should move to the patch where they're needed, but I was

Yes, I'll move _value() to 10/16:

   sched/core: uclamp: Add uclamp_util_with()

where we actually need to access the clamp value and...

> wondering why you'd not written something like:
> 
> static inline
> struct uclamp_se uclamp_active(struct task_struct *p, unsigned int clamp_id)
> {
> 	if (p->uclamp[clamp_id].active)
> 		return p->uclamp[clamp_id];
> 
> 	return uclamp_eff(p, clamp_id);
> }
> 
> And then used:
> 
> 	uclamp_active(p, id).{value,bucket_id}
> 
> - OR -
> 
> have uclamp_eff() include the active thing, afaict the callsite in
> uclamp_rq_inc_id() guarantees !active.
> 
> In any case, I'm thinking the foo().member notation saves us from having
> to have two almost identical functions and the 'inline' part should get
> GCC to generate sane code.

... look into this approach, seems reasonable and actually better to read.

Thanks
diff mbox series

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c0dd7aac8e9..d8491954e2e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -584,14 +584,21 @@  struct sched_dl_entity {
  * Utilization clamp for a scheduling entity
  * @value:		clamp value "assigned" to a se
  * @bucket_id:		bucket index corresponding to the "assigned" value
+ * @active:		the se is currently refcounted in a rq's bucket
  *
  * The bucket_id is the index of the clamp bucket matching the clamp value
  * which is pre-computed and stored to avoid expensive integer divisions from
  * the fast path.
+ *
+ * The active bit is set whenever a task has got an "effective" value assigned,
+ * which can be different from the clamp value "requested" from user-space.
+ * This allows to know a task is refcounted in the rq's bucket corresponding
+ * to the "effective" bucket_id.
  */
 struct uclamp_se {
 	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
 	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
+	unsigned int active		: 1;
 };
 #endif /* CONFIG_UCLAMP_TASK */
 
@@ -676,6 +683,9 @@  struct task_struct {
 	struct sched_dl_entity		dl;
 
 #ifdef CONFIG_UCLAMP_TASK
+	/* Clamp values requested for a scheduling entity */
+	struct uclamp_se		uclamp_req[UCLAMP_CNT];
+	/* Effective clamp values used for a scheduling entity */
 	struct uclamp_se		uclamp[UCLAMP_CNT];
 #endif
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 99ce6d728df7..d4f6215ee03f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -56,6 +56,11 @@  int sched_proc_update_handler(struct ctl_table *table, int write,
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
+#ifdef CONFIG_UCLAMP_TASK
+extern unsigned int sysctl_sched_uclamp_util_min;
+extern unsigned int sysctl_sched_uclamp_util_max;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
@@ -75,6 +80,12 @@  extern int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+#ifdef CONFIG_UCLAMP_TASK
+extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos);
+#endif
+
 extern int sysctl_numa_balancing(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 046f61d33f00..d368ac26b8aa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -733,6 +733,14 @@  static void set_load_weight(struct task_struct *p, bool update_load)
 }
 
 #ifdef CONFIG_UCLAMP_TASK
+/* Max allowed minimum utilization */
+unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+
+/* Max allowed maximum utilization */
+unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
 
 /* Integer rounded range for each bucket */
 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
@@ -801,6 +809,52 @@  unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
 	return uclamp_idle_value(rq, clamp_id, clamp_value);
 }
 
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+{
+	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+	struct uclamp_se uc_max = uclamp_default[clamp_id];
+
+	/* System default restrictions always apply */
+	if (unlikely(uc_req.value > uc_max.value))
+		return uc_max;
+
+	return uc_req;
+}
+
+static inline unsigned int
+uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)
+{
+	struct uclamp_se uc_eff;
+
+	/* Task currently refcounted: use back-annotated (effective) bucket */
+	if (p->uclamp[clamp_id].active)
+		return p->uclamp[clamp_id].bucket_id;
+
+	uc_eff = uclamp_eff_get(p, clamp_id);
+
+	return uc_eff.bucket_id;
+}
+
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+{
+	struct uclamp_se uc_eff;
+
+	/* Task currently refcounted: use back-annotated (effective) value */
+	if (p->uclamp[clamp_id].active)
+		return p->uclamp[clamp_id].value;
+
+	uc_eff = uclamp_eff_get(p, clamp_id);
+
+	return uc_eff.value;
+}
+
 /*
  * When a task is enqueued on a rq, the clamp bucket currently defined by the
  * task's uclamp::bucket_id is refcounted on that rq. This also immediately
@@ -818,8 +872,12 @@  static inline void uclamp_rq_inc_id(struct task_struct *p, struct rq *rq,
 	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
 	struct uclamp_bucket *bucket;
 
+	/* Update task effective clamp */
+	p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+
 	bucket = &uc_rq->bucket[uc_se->bucket_id];
 	bucket->tasks++;
+	uc_se->active = true;
 
 	uclamp_idle_reset(rq, clamp_id, uc_se->value);
 
@@ -856,6 +914,7 @@  static inline void uclamp_rq_dec_id(struct task_struct *p, struct rq *rq,
 	SCHED_WARN_ON(!bucket->tasks);
 	if (likely(bucket->tasks))
 		bucket->tasks--;
+	uc_se->active = false;
 
 	/*
 	 * Keep "local max aggregation" simple and accept to (possibly)
@@ -909,8 +968,69 @@  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 		uclamp_rq_dec_id(p, rq, clamp_id);
 }
 
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int old_min, old_max;
+	static DEFINE_MUTEX(mutex);
+	int result;
+
+	mutex_lock(&mutex);
+	old_min = sysctl_sched_uclamp_util_min;
+	old_max = sysctl_sched_uclamp_util_max;
+
+	result = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (result)
+		goto undo;
+	if (!write)
+		goto done;
+
+	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+	    sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+		result = -EINVAL;
+		goto undo;
+	}
+
+	if (old_min != sysctl_sched_uclamp_util_min) {
+		uclamp_default[UCLAMP_MIN].value =
+			sysctl_sched_uclamp_util_min;
+		uclamp_default[UCLAMP_MIN].bucket_id =
+			uclamp_bucket_id(sysctl_sched_uclamp_util_min);
+	}
+	if (old_max != sysctl_sched_uclamp_util_max) {
+		uclamp_default[UCLAMP_MAX].value =
+			sysctl_sched_uclamp_util_max;
+		uclamp_default[UCLAMP_MAX].bucket_id =
+			uclamp_bucket_id(sysctl_sched_uclamp_util_max);
+	}
+
+	/*
+	 * Updating all the RUNNABLE task is expensive, keep it simple and do
+	 * just a lazy update at each next enqueue time.
+	 */
+	goto done;
+
+undo:
+	sysctl_sched_uclamp_util_min = old_min;
+	sysctl_sched_uclamp_util_max = old_max;
+done:
+	mutex_unlock(&mutex);
+
+	return result;
+}
+
+static void uclamp_fork(struct task_struct *p)
+{
+	unsigned int clamp_id;
+
+	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
+		p->uclamp[clamp_id].active = false;
+}
+
 static void __init init_uclamp(void)
 {
+	struct uclamp_se uc_max = {};
 	unsigned int clamp_id;
 	int cpu;
 
@@ -920,16 +1040,23 @@  static void __init init_uclamp(void)
 	}
 
 	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) {
-		struct uclamp_se *uc_se = &init_task.uclamp[clamp_id];
+		struct uclamp_se *uc_se = &init_task.uclamp_req[clamp_id];
 
 		uc_se->value = uclamp_none(clamp_id);
 		uc_se->bucket_id = uclamp_bucket_id(uc_se->value);
 	}
+
+	/* System defaults allow max clamp values for both indexes */
+	uc_max.value = uclamp_none(UCLAMP_MAX);
+	uc_max.bucket_id = uclamp_bucket_id(uc_max.value);
+	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
+		uclamp_default[clamp_id] = uc_max;
 }
 
 #else /* CONFIG_UCLAMP_TASK */
 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
@@ -2530,6 +2657,8 @@  int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 */
 	p->prio = current->normal_prio;
 
+	uclamp_fork(p);
+
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 987ae08147bf..72277f09887d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -446,6 +446,22 @@  static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rr_handler,
 	},
+#ifdef CONFIG_UCLAMP_TASK
+	{
+		.procname	= "sched_uclamp_util_min",
+		.data		= &sysctl_sched_uclamp_util_min,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_sched_uclamp_handler,
+	},
+	{
+		.procname	= "sched_uclamp_util_max",
+		.data		= &sysctl_sched_uclamp_util_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_sched_uclamp_handler,
+	},
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",