diff mbox

[RFCv5,38/46] sched: scheduler-driven cpu frequency selection

Message ID 1436293469-25707-39-git-send-email-morten.rasmussen@arm.com (mailing list archive)
State RFC
Headers show

Commit Message

Morten Rasmussen July 7, 2015, 6:24 p.m. UTC
From: Michael Turquette <mturquette@baylibre.com>

Scheduler-driven cpu frequency selection is desirable as part of the
on-going effort to make the scheduler better aware of energy
consumption.  No piece of the Linux kernel has a better view of the
factors that affect a cpu frequency selection policy than the
scheduler[0], and this patch is an attempt to converge on an initial
solution.

This patch implements a simple shim layer between the Linux scheduler
and the cpufreq subsystem. This interface accepts a capacity request
from the Completely Fair Scheduler and honors the max request from all
cpus in the same frequency domain.

The policy magic comes from choosing the cpu capacity request from cfs
and is not contained in this cpufreq governor. This code is
intentionally dumb.

Note that this "governor" is event-driven. There is no polling loop to
check cpu idle time nor any other method which is unsynchronized with
the scheduler.

Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
code and test results.

[0] http://article.gmane.org/gmane.linux.kernel/1499836

Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 drivers/cpufreq/Kconfig      |  24 ++++
 include/linux/cpufreq.h      |   3 +
 kernel/sched/Makefile        |   1 +
 kernel/sched/cpufreq_sched.c | 308 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h         |   8 ++
 5 files changed, 344 insertions(+)
 create mode 100644 kernel/sched/cpufreq_sched.c

Comments

Michael Turquette July 8, 2015, 3:09 p.m. UTC | #1
Quoting Morten Rasmussen (2015-07-07 11:24:21)
> From: Michael Turquette <mturquette@baylibre.com>
> 
> Scheduler-driven cpu frequency selection is desirable as part of the
> on-going effort to make the scheduler better aware of energy
> consumption.  No piece of the Linux kernel has a better view of the
> factors that affect a cpu frequency selection policy than the
> scheduler[0], and this patch is an attempt to converge on an initial
> solution.
> 
> This patch implements a simple shim layer between the Linux scheduler
> and the cpufreq subsystem. This interface accepts a capacity request
> from the Completely Fair Scheduler and honors the max request from all
> cpus in the same frequency domain.
> 
> The policy magic comes from choosing the cpu capacity request from cfs
> and is not contained in this cpufreq governor. This code is
> intentionally dumb.
> 
> Note that this "governor" is event-driven. There is no polling loop to
> check cpu idle time nor any other method which is unsynchronized with
> the scheduler.
> 
> Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
> code and test results.
> 
> [0] http://article.gmane.org/gmane.linux.kernel/1499836
> 
> Signed-off-by: Michael Turquette <mturquette@baylibre.com>
> Signed-off-by: Juri Lelli <juri.lelli@arm.com>

Hi Morten,

I believe your sign-off is needed here as well.

Regards,
Mike

> ---
>  drivers/cpufreq/Kconfig      |  24 ++++
>  include/linux/cpufreq.h      |   3 +
>  kernel/sched/Makefile        |   1 +
>  kernel/sched/cpufreq_sched.c | 308 +++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h         |   8 ++
>  5 files changed, 344 insertions(+)
>  create mode 100644 kernel/sched/cpufreq_sched.c
> 
> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> index 659879a..9bbf44c 100644
> --- a/drivers/cpufreq/Kconfig
> +++ b/drivers/cpufreq/Kconfig
> @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
>           Be aware that not all cpufreq drivers support the conservative
>           governor. If unsure have a look at the help section of the
>           driver. Fallback governor will be the performance governor.
> +
> +config CPU_FREQ_DEFAULT_GOV_SCHED
> +       bool "sched"
> +       select CPU_FREQ_GOV_SCHED
> +       select CPU_FREQ_GOV_PERFORMANCE
> +       help
> +         Use the CPUfreq governor 'sched' as default. This scales
> +         cpu frequency from the scheduler as per-entity load tracking
> +         statistics are updated.
>  endchoice
>  
>  config CPU_FREQ_GOV_PERFORMANCE
> @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
>  
>           If in doubt, say N.
>  
> +config CPU_FREQ_GOV_SCHED
> +       tristate "'sched' cpufreq governor"
> +       depends on CPU_FREQ
> +       select CPU_FREQ_GOV_COMMON
> +       help
> +         'sched' - this governor scales cpu frequency from the
> +         scheduler as a function of cpu capacity utilization. It does
> +         not evaluate utilization on a periodic basis (as ondemand
> +         does) but instead is invoked from the completely fair
> +         scheduler when updating per-entity load tracking statistics.
> +         Latency to respond to changes in load is improved over polling
> +         governors due to its event-driven design.
> +
> +         If in doubt, say N.
> +
>  comment "CPU frequency scaling drivers"
>  
>  config CPUFREQ_DT
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 1f2c9a1..30241c9 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
>  #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
>  extern struct cpufreq_governor cpufreq_gov_conservative;
>  #define CPUFREQ_DEFAULT_GOVERNOR       (&cpufreq_gov_conservative)
> +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
> +extern struct cpufreq_governor cpufreq_gov_sched_gov;
> +#define CPUFREQ_DEFAULT_GOVERNOR       (&cpufreq_gov_sched)
>  #endif
>  
>  /*********************************************************************
> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> index 6768797..90ed832 100644
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
>  obj-$(CONFIG_SCHEDSTATS) += stats.o
>  obj-$(CONFIG_SCHED_DEBUG) += debug.o
>  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
> +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
> diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
> new file mode 100644
> index 0000000..5020f24
> --- /dev/null
> +++ b/kernel/sched/cpufreq_sched.c
> @@ -0,0 +1,308 @@
> +/*
> + *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/cpufreq.h>
> +#include <linux/module.h>
> +#include <linux/kthread.h>
> +#include <linux/percpu.h>
> +#include <linux/irq_work.h>
> +
> +#include "sched.h"
> +
> +#define THROTTLE_NSEC          50000000 /* 50ms default */
> +
> +static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
> +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
> +
> +/**
> + * gov_data - per-policy data internal to the governor
> + * @throttle: next throttling period expiry. Derived from throttle_nsec
> + * @throttle_nsec: throttle period length in nanoseconds
> + * @task: worker thread for dvfs transition that may block/sleep
> + * @irq_work: callback used to wake up worker thread
> + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
> + *
> + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
> + * per-policy instance of it is created when the cpufreq_sched governor receives
> + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
> + * member of struct cpufreq_policy.
> + *
> + * Readers of this data must call down_read(policy->rwsem). Writers must
> + * call down_write(policy->rwsem).
> + */
> +struct gov_data {
> +       ktime_t throttle;
> +       unsigned int throttle_nsec;
> +       struct task_struct *task;
> +       struct irq_work irq_work;
> +       struct cpufreq_policy *policy;
> +       unsigned int freq;
> +};
> +
> +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
> +{
> +       struct gov_data *gd = policy->governor_data;
> +
> +       /* avoid race with cpufreq_sched_stop */
> +       if (!down_write_trylock(&policy->rwsem))
> +               return;
> +
> +       __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
> +
> +       gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
> +       up_write(&policy->rwsem);
> +}
> +
> +/*
> + * we pass in struct cpufreq_policy. This is safe because changing out the
> + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
> + * which tears down all of the data structures and __cpufreq_governor(policy,
> + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
> + * new policy pointer
> + */
> +static int cpufreq_sched_thread(void *data)
> +{
> +       struct sched_param param;
> +       struct cpufreq_policy *policy;
> +       struct gov_data *gd;
> +       int ret;
> +
> +       policy = (struct cpufreq_policy *) data;
> +       if (!policy) {
> +               pr_warn("%s: missing policy\n", __func__);
> +               do_exit(-EINVAL);
> +       }
> +
> +       gd = policy->governor_data;
> +       if (!gd) {
> +               pr_warn("%s: missing governor data\n", __func__);
> +               do_exit(-EINVAL);
> +       }
> +
> +       param.sched_priority = 50;
> +       ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
> +       if (ret) {
> +               pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
> +               do_exit(-EINVAL);
> +       } else {
> +               pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
> +                               __func__, gd->task->pid);
> +       }
> +
> +       ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
> +       if (ret) {
> +               pr_warn("%s: failed to set allowed ptr\n", __func__);
> +               do_exit(-EINVAL);
> +       }
> +
> +       /* main loop of the per-policy kthread */
> +       do {
> +               set_current_state(TASK_INTERRUPTIBLE);
> +               schedule();
> +               if (kthread_should_stop())
> +                       break;
> +
> +               cpufreq_sched_try_driver_target(policy, gd->freq);
> +       } while (!kthread_should_stop());
> +
> +       do_exit(0);
> +}
> +
> +static void cpufreq_sched_irq_work(struct irq_work *irq_work)
> +{
> +       struct gov_data *gd;
> +
> +       gd = container_of(irq_work, struct gov_data, irq_work);
> +       if (!gd) {
> +               return;
> +       }
> +
> +       wake_up_process(gd->task);
> +}
> +
> +/**
> + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values
> + * @cpu: cpu whose capacity utilization has recently changed
> + * @capacity: the new capacity requested by cpu
> + *
> + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so
> + * that the scheduler may inform the governor of updates to capacity
> + * utilization and make changes to cpu frequency. Currently this interface is
> + * designed around PELT values in CFS. It can be expanded to other scheduling
> + * classes in the future if needed.
> + *
> + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI
> + * wakes up the thread that does the actual work, cpufreq_sched_thread.
> + *
> + * This functions bails out early if either condition is true:
> + * 1) this cpu did not the new maximum capacity for its frequency domain
> + * 2) no change in cpu frequency is necessary to meet the new capacity request
> + */
> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
> +{
> +       unsigned int freq_new, cpu_tmp;
> +       struct cpufreq_policy *policy;
> +       struct gov_data *gd;
> +       unsigned long capacity_max = 0;
> +
> +       /* update per-cpu capacity request */
> +       __this_cpu_write(pcpu_capacity, capacity);
> +
> +       policy = cpufreq_cpu_get(cpu);
> +       if (IS_ERR_OR_NULL(policy)) {
> +               return;
> +       }
> +
> +       if (!policy->governor_data)
> +               goto out;
> +
> +       gd = policy->governor_data;
> +
> +       /* bail early if we are throttled */
> +       if (ktime_before(ktime_get(), gd->throttle))
> +               goto out;
> +
> +       /* find max capacity requested by cpus in this policy */
> +       for_each_cpu(cpu_tmp, policy->cpus)
> +               capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp));
> +
> +       /*
> +        * We only change frequency if this cpu's capacity request represents a
> +        * new max. If another cpu has requested a capacity greater than the
> +        * previous max then we rely on that cpu to hit this code path and make
> +        * the change. IOW, the cpu with the new max capacity is responsible
> +        * for setting the new capacity/frequency.
> +        *
> +        * If this cpu is not the new maximum then bail
> +        */
> +       if (capacity_max > capacity)
> +               goto out;
> +
> +       /* Convert the new maximum capacity request into a cpu frequency */
> +       freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
> +
> +       /* No change in frequency? Bail and return current capacity. */
> +       if (freq_new == policy->cur)
> +               goto out;
> +
> +       /* store the new frequency and perform the transition */
> +       gd->freq = freq_new;
> +
> +       if (cpufreq_driver_might_sleep())
> +               irq_work_queue_on(&gd->irq_work, cpu);
> +       else
> +               cpufreq_sched_try_driver_target(policy, freq_new);
> +
> +out:
> +       cpufreq_cpu_put(policy);
> +       return;
> +}
> +
> +static int cpufreq_sched_start(struct cpufreq_policy *policy)
> +{
> +       struct gov_data *gd;
> +       int cpu;
> +
> +       /* prepare per-policy private data */
> +       gd = kzalloc(sizeof(*gd), GFP_KERNEL);
> +       if (!gd) {
> +               pr_debug("%s: failed to allocate private data\n", __func__);
> +               return -ENOMEM;
> +       }
> +
> +       /* initialize per-cpu data */
> +       for_each_cpu(cpu, policy->cpus) {
> +               per_cpu(pcpu_capacity, cpu) = 0;
> +               per_cpu(pcpu_policy, cpu) = policy;
> +       }
> +
> +       /*
> +        * Don't ask for freq changes at an higher rate than what
> +        * the driver advertises as transition latency.
> +        */
> +       gd->throttle_nsec = policy->cpuinfo.transition_latency ?
> +                           policy->cpuinfo.transition_latency :
> +                           THROTTLE_NSEC;
> +       pr_debug("%s: throttle threshold = %u [ns]\n",
> +                 __func__, gd->throttle_nsec);
> +
> +       if (cpufreq_driver_might_sleep()) {
> +               /* init per-policy kthread */
> +               gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task");
> +               if (IS_ERR_OR_NULL(gd->task)) {
> +                       pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__);
> +                       goto err;
> +               }
> +               init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
> +       }
> +
> +       policy->governor_data = gd;
> +       gd->policy = policy;
> +       return 0;
> +
> +err:
> +       kfree(gd);
> +       return -ENOMEM;
> +}
> +
> +static int cpufreq_sched_stop(struct cpufreq_policy *policy)
> +{
> +       struct gov_data *gd = policy->governor_data;
> +
> +       if (cpufreq_driver_might_sleep()) {
> +               kthread_stop(gd->task);
> +       }
> +
> +       policy->governor_data = NULL;
> +
> +       /* FIXME replace with devm counterparts? */
> +       kfree(gd);
> +       return 0;
> +}
> +
> +static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event)
> +{
> +       switch (event) {
> +               case CPUFREQ_GOV_START:
> +                       /* Start managing the frequency */
> +                       return cpufreq_sched_start(policy);
> +
> +               case CPUFREQ_GOV_STOP:
> +                       return cpufreq_sched_stop(policy);
> +
> +               case CPUFREQ_GOV_LIMITS:        /* unused */
> +               case CPUFREQ_GOV_POLICY_INIT:   /* unused */
> +               case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
> +                       break;
> +       }
> +       return 0;
> +}
> +
> +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
> +static
> +#endif
> +struct cpufreq_governor cpufreq_gov_sched = {
> +       .name                   = "sched",
> +       .governor               = cpufreq_sched_setup,
> +       .owner                  = THIS_MODULE,
> +};
> +
> +static int __init cpufreq_sched_init(void)
> +{
> +       return cpufreq_register_governor(&cpufreq_gov_sched);
> +}
> +
> +static void __exit cpufreq_sched_exit(void)
> +{
> +       cpufreq_unregister_governor(&cpufreq_gov_sched);
> +}
> +
> +/* Try to make this the default governor */
> +fs_initcall(cpufreq_sched_init);
> +
> +MODULE_LICENSE("GPL v2");
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c395559..30aa0c4 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1476,6 +1476,13 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
>  }
>  #endif
>  
> +#ifdef CONFIG_CPU_FREQ_GOV_SCHED
> +void cpufreq_sched_set_cap(int cpu, unsigned long util);
> +#else
> +static inline void cpufreq_sched_set_cap(int cpu, unsigned long util)
> +{ }
> +#endif
> +
>  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
>  {
>         rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
> @@ -1484,6 +1491,7 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
>  #else
>  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
>  static inline void sched_avg_update(struct rq *rq) { }
> +static inline void gov_cfs_update_cpu(int cpu) {}
>  #endif
>  
>  extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leo Yan Aug. 11, 2015, 2:14 a.m. UTC | #2
On Tue, Jul 07, 2015 at 07:24:21PM +0100, Morten Rasmussen wrote:
> From: Michael Turquette <mturquette@baylibre.com>
> 
> Scheduler-driven cpu frequency selection is desirable as part of the
> on-going effort to make the scheduler better aware of energy
> consumption.  No piece of the Linux kernel has a better view of the
> factors that affect a cpu frequency selection policy than the
> scheduler[0], and this patch is an attempt to converge on an initial
> solution.
> 
> This patch implements a simple shim layer between the Linux scheduler
> and the cpufreq subsystem. This interface accepts a capacity request
> from the Completely Fair Scheduler and honors the max request from all
> cpus in the same frequency domain.
> 
> The policy magic comes from choosing the cpu capacity request from cfs
> and is not contained in this cpufreq governor. This code is
> intentionally dumb.
> 
> Note that this "governor" is event-driven. There is no polling loop to
> check cpu idle time nor any other method which is unsynchronized with
> the scheduler.
> 
> Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
> code and test results.
> 
> [0] http://article.gmane.org/gmane.linux.kernel/1499836
> 
> Signed-off-by: Michael Turquette <mturquette@baylibre.com>
> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
> ---
>  drivers/cpufreq/Kconfig      |  24 ++++
>  include/linux/cpufreq.h      |   3 +
>  kernel/sched/Makefile        |   1 +
>  kernel/sched/cpufreq_sched.c | 308 +++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h         |   8 ++
>  5 files changed, 344 insertions(+)
>  create mode 100644 kernel/sched/cpufreq_sched.c
> 
> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> index 659879a..9bbf44c 100644
> --- a/drivers/cpufreq/Kconfig
> +++ b/drivers/cpufreq/Kconfig
> @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
>  	  Be aware that not all cpufreq drivers support the conservative
>  	  governor. If unsure have a look at the help section of the
>  	  driver. Fallback governor will be the performance governor.
> +
> +config CPU_FREQ_DEFAULT_GOV_SCHED
> +	bool "sched"
> +	select CPU_FREQ_GOV_SCHED
> +	select CPU_FREQ_GOV_PERFORMANCE
> +	help
> +	  Use the CPUfreq governor 'sched' as default. This scales
> +	  cpu frequency from the scheduler as per-entity load tracking
> +	  statistics are updated.
>  endchoice
>  
>  config CPU_FREQ_GOV_PERFORMANCE
> @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
>  
>  	  If in doubt, say N.
>  
> +config CPU_FREQ_GOV_SCHED
> +	tristate "'sched' cpufreq governor"
> +	depends on CPU_FREQ
> +	select CPU_FREQ_GOV_COMMON
> +	help
> +	  'sched' - this governor scales cpu frequency from the
> +	  scheduler as a function of cpu capacity utilization. It does
> +	  not evaluate utilization on a periodic basis (as ondemand
> +	  does) but instead is invoked from the completely fair
> +	  scheduler when updating per-entity load tracking statistics.
> +	  Latency to respond to changes in load is improved over polling
> +	  governors due to its event-driven design.
> +
> +	  If in doubt, say N.
> +
>  comment "CPU frequency scaling drivers"
>  
>  config CPUFREQ_DT
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 1f2c9a1..30241c9 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
>  #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
>  extern struct cpufreq_governor cpufreq_gov_conservative;
>  #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
> +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)

s/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED/

> +extern struct cpufreq_governor cpufreq_gov_sched_gov;

s/cpufreq_gov_sched_gov/cpufreq_gov_sched/

> +#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
>  #endif
>  
>  /*********************************************************************
> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> index 6768797..90ed832 100644
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
>  obj-$(CONFIG_SCHEDSTATS) += stats.o
>  obj-$(CONFIG_SCHED_DEBUG) += debug.o
>  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
> +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
> diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
> new file mode 100644
> index 0000000..5020f24
> --- /dev/null
> +++ b/kernel/sched/cpufreq_sched.c
> @@ -0,0 +1,308 @@
> +/*
> + *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/cpufreq.h>
> +#include <linux/module.h>
> +#include <linux/kthread.h>
> +#include <linux/percpu.h>
> +#include <linux/irq_work.h>
> +
> +#include "sched.h"
> +
> +#define THROTTLE_NSEC		50000000 /* 50ms default */
> +
> +static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
> +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
> +
> +/**
> + * gov_data - per-policy data internal to the governor
> + * @throttle: next throttling period expiry. Derived from throttle_nsec
> + * @throttle_nsec: throttle period length in nanoseconds
> + * @task: worker thread for dvfs transition that may block/sleep
> + * @irq_work: callback used to wake up worker thread
> + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
> + *
> + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
> + * per-policy instance of it is created when the cpufreq_sched governor receives
> + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
> + * member of struct cpufreq_policy.
> + *
> + * Readers of this data must call down_read(policy->rwsem). Writers must
> + * call down_write(policy->rwsem).
> + */
> +struct gov_data {
> +	ktime_t throttle;
> +	unsigned int throttle_nsec;
> +	struct task_struct *task;
> +	struct irq_work irq_work;
> +	struct cpufreq_policy *policy;
> +	unsigned int freq;
> +};
> +
> +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
> +{
> +	struct gov_data *gd = policy->governor_data;
> +
> +	/* avoid race with cpufreq_sched_stop */
> +	if (!down_write_trylock(&policy->rwsem))
> +		return;
> +
> +	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
> +
> +	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
> +	up_write(&policy->rwsem);
> +}
> +
> +/*
> + * we pass in struct cpufreq_policy. This is safe because changing out the
> + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
> + * which tears down all of the data structures and __cpufreq_governor(policy,
> + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
> + * new policy pointer
> + */
> +static int cpufreq_sched_thread(void *data)
> +{
> +	struct sched_param param;
> +	struct cpufreq_policy *policy;
> +	struct gov_data *gd;
> +	int ret;
> +
> +	policy = (struct cpufreq_policy *) data;
> +	if (!policy) {
> +		pr_warn("%s: missing policy\n", __func__);
> +		do_exit(-EINVAL);
> +	}
> +
> +	gd = policy->governor_data;
> +	if (!gd) {
> +		pr_warn("%s: missing governor data\n", __func__);
> +		do_exit(-EINVAL);
> +	}
> +
> +	param.sched_priority = 50;
> +	ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
> +	if (ret) {
> +		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
> +		do_exit(-EINVAL);
> +	} else {
> +		pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
> +				__func__, gd->task->pid);
> +	}
> +
> +	ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
> +	if (ret) {
> +		pr_warn("%s: failed to set allowed ptr\n", __func__);
> +		do_exit(-EINVAL);
> +	}
> +
> +	/* main loop of the per-policy kthread */
> +	do {
> +		set_current_state(TASK_INTERRUPTIBLE);
> +		schedule();
> +		if (kthread_should_stop())
> +			break;
> +
> +		cpufreq_sched_try_driver_target(policy, gd->freq);
> +	} while (!kthread_should_stop());
> +
> +	do_exit(0);
> +}
> +
> +static void cpufreq_sched_irq_work(struct irq_work *irq_work)
> +{
> +	struct gov_data *gd;
> +
> +	gd = container_of(irq_work, struct gov_data, irq_work);
> +	if (!gd) {
> +		return;
> +	}
> +
> +	wake_up_process(gd->task);
> +}
> +
> +/**
> + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values
> + * @cpu: cpu whose capacity utilization has recently changed
> + * @capacity: the new capacity requested by cpu
> + *
> + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so
> + * that the scheduler may inform the governor of updates to capacity
> + * utilization and make changes to cpu frequency. Currently this interface is
> + * designed around PELT values in CFS. It can be expanded to other scheduling
> + * classes in the future if needed.
> + *
> + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI
> + * wakes up the thread that does the actual work, cpufreq_sched_thread.
> + *
> + * This functions bails out early if either condition is true:
> + * 1) this cpu did not the new maximum capacity for its frequency domain
> + * 2) no change in cpu frequency is necessary to meet the new capacity request
> + */
> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
> +{
> +	unsigned int freq_new, cpu_tmp;
> +	struct cpufreq_policy *policy;
> +	struct gov_data *gd;
> +	unsigned long capacity_max = 0;
> +
> +	/* update per-cpu capacity request */
> +	__this_cpu_write(pcpu_capacity, capacity);
> +
> +	policy = cpufreq_cpu_get(cpu);
> +	if (IS_ERR_OR_NULL(policy)) {
> +		return;
> +	}
> +
> +	if (!policy->governor_data)
> +		goto out;
> +
> +	gd = policy->governor_data;
> +
> +	/* bail early if we are throttled */
> +	if (ktime_before(ktime_get(), gd->throttle))
> +		goto out;
> +
> +	/* find max capacity requested by cpus in this policy */
> +	for_each_cpu(cpu_tmp, policy->cpus)
> +		capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp));
> +
> +	/*
> +	 * We only change frequency if this cpu's capacity request represents a
> +	 * new max. If another cpu has requested a capacity greater than the
> +	 * previous max then we rely on that cpu to hit this code path and make
> +	 * the change. IOW, the cpu with the new max capacity is responsible
> +	 * for setting the new capacity/frequency.
> +	 *
> +	 * If this cpu is not the new maximum then bail
> +	 */
> +	if (capacity_max > capacity)
> +		goto out;
> +
> +	/* Convert the new maximum capacity request into a cpu frequency */
> +	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
> +
> +	/* No change in frequency? Bail and return current capacity. */
> +	if (freq_new == policy->cur)
> +		goto out;
> +
> +	/* store the new frequency and perform the transition */
> +	gd->freq = freq_new;
> +
> +	if (cpufreq_driver_might_sleep())
> +		irq_work_queue_on(&gd->irq_work, cpu);
> +	else
> +		cpufreq_sched_try_driver_target(policy, freq_new);
> +
> +out:
> +	cpufreq_cpu_put(policy);
> +	return;
> +}
> +
> +static int cpufreq_sched_start(struct cpufreq_policy *policy)
> +{
> +	struct gov_data *gd;
> +	int cpu;
> +
> +	/* prepare per-policy private data */
> +	gd = kzalloc(sizeof(*gd), GFP_KERNEL);
> +	if (!gd) {
> +		pr_debug("%s: failed to allocate private data\n", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	/* initialize per-cpu data */
> +	for_each_cpu(cpu, policy->cpus) {
> +		per_cpu(pcpu_capacity, cpu) = 0;
> +		per_cpu(pcpu_policy, cpu) = policy;
> +	}
> +
> +	/*
> +	 * Don't ask for freq changes at an higher rate than what
> +	 * the driver advertises as transition latency.
> +	 */
> +	gd->throttle_nsec = policy->cpuinfo.transition_latency ?
> +			    policy->cpuinfo.transition_latency :
> +			    THROTTLE_NSEC;
> +	pr_debug("%s: throttle threshold = %u [ns]\n",
> +		  __func__, gd->throttle_nsec);
> +
> +	if (cpufreq_driver_might_sleep()) {
> +		/* init per-policy kthread */
> +		gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task");
> +		if (IS_ERR_OR_NULL(gd->task)) {
> +			pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__);
> +			goto err;
> +		}
> +		init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
> +	}
> +
> +	policy->governor_data = gd;
> +	gd->policy = policy;
> +	return 0;
> +
> +err:
> +	kfree(gd);
> +	return -ENOMEM;
> +}
> +
> +static int cpufreq_sched_stop(struct cpufreq_policy *policy)
> +{
> +	struct gov_data *gd = policy->governor_data;
> +
> +	if (cpufreq_driver_might_sleep()) {
> +		kthread_stop(gd->task);
> +	}
> +
> +	policy->governor_data = NULL;
> +
> +	/* FIXME replace with devm counterparts? */
> +	kfree(gd);
> +	return 0;
> +}
> +
> +static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event)
> +{
> +	switch (event) {
> +		case CPUFREQ_GOV_START:
> +			/* Start managing the frequency */
> +			return cpufreq_sched_start(policy);
> +
> +		case CPUFREQ_GOV_STOP:
> +			return cpufreq_sched_stop(policy);
> +
> +		case CPUFREQ_GOV_LIMITS:	/* unused */
> +		case CPUFREQ_GOV_POLICY_INIT:	/* unused */
> +		case CPUFREQ_GOV_POLICY_EXIT:	/* unused */
> +			break;
> +	}
> +	return 0;
> +}
> +
> +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
> +static
> +#endif
> +struct cpufreq_governor cpufreq_gov_sched = {
> +	.name			= "sched",
> +	.governor		= cpufreq_sched_setup,
> +	.owner			= THIS_MODULE,
> +};
> +
> +static int __init cpufreq_sched_init(void)
> +{
> +	return cpufreq_register_governor(&cpufreq_gov_sched);
> +}
> +
> +static void __exit cpufreq_sched_exit(void)
> +{
> +	cpufreq_unregister_governor(&cpufreq_gov_sched);
> +}
> +
> +/* Try to make this the default governor */
> +fs_initcall(cpufreq_sched_init);
> +
> +MODULE_LICENSE("GPL v2");
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c395559..30aa0c4 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1476,6 +1476,13 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
>  }
>  #endif
>  
> +#ifdef CONFIG_CPU_FREQ_GOV_SCHED
> +void cpufreq_sched_set_cap(int cpu, unsigned long util);
> +#else
> +static inline void cpufreq_sched_set_cap(int cpu, unsigned long util)
> +{ }
> +#endif
> +
>  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
>  {
>  	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
> @@ -1484,6 +1491,7 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
>  #else
>  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
>  static inline void sched_avg_update(struct rq *rq) { }
> +static inline void gov_cfs_update_cpu(int cpu) {}
>  #endif
>  
>  extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Aug. 11, 2015, 8:59 a.m. UTC | #3
Hi,

On 11/08/15 03:14, Leo Yan wrote:
> On Tue, Jul 07, 2015 at 07:24:21PM +0100, Morten Rasmussen wrote:
>> From: Michael Turquette <mturquette@baylibre.com>
>>
>> Scheduler-driven cpu frequency selection is desirable as part of the
>> on-going effort to make the scheduler better aware of energy
>> consumption.  No piece of the Linux kernel has a better view of the
>> factors that affect a cpu frequency selection policy than the
>> scheduler[0], and this patch is an attempt to converge on an initial
>> solution.
>>
>> This patch implements a simple shim layer between the Linux scheduler
>> and the cpufreq subsystem. This interface accepts a capacity request
>> from the Completely Fair Scheduler and honors the max request from all
>> cpus in the same frequency domain.
>>
>> The policy magic comes from choosing the cpu capacity request from cfs
>> and is not contained in this cpufreq governor. This code is
>> intentionally dumb.
>>
>> Note that this "governor" is event-driven. There is no polling loop to
>> check cpu idle time nor any other method which is unsynchronized with
>> the scheduler.
>>
>> Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
>> code and test results.
>>
>> [0] http://article.gmane.org/gmane.linux.kernel/1499836
>>
>> Signed-off-by: Michael Turquette <mturquette@baylibre.com>
>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>> ---
>>  drivers/cpufreq/Kconfig      |  24 ++++
>>  include/linux/cpufreq.h      |   3 +
>>  kernel/sched/Makefile        |   1 +
>>  kernel/sched/cpufreq_sched.c | 308 +++++++++++++++++++++++++++++++++++++++++++
>>  kernel/sched/sched.h         |   8 ++
>>  5 files changed, 344 insertions(+)
>>  create mode 100644 kernel/sched/cpufreq_sched.c
>>
>> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
>> index 659879a..9bbf44c 100644
>> --- a/drivers/cpufreq/Kconfig
>> +++ b/drivers/cpufreq/Kconfig
>> @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
>>         Be aware that not all cpufreq drivers support the conservative
>>         governor. If unsure have a look at the help section of the
>>         driver. Fallback governor will be the performance governor.
>> +
>> +config CPU_FREQ_DEFAULT_GOV_SCHED
>> +     bool "sched"
>> +     select CPU_FREQ_GOV_SCHED
>> +     select CPU_FREQ_GOV_PERFORMANCE
>> +     help
>> +       Use the CPUfreq governor 'sched' as default. This scales
>> +       cpu frequency from the scheduler as per-entity load tracking
>> +       statistics are updated.
>>  endchoice
>>
>>  config CPU_FREQ_GOV_PERFORMANCE
>> @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
>>
>>         If in doubt, say N.
>>
>> +config CPU_FREQ_GOV_SCHED
>> +     tristate "'sched' cpufreq governor"
>> +     depends on CPU_FREQ
>> +     select CPU_FREQ_GOV_COMMON
>> +     help
>> +       'sched' - this governor scales cpu frequency from the
>> +       scheduler as a function of cpu capacity utilization. It does
>> +       not evaluate utilization on a periodic basis (as ondemand
>> +       does) but instead is invoked from the completely fair
>> +       scheduler when updating per-entity load tracking statistics.
>> +       Latency to respond to changes in load is improved over polling
>> +       governors due to its event-driven design.
>> +
>> +       If in doubt, say N.
>> +
>>  comment "CPU frequency scaling drivers"
>>
>>  config CPUFREQ_DT
>> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
>> index 1f2c9a1..30241c9 100644
>> --- a/include/linux/cpufreq.h
>> +++ b/include/linux/cpufreq.h
>> @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
>>  #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
>>  extern struct cpufreq_governor cpufreq_gov_conservative;
>>  #define CPUFREQ_DEFAULT_GOVERNOR     (&cpufreq_gov_conservative)
>> +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
> 
> s/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED/
> 
>> +extern struct cpufreq_governor cpufreq_gov_sched_gov;
> 
> s/cpufreq_gov_sched_gov/cpufreq_gov_sched/
> 

Yes, right. Dietmar pointed out the same problem in reply to Mike's
original posting. I guess Mike is going to squash the fix in his
next posting.

Thanks a lot anyway! :)

Best,

- Juri

>> +#define CPUFREQ_DEFAULT_GOVERNOR     (&cpufreq_gov_sched)
>>  #endif
>>
>>  /*********************************************************************
>> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
>> index 6768797..90ed832 100644
>> --- a/kernel/sched/Makefile
>> +++ b/kernel/sched/Makefile
>> @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
>>  obj-$(CONFIG_SCHEDSTATS) += stats.o
>>  obj-$(CONFIG_SCHED_DEBUG) += debug.o
>>  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
>> +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
>> diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
>> new file mode 100644
>> index 0000000..5020f24
>> --- /dev/null
>> +++ b/kernel/sched/cpufreq_sched.c
>> @@ -0,0 +1,308 @@
>> +/*
>> + *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/cpufreq.h>
>> +#include <linux/module.h>
>> +#include <linux/kthread.h>
>> +#include <linux/percpu.h>
>> +#include <linux/irq_work.h>
>> +
>> +#include "sched.h"
>> +
>> +#define THROTTLE_NSEC                50000000 /* 50ms default */
>> +
>> +static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
>> +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
>> +
>> +/**
>> + * gov_data - per-policy data internal to the governor
>> + * @throttle: next throttling period expiry. Derived from throttle_nsec
>> + * @throttle_nsec: throttle period length in nanoseconds
>> + * @task: worker thread for dvfs transition that may block/sleep
>> + * @irq_work: callback used to wake up worker thread
>> + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
>> + *
>> + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
>> + * per-policy instance of it is created when the cpufreq_sched governor receives
>> + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
>> + * member of struct cpufreq_policy.
>> + *
>> + * Readers of this data must call down_read(policy->rwsem). Writers must
>> + * call down_write(policy->rwsem).
>> + */
>> +struct gov_data {
>> +     ktime_t throttle;
>> +     unsigned int throttle_nsec;
>> +     struct task_struct *task;
>> +     struct irq_work irq_work;
>> +     struct cpufreq_policy *policy;
>> +     unsigned int freq;
>> +};
>> +
>> +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
>> +{
>> +     struct gov_data *gd = policy->governor_data;
>> +
>> +     /* avoid race with cpufreq_sched_stop */
>> +     if (!down_write_trylock(&policy->rwsem))
>> +             return;
>> +
>> +     __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
>> +
>> +     gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
>> +     up_write(&policy->rwsem);
>> +}
>> +
>> +/*
>> + * we pass in struct cpufreq_policy. This is safe because changing out the
>> + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
>> + * which tears down all of the data structures and __cpufreq_governor(policy,
>> + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
>> + * new policy pointer
>> + */
>> +static int cpufreq_sched_thread(void *data)
>> +{
>> +     struct sched_param param;
>> +     struct cpufreq_policy *policy;
>> +     struct gov_data *gd;
>> +     int ret;
>> +
>> +     policy = (struct cpufreq_policy *) data;
>> +     if (!policy) {
>> +             pr_warn("%s: missing policy\n", __func__);
>> +             do_exit(-EINVAL);
>> +     }
>> +
>> +     gd = policy->governor_data;
>> +     if (!gd) {
>> +             pr_warn("%s: missing governor data\n", __func__);
>> +             do_exit(-EINVAL);
>> +     }
>> +
>> +     param.sched_priority = 50;
>> +     ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
>> +     if (ret) {
>> +             pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
>> +             do_exit(-EINVAL);
>> +     } else {
>> +             pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
>> +                             __func__, gd->task->pid);
>> +     }
>> +
>> +     ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
>> +     if (ret) {
>> +             pr_warn("%s: failed to set allowed ptr\n", __func__);
>> +             do_exit(-EINVAL);
>> +     }
>> +
>> +     /* main loop of the per-policy kthread */
>> +     do {
>> +             set_current_state(TASK_INTERRUPTIBLE);
>> +             schedule();
>> +             if (kthread_should_stop())
>> +                     break;
>> +
>> +             cpufreq_sched_try_driver_target(policy, gd->freq);
>> +     } while (!kthread_should_stop());
>> +
>> +     do_exit(0);
>> +}
>> +
>> +static void cpufreq_sched_irq_work(struct irq_work *irq_work)
>> +{
>> +     struct gov_data *gd;
>> +
>> +     gd = container_of(irq_work, struct gov_data, irq_work);
>> +     if (!gd) {
>> +             return;
>> +     }
>> +
>> +     wake_up_process(gd->task);
>> +}
>> +
>> +/**
>> + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values
>> + * @cpu: cpu whose capacity utilization has recently changed
>> + * @capacity: the new capacity requested by cpu
>> + *
>> + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so
>> + * that the scheduler may inform the governor of updates to capacity
>> + * utilization and make changes to cpu frequency. Currently this interface is
>> + * designed around PELT values in CFS. It can be expanded to other scheduling
>> + * classes in the future if needed.
>> + *
>> + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI
>> + * wakes up the thread that does the actual work, cpufreq_sched_thread.
>> + *
>> + * This functions bails out early if either condition is true:
>> + * 1) this cpu did not the new maximum capacity for its frequency domain
>> + * 2) no change in cpu frequency is necessary to meet the new capacity request
>> + */
>> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
>> +{
>> +     unsigned int freq_new, cpu_tmp;
>> +     struct cpufreq_policy *policy;
>> +     struct gov_data *gd;
>> +     unsigned long capacity_max = 0;
>> +
>> +     /* update per-cpu capacity request */
>> +     __this_cpu_write(pcpu_capacity, capacity);
>> +
>> +     policy = cpufreq_cpu_get(cpu);
>> +     if (IS_ERR_OR_NULL(policy)) {
>> +             return;
>> +     }
>> +
>> +     if (!policy->governor_data)
>> +             goto out;
>> +
>> +     gd = policy->governor_data;
>> +
>> +     /* bail early if we are throttled */
>> +     if (ktime_before(ktime_get(), gd->throttle))
>> +             goto out;
>> +
>> +     /* find max capacity requested by cpus in this policy */
>> +     for_each_cpu(cpu_tmp, policy->cpus)
>> +             capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp));
>> +
>> +     /*
>> +      * We only change frequency if this cpu's capacity request represents a
>> +      * new max. If another cpu has requested a capacity greater than the
>> +      * previous max then we rely on that cpu to hit this code path and make
>> +      * the change. IOW, the cpu with the new max capacity is responsible
>> +      * for setting the new capacity/frequency.
>> +      *
>> +      * If this cpu is not the new maximum then bail
>> +      */
>> +     if (capacity_max > capacity)
>> +             goto out;
>> +
>> +     /* Convert the new maximum capacity request into a cpu frequency */
>> +     freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
>> +
>> +     /* No change in frequency? Bail and return current capacity. */
>> +     if (freq_new == policy->cur)
>> +             goto out;
>> +
>> +     /* store the new frequency and perform the transition */
>> +     gd->freq = freq_new;
>> +
>> +     if (cpufreq_driver_might_sleep())
>> +             irq_work_queue_on(&gd->irq_work, cpu);
>> +     else
>> +             cpufreq_sched_try_driver_target(policy, freq_new);
>> +
>> +out:
>> +     cpufreq_cpu_put(policy);
>> +     return;
>> +}
>> +
>> +static int cpufreq_sched_start(struct cpufreq_policy *policy)
>> +{
>> +     struct gov_data *gd;
>> +     int cpu;
>> +
>> +     /* prepare per-policy private data */
>> +     gd = kzalloc(sizeof(*gd), GFP_KERNEL);
>> +     if (!gd) {
>> +             pr_debug("%s: failed to allocate private data\n", __func__);
>> +             return -ENOMEM;
>> +     }
>> +
>> +     /* initialize per-cpu data */
>> +     for_each_cpu(cpu, policy->cpus) {
>> +             per_cpu(pcpu_capacity, cpu) = 0;
>> +             per_cpu(pcpu_policy, cpu) = policy;
>> +     }
>> +
>> +     /*
>> +      * Don't ask for freq changes at an higher rate than what
>> +      * the driver advertises as transition latency.
>> +      */
>> +     gd->throttle_nsec = policy->cpuinfo.transition_latency ?
>> +                         policy->cpuinfo.transition_latency :
>> +                         THROTTLE_NSEC;
>> +     pr_debug("%s: throttle threshold = %u [ns]\n",
>> +               __func__, gd->throttle_nsec);
>> +
>> +     if (cpufreq_driver_might_sleep()) {
>> +             /* init per-policy kthread */
>> +             gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task");
>> +             if (IS_ERR_OR_NULL(gd->task)) {
>> +                     pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__);
>> +                     goto err;
>> +             }
>> +             init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
>> +     }
>> +
>> +     policy->governor_data = gd;
>> +     gd->policy = policy;
>> +     return 0;
>> +
>> +err:
>> +     kfree(gd);
>> +     return -ENOMEM;
>> +}
>> +
>> +static int cpufreq_sched_stop(struct cpufreq_policy *policy)
>> +{
>> +     struct gov_data *gd = policy->governor_data;
>> +
>> +     if (cpufreq_driver_might_sleep()) {
>> +             kthread_stop(gd->task);
>> +     }
>> +
>> +     policy->governor_data = NULL;
>> +
>> +     /* FIXME replace with devm counterparts? */
>> +     kfree(gd);
>> +     return 0;
>> +}
>> +
>> +static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event)
>> +{
>> +     switch (event) {
>> +             case CPUFREQ_GOV_START:
>> +                     /* Start managing the frequency */
>> +                     return cpufreq_sched_start(policy);
>> +
>> +             case CPUFREQ_GOV_STOP:
>> +                     return cpufreq_sched_stop(policy);
>> +
>> +             case CPUFREQ_GOV_LIMITS:        /* unused */
>> +             case CPUFREQ_GOV_POLICY_INIT:   /* unused */
>> +             case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
>> +                     break;
>> +     }
>> +     return 0;
>> +}
>> +
>> +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
>> +static
>> +#endif
>> +struct cpufreq_governor cpufreq_gov_sched = {
>> +     .name                   = "sched",
>> +     .governor               = cpufreq_sched_setup,
>> +     .owner                  = THIS_MODULE,
>> +};
>> +
>> +static int __init cpufreq_sched_init(void)
>> +{
>> +     return cpufreq_register_governor(&cpufreq_gov_sched);
>> +}
>> +
>> +static void __exit cpufreq_sched_exit(void)
>> +{
>> +     cpufreq_unregister_governor(&cpufreq_gov_sched);
>> +}
>> +
>> +/* Try to make this the default governor */
>> +fs_initcall(cpufreq_sched_init);
>> +
>> +MODULE_LICENSE("GPL v2");
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index c395559..30aa0c4 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -1476,6 +1476,13 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
>>  }
>>  #endif
>>
>> +#ifdef CONFIG_CPU_FREQ_GOV_SCHED
>> +void cpufreq_sched_set_cap(int cpu, unsigned long util);
>> +#else
>> +static inline void cpufreq_sched_set_cap(int cpu, unsigned long util)
>> +{ }
>> +#endif
>> +
>>  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
>>  {
>>       rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
>> @@ -1484,6 +1491,7 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
>>  #else
>>  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
>>  static inline void sched_avg_update(struct rq *rq) { }
>> +static inline void gov_cfs_update_cpu(int cpu) {}
>>  #endif
>>
>>  extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
>> --
>> 1.9.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Aug. 15, 2015, 12:35 p.m. UTC | #4
On Tue, Jul 07, 2015 at 07:24:21PM +0100, Morten Rasmussen wrote:
> diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
> new file mode 100644
> index 0000000..5020f24
> --- /dev/null
> +++ b/kernel/sched/cpufreq_sched.c
> @@ -0,0 +1,308 @@
> +/*
> + *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/cpufreq.h>
> +#include <linux/module.h>
> +#include <linux/kthread.h>
> +#include <linux/percpu.h>
> +#include <linux/irq_work.h>
> +
> +#include "sched.h"
> +
> +#define THROTTLE_NSEC		50000000 /* 50ms default */
> +
> +static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
> +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
> +
> +/**
> + * gov_data - per-policy data internal to the governor
> + * @throttle: next throttling period expiry. Derived from throttle_nsec
> + * @throttle_nsec: throttle period length in nanoseconds
> + * @task: worker thread for dvfs transition that may block/sleep
> + * @irq_work: callback used to wake up worker thread
> + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
> + *
> + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
> + * per-policy instance of it is created when the cpufreq_sched governor receives
> + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
> + * member of struct cpufreq_policy.
> + *
> + * Readers of this data must call down_read(policy->rwsem). Writers must
> + * call down_write(policy->rwsem).
> + */
> +struct gov_data {
> +	ktime_t throttle;
> +	unsigned int throttle_nsec;
> +	struct task_struct *task;
> +	struct irq_work irq_work;
> +	struct cpufreq_policy *policy;
> +	unsigned int freq;
> +};
> +
> +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
> +{
> +	struct gov_data *gd = policy->governor_data;
> +
> +	/* avoid race with cpufreq_sched_stop */
> +	if (!down_write_trylock(&policy->rwsem))
> +		return;
> +
> +	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
> +
> +	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
> +	up_write(&policy->rwsem);
> +}

That locking truly is disgusting.. why can't we change that?

> +static int cpufreq_sched_thread(void *data)
> +{

> +
> +	ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);

That's not sufficient, you really want to have called kthread_bind() on
these threads, otherwise userspace can change affinity on you.

> +
> +	do_exit(0);

I thought kthreads only needed to return...

> +}

> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
> +{
> +	unsigned int freq_new, cpu_tmp;
> +	struct cpufreq_policy *policy;
> +	struct gov_data *gd;
> +	unsigned long capacity_max = 0;
> +
> +	/* update per-cpu capacity request */
> +	__this_cpu_write(pcpu_capacity, capacity);
> +
> +	policy = cpufreq_cpu_get(cpu);

So this does a down_read_trylock(&cpufreq_rwsem) and a
read_lock_irqsave(&cpufreq_driver_lock), all while holding scheduler
locks.

> +	if (cpufreq_driver_might_sleep())
> +		irq_work_queue_on(&gd->irq_work, cpu);
> +	else
> +		cpufreq_sched_try_driver_target(policy, freq_new);

This will then do a down_write_trylock(&policy->rwsem)

> +
> +out:
> +	cpufreq_cpu_put(policy);

> +	return;
> +}

That is just insane... surely we can replace all that with a wee bit of
RCU logic.

So something like:

DEFINE_MUTEX(cpufreq_mutex);
struct cpufreq_driver *cpufreq_driver;

struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu)
{
	struct cpufreq_driver *driver;
	struct cpufreq_policy *policy;

	rcu_read_lock();
	driver = rcu_dereference(cpufreq_driver);
	if (!driver)
		goto err;

	policy = per_cpu_ptr(driver->policy, cpu);
	if (!policy)
		goto err;

	return policy;

err:
	rcu_read_unlock();
	return NULL;
}


void cpufreq_cpu_put(struct cpufreq_policy *policy)
{
	rcu_read_unlock();
}



void cpufreq_set_driver(struct cpufreq_driver *driver)
{
	mutex_lock(&cpufreq_mutex);

	rcu_assign_pointer(cpufreq_driver, NULL);

	/*
	 * Wait for everyone to observe the lack of driver; iow. until
	 * its unused.
	 */
	synchronize_rcu();

	/*
	 * Now that ye olde driver be gone, install a new one.
	 */
	if (driver)
		rcu_assign_pointer(cpufreq_driver, driver);

	mutex_unlock(&cpufreq_mutex);
}


No need for cpufreq_rwsem or cpufreq_driver_lock..


Hmm?
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Aug. 15, 2015, 1:05 p.m. UTC | #5
On Tue, Jul 07, 2015 at 07:24:21PM +0100, Morten Rasmussen wrote:
> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
> +{
> +	unsigned int freq_new, cpu_tmp;
> +	struct cpufreq_policy *policy;
> +	struct gov_data *gd;
> +	unsigned long capacity_max = 0;
> +
> +	/* update per-cpu capacity request */
> +	__this_cpu_write(pcpu_capacity, capacity);
> +
> +	policy = cpufreq_cpu_get(cpu);
> +	if (IS_ERR_OR_NULL(policy)) {
> +		return;
> +	}
> +
> +	if (!policy->governor_data)
> +		goto out;
> +
> +	gd = policy->governor_data;
> +
> +	/* bail early if we are throttled */
> +	if (ktime_before(ktime_get(), gd->throttle))
> +		goto out;

Isn't this the wrong place to throttle? Suppose you're getting multiple
new tasks placed on this CPU, the first one would trigger this callback
and start increasing freq..

While we're still changing freq. (and therefore throttled), another task
comes in which would again raise the freq.

With this scheme you loose the latter freq. change and will not
re-evaluate.

Any scheme that limits the callbacks to the actual hardware will have to
buffer requests and once the hardware returns (be it through an
interrupt or timeout) issue the latest request.

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Aug. 25, 2015, 10:45 a.m. UTC | #6
Hi Peter,

On 15/08/15 14:05, Peter Zijlstra wrote:
> On Tue, Jul 07, 2015 at 07:24:21PM +0100, Morten Rasmussen wrote:
>> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
>> +{
>> +	unsigned int freq_new, cpu_tmp;
>> +	struct cpufreq_policy *policy;
>> +	struct gov_data *gd;
>> +	unsigned long capacity_max = 0;
>> +
>> +	/* update per-cpu capacity request */
>> +	__this_cpu_write(pcpu_capacity, capacity);
>> +
>> +	policy = cpufreq_cpu_get(cpu);
>> +	if (IS_ERR_OR_NULL(policy)) {
>> +		return;
>> +	}
>> +
>> +	if (!policy->governor_data)
>> +		goto out;
>> +
>> +	gd = policy->governor_data;
>> +
>> +	/* bail early if we are throttled */
>> +	if (ktime_before(ktime_get(), gd->throttle))
>> +		goto out;
> 
> Isn't this the wrong place to throttle? Suppose you're getting multiple
> new tasks placed on this CPU, the first one would trigger this callback
> and start increasing freq..
> 
> While we're still changing freq. (and therefore throttled), another task
> comes in which would again raise the freq.
> 
> With this scheme you loose the latter freq. change and will not
> re-evaluate.
> 

The way the policy is implemented, you should not have this problem.
For new tasks you actually jump to max freq, as new tasks util gets
initialized to 1024. For load balancing migrations we wait until
all the tasks are migrated and then we trigger an update.

> Any scheme that limits the callbacks to the actual hardware will have to
> buffer requests and once the hardware returns (be it through an
> interrupt or timeout) issue the latest request.
>

But, it is true that if the above events happened the other way around
(we trigger an update after load balancing and a new task arrives), we
may miss the opportunity to jump to max with the new task. In my mind
this is probably not a big deal, as we'll have a tick pretty soon that
will fix things anyway (saving us some complexity in the backend).

What you think?

Thanks,

- Juri

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Sept. 4, 2015, 1:27 p.m. UTC | #7
On 15/08/15 13:35, Peter Zijlstra wrote:
> On Tue, Jul 07, 2015 at 07:24:21PM +0100, Morten Rasmussen wrote:
>> diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
>> new file mode 100644
>> index 0000000..5020f24
>> --- /dev/null
>> +++ b/kernel/sched/cpufreq_sched.c
>> @@ -0,0 +1,308 @@
>> +/*
>> + *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/cpufreq.h>
>> +#include <linux/module.h>
>> +#include <linux/kthread.h>
>> +#include <linux/percpu.h>
>> +#include <linux/irq_work.h>
>> +
>> +#include "sched.h"
>> +
>> +#define THROTTLE_NSEC		50000000 /* 50ms default */
>> +
>> +static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
>> +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
>> +
>> +/**
>> + * gov_data - per-policy data internal to the governor
>> + * @throttle: next throttling period expiry. Derived from throttle_nsec
>> + * @throttle_nsec: throttle period length in nanoseconds
>> + * @task: worker thread for dvfs transition that may block/sleep
>> + * @irq_work: callback used to wake up worker thread
>> + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
>> + *
>> + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
>> + * per-policy instance of it is created when the cpufreq_sched governor receives
>> + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
>> + * member of struct cpufreq_policy.
>> + *
>> + * Readers of this data must call down_read(policy->rwsem). Writers must
>> + * call down_write(policy->rwsem).
>> + */
>> +struct gov_data {
>> +	ktime_t throttle;
>> +	unsigned int throttle_nsec;
>> +	struct task_struct *task;
>> +	struct irq_work irq_work;
>> +	struct cpufreq_policy *policy;
>> +	unsigned int freq;
>> +};
>> +
>> +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
>> +{
>> +	struct gov_data *gd = policy->governor_data;
>> +
>> +	/* avoid race with cpufreq_sched_stop */
>> +	if (!down_write_trylock(&policy->rwsem))
>> +		return;
>> +
>> +	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
>> +
>> +	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
>> +	up_write(&policy->rwsem);
>> +}
> 
> That locking truly is disgusting.. why can't we change that?
> 
>> +static int cpufreq_sched_thread(void *data)
>> +{
> 
>> +
>> +	ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
> 
> That's not sufficient, you really want to have called kthread_bind() on
> these threads, otherwise userspace can change affinity on you.
> 
>> +
>> +	do_exit(0);
> 
> I thought kthreads only needed to return...
> 
>> +}
> 
>> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
>> +{
>> +	unsigned int freq_new, cpu_tmp;
>> +	struct cpufreq_policy *policy;
>> +	struct gov_data *gd;
>> +	unsigned long capacity_max = 0;
>> +
>> +	/* update per-cpu capacity request */
>> +	__this_cpu_write(pcpu_capacity, capacity);
>> +
>> +	policy = cpufreq_cpu_get(cpu);
> 
> So this does a down_read_trylock(&cpufreq_rwsem) and a
> read_lock_irqsave(&cpufreq_driver_lock), all while holding scheduler
> locks.
> 
>> +	if (cpufreq_driver_might_sleep())
>> +		irq_work_queue_on(&gd->irq_work, cpu);
>> +	else
>> +		cpufreq_sched_try_driver_target(policy, freq_new);
> 
> This will then do a down_write_trylock(&policy->rwsem)
> 
>> +
>> +out:
>> +	cpufreq_cpu_put(policy);
> 
>> +	return;
>> +}
> 
> That is just insane... surely we can replace all that with a wee bit of
> RCU logic.
> 
> So something like:
> 
> DEFINE_MUTEX(cpufreq_mutex);
> struct cpufreq_driver *cpufreq_driver;
> 
> struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu)
> {
> 	struct cpufreq_driver *driver;
> 	struct cpufreq_policy *policy;
> 
> 	rcu_read_lock();
> 	driver = rcu_dereference(cpufreq_driver);
> 	if (!driver)
> 		goto err;
> 
> 	policy = per_cpu_ptr(driver->policy, cpu);
> 	if (!policy)
> 		goto err;
> 
> 	return policy;
> 
> err:
> 	rcu_read_unlock();
> 	return NULL;
> }
> 
> 
> void cpufreq_cpu_put(struct cpufreq_policy *policy)
> {
> 	rcu_read_unlock();
> }
> 
> 
> 
> void cpufreq_set_driver(struct cpufreq_driver *driver)
> {
> 	mutex_lock(&cpufreq_mutex);
> 
> 	rcu_assign_pointer(cpufreq_driver, NULL);
> 
> 	/*
> 	 * Wait for everyone to observe the lack of driver; iow. until
> 	 * its unused.
> 	 */
> 	synchronize_rcu();
> 
> 	/*
> 	 * Now that ye olde driver be gone, install a new one.
> 	 */
> 	if (driver)
> 		rcu_assign_pointer(cpufreq_driver, driver);
> 
> 	mutex_unlock(&cpufreq_mutex);
> }
> 
> 
> No need for cpufreq_rwsem or cpufreq_driver_lock..
> 
> 
> Hmm?
> 

So, just to recall what we discussed at LPC (I have Mike's slides
at hand :-)). It seems that key points are:

1- we agreed that locking in cpufreq core has to change as we
   have to access it from scheduler hot-paths; what Peter is
   proposing above looks viable to me, what others (way more
   confident then me with cpufreq inners) say?

2- the interface has to be extended as we have to let other
   scheduling classes drive freq selection too; I guess that how
   we do aggregation depends on the nature of sched classes,
   but we didn't really reach any sort of agreement here; is
   this anyway something we can focus on after fixing locking?

3- the interface should also support peripheral devices; this
   seems a interesting feature to have, but how about we postpone
   it after we've got previous points right?

What did I miss of crucial? :-)

Best,

- Juri

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Sept. 14, 2015, 3:57 p.m. UTC | #8
On 04/09/15 14:27, Juri Lelli wrote:
> On 15/08/15 13:35, Peter Zijlstra wrote:
>> On Tue, Jul 07, 2015 at 07:24:21PM +0100, Morten Rasmussen wrote:
>>> diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
>>> new file mode 100644
>>> index 0000000..5020f24
>>> --- /dev/null
>>> +++ b/kernel/sched/cpufreq_sched.c
>>> @@ -0,0 +1,308 @@
>>> +/*
>>> + *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License version 2 as
>>> + * published by the Free Software Foundation.
>>> + */
>>> +
>>> +#include <linux/cpufreq.h>
>>> +#include <linux/module.h>
>>> +#include <linux/kthread.h>
>>> +#include <linux/percpu.h>
>>> +#include <linux/irq_work.h>
>>> +
>>> +#include "sched.h"
>>> +
>>> +#define THROTTLE_NSEC		50000000 /* 50ms default */
>>> +
>>> +static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
>>> +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
>>> +
>>> +/**
>>> + * gov_data - per-policy data internal to the governor
>>> + * @throttle: next throttling period expiry. Derived from throttle_nsec
>>> + * @throttle_nsec: throttle period length in nanoseconds
>>> + * @task: worker thread for dvfs transition that may block/sleep
>>> + * @irq_work: callback used to wake up worker thread
>>> + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
>>> + *
>>> + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
>>> + * per-policy instance of it is created when the cpufreq_sched governor receives
>>> + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
>>> + * member of struct cpufreq_policy.
>>> + *
>>> + * Readers of this data must call down_read(policy->rwsem). Writers must
>>> + * call down_write(policy->rwsem).
>>> + */
>>> +struct gov_data {
>>> +	ktime_t throttle;
>>> +	unsigned int throttle_nsec;
>>> +	struct task_struct *task;
>>> +	struct irq_work irq_work;
>>> +	struct cpufreq_policy *policy;
>>> +	unsigned int freq;
>>> +};
>>> +
>>> +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
>>> +{
>>> +	struct gov_data *gd = policy->governor_data;
>>> +
>>> +	/* avoid race with cpufreq_sched_stop */
>>> +	if (!down_write_trylock(&policy->rwsem))
>>> +		return;
>>> +
>>> +	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
>>> +
>>> +	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
>>> +	up_write(&policy->rwsem);
>>> +}
>>
>> That locking truly is disgusting.. why can't we change that?
>>
>>> +static int cpufreq_sched_thread(void *data)
>>> +{
>>
>>> +
>>> +	ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
>>
>> That's not sufficient, you really want to have called kthread_bind() on
>> these threads, otherwise userspace can change affinity on you.
>>
>>> +
>>> +	do_exit(0);
>>
>> I thought kthreads only needed to return...
>>
>>> +}
>>
>>> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
>>> +{
>>> +	unsigned int freq_new, cpu_tmp;
>>> +	struct cpufreq_policy *policy;
>>> +	struct gov_data *gd;
>>> +	unsigned long capacity_max = 0;
>>> +
>>> +	/* update per-cpu capacity request */
>>> +	__this_cpu_write(pcpu_capacity, capacity);
>>> +
>>> +	policy = cpufreq_cpu_get(cpu);
>>
>> So this does a down_read_trylock(&cpufreq_rwsem) and a
>> read_lock_irqsave(&cpufreq_driver_lock), all while holding scheduler
>> locks.
>>
>>> +	if (cpufreq_driver_might_sleep())
>>> +		irq_work_queue_on(&gd->irq_work, cpu);
>>> +	else
>>> +		cpufreq_sched_try_driver_target(policy, freq_new);
>>
>> This will then do a down_write_trylock(&policy->rwsem)
>>
>>> +
>>> +out:
>>> +	cpufreq_cpu_put(policy);
>>
>>> +	return;
>>> +}
>>
>> That is just insane... surely we can replace all that with a wee bit of
>> RCU logic.
>>
>> So something like:
>>
>> DEFINE_MUTEX(cpufreq_mutex);
>> struct cpufreq_driver *cpufreq_driver;
>>
>> struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu)
>> {
>> 	struct cpufreq_driver *driver;
>> 	struct cpufreq_policy *policy;
>>
>> 	rcu_read_lock();
>> 	driver = rcu_dereference(cpufreq_driver);
>> 	if (!driver)
>> 		goto err;
>>
>> 	policy = per_cpu_ptr(driver->policy, cpu);
>> 	if (!policy)
>> 		goto err;
>>
>> 	return policy;
>>
>> err:
>> 	rcu_read_unlock();
>> 	return NULL;
>> }
>>
>>
>> void cpufreq_cpu_put(struct cpufreq_policy *policy)
>> {
>> 	rcu_read_unlock();
>> }
>>
>>
>>
>> void cpufreq_set_driver(struct cpufreq_driver *driver)
>> {
>> 	mutex_lock(&cpufreq_mutex);
>>
>> 	rcu_assign_pointer(cpufreq_driver, NULL);
>>
>> 	/*
>> 	 * Wait for everyone to observe the lack of driver; iow. until
>> 	 * its unused.
>> 	 */
>> 	synchronize_rcu();
>>
>> 	/*
>> 	 * Now that ye olde driver be gone, install a new one.
>> 	 */
>> 	if (driver)
>> 		rcu_assign_pointer(cpufreq_driver, driver);
>>
>> 	mutex_unlock(&cpufreq_mutex);
>> }
>>
>>
>> No need for cpufreq_rwsem or cpufreq_driver_lock..
>>
>>
>> Hmm?
>>
> 
> So, just to recall what we discussed at LPC (I have Mike's slides
> at hand :-)). It seems that key points are:
> 
> 1- we agreed that locking in cpufreq core has to change as we
>    have to access it from scheduler hot-paths; what Peter is
>    proposing above looks viable to me, what others (way more
>    confident then me with cpufreq inners) say?
> 
> 2- the interface has to be extended as we have to let other
>    scheduling classes drive freq selection too; I guess that how
>    we do aggregation depends on the nature of sched classes,
>    but we didn't really reach any sort of agreement here; is
>    this anyway something we can focus on after fixing locking?
> 
> 3- the interface should also support peripheral devices; this
>    seems a interesting feature to have, but how about we postpone
>    it after we've got previous points right?
> 
> What did I miss of crucial? :-)
> 

Hi Mike, others, ping on above points.
Any comments on how we can move forward? :-)

Best,

- Juri

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Sept. 15, 2015, 1:45 p.m. UTC | #9
On Mon, Sep 14, 2015 at 04:57:35PM +0100, Juri Lelli wrote:
> On 04/09/15 14:27, Juri Lelli wrote:
> > So, just to recall what we discussed at LPC (I have Mike's slides
> > at hand :-)). It seems that key points are:
> > 
> > 1- we agreed that locking in cpufreq core has to change as we
> >    have to access it from scheduler hot-paths; what Peter is
> >    proposing above looks viable to me, what others (way more
> >    confident then me with cpufreq inners) say?

Rafael had some thoughts IIRC.

> > 2- the interface has to be extended as we have to let other
> >    scheduling classes drive freq selection too; I guess that how
> >    we do aggregation depends on the nature of sched classes,
> >    but we didn't really reach any sort of agreement here; is
> >    this anyway something we can focus on after fixing locking?

Right, that's going to be interesting. Reading through that SchedTune
thread has been educational (I really had no clue what it was on about
on initial reading, the discussion that's on-going clarified a lot).

It seems that even the fair class might want to provide minimal hints
due to that 'boost' / 'interactive' nonsense.

> > 3- the interface should also support peripheral devices; this
> >    seems a interesting feature to have, but how about we postpone
> >    it after we've got previous points right?

Agreed, that's a can of worms :-) Better start with the 'simple' things.

That said; ISTR a patch set on this topic recently.

  lkml.kernel.org/r/1441904972-5809-1-git-send-email-javi.merino@arm.com
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Sept. 15, 2015, 4:22 p.m. UTC | #10
On 15/09/15 14:45, Peter Zijlstra wrote:
> On Mon, Sep 14, 2015 at 04:57:35PM +0100, Juri Lelli wrote:
>> On 04/09/15 14:27, Juri Lelli wrote:
>>> So, just to recall what we discussed at LPC (I have Mike's slides
>>> at hand :-)). It seems that key points are:
>>>
>>> 1- we agreed that locking in cpufreq core has to change as we
>>>    have to access it from scheduler hot-paths; what Peter is
>>>    proposing above looks viable to me, what others (way more
>>>    confident then me with cpufreq inners) say?
> 
> Rafael had some thoughts IIRC.
>

Yep, right. Rafael, could you please refresh our memory here? Do
you foresee any problem trying to go as Peter suggested?

>>> 2- the interface has to be extended as we have to let other
>>>    scheduling classes drive freq selection too; I guess that how
>>>    we do aggregation depends on the nature of sched classes,
>>>    but we didn't really reach any sort of agreement here; is
>>>    this anyway something we can focus on after fixing locking?
> 
> Right, that's going to be interesting. Reading through that SchedTune
> thread has been educational (I really had no clue what it was on about
> on initial reading, the discussion that's on-going clarified a lot).
> 

That's good stuff yes.

> It seems that even the fair class might want to provide minimal hints
> due to that 'boost' / 'interactive' nonsense.
> 

That's basically what we are doing with the SchedDVFS + SchedTune thing.
Sometimes is helpful to be able to fake signals in order to get better
performance. The current interface already allows us to do that, but
we'll have to figure out how this need takes into account other classes
as well (like this being still best effort and others having QoS
requirements).

>>> 3- the interface should also support peripheral devices; this
>>>    seems a interesting feature to have, but how about we postpone
>>>    it after we've got previous points right?
> 
> Agreed, that's a can of worms :-) Better start with the 'simple' things.
> 

Good. :-)

> That said; ISTR a patch set on this topic recently.
> 
>   lkml.kernel.org/r/1441904972-5809-1-git-send-email-javi.merino@arm.com
> 

Yep, that has to be factored in as well eventually; we are actually
trying to keep that in our minds as we proceed further with development
(Javi sits a desk far from me ;-)), but, as you said, simpler things first.

Thanks,

- Juri

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Punit Agrawal Sept. 28, 2015, 4:48 p.m. UTC | #11
Hi Mike,

I ran into an issue when using this patch. Posting it here as this is
the latest posting I can find.

Morten Rasmussen <morten.rasmussen@arm.com> writes:

> From: Michael Turquette <mturquette@baylibre.com>
>
> Scheduler-driven cpu frequency selection is desirable as part of the
> on-going effort to make the scheduler better aware of energy
> consumption.  No piece of the Linux kernel has a better view of the
> factors that affect a cpu frequency selection policy than the
> scheduler[0], and this patch is an attempt to converge on an initial
> solution.
>
> This patch implements a simple shim layer between the Linux scheduler
> and the cpufreq subsystem. This interface accepts a capacity request
> from the Completely Fair Scheduler and honors the max request from all
> cpus in the same frequency domain.
>
> The policy magic comes from choosing the cpu capacity request from cfs
> and is not contained in this cpufreq governor. This code is
> intentionally dumb.
>
> Note that this "governor" is event-driven. There is no polling loop to
> check cpu idle time nor any other method which is unsynchronized with
> the scheduler.
>
> Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
> code and test results.
>
> [0] http://article.gmane.org/gmane.linux.kernel/1499836
>
> Signed-off-by: Michael Turquette <mturquette@baylibre.com>
> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
> ---
>  drivers/cpufreq/Kconfig      |  24 ++++
>  include/linux/cpufreq.h      |   3 +
>  kernel/sched/Makefile        |   1 +
>  kernel/sched/cpufreq_sched.c | 308 +++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h         |   8 ++
>  5 files changed, 344 insertions(+)
>  create mode 100644 kernel/sched/cpufreq_sched.c
>
> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> index 659879a..9bbf44c 100644
> --- a/drivers/cpufreq/Kconfig
> +++ b/drivers/cpufreq/Kconfig
> @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
>  	  Be aware that not all cpufreq drivers support the conservative
>  	  governor. If unsure have a look at the help section of the
>  	  driver. Fallback governor will be the performance governor.
> +
> +config CPU_FREQ_DEFAULT_GOV_SCHED
> +	bool "sched"
> +	select CPU_FREQ_GOV_SCHED
> +	select CPU_FREQ_GOV_PERFORMANCE
> +	help
> +	  Use the CPUfreq governor 'sched' as default. This scales
> +	  cpu frequency from the scheduler as per-entity load tracking
> +	  statistics are updated.
>  endchoice
>  
>  config CPU_FREQ_GOV_PERFORMANCE
> @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
>  
>  	  If in doubt, say N.
>  
> +config CPU_FREQ_GOV_SCHED
> +	tristate "'sched' cpufreq governor"
> +	depends on CPU_FREQ
> +	select CPU_FREQ_GOV_COMMON
> +	help
> +	  'sched' - this governor scales cpu frequency from the
> +	  scheduler as a function of cpu capacity utilization. It does
> +	  not evaluate utilization on a periodic basis (as ondemand
> +	  does) but instead is invoked from the completely fair
> +	  scheduler when updating per-entity load tracking statistics.
> +	  Latency to respond to changes in load is improved over polling
> +	  governors due to its event-driven design.
> +
> +	  If in doubt, say N.
> +
>  comment "CPU frequency scaling drivers"
>  
>  config CPUFREQ_DT
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 1f2c9a1..30241c9 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
>  #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
>  extern struct cpufreq_governor cpufreq_gov_conservative;
>  #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
> +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
> +extern struct cpufreq_governor cpufreq_gov_sched_gov;
> +#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
>  #endif

You have extra 'gov' in CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV and
cpufreq_gov_sched_gov above.


[...]

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Muckle Sept. 29, 2015, 12:26 a.m. UTC | #12
Hi Punit,

On 09/28/2015 09:48 AM, Punit Agrawal wrote:
> Hi Mike,
> 
> I ran into an issue when using this patch. Posting it here as this is
> the latest posting I can find.
> 
> Morten Rasmussen <morten.rasmussen@arm.com> writes:
> 
>> From: Michael Turquette <mturquette@baylibre.com>
>>
...
>> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
>> index 1f2c9a1..30241c9 100644
>> --- a/include/linux/cpufreq.h
>> +++ b/include/linux/cpufreq.h
>> @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
>>  #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
>>  extern struct cpufreq_governor cpufreq_gov_conservative;
>>  #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
>> +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
>> +extern struct cpufreq_governor cpufreq_gov_sched_gov;
>> +#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
>>  #endif
> 
> You have extra 'gov' in CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV and
> cpufreq_gov_sched_gov above.

Thanks, this was pointed out earlier and will be fixed in subsequent
revisions of the patchset.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Muckle Oct. 8, 2015, 12:14 a.m. UTC | #13
On 08/25/2015 03:45 AM, Juri Lelli wrote:
> But, it is true that if the above events happened the other way around
> (we trigger an update after load balancing and a new task arrives), we
> may miss the opportunity to jump to max with the new task. In my mind
> this is probably not a big deal, as we'll have a tick pretty soon that
> will fix things anyway (saving us some complexity in the backend).
> 
> What you think?

I fear that waiting up to a full tick to resolve a shortfall in CPU
bandwidth will cause complaints.

Thinking about how this would be implemented raises a couple questions
for me though.

1. To avoid issuing a frequency change request while one is already in
flight, the current code uses the stated cpufreq driver transition
latency to throttle. Wouldn't it be more accurate to block further
requests until the CPUFREQ_POSTCHANGE notifier has run? In addition to
removing the requirement of supplying a latency value, frequency
transitions may take different amounts of time depending on system state
so a single latency value may often be incorrect.

2. The decision of whether or not to call into the low level cpufreq
driver in the scheduler hot paths currently hinges on whether or not the
low level cpufreq driver will sleep. Even if the cpufreq driver does not
sleep however, the latency to enqueue a frequency change (and complete
it if the low level driver is not asynchronous) may still be high,
making it unsuitable to run in a scheduler hot path. Should the
semantics of the flag be changed to indicate whether a cpufreq driver is
fast enough to run in this context? Sleeping would still of course mean
that it is not.

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Oct. 8, 2015, 9:41 a.m. UTC | #14
On 08/10/15 01:14, Steve Muckle wrote:
> On 08/25/2015 03:45 AM, Juri Lelli wrote:
>> But, it is true that if the above events happened the other way around
>> (we trigger an update after load balancing and a new task arrives), we
>> may miss the opportunity to jump to max with the new task. In my mind
>> this is probably not a big deal, as we'll have a tick pretty soon that
>> will fix things anyway (saving us some complexity in the backend).
>>
>> What you think?
> 
> I fear that waiting up to a full tick to resolve a shortfall in CPU
> bandwidth will cause complaints.
>

Right, especially now that we'll extend the thing for other classes
as well. So, I guess we'll actually need to buffer requests, as Peter
was already suggesting.

> Thinking about how this would be implemented raises a couple questions
> for me though.
> 
> 1. To avoid issuing a frequency change request while one is already in
> flight, the current code uses the stated cpufreq driver transition
> latency to throttle. Wouldn't it be more accurate to block further
> requests until the CPUFREQ_POSTCHANGE notifier has run? In addition to
> removing the requirement of supplying a latency value, frequency
> transitions may take different amounts of time depending on system state
> so a single latency value may often be incorrect.
> 

Looks good to me.

> 2. The decision of whether or not to call into the low level cpufreq
> driver in the scheduler hot paths currently hinges on whether or not the
> low level cpufreq driver will sleep. Even if the cpufreq driver does not
> sleep however, the latency to enqueue a frequency change (and complete
> it if the low level driver is not asynchronous) may still be high,
> making it unsuitable to run in a scheduler hot path. Should the
> semantics of the flag be changed to indicate whether a cpufreq driver is
> fast enough to run in this context? Sleeping would still of course mean
> that it is not.
> 

Yeah, we assumed that not sleeping means fast. I didn't really played
with this configuration, so I can't say if this is a problem or not.
But, I agree with you that, if this is a problem, we could change
semantic of the flag (maybe it is just more general?).

Thanks,

- Juri

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 659879a..9bbf44c 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -102,6 +102,15 @@  config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 	  Be aware that not all cpufreq drivers support the conservative
 	  governor. If unsure have a look at the help section of the
 	  driver. Fallback governor will be the performance governor.
+
+config CPU_FREQ_DEFAULT_GOV_SCHED
+	bool "sched"
+	select CPU_FREQ_GOV_SCHED
+	select CPU_FREQ_GOV_PERFORMANCE
+	help
+	  Use the CPUfreq governor 'sched' as default. This scales
+	  cpu frequency from the scheduler as per-entity load tracking
+	  statistics are updated.
 endchoice
 
 config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,21 @@  config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
+config CPU_FREQ_GOV_SCHED
+	tristate "'sched' cpufreq governor"
+	depends on CPU_FREQ
+	select CPU_FREQ_GOV_COMMON
+	help
+	  'sched' - this governor scales cpu frequency from the
+	  scheduler as a function of cpu capacity utilization. It does
+	  not evaluate utilization on a periodic basis (as ondemand
+	  does) but instead is invoked from the completely fair
+	  scheduler when updating per-entity load tracking statistics.
+	  Latency to respond to changes in load is improved over polling
+	  governors due to its event-driven design.
+
+	  If in doubt, say N.
+
 comment "CPU frequency scaling drivers"
 
 config CPUFREQ_DT
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1f2c9a1..30241c9 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -494,6 +494,9 @@  extern struct cpufreq_governor cpufreq_gov_ondemand;
 #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
 extern struct cpufreq_governor cpufreq_gov_conservative;
 #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
+extern struct cpufreq_governor cpufreq_gov_sched_gov;
+#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
 #endif
 
 /*********************************************************************
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6768797..90ed832 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
new file mode 100644
index 0000000..5020f24
--- /dev/null
+++ b/kernel/sched/cpufreq_sched.c
@@ -0,0 +1,308 @@ 
+/*
+ *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+#include <linux/irq_work.h>
+
+#include "sched.h"
+
+#define THROTTLE_NSEC		50000000 /* 50ms default */
+
+static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
+static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
+
+/**
+ * gov_data - per-policy data internal to the governor
+ * @throttle: next throttling period expiry. Derived from throttle_nsec
+ * @throttle_nsec: throttle period length in nanoseconds
+ * @task: worker thread for dvfs transition that may block/sleep
+ * @irq_work: callback used to wake up worker thread
+ * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
+ *
+ * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
+ * per-policy instance of it is created when the cpufreq_sched governor receives
+ * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
+ * member of struct cpufreq_policy.
+ *
+ * Readers of this data must call down_read(policy->rwsem). Writers must
+ * call down_write(policy->rwsem).
+ */
+struct gov_data {
+	ktime_t throttle;
+	unsigned int throttle_nsec;
+	struct task_struct *task;
+	struct irq_work irq_work;
+	struct cpufreq_policy *policy;
+	unsigned int freq;
+};
+
+static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
+{
+	struct gov_data *gd = policy->governor_data;
+
+	/* avoid race with cpufreq_sched_stop */
+	if (!down_write_trylock(&policy->rwsem))
+		return;
+
+	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+
+	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
+	up_write(&policy->rwsem);
+}
+
+/*
+ * we pass in struct cpufreq_policy. This is safe because changing out the
+ * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
+ * which tears down all of the data structures and __cpufreq_governor(policy,
+ * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
+ * new policy pointer
+ */
+static int cpufreq_sched_thread(void *data)
+{
+	struct sched_param param;
+	struct cpufreq_policy *policy;
+	struct gov_data *gd;
+	int ret;
+
+	policy = (struct cpufreq_policy *) data;
+	if (!policy) {
+		pr_warn("%s: missing policy\n", __func__);
+		do_exit(-EINVAL);
+	}
+
+	gd = policy->governor_data;
+	if (!gd) {
+		pr_warn("%s: missing governor data\n", __func__);
+		do_exit(-EINVAL);
+	}
+
+	param.sched_priority = 50;
+	ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
+	if (ret) {
+		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		do_exit(-EINVAL);
+	} else {
+		pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
+				__func__, gd->task->pid);
+	}
+
+	ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
+	if (ret) {
+		pr_warn("%s: failed to set allowed ptr\n", __func__);
+		do_exit(-EINVAL);
+	}
+
+	/* main loop of the per-policy kthread */
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+		if (kthread_should_stop())
+			break;
+
+		cpufreq_sched_try_driver_target(policy, gd->freq);
+	} while (!kthread_should_stop());
+
+	do_exit(0);
+}
+
+static void cpufreq_sched_irq_work(struct irq_work *irq_work)
+{
+	struct gov_data *gd;
+
+	gd = container_of(irq_work, struct gov_data, irq_work);
+	if (!gd) {
+		return;
+	}
+
+	wake_up_process(gd->task);
+}
+
+/**
+ * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values
+ * @cpu: cpu whose capacity utilization has recently changed
+ * @capacity: the new capacity requested by cpu
+ *
+ * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so
+ * that the scheduler may inform the governor of updates to capacity
+ * utilization and make changes to cpu frequency. Currently this interface is
+ * designed around PELT values in CFS. It can be expanded to other scheduling
+ * classes in the future if needed.
+ *
+ * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI
+ * wakes up the thread that does the actual work, cpufreq_sched_thread.
+ *
+ * This functions bails out early if either condition is true:
+ * 1) this cpu did not the new maximum capacity for its frequency domain
+ * 2) no change in cpu frequency is necessary to meet the new capacity request
+ */
+void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
+{
+	unsigned int freq_new, cpu_tmp;
+	struct cpufreq_policy *policy;
+	struct gov_data *gd;
+	unsigned long capacity_max = 0;
+
+	/* update per-cpu capacity request */
+	__this_cpu_write(pcpu_capacity, capacity);
+
+	policy = cpufreq_cpu_get(cpu);
+	if (IS_ERR_OR_NULL(policy)) {
+		return;
+	}
+
+	if (!policy->governor_data)
+		goto out;
+
+	gd = policy->governor_data;
+
+	/* bail early if we are throttled */
+	if (ktime_before(ktime_get(), gd->throttle))
+		goto out;
+
+	/* find max capacity requested by cpus in this policy */
+	for_each_cpu(cpu_tmp, policy->cpus)
+		capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp));
+
+	/*
+	 * We only change frequency if this cpu's capacity request represents a
+	 * new max. If another cpu has requested a capacity greater than the
+	 * previous max then we rely on that cpu to hit this code path and make
+	 * the change. IOW, the cpu with the new max capacity is responsible
+	 * for setting the new capacity/frequency.
+	 *
+	 * If this cpu is not the new maximum then bail
+	 */
+	if (capacity_max > capacity)
+		goto out;
+
+	/* Convert the new maximum capacity request into a cpu frequency */
+	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+
+	/* No change in frequency? Bail and return current capacity. */
+	if (freq_new == policy->cur)
+		goto out;
+
+	/* store the new frequency and perform the transition */
+	gd->freq = freq_new;
+
+	if (cpufreq_driver_might_sleep())
+		irq_work_queue_on(&gd->irq_work, cpu);
+	else
+		cpufreq_sched_try_driver_target(policy, freq_new);
+
+out:
+	cpufreq_cpu_put(policy);
+	return;
+}
+
+static int cpufreq_sched_start(struct cpufreq_policy *policy)
+{
+	struct gov_data *gd;
+	int cpu;
+
+	/* prepare per-policy private data */
+	gd = kzalloc(sizeof(*gd), GFP_KERNEL);
+	if (!gd) {
+		pr_debug("%s: failed to allocate private data\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* initialize per-cpu data */
+	for_each_cpu(cpu, policy->cpus) {
+		per_cpu(pcpu_capacity, cpu) = 0;
+		per_cpu(pcpu_policy, cpu) = policy;
+	}
+
+	/*
+	 * Don't ask for freq changes at an higher rate than what
+	 * the driver advertises as transition latency.
+	 */
+	gd->throttle_nsec = policy->cpuinfo.transition_latency ?
+			    policy->cpuinfo.transition_latency :
+			    THROTTLE_NSEC;
+	pr_debug("%s: throttle threshold = %u [ns]\n",
+		  __func__, gd->throttle_nsec);
+
+	if (cpufreq_driver_might_sleep()) {
+		/* init per-policy kthread */
+		gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task");
+		if (IS_ERR_OR_NULL(gd->task)) {
+			pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__);
+			goto err;
+		}
+		init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
+	}
+
+	policy->governor_data = gd;
+	gd->policy = policy;
+	return 0;
+
+err:
+	kfree(gd);
+	return -ENOMEM;
+}
+
+static int cpufreq_sched_stop(struct cpufreq_policy *policy)
+{
+	struct gov_data *gd = policy->governor_data;
+
+	if (cpufreq_driver_might_sleep()) {
+		kthread_stop(gd->task);
+	}
+
+	policy->governor_data = NULL;
+
+	/* FIXME replace with devm counterparts? */
+	kfree(gd);
+	return 0;
+}
+
+static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event)
+{
+	switch (event) {
+		case CPUFREQ_GOV_START:
+			/* Start managing the frequency */
+			return cpufreq_sched_start(policy);
+
+		case CPUFREQ_GOV_STOP:
+			return cpufreq_sched_stop(policy);
+
+		case CPUFREQ_GOV_LIMITS:	/* unused */
+		case CPUFREQ_GOV_POLICY_INIT:	/* unused */
+		case CPUFREQ_GOV_POLICY_EXIT:	/* unused */
+			break;
+	}
+	return 0;
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static
+#endif
+struct cpufreq_governor cpufreq_gov_sched = {
+	.name			= "sched",
+	.governor		= cpufreq_sched_setup,
+	.owner			= THIS_MODULE,
+};
+
+static int __init cpufreq_sched_init(void)
+{
+	return cpufreq_register_governor(&cpufreq_gov_sched);
+}
+
+static void __exit cpufreq_sched_exit(void)
+{
+	cpufreq_unregister_governor(&cpufreq_gov_sched);
+}
+
+/* Try to make this the default governor */
+fs_initcall(cpufreq_sched_init);
+
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c395559..30aa0c4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1476,6 +1476,13 @@  unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 }
 #endif
 
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+void cpufreq_sched_set_cap(int cpu, unsigned long util);
+#else
+static inline void cpufreq_sched_set_cap(int cpu, unsigned long util)
+{ }
+#endif
+
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
@@ -1484,6 +1491,7 @@  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 #else
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
+static inline void gov_cfs_update_cpu(int cpu) {}
 #endif
 
 extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);