diff mbox

[v3,3/4] sched: scheduler-driven cpu frequency selection

Message ID 1435362824-26734-4-git-send-email-mturquette@linaro.org (mailing list archive)
State Not Applicable, archived
Delegated to: Rafael Wysocki
Headers show

Commit Message

Michael Turquette June 26, 2015, 11:53 p.m. UTC
From: Michael Turquette <mturquette@baylibre.com>

Scheduler-driven cpu frequency selection is desirable as part of the
on-going effort to make the scheduler better aware of energy
consumption.  No piece of the Linux kernel has a better view of the
factors that affect a cpu frequency selection policy than the
scheduler[0], and this patch is an attempt to converge on an initial
solution.

This patch implements a simple shim layer between the Linux scheduler
and the cpufreq subsystem. This interface accepts a capacity request
from the Completely Fair Scheduler and honors the max request from all
cpus in the same frequency domain.

The policy magic comes from choosing the cpu capacity request from cfs
and is not contained in this cpufreq governor. This code is
intentionally dumb.

Note that this "governor" is event-driven. There is no polling loop to
check cpu idle time nor any other method which is unsynchronized with
the scheduler.

Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
code and test results.

[0] http://article.gmane.org/gmane.linux.kernel/1499836

Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
Changes in v3:
Removed all policy bits
Renamed from cpufreq_cfs to cpufreq_sched
Bug fixes
Support non-blocking frequency transitions
License fix (thanks Paul)

 drivers/cpufreq/Kconfig      |  24 ++++
 include/linux/cpufreq.h      |   3 +
 kernel/sched/Makefile        |   1 +
 kernel/sched/cpufreq_sched.c | 308 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h         |   8 ++
 5 files changed, 344 insertions(+)
 create mode 100644 kernel/sched/cpufreq_sched.c

Comments

Felipe Balbi June 27, 2015, 12:47 a.m. UTC | #1
Hi,

On Fri, Jun 26, 2015 at 04:53:43PM -0700, Michael Turquette wrote:
> diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
> new file mode 100644
> index 0000000..5020f24
> --- /dev/null
> +++ b/kernel/sched/cpufreq_sched.c
> @@ -0,0 +1,308 @@
> +/*
> + *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/cpufreq.h>
> +#include <linux/module.h>
> +#include <linux/kthread.h>
> +#include <linux/percpu.h>
> +#include <linux/irq_work.h>
> +
> +#include "sched.h"
> +
> +#define THROTTLE_NSEC		50000000 /* 50ms default */
> +
> +static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
> +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
> +
> +/**
> + * gov_data - per-policy data internal to the governor
> + * @throttle: next throttling period expiry. Derived from throttle_nsec
> + * @throttle_nsec: throttle period length in nanoseconds
> + * @task: worker thread for dvfs transition that may block/sleep
> + * @irq_work: callback used to wake up worker thread
> + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
> + *
> + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
> + * per-policy instance of it is created when the cpufreq_sched governor receives
> + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
> + * member of struct cpufreq_policy.
> + *
> + * Readers of this data must call down_read(policy->rwsem). Writers must
> + * call down_write(policy->rwsem).
> + */
> +struct gov_data {
> +	ktime_t throttle;
> +	unsigned int throttle_nsec;
> +	struct task_struct *task;
> +	struct irq_work irq_work;
> +	struct cpufreq_policy *policy;
> +	unsigned int freq;
> +};
> +
> +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
> +{
> +	struct gov_data *gd = policy->governor_data;
> +
> +	/* avoid race with cpufreq_sched_stop */
> +	if (!down_write_trylock(&policy->rwsem))
> +		return;
> +
> +	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
> +
> +	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
> +	up_write(&policy->rwsem);
> +}
> +
> +/*
> + * we pass in struct cpufreq_policy. This is safe because changing out the
> + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
> + * which tears down all of the data structures and __cpufreq_governor(policy,
> + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
> + * new policy pointer
> + */
> +static int cpufreq_sched_thread(void *data)
> +{
> +	struct sched_param param;
> +	struct cpufreq_policy *policy;
> +	struct gov_data *gd;
> +	int ret;
> +
> +	policy = (struct cpufreq_policy *) data;

unnecessary cast.

> +	if (!policy) {

Is this even possible ? I'd just let it oops since it would be a really
odd case.

> +		pr_warn("%s: missing policy\n", __func__);
> +		do_exit(-EINVAL);
> +	}
> +
> +	gd = policy->governor_data;
> +	if (!gd) {

likewise.

> +		pr_warn("%s: missing governor data\n", __func__);
> +		do_exit(-EINVAL);
> +	}
> +
> +	param.sched_priority = 50;
> +	ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
> +	if (ret) {
> +		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
> +		do_exit(-EINVAL);
> +	} else {

else is unnecessary here, but no strong feelings.

> +		pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
> +				__func__, gd->task->pid);
> +	}
> +
> +	ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
> +	if (ret) {
> +		pr_warn("%s: failed to set allowed ptr\n", __func__);
> +		do_exit(-EINVAL);
> +	}
> +
> +	/* main loop of the per-policy kthread */
> +	do {
> +		set_current_state(TASK_INTERRUPTIBLE);
> +		schedule();
> +		if (kthread_should_stop())
> +			break;
> +
> +		cpufreq_sched_try_driver_target(policy, gd->freq);
> +	} while (!kthread_should_stop());

looks like this would be simpler with a plain while() instead of do
{} while:

	while (!kthread_should_stop()) {
		set_current_state(TASK_INTERRUPTIBLE);
		schedule();
		cpufreq_sched_try_driver_target(policy, gd->freq);
	}

> +	do_exit(0);
> +}
> +
> +static void cpufreq_sched_irq_work(struct irq_work *irq_work)
> +{
> +	struct gov_data *gd;
> +
> +	gd = container_of(irq_work, struct gov_data, irq_work);

if irq_work is the first member in struct gov_data, this gets optimized
to a cast.

> +	if (!gd) {

unnecessary parens.

> +		return;
> +	}
> +
> +	wake_up_process(gd->task);
> +}
> +
> +/**
> + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values
> + * @cpu: cpu whose capacity utilization has recently changed
> + * @capacity: the new capacity requested by cpu
> + *
> + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so
> + * that the scheduler may inform the governor of updates to capacity
> + * utilization and make changes to cpu frequency. Currently this interface is
> + * designed around PELT values in CFS. It can be expanded to other scheduling
> + * classes in the future if needed.
> + *
> + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI
> + * wakes up the thread that does the actual work, cpufreq_sched_thread.
> + *
> + * This functions bails out early if either condition is true:
> + * 1) this cpu did not the new maximum capacity for its frequency domain
> + * 2) no change in cpu frequency is necessary to meet the new capacity request
> + */
> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
> +{
> +	unsigned int freq_new, cpu_tmp;
> +	struct cpufreq_policy *policy;
> +	struct gov_data *gd;
> +	unsigned long capacity_max = 0;
> +
> +	/* update per-cpu capacity request */
> +	__this_cpu_write(pcpu_capacity, capacity);
> +
> +	policy = cpufreq_cpu_get(cpu);
> +	if (IS_ERR_OR_NULL(policy)) {

can this really be ERR_PTR ? Also, unnecessary parens

> +		return;
> +	}
> +
> +	if (!policy->governor_data)
> +		goto out;
> +
> +	gd = policy->governor_data;
> +
> +	/* bail early if we are throttled */
> +	if (ktime_before(ktime_get(), gd->throttle))
> +		goto out;
> +
> +	/* find max capacity requested by cpus in this policy */
> +	for_each_cpu(cpu_tmp, policy->cpus)
> +		capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp));
> +
> +	/*
> +	 * We only change frequency if this cpu's capacity request represents a
> +	 * new max. If another cpu has requested a capacity greater than the
> +	 * previous max then we rely on that cpu to hit this code path and make
> +	 * the change. IOW, the cpu with the new max capacity is responsible
> +	 * for setting the new capacity/frequency.
> +	 *
> +	 * If this cpu is not the new maximum then bail
> +	 */
> +	if (capacity_max > capacity)
> +		goto out;
> +
> +	/* Convert the new maximum capacity request into a cpu frequency */
> +	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
> +
> +	/* No change in frequency? Bail and return current capacity. */
> +	if (freq_new == policy->cur)
> +		goto out;
> +
> +	/* store the new frequency and perform the transition */
> +	gd->freq = freq_new;
> +
> +	if (cpufreq_driver_might_sleep())
> +		irq_work_queue_on(&gd->irq_work, cpu);
> +	else
> +		cpufreq_sched_try_driver_target(policy, freq_new);
> +
> +out:
> +	cpufreq_cpu_put(policy);
> +	return;

unnecessary return

> +}
> +
> +static int cpufreq_sched_start(struct cpufreq_policy *policy)
> +{
> +	struct gov_data *gd;
> +	int cpu;
> +
> +	/* prepare per-policy private data */
> +	gd = kzalloc(sizeof(*gd), GFP_KERNEL);
> +	if (!gd) {
> +		pr_debug("%s: failed to allocate private data\n", __func__);

unnecessary OOM message, that will render curly braces unnecessary too.

> +		return -ENOMEM;
> +	}
> +
> +	/* initialize per-cpu data */
> +	for_each_cpu(cpu, policy->cpus) {
> +		per_cpu(pcpu_capacity, cpu) = 0;
> +		per_cpu(pcpu_policy, cpu) = policy;
> +	}
> +
> +	/*
> +	 * Don't ask for freq changes at an higher rate than what

s/an higher/a higher

> +	 * the driver advertises as transition latency.
> +	 */
> +	gd->throttle_nsec = policy->cpuinfo.transition_latency ?
> +			    policy->cpuinfo.transition_latency :
> +			    THROTTLE_NSEC;
> +	pr_debug("%s: throttle threshold = %u [ns]\n",
> +		  __func__, gd->throttle_nsec);
> +
> +	if (cpufreq_driver_might_sleep()) {
> +		/* init per-policy kthread */
> +		gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task");
> +		if (IS_ERR_OR_NULL(gd->task)) {

kthread_run() doesn't return NULL.

> +			pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__);
> +			goto err;
> +		}
> +		init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
> +	}
> +
> +	policy->governor_data = gd;
> +	gd->policy = policy;
> +	return 0;
> +
> +err:
> +	kfree(gd);
> +	return -ENOMEM;

why don't you pass along errors returned by any other function you call ?

> +}
> +
> +static int cpufreq_sched_stop(struct cpufreq_policy *policy)
> +{
> +	struct gov_data *gd = policy->governor_data;
> +
> +	if (cpufreq_driver_might_sleep()) {

unnecessary curly braces.

> +		kthread_stop(gd->task);

should you switch back to some default OPP when this is removed ? Some
SoCs can't run at certain OPPs forever (thermal limitations, or whatever
else), might be good to switch to something considered safe.

> +	}
> +
> +	policy->governor_data = NULL;
> +
> +	/* FIXME replace with devm counterparts? */
> +	kfree(gd);
> +	return 0;
> +}
> +
> +static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event)
> +{
> +	switch (event) {
> +		case CPUFREQ_GOV_START:
> +			/* Start managing the frequency */
> +			return cpufreq_sched_start(policy);
> +
> +		case CPUFREQ_GOV_STOP:
> +			return cpufreq_sched_stop(policy);
> +
> +		case CPUFREQ_GOV_LIMITS:	/* unused */
> +		case CPUFREQ_GOV_POLICY_INIT:	/* unused */
> +		case CPUFREQ_GOV_POLICY_EXIT:	/* unused */
> +			break;

indentation

> +	}
> +	return 0;
> +}
> +
> +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
> +static
> +#endif
> +struct cpufreq_governor cpufreq_gov_sched = {
> +	.name			= "sched",
> +	.governor		= cpufreq_sched_setup,
> +	.owner			= THIS_MODULE,
> +};
> +
> +static int __init cpufreq_sched_init(void)
> +{
> +	return cpufreq_register_governor(&cpufreq_gov_sched);
> +}
> +
> +static void __exit cpufreq_sched_exit(void)
> +{
> +	cpufreq_unregister_governor(&cpufreq_gov_sched);
> +}
> +
> +/* Try to make this the default governor */
> +fs_initcall(cpufreq_sched_init);

why fs_initcall() ? Why can't this be in module_init() ?
Felipe Balbi June 29, 2015, 4:55 p.m. UTC | #2
Hi,

On Mon, Jun 29, 2015 at 09:49:43AM -0700, Michael Turquette wrote:

<snip>

> > > +static int cpufreq_sched_stop(struct cpufreq_policy *policy)
> > > +{
> > > +     struct gov_data *gd = policy->governor_data;
> > > +
> > > +     if (cpufreq_driver_might_sleep()) {
> > 
> > unnecessary curly braces.
> > 
> > > +             kthread_stop(gd->task);
> > 
> 
> Thanks for the review. I'll take into account everything above.
> 
> > should you switch back to some default OPP when this is removed ? Some
> > SoCs can't run at certain OPPs forever (thermal limitations, or whatever
> > else), might be good to switch to something considered safe.
> 
> The above only happens when we unload the module or switch governors,
> and every governor has this characteristic.
> 
> I do not think that open-coding a return to some default opp in every
> governor is a good solution. This sounds like something the cpufreq core
> should take care of.

indeed.

> Also, how do we know which opp is safe?

no idea, that needs to be described somehow.
Dietmar Eggemann July 6, 2015, 8:06 p.m. UTC | #3
Hi Mike,

On 27/06/15 00:53, Michael Turquette wrote:
> From: Michael Turquette <mturquette@baylibre.com>
> 

[...]

>  comment "CPU frequency scaling drivers"
> 
>  config CPUFREQ_DT
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 1f2c9a1..30241c9 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
>  #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
>  extern struct cpufreq_governor cpufreq_gov_conservative;
>  #define CPUFREQ_DEFAULT_GOVERNOR       (&cpufreq_gov_conservative)
> +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
> +extern struct cpufreq_governor cpufreq_gov_sched_gov;

To get it building with CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED=y .

-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
-extern struct cpufreq_governor cpufreq_gov_sched_gov;
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
+extern struct cpufreq_governor cpufreq_gov_sched;

> +#define CPUFREQ_DEFAULT_GOVERNOR       (&cpufreq_gov_sched)
>  #endif

> +
> +/**
> + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values

[...]

minor nit:

s/cpufreq_sched_set_capacity/cpufreq_sched_set_cap

> + * @cpu: cpu whose capacity utilization has recently changed
> + * @capacity: the new capacity requested by cpu
> + *
> + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so
> + * that the scheduler may inform the governor of updates to capacity
> + * utilization and make changes to cpu frequency. Currently this interface is
> + * designed around PELT values in CFS. It can be expanded to other scheduling
> + * classes in the future if needed.
> + *
> + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI
> + * wakes up the thread that does the actual work, cpufreq_sched_thread.
> + *
> + * This functions bails out early if either condition is true:
> + * 1) this cpu did not the new maximum capacity for its frequency domain
> + * 2) no change in cpu frequency is necessary to meet the new capacity request
> + */
> +void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
> +{
> +       unsigned int freq_new, cpu_tmp;
> +       struct cpufreq_policy *policy;
> +       struct gov_data *gd;
> +       unsigned long capacity_max = 0;

[...]

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index a171fef..0206889 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -102,6 +102,15 @@  config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 	  Be aware that not all cpufreq drivers support the conservative
 	  governor. If unsure have a look at the help section of the
 	  driver. Fallback governor will be the performance governor.
+
+config CPU_FREQ_DEFAULT_GOV_SCHED
+	bool "sched"
+	select CPU_FREQ_GOV_SCHED
+	select CPU_FREQ_GOV_PERFORMANCE
+	help
+	  Use the CPUfreq governor 'sched' as default. This scales
+	  cpu frequency from the scheduler as per-entity load tracking
+	  statistics are updated.
 endchoice
 
 config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,21 @@  config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
+config CPU_FREQ_GOV_SCHED
+	tristate "'sched' cpufreq governor"
+	depends on CPU_FREQ
+	select CPU_FREQ_GOV_COMMON
+	help
+	  'sched' - this governor scales cpu frequency from the
+	  scheduler as a function of cpu capacity utilization. It does
+	  not evaluate utilization on a periodic basis (as ondemand
+	  does) but instead is invoked from the completely fair
+	  scheduler when updating per-entity load tracking statistics.
+	  Latency to respond to changes in load is improved over polling
+	  governors due to its event-driven design.
+
+	  If in doubt, say N.
+
 comment "CPU frequency scaling drivers"
 
 config CPUFREQ_DT
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1f2c9a1..30241c9 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -494,6 +494,9 @@  extern struct cpufreq_governor cpufreq_gov_ondemand;
 #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
 extern struct cpufreq_governor cpufreq_gov_conservative;
 #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV)
+extern struct cpufreq_governor cpufreq_gov_sched_gov;
+#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
 #endif
 
 /*********************************************************************
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be870..f04386c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
new file mode 100644
index 0000000..5020f24
--- /dev/null
+++ b/kernel/sched/cpufreq_sched.c
@@ -0,0 +1,308 @@ 
+/*
+ *  Copyright (C)  2015 Michael Turquette <mturquette@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+#include <linux/irq_work.h>
+
+#include "sched.h"
+
+#define THROTTLE_NSEC		50000000 /* 50ms default */
+
+static DEFINE_PER_CPU(unsigned long, pcpu_capacity);
+static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy);
+
+/**
+ * gov_data - per-policy data internal to the governor
+ * @throttle: next throttling period expiry. Derived from throttle_nsec
+ * @throttle_nsec: throttle period length in nanoseconds
+ * @task: worker thread for dvfs transition that may block/sleep
+ * @irq_work: callback used to wake up worker thread
+ * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread
+ *
+ * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
+ * per-policy instance of it is created when the cpufreq_sched governor receives
+ * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
+ * member of struct cpufreq_policy.
+ *
+ * Readers of this data must call down_read(policy->rwsem). Writers must
+ * call down_write(policy->rwsem).
+ */
+struct gov_data {
+	ktime_t throttle;
+	unsigned int throttle_nsec;
+	struct task_struct *task;
+	struct irq_work irq_work;
+	struct cpufreq_policy *policy;
+	unsigned int freq;
+};
+
+static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq)
+{
+	struct gov_data *gd = policy->governor_data;
+
+	/* avoid race with cpufreq_sched_stop */
+	if (!down_write_trylock(&policy->rwsem))
+		return;
+
+	__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+
+	gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
+	up_write(&policy->rwsem);
+}
+
+/*
+ * we pass in struct cpufreq_policy. This is safe because changing out the
+ * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
+ * which tears down all of the data structures and __cpufreq_governor(policy,
+ * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
+ * new policy pointer
+ */
+static int cpufreq_sched_thread(void *data)
+{
+	struct sched_param param;
+	struct cpufreq_policy *policy;
+	struct gov_data *gd;
+	int ret;
+
+	policy = (struct cpufreq_policy *) data;
+	if (!policy) {
+		pr_warn("%s: missing policy\n", __func__);
+		do_exit(-EINVAL);
+	}
+
+	gd = policy->governor_data;
+	if (!gd) {
+		pr_warn("%s: missing governor data\n", __func__);
+		do_exit(-EINVAL);
+	}
+
+	param.sched_priority = 50;
+	ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
+	if (ret) {
+		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		do_exit(-EINVAL);
+	} else {
+		pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
+				__func__, gd->task->pid);
+	}
+
+	ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
+	if (ret) {
+		pr_warn("%s: failed to set allowed ptr\n", __func__);
+		do_exit(-EINVAL);
+	}
+
+	/* main loop of the per-policy kthread */
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+		if (kthread_should_stop())
+			break;
+
+		cpufreq_sched_try_driver_target(policy, gd->freq);
+	} while (!kthread_should_stop());
+
+	do_exit(0);
+}
+
+static void cpufreq_sched_irq_work(struct irq_work *irq_work)
+{
+	struct gov_data *gd;
+
+	gd = container_of(irq_work, struct gov_data, irq_work);
+	if (!gd) {
+		return;
+	}
+
+	wake_up_process(gd->task);
+}
+
+/**
+ * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values
+ * @cpu: cpu whose capacity utilization has recently changed
+ * @capacity: the new capacity requested by cpu
+ *
+ * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so
+ * that the scheduler may inform the governor of updates to capacity
+ * utilization and make changes to cpu frequency. Currently this interface is
+ * designed around PELT values in CFS. It can be expanded to other scheduling
+ * classes in the future if needed.
+ *
+ * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI
+ * wakes up the thread that does the actual work, cpufreq_sched_thread.
+ *
+ * This functions bails out early if either condition is true:
+ * 1) this cpu did not the new maximum capacity for its frequency domain
+ * 2) no change in cpu frequency is necessary to meet the new capacity request
+ */
+void cpufreq_sched_set_cap(int cpu, unsigned long capacity)
+{
+	unsigned int freq_new, cpu_tmp;
+	struct cpufreq_policy *policy;
+	struct gov_data *gd;
+	unsigned long capacity_max = 0;
+
+	/* update per-cpu capacity request */
+	__this_cpu_write(pcpu_capacity, capacity);
+
+	policy = cpufreq_cpu_get(cpu);
+	if (IS_ERR_OR_NULL(policy)) {
+		return;
+	}
+
+	if (!policy->governor_data)
+		goto out;
+
+	gd = policy->governor_data;
+
+	/* bail early if we are throttled */
+	if (ktime_before(ktime_get(), gd->throttle))
+		goto out;
+
+	/* find max capacity requested by cpus in this policy */
+	for_each_cpu(cpu_tmp, policy->cpus)
+		capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp));
+
+	/*
+	 * We only change frequency if this cpu's capacity request represents a
+	 * new max. If another cpu has requested a capacity greater than the
+	 * previous max then we rely on that cpu to hit this code path and make
+	 * the change. IOW, the cpu with the new max capacity is responsible
+	 * for setting the new capacity/frequency.
+	 *
+	 * If this cpu is not the new maximum then bail
+	 */
+	if (capacity_max > capacity)
+		goto out;
+
+	/* Convert the new maximum capacity request into a cpu frequency */
+	freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+
+	/* No change in frequency? Bail and return current capacity. */
+	if (freq_new == policy->cur)
+		goto out;
+
+	/* store the new frequency and perform the transition */
+	gd->freq = freq_new;
+
+	if (cpufreq_driver_might_sleep())
+		irq_work_queue_on(&gd->irq_work, cpu);
+	else
+		cpufreq_sched_try_driver_target(policy, freq_new);
+
+out:
+	cpufreq_cpu_put(policy);
+	return;
+}
+
+static int cpufreq_sched_start(struct cpufreq_policy *policy)
+{
+	struct gov_data *gd;
+	int cpu;
+
+	/* prepare per-policy private data */
+	gd = kzalloc(sizeof(*gd), GFP_KERNEL);
+	if (!gd) {
+		pr_debug("%s: failed to allocate private data\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* initialize per-cpu data */
+	for_each_cpu(cpu, policy->cpus) {
+		per_cpu(pcpu_capacity, cpu) = 0;
+		per_cpu(pcpu_policy, cpu) = policy;
+	}
+
+	/*
+	 * Don't ask for freq changes at an higher rate than what
+	 * the driver advertises as transition latency.
+	 */
+	gd->throttle_nsec = policy->cpuinfo.transition_latency ?
+			    policy->cpuinfo.transition_latency :
+			    THROTTLE_NSEC;
+	pr_debug("%s: throttle threshold = %u [ns]\n",
+		  __func__, gd->throttle_nsec);
+
+	if (cpufreq_driver_might_sleep()) {
+		/* init per-policy kthread */
+		gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task");
+		if (IS_ERR_OR_NULL(gd->task)) {
+			pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__);
+			goto err;
+		}
+		init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
+	}
+
+	policy->governor_data = gd;
+	gd->policy = policy;
+	return 0;
+
+err:
+	kfree(gd);
+	return -ENOMEM;
+}
+
+static int cpufreq_sched_stop(struct cpufreq_policy *policy)
+{
+	struct gov_data *gd = policy->governor_data;
+
+	if (cpufreq_driver_might_sleep()) {
+		kthread_stop(gd->task);
+	}
+
+	policy->governor_data = NULL;
+
+	/* FIXME replace with devm counterparts? */
+	kfree(gd);
+	return 0;
+}
+
+static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event)
+{
+	switch (event) {
+		case CPUFREQ_GOV_START:
+			/* Start managing the frequency */
+			return cpufreq_sched_start(policy);
+
+		case CPUFREQ_GOV_STOP:
+			return cpufreq_sched_stop(policy);
+
+		case CPUFREQ_GOV_LIMITS:	/* unused */
+		case CPUFREQ_GOV_POLICY_INIT:	/* unused */
+		case CPUFREQ_GOV_POLICY_EXIT:	/* unused */
+			break;
+	}
+	return 0;
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static
+#endif
+struct cpufreq_governor cpufreq_gov_sched = {
+	.name			= "sched",
+	.governor		= cpufreq_sched_setup,
+	.owner			= THIS_MODULE,
+};
+
+static int __init cpufreq_sched_init(void)
+{
+	return cpufreq_register_governor(&cpufreq_gov_sched);
+}
+
+static void __exit cpufreq_sched_exit(void)
+{
+	cpufreq_unregister_governor(&cpufreq_gov_sched);
+}
+
+/* Try to make this the default governor */
+fs_initcall(cpufreq_sched_init);
+
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e1299..25a1b85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1396,6 +1396,13 @@  unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 }
 #endif
 
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+void cpufreq_sched_set_cap(int cpu, unsigned long util);
+#else
+static inline void cpufreq_sched_set_cap(int cpu, unsigned long util)
+{ }
+#endif
+
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
@@ -1404,6 +1411,7 @@  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 #else
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
+static inline void gov_cfs_update_cpu(int cpu) {}
 #endif
 
 extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);