Message ID | 1435362824-26734-4-git-send-email-mturquette@linaro.org (mailing list archive) |
---|---|
State | Not Applicable, archived |
Delegated to: | Rafael Wysocki |
Headers | show |
Hi, On Fri, Jun 26, 2015 at 04:53:43PM -0700, Michael Turquette wrote: > diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c > new file mode 100644 > index 0000000..5020f24 > --- /dev/null > +++ b/kernel/sched/cpufreq_sched.c > @@ -0,0 +1,308 @@ > +/* > + * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/cpufreq.h> > +#include <linux/module.h> > +#include <linux/kthread.h> > +#include <linux/percpu.h> > +#include <linux/irq_work.h> > + > +#include "sched.h" > + > +#define THROTTLE_NSEC 50000000 /* 50ms default */ > + > +static DEFINE_PER_CPU(unsigned long, pcpu_capacity); > +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy); > + > +/** > + * gov_data - per-policy data internal to the governor > + * @throttle: next throttling period expiry. Derived from throttle_nsec > + * @throttle_nsec: throttle period length in nanoseconds > + * @task: worker thread for dvfs transition that may block/sleep > + * @irq_work: callback used to wake up worker thread > + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread > + * > + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A > + * per-policy instance of it is created when the cpufreq_sched governor receives > + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data > + * member of struct cpufreq_policy. > + * > + * Readers of this data must call down_read(policy->rwsem). Writers must > + * call down_write(policy->rwsem). > + */ > +struct gov_data { > + ktime_t throttle; > + unsigned int throttle_nsec; > + struct task_struct *task; > + struct irq_work irq_work; > + struct cpufreq_policy *policy; > + unsigned int freq; > +}; > + > +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq) > +{ > + struct gov_data *gd = policy->governor_data; > + > + /* avoid race with cpufreq_sched_stop */ > + if (!down_write_trylock(&policy->rwsem)) > + return; > + > + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); > + > + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); > + up_write(&policy->rwsem); > +} > + > +/* > + * we pass in struct cpufreq_policy. This is safe because changing out the > + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), > + * which tears down all of the data structures and __cpufreq_governor(policy, > + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the > + * new policy pointer > + */ > +static int cpufreq_sched_thread(void *data) > +{ > + struct sched_param param; > + struct cpufreq_policy *policy; > + struct gov_data *gd; > + int ret; > + > + policy = (struct cpufreq_policy *) data; unnecessary cast. > + if (!policy) { Is this even possible ? I'd just let it oops since it would be a really odd case. > + pr_warn("%s: missing policy\n", __func__); > + do_exit(-EINVAL); > + } > + > + gd = policy->governor_data; > + if (!gd) { likewise. > + pr_warn("%s: missing governor data\n", __func__); > + do_exit(-EINVAL); > + } > + > + param.sched_priority = 50; > + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); > + if (ret) { > + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); > + do_exit(-EINVAL); > + } else { else is unnecessary here, but no strong feelings. > + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", > + __func__, gd->task->pid); > + } > + > + ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus); > + if (ret) { > + pr_warn("%s: failed to set allowed ptr\n", __func__); > + do_exit(-EINVAL); > + } > + > + /* main loop of the per-policy kthread */ > + do { > + set_current_state(TASK_INTERRUPTIBLE); > + schedule(); > + if (kthread_should_stop()) > + break; > + > + cpufreq_sched_try_driver_target(policy, gd->freq); > + } while (!kthread_should_stop()); looks like this would be simpler with a plain while() instead of do {} while: while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); schedule(); cpufreq_sched_try_driver_target(policy, gd->freq); } > + do_exit(0); > +} > + > +static void cpufreq_sched_irq_work(struct irq_work *irq_work) > +{ > + struct gov_data *gd; > + > + gd = container_of(irq_work, struct gov_data, irq_work); if irq_work is the first member in struct gov_data, this gets optimized to a cast. > + if (!gd) { unnecessary parens. > + return; > + } > + > + wake_up_process(gd->task); > +} > + > +/** > + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values > + * @cpu: cpu whose capacity utilization has recently changed > + * @capacity: the new capacity requested by cpu > + * > + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so > + * that the scheduler may inform the governor of updates to capacity > + * utilization and make changes to cpu frequency. Currently this interface is > + * designed around PELT values in CFS. It can be expanded to other scheduling > + * classes in the future if needed. > + * > + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI > + * wakes up the thread that does the actual work, cpufreq_sched_thread. > + * > + * This functions bails out early if either condition is true: > + * 1) this cpu did not the new maximum capacity for its frequency domain > + * 2) no change in cpu frequency is necessary to meet the new capacity request > + */ > +void cpufreq_sched_set_cap(int cpu, unsigned long capacity) > +{ > + unsigned int freq_new, cpu_tmp; > + struct cpufreq_policy *policy; > + struct gov_data *gd; > + unsigned long capacity_max = 0; > + > + /* update per-cpu capacity request */ > + __this_cpu_write(pcpu_capacity, capacity); > + > + policy = cpufreq_cpu_get(cpu); > + if (IS_ERR_OR_NULL(policy)) { can this really be ERR_PTR ? Also, unnecessary parens > + return; > + } > + > + if (!policy->governor_data) > + goto out; > + > + gd = policy->governor_data; > + > + /* bail early if we are throttled */ > + if (ktime_before(ktime_get(), gd->throttle)) > + goto out; > + > + /* find max capacity requested by cpus in this policy */ > + for_each_cpu(cpu_tmp, policy->cpus) > + capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp)); > + > + /* > + * We only change frequency if this cpu's capacity request represents a > + * new max. If another cpu has requested a capacity greater than the > + * previous max then we rely on that cpu to hit this code path and make > + * the change. IOW, the cpu with the new max capacity is responsible > + * for setting the new capacity/frequency. > + * > + * If this cpu is not the new maximum then bail > + */ > + if (capacity_max > capacity) > + goto out; > + > + /* Convert the new maximum capacity request into a cpu frequency */ > + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; > + > + /* No change in frequency? Bail and return current capacity. */ > + if (freq_new == policy->cur) > + goto out; > + > + /* store the new frequency and perform the transition */ > + gd->freq = freq_new; > + > + if (cpufreq_driver_might_sleep()) > + irq_work_queue_on(&gd->irq_work, cpu); > + else > + cpufreq_sched_try_driver_target(policy, freq_new); > + > +out: > + cpufreq_cpu_put(policy); > + return; unnecessary return > +} > + > +static int cpufreq_sched_start(struct cpufreq_policy *policy) > +{ > + struct gov_data *gd; > + int cpu; > + > + /* prepare per-policy private data */ > + gd = kzalloc(sizeof(*gd), GFP_KERNEL); > + if (!gd) { > + pr_debug("%s: failed to allocate private data\n", __func__); unnecessary OOM message, that will render curly braces unnecessary too. > + return -ENOMEM; > + } > + > + /* initialize per-cpu data */ > + for_each_cpu(cpu, policy->cpus) { > + per_cpu(pcpu_capacity, cpu) = 0; > + per_cpu(pcpu_policy, cpu) = policy; > + } > + > + /* > + * Don't ask for freq changes at an higher rate than what s/an higher/a higher > + * the driver advertises as transition latency. > + */ > + gd->throttle_nsec = policy->cpuinfo.transition_latency ? > + policy->cpuinfo.transition_latency : > + THROTTLE_NSEC; > + pr_debug("%s: throttle threshold = %u [ns]\n", > + __func__, gd->throttle_nsec); > + > + if (cpufreq_driver_might_sleep()) { > + /* init per-policy kthread */ > + gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task"); > + if (IS_ERR_OR_NULL(gd->task)) { kthread_run() doesn't return NULL. > + pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__); > + goto err; > + } > + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); > + } > + > + policy->governor_data = gd; > + gd->policy = policy; > + return 0; > + > +err: > + kfree(gd); > + return -ENOMEM; why don't you pass along errors returned by any other function you call ? > +} > + > +static int cpufreq_sched_stop(struct cpufreq_policy *policy) > +{ > + struct gov_data *gd = policy->governor_data; > + > + if (cpufreq_driver_might_sleep()) { unnecessary curly braces. > + kthread_stop(gd->task); should you switch back to some default OPP when this is removed ? Some SoCs can't run at certain OPPs forever (thermal limitations, or whatever else), might be good to switch to something considered safe. > + } > + > + policy->governor_data = NULL; > + > + /* FIXME replace with devm counterparts? */ > + kfree(gd); > + return 0; > +} > + > +static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event) > +{ > + switch (event) { > + case CPUFREQ_GOV_START: > + /* Start managing the frequency */ > + return cpufreq_sched_start(policy); > + > + case CPUFREQ_GOV_STOP: > + return cpufreq_sched_stop(policy); > + > + case CPUFREQ_GOV_LIMITS: /* unused */ > + case CPUFREQ_GOV_POLICY_INIT: /* unused */ > + case CPUFREQ_GOV_POLICY_EXIT: /* unused */ > + break; indentation > + } > + return 0; > +} > + > +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED > +static > +#endif > +struct cpufreq_governor cpufreq_gov_sched = { > + .name = "sched", > + .governor = cpufreq_sched_setup, > + .owner = THIS_MODULE, > +}; > + > +static int __init cpufreq_sched_init(void) > +{ > + return cpufreq_register_governor(&cpufreq_gov_sched); > +} > + > +static void __exit cpufreq_sched_exit(void) > +{ > + cpufreq_unregister_governor(&cpufreq_gov_sched); > +} > + > +/* Try to make this the default governor */ > +fs_initcall(cpufreq_sched_init); why fs_initcall() ? Why can't this be in module_init() ?
Hi, On Mon, Jun 29, 2015 at 09:49:43AM -0700, Michael Turquette wrote: <snip> > > > +static int cpufreq_sched_stop(struct cpufreq_policy *policy) > > > +{ > > > + struct gov_data *gd = policy->governor_data; > > > + > > > + if (cpufreq_driver_might_sleep()) { > > > > unnecessary curly braces. > > > > > + kthread_stop(gd->task); > > > > Thanks for the review. I'll take into account everything above. > > > should you switch back to some default OPP when this is removed ? Some > > SoCs can't run at certain OPPs forever (thermal limitations, or whatever > > else), might be good to switch to something considered safe. > > The above only happens when we unload the module or switch governors, > and every governor has this characteristic. > > I do not think that open-coding a return to some default opp in every > governor is a good solution. This sounds like something the cpufreq core > should take care of. indeed. > Also, how do we know which opp is safe? no idea, that needs to be described somehow.
Hi Mike, On 27/06/15 00:53, Michael Turquette wrote: > From: Michael Turquette <mturquette@baylibre.com> > [...] > comment "CPU frequency scaling drivers" > > config CPUFREQ_DT > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h > index 1f2c9a1..30241c9 100644 > --- a/include/linux/cpufreq.h > +++ b/include/linux/cpufreq.h > @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; > #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) > extern struct cpufreq_governor cpufreq_gov_conservative; > #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) > +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV) > +extern struct cpufreq_governor cpufreq_gov_sched_gov; To get it building with CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED=y . -#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV) -extern struct cpufreq_governor cpufreq_gov_sched_gov; +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED) +extern struct cpufreq_governor cpufreq_gov_sched; > +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) > #endif > + > +/** > + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values [...] minor nit: s/cpufreq_sched_set_capacity/cpufreq_sched_set_cap > + * @cpu: cpu whose capacity utilization has recently changed > + * @capacity: the new capacity requested by cpu > + * > + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so > + * that the scheduler may inform the governor of updates to capacity > + * utilization and make changes to cpu frequency. Currently this interface is > + * designed around PELT values in CFS. It can be expanded to other scheduling > + * classes in the future if needed. > + * > + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI > + * wakes up the thread that does the actual work, cpufreq_sched_thread. > + * > + * This functions bails out early if either condition is true: > + * 1) this cpu did not the new maximum capacity for its frequency domain > + * 2) no change in cpu frequency is necessary to meet the new capacity request > + */ > +void cpufreq_sched_set_cap(int cpu, unsigned long capacity) > +{ > + unsigned int freq_new, cpu_tmp; > + struct cpufreq_policy *policy; > + struct gov_data *gd; > + unsigned long capacity_max = 0; [...] -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..0206889 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_SCHED + bool "sched" + select CPU_FREQ_GOV_SCHED + select CPU_FREQ_GOV_PERFORMANCE + help + Use the CPUfreq governor 'sched' as default. This scales + cpu frequency from the scheduler as per-entity load tracking + statistics are updated. endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SCHED + tristate "'sched' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is invoked from the completely fair + scheduler when updating per-entity load tracking statistics. + Latency to respond to changes in load is improved over polling + governors due to its event-driven design. + + If in doubt, say N. + comment "CPU frequency scaling drivers" config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1f2c9a1..30241c9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -494,6 +494,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_GOV) +extern struct cpufreq_governor cpufreq_gov_sched_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) #endif /********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..f04386c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 0000000..5020f24 --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,308 @@ +/* + * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h> + +#include "sched.h" + +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +static DEFINE_PER_CPU(unsigned long, pcpu_capacity); +static DEFINE_PER_CPU(struct cpufreq_policy *, pcpu_policy); + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @freq: new frequency stored in *_sched_update_cpu and used in *_sched_thread + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + struct cpufreq_policy *policy; + unsigned int freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + int ret; + + policy = (struct cpufreq_policy *) data; + if (!policy) { + pr_warn("%s: missing policy\n", __func__); + do_exit(-EINVAL); + } + + gd = policy->governor_data; + if (!gd) { + pr_warn("%s: missing governor data\n", __func__); + do_exit(-EINVAL); + } + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus); + if (ret) { + pr_warn("%s: failed to set allowed ptr\n", __func__); + do_exit(-EINVAL); + } + + /* main loop of the per-policy kthread */ + do { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + if (kthread_should_stop()) + break; + + cpufreq_sched_try_driver_target(policy, gd->freq); + } while (!kthread_should_stop()); + + do_exit(0); +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) { + return; + } + + wake_up_process(gd->task); +} + +/** + * cpufreq_sched_set_capacity - interface to scheduler for changing capacity values + * @cpu: cpu whose capacity utilization has recently changed + * @capacity: the new capacity requested by cpu + * + * cpufreq_sched_sched_capacity is an interface exposed to the scheduler so + * that the scheduler may inform the governor of updates to capacity + * utilization and make changes to cpu frequency. Currently this interface is + * designed around PELT values in CFS. It can be expanded to other scheduling + * classes in the future if needed. + * + * cpufreq_sched_set_capacity raises an IPI. The irq_work handler for that IPI + * wakes up the thread that does the actual work, cpufreq_sched_thread. + * + * This functions bails out early if either condition is true: + * 1) this cpu did not the new maximum capacity for its frequency domain + * 2) no change in cpu frequency is necessary to meet the new capacity request + */ +void cpufreq_sched_set_cap(int cpu, unsigned long capacity) +{ + unsigned int freq_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity_max = 0; + + /* update per-cpu capacity request */ + __this_cpu_write(pcpu_capacity, capacity); + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) { + return; + } + + if (!policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* bail early if we are throttled */ + if (ktime_before(ktime_get(), gd->throttle)) + goto out; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) + capacity_max = max(capacity_max, per_cpu(pcpu_capacity, cpu_tmp)); + + /* + * We only change frequency if this cpu's capacity request represents a + * new max. If another cpu has requested a capacity greater than the + * previous max then we rely on that cpu to hit this code path and make + * the change. IOW, the cpu with the new max capacity is responsible + * for setting the new capacity/frequency. + * + * If this cpu is not the new maximum then bail + */ + if (capacity_max > capacity) + goto out; + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + + /* No change in frequency? Bail and return current capacity. */ + if (freq_new == policy->cur) + goto out; + + /* store the new frequency and perform the transition */ + gd->freq = freq_new; + + if (cpufreq_driver_might_sleep()) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); + return; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + + /* prepare per-policy private data */ + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) { + pr_debug("%s: failed to allocate private data\n", __func__); + return -ENOMEM; + } + + /* initialize per-cpu data */ + for_each_cpu(cpu, policy->cpus) { + per_cpu(pcpu_capacity, cpu) = 0; + per_cpu(pcpu_policy, cpu) = policy; + } + + /* + * Don't ask for freq changes at an higher rate than what + * the driver advertises as transition latency. + */ + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + if (cpufreq_driver_might_sleep()) { + /* init per-policy kthread */ + gd->task = kthread_run(cpufreq_sched_thread, policy, "kcpufreq_sched_task"); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kcpufreq_sched_task thread\n", __func__); + goto err; + } + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + policy->governor_data = gd; + gd->policy = policy; + return 0; + +err: + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + if (cpufreq_driver_might_sleep()) { + kthread_stop(gd->task); + } + + policy->governor_data = NULL; + + /* FIXME replace with devm counterparts? */ + kfree(gd); + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_START: + /* Start managing the frequency */ + return cpufreq_sched_start(policy); + + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + + case CPUFREQ_GOV_LIMITS: /* unused */ + case CPUFREQ_GOV_POLICY_INIT: /* unused */ + case CPUFREQ_GOV_POLICY_EXIT: /* unused */ + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +static void __exit cpufreq_sched_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); + +MODULE_LICENSE("GPL v2"); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e1299..25a1b85 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,6 +1396,13 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +void cpufreq_sched_set_cap(int cpu, unsigned long util); +#else +static inline void cpufreq_sched_set_cap(int cpu, unsigned long util) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); @@ -1404,6 +1411,7 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } +static inline void gov_cfs_update_cpu(int cpu) {} #endif extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);