Message ID | 85bf45982709e06f7f42e1b8f8315945e9d9b6d0.1478858983.git.viresh.kumar@linaro.org (mailing list archive) |
---|---|
State | Changes Requested, archived |
Headers | show |
Hi, On 11/11/2016 11:22, Viresh Kumar wrote: > If slow path frequency changes are conducted in a SCHED_OTHER context > then they may be delayed for some amount of time, including > indefinitely, when real time or deadline activity is taking place. > > Move the slow path to a real time kernel thread. In the future the > thread should be made SCHED_DEADLINE. would you have an insight, as to what runtime/deadline/period to set, and/or what specific timing constraints you'd like to set, just for this cpufreq slowpath work? > The RT priority is arbitrarily set > to 50 for now. [...] > + struct sched_param param = { .sched_priority = 50 }; won't you have a tunable here? (sysctl?) Thanks, T.
On Fri, Nov 11, 2016 at 03:32:04PM +0100, Tommaso Cucinotta wrote: > >+ struct sched_param param = { .sched_priority = 50 }; > > won't you have a tunable here? (sysctl?) You can use the regular userspace tools, like schedtool and chrt to set priorities. -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Nov 11, 2016 at 11:22 AM, Viresh Kumar <viresh.kumar@linaro.org> wrote: > If slow path frequency changes are conducted in a SCHED_OTHER context > then they may be delayed for some amount of time, including > indefinitely, when real time or deadline activity is taking place. > > Move the slow path to a real time kernel thread. In the future the > thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set > to 50 for now. > > Hackbench results on ARM Exynos, dual core A15 platform for 10 > iterations: > > $ hackbench -s 100 -l 100 -g 10 -f 20 > > Before After > --------------------------------- > 1.808 1.603 > 1.847 1.251 > 2.229 1.590 > 1.952 1.600 > 1.947 1.257 > 1.925 1.627 > 2.694 1.620 > 1.258 1.621 > 1.919 1.632 > 1.250 1.240 > > Average: > > 1.8829 1.5041 > > Based on initial work by Steve Muckle. > > Signed-off-by: Steve Muckle <smuckle.linux@gmail.com> > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > --- > kernel/sched/cpufreq_schedutil.c | 62 ++++++++++++++++++++++++++++++++-------- > 1 file changed, 50 insertions(+), 12 deletions(-) > > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c > index ccb2ab89affb..045ce0a4e6d1 100644 > --- a/kernel/sched/cpufreq_schedutil.c > +++ b/kernel/sched/cpufreq_schedutil.c > @@ -12,6 +12,7 @@ > #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > > #include <linux/cpufreq.h> > +#include <linux/kthread.h> > #include <linux/slab.h> > #include <trace/events/power.h> > > @@ -35,8 +36,10 @@ struct sugov_policy { > > /* The next fields are only needed if fast switch cannot be used. */ > struct irq_work irq_work; > - struct work_struct work; > + struct kthread_work work; > struct mutex work_lock; > + struct kthread_worker worker; > + struct task_struct *thread; > bool work_in_progress; > > bool need_freq_update; > @@ -291,9 +294,10 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, > raw_spin_unlock(&sg_policy->update_lock); > } > > -static void sugov_work(struct work_struct *work) > +static void sugov_work(struct kthread_work *work) > { > - struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); > + struct sugov_policy *sg_policy = > + container_of(work, struct sugov_policy, work); Why this change? > > mutex_lock(&sg_policy->work_lock); > __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, > @@ -308,7 +312,7 @@ static void sugov_irq_work(struct irq_work *irq_work) > struct sugov_policy *sg_policy; > > sg_policy = container_of(irq_work, struct sugov_policy, irq_work); > - schedule_work_on(smp_processor_id(), &sg_policy->work); > + kthread_queue_work(&sg_policy->worker, &sg_policy->work); > } > > /************************** sysfs interface ************************/ > @@ -362,9 +366,23 @@ static struct kobj_type sugov_tunables_ktype = { > > static struct cpufreq_governor schedutil_gov; > > +static void sugov_policy_free(struct sugov_policy *sg_policy) > +{ > + if (!sg_policy->policy->fast_switch_enabled) { > + kthread_flush_worker(&sg_policy->worker); > + kthread_stop(sg_policy->thread); > + } > + > + mutex_destroy(&sg_policy->work_lock); > + kfree(sg_policy); > +} > + > static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) > { > struct sugov_policy *sg_policy; > + struct task_struct *thread; > + struct sched_param param = { .sched_priority = 50 }; I'd define a symbol for the 50. It's just one extra line of code ... Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/11/2016 02:16 PM, Rafael J. Wysocki wrote: > On Fri, Nov 11, 2016 at 11:22 AM, Viresh Kumar <viresh.kumar@linaro.org> wrote: >> If slow path frequency changes are conducted in a SCHED_OTHER context >> then they may be delayed for some amount of time, including >> indefinitely, when real time or deadline activity is taking place. >> >> Move the slow path to a real time kernel thread. In the future the >> thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set >> to 50 for now. >> >> Hackbench results on ARM Exynos, dual core A15 platform for 10 >> iterations: >> >> $ hackbench -s 100 -l 100 -g 10 -f 20 >> >> Before After >> --------------------------------- >> 1.808 1.603 >> 1.847 1.251 >> 2.229 1.590 >> 1.952 1.600 >> 1.947 1.257 >> 1.925 1.627 >> 2.694 1.620 >> 1.258 1.621 >> 1.919 1.632 >> 1.250 1.240 >> >> Average: >> >> 1.8829 1.5041 >> >> Based on initial work by Steve Muckle. >> >> Signed-off-by: Steve Muckle <smuckle.linux@gmail.com> >> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> >> --- >> kernel/sched/cpufreq_schedutil.c | 62 ++++++++++++++++++++++++++++++++-------- >> 1 file changed, 50 insertions(+), 12 deletions(-) >> >> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c >> index ccb2ab89affb..045ce0a4e6d1 100644 >> --- a/kernel/sched/cpufreq_schedutil.c >> +++ b/kernel/sched/cpufreq_schedutil.c >> @@ -12,6 +12,7 @@ >> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> >> #include <linux/cpufreq.h> >> +#include <linux/kthread.h> >> #include <linux/slab.h> >> #include <trace/events/power.h> >> >> @@ -35,8 +36,10 @@ struct sugov_policy { >> >> /* The next fields are only needed if fast switch cannot be used. */ >> struct irq_work irq_work; >> - struct work_struct work; >> + struct kthread_work work; >> struct mutex work_lock; >> + struct kthread_worker worker; >> + struct task_struct *thread; >> bool work_in_progress; >> >> bool need_freq_update; >> @@ -291,9 +294,10 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, >> raw_spin_unlock(&sg_policy->update_lock); >> } >> >> -static void sugov_work(struct work_struct *work) >> +static void sugov_work(struct kthread_work *work) >> { >> - struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); >> + struct sugov_policy *sg_policy = >> + container_of(work, struct sugov_policy, work); > > Why this change? > >> >> mutex_lock(&sg_policy->work_lock); >> __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, >> @@ -308,7 +312,7 @@ static void sugov_irq_work(struct irq_work *irq_work) >> struct sugov_policy *sg_policy; >> >> sg_policy = container_of(irq_work, struct sugov_policy, irq_work); >> - schedule_work_on(smp_processor_id(), &sg_policy->work); >> + kthread_queue_work(&sg_policy->worker, &sg_policy->work); >> } >> >> /************************** sysfs interface ************************/ >> @@ -362,9 +366,23 @@ static struct kobj_type sugov_tunables_ktype = { >> >> static struct cpufreq_governor schedutil_gov; >> >> +static void sugov_policy_free(struct sugov_policy *sg_policy) >> +{ >> + if (!sg_policy->policy->fast_switch_enabled) { >> + kthread_flush_worker(&sg_policy->worker); >> + kthread_stop(sg_policy->thread); >> + } >> + >> + mutex_destroy(&sg_policy->work_lock); >> + kfree(sg_policy); >> +} >> + >> static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) >> { >> struct sugov_policy *sg_policy; >> + struct task_struct *thread; >> + struct sched_param param = { .sched_priority = 50 }; > > I'd define a symbol for the 50. It's just one extra line of code ... > Hold on a sec. I thought during LPC someone (Peter?) made a point that when RT thread run, we should bump the frequency to max? So, schedutil is going to trigger schedutil to bump up the frequency to max, right? -Saravana
On 11 November 2016 at 20:02, Tommaso Cucinotta <tommaso.cucinotta@sssup.it> wrote: > would you have an insight, as to what runtime/deadline/period to set, and/or > what specific timing constraints you'd like to set, just for this cpufreq > slowpath work? I don't have any such figures for not at least :( -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11 November 2016 at 20:09, Peter Zijlstra <peterz@infradead.org> wrote: > On Fri, Nov 11, 2016 at 03:32:04PM +0100, Tommaso Cucinotta wrote: >> >+ struct sched_param param = { .sched_priority = 50 }; >> >> won't you have a tunable here? (sysctl?) > > You can use the regular userspace tools, like schedtool and chrt to set > priorities. I wanted to get some help from you on this Peter. The out of tree Interactive governor has always used MAX_RT_PRIORITY - 1 here instead of 50. But Steve started with 50. What do you think should the value be ? -- viresh -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 12 November 2016 at 03:46, Rafael J. Wysocki <rafael@kernel.org> wrote: >> +static void sugov_work(struct kthread_work *work) >> { >> - struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); >> + struct sugov_policy *sg_policy = >> + container_of(work, struct sugov_policy, work); > > Why this change? Mistake .. >> static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) >> { >> struct sugov_policy *sg_policy; >> + struct task_struct *thread; >> + struct sched_param param = { .sched_priority = 50 }; > > I'd define a symbol for the 50. It's just one extra line of code ... Sure. As I asked in the cover letter, will you be fine if I send the same patch for ondemand/conservative governors ? -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 12 November 2016 at 07:01, Saravana Kannan <skannan@codeaurora.org> wrote: > Hold on a sec. I thought during LPC someone (Peter?) made a point that when > RT thread run, we should bump the frequency to max? I wasn't there but AFAIU, this is the case we have currently for the schedutil governor. And we (mobile world, Linaro) want to change that it doesn't work that well for us. So perhaps it is just the opposite of what you stated. > So, schedutil is going > to trigger schedutil to bump up the frequency to max, right? How is that question related to this patch ? -- viresh -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Nov 12, 2016 at 2:31 AM, Saravana Kannan <skannan@codeaurora.org> wrote: > On 11/11/2016 02:16 PM, Rafael J. Wysocki wrote: >> >> On Fri, Nov 11, 2016 at 11:22 AM, Viresh Kumar <viresh.kumar@linaro.org> >> wrote: >>> >>> If slow path frequency changes are conducted in a SCHED_OTHER context >>> then they may be delayed for some amount of time, including >>> indefinitely, when real time or deadline activity is taking place. >>> >>> Move the slow path to a real time kernel thread. In the future the >>> thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set >>> to 50 for now. >>> >>> Hackbench results on ARM Exynos, dual core A15 platform for 10 >>> iterations: >>> >>> $ hackbench -s 100 -l 100 -g 10 -f 20 >>> >>> Before After >>> --------------------------------- >>> 1.808 1.603 >>> 1.847 1.251 >>> 2.229 1.590 >>> 1.952 1.600 >>> 1.947 1.257 >>> 1.925 1.627 >>> 2.694 1.620 >>> 1.258 1.621 >>> 1.919 1.632 >>> 1.250 1.240 >>> >>> Average: >>> >>> 1.8829 1.5041 >>> >>> Based on initial work by Steve Muckle. >>> >>> Signed-off-by: Steve Muckle <smuckle.linux@gmail.com> >>> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> >>> --- >>> kernel/sched/cpufreq_schedutil.c | 62 >>> ++++++++++++++++++++++++++++++++-------- >>> 1 file changed, 50 insertions(+), 12 deletions(-) >>> >>> diff --git a/kernel/sched/cpufreq_schedutil.c >>> b/kernel/sched/cpufreq_schedutil.c >>> index ccb2ab89affb..045ce0a4e6d1 100644 >>> --- a/kernel/sched/cpufreq_schedutil.c >>> +++ b/kernel/sched/cpufreq_schedutil.c >>> @@ -12,6 +12,7 @@ >>> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >>> >>> #include <linux/cpufreq.h> >>> +#include <linux/kthread.h> >>> #include <linux/slab.h> >>> #include <trace/events/power.h> >>> >>> @@ -35,8 +36,10 @@ struct sugov_policy { >>> >>> /* The next fields are only needed if fast switch cannot be >>> used. */ >>> struct irq_work irq_work; >>> - struct work_struct work; >>> + struct kthread_work work; >>> struct mutex work_lock; >>> + struct kthread_worker worker; >>> + struct task_struct *thread; >>> bool work_in_progress; >>> >>> bool need_freq_update; >>> @@ -291,9 +294,10 @@ static void sugov_update_shared(struct >>> update_util_data *hook, u64 time, >>> raw_spin_unlock(&sg_policy->update_lock); >>> } >>> >>> -static void sugov_work(struct work_struct *work) >>> +static void sugov_work(struct kthread_work *work) >>> { >>> - struct sugov_policy *sg_policy = container_of(work, struct >>> sugov_policy, work); >>> + struct sugov_policy *sg_policy = >>> + container_of(work, struct sugov_policy, work); >> >> >> Why this change? >> >>> >>> mutex_lock(&sg_policy->work_lock); >>> __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, >>> @@ -308,7 +312,7 @@ static void sugov_irq_work(struct irq_work *irq_work) >>> struct sugov_policy *sg_policy; >>> >>> sg_policy = container_of(irq_work, struct sugov_policy, >>> irq_work); >>> - schedule_work_on(smp_processor_id(), &sg_policy->work); >>> + kthread_queue_work(&sg_policy->worker, &sg_policy->work); >>> } >>> >>> /************************** sysfs interface ************************/ >>> @@ -362,9 +366,23 @@ static struct kobj_type sugov_tunables_ktype = { >>> >>> static struct cpufreq_governor schedutil_gov; >>> >>> +static void sugov_policy_free(struct sugov_policy *sg_policy) >>> +{ >>> + if (!sg_policy->policy->fast_switch_enabled) { >>> + kthread_flush_worker(&sg_policy->worker); >>> + kthread_stop(sg_policy->thread); >>> + } >>> + >>> + mutex_destroy(&sg_policy->work_lock); >>> + kfree(sg_policy); >>> +} >>> + >>> static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy >>> *policy) >>> { >>> struct sugov_policy *sg_policy; >>> + struct task_struct *thread; >>> + struct sched_param param = { .sched_priority = 50 }; >> >> >> I'd define a symbol for the 50. It's just one extra line of code ... >> > > Hold on a sec. I thought during LPC someone (Peter?) made a point that when > RT thread run, we should bump the frequency to max? So, schedutil is going > to trigger schedutil to bump up the frequency to max, right? No, it isn't, or at least that is unlikely. sugov_update_commit() sets sg_policy->work_in_progress before queuing the IRQ work and it is not cleared until the frequency changes in sugov_work(). OTOH, sugov_should_update_freq() checks sg_policy->work_in_progress upfront and returns false when it is set, so the governor won't see its own worker threads run, unless I'm overlooking something highly non-obvious. Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Nov 11, 2016 at 11:16:59PM +0100, Rafael J. Wysocki wrote: > > + struct sched_param param = { .sched_priority = 50 }; > > I'd define a symbol for the 50. It's just one extra line of code ... A minor point for sure, but in general what's the motivation for defining symbols for things which are only used once? It makes it harder to untangle and learn a piece of code IMO, having to jump to the definitions of them to see what they are. thanks, Steve -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Nov 13, 2016 at 03:37:18PM +0100, Rafael J. Wysocki wrote: > > Hold on a sec. I thought during LPC someone (Peter?) made a point that when > > RT thread run, we should bump the frequency to max? So, schedutil is going > > to trigger schedutil to bump up the frequency to max, right? > > No, it isn't, or at least that is unlikely. > > sugov_update_commit() sets sg_policy->work_in_progress before queuing > the IRQ work and it is not cleared until the frequency changes in > sugov_work(). > > OTOH, sugov_should_update_freq() checks sg_policy->work_in_progress > upfront and returns false when it is set, so the governor won't see > its own worker threads run, unless I'm overlooking something highly > non-obvious. FWIW my intention with the original version of this patch (which I neglected to communicate to Viresh) was that it would depend on changing the frequency policy for RT. I had been using rt_avg. It sounds like during LPC there were talks of using another metric. It does appear things would work okay without that but it also seems a bit fragile. There's the window between when the work_in_progress gets cleared and the RT kthread yields. I have not thought through the various scenarios there, what is possible and tested to see if it is significant enough to impact power-sensitive platforms. thanks, Steve -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Nov 13, 2016 at 8:47 PM, Steve Muckle <smuckle.linux@gmail.com> wrote: > On Sun, Nov 13, 2016 at 03:37:18PM +0100, Rafael J. Wysocki wrote: >> > Hold on a sec. I thought during LPC someone (Peter?) made a point that when >> > RT thread run, we should bump the frequency to max? So, schedutil is going >> > to trigger schedutil to bump up the frequency to max, right? >> >> No, it isn't, or at least that is unlikely. >> >> sugov_update_commit() sets sg_policy->work_in_progress before queuing >> the IRQ work and it is not cleared until the frequency changes in >> sugov_work(). >> >> OTOH, sugov_should_update_freq() checks sg_policy->work_in_progress >> upfront and returns false when it is set, so the governor won't see >> its own worker threads run, unless I'm overlooking something highly >> non-obvious. > > FWIW my intention with the original version of this patch (which I > neglected to communicate to Viresh) was that it would depend on changing > the frequency policy for RT. I had been using rt_avg. It sounds like > during LPC there were talks of using another metric. > > It does appear things would work okay without that but it also seems > a bit fragile. Yes, it does. To a minimum, there should be a comment regarding that in the patches. > There's the window between when the work_in_progress > gets cleared and the RT kthread yields. I have not thought through the > various scenarios there, what is possible and tested to see if it is > significant enough to impact power-sensitive platforms. Well, me neither, to be entirely honest. :-) That said, there is a limited number of call sites for update_curr_rt(), where SCHED_CPUFREQ_RT is passed to cpufreq governors: dequeue_task_rt(), put_prev_task_rt(), pick_next_task_rt(), and task_tick_rt(). I'm not sure how pick_next_task_rt() can be relevant here at all, though, and task_tick_rt() would need to be running exactly during the window mentioned above, so it probably is negligible either, at least on the average. From the quick look at the scheduler core, put_prev_task() is mostly called for running tasks, so that case doesn't look like something to worry about too, although it would need to be looked through in detail. The dequeue part I'm totally unsure about. In any case, the clearing of work_in_progress might still be deferred by queuing a regular (non-RT) work item to do that from the kthread work (that will guarantee "hiding" the kthread work from the governor), but admittedly that would be a sledgehammer of sorts (and it might defeat the purpose of the whole exercise) ... Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 12-11-16, 10:57, Viresh Kumar wrote: > On 12 November 2016 at 07:01, Saravana Kannan <skannan@codeaurora.org> wrote: > > Hold on a sec. I thought during LPC someone (Peter?) made a point that when > > RT thread run, we should bump the frequency to max? > > I wasn't there but AFAIU, this is the case we have currently for the schedutil > governor. And we (mobile world, Linaro) want to change that it doesn't work > that well for us. So perhaps it is just the opposite of what you stated. > > > So, schedutil is going > > to trigger schedutil to bump up the frequency to max, right? > > How is that question related to this patch ? Trash my last email, I failed to read yours :(
On Sat, Nov 12, 2016 at 10:52:35AM +0530, Viresh Kumar wrote: > On 11 November 2016 at 20:09, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Nov 11, 2016 at 03:32:04PM +0100, Tommaso Cucinotta wrote: > >> >+ struct sched_param param = { .sched_priority = 50 }; > >> > >> won't you have a tunable here? (sysctl?) > > > > You can use the regular userspace tools, like schedtool and chrt to set > > priorities. > > I wanted to get some help from you on this Peter. The out of tree Interactive > governor has always used MAX_RT_PRIORITY - 1 here instead of 50. > > But Steve started with 50. What do you think should the value be ? Any static prio value is wrong (static prio assignment requires system knowledge that the kernel doesn't and cannot have), 50 is what threaded IRQs default too as well IIRC, so it would at least be consistent with that. -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 14-11-16, 10:22, Peter Zijlstra wrote: > Any static prio value is wrong (static prio assignment requires system > knowledge that the kernel doesn't and cannot have), 50 is what threaded > IRQs default too as well IIRC, so it would at least be consistent with > that. Yes you are correct and I have found a better way of defining the priority in this case using that code instead of magic figure 50. MAX_USER_RT_PRIO/2 :)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index ccb2ab89affb..045ce0a4e6d1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpufreq.h> +#include <linux/kthread.h> #include <linux/slab.h> #include <trace/events/power.h> @@ -35,8 +36,10 @@ struct sugov_policy { /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress; bool need_freq_update; @@ -291,9 +294,10 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_unlock(&sg_policy->update_lock); } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { - struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + struct sugov_policy *sg_policy = + container_of(work, struct sugov_policy, work); mutex_lock(&sg_policy->work_lock); __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, @@ -308,7 +312,7 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + kthread_queue_work(&sg_policy->worker, &sg_policy->work); } /************************** sysfs interface ************************/ @@ -362,9 +366,23 @@ static struct kobj_type sugov_tunables_ktype = { static struct cpufreq_governor schedutil_gov; +static void sugov_policy_free(struct sugov_policy *sg_policy) +{ + if (!sg_policy->policy->fast_switch_enabled) { + kthread_flush_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); + } + + mutex_destroy(&sg_policy->work_lock); + kfree(sg_policy); +} + static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; + struct task_struct *thread; + struct sched_param param = { .sched_priority = 50 }; + int ret; sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); if (!sg_policy) @@ -372,16 +390,36 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) sg_policy->policy = policy; init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); - return sg_policy; -} -static void sugov_policy_free(struct sugov_policy *sg_policy) -{ - mutex_destroy(&sg_policy->work_lock); - kfree(sg_policy); + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return sg_policy; + + kthread_init_work(&sg_policy->work, sugov_work); + kthread_init_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + mutex_destroy(&sg_policy->work_lock); + kfree(sg_policy); + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return NULL; + } + sg_policy->thread = thread; + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + sugov_policy_free(sg_policy); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return NULL; + } + + kthread_bind_mask(thread, policy->related_cpus); + wake_up_process(thread); + + return sg_policy; } static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) @@ -541,7 +579,7 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + kthread_cancel_work_sync(&sg_policy->work); } static void sugov_limits(struct cpufreq_policy *policy)