diff mbox

[v5,2/3] PM / DEVFREQ: add basic governors

Message ID 1312794188-9823-3-git-send-email-myungjoo.ham@samsung.com (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

MyungJoo Ham Aug. 8, 2011, 9:03 a.m. UTC
Four CPUFREQ-like governors are provided as examples.

powersave: use the lowest frequency possible. The user (device) should
set the polling_ms as 0 because polling is useless for this governor.

performance: use the highest freqeuncy possible. The user (device)
should set the polling_ms as 0 because polling is useless for this
governor.

userspace: use the user specified frequency stored at
devfreq.user_set_freq. With sysfs support in the following patch, a user
may set the value with the sysfs interface.

simple_ondemand: simplified version of CPUFREQ's ONDEMAND governor.

When a user updates OPP entries (enable/disable/add), OPP framework
automatically notifies DEVFREQ to update operating frequency
accordingly. Thus, DEVFREQ users (device drivers) do not need to update
DEVFREQ manually with OPP entry updates or set polling_ms for powersave
, performance, userspace, or any other "static" governors.

Note that these are given only as basic examples for governors and any
devices with DEVFREQ may implement their own governors with the drivers
and use them.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>

---
Changes from v4:
- Added userspace governor

Changes from v3:
- Bugfixes on simple-ondemand governor (divide by zero / overflow)
- Style fixes
- Give names to governors
---
 drivers/base/power/devfreq.c |  100 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/devfreq.h      |    8 +++
 2 files changed, 108 insertions(+), 0 deletions(-)

Comments

Mike Turquette Aug. 11, 2011, 1:35 a.m. UTC | #1
On Mon, Aug 8, 2011 at 2:03 AM, MyungJoo Ham <myungjoo.ham@samsung.com> wrote:
> Four CPUFREQ-like governors are provided as examples.
>
> powersave: use the lowest frequency possible. The user (device) should
> set the polling_ms as 0 because polling is useless for this governor.
>
> performance: use the highest freqeuncy possible. The user (device)
> should set the polling_ms as 0 because polling is useless for this
> governor.

Polling is the only practical use for devfreq, assuming a QoS API
exists for DVFS.  As such powersave and performance governors should
be removed.

> userspace: use the user specified frequency stored at
> devfreq.user_set_freq. With sysfs support in the following patch, a user
> may set the value with the sysfs interface.
>
> simple_ondemand: simplified version of CPUFREQ's ONDEMAND governor.

I won't repeate everything from patch 1 of this series, but the
governors should implement the queue/loop logic in the same way that
CPUfreq does, and the individual devices should have their own
delayed_work.

> When a user updates OPP entries (enable/disable/add), OPP framework
> automatically notifies DEVFREQ to update operating frequency
> accordingly. Thus, DEVFREQ users (device drivers) do not need to update

It would be nice if OPP library "notified" devfreq but it does not
today.  OPP library needs notifiers and devfreq can provide handlers
for them.

> DEVFREQ manually with OPP entry updates or set polling_ms for powersave
> , performance, userspace, or any other "static" governors.
>
> Note that these are given only as basic examples for governors and any
> devices with DEVFREQ may implement their own governors with the drivers
> and use them.
>
> Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
> Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
>
> ---
> Changes from v4:
> - Added userspace governor
>
> Changes from v3:
> - Bugfixes on simple-ondemand governor (divide by zero / overflow)
> - Style fixes
> - Give names to governors
> ---
>  drivers/base/power/devfreq.c |  100 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/devfreq.h      |    8 +++
>  2 files changed, 108 insertions(+), 0 deletions(-)

Governors should be split out into their own file, especially since
they need to grow to include polling/queueing logic.

>
> diff --git a/drivers/base/power/devfreq.c b/drivers/base/power/devfreq.c
> index 6f4bd3a..c53bca9 100644
> --- a/drivers/base/power/devfreq.c
> +++ b/drivers/base/power/devfreq.c
> @@ -301,3 +301,103 @@ static int __init devfreq_init(void)
>        return 0;
>  }
>  late_initcall(devfreq_init);
> +
> +static int devfreq_powersave_func(struct devfreq *df,
> +                                 unsigned long *freq)
> +{
> +       *freq = 0; /* devfreq_do will run "ceiling" to 0 */
> +       return 0;
> +}
> +
> +struct devfreq_governor devfreq_powersave = {
> +       .name = "powersave",
> +       .get_target_freq = devfreq_powersave_func,
> +};
> +
> +static int devfreq_performance_func(struct devfreq *df,
> +                                   unsigned long *freq)
> +{
> +       *freq = UINT_MAX; /* devfreq_do will run "floor" */
> +       return 0;
> +}
> +
> +struct devfreq_governor devfreq_performance = {
> +       .name = "performance",
> +       .get_target_freq = devfreq_performance_func,
> +};
> +
> +static int devfreq_userspace_func(struct devfreq *df, unsigned long *freq)
> +{
> +       if (df->user_set_freq == 0)
> +               *freq = df->previous_freq; /* No user freq specified yet */
> +       else
> +               *freq = df->user_set_freq;
> +
> +       return 0;
> +}
> +
> +struct devfreq_governor devfreq_userspace = {
> +       .name = "userspace",
> +       .get_target_freq = devfreq_userspace_func,
> +};
> +
> +/* Constants for DevFreq-Simple-Ondemand (DFSO) */
> +#define DFSO_UPTHRESHOLD       (90)
> +#define DFSO_DOWNDIFFERENCTIAL (5)
> +static int devfreq_simple_ondemand_func(struct devfreq *df,
> +                                       unsigned long *freq)
> +{
> +       struct devfreq_dev_status stat;
> +       int err = df->profile->get_dev_status(df->dev, &stat);
> +       unsigned long long a, b;
> +
> +       if (err)
> +               return err;
> +
> +       /* Assume MAX if it is going to be divided by zero */
> +       if (stat.total_time == 0) {
> +               *freq = UINT_MAX;
> +               return 0;
> +       }
> +
> +       /* Prevent overflow */
> +       if (stat.busy_time >= (1 << 24) || stat.total_time >= (1 << 24)) {
> +               stat.busy_time >>= 7;
> +               stat.total_time >>= 7;
> +       }
> +
> +       /* Set MAX if it's busy enough */
> +       if (stat.busy_time * 100 >
> +           stat.total_time * DFSO_UPTHRESHOLD) {

Thresholds should not be constants, but should be tuneable parameters,
per-device.  This is yet another reason for revising the existing
relationship between devfreq core code, governors and devices.

> +               *freq = UINT_MAX;
> +               return 0;
> +       }
> +
> +       /* Set MAX if we do not know the initial frequency */
> +       if (stat.current_frequency == 0) {
> +               *freq = UINT_MAX;
> +               return 0;
> +       }
> +
> +       /* Keep the current frequency */
> +       if (stat.busy_time * 100 >
> +           stat.total_time * (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL)) {

Same as above.

> +               *freq = stat.current_frequency;
> +               return 0;
> +       }
> +
> +       /* Set the desired frequency based on the load */
> +       a = stat.busy_time;
> +       a *= stat.current_frequency;
> +       b = div_u64(a, stat.total_time);
> +       b *= 100;
> +       b = div_u64(b, (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL / 2));

Same as above.

Regards,
Mike

> +       *freq = (unsigned long) b;
> +
> +       return 0;
> +}
> +
> +struct devfreq_governor devfreq_simple_ondemand = {
> +       .name = "simple_ondemand",
> +       .get_target_freq = devfreq_simple_ondemand_func,
> +};
> diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
> index 6ec630b..7131d2a 100644
> --- a/include/linux/devfreq.h
> +++ b/include/linux/devfreq.h
> @@ -57,6 +57,8 @@ struct devfreq_governor {
>  * @next_polling       the number of remaining "devfreq_monitor" executions to
>  *                     reevaluate frequency/voltage of the device. Set by
>  *                     profile's polling_ms interval.
> + * @user_set_freq      User specified adequete frequency value (thru sysfs
> + *             interface). Governors may and may not use this value.
>  * @data       Private data of the governor. The devfreq framework does not
>  *             touch this.
>  *
> @@ -72,6 +74,7 @@ struct devfreq {
>        unsigned long previous_freq;
>        unsigned int next_polling;
>
> +       unsigned long user_set_freq; /* governors may ignore this. */
>        void *data; /* private data for governors */
>  };
>
> @@ -81,6 +84,11 @@ extern int devfreq_add_device(struct device *dev,
>                           struct devfreq_governor *governor);
>  extern int devfreq_remove_device(struct device *dev);
>  extern int devfreq_update(struct device *dev);
> +
> +extern struct devfreq_governor devfreq_powersave;
> +extern struct devfreq_governor devfreq_performance;
> +extern struct devfreq_governor devfreq_userspace;
> +extern struct devfreq_governor devfreq_simple_ondemand;
>  #else /* !CONFIG_PM_DEVFREQ */
>  static int devfreq_add_device(struct device *dev,
>                           struct devfreq_dev_profile *profile,
> --
> 1.7.4.1
>
>
MyungJoo Ham Aug. 16, 2011, 8:52 a.m. UTC | #2
On Thu, Aug 11, 2011 at 10:35 AM, Turquette, Mike <mturquette@ti.com> wrote:
[]
>
> Polling is the only practical use for devfreq, assuming a QoS API
> exists for DVFS.  As such powersave and performance governors should
> be removed.

Although powersave/performance governors may seem useless, they are
used as basis on measuring the usefulness of DVFS mechanism of
specific devices. If a device is going to use DVFS, we can test the
device with them to find out the potential power save (compare
powersave to performance) and the performance deterioration (compared
to performance). Often, in testing phase, QA teams use performance to
find out any issues with DVFS features (in CPUFREQ). Users may simply
want to use performance governor in some cases (power is not an issue
sometimes).

Using QoS APIs simply to set "minimum" or "maximum" is possible.
However, they are not that straightforward; e.g., how should we set
"DMA latency" to be fixed at the minimum frequency regardless of
others, or how should we set "Network latency" to be fixed at the
maximum frequency? especially without knowing the specifications of
each DVFS-capable device (such as available frequencies, valid latency
values, ...).

>
>> userspace: use the user specified frequency stored at
>> devfreq.user_set_freq. With sysfs support in the following patch, a user
>> may set the value with the sysfs interface.
>>
>> simple_ondemand: simplified version of CPUFREQ's ONDEMAND governor.
>
> I won't repeate everything from patch 1 of this series, but the
> governors should implement the queue/loop logic in the same way that
> CPUfreq does, and the individual devices should have their own
> delayed_work.

First, in case where we want to let each DVFS-capable device have
exact polling frequency (up to jiffy resolution), we only need to set
polling_interval = jiffies_to_msecs(1);.

In case of CPUFREQ, there would be only one polling loop at most for
each core. However, in case of DEVFREQ, there could be multiple
polling loops at a core if CPUFREQ-like looping logic is introduced.
Why don't we reduce that overhead while their function is same, it is
easily doable, and it reduces redundancy?

>
>> When a user updates OPP entries (enable/disable/add), OPP framework
>> automatically notifies DEVFREQ to update operating frequency
>> accordingly. Thus, DEVFREQ users (device drivers) do not need to update
>
> It would be nice if OPP library "notified" devfreq but it does not
> today.  OPP library needs notifiers and devfreq can provide handlers
> for them.

That's why devfreq_update() is added in the patch. While DEVFREQ is
the only one requiring notifications from OPP, do you think we may
incur the overhead of notifier at OPP by replacing devfreq_update with
notifier? If we somehow add another module that requires notifications
from OPP for frequency availability changes, we will need to implement
notifier at OPP side, but not just yet, I guess: (discussed before at
https://lists.linux-foundation.org/pipermail/linux-pm/2011-July/032053.html
)

>
[]
>> ---
>>  drivers/base/power/devfreq.c |  100 ++++++++++++++++++++++++++++++++++++++++++
>>  include/linux/devfreq.h      |    8 +++
>>  2 files changed, 108 insertions(+), 0 deletions(-)
>
> Governors should be split out into their own file, especially since
> they need to grow to include polling/queueing logic.

We will need to decide where to settle devfreq core, drivers, and
governors first. Would /drivers/devfreq/ be appropriate?

[]
>> +
>> +       /* Set MAX if it's busy enough */
>> +       if (stat.busy_time * 100 >
>> +           stat.total_time * DFSO_UPTHRESHOLD) {
>
> Thresholds should not be constants, but should be tuneable parameters,
> per-device.  This is yet another reason for revising the existing
> relationship between devfreq core code, governors and devices.
>

I agree. I think I should add governor specific "setup" value at
devfreq_add_device(); modifying the interface from

extern int devfreq_add_device(struct device *dev, struct
devfreq_dev_profile *profile, struct devfreq_governor *governor);
==>
extern int devfreq_add_device(struct device *dev, struct
devfreq_dev_profile *profile, struct devfreq_governor *governor, void
*gov_data);

where gov_data is fed to struct devfreq's data field.


>> +               *freq = UINT_MAX;
>> +               return 0;
>> +       }
>> +
>> +       /* Set MAX if we do not know the initial frequency */
>> +       if (stat.current_frequency == 0) {
>> +               *freq = UINT_MAX;
>> +               return 0;
>> +       }
>> +
>> +       /* Keep the current frequency */
>> +       if (stat.busy_time * 100 >
>> +           stat.total_time * (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL)) {
>
> Same as above.
Yes.

>
>> +               *freq = stat.current_frequency;
>> +               return 0;
>> +       }
>> +
>> +       /* Set the desired frequency based on the load */
>> +       a = stat.busy_time;
>> +       a *= stat.current_frequency;
>> +       b = div_u64(a, stat.total_time);
>> +       b *= 100;
>> +       b = div_u64(b, (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL / 2));
>
> Same as above.
Yes.

>
> Regards,
> Mike
>


Cheers!
MyungJoo
Mike Turquette Aug. 16, 2011, 6:11 p.m. UTC | #3
On Tue, Aug 16, 2011 at 1:52 AM, MyungJoo Ham <myungjoo.ham@samsung.com> wrote:
> On Thu, Aug 11, 2011 at 10:35 AM, Turquette, Mike <mturquette@ti.com> wrote:
> []
>>
>> Polling is the only practical use for devfreq, assuming a QoS API
>> exists for DVFS.  As such powersave and performance governors should
>> be removed.
>
> Although powersave/performance governors may seem useless, they are
> used as basis on measuring the usefulness of DVFS mechanism of
> specific devices. If a device is going to use DVFS, we can test the
> device with them to find out the potential power save (compare
> powersave to performance) and the performance deterioration (compared
> to performance). Often, in testing phase, QA teams use performance to
> find out any issues with DVFS features (in CPUFREQ). Users may simply
> want to use performance governor in some cases (power is not an issue
> sometimes).

Fair enough.  Keeping them around for testing is sensible.

> Using QoS APIs simply to set "minimum" or "maximum" is possible.
> However, they are not that straightforward; e.g., how should we set
> "DMA latency" to be fixed at the minimum frequency regardless of
> others, or how should we set "Network latency" to be fixed at the
> maximum frequency? especially without knowing the specifications of
> each DVFS-capable device (such as available frequencies, valid latency
> values, ...).
>
>>
>>> userspace: use the user specified frequency stored at
>>> devfreq.user_set_freq. With sysfs support in the following patch, a user
>>> may set the value with the sysfs interface.
>>>
>>> simple_ondemand: simplified version of CPUFREQ's ONDEMAND governor.
>>
>> I won't repeate everything from patch 1 of this series, but the
>> governors should implement the queue/loop logic in the same way that
>> CPUfreq does, and the individual devices should have their own
>> delayed_work.
>
> First, in case where we want to let each DVFS-capable device have
> exact polling frequency (up to jiffy resolution), we only need to set
> polling_interval = jiffies_to_msecs(1);.

That requires a source code change for anyone that wants to do it.  My
main complaint with this method is that it is restrictive to begin
with and the whole method for determining the next_polling time
reproduces what workqueues already give us.

> In case of CPUFREQ, there would be only one polling loop at most for
> each core. However, in case of DEVFREQ, there could be multiple
> polling loops at a core if CPUFREQ-like looping logic is introduced.
> Why don't we reduce that overhead while their function is same, it is
> easily doable, and it reduces redundancy?

I'm afraid I don't follow.  I was thinking of having a single wq loop
for each device.  Under what conditions would a single device have
multiple wq loops operating against it?

>>> When a user updates OPP entries (enable/disable/add), OPP framework
>>> automatically notifies DEVFREQ to update operating frequency
>>> accordingly. Thus, DEVFREQ users (device drivers) do not need to update
>>
>> It would be nice if OPP library "notified" devfreq but it does not
>> today.  OPP library needs notifiers and devfreq can provide handlers
>> for them.
>
> That's why devfreq_update() is added in the patch. While DEVFREQ is
> the only one requiring notifications from OPP, do you think we may
> incur the overhead of notifier at OPP by replacing devfreq_update with
> notifier? If we somehow add another module that requires notifications
> from OPP for frequency availability changes, we will need to implement
> notifier at OPP side, but not just yet, I guess: (discussed before at
> https://lists.linux-foundation.org/pipermail/linux-pm/2011-July/032053.html
> )

Reading that thread makes me think that we really should implement
notifiers in the OPP library.  An obvious user of OPP notifiers would
be CPUfreq.  I think it is safe to say that there may be
implementations of devfreq and CPUfreq that live side-by-side in the
near future; OPPs might be enabled/disabled dynamically, which means
both of them need callbacks.  Better to abstract it out early, I
think.

>>
> []
>>> ---
>>>  drivers/base/power/devfreq.c |  100 ++++++++++++++++++++++++++++++++++++++++++
>>>  include/linux/devfreq.h      |    8 +++
>>>  2 files changed, 108 insertions(+), 0 deletions(-)
>>
>> Governors should be split out into their own file, especially since
>> they need to grow to include polling/queueing logic.
>
> We will need to decide where to settle devfreq core, drivers, and
> governors first. Would /drivers/devfreq/ be appropriate?

I think GKH already ACK'd drivers/devfreq in a previous thread:
https://lists.linux-foundation.org/pipermail/linux-pm/2011-August/032537.html

> []
>>> +
>>> +       /* Set MAX if it's busy enough */
>>> +       if (stat.busy_time * 100 >
>>> +           stat.total_time * DFSO_UPTHRESHOLD) {
>>
>> Thresholds should not be constants, but should be tuneable parameters,
>> per-device.  This is yet another reason for revising the existing
>> relationship between devfreq core code, governors and devices.
>>
>
> I agree. I think I should add governor specific "setup" value at
> devfreq_add_device(); modifying the interface from
>
> extern int devfreq_add_device(struct device *dev, struct
> devfreq_dev_profile *profile, struct devfreq_governor *governor);
> ==>
> extern int devfreq_add_device(struct device *dev, struct
> devfreq_dev_profile *profile, struct devfreq_governor *governor, void
> *gov_data);
>
> where gov_data is fed to struct devfreq's data field.

It would be nice for the threshold values to be run-time tunable via
sysfs.  CPUfreq does this well today for ondemand/conservative
governors and it really helps when doing power/performance tuning.

Regards,
Mike

>>> +               *freq = UINT_MAX;
>>> +               return 0;
>>> +       }
>>> +
>>> +       /* Set MAX if we do not know the initial frequency */
>>> +       if (stat.current_frequency == 0) {
>>> +               *freq = UINT_MAX;
>>> +               return 0;
>>> +       }
>>> +
>>> +       /* Keep the current frequency */
>>> +       if (stat.busy_time * 100 >
>>> +           stat.total_time * (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL)) {
>>
>> Same as above.
> Yes.
>
>>
>>> +               *freq = stat.current_frequency;
>>> +               return 0;
>>> +       }
>>> +
>>> +       /* Set the desired frequency based on the load */
>>> +       a = stat.busy_time;
>>> +       a *= stat.current_frequency;
>>> +       b = div_u64(a, stat.total_time);
>>> +       b *= 100;
>>> +       b = div_u64(b, (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL / 2));
>>
>> Same as above.
> Yes.
>
>>
>> Regards,
>> Mike
>>
>
>
> Cheers!
> MyungJoo
>
> --
> MyungJoo Ham (???), Ph.D.
> Mobile Software Platform Lab,
> Digital Media and Communications (DMC) Business
> Samsung Electronics
> cell: 82-10-6714-2858
>
MyungJoo Ham Aug. 17, 2011, 7:32 a.m. UTC | #4
On Wed, Aug 17, 2011 at 3:11 AM, Turquette, Mike <mturquette@ti.com> wrote:
> On Tue, Aug 16, 2011 at 1:52 AM, MyungJoo Ham <myungjoo.ham@samsung.com> wrote:
>>
>> First, in case where we want to let each DVFS-capable device have
>> exact polling frequency (up to jiffy resolution), we only need to set
>> polling_interval = jiffies_to_msecs(1);.
>
> That requires a source code change for anyone that wants to do it.  My
> main complaint with this method is that it is restrictive to begin
> with and the whole method for determining the next_polling time
> reproduces what workqueues already give us.
>
>> In case of CPUFREQ, there would be only one polling loop at most for
>> each core. However, in case of DEVFREQ, there could be multiple
>> polling loops at a core if CPUFREQ-like looping logic is introduced.
>> Why don't we reduce that overhead while their function is same, it is
>> easily doable, and it reduces redundancy?
>
> I'm afraid I don't follow.  I was thinking of having a single wq loop
> for each device.  Under what conditions would a single device have
> multiple wq loops operating against it?

I meant a single wq loop for each device and multiple DEVFREQ devices
for a system.

I aggregated multiple instances of DEVFREQ polling into one polling loop because
1. remove redundant polling loops,
2. simplify governors implementation; I'm presuming that some devices
(especially GPUs and MMC hosts) might want their own custom governors
although the governors won't be complex, and
3. reduce overhead.

I don't see any benefit of looping at each DEVFREQ device, yet.

The case of CPUFREQ is a bit different because each CPUFREQ device
(CPU) is capable of looping itself and each looping should represent
one CPU. Each looping device (CPU) will have only up to one CPUFREQ
polling loop and each instance of CPUFREQ should be executed on the
corresponding CPU in order to avoid being slept while the represented
CPU is running.

>
>>>> When a user updates OPP entries (enable/disable/add), OPP framework
>>>> automatically notifies DEVFREQ to update operating frequency
>>>> accordingly. Thus, DEVFREQ users (device drivers) do not need to update
>>>
>>> It would be nice if OPP library "notified" devfreq but it does not
>>> today.  OPP library needs notifiers and devfreq can provide handlers
>>> for them.
>>
>> That's why devfreq_update() is added in the patch. While DEVFREQ is
>> the only one requiring notifications from OPP, do you think we may
>> incur the overhead of notifier at OPP by replacing devfreq_update with
>> notifier? If we somehow add another module that requires notifications
>> from OPP for frequency availability changes, we will need to implement
>> notifier at OPP side, but not just yet, I guess: (discussed before at
>> https://lists.linux-foundation.org/pipermail/linux-pm/2011-July/032053.html
>> )
>
> Reading that thread makes me think that we really should implement
> notifiers in the OPP library.  An obvious user of OPP notifiers would
> be CPUfreq.  I think it is safe to say that there may be
> implementations of devfreq and CPUfreq that live side-by-side in the
> near future; OPPs might be enabled/disabled dynamically, which means
> both of them need callbacks.  Better to abstract it out early, I
> think.

Ok, assuming that there would be another requesting notifications from
OPP, I've added "opp_get_notifier()" that returns struct
srcu_notifier_head * removing devfreq_update() at patchset v6
candidate.

>
>>>
>> []
>>>> ---
>>>>  drivers/base/power/devfreq.c |  100 ++++++++++++++++++++++++++++++++++++++++++
>>>>  include/linux/devfreq.h      |    8 +++
>>>>  2 files changed, 108 insertions(+), 0 deletions(-)
>>>
>>> Governors should be split out into their own file, especially since
>>> they need to grow to include polling/queueing logic.
>>
>> We will need to decide where to settle devfreq core, drivers, and
>> governors first. Would /drivers/devfreq/ be appropriate?
>
> I think GKH already ACK'd drivers/devfreq in a previous thread:
> https://lists.linux-foundation.org/pipermail/linux-pm/2011-August/032537.html

Yup. I'm moving them to /drivers/devfreq/

>
>> []
>>>> +
>>>> +       /* Set MAX if it's busy enough */
>>>> +       if (stat.busy_time * 100 >
>>>> +           stat.total_time * DFSO_UPTHRESHOLD) {
>>>
>>> Thresholds should not be constants, but should be tuneable parameters,
>>> per-device.  This is yet another reason for revising the existing
>>> relationship between devfreq core code, governors and devices.
>>>
>>
>> I agree. I think I should add governor specific "setup" value at
>> devfreq_add_device(); modifying the interface from
>>
>> extern int devfreq_add_device(struct device *dev, struct
>> devfreq_dev_profile *profile, struct devfreq_governor *governor);
>> ==>
>> extern int devfreq_add_device(struct device *dev, struct
>> devfreq_dev_profile *profile, struct devfreq_governor *governor, void
>> *gov_data);
>>
>> where gov_data is fed to struct devfreq's data field.
>
> It would be nice for the threshold values to be run-time tunable via
> sysfs.  CPUfreq does this well today for ondemand/conservative
> governors and it really helps when doing power/performance tuning.
>
> Regards,
> Mike
>

This will be done with the next revision.

Thank you.

MyungJoo

>>>> +               *freq = UINT_MAX;
>>>> +               return 0;
>>>> +       }
>>>> +
>>>> +       /* Set MAX if we do not know the initial frequency */
>>>> +       if (stat.current_frequency == 0) {
>>>> +               *freq = UINT_MAX;
>>>> +               return 0;
>>>> +       }
>>>> +
>>>> +       /* Keep the current frequency */
>>>> +       if (stat.busy_time * 100 >
>>>> +           stat.total_time * (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL)) {
>>>
>>> Same as above.
>> Yes.
>>
>>>
>>>> +               *freq = stat.current_frequency;
>>>> +               return 0;
>>>> +       }
>>>> +
>>>> +       /* Set the desired frequency based on the load */
>>>> +       a = stat.busy_time;
>>>> +       a *= stat.current_frequency;
>>>> +       b = div_u64(a, stat.total_time);
>>>> +       b *= 100;
>>>> +       b = div_u64(b, (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL / 2));
>>>
>>> Same as above.
>> Yes.
>>
>>>
>>> Regards,
>>> Mike
>>>
>>
>>
>> Cheers!
>> MyungJoo
>>
>> --
>> MyungJoo Ham (???), Ph.D.
>> Mobile Software Platform Lab,
>> Digital Media and Communications (DMC) Business
>> Samsung Electronics
>> cell: 82-10-6714-2858
>>
>
diff mbox

Patch

diff --git a/drivers/base/power/devfreq.c b/drivers/base/power/devfreq.c
index 6f4bd3a..c53bca9 100644
--- a/drivers/base/power/devfreq.c
+++ b/drivers/base/power/devfreq.c
@@ -301,3 +301,103 @@  static int __init devfreq_init(void)
 	return 0;
 }
 late_initcall(devfreq_init);
+
+static int devfreq_powersave_func(struct devfreq *df,
+				  unsigned long *freq)
+{
+	*freq = 0; /* devfreq_do will run "ceiling" to 0 */
+	return 0;
+}
+
+struct devfreq_governor devfreq_powersave = {
+	.name = "powersave",
+	.get_target_freq = devfreq_powersave_func,
+};
+
+static int devfreq_performance_func(struct devfreq *df,
+				    unsigned long *freq)
+{
+	*freq = UINT_MAX; /* devfreq_do will run "floor" */
+	return 0;
+}
+
+struct devfreq_governor devfreq_performance = {
+	.name = "performance",
+	.get_target_freq = devfreq_performance_func,
+};
+
+static int devfreq_userspace_func(struct devfreq *df, unsigned long *freq)
+{
+	if (df->user_set_freq == 0)
+		*freq = df->previous_freq; /* No user freq specified yet */
+	else
+		*freq = df->user_set_freq;
+
+	return 0;
+}
+
+struct devfreq_governor devfreq_userspace = {
+	.name = "userspace",
+	.get_target_freq = devfreq_userspace_func,
+};
+
+/* Constants for DevFreq-Simple-Ondemand (DFSO) */
+#define DFSO_UPTHRESHOLD	(90)
+#define DFSO_DOWNDIFFERENCTIAL	(5)
+static int devfreq_simple_ondemand_func(struct devfreq *df,
+					unsigned long *freq)
+{
+	struct devfreq_dev_status stat;
+	int err = df->profile->get_dev_status(df->dev, &stat);
+	unsigned long long a, b;
+
+	if (err)
+		return err;
+
+	/* Assume MAX if it is going to be divided by zero */
+	if (stat.total_time == 0) {
+		*freq = UINT_MAX;
+		return 0;
+	}
+
+	/* Prevent overflow */
+	if (stat.busy_time >= (1 << 24) || stat.total_time >= (1 << 24)) {
+		stat.busy_time >>= 7;
+		stat.total_time >>= 7;
+	}
+
+	/* Set MAX if it's busy enough */
+	if (stat.busy_time * 100 >
+	    stat.total_time * DFSO_UPTHRESHOLD) {
+		*freq = UINT_MAX;
+		return 0;
+	}
+
+	/* Set MAX if we do not know the initial frequency */
+	if (stat.current_frequency == 0) {
+		*freq = UINT_MAX;
+		return 0;
+	}
+
+	/* Keep the current frequency */
+	if (stat.busy_time * 100 >
+	    stat.total_time * (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL)) {
+		*freq = stat.current_frequency;
+		return 0;
+	}
+
+	/* Set the desired frequency based on the load */
+	a = stat.busy_time;
+	a *= stat.current_frequency;
+	b = div_u64(a, stat.total_time);
+	b *= 100;
+	b = div_u64(b, (DFSO_UPTHRESHOLD - DFSO_DOWNDIFFERENCTIAL / 2));
+	*freq = (unsigned long) b;
+
+	return 0;
+}
+
+struct devfreq_governor devfreq_simple_ondemand = {
+	.name = "simple_ondemand",
+	.get_target_freq = devfreq_simple_ondemand_func,
+};
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 6ec630b..7131d2a 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -57,6 +57,8 @@  struct devfreq_governor {
  * @next_polling	the number of remaining "devfreq_monitor" executions to
  *			reevaluate frequency/voltage of the device. Set by
  *			profile's polling_ms interval.
+ * @user_set_freq	User specified adequete frequency value (thru sysfs
+ *		interface). Governors may and may not use this value.
  * @data	Private data of the governor. The devfreq framework does not
  *		touch this.
  *
@@ -72,6 +74,7 @@  struct devfreq {
 	unsigned long previous_freq;
 	unsigned int next_polling;
 
+	unsigned long user_set_freq; /* governors may ignore this. */
 	void *data; /* private data for governors */
 };
 
@@ -81,6 +84,11 @@  extern int devfreq_add_device(struct device *dev,
 			   struct devfreq_governor *governor);
 extern int devfreq_remove_device(struct device *dev);
 extern int devfreq_update(struct device *dev);
+
+extern struct devfreq_governor devfreq_powersave;
+extern struct devfreq_governor devfreq_performance;
+extern struct devfreq_governor devfreq_userspace;
+extern struct devfreq_governor devfreq_simple_ondemand;
 #else /* !CONFIG_PM_DEVFREQ */
 static int devfreq_add_device(struct device *dev,
 			   struct devfreq_dev_profile *profile,