diff mbox

cpufreq: ondemand: Change the calculation of target frequency

Message ID 51A7BFA6.3070800@semaphore.gr (mailing list archive)
State RFC, archived
Headers show

Commit Message

Stratos Karafotis May 30, 2013, 9:07 p.m. UTC
Ondemand calculates load in terms of frequency and increases it only
if the load_freq is greater than up_threshold multiplied by current
or average frequency. This seems to produce oscillations of frequency
between min and max because, for example, a relatively small load can
easily saturate minimum frequency and lead the CPU to max. Then, the
CPU will decrease back to min due to a small load_freq.

This patch changes the calculation method of load and target frequency
considering 2 points:
- Load computation should be independent from current or average
measured frequency. For example an absolute load 80% at 100MHz is not
necessarily equivalent to 8% at 1000MHz in the next sampling interval.
- Target frequency should be increased to any value of frequency table
proportional to absolute load, instead to only the max.

The patch also removes the unused code as a result of the above changes.

Tested on Intel i7-3770 CPU @ 3.40GHz and on Quad core 1500MHz Krait.
Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an
increase ~1.5% in performance. cpufreq_stats (time_in_state) shows
that middle frequencies are used more, with this patch. Highest
and lowest frequencies were used less by ~9%

Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
---
 arch/x86/include/asm/processor.h   | 29 ----------------------
 drivers/cpufreq/Makefile           |  2 +-
 drivers/cpufreq/acpi-cpufreq.c     |  5 ----
 drivers/cpufreq/cpufreq.c          | 21 ----------------
 drivers/cpufreq/cpufreq_governor.c | 10 +-------
 drivers/cpufreq/cpufreq_governor.h |  1 -
 drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++-----------------------
 drivers/cpufreq/mperf.c            | 51 --------------------------------------
 drivers/cpufreq/mperf.h            |  9 -------
 include/linux/cpufreq.h            |  6 -----
 10 files changed, 9 insertions(+), 164 deletions(-)
 delete mode 100644 drivers/cpufreq/mperf.c
 delete mode 100644 drivers/cpufreq/mperf.h

Comments

Viresh Kumar May 31, 2013, 8:51 a.m. UTC | #1
Sorry for joining the party so late.. I was running away from reviewing it :(

On 31 May 2013 02:37, Stratos Karafotis <stratosk@semaphore.gr> wrote:
> Ondemand calculates load in terms of frequency and increases it only
> if the load_freq is greater than up_threshold multiplied by current
> or average frequency. This seems to produce oscillations of frequency
> between min and max because, for example, a relatively small load can
> easily saturate minimum frequency and lead the CPU to max. Then, the
> CPU will decrease back to min due to a small load_freq.
>
> This patch changes the calculation method of load and target frequency
> considering 2 points:
> - Load computation should be independent from current or average
> measured frequency. For example an absolute load 80% at 100MHz is not
> necessarily equivalent to 8% at 1000MHz in the next sampling interval.
> - Target frequency should be increased to any value of frequency table
> proportional to absolute load, instead to only the max.
>
> The patch also removes the unused code as a result of the above changes.
>
> Tested on Intel i7-3770 CPU @ 3.40GHz and on Quad core 1500MHz Krait.
> Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an
> increase ~1.5% in performance. cpufreq_stats (time_in_state) shows
> that middle frequencies are used more, with this patch. Highest
> and lowest frequencies were used less by ~9%
>
> Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
> ---
>  arch/x86/include/asm/processor.h   | 29 ----------------------
>  drivers/cpufreq/Makefile           |  2 +-
>  drivers/cpufreq/acpi-cpufreq.c     |  5 ----
>  drivers/cpufreq/cpufreq.c          | 21 ----------------
>  drivers/cpufreq/cpufreq_governor.c | 10 +-------
>  drivers/cpufreq/cpufreq_governor.h |  1 -
>  drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++-----------------------
>  drivers/cpufreq/mperf.c            | 51 --------------------------------------
>  drivers/cpufreq/mperf.h            |  9 -------
>  include/linux/cpufreq.h            |  6 -----
>  10 files changed, 9 insertions(+), 164 deletions(-)
>  delete mode 100644 drivers/cpufreq/mperf.c
>  delete mode 100644 drivers/cpufreq/mperf.h

I believe you should have removed other users of getavg() in a separate
patch and also cc'd relevant people so that you can some review comments
from  them.

> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 22224b3..2874a3b 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -941,35 +941,6 @@ extern int set_tsc_mode(unsigned int val);
>
>  extern u16 amd_get_nb_id(int cpu);
>
> -struct aperfmperf {
> -       u64 aperf, mperf;
> -};
> -
> -static inline void get_aperfmperf(struct aperfmperf *am)
> -{
> -       WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
> -
> -       rdmsrl(MSR_IA32_APERF, am->aperf);
> -       rdmsrl(MSR_IA32_MPERF, am->mperf);
> -}
> -
> -#define APERFMPERF_SHIFT 10
> -
> -static inline
> -unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
> -                                   struct aperfmperf *new)
> -{
> -       u64 aperf = new->aperf - old->aperf;
> -       u64 mperf = new->mperf - old->mperf;
> -       unsigned long ratio = aperf;
> -
> -       mperf >>= APERFMPERF_SHIFT;
> -       if (mperf)
> -               ratio = div64_u64(aperf, mperf);
> -
> -       return ratio;
> -}
> -
>  extern unsigned long arch_align_stack(unsigned long sp);
>  extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
>
> diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
> index 315b923..4519fb1 100644
> --- a/drivers/cpufreq/Makefile
> +++ b/drivers/cpufreq/Makefile
> @@ -23,7 +23,7 @@ obj-$(CONFIG_GENERIC_CPUFREQ_CPU0)    += cpufreq-cpu0.o
>  # powernow-k8 can load then. ACPI is preferred to all other hardware-specific drivers.
>  # speedstep-* is preferred over p4-clockmod.
>
> -obj-$(CONFIG_X86_ACPI_CPUFREQ)         += acpi-cpufreq.o mperf.o
> +obj-$(CONFIG_X86_ACPI_CPUFREQ)         += acpi-cpufreq.o
>  obj-$(CONFIG_X86_POWERNOW_K8)          += powernow-k8.o
>  obj-$(CONFIG_X86_PCC_CPUFREQ)          += pcc-cpufreq.o
>  obj-$(CONFIG_X86_POWERNOW_K6)          += powernow-k6.o
> diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
> index 11b8b4b..0025cdd 100644
> --- a/drivers/cpufreq/acpi-cpufreq.c
> +++ b/drivers/cpufreq/acpi-cpufreq.c
> @@ -45,7 +45,6 @@
>  #include <asm/msr.h>
>  #include <asm/processor.h>
>  #include <asm/cpufeature.h>
> -#include "mperf.h"
>
>  MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
>  MODULE_DESCRIPTION("ACPI Processor P-States Driver");
> @@ -842,10 +841,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
>         /* notify BIOS that we exist */
>         acpi_processor_notify_smm(THIS_MODULE);
>
> -       /* Check for APERF/MPERF support in hardware */
> -       if (boot_cpu_has(X86_FEATURE_APERFMPERF))
> -               acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
> -
>         pr_debug("CPU%u - ACPI performance management activated.\n", cpu);
>         for (i = 0; i < perf->state_count; i++)
>                 pr_debug("     %cP%d: %d MHz, %d mW, %d uS\n",
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 2d53f47..04ceddc 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -1502,27 +1502,6 @@ no_policy:
>  }
>  EXPORT_SYMBOL_GPL(cpufreq_driver_target);
>
> -int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu)
> -{
> -       int ret = 0;
> -
> -       if (cpufreq_disabled())
> -               return ret;
> -
> -       if (!cpufreq_driver->getavg)
> -               return 0;
> -
> -       policy = cpufreq_cpu_get(policy->cpu);
> -       if (!policy)
> -               return -EINVAL;
> -
> -       ret = cpufreq_driver->getavg(policy, cpu);
> -
> -       cpufreq_cpu_put(policy);
> -       return ret;
> -}
> -EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg);
> -
>  /*
>   * when "event" is CPUFREQ_GOV_LIMITS
>   */
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 5af40ad..eeab30a 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -97,7 +97,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
>
>         policy = cdbs->cur_policy;
>
> -       /* Get Absolute Load (in terms of freq for ondemand gov) */
> +       /* Get Absolute Load */
>         for_each_cpu(j, policy->cpus) {
>                 struct cpu_dbs_common_info *j_cdbs;
>                 u64 cur_wall_time, cur_idle_time;
> @@ -148,14 +148,6 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
>
>                 load = 100 * (wall_time - idle_time) / wall_time;
>
> -               if (dbs_data->cdata->governor == GOV_ONDEMAND) {
> -                       int freq_avg = __cpufreq_driver_getavg(policy, j);
> -                       if (freq_avg <= 0)
> -                               freq_avg = policy->cur;
> -
> -                       load *= freq_avg;
> -               }
> -
>                 if (load > max_load)
>                         max_load = load;
>         }
> diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
> index e16a961..e899c11 100644
> --- a/drivers/cpufreq/cpufreq_governor.h
> +++ b/drivers/cpufreq/cpufreq_governor.h
> @@ -169,7 +169,6 @@ struct od_dbs_tuners {
>         unsigned int sampling_rate;
>         unsigned int sampling_down_factor;
>         unsigned int up_threshold;
> -       unsigned int adj_up_threshold;
>         unsigned int powersave_bias;
>         unsigned int io_is_busy;
>  };
> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
> index 4b9bb5d..c1e6d3e 100644
> --- a/drivers/cpufreq/cpufreq_ondemand.c
> +++ b/drivers/cpufreq/cpufreq_ondemand.c
> @@ -29,11 +29,9 @@
>  #include "cpufreq_governor.h"
>
>  /* On-demand governor macros */
> -#define DEF_FREQUENCY_DOWN_DIFFERENTIAL                (10)
>  #define DEF_FREQUENCY_UP_THRESHOLD             (80)
>  #define DEF_SAMPLING_DOWN_FACTOR               (1)
>  #define MAX_SAMPLING_DOWN_FACTOR               (100000)
> -#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL      (3)
>  #define MICRO_FREQUENCY_UP_THRESHOLD           (95)
>  #define MICRO_FREQUENCY_MIN_SAMPLE_RATE                (10000)
>  #define MIN_FREQUENCY_UP_THRESHOLD             (11)
> @@ -159,14 +157,10 @@ static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq)
>
>  /*
>   * Every sampling_rate, we check, if current idle time is less than 20%
> - * (default), then we try to increase frequency. Every sampling_rate, we look
> - * for the lowest frequency which can sustain the load while keeping idle time
> - * over 30%. If such a frequency exist, we try to decrease to this frequency.
> - *
> - * Any frequency increase takes it to the maximum frequency. Frequency reduction
> - * happens at minimum steps of 5% (default) of current frequency
> + * (default), then we try to increase frequency. Else, we adjust the frequency
> + * proportional to load.
>   */
> -static void od_check_cpu(int cpu, unsigned int load_freq)
> +static void od_check_cpu(int cpu, unsigned int load)
>  {
>         struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
>         struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
> @@ -176,29 +170,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
>         dbs_info->freq_lo = 0;
>
>         /* Check for frequency increase */
> -       if (load_freq > od_tuners->up_threshold * policy->cur) {
> +       if (load > od_tuners->up_threshold) {
>                 /* If switching to max speed, apply sampling_down_factor */
>                 if (policy->cur < policy->max)
>                         dbs_info->rate_mult =
>                                 od_tuners->sampling_down_factor;
>                 dbs_freq_increase(policy, policy->max);
>                 return;
> -       }
> -
> -       /* Check for frequency decrease */
> -       /* if we cannot reduce the frequency anymore, break out early */
> -       if (policy->cur == policy->min)
> -               return;
> -
> -       /*
> -        * The optimal frequency is the frequency that is the lowest that can
> -        * support the current CPU usage without triggering the up policy. To be
> -        * safe, we focus 10 points under the threshold.
> -        */
> -       if (load_freq < od_tuners->adj_up_threshold
> -                       * policy->cur) {
> +       } else {
> +               /* Calculate the next frequency proportional to load */
>                 unsigned int freq_next;
> -               freq_next = load_freq / od_tuners->adj_up_threshold;
> +               freq_next = load * policy->max / 100;
>
>                 /* No longer fully busy, reset rate_mult */
>                 dbs_info->rate_mult = 1;
> @@ -372,9 +354,6 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
>                         input < MIN_FREQUENCY_UP_THRESHOLD) {
>                 return -EINVAL;
>         }
> -       /* Calculate the new adj_up_threshold */
> -       od_tuners->adj_up_threshold += input;
> -       od_tuners->adj_up_threshold -= od_tuners->up_threshold;
>
>         od_tuners->up_threshold = input;
>         return count;
> @@ -523,8 +502,6 @@ static int od_init(struct dbs_data *dbs_data)
>         if (idle_time != -1ULL) {
>                 /* Idle micro accounting is supported. Use finer thresholds */
>                 tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
> -               tuners->adj_up_threshold = MICRO_FREQUENCY_UP_THRESHOLD -
> -                       MICRO_FREQUENCY_DOWN_DIFFERENTIAL;
>                 /*
>                  * In nohz/micro accounting case we set the minimum frequency
>                  * not depending on HZ, but fixed (very low). The deferred
> @@ -533,8 +510,6 @@ static int od_init(struct dbs_data *dbs_data)
>                 dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
>         } else {
>                 tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
> -               tuners->adj_up_threshold = DEF_FREQUENCY_UP_THRESHOLD -
> -                       DEF_FREQUENCY_DOWN_DIFFERENTIAL;
>
>                 /* For correct statistics, we need 10 ticks for each measure */
>                 dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
> diff --git a/drivers/cpufreq/mperf.c b/drivers/cpufreq/mperf.c
> deleted file mode 100644
> index 911e193..0000000
> --- a/drivers/cpufreq/mperf.c
> +++ /dev/null
> @@ -1,51 +0,0 @@
> -#include <linux/kernel.h>
> -#include <linux/smp.h>
> -#include <linux/module.h>
> -#include <linux/init.h>
> -#include <linux/cpufreq.h>
> -#include <linux/slab.h>
> -
> -#include "mperf.h"
> -
> -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
> -
> -/* Called via smp_call_function_single(), on the target CPU */
> -static void read_measured_perf_ctrs(void *_cur)
> -{
> -       struct aperfmperf *am = _cur;
> -
> -       get_aperfmperf(am);
> -}
> -
> -/*
> - * Return the measured active (C0) frequency on this CPU since last call
> - * to this function.
> - * Input: cpu number
> - * Return: Average CPU frequency in terms of max frequency (zero on error)
> - *
> - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
> - * over a period of time, while CPU is in C0 state.
> - * IA32_MPERF counts at the rate of max advertised frequency
> - * IA32_APERF counts at the rate of actual CPU frequency
> - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
> - * no meaning should be associated with absolute values of these MSRs.
> - */
> -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
> -                                       unsigned int cpu)
> -{
> -       struct aperfmperf perf;
> -       unsigned long ratio;
> -       unsigned int retval;
> -
> -       if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
> -               return 0;
> -
> -       ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
> -       per_cpu(acfreq_old_perf, cpu) = perf;
> -
> -       retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
> -
> -       return retval;
> -}
> -EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
> -MODULE_LICENSE("GPL");
> diff --git a/drivers/cpufreq/mperf.h b/drivers/cpufreq/mperf.h
> deleted file mode 100644
> index 5dbf295..0000000
> --- a/drivers/cpufreq/mperf.h
> +++ /dev/null
> @@ -1,9 +0,0 @@
> -/*
> - *  (c) 2010 Advanced Micro Devices, Inc.
> - *  Your use of this code is subject to the terms and conditions of the
> - *  GNU general public license version 2. See "COPYING" or
> - *  http://www.gnu.org/licenses/gpl.html
> - */
> -
> -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
> -                                       unsigned int cpu);
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 037d36a..60fcb42 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -211,10 +211,6 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
>                                    unsigned int target_freq,
>                                    unsigned int relation);
>
> -
> -extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy,
> -                                  unsigned int cpu);
> -
>  int cpufreq_register_governor(struct cpufreq_governor *governor);
>  void cpufreq_unregister_governor(struct cpufreq_governor *governor);
>
> @@ -254,8 +250,6 @@ struct cpufreq_driver {
>         unsigned int    (*get)  (unsigned int cpu);
>
>         /* optional */
> -       unsigned int (*getavg)  (struct cpufreq_policy *policy,
> -                                unsigned int cpu);
>         int     (*bios_limit)   (int cpu, unsigned int *limit);
>
>         int     (*exit)         (struct cpufreq_policy *policy);
> --
> 1.8.1.4
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar May 31, 2013, 8:54 a.m. UTC | #2
On 31 May 2013 02:37, Stratos Karafotis <stratosk@semaphore.gr> wrote:

Ahh.. earlier mail got sent without me doing complete review :(

> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> +static void od_check_cpu(int cpu, unsigned int load)
>  {
>         struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
>         struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
> @@ -176,29 +170,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
>         dbs_info->freq_lo = 0;
>
>         /* Check for frequency increase */
> -       if (load_freq > od_tuners->up_threshold * policy->cur) {
> +       if (load > od_tuners->up_threshold) {

Chances of this getting hit are minimal now.. I don't know if keeping
this will change anything now :)

>                 /* If switching to max speed, apply sampling_down_factor */
>                 if (policy->cur < policy->max)
>                         dbs_info->rate_mult =
>                                 od_tuners->sampling_down_factor;
>                 dbs_freq_increase(policy, policy->max);
>                 return;
> -       }
> -
> -       /* Check for frequency decrease */
> -       /* if we cannot reduce the frequency anymore, break out early */
> -       if (policy->cur == policy->min)
> -               return;
> -
> -       /*
> -        * The optimal frequency is the frequency that is the lowest that can
> -        * support the current CPU usage without triggering the up policy. To be
> -        * safe, we focus 10 points under the threshold.
> -        */
> -       if (load_freq < od_tuners->adj_up_threshold
> -                       * policy->cur) {
> +       } else {
> +               /* Calculate the next frequency proportional to load */
>                 unsigned int freq_next;
> -               freq_next = load_freq / od_tuners->adj_up_threshold;
> +               freq_next = load * policy->max / 100;

Rafael asked why you believe this is the right formula and I really couldn't
find an appropriate answer to that, sorry :(
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki May 31, 2013, 12:42 p.m. UTC | #3
On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote:
> On 31 May 2013 02:37, Stratos Karafotis <stratosk@semaphore.gr> wrote:
> 
> Ahh.. earlier mail got sent without me doing complete review :(
> 
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > +static void od_check_cpu(int cpu, unsigned int load)
> >  {
> >         struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
> >         struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
> > @@ -176,29 +170,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
> >         dbs_info->freq_lo = 0;
> >
> >         /* Check for frequency increase */
> > -       if (load_freq > od_tuners->up_threshold * policy->cur) {
> > +       if (load > od_tuners->up_threshold) {
> 
> Chances of this getting hit are minimal now.. I don't know if keeping
> this will change anything now :)
> 
> >                 /* If switching to max speed, apply sampling_down_factor */
> >                 if (policy->cur < policy->max)
> >                         dbs_info->rate_mult =
> >                                 od_tuners->sampling_down_factor;
> >                 dbs_freq_increase(policy, policy->max);
> >                 return;
> > -       }
> > -
> > -       /* Check for frequency decrease */
> > -       /* if we cannot reduce the frequency anymore, break out early */
> > -       if (policy->cur == policy->min)
> > -               return;
> > -
> > -       /*
> > -        * The optimal frequency is the frequency that is the lowest that can
> > -        * support the current CPU usage without triggering the up policy. To be
> > -        * safe, we focus 10 points under the threshold.
> > -        */
> > -       if (load_freq < od_tuners->adj_up_threshold
> > -                       * policy->cur) {
> > +       } else {
> > +               /* Calculate the next frequency proportional to load */
> >                 unsigned int freq_next;
> > -               freq_next = load_freq / od_tuners->adj_up_threshold;
> > +               freq_next = load * policy->max / 100;
> 
> Rafael asked why you believe this is the right formula and I really couldn't
> find an appropriate answer to that, sorry :(

Right, it would be good to explain that.

"Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?

Rafael
Stratos Karafotis May 31, 2013, 4:33 p.m. UTC | #4
On 05/31/2013 11:51 AM, Viresh Kumar wrote:
>> ---
>>   arch/x86/include/asm/processor.h   | 29 ----------------------
>>   drivers/cpufreq/Makefile           |  2 +-
>>   drivers/cpufreq/acpi-cpufreq.c     |  5 ----
>>   drivers/cpufreq/cpufreq.c          | 21 ----------------
>>   drivers/cpufreq/cpufreq_governor.c | 10 +-------
>>   drivers/cpufreq/cpufreq_governor.h |  1 -
>>   drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++-----------------------
>>   drivers/cpufreq/mperf.c            | 51 --------------------------------------
>>   drivers/cpufreq/mperf.h            |  9 -------
>>   include/linux/cpufreq.h            |  6 -----
>>   10 files changed, 9 insertions(+), 164 deletions(-)
>>   delete mode 100644 drivers/cpufreq/mperf.c
>>   delete mode 100644 drivers/cpufreq/mperf.h
> 
> I believe you should have removed other users of getavg() in a separate
> patch and also cc'd relevant people so that you can some review comments
> from  them.

I will split the patch in two. If it's OK, I will keep the removal of 
__cpufreq_driver_getavg in the original patch and move the clean up of
APERF/MPERF support in a second patch. I will also cc relevant people.


>>          /* Check for frequency increase */
>> -       if (load_freq > od_tuners->up_threshold * policy->cur) {
>> +       if (load > od_tuners->up_threshold) {
> 
> Chances of this getting hit are minimal now.. I don't know if keeping
> this will change anything now :)

Actually, no. This getting hit pretty often.
Please find attached the cpufreq statistics - trans_table during build
of 3.4 kernel. With default up_threshold (95), the transition to max
happened many times because of load was greater than up_threshold.
I also thought to keep this code to leave up_threshold functionality unaffected.
 
On 05/31/2013 03:42 PM, Rafael J. Wysocki wrote:
> On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote:
>>> +       } else {
>>> +               /* Calculate the next frequency proportional to load */
>>>                  unsigned int freq_next;
>>> -               freq_next = load_freq / od_tuners->adj_up_threshold;
>>> +               freq_next = load * policy->max / 100;
>>
>> Rafael asked why you believe this is the right formula and I really couldn't
>> find an appropriate answer to that, sorry :(
> 
> Right, it would be good to explain that.
> 
> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
> 

I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".
This way the target frequency will be proportional to load and the calculation will
"map" the load to CPU freq table.

I will update the patch according to your observations and suggestions.

Thanks,
Stratos
From  :    To
         :   3401000   3400000   3300000   3100000   3000000   2900000   2800000   2600000   2500000   2400000   2200000   2100000   2000000   1900000   1700000   1600000 
  3401000:         0         0         4         2         4         2         3         0         2         1         1         1         1         4         0        29 
  3400000:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0 
  3300000:         4         0         0         0         1         0         0         0         0         0         0         1         0         0         0         7 
  3100000:         2         0         0         0         1         0         0         0         0         0         0         0         0         0         0         0 
  3000000:         4         0         0         0         0         0         0         0         0         1         0         0         0         0         0         4 
  2900000:         1         0         0         0         1         0         0         0         0         0         0         0         0         0         0         7 
  2800000:         3         0         0         0         0         1         0         0         0         1         0         0         0         0         0         3 
  2600000:         0         0         0         0         0         1         0         0         0         1         0         0         0         0         0         4 
  2500000:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         4 
  2400000:         3         0         0         0         0         0         1         0         0         0         0         0         0         0         0         7 
  2200000:         1         0         0         0         0         0         0         0         0         1         0         0         0         0         0         3 
  2100000:         1         0         0         0         0         0         0         1         0         0         0         0         0         1         0         4 
  2000000:         1         0         0         0         0         0         0         0         0         0         0         0         0         0         0         1 
  1900000:         0         0         2         0         0         0         0         0         0         1         0         0         0         0         0         8 
  1700000:         0         0         0         0         0         0         0         0         0         0         0         0         0         0         0         2 
  1600000:        33         0         7         1         2         5         4         5         2         5         4         5         1         6         2         0
Rafael Wysocki June 1, 2013, 12:27 p.m. UTC | #5
On Friday, May 31, 2013 07:33:06 PM Stratos Karafotis wrote:
> On 05/31/2013 11:51 AM, Viresh Kumar wrote:
> >> ---
> >>   arch/x86/include/asm/processor.h   | 29 ----------------------
> >>   drivers/cpufreq/Makefile           |  2 +-
> >>   drivers/cpufreq/acpi-cpufreq.c     |  5 ----
> >>   drivers/cpufreq/cpufreq.c          | 21 ----------------
> >>   drivers/cpufreq/cpufreq_governor.c | 10 +-------
> >>   drivers/cpufreq/cpufreq_governor.h |  1 -
> >>   drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++-----------------------
> >>   drivers/cpufreq/mperf.c            | 51 --------------------------------------
> >>   drivers/cpufreq/mperf.h            |  9 -------
> >>   include/linux/cpufreq.h            |  6 -----
> >>   10 files changed, 9 insertions(+), 164 deletions(-)
> >>   delete mode 100644 drivers/cpufreq/mperf.c
> >>   delete mode 100644 drivers/cpufreq/mperf.h
> > 
> > I believe you should have removed other users of getavg() in a separate
> > patch and also cc'd relevant people so that you can some review comments
> > from  them.
> 
> I will split the patch in two. If it's OK, I will keep the removal of 
> __cpufreq_driver_getavg in the original patch and move the clean up of
> APERF/MPERF support in a second patch. I will also cc relevant people.
> 
> 
> >>          /* Check for frequency increase */
> >> -       if (load_freq > od_tuners->up_threshold * policy->cur) {
> >> +       if (load > od_tuners->up_threshold) {
> > 
> > Chances of this getting hit are minimal now.. I don't know if keeping
> > this will change anything now :)
> 
> Actually, no. This getting hit pretty often.
> Please find attached the cpufreq statistics - trans_table during build
> of 3.4 kernel. With default up_threshold (95), the transition to max
> happened many times because of load was greater than up_threshold.
> I also thought to keep this code to leave up_threshold functionality unaffected.
>  
> On 05/31/2013 03:42 PM, Rafael J. Wysocki wrote:
> > On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote:
> >>> +       } else {
> >>> +               /* Calculate the next frequency proportional to load */
> >>>                  unsigned int freq_next;
> >>> -               freq_next = load_freq / od_tuners->adj_up_threshold;
> >>> +               freq_next = load * policy->max / 100;
> >>
> >> Rafael asked why you believe this is the right formula and I really couldn't
> >> find an appropriate answer to that, sorry :(
> > 
> > Right, it would be good to explain that.
> > 
> > "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
> > 
> 
> I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".
> This way the target frequency will be proportional to load and the calculation will
> "map" the load to CPU freq table.

That seems to mean "take the percentage of policy->cpuinfo.max_freq proportional
to the current load and use the available frequency closest to that".  Is that
correct?

Rafael
Stratos Karafotis June 1, 2013, 12:50 p.m. UTC | #6
On 06/01/2013 03:27 PM, Rafael J. Wysocki wrote:
> On Friday, May 31, 2013 07:33:06 PM Stratos Karafotis wrote:
>> On 05/31/2013 11:51 AM, Viresh Kumar wrote:
>>>> ---
>>>>    arch/x86/include/asm/processor.h   | 29 ----------------------
>>>>    drivers/cpufreq/Makefile           |  2 +-
>>>>    drivers/cpufreq/acpi-cpufreq.c     |  5 ----
>>>>    drivers/cpufreq/cpufreq.c          | 21 ----------------
>>>>    drivers/cpufreq/cpufreq_governor.c | 10 +-------
>>>>    drivers/cpufreq/cpufreq_governor.h |  1 -
>>>>    drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++-----------------------
>>>>    drivers/cpufreq/mperf.c            | 51 --------------------------------------
>>>>    drivers/cpufreq/mperf.h            |  9 -------
>>>>    include/linux/cpufreq.h            |  6 -----
>>>>    10 files changed, 9 insertions(+), 164 deletions(-)
>>>>    delete mode 100644 drivers/cpufreq/mperf.c
>>>>    delete mode 100644 drivers/cpufreq/mperf.h
>>>
>>> I believe you should have removed other users of getavg() in a separate
>>> patch and also cc'd relevant people so that you can some review comments
>>> from  them.
>>
>> I will split the patch in two. If it's OK, I will keep the removal of
>> __cpufreq_driver_getavg in the original patch and move the clean up of
>> APERF/MPERF support in a second patch. I will also cc relevant people.
>>
>>
>>>>           /* Check for frequency increase */
>>>> -       if (load_freq > od_tuners->up_threshold * policy->cur) {
>>>> +       if (load > od_tuners->up_threshold) {
>>>
>>> Chances of this getting hit are minimal now.. I don't know if keeping
>>> this will change anything now :)
>>
>> Actually, no. This getting hit pretty often.
>> Please find attached the cpufreq statistics - trans_table during build
>> of 3.4 kernel. With default up_threshold (95), the transition to max
>> happened many times because of load was greater than up_threshold.
>> I also thought to keep this code to leave up_threshold functionality unaffected.
>>   
>> On 05/31/2013 03:42 PM, Rafael J. Wysocki wrote:
>>> On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote:
>>>>> +       } else {
>>>>> +               /* Calculate the next frequency proportional to load */
>>>>>                   unsigned int freq_next;
>>>>> -               freq_next = load_freq / od_tuners->adj_up_threshold;
>>>>> +               freq_next = load * policy->max / 100;
>>>>
>>>> Rafael asked why you believe this is the right formula and I really couldn't
>>>> find an appropriate answer to that, sorry :(
>>>
>>> Right, it would be good to explain that.
>>>
>>> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
>>>
>>
>> I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".
>> This way the target frequency will be proportional to load and the calculation will
>> "map" the load to CPU freq table.
> 
> That seems to mean "take the percentage of policy->cpuinfo.max_freq proportional
> to the current load and use the available frequency closest to that".  Is that
> correct?
> 
> Rafael
> 
> 

In my opinion, yes. I thought, yesterday, after your question, to normalize load
to policy->min - policy->max. But I think it's a more correct approach to take 
the percentage of cpuinfo.max, as you said.
Actually, I did my tests on the percentage of policy->max that was equal to 
cpuinfo.max.

Unless, I miss something here. :)

Thanks,
Stratos
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar June 1, 2013, 2:56 p.m. UTC | #7
On 31 May 2013 22:03, Stratos Karafotis <stratosk@semaphore.gr> wrote:
> On 05/31/2013 11:51 AM, Viresh Kumar wrote:
>> I believe you should have removed other users of getavg() in a separate
>> patch and also cc'd relevant people so that you can some review comments
>> from  them.
>
> I will split the patch in two. If it's OK, I will keep the removal of
> __cpufreq_driver_getavg in the original patch and move the clean up of
> APERF/MPERF support in a second patch. I will also cc relevant people.

Even removal of __cpufreq_driver_getavg() should be done in a separate
patch, so that it can be reverted easily if required later.

>> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
>
> I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".

Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is
initialized.. but user may request a lower max freq for a governor or policy.
Which is actually reflected in policy->max I believe.

Over that why keeping following check is useful anymore?

if (load_freq > od_tuners->up_threshold)
    goto max.

As, if load is over 95, then even policy->max * 95 / 100 will even give almost
the same freq.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stratos Karafotis June 1, 2013, 4:06 p.m. UTC | #8
On 06/01/2013 05:56 PM, Viresh Kumar wrote:
> On 31 May 2013 22:03, Stratos Karafotis <stratosk@semaphore.gr> wrote:
>> On 05/31/2013 11:51 AM, Viresh Kumar wrote:
>>> I believe you should have removed other users of getavg() in a separate
>>> patch and also cc'd relevant people so that you can some review comments
>>> from  them.
>>
>> I will split the patch in two. If it's OK, I will keep the removal of
>> __cpufreq_driver_getavg in the original patch and move the clean up of
>> APERF/MPERF support in a second patch. I will also cc relevant people.
> 
> Even removal of __cpufreq_driver_getavg() should be done in a separate
> patch, so that it can be reverted easily if required later.

Thanks, Viresh. I will do the removal of that function in a seperate patch.
Should I use a third patch for it? Or should I include it in the patch which
will remove APERF/MPERF support?

>>> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
>>
>> I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".
> 
> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is
> initialized.. but user may request a lower max freq for a governor or policy.
> Which is actually reflected in policy->max I believe.

My initial thought is to have a linear function to calculate the target freq 
proportional to load: (I will use 'C' as the function's slope as Rafael used it) 

freq_target = C * load

For simplicity, let's assume that load is between 0 and 1 as initially is calculated
in governor.
Ideally,  for a load = 0, we should have freq_target = 0 and for load = 1,
freq_target = cpuinfo.max

So, the slope C = cpuinfo.max 

I think, it's matter of definition about what policy->min and policy->max can do.
Should they change the slope C? Or only limit freq_target?
I don't think that the policy->max (or min) should affect HOW (slope C) governor
calculates freq_target but only limit the calculated result.

Maybe, we could have separate tunables to a affect the slope C.

If I'm wrong about the definition of policy->min, policy->max, I would change
the code accordingly.


> Over that why keeping following check is useful anymore?
> 
> if (load_freq > od_tuners->up_threshold)
>      goto max.
> 
> As, if load is over 95, then even policy->max * 95 / 100 will even give almost
> the same freq.
> 

I thought that too. But maybe user selects a lower value for up_threshold.
(For example, up_threshold = 60). In my opinion, we have to keep up_theshold,
to give the possibility to user to have max freq with small loads.


Thanks for your comments!
Stratos 
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki June 1, 2013, 7:37 p.m. UTC | #9
On Saturday, June 01, 2013 08:26:47 PM Viresh Kumar wrote:
> On 31 May 2013 22:03, Stratos Karafotis <stratosk@semaphore.gr> wrote:
> > On 05/31/2013 11:51 AM, Viresh Kumar wrote:
> >> I believe you should have removed other users of getavg() in a separate
> >> patch and also cc'd relevant people so that you can some review comments
> >> from  them.
> >
> > I will split the patch in two. If it's OK, I will keep the removal of
> > __cpufreq_driver_getavg in the original patch and move the clean up of
> > APERF/MPERF support in a second patch. I will also cc relevant people.
> 
> Even removal of __cpufreq_driver_getavg() should be done in a separate
> patch, so that it can be reverted easily if required later.

Why would you want to revert it separately?

> >> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
> >
> > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".
> 
> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is
> initialized.. but user may request a lower max freq for a governor or policy.
> Which is actually reflected in policy->max I believe.

Which doesn't matter.  The formula should provide the same results regardless
of the user settings except that the selected frequency should be capped by
policy->max (instead of being proportional to it).  I think using
cpuinfo.max_freq here is correct.

> Over that why keeping following check is useful anymore?
> 
> if (load_freq > od_tuners->up_threshold)
>     goto max.
> 
> As, if load is over 95, then even policy->max * 95 / 100 will even give almost
> the same freq.

Yes, in the majority of cases.

Thanks,
Rafael
Viresh Kumar June 3, 2013, 6:11 a.m. UTC | #10
On 1 June 2013 21:36, Stratos Karafotis <stratosk@semaphore.gr> wrote:
> On 06/01/2013 05:56 PM, Viresh Kumar wrote:

>> Even removal of __cpufreq_driver_getavg() should be done in a separate
>> patch, so that it can be reverted easily if required later.
>
> Thanks, Viresh. I will do the removal of that function in a seperate patch.
> Should I use a third patch for it? Or should I include it in the patch which
> will remove APERF/MPERF support?

Maybe a third patch would be more cleaner.

>> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is
>> initialized.. but user may request a lower max freq for a governor or policy.
>> Which is actually reflected in policy->max I believe.
>
> My initial thought is to have a linear function to calculate the target freq
> proportional to load: (I will use 'C' as the function's slope as Rafael used it)
>
> freq_target = C * load
>
> For simplicity, let's assume that load is between 0 and 1 as initially is calculated
> in governor.
> Ideally,  for a load = 0, we should have freq_target = 0 and for load = 1,
> freq_target = cpuinfo.max
>
> So, the slope C = cpuinfo.max
>
> I think, it's matter of definition about what policy->min and policy->max can do.
> Should they change the slope C? Or only limit freq_target?
> I don't think that the policy->max (or min) should affect HOW (slope C) governor
> calculates freq_target but only limit the calculated result.
>
> Maybe, we could have separate tunables to a affect the slope C.
>
> If I'm wrong about the definition of policy->min, policy->max, I would change
> the code accordingly.

Lets discuss that in reply to Rafael's mail.

>> As, if load is over 95, then even policy->max * 95 / 100 will even give almost
>> the same freq.
>>
>
> I thought that too. But maybe user selects a lower value for up_threshold.
> (For example, up_threshold = 60). In my opinion, we have to keep up_theshold,
> to give the possibility to user to have max freq with small loads.

Yes... good point.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar June 3, 2013, 6:51 a.m. UTC | #11
On 2 June 2013 01:07, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Saturday, June 01, 2013 08:26:47 PM Viresh Kumar wrote:

>> Even removal of __cpufreq_driver_getavg() should be done in a separate
>> patch, so that it can be reverted easily if required later.
>
> Why would you want to revert it separately?

We might not need to revert all the changes that Stratos is doing.
Only a revert of getavg() and its users + a small fix in governor would
be enough. Stratos patch isn't only about removing getavg() but how
ondemand works and so breaking stuff might be more useful.

>> >> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
>> >
>> > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".
>>
>> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is
>> initialized.. but user may request a lower max freq for a governor or policy.
>> Which is actually reflected in policy->max I believe.
>
> Which doesn't matter.  The formula should provide the same results regardless
> of the user settings except that the selected frequency should be capped by
> policy->max (instead of being proportional to it).  I think using
> cpuinfo.max_freq here is correct.

I am confused now about what to use.. This is how I read it:

Assumption: CPU supports following freq range.. 500 MHz, 600 MHz, 700 MHz,
800 MHz, 900 MHz, 1 GHz

cpuinfo.max_freq = 1 GHz
policy->max is set to 600 MHz

Case 1: Use policy->max:

We need load to be over 500/600 (i.e. .834) to move to 600 MHz.

Case 2: Use cpuinfo.max_freq..

We need load to be over 500/1000 (i.e. .5) to move to 600 MHz.

So, obviously the calculations aren't the same..

Now, this is how I understood these two different variables:
- cpuinfo.max_freq:

>> Over that why keeping following check is useful anymore?
>>
>> if (load_freq > od_tuners->up_threshold)
>>     goto max.
>>
>> As, if load is over 95, then even policy->max * 95 / 100 will even give almost
>> the same freq.
>
> Yes, in the majority of cases.
>
> Thanks,
> Rafael
>
>
> --
> I speak only for myself.
> Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar June 3, 2013, 6:55 a.m. UTC | #12
Sent half written mail.. sorry.. will continue from where I left.

On 3 June 2013 12:21, Viresh Kumar <viresh.kumar@linaro.org> wrote:

> So, obviously the calculations aren't the same..

Now, this is how I understood these two different variables:
- cpuinfo.max_freq: maximum frequency (in kHz) which is
supported by this CPU
- policy->max: Maximum frequency forced by user or governor

When somebody sets policy->max, he expects cpufreq core to
use the range between min and max as the slope cpufreq core
has and adjust its frequencies accordingly..

Don't know if I am right or wrong :(
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki June 3, 2013, 10:32 a.m. UTC | #13
On Monday, June 03, 2013 12:21:47 PM Viresh Kumar wrote:
> On 2 June 2013 01:07, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Saturday, June 01, 2013 08:26:47 PM Viresh Kumar wrote:
> 
> >> Even removal of __cpufreq_driver_getavg() should be done in a separate
> >> patch, so that it can be reverted easily if required later.
> >
> > Why would you want to revert it separately?
> 
> We might not need to revert all the changes that Stratos is doing.
> Only a revert of getavg() and its users + a small fix in governor would
> be enough. Stratos patch isn't only about removing getavg() but how
> ondemand works and so breaking stuff might be more useful.

Yes, it's probably better to do the getavg() removal as a separate patch.

> >> >> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C?
> >> >
> >> > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100".
> >>
> >> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is
> >> initialized.. but user may request a lower max freq for a governor or policy.
> >> Which is actually reflected in policy->max I believe.
> >
> > Which doesn't matter.  The formula should provide the same results regardless
> > of the user settings except that the selected frequency should be capped by
> > policy->max (instead of being proportional to it).  I think using
> > cpuinfo.max_freq here is correct.
> 
> I am confused now about what to use.. This is how I read it:
> 
> Assumption: CPU supports following freq range.. 500 MHz, 600 MHz, 700 MHz,
> 800 MHz, 900 MHz, 1 GHz
> 
> cpuinfo.max_freq = 1 GHz
> policy->max is set to 600 MHz
> 
> Case 1: Use policy->max:
> 
> We need load to be over 500/600 (i.e. .834) to move to 600 MHz.
> 
> Case 2: Use cpuinfo.max_freq..
> 
> We need load to be over 500/1000 (i.e. .5) to move to 600 MHz.
> 
> So, obviously the calculations aren't the same..

No, they aren't.

My point is that the set of frequencies to choose from doesn't change with
the changes of user settings, so the computation should use things that
don't with the user settings either.

Thanks,
Rafael
Rafael Wysocki June 3, 2013, 10:57 a.m. UTC | #14
On Monday, June 03, 2013 12:25:02 PM Viresh Kumar wrote:
> Sent half written mail.. sorry.. will continue from where I left.
> 
> On 3 June 2013 12:21, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> 
> > So, obviously the calculations aren't the same..
> 
> Now, this is how I understood these two different variables:
> - cpuinfo.max_freq: maximum frequency (in kHz) which is
> supported by this CPU
> - policy->max: Maximum frequency forced by user or governor
> 
> When somebody sets policy->max, he expects cpufreq core to
> use the range between min and max as the slope cpufreq core
> has and adjust its frequencies accordingly..
> 
> Don't know if I am right or wrong :(

The reality is that the set of frequencies to use is constant and changing
policy->max doesn't change that set.  There are just fewer frequencies the
governor can choose from if policy->max is below cpuinfo.max_freq.

The question is if we want policy->max to re-scale them effectively (i.e. to
change weights so that the maximum load maps to the highest frequency available
at the moment) or if we want policy->max to work as a cap (i.e. to map all
loads above certain value to the maximum frequency available at the moment, so
that the criteria for selecting the lower frequencies don't change).  In my
opinion the second option is better, because it means "OK, we can't use some
high frequencies, but let's not hurt performance for the loads that wouldn't
require them anyway".  Otherwise, we'll effectively throttle all loads and
that not only causes performance to drop, but also causes more energy to be
used overall.

Thanks,
Rafael
Viresh Kumar June 3, 2013, 11:24 a.m. UTC | #15
On 3 June 2013 16:27, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> The question is if we want policy->max to re-scale them effectively (i.e. to
> change weights so that the maximum load maps to the highest frequency available
> at the moment) or if we want policy->max to work as a cap (i.e. to map all
> loads above certain value to the maximum frequency available at the moment, so
> that the criteria for selecting the lower frequencies don't change).  In my
> opinion the second option is better, because it means "OK, we can't use some
> high frequencies, but let's not hurt performance for the loads that wouldn't
> require them anyway".  Otherwise, we'll effectively throttle all loads and
> that not only causes performance to drop, but also causes more energy to be
> used overall.

I wouldn't say that I don't agree with you as I do to some extent.

But the question that comes to my mind now is: Why is policy->max reduced
in the first place? User doesn't have control over which freqs to expose, so
available_frequencies will stay the same. The only thing he is capable
of doing is: reduce policy->max.. Which in a way means that.. "I don't want to
use higher frequencies (power, thermal reasons) and I know performance will
go down with it and I don't care for it now."

And this way I feel policy->max isn't a bad choice either.

BUT as you said: policy->max isn't there to say don't be so aggressive now in
increasing frequencies but just only to say, don't go over this frequency..

So, probably we can use cpuinfo.max_freq :)
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stratos Karafotis June 3, 2013, 4:12 p.m. UTC | #16
On 06/03/2013 02:24 PM, Viresh Kumar wrote:
> On 3 June 2013 16:27, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> The question is if we want policy->max to re-scale them effectively (i.e. to
>> change weights so that the maximum load maps to the highest frequency available
>> at the moment) or if we want policy->max to work as a cap (i.e. to map all
>> loads above certain value to the maximum frequency available at the moment, so
>> that the criteria for selecting the lower frequencies don't change).  In my
>> opinion the second option is better, because it means "OK, we can't use some
>> high frequencies, but let's not hurt performance for the loads that wouldn't
>> require them anyway".  Otherwise, we'll effectively throttle all loads and
>> that not only causes performance to drop, but also causes more energy to be
>> used overall.
> 
> I wouldn't say that I don't agree with you as I do to some extent.
> 
> But the question that comes to my mind now is: Why is policy->max reduced
> in the first place? User doesn't have control over which freqs to expose, so
> available_frequencies will stay the same. The only thing he is capable
> of doing is: reduce policy->max.. Which in a way means that.. "I don't want to
> use higher frequencies (power, thermal reasons) and I know performance will
> go down with it and I don't care for it now."
> 
> And this way I feel policy->max isn't a bad choice either.
> 
> BUT as you said: policy->max isn't there to say don't be so aggressive now in
> increasing frequencies but just only to say, don't go over this frequency..
> 
> So, probably we can use cpuinfo.max_freq :)
> 

Both of you know better than me, but I also believe that cpuinfo.max is more
appropriate. Although, the decision was taken, let me share a spreadsheet to show
the 2 cases.
https://docs.google.com/spreadsheet/ccc?key=0AnMfNYUV1k0ddHh1OUFXa0kxcGZJaXd4am1sdmVWT0E&usp=sharing

I will prepare the v2 of this patch that uses cpuinfo.max_freq instead of policy-max.
Also, I will split the patch into 3 parts as suggested.


Thank you for your comments and suggestions!
Stratos
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 22224b3..2874a3b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -941,35 +941,6 @@  extern int set_tsc_mode(unsigned int val);
 
 extern u16 amd_get_nb_id(int cpu);
 
-struct aperfmperf {
-	u64 aperf, mperf;
-};
-
-static inline void get_aperfmperf(struct aperfmperf *am)
-{
-	WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
-
-	rdmsrl(MSR_IA32_APERF, am->aperf);
-	rdmsrl(MSR_IA32_MPERF, am->mperf);
-}
-
-#define APERFMPERF_SHIFT 10
-
-static inline
-unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
-				    struct aperfmperf *new)
-{
-	u64 aperf = new->aperf - old->aperf;
-	u64 mperf = new->mperf - old->mperf;
-	unsigned long ratio = aperf;
-
-	mperf >>= APERFMPERF_SHIFT;
-	if (mperf)
-		ratio = div64_u64(aperf, mperf);
-
-	return ratio;
-}
-
 extern unsigned long arch_align_stack(unsigned long sp);
 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
 
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 315b923..4519fb1 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -23,7 +23,7 @@  obj-$(CONFIG_GENERIC_CPUFREQ_CPU0)	+= cpufreq-cpu0.o
 # powernow-k8 can load then. ACPI is preferred to all other hardware-specific drivers.
 # speedstep-* is preferred over p4-clockmod.
 
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
 obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 11b8b4b..0025cdd 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -45,7 +45,6 @@ 
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
-#include "mperf.h"
 
 MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
 MODULE_DESCRIPTION("ACPI Processor P-States Driver");
@@ -842,10 +841,6 @@  static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	/* notify BIOS that we exist */
 	acpi_processor_notify_smm(THIS_MODULE);
 
-	/* Check for APERF/MPERF support in hardware */
-	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
-		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
-
 	pr_debug("CPU%u - ACPI performance management activated.\n", cpu);
 	for (i = 0; i < perf->state_count; i++)
 		pr_debug("     %cP%d: %d MHz, %d mW, %d uS\n",
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 2d53f47..04ceddc 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1502,27 +1502,6 @@  no_policy:
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_target);
 
-int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu)
-{
-	int ret = 0;
-
-	if (cpufreq_disabled())
-		return ret;
-
-	if (!cpufreq_driver->getavg)
-		return 0;
-
-	policy = cpufreq_cpu_get(policy->cpu);
-	if (!policy)
-		return -EINVAL;
-
-	ret = cpufreq_driver->getavg(policy, cpu);
-
-	cpufreq_cpu_put(policy);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg);
-
 /*
  * when "event" is CPUFREQ_GOV_LIMITS
  */
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 5af40ad..eeab30a 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -97,7 +97,7 @@  void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 
 	policy = cdbs->cur_policy;
 
-	/* Get Absolute Load (in terms of freq for ondemand gov) */
+	/* Get Absolute Load */
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_common_info *j_cdbs;
 		u64 cur_wall_time, cur_idle_time;
@@ -148,14 +148,6 @@  void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 
 		load = 100 * (wall_time - idle_time) / wall_time;
 
-		if (dbs_data->cdata->governor == GOV_ONDEMAND) {
-			int freq_avg = __cpufreq_driver_getavg(policy, j);
-			if (freq_avg <= 0)
-				freq_avg = policy->cur;
-
-			load *= freq_avg;
-		}
-
 		if (load > max_load)
 			max_load = load;
 	}
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index e16a961..e899c11 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -169,7 +169,6 @@  struct od_dbs_tuners {
 	unsigned int sampling_rate;
 	unsigned int sampling_down_factor;
 	unsigned int up_threshold;
-	unsigned int adj_up_threshold;
 	unsigned int powersave_bias;
 	unsigned int io_is_busy;
 };
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 4b9bb5d..c1e6d3e 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -29,11 +29,9 @@ 
 #include "cpufreq_governor.h"
 
 /* On-demand governor macros */
-#define DEF_FREQUENCY_DOWN_DIFFERENTIAL		(10)
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
-#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL	(3)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
 #define MIN_FREQUENCY_UP_THRESHOLD		(11)
@@ -159,14 +157,10 @@  static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq)
 
 /*
  * Every sampling_rate, we check, if current idle time is less than 20%
- * (default), then we try to increase frequency. Every sampling_rate, we look
- * for the lowest frequency which can sustain the load while keeping idle time
- * over 30%. If such a frequency exist, we try to decrease to this frequency.
- *
- * Any frequency increase takes it to the maximum frequency. Frequency reduction
- * happens at minimum steps of 5% (default) of current frequency
+ * (default), then we try to increase frequency. Else, we adjust the frequency
+ * proportional to load.
  */
-static void od_check_cpu(int cpu, unsigned int load_freq)
+static void od_check_cpu(int cpu, unsigned int load)
 {
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 	struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
@@ -176,29 +170,17 @@  static void od_check_cpu(int cpu, unsigned int load_freq)
 	dbs_info->freq_lo = 0;
 
 	/* Check for frequency increase */
-	if (load_freq > od_tuners->up_threshold * policy->cur) {
+	if (load > od_tuners->up_threshold) {
 		/* If switching to max speed, apply sampling_down_factor */
 		if (policy->cur < policy->max)
 			dbs_info->rate_mult =
 				od_tuners->sampling_down_factor;
 		dbs_freq_increase(policy, policy->max);
 		return;
-	}
-
-	/* Check for frequency decrease */
-	/* if we cannot reduce the frequency anymore, break out early */
-	if (policy->cur == policy->min)
-		return;
-
-	/*
-	 * The optimal frequency is the frequency that is the lowest that can
-	 * support the current CPU usage without triggering the up policy. To be
-	 * safe, we focus 10 points under the threshold.
-	 */
-	if (load_freq < od_tuners->adj_up_threshold
-			* policy->cur) {
+	} else {
+		/* Calculate the next frequency proportional to load */
 		unsigned int freq_next;
-		freq_next = load_freq / od_tuners->adj_up_threshold;
+		freq_next = load * policy->max / 100;
 
 		/* No longer fully busy, reset rate_mult */
 		dbs_info->rate_mult = 1;
@@ -372,9 +354,6 @@  static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
 			input < MIN_FREQUENCY_UP_THRESHOLD) {
 		return -EINVAL;
 	}
-	/* Calculate the new adj_up_threshold */
-	od_tuners->adj_up_threshold += input;
-	od_tuners->adj_up_threshold -= od_tuners->up_threshold;
 
 	od_tuners->up_threshold = input;
 	return count;
@@ -523,8 +502,6 @@  static int od_init(struct dbs_data *dbs_data)
 	if (idle_time != -1ULL) {
 		/* Idle micro accounting is supported. Use finer thresholds */
 		tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
-		tuners->adj_up_threshold = MICRO_FREQUENCY_UP_THRESHOLD -
-			MICRO_FREQUENCY_DOWN_DIFFERENTIAL;
 		/*
 		 * In nohz/micro accounting case we set the minimum frequency
 		 * not depending on HZ, but fixed (very low). The deferred
@@ -533,8 +510,6 @@  static int od_init(struct dbs_data *dbs_data)
 		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
 	} else {
 		tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
-		tuners->adj_up_threshold = DEF_FREQUENCY_UP_THRESHOLD -
-			DEF_FREQUENCY_DOWN_DIFFERENTIAL;
 
 		/* For correct statistics, we need 10 ticks for each measure */
 		dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
diff --git a/drivers/cpufreq/mperf.c b/drivers/cpufreq/mperf.c
deleted file mode 100644
index 911e193..0000000
--- a/drivers/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@ 
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-
-#include "mperf.h"
-
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-	struct aperfmperf *am = _cur;
-
-	get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-					unsigned int cpu)
-{
-	struct aperfmperf perf;
-	unsigned long ratio;
-	unsigned int retval;
-
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-		return 0;
-
-	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-	per_cpu(acfreq_old_perf, cpu) = perf;
-
-	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
-MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/mperf.h b/drivers/cpufreq/mperf.h
deleted file mode 100644
index 5dbf295..0000000
--- a/drivers/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@ 
-/*
- *  (c) 2010 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- */
-
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-					unsigned int cpu);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 037d36a..60fcb42 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -211,10 +211,6 @@  extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
 				   unsigned int target_freq,
 				   unsigned int relation);
 
-
-extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy,
-				   unsigned int cpu);
-
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
@@ -254,8 +250,6 @@  struct cpufreq_driver {
 	unsigned int	(*get)	(unsigned int cpu);
 
 	/* optional */
-	unsigned int (*getavg)	(struct cpufreq_policy *policy,
-				 unsigned int cpu);
 	int	(*bios_limit)	(int cpu, unsigned int *limit);
 
 	int	(*exit)		(struct cpufreq_policy *policy);