Message ID | 51A7BFA6.3070800@semaphore.gr (mailing list archive) |
---|---|
State | RFC, archived |
Headers | show |
Sorry for joining the party so late.. I was running away from reviewing it :( On 31 May 2013 02:37, Stratos Karafotis <stratosk@semaphore.gr> wrote: > Ondemand calculates load in terms of frequency and increases it only > if the load_freq is greater than up_threshold multiplied by current > or average frequency. This seems to produce oscillations of frequency > between min and max because, for example, a relatively small load can > easily saturate minimum frequency and lead the CPU to max. Then, the > CPU will decrease back to min due to a small load_freq. > > This patch changes the calculation method of load and target frequency > considering 2 points: > - Load computation should be independent from current or average > measured frequency. For example an absolute load 80% at 100MHz is not > necessarily equivalent to 8% at 1000MHz in the next sampling interval. > - Target frequency should be increased to any value of frequency table > proportional to absolute load, instead to only the max. > > The patch also removes the unused code as a result of the above changes. > > Tested on Intel i7-3770 CPU @ 3.40GHz and on Quad core 1500MHz Krait. > Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an > increase ~1.5% in performance. cpufreq_stats (time_in_state) shows > that middle frequencies are used more, with this patch. Highest > and lowest frequencies were used less by ~9% > > Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr> > --- > arch/x86/include/asm/processor.h | 29 ---------------------- > drivers/cpufreq/Makefile | 2 +- > drivers/cpufreq/acpi-cpufreq.c | 5 ---- > drivers/cpufreq/cpufreq.c | 21 ---------------- > drivers/cpufreq/cpufreq_governor.c | 10 +------- > drivers/cpufreq/cpufreq_governor.h | 1 - > drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++----------------------- > drivers/cpufreq/mperf.c | 51 -------------------------------------- > drivers/cpufreq/mperf.h | 9 ------- > include/linux/cpufreq.h | 6 ----- > 10 files changed, 9 insertions(+), 164 deletions(-) > delete mode 100644 drivers/cpufreq/mperf.c > delete mode 100644 drivers/cpufreq/mperf.h I believe you should have removed other users of getavg() in a separate patch and also cc'd relevant people so that you can some review comments from them. > diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h > index 22224b3..2874a3b 100644 > --- a/arch/x86/include/asm/processor.h > +++ b/arch/x86/include/asm/processor.h > @@ -941,35 +941,6 @@ extern int set_tsc_mode(unsigned int val); > > extern u16 amd_get_nb_id(int cpu); > > -struct aperfmperf { > - u64 aperf, mperf; > -}; > - > -static inline void get_aperfmperf(struct aperfmperf *am) > -{ > - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); > - > - rdmsrl(MSR_IA32_APERF, am->aperf); > - rdmsrl(MSR_IA32_MPERF, am->mperf); > -} > - > -#define APERFMPERF_SHIFT 10 > - > -static inline > -unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, > - struct aperfmperf *new) > -{ > - u64 aperf = new->aperf - old->aperf; > - u64 mperf = new->mperf - old->mperf; > - unsigned long ratio = aperf; > - > - mperf >>= APERFMPERF_SHIFT; > - if (mperf) > - ratio = div64_u64(aperf, mperf); > - > - return ratio; > -} > - > extern unsigned long arch_align_stack(unsigned long sp); > extern void free_init_pages(char *what, unsigned long begin, unsigned long end); > > diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile > index 315b923..4519fb1 100644 > --- a/drivers/cpufreq/Makefile > +++ b/drivers/cpufreq/Makefile > @@ -23,7 +23,7 @@ obj-$(CONFIG_GENERIC_CPUFREQ_CPU0) += cpufreq-cpu0.o > # powernow-k8 can load then. ACPI is preferred to all other hardware-specific drivers. > # speedstep-* is preferred over p4-clockmod. > > -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o > +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o > obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o > obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o > obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o > diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c > index 11b8b4b..0025cdd 100644 > --- a/drivers/cpufreq/acpi-cpufreq.c > +++ b/drivers/cpufreq/acpi-cpufreq.c > @@ -45,7 +45,6 @@ > #include <asm/msr.h> > #include <asm/processor.h> > #include <asm/cpufeature.h> > -#include "mperf.h" > > MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); > MODULE_DESCRIPTION("ACPI Processor P-States Driver"); > @@ -842,10 +841,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) > /* notify BIOS that we exist */ > acpi_processor_notify_smm(THIS_MODULE); > > - /* Check for APERF/MPERF support in hardware */ > - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) > - acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; > - > pr_debug("CPU%u - ACPI performance management activated.\n", cpu); > for (i = 0; i < perf->state_count; i++) > pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > index 2d53f47..04ceddc 100644 > --- a/drivers/cpufreq/cpufreq.c > +++ b/drivers/cpufreq/cpufreq.c > @@ -1502,27 +1502,6 @@ no_policy: > } > EXPORT_SYMBOL_GPL(cpufreq_driver_target); > > -int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu) > -{ > - int ret = 0; > - > - if (cpufreq_disabled()) > - return ret; > - > - if (!cpufreq_driver->getavg) > - return 0; > - > - policy = cpufreq_cpu_get(policy->cpu); > - if (!policy) > - return -EINVAL; > - > - ret = cpufreq_driver->getavg(policy, cpu); > - > - cpufreq_cpu_put(policy); > - return ret; > -} > -EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg); > - > /* > * when "event" is CPUFREQ_GOV_LIMITS > */ > diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c > index 5af40ad..eeab30a 100644 > --- a/drivers/cpufreq/cpufreq_governor.c > +++ b/drivers/cpufreq/cpufreq_governor.c > @@ -97,7 +97,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) > > policy = cdbs->cur_policy; > > - /* Get Absolute Load (in terms of freq for ondemand gov) */ > + /* Get Absolute Load */ > for_each_cpu(j, policy->cpus) { > struct cpu_dbs_common_info *j_cdbs; > u64 cur_wall_time, cur_idle_time; > @@ -148,14 +148,6 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) > > load = 100 * (wall_time - idle_time) / wall_time; > > - if (dbs_data->cdata->governor == GOV_ONDEMAND) { > - int freq_avg = __cpufreq_driver_getavg(policy, j); > - if (freq_avg <= 0) > - freq_avg = policy->cur; > - > - load *= freq_avg; > - } > - > if (load > max_load) > max_load = load; > } > diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h > index e16a961..e899c11 100644 > --- a/drivers/cpufreq/cpufreq_governor.h > +++ b/drivers/cpufreq/cpufreq_governor.h > @@ -169,7 +169,6 @@ struct od_dbs_tuners { > unsigned int sampling_rate; > unsigned int sampling_down_factor; > unsigned int up_threshold; > - unsigned int adj_up_threshold; > unsigned int powersave_bias; > unsigned int io_is_busy; > }; > diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c > index 4b9bb5d..c1e6d3e 100644 > --- a/drivers/cpufreq/cpufreq_ondemand.c > +++ b/drivers/cpufreq/cpufreq_ondemand.c > @@ -29,11 +29,9 @@ > #include "cpufreq_governor.h" > > /* On-demand governor macros */ > -#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) > #define DEF_FREQUENCY_UP_THRESHOLD (80) > #define DEF_SAMPLING_DOWN_FACTOR (1) > #define MAX_SAMPLING_DOWN_FACTOR (100000) > -#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) > #define MICRO_FREQUENCY_UP_THRESHOLD (95) > #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) > #define MIN_FREQUENCY_UP_THRESHOLD (11) > @@ -159,14 +157,10 @@ static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) > > /* > * Every sampling_rate, we check, if current idle time is less than 20% > - * (default), then we try to increase frequency. Every sampling_rate, we look > - * for the lowest frequency which can sustain the load while keeping idle time > - * over 30%. If such a frequency exist, we try to decrease to this frequency. > - * > - * Any frequency increase takes it to the maximum frequency. Frequency reduction > - * happens at minimum steps of 5% (default) of current frequency > + * (default), then we try to increase frequency. Else, we adjust the frequency > + * proportional to load. > */ > -static void od_check_cpu(int cpu, unsigned int load_freq) > +static void od_check_cpu(int cpu, unsigned int load) > { > struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); > struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy; > @@ -176,29 +170,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq) > dbs_info->freq_lo = 0; > > /* Check for frequency increase */ > - if (load_freq > od_tuners->up_threshold * policy->cur) { > + if (load > od_tuners->up_threshold) { > /* If switching to max speed, apply sampling_down_factor */ > if (policy->cur < policy->max) > dbs_info->rate_mult = > od_tuners->sampling_down_factor; > dbs_freq_increase(policy, policy->max); > return; > - } > - > - /* Check for frequency decrease */ > - /* if we cannot reduce the frequency anymore, break out early */ > - if (policy->cur == policy->min) > - return; > - > - /* > - * The optimal frequency is the frequency that is the lowest that can > - * support the current CPU usage without triggering the up policy. To be > - * safe, we focus 10 points under the threshold. > - */ > - if (load_freq < od_tuners->adj_up_threshold > - * policy->cur) { > + } else { > + /* Calculate the next frequency proportional to load */ > unsigned int freq_next; > - freq_next = load_freq / od_tuners->adj_up_threshold; > + freq_next = load * policy->max / 100; > > /* No longer fully busy, reset rate_mult */ > dbs_info->rate_mult = 1; > @@ -372,9 +354,6 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf, > input < MIN_FREQUENCY_UP_THRESHOLD) { > return -EINVAL; > } > - /* Calculate the new adj_up_threshold */ > - od_tuners->adj_up_threshold += input; > - od_tuners->adj_up_threshold -= od_tuners->up_threshold; > > od_tuners->up_threshold = input; > return count; > @@ -523,8 +502,6 @@ static int od_init(struct dbs_data *dbs_data) > if (idle_time != -1ULL) { > /* Idle micro accounting is supported. Use finer thresholds */ > tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; > - tuners->adj_up_threshold = MICRO_FREQUENCY_UP_THRESHOLD - > - MICRO_FREQUENCY_DOWN_DIFFERENTIAL; > /* > * In nohz/micro accounting case we set the minimum frequency > * not depending on HZ, but fixed (very low). The deferred > @@ -533,8 +510,6 @@ static int od_init(struct dbs_data *dbs_data) > dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; > } else { > tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD; > - tuners->adj_up_threshold = DEF_FREQUENCY_UP_THRESHOLD - > - DEF_FREQUENCY_DOWN_DIFFERENTIAL; > > /* For correct statistics, we need 10 ticks for each measure */ > dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO * > diff --git a/drivers/cpufreq/mperf.c b/drivers/cpufreq/mperf.c > deleted file mode 100644 > index 911e193..0000000 > --- a/drivers/cpufreq/mperf.c > +++ /dev/null > @@ -1,51 +0,0 @@ > -#include <linux/kernel.h> > -#include <linux/smp.h> > -#include <linux/module.h> > -#include <linux/init.h> > -#include <linux/cpufreq.h> > -#include <linux/slab.h> > - > -#include "mperf.h" > - > -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); > - > -/* Called via smp_call_function_single(), on the target CPU */ > -static void read_measured_perf_ctrs(void *_cur) > -{ > - struct aperfmperf *am = _cur; > - > - get_aperfmperf(am); > -} > - > -/* > - * Return the measured active (C0) frequency on this CPU since last call > - * to this function. > - * Input: cpu number > - * Return: Average CPU frequency in terms of max frequency (zero on error) > - * > - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance > - * over a period of time, while CPU is in C0 state. > - * IA32_MPERF counts at the rate of max advertised frequency > - * IA32_APERF counts at the rate of actual CPU frequency > - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and > - * no meaning should be associated with absolute values of these MSRs. > - */ > -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, > - unsigned int cpu) > -{ > - struct aperfmperf perf; > - unsigned long ratio; > - unsigned int retval; > - > - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) > - return 0; > - > - ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); > - per_cpu(acfreq_old_perf, cpu) = perf; > - > - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; > - > - return retval; > -} > -EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); > -MODULE_LICENSE("GPL"); > diff --git a/drivers/cpufreq/mperf.h b/drivers/cpufreq/mperf.h > deleted file mode 100644 > index 5dbf295..0000000 > --- a/drivers/cpufreq/mperf.h > +++ /dev/null > @@ -1,9 +0,0 @@ > -/* > - * (c) 2010 Advanced Micro Devices, Inc. > - * Your use of this code is subject to the terms and conditions of the > - * GNU general public license version 2. See "COPYING" or > - * http://www.gnu.org/licenses/gpl.html > - */ > - > -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, > - unsigned int cpu); > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h > index 037d36a..60fcb42 100644 > --- a/include/linux/cpufreq.h > +++ b/include/linux/cpufreq.h > @@ -211,10 +211,6 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, > unsigned int target_freq, > unsigned int relation); > > - > -extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy, > - unsigned int cpu); > - > int cpufreq_register_governor(struct cpufreq_governor *governor); > void cpufreq_unregister_governor(struct cpufreq_governor *governor); > > @@ -254,8 +250,6 @@ struct cpufreq_driver { > unsigned int (*get) (unsigned int cpu); > > /* optional */ > - unsigned int (*getavg) (struct cpufreq_policy *policy, > - unsigned int cpu); > int (*bios_limit) (int cpu, unsigned int *limit); > > int (*exit) (struct cpufreq_policy *policy); > -- > 1.8.1.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 31 May 2013 02:37, Stratos Karafotis <stratosk@semaphore.gr> wrote: Ahh.. earlier mail got sent without me doing complete review :( > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > +static void od_check_cpu(int cpu, unsigned int load) > { > struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); > struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy; > @@ -176,29 +170,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq) > dbs_info->freq_lo = 0; > > /* Check for frequency increase */ > - if (load_freq > od_tuners->up_threshold * policy->cur) { > + if (load > od_tuners->up_threshold) { Chances of this getting hit are minimal now.. I don't know if keeping this will change anything now :) > /* If switching to max speed, apply sampling_down_factor */ > if (policy->cur < policy->max) > dbs_info->rate_mult = > od_tuners->sampling_down_factor; > dbs_freq_increase(policy, policy->max); > return; > - } > - > - /* Check for frequency decrease */ > - /* if we cannot reduce the frequency anymore, break out early */ > - if (policy->cur == policy->min) > - return; > - > - /* > - * The optimal frequency is the frequency that is the lowest that can > - * support the current CPU usage without triggering the up policy. To be > - * safe, we focus 10 points under the threshold. > - */ > - if (load_freq < od_tuners->adj_up_threshold > - * policy->cur) { > + } else { > + /* Calculate the next frequency proportional to load */ > unsigned int freq_next; > - freq_next = load_freq / od_tuners->adj_up_threshold; > + freq_next = load * policy->max / 100; Rafael asked why you believe this is the right formula and I really couldn't find an appropriate answer to that, sorry :( -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote: > On 31 May 2013 02:37, Stratos Karafotis <stratosk@semaphore.gr> wrote: > > Ahh.. earlier mail got sent without me doing complete review :( > > > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > > +static void od_check_cpu(int cpu, unsigned int load) > > { > > struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); > > struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy; > > @@ -176,29 +170,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq) > > dbs_info->freq_lo = 0; > > > > /* Check for frequency increase */ > > - if (load_freq > od_tuners->up_threshold * policy->cur) { > > + if (load > od_tuners->up_threshold) { > > Chances of this getting hit are minimal now.. I don't know if keeping > this will change anything now :) > > > /* If switching to max speed, apply sampling_down_factor */ > > if (policy->cur < policy->max) > > dbs_info->rate_mult = > > od_tuners->sampling_down_factor; > > dbs_freq_increase(policy, policy->max); > > return; > > - } > > - > > - /* Check for frequency decrease */ > > - /* if we cannot reduce the frequency anymore, break out early */ > > - if (policy->cur == policy->min) > > - return; > > - > > - /* > > - * The optimal frequency is the frequency that is the lowest that can > > - * support the current CPU usage without triggering the up policy. To be > > - * safe, we focus 10 points under the threshold. > > - */ > > - if (load_freq < od_tuners->adj_up_threshold > > - * policy->cur) { > > + } else { > > + /* Calculate the next frequency proportional to load */ > > unsigned int freq_next; > > - freq_next = load_freq / od_tuners->adj_up_threshold; > > + freq_next = load * policy->max / 100; > > Rafael asked why you believe this is the right formula and I really couldn't > find an appropriate answer to that, sorry :( Right, it would be good to explain that. "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? Rafael
On 05/31/2013 11:51 AM, Viresh Kumar wrote: >> --- >> arch/x86/include/asm/processor.h | 29 ---------------------- >> drivers/cpufreq/Makefile | 2 +- >> drivers/cpufreq/acpi-cpufreq.c | 5 ---- >> drivers/cpufreq/cpufreq.c | 21 ---------------- >> drivers/cpufreq/cpufreq_governor.c | 10 +------- >> drivers/cpufreq/cpufreq_governor.h | 1 - >> drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++----------------------- >> drivers/cpufreq/mperf.c | 51 -------------------------------------- >> drivers/cpufreq/mperf.h | 9 ------- >> include/linux/cpufreq.h | 6 ----- >> 10 files changed, 9 insertions(+), 164 deletions(-) >> delete mode 100644 drivers/cpufreq/mperf.c >> delete mode 100644 drivers/cpufreq/mperf.h > > I believe you should have removed other users of getavg() in a separate > patch and also cc'd relevant people so that you can some review comments > from them. I will split the patch in two. If it's OK, I will keep the removal of __cpufreq_driver_getavg in the original patch and move the clean up of APERF/MPERF support in a second patch. I will also cc relevant people. >> /* Check for frequency increase */ >> - if (load_freq > od_tuners->up_threshold * policy->cur) { >> + if (load > od_tuners->up_threshold) { > > Chances of this getting hit are minimal now.. I don't know if keeping > this will change anything now :) Actually, no. This getting hit pretty often. Please find attached the cpufreq statistics - trans_table during build of 3.4 kernel. With default up_threshold (95), the transition to max happened many times because of load was greater than up_threshold. I also thought to keep this code to leave up_threshold functionality unaffected. On 05/31/2013 03:42 PM, Rafael J. Wysocki wrote: > On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote: >>> + } else { >>> + /* Calculate the next frequency proportional to load */ >>> unsigned int freq_next; >>> - freq_next = load_freq / od_tuners->adj_up_threshold; >>> + freq_next = load * policy->max / 100; >> >> Rafael asked why you believe this is the right formula and I really couldn't >> find an appropriate answer to that, sorry :( > > Right, it would be good to explain that. > > "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". This way the target frequency will be proportional to load and the calculation will "map" the load to CPU freq table. I will update the patch according to your observations and suggestions. Thanks, Stratos From : To : 3401000 3400000 3300000 3100000 3000000 2900000 2800000 2600000 2500000 2400000 2200000 2100000 2000000 1900000 1700000 1600000 3401000: 0 0 4 2 4 2 3 0 2 1 1 1 1 4 0 29 3400000: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3300000: 4 0 0 0 1 0 0 0 0 0 0 1 0 0 0 7 3100000: 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 3000000: 4 0 0 0 0 0 0 0 0 1 0 0 0 0 0 4 2900000: 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 7 2800000: 3 0 0 0 0 1 0 0 0 1 0 0 0 0 0 3 2600000: 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 4 2500000: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 2400000: 3 0 0 0 0 0 1 0 0 0 0 0 0 0 0 7 2200000: 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 3 2100000: 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 4 2000000: 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1900000: 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 8 1700000: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1600000: 33 0 7 1 2 5 4 5 2 5 4 5 1 6 2 0
On Friday, May 31, 2013 07:33:06 PM Stratos Karafotis wrote: > On 05/31/2013 11:51 AM, Viresh Kumar wrote: > >> --- > >> arch/x86/include/asm/processor.h | 29 ---------------------- > >> drivers/cpufreq/Makefile | 2 +- > >> drivers/cpufreq/acpi-cpufreq.c | 5 ---- > >> drivers/cpufreq/cpufreq.c | 21 ---------------- > >> drivers/cpufreq/cpufreq_governor.c | 10 +------- > >> drivers/cpufreq/cpufreq_governor.h | 1 - > >> drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++----------------------- > >> drivers/cpufreq/mperf.c | 51 -------------------------------------- > >> drivers/cpufreq/mperf.h | 9 ------- > >> include/linux/cpufreq.h | 6 ----- > >> 10 files changed, 9 insertions(+), 164 deletions(-) > >> delete mode 100644 drivers/cpufreq/mperf.c > >> delete mode 100644 drivers/cpufreq/mperf.h > > > > I believe you should have removed other users of getavg() in a separate > > patch and also cc'd relevant people so that you can some review comments > > from them. > > I will split the patch in two. If it's OK, I will keep the removal of > __cpufreq_driver_getavg in the original patch and move the clean up of > APERF/MPERF support in a second patch. I will also cc relevant people. > > > >> /* Check for frequency increase */ > >> - if (load_freq > od_tuners->up_threshold * policy->cur) { > >> + if (load > od_tuners->up_threshold) { > > > > Chances of this getting hit are minimal now.. I don't know if keeping > > this will change anything now :) > > Actually, no. This getting hit pretty often. > Please find attached the cpufreq statistics - trans_table during build > of 3.4 kernel. With default up_threshold (95), the transition to max > happened many times because of load was greater than up_threshold. > I also thought to keep this code to leave up_threshold functionality unaffected. > > On 05/31/2013 03:42 PM, Rafael J. Wysocki wrote: > > On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote: > >>> + } else { > >>> + /* Calculate the next frequency proportional to load */ > >>> unsigned int freq_next; > >>> - freq_next = load_freq / od_tuners->adj_up_threshold; > >>> + freq_next = load * policy->max / 100; > >> > >> Rafael asked why you believe this is the right formula and I really couldn't > >> find an appropriate answer to that, sorry :( > > > > Right, it would be good to explain that. > > > > "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? > > > > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". > This way the target frequency will be proportional to load and the calculation will > "map" the load to CPU freq table. That seems to mean "take the percentage of policy->cpuinfo.max_freq proportional to the current load and use the available frequency closest to that". Is that correct? Rafael
On 06/01/2013 03:27 PM, Rafael J. Wysocki wrote: > On Friday, May 31, 2013 07:33:06 PM Stratos Karafotis wrote: >> On 05/31/2013 11:51 AM, Viresh Kumar wrote: >>>> --- >>>> arch/x86/include/asm/processor.h | 29 ---------------------- >>>> drivers/cpufreq/Makefile | 2 +- >>>> drivers/cpufreq/acpi-cpufreq.c | 5 ---- >>>> drivers/cpufreq/cpufreq.c | 21 ---------------- >>>> drivers/cpufreq/cpufreq_governor.c | 10 +------- >>>> drivers/cpufreq/cpufreq_governor.h | 1 - >>>> drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++----------------------- >>>> drivers/cpufreq/mperf.c | 51 -------------------------------------- >>>> drivers/cpufreq/mperf.h | 9 ------- >>>> include/linux/cpufreq.h | 6 ----- >>>> 10 files changed, 9 insertions(+), 164 deletions(-) >>>> delete mode 100644 drivers/cpufreq/mperf.c >>>> delete mode 100644 drivers/cpufreq/mperf.h >>> >>> I believe you should have removed other users of getavg() in a separate >>> patch and also cc'd relevant people so that you can some review comments >>> from them. >> >> I will split the patch in two. If it's OK, I will keep the removal of >> __cpufreq_driver_getavg in the original patch and move the clean up of >> APERF/MPERF support in a second patch. I will also cc relevant people. >> >> >>>> /* Check for frequency increase */ >>>> - if (load_freq > od_tuners->up_threshold * policy->cur) { >>>> + if (load > od_tuners->up_threshold) { >>> >>> Chances of this getting hit are minimal now.. I don't know if keeping >>> this will change anything now :) >> >> Actually, no. This getting hit pretty often. >> Please find attached the cpufreq statistics - trans_table during build >> of 3.4 kernel. With default up_threshold (95), the transition to max >> happened many times because of load was greater than up_threshold. >> I also thought to keep this code to leave up_threshold functionality unaffected. >> >> On 05/31/2013 03:42 PM, Rafael J. Wysocki wrote: >>> On Friday, May 31, 2013 02:24:59 PM Viresh Kumar wrote: >>>>> + } else { >>>>> + /* Calculate the next frequency proportional to load */ >>>>> unsigned int freq_next; >>>>> - freq_next = load_freq / od_tuners->adj_up_threshold; >>>>> + freq_next = load * policy->max / 100; >>>> >>>> Rafael asked why you believe this is the right formula and I really couldn't >>>> find an appropriate answer to that, sorry :( >>> >>> Right, it would be good to explain that. >>> >>> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? >>> >> >> I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". >> This way the target frequency will be proportional to load and the calculation will >> "map" the load to CPU freq table. > > That seems to mean "take the percentage of policy->cpuinfo.max_freq proportional > to the current load and use the available frequency closest to that". Is that > correct? > > Rafael > > In my opinion, yes. I thought, yesterday, after your question, to normalize load to policy->min - policy->max. But I think it's a more correct approach to take the percentage of cpuinfo.max, as you said. Actually, I did my tests on the percentage of policy->max that was equal to cpuinfo.max. Unless, I miss something here. :) Thanks, Stratos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 31 May 2013 22:03, Stratos Karafotis <stratosk@semaphore.gr> wrote: > On 05/31/2013 11:51 AM, Viresh Kumar wrote: >> I believe you should have removed other users of getavg() in a separate >> patch and also cc'd relevant people so that you can some review comments >> from them. > > I will split the patch in two. If it's OK, I will keep the removal of > __cpufreq_driver_getavg in the original patch and move the clean up of > APERF/MPERF support in a second patch. I will also cc relevant people. Even removal of __cpufreq_driver_getavg() should be done in a separate patch, so that it can be reverted easily if required later. >> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? > > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is initialized.. but user may request a lower max freq for a governor or policy. Which is actually reflected in policy->max I believe. Over that why keeping following check is useful anymore? if (load_freq > od_tuners->up_threshold) goto max. As, if load is over 95, then even policy->max * 95 / 100 will even give almost the same freq. -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/01/2013 05:56 PM, Viresh Kumar wrote: > On 31 May 2013 22:03, Stratos Karafotis <stratosk@semaphore.gr> wrote: >> On 05/31/2013 11:51 AM, Viresh Kumar wrote: >>> I believe you should have removed other users of getavg() in a separate >>> patch and also cc'd relevant people so that you can some review comments >>> from them. >> >> I will split the patch in two. If it's OK, I will keep the removal of >> __cpufreq_driver_getavg in the original patch and move the clean up of >> APERF/MPERF support in a second patch. I will also cc relevant people. > > Even removal of __cpufreq_driver_getavg() should be done in a separate > patch, so that it can be reverted easily if required later. Thanks, Viresh. I will do the removal of that function in a seperate patch. Should I use a third patch for it? Or should I include it in the patch which will remove APERF/MPERF support? >>> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? >> >> I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". > > Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is > initialized.. but user may request a lower max freq for a governor or policy. > Which is actually reflected in policy->max I believe. My initial thought is to have a linear function to calculate the target freq proportional to load: (I will use 'C' as the function's slope as Rafael used it) freq_target = C * load For simplicity, let's assume that load is between 0 and 1 as initially is calculated in governor. Ideally, for a load = 0, we should have freq_target = 0 and for load = 1, freq_target = cpuinfo.max So, the slope C = cpuinfo.max I think, it's matter of definition about what policy->min and policy->max can do. Should they change the slope C? Or only limit freq_target? I don't think that the policy->max (or min) should affect HOW (slope C) governor calculates freq_target but only limit the calculated result. Maybe, we could have separate tunables to a affect the slope C. If I'm wrong about the definition of policy->min, policy->max, I would change the code accordingly. > Over that why keeping following check is useful anymore? > > if (load_freq > od_tuners->up_threshold) > goto max. > > As, if load is over 95, then even policy->max * 95 / 100 will even give almost > the same freq. > I thought that too. But maybe user selects a lower value for up_threshold. (For example, up_threshold = 60). In my opinion, we have to keep up_theshold, to give the possibility to user to have max freq with small loads. Thanks for your comments! Stratos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Saturday, June 01, 2013 08:26:47 PM Viresh Kumar wrote: > On 31 May 2013 22:03, Stratos Karafotis <stratosk@semaphore.gr> wrote: > > On 05/31/2013 11:51 AM, Viresh Kumar wrote: > >> I believe you should have removed other users of getavg() in a separate > >> patch and also cc'd relevant people so that you can some review comments > >> from them. > > > > I will split the patch in two. If it's OK, I will keep the removal of > > __cpufreq_driver_getavg in the original patch and move the clean up of > > APERF/MPERF support in a second patch. I will also cc relevant people. > > Even removal of __cpufreq_driver_getavg() should be done in a separate > patch, so that it can be reverted easily if required later. Why would you want to revert it separately? > >> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? > > > > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". > > Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is > initialized.. but user may request a lower max freq for a governor or policy. > Which is actually reflected in policy->max I believe. Which doesn't matter. The formula should provide the same results regardless of the user settings except that the selected frequency should be capped by policy->max (instead of being proportional to it). I think using cpuinfo.max_freq here is correct. > Over that why keeping following check is useful anymore? > > if (load_freq > od_tuners->up_threshold) > goto max. > > As, if load is over 95, then even policy->max * 95 / 100 will even give almost > the same freq. Yes, in the majority of cases. Thanks, Rafael
On 1 June 2013 21:36, Stratos Karafotis <stratosk@semaphore.gr> wrote: > On 06/01/2013 05:56 PM, Viresh Kumar wrote: >> Even removal of __cpufreq_driver_getavg() should be done in a separate >> patch, so that it can be reverted easily if required later. > > Thanks, Viresh. I will do the removal of that function in a seperate patch. > Should I use a third patch for it? Or should I include it in the patch which > will remove APERF/MPERF support? Maybe a third patch would be more cleaner. >> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is >> initialized.. but user may request a lower max freq for a governor or policy. >> Which is actually reflected in policy->max I believe. > > My initial thought is to have a linear function to calculate the target freq > proportional to load: (I will use 'C' as the function's slope as Rafael used it) > > freq_target = C * load > > For simplicity, let's assume that load is between 0 and 1 as initially is calculated > in governor. > Ideally, for a load = 0, we should have freq_target = 0 and for load = 1, > freq_target = cpuinfo.max > > So, the slope C = cpuinfo.max > > I think, it's matter of definition about what policy->min and policy->max can do. > Should they change the slope C? Or only limit freq_target? > I don't think that the policy->max (or min) should affect HOW (slope C) governor > calculates freq_target but only limit the calculated result. > > Maybe, we could have separate tunables to a affect the slope C. > > If I'm wrong about the definition of policy->min, policy->max, I would change > the code accordingly. Lets discuss that in reply to Rafael's mail. >> As, if load is over 95, then even policy->max * 95 / 100 will even give almost >> the same freq. >> > > I thought that too. But maybe user selects a lower value for up_threshold. > (For example, up_threshold = 60). In my opinion, we have to keep up_theshold, > to give the possibility to user to have max freq with small loads. Yes... good point. -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2 June 2013 01:07, Rafael J. Wysocki <rjw@sisk.pl> wrote: > On Saturday, June 01, 2013 08:26:47 PM Viresh Kumar wrote: >> Even removal of __cpufreq_driver_getavg() should be done in a separate >> patch, so that it can be reverted easily if required later. > > Why would you want to revert it separately? We might not need to revert all the changes that Stratos is doing. Only a revert of getavg() and its users + a small fix in governor would be enough. Stratos patch isn't only about removing getavg() but how ondemand works and so breaking stuff might be more useful. >> >> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? >> > >> > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". >> >> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is >> initialized.. but user may request a lower max freq for a governor or policy. >> Which is actually reflected in policy->max I believe. > > Which doesn't matter. The formula should provide the same results regardless > of the user settings except that the selected frequency should be capped by > policy->max (instead of being proportional to it). I think using > cpuinfo.max_freq here is correct. I am confused now about what to use.. This is how I read it: Assumption: CPU supports following freq range.. 500 MHz, 600 MHz, 700 MHz, 800 MHz, 900 MHz, 1 GHz cpuinfo.max_freq = 1 GHz policy->max is set to 600 MHz Case 1: Use policy->max: We need load to be over 500/600 (i.e. .834) to move to 600 MHz. Case 2: Use cpuinfo.max_freq.. We need load to be over 500/1000 (i.e. .5) to move to 600 MHz. So, obviously the calculations aren't the same.. Now, this is how I understood these two different variables: - cpuinfo.max_freq: >> Over that why keeping following check is useful anymore? >> >> if (load_freq > od_tuners->up_threshold) >> goto max. >> >> As, if load is over 95, then even policy->max * 95 / 100 will even give almost >> the same freq. > > Yes, in the majority of cases. > > Thanks, > Rafael > > > -- > I speak only for myself. > Rafael J. Wysocki, Intel Open Source Technology Center. -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Sent half written mail.. sorry.. will continue from where I left.
On 3 June 2013 12:21, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> So, obviously the calculations aren't the same..
Now, this is how I understood these two different variables:
- cpuinfo.max_freq: maximum frequency (in kHz) which is
supported by this CPU
- policy->max: Maximum frequency forced by user or governor
When somebody sets policy->max, he expects cpufreq core to
use the range between min and max as the slope cpufreq core
has and adjust its frequencies accordingly..
Don't know if I am right or wrong :(
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, June 03, 2013 12:21:47 PM Viresh Kumar wrote: > On 2 June 2013 01:07, Rafael J. Wysocki <rjw@sisk.pl> wrote: > > On Saturday, June 01, 2013 08:26:47 PM Viresh Kumar wrote: > > >> Even removal of __cpufreq_driver_getavg() should be done in a separate > >> patch, so that it can be reverted easily if required later. > > > > Why would you want to revert it separately? > > We might not need to revert all the changes that Stratos is doing. > Only a revert of getavg() and its users + a small fix in governor would > be enough. Stratos patch isn't only about removing getavg() but how > ondemand works and so breaking stuff might be more useful. Yes, it's probably better to do the getavg() removal as a separate patch. > >> >> "Proportional to load" means C * load, so why is "policy->max / 100" *the* right C? > >> > > >> > I think, finally(?) I see your point. The right C should be "policy->cpuinfo.max_freq / 100". > >> > >> Why are you changing it to cpuinfo.max_freq?? This is fixed once a driver is > >> initialized.. but user may request a lower max freq for a governor or policy. > >> Which is actually reflected in policy->max I believe. > > > > Which doesn't matter. The formula should provide the same results regardless > > of the user settings except that the selected frequency should be capped by > > policy->max (instead of being proportional to it). I think using > > cpuinfo.max_freq here is correct. > > I am confused now about what to use.. This is how I read it: > > Assumption: CPU supports following freq range.. 500 MHz, 600 MHz, 700 MHz, > 800 MHz, 900 MHz, 1 GHz > > cpuinfo.max_freq = 1 GHz > policy->max is set to 600 MHz > > Case 1: Use policy->max: > > We need load to be over 500/600 (i.e. .834) to move to 600 MHz. > > Case 2: Use cpuinfo.max_freq.. > > We need load to be over 500/1000 (i.e. .5) to move to 600 MHz. > > So, obviously the calculations aren't the same.. No, they aren't. My point is that the set of frequencies to choose from doesn't change with the changes of user settings, so the computation should use things that don't with the user settings either. Thanks, Rafael
On Monday, June 03, 2013 12:25:02 PM Viresh Kumar wrote: > Sent half written mail.. sorry.. will continue from where I left. > > On 3 June 2013 12:21, Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > So, obviously the calculations aren't the same.. > > Now, this is how I understood these two different variables: > - cpuinfo.max_freq: maximum frequency (in kHz) which is > supported by this CPU > - policy->max: Maximum frequency forced by user or governor > > When somebody sets policy->max, he expects cpufreq core to > use the range between min and max as the slope cpufreq core > has and adjust its frequencies accordingly.. > > Don't know if I am right or wrong :( The reality is that the set of frequencies to use is constant and changing policy->max doesn't change that set. There are just fewer frequencies the governor can choose from if policy->max is below cpuinfo.max_freq. The question is if we want policy->max to re-scale them effectively (i.e. to change weights so that the maximum load maps to the highest frequency available at the moment) or if we want policy->max to work as a cap (i.e. to map all loads above certain value to the maximum frequency available at the moment, so that the criteria for selecting the lower frequencies don't change). In my opinion the second option is better, because it means "OK, we can't use some high frequencies, but let's not hurt performance for the loads that wouldn't require them anyway". Otherwise, we'll effectively throttle all loads and that not only causes performance to drop, but also causes more energy to be used overall. Thanks, Rafael
On 3 June 2013 16:27, Rafael J. Wysocki <rjw@sisk.pl> wrote: > The question is if we want policy->max to re-scale them effectively (i.e. to > change weights so that the maximum load maps to the highest frequency available > at the moment) or if we want policy->max to work as a cap (i.e. to map all > loads above certain value to the maximum frequency available at the moment, so > that the criteria for selecting the lower frequencies don't change). In my > opinion the second option is better, because it means "OK, we can't use some > high frequencies, but let's not hurt performance for the loads that wouldn't > require them anyway". Otherwise, we'll effectively throttle all loads and > that not only causes performance to drop, but also causes more energy to be > used overall. I wouldn't say that I don't agree with you as I do to some extent. But the question that comes to my mind now is: Why is policy->max reduced in the first place? User doesn't have control over which freqs to expose, so available_frequencies will stay the same. The only thing he is capable of doing is: reduce policy->max.. Which in a way means that.. "I don't want to use higher frequencies (power, thermal reasons) and I know performance will go down with it and I don't care for it now." And this way I feel policy->max isn't a bad choice either. BUT as you said: policy->max isn't there to say don't be so aggressive now in increasing frequencies but just only to say, don't go over this frequency.. So, probably we can use cpuinfo.max_freq :) -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/03/2013 02:24 PM, Viresh Kumar wrote: > On 3 June 2013 16:27, Rafael J. Wysocki <rjw@sisk.pl> wrote: >> The question is if we want policy->max to re-scale them effectively (i.e. to >> change weights so that the maximum load maps to the highest frequency available >> at the moment) or if we want policy->max to work as a cap (i.e. to map all >> loads above certain value to the maximum frequency available at the moment, so >> that the criteria for selecting the lower frequencies don't change). In my >> opinion the second option is better, because it means "OK, we can't use some >> high frequencies, but let's not hurt performance for the loads that wouldn't >> require them anyway". Otherwise, we'll effectively throttle all loads and >> that not only causes performance to drop, but also causes more energy to be >> used overall. > > I wouldn't say that I don't agree with you as I do to some extent. > > But the question that comes to my mind now is: Why is policy->max reduced > in the first place? User doesn't have control over which freqs to expose, so > available_frequencies will stay the same. The only thing he is capable > of doing is: reduce policy->max.. Which in a way means that.. "I don't want to > use higher frequencies (power, thermal reasons) and I know performance will > go down with it and I don't care for it now." > > And this way I feel policy->max isn't a bad choice either. > > BUT as you said: policy->max isn't there to say don't be so aggressive now in > increasing frequencies but just only to say, don't go over this frequency.. > > So, probably we can use cpuinfo.max_freq :) > Both of you know better than me, but I also believe that cpuinfo.max is more appropriate. Although, the decision was taken, let me share a spreadsheet to show the 2 cases. https://docs.google.com/spreadsheet/ccc?key=0AnMfNYUV1k0ddHh1OUFXa0kxcGZJaXd4am1sdmVWT0E&usp=sharing I will prepare the v2 of this patch that uses cpuinfo.max_freq instead of policy-max. Also, I will split the patch into 3 parts as suggested. Thank you for your comments and suggestions! Stratos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 22224b3..2874a3b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -941,35 +941,6 @@ extern int set_tsc_mode(unsigned int val); extern u16 amd_get_nb_id(int cpu); -struct aperfmperf { - u64 aperf, mperf; -}; - -static inline void get_aperfmperf(struct aperfmperf *am) -{ - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); - - rdmsrl(MSR_IA32_APERF, am->aperf); - rdmsrl(MSR_IA32_MPERF, am->mperf); -} - -#define APERFMPERF_SHIFT 10 - -static inline -unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, - struct aperfmperf *new) -{ - u64 aperf = new->aperf - old->aperf; - u64 mperf = new->mperf - old->mperf; - unsigned long ratio = aperf; - - mperf >>= APERFMPERF_SHIFT; - if (mperf) - ratio = div64_u64(aperf, mperf); - - return ratio; -} - extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 315b923..4519fb1 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -23,7 +23,7 @@ obj-$(CONFIG_GENERIC_CPUFREQ_CPU0) += cpufreq-cpu0.o # powernow-k8 can load then. ACPI is preferred to all other hardware-specific drivers. # speedstep-* is preferred over p4-clockmod. -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 11b8b4b..0025cdd 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -45,7 +45,6 @@ #include <asm/msr.h> #include <asm/processor.h> #include <asm/cpufeature.h> -#include "mperf.h" MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); MODULE_DESCRIPTION("ACPI Processor P-States Driver"); @@ -842,10 +841,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); - /* Check for APERF/MPERF support in hardware */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; - pr_debug("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 2d53f47..04ceddc 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1502,27 +1502,6 @@ no_policy: } EXPORT_SYMBOL_GPL(cpufreq_driver_target); -int __cpufreq_driver_getavg(struct cpufreq_policy *policy, unsigned int cpu) -{ - int ret = 0; - - if (cpufreq_disabled()) - return ret; - - if (!cpufreq_driver->getavg) - return 0; - - policy = cpufreq_cpu_get(policy->cpu); - if (!policy) - return -EINVAL; - - ret = cpufreq_driver->getavg(policy, cpu); - - cpufreq_cpu_put(policy); - return ret; -} -EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg); - /* * when "event" is CPUFREQ_GOV_LIMITS */ diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 5af40ad..eeab30a 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -97,7 +97,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) policy = cdbs->cur_policy; - /* Get Absolute Load (in terms of freq for ondemand gov) */ + /* Get Absolute Load */ for_each_cpu(j, policy->cpus) { struct cpu_dbs_common_info *j_cdbs; u64 cur_wall_time, cur_idle_time; @@ -148,14 +148,6 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) load = 100 * (wall_time - idle_time) / wall_time; - if (dbs_data->cdata->governor == GOV_ONDEMAND) { - int freq_avg = __cpufreq_driver_getavg(policy, j); - if (freq_avg <= 0) - freq_avg = policy->cur; - - load *= freq_avg; - } - if (load > max_load) max_load = load; } diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index e16a961..e899c11 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -169,7 +169,6 @@ struct od_dbs_tuners { unsigned int sampling_rate; unsigned int sampling_down_factor; unsigned int up_threshold; - unsigned int adj_up_threshold; unsigned int powersave_bias; unsigned int io_is_busy; }; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 4b9bb5d..c1e6d3e 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -29,11 +29,9 @@ #include "cpufreq_governor.h" /* On-demand governor macros */ -#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) #define DEF_FREQUENCY_UP_THRESHOLD (80) #define DEF_SAMPLING_DOWN_FACTOR (1) #define MAX_SAMPLING_DOWN_FACTOR (100000) -#define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) #define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) #define MIN_FREQUENCY_UP_THRESHOLD (11) @@ -159,14 +157,10 @@ static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq) /* * Every sampling_rate, we check, if current idle time is less than 20% - * (default), then we try to increase frequency. Every sampling_rate, we look - * for the lowest frequency which can sustain the load while keeping idle time - * over 30%. If such a frequency exist, we try to decrease to this frequency. - * - * Any frequency increase takes it to the maximum frequency. Frequency reduction - * happens at minimum steps of 5% (default) of current frequency + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. */ -static void od_check_cpu(int cpu, unsigned int load_freq) +static void od_check_cpu(int cpu, unsigned int load) { struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy; @@ -176,29 +170,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq) dbs_info->freq_lo = 0; /* Check for frequency increase */ - if (load_freq > od_tuners->up_threshold * policy->cur) { + if (load > od_tuners->up_threshold) { /* If switching to max speed, apply sampling_down_factor */ if (policy->cur < policy->max) dbs_info->rate_mult = od_tuners->sampling_down_factor; dbs_freq_increase(policy, policy->max); return; - } - - /* Check for frequency decrease */ - /* if we cannot reduce the frequency anymore, break out early */ - if (policy->cur == policy->min) - return; - - /* - * The optimal frequency is the frequency that is the lowest that can - * support the current CPU usage without triggering the up policy. To be - * safe, we focus 10 points under the threshold. - */ - if (load_freq < od_tuners->adj_up_threshold - * policy->cur) { + } else { + /* Calculate the next frequency proportional to load */ unsigned int freq_next; - freq_next = load_freq / od_tuners->adj_up_threshold; + freq_next = load * policy->max / 100; /* No longer fully busy, reset rate_mult */ dbs_info->rate_mult = 1; @@ -372,9 +354,6 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf, input < MIN_FREQUENCY_UP_THRESHOLD) { return -EINVAL; } - /* Calculate the new adj_up_threshold */ - od_tuners->adj_up_threshold += input; - od_tuners->adj_up_threshold -= od_tuners->up_threshold; od_tuners->up_threshold = input; return count; @@ -523,8 +502,6 @@ static int od_init(struct dbs_data *dbs_data) if (idle_time != -1ULL) { /* Idle micro accounting is supported. Use finer thresholds */ tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; - tuners->adj_up_threshold = MICRO_FREQUENCY_UP_THRESHOLD - - MICRO_FREQUENCY_DOWN_DIFFERENTIAL; /* * In nohz/micro accounting case we set the minimum frequency * not depending on HZ, but fixed (very low). The deferred @@ -533,8 +510,6 @@ static int od_init(struct dbs_data *dbs_data) dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; } else { tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD; - tuners->adj_up_threshold = DEF_FREQUENCY_UP_THRESHOLD - - DEF_FREQUENCY_DOWN_DIFFERENTIAL; /* For correct statistics, we need 10 ticks for each measure */ dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO * diff --git a/drivers/cpufreq/mperf.c b/drivers/cpufreq/mperf.c deleted file mode 100644 index 911e193..0000000 --- a/drivers/cpufreq/mperf.c +++ /dev/null @@ -1,51 +0,0 @@ -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> - -#include "mperf.h" - -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); - -/* Called via smp_call_function_single(), on the target CPU */ -static void read_measured_perf_ctrs(void *_cur) -{ - struct aperfmperf *am = _cur; - - get_aperfmperf(am); -} - -/* - * Return the measured active (C0) frequency on this CPU since last call - * to this function. - * Input: cpu number - * Return: Average CPU frequency in terms of max frequency (zero on error) - * - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance - * over a period of time, while CPU is in C0 state. - * IA32_MPERF counts at the rate of max advertised frequency - * IA32_APERF counts at the rate of actual CPU frequency - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and - * no meaning should be associated with absolute values of these MSRs. - */ -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) -{ - struct aperfmperf perf; - unsigned long ratio; - unsigned int retval; - - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) - return 0; - - ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); - per_cpu(acfreq_old_perf, cpu) = perf; - - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; - - return retval; -} -EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); -MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/mperf.h b/drivers/cpufreq/mperf.h deleted file mode 100644 index 5dbf295..0000000 --- a/drivers/cpufreq/mperf.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * (c) 2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 037d36a..60fcb42 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -211,10 +211,6 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation); - -extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy, - unsigned int cpu); - int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); @@ -254,8 +250,6 @@ struct cpufreq_driver { unsigned int (*get) (unsigned int cpu); /* optional */ - unsigned int (*getavg) (struct cpufreq_policy *policy, - unsigned int cpu); int (*bios_limit) (int cpu, unsigned int *limit); int (*exit) (struct cpufreq_policy *policy);
Ondemand calculates load in terms of frequency and increases it only if the load_freq is greater than up_threshold multiplied by current or average frequency. This seems to produce oscillations of frequency between min and max because, for example, a relatively small load can easily saturate minimum frequency and lead the CPU to max. Then, the CPU will decrease back to min due to a small load_freq. This patch changes the calculation method of load and target frequency considering 2 points: - Load computation should be independent from current or average measured frequency. For example an absolute load 80% at 100MHz is not necessarily equivalent to 8% at 1000MHz in the next sampling interval. - Target frequency should be increased to any value of frequency table proportional to absolute load, instead to only the max. The patch also removes the unused code as a result of the above changes. Tested on Intel i7-3770 CPU @ 3.40GHz and on Quad core 1500MHz Krait. Phoronix benchmark of Linux Kernel Compilation 3.1 test shows an increase ~1.5% in performance. cpufreq_stats (time_in_state) shows that middle frequencies are used more, with this patch. Highest and lowest frequencies were used less by ~9% Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr> --- arch/x86/include/asm/processor.h | 29 ---------------------- drivers/cpufreq/Makefile | 2 +- drivers/cpufreq/acpi-cpufreq.c | 5 ---- drivers/cpufreq/cpufreq.c | 21 ---------------- drivers/cpufreq/cpufreq_governor.c | 10 +------- drivers/cpufreq/cpufreq_governor.h | 1 - drivers/cpufreq/cpufreq_ondemand.c | 39 ++++++----------------------- drivers/cpufreq/mperf.c | 51 -------------------------------------- drivers/cpufreq/mperf.h | 9 ------- include/linux/cpufreq.h | 6 ----- 10 files changed, 9 insertions(+), 164 deletions(-) delete mode 100644 drivers/cpufreq/mperf.c delete mode 100644 drivers/cpufreq/mperf.h