Message ID | 3169564.ZRsPWhXyMD@kreacher (mailing list archive) |
---|---|
State | RFC, archived |
Headers | show |
Series | [RFC/RFT] cpufreq: intel_pstate: Work in passive mode with HWP enabled | expand |
"Rafael J. Wysocki" <rjw@rjwysocki.net> writes: > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > > Allow intel_pstate to work in the passive mode with HWP enabled and > make it translate the target frequency supplied by the cpufreq > governor in use into an EPP value to be written to the HWP request > MSR (high frequencies are mapped to low EPP values that mean more > performance-oriented HWP operation) as a hint for the HWP algorithm > in the processor, so as to prevent it and the CPU scheduler from > working against each other at least when the schedutil governor is > in use. > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > --- > > This is a prototype not intended for production use (based on linux-next). > > Please test it if you can (on HWP systems, of course) and let me know the > results. > > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well > may turn out to be either too high or too low for the general use, which is one > reason why getting as much testing coverage as possible is key here. > > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, > please do so and let me know the conclusions. > > Cheers, > Rafael > > --- > drivers/cpufreq/intel_pstate.c | 169 +++++++++++++++++++++++++++++++---------- > 1 file changed, 131 insertions(+), 38 deletions(-) > > Index: linux-pm/drivers/cpufreq/intel_pstate.c > =================================================================== > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c > +++ linux-pm/drivers/cpufreq/intel_pstate.c > @@ -36,6 +36,7 @@ > #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) > > #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 > #define INTEL_CPUFREQ_TRANSITION_DELAY 500 > > #ifdef CONFIG_ACPI > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int > return div_ext_fp(percent, 100); > } > > +#define HWP_EPP_TO_BYTE(x) (((u64)x >> 24) & 0xFF) > + > /** > * struct sample - Store performance sample > * @core_avg_perf: Ratio of APERF/MPERF which is the actual average > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st > > static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) > { > - intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); > + if (hwp_active) > + intel_pstate_hwp_force_min_perf(policy->cpu); > + else > + intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); > } > > static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct > pr_debug("CPU %d exiting\n", policy->cpu); > > intel_pstate_clear_update_util_hook(policy->cpu); > - if (hwp_active) { > + if (hwp_active) > intel_pstate_hwp_save_state(policy); > - intel_pstate_hwp_force_min_perf(policy->cpu); > - } else { > - intel_cpufreq_stop_cpu(policy); > - } > + > + intel_cpufreq_stop_cpu(policy); > } > > static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s > #define INTEL_PSTATE_TRACE_TARGET 10 > #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 > > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, > + int from, int to) > { > struct sample *sample; > > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c > sample = &cpu->sample; > trace_pstate_sample(trace_type, > 0, > - old_pstate, > - cpu->pstate.current_pstate, > + from, > + to, > sample->mperf, > sample->aperf, > sample->tsc, > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c > fp_toint(cpu->iowait_boost * 100)); > } > > -static int intel_cpufreq_target(struct cpufreq_policy *policy, > - unsigned int target_freq, > - unsigned int relation) > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp) > { > - struct cpudata *cpu = all_cpu_data[policy->cpu]; > - struct cpufreq_freqs freqs; > - int target_pstate, old_pstate; > + u64 value, prev; > > - update_turbo_state(); > + prev = READ_ONCE(cpu->hwp_req_cached); > + value = prev; > > - freqs.old = policy->cur; > - freqs.new = target_freq; > + /* > + * The entire MSR needs to be updated in order to update the EPP field > + * in it, so opportunistically update the min and max too if needed. > + */ > + value &= ~HWP_MIN_PERF(~0L); > + value |= HWP_MIN_PERF(cpu->min_perf_ratio); > + > + value &= ~HWP_MAX_PERF(~0L); > + value |= HWP_MAX_PERF(cpu->max_perf_ratio); > + > + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { > + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, > + HWP_EPP_TO_BYTE(prev), new_epp); > + > + value &= ~GENMASK_ULL(31, 24); > + value |= HWP_ENERGY_PERF_PREFERENCE(new_epp); > + } > + > + if (value != prev) { > + WRITE_ONCE(cpu->hwp_req_cached, value); > + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); > + } > +} > + > +/** > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register. > + * @cpu: Target CPU. > + * @max_freq: Maximum frequency to consider. > + * @target_freq: Target frequency selected by the governor. > + * > + * Translate the target frequency into a new EPP value to be written into the > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection. > + * > + * The purpose of this is to avoid situations in which the kernel and the HWP > + * algorithm work against each other by giving a hint about the expectations of > + * the former to the latter. > + * > + * The mapping betweeen the target frequencies and the hint values need not be > + * exact, but it must be monotonic, so that higher target frequencies always > + * indicate more performance-oriented P-state selection. > + */ > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq, > + unsigned int target_freq) > +{ > + s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq); > + > + intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp)); > +} > + Hey Rafael, I'm building a kernel with this in order to give it a try on my system, but I'm skeptical that translating the target frequency to an EPP value will work reliably. AFAIA the EPP value only has an indirect influence on the processor's performance by adjusting the trade-off between its responsiveness (rather than the actual clock frequency which it will sustain in the long run) and its energy usage, in a largely unspecified and non-linear way (non-linear like the effect of switching CPU energy optimization features on and off, or like its effect on the energy balancing behavior of the processor which can have a paradoxical effect on performance). I doubt that the scheduling-based CPU utilization is a good predictor for the optimal trade-off between those two variables. Consider the following scenarios where this would deviate from the typical default EPP of 0x80: - The application has low CPU utilization (say 10%) but it's latency-bound, the current default EPP value allows the CPU to ramp up during each burst and deliver reasonable average performance. With this code in effect you may end up at an EPP > 0xE0 which will severely limit the responsiveness of the HWP during each burst of work, possibly reducing performance (since the application was latency-bound), which will cause the CPU utilization of the process to further decrease, leading to a further increase in EPP and further decrease in performance (IOW this failure mode would have positive feedback rather than being somehow self-correcting). - The application has medium to high CPU utilization (say 60%), but it's IO-bound (e.g. a database server with a hard-drive bottleneck). This would cause you to program an EPP value around 0x60 which will cause the HWP to respond more aggressively than currently to each burst of work, increasing energy usage without a corresponding increase in performance (since the application was IO-bound). - Same as the last point, but the application is TDP-bound instead (e.g. a computer game running on an integrated GPU). Not only will CPU energy usage increase, but performance will decrease with it, due to the TDP budget being stolen from the GPU. - The application has medium to low CPU utilization and frequently migrates between CPU cores. This would end up at an EPP >= 0x80 which won't give the application any CPU responsiveness advantage during migration over the current behavior (but possibly quite the opposite), even though the scheduler might know exactly what CPU frequency the application was able to average, and could have saved it some significant amount of ramp-up time. It seems to me that hooking this up to the HWP_MIN_PERF control instead of EPP would have a more deterministic effect and avoid the failure scenarios described above. E.g. upon migration you would be able to instantly set up the new CPU core to the same P-state you knew the application was averaging before migration. No performance is sacrificed in latency-bound applications since you wouldn't be limiting the HWP's responsiveness. No energy efficiency is sacrificed in the steady state since you would be programming the most energy-efficient (minimum constant) P-state which is able to deliver the observed performance of the application -- It might make sense to program that value *minus* some small margin though, in order to give the HWP the chance to clock down in cases where the load fluctuates discontinuously before the next update of the HWP request. Of course it would also be nice to give schedutil control of HWP_MAX_PERF eventually, e.g. by implementing the energy efficiency optimization I've been proposing [1] in schedutil too. [1] https://lwn.net/Articles/818827/ > +static int intel_cpufreq_adjust_pstate(struct cpudata *cpu, > + unsigned int target_freq, > + unsigned int relation) > +{ > + int old_pstate = cpu->pstate.current_pstate; > + int target_pstate; > > - cpufreq_freq_transition_begin(policy, &freqs); > switch (relation) { > case CPUFREQ_RELATION_L: > - target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling); > + target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); > break; > case CPUFREQ_RELATION_H: > - target_pstate = freqs.new / cpu->pstate.scaling; > + target_pstate = target_freq / cpu->pstate.scaling; > break; > default: > - target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling); > + target_pstate = DIV_ROUND_CLOSEST(target_freq, cpu->pstate.scaling); > break; > } > target_pstate = intel_pstate_prepare_request(cpu, target_pstate); > - old_pstate = cpu->pstate.current_pstate; > - if (target_pstate != cpu->pstate.current_pstate) { > + if (target_pstate != old_pstate) { > cpu->pstate.current_pstate = target_pstate; > - wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, > + wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, > pstate_funcs.get_val(cpu, target_pstate)); > } > - freqs.new = target_pstate * cpu->pstate.scaling; > - intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate); > + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate, target_pstate); > + return target_pstate * cpu->pstate.scaling; > +} > + > +static int intel_cpufreq_target(struct cpufreq_policy *policy, > + unsigned int target_freq, > + unsigned int relation) > +{ > + struct cpudata *cpu = all_cpu_data[policy->cpu]; > + struct cpufreq_freqs freqs; > + > + update_turbo_state(); > + > + freqs.old = policy->cur; > + freqs.new = target_freq; > + > + cpufreq_freq_transition_begin(policy, &freqs); > + > + if (hwp_active) > + intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq, > + target_freq); > + else > + freqs.new = intel_cpufreq_adjust_pstate(cpu, target_freq, relation); > + > cpufreq_freq_transition_end(policy, &freqs, false); > > return 0; > @@ -2365,11 +2440,18 @@ static unsigned int intel_cpufreq_fast_s > > update_turbo_state(); > > + if (hwp_active) { > + intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq, > + target_freq); > + return target_freq; > + } > + > target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); > target_pstate = intel_pstate_prepare_request(cpu, target_pstate); > old_pstate = cpu->pstate.current_pstate; > intel_pstate_update_pstate(cpu, target_pstate); > - intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); > + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate, > + target_pstate); > return target_pstate * cpu->pstate.scaling; > } > > @@ -2389,7 +2471,6 @@ static int intel_cpufreq_cpu_init(struct > return ret; > > policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; > - policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; > /* This reflects the intel_pstate_get_cpu_pstates() setting. */ > policy->cur = policy->cpuinfo.min_freq; > > @@ -2401,10 +2482,19 @@ static int intel_cpufreq_cpu_init(struct > > cpu = all_cpu_data[policy->cpu]; > > - if (hwp_active) > + if (hwp_active) { > + u64 value; > + > + rdmsrl_on_cpu(policy->cpu, MSR_HWP_REQUEST, &value); > + WRITE_ONCE(cpu->hwp_req_cached, value); > + cpu->epp_saved = HWP_EPP_TO_BYTE(value); > + > intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state); > - else > + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP; > + } else { > turbo_max = cpu->pstate.turbo_pstate; > + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; > + } > > min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); > min_freq *= cpu->pstate.scaling; > @@ -2449,6 +2539,13 @@ static int intel_cpufreq_cpu_exit(struct > freq_qos_remove_request(req); > kfree(req); > > + if (hwp_active) { > + struct cpudata *cpu = all_cpu_data[policy->cpu]; > + > + /* Restore the original HWP EPP value. */ > + intel_cpufreq_update_hwp_request(cpu, cpu->epp_saved); > + } > + > return intel_pstate_cpu_exit(policy); > } > > @@ -2505,9 +2602,6 @@ static int intel_pstate_register_driver( > > static int intel_pstate_unregister_driver(void) > { > - if (hwp_active) > - return -EBUSY; > - > cpufreq_unregister_driver(intel_pstate_driver); > intel_pstate_driver_cleanup(); > > @@ -2815,12 +2909,11 @@ static int __init intel_pstate_setup(cha > if (!str) > return -EINVAL; > > - if (!strcmp(str, "disable")) { > + if (!strcmp(str, "disable")) > no_load = 1; > - } else if (!strcmp(str, "passive")) { > + else if (!strcmp(str, "passive")) > default_driver = &intel_cpufreq; > - no_hwp = 1; > - } > + > if (!strcmp(str, "no_hwp")) { > pr_info("HWP disabled\n"); > no_hwp = 1;
On Mon, May 25, 2020 at 3:39 AM Francisco Jerez <francisco.jerez.plata@intel.com> wrote: > > "Rafael J. Wysocki" <rjw@rjwysocki.net> writes: > > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > > > > Allow intel_pstate to work in the passive mode with HWP enabled and > > make it translate the target frequency supplied by the cpufreq > > governor in use into an EPP value to be written to the HWP request > > MSR (high frequencies are mapped to low EPP values that mean more > > performance-oriented HWP operation) as a hint for the HWP algorithm > > in the processor, so as to prevent it and the CPU scheduler from > > working against each other at least when the schedutil governor is > > in use. > > > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > > --- > > > > This is a prototype not intended for production use (based on linux-next). > > > > Please test it if you can (on HWP systems, of course) and let me know the > > results. > > > > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well > > may turn out to be either too high or too low for the general use, which is one > > reason why getting as much testing coverage as possible is key here. > > > > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, > > please do so and let me know the conclusions. > > > > Cheers, > > Rafael > > > > --- > > drivers/cpufreq/intel_pstate.c | 169 +++++++++++++++++++++++++++++++---------- > > 1 file changed, 131 insertions(+), 38 deletions(-) > > > > Index: linux-pm/drivers/cpufreq/intel_pstate.c > > =================================================================== > > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c > > +++ linux-pm/drivers/cpufreq/intel_pstate.c > > @@ -36,6 +36,7 @@ > > #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) > > > > #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 > > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 > > #define INTEL_CPUFREQ_TRANSITION_DELAY 500 > > > > #ifdef CONFIG_ACPI > > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int > > return div_ext_fp(percent, 100); > > } > > > > +#define HWP_EPP_TO_BYTE(x) (((u64)x >> 24) & 0xFF) > > + > > /** > > * struct sample - Store performance sample > > * @core_avg_perf: Ratio of APERF/MPERF which is the actual average > > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st > > > > static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) > > { > > - intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); > > + if (hwp_active) > > + intel_pstate_hwp_force_min_perf(policy->cpu); > > + else > > + intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); > > } > > > > static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) > > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct > > pr_debug("CPU %d exiting\n", policy->cpu); > > > > intel_pstate_clear_update_util_hook(policy->cpu); > > - if (hwp_active) { > > + if (hwp_active) > > intel_pstate_hwp_save_state(policy); > > - intel_pstate_hwp_force_min_perf(policy->cpu); > > - } else { > > - intel_cpufreq_stop_cpu(policy); > > - } > > + > > + intel_cpufreq_stop_cpu(policy); > > } > > > > static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) > > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s > > #define INTEL_PSTATE_TRACE_TARGET 10 > > #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 > > > > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) > > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, > > + int from, int to) > > { > > struct sample *sample; > > > > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c > > sample = &cpu->sample; > > trace_pstate_sample(trace_type, > > 0, > > - old_pstate, > > - cpu->pstate.current_pstate, > > + from, > > + to, > > sample->mperf, > > sample->aperf, > > sample->tsc, > > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c > > fp_toint(cpu->iowait_boost * 100)); > > } > > > > -static int intel_cpufreq_target(struct cpufreq_policy *policy, > > - unsigned int target_freq, > > - unsigned int relation) > > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp) > > { > > - struct cpudata *cpu = all_cpu_data[policy->cpu]; > > - struct cpufreq_freqs freqs; > > - int target_pstate, old_pstate; > > + u64 value, prev; > > > > - update_turbo_state(); > > + prev = READ_ONCE(cpu->hwp_req_cached); > > + value = prev; > > > > - freqs.old = policy->cur; > > - freqs.new = target_freq; > > + /* > > + * The entire MSR needs to be updated in order to update the EPP field > > + * in it, so opportunistically update the min and max too if needed. > > + */ > > + value &= ~HWP_MIN_PERF(~0L); > > + value |= HWP_MIN_PERF(cpu->min_perf_ratio); > > + > > + value &= ~HWP_MAX_PERF(~0L); > > + value |= HWP_MAX_PERF(cpu->max_perf_ratio); > > + > > + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { > > + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, > > + HWP_EPP_TO_BYTE(prev), new_epp); > > + > > + value &= ~GENMASK_ULL(31, 24); > > + value |= HWP_ENERGY_PERF_PREFERENCE(new_epp); > > + } > > + > > + if (value != prev) { > > + WRITE_ONCE(cpu->hwp_req_cached, value); > > + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); > > + } > > +} > > + > > +/** > > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register. > > + * @cpu: Target CPU. > > + * @max_freq: Maximum frequency to consider. > > + * @target_freq: Target frequency selected by the governor. > > + * > > + * Translate the target frequency into a new EPP value to be written into the > > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection. > > + * > > + * The purpose of this is to avoid situations in which the kernel and the HWP > > + * algorithm work against each other by giving a hint about the expectations of > > + * the former to the latter. > > + * > > + * The mapping betweeen the target frequencies and the hint values need not be > > + * exact, but it must be monotonic, so that higher target frequencies always > > + * indicate more performance-oriented P-state selection. > > + */ > > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq, > > + unsigned int target_freq) > > +{ > > + s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq); > > + > > + intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp)); > > +} > > + > > Hey Rafael, I'm building a kernel with this in order to give it a try on > my system, but I'm skeptical that translating the target frequency to an > EPP value will work reliably. AFAIA the EPP value only has an indirect > influence on the processor's performance by adjusting the trade-off > between its responsiveness (rather than the actual clock frequency which > it will sustain in the long run) and its energy usage, in a largely > unspecified and non-linear way (non-linear like the effect of switching > CPU energy optimization features on and off, or like its effect on the > energy balancing behavior of the processor which can have a paradoxical > effect on performance). > > I doubt that the scheduling-based CPU utilization is a good predictor > for the optimal trade-off between those two variables. While I agree that this is not perfect, there barely is anything else that can be used for this purpose. Using the desired field or trying to adjust the limits relatively often would basically cause the P-state selection to be driven entirely by the kernel which simply doesn't know certain things only known to the P-unit, so it is reasonable to leave some level of control to the latter, so as to allow it to use the information known to it only. However, if it is allowed to do whatever it likes without any hints from the kernel, it may very well go against the scheduler's decisions which is not going to be optimal. I'm simply not sure if there is any other way to give such hints to it that through the EPP. Thanks!
On 2020.05.21 10:16 Rafael J. Wysocki wrote: > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > > Allow intel_pstate to work in the passive mode with HWP enabled and > make it translate the target frequency supplied by the cpufreq > governor in use into an EPP value to be written to the HWP request > MSR (high frequencies are mapped to low EPP values that mean more > performance-oriented HWP operation) as a hint for the HWP algorithm > in the processor, so as to prevent it and the CPU scheduler from > working against each other at least when the schedutil governor is > in use. > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > --- > > This is a prototype not intended for production use (based on linux-next). > > Please test it if you can (on HWP systems, of course) and let me know the > results. > > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well > may turn out to be either too high or too low for the general use, which is one > reason why getting as much testing coverage as possible is key here. > > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, > please do so and let me know the conclusions. Hi all, It is way too early for me to reply to this, as my tests are incomplete. However, I saw the other e-mails and will chime in now. Note before: I have been having difficulties with this test computer, results may be suspect. The default INTEL_CPUFREQ_TRANSITION_DELAY_HWP is definitely too short. I do not know what an optimal value is, but I have made it the same as INTEL_CPUFREQ_TRANSITION_LATENCY for now, and because it makes for great test comparisons. I have only done the simplest of tests so far, single thread, forced CPU affinity load ramp up/down. Load: fixed work packet per work period, i.e. the amount of work to do is independent of CPU frequency. 347 hertz work / sleep frequency. Why? No reason. Because it is what I used last time. To reveal any hysteresis (i.e. with conservative governor) load ramps up from none to 100% and then back down to none at 3 seconds per step (step size is uncalibrated). Processor: Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz Modes: intel_cpufreq (i_) intel_pstate (p_) acpi-cpufreq (a_) Kernel: b955f2f4a425 (HEAD -> k57-hwp) cpufreq: intel_pstate: Work in passive mode with HWP enabled faf93d5c9da1 cpufreq: intel_pstate: Use passive mode by default without HWP b9bbe6ed63b2 (tag: v5.7-rc6) Linux 5.7-rc6 Graph notes: The ramp up and ramp down are folded back on the graphs. These tests were launched manually in two terminals, so the timing was not exact. I'll fix this moving forward. Legend - ondemand graph [1] (minor annotation added): i_onde_stock : intel-cpufreq, ondemand, stock kernel (5.7-rc6), hwp disabled. i_onde_hwp : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 5000. i_onde_hwp2 : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 20000. a_onde: acpi-cpufreq, stock kernel (5.7-rc6), intel_pstate disabled. Conclusion: DELAY_HWP 20000 matches the pre-patch response almost exactly. DELAY_HWP 5000 goes too far (my opinion), but does look promising. Legend - intel_pstate - powersave graph [2]. What? Why is there such a graph, unrelated to this patch? Well, because there is a not yet understood effect. p_powe_stock : intel_pstate, powersave, stock kernel (5.7-rc6), hwp disabled. P_powe_hwp : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 5000. P_powe_hwp2 : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 20000. Conclusion: ?? Note: That I merely made a stupid mistake is a real possibility. Repeat test pending. Legend - schedutil graph [3]: i_sche_stock : intel-cpufreq, ondemand, stock kernel (5.7-rc6), hwp disabled. i_sche_hwp : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 5000. i_sche_hwp2 : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 20000. a_sche: acpi-cpufreq, stock kernel (5.7-rc6), intel_pstate disabled. Conclusion: ?? I don't know, always had schedutil issues on this computer. Expected more like typical from my older test server (i7-2600K) [4]: [1] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-ondemand.png [2] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-but-active-powersave.png [3] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-schedutil.png [4] http://www.smythies.com/~doug/linux/intel_pstate/cpu-freq-scalers/fixed-work-packet-folded-schedutil.png > > #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 For now, while trying to prove nothing is busted, suggest 20000. > +/** > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register. request. > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq, > + unsigned int target_freq) > +{ > + s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq); > + I would have thought this: s64 epp_fp = div_fp(255 * (max_freq - target_freq), (max_freq - min_freq)); but tested the original only, so far. ... Doug
"Rafael J. Wysocki" <rafael@kernel.org> writes: > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez > <francisco.jerez.plata@intel.com> wrote: >> >> "Rafael J. Wysocki" <rjw@rjwysocki.net> writes: >> >> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> >> > >> > Allow intel_pstate to work in the passive mode with HWP enabled and >> > make it translate the target frequency supplied by the cpufreq >> > governor in use into an EPP value to be written to the HWP request >> > MSR (high frequencies are mapped to low EPP values that mean more >> > performance-oriented HWP operation) as a hint for the HWP algorithm >> > in the processor, so as to prevent it and the CPU scheduler from >> > working against each other at least when the schedutil governor is >> > in use. >> > >> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> >> > --- >> > >> > This is a prototype not intended for production use (based on linux-next). >> > >> > Please test it if you can (on HWP systems, of course) and let me know the >> > results. >> > >> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well >> > may turn out to be either too high or too low for the general use, which is one >> > reason why getting as much testing coverage as possible is key here. >> > >> > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, >> > please do so and let me know the conclusions. >> > >> > Cheers, >> > Rafael >> > >> > --- >> > drivers/cpufreq/intel_pstate.c | 169 +++++++++++++++++++++++++++++++---------- >> > 1 file changed, 131 insertions(+), 38 deletions(-) >> > >> > Index: linux-pm/drivers/cpufreq/intel_pstate.c >> > =================================================================== >> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c >> > +++ linux-pm/drivers/cpufreq/intel_pstate.c >> > @@ -36,6 +36,7 @@ >> > #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) >> > >> > #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 >> > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 >> > #define INTEL_CPUFREQ_TRANSITION_DELAY 500 >> > >> > #ifdef CONFIG_ACPI >> > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int >> > return div_ext_fp(percent, 100); >> > } >> > >> > +#define HWP_EPP_TO_BYTE(x) (((u64)x >> 24) & 0xFF) >> > + >> > /** >> > * struct sample - Store performance sample >> > * @core_avg_perf: Ratio of APERF/MPERF which is the actual average >> > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st >> > >> > static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) >> > { >> > - intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); >> > + if (hwp_active) >> > + intel_pstate_hwp_force_min_perf(policy->cpu); >> > + else >> > + intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); >> > } >> > >> > static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) >> > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct >> > pr_debug("CPU %d exiting\n", policy->cpu); >> > >> > intel_pstate_clear_update_util_hook(policy->cpu); >> > - if (hwp_active) { >> > + if (hwp_active) >> > intel_pstate_hwp_save_state(policy); >> > - intel_pstate_hwp_force_min_perf(policy->cpu); >> > - } else { >> > - intel_cpufreq_stop_cpu(policy); >> > - } >> > + >> > + intel_cpufreq_stop_cpu(policy); >> > } >> > >> > static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) >> > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s >> > #define INTEL_PSTATE_TRACE_TARGET 10 >> > #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 >> > >> > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) >> > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, >> > + int from, int to) >> > { >> > struct sample *sample; >> > >> > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c >> > sample = &cpu->sample; >> > trace_pstate_sample(trace_type, >> > 0, >> > - old_pstate, >> > - cpu->pstate.current_pstate, >> > + from, >> > + to, >> > sample->mperf, >> > sample->aperf, >> > sample->tsc, >> > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c >> > fp_toint(cpu->iowait_boost * 100)); >> > } >> > >> > -static int intel_cpufreq_target(struct cpufreq_policy *policy, >> > - unsigned int target_freq, >> > - unsigned int relation) >> > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp) >> > { >> > - struct cpudata *cpu = all_cpu_data[policy->cpu]; >> > - struct cpufreq_freqs freqs; >> > - int target_pstate, old_pstate; >> > + u64 value, prev; >> > >> > - update_turbo_state(); >> > + prev = READ_ONCE(cpu->hwp_req_cached); >> > + value = prev; >> > >> > - freqs.old = policy->cur; >> > - freqs.new = target_freq; >> > + /* >> > + * The entire MSR needs to be updated in order to update the EPP field >> > + * in it, so opportunistically update the min and max too if needed. >> > + */ >> > + value &= ~HWP_MIN_PERF(~0L); >> > + value |= HWP_MIN_PERF(cpu->min_perf_ratio); >> > + >> > + value &= ~HWP_MAX_PERF(~0L); >> > + value |= HWP_MAX_PERF(cpu->max_perf_ratio); >> > + >> > + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { >> > + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, >> > + HWP_EPP_TO_BYTE(prev), new_epp); >> > + >> > + value &= ~GENMASK_ULL(31, 24); >> > + value |= HWP_ENERGY_PERF_PREFERENCE(new_epp); >> > + } >> > + >> > + if (value != prev) { >> > + WRITE_ONCE(cpu->hwp_req_cached, value); >> > + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); >> > + } >> > +} >> > + >> > +/** >> > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register. >> > + * @cpu: Target CPU. >> > + * @max_freq: Maximum frequency to consider. >> > + * @target_freq: Target frequency selected by the governor. >> > + * >> > + * Translate the target frequency into a new EPP value to be written into the >> > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection. >> > + * >> > + * The purpose of this is to avoid situations in which the kernel and the HWP >> > + * algorithm work against each other by giving a hint about the expectations of >> > + * the former to the latter. >> > + * >> > + * The mapping betweeen the target frequencies and the hint values need not be >> > + * exact, but it must be monotonic, so that higher target frequencies always >> > + * indicate more performance-oriented P-state selection. >> > + */ >> > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq, >> > + unsigned int target_freq) >> > +{ >> > + s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq); >> > + >> > + intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp)); >> > +} >> > + >> >> Hey Rafael, I'm building a kernel with this in order to give it a try on >> my system, but I'm skeptical that translating the target frequency to an >> EPP value will work reliably. AFAIA the EPP value only has an indirect >> influence on the processor's performance by adjusting the trade-off >> between its responsiveness (rather than the actual clock frequency which >> it will sustain in the long run) and its energy usage, in a largely >> unspecified and non-linear way (non-linear like the effect of switching >> CPU energy optimization features on and off, or like its effect on the >> energy balancing behavior of the processor which can have a paradoxical >> effect on performance). >> >> I doubt that the scheduling-based CPU utilization is a good predictor >> for the optimal trade-off between those two variables. > > While I agree that this is not perfect, there barely is anything else > that can be used for this purpose. > > Using the desired field or trying to adjust the limits relatively > often would basically cause the P-state selection to be driven > entirely by the kernel which simply doesn't know certain things only > known to the P-unit, so it is reasonable to leave some level of > control to the latter, so as to allow it to use the information known > to it only. > > However, if it is allowed to do whatever it likes without any hints > from the kernel, it may very well go against the scheduler's decisions > which is not going to be optimal. > > I'm simply not sure if there is any other way to give such hints to it > that through the EPP. > Why not HWP_MIN_PERF? That would leave the HWP quite some room for maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not like P-state selection would be entirely driven by the kernel), while avoiding every failure scenario I was describing in my previous reply. Simply adjusting the EPP doesn't really allow the governor to make any assumptions about the instantaneous performance of the processor, so it seems questionable that it is exposed as a frequency target, since its effect on frequency would be highly implementation-dependent and inconsistent with the behavior of other CPUFREQ drivers. OTOH adjusting HWP_MIN_PERF achieves precisely what the caller wants: The CPU will deliver at least the requested number of clocks per second (assuming the request is compatible with the thermal and electrical constraints of the processor), while retaining the liberty to clock up beyond that frequency if the HWP is able to react to some discontinuous event more quickly than the scheduler. > Thanks!
Hi all, The INTEL_CPUFREQ_TRANSITION_DELAY_HWP = 20000 test results from this e-mail were incorrect. The test and graphs are being re-done. On 2020.05.25 08:30 Doug smythies wrote: > > Legend - intel_pstate - powersave graph [2]. > > What? Why is there such a graph, unrelated to this patch? > Well, because there is a not yet understood effect. > > p_powe_stock : intel_pstate, powersave, stock kernel (5.7-rc6), hwp disabled. > P_powe_hwp : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 5000. > P_powe_hwp2 : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 20000. > > Conclusion: ?? > > Note: That I merely made a stupid mistake is a real possibility. Yes, that was it. However all DELAY_HWP 20000 tests were bad, Not just this one. ... Doug
to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez <francisco.jerez.plata@intel.com> wrote: > > "Rafael J. Wysocki" <rafael@kernel.org> writes: > > > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez > > <francisco.jerez.plata@intel.com> wrote: > >> > >> "Rafael J. Wysocki" <rjw@rjwysocki.net> writes: > >> > >> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > >> > > >> > Allow intel_pstate to work in the passive mode with HWP enabled and > >> > make it translate the target frequency supplied by the cpufreq > >> > governor in use into an EPP value to be written to the HWP request > >> > MSR (high frequencies are mapped to low EPP values that mean more > >> > performance-oriented HWP operation) as a hint for the HWP algorithm > >> > in the processor, so as to prevent it and the CPU scheduler from > >> > working against each other at least when the schedutil governor is > >> > in use. > >> > > >> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > >> > --- > >> > > >> > This is a prototype not intended for production use (based on linux-next). > >> > > >> > Please test it if you can (on HWP systems, of course) and let me know the > >> > results. > >> > > >> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well > >> > may turn out to be either too high or too low for the general use, which is one > >> > reason why getting as much testing coverage as possible is key here. > >> > > >> > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, > >> > please do so and let me know the conclusions. > >> > > >> > Cheers, > >> > Rafael > >> > > >> > --- > >> > drivers/cpufreq/intel_pstate.c | 169 +++++++++++++++++++++++++++++++---------- > >> > 1 file changed, 131 insertions(+), 38 deletions(-) > >> > > >> > Index: linux-pm/drivers/cpufreq/intel_pstate.c > >> > =================================================================== > >> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c > >> > +++ linux-pm/drivers/cpufreq/intel_pstate.c > >> > @@ -36,6 +36,7 @@ > >> > #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) > >> > > >> > #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 > >> > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 > >> > #define INTEL_CPUFREQ_TRANSITION_DELAY 500 > >> > > >> > #ifdef CONFIG_ACPI > >> > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int > >> > return div_ext_fp(percent, 100); > >> > } > >> > > >> > +#define HWP_EPP_TO_BYTE(x) (((u64)x >> 24) & 0xFF) > >> > + > >> > /** > >> > * struct sample - Store performance sample > >> > * @core_avg_perf: Ratio of APERF/MPERF which is the actual average > >> > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st > >> > > >> > static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) > >> > { > >> > - intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); > >> > + if (hwp_active) > >> > + intel_pstate_hwp_force_min_perf(policy->cpu); > >> > + else > >> > + intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); > >> > } > >> > > >> > static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) > >> > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct > >> > pr_debug("CPU %d exiting\n", policy->cpu); > >> > > >> > intel_pstate_clear_update_util_hook(policy->cpu); > >> > - if (hwp_active) { > >> > + if (hwp_active) > >> > intel_pstate_hwp_save_state(policy); > >> > - intel_pstate_hwp_force_min_perf(policy->cpu); > >> > - } else { > >> > - intel_cpufreq_stop_cpu(policy); > >> > - } > >> > + > >> > + intel_cpufreq_stop_cpu(policy); > >> > } > >> > > >> > static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) > >> > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s > >> > #define INTEL_PSTATE_TRACE_TARGET 10 > >> > #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 > >> > > >> > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) > >> > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, > >> > + int from, int to) > >> > { > >> > struct sample *sample; > >> > > >> > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c > >> > sample = &cpu->sample; > >> > trace_pstate_sample(trace_type, > >> > 0, > >> > - old_pstate, > >> > - cpu->pstate.current_pstate, > >> > + from, > >> > + to, > >> > sample->mperf, > >> > sample->aperf, > >> > sample->tsc, > >> > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c > >> > fp_toint(cpu->iowait_boost * 100)); > >> > } > >> > > >> > -static int intel_cpufreq_target(struct cpufreq_policy *policy, > >> > - unsigned int target_freq, > >> > - unsigned int relation) > >> > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp) > >> > { > >> > - struct cpudata *cpu = all_cpu_data[policy->cpu]; > >> > - struct cpufreq_freqs freqs; > >> > - int target_pstate, old_pstate; > >> > + u64 value, prev; > >> > > >> > - update_turbo_state(); > >> > + prev = READ_ONCE(cpu->hwp_req_cached); > >> > + value = prev; > >> > > >> > - freqs.old = policy->cur; > >> > - freqs.new = target_freq; > >> > + /* > >> > + * The entire MSR needs to be updated in order to update the EPP field > >> > + * in it, so opportunistically update the min and max too if needed. > >> > + */ > >> > + value &= ~HWP_MIN_PERF(~0L); > >> > + value |= HWP_MIN_PERF(cpu->min_perf_ratio); > >> > + > >> > + value &= ~HWP_MAX_PERF(~0L); > >> > + value |= HWP_MAX_PERF(cpu->max_perf_ratio); > >> > + > >> > + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { > >> > + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, > >> > + HWP_EPP_TO_BYTE(prev), new_epp); > >> > + > >> > + value &= ~GENMASK_ULL(31, 24); > >> > + value |= HWP_ENERGY_PERF_PREFERENCE(new_epp); > >> > + } > >> > + > >> > + if (value != prev) { > >> > + WRITE_ONCE(cpu->hwp_req_cached, value); > >> > + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); > >> > + } > >> > +} > >> > + > >> > +/** > >> > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register. > >> > + * @cpu: Target CPU. > >> > + * @max_freq: Maximum frequency to consider. > >> > + * @target_freq: Target frequency selected by the governor. > >> > + * > >> > + * Translate the target frequency into a new EPP value to be written into the > >> > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection. > >> > + * > >> > + * The purpose of this is to avoid situations in which the kernel and the HWP > >> > + * algorithm work against each other by giving a hint about the expectations of > >> > + * the former to the latter. > >> > + * > >> > + * The mapping betweeen the target frequencies and the hint values need not be > >> > + * exact, but it must be monotonic, so that higher target frequencies always > >> > + * indicate more performance-oriented P-state selection. > >> > + */ > >> > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq, > >> > + unsigned int target_freq) > >> > +{ > >> > + s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq); > >> > + > >> > + intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp)); > >> > +} > >> > + > >> > >> Hey Rafael, I'm building a kernel with this in order to give it a try on > >> my system, but I'm skeptical that translating the target frequency to an > >> EPP value will work reliably. AFAIA the EPP value only has an indirect > >> influence on the processor's performance by adjusting the trade-off > >> between its responsiveness (rather than the actual clock frequency which > >> it will sustain in the long run) and its energy usage, in a largely > >> unspecified and non-linear way (non-linear like the effect of switching > >> CPU energy optimization features on and off, or like its effect on the > >> energy balancing behavior of the processor which can have a paradoxical > >> effect on performance). > >> > >> I doubt that the scheduling-based CPU utilization is a good predictor > >> for the optimal trade-off between those two variables. > > > > While I agree that this is not perfect, there barely is anything else > > that can be used for this purpose. > > > > Using the desired field or trying to adjust the limits relatively > > often would basically cause the P-state selection to be driven > > entirely by the kernel which simply doesn't know certain things only > > known to the P-unit, so it is reasonable to leave some level of > > control to the latter, so as to allow it to use the information known > > to it only. > > > > However, if it is allowed to do whatever it likes without any hints > > from the kernel, it may very well go against the scheduler's decisions > > which is not going to be optimal. > > > > I'm simply not sure if there is any other way to give such hints to it > > that through the EPP. > > > > Why not HWP_MIN_PERF? That would leave the HWP quite some room for > maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not > like P-state selection would be entirely driven by the kernel), while > avoiding every failure scenario I was describing in my previous reply. Actually, I have been thinking about the HWP min as an alternative that may be worth evaluating. However, I would rather set the HWP min to something like 80% if the cpufreq request. Let me cut an alternative patch.
On 2020.05.26 01:19 Rafael J. Wysocki wrote: > to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez > > "Rafael J. Wysocki" <rafael@kernel.org> writes: > > > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez > > > > Why not HWP_MIN_PERF? That would leave the HWP quite some room for > > maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not > > like P-state selection would be entirely driven by the kernel), while > > avoiding every failure scenario I was describing in my previous reply. I have re-done my tests. The problem that I observe seems specific to hwp itself and not this patch and it's use in passive mode. I see the exact same thing with intel_pstate/powersave. [1] detail A. Test: still simple single threaded load sweep, at 347 hertz work/sleep frequency. What do I see? Unexpected frequency drops at around 70% load. Example, from trace: First, the thing has been going for awhile at 4.8 GHz. Old epp ; new epp ; freq GHz; load % ; duration mS 80 ; 82 ; 4.57 ; 61.94 ; 20.001 82 ; 80 ; 4.57 ; 62.47 ; 40.003 80 ; 44 ; 3.73 ; 68.63 ; 62.009 <<<< What? Why freq down? Why long duration? 44 ; 0 ; 1.96 ; 100.23 ; 19.995 <<<< Even lower freq. load overruns. 0 ; 73 ; 4.56 ; 82.93 ; 40.07 <<<< O.K. recovered, but damage done. 73 ; 46 ; 2.36 ; 79.19 ; 20.94 <<< now things oscillate a little. 46 ; 0 ; 1.9884 ; 100.24 ; 20.99 0 ; 75 ; 4.5624 ; 82.1 ; 41.002 <<< Event ends. Next event in 487 milliseconds. Observation: Events are often, but not always, preceded by a longer than normal duration. However, long durations are also not out of the ordinary in passive mode. And yes, the above trace was with DELAY_HWP 20,000, but I do have trace examples with it at 5,000. This was just a particularly good example. Observation (from looking at a lot of trace data): There are phase delays between the two systems, intel_cpufreq and hwp, and sometimes they seem to oscillate a little and fight each other. There maybe some problematic work/sleep frequencies where the oscillation builds into a full blown resonance. Why does hwp drop the frequency? This system is otherwise fairly idle, so maybe because the pll drops down during the non work periods. Maybe HWP thinks the system is idle and drops the frequency. I can eliminate the overruns by disabling deep idle states such that the PLL vote is never relinquished, but it's not a fair test. Note that the above response can be "tuned". If we take the conversation algorithm from target frequency to EPP and introduce and offset, the above can be improved. At what cost? More sluggishness, for a large positive offset. So, the overruns just move from the steady state side of the task to when the task starts. I did not find if there is a "sweet spot" between offset and system response, and I do not think there is value added in trying. Note: With original settings, I rarely observe a problem with the step function response to a new task. > > Actually, I have been thinking about the HWP min as an alternative > that may be worth evaluating. > > However, I would rather set the HWP min to something like 80% if the > cpufreq request. Yes, this is a good idea and should not suffer from the two servo systems fighting each other. I got 0 overruns, verses 2240 overruns with no min limitation (100 second test). As for INTEL_CPUFREQ_TRANSITION_DELAY_HWP, I'll probably use 10 milliseconds moving forward, because that is what I am most familiar with from years ago work on the this driver. But, I did not observe any issue with 5 milliseconds. [1] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-but-active-powersave.png Other replaced graphs: http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-ondemand.png http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-schedutil.png ... Doug
On Tue, May 26, 2020 at 5:51 PM Doug Smythies <dsmythies@telus.net> wrote: > > On 2020.05.26 01:19 Rafael J. Wysocki wrote: > > to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez > > > "Rafael J. Wysocki" <rafael@kernel.org> writes: > > > > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez > > > > > > Why not HWP_MIN_PERF? That would leave the HWP quite some room for > > > maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not > > > like P-state selection would be entirely driven by the kernel), while > > > avoiding every failure scenario I was describing in my previous reply. > > I have re-done my tests. > The problem that I observe seems specific to hwp itself > and not this patch and it's use in passive mode. > I see the exact same thing with intel_pstate/powersave. > [1] detail A. > > > Test: still simple single threaded load sweep, > at 347 hertz work/sleep frequency. > What do I see? > > Unexpected frequency drops at around 70% load. > Example, from trace: > > First, the thing has been going for awhile at 4.8 GHz. > > Old epp ; new epp ; freq GHz; load % ; duration mS > 80 ; 82 ; 4.57 ; 61.94 ; 20.001 > 82 ; 80 ; 4.57 ; 62.47 ; 40.003 > 80 ; 44 ; 3.73 ; 68.63 ; 62.009 <<<< What? Why freq down? Why long duration? > 44 ; 0 ; 1.96 ; 100.23 ; 19.995 <<<< Even lower freq. load overruns. > 0 ; 73 ; 4.56 ; 82.93 ; 40.07 <<<< O.K. recovered, but damage done. > 73 ; 46 ; 2.36 ; 79.19 ; 20.94 <<< now things oscillate a little. > 46 ; 0 ; 1.9884 ; 100.24 ; 20.99 > 0 ; 75 ; 4.5624 ; 82.1 ; 41.002 <<< Event ends. Next event in 487 milliseconds. > > Observation: Events are often, but not always, preceded by a longer than normal duration. > However, long durations are also not out of the ordinary in passive mode. > > And yes, the above trace was with DELAY_HWP 20,000, but I do have trace examples > with it at 5,000. This was just a particularly good example. > > Observation (from looking at a lot of trace data): There are phase delays > between the two systems, intel_cpufreq and hwp, and sometimes they seem to > oscillate a little and fight each other. There maybe some problematic > work/sleep frequencies where the oscillation builds into a full blown > resonance. > > Why does hwp drop the frequency? > > This system is otherwise fairly idle, > so maybe because the pll drops down during the non work periods. > > Maybe HWP thinks the system is idle and drops the frequency. > I can eliminate the overruns by disabling deep idle states such > that the PLL vote is never relinquished, but it's not a fair test. > > Note that the above response can be "tuned". > If we take the conversation algorithm from target frequency to EPP > and introduce and offset, the above can be improved. > > At what cost? More sluggishness, for a large positive offset. > So, the overruns just move from the steady state side of the task to > when the task starts. I did not find if there is a "sweet spot" > between offset and system response, and I do not think there is value > added in trying. > > Note: With original settings, I rarely observe a problem with the step > function response to a new task. > > > > > Actually, I have been thinking about the HWP min as an alternative > > that may be worth evaluating. > > > > However, I would rather set the HWP min to something like 80% if the > > cpufreq request. > > Yes, this is a good idea and should not suffer from the two servo systems > fighting each other. OK, thanks for the feedback! I am about to post this patch. > I got 0 overruns, verses 2240 overruns with no min limitation (100 second test). > > As for INTEL_CPUFREQ_TRANSITION_DELAY_HWP, I'll probably use > 10 milliseconds moving forward, because that is what I am most > familiar with from years ago work on the this driver. But, I did > not observe any issue with 5 milliseconds. I'm going to use INTEL_CPUFREQ_TRANSITION_DELAY_HWP in the new patch for now then. > [1] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-but-active-powersave.png Thanks!
"Rafael J. Wysocki" <rafael@kernel.org> writes: > to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez > <francisco.jerez.plata@intel.com> wrote: >> >> "Rafael J. Wysocki" <rafael@kernel.org> writes: >> >> > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez >> > <francisco.jerez.plata@intel.com> wrote: >> >> >> >> "Rafael J. Wysocki" <rjw@rjwysocki.net> writes: >> >> >> >> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> >> >> > >> >> > Allow intel_pstate to work in the passive mode with HWP enabled and >> >> > make it translate the target frequency supplied by the cpufreq >> >> > governor in use into an EPP value to be written to the HWP request >> >> > MSR (high frequencies are mapped to low EPP values that mean more >> >> > performance-oriented HWP operation) as a hint for the HWP algorithm >> >> > in the processor, so as to prevent it and the CPU scheduler from >> >> > working against each other at least when the schedutil governor is >> >> > in use. >> >> > >> >> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> >> >> > --- >> >> > >> >> > This is a prototype not intended for production use (based on linux-next). >> >> > >> >> > Please test it if you can (on HWP systems, of course) and let me know the >> >> > results. >> >> > >> >> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well >> >> > may turn out to be either too high or too low for the general use, which is one >> >> > reason why getting as much testing coverage as possible is key here. >> >> > >> >> > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, >> >> > please do so and let me know the conclusions. >> >> > >> >> > Cheers, >> >> > Rafael >> >> > >> >> > --- >> >> > drivers/cpufreq/intel_pstate.c | 169 +++++++++++++++++++++++++++++++---------- >> >> > 1 file changed, 131 insertions(+), 38 deletions(-) >> >> > >> >> > Index: linux-pm/drivers/cpufreq/intel_pstate.c >> >> > =================================================================== >> >> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c >> >> > +++ linux-pm/drivers/cpufreq/intel_pstate.c >> >> > @@ -36,6 +36,7 @@ >> >> > #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) >> >> > >> >> > #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 >> >> > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 >> >> > #define INTEL_CPUFREQ_TRANSITION_DELAY 500 >> >> > >> >> > #ifdef CONFIG_ACPI >> >> > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int >> >> > return div_ext_fp(percent, 100); >> >> > } >> >> > >> >> > +#define HWP_EPP_TO_BYTE(x) (((u64)x >> 24) & 0xFF) >> >> > + >> >> > /** >> >> > * struct sample - Store performance sample >> >> > * @core_avg_perf: Ratio of APERF/MPERF which is the actual average >> >> > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st >> >> > >> >> > static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) >> >> > { >> >> > - intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); >> >> > + if (hwp_active) >> >> > + intel_pstate_hwp_force_min_perf(policy->cpu); >> >> > + else >> >> > + intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); >> >> > } >> >> > >> >> > static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) >> >> > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct >> >> > pr_debug("CPU %d exiting\n", policy->cpu); >> >> > >> >> > intel_pstate_clear_update_util_hook(policy->cpu); >> >> > - if (hwp_active) { >> >> > + if (hwp_active) >> >> > intel_pstate_hwp_save_state(policy); >> >> > - intel_pstate_hwp_force_min_perf(policy->cpu); >> >> > - } else { >> >> > - intel_cpufreq_stop_cpu(policy); >> >> > - } >> >> > + >> >> > + intel_cpufreq_stop_cpu(policy); >> >> > } >> >> > >> >> > static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) >> >> > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s >> >> > #define INTEL_PSTATE_TRACE_TARGET 10 >> >> > #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 >> >> > >> >> > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) >> >> > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, >> >> > + int from, int to) >> >> > { >> >> > struct sample *sample; >> >> > >> >> > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c >> >> > sample = &cpu->sample; >> >> > trace_pstate_sample(trace_type, >> >> > 0, >> >> > - old_pstate, >> >> > - cpu->pstate.current_pstate, >> >> > + from, >> >> > + to, >> >> > sample->mperf, >> >> > sample->aperf, >> >> > sample->tsc, >> >> > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c >> >> > fp_toint(cpu->iowait_boost * 100)); >> >> > } >> >> > >> >> > -static int intel_cpufreq_target(struct cpufreq_policy *policy, >> >> > - unsigned int target_freq, >> >> > - unsigned int relation) >> >> > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp) >> >> > { >> >> > - struct cpudata *cpu = all_cpu_data[policy->cpu]; >> >> > - struct cpufreq_freqs freqs; >> >> > - int target_pstate, old_pstate; >> >> > + u64 value, prev; >> >> > >> >> > - update_turbo_state(); >> >> > + prev = READ_ONCE(cpu->hwp_req_cached); >> >> > + value = prev; >> >> > >> >> > - freqs.old = policy->cur; >> >> > - freqs.new = target_freq; >> >> > + /* >> >> > + * The entire MSR needs to be updated in order to update the EPP field >> >> > + * in it, so opportunistically update the min and max too if needed. >> >> > + */ >> >> > + value &= ~HWP_MIN_PERF(~0L); >> >> > + value |= HWP_MIN_PERF(cpu->min_perf_ratio); >> >> > + >> >> > + value &= ~HWP_MAX_PERF(~0L); >> >> > + value |= HWP_MAX_PERF(cpu->max_perf_ratio); >> >> > + >> >> > + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { >> >> > + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, >> >> > + HWP_EPP_TO_BYTE(prev), new_epp); >> >> > + >> >> > + value &= ~GENMASK_ULL(31, 24); >> >> > + value |= HWP_ENERGY_PERF_PREFERENCE(new_epp); >> >> > + } >> >> > + >> >> > + if (value != prev) { >> >> > + WRITE_ONCE(cpu->hwp_req_cached, value); >> >> > + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); >> >> > + } >> >> > +} >> >> > + >> >> > +/** >> >> > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register. >> >> > + * @cpu: Target CPU. >> >> > + * @max_freq: Maximum frequency to consider. >> >> > + * @target_freq: Target frequency selected by the governor. >> >> > + * >> >> > + * Translate the target frequency into a new EPP value to be written into the >> >> > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection. >> >> > + * >> >> > + * The purpose of this is to avoid situations in which the kernel and the HWP >> >> > + * algorithm work against each other by giving a hint about the expectations of >> >> > + * the former to the latter. >> >> > + * >> >> > + * The mapping betweeen the target frequencies and the hint values need not be >> >> > + * exact, but it must be monotonic, so that higher target frequencies always >> >> > + * indicate more performance-oriented P-state selection. >> >> > + */ >> >> > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq, >> >> > + unsigned int target_freq) >> >> > +{ >> >> > + s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq); >> >> > + >> >> > + intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp)); >> >> > +} >> >> > + >> >> >> >> Hey Rafael, I'm building a kernel with this in order to give it a try on >> >> my system, but I'm skeptical that translating the target frequency to an >> >> EPP value will work reliably. AFAIA the EPP value only has an indirect >> >> influence on the processor's performance by adjusting the trade-off >> >> between its responsiveness (rather than the actual clock frequency which >> >> it will sustain in the long run) and its energy usage, in a largely >> >> unspecified and non-linear way (non-linear like the effect of switching >> >> CPU energy optimization features on and off, or like its effect on the >> >> energy balancing behavior of the processor which can have a paradoxical >> >> effect on performance). >> >> >> >> I doubt that the scheduling-based CPU utilization is a good predictor >> >> for the optimal trade-off between those two variables. >> > >> > While I agree that this is not perfect, there barely is anything else >> > that can be used for this purpose. >> > >> > Using the desired field or trying to adjust the limits relatively >> > often would basically cause the P-state selection to be driven >> > entirely by the kernel which simply doesn't know certain things only >> > known to the P-unit, so it is reasonable to leave some level of >> > control to the latter, so as to allow it to use the information known >> > to it only. >> > >> > However, if it is allowed to do whatever it likes without any hints >> > from the kernel, it may very well go against the scheduler's decisions >> > which is not going to be optimal. >> > >> > I'm simply not sure if there is any other way to give such hints to it >> > that through the EPP. >> > >> >> Why not HWP_MIN_PERF? That would leave the HWP quite some room for >> maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not >> like P-state selection would be entirely driven by the kernel), while >> avoiding every failure scenario I was describing in my previous reply. > > Actually, I have been thinking about the HWP min as an alternative > that may be worth evaluating. > > However, I would rather set the HWP min to something like 80% if the > cpufreq request. > > Let me cut an alternative patch. Heh... Good. While you're at it, it would be nice to make sure we have a way for the governor to control both the minimum and maximum P-state in some efficient way, rather than it simply providing a single target, with [1] in mind. [1] https://lwn.net/Articles/818827/
On 2020.05.21 10:16 Rafael J. Wysocki wrote: > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > > Allow intel_pstate to work in the passive mode with HWP enabled and > make it translate the target frequency supplied by the cpufreq > governor in use into an EPP value to be written to the HWP request > MSR (high frequencies are mapped to low EPP values that mean more > performance-oriented HWP operation) as a hint for the HWP algorithm > in the processor, so as to prevent it and the CPU scheduler from > working against each other at least when the schedutil governor is > in use. > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> > --- > > This is a prototype not intended for production use (based on linux-next). > > Please test it if you can (on HWP systems, of course) and let me know the > results. > > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well > may turn out to be either too high or too low for the general use, which is one > reason why getting as much testing coverage as possible is key here. > > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, > please do so and let me know the conclusions. > > Cheers, > Rafael To anyone trying this patch: You will need to monitor EPP (Energy Performance Preference) carefully. It changes as a function of passive/active, and if you booted active or passive or no-hwp and changed later. Originally, I was not specifically monitoring EPP, or paths taken since boot towards driver, intel_pstate or intel_cpufreq, and governor, and will now have to set aside test results. @Rafael: I am still having problems with my test computer and HWP. However, I can observe the energy saving potential of this "passive-yet-active HWP mode". At this point, I am actually trying to make my newer test computer simply behave and do what it is told with respect to CPU frequency scaling, because even acpi-cpufreq misbehaves for performance governor under some conditions [1]. [1] https://marc.info/?l=linux-pm&m=159155067328641&w=2 To my way of thinking: 1.) it is imperative that we be able to decouple the governor servo from the processor servo. At a minimum this is needed for system testing, debugging and reference baselines. At a maximum users could, perhaps, decide for themselves. Myself, I would prefer "passive" to mean "do what you have been told", and that is now what I am testing. 2.) I have always thought, indeed relied on, performance mode as being more than a hint. For my older i7-2600K it never disobeyed orders, except for the most minuscule of workloads. This newer i5-9600K seems to have a mind of its own which I would like to be able to turn off, yet still be able to use intel_pstate trace with schedutil. Recall last week I said > moving forward the typical CPU frequency scaling > configuration for my test system will be: > > driver: intel-cpufreq, forced at boot. > governor: schedutil > hwp: forced off at boot. The problem is that baseline references are still needed and performance mode is unreliable. Maybe other stuff also, I simply don't know at this point. Example of EPP changing (no need to read on) (from fresh boot): Current EPP: root@s18:/home/doug# rdmsr --bitfield 31:24 -u -a 0x774 128 128 128 128 128 128 root@s18:/home/doug# grep . /sys/devices/system/cpu/cpu3/cpufreq/* /sys/devices/system/cpu/cpu3/cpufreq/affected_cpus:3 /sys/devices/system/cpu/cpu3/cpufreq/base_frequency:3700000 /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_max_freq:4600000 /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_min_freq:800000 /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_transition_latency:0 /sys/devices/system/cpu/cpu3/cpufreq/energy_performance_available_preferences:default performance balance_performance balance_power power /sys/devices/system/cpu/cpu3/cpufreq/energy_performance_preference:balance_performance /sys/devices/system/cpu/cpu3/cpufreq/related_cpus:3 /sys/devices/system/cpu/cpu3/cpufreq/scaling_available_governors:performance powersave /sys/devices/system/cpu/cpu3/cpufreq/scaling_cur_freq:800102 /sys/devices/system/cpu/cpu3/cpufreq/scaling_driver:intel_pstate /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor:powersave /sys/devices/system/cpu/cpu3/cpufreq/scaling_max_freq:4600000 /sys/devices/system/cpu/cpu3/cpufreq/scaling_min_freq:800000 /sys/devices/system/cpu/cpu3/cpufreq/scaling_setspeed:<unsupported> Now, switch to passive mode: echo passive > /sys/devices/system/cpu/intel_pstate/status And observe EPP: root@s18:/home/doug# rdmsr --bitfield 31:24 -u -a 0x774 255 255 255 255 255 255 root@s18:/home/doug# grep . /sys/devices/system/cpu/cpu3/cpufreq/* /sys/devices/system/cpu/cpu3/cpufreq/affected_cpus:3 /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_max_freq:4600000 /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_min_freq:800000 /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_transition_latency:20000 /sys/devices/system/cpu/cpu3/cpufreq/related_cpus:3 /sys/devices/system/cpu/cpu3/cpufreq/scaling_available_governors:conservative ondemand userspace powersave performance schedutil Hey, where did the ability to adjust the energy_performance_preference setting go? /sys/devices/system/cpu/cpu3/cpufreq/scaling_cur_freq:3400313 /sys/devices/system/cpu/cpu3/cpufreq/scaling_driver:intel_cpufreq /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor:performance /sys/devices/system/cpu/cpu3/cpufreq/scaling_max_freq:4600000 /sys/devices/system/cpu/cpu3/cpufreq/scaling_min_freq:800000 /sys/devices/system/cpu/cpu3/cpufreq/scaling_setspeed:<unsupported> Kernel is 5.7 +plus this patch: root@s18:/home/doug# uname -a Linux s18 5.7.0-hwp10 #786 SMP PREEMPT Tue Jun 9 20:15:18 PDT 2020 x86_64 x86_64 x86_64 GNU/Linux 223e5c33f927 (HEAD -> k57-doug-hwp) cpufreq: intel_pstate: Accept passive mode with HWP enabled 5d890a14763d cpufreq: intel_pstate: Use passive mode by default without HWP 3d77e6a8804a (tag: v5.7) Linux 5.7 The below is on top of this patch, and is how I am attempting to move forward: diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4ab8bc1476c9..6c28ec49b192 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2331,33 +2331,32 @@ static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u32 min_perf) value |= HWP_MIN_PERF(min_perf); /* - * The entire MSR needs to be updated in order to update the HWP min - * field in it, so opportunistically update the max too if needed. + * the max also... */ value &= ~HWP_MAX_PERF(~0L); - value |= HWP_MAX_PERF(cpu->max_perf_ratio); + value |= HWP_MAX_PERF(min_perf); if (value != prev) wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); } /** - * intel_cpufreq_adjust_hwp - Adjust the HWP reuqest register. + * intel_cpufreq_adjust_hwp - Adjust the HWP request register. * @cpu: Target CPU. * @target_pstate: P-state corresponding to the target frequency. * - * Set the HWP minimum performance limit to 75% of @target_pstate taking the + * Set the HWP minimum performance limit to @target_pstate taking the * global min and max policy limits into account. * - * The purpose of this is to avoid situations in which the kernel and the HWP - * algorithm work against each other by giving a hint about the expectations of - * the former to the latter. + * The purpose of this is to force the slave (passive) servo to do what + * it has been told, not what ever it wants. + * This NOT a hint. EPP (responsiveness) is managed from elsewhere. */ static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate) { u32 min_perf; - min_perf = max_t(u32, (3 * target_pstate) / 4, cpu->min_perf_ratio); + min_perf = max_t(u32, target_pstate, cpu->min_perf_ratio); min_perf = min_t(u32, min_perf, cpu->max_perf_ratio); if (min_perf != cpu->pstate.current_pstate) { cpu->pstate.current_pstate = min_perf; ... Doug ... [deleted] ...
On 2020.05.21 10:16 Rafael J. Wysocki wrote: ... > > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well > may turn out to be either too high or too low for the general use, which is one > reason why getting as much testing coverage as possible is key here. > > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values, > please do so and let me know the conclusions. ... Hi Rafael, Since my v7 reply the other day about results for 347 Hz work/sleep frequency periodic workflows [1], I have been testing at 73 Hertz work/sleep frequency. And am now testing this: -#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 10000 Which shows promise for the schedutil governor, which no longer drives prematurely into the turbo range, consuming excessive amounts of energy as compared to passive without HWP. The ondemand governor is still very bad. The only way I have found to get it behave is to force max=min=target_pstate, in addition to the above. It will be quite some time before I have well organized feedback. In the meantime if anyone has suggestions for an optimal INTEL_CPUFREQ_TRANSITION_DELAY_HWP value, I would be grateful. Otherwise I'll attempt to find it via tests. ... Doug [1] https://marc.info/?l=linux-pm&m=159769839401767&w=2
Index: linux-pm/drivers/cpufreq/intel_pstate.c =================================================================== --- linux-pm.orig/drivers/cpufreq/intel_pstate.c +++ linux-pm/drivers/cpufreq/intel_pstate.c @@ -36,6 +36,7 @@ #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 #define INTEL_CPUFREQ_TRANSITION_DELAY 500 #ifdef CONFIG_ACPI @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int return div_ext_fp(percent, 100); } +#define HWP_EPP_TO_BYTE(x) (((u64)x >> 24) & 0xFF) + /** * struct sample - Store performance sample * @core_avg_perf: Ratio of APERF/MPERF which is the actual average @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) { - intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); + if (hwp_active) + intel_pstate_hwp_force_min_perf(policy->cpu); + else + intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); } static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct pr_debug("CPU %d exiting\n", policy->cpu); intel_pstate_clear_update_util_hook(policy->cpu); - if (hwp_active) { + if (hwp_active) intel_pstate_hwp_save_state(policy); - intel_pstate_hwp_force_min_perf(policy->cpu); - } else { - intel_cpufreq_stop_cpu(policy); - } + + intel_cpufreq_stop_cpu(policy); } static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s #define INTEL_PSTATE_TRACE_TARGET 10 #define INTEL_PSTATE_TRACE_FAST_SWITCH 90 -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, + int from, int to) { struct sample *sample; @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c sample = &cpu->sample; trace_pstate_sample(trace_type, 0, - old_pstate, - cpu->pstate.current_pstate, + from, + to, sample->mperf, sample->aperf, sample->tsc, @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c fp_toint(cpu->iowait_boost * 100)); } -static int intel_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp) { - struct cpudata *cpu = all_cpu_data[policy->cpu]; - struct cpufreq_freqs freqs; - int target_pstate, old_pstate; + u64 value, prev; - update_turbo_state(); + prev = READ_ONCE(cpu->hwp_req_cached); + value = prev; - freqs.old = policy->cur; - freqs.new = target_freq; + /* + * The entire MSR needs to be updated in order to update the EPP field + * in it, so opportunistically update the min and max too if needed. + */ + value &= ~HWP_MIN_PERF(~0L); + value |= HWP_MIN_PERF(cpu->min_perf_ratio); + + value &= ~HWP_MAX_PERF(~0L); + value |= HWP_MAX_PERF(cpu->max_perf_ratio); + + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, + HWP_EPP_TO_BYTE(prev), new_epp); + + value &= ~GENMASK_ULL(31, 24); + value |= HWP_ENERGY_PERF_PREFERENCE(new_epp); + } + + if (value != prev) { + WRITE_ONCE(cpu->hwp_req_cached, value); + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); + } +} + +/** + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register. + * @cpu: Target CPU. + * @max_freq: Maximum frequency to consider. + * @target_freq: Target frequency selected by the governor. + * + * Translate the target frequency into a new EPP value to be written into the + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection. + * + * The purpose of this is to avoid situations in which the kernel and the HWP + * algorithm work against each other by giving a hint about the expectations of + * the former to the latter. + * + * The mapping betweeen the target frequencies and the hint values need not be + * exact, but it must be monotonic, so that higher target frequencies always + * indicate more performance-oriented P-state selection. + */ +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq, + unsigned int target_freq) +{ + s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq); + + intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp)); +} + +static int intel_cpufreq_adjust_pstate(struct cpudata *cpu, + unsigned int target_freq, + unsigned int relation) +{ + int old_pstate = cpu->pstate.current_pstate; + int target_pstate; - cpufreq_freq_transition_begin(policy, &freqs); switch (relation) { case CPUFREQ_RELATION_L: - target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling); + target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); break; case CPUFREQ_RELATION_H: - target_pstate = freqs.new / cpu->pstate.scaling; + target_pstate = target_freq / cpu->pstate.scaling; break; default: - target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling); + target_pstate = DIV_ROUND_CLOSEST(target_freq, cpu->pstate.scaling); break; } target_pstate = intel_pstate_prepare_request(cpu, target_pstate); - old_pstate = cpu->pstate.current_pstate; - if (target_pstate != cpu->pstate.current_pstate) { + if (target_pstate != old_pstate) { cpu->pstate.current_pstate = target_pstate; - wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, + wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, target_pstate)); } - freqs.new = target_pstate * cpu->pstate.scaling; - intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate); + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate, target_pstate); + return target_pstate * cpu->pstate.scaling; +} + +static int intel_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + struct cpufreq_freqs freqs; + + update_turbo_state(); + + freqs.old = policy->cur; + freqs.new = target_freq; + + cpufreq_freq_transition_begin(policy, &freqs); + + if (hwp_active) + intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq, + target_freq); + else + freqs.new = intel_cpufreq_adjust_pstate(cpu, target_freq, relation); + cpufreq_freq_transition_end(policy, &freqs, false); return 0; @@ -2365,11 +2440,18 @@ static unsigned int intel_cpufreq_fast_s update_turbo_state(); + if (hwp_active) { + intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq, + target_freq); + return target_freq; + } + target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); target_pstate = intel_pstate_prepare_request(cpu, target_pstate); old_pstate = cpu->pstate.current_pstate; intel_pstate_update_pstate(cpu, target_pstate); - intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate, + target_pstate); return target_pstate * cpu->pstate.scaling; } @@ -2389,7 +2471,6 @@ static int intel_cpufreq_cpu_init(struct return ret; policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; - policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq; @@ -2401,10 +2482,19 @@ static int intel_cpufreq_cpu_init(struct cpu = all_cpu_data[policy->cpu]; - if (hwp_active) + if (hwp_active) { + u64 value; + + rdmsrl_on_cpu(policy->cpu, MSR_HWP_REQUEST, &value); + WRITE_ONCE(cpu->hwp_req_cached, value); + cpu->epp_saved = HWP_EPP_TO_BYTE(value); + intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state); - else + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP; + } else { turbo_max = cpu->pstate.turbo_pstate; + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; + } min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); min_freq *= cpu->pstate.scaling; @@ -2449,6 +2539,13 @@ static int intel_cpufreq_cpu_exit(struct freq_qos_remove_request(req); kfree(req); + if (hwp_active) { + struct cpudata *cpu = all_cpu_data[policy->cpu]; + + /* Restore the original HWP EPP value. */ + intel_cpufreq_update_hwp_request(cpu, cpu->epp_saved); + } + return intel_pstate_cpu_exit(policy); } @@ -2505,9 +2602,6 @@ static int intel_pstate_register_driver( static int intel_pstate_unregister_driver(void) { - if (hwp_active) - return -EBUSY; - cpufreq_unregister_driver(intel_pstate_driver); intel_pstate_driver_cleanup(); @@ -2815,12 +2909,11 @@ static int __init intel_pstate_setup(cha if (!str) return -EINVAL; - if (!strcmp(str, "disable")) { + if (!strcmp(str, "disable")) no_load = 1; - } else if (!strcmp(str, "passive")) { + else if (!strcmp(str, "passive")) default_driver = &intel_cpufreq; - no_hwp = 1; - } + if (!strcmp(str, "no_hwp")) { pr_info("HWP disabled\n"); no_hwp = 1;