diff mbox series

[RFC/RFT] cpufreq: intel_pstate: Work in passive mode with HWP enabled

Message ID 3169564.ZRsPWhXyMD@kreacher (mailing list archive)
State RFC, archived
Headers show
Series [RFC/RFT] cpufreq: intel_pstate: Work in passive mode with HWP enabled | expand

Commit Message

Rafael J. Wysocki May 21, 2020, 5:15 p.m. UTC
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Allow intel_pstate to work in the passive mode with HWP enabled and
make it translate the target frequency supplied by the cpufreq
governor in use into an EPP value to be written to the HWP request
MSR (high frequencies are mapped to low EPP values that mean more
performance-oriented HWP operation) as a hint for the HWP algorithm
in the processor, so as to prevent it and the CPU scheduler from
working against each other at least when the schedutil governor is
in use.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---

This is a prototype not intended for production use (based on linux-next).

Please test it if you can (on HWP systems, of course) and let me know the
results.

The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
may turn out to be either too high or too low for the general use, which is one
reason why getting as much testing coverage as possible is key here.

If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
please do so and let me know the conclusions.

Cheers,
Rafael

---
 drivers/cpufreq/intel_pstate.c |  169 +++++++++++++++++++++++++++++++----------
 1 file changed, 131 insertions(+), 38 deletions(-)

Comments

Francisco Jerez May 25, 2020, 1:36 a.m. UTC | #1
"Rafael J. Wysocki" <rjw@rjwysocki.net> writes:

> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>
> Allow intel_pstate to work in the passive mode with HWP enabled and
> make it translate the target frequency supplied by the cpufreq
> governor in use into an EPP value to be written to the HWP request
> MSR (high frequencies are mapped to low EPP values that mean more
> performance-oriented HWP operation) as a hint for the HWP algorithm
> in the processor, so as to prevent it and the CPU scheduler from
> working against each other at least when the schedutil governor is
> in use.
>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
>
> This is a prototype not intended for production use (based on linux-next).
>
> Please test it if you can (on HWP systems, of course) and let me know the
> results.
>
> The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
> may turn out to be either too high or too low for the general use, which is one
> reason why getting as much testing coverage as possible is key here.
>
> If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
> please do so and let me know the conclusions.
>
> Cheers,
> Rafael
>
> ---
>  drivers/cpufreq/intel_pstate.c |  169 +++++++++++++++++++++++++++++++----------
>  1 file changed, 131 insertions(+), 38 deletions(-)
>
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -36,6 +36,7 @@
>  #define INTEL_PSTATE_SAMPLING_INTERVAL	(10 * NSEC_PER_MSEC)
>  
>  #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
> +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP	5000
>  #define INTEL_CPUFREQ_TRANSITION_DELAY		500
>  
>  #ifdef CONFIG_ACPI
> @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int
>  	return div_ext_fp(percent, 100);
>  }
>  
> +#define HWP_EPP_TO_BYTE(x)	(((u64)x >> 24) & 0xFF)
> +
>  /**
>   * struct sample -	Store performance sample
>   * @core_avg_perf:	Ratio of APERF/MPERF which is the actual average
> @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st
>  
>  static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
>  {
> -	intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
> +	if (hwp_active)
> +		intel_pstate_hwp_force_min_perf(policy->cpu);
> +	else
> +		intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
>  }
>  
>  static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
> @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct
>  	pr_debug("CPU %d exiting\n", policy->cpu);
>  
>  	intel_pstate_clear_update_util_hook(policy->cpu);
> -	if (hwp_active) {
> +	if (hwp_active)
>  		intel_pstate_hwp_save_state(policy);
> -		intel_pstate_hwp_force_min_perf(policy->cpu);
> -	} else {
> -		intel_cpufreq_stop_cpu(policy);
> -	}
> +
> +	intel_cpufreq_stop_cpu(policy);
>  }
>  
>  static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
> @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s
>  #define	INTEL_PSTATE_TRACE_TARGET 10
>  #define	INTEL_PSTATE_TRACE_FAST_SWITCH 90
>  
> -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
> +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type,
> +				int from, int to)
>  {
>  	struct sample *sample;
>  
> @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c
>  	sample = &cpu->sample;
>  	trace_pstate_sample(trace_type,
>  		0,
> -		old_pstate,
> -		cpu->pstate.current_pstate,
> +		from,
> +		to,
>  		sample->mperf,
>  		sample->aperf,
>  		sample->tsc,
> @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c
>  		fp_toint(cpu->iowait_boost * 100));
>  }
>  
> -static int intel_cpufreq_target(struct cpufreq_policy *policy,
> -				unsigned int target_freq,
> -				unsigned int relation)
> +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp)
>  {
> -	struct cpudata *cpu = all_cpu_data[policy->cpu];
> -	struct cpufreq_freqs freqs;
> -	int target_pstate, old_pstate;
> +	u64 value, prev;
>  
> -	update_turbo_state();
> +	prev = READ_ONCE(cpu->hwp_req_cached);
> +	value = prev;
>  
> -	freqs.old = policy->cur;
> -	freqs.new = target_freq;
> +	/*
> +	 * The entire MSR needs to be updated in order to update the EPP field
> +	 * in it, so opportunistically update the min and max too if needed.
> +	 */
> +	value &= ~HWP_MIN_PERF(~0L);
> +	value |= HWP_MIN_PERF(cpu->min_perf_ratio);
> +
> +	value &= ~HWP_MAX_PERF(~0L);
> +	value |= HWP_MAX_PERF(cpu->max_perf_ratio);
> +
> +	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
> +		intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET,
> +				    HWP_EPP_TO_BYTE(prev), new_epp);
> +
> +		value &= ~GENMASK_ULL(31, 24);
> +		value |= HWP_ENERGY_PERF_PREFERENCE(new_epp);
> +	}
> +
> +	if (value != prev) {
> +		WRITE_ONCE(cpu->hwp_req_cached, value);
> +		wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
> +	}
> +}
> +
> +/**
> + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register.
> + * @cpu: Target CPU.
> + * @max_freq: Maximum frequency to consider.
> + * @target_freq: Target frequency selected by the governor.
> + *
> + * Translate the target frequency into a new EPP value to be written into the
> + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection.
> + *
> + * The purpose of this is to avoid situations in which the kernel and the HWP
> + * algorithm work against each other by giving a hint about the expectations of
> + * the former to the latter.
> + *
> + * The mapping betweeen the target frequencies and the hint values need not be
> + * exact, but it must be monotonic, so that higher target frequencies always
> + * indicate more performance-oriented P-state selection.
> + */
> +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq,
> +					     unsigned int target_freq)
> +{
> +	s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq);
> +
> +	intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp));
> +}
> +

Hey Rafael, I'm building a kernel with this in order to give it a try on
my system, but I'm skeptical that translating the target frequency to an
EPP value will work reliably.  AFAIA the EPP value only has an indirect
influence on the processor's performance by adjusting the trade-off
between its responsiveness (rather than the actual clock frequency which
it will sustain in the long run) and its energy usage, in a largely
unspecified and non-linear way (non-linear like the effect of switching
CPU energy optimization features on and off, or like its effect on the
energy balancing behavior of the processor which can have a paradoxical
effect on performance).

I doubt that the scheduling-based CPU utilization is a good predictor
for the optimal trade-off between those two variables.  Consider the
following scenarios where this would deviate from the typical default
EPP of 0x80:

 - The application has low CPU utilization (say 10%) but it's
   latency-bound, the current default EPP value allows the CPU to ramp
   up during each burst and deliver reasonable average performance.
   With this code in effect you may end up at an EPP > 0xE0 which will
   severely limit the responsiveness of the HWP during each burst of
   work, possibly reducing performance (since the application was
   latency-bound), which will cause the CPU utilization of the process
   to further decrease, leading to a further increase in EPP and further
   decrease in performance (IOW this failure mode would have positive
   feedback rather than being somehow self-correcting).

 - The application has medium to high CPU utilization (say 60%), but
   it's IO-bound (e.g. a database server with a hard-drive bottleneck).
   This would cause you to program an EPP value around 0x60 which will
   cause the HWP to respond more aggressively than currently to each
   burst of work, increasing energy usage without a corresponding
   increase in performance (since the application was IO-bound).

 - Same as the last point, but the application is TDP-bound instead
   (e.g. a computer game running on an integrated GPU).  Not only will
   CPU energy usage increase, but performance will decrease with it, due
   to the TDP budget being stolen from the GPU.

 - The application has medium to low CPU utilization and frequently
   migrates between CPU cores.  This would end up at an EPP >= 0x80
   which won't give the application any CPU responsiveness advantage
   during migration over the current behavior (but possibly quite the
   opposite), even though the scheduler might know exactly what CPU
   frequency the application was able to average, and could have saved
   it some significant amount of ramp-up time.

It seems to me that hooking this up to the HWP_MIN_PERF control instead
of EPP would have a more deterministic effect and avoid the failure
scenarios described above.  E.g. upon migration you would be able to
instantly set up the new CPU core to the same P-state you knew the
application was averaging before migration.  No performance is
sacrificed in latency-bound applications since you wouldn't be limiting
the HWP's responsiveness.  No energy efficiency is sacrificed in the
steady state since you would be programming the most energy-efficient
(minimum constant) P-state which is able to deliver the observed
performance of the application -- It might make sense to program that
value *minus* some small margin though, in order to give the HWP the
chance to clock down in cases where the load fluctuates discontinuously
before the next update of the HWP request.

Of course it would also be nice to give schedutil control of
HWP_MAX_PERF eventually, e.g. by implementing the energy efficiency
optimization I've been proposing [1] in schedutil too.

[1] https://lwn.net/Articles/818827/

> +static int intel_cpufreq_adjust_pstate(struct cpudata *cpu,
> +				       unsigned int target_freq,
> +				       unsigned int relation)
> +{
> +	int old_pstate = cpu->pstate.current_pstate;
> +	int target_pstate;
>  
> -	cpufreq_freq_transition_begin(policy, &freqs);
>  	switch (relation) {
>  	case CPUFREQ_RELATION_L:
> -		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
> +		target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
>  		break;
>  	case CPUFREQ_RELATION_H:
> -		target_pstate = freqs.new / cpu->pstate.scaling;
> +		target_pstate = target_freq / cpu->pstate.scaling;
>  		break;
>  	default:
> -		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
> +		target_pstate = DIV_ROUND_CLOSEST(target_freq, cpu->pstate.scaling);
>  		break;
>  	}
>  	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
> -	old_pstate = cpu->pstate.current_pstate;
> -	if (target_pstate != cpu->pstate.current_pstate) {
> +	if (target_pstate != old_pstate) {
>  		cpu->pstate.current_pstate = target_pstate;
> -		wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
> +		wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
>  			      pstate_funcs.get_val(cpu, target_pstate));
>  	}
> -	freqs.new = target_pstate * cpu->pstate.scaling;
> -	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate);
> +	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate, target_pstate);
> +	return target_pstate * cpu->pstate.scaling;
> +}
> +
> +static int intel_cpufreq_target(struct cpufreq_policy *policy,
> +				unsigned int target_freq,
> +				unsigned int relation)
> +{
> +	struct cpudata *cpu = all_cpu_data[policy->cpu];
> +	struct cpufreq_freqs freqs;
> +
> +	update_turbo_state();
> +
> +	freqs.old = policy->cur;
> +	freqs.new = target_freq;
> +
> +	cpufreq_freq_transition_begin(policy, &freqs);
> +
> +	if (hwp_active)
> +		intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq,
> +						 target_freq);
> +	else
> +		freqs.new = intel_cpufreq_adjust_pstate(cpu, target_freq, relation);
> +
>  	cpufreq_freq_transition_end(policy, &freqs, false);
>  
>  	return 0;
> @@ -2365,11 +2440,18 @@ static unsigned int intel_cpufreq_fast_s
>  
>  	update_turbo_state();
>  
> +	if (hwp_active) {
> +		intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq,
> +						 target_freq);
> +		return target_freq;
> +	}
> +
>  	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
>  	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
>  	old_pstate = cpu->pstate.current_pstate;
>  	intel_pstate_update_pstate(cpu, target_pstate);
> -	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
> +	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate,
> +			    target_pstate);
>  	return target_pstate * cpu->pstate.scaling;
>  }
>  
> @@ -2389,7 +2471,6 @@ static int intel_cpufreq_cpu_init(struct
>  		return ret;
>  
>  	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
> -	policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
>  	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
>  	policy->cur = policy->cpuinfo.min_freq;
>  
> @@ -2401,10 +2482,19 @@ static int intel_cpufreq_cpu_init(struct
>  
>  	cpu = all_cpu_data[policy->cpu];
>  
> -	if (hwp_active)
> +	if (hwp_active) {
> +		u64 value;
> +
> +		rdmsrl_on_cpu(policy->cpu, MSR_HWP_REQUEST, &value);
> +		WRITE_ONCE(cpu->hwp_req_cached, value);
> +		cpu->epp_saved = HWP_EPP_TO_BYTE(value);
> +
>  		intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state);
> -	else
> +		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
> +	} else {
>  		turbo_max = cpu->pstate.turbo_pstate;
> +		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
> +	}
>  
>  	min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
>  	min_freq *= cpu->pstate.scaling;
> @@ -2449,6 +2539,13 @@ static int intel_cpufreq_cpu_exit(struct
>  	freq_qos_remove_request(req);
>  	kfree(req);
>  
> +	if (hwp_active) {
> +		struct cpudata *cpu = all_cpu_data[policy->cpu];
> +
> +		/* Restore the original HWP EPP value. */
> +		intel_cpufreq_update_hwp_request(cpu, cpu->epp_saved);
> +	}
> +
>  	return intel_pstate_cpu_exit(policy);
>  }
>  
> @@ -2505,9 +2602,6 @@ static int intel_pstate_register_driver(
>  
>  static int intel_pstate_unregister_driver(void)
>  {
> -	if (hwp_active)
> -		return -EBUSY;
> -
>  	cpufreq_unregister_driver(intel_pstate_driver);
>  	intel_pstate_driver_cleanup();
>  
> @@ -2815,12 +2909,11 @@ static int __init intel_pstate_setup(cha
>  	if (!str)
>  		return -EINVAL;
>  
> -	if (!strcmp(str, "disable")) {
> +	if (!strcmp(str, "disable"))
>  		no_load = 1;
> -	} else if (!strcmp(str, "passive")) {
> +	else if (!strcmp(str, "passive"))
>  		default_driver = &intel_cpufreq;
> -		no_hwp = 1;
> -	}
> +
>  	if (!strcmp(str, "no_hwp")) {
>  		pr_info("HWP disabled\n");
>  		no_hwp = 1;
Rafael J. Wysocki May 25, 2020, 1:23 p.m. UTC | #2
On Mon, May 25, 2020 at 3:39 AM Francisco Jerez
<francisco.jerez.plata@intel.com> wrote:
>
> "Rafael J. Wysocki" <rjw@rjwysocki.net> writes:
>
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >
> > Allow intel_pstate to work in the passive mode with HWP enabled and
> > make it translate the target frequency supplied by the cpufreq
> > governor in use into an EPP value to be written to the HWP request
> > MSR (high frequencies are mapped to low EPP values that mean more
> > performance-oriented HWP operation) as a hint for the HWP algorithm
> > in the processor, so as to prevent it and the CPU scheduler from
> > working against each other at least when the schedutil governor is
> > in use.
> >
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > ---
> >
> > This is a prototype not intended for production use (based on linux-next).
> >
> > Please test it if you can (on HWP systems, of course) and let me know the
> > results.
> >
> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
> > may turn out to be either too high or too low for the general use, which is one
> > reason why getting as much testing coverage as possible is key here.
> >
> > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
> > please do so and let me know the conclusions.
> >
> > Cheers,
> > Rafael
> >
> > ---
> >  drivers/cpufreq/intel_pstate.c |  169 +++++++++++++++++++++++++++++++----------
> >  1 file changed, 131 insertions(+), 38 deletions(-)
> >
> > Index: linux-pm/drivers/cpufreq/intel_pstate.c
> > ===================================================================
> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> > +++ linux-pm/drivers/cpufreq/intel_pstate.c
> > @@ -36,6 +36,7 @@
> >  #define INTEL_PSTATE_SAMPLING_INTERVAL       (10 * NSEC_PER_MSEC)
> >
> >  #define INTEL_CPUFREQ_TRANSITION_LATENCY     20000
> > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP   5000
> >  #define INTEL_CPUFREQ_TRANSITION_DELAY               500
> >
> >  #ifdef CONFIG_ACPI
> > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int
> >       return div_ext_fp(percent, 100);
> >  }
> >
> > +#define HWP_EPP_TO_BYTE(x)   (((u64)x >> 24) & 0xFF)
> > +
> >  /**
> >   * struct sample -   Store performance sample
> >   * @core_avg_perf:   Ratio of APERF/MPERF which is the actual average
> > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st
> >
> >  static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
> >  {
> > -     intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
> > +     if (hwp_active)
> > +             intel_pstate_hwp_force_min_perf(policy->cpu);
> > +     else
> > +             intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
> >  }
> >
> >  static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
> > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct
> >       pr_debug("CPU %d exiting\n", policy->cpu);
> >
> >       intel_pstate_clear_update_util_hook(policy->cpu);
> > -     if (hwp_active) {
> > +     if (hwp_active)
> >               intel_pstate_hwp_save_state(policy);
> > -             intel_pstate_hwp_force_min_perf(policy->cpu);
> > -     } else {
> > -             intel_cpufreq_stop_cpu(policy);
> > -     }
> > +
> > +     intel_cpufreq_stop_cpu(policy);
> >  }
> >
> >  static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
> > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s
> >  #define      INTEL_PSTATE_TRACE_TARGET 10
> >  #define      INTEL_PSTATE_TRACE_FAST_SWITCH 90
> >
> > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
> > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type,
> > +                             int from, int to)
> >  {
> >       struct sample *sample;
> >
> > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c
> >       sample = &cpu->sample;
> >       trace_pstate_sample(trace_type,
> >               0,
> > -             old_pstate,
> > -             cpu->pstate.current_pstate,
> > +             from,
> > +             to,
> >               sample->mperf,
> >               sample->aperf,
> >               sample->tsc,
> > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c
> >               fp_toint(cpu->iowait_boost * 100));
> >  }
> >
> > -static int intel_cpufreq_target(struct cpufreq_policy *policy,
> > -                             unsigned int target_freq,
> > -                             unsigned int relation)
> > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp)
> >  {
> > -     struct cpudata *cpu = all_cpu_data[policy->cpu];
> > -     struct cpufreq_freqs freqs;
> > -     int target_pstate, old_pstate;
> > +     u64 value, prev;
> >
> > -     update_turbo_state();
> > +     prev = READ_ONCE(cpu->hwp_req_cached);
> > +     value = prev;
> >
> > -     freqs.old = policy->cur;
> > -     freqs.new = target_freq;
> > +     /*
> > +      * The entire MSR needs to be updated in order to update the EPP field
> > +      * in it, so opportunistically update the min and max too if needed.
> > +      */
> > +     value &= ~HWP_MIN_PERF(~0L);
> > +     value |= HWP_MIN_PERF(cpu->min_perf_ratio);
> > +
> > +     value &= ~HWP_MAX_PERF(~0L);
> > +     value |= HWP_MAX_PERF(cpu->max_perf_ratio);
> > +
> > +     if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
> > +             intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET,
> > +                                 HWP_EPP_TO_BYTE(prev), new_epp);
> > +
> > +             value &= ~GENMASK_ULL(31, 24);
> > +             value |= HWP_ENERGY_PERF_PREFERENCE(new_epp);
> > +     }
> > +
> > +     if (value != prev) {
> > +             WRITE_ONCE(cpu->hwp_req_cached, value);
> > +             wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
> > +     }
> > +}
> > +
> > +/**
> > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register.
> > + * @cpu: Target CPU.
> > + * @max_freq: Maximum frequency to consider.
> > + * @target_freq: Target frequency selected by the governor.
> > + *
> > + * Translate the target frequency into a new EPP value to be written into the
> > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection.
> > + *
> > + * The purpose of this is to avoid situations in which the kernel and the HWP
> > + * algorithm work against each other by giving a hint about the expectations of
> > + * the former to the latter.
> > + *
> > + * The mapping betweeen the target frequencies and the hint values need not be
> > + * exact, but it must be monotonic, so that higher target frequencies always
> > + * indicate more performance-oriented P-state selection.
> > + */
> > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq,
> > +                                          unsigned int target_freq)
> > +{
> > +     s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq);
> > +
> > +     intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp));
> > +}
> > +
>
> Hey Rafael, I'm building a kernel with this in order to give it a try on
> my system, but I'm skeptical that translating the target frequency to an
> EPP value will work reliably.  AFAIA the EPP value only has an indirect
> influence on the processor's performance by adjusting the trade-off
> between its responsiveness (rather than the actual clock frequency which
> it will sustain in the long run) and its energy usage, in a largely
> unspecified and non-linear way (non-linear like the effect of switching
> CPU energy optimization features on and off, or like its effect on the
> energy balancing behavior of the processor which can have a paradoxical
> effect on performance).
>
> I doubt that the scheduling-based CPU utilization is a good predictor
> for the optimal trade-off between those two variables.

While I agree that this is not perfect, there barely is anything else
that can be used for this purpose.

Using the desired field or trying to adjust the limits relatively
often would basically cause the P-state selection to be driven
entirely by the kernel which simply doesn't know certain things only
known to the P-unit, so it is reasonable to leave some level of
control to the latter, so as to allow it to use the information known
to it only.

However, if it is allowed to do whatever it likes without any hints
from the kernel, it may very well go against the scheduler's decisions
which is not going to be optimal.

I'm simply not sure if there is any other way to give such hints to it
that through the EPP.

Thanks!
Doug Smythies May 25, 2020, 3:30 p.m. UTC | #3
On 2020.05.21 10:16 Rafael J. Wysocki wrote:

> 
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> Allow intel_pstate to work in the passive mode with HWP enabled and
> make it translate the target frequency supplied by the cpufreq
> governor in use into an EPP value to be written to the HWP request
> MSR (high frequencies are mapped to low EPP values that mean more
> performance-oriented HWP operation) as a hint for the HWP algorithm
> in the processor, so as to prevent it and the CPU scheduler from
> working against each other at least when the schedutil governor is
> in use.
> 
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
> 
> This is a prototype not intended for production use (based on linux-next).
> 
> Please test it if you can (on HWP systems, of course) and let me know the
> results.
> 
> The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
> may turn out to be either too high or too low for the general use, which is one
> reason why getting as much testing coverage as possible is key here.
> 
> If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
> please do so and let me know the conclusions.

Hi all,

It is way too early for me to reply to this, as my tests are incomplete.
However, I saw the other e-mails and will chime in now.

Note before: I have been having difficulties with this test computer,
results may be suspect.

The default INTEL_CPUFREQ_TRANSITION_DELAY_HWP is definitely too short.
I do not know what an optimal value is, but I have made it the same as
INTEL_CPUFREQ_TRANSITION_LATENCY for now, and because it makes for great
test comparisons.

I have only done the simplest of tests so far, single thread, forced
CPU affinity load ramp up/down.

Load:
fixed work packet per work period, i.e. the amount of work to do is independent of CPU frequency.
347 hertz work / sleep frequency. Why? No reason. Because it is what I used last time.

To reveal any hysteresis (i.e. with conservative governor) load ramps up from none
to 100% and then back down to none at 3 seconds per step (step size is uncalibrated).

Processor:
Intel(R) Core(TM) i5-9600K CPU @ 3.70GHz

Modes:
intel_cpufreq (i_)
intel_pstate (p_)
acpi-cpufreq (a_)

Kernel:

b955f2f4a425 (HEAD -> k57-hwp) cpufreq: intel_pstate: Work in passive mode with HWP enabled
faf93d5c9da1 cpufreq: intel_pstate: Use passive mode by default without HWP
b9bbe6ed63b2 (tag: v5.7-rc6) Linux 5.7-rc6

Graph notes: The ramp up and ramp down are folded back on the graphs. These tests
were launched manually in two terminals, so the timing was not exact.
I'll fix this moving forward. 

Legend - ondemand graph [1] (minor annotation added):

i_onde_stock : intel-cpufreq, ondemand, stock kernel (5.7-rc6), hwp disabled.
i_onde_hwp : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 5000.
i_onde_hwp2 : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 20000.
a_onde: acpi-cpufreq, stock kernel (5.7-rc6), intel_pstate disabled.

Conclusion:
DELAY_HWP 20000 matches the pre-patch response almost exactly.
DELAY_HWP 5000 goes too far (my opinion), but does look promising.

Legend - intel_pstate - powersave graph [2].

What? Why is there such a graph, unrelated to this patch?
Well, because there is a not yet understood effect.

p_powe_stock : intel_pstate, powersave, stock kernel (5.7-rc6), hwp disabled.
P_powe_hwp : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 5000.
P_powe_hwp2 : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 20000.

Conclusion: ??

Note: That I merely made a stupid mistake is a real possibility.
Repeat test pending.

Legend - schedutil graph [3]:

i_sche_stock : intel-cpufreq, ondemand, stock kernel (5.7-rc6), hwp disabled.
i_sche_hwp : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 5000.
i_sche_hwp2 : intel-cpufreq, ondemand, patched kernel (5.7-rc6), DELAY_HWP 20000.
a_sche: acpi-cpufreq, stock kernel (5.7-rc6), intel_pstate disabled.

Conclusion: ?? I don't know, always had schedutil issues on this computer.
Expected more like typical from my older test server (i7-2600K) [4]:

[1] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-ondemand.png
[2] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-but-active-powersave.png
[3] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-schedutil.png
[4] http://www.smythies.com/~doug/linux/intel_pstate/cpu-freq-scalers/fixed-work-packet-folded-schedutil.png


> 
>  #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
> +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP	5000

For now, while trying to prove nothing is busted,
suggest 20000.

> +/**
> + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register.

request.

> +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq,
> +					     unsigned int target_freq)
> +{
> +	s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq);
> +

I would have thought this:

	s64 epp_fp = div_fp(255 * (max_freq - target_freq), (max_freq - min_freq));

but tested the original only, so far.

... Doug
Francisco Jerez May 25, 2020, 8:57 p.m. UTC | #4
"Rafael J. Wysocki" <rafael@kernel.org> writes:

> On Mon, May 25, 2020 at 3:39 AM Francisco Jerez
> <francisco.jerez.plata@intel.com> wrote:
>>
>> "Rafael J. Wysocki" <rjw@rjwysocki.net> writes:
>>
>> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>> >
>> > Allow intel_pstate to work in the passive mode with HWP enabled and
>> > make it translate the target frequency supplied by the cpufreq
>> > governor in use into an EPP value to be written to the HWP request
>> > MSR (high frequencies are mapped to low EPP values that mean more
>> > performance-oriented HWP operation) as a hint for the HWP algorithm
>> > in the processor, so as to prevent it and the CPU scheduler from
>> > working against each other at least when the schedutil governor is
>> > in use.
>> >
>> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>> > ---
>> >
>> > This is a prototype not intended for production use (based on linux-next).
>> >
>> > Please test it if you can (on HWP systems, of course) and let me know the
>> > results.
>> >
>> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
>> > may turn out to be either too high or too low for the general use, which is one
>> > reason why getting as much testing coverage as possible is key here.
>> >
>> > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
>> > please do so and let me know the conclusions.
>> >
>> > Cheers,
>> > Rafael
>> >
>> > ---
>> >  drivers/cpufreq/intel_pstate.c |  169 +++++++++++++++++++++++++++++++----------
>> >  1 file changed, 131 insertions(+), 38 deletions(-)
>> >
>> > Index: linux-pm/drivers/cpufreq/intel_pstate.c
>> > ===================================================================
>> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
>> > +++ linux-pm/drivers/cpufreq/intel_pstate.c
>> > @@ -36,6 +36,7 @@
>> >  #define INTEL_PSTATE_SAMPLING_INTERVAL       (10 * NSEC_PER_MSEC)
>> >
>> >  #define INTEL_CPUFREQ_TRANSITION_LATENCY     20000
>> > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP   5000
>> >  #define INTEL_CPUFREQ_TRANSITION_DELAY               500
>> >
>> >  #ifdef CONFIG_ACPI
>> > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int
>> >       return div_ext_fp(percent, 100);
>> >  }
>> >
>> > +#define HWP_EPP_TO_BYTE(x)   (((u64)x >> 24) & 0xFF)
>> > +
>> >  /**
>> >   * struct sample -   Store performance sample
>> >   * @core_avg_perf:   Ratio of APERF/MPERF which is the actual average
>> > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st
>> >
>> >  static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
>> >  {
>> > -     intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
>> > +     if (hwp_active)
>> > +             intel_pstate_hwp_force_min_perf(policy->cpu);
>> > +     else
>> > +             intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
>> >  }
>> >
>> >  static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
>> > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct
>> >       pr_debug("CPU %d exiting\n", policy->cpu);
>> >
>> >       intel_pstate_clear_update_util_hook(policy->cpu);
>> > -     if (hwp_active) {
>> > +     if (hwp_active)
>> >               intel_pstate_hwp_save_state(policy);
>> > -             intel_pstate_hwp_force_min_perf(policy->cpu);
>> > -     } else {
>> > -             intel_cpufreq_stop_cpu(policy);
>> > -     }
>> > +
>> > +     intel_cpufreq_stop_cpu(policy);
>> >  }
>> >
>> >  static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
>> > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s
>> >  #define      INTEL_PSTATE_TRACE_TARGET 10
>> >  #define      INTEL_PSTATE_TRACE_FAST_SWITCH 90
>> >
>> > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
>> > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type,
>> > +                             int from, int to)
>> >  {
>> >       struct sample *sample;
>> >
>> > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c
>> >       sample = &cpu->sample;
>> >       trace_pstate_sample(trace_type,
>> >               0,
>> > -             old_pstate,
>> > -             cpu->pstate.current_pstate,
>> > +             from,
>> > +             to,
>> >               sample->mperf,
>> >               sample->aperf,
>> >               sample->tsc,
>> > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c
>> >               fp_toint(cpu->iowait_boost * 100));
>> >  }
>> >
>> > -static int intel_cpufreq_target(struct cpufreq_policy *policy,
>> > -                             unsigned int target_freq,
>> > -                             unsigned int relation)
>> > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp)
>> >  {
>> > -     struct cpudata *cpu = all_cpu_data[policy->cpu];
>> > -     struct cpufreq_freqs freqs;
>> > -     int target_pstate, old_pstate;
>> > +     u64 value, prev;
>> >
>> > -     update_turbo_state();
>> > +     prev = READ_ONCE(cpu->hwp_req_cached);
>> > +     value = prev;
>> >
>> > -     freqs.old = policy->cur;
>> > -     freqs.new = target_freq;
>> > +     /*
>> > +      * The entire MSR needs to be updated in order to update the EPP field
>> > +      * in it, so opportunistically update the min and max too if needed.
>> > +      */
>> > +     value &= ~HWP_MIN_PERF(~0L);
>> > +     value |= HWP_MIN_PERF(cpu->min_perf_ratio);
>> > +
>> > +     value &= ~HWP_MAX_PERF(~0L);
>> > +     value |= HWP_MAX_PERF(cpu->max_perf_ratio);
>> > +
>> > +     if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
>> > +             intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET,
>> > +                                 HWP_EPP_TO_BYTE(prev), new_epp);
>> > +
>> > +             value &= ~GENMASK_ULL(31, 24);
>> > +             value |= HWP_ENERGY_PERF_PREFERENCE(new_epp);
>> > +     }
>> > +
>> > +     if (value != prev) {
>> > +             WRITE_ONCE(cpu->hwp_req_cached, value);
>> > +             wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
>> > +     }
>> > +}
>> > +
>> > +/**
>> > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register.
>> > + * @cpu: Target CPU.
>> > + * @max_freq: Maximum frequency to consider.
>> > + * @target_freq: Target frequency selected by the governor.
>> > + *
>> > + * Translate the target frequency into a new EPP value to be written into the
>> > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection.
>> > + *
>> > + * The purpose of this is to avoid situations in which the kernel and the HWP
>> > + * algorithm work against each other by giving a hint about the expectations of
>> > + * the former to the latter.
>> > + *
>> > + * The mapping betweeen the target frequencies and the hint values need not be
>> > + * exact, but it must be monotonic, so that higher target frequencies always
>> > + * indicate more performance-oriented P-state selection.
>> > + */
>> > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq,
>> > +                                          unsigned int target_freq)
>> > +{
>> > +     s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq);
>> > +
>> > +     intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp));
>> > +}
>> > +
>>
>> Hey Rafael, I'm building a kernel with this in order to give it a try on
>> my system, but I'm skeptical that translating the target frequency to an
>> EPP value will work reliably.  AFAIA the EPP value only has an indirect
>> influence on the processor's performance by adjusting the trade-off
>> between its responsiveness (rather than the actual clock frequency which
>> it will sustain in the long run) and its energy usage, in a largely
>> unspecified and non-linear way (non-linear like the effect of switching
>> CPU energy optimization features on and off, or like its effect on the
>> energy balancing behavior of the processor which can have a paradoxical
>> effect on performance).
>>
>> I doubt that the scheduling-based CPU utilization is a good predictor
>> for the optimal trade-off between those two variables.
>
> While I agree that this is not perfect, there barely is anything else
> that can be used for this purpose.
>
> Using the desired field or trying to adjust the limits relatively
> often would basically cause the P-state selection to be driven
> entirely by the kernel which simply doesn't know certain things only
> known to the P-unit, so it is reasonable to leave some level of
> control to the latter, so as to allow it to use the information known
> to it only.
>
> However, if it is allowed to do whatever it likes without any hints
> from the kernel, it may very well go against the scheduler's decisions
> which is not going to be optimal.
>
> I'm simply not sure if there is any other way to give such hints to it
> that through the EPP.
>

Why not HWP_MIN_PERF?  That would leave the HWP quite some room for
maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not
like P-state selection would be entirely driven by the kernel), while
avoiding every failure scenario I was describing in my previous reply.

Simply adjusting the EPP doesn't really allow the governor to make any
assumptions about the instantaneous performance of the processor, so it
seems questionable that it is exposed as a frequency target, since its
effect on frequency would be highly implementation-dependent and
inconsistent with the behavior of other CPUFREQ drivers.  OTOH adjusting
HWP_MIN_PERF achieves precisely what the caller wants: The CPU will
deliver at least the requested number of clocks per second (assuming the
request is compatible with the thermal and electrical constraints of the
processor), while retaining the liberty to clock up beyond that
frequency if the HWP is able to react to some discontinuous event more
quickly than the scheduler.

> Thanks!
Doug Smythies May 25, 2020, 9:09 p.m. UTC | #5
Hi all,

The INTEL_CPUFREQ_TRANSITION_DELAY_HWP = 20000
test results from this e-mail were incorrect.
The test and graphs are being re-done.

On 2020.05.25 08:30 Doug smythies wrote:

> 
> Legend - intel_pstate - powersave graph [2].
> 
> What? Why is there such a graph, unrelated to this patch?
> Well, because there is a not yet understood effect.
> 
> p_powe_stock : intel_pstate, powersave, stock kernel (5.7-rc6), hwp disabled.
> P_powe_hwp : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 5000.
> P_powe_hwp2 : intel_pstate, powersave, patched kernel (5.7-rc6), DELAY_HWP 20000.
> 
> Conclusion: ??
> 
> Note: That I merely made a stupid mistake is a real possibility.

Yes, that was it. However all DELAY_HWP 20000 tests were bad,
Not just this one.

... Doug
Rafael J. Wysocki May 26, 2020, 8:19 a.m. UTC | #6
to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez
<francisco.jerez.plata@intel.com> wrote:
>
> "Rafael J. Wysocki" <rafael@kernel.org> writes:
>
> > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez
> > <francisco.jerez.plata@intel.com> wrote:
> >>
> >> "Rafael J. Wysocki" <rjw@rjwysocki.net> writes:
> >>
> >> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >> >
> >> > Allow intel_pstate to work in the passive mode with HWP enabled and
> >> > make it translate the target frequency supplied by the cpufreq
> >> > governor in use into an EPP value to be written to the HWP request
> >> > MSR (high frequencies are mapped to low EPP values that mean more
> >> > performance-oriented HWP operation) as a hint for the HWP algorithm
> >> > in the processor, so as to prevent it and the CPU scheduler from
> >> > working against each other at least when the schedutil governor is
> >> > in use.
> >> >
> >> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >> > ---
> >> >
> >> > This is a prototype not intended for production use (based on linux-next).
> >> >
> >> > Please test it if you can (on HWP systems, of course) and let me know the
> >> > results.
> >> >
> >> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
> >> > may turn out to be either too high or too low for the general use, which is one
> >> > reason why getting as much testing coverage as possible is key here.
> >> >
> >> > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
> >> > please do so and let me know the conclusions.
> >> >
> >> > Cheers,
> >> > Rafael
> >> >
> >> > ---
> >> >  drivers/cpufreq/intel_pstate.c |  169 +++++++++++++++++++++++++++++++----------
> >> >  1 file changed, 131 insertions(+), 38 deletions(-)
> >> >
> >> > Index: linux-pm/drivers/cpufreq/intel_pstate.c
> >> > ===================================================================
> >> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> >> > +++ linux-pm/drivers/cpufreq/intel_pstate.c
> >> > @@ -36,6 +36,7 @@
> >> >  #define INTEL_PSTATE_SAMPLING_INTERVAL       (10 * NSEC_PER_MSEC)
> >> >
> >> >  #define INTEL_CPUFREQ_TRANSITION_LATENCY     20000
> >> > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP   5000
> >> >  #define INTEL_CPUFREQ_TRANSITION_DELAY               500
> >> >
> >> >  #ifdef CONFIG_ACPI
> >> > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int
> >> >       return div_ext_fp(percent, 100);
> >> >  }
> >> >
> >> > +#define HWP_EPP_TO_BYTE(x)   (((u64)x >> 24) & 0xFF)
> >> > +
> >> >  /**
> >> >   * struct sample -   Store performance sample
> >> >   * @core_avg_perf:   Ratio of APERF/MPERF which is the actual average
> >> > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st
> >> >
> >> >  static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
> >> >  {
> >> > -     intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
> >> > +     if (hwp_active)
> >> > +             intel_pstate_hwp_force_min_perf(policy->cpu);
> >> > +     else
> >> > +             intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
> >> >  }
> >> >
> >> >  static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
> >> > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct
> >> >       pr_debug("CPU %d exiting\n", policy->cpu);
> >> >
> >> >       intel_pstate_clear_update_util_hook(policy->cpu);
> >> > -     if (hwp_active) {
> >> > +     if (hwp_active)
> >> >               intel_pstate_hwp_save_state(policy);
> >> > -             intel_pstate_hwp_force_min_perf(policy->cpu);
> >> > -     } else {
> >> > -             intel_cpufreq_stop_cpu(policy);
> >> > -     }
> >> > +
> >> > +     intel_cpufreq_stop_cpu(policy);
> >> >  }
> >> >
> >> >  static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
> >> > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s
> >> >  #define      INTEL_PSTATE_TRACE_TARGET 10
> >> >  #define      INTEL_PSTATE_TRACE_FAST_SWITCH 90
> >> >
> >> > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
> >> > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type,
> >> > +                             int from, int to)
> >> >  {
> >> >       struct sample *sample;
> >> >
> >> > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c
> >> >       sample = &cpu->sample;
> >> >       trace_pstate_sample(trace_type,
> >> >               0,
> >> > -             old_pstate,
> >> > -             cpu->pstate.current_pstate,
> >> > +             from,
> >> > +             to,
> >> >               sample->mperf,
> >> >               sample->aperf,
> >> >               sample->tsc,
> >> > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c
> >> >               fp_toint(cpu->iowait_boost * 100));
> >> >  }
> >> >
> >> > -static int intel_cpufreq_target(struct cpufreq_policy *policy,
> >> > -                             unsigned int target_freq,
> >> > -                             unsigned int relation)
> >> > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp)
> >> >  {
> >> > -     struct cpudata *cpu = all_cpu_data[policy->cpu];
> >> > -     struct cpufreq_freqs freqs;
> >> > -     int target_pstate, old_pstate;
> >> > +     u64 value, prev;
> >> >
> >> > -     update_turbo_state();
> >> > +     prev = READ_ONCE(cpu->hwp_req_cached);
> >> > +     value = prev;
> >> >
> >> > -     freqs.old = policy->cur;
> >> > -     freqs.new = target_freq;
> >> > +     /*
> >> > +      * The entire MSR needs to be updated in order to update the EPP field
> >> > +      * in it, so opportunistically update the min and max too if needed.
> >> > +      */
> >> > +     value &= ~HWP_MIN_PERF(~0L);
> >> > +     value |= HWP_MIN_PERF(cpu->min_perf_ratio);
> >> > +
> >> > +     value &= ~HWP_MAX_PERF(~0L);
> >> > +     value |= HWP_MAX_PERF(cpu->max_perf_ratio);
> >> > +
> >> > +     if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
> >> > +             intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET,
> >> > +                                 HWP_EPP_TO_BYTE(prev), new_epp);
> >> > +
> >> > +             value &= ~GENMASK_ULL(31, 24);
> >> > +             value |= HWP_ENERGY_PERF_PREFERENCE(new_epp);
> >> > +     }
> >> > +
> >> > +     if (value != prev) {
> >> > +             WRITE_ONCE(cpu->hwp_req_cached, value);
> >> > +             wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
> >> > +     }
> >> > +}
> >> > +
> >> > +/**
> >> > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register.
> >> > + * @cpu: Target CPU.
> >> > + * @max_freq: Maximum frequency to consider.
> >> > + * @target_freq: Target frequency selected by the governor.
> >> > + *
> >> > + * Translate the target frequency into a new EPP value to be written into the
> >> > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection.
> >> > + *
> >> > + * The purpose of this is to avoid situations in which the kernel and the HWP
> >> > + * algorithm work against each other by giving a hint about the expectations of
> >> > + * the former to the latter.
> >> > + *
> >> > + * The mapping betweeen the target frequencies and the hint values need not be
> >> > + * exact, but it must be monotonic, so that higher target frequencies always
> >> > + * indicate more performance-oriented P-state selection.
> >> > + */
> >> > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq,
> >> > +                                          unsigned int target_freq)
> >> > +{
> >> > +     s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq);
> >> > +
> >> > +     intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp));
> >> > +}
> >> > +
> >>
> >> Hey Rafael, I'm building a kernel with this in order to give it a try on
> >> my system, but I'm skeptical that translating the target frequency to an
> >> EPP value will work reliably.  AFAIA the EPP value only has an indirect
> >> influence on the processor's performance by adjusting the trade-off
> >> between its responsiveness (rather than the actual clock frequency which
> >> it will sustain in the long run) and its energy usage, in a largely
> >> unspecified and non-linear way (non-linear like the effect of switching
> >> CPU energy optimization features on and off, or like its effect on the
> >> energy balancing behavior of the processor which can have a paradoxical
> >> effect on performance).
> >>
> >> I doubt that the scheduling-based CPU utilization is a good predictor
> >> for the optimal trade-off between those two variables.
> >
> > While I agree that this is not perfect, there barely is anything else
> > that can be used for this purpose.
> >
> > Using the desired field or trying to adjust the limits relatively
> > often would basically cause the P-state selection to be driven
> > entirely by the kernel which simply doesn't know certain things only
> > known to the P-unit, so it is reasonable to leave some level of
> > control to the latter, so as to allow it to use the information known
> > to it only.
> >
> > However, if it is allowed to do whatever it likes without any hints
> > from the kernel, it may very well go against the scheduler's decisions
> > which is not going to be optimal.
> >
> > I'm simply not sure if there is any other way to give such hints to it
> > that through the EPP.
> >
>
> Why not HWP_MIN_PERF?  That would leave the HWP quite some room for
> maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not
> like P-state selection would be entirely driven by the kernel), while
> avoiding every failure scenario I was describing in my previous reply.

Actually, I have been thinking about the HWP min as an alternative
that may be worth evaluating.

However, I would rather set the HWP min to something like 80% if the
cpufreq request.

Let me cut an alternative patch.
Doug Smythies May 26, 2020, 3:51 p.m. UTC | #7
On 2020.05.26 01:19 Rafael J. Wysocki wrote:
>  to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez
> > "Rafael J. Wysocki" <rafael@kernel.org> writes:
> > > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez
> >
> > Why not HWP_MIN_PERF?  That would leave the HWP quite some room for
> > maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not
> > like P-state selection would be entirely driven by the kernel), while
> > avoiding every failure scenario I was describing in my previous reply.

I have re-done my tests.
The problem that I observe seems specific to hwp itself
and not this patch and it's use in passive mode.
I see the exact same thing with intel_pstate/powersave.
[1] detail A.


Test: still simple single threaded load sweep,
at 347 hertz work/sleep frequency.
What do I see?

Unexpected frequency drops at around 70% load.
Example, from trace:

First, the thing has been going for awhile at 4.8 GHz.

Old epp ; new epp ; freq GHz; load % ; duration mS
80	  ; 82      ; 4.57    ; 61.94  ; 20.001
82      ; 80	; 4.57    ; 62.47  ; 40.003
80      ; 44      ; 3.73    ;	68.63  ; 62.009  <<<< What? Why freq down? Why long duration?
44      ;  0      ; 1.96    ; 100.23 ; 19.995  <<<< Even lower freq. load overruns.
 0      ; 73      ; 4.56    ; 82.93  ; 40.07   <<<< O.K. recovered, but damage done.
73      ; 46      ; 2.36    ;	79.19  ; 20.94   <<< now things oscillate a little.
46      ; 0       ; 1.9884  ;	100.24 ; 20.99
 0      ; 75      ; 4.5624  ;	82.1   ; 41.002  <<< Event ends. Next event in 487 milliseconds.

Observation: Events are often, but not always, preceded by a longer than normal duration.
However, long durations are also not out of the ordinary in passive mode.

And yes, the above trace was with DELAY_HWP 20,000, but I do have trace examples
with it at 5,000. This was just a particularly good example.

Observation (from looking at a lot of trace data): There are phase delays
between the two systems, intel_cpufreq and hwp, and sometimes they seem to
oscillate a little and fight each other. There maybe some problematic
work/sleep frequencies where the oscillation builds into a full blown
resonance. 
 
Why does hwp drop the frequency?

This system is otherwise fairly idle,
so maybe because the pll drops down during the non work periods.

Maybe HWP thinks the system is idle and drops the frequency.
I can eliminate the overruns by disabling deep idle states such
that the PLL vote is never relinquished, but it's not a fair test.

Note that the above response can be "tuned".
If we take the conversation algorithm from target frequency to EPP
and introduce and offset, the above can be improved.

At what cost? More sluggishness, for a large positive offset.
So, the overruns just move from the steady state side of the task to
when the task starts. I did not find if there is a "sweet spot"
between offset and system response, and I do not think there is value
added in trying.

Note: With original settings, I rarely observe a problem with the step
function response to a new task.

> 
> Actually, I have been thinking about the HWP min as an alternative
> that may be worth evaluating.
> 
> However, I would rather set the HWP min to something like 80% if the
> cpufreq request.

Yes, this is a good idea and should not suffer from the two servo systems
fighting each other.

I got 0 overruns, verses 2240 overruns with no min limitation (100 second test).

As for INTEL_CPUFREQ_TRANSITION_DELAY_HWP, I'll probably use
10 milliseconds moving forward, because that is what I am most
familiar with from years ago work on the this driver. But, I did
not observe any issue with 5 milliseconds.

[1] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-but-active-powersave.png

Other replaced graphs:

http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-ondemand.png
http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-schedutil.png
 
... Doug
Rafael J. Wysocki May 26, 2020, 5:42 p.m. UTC | #8
On Tue, May 26, 2020 at 5:51 PM Doug Smythies <dsmythies@telus.net> wrote:
>
> On 2020.05.26 01:19 Rafael J. Wysocki wrote:
> >  to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez
> > > "Rafael J. Wysocki" <rafael@kernel.org> writes:
> > > > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez
> > >
> > > Why not HWP_MIN_PERF?  That would leave the HWP quite some room for
> > > maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not
> > > like P-state selection would be entirely driven by the kernel), while
> > > avoiding every failure scenario I was describing in my previous reply.
>
> I have re-done my tests.
> The problem that I observe seems specific to hwp itself
> and not this patch and it's use in passive mode.
> I see the exact same thing with intel_pstate/powersave.
> [1] detail A.
>
>
> Test: still simple single threaded load sweep,
> at 347 hertz work/sleep frequency.
> What do I see?
>
> Unexpected frequency drops at around 70% load.
> Example, from trace:
>
> First, the thing has been going for awhile at 4.8 GHz.
>
> Old epp ; new epp ; freq GHz; load % ; duration mS
> 80        ; 82      ; 4.57    ; 61.94  ; 20.001
> 82      ; 80    ; 4.57    ; 62.47  ; 40.003
> 80      ; 44      ; 3.73    ;   68.63  ; 62.009  <<<< What? Why freq down? Why long duration?
> 44      ;  0      ; 1.96    ; 100.23 ; 19.995  <<<< Even lower freq. load overruns.
>  0      ; 73      ; 4.56    ; 82.93  ; 40.07   <<<< O.K. recovered, but damage done.
> 73      ; 46      ; 2.36    ;   79.19  ; 20.94   <<< now things oscillate a little.
> 46      ; 0       ; 1.9884  ;   100.24 ; 20.99
>  0      ; 75      ; 4.5624  ;   82.1   ; 41.002  <<< Event ends. Next event in 487 milliseconds.
>
> Observation: Events are often, but not always, preceded by a longer than normal duration.
> However, long durations are also not out of the ordinary in passive mode.
>
> And yes, the above trace was with DELAY_HWP 20,000, but I do have trace examples
> with it at 5,000. This was just a particularly good example.
>
> Observation (from looking at a lot of trace data): There are phase delays
> between the two systems, intel_cpufreq and hwp, and sometimes they seem to
> oscillate a little and fight each other. There maybe some problematic
> work/sleep frequencies where the oscillation builds into a full blown
> resonance.
>
> Why does hwp drop the frequency?
>
> This system is otherwise fairly idle,
> so maybe because the pll drops down during the non work periods.
>
> Maybe HWP thinks the system is idle and drops the frequency.
> I can eliminate the overruns by disabling deep idle states such
> that the PLL vote is never relinquished, but it's not a fair test.
>
> Note that the above response can be "tuned".
> If we take the conversation algorithm from target frequency to EPP
> and introduce and offset, the above can be improved.
>
> At what cost? More sluggishness, for a large positive offset.
> So, the overruns just move from the steady state side of the task to
> when the task starts. I did not find if there is a "sweet spot"
> between offset and system response, and I do not think there is value
> added in trying.
>
> Note: With original settings, I rarely observe a problem with the step
> function response to a new task.
>
> >
> > Actually, I have been thinking about the HWP min as an alternative
> > that may be worth evaluating.
> >
> > However, I would rather set the HWP min to something like 80% if the
> > cpufreq request.
>
> Yes, this is a good idea and should not suffer from the two servo systems
> fighting each other.

OK, thanks for the feedback!

I am about to post this patch.

> I got 0 overruns, verses 2240 overruns with no min limitation (100 second test).
>
> As for INTEL_CPUFREQ_TRANSITION_DELAY_HWP, I'll probably use
> 10 milliseconds moving forward, because that is what I am most
> familiar with from years ago work on the this driver. But, I did
> not observe any issue with 5 milliseconds.

I'm going to use INTEL_CPUFREQ_TRANSITION_DELAY_HWP in the new patch
for now then.

> [1] http://www.smythies.com/~doug/linux/intel_pstate/passive-hwp/passive-hwp-but-active-powersave.png

Thanks!
Francisco Jerez May 26, 2020, 6:29 p.m. UTC | #9
"Rafael J. Wysocki" <rafael@kernel.org> writes:

>  to On Mon, May 25, 2020 at 10:57 PM Francisco Jerez
> <francisco.jerez.plata@intel.com> wrote:
>>
>> "Rafael J. Wysocki" <rafael@kernel.org> writes:
>>
>> > On Mon, May 25, 2020 at 3:39 AM Francisco Jerez
>> > <francisco.jerez.plata@intel.com> wrote:
>> >>
>> >> "Rafael J. Wysocki" <rjw@rjwysocki.net> writes:
>> >>
>> >> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>> >> >
>> >> > Allow intel_pstate to work in the passive mode with HWP enabled and
>> >> > make it translate the target frequency supplied by the cpufreq
>> >> > governor in use into an EPP value to be written to the HWP request
>> >> > MSR (high frequencies are mapped to low EPP values that mean more
>> >> > performance-oriented HWP operation) as a hint for the HWP algorithm
>> >> > in the processor, so as to prevent it and the CPU scheduler from
>> >> > working against each other at least when the schedutil governor is
>> >> > in use.
>> >> >
>> >> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>> >> > ---
>> >> >
>> >> > This is a prototype not intended for production use (based on linux-next).
>> >> >
>> >> > Please test it if you can (on HWP systems, of course) and let me know the
>> >> > results.
>> >> >
>> >> > The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
>> >> > may turn out to be either too high or too low for the general use, which is one
>> >> > reason why getting as much testing coverage as possible is key here.
>> >> >
>> >> > If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
>> >> > please do so and let me know the conclusions.
>> >> >
>> >> > Cheers,
>> >> > Rafael
>> >> >
>> >> > ---
>> >> >  drivers/cpufreq/intel_pstate.c |  169 +++++++++++++++++++++++++++++++----------
>> >> >  1 file changed, 131 insertions(+), 38 deletions(-)
>> >> >
>> >> > Index: linux-pm/drivers/cpufreq/intel_pstate.c
>> >> > ===================================================================
>> >> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
>> >> > +++ linux-pm/drivers/cpufreq/intel_pstate.c
>> >> > @@ -36,6 +36,7 @@
>> >> >  #define INTEL_PSTATE_SAMPLING_INTERVAL       (10 * NSEC_PER_MSEC)
>> >> >
>> >> >  #define INTEL_CPUFREQ_TRANSITION_LATENCY     20000
>> >> > +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP   5000
>> >> >  #define INTEL_CPUFREQ_TRANSITION_DELAY               500
>> >> >
>> >> >  #ifdef CONFIG_ACPI
>> >> > @@ -95,6 +96,8 @@ static inline int32_t percent_ext_fp(int
>> >> >       return div_ext_fp(percent, 100);
>> >> >  }
>> >> >
>> >> > +#define HWP_EPP_TO_BYTE(x)   (((u64)x >> 24) & 0xFF)
>> >> > +
>> >> >  /**
>> >> >   * struct sample -   Store performance sample
>> >> >   * @core_avg_perf:   Ratio of APERF/MPERF which is the actual average
>> >> > @@ -2175,7 +2178,10 @@ static int intel_pstate_verify_policy(st
>> >> >
>> >> >  static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
>> >> >  {
>> >> > -     intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
>> >> > +     if (hwp_active)
>> >> > +             intel_pstate_hwp_force_min_perf(policy->cpu);
>> >> > +     else
>> >> > +             intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
>> >> >  }
>> >> >
>> >> >  static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
>> >> > @@ -2183,12 +2189,10 @@ static void intel_pstate_stop_cpu(struct
>> >> >       pr_debug("CPU %d exiting\n", policy->cpu);
>> >> >
>> >> >       intel_pstate_clear_update_util_hook(policy->cpu);
>> >> > -     if (hwp_active) {
>> >> > +     if (hwp_active)
>> >> >               intel_pstate_hwp_save_state(policy);
>> >> > -             intel_pstate_hwp_force_min_perf(policy->cpu);
>> >> > -     } else {
>> >> > -             intel_cpufreq_stop_cpu(policy);
>> >> > -     }
>> >> > +
>> >> > +     intel_cpufreq_stop_cpu(policy);
>> >> >  }
>> >> >
>> >> >  static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
>> >> > @@ -2296,7 +2300,8 @@ static int intel_cpufreq_verify_policy(s
>> >> >  #define      INTEL_PSTATE_TRACE_TARGET 10
>> >> >  #define      INTEL_PSTATE_TRACE_FAST_SWITCH 90
>> >> >
>> >> > -static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
>> >> > +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type,
>> >> > +                             int from, int to)
>> >> >  {
>> >> >       struct sample *sample;
>> >> >
>> >> > @@ -2309,8 +2314,8 @@ static void intel_cpufreq_trace(struct c
>> >> >       sample = &cpu->sample;
>> >> >       trace_pstate_sample(trace_type,
>> >> >               0,
>> >> > -             old_pstate,
>> >> > -             cpu->pstate.current_pstate,
>> >> > +             from,
>> >> > +             to,
>> >> >               sample->mperf,
>> >> >               sample->aperf,
>> >> >               sample->tsc,
>> >> > @@ -2318,40 +2323,110 @@ static void intel_cpufreq_trace(struct c
>> >> >               fp_toint(cpu->iowait_boost * 100));
>> >> >  }
>> >> >
>> >> > -static int intel_cpufreq_target(struct cpufreq_policy *policy,
>> >> > -                             unsigned int target_freq,
>> >> > -                             unsigned int relation)
>> >> > +static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp)
>> >> >  {
>> >> > -     struct cpudata *cpu = all_cpu_data[policy->cpu];
>> >> > -     struct cpufreq_freqs freqs;
>> >> > -     int target_pstate, old_pstate;
>> >> > +     u64 value, prev;
>> >> >
>> >> > -     update_turbo_state();
>> >> > +     prev = READ_ONCE(cpu->hwp_req_cached);
>> >> > +     value = prev;
>> >> >
>> >> > -     freqs.old = policy->cur;
>> >> > -     freqs.new = target_freq;
>> >> > +     /*
>> >> > +      * The entire MSR needs to be updated in order to update the EPP field
>> >> > +      * in it, so opportunistically update the min and max too if needed.
>> >> > +      */
>> >> > +     value &= ~HWP_MIN_PERF(~0L);
>> >> > +     value |= HWP_MIN_PERF(cpu->min_perf_ratio);
>> >> > +
>> >> > +     value &= ~HWP_MAX_PERF(~0L);
>> >> > +     value |= HWP_MAX_PERF(cpu->max_perf_ratio);
>> >> > +
>> >> > +     if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
>> >> > +             intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET,
>> >> > +                                 HWP_EPP_TO_BYTE(prev), new_epp);
>> >> > +
>> >> > +             value &= ~GENMASK_ULL(31, 24);
>> >> > +             value |= HWP_ENERGY_PERF_PREFERENCE(new_epp);
>> >> > +     }
>> >> > +
>> >> > +     if (value != prev) {
>> >> > +             WRITE_ONCE(cpu->hwp_req_cached, value);
>> >> > +             wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
>> >> > +     }
>> >> > +}
>> >> > +
>> >> > +/**
>> >> > + * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register.
>> >> > + * @cpu: Target CPU.
>> >> > + * @max_freq: Maximum frequency to consider.
>> >> > + * @target_freq: Target frequency selected by the governor.
>> >> > + *
>> >> > + * Translate the target frequency into a new EPP value to be written into the
>> >> > + * HWP request MSR of @cpu as a hint for the HW-driven P-state selection.
>> >> > + *
>> >> > + * The purpose of this is to avoid situations in which the kernel and the HWP
>> >> > + * algorithm work against each other by giving a hint about the expectations of
>> >> > + * the former to the latter.
>> >> > + *
>> >> > + * The mapping betweeen the target frequencies and the hint values need not be
>> >> > + * exact, but it must be monotonic, so that higher target frequencies always
>> >> > + * indicate more performance-oriented P-state selection.
>> >> > + */
>> >> > +static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq,
>> >> > +                                          unsigned int target_freq)
>> >> > +{
>> >> > +     s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq);
>> >> > +
>> >> > +     intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp));
>> >> > +}
>> >> > +
>> >>
>> >> Hey Rafael, I'm building a kernel with this in order to give it a try on
>> >> my system, but I'm skeptical that translating the target frequency to an
>> >> EPP value will work reliably.  AFAIA the EPP value only has an indirect
>> >> influence on the processor's performance by adjusting the trade-off
>> >> between its responsiveness (rather than the actual clock frequency which
>> >> it will sustain in the long run) and its energy usage, in a largely
>> >> unspecified and non-linear way (non-linear like the effect of switching
>> >> CPU energy optimization features on and off, or like its effect on the
>> >> energy balancing behavior of the processor which can have a paradoxical
>> >> effect on performance).
>> >>
>> >> I doubt that the scheduling-based CPU utilization is a good predictor
>> >> for the optimal trade-off between those two variables.
>> >
>> > While I agree that this is not perfect, there barely is anything else
>> > that can be used for this purpose.
>> >
>> > Using the desired field or trying to adjust the limits relatively
>> > often would basically cause the P-state selection to be driven
>> > entirely by the kernel which simply doesn't know certain things only
>> > known to the P-unit, so it is reasonable to leave some level of
>> > control to the latter, so as to allow it to use the information known
>> > to it only.
>> >
>> > However, if it is allowed to do whatever it likes without any hints
>> > from the kernel, it may very well go against the scheduler's decisions
>> > which is not going to be optimal.
>> >
>> > I'm simply not sure if there is any other way to give such hints to it
>> > that through the EPP.
>> >
>>
>> Why not HWP_MIN_PERF?  That would leave the HWP quite some room for
>> maneuvering (the whole HWP_MIN_PERF-HWP_MAX_PERF P-state range, it's not
>> like P-state selection would be entirely driven by the kernel), while
>> avoiding every failure scenario I was describing in my previous reply.
>
> Actually, I have been thinking about the HWP min as an alternative
> that may be worth evaluating.
>
> However, I would rather set the HWP min to something like 80% if the
> cpufreq request.
>
> Let me cut an alternative patch.

Heh...  Good.  While you're at it, it would be nice to make sure we have
a way for the governor to control both the minimum and maximum P-state
in some efficient way, rather than it simply providing a single target,
with [1] in mind.

[1] https://lwn.net/Articles/818827/
Doug Smythies June 15, 2020, 6:18 a.m. UTC | #10
On 2020.05.21 10:16 Rafael J. Wysocki wrote:
> 
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> Allow intel_pstate to work in the passive mode with HWP enabled and
> make it translate the target frequency supplied by the cpufreq
> governor in use into an EPP value to be written to the HWP request
> MSR (high frequencies are mapped to low EPP values that mean more
> performance-oriented HWP operation) as a hint for the HWP algorithm
> in the processor, so as to prevent it and the CPU scheduler from
> working against each other at least when the schedutil governor is
> in use.
> 
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
> 
> This is a prototype not intended for production use (based on linux-next).
> 
> Please test it if you can (on HWP systems, of course) and let me know the
> results.
> 
> The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
> may turn out to be either too high or too low for the general use, which is one
> reason why getting as much testing coverage as possible is key here.
> 
> If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
> please do so and let me know the conclusions.
> 
> Cheers,
> Rafael

To anyone trying this patch:

You will need to monitor EPP (Energy Performance Preference) carefully.
It changes as a function of passive/active, and if you booted active
or passive or no-hwp and changed later.

Originally, I was not specifically monitoring EPP, or paths taken since boot
towards driver, intel_pstate or intel_cpufreq, and governor, and will now have
to set aside test results.

@Rafael: I am still having problems with my test computer and HWP. However, I can
observe the energy saving potential of this "passive-yet-active HWP mode".

At this point, I am actually trying to make my newer test computer
simply behave and do what it is told with respect to CPU frequency scaling,
because even acpi-cpufreq misbehaves for performance governor under
some conditions [1].

[1] https://marc.info/?l=linux-pm&m=159155067328641&w=2

To my way of thinking:

1.) it is imperative that we be able to decouple the governor servo
from the processor servo. At a minimum this is needed for system testing,
debugging and reference baselines. At a maximum users could, perhaps, decide
for themselves. Myself, I would prefer "passive" to mean "do what you
have been told", and that is now what I am testing.

2.) I have always thought, indeed relied on, performance mode as being
more than a hint. For my older i7-2600K it never disobeyed orders, except
for the most minuscule of workloads.
This newer i5-9600K seems to have a mind of its own which I would like
to be able to turn off, yet still be able to use intel_pstate trace
with schedutil.

Recall last week I said

> moving forward the typical CPU frequency scaling
> configuration for my test system will be:
>
> driver: intel-cpufreq, forced at boot.
> governor: schedutil
> hwp: forced off at boot.

The problem is that baseline references are still needed
and performance mode is unreliable. Maybe other stuff also,
I simply don't know at this point.

Example of EPP changing (no need to read on) (from fresh boot):

Current EPP:

root@s18:/home/doug# rdmsr --bitfield 31:24 -u -a 0x774
128
128
128
128
128
128
root@s18:/home/doug# grep . /sys/devices/system/cpu/cpu3/cpufreq/*
/sys/devices/system/cpu/cpu3/cpufreq/affected_cpus:3
/sys/devices/system/cpu/cpu3/cpufreq/base_frequency:3700000
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_max_freq:4600000
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_min_freq:800000
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_transition_latency:0
/sys/devices/system/cpu/cpu3/cpufreq/energy_performance_available_preferences:default performance balance_performance balance_power
power
/sys/devices/system/cpu/cpu3/cpufreq/energy_performance_preference:balance_performance
/sys/devices/system/cpu/cpu3/cpufreq/related_cpus:3
/sys/devices/system/cpu/cpu3/cpufreq/scaling_available_governors:performance powersave
/sys/devices/system/cpu/cpu3/cpufreq/scaling_cur_freq:800102
/sys/devices/system/cpu/cpu3/cpufreq/scaling_driver:intel_pstate
/sys/devices/system/cpu/cpu3/cpufreq/scaling_governor:powersave
/sys/devices/system/cpu/cpu3/cpufreq/scaling_max_freq:4600000
/sys/devices/system/cpu/cpu3/cpufreq/scaling_min_freq:800000
/sys/devices/system/cpu/cpu3/cpufreq/scaling_setspeed:<unsupported>

Now, switch to passive mode:

echo passive > /sys/devices/system/cpu/intel_pstate/status

And observe EPP:

root@s18:/home/doug# rdmsr --bitfield 31:24 -u -a 0x774
255
255
255
255
255
255
root@s18:/home/doug# grep . /sys/devices/system/cpu/cpu3/cpufreq/*
/sys/devices/system/cpu/cpu3/cpufreq/affected_cpus:3
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_max_freq:4600000
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_min_freq:800000
/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_transition_latency:20000
/sys/devices/system/cpu/cpu3/cpufreq/related_cpus:3
/sys/devices/system/cpu/cpu3/cpufreq/scaling_available_governors:conservative ondemand userspace powersave performance schedutil

Hey, where did the ability to adjust the energy_performance_preference setting go?

/sys/devices/system/cpu/cpu3/cpufreq/scaling_cur_freq:3400313
/sys/devices/system/cpu/cpu3/cpufreq/scaling_driver:intel_cpufreq
/sys/devices/system/cpu/cpu3/cpufreq/scaling_governor:performance
/sys/devices/system/cpu/cpu3/cpufreq/scaling_max_freq:4600000
/sys/devices/system/cpu/cpu3/cpufreq/scaling_min_freq:800000
/sys/devices/system/cpu/cpu3/cpufreq/scaling_setspeed:<unsupported>

Kernel is 5.7 +plus this patch:

root@s18:/home/doug# uname -a
Linux s18 5.7.0-hwp10 #786 SMP PREEMPT Tue Jun 9 20:15:18 PDT 2020 x86_64 x86_64 x86_64 GNU/Linux

223e5c33f927 (HEAD -> k57-doug-hwp) cpufreq: intel_pstate: Accept passive mode with HWP enabled
5d890a14763d cpufreq: intel_pstate: Use passive mode by default without HWP
3d77e6a8804a (tag: v5.7) Linux 5.7

The below is on top of this patch, and is how I am attempting to move forward:

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4ab8bc1476c9..6c28ec49b192 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2331,33 +2331,32 @@ static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u32 min_perf)
        value |= HWP_MIN_PERF(min_perf);

        /*
-        * The entire MSR needs to be updated in order to update the HWP min
-        * field in it, so opportunistically update the max too if needed.
+        * the max also...
         */
        value &= ~HWP_MAX_PERF(~0L);
-       value |= HWP_MAX_PERF(cpu->max_perf_ratio);
+       value |= HWP_MAX_PERF(min_perf);

        if (value != prev)
                wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
 }

 /**
- * intel_cpufreq_adjust_hwp - Adjust the HWP reuqest register.
+ * intel_cpufreq_adjust_hwp - Adjust the HWP request register.
  * @cpu: Target CPU.
  * @target_pstate: P-state corresponding to the target frequency.
  *
- * Set the HWP minimum performance limit to 75% of @target_pstate taking the
+ * Set the HWP minimum performance limit to @target_pstate taking the
  * global min and max policy limits into account.
  *
- * The purpose of this is to avoid situations in which the kernel and the HWP
- * algorithm work against each other by giving a hint about the expectations of
- * the former to the latter.
+ * The purpose of this is to force the slave (passive) servo to do what
+ * it has been told, not what ever it wants.
+ * This NOT a hint. EPP (responsiveness) is managed from elsewhere.
  */
 static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate)
 {
        u32 min_perf;

-       min_perf = max_t(u32, (3 * target_pstate) / 4, cpu->min_perf_ratio);
+       min_perf = max_t(u32, target_pstate, cpu->min_perf_ratio);
        min_perf = min_t(u32, min_perf, cpu->max_perf_ratio);
        if (min_perf != cpu->pstate.current_pstate) {
                cpu->pstate.current_pstate = min_perf;

... Doug

... [deleted] ...
Doug Smythies Aug. 24, 2020, 2:54 p.m. UTC | #11
On 2020.05.21 10:16 Rafael J. Wysocki wrote:

...
> 
> The INTEL_CPUFREQ_TRANSITION_DELAY_HWP value has been guessed and it very well
> may turn out to be either too high or too low for the general use, which is one
> reason why getting as much testing coverage as possible is key here.
> 
> If you can play with different INTEL_CPUFREQ_TRANSITION_DELAY_HWP values,
> please do so and let me know the conclusions.
...

Hi Rafael,

Since my v7 reply the other day about results for 347 Hz work/sleep frequency
periodic workflows [1], I have been testing at 73 Hertz work/sleep frequency.
And am now testing this:

-#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP     5000
+#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP     10000

Which shows promise for the schedutil governor, which no
longer drives prematurely into the turbo range, consuming
excessive amounts of energy as compared to passive without
HWP.

The ondemand governor is still very bad. The only way I
have found to get it behave is to force max=min=target_pstate,
in addition to the above.

It will be quite some time before I have well organized feedback.
In the meantime if anyone has suggestions for an optimal
INTEL_CPUFREQ_TRANSITION_DELAY_HWP value, I would be grateful.
Otherwise I'll attempt to find it via tests.

... Doug

[1] https://marc.info/?l=linux-pm&m=159769839401767&w=2
diff mbox series

Patch

Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -36,6 +36,7 @@ 
 #define INTEL_PSTATE_SAMPLING_INTERVAL	(10 * NSEC_PER_MSEC)
 
 #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
+#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP	5000
 #define INTEL_CPUFREQ_TRANSITION_DELAY		500
 
 #ifdef CONFIG_ACPI
@@ -95,6 +96,8 @@  static inline int32_t percent_ext_fp(int
 	return div_ext_fp(percent, 100);
 }
 
+#define HWP_EPP_TO_BYTE(x)	(((u64)x >> 24) & 0xFF)
+
 /**
  * struct sample -	Store performance sample
  * @core_avg_perf:	Ratio of APERF/MPERF which is the actual average
@@ -2175,7 +2178,10 @@  static int intel_pstate_verify_policy(st
 
 static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
 {
-	intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
+	if (hwp_active)
+		intel_pstate_hwp_force_min_perf(policy->cpu);
+	else
+		intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
 }
 
 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
@@ -2183,12 +2189,10 @@  static void intel_pstate_stop_cpu(struct
 	pr_debug("CPU %d exiting\n", policy->cpu);
 
 	intel_pstate_clear_update_util_hook(policy->cpu);
-	if (hwp_active) {
+	if (hwp_active)
 		intel_pstate_hwp_save_state(policy);
-		intel_pstate_hwp_force_min_perf(policy->cpu);
-	} else {
-		intel_cpufreq_stop_cpu(policy);
-	}
+
+	intel_cpufreq_stop_cpu(policy);
 }
 
 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
@@ -2296,7 +2300,8 @@  static int intel_cpufreq_verify_policy(s
 #define	INTEL_PSTATE_TRACE_TARGET 10
 #define	INTEL_PSTATE_TRACE_FAST_SWITCH 90
 
-static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
+static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type,
+				int from, int to)
 {
 	struct sample *sample;
 
@@ -2309,8 +2314,8 @@  static void intel_cpufreq_trace(struct c
 	sample = &cpu->sample;
 	trace_pstate_sample(trace_type,
 		0,
-		old_pstate,
-		cpu->pstate.current_pstate,
+		from,
+		to,
 		sample->mperf,
 		sample->aperf,
 		sample->tsc,
@@ -2318,40 +2323,110 @@  static void intel_cpufreq_trace(struct c
 		fp_toint(cpu->iowait_boost * 100));
 }
 
-static int intel_cpufreq_target(struct cpufreq_policy *policy,
-				unsigned int target_freq,
-				unsigned int relation)
+static void intel_cpufreq_update_hwp_request(struct cpudata *cpu, u8 new_epp)
 {
-	struct cpudata *cpu = all_cpu_data[policy->cpu];
-	struct cpufreq_freqs freqs;
-	int target_pstate, old_pstate;
+	u64 value, prev;
 
-	update_turbo_state();
+	prev = READ_ONCE(cpu->hwp_req_cached);
+	value = prev;
 
-	freqs.old = policy->cur;
-	freqs.new = target_freq;
+	/*
+	 * The entire MSR needs to be updated in order to update the EPP field
+	 * in it, so opportunistically update the min and max too if needed.
+	 */
+	value &= ~HWP_MIN_PERF(~0L);
+	value |= HWP_MIN_PERF(cpu->min_perf_ratio);
+
+	value &= ~HWP_MAX_PERF(~0L);
+	value |= HWP_MAX_PERF(cpu->max_perf_ratio);
+
+	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
+		intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET,
+				    HWP_EPP_TO_BYTE(prev), new_epp);
+
+		value &= ~GENMASK_ULL(31, 24);
+		value |= HWP_ENERGY_PERF_PREFERENCE(new_epp);
+	}
+
+	if (value != prev) {
+		WRITE_ONCE(cpu->hwp_req_cached, value);
+		wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
+	}
+}
+
+/**
+ * intel_cpufreq_adjust_hwp_request - Adjust the HWP reuqest register.
+ * @cpu: Target CPU.
+ * @max_freq: Maximum frequency to consider.
+ * @target_freq: Target frequency selected by the governor.
+ *
+ * Translate the target frequency into a new EPP value to be written into the
+ * HWP request MSR of @cpu as a hint for the HW-driven P-state selection.
+ *
+ * The purpose of this is to avoid situations in which the kernel and the HWP
+ * algorithm work against each other by giving a hint about the expectations of
+ * the former to the latter.
+ *
+ * The mapping betweeen the target frequencies and the hint values need not be
+ * exact, but it must be monotonic, so that higher target frequencies always
+ * indicate more performance-oriented P-state selection.
+ */
+static void intel_cpufreq_adjust_hwp_request(struct cpudata *cpu, s64 max_freq,
+					     unsigned int target_freq)
+{
+	s64 epp_fp = div_fp(255 * (max_freq - target_freq), max_freq);
+
+	intel_cpufreq_update_hwp_request(cpu, fp_toint(epp_fp));
+}
+
+static int intel_cpufreq_adjust_pstate(struct cpudata *cpu,
+				       unsigned int target_freq,
+				       unsigned int relation)
+{
+	int old_pstate = cpu->pstate.current_pstate;
+	int target_pstate;
 
-	cpufreq_freq_transition_begin(policy, &freqs);
 	switch (relation) {
 	case CPUFREQ_RELATION_L:
-		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
+		target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
 		break;
 	case CPUFREQ_RELATION_H:
-		target_pstate = freqs.new / cpu->pstate.scaling;
+		target_pstate = target_freq / cpu->pstate.scaling;
 		break;
 	default:
-		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
+		target_pstate = DIV_ROUND_CLOSEST(target_freq, cpu->pstate.scaling);
 		break;
 	}
 	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
-	old_pstate = cpu->pstate.current_pstate;
-	if (target_pstate != cpu->pstate.current_pstate) {
+	if (target_pstate != old_pstate) {
 		cpu->pstate.current_pstate = target_pstate;
-		wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
+		wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
 			      pstate_funcs.get_val(cpu, target_pstate));
 	}
-	freqs.new = target_pstate * cpu->pstate.scaling;
-	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate);
+	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate, target_pstate);
+	return target_pstate * cpu->pstate.scaling;
+}
+
+static int intel_cpufreq_target(struct cpufreq_policy *policy,
+				unsigned int target_freq,
+				unsigned int relation)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	struct cpufreq_freqs freqs;
+
+	update_turbo_state();
+
+	freqs.old = policy->cur;
+	freqs.new = target_freq;
+
+	cpufreq_freq_transition_begin(policy, &freqs);
+
+	if (hwp_active)
+		intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq,
+						 target_freq);
+	else
+		freqs.new = intel_cpufreq_adjust_pstate(cpu, target_freq, relation);
+
 	cpufreq_freq_transition_end(policy, &freqs, false);
 
 	return 0;
@@ -2365,11 +2440,18 @@  static unsigned int intel_cpufreq_fast_s
 
 	update_turbo_state();
 
+	if (hwp_active) {
+		intel_cpufreq_adjust_hwp_request(cpu, policy->cpuinfo.max_freq,
+						 target_freq);
+		return target_freq;
+	}
+
 	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
 	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
 	old_pstate = cpu->pstate.current_pstate;
 	intel_pstate_update_pstate(cpu, target_pstate);
-	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
+	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate,
+			    target_pstate);
 	return target_pstate * cpu->pstate.scaling;
 }
 
@@ -2389,7 +2471,6 @@  static int intel_cpufreq_cpu_init(struct
 		return ret;
 
 	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
-	policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
 	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
 	policy->cur = policy->cpuinfo.min_freq;
 
@@ -2401,10 +2482,19 @@  static int intel_cpufreq_cpu_init(struct
 
 	cpu = all_cpu_data[policy->cpu];
 
-	if (hwp_active)
+	if (hwp_active) {
+		u64 value;
+
+		rdmsrl_on_cpu(policy->cpu, MSR_HWP_REQUEST, &value);
+		WRITE_ONCE(cpu->hwp_req_cached, value);
+		cpu->epp_saved = HWP_EPP_TO_BYTE(value);
+
 		intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state);
-	else
+		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
+	} else {
 		turbo_max = cpu->pstate.turbo_pstate;
+		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
+	}
 
 	min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
 	min_freq *= cpu->pstate.scaling;
@@ -2449,6 +2539,13 @@  static int intel_cpufreq_cpu_exit(struct
 	freq_qos_remove_request(req);
 	kfree(req);
 
+	if (hwp_active) {
+		struct cpudata *cpu = all_cpu_data[policy->cpu];
+
+		/* Restore the original HWP EPP value. */
+		intel_cpufreq_update_hwp_request(cpu, cpu->epp_saved);
+	}
+
 	return intel_pstate_cpu_exit(policy);
 }
 
@@ -2505,9 +2602,6 @@  static int intel_pstate_register_driver(
 
 static int intel_pstate_unregister_driver(void)
 {
-	if (hwp_active)
-		return -EBUSY;
-
 	cpufreq_unregister_driver(intel_pstate_driver);
 	intel_pstate_driver_cleanup();
 
@@ -2815,12 +2909,11 @@  static int __init intel_pstate_setup(cha
 	if (!str)
 		return -EINVAL;
 
-	if (!strcmp(str, "disable")) {
+	if (!strcmp(str, "disable"))
 		no_load = 1;
-	} else if (!strcmp(str, "passive")) {
+	else if (!strcmp(str, "passive"))
 		default_driver = &intel_cpufreq;
-		no_hwp = 1;
-	}
+
 	if (!strcmp(str, "no_hwp")) {
 		pr_info("HWP disabled\n");
 		no_hwp = 1;