diff mbox

[6/9] cpufreq/intel_pstate: Implement variably low-pass filtering controller for small core.

Message ID 20180328063845.4884-7-currojerez@riseup.net (mailing list archive)
State New, archived
Headers show

Commit Message

Francisco Jerez March 28, 2018, 6:38 a.m. UTC
This introduces a controller for low-power parts that takes advantage
of the IO active time statistic introduced earlier in order to adjust
the trade-off between responsiveness and energy efficiency of the
heuristic dynamically.  This allows it to achieve lower energy
consumption when the system is far enough from the CPU-bound end of
the IO utilization statistic.  In low-latency mode the controller is
actually somewhat more aggressive than the current one due to its use
of the APER/MPERF ratio (particularly if C0 residency is low, which by
itself partly mitigates the lower energy efficiency of the aggressive
heuristic) -- See the comments below for the rationale.

The heuristic is tuned to roughly match the performance numbers of the
current governor (which is rather aggressive) in latency-bound
test-cases, so the energy-saving behavior won't kick in with the
current calibration except when heavily IO-bound for some time.  The
RT and DL scheduling flags could potentially provide a useful
additional variable for the heuristic to decide whether the workload
is latency-sensitive, allowing it to save power in other
(non-IO-bound) cases, but this is not attempted in this series since
there would be an increased risk of performance regressions due to
latency-sensitive tasks not marked RT or DL.

For the moment this is only enabled on BXT in order to reduce the
extent of any unexpected fallout, but it should work on other
low-power platforms if it's hooked up to the right pstate_funcs table
(at your own risk).

Signed-off-by: Francisco Jerez <currojerez@riseup.net>
---
 drivers/cpufreq/intel_pstate.c | 357 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 353 insertions(+), 4 deletions(-)
diff mbox

Patch

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ef699a3a238f..d4b5d0aaa282 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -61,6 +61,11 @@  static inline int32_t mul_fp(int32_t x, int32_t y)
 	return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
 }
 
+static inline int rnd_fp(int32_t x)
+{
+	return (x + (1 << (FRAC_BITS - 1))) >> FRAC_BITS;
+}
+
 static inline int32_t div_fp(s64 x, s64 y)
 {
 	return div64_s64((int64_t)x << FRAC_BITS, y);
@@ -171,6 +176,23 @@  struct vid_data {
 	int32_t ratio;
 };
 
+/**
+ * struct lp_data - LP controller parameters and state.
+ * @sample_interval_ns:  Update interval in ns
+ * @last_io_active_ns:   Cumulative IO active time in ns observed at the
+ *                       last sample.
+ * @setpoint:            Target CPU utilization at which the controller is
+ *                       expected to leave the current P-state untouched, as
+ *                       a fixed-point fraction.
+ * @p_base:              Low-pass filtered P-state as a fixed-point fraction.
+ */
+struct lp_data {
+	s64 sample_interval_ns;
+	uint64_t last_io_active_ns;
+	int32_t setpoint;
+	int32_t p_base;
+};
+
 /**
  * struct global_params - Global parameters, mostly tunable via sysfs.
  * @no_turbo:		Whether or not to use turbo P-states.
@@ -234,6 +256,7 @@  struct cpudata {
 
 	struct pstate_data pstate;
 	struct vid_data vid;
+	struct lp_data lp;
 
 	u64	last_update;
 	u64	last_sample_time;
@@ -258,6 +281,28 @@  struct cpudata {
 
 static struct cpudata **all_cpu_data;
 
+/**
+ * struct lp_params - LP controller static configuration
+ * @sample_interval_ms:      Update interval in ms
+ * @setpoint_pml:            Target CPU utilization at which the controller is
+ *                           expected to leave the current P-state untouched,
+ *                           as an integer per mille.
+ * @p_base_avg_hz:           Exponential averaging frequency of the P-state
+ *                           low-pass filter as an integer in Hz.
+ * @io_active_threshold_pml: IO utilization threshold at which the controller
+ *                           should transition to a higher latency low-pass
+ *                           filtering mode, as an integer per mille.
+ * @io_active_avg_hz:        Exponential averaging frequency of the IO
+ *                           utilization statistic as an integer in Hz.
+ */
+struct lp_params {
+	int sample_interval_ms;
+	int setpoint_pml;
+	int p_base_avg_hz;
+	int io_active_threshold_pml;
+	int io_active_avg_hz;
+};
+
 /**
  * struct pstate_funcs - Per CPU model specific callbacks
  * @get_max:		Callback to get maximum non turbo effective P state
@@ -286,6 +331,13 @@  struct pstate_funcs {
 };
 
 static struct pstate_funcs pstate_funcs __read_mostly;
+static struct lp_params lp_params __read_mostly = {
+	.sample_interval_ms = 10,
+	.setpoint_pml = 700,
+	.p_base_avg_hz = 3,
+	.io_active_threshold_pml = 983,
+	.io_active_avg_hz = 3
+};
 
 static int hwp_active __read_mostly;
 static bool per_cpu_limits __read_mostly;
@@ -1483,6 +1535,285 @@  static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
 	return target;
 }
 
+/**
+ * Initialize the struct lp_data of the specified CPU to the defaults
+ * calculated from @lp_params.
+ */
+static void intel_pstate_lp_reset(struct cpudata *cpu)
+{
+	struct lp_data *lp = &cpu->lp;
+
+	lp->sample_interval_ns = lp_params.sample_interval_ms * NSEC_PER_MSEC;
+	lp->setpoint = div_fp(lp_params.setpoint_pml, 1000);
+	lp->p_base = int_tofp(cpu->pstate.current_pstate);
+}
+
+/**
+ * Unit ramp function used as building block for more complex
+ * piecewise linear functions.
+ */
+static int32_t ramp(int32_t x0, int32_t x1, int32_t x)
+{
+	return x <= x0 ? 0 :
+	       x >= x1 ? int_tofp(1) :
+	       div_fp(x - x0, x1 - x0);
+}
+
+/**
+ * Fixed point representation with twice the usual number of
+ * fractional bits.
+ */
+#define DFRAC_BITS 16
+#define DFRAC_ONE (1 << DFRAC_BITS)
+#define DFRAC_MAX_INT (0u - (uint32_t)DFRAC_ONE)
+
+/**
+ * Fast but rather inaccurate piecewise-linear approximation of a
+ * fixed-point product by an inverse exponential:
+ *
+ *  decay(a, p) = a * 2 ^ (-p / DFRAC_ONE) + O(a)
+ *
+ * The error term should be lower in magnitude than 0.044 * a.
+ */
+static int32_t decay(int32_t a, uint32_t p)
+{
+	if (a < 0) {
+		/*
+		 * Avoid implementation-defined behavior in signed
+		 * right shift of negative integer.
+		 */
+		return -decay(-a, p);
+
+	} else if (p < 32 * DFRAC_ONE) {
+		/* Interpolate between 2^-floor(p) and 2^-ceil(p). */
+		const uint32_t floor_p = p >> DFRAC_BITS;
+		const uint32_t ceil_p = (p + DFRAC_ONE - 1) >> DFRAC_BITS;
+		const uint64_t frac_p = p - (floor_p << DFRAC_BITS);
+
+		return ((a >> floor_p) * (DFRAC_ONE - frac_p) +
+			(ceil_p >= 32 ? 0 : a >> ceil_p) * frac_p) >>
+		       DFRAC_BITS;
+	}
+
+	/* Short-circuit to avoid overflow. */
+	return 0;
+}
+
+/**
+ * Calculate the target P-state for the next update period.  Uses a
+ * (variably) low-pass-filtering controller intended to improve energy
+ * efficiency under some conditions controlled heuristically.
+ */
+static int32_t get_target_pstate_lp(struct cpudata *cpu)
+{
+	struct lp_data *lp = &cpu->lp;
+	/*
+	 * Estimate the average IO utilization over the sampling
+	 * interval.
+	 */
+	const uint64_t delta_ns = cpu->sample.time - cpu->last_sample_time;
+	const uint64_t io_active_ns = cpufreq_io_active_time_ns();
+	const uint32_t io_active_pml = div_fp(
+		(io_active_ns - lp->last_io_active_ns) * 1000,
+		delta_ns);
+	/*
+	 * Approximate, but saves two 64-bit integer divisions below
+	 * and should be fully evaluated at compile-time.  Causes the
+	 * exponential averaging to have an effective base of
+	 * 1.90702343749, which has little functional implications as
+	 * long as the io_active_avg_hz and p_base_avg_hz parameters
+	 * are scaled accordingly.
+	 */
+	const uint32_t ns_per_s_shift = order_base_2(NSEC_PER_SEC);
+	/*
+	 * Exponentially average the IO utilization observed during
+	 * the last interval in order to obtain a more long-term
+	 * statistic.  The exponent p is the ratio between the elapsed
+	 * time and an exponential averaging time constant:
+	 *
+	 *   T0 = s / io_active_avg_hz
+	 *
+	 * This time constant should typically be of the order of
+	 * magnitude of the time constant T1 of the low-pass filter in
+	 * order for the IO utilization statistic to contain a
+	 * non-negligible contribution from the behavior of the system
+	 * in the window of time in which the low-pass filter will be
+	 * rearranging the workload.  A longer time constant provides
+	 * a more stable statistic at the cost of making it less
+	 * responsive to changes in the behavior of the system.
+	 */
+	const uint32_t p = min((uint64_t)DFRAC_MAX_INT,
+			       (lp_params.io_active_avg_hz * delta_ns) >>
+			       (ns_per_s_shift - DFRAC_BITS));
+	const uint32_t io_active_avg_pml = io_active_pml +
+		decay(cpu->iowait_boost - io_active_pml, p);
+	/*
+	 * Whether the system is under close to full IO utilization,
+	 * which causes the controller to make a more conservative
+	 * trade-off between latency and energy usage, since the
+	 * system is close to IO-bound so performance isn't guaranteed
+	 * to scale further with increasing CPU frequency.
+	 *
+	 * If the workload *is* latency-bound despite the nearly full
+	 * IO utilization, the more conservative behavior of the
+	 * controller will result in negative feed-back causing the IO
+	 * utilization to drop under io_active_threshold_pml, at which
+	 * point the controller will switch back to the more
+	 * aggressively latency-minimizing mode, which will cause the
+	 * IO utilization to increase.
+	 *
+	 * This means that io_active_threshold_pml acts as a point of
+	 * stable equilibrium of the system for latency-bound IO
+	 * workloads, so it should be rather high in order to avoid
+	 * underutilizing IO devices around equilibrium, which may
+	 * hurt performance for such (rather pathological) workloads.
+	 * Applications that pipeline CPU and IO work shouldn't be
+	 * appreciably latency-bound and will be able to achieve full
+	 * IO utilization regardless of io_active_threshold_pml.  In
+	 * addition they will obtain comparatively lower energy usage
+	 * since the controller should hardly ever have to switch to
+	 * latency-minimizing mode in the steady state.
+	 */
+	const bool relax = rnd_fp(io_active_avg_pml) >=
+			   lp_params.io_active_threshold_pml;
+	/*
+	 * P-state limits in fixed-point as allowed by the policy.
+	 */
+	const int32_t p_min = int_tofp(max(cpu->pstate.min_pstate,
+					   cpu->min_perf_ratio));
+	const int32_t p_max = int_tofp(cpu->max_perf_ratio);
+	const int32_t p_cur = int_tofp(cpu->pstate.current_pstate);
+	/*
+	 * Observed average P-state during (a part of) the sampling
+	 * period.  The conservative path uses the TSC increment as
+	 * denominator which will give the minimum (arguably most
+	 * energy-efficient) P-state able to accomplish the observed
+	 * amount of work during the sampling period.
+	 *
+	 * The downside of that somewhat optimistic estimate is that
+	 * it can give a rather biased result for intermittent
+	 * latency-sensitive workloads, which may have to be completed
+	 * in a short window of time for the system to achieve maximum
+	 * performance, even though the average CPU utilization is
+	 * low.  For that reason the latency-minimizing heuristic uses
+	 * the MPERF increment as denominator instead, which will give
+	 * the P-state able to accomplish the observed amount of work
+	 * during the time that the processor was actually awake (in
+	 * C0 state specifically), which is approximately optimal
+	 * under the rather pessimistic assumption that the CPU work
+	 * cannot be parallelized with any other dependent IO work
+	 * that subsequently keeps the CPU idle (arguably in C1+
+	 * states), so MPERF provides an estimate of the time the CPU
+	 * actually had available to accomplish the observed work.
+	 */
+	const s64 n_obs = cpu->sample.aperf << cpu->aperf_mperf_shift;
+	const s64 n_max = (relax ? cpu->sample.tsc :
+			   cpu->sample.mperf << cpu->aperf_mperf_shift);
+	const int32_t p_obs = min(p_cur, div_fp(
+				     n_obs * cpu->pstate.max_pstate_physical,
+				     n_max));
+	/*
+	 * Average P-state that would have been observed at the target
+	 * CPU utilization.  A lower setpoint fraction gives the
+	 * controller a stronger upward bias and a larger room for the
+	 * system load to fluctuate between update periods, at the
+	 * cost of increasing energy usage.
+	 */
+	const int32_t p_tgt = mul_fp(lp->setpoint, p_cur);
+	/*
+	 * Unfiltered controller response for the observed average
+	 * performance during the last sampling period.	 This is the
+	 * simplest piecewise-linear function of p_obs that satisfies
+	 * the following properties:
+	 *
+	 *   - p_est(0) = p_min
+	 *   - p_est(p_tgt) = p_cur
+	 *   - p_est(p_cur) = p_max
+	 *
+	 * which ensure that the P-state range specified by the policy
+	 * is honored and that the controller has a fixed point at the
+	 * target utilization.
+	 */
+	const int32_t p_est = max(p_min,
+				  mul_fp(p_cur, ramp(0, p_tgt, p_obs)) +
+				  mul_fp(p_max - p_cur,
+					 ramp(p_tgt, p_cur, p_obs)));
+	/*
+	 * Low-pass filter the P-state estimate above by exponential
+	 * averaging.  For an oscillating workload (e.g. submitting
+	 * work repeatedly to a device like a soundcard or GPU) this
+	 * will approximate the minimum P-state that would be able to
+	 * accomplish the observed amount of work during the averaging
+	 * period, which is also the optimally energy-efficient one,
+	 * under the assumptions that:
+	 *
+	 *  - The power curve of the system is convex throughout the
+	 *    range of P-states allowed by the policy.	I.e. energy
+	 *    efficiency is steadily decreasing with frequency past
+	 *    p_min (which is typically close to the
+	 *    maximum-efficiency ratio).  In practice for the lower
+	 *    range of P-states this may only be approximately true
+	 *    due to the interaction between different components of
+	 *    the system.
+	 *
+	 *  - Parallelism constraints of the workload don't prevent it
+	 *    from achieving the same throughput at the lower P-state.
+	 *    This will happen in cases where the application is
+	 *    designed in a way that doesn't allow for dependent CPU
+	 *    and IO jobs to be pipelined, leading to alternating full
+	 *    and zero utilization of the CPU and IO device.  This
+	 *    will give an average IO device utilization lower than
+	 *    100% regardless of the CPU frequency, which is expected
+	 *    to cause the controller to transition to low-latency
+	 *    mode once the IO utilization drops below
+	 *    io_active_threshold_pml, at which point p_base will no
+	 *    longer have an influence on the controller response.
+	 *
+	 *  - The period of the oscillating workload is significantly
+	 *    shorter than the time constant of the exponential
+	 *    average:
+	 *
+	 *	T1 = s / p_base_avg_hz
+	 *
+	 *    Otherwise for more slowly oscillating workloads the
+	 *    controller response will roughly follow the oscillation,
+	 *    leading to decreased energy efficiency.
+	 *
+	 *  - The behavior of the workload doesn't change
+	 *    qualitatively during the next update interval.  This is
+	 *    only true in the steady state, and could possibly lead
+	 *    to a transitory period in which the controller response
+	 *    deviates from the most energy-efficient ratio until the
+	 *    workload reaches a steady state again.  This could be
+	 *    mitigated to a certain extent with some form of
+	 *    per-entity load tracking.
+	 */
+	const uint32_t q = min((uint64_t)DFRAC_MAX_INT,
+			       (lp_params.p_base_avg_hz * delta_ns) >>
+			       (ns_per_s_shift - DFRAC_BITS));
+	lp->p_base = p_est + decay(lp->p_base - p_est, q);
+	/*
+	 * Update busy_scaled with the utilization fraction relative
+	 * to the TSC base frequency.  Used for tracing.  100%
+	 * indicates full utilization at the maximum non-turbo
+	 * frequency.
+	 */
+	cpu->sample.busy_scaled = div_fp(100 * n_obs, cpu->sample.tsc);
+	/*
+	 * Track the IO utilization statistic as iowait_boost for it
+	 * to show up in traces instead of the iowait_boost fraction.
+	 */
+	cpu->iowait_boost = io_active_avg_pml;
+	lp->last_io_active_ns = io_active_ns;
+	/*
+	 * Use the low-pass-filtered controller response for better
+	 * energy efficiency unless we have reasons to believe that
+	 * some of the optimality assumptions discussed above may not
+	 * hold.
+	 */
+	return rnd_fp(relax ? lp->p_base : p_est);
+}
+
 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
 {
 	int max_pstate = intel_pstate_get_base_pstate(cpu);
@@ -1566,6 +1897,22 @@  static void intel_pstate_update_util(struct update_util_data *data, u64 time,
 	}
 }
 
+/**
+ * Implementation of the cpufreq update_util hook based on the LP
+ * controller (see get_target_pstate_lp()).
+ */
+static void intel_pstate_update_util_lp(struct update_util_data *data,
+					u64 time, unsigned int flags)
+{
+	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
+	const u64 delta_ns = time - cpu->sample.time;
+
+	if (smp_processor_id() == cpu->cpu &&
+	    delta_ns >= cpu->lp.sample_interval_ns &&
+	    intel_pstate_sample(cpu, time))
+		intel_pstate_adjust_pstate(cpu, get_target_pstate_lp(cpu));
+}
+
 static struct pstate_funcs core_funcs = {
 	.get_max = core_get_max_pstate,
 	.get_max_physical = core_get_max_pstate_physical,
@@ -1616,7 +1963,7 @@  static const struct pstate_funcs bxt_funcs = {
 	.get_turbo = core_get_turbo_pstate,
 	.get_scaling = core_get_scaling,
 	.get_val = core_get_val,
-	.update_util = intel_pstate_update_util,
+	.update_util = intel_pstate_update_util_lp,
 };
 
 #define ICPU(model, policy) \
@@ -1695,6 +2042,10 @@  static int intel_pstate_init_cpu(unsigned int cpunum)
 
 	intel_pstate_get_cpu_pstates(cpu);
 
+	if (!hwp_active &&
+	    pstate_funcs.update_util == intel_pstate_update_util_lp)
+		intel_pstate_lp_reset(cpu);
+
 	pr_debug("controlling: cpu %d\n", cpunum);
 
 	return 0;
@@ -2287,9 +2638,7 @@  static int __init intel_pstate_init(void)
 
 	if (x86_match_cpu(hwp_support_ids)) {
 		copy_cpu_funcs(&core_funcs);
-		if (no_hwp) {
-			pstate_funcs.update_util = intel_pstate_update_util;
-		} else {
+		if (!no_hwp) {
 			hwp_active++;
 			intel_pstate.attr = hwp_cpufreq_attrs;
 			goto hwp_cpu_matched;