diff mbox

[1/9] cpufreq: Implement infrastructure keeping track of aggregated IO active time.

Message ID 20180328063845.4884-2-currojerez@riseup.net (mailing list archive)
State New, archived
Headers show

Commit Message

Francisco Jerez March 28, 2018, 6:38 a.m. UTC
This provides an IO activity statistic to cpufreq governors
complementary to the IO wait time currently available to them.  An IO
utilization estimated from this statistic which is significantly lower
than 100% can be interpreted as an indication that no IO devices are
utilized to their full throughput yet, and overall system performance
has a good chance of scaling with increasing CPU frequency.  An IO
utilization close to 100% indicates that at all times there was at
least one active IO device, in which case the system is not guaranteed
to be able to accomplish more work per unit of time even if the CPU
frequency could be increased further, providing an opportunity for the
cpufreq governor to save energy.

This patch uses a fairly minimal lockless approach to keep track of IO
activity time that only relies on an atomic counter of the number of
IO jobs in flight and another atomic variable that accumulates the
total amount of time spent with at least one active IO job since
system boot.  IO utilization can be estimated by the cpufreq governor
by periodically sampling increments of IO active time and dividing
them by the increment of the system monotonic clock.

Under some circumstances it may be more accurate to estimate IO
utilization as the maximum IO utilization across all IO devices, which
could be achieved with a somewhat more complex tree-like data
structure (the present approach is roughly equivalent for IO jobs that
are executed simultaneously, but IO jobs that aren't overlapping in
time will show up as the sum of the individual utilizations of each IO
device, which might be biased towards 100% if the non-overlapping jobs
were actually parallelizable), OTOH in cases where the tasks performed
by different IO devices are interdependent the present approach
provides a more accurate estimate (while the alternative approach
would be biased towards 0% and would likely result in less
energy-efficient behavior of the cpufreq governor).

Signed-off-by: Francisco Jerez <currojerez@riseup.net>
---
 drivers/cpufreq/cpufreq.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cpufreq.h   |  20 +++++
 2 files changed, 224 insertions(+)
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index de33ebf008ad..892709d0722e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2444,6 +2444,210 @@  int cpufreq_boost_enabled(void)
 }
 EXPORT_SYMBOL_GPL(cpufreq_boost_enabled);
 
+/*********************************************************************
+ *               IO ACTIVE TIME ACCOUNTING                           *
+ *********************************************************************/
+
+/**
+ * Number of times cpufreq_io_active_begin() has been called so far without a
+ * matching cpufreq_io_active_end(), or IOW, the approximate number of IO jobs
+ * currently in flight.
+ */
+static atomic_t io_active_count;
+
+/**
+ * Total aggregated time that io_active_count has been greater than zero since
+ * system boot.  Negative values (in two's complement) represent a duration
+ * relative to the current time (typically used to implement the section
+ * between matching cpufreq_io_active_begin() and cpufreq_io_active_end()
+ * calls).  Positive values represent absolute durations and are smaller than
+ * IO_ACTIVE_TIME_M in magnitude.  In order to prevent the reduced integer
+ * range from introducing more frequent time wraparounds than in the rest of
+ * the kernel, time is represented with slightly lower precision than a
+ * ktime_t, in units of 4 ns.
+ */
+static atomic64_t io_active_time;
+
+/**
+ * Time of system boot, or one plus the maximum encoding of an absolute time
+ * duration.  Values greater or equal to this constant in magnitude are used to
+ * represent points in time rather than time durations, this guarantees that
+ * the maximum representable time duration can be subtracted from any point in
+ * time and still give a positive number as result, which is important due to
+ * the somewhat special semantics of the sign of io_active_time.
+ */
+#define IO_ACTIVE_TIME_M ((uint64_t)1 << 62)
+
+/**
+ * Return true if @t is a negative io_active_time value encoding a time
+ * duration relative to the current time.
+ */
+static bool io_active_time_is_relative(uint64_t t)
+{
+	return t >> 63;
+}
+
+/**
+ * Convert a duration or point in time into a scalar value in nanoseconds.
+ */
+static uint64_t io_active_time_to_ns(uint64_t t)
+{
+	return (t & (IO_ACTIVE_TIME_M - 1)) << 2;
+}
+
+/**
+ * Convert a scalar time value in nanoseconds into a point in time.
+ */
+static uint64_t io_active_time_from_ns(uint64_t ns)
+{
+	return IO_ACTIVE_TIME_M + (ns >> 2);
+}
+
+/**
+ * Mark the beginning of the processing of an IO job.  Each call of
+ * cpufreq_io_active_begin() must be accompanied by a corresponding call of
+ * cpufreq_io_active_end() after the IO job completes.
+ */
+void cpufreq_io_active_begin(void)
+{
+	/*
+	 * The body of the conditional below is executed only for the first of
+	 * any number of concurrent calls of cpufreq_io_active_begin(), it is
+	 * ordered after any io_active_time updates done by previous
+	 * invocations of cpufreq_io_active_end() (since those updates are
+	 * ordered before the atomic_cmpxchg_release operation that caused the
+	 * counter to drop to zero in the first place), and it is ordered with
+	 * respect to io_active_time updates done by concurrent invocations of
+	 * cpufreq_io_active_end() (since they won't modify io_active_time
+	 * until io_active_count has dropped to one, which implies that the
+	 * present cpufreq_io_active_begin() call and its matching
+	 * cpufreq_io_active_end() have completed).  Therefore code in this
+	 * block is effectively single-threaded.
+	 */
+	if (atomic_fetch_inc_acquire(&io_active_count) == 0) {
+		/*
+		 * Subtract the current time from io_active_time, which is
+		 * guaranteed to give a negative value (i.e. relative to the
+		 * current time) assuming that the precondition of
+		 * io_active_time being non-negative and lower than
+		 * IO_ACTIVE_TIME_M (i.e. an absolute time duration) is met.
+		 */
+		const uint64_t now = io_active_time_from_ns(ktime_get_ns());
+
+		atomic64_sub(now, &io_active_time);
+		/*
+		 * The barrier is provided for the io_active_time update above
+		 * to be correctly ordered with respect to subsequent memory
+		 * operations, in particular the ones leading to the eventual
+		 * execution of a matching cpufreq_io_active_end() call, which
+		 * must have the io_active_time update above visible.
+		 */
+		smp_wmb();
+	}
+}
+EXPORT_SYMBOL(cpufreq_io_active_begin);
+
+/**
+ * Mark the end of the processing of an IO job.  This must be called after the
+ * completion of the corresponding call of cpufreq_io_active_begin(), but there
+ * is no requirement for the two functions to be called from the same thread.
+ */
+void cpufreq_io_active_end(void)
+{
+	const uint64_t now = io_active_time_from_ns(ktime_get_ns());
+	uint64_t old_active, new_active = atomic_read(&io_active_count);
+	uint64_t begin;
+
+	do {
+		old_active = new_active;
+
+		/*
+		 * The body of the conditional below is ordered after a point
+		 * in which the present thread was executing the only active
+		 * begin/end section remaining in the system.  This implies
+		 * that no concurrent begin/end section can possibly have
+		 * started before the present begin/end section, since
+		 * otherwise the last read of io_active_count would have
+		 * returned a value greater than one.  Therefore all
+		 * concurrent begin/end sections are ordered with respect to
+		 * this section's begin, and cannot possibly have observed an
+		 * io_active_count value lower than two at any point of their
+		 * execution.  That makes code in this block (and in the other
+		 * old_active == 1 conditional further down) effectively
+		 * single-threaded until the atomic cmpxchg operation below
+		 * succeeds.
+		 */
+		if (old_active == 1) {
+			/*
+			 * Update io_active_time to reflect the time spent
+			 * between this (potentially last) call of
+			 * cpufreq_io_active_end() and the matching call of
+			 * cpufreq_io_active_begin().  This doesn't use an
+			 * atomic add in order to prevent overflow which could
+			 * lead to a sign inconsistency every ~584 years.
+			 *
+			 * If now is lower than -begin because the system's
+			 * monotonic clock has wrapped around since the
+			 * matching call of cpufreq_io_active_begin(), the sum
+			 * below will give a time duration result off by
+			 * IO_ACTIVE_TIME_M, which is taken care of with a
+			 * little bitwise arithmetic.
+			 */
+			begin = atomic64_read(&io_active_time);
+			WARN_ON(!io_active_time_is_relative(begin));
+			atomic64_set(&io_active_time,
+				     (begin + now) & (IO_ACTIVE_TIME_M - 1));
+		}
+
+		/*
+		 * If old_active is one at this point we have the guarantee
+		 * that there will be no concurrent update of io_active_time
+		 * between our last update above and the atomic cmpxchg
+		 * operation below, so io_active_time is guaranteed to be
+		 * non-negative as a postcondition of this function when
+		 * io_active_count is successfully decremented to zero by the
+		 * following cmpxchg operation.
+		 */
+		new_active = atomic_cmpxchg_release(
+			&io_active_count, old_active, old_active - 1);
+
+		/*
+		 * Roll back to the original io_active_time value if somebody
+		 * called cpufreq_io_active_begin() concurrently after our
+		 * previous read of io_active_count, which means that this
+		 * maybe wasn't the last call of cpufreq_io_active_end() after
+		 * all.
+		 */
+		if (old_active == 1 && old_active != new_active)
+			atomic64_set(&io_active_time, begin);
+
+		/*
+		 * Retry if another thread modified the counter in parallel
+		 * preventing the atomic cmpxchg operation above from
+		 * committing our updated value.
+		 */
+	} while (old_active != new_active);
+}
+EXPORT_SYMBOL(cpufreq_io_active_end);
+
+/**
+ * Return the total accumulated time in nanoseconds that the system has spent
+ * doing IO processing since boot, defined as the total time spent between
+ * matching calls of cpufreq_io_active_begin() and cpufreq_io_active_end().
+ */
+uint64_t cpufreq_io_active_time_ns(void)
+{
+	const uint64_t time = atomic64_read(&io_active_time);
+	const uint64_t now = io_active_time_from_ns(ktime_get_ns());
+
+	if (io_active_time_is_relative(time))
+		return io_active_time_to_ns(time + now);
+	else
+		return io_active_time_to_ns(time);
+}
+EXPORT_SYMBOL(cpufreq_io_active_time_ns);
+
+
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
  *********************************************************************/
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 21e8d248d956..2107a1169cce 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -950,7 +950,27 @@  static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
 }
 #endif
 
+#ifdef CONFIG_CPU_FREQ
+void cpufreq_io_active_begin(void);
+void cpufreq_io_active_end(void);
+uint64_t cpufreq_io_active_time_ns(void);
+#else
+static inline void cpufreq_io_active_begin(void)
+{
+}
+
+static inline void cpufreq_io_active_end(void)
+{
+}
+
+static inline uint64_t cpufreq_io_active_time_ns(void)
+{
+	return 0;
+}
+#endif
+
 extern void arch_freq_prepare_all(void);
+
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
 extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,