diff mbox

[RFC,2/2] sched: idle: IRQ based next prediction for idle period

Message ID 1452093774-17831-3-git-send-email-daniel.lezcano@linaro.org (mailing list archive)
State RFC, archived
Headers show

Commit Message

Daniel Lezcano Jan. 6, 2016, 3:22 p.m. UTC
Many IRQs are quiet most of the time, or they tend to come in bursts of
fairly equal time intervals within each burst. It is therefore possible
to detect those IRQs with stable intervals and guestimate when the next
IRQ event is most likely to happen.

Examples of such IRQs may include audio related IRQs where the FIFO size
and/or DMA descriptor size with the sample rate create stable intervals,
block devices during large data transfers, etc.  Even network streaming
of multimedia content creates patterns of periodic network interface IRQs
in some cases.

This patch adds code to track the mean interval and variance for each IRQ
over a window of time intervals between IRQ events. Those statistics can
be used to assist cpuidle in selecting the most appropriate sleep state
by predicting the most likely time for the next interrupt.

Because the stats are gathered in interrupt context, the core computation
is as light as possible.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/cpuidle/Kconfig   |   5 +
 kernel/irq/Kconfig        |   1 -
 kernel/sched/Makefile     |   1 +
 kernel/sched/idle-sched.c | 549 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 555 insertions(+), 1 deletion(-)
 create mode 100644 kernel/sched/idle-sched.c

Comments

Nicolas Pitre Jan. 6, 2016, 5:40 p.m. UTC | #1
On Wed, 6 Jan 2016, Daniel Lezcano wrote:

> Many IRQs are quiet most of the time, or they tend to come in bursts of
> fairly equal time intervals within each burst. It is therefore possible
> to detect those IRQs with stable intervals and guestimate when the next
> IRQ event is most likely to happen.
> 
> Examples of such IRQs may include audio related IRQs where the FIFO size
> and/or DMA descriptor size with the sample rate create stable intervals,
> block devices during large data transfers, etc.  Even network streaming
> of multimedia content creates patterns of periodic network interface IRQs
> in some cases.
> 
> This patch adds code to track the mean interval and variance for each IRQ
> over a window of time intervals between IRQ events. Those statistics can
> be used to assist cpuidle in selecting the most appropriate sleep state
> by predicting the most likely time for the next interrupt.
> 
> Because the stats are gathered in interrupt context, the core computation
> is as light as possible.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

There are still a few problems with this patch.

Please see comments below.

> ---
>  drivers/cpuidle/Kconfig   |   5 +
>  kernel/irq/Kconfig        |   1 -
>  kernel/sched/Makefile     |   1 +
>  kernel/sched/idle-sched.c | 549 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 555 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/sched/idle-sched.c
> 
> diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
> index 8c7930b..dd17215 100644
> --- a/drivers/cpuidle/Kconfig
> +++ b/drivers/cpuidle/Kconfig
> @@ -25,6 +25,11 @@ config CPU_IDLE_GOV_MENU
>  	bool "Menu governor (for tickless system)"
>  	default y
>  
> +config CPU_IDLE_GOV_SCHED
> +	bool "Sched idle governor"
> +	select IRQ_TIMINGS
> +	default y
> +
>  config DT_IDLE_STATES
>  	bool
>  
> diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> index 1275fd1..81557ae 100644
> --- a/kernel/irq/Kconfig
> +++ b/kernel/irq/Kconfig
> @@ -75,7 +75,6 @@ config HANDLE_DOMAIN_IRQ
>  
>  config IRQ_TIMINGS
>          bool
> -	default y
>  
>  config IRQ_DOMAIN_DEBUG
>  	bool "Expose hardware/virtual IRQ mapping via debugfs"
> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> index 6768797..f7d5a35 100644
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
>  obj-$(CONFIG_SCHEDSTATS) += stats.o
>  obj-$(CONFIG_SCHED_DEBUG) += debug.o
>  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
> +obj-$(CONFIG_CPU_IDLE_GOV_SCHED) += idle-sched.o
> diff --git a/kernel/sched/idle-sched.c b/kernel/sched/idle-sched.c
> new file mode 100644
> index 0000000..a8e7557
> --- /dev/null
> +++ b/kernel/sched/idle-sched.c
> @@ -0,0 +1,549 @@
> +/*
> + *  Copyright (C) 2016 Linaro Ltd, Daniel Lezcano <daniel.lezcano@linaro.org>
> + *                                 Nicolas Pitre <nicolas.pitre@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +#include <linux/cpuidle.h>
> +#include <linux/interrupt.h>
> +#include <linux/irqdesc.h>
> +#include <linux/ktime.h>
> +#include <linux/slab.h>
> +#include <linux/tick.h>
> +#include <linux/time64.h>
> +
> +/*
> + * Define the number of samples over which the average and variance
> + * are computed. A power of 2 is preferred so to let the compiler
> + * optimize divisions by that number with simple arithmetic shifts.
> + */
> +#define STATS_NR_VALUES 4
> +
> +/**
> + * struct stats - internal structure to encapsulate stats informations
> + *
> + * @sum: sum of the values
> + * @values: array of values to do stats on
> + * @n: current number of values
> + * @w_ptr: current buffer pointer
> + */
> +struct stats {
> +	u64          sum;                     /* sum of values */
> +	u32          values[STATS_NR_VALUES]; /* array of values */
> +	unsigned int w_ptr;                   /* current window pointer */
> +};
> +
> +/**
> + * struct wakeup - internal structure describing a source of wakeup
> + *
> + * @stats: the stats structure on the different event intervals
> + * @timestamp: latest update timestamp
> + */
> +struct wakeup {
> +	struct stats stats;
> +	ktime_t timestamp;
> +};
> +
> +/*
> + * Per cpu and irq statistics. Each cpu receives interrupts and those
> + * ones can be distributed following an irq chip specific
> + * algorithm. Random irq distribution is the worst case to predict
> + * interruption behavior but usually that does not happen or could be
> + * fixed from userspace by setting the irq affinity.
> + */
> +static DEFINE_PER_CPU(struct wakeup, *wakeups[NR_IRQS]);
> +
> +/**
> + * stats_add - add a new value in the statistic structure
> + *
> + * @s: the statistic structure
> + * @value: the new value to be added
> + *
> + * Adds the value to the array, if the array is full, the oldest value
> + * is replaced.
> + */
> +static void stats_add(int irq, struct stats *s, u32 value)
> +{
> +	/*
> +	 * This is a circular buffer, so the oldest value is
> +	 * the next one in the buffer. Let's compute the next
> +	 * pointer to retrieve the oldest value and re-use it
> +	 * to update the w_ptr after adding the new value.
> +	 */
> +	unsigned int w_ptr = s->w_ptr;
> +
> +	/*
> +	 * Remove the oldest value from the summing. If this is the
> +	 * first time we go through this array slot, the previous
> +	 * value will be zero and we won't substract anything from the
> +	 * current sum.  Hence this code relies on a zero-ed stat
> +	 * structure at init time via memset or kzalloc.
> +	 */
> +	s->sum -= s->values[w_ptr];
> +	s->values[w_ptr] = value;
> +	s->w_ptr = (w_ptr + 1) % STATS_NR_VALUES;
> +
> +        /*
> +	 * In order to reduce the overhead and to prevent value
> +	 * derivation due to the integer computation, we just sum the
> +	 * value and do the division when the average and the variance
> +	 * are requested.
> +         */
> +	s->sum += value;
> +}
> +
> +/**
> + * stats_reset - reset the stats
> + *
> + * @s: the statistic structure
> + *
> + * Reset the statistics and reset the values
> + */
> +static inline void stats_reset(struct stats *s)
> +{
> +	memset(s, 0, sizeof(*s));
> +}
> +
> +/**
> + * stats_mean - compute the average
> + *
> + * @s: the statistics structure
> + *
> + * Returns an u32 corresponding to the mean value, or zero if there is
> + * no data
> + */
> +static inline u32 stats_mean(struct stats *s)
> +{
> +	/*
> +	 * gcc is smart enough to convert to a bits shift when the
> +	 * divisor is constant and multiple of 2^x.
> +	 *
> +	 * The number of values could have not reached STATS_NR_VALUES
> +	 * yet, but we can consider it acceptable as the situation is
> +	 * only at the very first beginning in the system life cycle.
> +	 */
> +	return s->sum / STATS_NR_VALUES;
> +}
> +
> +/**
> + * stats_variance - compute the variance
> + *
> + * @s: the statistic structure
> + *
> + * Returns an u64 corresponding to the variance, or zero if there is
> + * no data
> + */
> +static u64 stats_variance(struct stats *s, u32 mean)
> +{
> +	int i;
> +	u64 variance = 0;
> +
> +	/*
> +	 * The variance is the sum of the squared difference to the
> +	 * average divided by the number of elements.
> +	 */
> +	for (i = 0; i < STATS_NR_VALUES; i++) {
> +		s32 diff = s->values[i] - mean;
> +		variance += (u64)diff * diff;

Strictly speaking, diff should be casted to s64 as it is a signed value 
that may actually be negative.  Because of the strange C type promotion 
rules, the generated code appears correct (at least on ARM), but it 
would be clearer to use s64 anyway.

The product will end up being positive in all cases of course so 
variance may remain as a u64.

> +	}
> +
> +	return variance / STATS_NR_VALUES;
> +}
> +
> +/**
> + * sched_idle_irq - irq timestamp callback
> + *
> + * @irq: the irq number
> + * @timestamp: when the interrupt occured
> + * @dev_id: device id for shared interrupt (not yet used)
> + * @data: private data given when registering this callback
> + *
> + * Interrupt callback called when an interrupt happens. This function
> + * is critical as it is called under an interrupt section: minimum
> + * operations as possible are done here:
> + */
> +static void sched_idle_irq(unsigned int irq, ktime_t timestamp,
> +			   void *dev_id, void *data)
> +{
> +	u32 diff;
> +	unsigned int cpu = raw_smp_processor_id();
> +	struct wakeup *w = per_cpu(wakeups[irq], cpu);
> +
> +	if (!w)
> +		return;
> +
> +	/*
> +	 * It is the first time the interrupt occurs of the series, we
> +	 * can't do any stats as we don't have an interval, just store
> +	 * the timestamp and exit.
> +	 */
> +	if (ktime_equal(w->timestamp, ktime_set(0, 0))) {
> +		w->timestamp = timestamp;
> +		return;
> +	}
> +
> +	/*
> +	 * Microsec resolution is enough for our purpose.
> +	 */
> +	diff = ktime_us_delta(timestamp, w->timestamp);
> +	w->timestamp = timestamp;
> +	stats_add(irq, &w->stats, diff);
> +}
> +
> +/*
> + * Callback to be called when an interrupt happens.
> + */
> +static struct irqtimings irq_timings = {
> +	.handler = sched_idle_irq,
> +};
> +
> +static ktime_t next_irq_event(void)
> +{
> +	unsigned int irq, cpu = raw_smp_processor_id();
> +	ktime_t diff, next, min = { .tv64 = KTIME_MAX };

To avoid exposing the ktime_t definition, you should use 
ktime_set(KTIME_SEC_MAX, 0) instead of { .tv64 = KTIME_MAX }.

> +	struct wakeup *w;
> +	u32 interval, mean;
> +	u64 variance;
> +
> +	/*
> +	 * Lookup the interrupt array for this cpu and search for the
> +	 * earlier expected interruption.
> +	 */
> +        for (irq = 0; irq < NR_IRQS; irq++) {
> +
> +		ktime_t now = ktime_get();

ktime_get() is potentially non-trivial. It should be plenty good enough 
to call it only once outside the loop.

> +		w = per_cpu(wakeups[irq], cpu);
> +
> +		/*
> +		 * The interrupt was not setup as a source of a wakeup
> +		 * or the wakeup source is not considered at this
> +		 * moment stable enough to do a prediction.
> +		 */
> +		if (!w)
> +			continue;
> +
> +		/*
> +		 * No statistics available yet.
> +		 */
> +		if (ktime_equal(w->timestamp, ktime_set(0, 0)))
> +			continue;
> +
> +		diff = ktime_sub(now, w->timestamp);
> +
> +		/*
> +		 * There is no point attempting predictions on interrupts more
> +		 * than 1 second apart. This has no benefit for sleep state
> +		 * selection and increases the risk of overflowing our variance
> +		 * computation. Reset all stats in that case.
> +		 */
> +		if (unlikely(ktime_after(diff, ktime_set(1, 0)))) {
> +			stats_reset(&w->stats);
> +			continue;
> +		}

The above is wrong. It is not computing the interval between successive 
interruts but rather the interval between the last interrupt occurrence 
and the present time (i.e. when we're about to go idle).  This won't 
prevent interrupt intervals greater than one second from being summed 
and potentially overflowing the variance if this code is executed less 
than a second after one such IRQ interval.  This test should rather be 
performed in sched_idle_irq().

> +		 * If the mean value is null, just ignore this wakeup
> +		 * source.
> +		 */
> +		mean = stats_mean(&w->stats);
> +		if (!mean)
> +			continue;
> +
> +		variance = stats_variance(&w->stats, mean);
> +		/*
> +		 * We want to check the last interval is:
> +		 *
> +		 *  mean - stddev < interval < mean + stddev
> +		 *
> +		 * That simplifies to:
> +		 *
> +		 * -stddev < interval - mean < stddev
> +		 *
> +		 * abs(interval - mean) < stddev
> +		 *
> +		 * The standard deviation is the sqrt of the variance:
> +		 *
> +		 * abs(interval - mean) < sqrt(variance)
> +		 *
> +		 * and we want to prevent to do an sqrt, so we square
> +		 * the equation:
> +		 *
> +		 * (interval - mean)^2 < variance
> +		 *
> +		 * So if the latest value of the stats complies with
> +		 * this condition, then the wakeup source is
> +		 * considered predictable and can be used to predict
> +		 * the next event.
> +		 */
> +		interval = w->stats.values[(w->stats.w_ptr + 1) % STATS_NR_VALUES];

But here (w->stats.w_ptr + 1) % STATS_NR_VALUES does not point at the 
last interval. It rather points at the second oldest.

To make things simpler, you should rather pre-increment the pointer 
before updating the array in stats_add(), and here the value you want 
will directly be accessible with w->stats.values[w->stats.w_ptr].

> +		if ((u64)((interval - mean) * (interval - mean)) > variance)
> +			continue;

Same comment as in stats_variance(): this should be s64.

> +		/*
> +		 * Let's compute the next event: the wakeup source is
> +		 * considered predictable, we add the average interval
> +		 * time added to the latest interruption event time.
> +		 */
> +		next = ktime_add_us(w->timestamp, stats_mean(&w->stats));
> +
> +		/*
> +		 * If the interrupt is supposed to happen before the
> +		 * minimum time, then it becomes the minimum.
> +		 */
> +		if (ktime_before(next, min))
> +			min = next;
> +	}
> +
> +	/*
> +	 * At this point, we have our prediction but the caller is
> +	 * expecting the remaining time before the next event, so
> +	 * compute the expected sleep length.
> +	 */
> +	diff = ktime_sub(min, ktime_get());

You should use the variable 'now' rather than asking for the current 
time again.

> +	/*
> +	 * The result could be negative for different reasons:
> +	 *  - the prediction is incorrect
> +	 *  - the prediction was too near now and expired while we were
> +	 *    in this function
> +	 *
> +	 * In both cases, we return KTIME_MAX as a failure to do a
> +	 * prediction
> +	 */
> +	if (ktime_compare(diff, ktime_set(0, 0)) <= 0)
> +		return (ktime_t){ .tv64 = KTIME_MAX };

See my comment about ktime_t internals at the beginning of this 
function.

> +
> +	return diff;
> +}
> +
> +/**
> + * sched_idle_next_wakeup - Predict the next wakeup on the current cpu
> + *
> + * The next event on the cpu is based on a statistic approach of the
> + * interrupt events and the timer deterministic value. From the timer
> + * or the irqs, we return the one expected to occur first.
> + *
> + * Returns the expected remaining idle time before being woken up by
> + * an interruption.
> + */
> +s64 sched_idle_next_wakeup(void)
> +{
> +	s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
> +	s64 next_irq = ktime_to_us(next_irq_event());
> +
> +	return min(next_irq, next_timer);
> +}
> +
> +/**
> + * sched_idle - go to idle for a specified amount of time
> + *
> + * @duration: the idle duration time
> + * @latency: the latency constraint
> + *
> + * Returns 0 on success, < 0 otherwise.
> + */
> +int sched_idle(s64 duration, unsigned int latency)
> +{
> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> +	struct cpuidle_state_usage *su;
> +	struct cpuidle_state *s;
> +	int i, ret = 0, index = -1;
> +
> +	rcu_idle_enter();
> +
> +	/*
> +	 * No cpuidle driver is available, let's use the default arch
> +	 * idle function.
> +	 */
> +	if (cpuidle_not_available(drv, dev))
> +		goto default_idle;
> +
> +	/*
> +	 * Find the idle state with the lowest power while satisfying
> +	 * our constraints. We will save energy if the duration of the
> +	 * idle time is bigger than the target residency which is the
> +	 * break even point. The choice will be modulated by the
> +	 * latency.
> +	 */
> +	for (i = 0; i < drv->state_count; i++) {
> +
> +		s = &drv->states[i];
> +
> +		su = &dev->states_usage[i];
> +
> +		if (s->disabled || su->disable)
> +			continue;
> +		if (s->target_residency > duration)
> +			continue;
> +		if (s->exit_latency > latency)
> +			continue;
> +
> +		index = i;
> +	}
> +
> +	/*
> +	 * The idle task must be scheduled, it is pointless to go to
> +	 * idle, just re-enable the interrupt and return.
> +	 */
> +	if (current_clr_polling_and_test()) {
> +		local_irq_enable();
> +		goto out;
> +	}
> +
> +	if (index < 0) {
> +		/*
> +		 * No idle callbacks fulfilled the constraints, jump
> +		 * to the default function like there wasn't any
> +		 * cpuidle driver.
> +		 */
> +		goto default_idle;
> +	} else {
> +		/*
> +		 * Enter the idle state previously returned by the
> +		 * governor decision.  This function will block until
> +		 * an interrupt occurs and will take care of
> +		 * re-enabling the local interrupts
> +		 */
> +		return cpuidle_enter(drv, dev, index);
> +	}
> +
> +default_idle:
> +	default_idle_call();
> +out:
> +	rcu_idle_exit();
> +	return ret;
> +}
> +
> +/**
> + * sched_irq_timing_free - free_irq callback
> + *
> + * @irq: the irq number to stop tracking
> + * @dev_id: not used at the moment
> + *
> + * This function will remove from the wakeup source prediction table.
> + */
> +static void sched_irq_timing_free(unsigned int irq, void *dev_id)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);
> +	struct wakeup *w;
> +	unsigned int cpu;
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	desc->timings = NULL;
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> +
> +	for_each_possible_cpu(cpu) {
> +		w = per_cpu(wakeups[irq], cpu);
> +		if (!w)
> +			continue;
> +
> +		per_cpu(wakeups[irq], cpu) = NULL;
> +		kfree(w);
> +	}
> +}
> +
> +/**
> + * sched_irq_timing_setup - setup_irq callback
> + *
> + * @irq: the interrupt numbe to be tracked
> + * @act: the new irq action to be set to this interrupt
> + *
> + * Update the irq table to be tracked in order to predict the next event.
> + *
> + * Returns zero on success. On error it returns -ENOMEM.
> + */
> +static int sched_irq_timing_setup(unsigned int irq, struct irqaction *act)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);
> +	struct wakeup *w;
> +	unsigned int cpu;
> +	unsigned long flags;
> +	int ret = -ENOMEM;
> +
> +	/*
> +	 * No interrupt set for this descriptor or related to a timer.
> +	 * Timers are deterministic, so no need to try to do any
> +	 * prediction on them. No error for both cases, we are just not
> +	 * interested.
> +	 */
> +	if (!act || (act->flags & __IRQF_TIMER))
> +		return 0;
> +
> +	/*
> +	 * Allocates the wakeup structure and the stats structure. As
> +	 * the interrupt can occur on any cpu, allocate the wakeup
> +	 * structure per cpu basis.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +
> +		w = kzalloc(sizeof(*w), GFP_KERNEL);
> +		if (!w)
> +			goto undo;
> +
> +		per_cpu(wakeups[irq], cpu) = w;
> +	}
> +
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	desc->timings = &irq_timings;
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> +
> +	ret = 0;
> +undo:
> +	/*
> +	 * Rollback all allocations on failure
> +	 */
> +	if (ret)
> +		sched_irq_timing_free(irq, act->dev_id);
> +
> +	return ret;
> +}
> +
> +/*
> + * Setup/free irq callbacks
> + */
> +static struct irqtimings_ops irqt_ops = {
> +	.setup = sched_irq_timing_setup,
> +	.free = sched_irq_timing_free,
> +};
> +
> +/**
> + * sched_idle_init - setup the interrupt tracking table
> + *
> + * At init time, some interrupts could have been setup and in the
> + * system life time, some devices could be setup. In order to track
> + * all interrupts we are interested in, we first register a couple of
> + * callback to keep up-to-date the interrupt tracking table and then
> + * we setup the table with the interrupt which were already set up.
> + */
> +int __init sched_idle_init(void)
> +{
> +	struct irq_desc *desc;
> +	unsigned int irq;
> +	int ret;
> +
> +	/*
> +	 * Register the setup/free irq callbacks, so new interrupt or
> +	 * freed interrupt will update their tracking.
> +	 */
> +	ret = register_irq_timings(&irqt_ops);
> +	if (ret) {
> +		pr_err("Failed to register timings ops\n");
> +		return ret;
> +	}
> +
> +	/*
> +	 * For all the irq already setup, assign the timing callback.
> +	 * All interrupts with their desc NULL will be discarded.
> +	 */
> +	for_each_irq_desc(irq, desc)
> +		sched_irq_timing_setup(irq, desc->action);
> +
> +	return 0;
> +}
> +late_initcall(sched_idle_init);
> -- 
> 1.9.1
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 7, 2016, 3:42 p.m. UTC | #2
On 01/06/2016 06:40 PM, Nicolas Pitre wrote:
> On Wed, 6 Jan 2016, Daniel Lezcano wrote:
>
>> Many IRQs are quiet most of the time, or they tend to come in bursts of
>> fairly equal time intervals within each burst. It is therefore possible
>> to detect those IRQs with stable intervals and guestimate when the next
>> IRQ event is most likely to happen.
>>
>> Examples of such IRQs may include audio related IRQs where the FIFO size
>> and/or DMA descriptor size with the sample rate create stable intervals,
>> block devices during large data transfers, etc.  Even network streaming
>> of multimedia content creates patterns of periodic network interface IRQs
>> in some cases.
>>
>> This patch adds code to track the mean interval and variance for each IRQ
>> over a window of time intervals between IRQ events. Those statistics can
>> be used to assist cpuidle in selecting the most appropriate sleep state
>> by predicting the most likely time for the next interrupt.
>>
>> Because the stats are gathered in interrupt context, the core computation
>> is as light as possible.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>
> There are still a few problems with this patch.
>
> Please see comments below.

[ ... ]

>> +/**
>> + * stats_variance - compute the variance
>> + *
>> + * @s: the statistic structure
>> + *
>> + * Returns an u64 corresponding to the variance, or zero if there is
>> + * no data
>> + */
>> +static u64 stats_variance(struct stats *s, u32 mean)
>> +{
>> +	int i;
>> +	u64 variance = 0;
>> +
>> +	/*
>> +	 * The variance is the sum of the squared difference to the
>> +	 * average divided by the number of elements.
>> +	 */
>> +	for (i = 0; i < STATS_NR_VALUES; i++) {
>> +		s32 diff = s->values[i] - mean;
>> +		variance += (u64)diff * diff;
>
> Strictly speaking, diff should be casted to s64 as it is a signed value
> that may actually be negative.  Because of the strange C type promotion
> rules, the generated code appears correct (at least on ARM), but it
> would be clearer to use s64 anyway.

I don't get the connection in your explanation of why it should be a 
s64. It is already a signed s32, s->values[] are s32 and mean is u32.

What would be the benefit to convert diff to s64 ?

> The product will end up being positive in all cases of course so
> variance may remain as a u64.
>

[ ... ]

>> +static ktime_t next_irq_event(void)
>> +{
>> +	unsigned int irq, cpu = raw_smp_processor_id();
>> +	ktime_t diff, next, min = { .tv64 = KTIME_MAX };
>
> To avoid exposing the ktime_t definition, you should use
> ktime_set(KTIME_SEC_MAX, 0) instead of { .tv64 = KTIME_MAX }.

Ok.

>> +	struct wakeup *w;
>> +	u32 interval, mean;
>> +	u64 variance;
>> +
>> +	/*
>> +	 * Lookup the interrupt array for this cpu and search for the
>> +	 * earlier expected interruption.
>> +	 */
>> +        for (irq = 0; irq < NR_IRQS; irq++) {
>> +
>> +		ktime_t now = ktime_get();
>
> ktime_get() is potentially non-trivial. It should be plenty good enough
> to call it only once outside the loop.

Ok.

>> +		w = per_cpu(wakeups[irq], cpu);
>> +
>> +		/*
>> +		 * The interrupt was not setup as a source of a wakeup
>> +		 * or the wakeup source is not considered at this
>> +		 * moment stable enough to do a prediction.
>> +		 */
>> +		if (!w)
>> +			continue;
>> +
>> +		/*
>> +		 * No statistics available yet.
>> +		 */
>> +		if (ktime_equal(w->timestamp, ktime_set(0, 0)))
>> +			continue;
>> +
>> +		diff = ktime_sub(now, w->timestamp);
>> +
>> +		/*
>> +		 * There is no point attempting predictions on interrupts more
>> +		 * than 1 second apart. This has no benefit for sleep state
>> +		 * selection and increases the risk of overflowing our variance
>> +		 * computation. Reset all stats in that case.
>> +		 */
>> +		if (unlikely(ktime_after(diff, ktime_set(1, 0)))) {
>> +			stats_reset(&w->stats);
>> +			continue;
>> +		}
>
> The above is wrong. It is not computing the interval between successive
> interruts but rather the interval between the last interrupt occurrence
> and the present time (i.e. when we're about to go idle).  This won't
> prevent interrupt intervals greater than one second from being summed
> and potentially overflowing the variance if this code is executed less
> than a second after one such IRQ interval.  This test should rather be
> performed in sched_idle_irq().

Ok, I will move it.

>> +		 * If the mean value is null, just ignore this wakeup
>> +		 * source.
>> +		 */
>> +		mean = stats_mean(&w->stats);
>> +		if (!mean)
>> +			continue;
>> +
>> +		variance = stats_variance(&w->stats, mean);
>> +		/*
>> +		 * We want to check the last interval is:
>> +		 *
>> +		 *  mean - stddev < interval < mean + stddev
>> +		 *
>> +		 * That simplifies to:
>> +		 *
>> +		 * -stddev < interval - mean < stddev
>> +		 *
>> +		 * abs(interval - mean) < stddev
>> +		 *
>> +		 * The standard deviation is the sqrt of the variance:
>> +		 *
>> +		 * abs(interval - mean) < sqrt(variance)
>> +		 *
>> +		 * and we want to prevent to do an sqrt, so we square
>> +		 * the equation:
>> +		 *
>> +		 * (interval - mean)^2 < variance
>> +		 *
>> +		 * So if the latest value of the stats complies with
>> +		 * this condition, then the wakeup source is
>> +		 * considered predictable and can be used to predict
>> +		 * the next event.
>> +		 */
>> +		interval = w->stats.values[(w->stats.w_ptr + 1) % STATS_NR_VALUES];
>
> But here (w->stats.w_ptr + 1) % STATS_NR_VALUES does not point at the
> last interval. It rather points at the second oldest.
> To make things simpler, you should rather pre-increment the pointer
> before updating the array in stats_add(), and here the value you want
> will directly be accessible with w->stats.values[w->stats.w_ptr].

Yeah, there is a glitch here. I will look at it.

>> +		if ((u64)((interval - mean) * (interval - mean)) > variance)
>> +			continue;
>
> Same comment as in stats_variance(): this should be s64.
>
>> +		/*
>> +		 * Let's compute the next event: the wakeup source is
>> +		 * considered predictable, we add the average interval
>> +		 * time added to the latest interruption event time.
>> +		 */
>> +		next = ktime_add_us(w->timestamp, stats_mean(&w->stats));
>> +
>> +		/*
>> +		 * If the interrupt is supposed to happen before the
>> +		 * minimum time, then it becomes the minimum.
>> +		 */
>> +		if (ktime_before(next, min))
>> +			min = next;
>> +	}
>> +
>> +	/*
>> +	 * At this point, we have our prediction but the caller is
>> +	 * expecting the remaining time before the next event, so
>> +	 * compute the expected sleep length.
>> +	 */
>> +	diff = ktime_sub(min, ktime_get());
>
> You should use the variable 'now' rather than asking for the current
> time again.

Yep.

>> +	/*
>> +	 * The result could be negative for different reasons:
>> +	 *  - the prediction is incorrect
>> +	 *  - the prediction was too near now and expired while we were
>> +	 *    in this function
>> +	 *
>> +	 * In both cases, we return KTIME_MAX as a failure to do a
>> +	 * prediction
>> +	 */
>> +	if (ktime_compare(diff, ktime_set(0, 0)) <= 0)
>> +		return (ktime_t){ .tv64 = KTIME_MAX };
>
> See my comment about ktime_t internals at the beginning of this
> function.

Ok.

Thanks for the review.

   -- Daniel
Thomas Gleixner Jan. 8, 2016, 3:43 p.m. UTC | #3
On Wed, 6 Jan 2016, Daniel Lezcano wrote:
> +/**
> + * sched_irq_timing_free - free_irq callback
> + *
> + * @irq: the irq number to stop tracking
> + * @dev_id: not used at the moment
> + *
> + * This function will remove from the wakeup source prediction table.
> + */
> +static void sched_irq_timing_free(unsigned int irq, void *dev_id)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);

What has this code to fiddle with irq_desc? Nothing!

> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	desc->timings = &irq_timings;
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);

Bah, no. This belongs into the core code. Add that handler - and yes, that's
all you need - to your timing_ops and let the core code do it in a proper way.

> +/**
> + * sched_idle_init - setup the interrupt tracking table
> + *
> + * At init time, some interrupts could have been setup and in the
> + * system life time, some devices could be setup. In order to track
> + * all interrupts we are interested in, we first register a couple of
> + * callback to keep up-to-date the interrupt tracking table and then
> + * we setup the table with the interrupt which were already set up.
> + */
> +int __init sched_idle_init(void)
> +{
> +	struct irq_desc *desc;
> +	unsigned int irq;
> +	int ret;
> +
> +	/*
> +	 * Register the setup/free irq callbacks, so new interrupt or
> +	 * freed interrupt will update their tracking.
> +	 */
> +	ret = register_irq_timings(&irqt_ops);
> +	if (ret) {
> +		pr_err("Failed to register timings ops\n");
> +		return ret;
> +	}

So that stuff is installed unconditionally. Is it used unconditionally as
well?

> +	/*
> +	 * For all the irq already setup, assign the timing callback.
> +	 * All interrupts with their desc NULL will be discarded.
> +	 */
> +	for_each_irq_desc(irq, desc)
> +		sched_irq_timing_setup(irq, desc->action);

No, no, no. This belongs into the core code register_irq_timings() function
which installs the handler into the irq descs with the proper protections and
once it has done that enables the static key.

The above is completely unprotected against interrupts being setup or even
freed concurrently.

Aside of that, you call that setup function in setup_irq for each action() and
here you call it only for the first one.

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 10, 2016, 10:37 p.m. UTC | #4
On 01/06/2016 06:40 PM, Nicolas Pitre wrote:
> On Wed, 6 Jan 2016, Daniel Lezcano wrote:
>
>> Many IRQs are quiet most of the time, or they tend to come in bursts of
>> fairly equal time intervals within each burst. It is therefore possible
>> to detect those IRQs with stable intervals and guestimate when the next
>> IRQ event is most likely to happen.
>>
>> Examples of such IRQs may include audio related IRQs where the FIFO size
>> and/or DMA descriptor size with the sample rate create stable intervals,
>> block devices during large data transfers, etc.  Even network streaming
>> of multimedia content creates patterns of periodic network interface IRQs
>> in some cases.
>>
>> This patch adds code to track the mean interval and variance for each IRQ
>> over a window of time intervals between IRQ events. Those statistics can
>> be used to assist cpuidle in selecting the most appropriate sleep state
>> by predicting the most likely time for the next interrupt.
>>
>> Because the stats are gathered in interrupt context, the core computation
>> is as light as possible.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

[ ... ]

>> +
>> +		diff = ktime_sub(now, w->timestamp);
>> +
>> +		/*
>> +		 * There is no point attempting predictions on interrupts more
>> +		 * than 1 second apart. This has no benefit for sleep state
>> +		 * selection and increases the risk of overflowing our variance
>> +		 * computation. Reset all stats in that case.
>> +		 */
>> +		if (unlikely(ktime_after(diff, ktime_set(1, 0)))) {
>> +			stats_reset(&w->stats);
>> +			continue;
>> +		}
>
> The above is wrong. It is not computing the interval between successive
> interruts but rather the interval between the last interrupt occurrence
> and the present time (i.e. when we're about to go idle).  This won't
> prevent interrupt intervals greater than one second from being summed
> and potentially overflowing the variance if this code is executed less
> than a second after one such IRQ interval.  This test should rather be
> performed in sched_idle_irq().

Hi Nico,

I have been through here again and think we should duplicate the test 
because there are two cases:

1. We did not go idle and the interval measured in sched_idle_irq is 
more than one second, then the stats are reset. I suggest to use an 
approximation of one second: (diff < (1 << 20)) as we are in the fast
path.

2. We are going idle and the latest interrupt happened one second apart 
from now. So we keep the current test.

   -- Daniel
Nicolas Pitre Jan. 10, 2016, 10:46 p.m. UTC | #5
On Sun, 10 Jan 2016, Daniel Lezcano wrote:

> On 01/06/2016 06:40 PM, Nicolas Pitre wrote:
> > On Wed, 6 Jan 2016, Daniel Lezcano wrote:
> >
> > > Many IRQs are quiet most of the time, or they tend to come in bursts of
> > > fairly equal time intervals within each burst. It is therefore possible
> > > to detect those IRQs with stable intervals and guestimate when the next
> > > IRQ event is most likely to happen.
> > >
> > > Examples of such IRQs may include audio related IRQs where the FIFO size
> > > and/or DMA descriptor size with the sample rate create stable intervals,
> > > block devices during large data transfers, etc.  Even network streaming
> > > of multimedia content creates patterns of periodic network interface IRQs
> > > in some cases.
> > >
> > > This patch adds code to track the mean interval and variance for each IRQ
> > > over a window of time intervals between IRQ events. Those statistics can
> > > be used to assist cpuidle in selecting the most appropriate sleep state
> > > by predicting the most likely time for the next interrupt.
> > >
> > > Because the stats are gathered in interrupt context, the core computation
> > > is as light as possible.
> > >
> > > Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> 
> [ ... ]
> 
> > > +
> > > +		diff = ktime_sub(now, w->timestamp);
> > > +
> > > +		/*
> > > +		 * There is no point attempting predictions on interrupts more
> > > +		 * than 1 second apart. This has no benefit for sleep state
> > > +		 * selection and increases the risk of overflowing our
> > > variance
> > > +		 * computation. Reset all stats in that case.
> > > +		 */
> > > +		if (unlikely(ktime_after(diff, ktime_set(1, 0)))) {
> > > +			stats_reset(&w->stats);
> > > +			continue;
> > > +		}
> >
> > The above is wrong. It is not computing the interval between successive
> > interruts but rather the interval between the last interrupt occurrence
> > and the present time (i.e. when we're about to go idle).  This won't
> > prevent interrupt intervals greater than one second from being summed
> > and potentially overflowing the variance if this code is executed less
> > than a second after one such IRQ interval.  This test should rather be
> > performed in sched_idle_irq().
> 
> Hi Nico,
> 
> I have been through here again and think we should duplicate the test because
> there are two cases:
> 
> 1. We did not go idle and the interval measured in sched_idle_irq is more than
> one second, then the stats are reset. I suggest to use an approximation of one
> second: (diff < (1 << 20)) as we are in the fast
> path.
> 
> 2. We are going idle and the latest interrupt happened one second apart from
> now. So we keep the current test.

You don't need the current test if the interval is already limited 
earlier on.  Predictions that would otherwise trip that test will target 
a time in the past and be discarded.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 10, 2016, 10:58 p.m. UTC | #6
On 01/10/2016 11:46 PM, Nicolas Pitre wrote:
> On Sun, 10 Jan 2016, Daniel Lezcano wrote:
>
>> On 01/06/2016 06:40 PM, Nicolas Pitre wrote:
>>> On Wed, 6 Jan 2016, Daniel Lezcano wrote:
>>>
>>>> Many IRQs are quiet most of the time, or they tend to come in bursts of
>>>> fairly equal time intervals within each burst. It is therefore possible
>>>> to detect those IRQs with stable intervals and guestimate when the next
>>>> IRQ event is most likely to happen.
>>>>
>>>> Examples of such IRQs may include audio related IRQs where the FIFO size
>>>> and/or DMA descriptor size with the sample rate create stable intervals,
>>>> block devices during large data transfers, etc.  Even network streaming
>>>> of multimedia content creates patterns of periodic network interface IRQs
>>>> in some cases.
>>>>
>>>> This patch adds code to track the mean interval and variance for each IRQ
>>>> over a window of time intervals between IRQ events. Those statistics can
>>>> be used to assist cpuidle in selecting the most appropriate sleep state
>>>> by predicting the most likely time for the next interrupt.
>>>>
>>>> Because the stats are gathered in interrupt context, the core computation
>>>> is as light as possible.
>>>>
>>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>>
>> [ ... ]
>>
>>>> +
>>>> +		diff = ktime_sub(now, w->timestamp);
>>>> +
>>>> +		/*
>>>> +		 * There is no point attempting predictions on interrupts more
>>>> +		 * than 1 second apart. This has no benefit for sleep state
>>>> +		 * selection and increases the risk of overflowing our
>>>> variance
>>>> +		 * computation. Reset all stats in that case.
>>>> +		 */
>>>> +		if (unlikely(ktime_after(diff, ktime_set(1, 0)))) {
>>>> +			stats_reset(&w->stats);
>>>> +			continue;
>>>> +		}
>>>
>>> The above is wrong. It is not computing the interval between successive
>>> interruts but rather the interval between the last interrupt occurrence
>>> and the present time (i.e. when we're about to go idle).  This won't
>>> prevent interrupt intervals greater than one second from being summed
>>> and potentially overflowing the variance if this code is executed less
>>> than a second after one such IRQ interval.  This test should rather be
>>> performed in sched_idle_irq().
>>
>> Hi Nico,
>>
>> I have been through here again and think we should duplicate the test because
>> there are two cases:
>>
>> 1. We did not go idle and the interval measured in sched_idle_irq is more than
>> one second, then the stats are reset. I suggest to use an approximation of one
>> second: (diff < (1 << 20)) as we are in the fast
>> path.
>>
>> 2. We are going idle and the latest interrupt happened one second apart from
>> now. So we keep the current test.
>
> You don't need the current test if the interval is already limited
> earlier on.  Predictions that would otherwise trip that test will target
> a time in the past and be discarded.

Yes, but that wake up source should be discarded in the process of the 
selection, so ignored in the loop, otherwise it can end up as the next 
event (which is obviously wrong) and discarded at the end by returning 
KTIME_MAX, instead of giving the opportunity to find another interrupt 
as the next event with a greater value.
Nicolas Pitre Jan. 10, 2016, 11:13 p.m. UTC | #7
On Sun, 10 Jan 2016, Daniel Lezcano wrote:

> On 01/10/2016 11:46 PM, Nicolas Pitre wrote:
> > On Sun, 10 Jan 2016, Daniel Lezcano wrote:
> >
> > > On 01/06/2016 06:40 PM, Nicolas Pitre wrote:
> > > > On Wed, 6 Jan 2016, Daniel Lezcano wrote:
> > > >
> > > > > Many IRQs are quiet most of the time, or they tend to come in bursts
> > > > > of
> > > > > fairly equal time intervals within each burst. It is therefore
> > > > > possible
> > > > > to detect those IRQs with stable intervals and guestimate when the
> > > > > next
> > > > > IRQ event is most likely to happen.
> > > > >
> > > > > Examples of such IRQs may include audio related IRQs where the FIFO
> > > > > size
> > > > > and/or DMA descriptor size with the sample rate create stable
> > > > > intervals,
> > > > > block devices during large data transfers, etc.  Even network
> > > > > streaming
> > > > > of multimedia content creates patterns of periodic network interface
> > > > > IRQs
> > > > > in some cases.
> > > > >
> > > > > This patch adds code to track the mean interval and variance for each
> > > > > IRQ
> > > > > over a window of time intervals between IRQ events. Those statistics
> > > > > can
> > > > > be used to assist cpuidle in selecting the most appropriate sleep
> > > > > state
> > > > > by predicting the most likely time for the next interrupt.
> > > > >
> > > > > Because the stats are gathered in interrupt context, the core
> > > > > computation
> > > > > is as light as possible.
> > > > >
> > > > > Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> > >
> > > [ ... ]
> > >
> > > > > +
> > > > > +		diff = ktime_sub(now, w->timestamp);
> > > > > +
> > > > > +		/*
> > > > > +		 * There is no point attempting predictions on
> > > > > interrupts more
> > > > > +		 * than 1 second apart. This has no benefit for sleep
> > > > > state
> > > > > +		 * selection and increases the risk of overflowing our
> > > > > variance
> > > > > +		 * computation. Reset all stats in that case.
> > > > > +		 */
> > > > > +		if (unlikely(ktime_after(diff, ktime_set(1, 0)))) {
> > > > > +			stats_reset(&w->stats);
> > > > > +			continue;
> > > > > +		}
> > > >
> > > > The above is wrong. It is not computing the interval between successive
> > > > interruts but rather the interval between the last interrupt occurrence
> > > > and the present time (i.e. when we're about to go idle).  This won't
> > > > prevent interrupt intervals greater than one second from being summed
> > > > and potentially overflowing the variance if this code is executed less
> > > > than a second after one such IRQ interval.  This test should rather be
> > > > performed in sched_idle_irq().
> > >
> > > Hi Nico,
> > >
> > > I have been through here again and think we should duplicate the test
> > > because
> > > there are two cases:
> > >
> > > 1. We did not go idle and the interval measured in sched_idle_irq is more
> > > than
> > > one second, then the stats are reset. I suggest to use an approximation of
> > > one
> > > second: (diff < (1 << 20)) as we are in the fast
> > > path.
> > >
> > > 2. We are going idle and the latest interrupt happened one second apart
> > > from
> > > now. So we keep the current test.
> >
> > You don't need the current test if the interval is already limited
> > earlier on.  Predictions that would otherwise trip that test will target
> > a time in the past and be discarded.
> 
> Yes, but that wake up source should be discarded in the process of the
> selection, so ignored in the loop, otherwise it can end up as the next event
> (which is obviously wrong) and discarded at the end by returning KTIME_MAX,
> instead of giving the opportunity to find another interrupt as the next event
> with a greater value.

The loop should always discard any prediction for a past time and move 
to the next, not only at the end.

If interrupt stats are not gathered if an interval is greater than one 
second, that means no prediction will ever be more than one second away 
from the last IRQ occurrence.  If you ask for a prediction one second 
after the last IRQ then those predictions will all be in the past.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 12, 2016, 12:41 p.m. UTC | #8
On 01/08/2016 04:43 PM, Thomas Gleixner wrote:
> On Wed, 6 Jan 2016, Daniel Lezcano wrote:
>> +/**
>> + * sched_irq_timing_free - free_irq callback
>> + *
>> + * @irq: the irq number to stop tracking
>> + * @dev_id: not used at the moment
>> + *
>> + * This function will remove from the wakeup source prediction table.
>> + */
>> +static void sched_irq_timing_free(unsigned int irq, void *dev_id)
>> +{
>> +	struct irq_desc *desc = irq_to_desc(irq);
>
> What has this code to fiddle with irq_desc? Nothing!
>
>> +	raw_spin_lock_irqsave(&desc->lock, flags);
>> +	desc->timings = &irq_timings;
>> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
>
> Bah, no. This belongs into the core code. Add that handler - and yes, that's
> all you need - to your timing_ops and let the core code do it in a proper way.

Ah, yes, you are right. That will be much more cleaner.

>> +/**
>> + * sched_idle_init - setup the interrupt tracking table
>> + *
>> + * At init time, some interrupts could have been setup and in the
>> + * system life time, some devices could be setup. In order to track
>> + * all interrupts we are interested in, we first register a couple of
>> + * callback to keep up-to-date the interrupt tracking table and then
>> + * we setup the table with the interrupt which were already set up.
>> + */
>> +int __init sched_idle_init(void)
>> +{
>> +	struct irq_desc *desc;
>> +	unsigned int irq;
>> +	int ret;
>> +
>> +	/*
>> +	 * Register the setup/free irq callbacks, so new interrupt or
>> +	 * freed interrupt will update their tracking.
>> +	 */
>> +	ret = register_irq_timings(&irqt_ops);
>> +	if (ret) {
>> +		pr_err("Failed to register timings ops\n");
>> +		return ret;
>> +	}
>
> So that stuff is installed unconditionally. Is it used unconditionally as
> well?

Sorry, I am not sure to understand your question. If the kernel is 
compiled with CONFIG_CPU_IDLE_GOV_SCHED=y, this code is enabled and use 
the irq timings. The condition comes from the compilation option.

Is that your question ?

>> +	/*
>> +	 * For all the irq already setup, assign the timing callback.
>> +	 * All interrupts with their desc NULL will be discarded.
>> +	 */
>> +	for_each_irq_desc(irq, desc)
>> +		sched_irq_timing_setup(irq, desc->action);
>
> No, no, no. This belongs into the core code register_irq_timings() function
> which installs the handler into the irq descs with the proper protections and
> once it has done that enables the static key.

Ok.

> The above is completely unprotected against interrupts being setup or even
> freed concurrently.

Ok, I will do the change.

> Aside of that, you call that setup function in setup_irq for each action() and
> here you call it only for the first one.

Ah, right.

Thanks for the review.

   -- Daniel
Thomas Gleixner Jan. 12, 2016, 1:42 p.m. UTC | #9
On Tue, 12 Jan 2016, Daniel Lezcano wrote:
> On 01/08/2016 04:43 PM, Thomas Gleixner wrote:
> > > +	/*
> > > +	 * Register the setup/free irq callbacks, so new interrupt or
> > > +	 * freed interrupt will update their tracking.
> > > +	 */
> > > +	ret = register_irq_timings(&irqt_ops);
> > > +	if (ret) {
> > > +		pr_err("Failed to register timings ops\n");
> > > +		return ret;
> > > +	}
> > 
> > So that stuff is installed unconditionally. Is it used unconditionally as
> > well?
> 
> Sorry, I am not sure to understand your question. If the kernel is compiled
> with CONFIG_CPU_IDLE_GOV_SCHED=y, this code is enabled and use the irq
> timings. The condition comes from the compilation option.

The question is whether the option also activates that thing or is there still
some /sys/whatever/idlegov magic where you can (de)select it.

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 12, 2016, 2:16 p.m. UTC | #10
On 01/12/2016 02:42 PM, Thomas Gleixner wrote:
> On Tue, 12 Jan 2016, Daniel Lezcano wrote:
>> On 01/08/2016 04:43 PM, Thomas Gleixner wrote:
>>>> +	/*
>>>> +	 * Register the setup/free irq callbacks, so new interrupt or
>>>> +	 * freed interrupt will update their tracking.
>>>> +	 */
>>>> +	ret = register_irq_timings(&irqt_ops);
>>>> +	if (ret) {
>>>> +		pr_err("Failed to register timings ops\n");
>>>> +		return ret;
>>>> +	}
>>>
>>> So that stuff is installed unconditionally. Is it used unconditionally as
>>> well?
>>
>> Sorry, I am not sure to understand your question. If the kernel is compiled
>> with CONFIG_CPU_IDLE_GOV_SCHED=y, this code is enabled and use the irq
>> timings. The condition comes from the compilation option.
>
> The question is whether the option also activates that thing or is there still
> some /sys/whatever/idlegov magic where you can (de)select it.

Yes, in the next patches of the series I did not send, we can switch to 
the cpuidle's governor framework or idle-sched. I will look at how to 
disable it when switching to the cpuidle's governors.

Thanks

   -- Daniel
Thomas Gleixner Jan. 12, 2016, 2:26 p.m. UTC | #11
On Tue, 12 Jan 2016, Daniel Lezcano wrote:
> On 01/12/2016 02:42 PM, Thomas Gleixner wrote:
> > On Tue, 12 Jan 2016, Daniel Lezcano wrote:
> > > On 01/08/2016 04:43 PM, Thomas Gleixner wrote:
> > > > > +	/*
> > > > > +	 * Register the setup/free irq callbacks, so new interrupt or
> > > > > +	 * freed interrupt will update their tracking.
> > > > > +	 */
> > > > > +	ret = register_irq_timings(&irqt_ops);
> > > > > +	if (ret) {
> > > > > +		pr_err("Failed to register timings ops\n");
> > > > > +		return ret;
> > > > > +	}
> > > > 
> > > > So that stuff is installed unconditionally. Is it used unconditionally
> > > > as
> > > > well?
> > > 
> > > Sorry, I am not sure to understand your question. If the kernel is
> > > compiled
> > > with CONFIG_CPU_IDLE_GOV_SCHED=y, this code is enabled and use the irq
> > > timings. The condition comes from the compilation option.
> > 
> > The question is whether the option also activates that thing or is there
> > still
> > some /sys/whatever/idlegov magic where you can (de)select it.
> 
> Yes, in the next patches of the series I did not send, we can switch to the
> cpuidle's governor framework or idle-sched. I will look at how to disable it
> when switching to the cpuidle's governors.

You better implement the switching part in the cpuidle core first, i.e. proper
callbacks when a governor is switched in/out. Then make use of this switcheroo
right away. Doing it the other way round is just wrong.

Thanks,

	tglx


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 12, 2016, 2:52 p.m. UTC | #12
On 01/12/2016 03:26 PM, Thomas Gleixner wrote:
> On Tue, 12 Jan 2016, Daniel Lezcano wrote:
>> On 01/12/2016 02:42 PM, Thomas Gleixner wrote:
>>> On Tue, 12 Jan 2016, Daniel Lezcano wrote:
>>>> On 01/08/2016 04:43 PM, Thomas Gleixner wrote:
>>>>>> +	/*
>>>>>> +	 * Register the setup/free irq callbacks, so new interrupt or
>>>>>> +	 * freed interrupt will update their tracking.
>>>>>> +	 */
>>>>>> +	ret = register_irq_timings(&irqt_ops);
>>>>>> +	if (ret) {
>>>>>> +		pr_err("Failed to register timings ops\n");
>>>>>> +		return ret;
>>>>>> +	}
>>>>>
>>>>> So that stuff is installed unconditionally. Is it used unconditionally
>>>>> as
>>>>> well?
>>>>
>>>> Sorry, I am not sure to understand your question. If the kernel is
>>>> compiled
>>>> with CONFIG_CPU_IDLE_GOV_SCHED=y, this code is enabled and use the irq
>>>> timings. The condition comes from the compilation option.
>>>
>>> The question is whether the option also activates that thing or is there
>>> still
>>> some /sys/whatever/idlegov magic where you can (de)select it.
>>
>> Yes, in the next patches of the series I did not send, we can switch to the
>> cpuidle's governor framework or idle-sched. I will look at how to disable it
>> when switching to the cpuidle's governors.
>
> You better implement the switching part in the cpuidle core first, i.e. proper
> callbacks when a governor is switched in/out. Then make use of this switcheroo
> right away. Doing it the other way round is just wrong.

The problem is this code is not another governor but a 'predictor' where 
the scheduler will use the information to ask the cpuidle to go to a 
specific idle state without going through the governor code, so into the 
governor's callbacks. It is on top of cpuidle. The scheduler will become 
the governor.

The current straightforward code, does the switch in the cpu_idle_loop 
idle_task's function:

[ ... ]

if (cpu_idle_force_poll || tick_check_broadcast_expired())
	cpu_idle_poll();
else {
	if (sched_idle_enabled()) {
		int latency = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
		s64 duration = sched_idle_next_wakeup();
		sched_idle(duration, latency);
	} else {
		cpuidle_idle_call();
	}
}

Due to the complexity of the code, this first step introduce a mechanism 
to predict the next event and re-use it trivially in the idle task.

Perhaps, it would be acceptable to have cpuidle_idle_call() to be 
replaced by a callback and the switch acts at this level ?
Thomas Gleixner Jan. 12, 2016, 3:12 p.m. UTC | #13
On Tue, 12 Jan 2016, Daniel Lezcano wrote:
> On 01/12/2016 03:26 PM, Thomas Gleixner wrote:
> > You better implement the switching part in the cpuidle core first, i.e.
> > proper
> > callbacks when a governor is switched in/out. Then make use of this
> > switcheroo
> > right away. Doing it the other way round is just wrong.
> 
> The problem is this code is not another governor but a 'predictor' where the
> scheduler will use the information to ask the cpuidle to go to a specific idle
> state without going through the governor code, so into the governor's
> callbacks. It is on top of cpuidle. The scheduler will become the governor.
> 
> The current straightforward code, does the switch in the cpu_idle_loop
> idle_task's function:
> 
> [ ... ]
> 
> if (cpu_idle_force_poll || tick_check_broadcast_expired())
> 	cpu_idle_poll();
> else {
> 	if (sched_idle_enabled()) {
> 		int latency = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
> 		s64 duration = sched_idle_next_wakeup();
> 		sched_idle(duration, latency);
> 	} else {
> 		cpuidle_idle_call();
> 	}
> }
> 
> Due to the complexity of the code, this first step introduce a mechanism to
> predict the next event and re-use it trivially in the idle task.

This looks really wrong. Why on earth don't you implement a proper governor
and just get rid of this extra hackery?

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 12, 2016, 4:04 p.m. UTC | #14
On 01/12/2016 04:12 PM, Thomas Gleixner wrote:
> On Tue, 12 Jan 2016, Daniel Lezcano wrote:
>> On 01/12/2016 03:26 PM, Thomas Gleixner wrote:
>>> You better implement the switching part in the cpuidle core first, i.e.
>>> proper
>>> callbacks when a governor is switched in/out. Then make use of this
>>> switcheroo
>>> right away. Doing it the other way round is just wrong.
>>
>> The problem is this code is not another governor but a 'predictor' where the
>> scheduler will use the information to ask the cpuidle to go to a specific idle
>> state without going through the governor code, so into the governor's
>> callbacks. It is on top of cpuidle. The scheduler will become the governor.
>>
>> The current straightforward code, does the switch in the cpu_idle_loop
>> idle_task's function:
>>
>> [ ... ]
>>
>> if (cpu_idle_force_poll || tick_check_broadcast_expired())
>> 	cpu_idle_poll();
>> else {
>> 	if (sched_idle_enabled()) {
>> 		int latency = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
>> 		s64 duration = sched_idle_next_wakeup();
>> 		sched_idle(duration, latency);
>> 	} else {
>> 		cpuidle_idle_call();
>> 	}
>> }
>>
>> Due to the complexity of the code, this first step introduce a mechanism to
>> predict the next event and re-use it trivially in the idle task.
>
> This looks really wrong. Why on earth don't you implement a proper governor
> and just get rid of this extra hackery?

That is part of the ongoing work where we are integrating the different 
PM subsystems with the scheduler in order have them collaborating 
together as asked by Ingo [1].

The idea is to get rid of the governors and let the scheduler to tell 
the Cpuidle framework : "I expect to sleep <x> nsec and I have a <y>
nsec latency requirement" as stated by Peter Zijlstra [2].

The code above could be not nice but I think it begins to do what is 
expecting Peter. It is an embryonic code and in order to prevent too 
much different topics for a single series, I posted the two first 
patches which provide the next event API as the foundations for the next 
changes. How we integrate the 'next event' is a question I wanted to 
postpone in a different series of RFC patches.

   -- Daniel

[1] http://lwn.net/Articles/552885/
[2] https://lkml.org/lkml/2013/11/11/353 (broken here is another archive [3]
[3] http://lkml.iu.edu/hypermail/linux/kernel/1311.1/01360.html
Nicolas Pitre Jan. 12, 2016, 7:27 p.m. UTC | #15
On Thu, 7 Jan 2016, Daniel Lezcano wrote:

> On 01/06/2016 06:40 PM, Nicolas Pitre wrote:
> > On Wed, 6 Jan 2016, Daniel Lezcano wrote:
> >
> > > Many IRQs are quiet most of the time, or they tend to come in bursts of
> > > fairly equal time intervals within each burst. It is therefore possible
> > > to detect those IRQs with stable intervals and guestimate when the next
> > > IRQ event is most likely to happen.
> > >
> > > Examples of such IRQs may include audio related IRQs where the FIFO size
> > > and/or DMA descriptor size with the sample rate create stable intervals,
> > > block devices during large data transfers, etc.  Even network streaming
> > > of multimedia content creates patterns of periodic network interface IRQs
> > > in some cases.
> > >
> > > This patch adds code to track the mean interval and variance for each IRQ
> > > over a window of time intervals between IRQ events. Those statistics can
> > > be used to assist cpuidle in selecting the most appropriate sleep state
> > > by predicting the most likely time for the next interrupt.
> > >
> > > Because the stats are gathered in interrupt context, the core computation
> > > is as light as possible.
> > >
> > > Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> >
> > There are still a few problems with this patch.
> >
> > Please see comments below.
> 
> [ ... ]
> 
> > > +/**
> > > + * stats_variance - compute the variance
> > > + *
> > > + * @s: the statistic structure
> > > + *
> > > + * Returns an u64 corresponding to the variance, or zero if there is
> > > + * no data
> > > + */
> > > +static u64 stats_variance(struct stats *s, u32 mean)
> > > +{
> > > +	int i;
> > > +	u64 variance = 0;
> > > +
> > > +	/*
> > > +	 * The variance is the sum of the squared difference to the
> > > +	 * average divided by the number of elements.
> > > +	 */
> > > +	for (i = 0; i < STATS_NR_VALUES; i++) {
> > > +		s32 diff = s->values[i] - mean;
> > > +		variance += (u64)diff * diff;
> >
> > Strictly speaking, diff should be casted to s64 as it is a signed value
> > that may actually be negative.  Because of the strange C type promotion
> > rules, the generated code appears correct (at least on ARM), but it
> > would be clearer to use s64 anyway.
> 
> I don't get the connection in your explanation of why it should be a s64. It
> is already a signed s32, s->values[] are s32 and mean is u32.
> 
> What would be the benefit to convert diff to s64 ?

Suppose diff ends up being -1. Normally, -1 * -1 = 1.

Now you cast the first term to a 64-bit value so the result of the 
product becomes a 64-bit value to avoid overflows.  However you cast it 
to an _unsigned_ 64-bit value.  Therefore you could expect something 
like 0x00000000ffffffff * -1 would take place and that wouldn't produce 
1 for result at all.

Yet, the C standard is weird and completely unintuitive sometimes.  
Here is an example of such a time.  Even though you cast the first term 
to an unsigned value, the compiler is performing sign extension 
nevertheless.  So casting to u64 or s64 doesn't change anything in 
practice.  But for a human reading the code, using s64 will be more 
intuitive.

> > The product will end up being positive in all cases of course so
> > variance may remain as a u64.
> >
> 
> [ ... ]


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Gleixner Jan. 13, 2016, 9:17 a.m. UTC | #16
On Tue, 12 Jan 2016, Daniel Lezcano wrote:
> On 01/12/2016 04:12 PM, Thomas Gleixner wrote:
> > This looks really wrong. Why on earth don't you implement a proper governor
> > and just get rid of this extra hackery?
> 
> That is part of the ongoing work where we are integrating the different PM
> subsystems with the scheduler in order have them collaborating together as
> asked by Ingo [1].
> 
> The idea is to get rid of the governors and let the scheduler to tell the
> Cpuidle framework : "I expect to sleep <x> nsec and I have a <y>
> nsec latency requirement" as stated by Peter Zijlstra [2].
> 
> The code above could be not nice but I think it begins to do what is expecting
> Peter. It is an embryonic code and in order to prevent too much different
> topics for a single series, I posted the two first patches which provide the
> next event API as the foundations for the next changes. How we integrate the
> 'next event' is a question I wanted to postpone in a different series of RFC
> patches.

Fair enough.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 18, 2016, 1:21 p.m. UTC | #17
On 01/08/2016 04:43 PM, Thomas Gleixner wrote:
> On Wed, 6 Jan 2016, Daniel Lezcano wrote:

[ ... ]

>> +	/*
>> +	 * For all the irq already setup, assign the timing callback.
>> +	 * All interrupts with their desc NULL will be discarded.
>> +	 */
>> +	for_each_irq_desc(irq, desc)
>> +		sched_irq_timing_setup(irq, desc->action);
>
> No, no, no. This belongs into the core code register_irq_timings() function
> which installs the handler into the irq descs with the proper protections and
> once it has done that enables the static key.
>
> The above is completely unprotected against interrupts being setup or even
> freed concurrently.
>
> Aside of that, you call that setup function in setup_irq for each action() and
> here you call it only for the first one.

Hi Thomas,

I went through the different comments and almost finished the changes 
but I think the 'register_ops' approach, which happens after some irq 
were setup, introduces some useless complexity and because of the desc 
lock section, the ops can't do memory allocation. Before going further, 
I am wondering if declaring the irq_timings_ops statically (read without 
'register_ops' - hence without a init time dependency) and calling the 
init/free ops from alloc_desc/free_desc wouldn't be cleaner and simpler.

What do you think ?

   -- Daniel
Thomas Gleixner Jan. 20, 2016, 3:41 p.m. UTC | #18
On Mon, 18 Jan 2016, Daniel Lezcano wrote:
> On 01/08/2016 04:43 PM, Thomas Gleixner wrote:
> > The above is completely unprotected against interrupts being setup or even
> > freed concurrently.
> > 
> > Aside of that, you call that setup function in setup_irq for each action()
> > and
> > here you call it only for the first one.
> 
> I went through the different comments and almost finished the changes but I
> think the 'register_ops' approach, which happens after some irq were setup,
> introduces some useless complexity and because of the desc lock section, the
> ops can't do memory allocation.

You can't protect that with desc_lock. You need to take the sparse_irq_lock,
which is a mutex, to protect the irq desc walk.

> Before going further, I am wondering if declaring the irq_timings_ops
> statically (read without 'register_ops' - hence without a init time
> dependency) and calling the init/free ops from alloc_desc/free_desc wouldn't
> be cleaner and simpler.

Then you don't need those ops at all. You can make it simple function calls,
which get compiled out if that stuff is not enabled.

Thanks,

	tglx


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Jan. 20, 2016, 4 p.m. UTC | #19
The current approach to select an idle state is based on the idle period
statistics computation.

Useless to say this approach satisfied everyone as a solution to find the
best trade-off between the performances and the energy saving via the menu
governor.

However, the kernel is evolving to act pro-actively regarding the energy
constraints with the scheduler and the different power management subsystems
are not collaborating with the scheduler as the conductor of the decisions,
they all act independently.

The cpuidle governors are based on idle period statistics, without knowledge
of what woke up the cpu. In these sources of wakes up, the IPI are of course
accounted (as well as the timers irq) which results in doing statistics on
the scheduler behavior too. It is no sense to let the scheduler to take a
decision based on a next prediction of its own decisions.

In order to integrate the cpuidle framework into the scheduler, we have to
radically change the approach by clearly identifying what is causing a wake
up and how it behaves.

This serie inverts the logic.

Instead of tracking the idle durations and do statistics on them, these
patches track the interrupt individually and try to predict the next interrupt.

By combining the interrupts' next event on a single CPU, we can predict the
next event for the CPU, hence predict how long we will be sleeping when
entering idle.

The IPI and timer interrupts are not taken into account.

The first patch provides a callback to be registered in the irq subsystem
and to be called when an interrupt is handled with a timestamp.

The second patch uses the callback provided by the patch above to compute
the delta and store it in a circular buffer. It is per cpu, the callback
implements minimal operations as it is in an interrupt context.

When we the cpu enters idle, it asks for the expected sleep time. Then the
expected minimum sleep length for all interrupts is used and compared to
the timer sleep length, again the minimum is taken and gives the expected
sleep time.

The statistics are very trivial and could be improved later but this first
step shows we have a nice overall improvement in SMP. In UP the menu governor
is a bit better which may indicate the next prediction computation could be
improved but confirms removing the IPI from the equation increase the
accuracy.

Changelog:
	V2:
	   - Changed the register_ops approach for the irq subsystem
           - Fixed Nicolas's comments

Daniel Lezcano (2):
  irq: Add a framework to measure interrupt timings
  sched: idle: IRQ based next prediction for idle period

 drivers/cpuidle/Kconfig    |   9 +
 include/linux/interrupt.h  |  26 +++
 include/linux/irqhandler.h |   1 +
 kernel/irq/Kconfig         |   4 +
 kernel/irq/handle.c        |   1 +
 kernel/irq/internals.h     |  43 ++++
 kernel/irq/irqdesc.c       |   6 +
 kernel/irq/manage.c        |  10 +-
 kernel/sched/Makefile      |   1 +
 kernel/sched/idle-sched.c  | 529 +++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 629 insertions(+), 1 deletion(-)
 create mode 100644 kernel/sched/idle-sched.c
Daniel Lezcano Jan. 20, 2016, 4 p.m. UTC | #20
The current approach to select an idle state is based on the idle period
statistics computation.

Useless to say this approach satisfied everyone as a solution to find the
best trade-off between the performances and the energy saving via the menu
governor.

However, the kernel is evolving to act pro-actively regarding the energy
constraints with the scheduler and the different power management subsystems
are not collaborating with the scheduler as the conductor of the decisions,
they all act independently.

The cpuidle governors are based on idle period statistics, without knowledge
of what woke up the cpu. In these sources of wakes up, the IPI are of course
accounted (as well as the timers irq) which results in doing statistics on
the scheduler behavior too. It is no sense to let the scheduler to take a
decision based on a next prediction of its own decisions.

In order to integrate the cpuidle framework into the scheduler, we have to
radically change the approach by clearly identifying what is causing a wake
up and how it behaves.

This serie inverts the logic.

Instead of tracking the idle durations and do statistics on them, these
patches track the interrupt individually and try to predict the next interrupt.

By combining the interrupts' next event on a single CPU, we can predict the
next event for the CPU, hence predict how long we will be sleeping when
entering idle.

The IPI and timer interrupts are not taken into account.

The first patch provides a callback to be registered in the irq subsystem
and to be called when an interrupt is handled with a timestamp.

The second patch uses the callback provided by the patch above to compute
the delta and store it in a circular buffer. It is per cpu, the callback
implements minimal operations as it is in an interrupt context.

When we the cpu enters idle, it asks for the expected sleep time. Then the
expected minimum sleep length for all interrupts is used and compared to
the timer sleep length, again the minimum is taken and gives the expected
sleep time.

The statistics are very trivial and could be improved later but this first
step shows we have a nice overall improvement in SMP. In UP the menu governor
is a bit better which may indicate the next prediction computation could be
improved but confirms removing the IPI from the equation increase the
accuracy.

Changelog:
	V2:
	   - Changed the register_ops approach for the irq subsystem
           - Fixed Nicolas's comments

Daniel Lezcano (2):
  irq: Add a framework to measure interrupt timings
  sched: idle: IRQ based next prediction for idle period

 drivers/cpuidle/Kconfig    |   9 +
 include/linux/interrupt.h  |  26 +++
 include/linux/irqhandler.h |   1 +
 kernel/irq/Kconfig         |   4 +
 kernel/irq/handle.c        |   1 +
 kernel/irq/internals.h     |  43 ++++
 kernel/irq/irqdesc.c       |   6 +
 kernel/irq/manage.c        |  10 +-
 kernel/sched/Makefile      |   1 +
 kernel/sched/idle-sched.c  | 529 +++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 629 insertions(+), 1 deletion(-)
 create mode 100644 kernel/sched/idle-sched.c
diff mbox

Patch

diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 8c7930b..dd17215 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -25,6 +25,11 @@  config CPU_IDLE_GOV_MENU
 	bool "Menu governor (for tickless system)"
 	default y
 
+config CPU_IDLE_GOV_SCHED
+	bool "Sched idle governor"
+	select IRQ_TIMINGS
+	default y
+
 config DT_IDLE_STATES
 	bool
 
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1275fd1..81557ae 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -75,7 +75,6 @@  config HANDLE_DOMAIN_IRQ
 
 config IRQ_TIMINGS
         bool
-	default y
 
 config IRQ_DOMAIN_DEBUG
 	bool "Expose hardware/virtual IRQ mapping via debugfs"
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6768797..f7d5a35 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHED) += idle-sched.o
diff --git a/kernel/sched/idle-sched.c b/kernel/sched/idle-sched.c
new file mode 100644
index 0000000..a8e7557
--- /dev/null
+++ b/kernel/sched/idle-sched.c
@@ -0,0 +1,549 @@ 
+/*
+ *  Copyright (C) 2016 Linaro Ltd, Daniel Lezcano <daniel.lezcano@linaro.org>
+ *                                 Nicolas Pitre <nicolas.pitre@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/irqdesc.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+#include <linux/tick.h>
+#include <linux/time64.h>
+
+/*
+ * Define the number of samples over which the average and variance
+ * are computed. A power of 2 is preferred so to let the compiler
+ * optimize divisions by that number with simple arithmetic shifts.
+ */
+#define STATS_NR_VALUES 4
+
+/**
+ * struct stats - internal structure to encapsulate stats informations
+ *
+ * @sum: sum of the values
+ * @values: array of values to do stats on
+ * @n: current number of values
+ * @w_ptr: current buffer pointer
+ */
+struct stats {
+	u64          sum;                     /* sum of values */
+	u32          values[STATS_NR_VALUES]; /* array of values */
+	unsigned int w_ptr;                   /* current window pointer */
+};
+
+/**
+ * struct wakeup - internal structure describing a source of wakeup
+ *
+ * @stats: the stats structure on the different event intervals
+ * @timestamp: latest update timestamp
+ */
+struct wakeup {
+	struct stats stats;
+	ktime_t timestamp;
+};
+
+/*
+ * Per cpu and irq statistics. Each cpu receives interrupts and those
+ * ones can be distributed following an irq chip specific
+ * algorithm. Random irq distribution is the worst case to predict
+ * interruption behavior but usually that does not happen or could be
+ * fixed from userspace by setting the irq affinity.
+ */
+static DEFINE_PER_CPU(struct wakeup, *wakeups[NR_IRQS]);
+
+/**
+ * stats_add - add a new value in the statistic structure
+ *
+ * @s: the statistic structure
+ * @value: the new value to be added
+ *
+ * Adds the value to the array, if the array is full, the oldest value
+ * is replaced.
+ */
+static void stats_add(int irq, struct stats *s, u32 value)
+{
+	/*
+	 * This is a circular buffer, so the oldest value is
+	 * the next one in the buffer. Let's compute the next
+	 * pointer to retrieve the oldest value and re-use it
+	 * to update the w_ptr after adding the new value.
+	 */
+	unsigned int w_ptr = s->w_ptr;
+
+	/*
+	 * Remove the oldest value from the summing. If this is the
+	 * first time we go through this array slot, the previous
+	 * value will be zero and we won't substract anything from the
+	 * current sum.  Hence this code relies on a zero-ed stat
+	 * structure at init time via memset or kzalloc.
+	 */
+	s->sum -= s->values[w_ptr];
+	s->values[w_ptr] = value;
+	s->w_ptr = (w_ptr + 1) % STATS_NR_VALUES;
+
+        /*
+	 * In order to reduce the overhead and to prevent value
+	 * derivation due to the integer computation, we just sum the
+	 * value and do the division when the average and the variance
+	 * are requested.
+         */
+	s->sum += value;
+}
+
+/**
+ * stats_reset - reset the stats
+ *
+ * @s: the statistic structure
+ *
+ * Reset the statistics and reset the values
+ */
+static inline void stats_reset(struct stats *s)
+{
+	memset(s, 0, sizeof(*s));
+}
+
+/**
+ * stats_mean - compute the average
+ *
+ * @s: the statistics structure
+ *
+ * Returns an u32 corresponding to the mean value, or zero if there is
+ * no data
+ */
+static inline u32 stats_mean(struct stats *s)
+{
+	/*
+	 * gcc is smart enough to convert to a bits shift when the
+	 * divisor is constant and multiple of 2^x.
+	 *
+	 * The number of values could have not reached STATS_NR_VALUES
+	 * yet, but we can consider it acceptable as the situation is
+	 * only at the very first beginning in the system life cycle.
+	 */
+	return s->sum / STATS_NR_VALUES;
+}
+
+/**
+ * stats_variance - compute the variance
+ *
+ * @s: the statistic structure
+ *
+ * Returns an u64 corresponding to the variance, or zero if there is
+ * no data
+ */
+static u64 stats_variance(struct stats *s, u32 mean)
+{
+	int i;
+	u64 variance = 0;
+
+	/*
+	 * The variance is the sum of the squared difference to the
+	 * average divided by the number of elements.
+	 */
+	for (i = 0; i < STATS_NR_VALUES; i++) {
+		s32 diff = s->values[i] - mean;
+		variance += (u64)diff * diff;
+	}
+
+	return variance / STATS_NR_VALUES;
+}
+
+/**
+ * sched_idle_irq - irq timestamp callback
+ *
+ * @irq: the irq number
+ * @timestamp: when the interrupt occured
+ * @dev_id: device id for shared interrupt (not yet used)
+ * @data: private data given when registering this callback
+ *
+ * Interrupt callback called when an interrupt happens. This function
+ * is critical as it is called under an interrupt section: minimum
+ * operations as possible are done here:
+ */
+static void sched_idle_irq(unsigned int irq, ktime_t timestamp,
+			   void *dev_id, void *data)
+{
+	u32 diff;
+	unsigned int cpu = raw_smp_processor_id();
+	struct wakeup *w = per_cpu(wakeups[irq], cpu);
+
+	if (!w)
+		return;
+
+	/*
+	 * It is the first time the interrupt occurs of the series, we
+	 * can't do any stats as we don't have an interval, just store
+	 * the timestamp and exit.
+	 */
+	if (ktime_equal(w->timestamp, ktime_set(0, 0))) {
+		w->timestamp = timestamp;
+		return;
+	}
+
+	/*
+	 * Microsec resolution is enough for our purpose.
+	 */
+	diff = ktime_us_delta(timestamp, w->timestamp);
+	w->timestamp = timestamp;
+	stats_add(irq, &w->stats, diff);
+}
+
+/*
+ * Callback to be called when an interrupt happens.
+ */
+static struct irqtimings irq_timings = {
+	.handler = sched_idle_irq,
+};
+
+static ktime_t next_irq_event(void)
+{
+	unsigned int irq, cpu = raw_smp_processor_id();
+	ktime_t diff, next, min = { .tv64 = KTIME_MAX };
+	struct wakeup *w;
+	u32 interval, mean;
+	u64 variance;
+
+	/*
+	 * Lookup the interrupt array for this cpu and search for the
+	 * earlier expected interruption.
+	 */
+        for (irq = 0; irq < NR_IRQS; irq++) {
+
+		ktime_t now = ktime_get();
+
+		w = per_cpu(wakeups[irq], cpu);
+
+		/*
+		 * The interrupt was not setup as a source of a wakeup
+		 * or the wakeup source is not considered at this
+		 * moment stable enough to do a prediction.
+		 */
+		if (!w)
+			continue;
+
+		/*
+		 * No statistics available yet.
+		 */
+		if (ktime_equal(w->timestamp, ktime_set(0, 0)))
+			continue;
+
+		diff = ktime_sub(now, w->timestamp);
+
+		/*
+		 * There is no point attempting predictions on interrupts more
+		 * than 1 second apart. This has no benefit for sleep state
+		 * selection and increases the risk of overflowing our variance
+		 * computation. Reset all stats in that case.
+		 */
+		if (unlikely(ktime_after(diff, ktime_set(1, 0)))) {
+			stats_reset(&w->stats);
+			continue;
+		}
+
+		/*
+		 * If the mean value is null, just ignore this wakeup
+		 * source.
+		 */
+		mean = stats_mean(&w->stats);
+		if (!mean)
+			continue;
+
+		variance = stats_variance(&w->stats, mean);
+		/*
+		 * We want to check the last interval is:
+		 *
+		 *  mean - stddev < interval < mean + stddev
+		 *
+		 * That simplifies to:
+		 *
+		 * -stddev < interval - mean < stddev
+		 *
+		 * abs(interval - mean) < stddev
+		 *
+		 * The standard deviation is the sqrt of the variance:
+		 *
+		 * abs(interval - mean) < sqrt(variance)
+		 *
+		 * and we want to prevent to do an sqrt, so we square
+		 * the equation:
+		 *
+		 * (interval - mean)^2 < variance
+		 *
+		 * So if the latest value of the stats complies with
+		 * this condition, then the wakeup source is
+		 * considered predictable and can be used to predict
+		 * the next event.
+		 */
+		interval = w->stats.values[(w->stats.w_ptr + 1) % STATS_NR_VALUES];
+		if ((u64)((interval - mean) * (interval - mean)) > variance)
+			continue;
+
+		/*
+		 * Let's compute the next event: the wakeup source is
+		 * considered predictable, we add the average interval
+		 * time added to the latest interruption event time.
+		 */
+		next = ktime_add_us(w->timestamp, stats_mean(&w->stats));
+
+		/*
+		 * If the interrupt is supposed to happen before the
+		 * minimum time, then it becomes the minimum.
+		 */
+		if (ktime_before(next, min))
+			min = next;
+	}
+
+	/*
+	 * At this point, we have our prediction but the caller is
+	 * expecting the remaining time before the next event, so
+	 * compute the expected sleep length.
+	 */
+	diff = ktime_sub(min, ktime_get());
+
+	/*
+	 * The result could be negative for different reasons:
+	 *  - the prediction is incorrect
+	 *  - the prediction was too near now and expired while we were
+	 *    in this function
+	 *
+	 * In both cases, we return KTIME_MAX as a failure to do a
+	 * prediction
+	 */
+	if (ktime_compare(diff, ktime_set(0, 0)) <= 0)
+		return (ktime_t){ .tv64 = KTIME_MAX };
+
+	return diff;
+}
+
+/**
+ * sched_idle_next_wakeup - Predict the next wakeup on the current cpu
+ *
+ * The next event on the cpu is based on a statistic approach of the
+ * interrupt events and the timer deterministic value. From the timer
+ * or the irqs, we return the one expected to occur first.
+ *
+ * Returns the expected remaining idle time before being woken up by
+ * an interruption.
+ */
+s64 sched_idle_next_wakeup(void)
+{
+	s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
+	s64 next_irq = ktime_to_us(next_irq_event());
+
+	return min(next_irq, next_timer);
+}
+
+/**
+ * sched_idle - go to idle for a specified amount of time
+ *
+ * @duration: the idle duration time
+ * @latency: the latency constraint
+ *
+ * Returns 0 on success, < 0 otherwise.
+ */
+int sched_idle(s64 duration, unsigned int latency)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	struct cpuidle_state_usage *su;
+	struct cpuidle_state *s;
+	int i, ret = 0, index = -1;
+
+	rcu_idle_enter();
+
+	/*
+	 * No cpuidle driver is available, let's use the default arch
+	 * idle function.
+	 */
+	if (cpuidle_not_available(drv, dev))
+		goto default_idle;
+
+	/*
+	 * Find the idle state with the lowest power while satisfying
+	 * our constraints. We will save energy if the duration of the
+	 * idle time is bigger than the target residency which is the
+	 * break even point. The choice will be modulated by the
+	 * latency.
+	 */
+	for (i = 0; i < drv->state_count; i++) {
+
+		s = &drv->states[i];
+
+		su = &dev->states_usage[i];
+
+		if (s->disabled || su->disable)
+			continue;
+		if (s->target_residency > duration)
+			continue;
+		if (s->exit_latency > latency)
+			continue;
+
+		index = i;
+	}
+
+	/*
+	 * The idle task must be scheduled, it is pointless to go to
+	 * idle, just re-enable the interrupt and return.
+	 */
+	if (current_clr_polling_and_test()) {
+		local_irq_enable();
+		goto out;
+	}
+
+	if (index < 0) {
+		/*
+		 * No idle callbacks fulfilled the constraints, jump
+		 * to the default function like there wasn't any
+		 * cpuidle driver.
+		 */
+		goto default_idle;
+	} else {
+		/*
+		 * Enter the idle state previously returned by the
+		 * governor decision.  This function will block until
+		 * an interrupt occurs and will take care of
+		 * re-enabling the local interrupts
+		 */
+		return cpuidle_enter(drv, dev, index);
+	}
+
+default_idle:
+	default_idle_call();
+out:
+	rcu_idle_exit();
+	return ret;
+}
+
+/**
+ * sched_irq_timing_free - free_irq callback
+ *
+ * @irq: the irq number to stop tracking
+ * @dev_id: not used at the moment
+ *
+ * This function will remove from the wakeup source prediction table.
+ */
+static void sched_irq_timing_free(unsigned int irq, void *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct wakeup *w;
+	unsigned int cpu;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	desc->timings = NULL;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	for_each_possible_cpu(cpu) {
+		w = per_cpu(wakeups[irq], cpu);
+		if (!w)
+			continue;
+
+		per_cpu(wakeups[irq], cpu) = NULL;
+		kfree(w);
+	}
+}
+
+/**
+ * sched_irq_timing_setup - setup_irq callback
+ *
+ * @irq: the interrupt numbe to be tracked
+ * @act: the new irq action to be set to this interrupt
+ *
+ * Update the irq table to be tracked in order to predict the next event.
+ *
+ * Returns zero on success. On error it returns -ENOMEM.
+ */
+static int sched_irq_timing_setup(unsigned int irq, struct irqaction *act)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct wakeup *w;
+	unsigned int cpu;
+	unsigned long flags;
+	int ret = -ENOMEM;
+
+	/*
+	 * No interrupt set for this descriptor or related to a timer.
+	 * Timers are deterministic, so no need to try to do any
+	 * prediction on them. No error for both cases, we are just not
+	 * interested.
+	 */
+	if (!act || (act->flags & __IRQF_TIMER))
+		return 0;
+
+	/*
+	 * Allocates the wakeup structure and the stats structure. As
+	 * the interrupt can occur on any cpu, allocate the wakeup
+	 * structure per cpu basis.
+	 */
+	for_each_possible_cpu(cpu) {
+
+		w = kzalloc(sizeof(*w), GFP_KERNEL);
+		if (!w)
+			goto undo;
+
+		per_cpu(wakeups[irq], cpu) = w;
+	}
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	desc->timings = &irq_timings;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	ret = 0;
+undo:
+	/*
+	 * Rollback all allocations on failure
+	 */
+	if (ret)
+		sched_irq_timing_free(irq, act->dev_id);
+
+	return ret;
+}
+
+/*
+ * Setup/free irq callbacks
+ */
+static struct irqtimings_ops irqt_ops = {
+	.setup = sched_irq_timing_setup,
+	.free = sched_irq_timing_free,
+};
+
+/**
+ * sched_idle_init - setup the interrupt tracking table
+ *
+ * At init time, some interrupts could have been setup and in the
+ * system life time, some devices could be setup. In order to track
+ * all interrupts we are interested in, we first register a couple of
+ * callback to keep up-to-date the interrupt tracking table and then
+ * we setup the table with the interrupt which were already set up.
+ */
+int __init sched_idle_init(void)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	int ret;
+
+	/*
+	 * Register the setup/free irq callbacks, so new interrupt or
+	 * freed interrupt will update their tracking.
+	 */
+	ret = register_irq_timings(&irqt_ops);
+	if (ret) {
+		pr_err("Failed to register timings ops\n");
+		return ret;
+	}
+
+	/*
+	 * For all the irq already setup, assign the timing callback.
+	 * All interrupts with their desc NULL will be discarded.
+	 */
+	for_each_irq_desc(irq, desc)
+		sched_irq_timing_setup(irq, desc->action);
+
+	return 0;
+}
+late_initcall(sched_idle_init);