diff mbox

[RFC,1/4] arm64: kernel: implement DT based idle states infrastructure

Message ID 1395138028-19630-2-git-send-email-lorenzo.pieralisi@arm.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Lorenzo Pieralisi March 18, 2014, 10:20 a.m. UTC
On most common ARM systems, the low-power states a CPU can be put into are
not discoverable in HW and require device tree bindings to describe
the respective power domains, power down protocol and idle states parameters.

In order to enable DT based idle states and configure idle drivers, this
patch implements the bulk infrastructure required to parse the device tree
idle states bindings and functions to initizialize idle driver and protocol
back-ends.

Protocol back-ends (eg PSCI) must register a protocol initializer with
the idle state parser so that upon protocol detection, the parsing code
can call the back-end infrastructure to complete the idle driver
initialization.

Idle state index 0 is always initialized, ie always considered present
on all ARM platforms.

Code that initializes idle states checks the CPU idle driver cpumask so
that multiple CPU idle drivers can be initialized through it in the
kernel. The CPU idle driver cpumask defines which idle states should be
considered valid for the driver, ie idle states that are valid on a set
of cpus the idle driver manages.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 arch/arm64/Kconfig                   |   4 +
 arch/arm64/include/asm/idle_states.h |  20 ++
 arch/arm64/kernel/Makefile           |   1 +
 arch/arm64/kernel/idle_states.c      | 397 +++++++++++++++++++++++++++++++++++
 4 files changed, 422 insertions(+)
 create mode 100644 arch/arm64/include/asm/idle_states.h
 create mode 100644 arch/arm64/kernel/idle_states.c

Comments

Rob Herring March 18, 2014, 1:27 p.m. UTC | #1
On Tue, Mar 18, 2014 at 5:20 AM, Lorenzo Pieralisi
<lorenzo.pieralisi@arm.com> wrote:
> On most common ARM systems, the low-power states a CPU can be put into are
> not discoverable in HW and require device tree bindings to describe
> the respective power domains, power down protocol and idle states parameters.
>
> In order to enable DT based idle states and configure idle drivers, this
> patch implements the bulk infrastructure required to parse the device tree
> idle states bindings and functions to initizialize idle driver and protocol
> back-ends.
>
> Protocol back-ends (eg PSCI) must register a protocol initializer with
> the idle state parser so that upon protocol detection, the parsing code
> can call the back-end infrastructure to complete the idle driver
> initialization.
>
> Idle state index 0 is always initialized, ie always considered present
> on all ARM platforms.
>
> Code that initializes idle states checks the CPU idle driver cpumask so
> that multiple CPU idle drivers can be initialized through it in the
> kernel. The CPU idle driver cpumask defines which idle states should be
> considered valid for the driver, ie idle states that are valid on a set
> of cpus the idle driver manages.
>
> Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> ---
>  arch/arm64/Kconfig                   |   4 +
>  arch/arm64/include/asm/idle_states.h |  20 ++
>  arch/arm64/kernel/Makefile           |   1 +
>  arch/arm64/kernel/idle_states.c      | 397 +++++++++++++++++++++++++++++++++++

This all belongs in drivers/cpuidle either as part of your driver or
as library calls the driver can use as it is very obvious it is
dependent on cpuidle.


>  4 files changed, 422 insertions(+)
>  create mode 100644 arch/arm64/include/asm/idle_states.h
>  create mode 100644 arch/arm64/kernel/idle_states.c
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 27bbcfc..3132572 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -295,6 +295,10 @@ config ARCH_SUSPEND_POSSIBLE
>  config ARM64_CPU_SUSPEND
>         def_bool PM_SLEEP
>
> +config ARM64_IDLE_STATES

Idle states apply to ARM as well, right?

OF_IDLE_STATES?

> +       def_bool CPU_IDLE
> +       select ARM64_CPU_SUSPEND

This should probably get renamed to ARCH_CPU_SUSPEND. There's no need
for different names for arm and arm64.

> +
>  endmenu
>
>  menu "CPU Power Management"
> diff --git a/arch/arm64/include/asm/idle_states.h b/arch/arm64/include/asm/idle_states.h
> new file mode 100644
> index 0000000..0b9f9ba
> --- /dev/null
> +++ b/arch/arm64/include/asm/idle_states.h
> @@ -0,0 +1,20 @@
> +#ifndef __ARM64_IDLE_STATES
> +#define __ARM64_IDLE_STATES
> +
> +struct idle_state {
> +       u32     index;
> +       u32     entry_latency;
> +       u32     exit_latency;
> +       u32     min_residency;
> +       u32     param;
> +       struct device_node *node;
> +       struct idle_state *state;
> +       cpumask_t       cpus;
> +       bool    logic_state_retained;
> +       bool    cache_state_retained;
> +};
> +
> +struct cpuidle_driver;
> +
> +int __init arm_init_idle_driver(struct cpuidle_driver *drv);
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index 2d4554b..2afc9a0 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -19,6 +19,7 @@ arm64-obj-$(CONFIG_HW_PERF_EVENTS)    += perf_event.o
>  arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)+= hw_breakpoint.o
>  arm64-obj-$(CONFIG_EARLY_PRINTK)       += early_printk.o
>  arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND)  += sleep.o suspend.o
> +arm64-obj-$(CONFIG_ARM64_IDLE_STATES)  += idle_states.o
>  arm64-obj-$(CONFIG_JUMP_LABEL)         += jump_label.o
>
>  obj-y                                  += $(arm64-obj-y) vdso/
> diff --git a/arch/arm64/kernel/idle_states.c b/arch/arm64/kernel/idle_states.c
> new file mode 100644
> index 0000000..0386cff
> --- /dev/null
> +++ b/arch/arm64/kernel/idle_states.c
> @@ -0,0 +1,397 @@
> +/*
> + * ARM device tree idle states parsing code.
> + *
> + * Copyright (C) 2014 ARM Ltd.
> + * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/cpuidle.h>
> +#include <linux/cpumask.h>
> +#include <linux/cpu_pm.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +
> +#include <asm/idle_states.h>
> +#include <asm/psci.h>
> +#include <asm/suspend.h>
> +
> +static struct idle_state idle_states[CPUIDLE_STATE_MAX] __initdata;
> +typedef int (*idle_states_initializer)(struct cpumask *, struct idle_state *,
> +                                      unsigned int);
> +
> +struct protocol_init {
> +       const char *prot;
> +       idle_states_initializer prot_init;
> +};
> +
> +static const struct protocol_init protocols[] __initconst = {
> +       {}
> +};
> +
> +static __init const struct protocol_init *get_protocol(const char *str)
> +{

This needs a better name or to be removed until you actually need it.
What protocols do you expect to have? This kind of looks like GUFI
trying to do DT vs. ACPI backends.

> +       int i;
> +
> +       if (!str)
> +               return NULL;
> +
> +       for (i = 0; protocols[i].prot; i++)
> +               if (!strcmp(protocols[i].prot, str))
> +                       return &protocols[i];
> +
> +       return NULL;
> +}
> +
> +static void __init idle_state_cpu_mask(int cpu, struct idle_state *idle_state,
> +                                      struct device_node *cn)
> +{
> +       int i = 0;
> +       struct device_node *cpu_state;
> +
> +       do {
> +               cpu_state = of_parse_phandle(cn, "cpu-idle-states", i++);
> +               if (cpu_state && idle_state->node == cpu_state)
> +                       cpumask_set_cpu(cpu, &idle_state->cpus);
> +               of_node_put(cpu_state);
> +       } while (cpu_state);
> +}
> +
> +static int __init parse_idle_states_node(struct device_node *parent, int cnt,
> +                                        const cpumask_t *cpus)
> +{
> +       struct device_node *state;

node or np or state_node so it is clear this is a device_node.

> +       struct idle_state *idle_state;
> +       int cpu;
> +
> +       for_each_child_of_node(parent, state) {
> +
> +               if (!of_device_is_compatible(state, "arm,idle-state")) {
> +                       pr_warn(" %s has child nodes that are not idle states\n",
> +                               parent->full_name);
> +                       continue;
> +               }
> +
> +               idle_state = &idle_states[cnt];
> +
> +               pr_debug(" * %s...\n", state->full_name);
> +
> +               idle_state->node = state;
> +               /*
> +                * Check cpus on which the idle state is valid
> +                */
> +               for_each_possible_cpu(cpu) {
> +                       struct device_node *cn;
> +
> +                       cn = of_get_cpu_node(cpu, NULL);
> +                       if (!cn) {
> +                               pr_err("missing device node for CPU %d\n", cpu);
> +                               continue;
> +                       }
> +                       idle_state_cpu_mask(cpu, idle_state, cn);
> +               }
> +
> +               /*
> +                * The driver cpumask is not a subset of cpus on which the
> +                * idle state is valid, hence the idle state is skipped for
> +                * this driver.
> +                */
> +               if (!cpumask_subset(cpus, &idle_state->cpus))
> +                       continue;
> +
> +               if (of_property_read_u32(state, "index", &idle_state->index)) {
> +                       pr_debug(" * %s missing index property\n",
> +                                    state->full_name);
> +                       continue;
> +               }
> +
> +               if (of_property_read_u32(state, "entry-latency-us",
> +                                        &idle_state->entry_latency)) {
> +                       pr_debug(" * %s missing entry latency property\n",
> +                                    state->full_name);
> +                       continue;
> +               }
> +
> +               if (of_property_read_u32(state, "exit-latency-us",
> +                                        &idle_state->exit_latency)) {
> +                       pr_debug(" * %s missing exit latency property\n",
> +                                    state->full_name);
> +                       continue;
> +               }
> +
> +               if (of_property_read_u32(state, "min-residency-us",
> +                                        &idle_state->min_residency)) {
> +                       pr_debug(" * %s missing min-residency property\n",
> +                                    state->full_name);
> +                       continue;
> +               }
> +
> +               if (of_property_read_u32(state, "entry-method-param",
> +                                        &idle_state->param)) {
> +                       pr_debug(" * %s missing entry-method-param property\n",
> +                                    state->full_name);
> +                       continue;
> +               }
> +
> +               if (++cnt == CPUIDLE_STATE_MAX) {
> +                       pr_warn("Number of parsed states equal static CPU idle state limit\n");
> +                       of_node_put(state);
> +                       break;
> +               }
> +       }
> +
> +       return cnt;
> +}
> +
> +static int __init parse_idle_states(struct device_node *root, int cnt,
> +                                   const cpumask_t *cpus)
> +{
> +       int head_idx, curr_idx;
> +       struct device_node *curr = root;
> +
> +       /*
> +        * Breadth-first DT idle states parsing
> +        *
> +        * Sweep idle states level in the device tree and use the
> +        * idle_states array to stash the visited nodes, as a queue.
> +        *
> +        * parse_idle_states_node() updates the idle_states array by
> +        * initializing entries, stashing the device tree node for the
> +        * corresponding state (struct idle_state.node) and incrementing
> +        * the idle states counter that is returned so that curr_idx is
> +        * kept up-to-date while descending into tree levels.
> +        *
> +        * Store the initial counter head_idx and curr_idx and use head_idx
> +        * as a queue of node indices to be visited.
> +        *
> +        * When we reach the max number of CPU idle states or
> +        * head_idx == curr_idx (empty nodes queue) we are done.
> +        */
> +       head_idx = curr_idx = cnt;
> +
> +       do {
> +               curr_idx = parse_idle_states_node(curr, curr_idx, cpus);
> +               if (curr_idx == CPUIDLE_STATE_MAX || head_idx == curr_idx)
> +                       break;
> +               /*
> +                * idle_states array is updated by parse_idle_states_node(),
> +                * we can use the initialized states as a queue of nodes
> +                * that need to be checked for their idle states siblings.
> +                * head_idx works as a pointer into the queue to get the
> +                * next node to be parsed.
> +                */
> +               curr = idle_states[head_idx++].node;
> +       } while (curr);

I still object to index property and this is why. You need to be able
to determine state order by actual h/w properties. That is what you
are doing in your head when you define the indexes.

You really want a linked list here that you can sort as you go and not
care what order you parse DT nodes. Not to mention you don't know how
many states you will have.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lorenzo Pieralisi March 18, 2014, 6:08 p.m. UTC | #2
Hi Rob,

thanks for reviewing.

On Tue, Mar 18, 2014 at 01:27:29PM +0000, Rob Herring wrote:
> On Tue, Mar 18, 2014 at 5:20 AM, Lorenzo Pieralisi
> <lorenzo.pieralisi@arm.com> wrote:
> > On most common ARM systems, the low-power states a CPU can be put into are
> > not discoverable in HW and require device tree bindings to describe
> > the respective power domains, power down protocol and idle states parameters.
> >
> > In order to enable DT based idle states and configure idle drivers, this
> > patch implements the bulk infrastructure required to parse the device tree
> > idle states bindings and functions to initizialize idle driver and protocol
> > back-ends.
> >
> > Protocol back-ends (eg PSCI) must register a protocol initializer with
> > the idle state parser so that upon protocol detection, the parsing code
> > can call the back-end infrastructure to complete the idle driver
> > initialization.
> >
> > Idle state index 0 is always initialized, ie always considered present
> > on all ARM platforms.
> >
> > Code that initializes idle states checks the CPU idle driver cpumask so
> > that multiple CPU idle drivers can be initialized through it in the
> > kernel. The CPU idle driver cpumask defines which idle states should be
> > considered valid for the driver, ie idle states that are valid on a set
> > of cpus the idle driver manages.
> >
> > Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> > ---
> >  arch/arm64/Kconfig                   |   4 +
> >  arch/arm64/include/asm/idle_states.h |  20 ++
> >  arch/arm64/kernel/Makefile           |   1 +
> >  arch/arm64/kernel/idle_states.c      | 397 +++++++++++++++++++++++++++++++++++
> 
> This all belongs in drivers/cpuidle either as part of your driver or
> as library calls the driver can use as it is very obvious it is
> dependent on cpuidle.

Yes, and it is something I thought about before posting. There are a
couple of dependencies to be managed though (PSCI and cpu_suspend API),
I can probably make the build dependent on ARM64 for the moment.

I will look into this.

> >  4 files changed, 422 insertions(+)
> >  create mode 100644 arch/arm64/include/asm/idle_states.h
> >  create mode 100644 arch/arm64/kernel/idle_states.c
> >
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 27bbcfc..3132572 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -295,6 +295,10 @@ config ARCH_SUSPEND_POSSIBLE
> >  config ARM64_CPU_SUSPEND
> >         def_bool PM_SLEEP
> >
> > +config ARM64_IDLE_STATES
> 
> Idle states apply to ARM as well, right?

Eh, yes with a couple of twists, see above.

> OF_IDLE_STATES?

Seems sensible.

> > +       def_bool CPU_IDLE
> > +       select ARM64_CPU_SUSPEND
> 
> This should probably get renamed to ARCH_CPU_SUSPEND. There's no need
> for different names for arm and arm64.

Yes I think that makes sense, I will move parsing code to drivers/cpuidle and
check if the CONFIG rename might cause issues.

[...]

> > +static struct idle_state idle_states[CPUIDLE_STATE_MAX] __initdata;
> > +typedef int (*idle_states_initializer)(struct cpumask *, struct idle_state *,
> > +                                      unsigned int);
> > +
> > +struct protocol_init {
> > +       const char *prot;
> > +       idle_states_initializer prot_init;
> > +};
> > +
> > +static const struct protocol_init protocols[] __initconst = {
> > +       {}
> > +};
> > +
> > +static __init const struct protocol_init *get_protocol(const char *str)
> > +{
> 
> This needs a better name or to be removed until you actually need it.
> What protocols do you expect to have? This kind of looks like GUFI
> trying to do DT vs. ACPI backends.

Better name probably, but I do now want to hardcode PSCI either (patch 2 adds
the PSCI backend initializer), I am not sure that would make people extremely
happy and this code can be reused by different back-end implementations with
nary a problem.

Code has to be generic enough to call initialization for different suspend
backends, I do not think I am asking too much here.

> > +       int i;
> > +
> > +       if (!str)
> > +               return NULL;
> > +
> > +       for (i = 0; protocols[i].prot; i++)
> > +               if (!strcmp(protocols[i].prot, str))
> > +                       return &protocols[i];
> > +
> > +       return NULL;
> > +}
> > +
> > +static void __init idle_state_cpu_mask(int cpu, struct idle_state *idle_state,
> > +                                      struct device_node *cn)
> > +{
> > +       int i = 0;
> > +       struct device_node *cpu_state;
> > +
> > +       do {
> > +               cpu_state = of_parse_phandle(cn, "cpu-idle-states", i++);
> > +               if (cpu_state && idle_state->node == cpu_state)
> > +                       cpumask_set_cpu(cpu, &idle_state->cpus);
> > +               of_node_put(cpu_state);
> > +       } while (cpu_state);
> > +}
> > +
> > +static int __init parse_idle_states_node(struct device_node *parent, int cnt,
> > +                                        const cpumask_t *cpus)
> > +{
> > +       struct device_node *state;
> 
> node or np or state_node so it is clear this is a device_node.

Ok.

> > +       struct idle_state *idle_state;
> > +       int cpu;
> > +
> > +       for_each_child_of_node(parent, state) {
> > +
> > +               if (!of_device_is_compatible(state, "arm,idle-state")) {
> > +                       pr_warn(" %s has child nodes that are not idle states\n",
> > +                               parent->full_name);
> > +                       continue;
> > +               }
> > +
> > +               idle_state = &idle_states[cnt];
> > +
> > +               pr_debug(" * %s...\n", state->full_name);
> > +
> > +               idle_state->node = state;
> > +               /*
> > +                * Check cpus on which the idle state is valid
> > +                */
> > +               for_each_possible_cpu(cpu) {
> > +                       struct device_node *cn;
> > +
> > +                       cn = of_get_cpu_node(cpu, NULL);
> > +                       if (!cn) {
> > +                               pr_err("missing device node for CPU %d\n", cpu);
> > +                               continue;
> > +                       }
> > +                       idle_state_cpu_mask(cpu, idle_state, cn);
> > +               }
> > +
> > +               /*
> > +                * The driver cpumask is not a subset of cpus on which the
> > +                * idle state is valid, hence the idle state is skipped for
> > +                * this driver.
> > +                */
> > +               if (!cpumask_subset(cpus, &idle_state->cpus))
> > +                       continue;
> > +
> > +               if (of_property_read_u32(state, "index", &idle_state->index)) {
> > +                       pr_debug(" * %s missing index property\n",
> > +                                    state->full_name);
> > +                       continue;
> > +               }
> > +
> > +               if (of_property_read_u32(state, "entry-latency-us",
> > +                                        &idle_state->entry_latency)) {
> > +                       pr_debug(" * %s missing entry latency property\n",
> > +                                    state->full_name);
> > +                       continue;
> > +               }
> > +
> > +               if (of_property_read_u32(state, "exit-latency-us",
> > +                                        &idle_state->exit_latency)) {
> > +                       pr_debug(" * %s missing exit latency property\n",
> > +                                    state->full_name);
> > +                       continue;
> > +               }
> > +
> > +               if (of_property_read_u32(state, "min-residency-us",
> > +                                        &idle_state->min_residency)) {
> > +                       pr_debug(" * %s missing min-residency property\n",
> > +                                    state->full_name);
> > +                       continue;
> > +               }
> > +
> > +               if (of_property_read_u32(state, "entry-method-param",
> > +                                        &idle_state->param)) {
> > +                       pr_debug(" * %s missing entry-method-param property\n",
> > +                                    state->full_name);
> > +                       continue;
> > +               }
> > +
> > +               if (++cnt == CPUIDLE_STATE_MAX) {
> > +                       pr_warn("Number of parsed states equal static CPU idle state limit\n");
> > +                       of_node_put(state);
> > +                       break;
> > +               }
> > +       }
> > +
> > +       return cnt;
> > +}
> > +
> > +static int __init parse_idle_states(struct device_node *root, int cnt,
> > +                                   const cpumask_t *cpus)
> > +{
> > +       int head_idx, curr_idx;
> > +       struct device_node *curr = root;
> > +
> > +       /*
> > +        * Breadth-first DT idle states parsing
> > +        *
> > +        * Sweep idle states level in the device tree and use the
> > +        * idle_states array to stash the visited nodes, as a queue.
> > +        *
> > +        * parse_idle_states_node() updates the idle_states array by
> > +        * initializing entries, stashing the device tree node for the
> > +        * corresponding state (struct idle_state.node) and incrementing
> > +        * the idle states counter that is returned so that curr_idx is
> > +        * kept up-to-date while descending into tree levels.
> > +        *
> > +        * Store the initial counter head_idx and curr_idx and use head_idx
> > +        * as a queue of node indices to be visited.
> > +        *
> > +        * When we reach the max number of CPU idle states or
> > +        * head_idx == curr_idx (empty nodes queue) we are done.
> > +        */
> > +       head_idx = curr_idx = cnt;
> > +
> > +       do {
> > +               curr_idx = parse_idle_states_node(curr, curr_idx, cpus);
> > +               if (curr_idx == CPUIDLE_STATE_MAX || head_idx == curr_idx)
> > +                       break;
> > +               /*
> > +                * idle_states array is updated by parse_idle_states_node(),
> > +                * we can use the initialized states as a queue of nodes
> > +                * that need to be checked for their idle states siblings.
> > +                * head_idx works as a pointer into the queue to get the
> > +                * next node to be parsed.
> > +                */
> > +               curr = idle_states[head_idx++].node;
> > +       } while (curr);
> 
> I still object to index property and this is why. You need to be able
> to determine state order by actual h/w properties. That is what you
> are doing in your head when you define the indexes.
> 
> You really want a linked list here that you can sort as you go and not
> care what order you parse DT nodes. Not to mention you don't know how
> many states you will have.

This code does not care about the order of nodes, the index is just there
to keep track of nodes that have still to be parsed. Sorting is done later,
using the index property (totally unrelated to the {head/curr}_idx) which I
understand is frowned upon in DT world (but in this case I think it could be
accepted, certainly it would make my life easier).

Having said that, I like the idea of implementing it with a linked list and
sorting states while parsing them. I will remove that index property and
replace it with an actual hw property: power_consumption ? Or should I just
use min_residency (the higher the required residency the deeper the idle
state) ? Defining the power consumption (or better savings) for a state is
an _outright_ can of worms, that's why using an index is easier.

Thoughts ?

Thanks,
Lorenzo

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sebastian Capella March 18, 2014, 9:49 p.m. UTC | #3
On 18 March 2014 11:08, Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> wrote:
>> > +        * When we reach the max number of CPU idle states or
>> > +        * head_idx == curr_idx (empty nodes queue) we are done.
>> > +        */
>> > +       head_idx = curr_idx = cnt;
>> > +
>> > +       do {
>> > +               curr_idx = parse_idle_states_node(curr, curr_idx, cpus);
>> > +               if (curr_idx == CPUIDLE_STATE_MAX || head_idx == curr_idx)
>> > +                       break;
>> > +               /*
>> > +                * idle_states array is updated by parse_idle_states_node(),
>> > +                * we can use the initialized states as a queue of nodes
>> > +                * that need to be checked for their idle states siblings.
>> > +                * head_idx works as a pointer into the queue to get the
>> > +                * next node to be parsed.
>> > +                */
>> > +               curr = idle_states[head_idx++].node;
>> > +       } while (curr);
>>
>> I still object to index property and this is why. You need to be able
>> to determine state order by actual h/w properties. That is what you
>> are doing in your head when you define the indexes.
>>
>> You really want a linked list here that you can sort as you go and not
>> care what order you parse DT nodes. Not to mention you don't know how
>> many states you will have.
>
> This code does not care about the order of nodes, the index is just there
> to keep track of nodes that have still to be parsed. Sorting is done later,
> using the index property (totally unrelated to the {head/curr}_idx) which I
> understand is frowned upon in DT world (but in this case I think it could be
> accepted, certainly it would make my life easier).
>
> Having said that, I like the idea of implementing it with a linked list and
> sorting states while parsing them. I will remove that index property and
> replace it with an actual hw property: power_consumption ? Or should I just
> use min_residency (the higher the required residency the deeper the idle
> state) ? Defining the power consumption (or better savings) for a state is
> an _outright_ can of worms, that's why using an index is easier.
>
> Thoughts ?

How about something like rank, power_rank or power_score since it's
neither an index nor a physical value but a value for sorting states
relative to each other?

Thanks,

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lorenzo Pieralisi March 19, 2014, 5:23 p.m. UTC | #4
On Tue, Mar 18, 2014 at 09:49:12PM +0000, Sebastian Capella wrote:
> On 18 March 2014 11:08, Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> wrote:
> >> > +        * When we reach the max number of CPU idle states or
> >> > +        * head_idx == curr_idx (empty nodes queue) we are done.
> >> > +        */
> >> > +       head_idx = curr_idx = cnt;
> >> > +
> >> > +       do {
> >> > +               curr_idx = parse_idle_states_node(curr, curr_idx, cpus);
> >> > +               if (curr_idx == CPUIDLE_STATE_MAX || head_idx == curr_idx)
> >> > +                       break;
> >> > +               /*
> >> > +                * idle_states array is updated by parse_idle_states_node(),
> >> > +                * we can use the initialized states as a queue of nodes
> >> > +                * that need to be checked for their idle states siblings.
> >> > +                * head_idx works as a pointer into the queue to get the
> >> > +                * next node to be parsed.
> >> > +                */
> >> > +               curr = idle_states[head_idx++].node;
> >> > +       } while (curr);
> >>
> >> I still object to index property and this is why. You need to be able
> >> to determine state order by actual h/w properties. That is what you
> >> are doing in your head when you define the indexes.
> >>
> >> You really want a linked list here that you can sort as you go and not
> >> care what order you parse DT nodes. Not to mention you don't know how
> >> many states you will have.
> >
> > This code does not care about the order of nodes, the index is just there
> > to keep track of nodes that have still to be parsed. Sorting is done later,
> > using the index property (totally unrelated to the {head/curr}_idx) which I
> > understand is frowned upon in DT world (but in this case I think it could be
> > accepted, certainly it would make my life easier).
> >
> > Having said that, I like the idea of implementing it with a linked list and
> > sorting states while parsing them. I will remove that index property and
> > replace it with an actual hw property: power_consumption ? Or should I just
> > use min_residency (the higher the required residency the deeper the idle
> > state) ? Defining the power consumption (or better savings) for a state is
> > an _outright_ can of worms, that's why using an index is easier.
> >
> > Thoughts ?
> 
> How about something like rank, power_rank or power_score since it's
> neither an index nor a physical value but a value for sorting states
> relative to each other?

Well, if it is just a renaming issue we can use index just as well.

I hope that by now it is understood that the index property is not
the state index, it is a value representing relative power consumption.

Rob is asking a property describing hw, and honestly unless we define
a power consumption property value (defining what it means will be rather
complicated though) I'd rather remove index altogether and use min_residency
as a comparison value instead.

Lorenzo

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sebastian Capella March 20, 2014, 6:19 p.m. UTC | #5
On 19 March 2014 10:23, Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> wrote:
> Rob is asking a property describing hw, and honestly unless we define
> a power consumption property value (defining what it means will be rather
> complicated though) I'd rather remove index altogether and use min_residency
> as a comparison value instead.

I like having a separate ranking for power to sort the states.  To me
it seems that min residency could be ambiguous for this.  We could
have states with inverted min_residencies: where a lower consumption
state has a lower target residency because the cost is recovered more
quickly.  Such a state might have higher latency for exit, so even
though the lower min_residency/lower power state is clearly
preferable, selection of the lower power state may be prevented by
kernel latency constraints.

Thanks,

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano March 24, 2014, 3:31 p.m. UTC | #6
Hi Lorenzo,



On 03/18/2014 11:20 AM, Lorenzo Pieralisi wrote:
> On most common ARM systems, the low-power states a CPU can be put into are
> not discoverable in HW and require device tree bindings to describe
> the respective power domains, power down protocol and idle states parameters.
>
> In order to enable DT based idle states and configure idle drivers, this
> patch implements the bulk infrastructure required to parse the device tree
> idle states bindings and functions to initizialize idle driver and protocol
> back-ends.
>
> Protocol back-ends (eg PSCI) must register a protocol initializer with
> the idle state parser so that upon protocol detection, the parsing code
> can call the back-end infrastructure to complete the idle driver
> initialization.
>
> Idle state index 0 is always initialized, ie always considered present
> on all ARM platforms.
>
> Code that initializes idle states checks the CPU idle driver cpumask so
> that multiple CPU idle drivers can be initialized through it in the
> kernel. The CPU idle driver cpumask defines which idle states should be
> considered valid for the driver, ie idle states that are valid on a set
> of cpus the idle driver manages.
>
> Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> ---
>   arch/arm64/Kconfig                   |   4 +
>   arch/arm64/include/asm/idle_states.h |  20 ++
>   arch/arm64/kernel/Makefile           |   1 +
>   arch/arm64/kernel/idle_states.c      | 397 +++++++++++++++++++++++++++++++++++
>   4 files changed, 422 insertions(+)
>   create mode 100644 arch/arm64/include/asm/idle_states.h
>   create mode 100644 arch/arm64/kernel/idle_states.c
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 27bbcfc..3132572 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -295,6 +295,10 @@ config ARCH_SUSPEND_POSSIBLE
>   config ARM64_CPU_SUSPEND
>   	def_bool PM_SLEEP
>
> +config ARM64_IDLE_STATES
> +	def_bool CPU_IDLE
> +	select ARM64_CPU_SUSPEND
> +
>   endmenu
>
>   menu "CPU Power Management"
> diff --git a/arch/arm64/include/asm/idle_states.h b/arch/arm64/include/asm/idle_states.h
> new file mode 100644
> index 0000000..0b9f9ba
> --- /dev/null
> +++ b/arch/arm64/include/asm/idle_states.h
> @@ -0,0 +1,20 @@
> +#ifndef __ARM64_IDLE_STATES
> +#define __ARM64_IDLE_STATES
> +
> +struct idle_state {
> +	u32	index;
> +	u32	entry_latency;
> +	u32	exit_latency;
> +	u32	min_residency;
> +	u32	param;

Could you add a comment for this 'param' field or change the name to eg 
'psci_param' ?

> +	struct device_node *node;
> +	struct idle_state *state;

Why is it needed ?

> +	cpumask_t	cpus;

It sounds strange to have to declare this cpumask here.

> +	bool	logic_state_retained;
> +	bool	cache_state_retained;
> +};

IMHO, there is useless duplication of structure definition / declaration.

I suggest to stick as much as possible to the cpuidle structures, that 
is cpuidle_state and cpuidle_driver.

Instead of using the intermediate, idle_states[CPUIDLE_STATE_MAX], use 
the drv->states directly and drv->cpumask. That will prevent extra copy 
and definition/declaration for nothing. And fill the structure directly 
from parse_idle_states_node / parse_idle_states.

All this comes from the need of the state ordering, right ?

> +struct cpuidle_driver;
> +
> +int __init arm_init_idle_driver(struct cpuidle_driver *drv);
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index 2d4554b..2afc9a0 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -19,6 +19,7 @@ arm64-obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o
>   arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)+= hw_breakpoint.o
>   arm64-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
>   arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND)	+= sleep.o suspend.o
> +arm64-obj-$(CONFIG_ARM64_IDLE_STATES)	+= idle_states.o
>   arm64-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
>
>   obj-y					+= $(arm64-obj-y) vdso/
> diff --git a/arch/arm64/kernel/idle_states.c b/arch/arm64/kernel/idle_states.c
> new file mode 100644
> index 0000000..0386cff
> --- /dev/null
> +++ b/arch/arm64/kernel/idle_states.c
> @@ -0,0 +1,397 @@
> +/*
> + * ARM device tree idle states parsing code.
> + *
> + * Copyright (C) 2014 ARM Ltd.
> + * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/cpuidle.h>
> +#include <linux/cpumask.h>
> +#include <linux/cpu_pm.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +
> +#include <asm/idle_states.h>
> +#include <asm/psci.h>
> +#include <asm/suspend.h>
> +
> +static struct idle_state idle_states[CPUIDLE_STATE_MAX] __initdata;
> +typedef int (*idle_states_initializer)(struct cpumask *, struct idle_state *,
> +				       unsigned int);
> +
> +struct protocol_init {
> +	const char *prot;
> +	idle_states_initializer prot_init;
> +};
> +
> +static const struct protocol_init protocols[] __initconst = {
> +	{}
> +};
> +
> +static __init const struct protocol_init *get_protocol(const char *str)
> +{
> +	int i;
> +
> +	if (!str)
> +		return NULL;
> +
> +	for (i = 0; protocols[i].prot; i++)
> +		if (!strcmp(protocols[i].prot, str))
> +			return &protocols[i];
> +
> +	return NULL;
> +}
> +
> +static void __init idle_state_cpu_mask(int cpu, struct idle_state *idle_state,
> +				       struct device_node *cn)
> +{
> +	int i = 0;
> +	struct device_node *cpu_state;
> +
> +	do {
> +		cpu_state = of_parse_phandle(cn, "cpu-idle-states", i++);
> +		if (cpu_state && idle_state->node == cpu_state)
> +			cpumask_set_cpu(cpu, &idle_state->cpus);
> +		of_node_put(cpu_state);
> +	} while (cpu_state);
> +}
> +
> +static int __init parse_idle_states_node(struct device_node *parent, int cnt,
> +					 const cpumask_t *cpus)
> +{
> +	struct device_node *state;
> +	struct idle_state *idle_state;
> +	int cpu;
> +
> +	for_each_child_of_node(parent, state) {
> +
> +		if (!of_device_is_compatible(state, "arm,idle-state")) {
> +			pr_warn(" %s has child nodes that are not idle states\n",
> +				parent->full_name);
> +			continue;
> +		}
> +
> +		idle_state = &idle_states[cnt];
> +
> +		pr_debug(" * %s...\n", state->full_name);
> +
> +		idle_state->node = state;

It is a bit confusing to use 'state' variable name here, is it possible 
to change to something like 'node' or whatever ?

> +		/*
> +		 * Check cpus on which the idle state is valid
> +		 */
> +		for_each_possible_cpu(cpu) {
> +			struct device_node *cn;
> +
> +			cn = of_get_cpu_node(cpu, NULL);
> +			if (!cn) {
> +				pr_err("missing device node for CPU %d\n", cpu);
> +				continue;
> +			}
> +			idle_state_cpu_mask(cpu, idle_state, cn);
> +		}
> +
> +		/*
> +		 * The driver cpumask is not a subset of cpus on which the
> +		 * idle state is valid, hence the idle state is skipped for
> +		 * this driver.
> +		 */
> +		if (!cpumask_subset(cpus, &idle_state->cpus))
> +			continue;
> +
> +		if (of_property_read_u32(state, "index", &idle_state->index)) {
> +			pr_debug(" * %s missing index property\n",
> +				     state->full_name);
> +			continue;
> +		}
> +
> +		if (of_property_read_u32(state, "entry-latency-us",
> +					 &idle_state->entry_latency)) {
> +			pr_debug(" * %s missing entry latency property\n",
> +				     state->full_name);
> +			continue;
> +		}
> +
> +		if (of_property_read_u32(state, "exit-latency-us",
> +					 &idle_state->exit_latency)) {
> +			pr_debug(" * %s missing exit latency property\n",
> +				     state->full_name);
> +			continue;
> +		}
> +
> +		if (of_property_read_u32(state, "min-residency-us",
> +					 &idle_state->min_residency)) {
> +			pr_debug(" * %s missing min-residency property\n",
> +				     state->full_name);
> +			continue;
> +		}
> +
> +		if (of_property_read_u32(state, "entry-method-param",
> +					 &idle_state->param)) {
> +			pr_debug(" * %s missing entry-method-param property\n",
> +				     state->full_name);
> +			continue;
> +		}
> +
> +		if (++cnt == CPUIDLE_STATE_MAX) {
> +			pr_warn("Number of parsed states equal static CPU idle state limit\n");
> +			of_node_put(state);
> +			break;
> +		}
> +	}
> +
> +	return cnt;
> +}
> +
> +static int __init parse_idle_states(struct device_node *root, int cnt,
> +				    const cpumask_t *cpus)
> +{
> +	int head_idx, curr_idx;
> +	struct device_node *curr = root;
> +
> +	/*
> +	 * Breadth-first DT idle states parsing
> +	 *
> +	 * Sweep idle states level in the device tree and use the
> +	 * idle_states array to stash the visited nodes, as a queue.
> +	 *
> +	 * parse_idle_states_node() updates the idle_states array by
> +	 * initializing entries, stashing the device tree node for the
> +	 * corresponding state (struct idle_state.node) and incrementing
> +	 * the idle states counter that is returned so that curr_idx is
> +	 * kept up-to-date while descending into tree levels.
> +	 *
> +	 * Store the initial counter head_idx and curr_idx and use head_idx
> +	 * as a queue of node indices to be visited.
> +	 *
> +	 * When we reach the max number of CPU idle states or
> +	 * head_idx == curr_idx (empty nodes queue) we are done.
> +	 */
> +	head_idx = curr_idx = cnt;
> +
> +	do {
> +		curr_idx = parse_idle_states_node(curr, curr_idx, cpus);
> +		if (curr_idx == CPUIDLE_STATE_MAX || head_idx == curr_idx)
> +			break;
> +		/*
> +		 * idle_states array is updated by parse_idle_states_node(),
> +		 * we can use the initialized states as a queue of nodes
> +		 * that need to be checked for their idle states siblings.
> +		 * head_idx works as a pointer into the queue to get the
> +		 * next node to be parsed.
> +		 */
> +		curr = idle_states[head_idx++].node;
> +	} while (curr);
> +
> +	return curr_idx;
> +}
> +
> +/*
> + * Sort states according to their index value, higher indexes
> + * imply higher power savings, as expected by the CPU idle subsystem.
> + */
> +static void __init sort_states(int count)
> +{
> +	int i, j;
> +	struct idle_state *idle_state;
> +
> +	/*
> +	 * move pointers instead of entries
> +	 */
> +	for (i = 0; i < count; i++)
> +		idle_states[i].state = &idle_states[i];
> +
> +	/*
> +	 * Selection sort is acceptable here, since the
> +	 * number of states is small (<=CPUIDLE_STATE_MAX)
> +	 *
> +	 * Index starts from 1, we know index 0 is standby wfi and
> +	 * it is always the idle state with higher power consumption
> +	 */
> +	for (i = 1; i < count - 1; i++) {
> +		u32 elem = i;
> +		for (j = i+1; j < count; j++) {
> +			if (idle_states[j].state->index <
> +					idle_states[elem].state->index)
> +				elem = j;
> +		}
> +		if (i != elem) {
> +			idle_state = idle_states[i].state;
> +			idle_states[i].state = idle_states[elem].state;
> +			idle_states[elem].state = idle_state;
> +		}
> +	}
> +}
> +
> +/*
> + * arm_dt_init_idle_states - Parse DT idle states and initialize the protocol
> + *			     back-end
> + *
> + * @prot: pointer to the protocol initializer. Initialized only if return code
> + *        is >0
> + * @cpus: CPU idle driver cpumask
> + *
> + * Returns:
> + *	Number of idle states detected upon success
> + *	<0 on failure
> + */
> +static int __init arm_dt_init_idle_states(const struct protocol_init **prot,
> +					  const cpumask_t *cpus)
> +{
> +	struct device_node *cpups;
> +	const char *entry_method;
> +	/* start from 1, stanbywfi is always there */
> +	int cnt = 1, ret = 0;
> +
> +	if (!prot)
> +		return -EINVAL;
> +
> +	cpups = of_find_node_by_path("/cpus/idle-states");
> +
> +	if (!cpups)
> +		return -ENODEV;
> +
> +	cnt = parse_idle_states(cpups, cnt, cpus);
> +
> +	if (cnt == 1) {
> +		ret = -ENODATA;
> +		goto put_node;
> +	}

If 'cnt' is always 1 when calling parse_idle_states, it would be more 
logical to do:

cnt = parse_idle_states(cpups, cpus);
if (cnt < 0) {
	ret = -ENODATA;
	goto put_node;
}

and change parse_idle_states consequently.

> +
> +	/*
> +	 * idle driver expects states to sorted in terms of power
> +	 * consumption
> +	 */
> +	sort_states(cnt);

Isn't possible to sort the states with the index when parsing the DT 
instead ?

> +	if (of_property_read_string(cpups, "entry-method", &entry_method)) {
> +		pr_warn(" * %s missing entry_method property\n",
> +			    cpups->full_name);
> +		ret = -EOPNOTSUPP;
> +		goto put_node;
> +	}
> +
> +	*prot = get_protocol(entry_method);
> +	if (!*prot) {
> +		pr_warn("Missing protocol initializer\n");
> +		ret = -EOPNOTSUPP;
> +		goto put_node;
> +	}
> +
> +	pr_debug("detected %u idle states\n", cnt);
> +
> +put_node:
> +	of_node_put(cpups);
> +	return ret ? : cnt;
> +}
> +
> +/*
> + * arm_enter_idle_state - Programs CPU to enter the specified state
> + *
> + * @dev: cpuidle device
> + * @drv: cpuidle driver
> + * @idx: state index
> + *
> + * Called from the CPUidle framework to program the device to the
> + * specified target state selected by the governor.
> + */
> +static int arm_enter_idle_state(struct cpuidle_device *dev,
> +				struct cpuidle_driver *drv, int idx)
> +{
> +	int ret;
> +
> +	if (!idx) {
> +		/*
> +		 * idle index 0 is just standby wfi, does not require CPU
> +		 * to be suspended
> +		 */
> +		cpu_do_idle();
> +		return idx;
> +	}
> +
> +	cpu_pm_enter();
> +	/*
> +	 * Pass idle state index to cpu_suspend which in turn will call
> +	 * the CPU ops suspend protocol with idle index as a parameter
> +	 */
> +	ret = cpu_suspend(idx);
> +
> +	cpu_pm_exit();
> +
> +	return ret ? : idx;

Are we sure the cpu_suspend will always return a negative value ? If the 
underlying returns 1, like the exynos4/5's cpuidle driver does, that 
will mess up the cpuidle governor with bad inputs.

> +}
> +
> +int __init arm_init_idle_driver(struct cpuidle_driver *drv)
> +{
> +	int i,  idle_states_nb;
> +	struct idle_state *idle_state;
> +	struct cpuidle_state *s;
> +	const struct protocol_init *prot;
> +
> +	drv->states[0].exit_latency = 1;
> +	drv->states[0].target_residency = 1;
> +	drv->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
> +	drv->states[0].enter = arm_enter_idle_state;
> +	strncpy(drv->states[0].name, "ARM WFI", CPUIDLE_NAME_LEN);
> +	strncpy(drv->states[0].desc, "ARM WFI", CPUIDLE_DESC_LEN);

The drv->states[0].name|desc is an array but I hope with some cleanups 
in the acpi cpuidle driver, we can change it to a const char *.

Is it possible to change it as:
drv->states[0].name = "ARM WFI";
drv->states[0].desc = "ARM WFI";

?

> +	idle_states_nb = arm_dt_init_idle_states(&prot, drv->cpumask);
> +
> +	if (idle_states_nb < 0) {
> +		/*
> +		 * No DT based idle states detected
> +		 * Initialize driver count and exit successfully.
> +		 */
> +		drv->state_count = 1;
> +		return 0;
> +	}

If we are not able to initialize the cpuidle driver, wouldn't make sense 
to let the default idle function instead of defining a single state 
driver ? If no cpuidle driver is registered, the idle mainloop will 
switch to the default idle function. That should save the system to 
enter the cpuidle framework and the governor with all its computations 
for nothing, no ?

> +	/*
> +	 * We finally have some idle states to initialize.
> +	 * Driver state 0 corresponds to WFI, start from index 1 and count up
> +	 * to idle_states_nb (parsed idle states + WFI).
> +	 * arm_dt_init_idle_states() ensures that CPUIDLE_STATE_MAX is not
> +	 * exceeded.
> +	 */

I don't know the arm64 architecture but on armv7 there are some 
implementations which do not support the WFI instruction alone (eg. 
omap4), if that could happen for arm64, the driver is assuming WFI is 
always supported.

> +	s = &drv->states[1];
> +	for (i = 1; i < idle_states_nb; i++, s++) {
> +		idle_state = idle_states[i].state;
> +		if (!idle_state)
> +			break;
> +
> +		strncpy(s->name, idle_state->node->name, CPUIDLE_NAME_LEN);
> +		strncpy(s->desc, idle_state->node->name, CPUIDLE_DESC_LEN);

Same comment than above.

> +
> +		s->exit_latency =
> +			idle_state->entry_latency + idle_state->exit_latency;
> +		s->target_residency = idle_state->min_residency;
> +		/*
> +		 * TBD: flag for timers is set implicitly for now but must be
> +		 * linked to power domains.
> +		 */
> +		if (!idle_state->logic_state_retained)
> +			s->flags |= CPUIDLE_FLAG_TIMER_STOP;

The exynos use per cpu timer which are not local, so they are not shut 
down when 'logic_state_retained' is reached (If I refer to the calxeda 
driver, it should be the same). Is the TBD comment describing such case ?

> +		s->flags |= CPUIDLE_FLAG_TIME_VALID;
> +		s->enter = arm_enter_idle_state;
> +	}
> +
> +	/*
> +	 * If we are here, we have a protocol back-end to initialize.
> +	 *
> +	 * If protocol initializer fails reset states count to 1 (wfi)
> +	 */
> +	if (prot->prot_init(drv->cpumask, idle_states, idle_states_nb))
> +		i = 1;

It should fail instead, as mentioned above with the single-state-driver 
comment.

> +
> +	pr_info("idle states initialized, prototocol: %s states count: %u",
> +		prot->prot, i);
> +
> +	drv->state_count = i;
> +	return 0;
> +}


   -- Daniel
Rob Herring March 24, 2014, 5:46 p.m. UTC | #7
On Thu, Mar 20, 2014 at 1:19 PM, Sebastian Capella
<sebastian.capella@linaro.org> wrote:
> On 19 March 2014 10:23, Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> wrote:
>> Rob is asking a property describing hw, and honestly unless we define
>> a power consumption property value (defining what it means will be rather
>> complicated though) I'd rather remove index altogether and use min_residency
>> as a comparison value instead.
>
> I like having a separate ranking for power to sort the states.  To me
> it seems that min residency could be ambiguous for this.  We could
> have states with inverted min_residencies: where a lower consumption
> state has a lower target residency because the cost is recovered more
> quickly.  Such a state might have higher latency for exit, so even
> though the lower min_residency/lower power state is clearly
> preferable, selection of the lower power state may be prevented by
> kernel latency constraints.

Perhaps you can add the min residency and exit latency times.

Another piece to consider is the affinity level of the state should
probably be the basis for 1st order sorting.

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lorenzo Pieralisi March 25, 2014, 11:51 a.m. UTC | #8
On Mon, Mar 24, 2014 at 05:46:47PM +0000, Rob Herring wrote:
> On Thu, Mar 20, 2014 at 1:19 PM, Sebastian Capella
> <sebastian.capella@linaro.org> wrote:
> > On 19 March 2014 10:23, Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> wrote:
> >> Rob is asking a property describing hw, and honestly unless we define
> >> a power consumption property value (defining what it means will be rather
> >> complicated though) I'd rather remove index altogether and use min_residency
> >> as a comparison value instead.
> >
> > I like having a separate ranking for power to sort the states.  To me
> > it seems that min residency could be ambiguous for this.  We could
> > have states with inverted min_residencies: where a lower consumption
> > state has a lower target residency because the cost is recovered more
> > quickly.  Such a state might have higher latency for exit, so even
> > though the lower min_residency/lower power state is clearly
> > preferable, selection of the lower power state may be prevented by
> > kernel latency constraints.
> 
> Perhaps you can add the min residency and exit latency times.

I do not think we should mix exit latency with power consumption
ranking, they are used and define different things.

The problem is that min_residency is extremely hard to define (see
discussion below).

http://www.spinics.net/lists/arm-kernel/msg316147.html

What should min_residency represent ? As Anti correctly pointed out, it should
be compared to a shallower idle state, but to define "shallower" we need
a ranking, do not we ?

If we say "min_residency is the time required for the processor to stay
in the idle state in order to maximize power savings" (and I tried to do that),
that's wrong by definition, because there might be a deeper idle state saving
more power when entered for a time within the next event.

state 1 - min residency 10 us
state 2 - min residency 20 us

If we sleep for 1 second, state 2 must be entered, not 1 and the problem
is always the same, state 1 10us compares to state 0 (break even time
against state 0).

So, we are back to square one. How should min_residency be defined ?
If it is the break even time against a shallower idle state, we have
to know what's the shallower idle state, so we need a power ranking for
idle states.

> Another piece to consider is the affinity level of the state should
> probably be the basis for 1st order sorting.

I would like to see the idle states as a flat list, when you say
affinity you mean a cpumask ? Affinity level in the PSCI power state
parameter ? In the MPIDR ?

Thanks,
Lorenzo

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lorenzo Pieralisi March 25, 2014, 12:42 p.m. UTC | #9
Hi Daniel,

On Mon, Mar 24, 2014 at 03:31:36PM +0000, Daniel Lezcano wrote:

[...]

> > diff --git a/arch/arm64/include/asm/idle_states.h b/arch/arm64/include/asm/idle_states.h
> > new file mode 100644
> > index 0000000..0b9f9ba
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/idle_states.h
> > @@ -0,0 +1,20 @@
> > +#ifndef __ARM64_IDLE_STATES
> > +#define __ARM64_IDLE_STATES
> > +
> > +struct idle_state {
> > +     u32     index;
> > +     u32     entry_latency;
> > +     u32     exit_latency;
> > +     u32     min_residency;
> > +     u32     param;
> 
> Could you add a comment for this 'param' field or change the name to eg
> 'psci_param' ?

I can, problem is that it is not necessarily tied to PSCI or put it
differently, it is an opaque parameter that is used by the suspend
backend to enter the idle state. For PSCI is the power_state parameter
in the PSCI suspend call, for other backends might mean something
different.

> 
> > +     struct device_node *node;
> > +     struct idle_state *state;
> 
> Why is it needed ?

To swap pointers while sorting the array, probably will disappear, see
below.

> > +     cpumask_t       cpus;
> 
> It sounds strange to have to declare this cpumask here.

I think I can do away with it, given your comments below.

> > +     bool    logic_state_retained;
> > +     bool    cache_state_retained;
> > +};
> 
> IMHO, there is useless duplication of structure definition / declaration.
> 
> I suggest to stick as much as possible to the cpuidle structures, that
> is cpuidle_state and cpuidle_driver.
> 
> Instead of using the intermediate, idle_states[CPUIDLE_STATE_MAX], use
> the drv->states directly and drv->cpumask. That will prevent extra copy
> and definition/declaration for nothing. And fill the structure directly
> from parse_idle_states_node / parse_idle_states.
> 
> All this comes from the need of the state ordering, right ?

Yes and the problem is that idle states in the cpuidle_driver are stored in
an array, filling it while parsing will probably require some copies (moves),
still doable, I will look into that.

> > +struct cpuidle_driver;
> > +
> > +int __init arm_init_idle_driver(struct cpuidle_driver *drv);
> > +#endif
> > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> > index 2d4554b..2afc9a0 100644
> > --- a/arch/arm64/kernel/Makefile
> > +++ b/arch/arm64/kernel/Makefile
> > @@ -19,6 +19,7 @@ arm64-obj-$(CONFIG_HW_PERF_EVENTS)  += perf_event.o
> >   arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)+= hw_breakpoint.o
> >   arm64-obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
> >   arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND)       += sleep.o suspend.o
> > +arm64-obj-$(CONFIG_ARM64_IDLE_STATES)        += idle_states.o
> >   arm64-obj-$(CONFIG_JUMP_LABEL)              += jump_label.o
> >
> >   obj-y                                       += $(arm64-obj-y) vdso/
> > diff --git a/arch/arm64/kernel/idle_states.c b/arch/arm64/kernel/idle_states.c
> > new file mode 100644
> > index 0000000..0386cff
> > --- /dev/null
> > +++ b/arch/arm64/kernel/idle_states.c
> > @@ -0,0 +1,397 @@
> > +/*
> > + * ARM device tree idle states parsing code.
> > + *
> > + * Copyright (C) 2014 ARM Ltd.
> > + * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <linux/cpuidle.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/cpu_pm.h>
> > +#include <linux/errno.h>
> > +#include <linux/kernel.h>
> > +#include <linux/module.h>
> > +#include <linux/of.h>
> > +
> > +#include <asm/idle_states.h>
> > +#include <asm/psci.h>
> > +#include <asm/suspend.h>
> > +
> > +static struct idle_state idle_states[CPUIDLE_STATE_MAX] __initdata;
> > +typedef int (*idle_states_initializer)(struct cpumask *, struct idle_state *,
> > +                                    unsigned int);
> > +
> > +struct protocol_init {
> > +     const char *prot;
> > +     idle_states_initializer prot_init;
> > +};
> > +
> > +static const struct protocol_init protocols[] __initconst = {
> > +     {}
> > +};
> > +
> > +static __init const struct protocol_init *get_protocol(const char *str)
> > +{
> > +     int i;
> > +
> > +     if (!str)
> > +             return NULL;
> > +
> > +     for (i = 0; protocols[i].prot; i++)
> > +             if (!strcmp(protocols[i].prot, str))
> > +                     return &protocols[i];
> > +
> > +     return NULL;
> > +}
> > +
> > +static void __init idle_state_cpu_mask(int cpu, struct idle_state *idle_state,
> > +                                    struct device_node *cn)
> > +{
> > +     int i = 0;
> > +     struct device_node *cpu_state;
> > +
> > +     do {
> > +             cpu_state = of_parse_phandle(cn, "cpu-idle-states", i++);
> > +             if (cpu_state && idle_state->node == cpu_state)
> > +                     cpumask_set_cpu(cpu, &idle_state->cpus);
> > +             of_node_put(cpu_state);
> > +     } while (cpu_state);
> > +}
> > +
> > +static int __init parse_idle_states_node(struct device_node *parent, int cnt,
> > +                                      const cpumask_t *cpus)
> > +{
> > +     struct device_node *state;
> > +     struct idle_state *idle_state;
> > +     int cpu;
> > +
> > +     for_each_child_of_node(parent, state) {
> > +
> > +             if (!of_device_is_compatible(state, "arm,idle-state")) {
> > +                     pr_warn(" %s has child nodes that are not idle states\n",
> > +                             parent->full_name);
> > +                     continue;
> > +             }
> > +
> > +             idle_state = &idle_states[cnt];
> > +
> > +             pr_debug(" * %s...\n", state->full_name);
> > +
> > +             idle_state->node = state;
> 
> It is a bit confusing to use 'state' variable name here, is it possible
> to change to something like 'node' or whatever ?

Yes, absolutely.

[...]

> > +/*
> > + * arm_dt_init_idle_states - Parse DT idle states and initialize the protocol
> > + *                        back-end
> > + *
> > + * @prot: pointer to the protocol initializer. Initialized only if return code
> > + *        is >0
> > + * @cpus: CPU idle driver cpumask
> > + *
> > + * Returns:
> > + *   Number of idle states detected upon success
> > + *   <0 on failure
> > + */
> > +static int __init arm_dt_init_idle_states(const struct protocol_init **prot,
> > +                                       const cpumask_t *cpus)
> > +{
> > +     struct device_node *cpups;
> > +     const char *entry_method;
> > +     /* start from 1, stanbywfi is always there */
> > +     int cnt = 1, ret = 0;
> > +
> > +     if (!prot)
> > +             return -EINVAL;
> > +
> > +     cpups = of_find_node_by_path("/cpus/idle-states");
> > +
> > +     if (!cpups)
> > +             return -ENODEV;
> > +
> > +     cnt = parse_idle_states(cpups, cnt, cpus);
> > +
> > +     if (cnt == 1) {
> > +             ret = -ENODATA;
> > +             goto put_node;
> > +     }
> 
> If 'cnt' is always 1 when calling parse_idle_states, it would be more
> logical to do:
> 
> cnt = parse_idle_states(cpups, cpus);
> if (cnt < 0) {
>         ret = -ENODATA;
>         goto put_node;
> }
> 
> and change parse_idle_states consequently.

Yes I think it can be reworked to be made clearer, see below for default
wfi state.

> > +
> > +     /*
> > +      * idle driver expects states to sorted in terms of power
> > +      * consumption
> > +      */
> > +     sort_states(cnt);
> 
> Isn't possible to sort the states with the index when parsing the DT
> instead ?

Yes, I will do that (what field to use for comparison is still up in the
air though :))

> > +     if (of_property_read_string(cpups, "entry-method", &entry_method)) {
> > +             pr_warn(" * %s missing entry_method property\n",
> > +                         cpups->full_name);
> > +             ret = -EOPNOTSUPP;
> > +             goto put_node;
> > +     }
> > +
> > +     *prot = get_protocol(entry_method);
> > +     if (!*prot) {
> > +             pr_warn("Missing protocol initializer\n");
> > +             ret = -EOPNOTSUPP;
> > +             goto put_node;
> > +     }
> > +
> > +     pr_debug("detected %u idle states\n", cnt);
> > +
> > +put_node:
> > +     of_node_put(cpups);
> > +     return ret ? : cnt;
> > +}
> > +
> > +/*
> > + * arm_enter_idle_state - Programs CPU to enter the specified state
> > + *
> > + * @dev: cpuidle device
> > + * @drv: cpuidle driver
> > + * @idx: state index
> > + *
> > + * Called from the CPUidle framework to program the device to the
> > + * specified target state selected by the governor.
> > + */
> > +static int arm_enter_idle_state(struct cpuidle_device *dev,
> > +                             struct cpuidle_driver *drv, int idx)
> > +{
> > +     int ret;
> > +
> > +     if (!idx) {
> > +             /*
> > +              * idle index 0 is just standby wfi, does not require CPU
> > +              * to be suspended
> > +              */
> > +             cpu_do_idle();
> > +             return idx;
> > +     }
> > +
> > +     cpu_pm_enter();
> > +     /*
> > +      * Pass idle state index to cpu_suspend which in turn will call
> > +      * the CPU ops suspend protocol with idle index as a parameter
> > +      */
> > +     ret = cpu_suspend(idx);
> > +
> > +     cpu_pm_exit();
> > +
> > +     return ret ? : idx;
> 
> Are we sure the cpu_suspend will always return a negative value ? If the
> underlying returns 1, like the exynos4/5's cpuidle driver does, that
> will mess up the cpuidle governor with bad inputs.

cpu_suspend works a bit differently on ARM64, the suspend finisher is
defined through cpu_ops and we are in control of what it returns (eg
PSCI back-end, patch 2 in this series).

Well-spotted anyway, I must force the return value to be < 0 (if suspend
fails) to avoid issues.

> > +}
> > +
> > +int __init arm_init_idle_driver(struct cpuidle_driver *drv)
> > +{
> > +     int i,  idle_states_nb;
> > +     struct idle_state *idle_state;
> > +     struct cpuidle_state *s;
> > +     const struct protocol_init *prot;
> > +
> > +     drv->states[0].exit_latency = 1;
> > +     drv->states[0].target_residency = 1;
> > +     drv->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
> > +     drv->states[0].enter = arm_enter_idle_state;
> > +     strncpy(drv->states[0].name, "ARM WFI", CPUIDLE_NAME_LEN);
> > +     strncpy(drv->states[0].desc, "ARM WFI", CPUIDLE_DESC_LEN);
> 
> The drv->states[0].name|desc is an array but I hope with some cleanups
> in the acpi cpuidle driver, we can change it to a const char *.
> 
> Is it possible to change it as:
> drv->states[0].name = "ARM WFI";
> drv->states[0].desc = "ARM WFI";
> 
> ?

Let me see first if I can address the questions below, related to WFI.

> > +     idle_states_nb = arm_dt_init_idle_states(&prot, drv->cpumask);
> > +
> > +     if (idle_states_nb < 0) {
> > +             /*
> > +              * No DT based idle states detected
> > +              * Initialize driver count and exit successfully.
> > +              */
> > +             drv->state_count = 1;
> > +             return 0;
> > +     }
> 
> If we are not able to initialize the cpuidle driver, wouldn't make sense
> to let the default idle function instead of defining a single state
> driver ? If no cpuidle driver is registered, the idle mainloop will
> switch to the default idle function. That should save the system to
> enter the cpuidle framework and the governor with all its computations
> for nothing, no ?

Yes, that makes sense.

> > +     /*
> > +      * We finally have some idle states to initialize.
> > +      * Driver state 0 corresponds to WFI, start from index 1 and count up
> > +      * to idle_states_nb (parsed idle states + WFI).
> > +      * arm_dt_init_idle_states() ensures that CPUIDLE_STATE_MAX is not
> > +      * exceeded.
> > +      */
> 
> I don't know the arm64 architecture but on armv7 there are some
> implementations which do not support the WFI instruction alone (eg.
> omap4), if that could happen for arm64, the driver is assuming WFI is
> always supported.

That's a sore point and I thought about that.

We must define what's "a standard wfi". I really hope we won't have
to override it as in arm32 for some platforms.

Having said that, I think it is fair to call cpu_do_idle for index 0, if
there are platform quirks they should be dealt with by overriding
cpu_do_idle.

Another solution would consist in always calling cpu_suspend() for all
states and then the suspend backend can put CPU in whatever version of wfi
the platform provide there.

I need to think more about this, I do not want to save the context for
nothing and at the same time have generic code.

> > +     s = &drv->states[1];
> > +     for (i = 1; i < idle_states_nb; i++, s++) {
> > +             idle_state = idle_states[i].state;
> > +             if (!idle_state)
> > +                     break;
> > +
> > +             strncpy(s->name, idle_state->node->name, CPUIDLE_NAME_LEN);
> > +             strncpy(s->desc, idle_state->node->name, CPUIDLE_DESC_LEN);
> 
> Same comment than above.

Ok.

> > +
> > +             s->exit_latency =
> > +                     idle_state->entry_latency + idle_state->exit_latency;
> > +             s->target_residency = idle_state->min_residency;
> > +             /*
> > +              * TBD: flag for timers is set implicitly for now but must be
> > +              * linked to power domains.
> > +              */
> > +             if (!idle_state->logic_state_retained)
> > +                     s->flags |= CPUIDLE_FLAG_TIMER_STOP;
> 
> The exynos use per cpu timer which are not local, so they are not shut
> down when 'logic_state_retained' is reached (If I refer to the calxeda
> driver, it should be the same). Is the TBD comment describing such case ?

Yes, the code above is temporary. The only way to fix it is to compare
the idle state power domain to the timer (tick_device) power domain and
check if that power domain goes down when the specific idle state is entered.

That's the reason why I went great lengths when defining the bindings
and that's good you flagged this up so that people are aware.

Let me know please if you want me to add that code (power domain parsing -
and additional bindings for timers and idle states) before considering merging
this code since I suspect changes are significant to handle this specific
niggle (an age-old one, it is really time we solved it).

> > +             s->flags |= CPUIDLE_FLAG_TIME_VALID;
> > +             s->enter = arm_enter_idle_state;
> > +     }
> > +
> > +     /*
> > +      * If we are here, we have a protocol back-end to initialize.
> > +      *
> > +      * If protocol initializer fails reset states count to 1 (wfi)
> > +      */
> > +     if (prot->prot_init(drv->cpumask, idle_states, idle_states_nb))
> > +             i = 1;
> 
> It should fail instead, as mentioned above with the single-state-driver
> comment.

Agreed.

Thank you very much for the review.
Lorenzo

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 27bbcfc..3132572 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -295,6 +295,10 @@  config ARCH_SUSPEND_POSSIBLE
 config ARM64_CPU_SUSPEND
 	def_bool PM_SLEEP
 
+config ARM64_IDLE_STATES
+	def_bool CPU_IDLE
+	select ARM64_CPU_SUSPEND
+
 endmenu
 
 menu "CPU Power Management"
diff --git a/arch/arm64/include/asm/idle_states.h b/arch/arm64/include/asm/idle_states.h
new file mode 100644
index 0000000..0b9f9ba
--- /dev/null
+++ b/arch/arm64/include/asm/idle_states.h
@@ -0,0 +1,20 @@ 
+#ifndef __ARM64_IDLE_STATES
+#define __ARM64_IDLE_STATES
+
+struct idle_state {
+	u32	index;
+	u32	entry_latency;
+	u32	exit_latency;
+	u32	min_residency;
+	u32	param;
+	struct device_node *node;
+	struct idle_state *state;
+	cpumask_t	cpus;
+	bool	logic_state_retained;
+	bool	cache_state_retained;
+};
+
+struct cpuidle_driver;
+
+int __init arm_init_idle_driver(struct cpuidle_driver *drv);
+#endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2d4554b..2afc9a0 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -19,6 +19,7 @@  arm64-obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o
 arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)+= hw_breakpoint.o
 arm64-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND)	+= sleep.o suspend.o
+arm64-obj-$(CONFIG_ARM64_IDLE_STATES)	+= idle_states.o
 arm64-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
 
 obj-y					+= $(arm64-obj-y) vdso/
diff --git a/arch/arm64/kernel/idle_states.c b/arch/arm64/kernel/idle_states.c
new file mode 100644
index 0000000..0386cff
--- /dev/null
+++ b/arch/arm64/kernel/idle_states.c
@@ -0,0 +1,397 @@ 
+/*
+ * ARM device tree idle states parsing code.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+
+#include <asm/idle_states.h>
+#include <asm/psci.h>
+#include <asm/suspend.h>
+
+static struct idle_state idle_states[CPUIDLE_STATE_MAX] __initdata;
+typedef int (*idle_states_initializer)(struct cpumask *, struct idle_state *,
+				       unsigned int);
+
+struct protocol_init {
+	const char *prot;
+	idle_states_initializer prot_init;
+};
+
+static const struct protocol_init protocols[] __initconst = {
+	{}
+};
+
+static __init const struct protocol_init *get_protocol(const char *str)
+{
+	int i;
+
+	if (!str)
+		return NULL;
+
+	for (i = 0; protocols[i].prot; i++)
+		if (!strcmp(protocols[i].prot, str))
+			return &protocols[i];
+
+	return NULL;
+}
+
+static void __init idle_state_cpu_mask(int cpu, struct idle_state *idle_state,
+				       struct device_node *cn)
+{
+	int i = 0;
+	struct device_node *cpu_state;
+
+	do {
+		cpu_state = of_parse_phandle(cn, "cpu-idle-states", i++);
+		if (cpu_state && idle_state->node == cpu_state)
+			cpumask_set_cpu(cpu, &idle_state->cpus);
+		of_node_put(cpu_state);
+	} while (cpu_state);
+}
+
+static int __init parse_idle_states_node(struct device_node *parent, int cnt,
+					 const cpumask_t *cpus)
+{
+	struct device_node *state;
+	struct idle_state *idle_state;
+	int cpu;
+
+	for_each_child_of_node(parent, state) {
+
+		if (!of_device_is_compatible(state, "arm,idle-state")) {
+			pr_warn(" %s has child nodes that are not idle states\n",
+				parent->full_name);
+			continue;
+		}
+
+		idle_state = &idle_states[cnt];
+
+		pr_debug(" * %s...\n", state->full_name);
+
+		idle_state->node = state;
+		/*
+		 * Check cpus on which the idle state is valid
+		 */
+		for_each_possible_cpu(cpu) {
+			struct device_node *cn;
+
+			cn = of_get_cpu_node(cpu, NULL);
+			if (!cn) {
+				pr_err("missing device node for CPU %d\n", cpu);
+				continue;
+			}
+			idle_state_cpu_mask(cpu, idle_state, cn);
+		}
+
+		/*
+		 * The driver cpumask is not a subset of cpus on which the
+		 * idle state is valid, hence the idle state is skipped for
+		 * this driver.
+		 */
+		if (!cpumask_subset(cpus, &idle_state->cpus))
+			continue;
+
+		if (of_property_read_u32(state, "index", &idle_state->index)) {
+			pr_debug(" * %s missing index property\n",
+				     state->full_name);
+			continue;
+		}
+
+		if (of_property_read_u32(state, "entry-latency-us",
+					 &idle_state->entry_latency)) {
+			pr_debug(" * %s missing entry latency property\n",
+				     state->full_name);
+			continue;
+		}
+
+		if (of_property_read_u32(state, "exit-latency-us",
+					 &idle_state->exit_latency)) {
+			pr_debug(" * %s missing exit latency property\n",
+				     state->full_name);
+			continue;
+		}
+
+		if (of_property_read_u32(state, "min-residency-us",
+					 &idle_state->min_residency)) {
+			pr_debug(" * %s missing min-residency property\n",
+				     state->full_name);
+			continue;
+		}
+
+		if (of_property_read_u32(state, "entry-method-param",
+					 &idle_state->param)) {
+			pr_debug(" * %s missing entry-method-param property\n",
+				     state->full_name);
+			continue;
+		}
+
+		if (++cnt == CPUIDLE_STATE_MAX) {
+			pr_warn("Number of parsed states equal static CPU idle state limit\n");
+			of_node_put(state);
+			break;
+		}
+	}
+
+	return cnt;
+}
+
+static int __init parse_idle_states(struct device_node *root, int cnt,
+				    const cpumask_t *cpus)
+{
+	int head_idx, curr_idx;
+	struct device_node *curr = root;
+
+	/*
+	 * Breadth-first DT idle states parsing
+	 *
+	 * Sweep idle states level in the device tree and use the
+	 * idle_states array to stash the visited nodes, as a queue.
+	 *
+	 * parse_idle_states_node() updates the idle_states array by
+	 * initializing entries, stashing the device tree node for the
+	 * corresponding state (struct idle_state.node) and incrementing
+	 * the idle states counter that is returned so that curr_idx is
+	 * kept up-to-date while descending into tree levels.
+	 *
+	 * Store the initial counter head_idx and curr_idx and use head_idx
+	 * as a queue of node indices to be visited.
+	 *
+	 * When we reach the max number of CPU idle states or
+	 * head_idx == curr_idx (empty nodes queue) we are done.
+	 */
+	head_idx = curr_idx = cnt;
+
+	do {
+		curr_idx = parse_idle_states_node(curr, curr_idx, cpus);
+		if (curr_idx == CPUIDLE_STATE_MAX || head_idx == curr_idx)
+			break;
+		/*
+		 * idle_states array is updated by parse_idle_states_node(),
+		 * we can use the initialized states as a queue of nodes
+		 * that need to be checked for their idle states siblings.
+		 * head_idx works as a pointer into the queue to get the
+		 * next node to be parsed.
+		 */
+		curr = idle_states[head_idx++].node;
+	} while (curr);
+
+	return curr_idx;
+}
+
+/*
+ * Sort states according to their index value, higher indexes
+ * imply higher power savings, as expected by the CPU idle subsystem.
+ */
+static void __init sort_states(int count)
+{
+	int i, j;
+	struct idle_state *idle_state;
+
+	/*
+	 * move pointers instead of entries
+	 */
+	for (i = 0; i < count; i++)
+		idle_states[i].state = &idle_states[i];
+
+	/*
+	 * Selection sort is acceptable here, since the
+	 * number of states is small (<=CPUIDLE_STATE_MAX)
+	 *
+	 * Index starts from 1, we know index 0 is standby wfi and
+	 * it is always the idle state with higher power consumption
+	 */
+	for (i = 1; i < count - 1; i++) {
+		u32 elem = i;
+		for (j = i+1; j < count; j++) {
+			if (idle_states[j].state->index <
+					idle_states[elem].state->index)
+				elem = j;
+		}
+		if (i != elem) {
+			idle_state = idle_states[i].state;
+			idle_states[i].state = idle_states[elem].state;
+			idle_states[elem].state = idle_state;
+		}
+	}
+}
+
+/*
+ * arm_dt_init_idle_states - Parse DT idle states and initialize the protocol
+ *			     back-end
+ *
+ * @prot: pointer to the protocol initializer. Initialized only if return code
+ *        is >0
+ * @cpus: CPU idle driver cpumask
+ *
+ * Returns:
+ *	Number of idle states detected upon success
+ *	<0 on failure
+ */
+static int __init arm_dt_init_idle_states(const struct protocol_init **prot,
+					  const cpumask_t *cpus)
+{
+	struct device_node *cpups;
+	const char *entry_method;
+	/* start from 1, stanbywfi is always there */
+	int cnt = 1, ret = 0;
+
+	if (!prot)
+		return -EINVAL;
+
+	cpups = of_find_node_by_path("/cpus/idle-states");
+
+	if (!cpups)
+		return -ENODEV;
+
+	cnt = parse_idle_states(cpups, cnt, cpus);
+
+	if (cnt == 1) {
+		ret = -ENODATA;
+		goto put_node;
+	}
+
+	/*
+	 * idle driver expects states to sorted in terms of power
+	 * consumption
+	 */
+	sort_states(cnt);
+
+	if (of_property_read_string(cpups, "entry-method", &entry_method)) {
+		pr_warn(" * %s missing entry_method property\n",
+			    cpups->full_name);
+		ret = -EOPNOTSUPP;
+		goto put_node;
+	}
+
+	*prot = get_protocol(entry_method);
+	if (!*prot) {
+		pr_warn("Missing protocol initializer\n");
+		ret = -EOPNOTSUPP;
+		goto put_node;
+	}
+
+	pr_debug("detected %u idle states\n", cnt);
+
+put_node:
+	of_node_put(cpups);
+	return ret ? : cnt;
+}
+
+/*
+ * arm_enter_idle_state - Programs CPU to enter the specified state
+ *
+ * @dev: cpuidle device
+ * @drv: cpuidle driver
+ * @idx: state index
+ *
+ * Called from the CPUidle framework to program the device to the
+ * specified target state selected by the governor.
+ */
+static int arm_enter_idle_state(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int idx)
+{
+	int ret;
+
+	if (!idx) {
+		/*
+		 * idle index 0 is just standby wfi, does not require CPU
+		 * to be suspended
+		 */
+		cpu_do_idle();
+		return idx;
+	}
+
+	cpu_pm_enter();
+	/*
+	 * Pass idle state index to cpu_suspend which in turn will call
+	 * the CPU ops suspend protocol with idle index as a parameter
+	 */
+	ret = cpu_suspend(idx);
+
+	cpu_pm_exit();
+
+	return ret ? : idx;
+}
+
+int __init arm_init_idle_driver(struct cpuidle_driver *drv)
+{
+	int i,  idle_states_nb;
+	struct idle_state *idle_state;
+	struct cpuidle_state *s;
+	const struct protocol_init *prot;
+
+	drv->states[0].exit_latency = 1;
+	drv->states[0].target_residency = 1;
+	drv->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
+	drv->states[0].enter = arm_enter_idle_state;
+	strncpy(drv->states[0].name, "ARM WFI", CPUIDLE_NAME_LEN);
+	strncpy(drv->states[0].desc, "ARM WFI", CPUIDLE_DESC_LEN);
+
+	idle_states_nb = arm_dt_init_idle_states(&prot, drv->cpumask);
+
+	if (idle_states_nb < 0) {
+		/*
+		 * No DT based idle states detected
+		 * Initialize driver count and exit successfully.
+		 */
+		drv->state_count = 1;
+		return 0;
+	}
+
+	/*
+	 * We finally have some idle states to initialize.
+	 * Driver state 0 corresponds to WFI, start from index 1 and count up
+	 * to idle_states_nb (parsed idle states + WFI).
+	 * arm_dt_init_idle_states() ensures that CPUIDLE_STATE_MAX is not
+	 * exceeded.
+	 */
+	s = &drv->states[1];
+	for (i = 1; i < idle_states_nb; i++, s++) {
+		idle_state = idle_states[i].state;
+		if (!idle_state)
+			break;
+
+		strncpy(s->name, idle_state->node->name, CPUIDLE_NAME_LEN);
+		strncpy(s->desc, idle_state->node->name, CPUIDLE_DESC_LEN);
+
+		s->exit_latency =
+			idle_state->entry_latency + idle_state->exit_latency;
+		s->target_residency = idle_state->min_residency;
+		/*
+		 * TBD: flag for timers is set implicitly for now but must be
+		 * linked to power domains.
+		 */
+		if (!idle_state->logic_state_retained)
+			s->flags |= CPUIDLE_FLAG_TIMER_STOP;
+		s->flags |= CPUIDLE_FLAG_TIME_VALID;
+		s->enter = arm_enter_idle_state;
+	}
+
+	/*
+	 * If we are here, we have a protocol back-end to initialize.
+	 *
+	 * If protocol initializer fails reset states count to 1 (wfi)
+	 */
+	if (prot->prot_init(drv->cpumask, idle_states, idle_states_nb))
+		i = 1;
+
+	pr_info("idle states initialized, prototocol: %s states count: %u",
+		prot->prot, i);
+
+	drv->state_count = i;
+	return 0;
+}
+