diff mbox

[1/2] ARM: vexpress/TC2: basic PM support

Message ID 1370587152-4630-2-git-send-email-nicolas.pitre@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Nicolas Pitre June 7, 2013, 6:39 a.m. UTC
This is the MCPM backend for the Virtual Express A15x2 A7x3 CoreTile
aka TC2.  This provides cluster management for SMP secondary boot and
CPU hotplug.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/arm/mach-vexpress/Kconfig  |   9 ++
 arch/arm/mach-vexpress/Makefile |   1 +
 arch/arm/mach-vexpress/tc2_pm.c | 243 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 253 insertions(+)
 create mode 100644 arch/arm/mach-vexpress/tc2_pm.c

Comments

Lorenzo Pieralisi June 7, 2013, 2:26 p.m. UTC | #1
On Fri, Jun 07, 2013 at 07:39:11AM +0100, Nicolas Pitre wrote:
> This is the MCPM backend for the Virtual Express A15x2 A7x3 CoreTile
> aka TC2.  This provides cluster management for SMP secondary boot and
> CPU hotplug.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>  arch/arm/mach-vexpress/Kconfig  |   9 ++
>  arch/arm/mach-vexpress/Makefile |   1 +
>  arch/arm/mach-vexpress/tc2_pm.c | 243 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 253 insertions(+)
>  create mode 100644 arch/arm/mach-vexpress/tc2_pm.c
> 
> diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig
> index b8bbabec63..e7a825d7df 100644
> --- a/arch/arm/mach-vexpress/Kconfig
> +++ b/arch/arm/mach-vexpress/Kconfig
> @@ -66,4 +66,13 @@ config ARCH_VEXPRESS_DCSCB
>  	  This is needed to provide CPU and cluster power management
>  	  on RTSM implementing big.LITTLE.
>  
> +config ARCH_VEXPRESS_TC2
> +	bool "Versatile Express TC2 power management"
> +	depends on MCPM
> +	select VEXPRESS_SPC
> +	select ARM_CCI
> +	help
> +	  Support for CPU and cluster power management on Versatile Express
> +	  with a TC2 (A15x2 A7x3) big.LITTLE core tile.
> +
>  endmenu
> diff --git a/arch/arm/mach-vexpress/Makefile b/arch/arm/mach-vexpress/Makefile
> index 48ba89a814..b1cf227fa5 100644
> --- a/arch/arm/mach-vexpress/Makefile
> +++ b/arch/arm/mach-vexpress/Makefile
> @@ -7,5 +7,6 @@ ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \
>  obj-y					:= v2m.o
>  obj-$(CONFIG_ARCH_VEXPRESS_CA9X4)	+= ct-ca9x4.o
>  obj-$(CONFIG_ARCH_VEXPRESS_DCSCB)	+= dcscb.o	dcscb_setup.o
> +obj-$(CONFIG_ARCH_VEXPRESS_TC2)		+= tc2_pm.o
>  obj-$(CONFIG_SMP)			+= platsmp.o
>  obj-$(CONFIG_HOTPLUG_CPU)		+= hotplug.o
> diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c
> new file mode 100644
> index 0000000000..a3ea524372
> --- /dev/null
> +++ b/arch/arm/mach-vexpress/tc2_pm.c
> @@ -0,0 +1,243 @@
> +/*
> + * arch/arm/mach-vexpress/tc2_pm.c - TC2 power management support
> + *
> + * Created by:	Nicolas Pitre, October 2012
> + * Copyright:	(C) 2012-2013  Linaro Limited
> + *
> + * Some portions of this file were originally written by Achin Gupta
> + * Copyright:   (C) 2012  ARM Limited
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/errno.h>
> +
> +#include <asm/mcpm.h>
> +#include <asm/proc-fns.h>
> +#include <asm/cacheflush.h>
> +#include <asm/cputype.h>
> +#include <asm/cp15.h>
> +
> +#include <mach/motherboard.h>

Is the include above needed ?

> +#include <linux/vexpress.h>
> +#include <linux/arm-cci.h>
> +
> +/*
> + * We can't use regular spinlocks. In the switcher case, it is possible
> + * for an outbound CPU to call power_down() after its inbound counterpart
> + * is already live using the same logical CPU number which trips lockdep
> + * debugging.
> + */
> +static arch_spinlock_t tc2_pm_lock = __ARCH_SPIN_LOCK_UNLOCKED;
> +
> +static int tc2_pm_use_count[3][2];
> +
> +static int tc2_pm_power_up(unsigned int cpu, unsigned int cluster)
> +{
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	if (cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster))
> +		return -EINVAL;

We could stash (vexpress_spc_get_nb_cpus()), it never changes.

> +	/*
> +	 * Since this is called with IRQs enabled, and no arch_spin_lock_irq
> +	 * variant exists, we need to disable IRQs manually here.
> +	 */
> +	local_irq_disable();
> +	arch_spin_lock(&tc2_pm_lock);
> +
> +	if (!tc2_pm_use_count[0][cluster] &&
> +	    !tc2_pm_use_count[1][cluster] &&
> +	    !tc2_pm_use_count[2][cluster])
> +		vexpress_spc_powerdown_enable(cluster, 0);
> +
> +	tc2_pm_use_count[cpu][cluster]++;
> +	if (tc2_pm_use_count[cpu][cluster] == 1) {
> +		vexpress_spc_write_resume_reg(cluster, cpu,
> +					      virt_to_phys(mcpm_entry_point));
> +		vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 1);
> +	} else if (tc2_pm_use_count[cpu][cluster] != 2) {
> +		/*
> +		 * The only possible values are:
> +		 * 0 = CPU down
> +		 * 1 = CPU (still) up
> +		 * 2 = CPU requested to be up before it had a chance
> +		 *     to actually make itself down.
> +		 * Any other value is a bug.
> +		 */
> +		BUG();
> +	}
> +
> +	arch_spin_unlock(&tc2_pm_lock);
> +	local_irq_enable();
> +
> +	return 0;
> +}
> +
> +static void tc2_pm_power_down(void)
> +{
> +	unsigned int mpidr, cpu, cluster;
> +	bool last_man = false, skip_wfi = false;
> +
> +	mpidr = read_cpuid_mpidr();
> +	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> +
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	BUG_ON(cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster));

Ditto, see above.

> +	__mcpm_cpu_going_down(cpu, cluster);
> +
> +	arch_spin_lock(&tc2_pm_lock);
> +	BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
> +	tc2_pm_use_count[cpu][cluster]--;
> +	if (tc2_pm_use_count[cpu][cluster] == 0) {
> +		vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 1);
> +		if (!tc2_pm_use_count[0][cluster] &&
> +		    !tc2_pm_use_count[1][cluster] &&
> +		    !tc2_pm_use_count[2][cluster]) {
> +			vexpress_spc_powerdown_enable(cluster, 1);
> +			vexpress_spc_set_global_wakeup_intr(1);
> +			last_man = true;
> +		}
> +	} else if (tc2_pm_use_count[cpu][cluster] == 1) {
> +		/*
> +		 * A power_up request went ahead of us.
> +		 * Even if we do not want to shut this CPU down,
> +		 * the caller expects a certain state as if the WFI
> +		 * was aborted.  So let's continue with cache cleaning.
> +		 */
> +		skip_wfi = true;
> +	} else
> +		BUG();
> +
> +	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
> +		arch_spin_unlock(&tc2_pm_lock);
> +
> +		set_cr(get_cr() & ~CR_C);

We must disable L2 prefetching on A15 before cleaning L2.

> +		flush_cache_all();
> +		asm volatile ("clrex");
> +		set_auxcr(get_auxcr() & ~(1 << 6));

I think we should add comments here to avoid copy'n'paste mayhem. The
code above is safe on cpus like A15/A7 (I know this back-end can just
be run on those processors) that hit in the cache with C-bit in SCTLR
cleared, it would explode on processors (eg A9) that do not hit in the
cache with C-bit cleared. I am wondering if it is better to write inline
asm and jump to v7 cache functions that do not need stack push/pop
straight away.

> +		cci_disable_port_by_cpu(mpidr);
> +
> +		/*
> +		 * Ensure that both C & I bits are disabled in the SCTLR
> +		 * before disabling ACE snoops. This ensures that no
> +		 * coherency traffic will originate from this cpu after
> +		 * ACE snoops are turned off.
> +		 */
> +		cpu_proc_fin();

Mmm, C bit is already cleared, why clear the I bit (and the A bit) ?
I do not think cpu_proc_fin() is needed and I am really keen on getting
the power down procedure right to avoid copy'n'paste induced error from
the start.

> +
> +		__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
> +	} else {
> +		/*
> +		 * If last man then undo any setup done previously.
> +		 */
> +		if (last_man) {
> +			vexpress_spc_powerdown_enable(cluster, 0);
> +			vexpress_spc_set_global_wakeup_intr(0);
> +		}
> +
> +		arch_spin_unlock(&tc2_pm_lock);
> +
> +		set_cr(get_cr() & ~CR_C);
> +		flush_cache_louis();
> +		asm volatile ("clrex");
> +		set_auxcr(get_auxcr() & ~(1 << 6));
> +	}
> +
> +	__mcpm_cpu_down(cpu, cluster);
> +
> +	/* Now we are prepared for power-down, do it: */
> +	if (!skip_wfi)
> +		wfi();
> +
> +	/* Not dead at this point?  Let our caller cope. */

This function should disable the GIC CPU IF, but I guess you will add
the code when CPUidle is merged.

> +static void tc2_pm_powered_up(void)
> +{
> +	unsigned int mpidr, cpu, cluster;
> +	unsigned long flags;
> +
> +	mpidr = read_cpuid_mpidr();
> +	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> +
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	BUG_ON(cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster));
> +
> +	local_irq_save(flags);
> +	arch_spin_lock(&tc2_pm_lock);
> +
> +	if (!tc2_pm_use_count[0][cluster] &&
> +	    !tc2_pm_use_count[1][cluster] &&
> +	    !tc2_pm_use_count[2][cluster]) {
> +		vexpress_spc_powerdown_enable(cluster, 0);
> +		vexpress_spc_set_global_wakeup_intr(0);
> +	}
> +
> +	if (!tc2_pm_use_count[cpu][cluster])
> +		tc2_pm_use_count[cpu][cluster] = 1;
> +
> +	vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 0);
> +	vexpress_spc_write_resume_reg(cluster, cpu, 0);
> +
> +	arch_spin_unlock(&tc2_pm_lock);
> +	local_irq_restore(flags);
> +}
> +
> +static const struct mcpm_platform_ops tc2_pm_power_ops = {
> +	.power_up	= tc2_pm_power_up,
> +	.power_down	= tc2_pm_power_down,
> +	.powered_up	= tc2_pm_powered_up,
> +};
> +
> +static void __init tc2_pm_usage_count_init(void)
> +{
> +	unsigned int mpidr, cpu, cluster;
> +
> +	mpidr = read_cpuid_mpidr();
> +	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> +
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	BUG_ON(cpu >= 3 || cluster >= 2);
> +	tc2_pm_use_count[cpu][cluster] = 1;
> +}
> +
> +/*
> + * Enable cluster-level coherency, in preparation for turning on the MMU.
> + */
> +static void __naked tc2_pm_power_up_setup(unsigned int affinity_level)
> +{
> +	asm volatile (" \n"
> +"	cmp	r0, #1 \n"
> +"	beq	cci_enable_port_for_self \n"
> +"	bx	lr ");
> +}

We could write a function like the above (stackless and inline) for the
sequence:

1- clear C bit
2- flush cache all/louis
3- exit coherency

Again, the current implementation is right on TC2 but we should not let
people think that's correct for all v7 processors

Lorenzo
Sudeep KarkadaNagesha June 10, 2013, 5:01 p.m. UTC | #2
On 07/06/13 07:39, Nicolas Pitre wrote:
> This is the MCPM backend for the Virtual Express A15x2 A7x3 CoreTile
> aka TC2.  This provides cluster management for SMP secondary boot and
> CPU hotplug.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>  arch/arm/mach-vexpress/Kconfig  |   9 ++
>  arch/arm/mach-vexpress/Makefile |   1 +
>  arch/arm/mach-vexpress/tc2_pm.c | 243 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 253 insertions(+)
>  create mode 100644 arch/arm/mach-vexpress/tc2_pm.c
> 
[...]
> +/*
> + * Enable cluster-level coherency, in preparation for turning on the MMU.
> + */
> +static void __naked tc2_pm_power_up_setup(unsigned int affinity_level)
> +{
> +	asm volatile (" \n"
> +"	cmp	r0, #1 \n"
You may need Thumb2 if-then(IT) instruction to support longer branch
range here when compiled in THUMB2 mode.
"       it      eq \n"

> +"	beq	cci_enable_port_for_self \n"
> +"	bx	lr ");
> +}
> +

Regards,
Sudeep
Nicolas Pitre June 10, 2013, 8:21 p.m. UTC | #3
On Mon, 10 Jun 2013, Sudeep KarkadaNagesha wrote:

> On 07/06/13 07:39, Nicolas Pitre wrote:
> > This is the MCPM backend for the Virtual Express A15x2 A7x3 CoreTile
> > aka TC2.  This provides cluster management for SMP secondary boot and
> > CPU hotplug.
> > 
> > Signed-off-by: Nicolas Pitre <nico@linaro.org>
> > ---
> >  arch/arm/mach-vexpress/Kconfig  |   9 ++
> >  arch/arm/mach-vexpress/Makefile |   1 +
> >  arch/arm/mach-vexpress/tc2_pm.c | 243 ++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 253 insertions(+)
> >  create mode 100644 arch/arm/mach-vexpress/tc2_pm.c
> > 
> [...]
> > +/*
> > + * Enable cluster-level coherency, in preparation for turning on the MMU.
> > + */
> > +static void __naked tc2_pm_power_up_setup(unsigned int affinity_level)
> > +{
> > +	asm volatile (" \n"
> > +"	cmp	r0, #1 \n"
> > +"	beq	cci_enable_port_for_self \n"
> > +"	bx	lr ");
> You may need Thumb2 if-then(IT) instruction to support longer branch
> range here when compiled in THUMB2 mode.
> "       it      eq \n"

Yeah, I noticed, and therefore reworked it in my follow-up patch as 
follows:

	cmp	r0, #1
	bxne	lr
	b	cci_enable_port_for_self \n"


Nicolas
Olof Johansson June 13, 2013, 8:32 p.m. UTC | #4
Hi,


Some comments below.

On Fri, Jun 07, 2013 at 02:39:11AM -0400, Nicolas Pitre wrote:
> This is the MCPM backend for the Virtual Express A15x2 A7x3 CoreTile
> aka TC2.  This provides cluster management for SMP secondary boot and
> CPU hotplug.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>  arch/arm/mach-vexpress/Kconfig  |   9 ++
>  arch/arm/mach-vexpress/Makefile |   1 +
>  arch/arm/mach-vexpress/tc2_pm.c | 243 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 253 insertions(+)
>  create mode 100644 arch/arm/mach-vexpress/tc2_pm.c
> 
> diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig
> index b8bbabec63..e7a825d7df 100644
> --- a/arch/arm/mach-vexpress/Kconfig
> +++ b/arch/arm/mach-vexpress/Kconfig
> @@ -66,4 +66,13 @@ config ARCH_VEXPRESS_DCSCB
>  	  This is needed to provide CPU and cluster power management
>  	  on RTSM implementing big.LITTLE.
>  
> +config ARCH_VEXPRESS_TC2
> +	bool "Versatile Express TC2 power management"

Should there be a _PM in the config option, perhaps? Or maybe take out the
power management one-line info if it's really just base support for the
platform.

> +	depends on MCPM
> +	select VEXPRESS_SPC
> +	select ARM_CCI
> +	help
> +	  Support for CPU and cluster power management on Versatile Express
> +	  with a TC2 (A15x2 A7x3) big.LITTLE core tile.
> +
>  endmenu
> diff --git a/arch/arm/mach-vexpress/Makefile b/arch/arm/mach-vexpress/Makefile
> index 48ba89a814..b1cf227fa5 100644
> --- a/arch/arm/mach-vexpress/Makefile
> +++ b/arch/arm/mach-vexpress/Makefile
> @@ -7,5 +7,6 @@ ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \
>  obj-y					:= v2m.o
>  obj-$(CONFIG_ARCH_VEXPRESS_CA9X4)	+= ct-ca9x4.o
>  obj-$(CONFIG_ARCH_VEXPRESS_DCSCB)	+= dcscb.o	dcscb_setup.o
> +obj-$(CONFIG_ARCH_VEXPRESS_TC2)		+= tc2_pm.o
>  obj-$(CONFIG_SMP)			+= platsmp.o
>  obj-$(CONFIG_HOTPLUG_CPU)		+= hotplug.o
> diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c
> new file mode 100644
> index 0000000000..a3ea524372
> --- /dev/null
> +++ b/arch/arm/mach-vexpress/tc2_pm.c
> @@ -0,0 +1,243 @@
> +/*
> + * arch/arm/mach-vexpress/tc2_pm.c - TC2 power management support
> + *
> + * Created by:	Nicolas Pitre, October 2012
> + * Copyright:	(C) 2012-2013  Linaro Limited
> + *
> + * Some portions of this file were originally written by Achin Gupta
> + * Copyright:   (C) 2012  ARM Limited
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/errno.h>
> +
> +#include <asm/mcpm.h>
> +#include <asm/proc-fns.h>
> +#include <asm/cacheflush.h>
> +#include <asm/cputype.h>
> +#include <asm/cp15.h>
> +
> +#include <mach/motherboard.h>
> +
> +#include <linux/vexpress.h>
> +#include <linux/arm-cci.h>
> +
> +/*
> + * We can't use regular spinlocks. In the switcher case, it is possible
> + * for an outbound CPU to call power_down() after its inbound counterpart
> + * is already live using the same logical CPU number which trips lockdep
> + * debugging.
> + */
> +static arch_spinlock_t tc2_pm_lock = __ARCH_SPIN_LOCK_UNLOCKED;
> +
> +static int tc2_pm_use_count[3][2];

A comment describing the purpose of this array would be much beneficial for
someone reading the driver.

> +
> +static int tc2_pm_power_up(unsigned int cpu, unsigned int cluster)
> +{
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	if (cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster))
> +		return -EINVAL;
> +
> +	/*
> +	 * Since this is called with IRQs enabled, and no arch_spin_lock_irq
> +	 * variant exists, we need to disable IRQs manually here.
> +	 */
> +	local_irq_disable();
> +	arch_spin_lock(&tc2_pm_lock);
> +
> +	if (!tc2_pm_use_count[0][cluster] &&
> +	    !tc2_pm_use_count[1][cluster] &&
> +	    !tc2_pm_use_count[2][cluster])

I see this construct used three times open-coded like this in the file. Adding
a smaller helper with a descriptive name would be a good idea.

Same goes for other direct accesses to the array -- maybe a tiny helper to
set/get state? It might end up overabstracted that way though.

> +		vexpress_spc_powerdown_enable(cluster, 0);
> +
> +	tc2_pm_use_count[cpu][cluster]++;
> +	if (tc2_pm_use_count[cpu][cluster] == 1) {
> +		vexpress_spc_write_resume_reg(cluster, cpu,
> +					      virt_to_phys(mcpm_entry_point));
> +		vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 1);
> +	} else if (tc2_pm_use_count[cpu][cluster] != 2) {
> +		/*
> +		 * The only possible values are:
> +		 * 0 = CPU down
> +		 * 1 = CPU (still) up
> +		 * 2 = CPU requested to be up before it had a chance
> +		 *     to actually make itself down.
> +		 * Any other value is a bug.
> +		 */
> +		BUG();
> +	}
> +
> +	arch_spin_unlock(&tc2_pm_lock);
> +	local_irq_enable();
> +
> +	return 0;
> +}
> +
> +static void tc2_pm_power_down(void)
> +{
> +	unsigned int mpidr, cpu, cluster;
> +	bool last_man = false, skip_wfi = false;
> +
> +	mpidr = read_cpuid_mpidr();
> +	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> +
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	BUG_ON(cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster));
> +
> +	__mcpm_cpu_going_down(cpu, cluster);
> +
> +	arch_spin_lock(&tc2_pm_lock);
> +	BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
> +	tc2_pm_use_count[cpu][cluster]--;
> +	if (tc2_pm_use_count[cpu][cluster] == 0) {
> +		vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 1);
> +		if (!tc2_pm_use_count[0][cluster] &&
> +		    !tc2_pm_use_count[1][cluster] &&
> +		    !tc2_pm_use_count[2][cluster]) {
> +			vexpress_spc_powerdown_enable(cluster, 1);
> +			vexpress_spc_set_global_wakeup_intr(1);
> +			last_man = true;
> +		}
> +	} else if (tc2_pm_use_count[cpu][cluster] == 1) {
> +		/*
> +		 * A power_up request went ahead of us.
> +		 * Even if we do not want to shut this CPU down,
> +		 * the caller expects a certain state as if the WFI
> +		 * was aborted.  So let's continue with cache cleaning.
> +		 */
> +		skip_wfi = true;
> +	} else
> +		BUG();
> +
> +	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
> +		arch_spin_unlock(&tc2_pm_lock);
> +
> +		set_cr(get_cr() & ~CR_C);
> +		flush_cache_all();
> +		asm volatile ("clrex");
> +		set_auxcr(get_auxcr() & ~(1 << 6));
> +
> +		cci_disable_port_by_cpu(mpidr);
> +
> +		/*
> +		 * Ensure that both C & I bits are disabled in the SCTLR
> +		 * before disabling ACE snoops. This ensures that no
> +		 * coherency traffic will originate from this cpu after
> +		 * ACE snoops are turned off.
> +		 */
> +		cpu_proc_fin();
> +
> +		__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
> +	} else {
> +		/*
> +		 * If last man then undo any setup done previously.
> +		 */
> +		if (last_man) {
> +			vexpress_spc_powerdown_enable(cluster, 0);
> +			vexpress_spc_set_global_wakeup_intr(0);
> +		}
> +
> +		arch_spin_unlock(&tc2_pm_lock);
> +
> +		set_cr(get_cr() & ~CR_C);
> +		flush_cache_louis();
> +		asm volatile ("clrex");
> +		set_auxcr(get_auxcr() & ~(1 << 6));
> +	}
> +
> +	__mcpm_cpu_down(cpu, cluster);
> +
> +	/* Now we are prepared for power-down, do it: */
> +	if (!skip_wfi)
> +		wfi();
> +
> +	/* Not dead at this point?  Let our caller cope. */
> +}
> +
> +static void tc2_pm_powered_up(void)
> +{
> +	unsigned int mpidr, cpu, cluster;
> +	unsigned long flags;
> +
> +	mpidr = read_cpuid_mpidr();
> +	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> +
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	BUG_ON(cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster));
> +
> +	local_irq_save(flags);
> +	arch_spin_lock(&tc2_pm_lock);
> +
> +	if (!tc2_pm_use_count[0][cluster] &&
> +	    !tc2_pm_use_count[1][cluster] &&
> +	    !tc2_pm_use_count[2][cluster]) {
> +		vexpress_spc_powerdown_enable(cluster, 0);
> +		vexpress_spc_set_global_wakeup_intr(0);
> +	}
> +
> +	if (!tc2_pm_use_count[cpu][cluster])
> +		tc2_pm_use_count[cpu][cluster] = 1;
> +
> +	vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 0);
> +	vexpress_spc_write_resume_reg(cluster, cpu, 0);
> +
> +	arch_spin_unlock(&tc2_pm_lock);
> +	local_irq_restore(flags);
> +}
> +
> +static const struct mcpm_platform_ops tc2_pm_power_ops = {
> +	.power_up	= tc2_pm_power_up,
> +	.power_down	= tc2_pm_power_down,
> +	.powered_up	= tc2_pm_powered_up,
> +};
> +
> +static void __init tc2_pm_usage_count_init(void)
> +{
> +	unsigned int mpidr, cpu, cluster;
> +
> +	mpidr = read_cpuid_mpidr();
> +	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> +	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> +
> +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> +	BUG_ON(cpu >= 3 || cluster >= 2);

A warning saying that this hardware configuration is not supported would maybe
be more informal for an user trying to boot some mythical future hardware with
a too-old kernel.

> +	tc2_pm_use_count[cpu][cluster] = 1;
> +}
> +
> +/*
> + * Enable cluster-level coherency, in preparation for turning on the MMU.
> + */
> +static void __naked tc2_pm_power_up_setup(unsigned int affinity_level)
> +{
> +	asm volatile (" \n"
> +"	cmp	r0, #1 \n"
> +"	beq	cci_enable_port_for_self \n"
> +"	bx	lr ");
> +}
> +
> +static int __init tc2_pm_init(void)
> +{
> +	int ret;
> +
> +	if (!vexpress_spc_check_loaded())
> +		return -ENODEV;
> +
> +	tc2_pm_usage_count_init();
> +
> +	ret = mcpm_platform_register(&tc2_pm_power_ops);
> +	if (!ret)
> +		ret = mcpm_sync_init(tc2_pm_power_up_setup);
> +	if (!ret)
> +		pr_info("TC2 power management initialized\n");
> +	return ret;
> +}
> +
> +early_initcall(tc2_pm_init);
> -- 
> 1.8.1.2
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Nicolas Pitre June 13, 2013, 10:31 p.m. UTC | #5
On Thu, 13 Jun 2013, Olof Johansson wrote:

> > +config ARCH_VEXPRESS_TC2
> > +	bool "Versatile Express TC2 power management"
> 
> Should there be a _PM in the config option, perhaps? Or maybe take out the
> power management one-line info if it's really just base support for the
> platform.

CONFIG_ARCH_VEXPRESS_TC2_PM it now is.

> > +static int tc2_pm_use_count[3][2];
> 
> A comment describing the purpose of this array would be much beneficial for
> someone reading the driver.

You are right.  It now looks like this:

#define TC2_CLUSTERS                    2
#define TC2_MAX_CPUS_PER_CLUSTER        3

/* Keep per-cpu usage count to cope with unordered up/down requests */
static int tc2_pm_use_count[TC2_MAX_CPUS_PER_CLUSTER][TC2_CLUSTERS];

#define tc2_cluster_unused(cluster) \
        (!tc2_pm_use_count[0][cluster] && \
         !tc2_pm_use_count[1][cluster] && \
         !tc2_pm_use_count[2][cluster])

> > +	if (!tc2_pm_use_count[0][cluster] &&
> > +	    !tc2_pm_use_count[1][cluster] &&
> > +	    !tc2_pm_use_count[2][cluster])
> 
> I see this construct used three times open-coded like this in the file. Adding
> a smaller helper with a descriptive name would be a good idea.

Good observation.  Being too close to this code makes those issues 
invisible.  Hence the tc2_cluster_unused() above.

> Same goes for other direct accesses to the array -- maybe a tiny helper to
> set/get state? It might end up overabstracted that way though.

I would think so.

> > +static void __init tc2_pm_usage_count_init(void)
> > +{
> > +	unsigned int mpidr, cpu, cluster;
> > +
> > +	mpidr = read_cpuid_mpidr();
> > +	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
> > +	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
> > +
> > +	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
> > +	BUG_ON(cpu >= 3 || cluster >= 2);
> 
> A warning saying that this hardware configuration is not supported would maybe
> be more informal for an user trying to boot some mythical future hardware with
> a too-old kernel.

Hmmm... OK.


Nicolas
Dave Martin June 18, 2013, 5:24 p.m. UTC | #6
On Thu, Jun 13, 2013 at 06:31:32PM -0400, Nicolas Pitre wrote:
> On Thu, 13 Jun 2013, Olof Johansson wrote:

[...]

> /* Keep per-cpu usage count to cope with unordered up/down requests */
> static int tc2_pm_use_count[TC2_MAX_CPUS_PER_CLUSTER][TC2_CLUSTERS];
> 
> #define tc2_cluster_unused(cluster) \
>         (!tc2_pm_use_count[0][cluster] && \
>          !tc2_pm_use_count[1][cluster] && \
>          !tc2_pm_use_count[2][cluster])
> 
> > > +	if (!tc2_pm_use_count[0][cluster] &&
> > > +	    !tc2_pm_use_count[1][cluster] &&
> > > +	    !tc2_pm_use_count[2][cluster])
> > 
> > I see this construct used three times open-coded like this in the file. Adding
> > a smaller helper with a descriptive name would be a good idea.
> 
> Good observation.  Being too close to this code makes those issues 
> invisible.  Hence the tc2_cluster_unused() above.
> 
> > Same goes for other direct accesses to the array -- maybe a tiny helper to
> > set/get state? It might end up overabstracted that way though.
> 
> I would think so.

True, but it's worth noting that most mcpm backends (certainly any
native backends) will end up duplicating this pattern.  We already
have it for the fast model and for TC2.


In this code, the use counts are doing basically the same thing as
the low-level mcpm coordination, but to coordinate things that can
happen while all affected CPUs' MMUs are on -- this we can use
proper spinlocks and be nice and efficient.  But there are still
parallels between the operations on the use counts array and the
low-level __mcpm_ state machine helpers.

There are some interesting wrinkles here, such as the fact that
tc2_pm_powered_up() chooses a first man to perform some post-MMU
actions using regular spinlocking and the use counts... but this
isn't necessarily the same CPU selected as first man by the
low-level pre-MMU synchronisation.  And that's not a bug, because
we want the winner at each stage to dash across the finish line:
waiting for the runners-up just incurs latency.


Much about this feels like it's solving a common but non-trivial
problem, so there could be an argument for having more framework, or
otherwise documenting it a bit more.

We seem to have the following basic operations:

	lock_for_power_down(cpu, cluster) ->

		cluster and CPU need to be powered on, lock is held for both;
		CPU needs to be powered on, lock is held for it; or
		CPU is already on, so do nothing, lock is not held.

	lock_for_power_up(cpu, cluster) ->

		CPU and cluster need to be powered down (last man), lock is held for both; or
		CPU needs to be powered down, lock is held for it; or
		Someone is already turning the CPU on, so do nothing, lock is not held.

	lock_for_setup(cpu, cluster) ->

		cluster needs to be put into the powered-on state (post-MMU first man), lock is held for it.
		cluster is already in the powrered-on state, so do nothing, lock is not held.

	unlock()

		guess


These may not be great names, but hopefully you get the idea.

They correspond to the operations done in .power_up(), .power_down()
and .powered_up() respectively.

Cheers
---Dave
Nicolas Pitre June 18, 2013, 7:50 p.m. UTC | #7
On Tue, 18 Jun 2013, Dave P Martin wrote:

> On Thu, Jun 13, 2013 at 06:31:32PM -0400, Nicolas Pitre wrote:
> > On Thu, 13 Jun 2013, Olof Johansson wrote:
> 
> [...]
> 
> > /* Keep per-cpu usage count to cope with unordered up/down requests */
> > static int tc2_pm_use_count[TC2_MAX_CPUS_PER_CLUSTER][TC2_CLUSTERS];
> > 
> > #define tc2_cluster_unused(cluster) \
> >         (!tc2_pm_use_count[0][cluster] && \
> >          !tc2_pm_use_count[1][cluster] && \
> >          !tc2_pm_use_count[2][cluster])
> > 
> > > > +	if (!tc2_pm_use_count[0][cluster] &&
> > > > +	    !tc2_pm_use_count[1][cluster] &&
> > > > +	    !tc2_pm_use_count[2][cluster])
> > > 
> > > I see this construct used three times open-coded like this in the file. Adding
> > > a smaller helper with a descriptive name would be a good idea.
> > 
> > Good observation.  Being too close to this code makes those issues 
> > invisible.  Hence the tc2_cluster_unused() above.
> > 
> > > Same goes for other direct accesses to the array -- maybe a tiny helper to
> > > set/get state? It might end up overabstracted that way though.
> > 
> > I would think so.
> 
> True, but it's worth noting that most mcpm backends (certainly any
> native backends) will end up duplicating this pattern.  We already
> have it for the fast model and for TC2.

We do not in the PSCI backend though.

> In this code, the use counts are doing basically the same thing as
> the low-level mcpm coordination, but to coordinate things that can
> happen while all affected CPUs' MMUs are on -- this we can use
> proper spinlocks and be nice and efficient.  But there are still
> parallels between the operations on the use counts array and the
> low-level __mcpm_ state machine helpers.

Sort of.  The use counts are needed to determine who is becoming the 
last man.  That's the sole purpose of it.  And then only the last man is 
allowed to invoke MCPM state machine helpers changing the cluster state.

> There are some interesting wrinkles here, such as the fact that
> tc2_pm_powered_up() chooses a first man to perform some post-MMU
> actions using regular spinlocking and the use counts... but this
> isn't necessarily the same CPU selected as first man by the
> low-level pre-MMU synchronisation.  And that's not a bug, because
> we want the winner at each stage to dash across the finish line:
> waiting for the runners-up just incurs latency.

Exact.

> Much about this feels like it's solving a common but non-trivial
> problem, so there could be an argument for having more framework, or
> otherwise documenting it a bit more.

Well... right now I think it is hard to abstract things properly as we 
have very few backends.  I would like to see more of them first.

For something that can be done next, it is probably about moving that 
use count handling up the stack and out of backends.  I don't think we 
should create more helpers for backends to use and play more ping pong 
between the core and backends. The core MCPM layer could well take care 
of that last man determination directly with no abstraction at all and 
simply pass a last_man flag to simpler backends.  Initially the last man 
was determined directly from some hardware state but the handling of 
unordered up/down requests needed more than just a binary state.


Nicolas
Olof Johansson June 18, 2013, 7:56 p.m. UTC | #8
I agree with most of the thread so far, but just wanted to highlight this:

On Tue, Jun 18, 2013 at 12:50 PM, Nicolas Pitre
<nicolas.pitre@linaro.org> wrote:

> Well... right now I think it is hard to abstract things properly as we
> have very few backends.  I would like to see more of them first.

Yes. This.

And this is why it's critical, absolutely critical, that the first one
or two implementations go in are as simple and clean as possible.

Otherwise seeing the commonalities and how to abstract them out will
be a complete headache, since only the person writing the driver will
be able to get a good grasp of it (alternatively, it will take a lot
of effort for someone to figure out, clean up, and see the bigger
picture).


-Olof
Nicolas Pitre June 18, 2013, 10:06 p.m. UTC | #9
On Tue, 18 Jun 2013, Olof Johansson wrote:

> I agree with most of the thread so far, but just wanted to highlight this:
> 
> On Tue, Jun 18, 2013 at 12:50 PM, Nicolas Pitre
> <nicolas.pitre@linaro.org> wrote:
> 
> > Well... right now I think it is hard to abstract things properly as we
> > have very few backends.  I would like to see more of them first.
> 
> Yes. This.
> 
> And this is why it's critical, absolutely critical, that the first one
> or two implementations go in are as simple and clean as possible.

I'll claim that this is rather utopic.  Simply because, once again, we 
don't know how to do so yet.  This is an iterative process and things 
will converge eventually.  But right now, even if the code is not 
necessarily as simple and clean as possible, it is certainly good enough 
for a start.

The perfect is the worst enemy of the good enough.

> Otherwise seeing the commonalities and how to abstract them out will
> be a complete headache, since only the person writing the driver will
> be able to get a good grasp of it (alternatively, it will take a lot
> of effort for someone to figure out, clean up, and see the bigger
> picture).

I don't see things as dramatically.  Probably because I'm one of the 
people who worked directly on this stuff.  And I don't intend to 
disappear after the code hits mainline either.  I therefore don't expect 
things to get out of control.  The job doesn't end there.

We do have ideas about improving things and the discussion is happening.  
But it would be wrong to wait until we can't think of any way to improve 
the code any further before merging it.  Otherwise even btrfs would 
still be out of tree.

The danger right now is that other people will reinvent their own layer 
because they have no model to follow at all.  That is even worse than 
having an imperfect model IMHO.

What we need is more exposure of this code.  It needs to be used and 
fiddled with by more people.  It has lived out of tree for a year 
already, and it would be about time its evolution happens in mainline 
from now on.

Let's also not forget that, right now, TC2 is the only fully functional 
big.LITTLE hardware implementation available to people.  Having it 
supported in mainline, even if not optimally at first, would greatly 
help other issues such as scheduler improvements for hybrid 
multiprocessing which are far more important than this code, as such 
platform could be distributed to scheduler people who might not always 
be bothered by low-level PM details and out-of-tree patches.

Please, understand that I fully appreciate the level of review and 
scrutiny that you provide and you do a very good job at it. However some 
accommodations for a compromise would be appreciated for the greater 
good here.


Nicolas
Dave Martin June 19, 2013, 10:08 a.m. UTC | #10
On Tue, Jun 18, 2013 at 03:50:33PM -0400, Nicolas Pitre wrote:
> On Tue, 18 Jun 2013, Dave P Martin wrote:
> 
> > On Thu, Jun 13, 2013 at 06:31:32PM -0400, Nicolas Pitre wrote:
> > > On Thu, 13 Jun 2013, Olof Johansson wrote:
> > 
> > [...]
> > 
> > > /* Keep per-cpu usage count to cope with unordered up/down requests */
> > > static int tc2_pm_use_count[TC2_MAX_CPUS_PER_CLUSTER][TC2_CLUSTERS];
> > > 
> > > #define tc2_cluster_unused(cluster) \
> > >         (!tc2_pm_use_count[0][cluster] && \
> > >          !tc2_pm_use_count[1][cluster] && \
> > >          !tc2_pm_use_count[2][cluster])
> > > 
> > > > > +	if (!tc2_pm_use_count[0][cluster] &&
> > > > > +	    !tc2_pm_use_count[1][cluster] &&
> > > > > +	    !tc2_pm_use_count[2][cluster])
> > > > 
> > > > I see this construct used three times open-coded like this in the file. Adding
> > > > a smaller helper with a descriptive name would be a good idea.
> > > 
> > > Good observation.  Being too close to this code makes those issues 
> > > invisible.  Hence the tc2_cluster_unused() above.
> > > 
> > > > Same goes for other direct accesses to the array -- maybe a tiny helper to
> > > > set/get state? It might end up overabstracted that way though.
> > > 
> > > I would think so.
> > 
> > True, but it's worth noting that most mcpm backends (certainly any
> > native backends) will end up duplicating this pattern.  We already
> > have it for the fast model and for TC2.
> 
> We do not in the PSCI backend though.
> 
> > In this code, the use counts are doing basically the same thing as
> > the low-level mcpm coordination, but to coordinate things that can
> > happen while all affected CPUs' MMUs are on -- this we can use
> > proper spinlocks and be nice and efficient.  But there are still
> > parallels between the operations on the use counts array and the
> > low-level __mcpm_ state machine helpers.
> 
> Sort of.  The use counts are needed to determine who is becoming the 
> last man.  That's the sole purpose of it.  And then only the last man is 
> allowed to invoke MCPM state machine helpers changing the cluster state.
> 
> > There are some interesting wrinkles here, such as the fact that
> > tc2_pm_powered_up() chooses a first man to perform some post-MMU
> > actions using regular spinlocking and the use counts... but this
> > isn't necessarily the same CPU selected as first man by the
> > low-level pre-MMU synchronisation.  And that's not a bug, because
> > we want the winner at each stage to dash across the finish line:
> > waiting for the runners-up just incurs latency.
> 
> Exact.

(So, when you say the counts are only used to choose the last man, I
don't think it's quite that simple:

in connection with the spinlock, they also do first-man/last-man
coordination, and powerdown preemption at the CPU and cluster levels...
so it really does shadow the low-level state machine.  But that's just
my interpretation, and whether it's a good interpretation will depend on
whether it fits future backends ... which we don't fully know yet.)

> > Much about this feels like it's solving a common but non-trivial
> > problem, so there could be an argument for having more framework, or
> > otherwise documenting it a bit more.
> 
> Well... right now I think it is hard to abstract things properly as we 
> have very few backends.  I would like to see more of them first.

Agreed: that seems a reasonable stance.

This won't turn into a free-for-all unless we allow it to.

> For something that can be done next, it is probably about moving that 
> use count handling up the stack and out of backends.  I don't think we 
> should create more helpers for backends to use and play more ping pong 
> between the core and backends. The core MCPM layer could well take care 
> of that last man determination directly with no abstraction at all and 
> simply pass a last_man flag to simpler backends.  Initially the last man 
> was determined directly from some hardware state but the handling of 
> unordered up/down requests needed more than just a binary state.

Agreed.  Since all the mcpm methods start with code like this to determine
what to do and obtain some sort of lock, there's no reason why that can't
disappear into the framework.

A helper might be needed to do the unlock for the powerdown path, though
this might be combined with the existing helper methods to some extent.


This needs a bit of careful thought though, and I think if we wait for
one or two more backends to emerge, then that thought would be better
informed.

Cheers
---Dave
diff mbox

Patch

diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig
index b8bbabec63..e7a825d7df 100644
--- a/arch/arm/mach-vexpress/Kconfig
+++ b/arch/arm/mach-vexpress/Kconfig
@@ -66,4 +66,13 @@  config ARCH_VEXPRESS_DCSCB
 	  This is needed to provide CPU and cluster power management
 	  on RTSM implementing big.LITTLE.
 
+config ARCH_VEXPRESS_TC2
+	bool "Versatile Express TC2 power management"
+	depends on MCPM
+	select VEXPRESS_SPC
+	select ARM_CCI
+	help
+	  Support for CPU and cluster power management on Versatile Express
+	  with a TC2 (A15x2 A7x3) big.LITTLE core tile.
+
 endmenu
diff --git a/arch/arm/mach-vexpress/Makefile b/arch/arm/mach-vexpress/Makefile
index 48ba89a814..b1cf227fa5 100644
--- a/arch/arm/mach-vexpress/Makefile
+++ b/arch/arm/mach-vexpress/Makefile
@@ -7,5 +7,6 @@  ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \
 obj-y					:= v2m.o
 obj-$(CONFIG_ARCH_VEXPRESS_CA9X4)	+= ct-ca9x4.o
 obj-$(CONFIG_ARCH_VEXPRESS_DCSCB)	+= dcscb.o	dcscb_setup.o
+obj-$(CONFIG_ARCH_VEXPRESS_TC2)		+= tc2_pm.o
 obj-$(CONFIG_SMP)			+= platsmp.o
 obj-$(CONFIG_HOTPLUG_CPU)		+= hotplug.o
diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c
new file mode 100644
index 0000000000..a3ea524372
--- /dev/null
+++ b/arch/arm/mach-vexpress/tc2_pm.c
@@ -0,0 +1,243 @@ 
+/*
+ * arch/arm/mach-vexpress/tc2_pm.c - TC2 power management support
+ *
+ * Created by:	Nicolas Pitre, October 2012
+ * Copyright:	(C) 2012-2013  Linaro Limited
+ *
+ * Some portions of this file were originally written by Achin Gupta
+ * Copyright:   (C) 2012  ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+
+#include <asm/mcpm.h>
+#include <asm/proc-fns.h>
+#include <asm/cacheflush.h>
+#include <asm/cputype.h>
+#include <asm/cp15.h>
+
+#include <mach/motherboard.h>
+
+#include <linux/vexpress.h>
+#include <linux/arm-cci.h>
+
+/*
+ * We can't use regular spinlocks. In the switcher case, it is possible
+ * for an outbound CPU to call power_down() after its inbound counterpart
+ * is already live using the same logical CPU number which trips lockdep
+ * debugging.
+ */
+static arch_spinlock_t tc2_pm_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+static int tc2_pm_use_count[3][2];
+
+static int tc2_pm_power_up(unsigned int cpu, unsigned int cluster)
+{
+	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
+	if (cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster))
+		return -EINVAL;
+
+	/*
+	 * Since this is called with IRQs enabled, and no arch_spin_lock_irq
+	 * variant exists, we need to disable IRQs manually here.
+	 */
+	local_irq_disable();
+	arch_spin_lock(&tc2_pm_lock);
+
+	if (!tc2_pm_use_count[0][cluster] &&
+	    !tc2_pm_use_count[1][cluster] &&
+	    !tc2_pm_use_count[2][cluster])
+		vexpress_spc_powerdown_enable(cluster, 0);
+
+	tc2_pm_use_count[cpu][cluster]++;
+	if (tc2_pm_use_count[cpu][cluster] == 1) {
+		vexpress_spc_write_resume_reg(cluster, cpu,
+					      virt_to_phys(mcpm_entry_point));
+		vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 1);
+	} else if (tc2_pm_use_count[cpu][cluster] != 2) {
+		/*
+		 * The only possible values are:
+		 * 0 = CPU down
+		 * 1 = CPU (still) up
+		 * 2 = CPU requested to be up before it had a chance
+		 *     to actually make itself down.
+		 * Any other value is a bug.
+		 */
+		BUG();
+	}
+
+	arch_spin_unlock(&tc2_pm_lock);
+	local_irq_enable();
+
+	return 0;
+}
+
+static void tc2_pm_power_down(void)
+{
+	unsigned int mpidr, cpu, cluster;
+	bool last_man = false, skip_wfi = false;
+
+	mpidr = read_cpuid_mpidr();
+	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+
+	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
+	BUG_ON(cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster));
+
+	__mcpm_cpu_going_down(cpu, cluster);
+
+	arch_spin_lock(&tc2_pm_lock);
+	BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
+	tc2_pm_use_count[cpu][cluster]--;
+	if (tc2_pm_use_count[cpu][cluster] == 0) {
+		vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 1);
+		if (!tc2_pm_use_count[0][cluster] &&
+		    !tc2_pm_use_count[1][cluster] &&
+		    !tc2_pm_use_count[2][cluster]) {
+			vexpress_spc_powerdown_enable(cluster, 1);
+			vexpress_spc_set_global_wakeup_intr(1);
+			last_man = true;
+		}
+	} else if (tc2_pm_use_count[cpu][cluster] == 1) {
+		/*
+		 * A power_up request went ahead of us.
+		 * Even if we do not want to shut this CPU down,
+		 * the caller expects a certain state as if the WFI
+		 * was aborted.  So let's continue with cache cleaning.
+		 */
+		skip_wfi = true;
+	} else
+		BUG();
+
+	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
+		arch_spin_unlock(&tc2_pm_lock);
+
+		set_cr(get_cr() & ~CR_C);
+		flush_cache_all();
+		asm volatile ("clrex");
+		set_auxcr(get_auxcr() & ~(1 << 6));
+
+		cci_disable_port_by_cpu(mpidr);
+
+		/*
+		 * Ensure that both C & I bits are disabled in the SCTLR
+		 * before disabling ACE snoops. This ensures that no
+		 * coherency traffic will originate from this cpu after
+		 * ACE snoops are turned off.
+		 */
+		cpu_proc_fin();
+
+		__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
+	} else {
+		/*
+		 * If last man then undo any setup done previously.
+		 */
+		if (last_man) {
+			vexpress_spc_powerdown_enable(cluster, 0);
+			vexpress_spc_set_global_wakeup_intr(0);
+		}
+
+		arch_spin_unlock(&tc2_pm_lock);
+
+		set_cr(get_cr() & ~CR_C);
+		flush_cache_louis();
+		asm volatile ("clrex");
+		set_auxcr(get_auxcr() & ~(1 << 6));
+	}
+
+	__mcpm_cpu_down(cpu, cluster);
+
+	/* Now we are prepared for power-down, do it: */
+	if (!skip_wfi)
+		wfi();
+
+	/* Not dead at this point?  Let our caller cope. */
+}
+
+static void tc2_pm_powered_up(void)
+{
+	unsigned int mpidr, cpu, cluster;
+	unsigned long flags;
+
+	mpidr = read_cpuid_mpidr();
+	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+
+	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
+	BUG_ON(cluster >= 2 || cpu >= vexpress_spc_get_nb_cpus(cluster));
+
+	local_irq_save(flags);
+	arch_spin_lock(&tc2_pm_lock);
+
+	if (!tc2_pm_use_count[0][cluster] &&
+	    !tc2_pm_use_count[1][cluster] &&
+	    !tc2_pm_use_count[2][cluster]) {
+		vexpress_spc_powerdown_enable(cluster, 0);
+		vexpress_spc_set_global_wakeup_intr(0);
+	}
+
+	if (!tc2_pm_use_count[cpu][cluster])
+		tc2_pm_use_count[cpu][cluster] = 1;
+
+	vexpress_spc_set_cpu_wakeup_irq(cpu, cluster, 0);
+	vexpress_spc_write_resume_reg(cluster, cpu, 0);
+
+	arch_spin_unlock(&tc2_pm_lock);
+	local_irq_restore(flags);
+}
+
+static const struct mcpm_platform_ops tc2_pm_power_ops = {
+	.power_up	= tc2_pm_power_up,
+	.power_down	= tc2_pm_power_down,
+	.powered_up	= tc2_pm_powered_up,
+};
+
+static void __init tc2_pm_usage_count_init(void)
+{
+	unsigned int mpidr, cpu, cluster;
+
+	mpidr = read_cpuid_mpidr();
+	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+
+	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
+	BUG_ON(cpu >= 3 || cluster >= 2);
+	tc2_pm_use_count[cpu][cluster] = 1;
+}
+
+/*
+ * Enable cluster-level coherency, in preparation for turning on the MMU.
+ */
+static void __naked tc2_pm_power_up_setup(unsigned int affinity_level)
+{
+	asm volatile (" \n"
+"	cmp	r0, #1 \n"
+"	beq	cci_enable_port_for_self \n"
+"	bx	lr ");
+}
+
+static int __init tc2_pm_init(void)
+{
+	int ret;
+
+	if (!vexpress_spc_check_loaded())
+		return -ENODEV;
+
+	tc2_pm_usage_count_init();
+
+	ret = mcpm_platform_register(&tc2_pm_power_ops);
+	if (!ret)
+		ret = mcpm_sync_init(tc2_pm_power_up_setup);
+	if (!ret)
+		pr_info("TC2 power management initialized\n");
+	return ret;
+}
+
+early_initcall(tc2_pm_init);