diff mbox

[3/7] ARM: tegra30: cpuidle: add LP2 driver for secondary CPUs

Message ID 1349691981-31038-4-git-send-email-josephl@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Joseph Lo Oct. 8, 2012, 10:26 a.m. UTC
This supports power-gated (LP2) idle on secondary CPUs for Tegra30.
The secondary CPUs can go into LP2 state independently. When CPU goes
into LP2 state, it saves it's state and puts itself to flow controlled
WFI state. After that, it will been power gated.

Based on the work by:
Scott Williams <scwilliams@nvidia.com>

Signed-off-by: Joseph Lo <josephl@nvidia.com>
---
 arch/arm/mach-tegra/Makefile          |    1 +
 arch/arm/mach-tegra/cpuidle-tegra30.c |   79 +++++++++++++++++++++++++++++-
 arch/arm/mach-tegra/pm.c              |   88 +++++++++++++++++++++++++++++++++
 arch/arm/mach-tegra/pm.h              |   30 +++++++++++
 arch/arm/mach-tegra/reset.h           |    9 +++
 arch/arm/mach-tegra/sleep-tegra30.S   |   26 ++++++++++
 arch/arm/mach-tegra/sleep.S           |   66 ++++++++++++++++++++++++
 arch/arm/mach-tegra/sleep.h           |    2 +
 8 files changed, 300 insertions(+), 1 deletions(-)
 create mode 100644 arch/arm/mach-tegra/pm.c
 create mode 100644 arch/arm/mach-tegra/pm.h

Comments

Lorenzo Pieralisi Oct. 8, 2012, 4:35 p.m. UTC | #1
On Mon, Oct 08, 2012 at 11:26:17AM +0100, Joseph Lo wrote:
> This supports power-gated (LP2) idle on secondary CPUs for Tegra30.
> The secondary CPUs can go into LP2 state independently. When CPU goes
> into LP2 state, it saves it's state and puts itself to flow controlled
> WFI state. After that, it will been power gated.
> 
> Based on the work by:
> Scott Williams <scwilliams@nvidia.com>
> 
> Signed-off-by: Joseph Lo <josephl@nvidia.com>
> ---
>  arch/arm/mach-tegra/Makefile          |    1 +
>  arch/arm/mach-tegra/cpuidle-tegra30.c |   79 +++++++++++++++++++++++++++++-
>  arch/arm/mach-tegra/pm.c              |   88 +++++++++++++++++++++++++++++++++
>  arch/arm/mach-tegra/pm.h              |   30 +++++++++++
>  arch/arm/mach-tegra/reset.h           |    9 +++
>  arch/arm/mach-tegra/sleep-tegra30.S   |   26 ++++++++++
>  arch/arm/mach-tegra/sleep.S           |   66 ++++++++++++++++++++++++
>  arch/arm/mach-tegra/sleep.h           |    2 +
>  8 files changed, 300 insertions(+), 1 deletions(-)
>  create mode 100644 arch/arm/mach-tegra/pm.c
>  create mode 100644 arch/arm/mach-tegra/pm.h
> 
> diff --git a/arch/arm/mach-tegra/Makefile b/arch/arm/mach-tegra/Makefile
> index 9b80c1e..6f224f7 100644
> --- a/arch/arm/mach-tegra/Makefile
> +++ b/arch/arm/mach-tegra/Makefile
> @@ -8,6 +8,7 @@ obj-y                                   += pmc.o
>  obj-y                                  += flowctrl.o
>  obj-y                                  += powergate.o
>  obj-y                                  += apbio.o
> +obj-y                                  += pm.o
>  obj-$(CONFIG_CPU_IDLE)                 += cpuidle.o
>  obj-$(CONFIG_CPU_IDLE)                 += sleep.o
>  obj-$(CONFIG_ARCH_TEGRA_2x_SOC)         += tegra20_clocks.o
> diff --git a/arch/arm/mach-tegra/cpuidle-tegra30.c b/arch/arm/mach-tegra/cpuidle-tegra30.c
> index 0f85df8..842fef4 100644
> --- a/arch/arm/mach-tegra/cpuidle-tegra30.c
> +++ b/arch/arm/mach-tegra/cpuidle-tegra30.c
> @@ -19,21 +19,94 @@
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/cpuidle.h>
> +#include <linux/cpu_pm.h>
> +#include <linux/clockchips.h>
> 
>  #include <asm/cpuidle.h>
> +#include <asm/proc-fns.h>
> +#include <asm/suspend.h>
> +
> +#include "pm.h"
> +#include "sleep.h"
> +
> +#ifdef CONFIG_PM_SLEEP
> +static int tegra30_idle_lp2(struct cpuidle_device *dev,
> +                           struct cpuidle_driver *drv,
> +                           int index);
> +#endif
> 
>  static struct cpuidle_driver tegra_idle_driver = {
>         .name = "tegra_idle",
>         .owner = THIS_MODULE,
>         .en_core_tk_irqen = 1,
> -       .state_count = 1,
> +       .state_count = 2,
>         .states = {
>                 [0] = ARM_CPUIDLE_WFI_STATE_PWR(600),
> +#ifdef CONFIG_PM_SLEEP
> +               [1] = {
> +                       .enter                  = tegra30_idle_lp2,
> +                       .exit_latency           = 2000,
> +                       .target_residency       = 2200,
> +                       .power_usage            = 0,
> +                       .flags                  = CPUIDLE_FLAG_TIME_VALID,
> +                       .name                   = "LP2",
> +                       .desc                   = "CPU power-gate",
> +               },
> +#endif
>         },
>  };
> 
>  static DEFINE_PER_CPU(struct cpuidle_device, tegra_idle_device);
> 
> +#ifdef CONFIG_PM_SLEEP
> +static bool tegra30_idle_enter_lp2_cpu_n(struct cpuidle_device *dev,
> +                                        struct cpuidle_driver *drv,
> +                                        int index)
> +{
> +#ifdef CONFIG_SMP
> +       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
> +
> +       smp_wmb();
> +
> +       save_cpu_arch_register();
> +
> +       cpu_suspend(0, tegra30_sleep_cpu_secondary_finish);
> +
> +       restore_cpu_arch_register();
> +
> +       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
> +#endif

Can't you factor out this #ifdef out using an inline function ?

> +
> +       return true;
> +}
> +
> +static int __cpuinit tegra30_idle_lp2(struct cpuidle_device *dev,
> +                                     struct cpuidle_driver *drv,
> +                                     int index)
> +{
> +       bool entered_lp2 = false;
> +
> +       local_fiq_disable();
> +
> +       tegra_set_cpu_in_lp2(dev->cpu);
> +       cpu_pm_enter();
> +
> +       if (dev->cpu == 0)

Logical cpu 0 ? Or you need a HW cpu 0 check here ? If you boot on a CPU
that is different from HW CPU 0 (do not know if that's possible) you
might have a problem.

[...]

> +bool __cpuinit tegra_set_cpu_in_lp2(int cpu)
> +{
> +       bool last_cpu = false;
> +
> +       spin_lock(&tegra_lp2_lock);
> +       BUG_ON(cpumask_test_cpu(cpu, &tegra_in_lp2));
> +       cpumask_set_cpu(cpu, &tegra_in_lp2);
> +
> +       /*
> +        * Update the IRAM copy used by the reset handler. The IRAM copy
> +        * can't use used directly by cpumask_set_cpu() because it uses
> +        * LDREX/STREX which requires the addressed location to be inner
> +        * cacheable and sharable which IRAM isn't.
> +        */
> +       writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> +       dsb();
> +
> +       if ((cpu == 0) && cpumask_equal(&tegra_in_lp2, cpu_online_mask))
> +               last_cpu = true;

For cpu == 0, see above.

[...]

> +ENTRY(tegra_flush_l1_cache)
> +       stmfd   sp!, {r4-r5, r7, r9-r11, lr}
> +       dmb                                     @ ensure ordering
> +
> +       /* Disable the data cache */
> +       mrc     p15, 0, r2, c1, c0, 0
> +       bic     r2, r2, #CR_C
> +       dsb
> +       mcr     p15, 0, r2, c1, c0, 0
> +
> +       /* Flush data cache */
> +       mov     r10, #0
> +#ifdef CONFIG_PREEMPT
> +       save_and_disable_irqs_notrace r9        @ make cssr&csidr read atomic
> +#endif
> +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
> +       isb                                     @ isb to sych the new cssr&csidr
> +       mrc     p15, 1, r1, c0, c0, 0           @ read the new csidr
> +#ifdef CONFIG_PREEMPT
> +       restore_irqs_notrace r9
> +#endif
> +       and     r2, r1, #7                      @ extract the length of the cache lines
> +       add     r2, r2, #4                      @ add 4 (line length offset)
> +       ldr     r4, =0x3ff
> +       ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
> +       clz     r5, r4                          @ find bit position of way size increment
> +       ldr     r7, =0x7fff
> +       ands    r7, r7, r1, lsr #13             @ extract max number of the index size
> +loop2:
> +       mov     r9, r4                          @ create working copy of max way size
> +loop3:
> +       orr     r11, r10, r9, lsl r5            @ factor way and cache number into r11
> +       orr     r11, r11, r7, lsl r2            @ factor index number into r11
> +       mcr     p15, 0, r11, c7, c14, 2         @ clean & invalidate by set/way
> +       subs    r9, r9, #1                      @ decrement the way
> +       bge     loop3
> +       subs    r7, r7, #1                      @ decrement the index
> +       bge     loop2
> +finished:
> +       mov     r10, #0                         @ swith back to cache level 0
> +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
> +       dsb
> +       isb

This code is already in the kernel in cache-v7.S, please use that.
We are just adding the new LoUIS API that probably does what you
want, even though for Tegra, that is an A9 based platform I fail to
understand why Level of Coherency differs from L1.

Can you explain to me please why Level of Coherency (LoC) is != from L1
on Tegra ?

Thanks,
Lorenzo
Joseph Lo Oct. 9, 2012, 4:13 a.m. UTC | #2
Hi Lorenzo,

Thanks for your review.

On Tue, 2012-10-09 at 00:35 +0800, Lorenzo Pieralisi wrote:
> On Mon, Oct 08, 2012 at 11:26:17AM +0100, Joseph Lo wrote:
> > This supports power-gated (LP2) idle on secondary CPUs for Tegra30.
> > The secondary CPUs can go into LP2 state independently. When CPU goes
> > into LP2 state, it saves it's state and puts itself to flow controlled
> > WFI state. After that, it will been power gated.
> > 
> > Based on the work by:
> > Scott Williams <scwilliams@nvidia.com>
> > 
> > Signed-off-by: Joseph Lo <josephl@nvidia.com>
> > ---
...
> > +#ifdef CONFIG_PM_SLEEP
> > +static bool tegra30_idle_enter_lp2_cpu_n(struct cpuidle_device *dev,
> > +                                        struct cpuidle_driver *drv,
> > +                                        int index)
> > +{
> > +#ifdef CONFIG_SMP
> > +       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
> > +
> > +       smp_wmb();
> > +
> > +       save_cpu_arch_register();
> > +
> > +       cpu_suspend(0, tegra30_sleep_cpu_secondary_finish);
> > +
> > +       restore_cpu_arch_register();
> > +
> > +       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
> > +#endif
> 
> Can't you factor out this #ifdef out using an inline function ?
> 
OK. Will do.

> > +
> > +       return true;
> > +}
> > +
> > +static int __cpuinit tegra30_idle_lp2(struct cpuidle_device *dev,
> > +                                     struct cpuidle_driver *drv,
> > +                                     int index)
> > +{
> > +       bool entered_lp2 = false;
> > +
> > +       local_fiq_disable();
> > +
> > +       tegra_set_cpu_in_lp2(dev->cpu);
> > +       cpu_pm_enter();
> > +
> > +       if (dev->cpu == 0)
> 
> Logical cpu 0 ? Or you need a HW cpu 0 check here ? If you boot on a CPU
> that is different from HW CPU 0 (do not know if that's possible) you
> might have a problem.
> 
> [...]
> 
For Tegra20 & Tegra30, it's always physical CPU 0 here. And the CPU0 was
always the first boot CPU. I will change to

cpu = cpu_logical_map(dev->cpu);

Thanks for your remind.

> > +bool __cpuinit tegra_set_cpu_in_lp2(int cpu)
> > +{
> > +       bool last_cpu = false;
> > +
> > +       spin_lock(&tegra_lp2_lock);
> > +       BUG_ON(cpumask_test_cpu(cpu, &tegra_in_lp2));
> > +       cpumask_set_cpu(cpu, &tegra_in_lp2);
> > +
> > +       /*
> > +        * Update the IRAM copy used by the reset handler. The IRAM copy
> > +        * can't use used directly by cpumask_set_cpu() because it uses
> > +        * LDREX/STREX which requires the addressed location to be inner
> > +        * cacheable and sharable which IRAM isn't.
> > +        */
> > +       writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> > +       dsb();
> > +
> > +       if ((cpu == 0) && cpumask_equal(&tegra_in_lp2, cpu_online_mask))
> > +               last_cpu = true;
> 
> For cpu == 0, see above.
> 
> [...]
> 
Will use cpu_logical_map to get the physical CPU first, thanks.

> > +ENTRY(tegra_flush_l1_cache)
> > +       stmfd   sp!, {r4-r5, r7, r9-r11, lr}
> > +       dmb                                     @ ensure ordering
> > +
> > +       /* Disable the data cache */
> > +       mrc     p15, 0, r2, c1, c0, 0
> > +       bic     r2, r2, #CR_C
> > +       dsb
> > +       mcr     p15, 0, r2, c1, c0, 0
> > +
> > +       /* Flush data cache */
> > +       mov     r10, #0
> > +#ifdef CONFIG_PREEMPT
> > +       save_and_disable_irqs_notrace r9        @ make cssr&csidr read atomic
> > +#endif
> > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
> > +       isb                                     @ isb to sych the new cssr&csidr
> > +       mrc     p15, 1, r1, c0, c0, 0           @ read the new csidr
> > +#ifdef CONFIG_PREEMPT
> > +       restore_irqs_notrace r9
> > +#endif
> > +       and     r2, r1, #7                      @ extract the length of the cache lines
> > +       add     r2, r2, #4                      @ add 4 (line length offset)
> > +       ldr     r4, =0x3ff
> > +       ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
> > +       clz     r5, r4                          @ find bit position of way size increment
> > +       ldr     r7, =0x7fff
> > +       ands    r7, r7, r1, lsr #13             @ extract max number of the index size
> > +loop2:
> > +       mov     r9, r4                          @ create working copy of max way size
> > +loop3:
> > +       orr     r11, r10, r9, lsl r5            @ factor way and cache number into r11
> > +       orr     r11, r11, r7, lsl r2            @ factor index number into r11
> > +       mcr     p15, 0, r11, c7, c14, 2         @ clean & invalidate by set/way
> > +       subs    r9, r9, #1                      @ decrement the way
> > +       bge     loop3
> > +       subs    r7, r7, #1                      @ decrement the index
> > +       bge     loop2
> > +finished:
> > +       mov     r10, #0                         @ swith back to cache level 0
> > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr 
> > +       dsb
> > +       isb
> 
> This code is already in the kernel in cache-v7.S, please use that.
> We are just adding the new LoUIS API that probably does what you
> want, even though for Tegra, that is an A9 based platform I fail to
> understand why Level of Coherency differs from L1.
> 
> Can you explain to me please why Level of Coherency (LoC) is != from L1
> on Tegra ?
> 

Thanks for introducing the new LoUIS cache API. Did LoUIS been changed
by other HW? I checked the new LoUIS API. If LoUIS == 0, it means inner
shareable then it do nothing just return. But I need to flush L1 data
cache here to sync the coherency before CPU be power gated. And disable
data cache before flush is needed.

I can tell you the sequence that why we just do L1 data cache flush
here. Maybe I need to change the comment to "flush to point of
coherency" not "level of coherency".

For secondary CPUs:
* after cpu_suspend
* disable data cache and flush L1 data cache
* Turn off SMP coherency
* power gate CPU

For CPU0:
* outer_disable (flush and disable L2)
* cpu_suspend
* disable data cache and flush L1 data cache
* Turn off SMP coherency
* Turn off MMU
* shut off the CPU rail

So we only do flush to PoC.

And changing the sequence of secondary CPUs to belows maybe more
suitable?
* after cpu_suspend
* disable data cache and call to v7_flush_dcache_all
* Turn off SMP coherency
* power gate CPU

How do you think?

Thanks,
Joseph
Lorenzo Pieralisi Oct. 9, 2012, 8:38 a.m. UTC | #3
On Tue, Oct 09, 2012 at 05:13:15AM +0100, Joseph Lo wrote:

[...]

> > > +ENTRY(tegra_flush_l1_cache)
> > > +       stmfd   sp!, {r4-r5, r7, r9-r11, lr}
> > > +       dmb                                     @ ensure ordering
> > > +
> > > +       /* Disable the data cache */
> > > +       mrc     p15, 0, r2, c1, c0, 0
> > > +       bic     r2, r2, #CR_C
> > > +       dsb
> > > +       mcr     p15, 0, r2, c1, c0, 0
> > > +
> > > +       /* Flush data cache */
> > > +       mov     r10, #0
> > > +#ifdef CONFIG_PREEMPT
> > > +       save_and_disable_irqs_notrace r9        @ make cssr&csidr read atomic
> > > +#endif
> > > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
> > > +       isb                                     @ isb to sych the new cssr&csidr
> > > +       mrc     p15, 1, r1, c0, c0, 0           @ read the new csidr
> > > +#ifdef CONFIG_PREEMPT
> > > +       restore_irqs_notrace r9
> > > +#endif
> > > +       and     r2, r1, #7                      @ extract the length of the cache lines
> > > +       add     r2, r2, #4                      @ add 4 (line length offset)
> > > +       ldr     r4, =0x3ff
> > > +       ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
> > > +       clz     r5, r4                          @ find bit position of way size increment
> > > +       ldr     r7, =0x7fff
> > > +       ands    r7, r7, r1, lsr #13             @ extract max number of the index size
> > > +loop2:
> > > +       mov     r9, r4                          @ create working copy of max way size
> > > +loop3:
> > > +       orr     r11, r10, r9, lsl r5            @ factor way and cache number into r11
> > > +       orr     r11, r11, r7, lsl r2            @ factor index number into r11
> > > +       mcr     p15, 0, r11, c7, c14, 2         @ clean & invalidate by set/way
> > > +       subs    r9, r9, #1                      @ decrement the way
> > > +       bge     loop3
> > > +       subs    r7, r7, #1                      @ decrement the index
> > > +       bge     loop2
> > > +finished:
> > > +       mov     r10, #0                         @ swith back to cache level 0
> > > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr 
> > > +       dsb
> > > +       isb
> > 
> > This code is already in the kernel in cache-v7.S, please use that.
> > We are just adding the new LoUIS API that probably does what you
> > want, even though for Tegra, that is an A9 based platform I fail to
> > understand why Level of Coherency differs from L1.
> > 
> > Can you explain to me please why Level of Coherency (LoC) is != from L1
> > on Tegra ?
> > 
> 
> Thanks for introducing the new LoUIS cache API. Did LoUIS been changed
> by other HW? I checked the new LoUIS API. If LoUIS == 0, it means inner
> shareable then it do nothing just return. But I need to flush L1 data
> cache here to sync the coherency before CPU be power gated. And disable
> data cache before flush is needed.

I understand that, that's why I am asking. To me LoUIS and LoC should
both be the same for A9 based platforms and they should both represent
a cache level that *includes* L1.

Can you provide me with the CLIDR value for Tegra3 please ?

> 
> I can tell you the sequence that why we just do L1 data cache flush
> here. Maybe I need to change the comment to "flush to point of
> coherency" not "level of coherency".
> 
> For secondary CPUs:
> * after cpu_suspend
> * disable data cache and flush L1 data cache
    ^(1)
> * Turn off SMP coherency
    ^(2)

Two steps above, one assembly function, no access to any data whatsoever
please.

> * power gate CPU
> 
> For CPU0:
> * outer_disable (flush and disable L2)

I guess L2 cannot be retained on Tegra ?

> * cpu_suspend
> * disable data cache and flush L1 data cache
> * Turn off SMP coherency
> * Turn off MMU
> * shut off the CPU rail
> 
> So we only do flush to PoC.
> 
> And changing the sequence of secondary CPUs to belows maybe more
> suitable?

Yes, basically because the net effect should be identical.

Lorenzo

> * after cpu_suspend
> * disable data cache and call to v7_flush_dcache_all
> * Turn off SMP coherency
> * power gate CPU
>
Joseph Lo Oct. 9, 2012, 9:18 a.m. UTC | #4
On Tue, 2012-10-09 at 16:38 +0800, Lorenzo Pieralisi wrote:
> On Tue, Oct 09, 2012 at 05:13:15AM +0100, Joseph Lo wrote:
> 
> [...]
> 
> > > > +ENTRY(tegra_flush_l1_cache)
> > > > +       stmfd   sp!, {r4-r5, r7, r9-r11, lr}
> > > > +       dmb                                     @ ensure ordering
> > > > +
> > > > +       /* Disable the data cache */
> > > > +       mrc     p15, 0, r2, c1, c0, 0
> > > > +       bic     r2, r2, #CR_C
> > > > +       dsb
> > > > +       mcr     p15, 0, r2, c1, c0, 0
> > > > +
> > > > +       /* Flush data cache */
> > > > +       mov     r10, #0
> > > > +#ifdef CONFIG_PREEMPT
> > > > +       save_and_disable_irqs_notrace r9        @ make cssr&csidr read atomic
> > > > +#endif
> > > > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
> > > > +       isb                                     @ isb to sych the new cssr&csidr
> > > > +       mrc     p15, 1, r1, c0, c0, 0           @ read the new csidr
> > > > +#ifdef CONFIG_PREEMPT
> > > > +       restore_irqs_notrace r9
> > > > +#endif
> > > > +       and     r2, r1, #7                      @ extract the length of the cache lines
> > > > +       add     r2, r2, #4                      @ add 4 (line length offset)
> > > > +       ldr     r4, =0x3ff
> > > > +       ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
> > > > +       clz     r5, r4                          @ find bit position of way size increment
> > > > +       ldr     r7, =0x7fff
> > > > +       ands    r7, r7, r1, lsr #13             @ extract max number of the index size
> > > > +loop2:
> > > > +       mov     r9, r4                          @ create working copy of max way size
> > > > +loop3:
> > > > +       orr     r11, r10, r9, lsl r5            @ factor way and cache number into r11
> > > > +       orr     r11, r11, r7, lsl r2            @ factor index number into r11
> > > > +       mcr     p15, 0, r11, c7, c14, 2         @ clean & invalidate by set/way
> > > > +       subs    r9, r9, #1                      @ decrement the way
> > > > +       bge     loop3
> > > > +       subs    r7, r7, #1                      @ decrement the index
> > > > +       bge     loop2
> > > > +finished:
> > > > +       mov     r10, #0                         @ swith back to cache level 0
> > > > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr 
> > > > +       dsb
> > > > +       isb
> > > 
> > > This code is already in the kernel in cache-v7.S, please use that.
> > > We are just adding the new LoUIS API that probably does what you
> > > want, even though for Tegra, that is an A9 based platform I fail to
> > > understand why Level of Coherency differs from L1.
> > > 
> > > Can you explain to me please why Level of Coherency (LoC) is != from L1
> > > on Tegra ?
> > > 
> > 
> > Thanks for introducing the new LoUIS cache API. Did LoUIS been changed
> > by other HW? I checked the new LoUIS API. If LoUIS == 0, it means inner
> > shareable then it do nothing just return. But I need to flush L1 data
> > cache here to sync the coherency before CPU be power gated. And disable
> > data cache before flush is needed.
> 
> I understand that, that's why I am asking. To me LoUIS and LoC should
> both be the same for A9 based platforms and they should both represent
> a cache level that *includes* L1.
> 
> Can you provide me with the CLIDR value for Tegra3 please ?
> 

I just checked the CLIDR of both Tegra20 and Tegra30. It's 0x09200003.
It looks I can use this new LoUIS api, right? :)

I will give it a try and update to next version. Thanks.

> > 
> > I can tell you the sequence that why we just do L1 data cache flush
> > here. Maybe I need to change the comment to "flush to point of
> > coherency" not "level of coherency".
> > 
> > For secondary CPUs:
> > * after cpu_suspend
> > * disable data cache and flush L1 data cache
>     ^(1)
> > * Turn off SMP coherency
>     ^(2)
> 
> Two steps above, one assembly function, no access to any data whatsoever
> please.
> 
OK. Will do this and update to next version.

> > * power gate CPU
> > 
> > For CPU0:
> > * outer_disable (flush and disable L2)
> 
> I guess L2 cannot be retained on Tegra ?
> 
Yes.

I will give it a test and update later.

Thanks,
Joseph
Lorenzo Pieralisi Oct. 9, 2012, 9:42 a.m. UTC | #5
On Tue, Oct 09, 2012 at 10:18:57AM +0100, Joseph Lo wrote:
> On Tue, 2012-10-09 at 16:38 +0800, Lorenzo Pieralisi wrote:
> > On Tue, Oct 09, 2012 at 05:13:15AM +0100, Joseph Lo wrote:
> > 
> > [...]
> > 
> > > > > +ENTRY(tegra_flush_l1_cache)
> > > > > +       stmfd   sp!, {r4-r5, r7, r9-r11, lr}
> > > > > +       dmb                                     @ ensure ordering
> > > > > +
> > > > > +       /* Disable the data cache */
> > > > > +       mrc     p15, 0, r2, c1, c0, 0
> > > > > +       bic     r2, r2, #CR_C
> > > > > +       dsb
> > > > > +       mcr     p15, 0, r2, c1, c0, 0
> > > > > +
> > > > > +       /* Flush data cache */
> > > > > +       mov     r10, #0
> > > > > +#ifdef CONFIG_PREEMPT
> > > > > +       save_and_disable_irqs_notrace r9        @ make cssr&csidr read atomic
> > > > > +#endif
> > > > > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
> > > > > +       isb                                     @ isb to sych the new cssr&csidr
> > > > > +       mrc     p15, 1, r1, c0, c0, 0           @ read the new csidr
> > > > > +#ifdef CONFIG_PREEMPT
> > > > > +       restore_irqs_notrace r9
> > > > > +#endif
> > > > > +       and     r2, r1, #7                      @ extract the length of the cache lines
> > > > > +       add     r2, r2, #4                      @ add 4 (line length offset)
> > > > > +       ldr     r4, =0x3ff
> > > > > +       ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
> > > > > +       clz     r5, r4                          @ find bit position of way size increment
> > > > > +       ldr     r7, =0x7fff
> > > > > +       ands    r7, r7, r1, lsr #13             @ extract max number of the index size
> > > > > +loop2:
> > > > > +       mov     r9, r4                          @ create working copy of max way size
> > > > > +loop3:
> > > > > +       orr     r11, r10, r9, lsl r5            @ factor way and cache number into r11
> > > > > +       orr     r11, r11, r7, lsl r2            @ factor index number into r11
> > > > > +       mcr     p15, 0, r11, c7, c14, 2         @ clean & invalidate by set/way
> > > > > +       subs    r9, r9, #1                      @ decrement the way
> > > > > +       bge     loop3
> > > > > +       subs    r7, r7, #1                      @ decrement the index
> > > > > +       bge     loop2
> > > > > +finished:
> > > > > +       mov     r10, #0                         @ swith back to cache level 0
> > > > > +       mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr 
> > > > > +       dsb
> > > > > +       isb
> > > > 
> > > > This code is already in the kernel in cache-v7.S, please use that.
> > > > We are just adding the new LoUIS API that probably does what you
> > > > want, even though for Tegra, that is an A9 based platform I fail to
> > > > understand why Level of Coherency differs from L1.
> > > > 
> > > > Can you explain to me please why Level of Coherency (LoC) is != from L1
> > > > on Tegra ?
> > > > 
> > > 
> > > Thanks for introducing the new LoUIS cache API. Did LoUIS been changed
> > > by other HW? I checked the new LoUIS API. If LoUIS == 0, it means inner
> > > shareable then it do nothing just return. But I need to flush L1 data
> > > cache here to sync the coherency before CPU be power gated. And disable
> > > data cache before flush is needed.
> > 
> > I understand that, that's why I am asking. To me LoUIS and LoC should
> > both be the same for A9 based platforms and they should both represent
> > a cache level that *includes* L1.
> > 
> > Can you provide me with the CLIDR value for Tegra3 please ?
> > 
> 
> I just checked the CLIDR of both Tegra20 and Tegra30. It's 0x09200003.
> It looks I can use this new LoUIS api, right? :)

You do not have to, since LoUIS == LoC, but that would be appreciated.

If you execute:

v7_flush_dcache_all

That would do the same thing on current Tegra(s).

If you want to reuse the same code for future A15/A7 processors then yes,

v7_flush_dcache_louis

is what you want to call, and that works for current Tegra(s) as well since,
as I mentioned LoUIS == LoC there.

Overall, using the new LoUIS API for single core shutdown is the best
option, since it should work for current and future cores.

Thanks,
Lorenzo
Stephen Warren Oct. 9, 2012, 10:38 p.m. UTC | #6
On 10/08/2012 04:26 AM, Joseph Lo wrote:
> This supports power-gated (LP2) idle on secondary CPUs for Tegra30.
> The secondary CPUs can go into LP2 state independently. When CPU goes
> into LP2 state, it saves it's state and puts itself to flow controlled
> WFI state. After that, it will been power gated.

> diff --git a/arch/arm/mach-tegra/cpuidle-tegra30.c b/arch/arm/mach-tegra/cpuidle-tegra30.c

>  static struct cpuidle_driver tegra_idle_driver = {
>  	.name = "tegra_idle",
>  	.owner = THIS_MODULE,
>  	.en_core_tk_irqen = 1,
> -	.state_count = 1,
> +	.state_count = 2,

Doesn't that assignment need to be ifdef'd just like the array entry
setup below:

>  	.states = {
>  		[0] = ARM_CPUIDLE_WFI_STATE_PWR(600),
> +#ifdef CONFIG_PM_SLEEP
> +		[1] = {
> +			.enter			= tegra30_idle_lp2,
> +			.exit_latency		= 2000,
> +			.target_residency	= 2200,
> +			.power_usage		= 0,
> +			.flags			= CPUIDLE_FLAG_TIME_VALID,
> +			.name			= "LP2",
> +			.desc			= "CPU power-gate",
> +		},
> +#endif
>  	},
>  };

> @@ -41,6 +114,10 @@ int __init tegra30_cpuidle_init(void)
>  	struct cpuidle_device *dev;
>  	struct cpuidle_driver *drv = &tegra_idle_driver;
>  
> +#ifndef CONFIG_PM_SLEEP
> +	drv->state_count = 1;	/* support clockgating only */
> +#endif

Oh, I see it's done here. Just fixing the static initialization seems a
lot simpler?

> diff --git a/arch/arm/mach-tegra/pm.c b/arch/arm/mach-tegra/pm.c

> +void __cpuinit tegra_clear_cpu_in_lp2(int cpu)
> +{
> +	spin_lock(&tegra_lp2_lock);
> +	BUG_ON(!cpumask_test_cpu(cpu, &tegra_in_lp2));
> +	cpumask_clear_cpu(cpu, &tegra_in_lp2);
> +
> +	/*
> +	 * Update the IRAM copy used by the reset handler. The IRAM copy
> +	 * can't use used directly by cpumask_clear_cpu() because it uses
> +	 * LDREX/STREX which requires the addressed location to be inner
> +	 * cacheable and sharable which IRAM isn't.
> +	 */
> +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> +	dsb();

Why not /just/ store the data in IRAM, and read/write directly to it,
rather than maintaining an SDRAM-based copy of it?

Then, wouldn't the body of this function be simply:

spin_lock();
BUG_ON(!(tegra_cpu_lp2_mask & BIT(cpu)));
tegra_cpu_lp2_mask |= BIT(cpu);
spin_unlock();

> +bool __cpuinit tegra_set_cpu_in_lp2(int cpu)

Similar comment here.
Joseph Lo Oct. 11, 2012, 9:15 a.m. UTC | #7
On Wed, 2012-10-10 at 06:38 +0800, Stephen Warren wrote:
> On 10/08/2012 04:26 AM, Joseph Lo wrote:
> > This supports power-gated (LP2) idle on secondary CPUs for Tegra30.
> > The secondary CPUs can go into LP2 state independently. When CPU goes
> > into LP2 state, it saves it's state and puts itself to flow controlled
> > WFI state. After that, it will been power gated.
> 
> > diff --git a/arch/arm/mach-tegra/cpuidle-tegra30.c b/arch/arm/mach-tegra/cpuidle-tegra30.c
> 
> >  static struct cpuidle_driver tegra_idle_driver = {
> >  	.name = "tegra_idle",
> >  	.owner = THIS_MODULE,
> >  	.en_core_tk_irqen = 1,
> > -	.state_count = 1,
> > +	.state_count = 2,
> 
> Doesn't that assignment need to be ifdef'd just like the array entry
> setup below:
> 
> >  	.states = {
> >  		[0] = ARM_CPUIDLE_WFI_STATE_PWR(600),
> > +#ifdef CONFIG_PM_SLEEP
> > +		[1] = {
> > +			.enter			= tegra30_idle_lp2,
> > +			.exit_latency		= 2000,
> > +			.target_residency	= 2200,
> > +			.power_usage		= 0,
> > +			.flags			= CPUIDLE_FLAG_TIME_VALID,
> > +			.name			= "LP2",
> > +			.desc			= "CPU power-gate",
> > +		},
> > +#endif
> >  	},
> >  };
> 
> > @@ -41,6 +114,10 @@ int __init tegra30_cpuidle_init(void)
> >  	struct cpuidle_device *dev;
> >  	struct cpuidle_driver *drv = &tegra_idle_driver;
> >  
> > +#ifndef CONFIG_PM_SLEEP
> > +	drv->state_count = 1;	/* support clockgating only */
> > +#endif
> 
> Oh, I see it's done here. Just fixing the static initialization seems a
> lot simpler?
> 
OK. Will do.

> > diff --git a/arch/arm/mach-tegra/pm.c b/arch/arm/mach-tegra/pm.c
> 
> > +void __cpuinit tegra_clear_cpu_in_lp2(int cpu)
> > +{
> > +	spin_lock(&tegra_lp2_lock);
> > +	BUG_ON(!cpumask_test_cpu(cpu, &tegra_in_lp2));
> > +	cpumask_clear_cpu(cpu, &tegra_in_lp2);
> > +
> > +	/*
> > +	 * Update the IRAM copy used by the reset handler. The IRAM copy
> > +	 * can't use used directly by cpumask_clear_cpu() because it uses
> > +	 * LDREX/STREX which requires the addressed location to be inner
> > +	 * cacheable and sharable which IRAM isn't.
> > +	 */
> > +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> > +	dsb();
> 
> Why not /just/ store the data in IRAM, and read/write directly to it,
> rather than maintaining an SDRAM-based copy of it?
> 
> Then, wouldn't the body of this function be simply:
> 
> spin_lock();
> BUG_ON(!(tegra_cpu_lp2_mask & BIT(cpu)));
> tegra_cpu_lp2_mask |= BIT(cpu);
> spin_unlock();
> 

It may not simple like this. To maintain it identical to a cpumask. It
may look likes below. Because I need to compare it with cpu_online_mask.

void __cpuinit tegra_clear_cpu_in_lp2(int phy_cpu_id)
{
	cpumask_t *iram_cpu_lp2_mask = tegra_cpu_lp2_mask;
	unsigned long *addr;

	spin_lock(&tegra_lp2_lock);

	addr = cpumask_bits(iram_cpu_lp2_mask);
	BUG_ON(!(1UL &
		(addr[BIT_WORD(phy_cpu_id)]
		 >> (phy_cpu_id & (BITS_PER_LONG-1)))));

	*(addr + BIT_WORD(phy_cpu_id)) &= ~BIT_MASK(phy_cpu_id);

	spin_unlock(&tegra_lp2_lock);
}

> > +bool __cpuinit tegra_set_cpu_in_lp2(int cpu)
> 
> Similar comment here.

bool __cpuinit tegra_set_cpu_in_lp2(int phy_cpu_id)
{ 
	bool last_cpu = false;
	cpumask_t *iram_cpu_lp2_mask = tegra_cpu_lp2_mask;
	unsigned long *addr;

	spin_lock(&tegra_lp2_lock);

	addr = cpumask_bits(iram_cpu_lp2_mask);
	BUG_ON((1UL &
		(addr[BIT_WORD(phy_cpu_id)]
		 >> (phy_cpu_id & (BITS_PER_LONG-1)))));

	*(addr + BIT_WORD(phy_cpu_id)) |= BIT_MASK(phy_cpu_id);

	if ((phy_cpu_id == 0) && 
	     cpumask_equal(iram_cpu_lp2_mask, cpu_online_mask))
		last_cpu = true;

	spin_unlock(&tegra_lp2_lock);
	return last_cpu;
}

So I think the original version should more easy to understand. How do
you think?

Thanks,
Joseph
Stephen Warren Oct. 11, 2012, 4:24 p.m. UTC | #8
On 10/11/2012 03:15 AM, Joseph Lo wrote:
> On Wed, 2012-10-10 at 06:38 +0800, Stephen Warren wrote:
>> On 10/08/2012 04:26 AM, Joseph Lo wrote:
>>> This supports power-gated (LP2) idle on secondary CPUs for Tegra30.
>>> The secondary CPUs can go into LP2 state independently. When CPU goes
>>> into LP2 state, it saves it's state and puts itself to flow controlled
>>> WFI state. After that, it will been power gated.
>>
>>> diff --git a/arch/arm/mach-tegra/cpuidle-tegra30.c b/arch/arm/mach-tegra/cpuidle-tegra30.c
>>
>>>  static struct cpuidle_driver tegra_idle_driver = {
>>>  	.name = "tegra_idle",
>>>  	.owner = THIS_MODULE,
>>>  	.en_core_tk_irqen = 1,
>>> -	.state_count = 1,
>>> +	.state_count = 2,
>>
>> Doesn't that assignment need to be ifdef'd just like the array entry
>> setup below:
>>
>>>  	.states = {
>>>  		[0] = ARM_CPUIDLE_WFI_STATE_PWR(600),
>>> +#ifdef CONFIG_PM_SLEEP
>>> +		[1] = {
>>> +			.enter			= tegra30_idle_lp2,
>>> +			.exit_latency		= 2000,
>>> +			.target_residency	= 2200,
>>> +			.power_usage		= 0,
>>> +			.flags			= CPUIDLE_FLAG_TIME_VALID,
>>> +			.name			= "LP2",
>>> +			.desc			= "CPU power-gate",
>>> +		},
>>> +#endif
>>>  	},
>>>  };
>>
>>> @@ -41,6 +114,10 @@ int __init tegra30_cpuidle_init(void)
>>>  	struct cpuidle_device *dev;
>>>  	struct cpuidle_driver *drv = &tegra_idle_driver;
>>>  
>>> +#ifndef CONFIG_PM_SLEEP
>>> +	drv->state_count = 1;	/* support clockgating only */
>>> +#endif
>>
>> Oh, I see it's done here. Just fixing the static initialization seems a
>> lot simpler?
>>
> OK. Will do.
> 
>>> diff --git a/arch/arm/mach-tegra/pm.c b/arch/arm/mach-tegra/pm.c
>>
>>> +void __cpuinit tegra_clear_cpu_in_lp2(int cpu)
>>> +{
>>> +	spin_lock(&tegra_lp2_lock);
>>> +	BUG_ON(!cpumask_test_cpu(cpu, &tegra_in_lp2));
>>> +	cpumask_clear_cpu(cpu, &tegra_in_lp2);
>>> +
>>> +	/*
>>> +	 * Update the IRAM copy used by the reset handler. The IRAM copy
>>> +	 * can't use used directly by cpumask_clear_cpu() because it uses
>>> +	 * LDREX/STREX which requires the addressed location to be inner
>>> +	 * cacheable and sharable which IRAM isn't.
>>> +	 */
>>> +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
>>> +	dsb();
>>
>> Why not /just/ store the data in IRAM, and read/write directly to it,
>> rather than maintaining an SDRAM-based copy of it?
>>
>> Then, wouldn't the body of this function be simply:
>>
>> spin_lock();
>> BUG_ON(!(tegra_cpu_lp2_mask & BIT(cpu)));
>> tegra_cpu_lp2_mask |= BIT(cpu);
>> spin_unlock();
>>
> 
> It may not simple like this. To maintain it identical to a cpumask. It
> may look likes below. Because I need to compare it with cpu_online_mask.

Oh, the comparison against cpu_online_mask() is what I was missing. I
guess that offline CPUs don't go into LP2, so you can't just check that
tegra_cpu_lp2_mask == (1 << num_cpus()) - 1.

One way to avoid that might be to maintain a cpu_in_lp2_count variable
alongside the mask, and simply compare that against num_online_cpus()
rather than comparing the two masks. At least that would avoid the
following line:

>>> +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);

... making use of knowledge of the internal structure of the struct
cpumask type.

However, given the comparison requirement, either way is probably fine.
Joseph Lo Oct. 12, 2012, 3:21 a.m. UTC | #9
On Fri, 2012-10-12 at 00:24 +0800, Stephen Warren wrote:
> On 10/11/2012 03:15 AM, Joseph Lo wrote:
> > On Wed, 2012-10-10 at 06:38 +0800, Stephen Warren wrote:
> >> On 10/08/2012 04:26 AM, Joseph Lo wrote:
> >>> This supports power-gated (LP2) idle on secondary CPUs for Tegra30.
> >>> The secondary CPUs can go into LP2 state independently. When CPU goes
> >>> into LP2 state, it saves it's state and puts itself to flow controlled
> >>> WFI state. After that, it will been power gated.
> >>
> >>> diff --git a/arch/arm/mach-tegra/pm.c b/arch/arm/mach-tegra/pm.c
> >>
> >>> +void __cpuinit tegra_clear_cpu_in_lp2(int cpu)
> >>> +{
> >>> +	spin_lock(&tegra_lp2_lock);
> >>> +	BUG_ON(!cpumask_test_cpu(cpu, &tegra_in_lp2));
> >>> +	cpumask_clear_cpu(cpu, &tegra_in_lp2);
> >>> +
> >>> +	/*
> >>> +	 * Update the IRAM copy used by the reset handler. The IRAM copy
> >>> +	 * can't use used directly by cpumask_clear_cpu() because it uses
> >>> +	 * LDREX/STREX which requires the addressed location to be inner
> >>> +	 * cacheable and sharable which IRAM isn't.
> >>> +	 */
> >>> +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> >>> +	dsb();
> >>
> >> Why not /just/ store the data in IRAM, and read/write directly to it,
> >> rather than maintaining an SDRAM-based copy of it?
> >>
> >> Then, wouldn't the body of this function be simply:
> >>
> >> spin_lock();
> >> BUG_ON(!(tegra_cpu_lp2_mask & BIT(cpu)));
> >> tegra_cpu_lp2_mask |= BIT(cpu);
> >> spin_unlock();
> >>
> > 
> > It may not simple like this. To maintain it identical to a cpumask. It
> > may look likes below. Because I need to compare it with cpu_online_mask.
> 
> Oh, the comparison against cpu_online_mask() is what I was missing. I
> guess that offline CPUs don't go into LP2, so you can't just check that
> tegra_cpu_lp2_mask == (1 << num_cpus()) - 1.
> 
> One way to avoid that might be to maintain a cpu_in_lp2_count variable
> alongside the mask, and simply compare that against num_online_cpus()
> rather than comparing the two masks. At least that would avoid the
> following line:
> 
> >>> +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> 
> ... making use of knowledge of the internal structure of the struct
> cpumask type.
> 
> However, given the comparison requirement, either way is probably fine.

OK. I got your idea now. And I rechecked it today. The method that you
suggested original could satisfy the usage here. It could maintain the
CPUs go into LP2 as a cpumask. I just verified it. Will update in next
version.

Thanks,
Joseph
Stephen Warren Oct. 30, 2012, 10:27 p.m. UTC | #10
On 10/30/2012 04:03 PM, Antti P Miettinen wrote:
> Joseph Lo <josephl@nvidia.com> writes:
>>>>>> +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> 
> BTW, writel_relaxed() would probably be more than enough? IRAM is mapped
> stronly ordered, isn't it? And there's an explicit dsb(). And the mask
> is observed and written only by CPUs. If there are coherence issues,
> they would be in the fabric? And then neither CPU barriers nor L2 sync
> would help, you'd need a readback, right?

I expect there are many places where we simply default to using
readl/writel (e.g. due to cut/paste, their prevalence, etc.) rather than
explicitly using the _relaxed variants if we can. Perhaps we should do a
pass through all the Tegra code and clean that up sometime.
Joseph Lo Oct. 31, 2012, 1:26 a.m. UTC | #11
On Wed, 2012-10-31 at 06:27 +0800, Stephen Warren wrote:
> On 10/30/2012 04:03 PM, Antti P Miettinen wrote:
> > Joseph Lo <josephl@nvidia.com> writes:
> >>>>>> +	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
> > 
> > BTW, writel_relaxed() would probably be more than enough? IRAM is mapped
> > stronly ordered, isn't it? And there's an explicit dsb(). And the mask
> > is observed and written only by CPUs. If there are coherence issues,
> > they would be in the fabric? And then neither CPU barriers nor L2 sync
> > would help, you'd need a readback, right?
> 
> I expect there are many places where we simply default to using
> readl/writel (e.g. due to cut/paste, their prevalence, etc.) rather than
> explicitly using the _relaxed variants if we can. Perhaps we should do a
> pass through all the Tegra code and clean that up sometime.

Hi Antti,

Thanks for review.
I had updated this code from V2. The code looks like below right now.
It's similar to "writel_relaxed" function. And I had verified this code
in SMP environment it can sync the status of "cpu_in_lp2". I don't see
any coherency issue in IRAM memory space right now. I knew some IO
registers that under PPSB bus (peripheral bus) needed a readback as a
barrier. Because PPSB queues write transactions.

I had verified this on Tegra20 & Tegra30. It's reliable.

*cpu_in_lp2 |= BIT(phy_cpu_id);
or
*cpu_in_lp2 &= ~BIT(phy_cpu_id);

Thanks,
Joseph
diff mbox

Patch

diff --git a/arch/arm/mach-tegra/Makefile b/arch/arm/mach-tegra/Makefile
index 9b80c1e..6f224f7 100644
--- a/arch/arm/mach-tegra/Makefile
+++ b/arch/arm/mach-tegra/Makefile
@@ -8,6 +8,7 @@  obj-y					+= pmc.o
 obj-y					+= flowctrl.o
 obj-y					+= powergate.o
 obj-y					+= apbio.o
+obj-y					+= pm.o
 obj-$(CONFIG_CPU_IDLE)			+= cpuidle.o
 obj-$(CONFIG_CPU_IDLE)			+= sleep.o
 obj-$(CONFIG_ARCH_TEGRA_2x_SOC)         += tegra20_clocks.o
diff --git a/arch/arm/mach-tegra/cpuidle-tegra30.c b/arch/arm/mach-tegra/cpuidle-tegra30.c
index 0f85df8..842fef4 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra30.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra30.c
@@ -19,21 +19,94 @@ 
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/cpuidle.h>
+#include <linux/cpu_pm.h>
+#include <linux/clockchips.h>
 
 #include <asm/cpuidle.h>
+#include <asm/proc-fns.h>
+#include <asm/suspend.h>
+
+#include "pm.h"
+#include "sleep.h"
+
+#ifdef CONFIG_PM_SLEEP
+static int tegra30_idle_lp2(struct cpuidle_device *dev,
+			    struct cpuidle_driver *drv,
+			    int index);
+#endif
 
 static struct cpuidle_driver tegra_idle_driver = {
 	.name = "tegra_idle",
 	.owner = THIS_MODULE,
 	.en_core_tk_irqen = 1,
-	.state_count = 1,
+	.state_count = 2,
 	.states = {
 		[0] = ARM_CPUIDLE_WFI_STATE_PWR(600),
+#ifdef CONFIG_PM_SLEEP
+		[1] = {
+			.enter			= tegra30_idle_lp2,
+			.exit_latency		= 2000,
+			.target_residency	= 2200,
+			.power_usage		= 0,
+			.flags			= CPUIDLE_FLAG_TIME_VALID,
+			.name			= "LP2",
+			.desc			= "CPU power-gate",
+		},
+#endif
 	},
 };
 
 static DEFINE_PER_CPU(struct cpuidle_device, tegra_idle_device);
 
+#ifdef CONFIG_PM_SLEEP
+static bool tegra30_idle_enter_lp2_cpu_n(struct cpuidle_device *dev,
+					 struct cpuidle_driver *drv,
+					 int index)
+{
+#ifdef CONFIG_SMP
+	clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+
+	smp_wmb();
+
+	save_cpu_arch_register();
+
+	cpu_suspend(0, tegra30_sleep_cpu_secondary_finish);
+
+	restore_cpu_arch_register();
+
+	clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+#endif
+
+	return true;
+}
+
+static int __cpuinit tegra30_idle_lp2(struct cpuidle_device *dev,
+				      struct cpuidle_driver *drv,
+				      int index)
+{
+	bool entered_lp2 = false;
+
+	local_fiq_disable();
+
+	tegra_set_cpu_in_lp2(dev->cpu);
+	cpu_pm_enter();
+
+	if (dev->cpu == 0)
+		cpu_do_idle();
+	else
+		entered_lp2 = tegra30_idle_enter_lp2_cpu_n(dev, drv, index);
+
+	cpu_pm_exit();
+	tegra_clear_cpu_in_lp2(dev->cpu);
+
+	local_fiq_enable();
+
+	smp_rmb();
+
+	return (entered_lp2) ? index : 0;
+}
+#endif
+
 int __init tegra30_cpuidle_init(void)
 {
 	int ret;
@@ -41,6 +114,10 @@  int __init tegra30_cpuidle_init(void)
 	struct cpuidle_device *dev;
 	struct cpuidle_driver *drv = &tegra_idle_driver;
 
+#ifndef CONFIG_PM_SLEEP
+	drv->state_count = 1;	/* support clockgating only */
+#endif
+
 	ret = cpuidle_register_driver(&tegra_idle_driver);
 	if (ret) {
 		pr_err("CPUidle driver registration failed\n");
diff --git a/arch/arm/mach-tegra/pm.c b/arch/arm/mach-tegra/pm.c
new file mode 100644
index 0000000..4655383
--- /dev/null
+++ b/arch/arm/mach-tegra/pm.c
@@ -0,0 +1,88 @@ 
+/*
+ * CPU complex suspend & resume functions for Tegra SoCs
+ *
+ * Copyright (c) 2009-2012, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/io.h>
+#include <linux/cpumask.h>
+
+#include <mach/iomap.h>
+
+#include "reset.h"
+
+#ifdef CONFIG_PM_SLEEP
+static unsigned int g_diag_reg;
+static DEFINE_SPINLOCK(tegra_lp2_lock);
+static cpumask_t tegra_in_lp2;
+
+void save_cpu_arch_register(void)
+{
+	/*read diagnostic register*/
+	asm("mrc p15, 0, %0, c15, c0, 1" : "=r"(g_diag_reg) : : "cc");
+	return;
+}
+
+void restore_cpu_arch_register(void)
+{
+	/*write diagnostic register*/
+	asm("mcr p15, 0, %0, c15, c0, 1" : : "r"(g_diag_reg) : "cc");
+	return;
+}
+
+void __cpuinit tegra_clear_cpu_in_lp2(int cpu)
+{
+	spin_lock(&tegra_lp2_lock);
+	BUG_ON(!cpumask_test_cpu(cpu, &tegra_in_lp2));
+	cpumask_clear_cpu(cpu, &tegra_in_lp2);
+
+	/*
+	 * Update the IRAM copy used by the reset handler. The IRAM copy
+	 * can't use used directly by cpumask_clear_cpu() because it uses
+	 * LDREX/STREX which requires the addressed location to be inner
+	 * cacheable and sharable which IRAM isn't.
+	 */
+	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
+	dsb();
+
+	spin_unlock(&tegra_lp2_lock);
+}
+
+bool __cpuinit tegra_set_cpu_in_lp2(int cpu)
+{
+	bool last_cpu = false;
+
+	spin_lock(&tegra_lp2_lock);
+	BUG_ON(cpumask_test_cpu(cpu, &tegra_in_lp2));
+	cpumask_set_cpu(cpu, &tegra_in_lp2);
+
+	/*
+	 * Update the IRAM copy used by the reset handler. The IRAM copy
+	 * can't use used directly by cpumask_set_cpu() because it uses
+	 * LDREX/STREX which requires the addressed location to be inner
+	 * cacheable and sharable which IRAM isn't.
+	 */
+	writel(tegra_in_lp2.bits[0], tegra_cpu_lp2_mask);
+	dsb();
+
+	if ((cpu == 0) && cpumask_equal(&tegra_in_lp2, cpu_online_mask))
+		last_cpu = true;
+
+	spin_unlock(&tegra_lp2_lock);
+	return last_cpu;
+}
+#endif
diff --git a/arch/arm/mach-tegra/pm.h b/arch/arm/mach-tegra/pm.h
new file mode 100644
index 0000000..21858d6
--- /dev/null
+++ b/arch/arm/mach-tegra/pm.h
@@ -0,0 +1,30 @@ 
+/*
+ * Copyright (C) 2010 Google, Inc.
+ * Copyright (c) 2010-2012 NVIDIA Corporation. All rights reserved.
+ *
+ * Author:
+ *	Colin Cross <ccross@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _MACH_TEGRA_PM_H_
+#define _MACH_TEGRA_PM_H_
+
+void save_cpu_arch_register(void);
+void restore_cpu_arch_register(void);
+
+void tegra_clear_cpu_in_lp2(int cpu);
+bool tegra_set_cpu_in_lp2(int cpu);
+
+#endif /* _MACH_TEGRA_PM_H_ */
diff --git a/arch/arm/mach-tegra/reset.h b/arch/arm/mach-tegra/reset.h
index de88bf8..234cd6b 100644
--- a/arch/arm/mach-tegra/reset.h
+++ b/arch/arm/mach-tegra/reset.h
@@ -29,6 +29,8 @@ 
 
 #ifndef __ASSEMBLY__
 
+#include <mach/irammap.h>
+
 extern unsigned long __tegra_cpu_reset_handler_data[TEGRA_RESET_DATA_SIZE];
 
 void __tegra_cpu_reset_handler_start(void);
@@ -36,6 +38,13 @@  void __tegra_cpu_reset_handler(void);
 void __tegra_cpu_reset_handler_end(void);
 void tegra_secondary_startup(void);
 
+#ifdef CONFIG_PM_SLEEP
+#define tegra_cpu_lp2_mask \
+	(IO_ADDRESS(TEGRA_IRAM_BASE + TEGRA_IRAM_RESET_HANDLER_OFFSET + \
+	((u32)&__tegra_cpu_reset_handler_data[TEGRA_RESET_MASK_LP2] - \
+	 (u32)__tegra_cpu_reset_handler_start)))
+#endif
+
 #define tegra_cpu_reset_handler_offset \
 		((u32)__tegra_cpu_reset_handler - \
 		 (u32)__tegra_cpu_reset_handler_start)
diff --git a/arch/arm/mach-tegra/sleep-tegra30.S b/arch/arm/mach-tegra/sleep-tegra30.S
index 777d9ce..67b1cba 100644
--- a/arch/arm/mach-tegra/sleep-tegra30.S
+++ b/arch/arm/mach-tegra/sleep-tegra30.S
@@ -17,6 +17,8 @@ 
 #include <linux/linkage.h>
 
 #include <asm/assembler.h>
+#include <asm/asm-offsets.h>
+#include <asm/glue-cache.h>
 
 #include <mach/iomap.h>
 
@@ -82,6 +84,7 @@  delay_1:
 	ldr	r3, [r1]			@ read CSR
 	str	r3, [r1]			@ clear CSR
 	tst	r0, #TEGRA30_POWER_HOTPLUG_SHUTDOWN
+	moveq   r3, #FLOW_CTRL_WAIT_FOR_INTERRUPT	@ For LP2
 	movne	r3, #FLOW_CTRL_WAITEVENT		@ For hotplug
 	str	r3, [r2]
 	ldr	r0, [r2]
@@ -105,3 +108,26 @@  wfe_war:
 
 ENDPROC(tegra30_cpu_shutdown)
 #endif
+
+#ifdef CONFIG_PM_SLEEP
+/*
+ * tegra30_sleep_cpu_secondary_finish(unsigned long v2p)
+ *
+ * Enters LP2 on secondary CPU by exiting coherency and powergating the CPU.
+ */
+ENTRY(tegra30_sleep_cpu_secondary_finish)
+	mov	r7, lr
+
+	/* Flush and disable the L1 data cache */
+	bl	tegra_flush_l1_cache
+
+	/* Turn off SMP coherency */
+	exit_smp r4, r5
+
+	/* Powergate this CPU. */
+	mov	r0, #0                          @ power mode flags (!hotplug)
+	bl	tegra30_cpu_shutdown
+	mov	r0, #1                          @ never return here
+	mov	pc, r7
+ENDPROC(tegra30_sleep_cpu_secondary_finish)
+#endif
diff --git a/arch/arm/mach-tegra/sleep.S b/arch/arm/mach-tegra/sleep.S
index ea81554..7748fdb 100644
--- a/arch/arm/mach-tegra/sleep.S
+++ b/arch/arm/mach-tegra/sleep.S
@@ -25,9 +25,75 @@ 
 #include <linux/linkage.h>
 
 #include <asm/assembler.h>
+#include <asm/cp15.h>
 
 #include <mach/iomap.h>
 
 #include "flowctrl.h"
 #include "sleep.h"
 
+#ifdef CONFIG_PM_SLEEP
+/*
+ * tegra_flush_l1_cache
+ *
+ * clean & invalidate the L1 cache
+ *
+ * The flush_cache_all flushes all caches within level of coherence, this
+ * may not be desired if all we need is to flush L1 only. Therefore this
+ * function is implemented to flush the L1 cache only.
+ *
+ * Disable is needed before flush to prevent allocation during flush.
+ * When cache is disabled, we cannot push to stack.
+ *
+ * Corrupted registers: r0-r7, r9-r11
+ */
+
+ENTRY(tegra_flush_l1_cache)
+	stmfd	sp!, {r4-r5, r7, r9-r11, lr}
+	dmb					@ ensure ordering
+
+	/* Disable the data cache */
+	mrc	p15, 0, r2, c1, c0, 0
+	bic	r2, r2, #CR_C
+	dsb
+	mcr	p15, 0, r2, c1, c0, 0
+
+	/* Flush data cache */
+	mov	r10, #0
+#ifdef CONFIG_PREEMPT
+	save_and_disable_irqs_notrace r9	@ make cssr&csidr read atomic
+#endif
+	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
+	isb					@ isb to sych the new cssr&csidr
+	mrc	p15, 1, r1, c0, c0, 0		@ read the new csidr
+#ifdef CONFIG_PREEMPT
+	restore_irqs_notrace r9
+#endif
+	and	r2, r1, #7			@ extract the length of the cache lines
+	add	r2, r2, #4			@ add 4 (line length offset)
+	ldr	r4, =0x3ff
+	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
+	clz	r5, r4				@ find bit position of way size increment
+	ldr	r7, =0x7fff
+	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
+loop2:
+	mov	r9, r4				@ create working copy of max way size
+loop3:
+	orr	r11, r10, r9, lsl r5		@ factor way and cache number into r11
+	orr	r11, r11, r7, lsl r2		@ factor index number into r11
+	mcr	p15, 0, r11, c7, c14, 2		@ clean & invalidate by set/way
+	subs	r9, r9, #1			@ decrement the way
+	bge	loop3
+	subs	r7, r7, #1			@ decrement the index
+	bge	loop2
+finished:
+	mov	r10, #0				@ swith back to cache level 0
+	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
+	dsb
+	isb
+
+	ldmfd	sp!, {r4-r5, r7, r9-r11, lr}
+	mov	pc, lr
+ENDPROC(tegra_flush_l1_cache)
+
+#endif
diff --git a/arch/arm/mach-tegra/sleep.h b/arch/arm/mach-tegra/sleep.h
index c9dec37..220fbd1 100644
--- a/arch/arm/mach-tegra/sleep.h
+++ b/arch/arm/mach-tegra/sleep.h
@@ -82,5 +82,7 @@  static inline void tegra20_hotplug_init(void) {}
 static inline void tegra30_hotplug_init(void) {}
 #endif
 
+int tegra30_sleep_cpu_secondary_finish(unsigned long);
+
 #endif
 #endif