diff mbox

[02/16] ARM: b.L: introduce the CPU/cluster power API

Message ID 1357777251-13541-3-git-send-email-nicolas.pitre@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Nicolas Pitre Jan. 10, 2013, 12:20 a.m. UTC
This is the basic API used to handle the powering up/down of individual
CPUs in a big.LITTLE system.  The platform specific backend implementation
has the responsibility to also handle the cluster level power as well when
the first/last CPU in a cluster is brought up/down.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/arm/common/bL_entry.c      | 88 +++++++++++++++++++++++++++++++++++++++
 arch/arm/include/asm/bL_entry.h | 92 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+)

Comments

Will Deacon Jan. 10, 2013, 11:08 p.m. UTC | #1
On Thu, Jan 10, 2013 at 12:20:37AM +0000, Nicolas Pitre wrote:
> This is the basic API used to handle the powering up/down of individual
> CPUs in a big.LITTLE system.  The platform specific backend implementation
> has the responsibility to also handle the cluster level power as well when
> the first/last CPU in a cluster is brought up/down.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>  arch/arm/common/bL_entry.c      | 88 +++++++++++++++++++++++++++++++++++++++
>  arch/arm/include/asm/bL_entry.h | 92 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 180 insertions(+)
> 
> diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
> index 80fff49417..41de0622de 100644
> --- a/arch/arm/common/bL_entry.c
> +++ b/arch/arm/common/bL_entry.c
> @@ -11,11 +11,13 @@
>  
>  #include <linux/kernel.h>
>  #include <linux/init.h>
> +#include <linux/irqflags.h>
>  
>  #include <asm/bL_entry.h>
>  #include <asm/barrier.h>
>  #include <asm/proc-fns.h>
>  #include <asm/cacheflush.h>
> +#include <asm/idmap.h>
>  
>  extern volatile unsigned long bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
>  
> @@ -28,3 +30,89 @@ void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
>  	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
>  			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
>  }
> +
> +static const struct bL_platform_power_ops *platform_ops;
> +
> +int __init bL_platform_power_register(const struct bL_platform_power_ops *ops)
> +{
> +	if (platform_ops)
> +		return -EBUSY;
> +	platform_ops = ops;
> +	return 0;
> +}
> +
> +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
> +{
> +	if (!platform_ops)
> +		return -EUNATCH;

Is this the right error code?

> +	might_sleep();
> +	return platform_ops->power_up(cpu, cluster);
> +}
> +
> +typedef void (*phys_reset_t)(unsigned long);

Maybe it's worth putting this typedef in a header file somewhere. It's
also used by the soft reboot code.

> +
> +void bL_cpu_power_down(void)
> +{
> +	phys_reset_t phys_reset;
> +
> +	BUG_ON(!platform_ops);

Seems a bit overkill, or are we unrecoverable by this point?

> +	BUG_ON(!irqs_disabled());
> +
> +	/*
> +	 * Do this before calling into the power_down method,
> +	 * as it might not always be safe to do afterwards.
> +	 */
> +	setup_mm_for_reboot();
> +
> +	platform_ops->power_down();
> +
> +	/*
> +	 * It is possible for a power_up request to happen concurrently
> +	 * with a power_down request for the same CPU. In this case the
> +	 * power_down method might not be able to actually enter a
> +	 * powered down state with the WFI instruction if the power_up
> +	 * method has removed the required reset condition.  The
> +	 * power_down method is then allowed to return. We must perform
> +	 * a re-entry in the kernel as if the power_up method just had
> +	 * deasserted reset on the CPU.
> +	 *
> +	 * To simplify race issues, the platform specific implementation
> +	 * must accommodate for the possibility of unordered calls to
> +	 * power_down and power_up with a usage count. Therefore, if a
> +	 * call to power_up is issued for a CPU that is not down, then
> +	 * the next call to power_down must not attempt a full shutdown
> +	 * but only do the minimum (normally disabling L1 cache and CPU
> +	 * coherency) and return just as if a concurrent power_up request
> +	 * had happened as described above.
> +	 */
> +
> +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> +	phys_reset(virt_to_phys(bL_entry_point));
> +
> +	/* should never get here */
> +	BUG();
> +}
> +
> +void bL_cpu_suspend(u64 expected_residency)
> +{
> +	phys_reset_t phys_reset;
> +
> +	BUG_ON(!platform_ops);
> +	BUG_ON(!irqs_disabled());
> +
> +	/* Very similar to bL_cpu_power_down() */
> +	setup_mm_for_reboot();
> +	platform_ops->suspend(expected_residency);
> +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> +	phys_reset(virt_to_phys(bL_entry_point));
> +	BUG();
> +}
> +
> +int bL_cpu_powered_up(void)
> +{
> +	if (!platform_ops)
> +		return -EUNATCH;
> +	if (platform_ops->powered_up)
> +		platform_ops->powered_up();
> +	return 0;
> +}
> diff --git a/arch/arm/include/asm/bL_entry.h b/arch/arm/include/asm/bL_entry.h
> index ff623333a1..942d7f9f19 100644
> --- a/arch/arm/include/asm/bL_entry.h
> +++ b/arch/arm/include/asm/bL_entry.h
> @@ -31,5 +31,97 @@ extern void bL_entry_point(void);
>   */
>  void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
>  
> +/*
> + * CPU/cluster power operations API for higher subsystems to use.
> + */
> +
> +/**
> + * bL_cpu_power_up - make given CPU in given cluster runable
> + *
> + * @cpu: CPU number within given cluster
> + * @cluster: cluster number for the CPU
> + *
> + * The identified CPU is brought out of reset.  If the cluster was powered
> + * down then it is brought up as well, taking care not to let the other CPUs
> + * in the cluster run, and ensuring appropriate cluster setup.
> + *
> + * Caller must ensure the appropriate entry vector is initialized with
> + * bL_set_entry_vector() prior to calling this.
> + *
> + * This must be called in a sleepable context.  However, the implementation
> + * is strongly encouraged to return early and let the operation happen
> + * asynchronously, especially when significant delays are expected.
> + *
> + * If the operation cannot be performed then an error code is returned.
> + */
> +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster);
> +
> +/**
> + * bL_cpu_power_down - power the calling CPU down
> + *
> + * The calling CPU is powered down.
> + *
> + * If this CPU is found to be the "last man standing" in the cluster
> + * then the cluster is prepared for power-down too.
> + *
> + * This must be called with interrupts disabled.
> + *
> + * This does not return.  Re-entry in the kernel is expected via
> + * bL_entry_point.
> + */
> +void bL_cpu_power_down(void);
> +
> +/**
> + * bL_cpu_suspend - bring the calling CPU in a suspended state
> + *
> + * @expected_residency: duration in microseconds the CPU is expected
> + *			to remain suspended, or 0 if unknown/infinity.
> + *
> + * The calling CPU is suspended.  The expected residency argument is used
> + * as a hint by the platform specific backend to implement the appropriate
> + * sleep state level according to the knowledge it has on wake-up latency
> + * for the given hardware.
> + *
> + * If this CPU is found to be the "last man standing" in the cluster
> + * then the cluster may be prepared for power-down too, if the expected
> + * residency makes it worthwhile.
> + *
> + * This must be called with interrupts disabled.
> + *
> + * This does not return.  Re-entry in the kernel is expected via
> + * bL_entry_point.
> + */
> +void bL_cpu_suspend(u64 expected_residency);
> +
> +/**
> + * bL_cpu_powered_up - housekeeping workafter a CPU has been powered up
> + *
> + * This lets the platform specific backend code perform needed housekeeping
> + * work.  This must be called by the newly activated CPU as soon as it is
> + * fully operational in kernel space, before it enables interrupts.
> + *
> + * If the operation cannot be performed then an error code is returned.
> + */
> +int bL_cpu_powered_up(void);
> +
> +/*
> + * Platform specific methods used in the implementation of the above API.
> + */
> +struct bL_platform_power_ops {
> +	int (*power_up)(unsigned int cpu, unsigned int cluster);
> +	void (*power_down)(void);
> +	void (*suspend)(u64);
> +	void (*powered_up)(void);
> +};

It would be good if these prototypes matched the PSCI code, then platforms
could just glue them together directly.

Will
Nicolas Pitre Jan. 11, 2013, 2:30 a.m. UTC | #2
On Thu, 10 Jan 2013, Will Deacon wrote:

> On Thu, Jan 10, 2013 at 12:20:37AM +0000, Nicolas Pitre wrote:
> > This is the basic API used to handle the powering up/down of individual
> > CPUs in a big.LITTLE system.  The platform specific backend implementation
> > has the responsibility to also handle the cluster level power as well when
> > the first/last CPU in a cluster is brought up/down.
> > 
> > Signed-off-by: Nicolas Pitre <nico@linaro.org>
> > ---
> >  arch/arm/common/bL_entry.c      | 88 +++++++++++++++++++++++++++++++++++++++
> >  arch/arm/include/asm/bL_entry.h | 92 +++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 180 insertions(+)
> > 
> > diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
> > index 80fff49417..41de0622de 100644
> > --- a/arch/arm/common/bL_entry.c
> > +++ b/arch/arm/common/bL_entry.c
> > @@ -11,11 +11,13 @@
> >  
> >  #include <linux/kernel.h>
> >  #include <linux/init.h>
> > +#include <linux/irqflags.h>
> >  
> >  #include <asm/bL_entry.h>
> >  #include <asm/barrier.h>
> >  #include <asm/proc-fns.h>
> >  #include <asm/cacheflush.h>
> > +#include <asm/idmap.h>
> >  
> >  extern volatile unsigned long bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
> >  
> > @@ -28,3 +30,89 @@ void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
> >  	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
> >  			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
> >  }
> > +
> > +static const struct bL_platform_power_ops *platform_ops;
> > +
> > +int __init bL_platform_power_register(const struct bL_platform_power_ops *ops)
> > +{
> > +	if (platform_ops)
> > +		return -EBUSY;
> > +	platform_ops = ops;
> > +	return 0;
> > +}
> > +
> > +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
> > +{
> > +	if (!platform_ops)
> > +		return -EUNATCH;
> 
> Is this the right error code?

It is as good as any other, with some meaning to be distinguished from 
the traditional ones like -ENOMEM or -EINVAL that the platform backends 
could return.

Would you prefer another one?

> > +	might_sleep();
> > +	return platform_ops->power_up(cpu, cluster);
> > +}
> > +
> > +typedef void (*phys_reset_t)(unsigned long);
> 
> Maybe it's worth putting this typedef in a header file somewhere. It's
> also used by the soft reboot code.

Agreed.  Maybe separately from this series though.

> > +
> > +void bL_cpu_power_down(void)
> > +{
> > +	phys_reset_t phys_reset;
> > +
> > +	BUG_ON(!platform_ops);
> 
> Seems a bit overkill, or are we unrecoverable by this point?

We are.  The upper layer expects this CPU to be dead and there is no 
easy recovery possible.  This is a "should never happen" condition, and 
the kernel is badly configured otherwise.

> 
> > +	BUG_ON(!irqs_disabled());
> > +
> > +	/*
> > +	 * Do this before calling into the power_down method,
> > +	 * as it might not always be safe to do afterwards.
> > +	 */
> > +	setup_mm_for_reboot();
> > +
> > +	platform_ops->power_down();
> > +
> > +	/*
> > +	 * It is possible for a power_up request to happen concurrently
> > +	 * with a power_down request for the same CPU. In this case the
> > +	 * power_down method might not be able to actually enter a
> > +	 * powered down state with the WFI instruction if the power_up
> > +	 * method has removed the required reset condition.  The
> > +	 * power_down method is then allowed to return. We must perform
> > +	 * a re-entry in the kernel as if the power_up method just had
> > +	 * deasserted reset on the CPU.
> > +	 *
> > +	 * To simplify race issues, the platform specific implementation
> > +	 * must accommodate for the possibility of unordered calls to
> > +	 * power_down and power_up with a usage count. Therefore, if a
> > +	 * call to power_up is issued for a CPU that is not down, then
> > +	 * the next call to power_down must not attempt a full shutdown
> > +	 * but only do the minimum (normally disabling L1 cache and CPU
> > +	 * coherency) and return just as if a concurrent power_up request
> > +	 * had happened as described above.
> > +	 */
> > +
> > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > +	phys_reset(virt_to_phys(bL_entry_point));
> > +
> > +	/* should never get here */
> > +	BUG();
> > +}
> > +
> > +void bL_cpu_suspend(u64 expected_residency)
> > +{
> > +	phys_reset_t phys_reset;
> > +
> > +	BUG_ON(!platform_ops);
> > +	BUG_ON(!irqs_disabled());
> > +
> > +	/* Very similar to bL_cpu_power_down() */
> > +	setup_mm_for_reboot();
> > +	platform_ops->suspend(expected_residency);
> > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > +	phys_reset(virt_to_phys(bL_entry_point));
> > +	BUG();
> > +}
> > +
> > +int bL_cpu_powered_up(void)
> > +{
> > +	if (!platform_ops)
> > +		return -EUNATCH;
> > +	if (platform_ops->powered_up)
> > +		platform_ops->powered_up();
> > +	return 0;
> > +}
> > diff --git a/arch/arm/include/asm/bL_entry.h b/arch/arm/include/asm/bL_entry.h
> > index ff623333a1..942d7f9f19 100644
> > --- a/arch/arm/include/asm/bL_entry.h
> > +++ b/arch/arm/include/asm/bL_entry.h
> > @@ -31,5 +31,97 @@ extern void bL_entry_point(void);
> >   */
> >  void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
> >  
> > +/*
> > + * CPU/cluster power operations API for higher subsystems to use.
> > + */
> > +
> > +/**
> > + * bL_cpu_power_up - make given CPU in given cluster runable
> > + *
> > + * @cpu: CPU number within given cluster
> > + * @cluster: cluster number for the CPU
> > + *
> > + * The identified CPU is brought out of reset.  If the cluster was powered
> > + * down then it is brought up as well, taking care not to let the other CPUs
> > + * in the cluster run, and ensuring appropriate cluster setup.
> > + *
> > + * Caller must ensure the appropriate entry vector is initialized with
> > + * bL_set_entry_vector() prior to calling this.
> > + *
> > + * This must be called in a sleepable context.  However, the implementation
> > + * is strongly encouraged to return early and let the operation happen
> > + * asynchronously, especially when significant delays are expected.
> > + *
> > + * If the operation cannot be performed then an error code is returned.
> > + */
> > +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster);
> > +
> > +/**
> > + * bL_cpu_power_down - power the calling CPU down
> > + *
> > + * The calling CPU is powered down.
> > + *
> > + * If this CPU is found to be the "last man standing" in the cluster
> > + * then the cluster is prepared for power-down too.
> > + *
> > + * This must be called with interrupts disabled.
> > + *
> > + * This does not return.  Re-entry in the kernel is expected via
> > + * bL_entry_point.
> > + */
> > +void bL_cpu_power_down(void);
> > +
> > +/**
> > + * bL_cpu_suspend - bring the calling CPU in a suspended state
> > + *
> > + * @expected_residency: duration in microseconds the CPU is expected
> > + *			to remain suspended, or 0 if unknown/infinity.
> > + *
> > + * The calling CPU is suspended.  The expected residency argument is used
> > + * as a hint by the platform specific backend to implement the appropriate
> > + * sleep state level according to the knowledge it has on wake-up latency
> > + * for the given hardware.
> > + *
> > + * If this CPU is found to be the "last man standing" in the cluster
> > + * then the cluster may be prepared for power-down too, if the expected
> > + * residency makes it worthwhile.
> > + *
> > + * This must be called with interrupts disabled.
> > + *
> > + * This does not return.  Re-entry in the kernel is expected via
> > + * bL_entry_point.
> > + */
> > +void bL_cpu_suspend(u64 expected_residency);
> > +
> > +/**
> > + * bL_cpu_powered_up - housekeeping workafter a CPU has been powered up
> > + *
> > + * This lets the platform specific backend code perform needed housekeeping
> > + * work.  This must be called by the newly activated CPU as soon as it is
> > + * fully operational in kernel space, before it enables interrupts.
> > + *
> > + * If the operation cannot be performed then an error code is returned.
> > + */
> > +int bL_cpu_powered_up(void);
> > +
> > +/*
> > + * Platform specific methods used in the implementation of the above API.
> > + */
> > +struct bL_platform_power_ops {
> > +	int (*power_up)(unsigned int cpu, unsigned int cluster);
> > +	void (*power_down)(void);
> > +	void (*suspend)(u64);
> > +	void (*powered_up)(void);
> > +};
> 
> It would be good if these prototypes matched the PSCI code, then platforms
> could just glue them together directly.

No.

I discussed this at length with Charles (the PSCI spec author) already. 
Even in the PSCI case, a minimum PSCI backend is necessary to do some 
impedance matching between what the PSCI calls expect as arguments and 
what this kernel specific API needs to express.  For example, the UP 
method needs to always be provided with the address for bL_entry, 
irrespective of where the user of this kernel API wants execution to be 
resumed.  There might be some cases where the backend might decide to 
override the desired power saving state because of other kernel induced 
constraints (ongoing DMA operation for example) that PSCI doesn't (and 
should not) know about.  And the best place to arbitrate between those 
platform specific constraints is in this platform specific shim or 
backend.

Because of that, and because one feature of Linux is to not have stable 
APIs in the kernel so to be free to adapt them to future needs, I think 
it is best not to even try matching the PSCI interface here.


Nicolas
Will Deacon Jan. 11, 2013, 10:58 a.m. UTC | #3
On Fri, Jan 11, 2013 at 02:30:06AM +0000, Nicolas Pitre wrote:
> On Thu, 10 Jan 2013, Will Deacon wrote:
> > On Thu, Jan 10, 2013 at 12:20:37AM +0000, Nicolas Pitre wrote:
> > > +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
> > > +{
> > > +	if (!platform_ops)
> > > +		return -EUNATCH;
> > 
> > Is this the right error code?
> 
> It is as good as any other, with some meaning to be distinguished from 
> the traditional ones like -ENOMEM or -EINVAL that the platform backends 
> could return.
> 
> Would you prefer another one?

-ENODEV? Nothing to lose sleep over though.

> > > +	might_sleep();
> > > +	return platform_ops->power_up(cpu, cluster);
> > > +}
> > > +
> > > +typedef void (*phys_reset_t)(unsigned long);
> > 
> > Maybe it's worth putting this typedef in a header file somewhere. It's
> > also used by the soft reboot code.
> 
> Agreed.  Maybe separately from this series though.
> 
> > > +
> > > +void bL_cpu_power_down(void)
> > > +{
> > > +	phys_reset_t phys_reset;
> > > +
> > > +	BUG_ON(!platform_ops);
> > 
> > Seems a bit overkill, or are we unrecoverable by this point?
> 
> We are.  The upper layer expects this CPU to be dead and there is no 
> easy recovery possible.  This is a "should never happen" condition, and 
> the kernel is badly configured otherwise.

Okey doke, that's what I feared. The BUG_ON makes sense then.

> > > +/*
> > > + * Platform specific methods used in the implementation of the above API.
> > > + */
> > > +struct bL_platform_power_ops {
> > > +	int (*power_up)(unsigned int cpu, unsigned int cluster);
> > > +	void (*power_down)(void);
> > > +	void (*suspend)(u64);
> > > +	void (*powered_up)(void);
> > > +};
> > 
> > It would be good if these prototypes matched the PSCI code, then platforms
> > could just glue them together directly.
> 
> No.
> 
> I discussed this at length with Charles (the PSCI spec author) already. 
> Even in the PSCI case, a minimum PSCI backend is necessary to do some 
> impedance matching between what the PSCI calls expect as arguments and 
> what this kernel specific API needs to express.  For example, the UP 
> method needs to always be provided with the address for bL_entry, 
> irrespective of where the user of this kernel API wants execution to be 
> resumed.  There might be some cases where the backend might decide to 
> override the desired power saving state because of other kernel induced 
> constraints (ongoing DMA operation for example) that PSCI doesn't (and 
> should not) know about.  And the best place to arbitrate between those 
> platform specific constraints is in this platform specific shim or 
> backend.

Yes, you're right. I was thinking we could convert cpu/cluster into cpuid
automatically, but actually it's not guaranteed that the PSCI firmware will
follow the MPIDR format so we even need platform-specific marshalling for
that.

Thanks for the reply,

Will
tip-bot for Dave Martin Jan. 11, 2013, 11:29 a.m. UTC | #4
On Thu, Jan 10, 2013 at 09:30:06PM -0500, Nicolas Pitre wrote:
> On Thu, 10 Jan 2013, Will Deacon wrote:
> 
> > On Thu, Jan 10, 2013 at 12:20:37AM +0000, Nicolas Pitre wrote:
> > > This is the basic API used to handle the powering up/down of individual
> > > CPUs in a big.LITTLE system.  The platform specific backend implementation
> > > has the responsibility to also handle the cluster level power as well when
> > > the first/last CPU in a cluster is brought up/down.
> > > 
> > > Signed-off-by: Nicolas Pitre <nico@linaro.org>
> > > ---
> > >  arch/arm/common/bL_entry.c      | 88 +++++++++++++++++++++++++++++++++++++++
> > >  arch/arm/include/asm/bL_entry.h | 92 +++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 180 insertions(+)
> > > 
> > > diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
> > > index 80fff49417..41de0622de 100644
> > > --- a/arch/arm/common/bL_entry.c
> > > +++ b/arch/arm/common/bL_entry.c
> > > @@ -11,11 +11,13 @@
> > >  
> > >  #include <linux/kernel.h>
> > >  #include <linux/init.h>
> > > +#include <linux/irqflags.h>
> > >  
> > >  #include <asm/bL_entry.h>
> > >  #include <asm/barrier.h>
> > >  #include <asm/proc-fns.h>
> > >  #include <asm/cacheflush.h>
> > > +#include <asm/idmap.h>
> > >  
> > >  extern volatile unsigned long bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
> > >  
> > > @@ -28,3 +30,89 @@ void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
> > >  	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
> > >  			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
> > >  }
> > > +
> > > +static const struct bL_platform_power_ops *platform_ops;
> > > +
> > > +int __init bL_platform_power_register(const struct bL_platform_power_ops *ops)
> > > +{
> > > +	if (platform_ops)
> > > +		return -EBUSY;
> > > +	platform_ops = ops;
> > > +	return 0;
> > > +}
> > > +
> > > +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
> > > +{
> > > +	if (!platform_ops)
> > > +		return -EUNATCH;
> > 
> > Is this the right error code?
> 
> It is as good as any other, with some meaning to be distinguished from 
> the traditional ones like -ENOMEM or -EINVAL that the platform backends 
> could return.
> 
> Would you prefer another one?
> 
> > > +	might_sleep();
> > > +	return platform_ops->power_up(cpu, cluster);
> > > +}
> > > +
> > > +typedef void (*phys_reset_t)(unsigned long);
> > 
> > Maybe it's worth putting this typedef in a header file somewhere. It's
> > also used by the soft reboot code.
> 
> Agreed.  Maybe separately from this series though.
> 
> > > +
> > > +void bL_cpu_power_down(void)
> > > +{
> > > +	phys_reset_t phys_reset;
> > > +
> > > +	BUG_ON(!platform_ops);
> > 
> > Seems a bit overkill, or are we unrecoverable by this point?
> 
> We are.  The upper layer expects this CPU to be dead and there is no 
> easy recovery possible.  This is a "should never happen" condition, and 
> the kernel is badly configured otherwise.

bL_cpu_power_down() is unconditional and does not fail.  This means
that calling this function means that:

 a) a subsequent call to bL_cpu_power_up() on this CPU will cause it
    to jump to bL_entry_point, in something resembling reset state;

 b) for all, part (or, rarely, none) of the intervening period, the
    CPU may really be turned off.

Without this BUG_ON() we would need to implement a dummy mechanism
to send the CPU to bL_entry_point at the right time.  If this happens
instantaneously (without waiting for bL_cpu_power_up()), then this
will likely lead to a spin of some sort unless it only happens
occasionally.  Also, the whole purpose of this function is to power off
the CPU, permitting power savings, so if no means has been registered too
do that, a call to bL_power_off() is certainly buggy misuse.

> 
> > 
> > > +	BUG_ON(!irqs_disabled());
> > > +
> > > +	/*
> > > +	 * Do this before calling into the power_down method,
> > > +	 * as it might not always be safe to do afterwards.
> > > +	 */
> > > +	setup_mm_for_reboot();
> > > +
> > > +	platform_ops->power_down();
> > > +
> > > +	/*
> > > +	 * It is possible for a power_up request to happen concurrently
> > > +	 * with a power_down request for the same CPU. In this case the
> > > +	 * power_down method might not be able to actually enter a
> > > +	 * powered down state with the WFI instruction if the power_up
> > > +	 * method has removed the required reset condition.  The
> > > +	 * power_down method is then allowed to return. We must perform
> > > +	 * a re-entry in the kernel as if the power_up method just had
> > > +	 * deasserted reset on the CPU.
> > > +	 *
> > > +	 * To simplify race issues, the platform specific implementation
> > > +	 * must accommodate for the possibility of unordered calls to
> > > +	 * power_down and power_up with a usage count. Therefore, if a
> > > +	 * call to power_up is issued for a CPU that is not down, then
> > > +	 * the next call to power_down must not attempt a full shutdown
> > > +	 * but only do the minimum (normally disabling L1 cache and CPU
> > > +	 * coherency) and return just as if a concurrent power_up request
> > > +	 * had happened as described above.
> > > +	 */
> > > +
> > > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > > +	phys_reset(virt_to_phys(bL_entry_point));
> > > +
> > > +	/* should never get here */
> > > +	BUG();
> > > +}
> > > +
> > > +void bL_cpu_suspend(u64 expected_residency)
> > > +{
> > > +	phys_reset_t phys_reset;
> > > +
> > > +	BUG_ON(!platform_ops);
> > > +	BUG_ON(!irqs_disabled());
> > > +
> > > +	/* Very similar to bL_cpu_power_down() */
> > > +	setup_mm_for_reboot();
> > > +	platform_ops->suspend(expected_residency);
> > > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > > +	phys_reset(virt_to_phys(bL_entry_point));
> > > +	BUG();
> > > +}
> > > +
> > > +int bL_cpu_powered_up(void)
> > > +{
> > > +	if (!platform_ops)
> > > +		return -EUNATCH;
> > > +	if (platform_ops->powered_up)
> > > +		platform_ops->powered_up();
> > > +	return 0;
> > > +}
> > > diff --git a/arch/arm/include/asm/bL_entry.h b/arch/arm/include/asm/bL_entry.h
> > > index ff623333a1..942d7f9f19 100644
> > > --- a/arch/arm/include/asm/bL_entry.h
> > > +++ b/arch/arm/include/asm/bL_entry.h
> > > @@ -31,5 +31,97 @@ extern void bL_entry_point(void);
> > >   */
> > >  void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
> > >  
> > > +/*
> > > + * CPU/cluster power operations API for higher subsystems to use.
> > > + */
> > > +
> > > +/**
> > > + * bL_cpu_power_up - make given CPU in given cluster runable
> > > + *
> > > + * @cpu: CPU number within given cluster
> > > + * @cluster: cluster number for the CPU
> > > + *
> > > + * The identified CPU is brought out of reset.  If the cluster was powered
> > > + * down then it is brought up as well, taking care not to let the other CPUs
> > > + * in the cluster run, and ensuring appropriate cluster setup.
> > > + *
> > > + * Caller must ensure the appropriate entry vector is initialized with
> > > + * bL_set_entry_vector() prior to calling this.
> > > + *
> > > + * This must be called in a sleepable context.  However, the implementation
> > > + * is strongly encouraged to return early and let the operation happen
> > > + * asynchronously, especially when significant delays are expected.
> > > + *
> > > + * If the operation cannot be performed then an error code is returned.
> > > + */
> > > +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster);
> > > +
> > > +/**
> > > + * bL_cpu_power_down - power the calling CPU down
> > > + *
> > > + * The calling CPU is powered down.
> > > + *
> > > + * If this CPU is found to be the "last man standing" in the cluster
> > > + * then the cluster is prepared for power-down too.
> > > + *
> > > + * This must be called with interrupts disabled.
> > > + *
> > > + * This does not return.  Re-entry in the kernel is expected via
> > > + * bL_entry_point.
> > > + */
> > > +void bL_cpu_power_down(void);
> > > +
> > > +/**
> > > + * bL_cpu_suspend - bring the calling CPU in a suspended state
> > > + *
> > > + * @expected_residency: duration in microseconds the CPU is expected
> > > + *			to remain suspended, or 0 if unknown/infinity.
> > > + *
> > > + * The calling CPU is suspended.  The expected residency argument is used
> > > + * as a hint by the platform specific backend to implement the appropriate
> > > + * sleep state level according to the knowledge it has on wake-up latency
> > > + * for the given hardware.
> > > + *
> > > + * If this CPU is found to be the "last man standing" in the cluster
> > > + * then the cluster may be prepared for power-down too, if the expected
> > > + * residency makes it worthwhile.
> > > + *
> > > + * This must be called with interrupts disabled.
> > > + *
> > > + * This does not return.  Re-entry in the kernel is expected via
> > > + * bL_entry_point.
> > > + */
> > > +void bL_cpu_suspend(u64 expected_residency);
> > > +
> > > +/**
> > > + * bL_cpu_powered_up - housekeeping workafter a CPU has been powered up
> > > + *
> > > + * This lets the platform specific backend code perform needed housekeeping
> > > + * work.  This must be called by the newly activated CPU as soon as it is
> > > + * fully operational in kernel space, before it enables interrupts.
> > > + *
> > > + * If the operation cannot be performed then an error code is returned.
> > > + */
> > > +int bL_cpu_powered_up(void);
> > > +
> > > +/*
> > > + * Platform specific methods used in the implementation of the above API.
> > > + */
> > > +struct bL_platform_power_ops {
> > > +	int (*power_up)(unsigned int cpu, unsigned int cluster);
> > > +	void (*power_down)(void);
> > > +	void (*suspend)(u64);
> > > +	void (*powered_up)(void);
> > > +};
> > 
> > It would be good if these prototypes matched the PSCI code, then platforms
> > could just glue them together directly.
> 
> No.
> 
> I discussed this at length with Charles (the PSCI spec author) already. 
> Even in the PSCI case, a minimum PSCI backend is necessary to do some 
> impedance matching between what the PSCI calls expect as arguments and 
> what this kernel specific API needs to express.  For example, the UP 
> method needs to always be provided with the address for bL_entry, 
> irrespective of where the user of this kernel API wants execution to be 
> resumed.  There might be some cases where the backend might decide to 
> override the desired power saving state because of other kernel induced 
> constraints (ongoing DMA operation for example) that PSCI doesn't (and 
> should not) know about.  And the best place to arbitrate between those 
> platform specific constraints is in this platform specific shim or 
> backend.
> 
> Because of that, and because one feature of Linux is to not have stable 
> APIs in the kernel so to be free to adapt them to future needs, I think 
> it is best not to even try matching the PSCI interface here.

The kernel may need to do stuff in these functions, even if the
underlying backend is PSCI, so they wouldn't just be a pass-through ...
or does the mach-virt experience convince us that there would be nothing
to do?    I feel unsure about that, but I've not looked at the mach-virt
code yet.

mach-virt lacks most of the hardware nasties which we can't ignore at
the host kernel / firmware interface.

Cheers
---Dave
Santosh Shilimkar Jan. 11, 2013, 5:26 p.m. UTC | #5
On Thursday 10 January 2013 05:50 AM, Nicolas Pitre wrote:
> This is the basic API used to handle the powering up/down of individual
> CPUs in a big.LITTLE system.  The platform specific backend implementation
> has the responsibility to also handle the cluster level power as well when
> the first/last CPU in a cluster is brought up/down.
>
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>   arch/arm/common/bL_entry.c      | 88 +++++++++++++++++++++++++++++++++++++++
>   arch/arm/include/asm/bL_entry.h | 92 +++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 180 insertions(+)
>
> diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
> index 80fff49417..41de0622de 100644
> --- a/arch/arm/common/bL_entry.c
> +++ b/arch/arm/common/bL_entry.c
> @@ -11,11 +11,13 @@
>
>   #include <linux/kernel.h>
>   #include <linux/init.h>
> +#include <linux/irqflags.h>
>
>   #include <asm/bL_entry.h>
>   #include <asm/barrier.h>
>   #include <asm/proc-fns.h>
>   #include <asm/cacheflush.h>
> +#include <asm/idmap.h>
>
>   extern volatile unsigned long bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
>
> @@ -28,3 +30,89 @@ void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
>   	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
>   			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
>   }
> +
> +static const struct bL_platform_power_ops *platform_ops;
> +
> +int __init bL_platform_power_register(const struct bL_platform_power_ops *ops)
> +{
> +	if (platform_ops)
> +		return -EBUSY;
> +	platform_ops = ops;
> +	return 0;
> +}
> +
> +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
> +{
> +	if (!platform_ops)
> +		return -EUNATCH;
> +	might_sleep();
> +	return platform_ops->power_up(cpu, cluster);
> +}
> +
> +typedef void (*phys_reset_t)(unsigned long);
> +
> +void bL_cpu_power_down(void)
> +{
> +	phys_reset_t phys_reset;
> +
> +	BUG_ON(!platform_ops);
> +	BUG_ON(!irqs_disabled());
> +
> +	/*
> +	 * Do this before calling into the power_down method,
> +	 * as it might not always be safe to do afterwards.
> +	 */
> +	setup_mm_for_reboot();
> +
> +	platform_ops->power_down();
> +
> +	/*
> +	 * It is possible for a power_up request to happen concurrently
> +	 * with a power_down request for the same CPU. In this case the
> +	 * power_down method might not be able to actually enter a
> +	 * powered down state with the WFI instruction if the power_up
> +	 * method has removed the required reset condition.  The
> +	 * power_down method is then allowed to return. We must perform
> +	 * a re-entry in the kernel as if the power_up method just had
> +	 * deasserted reset on the CPU.
> +	 *
> +	 * To simplify race issues, the platform specific implementation
> +	 * must accommodate for the possibility of unordered calls to
> +	 * power_down and power_up with a usage count. Therefore, if a
> +	 * call to power_up is issued for a CPU that is not down, then
> +	 * the next call to power_down must not attempt a full shutdown
> +	 * but only do the minimum (normally disabling L1 cache and CPU
> +	 * coherency) and return just as if a concurrent power_up request
> +	 * had happened as described above.
> +	 */
> +
> +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> +	phys_reset(virt_to_phys(bL_entry_point));
> +
> +	/* should never get here */
> +	BUG();
> +}
> +
> +void bL_cpu_suspend(u64 expected_residency)
> +{
> +	phys_reset_t phys_reset;
> +
> +	BUG_ON(!platform_ops);
> +	BUG_ON(!irqs_disabled());
> +
> +	/* Very similar to bL_cpu_power_down() */
> +	setup_mm_for_reboot();
> +	platform_ops->suspend(expected_residency);
> +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> +	phys_reset(virt_to_phys(bL_entry_point));
> +	BUG();
>
I might be missing all the rationales behind not having a recovery for
CPUs entering suspend if they actualy come here because of some events.
This is pretty much possible in many scenario's and hence letting CPU
cpu come out of suspend should be possible. May be switcher code don't
have such requirement but it appeared bit off to me.

Regards
Santosh
Nicolas Pitre Jan. 11, 2013, 6:33 p.m. UTC | #6
On Fri, 11 Jan 2013, Santosh Shilimkar wrote:

> On Thursday 10 January 2013 05:50 AM, Nicolas Pitre wrote:
> > This is the basic API used to handle the powering up/down of individual
> > CPUs in a big.LITTLE system.  The platform specific backend implementation
> > has the responsibility to also handle the cluster level power as well when
> > the first/last CPU in a cluster is brought up/down.
> > 
> > Signed-off-by: Nicolas Pitre <nico@linaro.org>
> > ---
> >   arch/arm/common/bL_entry.c      | 88
> > +++++++++++++++++++++++++++++++++++++++
> >   arch/arm/include/asm/bL_entry.h | 92
> > +++++++++++++++++++++++++++++++++++++++++
> >   2 files changed, 180 insertions(+)
> > 
> > diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
> > index 80fff49417..41de0622de 100644
> > --- a/arch/arm/common/bL_entry.c
> > +++ b/arch/arm/common/bL_entry.c
> > @@ -11,11 +11,13 @@
> > 
> >   #include <linux/kernel.h>
> >   #include <linux/init.h>
> > +#include <linux/irqflags.h>
> > 
> >   #include <asm/bL_entry.h>
> >   #include <asm/barrier.h>
> >   #include <asm/proc-fns.h>
> >   #include <asm/cacheflush.h>
> > +#include <asm/idmap.h>
> > 
> >   extern volatile unsigned long
> > bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
> > 
> > @@ -28,3 +30,89 @@ void bL_set_entry_vector(unsigned cpu, unsigned cluster,
> > void *ptr)
> >   	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
> >   			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
> >   }
> > +
> > +static const struct bL_platform_power_ops *platform_ops;
> > +
> > +int __init bL_platform_power_register(const struct bL_platform_power_ops
> > *ops)
> > +{
> > +	if (platform_ops)
> > +		return -EBUSY;
> > +	platform_ops = ops;
> > +	return 0;
> > +}
> > +
> > +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
> > +{
> > +	if (!platform_ops)
> > +		return -EUNATCH;
> > +	might_sleep();
> > +	return platform_ops->power_up(cpu, cluster);
> > +}
> > +
> > +typedef void (*phys_reset_t)(unsigned long);
> > +
> > +void bL_cpu_power_down(void)
> > +{
> > +	phys_reset_t phys_reset;
> > +
> > +	BUG_ON(!platform_ops);
> > +	BUG_ON(!irqs_disabled());
> > +
> > +	/*
> > +	 * Do this before calling into the power_down method,
> > +	 * as it might not always be safe to do afterwards.
> > +	 */
> > +	setup_mm_for_reboot();
> > +
> > +	platform_ops->power_down();
> > +
> > +	/*
> > +	 * It is possible for a power_up request to happen concurrently
> > +	 * with a power_down request for the same CPU. In this case the
> > +	 * power_down method might not be able to actually enter a
> > +	 * powered down state with the WFI instruction if the power_up
> > +	 * method has removed the required reset condition.  The
> > +	 * power_down method is then allowed to return. We must perform
> > +	 * a re-entry in the kernel as if the power_up method just had
> > +	 * deasserted reset on the CPU.
> > +	 *
> > +	 * To simplify race issues, the platform specific implementation
> > +	 * must accommodate for the possibility of unordered calls to
> > +	 * power_down and power_up with a usage count. Therefore, if a
> > +	 * call to power_up is issued for a CPU that is not down, then
> > +	 * the next call to power_down must not attempt a full shutdown
> > +	 * but only do the minimum (normally disabling L1 cache and CPU
> > +	 * coherency) and return just as if a concurrent power_up request
> > +	 * had happened as described above.
> > +	 */
> > +
> > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > +	phys_reset(virt_to_phys(bL_entry_point));
> > +
> > +	/* should never get here */
> > +	BUG();
> > +}
> > +
> > +void bL_cpu_suspend(u64 expected_residency)
> > +{
> > +	phys_reset_t phys_reset;
> > +
> > +	BUG_ON(!platform_ops);
> > +	BUG_ON(!irqs_disabled());
> > +
> > +	/* Very similar to bL_cpu_power_down() */
> > +	setup_mm_for_reboot();
> > +	platform_ops->suspend(expected_residency);
> > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > +	phys_reset(virt_to_phys(bL_entry_point));
> > +	BUG();
> > 
> I might be missing all the rationales behind not having a recovery for
> CPUs entering suspend if they actualy come here because of some events.
> This is pretty much possible in many scenario's and hence letting CPU
> cpu come out of suspend should be possible. May be switcher code don't
> have such requirement but it appeared bit off to me.

There are two things to consider here:

1) The CPU is suspended.  CPU state is lost. Next interrupt to wake up
   the CPU will make it restart from the reset vector and re-entry in 
   the kernel will happen via bL_entry_point to deal with the various 
   cluster issues, to eventually resume kernel code via cpu_resume.  
   Obviously, the machine specific backend code would have set the
   bL_entry_point address in its machine specific reset vector in
   advance.

2) An interrupt comes along before the CPU is effectively suspended, say 
   right before the backend code executes a WFI to shut the CPU down.  
   The CPU and possibly cluster state was already set for being powered 
   off.  We cannot simply return at this point as caches are off, the 
   CPU is not coherent with the rest of the system anymore, etc.  So if 
   the platform specific backend ever returns, say because the final WFI 
   exited, then we have to go through the same arbitration process to 
   restore the CPU and cluster state as if that was a hard reset.  Hence 
   the cpu_reset call to loop back into bL_entry_point.

In either cases, we simply cannot ever return from bL_cpu_suspend() 
directly.  Of course, the caller is expected to have used 
bL_set_entry_vector() beforehand, most probably with cpu_resume as 
argument.


Nicolas
Santosh Shilimkar Jan. 11, 2013, 6:41 p.m. UTC | #7
On Saturday 12 January 2013 12:03 AM, Nicolas Pitre wrote:
> On Fri, 11 Jan 2013, Santosh Shilimkar wrote:
>
>> On Thursday 10 January 2013 05:50 AM, Nicolas Pitre wrote:
>>> This is the basic API used to handle the powering up/down of individual
>>> CPUs in a big.LITTLE system.  The platform specific backend implementation
>>> has the responsibility to also handle the cluster level power as well when
>>> the first/last CPU in a cluster is brought up/down.
>>>
>>> Signed-off-by: Nicolas Pitre <nico@linaro.org>
>>> ---
>>>    arch/arm/common/bL_entry.c      | 88
>>> +++++++++++++++++++++++++++++++++++++++
>>>    arch/arm/include/asm/bL_entry.h | 92
>>> +++++++++++++++++++++++++++++++++++++++++
>>>    2 files changed, 180 insertions(+)
>>>
>>> diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
>>> index 80fff49417..41de0622de 100644
>>> --- a/arch/arm/common/bL_entry.c
>>> +++ b/arch/arm/common/bL_entry.c
>>> @@ -11,11 +11,13 @@
>>>
>>>    #include <linux/kernel.h>
>>>    #include <linux/init.h>
>>> +#include <linux/irqflags.h>
>>>
>>>    #include <asm/bL_entry.h>
>>>    #include <asm/barrier.h>
>>>    #include <asm/proc-fns.h>
>>>    #include <asm/cacheflush.h>
>>> +#include <asm/idmap.h>
>>>
>>>    extern volatile unsigned long
>>> bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
>>>
>>> @@ -28,3 +30,89 @@ void bL_set_entry_vector(unsigned cpu, unsigned cluster,
>>> void *ptr)
>>>    	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
>>>    			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
>>>    }
>>> +
>>> +static const struct bL_platform_power_ops *platform_ops;
>>> +
>>> +int __init bL_platform_power_register(const struct bL_platform_power_ops
>>> *ops)
>>> +{
>>> +	if (platform_ops)
>>> +		return -EBUSY;
>>> +	platform_ops = ops;
>>> +	return 0;
>>> +}
>>> +
>>> +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
>>> +{
>>> +	if (!platform_ops)
>>> +		return -EUNATCH;
>>> +	might_sleep();
>>> +	return platform_ops->power_up(cpu, cluster);
>>> +}
>>> +
>>> +typedef void (*phys_reset_t)(unsigned long);
>>> +
>>> +void bL_cpu_power_down(void)
>>> +{
>>> +	phys_reset_t phys_reset;
>>> +
>>> +	BUG_ON(!platform_ops);
>>> +	BUG_ON(!irqs_disabled());
>>> +
>>> +	/*
>>> +	 * Do this before calling into the power_down method,
>>> +	 * as it might not always be safe to do afterwards.
>>> +	 */
>>> +	setup_mm_for_reboot();
>>> +
>>> +	platform_ops->power_down();
>>> +
>>> +	/*
>>> +	 * It is possible for a power_up request to happen concurrently
>>> +	 * with a power_down request for the same CPU. In this case the
>>> +	 * power_down method might not be able to actually enter a
>>> +	 * powered down state with the WFI instruction if the power_up
>>> +	 * method has removed the required reset condition.  The
>>> +	 * power_down method is then allowed to return. We must perform
>>> +	 * a re-entry in the kernel as if the power_up method just had
>>> +	 * deasserted reset on the CPU.
>>> +	 *
>>> +	 * To simplify race issues, the platform specific implementation
>>> +	 * must accommodate for the possibility of unordered calls to
>>> +	 * power_down and power_up with a usage count. Therefore, if a
>>> +	 * call to power_up is issued for a CPU that is not down, then
>>> +	 * the next call to power_down must not attempt a full shutdown
>>> +	 * but only do the minimum (normally disabling L1 cache and CPU
>>> +	 * coherency) and return just as if a concurrent power_up request
>>> +	 * had happened as described above.
>>> +	 */
>>> +
>>> +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
>>> +	phys_reset(virt_to_phys(bL_entry_point));
>>> +
>>> +	/* should never get here */
>>> +	BUG();
>>> +}
>>> +
>>> +void bL_cpu_suspend(u64 expected_residency)
>>> +{
>>> +	phys_reset_t phys_reset;
>>> +
>>> +	BUG_ON(!platform_ops);
>>> +	BUG_ON(!irqs_disabled());
>>> +
>>> +	/* Very similar to bL_cpu_power_down() */
>>> +	setup_mm_for_reboot();
>>> +	platform_ops->suspend(expected_residency);
>>> +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
>>> +	phys_reset(virt_to_phys(bL_entry_point));
>>> +	BUG();
>>>
>> I might be missing all the rationales behind not having a recovery for
>> CPUs entering suspend if they actualy come here because of some events.
>> This is pretty much possible in many scenario's and hence letting CPU
>> cpu come out of suspend should be possible. May be switcher code don't
>> have such requirement but it appeared bit off to me.
>
> There are two things to consider here:
>
> 1) The CPU is suspended.  CPU state is lost. Next interrupt to wake up
>     the CPU will make it restart from the reset vector and re-entry in
>     the kernel will happen via bL_entry_point to deal with the various
>     cluster issues, to eventually resume kernel code via cpu_resume.
>     Obviously, the machine specific backend code would have set the
>     bL_entry_point address in its machine specific reset vector in
>     advance.
This is the successful case and in that case you will anyway not hit the
BUG.
>
> 2) An interrupt comes along before the CPU is effectively suspended, say
>     right before the backend code executes a WFI to shut the CPU down.
>     The CPU and possibly cluster state was already set for being powered
>     off.  We cannot simply return at this point as caches are off, the
>     CPU is not coherent with the rest of the system anymore, etc.  So if
>     the platform specific backend ever returns, say because the final WFI
>     exited, then we have to go through the same arbitration process to
>     restore the CPU and cluster state as if that was a hard reset.  Hence
>     the cpu_reset call to loop back into bL_entry_point.
>
This is the one I was thinking. Enabling C bit and SMP bit should be
enough for CPU to get back to right state since the CPU has not lost
the context all the registers including SP is intact and CPU should
be able to resume.

> In either cases, we simply cannot ever return from bL_cpu_suspend()
> directly.  Of course, the caller is expected to have used
> bL_set_entry_vector() beforehand, most probably with cpu_resume as
> argument.
>
The above might get complicated if when above situation happens on
last CPU where even CCI gets disabled and then adding the rever code
for all of that may not be worth. You approach is much safer.
Thanks for explaining it further.

Regards,
Santosh
Nicolas Pitre Jan. 11, 2013, 7:54 p.m. UTC | #8
On Sat, 12 Jan 2013, Santosh Shilimkar wrote:

> On Saturday 12 January 2013 12:03 AM, Nicolas Pitre wrote:
> > On Fri, 11 Jan 2013, Santosh Shilimkar wrote:
> > 
> > > On Thursday 10 January 2013 05:50 AM, Nicolas Pitre wrote:
> > > > This is the basic API used to handle the powering up/down of individual
> > > > CPUs in a big.LITTLE system.  The platform specific backend
> > > > implementation
> > > > has the responsibility to also handle the cluster level power as well
> > > > when
> > > > the first/last CPU in a cluster is brought up/down.
> > > > 
> > > > Signed-off-by: Nicolas Pitre <nico@linaro.org>
> > > > ---
> > > >    arch/arm/common/bL_entry.c      | 88
> > > > +++++++++++++++++++++++++++++++++++++++
> > > >    arch/arm/include/asm/bL_entry.h | 92
> > > > +++++++++++++++++++++++++++++++++++++++++
> > > >    2 files changed, 180 insertions(+)
> > > > 
> > > > diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
> > > > index 80fff49417..41de0622de 100644
> > > > --- a/arch/arm/common/bL_entry.c
> > > > +++ b/arch/arm/common/bL_entry.c
> > > > @@ -11,11 +11,13 @@
> > > > 
> > > >    #include <linux/kernel.h>
> > > >    #include <linux/init.h>
> > > > +#include <linux/irqflags.h>
> > > > 
> > > >    #include <asm/bL_entry.h>
> > > >    #include <asm/barrier.h>
> > > >    #include <asm/proc-fns.h>
> > > >    #include <asm/cacheflush.h>
> > > > +#include <asm/idmap.h>
> > > > 
> > > >    extern volatile unsigned long
> > > > bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
> > > > 
> > > > @@ -28,3 +30,89 @@ void bL_set_entry_vector(unsigned cpu, unsigned
> > > > cluster,
> > > > void *ptr)
> > > >    	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
> > > >    			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
> > > >    }
> > > > +
> > > > +static const struct bL_platform_power_ops *platform_ops;
> > > > +
> > > > +int __init bL_platform_power_register(const struct
> > > > bL_platform_power_ops
> > > > *ops)
> > > > +{
> > > > +	if (platform_ops)
> > > > +		return -EBUSY;
> > > > +	platform_ops = ops;
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
> > > > +{
> > > > +	if (!platform_ops)
> > > > +		return -EUNATCH;
> > > > +	might_sleep();
> > > > +	return platform_ops->power_up(cpu, cluster);
> > > > +}
> > > > +
> > > > +typedef void (*phys_reset_t)(unsigned long);
> > > > +
> > > > +void bL_cpu_power_down(void)
> > > > +{
> > > > +	phys_reset_t phys_reset;
> > > > +
> > > > +	BUG_ON(!platform_ops);
> > > > +	BUG_ON(!irqs_disabled());
> > > > +
> > > > +	/*
> > > > +	 * Do this before calling into the power_down method,
> > > > +	 * as it might not always be safe to do afterwards.
> > > > +	 */
> > > > +	setup_mm_for_reboot();
> > > > +
> > > > +	platform_ops->power_down();
> > > > +
> > > > +	/*
> > > > +	 * It is possible for a power_up request to happen concurrently
> > > > +	 * with a power_down request for the same CPU. In this case the
> > > > +	 * power_down method might not be able to actually enter a
> > > > +	 * powered down state with the WFI instruction if the power_up
> > > > +	 * method has removed the required reset condition.  The
> > > > +	 * power_down method is then allowed to return. We must perform
> > > > +	 * a re-entry in the kernel as if the power_up method just had
> > > > +	 * deasserted reset on the CPU.
> > > > +	 *
> > > > +	 * To simplify race issues, the platform specific implementation
> > > > +	 * must accommodate for the possibility of unordered calls to
> > > > +	 * power_down and power_up with a usage count. Therefore, if a
> > > > +	 * call to power_up is issued for a CPU that is not down, then
> > > > +	 * the next call to power_down must not attempt a full shutdown
> > > > +	 * but only do the minimum (normally disabling L1 cache and CPU
> > > > +	 * coherency) and return just as if a concurrent power_up request
> > > > +	 * had happened as described above.
> > > > +	 */
> > > > +
> > > > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > > > +	phys_reset(virt_to_phys(bL_entry_point));
> > > > +
> > > > +	/* should never get here */
> > > > +	BUG();
> > > > +}
> > > > +
> > > > +void bL_cpu_suspend(u64 expected_residency)
> > > > +{
> > > > +	phys_reset_t phys_reset;
> > > > +
> > > > +	BUG_ON(!platform_ops);
> > > > +	BUG_ON(!irqs_disabled());
> > > > +
> > > > +	/* Very similar to bL_cpu_power_down() */
> > > > +	setup_mm_for_reboot();
> > > > +	platform_ops->suspend(expected_residency);
> > > > +	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
> > > > +	phys_reset(virt_to_phys(bL_entry_point));
> > > > +	BUG();
> > > > 
> > > I might be missing all the rationales behind not having a recovery for
> > > CPUs entering suspend if they actualy come here because of some events.
> > > This is pretty much possible in many scenario's and hence letting CPU
> > > cpu come out of suspend should be possible. May be switcher code don't
> > > have such requirement but it appeared bit off to me.
> > 
> > There are two things to consider here:
> > 
> > 1) The CPU is suspended.  CPU state is lost. Next interrupt to wake up
> >     the CPU will make it restart from the reset vector and re-entry in
> >     the kernel will happen via bL_entry_point to deal with the various
> >     cluster issues, to eventually resume kernel code via cpu_resume.
> >     Obviously, the machine specific backend code would have set the
> >     bL_entry_point address in its machine specific reset vector in
> >     advance.
> This is the successful case and in that case you will anyway not hit the
> BUG.
> > 
> > 2) An interrupt comes along before the CPU is effectively suspended, say
> >     right before the backend code executes a WFI to shut the CPU down.
> >     The CPU and possibly cluster state was already set for being powered
> >     off.  We cannot simply return at this point as caches are off, the
> >     CPU is not coherent with the rest of the system anymore, etc.  So if
> >     the platform specific backend ever returns, say because the final WFI
> >     exited, then we have to go through the same arbitration process to
> >     restore the CPU and cluster state as if that was a hard reset.  Hence
> >     the cpu_reset call to loop back into bL_entry_point.
> > 
> This is the one I was thinking. Enabling C bit and SMP bit should be
> enough for CPU to get back to right state since the CPU has not lost
> the context all the registers including SP is intact and CPU should
> be able to resume.

No.  If we are the last man then the CCI is disabled and we cannot just 
enable the C and SMP bits anymore without turning the CCI back on.  And 
even if we are not the last man, maybe another CPU is concurrently going 
through the same code and that one _is_ the last man, in which case it 
will have waited until we are done flushing our cache to turn off the 
CCI but we don't know about that.  And yet another CPU might be coming 
up just at the same moment but this one will want to turn on the CCI and 
that has to be done in a controlled way, and that control is performed 
in the code from bL_entry_point.  So, in short, we cannot just return 
from here.

> > In either cases, we simply cannot ever return from bL_cpu_suspend()
> > directly.  Of course, the caller is expected to have used
> > bL_set_entry_vector() beforehand, most probably with cpu_resume as
> > argument.
> > 
> The above might get complicated if when above situation happens on
> last CPU where even CCI gets disabled and then adding the rever code
> for all of that may not be worth. You approach is much safer.

Indeed.  Furthermore, that revert code does exist already: it is all in 
bL_head.S.  Hence the cpu_reset(bL_entry_point) call.

> Thanks for explaining it further.

No problem.  That literally took us months to get this code 
right so it might not be fully obvious to others after the first look.


Nicolas
diff mbox

Patch

diff --git a/arch/arm/common/bL_entry.c b/arch/arm/common/bL_entry.c
index 80fff49417..41de0622de 100644
--- a/arch/arm/common/bL_entry.c
+++ b/arch/arm/common/bL_entry.c
@@ -11,11 +11,13 @@ 
 
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irqflags.h>
 
 #include <asm/bL_entry.h>
 #include <asm/barrier.h>
 #include <asm/proc-fns.h>
 #include <asm/cacheflush.h>
+#include <asm/idmap.h>
 
 extern volatile unsigned long bL_entry_vectors[BL_NR_CLUSTERS][BL_CPUS_PER_CLUSTER];
 
@@ -28,3 +30,89 @@  void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
 	outer_clean_range(__pa(&bL_entry_vectors[cluster][cpu]),
 			  __pa(&bL_entry_vectors[cluster][cpu + 1]));
 }
+
+static const struct bL_platform_power_ops *platform_ops;
+
+int __init bL_platform_power_register(const struct bL_platform_power_ops *ops)
+{
+	if (platform_ops)
+		return -EBUSY;
+	platform_ops = ops;
+	return 0;
+}
+
+int bL_cpu_power_up(unsigned int cpu, unsigned int cluster)
+{
+	if (!platform_ops)
+		return -EUNATCH;
+	might_sleep();
+	return platform_ops->power_up(cpu, cluster);
+}
+
+typedef void (*phys_reset_t)(unsigned long);
+
+void bL_cpu_power_down(void)
+{
+	phys_reset_t phys_reset;
+
+	BUG_ON(!platform_ops);
+	BUG_ON(!irqs_disabled());
+
+	/*
+	 * Do this before calling into the power_down method,
+	 * as it might not always be safe to do afterwards.
+	 */
+	setup_mm_for_reboot();
+
+	platform_ops->power_down();
+
+	/*
+	 * It is possible for a power_up request to happen concurrently
+	 * with a power_down request for the same CPU. In this case the
+	 * power_down method might not be able to actually enter a
+	 * powered down state with the WFI instruction if the power_up
+	 * method has removed the required reset condition.  The
+	 * power_down method is then allowed to return. We must perform
+	 * a re-entry in the kernel as if the power_up method just had
+	 * deasserted reset on the CPU.
+	 *
+	 * To simplify race issues, the platform specific implementation
+	 * must accommodate for the possibility of unordered calls to
+	 * power_down and power_up with a usage count. Therefore, if a
+	 * call to power_up is issued for a CPU that is not down, then
+	 * the next call to power_down must not attempt a full shutdown
+	 * but only do the minimum (normally disabling L1 cache and CPU
+	 * coherency) and return just as if a concurrent power_up request
+	 * had happened as described above.
+	 */
+
+	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
+	phys_reset(virt_to_phys(bL_entry_point));
+
+	/* should never get here */
+	BUG();
+}
+
+void bL_cpu_suspend(u64 expected_residency)
+{
+	phys_reset_t phys_reset;
+
+	BUG_ON(!platform_ops);
+	BUG_ON(!irqs_disabled());
+
+	/* Very similar to bL_cpu_power_down() */
+	setup_mm_for_reboot();
+	platform_ops->suspend(expected_residency);
+	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
+	phys_reset(virt_to_phys(bL_entry_point));
+	BUG();
+}
+
+int bL_cpu_powered_up(void)
+{
+	if (!platform_ops)
+		return -EUNATCH;
+	if (platform_ops->powered_up)
+		platform_ops->powered_up();
+	return 0;
+}
diff --git a/arch/arm/include/asm/bL_entry.h b/arch/arm/include/asm/bL_entry.h
index ff623333a1..942d7f9f19 100644
--- a/arch/arm/include/asm/bL_entry.h
+++ b/arch/arm/include/asm/bL_entry.h
@@ -31,5 +31,97 @@  extern void bL_entry_point(void);
  */
 void bL_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
 
+/*
+ * CPU/cluster power operations API for higher subsystems to use.
+ */
+
+/**
+ * bL_cpu_power_up - make given CPU in given cluster runable
+ *
+ * @cpu: CPU number within given cluster
+ * @cluster: cluster number for the CPU
+ *
+ * The identified CPU is brought out of reset.  If the cluster was powered
+ * down then it is brought up as well, taking care not to let the other CPUs
+ * in the cluster run, and ensuring appropriate cluster setup.
+ *
+ * Caller must ensure the appropriate entry vector is initialized with
+ * bL_set_entry_vector() prior to calling this.
+ *
+ * This must be called in a sleepable context.  However, the implementation
+ * is strongly encouraged to return early and let the operation happen
+ * asynchronously, especially when significant delays are expected.
+ *
+ * If the operation cannot be performed then an error code is returned.
+ */
+int bL_cpu_power_up(unsigned int cpu, unsigned int cluster);
+
+/**
+ * bL_cpu_power_down - power the calling CPU down
+ *
+ * The calling CPU is powered down.
+ *
+ * If this CPU is found to be the "last man standing" in the cluster
+ * then the cluster is prepared for power-down too.
+ *
+ * This must be called with interrupts disabled.
+ *
+ * This does not return.  Re-entry in the kernel is expected via
+ * bL_entry_point.
+ */
+void bL_cpu_power_down(void);
+
+/**
+ * bL_cpu_suspend - bring the calling CPU in a suspended state
+ *
+ * @expected_residency: duration in microseconds the CPU is expected
+ *			to remain suspended, or 0 if unknown/infinity.
+ *
+ * The calling CPU is suspended.  The expected residency argument is used
+ * as a hint by the platform specific backend to implement the appropriate
+ * sleep state level according to the knowledge it has on wake-up latency
+ * for the given hardware.
+ *
+ * If this CPU is found to be the "last man standing" in the cluster
+ * then the cluster may be prepared for power-down too, if the expected
+ * residency makes it worthwhile.
+ *
+ * This must be called with interrupts disabled.
+ *
+ * This does not return.  Re-entry in the kernel is expected via
+ * bL_entry_point.
+ */
+void bL_cpu_suspend(u64 expected_residency);
+
+/**
+ * bL_cpu_powered_up - housekeeping workafter a CPU has been powered up
+ *
+ * This lets the platform specific backend code perform needed housekeeping
+ * work.  This must be called by the newly activated CPU as soon as it is
+ * fully operational in kernel space, before it enables interrupts.
+ *
+ * If the operation cannot be performed then an error code is returned.
+ */
+int bL_cpu_powered_up(void);
+
+/*
+ * Platform specific methods used in the implementation of the above API.
+ */
+struct bL_platform_power_ops {
+	int (*power_up)(unsigned int cpu, unsigned int cluster);
+	void (*power_down)(void);
+	void (*suspend)(u64);
+	void (*powered_up)(void);
+};
+
+/**
+ * bL_platform_power_register - register platform specific power methods
+ *
+ * @ops: bL_platform_power_ops structure to register
+ *
+ * An error is returned if the registration has been done previously.
+ */
+int __init bL_platform_power_register(const struct bL_platform_power_ops *ops);
+
 #endif /* ! __ASSEMBLY__ */
 #endif