diff mbox series

[v9,09/17] arm: tegra20: cpuidle: Handle case where secondary CPU hangs on entering LP2

Message ID 20200212235134.12638-10-digetx@gmail.com (mailing list archive)
State Not Applicable, archived
Headers show
Series Consolidate and improve NVIDIA Tegra CPUIDLE driver(s) | expand

Commit Message

Dmitry Osipenko Feb. 12, 2020, 11:51 p.m. UTC
It is possible that something may go wrong with the secondary CPU, in that
case it is much nicer to get a dump of the flow-controller state before
hanging machine.

Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
Tested-by: Peter Geis <pgwipeout@gmail.com>
Tested-by: Jasper Korten <jja2000@gmail.com>
Tested-by: David Heidelberg <david@ixit.cz>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 arch/arm/mach-tegra/cpuidle-tegra20.c | 47 +++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

Comments

Daniel Lezcano Feb. 21, 2020, 2:55 p.m. UTC | #1
On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
> It is possible that something may go wrong with the secondary CPU, in that
> case it is much nicer to get a dump of the flow-controller state before
> hanging machine.
> 
> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
> Tested-by: Peter Geis <pgwipeout@gmail.com>
> Tested-by: Jasper Korten <jja2000@gmail.com>
> Tested-by: David Heidelberg <david@ixit.cz>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>

> ---
>  arch/arm/mach-tegra/cpuidle-tegra20.c | 47 +++++++++++++++++++++++++--
>  1 file changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
> index 9672c619f4bc..bcc158b72e67 100644
> --- a/arch/arm/mach-tegra/cpuidle-tegra20.c
> +++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
> @@ -83,14 +83,57 @@ static inline void tegra20_wake_cpu1_from_reset(void)
>  }
>  #endif
>  
> +static void tegra20_report_cpus_state(void)
> +{
> +	unsigned long cpu, lcpu, csr;
> +
> +	for_each_cpu(lcpu, cpu_possible_mask) {
> +		cpu = cpu_logical_map(lcpu);
> +		csr = flowctrl_read_cpu_csr(cpu);
> +
> +		pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n",
> +		       cpu, cpu_online(lcpu), csr);
> +	}
> +}
> +
> +static int tegra20_wait_for_secondary_cpu_parking(void)
> +{
> +	unsigned int retries = 3;
> +
> +	while (retries--) {
> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
> +
> +		/*
> +		 * The primary CPU0 core shall wait for the secondaries
> +		 * shutdown in order to power-off CPU's cluster safely.
> +		 * The timeout value depends on the current CPU frequency,
> +		 * it takes about 40-150us  in average and over 1000us in
> +		 * a worst case scenario.
> +		 */
> +		do {
> +			if (tegra_cpu_rail_off_ready())
> +				return 0;
> +
> +		} while (ktime_before(ktime_get(), timeout));
> +
> +		pr_err("secondary CPU taking too long to park\n");
> +
> +		tegra20_report_cpus_state();
> +	}
> +
> +	pr_err("timed out waiting secondaries to park\n");
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static bool tegra20_cpu_cluster_power_down(struct cpuidle_device *dev,
>  					   struct cpuidle_driver *drv,
>  					   int index)
>  {
>  	bool ret;
>  
> -	while (!tegra_cpu_rail_off_ready())
> -		cpu_relax();
> +	if (tegra20_wait_for_secondary_cpu_parking())
> +		return false;
>  
>  	ret = !tegra_pm_enter_lp2();
>  
> -- 
> 2.24.0
>
Daniel Lezcano Feb. 21, 2020, 3:43 p.m. UTC | #2
On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
> It is possible that something may go wrong with the secondary CPU, in that
> case it is much nicer to get a dump of the flow-controller state before
> hanging machine.
> 
> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
> Tested-by: Peter Geis <pgwipeout@gmail.com>
> Tested-by: Jasper Korten <jja2000@gmail.com>
> Tested-by: David Heidelberg <david@ixit.cz>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
>  arch/arm/mach-tegra/cpuidle-tegra20.c | 47 +++++++++++++++++++++++++--
>  1 file changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
> index 9672c619f4bc..bcc158b72e67 100644
> --- a/arch/arm/mach-tegra/cpuidle-tegra20.c
> +++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
> @@ -83,14 +83,57 @@ static inline void tegra20_wake_cpu1_from_reset(void)
>  }
>  #endif
>  
> +static void tegra20_report_cpus_state(void)
> +{
> +	unsigned long cpu, lcpu, csr;
> +
> +	for_each_cpu(lcpu, cpu_possible_mask) {
> +		cpu = cpu_logical_map(lcpu);
> +		csr = flowctrl_read_cpu_csr(cpu);
> +
> +		pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n",
> +		       cpu, cpu_online(lcpu), csr);
> +	}
> +}
> +
> +static int tegra20_wait_for_secondary_cpu_parking(void)
> +{
> +	unsigned int retries = 3;
> +
> +	while (retries--) {
> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);

Oops I missed this one. Do not use ktime_get() in this code path, use jiffies.

> +
> +		/*
> +		 * The primary CPU0 core shall wait for the secondaries
> +		 * shutdown in order to power-off CPU's cluster safely.
> +		 * The timeout value depends on the current CPU frequency,
> +		 * it takes about 40-150us  in average and over 1000us in
> +		 * a worst case scenario.
> +		 */
> +		do {
> +			if (tegra_cpu_rail_off_ready())
> +				return 0;
> +
> +		} while (ktime_before(ktime_get(), timeout));

So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
here but the function will hang 1.5s :/

I suggest something like:

	while (retries--i && !tegra_cpu_rail_off_ready()) 
		udelay(100);

So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
impact.

> +		pr_err("secondary CPU taking too long to park\n");
> +
> +		tegra20_report_cpus_state();
> +	}
> +
> +	pr_err("timed out waiting secondaries to park\n");
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static bool tegra20_cpu_cluster_power_down(struct cpuidle_device *dev,
>  					   struct cpuidle_driver *drv,
>  					   int index)
>  {
>  	bool ret;
>  
> -	while (!tegra_cpu_rail_off_ready())
> -		cpu_relax();
> +	if (tegra20_wait_for_secondary_cpu_parking())
> +		return false;
>  
>  	ret = !tegra_pm_enter_lp2();
>  
> -- 
> 2.24.0
>
Dmitry Osipenko Feb. 21, 2020, 4:56 p.m. UTC | #3
Hello Daniel,

21.02.2020 18:43, Daniel Lezcano пишет:
> On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
>> It is possible that something may go wrong with the secondary CPU, in that
>> case it is much nicer to get a dump of the flow-controller state before
>> hanging machine.
>>
>> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
>> Tested-by: Peter Geis <pgwipeout@gmail.com>
>> Tested-by: Jasper Korten <jja2000@gmail.com>
>> Tested-by: David Heidelberg <david@ixit.cz>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>  arch/arm/mach-tegra/cpuidle-tegra20.c | 47 +++++++++++++++++++++++++--
>>  1 file changed, 45 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
>> index 9672c619f4bc..bcc158b72e67 100644
>> --- a/arch/arm/mach-tegra/cpuidle-tegra20.c
>> +++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
>> @@ -83,14 +83,57 @@ static inline void tegra20_wake_cpu1_from_reset(void)
>>  }
>>  #endif
>>  
>> +static void tegra20_report_cpus_state(void)
>> +{
>> +	unsigned long cpu, lcpu, csr;
>> +
>> +	for_each_cpu(lcpu, cpu_possible_mask) {
>> +		cpu = cpu_logical_map(lcpu);
>> +		csr = flowctrl_read_cpu_csr(cpu);
>> +
>> +		pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n",
>> +		       cpu, cpu_online(lcpu), csr);
>> +	}
>> +}
>> +
>> +static int tegra20_wait_for_secondary_cpu_parking(void)
>> +{
>> +	unsigned int retries = 3;
>> +
>> +	while (retries--) {
>> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
> 
> Oops I missed this one. Do not use ktime_get() in this code path, use jiffies.

Could you please explain what benefits jiffies have over the ktime_get()?

>> +
>> +		/*
>> +		 * The primary CPU0 core shall wait for the secondaries
>> +		 * shutdown in order to power-off CPU's cluster safely.
>> +		 * The timeout value depends on the current CPU frequency,
>> +		 * it takes about 40-150us  in average and over 1000us in
>> +		 * a worst case scenario.
>> +		 */
>> +		do {
>> +			if (tegra_cpu_rail_off_ready())
>> +				return 0;
>> +
>> +		} while (ktime_before(ktime_get(), timeout));
> 
> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
> here but the function will hang 1.5s :/
> 
> I suggest something like:
> 
> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
> 		udelay(100);
> 
> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
> impact.
But udelay() also results into CPU spinning in a busy-loop, and thus,
what's the difference?
Daniel Lezcano Feb. 21, 2020, 5:36 p.m. UTC | #4
On Fri, Feb 21, 2020 at 07:56:51PM +0300, Dmitry Osipenko wrote:
> Hello Daniel,
> 
> 21.02.2020 18:43, Daniel Lezcano пишет:
> > On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
> >> It is possible that something may go wrong with the secondary CPU, in that
> >> case it is much nicer to get a dump of the flow-controller state before
> >> hanging machine.
> >>
> >> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
> >> Tested-by: Peter Geis <pgwipeout@gmail.com>
> >> Tested-by: Jasper Korten <jja2000@gmail.com>
> >> Tested-by: David Heidelberg <david@ixit.cz>
> >> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> >> ---

[ ... ]

> >> +static int tegra20_wait_for_secondary_cpu_parking(void)
> >> +{
> >> +	unsigned int retries = 3;
> >> +
> >> +	while (retries--) {
> >> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
> > 
> > Oops I missed this one. Do not use ktime_get() in this code path, use jiffies.
> 
> Could you please explain what benefits jiffies have over the ktime_get()?

ktime_get() is very slow, jiffies is updated every tick.

> >> +
> >> +		/*
> >> +		 * The primary CPU0 core shall wait for the secondaries
> >> +		 * shutdown in order to power-off CPU's cluster safely.
> >> +		 * The timeout value depends on the current CPU frequency,
> >> +		 * it takes about 40-150us  in average and over 1000us in
> >> +		 * a worst case scenario.
> >> +		 */
> >> +		do {
> >> +			if (tegra_cpu_rail_off_ready())
> >> +				return 0;
> >> +
> >> +		} while (ktime_before(ktime_get(), timeout));
> > 
> > So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
> > times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
> > here but the function will hang 1.5s :/
> > 
> > I suggest something like:
> > 
> > 	while (retries--i && !tegra_cpu_rail_off_ready()) 
> > 		udelay(100);
> > 
> > So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
> > impact.
> But udelay() also results into CPU spinning in a busy-loop, and thus,
> what's the difference?

busy looping instead of register reads with all the hardware things involved behind.
Dmitry Osipenko Feb. 21, 2020, 6:19 p.m. UTC | #5
21.02.2020 20:36, Daniel Lezcano пишет:
> On Fri, Feb 21, 2020 at 07:56:51PM +0300, Dmitry Osipenko wrote:
>> Hello Daniel,
>>
>> 21.02.2020 18:43, Daniel Lezcano пишет:
>>> On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
>>>> It is possible that something may go wrong with the secondary CPU, in that
>>>> case it is much nicer to get a dump of the flow-controller state before
>>>> hanging machine.
>>>>
>>>> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
>>>> Tested-by: Peter Geis <pgwipeout@gmail.com>
>>>> Tested-by: Jasper Korten <jja2000@gmail.com>
>>>> Tested-by: David Heidelberg <david@ixit.cz>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
> 
> [ ... ]
> 
>>>> +static int tegra20_wait_for_secondary_cpu_parking(void)
>>>> +{
>>>> +	unsigned int retries = 3;
>>>> +
>>>> +	while (retries--) {
>>>> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
>>>
>>> Oops I missed this one. Do not use ktime_get() in this code path, use jiffies.
>>
>> Could you please explain what benefits jiffies have over the ktime_get()?
> 
> ktime_get() is very slow, jiffies is updated every tick.

But how jiffies are supposed to be updated if interrupts are disabled?

Aren't jiffies actually slower than ktime_get() because jiffies are
updating every 10/1ms (depending on CONFIG_HZ)?

We're kinda interesting here in getting into deep-idling state as quick
as possible. I was checking how much time takes the busy-loop below and
it takes ~40-150us in average, which is good enough.

>>>> +
>>>> +		/*
>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>> +		 * The timeout value depends on the current CPU frequency,
>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>> +		 * a worst case scenario.
>>>> +		 */
>>>> +		do {
>>>> +			if (tegra_cpu_rail_off_ready())
>>>> +				return 0;
>>>> +
>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>
>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>> here but the function will hang 1.5s :/
>>>
>>> I suggest something like:
>>>
>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>> 		udelay(100);
>>>
>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>> impact.
>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>> what's the difference?
> 
> busy looping instead of register reads with all the hardware things involved behind.

Please notice that this code runs only on an older Cortex-A9/A15, which
doesn't support WFE for the delaying, and thus, CPU always busy-loops
inside udelay().

What about if I'll add cpu_relax() to the loop? Do you think it it could
have any positive effect?
Daniel Lezcano Feb. 21, 2020, 8:02 p.m. UTC | #6
On 21/02/2020 19:19, Dmitry Osipenko wrote:
> 21.02.2020 20:36, Daniel Lezcano пишет:
>> On Fri, Feb 21, 2020 at 07:56:51PM +0300, Dmitry Osipenko wrote:
>>> Hello Daniel,
>>>
>>> 21.02.2020 18:43, Daniel Lezcano пишет:
>>>> On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
>>>>> It is possible that something may go wrong with the secondary CPU, in that
>>>>> case it is much nicer to get a dump of the flow-controller state before
>>>>> hanging machine.
>>>>>
>>>>> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
>>>>> Tested-by: Peter Geis <pgwipeout@gmail.com>
>>>>> Tested-by: Jasper Korten <jja2000@gmail.com>
>>>>> Tested-by: David Heidelberg <david@ixit.cz>
>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>> ---
>>
>> [ ... ]
>>
>>>>> +static int tegra20_wait_for_secondary_cpu_parking(void)
>>>>> +{
>>>>> +	unsigned int retries = 3;
>>>>> +
>>>>> +	while (retries--) {
>>>>> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
>>>>
>>>> Oops I missed this one. Do not use ktime_get() in this code path, use jiffies.
>>>
>>> Could you please explain what benefits jiffies have over the ktime_get()?
>>
>> ktime_get() is very slow, jiffies is updated every tick.
> 
> But how jiffies are supposed to be updated if interrupts are disabled?

Yeah, other cpus must not be idle in this.

> Aren't jiffies actually slower than ktime_get() because jiffies are
> updating every 10/1ms (depending on CONFIG_HZ)?

They are no slower, they have a lower resolution which is 10ms or 4ms.

Given the 500ms timeout, it is fine.

> We're kinda interesting here in getting into deep-idling state as quick
> as possible. I was checking how much time takes the busy-loop below and
> it takes ~40-150us in average, which is good enough.

ktime_get() gets a seq lock and it is very slow.

>>>>> +
>>>>> +		/*
>>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>>> +		 * The timeout value depends on the current CPU frequency,
>>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>>> +		 * a worst case scenario.
>>>>> +		 */
>>>>> +		do {
>>>>> +			if (tegra_cpu_rail_off_ready())
>>>>> +				return 0;
>>>>> +
>>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>>
>>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>>> here but the function will hang 1.5s :/
>>>>
>>>> I suggest something like:
>>>>
>>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>>> 		udelay(100);
>>>>
>>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>>> impact.
>>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>>> what's the difference?
>>
>> busy looping instead of register reads with all the hardware things involved behind.
> 
> Please notice that this code runs only on an older Cortex-A9/A15, which
> doesn't support WFE for the delaying, and thus, CPU always busy-loops
> inside udelay().
> 
> What about if I'll add cpu_relax() to the loop? Do you think it it could
> have any positive effect?

I think udelay() has a call to cpu_relax().
Dmitry Osipenko Feb. 21, 2020, 8:21 p.m. UTC | #7
21.02.2020 23:02, Daniel Lezcano пишет:
> On 21/02/2020 19:19, Dmitry Osipenko wrote:
>> 21.02.2020 20:36, Daniel Lezcano пишет:
>>> On Fri, Feb 21, 2020 at 07:56:51PM +0300, Dmitry Osipenko wrote:
>>>> Hello Daniel,
>>>>
>>>> 21.02.2020 18:43, Daniel Lezcano пишет:
>>>>> On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
>>>>>> It is possible that something may go wrong with the secondary CPU, in that
>>>>>> case it is much nicer to get a dump of the flow-controller state before
>>>>>> hanging machine.
>>>>>>
>>>>>> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
>>>>>> Tested-by: Peter Geis <pgwipeout@gmail.com>
>>>>>> Tested-by: Jasper Korten <jja2000@gmail.com>
>>>>>> Tested-by: David Heidelberg <david@ixit.cz>
>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>> ---
>>>
>>> [ ... ]
>>>
>>>>>> +static int tegra20_wait_for_secondary_cpu_parking(void)
>>>>>> +{
>>>>>> +	unsigned int retries = 3;
>>>>>> +
>>>>>> +	while (retries--) {
>>>>>> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
>>>>>
>>>>> Oops I missed this one. Do not use ktime_get() in this code path, use jiffies.
>>>>
>>>> Could you please explain what benefits jiffies have over the ktime_get()?
>>>
>>> ktime_get() is very slow, jiffies is updated every tick.
>>
>> But how jiffies are supposed to be updated if interrupts are disabled?
> 
> Yeah, other cpus must not be idle in this.

Okay, then jiffies can't be used here because this function is used for
the coupled / power-gated state only. All CPUs are idling in this state.

>> Aren't jiffies actually slower than ktime_get() because jiffies are
>> updating every 10/1ms (depending on CONFIG_HZ)?
> 
> They are no slower, they have a lower resolution which is 10ms or 4ms.
> 
> Given the 500ms timeout, it is fine.
> 
>> We're kinda interesting here in getting into deep-idling state as quick
>> as possible. I was checking how much time takes the busy-loop below and
>> it takes ~40-150us in average, which is good enough.
> 
> ktime_get() gets a seq lock and it is very slow.

Since all CPUs are idling here, the locking isn't a problem.

The wait_for_secondary_cpu_parking() function is called on CPU0, it
waits for the secondary CPUs to enter into safe-state before CPU0 could
power-gate the whole CPU cluster.

>>>>>> +
>>>>>> +		/*
>>>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>>>> +		 * The timeout value depends on the current CPU frequency,
>>>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>>>> +		 * a worst case scenario.
>>>>>> +		 */
>>>>>> +		do {
>>>>>> +			if (tegra_cpu_rail_off_ready())
>>>>>> +				return 0;
>>>>>> +
>>>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>>>
>>>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>>>> here but the function will hang 1.5s :/
>>>>>
>>>>> I suggest something like:
>>>>>
>>>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>>>> 		udelay(100);
>>>>>
>>>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>>>> impact.
>>>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>>>> what's the difference?
>>>
>>> busy looping instead of register reads with all the hardware things involved behind.
>>
>> Please notice that this code runs only on an older Cortex-A9/A15, which
>> doesn't support WFE for the delaying, and thus, CPU always busy-loops
>> inside udelay().
>>
>> What about if I'll add cpu_relax() to the loop? Do you think it it could
>> have any positive effect?
> 
> I think udelay() has a call to cpu_relax().

Yes, my point is that udelay() doesn't bring much benefit for us here
because:

1. we want to enter into power-gated state as quick as possible and
udelay() just adds an unnecessary delay

2. udelay() spins in a busy-loop until delay is expired, just like we're
doing it in this function already
Dmitry Osipenko Feb. 21, 2020, 8:42 p.m. UTC | #8
21.02.2020 23:21, Dmitry Osipenko пишет:
> 21.02.2020 23:02, Daniel Lezcano пишет:
>> On 21/02/2020 19:19, Dmitry Osipenko wrote:
>>> 21.02.2020 20:36, Daniel Lezcano пишет:
>>>> On Fri, Feb 21, 2020 at 07:56:51PM +0300, Dmitry Osipenko wrote:
>>>>> Hello Daniel,
>>>>>
>>>>> 21.02.2020 18:43, Daniel Lezcano пишет:
>>>>>> On Thu, Feb 13, 2020 at 02:51:26AM +0300, Dmitry Osipenko wrote:
>>>>>>> It is possible that something may go wrong with the secondary CPU, in that
>>>>>>> case it is much nicer to get a dump of the flow-controller state before
>>>>>>> hanging machine.
>>>>>>>
>>>>>>> Acked-by: Peter De Schrijver <pdeschrijver@nvidia.com>
>>>>>>> Tested-by: Peter Geis <pgwipeout@gmail.com>
>>>>>>> Tested-by: Jasper Korten <jja2000@gmail.com>
>>>>>>> Tested-by: David Heidelberg <david@ixit.cz>
>>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>>> ---
>>>>
>>>> [ ... ]
>>>>
>>>>>>> +static int tegra20_wait_for_secondary_cpu_parking(void)
>>>>>>> +{
>>>>>>> +	unsigned int retries = 3;
>>>>>>> +
>>>>>>> +	while (retries--) {
>>>>>>> +		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
>>>>>>
>>>>>> Oops I missed this one. Do not use ktime_get() in this code path, use jiffies.
>>>>>
>>>>> Could you please explain what benefits jiffies have over the ktime_get()?
>>>>
>>>> ktime_get() is very slow, jiffies is updated every tick.
>>>
>>> But how jiffies are supposed to be updated if interrupts are disabled?
>>
>> Yeah, other cpus must not be idle in this.
> 
> Okay, then jiffies can't be used here because this function is used for
> the coupled / power-gated state only. All CPUs are idling in this state.
> 
>>> Aren't jiffies actually slower than ktime_get() because jiffies are
>>> updating every 10/1ms (depending on CONFIG_HZ)?
>>
>> They are no slower, they have a lower resolution which is 10ms or 4ms.
>>
>> Given the 500ms timeout, it is fine.
>>
>>> We're kinda interesting here in getting into deep-idling state as quick
>>> as possible. I was checking how much time takes the busy-loop below and
>>> it takes ~40-150us in average, which is good enough.
>>
>> ktime_get() gets a seq lock and it is very slow.
> 
> Since all CPUs are idling here, the locking isn't a problem.
> 
> The wait_for_secondary_cpu_parking() function is called on CPU0, it
> waits for the secondary CPUs to enter into safe-state before CPU0 could
> power-gate the whole CPU cluster.
> 
>>>>>>> +
>>>>>>> +		/*
>>>>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>>>>> +		 * The timeout value depends on the current CPU frequency,
>>>>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>>>>> +		 * a worst case scenario.
>>>>>>> +		 */
>>>>>>> +		do {
>>>>>>> +			if (tegra_cpu_rail_off_ready())
>>>>>>> +				return 0;
>>>>>>> +
>>>>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>>>>
>>>>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>>>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>>>>> here but the function will hang 1.5s :/
>>>>>>
>>>>>> I suggest something like:
>>>>>>
>>>>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>>>>> 		udelay(100);
>>>>>>
>>>>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>>>>> impact.
>>>>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>>>>> what's the difference?
>>>>
>>>> busy looping instead of register reads with all the hardware things involved behind.
>>>
>>> Please notice that this code runs only on an older Cortex-A9/A15, which
>>> doesn't support WFE for the delaying, and thus, CPU always busy-loops
>>> inside udelay().
>>>
>>> What about if I'll add cpu_relax() to the loop? Do you think it it could
>>> have any positive effect?
>>
>> I think udelay() has a call to cpu_relax().
> 
> Yes, my point is that udelay() doesn't bring much benefit for us here
> because:
> 
> 1. we want to enter into power-gated state as quick as possible and
> udelay() just adds an unnecessary delay
> 
> 2. udelay() spins in a busy-loop until delay is expired, just like we're
> doing it in this function already

I'll try the udelay()-loop over the weekend and will see if it makes any
real difference, maybe I'm missing something.

If it doesn't make any difference, I'll leave this patch as-is, okay?
Daniel Lezcano Feb. 21, 2020, 8:48 p.m. UTC | #9
On 21/02/2020 21:21, Dmitry Osipenko wrote:
> 21.02.2020 23:02, Daniel Lezcano пишет:

[ ... ]

>>>>>>> +
>>>>>>> +		/*
>>>>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>>>>> +		 * The timeout value depends on the current CPU frequency,
>>>>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>>>>> +		 * a worst case scenario.
>>>>>>> +		 */
>>>>>>> +		do {
>>>>>>> +			if (tegra_cpu_rail_off_ready())
>>>>>>> +				return 0;
>>>>>>> +
>>>>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>>>>
>>>>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>>>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>>>>> here but the function will hang 1.5s :/
>>>>>>
>>>>>> I suggest something like:
>>>>>>
>>>>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>>>>> 		udelay(100);
>>>>>>
>>>>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>>>>> impact.
>>>>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>>>>> what's the difference?
>>>>
>>>> busy looping instead of register reads with all the hardware things involved behind.
>>>
>>> Please notice that this code runs only on an older Cortex-A9/A15, which
>>> doesn't support WFE for the delaying, and thus, CPU always busy-loops
>>> inside udelay().
>>>
>>> What about if I'll add cpu_relax() to the loop? Do you think it it could
>>> have any positive effect?
>>
>> I think udelay() has a call to cpu_relax().
> 
> Yes, my point is that udelay() doesn't bring much benefit for us here
> because:
> 
> 1. we want to enter into power-gated state as quick as possible and
> udelay() just adds an unnecessary delay
> 
> 2. udelay() spins in a busy-loop until delay is expired, just like we're
> doing it in this function already

In this case why not remove ktime_get() and increase the number of retries?
Dmitry Osipenko Feb. 21, 2020, 8:54 p.m. UTC | #10
21.02.2020 23:48, Daniel Lezcano пишет:
> On 21/02/2020 21:21, Dmitry Osipenko wrote:
>> 21.02.2020 23:02, Daniel Lezcano пишет:
> 
> [ ... ]
> 
>>>>>>>> +
>>>>>>>> +		/*
>>>>>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>>>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>>>>>> +		 * The timeout value depends on the current CPU frequency,
>>>>>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>>>>>> +		 * a worst case scenario.
>>>>>>>> +		 */
>>>>>>>> +		do {
>>>>>>>> +			if (tegra_cpu_rail_off_ready())
>>>>>>>> +				return 0;
>>>>>>>> +
>>>>>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>>>>>
>>>>>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>>>>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>>>>>> here but the function will hang 1.5s :/
>>>>>>>
>>>>>>> I suggest something like:
>>>>>>>
>>>>>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>>>>>> 		udelay(100);
>>>>>>>
>>>>>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>>>>>> impact.
>>>>>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>>>>>> what's the difference?
>>>>>
>>>>> busy looping instead of register reads with all the hardware things involved behind.
>>>>
>>>> Please notice that this code runs only on an older Cortex-A9/A15, which
>>>> doesn't support WFE for the delaying, and thus, CPU always busy-loops
>>>> inside udelay().
>>>>
>>>> What about if I'll add cpu_relax() to the loop? Do you think it it could
>>>> have any positive effect?
>>>
>>> I think udelay() has a call to cpu_relax().
>>
>> Yes, my point is that udelay() doesn't bring much benefit for us here
>> because:
>>
>> 1. we want to enter into power-gated state as quick as possible and
>> udelay() just adds an unnecessary delay
>>
>> 2. udelay() spins in a busy-loop until delay is expired, just like we're
>> doing it in this function already
> 
> In this case why not remove ktime_get() and increase the number of retries?

Because the busy-loop performance depends on CPU's frequency, so we
can't rely on a bare number of the retries.
Daniel Lezcano Feb. 21, 2020, 9:11 p.m. UTC | #11
On 21/02/2020 21:54, Dmitry Osipenko wrote:
> 21.02.2020 23:48, Daniel Lezcano пишет:
>> On 21/02/2020 21:21, Dmitry Osipenko wrote:
>>> 21.02.2020 23:02, Daniel Lezcano пишет:
>>
>> [ ... ]
>>
>>>>>>>>> +
>>>>>>>>> +		/*
>>>>>>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>>>>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>>>>>>> +		 * The timeout value depends on the current CPU frequency,
>>>>>>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>>>>>>> +		 * a worst case scenario.
>>>>>>>>> +		 */
>>>>>>>>> +		do {
>>>>>>>>> +			if (tegra_cpu_rail_off_ready())
>>>>>>>>> +				return 0;
>>>>>>>>> +
>>>>>>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>>>>>>
>>>>>>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>>>>>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>>>>>>> here but the function will hang 1.5s :/
>>>>>>>>
>>>>>>>> I suggest something like:
>>>>>>>>
>>>>>>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>>>>>>> 		udelay(100);
>>>>>>>>
>>>>>>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>>>>>>> impact.
>>>>>>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>>>>>>> what's the difference?
>>>>>>
>>>>>> busy looping instead of register reads with all the hardware things involved behind.
>>>>>
>>>>> Please notice that this code runs only on an older Cortex-A9/A15, which
>>>>> doesn't support WFE for the delaying, and thus, CPU always busy-loops
>>>>> inside udelay().
>>>>>
>>>>> What about if I'll add cpu_relax() to the loop? Do you think it it could
>>>>> have any positive effect?
>>>>
>>>> I think udelay() has a call to cpu_relax().
>>>
>>> Yes, my point is that udelay() doesn't bring much benefit for us here
>>> because:
>>>
>>> 1. we want to enter into power-gated state as quick as possible and
>>> udelay() just adds an unnecessary delay
>>>
>>> 2. udelay() spins in a busy-loop until delay is expired, just like we're
>>> doing it in this function already
>>
>> In this case why not remove ktime_get() and increase the number of retries?
> 
> Because the busy-loop performance depends on CPU's frequency, so we
> can't rely on a bare number of the retries.

Why not if computed in the worst case scenario?

Anyway, I'll let you give a try.
Dmitry Osipenko Feb. 24, 2020, 3:12 p.m. UTC | #12
22.02.2020 00:11, Daniel Lezcano пишет:
> On 21/02/2020 21:54, Dmitry Osipenko wrote:
>> 21.02.2020 23:48, Daniel Lezcano пишет:
>>> On 21/02/2020 21:21, Dmitry Osipenko wrote:
>>>> 21.02.2020 23:02, Daniel Lezcano пишет:
>>>
>>> [ ... ]
>>>
>>>>>>>>>> +
>>>>>>>>>> +		/*
>>>>>>>>>> +		 * The primary CPU0 core shall wait for the secondaries
>>>>>>>>>> +		 * shutdown in order to power-off CPU's cluster safely.
>>>>>>>>>> +		 * The timeout value depends on the current CPU frequency,
>>>>>>>>>> +		 * it takes about 40-150us  in average and over 1000us in
>>>>>>>>>> +		 * a worst case scenario.
>>>>>>>>>> +		 */
>>>>>>>>>> +		do {
>>>>>>>>>> +			if (tegra_cpu_rail_off_ready())
>>>>>>>>>> +				return 0;
>>>>>>>>>> +
>>>>>>>>>> +		} while (ktime_before(ktime_get(), timeout));
>>>>>>>>>
>>>>>>>>> So this loop will aggresively call tegra_cpu_rail_off_ready() and retry 3
>>>>>>>>> times. The tegra_cpu_rail_off_ready() function can be called thoushand of times
>>>>>>>>> here but the function will hang 1.5s :/
>>>>>>>>>
>>>>>>>>> I suggest something like:
>>>>>>>>>
>>>>>>>>> 	while (retries--i && !tegra_cpu_rail_off_ready()) 
>>>>>>>>> 		udelay(100);
>>>>>>>>>
>>>>>>>>> So <retries> calls to tegra_cpu_rail_off_ready() and 100us x <retries> maximum
>>>>>>>>> impact.
>>>>>>>> But udelay() also results into CPU spinning in a busy-loop, and thus,
>>>>>>>> what's the difference?
>>>>>>>
>>>>>>> busy looping instead of register reads with all the hardware things involved behind.
>>>>>>
>>>>>> Please notice that this code runs only on an older Cortex-A9/A15, which
>>>>>> doesn't support WFE for the delaying, and thus, CPU always busy-loops
>>>>>> inside udelay().
>>>>>>
>>>>>> What about if I'll add cpu_relax() to the loop? Do you think it it could
>>>>>> have any positive effect?
>>>>>
>>>>> I think udelay() has a call to cpu_relax().
>>>>
>>>> Yes, my point is that udelay() doesn't bring much benefit for us here
>>>> because:
>>>>
>>>> 1. we want to enter into power-gated state as quick as possible and
>>>> udelay() just adds an unnecessary delay
>>>>
>>>> 2. udelay() spins in a busy-loop until delay is expired, just like we're
>>>> doing it in this function already
>>>
>>> In this case why not remove ktime_get() and increase the number of retries?
>>
>> Because the busy-loop performance depends on CPU's frequency, so we
>> can't rely on a bare number of the retries.
> 
> Why not if computed in the worst case scenario?

There are always at least a few dozens of microseconds to wait, so
something like udelay(10) should be a bit better variant anyways.

> Anyway, I'll let you give a try.
Turned out that udelay(10) is noticeably better when system is running
on a lower freqs in comparison to ktime_get(). I'll switch to udelay in
v10, thank you very much for the suggestion!
diff mbox series

Patch

diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
index 9672c619f4bc..bcc158b72e67 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra20.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
@@ -83,14 +83,57 @@  static inline void tegra20_wake_cpu1_from_reset(void)
 }
 #endif
 
+static void tegra20_report_cpus_state(void)
+{
+	unsigned long cpu, lcpu, csr;
+
+	for_each_cpu(lcpu, cpu_possible_mask) {
+		cpu = cpu_logical_map(lcpu);
+		csr = flowctrl_read_cpu_csr(cpu);
+
+		pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n",
+		       cpu, cpu_online(lcpu), csr);
+	}
+}
+
+static int tegra20_wait_for_secondary_cpu_parking(void)
+{
+	unsigned int retries = 3;
+
+	while (retries--) {
+		ktime_t timeout = ktime_add_ms(ktime_get(), 500);
+
+		/*
+		 * The primary CPU0 core shall wait for the secondaries
+		 * shutdown in order to power-off CPU's cluster safely.
+		 * The timeout value depends on the current CPU frequency,
+		 * it takes about 40-150us  in average and over 1000us in
+		 * a worst case scenario.
+		 */
+		do {
+			if (tegra_cpu_rail_off_ready())
+				return 0;
+
+		} while (ktime_before(ktime_get(), timeout));
+
+		pr_err("secondary CPU taking too long to park\n");
+
+		tegra20_report_cpus_state();
+	}
+
+	pr_err("timed out waiting secondaries to park\n");
+
+	return -ETIMEDOUT;
+}
+
 static bool tegra20_cpu_cluster_power_down(struct cpuidle_device *dev,
 					   struct cpuidle_driver *drv,
 					   int index)
 {
 	bool ret;
 
-	while (!tegra_cpu_rail_off_ready())
-		cpu_relax();
+	if (tegra20_wait_for_secondary_cpu_parking())
+		return false;
 
 	ret = !tegra_pm_enter_lp2();