diff mbox

cpuidle: coupled: fix dead loop corner case

Message ID 1376975864-31487-1-git-send-email-zhangwm@marvell.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Neil Zhang Aug. 20, 2013, 5:17 a.m. UTC
There is a corener case when no peripheral irqs route to secondary
cores.
Let's take dual core system for example, the sequence is as following:

		Core 0			        Core1
1.			           set waiting bit and enter waiting loop
2. set waiting bit and poke core1
3. 				   clear poke in irq and enter safe state
4. set ready bit and enter ready loop

Since there is no peripheral irq route to core 1, so it will stay in
safe state forever, and core 0 will dead loop in the following code.
	while (!cpuidle_coupled_cpus_ready(coupled)) {
		/* Check if any other cpus bailed out of idle. */
		if (!cpuidle_coupled_cpus_waiting(coupled))
	}

The solution is don't let secondary core enter safe state when it has
already handled the poke interrupt.

Signed-off-by: Neil Zhang <zhangwm@marvell.com>
Reviewed-by: Fangsuo Wu <fswu@marvell.com>
---
 drivers/cpuidle/coupled.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

Comments

Rafael Wysocki Aug. 20, 2013, 12:36 p.m. UTC | #1
On Tuesday, August 20, 2013 01:17:44 PM Neil Zhang wrote:
> There is a corener case when no peripheral irqs route to secondary
> cores.
> Let's take dual core system for example, the sequence is as following:
> 
> 		Core 0			        Core1
> 1.			           set waiting bit and enter waiting loop
> 2. set waiting bit and poke core1
> 3. 				   clear poke in irq and enter safe state
> 4. set ready bit and enter ready loop
> 
> Since there is no peripheral irq route to core 1, so it will stay in
> safe state forever, and core 0 will dead loop in the following code.
> 	while (!cpuidle_coupled_cpus_ready(coupled)) {
> 		/* Check if any other cpus bailed out of idle. */
> 		if (!cpuidle_coupled_cpus_waiting(coupled))
> 	}
> 
> The solution is don't let secondary core enter safe state when it has
> already handled the poke interrupt.
> 
> Signed-off-by: Neil Zhang <zhangwm@marvell.com>
> Reviewed-by: Fangsuo Wu <fswu@marvell.com>

Daniel, can you please have a look at this?

Rafael


> ---
>  drivers/cpuidle/coupled.c |    7 +++++++
>  1 files changed, 7 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
> index 2a297f8..a37c718 100644
> --- a/drivers/cpuidle/coupled.c
> +++ b/drivers/cpuidle/coupled.c
> @@ -119,6 +119,7 @@ struct cpuidle_coupled {
>  #define CPUIDLE_COUPLED_NOT_IDLE	(-1)
>  
>  static DEFINE_MUTEX(cpuidle_coupled_lock);
> +static DEFINE_PER_CPU(bool, poke_sync);
>  static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
>  
>  /*
> @@ -295,6 +296,7 @@ static void cpuidle_coupled_poked(void *info)
>  {
>  	int cpu = (unsigned long)info;
>  	cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask);
> +	__this_cpu_write(poke_sync, true);
>  }
>  
>  /**
> @@ -473,6 +475,7 @@ retry:
>  	 * allowed for a single cpu.
>  	 */
>  	while (!cpuidle_coupled_cpus_waiting(coupled)) {
> +		__this_cpu_write(poke_sync, false);
>  		if (cpuidle_coupled_clear_pokes(dev->cpu)) {
>  			cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
>  			goto out;
> @@ -483,6 +486,10 @@ retry:
>  			goto out;
>  		}
>  
> +		if (cpuidle_coupled_cpus_waiting(coupled)
> +			&& __this_cpu_read(poke_sync))
> +			break;
> +
>  		entered_state = cpuidle_enter_state(dev, drv,
>  			dev->safe_state_index);
>  	}
>
Neil Zhang Aug. 22, 2013, 10:11 a.m. UTC | #2
Daniel & Colin,

> -----Original Message-----

> From: Rafael J. Wysocki [mailto:rjw@sisk.pl]

> Sent: 2013?8?20? 20:37

> To: Neil Zhang; Daniel Lezcano

> Cc: linux-pm@vger.kernel.org; linux-kernel@vger.kernel.org

> Subject: Re: [PATCH] cpuidle: coupled: fix dead loop corner case

> 

> On Tuesday, August 20, 2013 01:17:44 PM Neil Zhang wrote:

> > There is a corener case when no peripheral irqs route to secondary

> > cores.

> > Let's take dual core system for example, the sequence is as following:

> >

> > 		Core 0			        Core1

> > 1.			           set waiting bit and enter waiting loop

> > 2. set waiting bit and poke core1

> > 3. 				   clear poke in irq and enter safe state

> > 4. set ready bit and enter ready loop

> >

> > Since there is no peripheral irq route to core 1, so it will stay in

> > safe state forever, and core 0 will dead loop in the following code.

> > 	while (!cpuidle_coupled_cpus_ready(coupled)) {

> > 		/* Check if any other cpus bailed out of idle. */

> > 		if (!cpuidle_coupled_cpus_waiting(coupled))

> > 	}

> >

> > The solution is don't let secondary core enter safe state when it has

> > already handled the poke interrupt.

> >

> > Signed-off-by: Neil Zhang <zhangwm@marvell.com>

> > Reviewed-by: Fangsuo Wu <fswu@marvell.com>

> 

> Daniel, can you please have a look at this?

> 

> Rafael

> 


What's your opinion?
Thanks.

> 

> > ---

> >  drivers/cpuidle/coupled.c |    7 +++++++

> >  1 files changed, 7 insertions(+), 0 deletions(-)

> >

> > diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c

> > index 2a297f8..a37c718 100644

> > --- a/drivers/cpuidle/coupled.c

> > +++ b/drivers/cpuidle/coupled.c

> > @@ -119,6 +119,7 @@ struct cpuidle_coupled {

> >  #define CPUIDLE_COUPLED_NOT_IDLE	(-1)

> >

> >  static DEFINE_MUTEX(cpuidle_coupled_lock);

> > +static DEFINE_PER_CPU(bool, poke_sync);

> >  static DEFINE_PER_CPU(struct call_single_data,

> > cpuidle_coupled_poke_cb);

> >

> >  /*

> > @@ -295,6 +296,7 @@ static void cpuidle_coupled_poked(void *info)  {

> >  	int cpu = (unsigned long)info;

> >  	cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask);

> > +	__this_cpu_write(poke_sync, true);

> >  }

> >

> >  /**

> > @@ -473,6 +475,7 @@ retry:

> >  	 * allowed for a single cpu.

> >  	 */

> >  	while (!cpuidle_coupled_cpus_waiting(coupled)) {

> > +		__this_cpu_write(poke_sync, false);

> >  		if (cpuidle_coupled_clear_pokes(dev->cpu)) {

> >  			cpuidle_coupled_set_not_waiting(dev->cpu, coupled);

> >  			goto out;

> > @@ -483,6 +486,10 @@ retry:

> >  			goto out;

> >  		}

> >

> > +		if (cpuidle_coupled_cpus_waiting(coupled)

> > +			&& __this_cpu_read(poke_sync))

> > +			break;

> > +

> >  		entered_state = cpuidle_enter_state(dev, drv,

> >  			dev->safe_state_index);

> >  	}

> >

> --

> I speak only for myself.

> Rafael J. Wysocki, Intel Open Source Technology Center.


Best Regards,
Neil Zhang
Colin Cross Aug. 22, 2013, 9:08 p.m. UTC | #3
On Mon, Aug 19, 2013 at 10:17 PM, Neil Zhang <zhangwm@marvell.com> wrote:
> There is a corener case when no peripheral irqs route to secondary
> cores.
> Let's take dual core system for example, the sequence is as following:
>
>                 Core 0                          Core1
> 1.                                 set waiting bit and enter waiting loop
> 2. set waiting bit and poke core1
> 3.                                 clear poke in irq and enter safe state
> 4. set ready bit and enter ready loop
>
> Since there is no peripheral irq route to core 1, so it will stay in
> safe state forever, and core 0 will dead loop in the following code.
>         while (!cpuidle_coupled_cpus_ready(coupled)) {
>                 /* Check if any other cpus bailed out of idle. */
>                 if (!cpuidle_coupled_cpus_waiting(coupled))
>         }
>
> The solution is don't let secondary core enter safe state when it has
> already handled the poke interrupt.
>
> Signed-off-by: Neil Zhang <zhangwm@marvell.com>
> Reviewed-by: Fangsuo Wu <fswu@marvell.com>
> ---
>  drivers/cpuidle/coupled.c |    7 +++++++
>  1 files changed, 7 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
> index 2a297f8..a37c718 100644
> --- a/drivers/cpuidle/coupled.c
> +++ b/drivers/cpuidle/coupled.c
> @@ -119,6 +119,7 @@ struct cpuidle_coupled {
>  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
>
>  static DEFINE_MUTEX(cpuidle_coupled_lock);
> +static DEFINE_PER_CPU(bool, poke_sync);
>  static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
>
>  /*
> @@ -295,6 +296,7 @@ static void cpuidle_coupled_poked(void *info)
>  {
>         int cpu = (unsigned long)info;
>         cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask);
> +       __this_cpu_write(poke_sync, true);
>  }
>
>  /**
> @@ -473,6 +475,7 @@ retry:
>          * allowed for a single cpu.
>          */
>         while (!cpuidle_coupled_cpus_waiting(coupled)) {
> +               __this_cpu_write(poke_sync, false);
>                 if (cpuidle_coupled_clear_pokes(dev->cpu)) {
>                         cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
>                         goto out;
> @@ -483,6 +486,10 @@ retry:
>                         goto out;
>                 }
>
> +               if (cpuidle_coupled_cpus_waiting(coupled)
> +                       && __this_cpu_read(poke_sync))
> +                       break;
> +
>                 entered_state = cpuidle_enter_state(dev, drv,
>                         dev->safe_state_index);
>         }
> --
> 1.7.4.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

I have a similar patch that avoids adding another check for
cpuidle_coupled_cpus_waiting, and uses the return value from
cpuidle_coupled_clear_pokes instead of adding a percpu bool.  I will
post it shortly.

Do you have a test case that can reproduce this easily?
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Zhang Aug. 23, 2013, 3:17 a.m. UTC | #4
> -----Original Message-----

> From: Colin Cross [mailto:ccross@google.com]

> Sent: 2013?8?23? 5:08

> To: Neil Zhang

> Cc: Rafael J. Wysocki; Daniel Lezcano; Linux PM list; lkml

> Subject: Re: [PATCH] cpuidle: coupled: fix dead loop corner case

> 

> On Mon, Aug 19, 2013 at 10:17 PM, Neil Zhang <zhangwm@marvell.com>

> wrote:

> > There is a corener case when no peripheral irqs route to secondary

> > cores.

> > Let's take dual core system for example, the sequence is as following:

> >

> >                 Core 0                          Core1

> > 1.                                 set waiting bit and enter waiting

> loop

> > 2. set waiting bit and poke core1

> > 3.                                 clear poke in irq and enter safe

> state

> > 4. set ready bit and enter ready loop

> >

> > Since there is no peripheral irq route to core 1, so it will stay in

> > safe state forever, and core 0 will dead loop in the following code.

> >         while (!cpuidle_coupled_cpus_ready(coupled)) {

> >                 /* Check if any other cpus bailed out of idle. */

> >                 if (!cpuidle_coupled_cpus_waiting(coupled))

> >         }

> >

> > The solution is don't let secondary core enter safe state when it has

> > already handled the poke interrupt.

> >

> > Signed-off-by: Neil Zhang <zhangwm@marvell.com>

> > Reviewed-by: Fangsuo Wu <fswu@marvell.com>

> > ---

> >  drivers/cpuidle/coupled.c |    7 +++++++

> >  1 files changed, 7 insertions(+), 0 deletions(-)

> >

> > diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c

> > index 2a297f8..a37c718 100644

> > --- a/drivers/cpuidle/coupled.c

> > +++ b/drivers/cpuidle/coupled.c

> > @@ -119,6 +119,7 @@ struct cpuidle_coupled {

> >  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)

> >

> >  static DEFINE_MUTEX(cpuidle_coupled_lock);

> > +static DEFINE_PER_CPU(bool, poke_sync);

> >  static DEFINE_PER_CPU(struct call_single_data,

> > cpuidle_coupled_poke_cb);

> >

> >  /*

> > @@ -295,6 +296,7 @@ static void cpuidle_coupled_poked(void *info)  {

> >         int cpu = (unsigned long)info;

> >         cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask);

> > +       __this_cpu_write(poke_sync, true);

> >  }

> >

> >  /**

> > @@ -473,6 +475,7 @@ retry:

> >          * allowed for a single cpu.

> >          */

> >         while (!cpuidle_coupled_cpus_waiting(coupled)) {

> > +               __this_cpu_write(poke_sync, false);

> >                 if (cpuidle_coupled_clear_pokes(dev->cpu)) {

> >                         cpuidle_coupled_set_not_waiting(dev->cpu,

> coupled);

> >                         goto out;

> > @@ -483,6 +486,10 @@ retry:

> >                         goto out;

> >                 }

> >

> > +               if (cpuidle_coupled_cpus_waiting(coupled)

> > +                       && __this_cpu_read(poke_sync))

> > +                       break;

> > +

> >                 entered_state = cpuidle_enter_state(dev, drv,

> >                         dev->safe_state_index);

> >         }

> > --

> > 1.7.4.1

> >

> > --

> > To unsubscribe from this list: send the line "unsubscribe

> > linux-kernel" in the body of a message to majordomo@vger.kernel.org

> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

> > Please read the FAQ at  http://www.tux.org/lkml/

> 

> I have a similar patch that avoids adding another check for

> cpuidle_coupled_cpus_waiting, and uses the return value from

> cpuidle_coupled_clear_pokes instead of adding a percpu bool.  I will post it

> shortly.

> 

> Do you have a test case that can reproduce this easily?


It's not easy to reproduce.
We only catch one time till now.

Best Regards,
Neil Zhang
diff mbox

Patch

diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 2a297f8..a37c718 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -119,6 +119,7 @@  struct cpuidle_coupled {
 #define CPUIDLE_COUPLED_NOT_IDLE	(-1)
 
 static DEFINE_MUTEX(cpuidle_coupled_lock);
+static DEFINE_PER_CPU(bool, poke_sync);
 static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
 
 /*
@@ -295,6 +296,7 @@  static void cpuidle_coupled_poked(void *info)
 {
 	int cpu = (unsigned long)info;
 	cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask);
+	__this_cpu_write(poke_sync, true);
 }
 
 /**
@@ -473,6 +475,7 @@  retry:
 	 * allowed for a single cpu.
 	 */
 	while (!cpuidle_coupled_cpus_waiting(coupled)) {
+		__this_cpu_write(poke_sync, false);
 		if (cpuidle_coupled_clear_pokes(dev->cpu)) {
 			cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
 			goto out;
@@ -483,6 +486,10 @@  retry:
 			goto out;
 		}
 
+		if (cpuidle_coupled_cpus_waiting(coupled)
+			&& __this_cpu_read(poke_sync))
+			break;
+
 		entered_state = cpuidle_enter_state(dev, drv,
 			dev->safe_state_index);
 	}