diff mbox series

workqueue: Fix race in schedule and flush work

Message ID 20220210184319.25009-1-treasure4paddy@gmail.com (mailing list archive)
State New, archived
Headers show
Series workqueue: Fix race in schedule and flush work | expand

Commit Message

Padmanabha Srinivasaiah Feb. 10, 2022, 6:43 p.m. UTC
While booting secondary cpus, online cpus mask is transient and can end up
in below calltrace. Using same cpumask for both schedule and flush of
work resolves the issue.

[    0.377823] CPU3: Booted secondary processor 0x0000000003 [0x410fd083]
[    0.379040] ------------[ cut here ]------------
[    0.383662] WARNING: CPU: 0 PID: 10 at kernel/workqueue.c:3084 __flush_work+0x12c/0x138
[    0.384850] Modules linked in:
[    0.385403] CPU: 0 PID: 10 Comm: rcu_tasks_rude_ Not tainted 5.17.0-rc3-v8+ #13
[    0.386473] Hardware name: Raspberry Pi 4 Model B Rev 1.4 (DT)
[    0.387289] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    0.388308] pc : __flush_work+0x12c/0x138
[    0.388970] lr : __flush_work+0x80/0x138
[    0.389620] sp : ffffffc00aaf3c60
[    0.390139] x29: ffffffc00aaf3d20 x28: ffffffc009c16af0 x27: ffffff80f761df48
[    0.391316] x26: 0000000000000004 x25: 0000000000000003 x24: 0000000000000100
[    0.392493] x23: ffffffffffffffff x22: ffffffc009c16b10 x21: ffffffc009c16b28
[    0.393668] x20: ffffffc009e53861 x19: ffffff80f77fbf40 x18: 00000000d744fcc9
[    0.394842] x17: 000000000000000b x16: 00000000000001c2 x15: ffffffc009e57550
[    0.396016] x14: 0000000000000000 x13: ffffffffffffffff x12: 0000000100000000
[    0.397190] x11: 0000000000000462 x10: ffffff8040258008 x9 : 0000000100000000
[    0.398364] x8 : 0000000000000000 x7 : ffffffc0093c8bf4 x6 : 0000000000000000
[    0.399538] x5 : 0000000000000000 x4 : ffffffc00a976e40 x3 : ffffffc00810444c
[    0.400711] x2 : 0000000000000004 x1 : 0000000000000000 x0 : 0000000000000000
[    0.401886] Call trace:
[    0.402309]  __flush_work+0x12c/0x138
[    0.402941]  schedule_on_each_cpu+0x228/0x278
[    0.403693]  rcu_tasks_rude_wait_gp+0x130/0x144
[    0.404502]  rcu_tasks_kthread+0x220/0x254
[    0.405264]  kthread+0x174/0x1ac
[    0.405837]  ret_from_fork+0x10/0x20
[    0.406456] irq event stamp: 102
[    0.406966] hardirqs last  enabled at (101): [<ffffffc0093c8468>] _raw_spin_unlock_irq+0x78/0xb4
[    0.408304] hardirqs last disabled at (102): [<ffffffc0093b8270>] el1_dbg+0x24/0x5c
[    0.409410] softirqs last  enabled at (54): [<ffffffc0081b80c8>] local_bh_enable+0xc/0x2c
[    0.410645] softirqs last disabled at (50): [<ffffffc0081b809c>] local_bh_disable+0xc/0x2c
[    0.411890] ---[ end trace 0000000000000000 ]---
[    0.413000] smp: Brought up 1 node, 4 CPUs
[    0.413762] SMP: Total of 4 processors activated.

Signed-off-by: Padmanabha Srinivasaiah <treasure4paddy@gmail.com>
---
 kernel/workqueue.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

Comments

Tejun Heo Feb. 14, 2022, 7:43 p.m. UTC | #1
Hello,

> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 33f1106b4f99..a3f53f859e9d 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -3326,28 +3326,38 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
>   */
>  int schedule_on_each_cpu(work_func_t func)
>  {
> -	int cpu;
>  	struct work_struct __percpu *works;
> +	cpumask_var_t sched_cpumask;
> +	int cpu, ret = 0;
>  
> -	works = alloc_percpu(struct work_struct);
> -	if (!works)
> +	if (!alloc_cpumask_var(&sched_cpumask, GFP_KERNEL))
>  		return -ENOMEM;
>  
> +	works = alloc_percpu(struct work_struct);
> +	if (!works) {
> +		ret = -ENOMEM;
> +		goto free_cpumask;
> +	}
> +
>  	cpus_read_lock();
>  
> -	for_each_online_cpu(cpu) {
> +	cpumask_copy(sched_cpumask, cpu_online_mask);
> +	for_each_cpu_and(cpu, sched_cpumask, cpu_online_mask) {

This definitely would need a comment explaining what's going on cuz it looks
weird to be copying the cpumask which is supposed to stay stable due to the
cpus_read_lock(). Given that it can only happen during early boot and the
online cpus can only be expanding, maybe just add sth like:

        if (early_during_boot) {
                for_each_possible_cpu(cpu)
                        INIT_WORK(per_cpu_ptr(works, cpu), func);
        }

BTW, who's calling schedule_on_each_cpu() that early during boot. It makes
no sense to do this while the cpumasks can't be stabilized.

Thanks.
Padmanabha Srinivasaiah Feb. 16, 2022, 6:49 p.m. UTC | #2
On Mon, Feb 14, 2022 at 09:43:52AM -1000, Tejun Heo wrote:
> Hello,
> 
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index 33f1106b4f99..a3f53f859e9d 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -3326,28 +3326,38 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
> >   */
> >  int schedule_on_each_cpu(work_func_t func)
> >  {
> > -	int cpu;
> >  	struct work_struct __percpu *works;
> > +	cpumask_var_t sched_cpumask;
> > +	int cpu, ret = 0;
> >  
> > -	works = alloc_percpu(struct work_struct);
> > -	if (!works)
> > +	if (!alloc_cpumask_var(&sched_cpumask, GFP_KERNEL))
> >  		return -ENOMEM;
> >  
> > +	works = alloc_percpu(struct work_struct);
> > +	if (!works) {
> > +		ret = -ENOMEM;
> > +		goto free_cpumask;
> > +	}
> > +
> >  	cpus_read_lock();
> >  
> > -	for_each_online_cpu(cpu) {
> > +	cpumask_copy(sched_cpumask, cpu_online_mask);
> > +	for_each_cpu_and(cpu, sched_cpumask, cpu_online_mask) {
> 
> This definitely would need a comment explaining what's going on cuz it looks
> weird to be copying the cpumask which is supposed to stay stable due to the
> cpus_read_lock().Given that it can only happen during early boot and the
> online cpus can only be expanding, maybe just add sth like:
> 
>         if (early_during_boot) {
>                 for_each_possible_cpu(cpu)
>                         INIT_WORK(per_cpu_ptr(works, cpu), func);
>         }
> 

Thanks tejun for the reply and suggestions.

Yes, unfortunately cpus_read_lock not keeping cpumask stable at
secondary boot. Not sure, may be it only gurantee 'cpu' dont go down
under cpus_read_[lock/unlock].

As suggested will tryout something like:
	if (system_state != RUNNING) {
		:
	}
> BTW, who's calling schedule_on_each_cpu() that early during boot. It makes
> no sense to do this while the cpumasks can't be stabilized.
>
It is  implemenation of CONFIG_TASKS_RUDE_RCU.

> Thanks.
> 
> -- 
> tejun
Paul E. McKenney Feb. 16, 2022, 7:07 p.m. UTC | #3
On Wed, Feb 16, 2022 at 07:49:39PM +0100, Padmanabha Srinivasaiah wrote:
> On Mon, Feb 14, 2022 at 09:43:52AM -1000, Tejun Heo wrote:
> > Hello,
> > 
> > > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > > index 33f1106b4f99..a3f53f859e9d 100644
> > > --- a/kernel/workqueue.c
> > > +++ b/kernel/workqueue.c
> > > @@ -3326,28 +3326,38 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
> > >   */
> > >  int schedule_on_each_cpu(work_func_t func)
> > >  {
> > > -	int cpu;
> > >  	struct work_struct __percpu *works;
> > > +	cpumask_var_t sched_cpumask;
> > > +	int cpu, ret = 0;
> > >  
> > > -	works = alloc_percpu(struct work_struct);
> > > -	if (!works)
> > > +	if (!alloc_cpumask_var(&sched_cpumask, GFP_KERNEL))
> > >  		return -ENOMEM;
> > >  
> > > +	works = alloc_percpu(struct work_struct);
> > > +	if (!works) {
> > > +		ret = -ENOMEM;
> > > +		goto free_cpumask;
> > > +	}
> > > +
> > >  	cpus_read_lock();
> > >  
> > > -	for_each_online_cpu(cpu) {
> > > +	cpumask_copy(sched_cpumask, cpu_online_mask);
> > > +	for_each_cpu_and(cpu, sched_cpumask, cpu_online_mask) {
> > 
> > This definitely would need a comment explaining what's going on cuz it looks
> > weird to be copying the cpumask which is supposed to stay stable due to the
> > cpus_read_lock().Given that it can only happen during early boot and the
> > online cpus can only be expanding, maybe just add sth like:
> > 
> >         if (early_during_boot) {
> >                 for_each_possible_cpu(cpu)
> >                         INIT_WORK(per_cpu_ptr(works, cpu), func);
> >         }
> > 
> 
> Thanks tejun for the reply and suggestions.
> 
> Yes, unfortunately cpus_read_lock not keeping cpumask stable at
> secondary boot. Not sure, may be it only gurantee 'cpu' dont go down
> under cpus_read_[lock/unlock].
> 
> As suggested will tryout something like:
> 	if (system_state != RUNNING) {
> 		:
> 	}
> > BTW, who's calling schedule_on_each_cpu() that early during boot. It makes
> > no sense to do this while the cpumasks can't be stabilized.
> >
> It is  implemenation of CONFIG_TASKS_RUDE_RCU.

Another option would be to adjust CONFIG_TASKS_RUDE_RCU based on where
things are in the boot process.  For example:

	// Wait for one rude RCU-tasks grace period.
	static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
	{
		if (num_online_cpus() <= 1)
			return;  // Fastpath for only one CPU.
		rtp->n_ipis += cpumask_weight(cpu_online_mask);
		schedule_on_each_cpu(rcu_tasks_be_rude);
	}

Easy enough either way!

							Thanx, Paul
Padmanabha Srinivasaiah Feb. 17, 2022, 3:39 p.m. UTC | #4
On Wed, Feb 16, 2022 at 11:07:00AM -0800, Paul E. McKenney wrote:
> On Wed, Feb 16, 2022 at 07:49:39PM +0100, Padmanabha Srinivasaiah wrote:
> > On Mon, Feb 14, 2022 at 09:43:52AM -1000, Tejun Heo wrote:
> > > Hello,
> > > 
> > > > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > > > index 33f1106b4f99..a3f53f859e9d 100644
> > > > --- a/kernel/workqueue.c
> > > > +++ b/kernel/workqueue.c
> > > > @@ -3326,28 +3326,38 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
> > > >   */
> > > >  int schedule_on_each_cpu(work_func_t func)
> > > >  {
> > > > -	int cpu;
> > > >  	struct work_struct __percpu *works;
> > > > +	cpumask_var_t sched_cpumask;
> > > > +	int cpu, ret = 0;
> > > >  
> > > > -	works = alloc_percpu(struct work_struct);
> > > > -	if (!works)
> > > > +	if (!alloc_cpumask_var(&sched_cpumask, GFP_KERNEL))
> > > >  		return -ENOMEM;
> > > >  
> > > > +	works = alloc_percpu(struct work_struct);
> > > > +	if (!works) {
> > > > +		ret = -ENOMEM;
> > > > +		goto free_cpumask;
> > > > +	}
> > > > +
> > > >  	cpus_read_lock();
> > > >  
> > > > -	for_each_online_cpu(cpu) {
> > > > +	cpumask_copy(sched_cpumask, cpu_online_mask);
> > > > +	for_each_cpu_and(cpu, sched_cpumask, cpu_online_mask) {
> > > 
> > > This definitely would need a comment explaining what's going on cuz it looks
> > > weird to be copying the cpumask which is supposed to stay stable due to the
> > > cpus_read_lock().Given that it can only happen during early boot and the
> > > online cpus can only be expanding, maybe just add sth like:
> > > 
> > >         if (early_during_boot) {
> > >                 for_each_possible_cpu(cpu)
> > >                         INIT_WORK(per_cpu_ptr(works, cpu), func);
> > >         }
> > > 
> > 
> > Thanks tejun for the reply and suggestions.
> > 
> > Yes, unfortunately cpus_read_lock not keeping cpumask stable at
> > secondary boot. Not sure, may be it only gurantee 'cpu' dont go down
> > under cpus_read_[lock/unlock].
> > 
> > As suggested will tryout something like:
> > 	if (system_state != RUNNING) {
> > 		:
> > 	}
> > > BTW, who's calling schedule_on_each_cpu() that early during boot. It makes
> > > no sense to do this while the cpumasks can't be stabilized.
> > >
> > It is  implemenation of CONFIG_TASKS_RUDE_RCU.
> 
> Another option would be to adjust CONFIG_TASKS_RUDE_RCU based on where
> things are in the boot process.  For example:
> 
> 	// Wait for one rude RCU-tasks grace period.
> 	static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
> 	{
> 		if (num_online_cpus() <= 1)
> 			return;  // Fastpath for only one CPU.
> 		rtp->n_ipis += cpumask_weight(cpu_online_mask);
> 		schedule_on_each_cpu(rcu_tasks_be_rude);
> 	}
> 
> Easy enough either way!
> 
> 							Thanx, Paul

Thank you Paul for suggestion, tried same and it fixes the issue.
Have submitted same as suggested-by Paul.

Link :
https://lore.kernel.org/all/20220217152520.18972-1-treasure4paddy@gmail.com/T/#t
diff mbox series

Patch

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33f1106b4f99..a3f53f859e9d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3326,28 +3326,38 @@  EXPORT_SYMBOL(cancel_delayed_work_sync);
  */
 int schedule_on_each_cpu(work_func_t func)
 {
-	int cpu;
 	struct work_struct __percpu *works;
+	cpumask_var_t sched_cpumask;
+	int cpu, ret = 0;
 
-	works = alloc_percpu(struct work_struct);
-	if (!works)
+	if (!alloc_cpumask_var(&sched_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
+	works = alloc_percpu(struct work_struct);
+	if (!works) {
+		ret = -ENOMEM;
+		goto free_cpumask;
+	}
+
 	cpus_read_lock();
 
-	for_each_online_cpu(cpu) {
+	cpumask_copy(sched_cpumask, cpu_online_mask);
+	for_each_cpu_and(cpu, sched_cpumask, cpu_online_mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu_and(cpu, sched_cpumask, cpu_online_mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
 	cpus_read_unlock();
 	free_percpu(works);
-	return 0;
+
+free_cpumask:
+	free_cpumask_var(sched_cpumask);
+	return ret;
 }
 
 /**