diff mbox series

tracing/osnoise: Fix possible recursive locking for cpus_read_lock()

Message ID 20250225123132.2583820-1-ranxiaokai627@163.com (mailing list archive)
State Changes Requested
Headers show
Series tracing/osnoise: Fix possible recursive locking for cpus_read_lock() | expand

Commit Message

Ran Xiaokai Feb. 25, 2025, 12:31 p.m. UTC
From: Ran Xiaokai <ran.xiaokai@zte.com.cn>

Lockdep reports this deadlock log:
============================================
WARNING: possible recursive locking detected
--------------------------------------------
sh/31444 is trying to acquire lock:
ffffffff82c51af0 (cpu_hotplug_lock){++++}-{0:0}, at:
stop_per_cpu_kthreads+0x7/0x60

but task is already holding lock:
ffffffff82c51af0 (cpu_hotplug_lock){++++}-{0:0}, at:
start_per_cpu_kthreads+0x28/0x140

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(cpu_hotplug_lock);
  lock(cpu_hotplug_lock);

Call Trace:
 <TASK>
 __lock_acquire+0x1612/0x29b0
 lock_acquire+0xd0/0x2e0
 cpus_read_lock+0x49/0x120
 stop_per_cpu_kthreads+0x7/0x60
 start_kthread+0x105/0x120
 start_per_cpu_kthreads+0xdd/0x140
 osnoise_workload_start+0x261/0x2f0
 osnoise_tracer_start+0x18/0x4

In start_kthread(), when kthread_run_on_cpu() fails,
cpus_read_unlock() should be called before stop_per_cpu_kthreads(),
but both start_per_cpu_kthreads() and start_kthread() call the error
handling routine stop_per_cpu_kthreads(),
which is redundant. Only one call is necessary.
To fix this, move stop_per_cpu_kthreads() outside of start_kthread(),
use the return value of start_kthread() to determine kthread creation
error.
The same issue exists in osnoise_hotplug_workfn() too.

Reviewed-by: Yang Guang <yang.guang5@zte.com.cn>
Reviewed-by: Wang Yong <wang.yong12@zte.com.cn>
Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
---
 kernel/trace/trace_osnoise.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

 		return;
@@ -2105,7 +2104,12 @@ static void osnoise_hotplug_workfn(struct
work_struct *dummy)
 	if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
 		return;
 
-	start_kthread(cpu);
+	if (start_kthread(cpu)) {
+		cpus_read_unlock();
+		stop_per_cpu_kthreads();
+		return;
+	}
+	cpus_read_unlock();
 }
 
 static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);

Comments

Steven Rostedt Feb. 25, 2025, 4:30 p.m. UTC | #1
On Tue, 25 Feb 2025 12:31:32 +0000
Ran Xiaokai <ranxiaokai627@163.com> wrote:

> @@ -2097,7 +2096,7 @@ static void osnoise_hotplug_workfn(struct
> work_struct *dummy)
>  		return;
>  
>  	guard(mutex)(&interface_lock);
> -	guard(cpus_read_lock)();
> +	cpus_read_lock();
>  
>  	if (!cpu_online(cpu))
>  		return;

This is buggy. You removed the guard, and right below we have an error exit
that will leave this function without unlocking the cpus_read_lock().

> @@ -2105,7 +2104,12 @@ static void osnoise_hotplug_workfn(struct
> work_struct *dummy)
>  	if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
>  		return;
>  
> -	start_kthread(cpu);
> +	if (start_kthread(cpu)) {
> +		cpus_read_unlock();
> +		stop_per_cpu_kthreads();
> +		return;

If all you want to do is to unlock before calling stop_per_cpu_kthreads(),
then this should simply be:

	if (start_kthread(cpu)) {
		cpus_read_unlock();
		stop_per_cpu_kthreads();
		cpus_read_lock(); // The guard() above will unlock this
		return;
	}


But I still have to verify that this is indeed the issue here.

-- Steve


> +	}
> +	cpus_read_unlock();
>  }
>  
>  static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
Ran Xiaokai Feb. 26, 2025, 3:42 a.m. UTC | #2
>On Tue, 25 Feb 2025 12:31:32 +0000
>Ran Xiaokai <ranxiaokai627@163.com> wrote:
>
>> @@ -2097,7 +2096,7 @@ static void osnoise_hotplug_workfn(struct
>> work_struct *dummy)
>>          return;
>>  
>>      guard(mutex)(&interface_lock);
>> -    guard(cpus_read_lock)();
>> +    cpus_read_lock();
>>  
>>      if (!cpu_online(cpu))
>>          return;
>
>This is buggy. You removed the guard, and right below we have an error exit
>that will leave this function without unlocking the cpus_read_lock().

Indeed.
I will run the LTP cpu-hotplug testcases before the next verion.

>> @@ -2105,7 +2104,12 @@ static void osnoise_hotplug_workfn(struct
>> work_struct *dummy)
>>      if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
>>          return;
>>  
>> -    start_kthread(cpu);
>> +    if (start_kthread(cpu)) {
>> +        cpus_read_unlock();
>> +        stop_per_cpu_kthreads();
>> +        return;
>
>If all you want to do is to unlock before calling stop_per_cpu_kthreads(),
>then this should simply be:
>
>   if (start_kthread(cpu)) {
>       cpus_read_unlock();
>       stop_per_cpu_kthreads();
>       cpus_read_lock(); // The guard() above will unlock this
>       return;
>   }

This is the deadlock senario:
start_per_cpu_kthreads()
  cpus_read_lock();                  // first lock call
  start_kthread(cpu)
    ... kthread_run_on_cpu() fails:
    if (IS_ERR(kthread)) {
      stop_per_cpu_kthreads(); {
        cpus_read_lock();      // second lock call. Cause the AA deadlock senario
      }
    }
  stop_per_cpu_kthreads();

Besides, stop_per_cpu_kthreads() is called both in start_per_cpu_kthreads() and
start_kthread() which is unnecessary.

So the fix should be inside start_kthread()?
How about this ?

--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2029,7 +2029,9 @@ static int start_kthread(unsigned int cpu)

        if (IS_ERR(kthread)) {
                pr_err(BANNER "could not start sampling thread\n");
+               cpus_read_unlock();
                stop_per_cpu_kthreads();
+               cpus_read_lock();
                return -ENOMEM;
        }

@@ -2076,7 +2078,6 @@ static int start_per_cpu_kthreads(void)
                retval = start_kthread(cpu);
                if (retval) {
                        cpus_read_unlock();
-                       stop_per_cpu_kthreads();
                        return retval;
                }
        }

>
>But I still have to verify that this is indeed the issue here.
>
>-- Steve
>
>
>> +    }
>> +    cpus_read_unlock();
>>  }
>>  
>>  static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
Steven Rostedt Feb. 26, 2025, 3:05 p.m. UTC | #3
On Wed, 26 Feb 2025 03:42:53 +0000
Ran Xiaokai <ranxiaokai627@163.com> wrote:

> >> @@ -2105,7 +2104,12 @@ static void osnoise_hotplug_workfn(struct
> >> work_struct *dummy)
> >>      if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
> >>          return;
> >>  
> >> -    start_kthread(cpu);
> >> +    if (start_kthread(cpu)) {
> >> +        cpus_read_unlock();
> >> +        stop_per_cpu_kthreads();
> >> +        return;  
> >
> >If all you want to do is to unlock before calling stop_per_cpu_kthreads(),
> >then this should simply be:
> >
> >   if (start_kthread(cpu)) {
> >       cpus_read_unlock();
> >       stop_per_cpu_kthreads();
> >       cpus_read_lock(); // The guard() above will unlock this
> >       return;
> >   }  
> 
> This is the deadlock senario:
> start_per_cpu_kthreads()
>   cpus_read_lock();                  // first lock call
>   start_kthread(cpu)
>     ... kthread_run_on_cpu() fails:
>     if (IS_ERR(kthread)) {
>       stop_per_cpu_kthreads(); {
>         cpus_read_lock();      // second lock call. Cause the AA deadlock senario
>       }
>     }
>   stop_per_cpu_kthreads();
> 
> Besides, stop_per_cpu_kthreads() is called both in start_per_cpu_kthreads() and
> start_kthread() which is unnecessary.
> 
> So the fix should be inside start_kthread()?
> How about this ?

No! You misunderstood what I wrote above.

Instead of removing the guard, keep it!

Do everything the same, but instead of having the error path of:

[..]
    if (start_kthread(cpu)) {
        cpus_read_unlock();
        stop_per_cpu_kthreads();
        return;
    }
    cpus_read_unlock();
 }

Which requires removing the guard. Just do:

    if (start_kthread(cpu)) {
        cpus_read_unlock();
        stop_per_cpu_kthreads();
        cpus_read_lock(); // The guard() will unlock this
    }
 }

I'm just saying to not replace the guard with open coded locking of
cpus_read_lock().

-- Steve
Vishal Chourasia Feb. 27, 2025, 7:40 a.m. UTC | #4
Hi,

On Tue, Feb 25, 2025 at 12:31:32PM +0000, Ran Xiaokai wrote:
> From: Ran Xiaokai <ran.xiaokai@zte.com.cn>
> 
> Lockdep reports this deadlock log:
> ============================================
> WARNING: possible recursive locking detected
> --------------------------------------------
> sh/31444 is trying to acquire lock:
> ffffffff82c51af0 (cpu_hotplug_lock){++++}-{0:0}, at:
> stop_per_cpu_kthreads+0x7/0x60
> 
> but task is already holding lock:
> ffffffff82c51af0 (cpu_hotplug_lock){++++}-{0:0}, at:
> start_per_cpu_kthreads+0x28/0x140
> 
> other info that might help us debug this:
>  Possible unsafe locking scenario:
> 
>        CPU0
>        ----
>   lock(cpu_hotplug_lock);
>   lock(cpu_hotplug_lock);
> 
> Call Trace:
>  <TASK>
>  __lock_acquire+0x1612/0x29b0
>  lock_acquire+0xd0/0x2e0
>  cpus_read_lock+0x49/0x120
>  stop_per_cpu_kthreads+0x7/0x60
>  start_kthread+0x105/0x120
>  start_per_cpu_kthreads+0xdd/0x140
>  osnoise_workload_start+0x261/0x2f0
>  osnoise_tracer_start+0x18/0x4
> 
> In start_kthread(), when kthread_run_on_cpu() fails,
> cpus_read_unlock() should be called before stop_per_cpu_kthreads(),
> but both start_per_cpu_kthreads() and start_kthread() call the error
> handling routine stop_per_cpu_kthreads(),
> which is redundant. Only one call is necessary.
> To fix this, move stop_per_cpu_kthreads() outside of start_kthread(),
> use the return value of start_kthread() to determine kthread creation
> error.
> The same issue exists in osnoise_hotplug_workfn() too.
> 
> Reviewed-by: Yang Guang <yang.guang5@zte.com.cn>
> Reviewed-by: Wang Yong <wang.yong12@zte.com.cn>
> Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
> ---
>  kernel/trace/trace_osnoise.c | 10 +++++++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
> index 92e16f03fa4e..38fb0c655f5b 100644
> --- a/kernel/trace/trace_osnoise.c
> +++ b/kernel/trace/trace_osnoise.c
> @@ -2029,7 +2029,6 @@ static int start_kthread(unsigned int cpu)
>  
>  	if (IS_ERR(kthread)) {
>  		pr_err(BANNER "could not start sampling thread\n");
> -		stop_per_cpu_kthreads();
>  		return -ENOMEM;
>  	}
>  
> @@ -2097,7 +2096,7 @@ static void osnoise_hotplug_workfn(struct
> work_struct *dummy)
>  		return;
>  
>  	guard(mutex)(&interface_lock);
> -	guard(cpus_read_lock)();
> +	cpus_read_lock();
>  
>  	if (!cpu_online(cpu))
>  		return;
> @@ -2105,7 +2104,12 @@ static void osnoise_hotplug_workfn(struct
> work_struct *dummy)
>  	if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
>  		return;
>  
> -	start_kthread(cpu);
> +	if (start_kthread(cpu)) {
> +		cpus_read_unlock();
> +		stop_per_cpu_kthreads();

Is it right to call stop_per_cpu_kthreads() which stops osnoise kthread
for every other CPUs in the system if a failure occurs during hotplug of a
CPU?

On another note, since stop_per_cpu_kthreads() invokes stop_kthread()
for every online CPU. It's better to remove stop_per_cpu_kthreads() from
start_kthread() and handle the error in `osnoise_hotplug_workfn` 

	Vishal
> +		return;
> +	}
> +	cpus_read_unlock();
>  }
>  
>  static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
> -- 
> 2.15.2
>
diff mbox series

Patch

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 92e16f03fa4e..38fb0c655f5b 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2029,7 +2029,6 @@  static int start_kthread(unsigned int cpu)
 
 	if (IS_ERR(kthread)) {
 		pr_err(BANNER "could not start sampling thread\n");
-		stop_per_cpu_kthreads();
 		return -ENOMEM;
 	}
 
@@ -2097,7 +2096,7 @@  static void osnoise_hotplug_workfn(struct
work_struct *dummy)
 		return;
 
 	guard(mutex)(&interface_lock);
-	guard(cpus_read_lock)();
+	cpus_read_lock();
 
 	if (!cpu_online(cpu))