diff mbox series

sched/schedutil: Fix deadlock between cpuset and cpu hotplug when using schedutil

Message ID 20220705123705.764-1-xuewen.yan@unisoc.com (mailing list archive)
State Handled Elsewhere, archived
Headers show
Series sched/schedutil: Fix deadlock between cpuset and cpu hotplug when using schedutil | expand

Commit Message

Xuewen Yan July 5, 2022, 12:37 p.m. UTC
There is a deadlock between cpu_hotplug_lock sem and cgroup_threadgroup_rwsem.
And both cpu online and cpu offline scenarios will cause deadlock:

[1] cpu online:

1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock:

  [ffffffc00f9c3850] __switch_to at ffffffc0081229d4
  [ffffffc00f9c38b0] __schedule at ffffffc009c824f8
  [ffffffc00f9c3910] schedule at ffffffc009c82b50
  [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc
  [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180     <<< try to get cpu_hotplug_lock
  [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8
  [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4
  [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8
  [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8
  [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac   <<< get the cgroup_threadgroup_rwsem
  [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018
  [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04
  [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8
  [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c
  [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8
  [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640
  [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc
  [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4
  [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8
  [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8
  [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c
  [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44

2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x:

  [ffffffc01245b740] __switch_to at ffffffc0081229d4
  [ffffffc01245b7a0] __schedule at ffffffc009c824f8
  [ffffffc01245b800] schedule at ffffffc009c82b50
  [ffffffc01245b880] schedule_timeout at ffffffc009c8a144
  [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac     <<< waiting for cpuhp/x complete
  [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac       <<< wake up cpuhp/x
  [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0
  [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000
  [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0
  [ffffffc01245bac0] _cpu_up at ffffffc0081dd844            <<< percpu_down_write(&cpu_hotplug_lock)
  [ffffffc01245bb40] cpu_up at ffffffc0081dd100
  [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4
  [ffffffc01245bb90] device_online at ffffffc008d77dec
  [ffffffc01245bbd0] online_store at ffffffc008d77bb8
  [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8
  [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894
  [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8
  [ffffffc01245bd50] vfs_write at ffffffc008607a8c
  [ffffffc01245bdb0] ksys_write at ffffffc0086076d8
  [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640
  [ffffffc01245be10] invoke_syscall at ffffffc00813dccc
  [ffffffc01245be30] el0_svc_common at ffffffc00813dc14
  [ffffffc01245be70] do_el0_svc at ffffffc00813dac8
  [ffffffc01245be80] el0_svc at ffffffc0098836c8
  [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c
  [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44

3. cpuhp/x would call cpufreq_online and the schedutil need create kthread, as a result it need wait for kthreadd:

  [ffffffc00b653860] __switch_to at ffffffc0081229d4
  [ffffffc00b6538c0] __schedule at ffffffc009c824f8
  [ffffffc00b653920] schedule at ffffffc009c82b50
  [ffffffc00b6539a0] schedule_timeout at ffffffc009c8a144
  [ffffffc00b653a00] wait_for_common at ffffffc009c83dac     <<< wait for kthreadd
  [ffffffc00b653ad0] __kthread_create_on_node at ffffffc0082296b8
  [ffffffc00b653b90] kthread_create_on_node at ffffffc008229a58
  [ffffffc00b653bb0] sugov_init at ffffffc001d01414
  [ffffffc00b653c00] cpufreq_init_governor at ffffffc009179a48
  [ffffffc00b653c50] cpufreq_set_policy at ffffffc009179360
  [ffffffc00b653cb0] cpufreq_online at ffffffc00917c3e4
  [ffffffc00b653d10] cpuhp_cpufreq_online at ffffffc00917ee3c
  [ffffffc00b653d70] cpuhp_invoke_callback at ffffffc0081dc000
  [ffffffc00b653dd0] cpuhp_thread_fun at ffffffc0081df47c
  [ffffffc00b653e10] smpboot_thread_fn at ffffffc008234718
  [ffffffc00b653e70] kthread at ffffffc00822ac84

4. the kthreadd is trying to get cgroup_threadgroup_rwsem...

  [ffffffc008073a70] __switch_to at ffffffc0081229d4
  [ffffffc008073ad0] __schedule at ffffffc009c824f8
  [ffffffc008073b30] schedule at ffffffc009c82b50
  [ffffffc008073b90] percpu_rwsem_wait at ffffffc00828fbcc  <<< cgroup_threadgroup_rwsem
  [ffffffc008073bd0] __percpu_down_read at ffffffc008290180
  [ffffffc008073c00] cgroup_css_set_fork at ffffffc008357ecc
  [ffffffc008073c60] cgroup_can_fork at ffffffc008357bb4
  [ffffffc008073d00] copy_process at ffffffc0081d18c4
  [ffffffc008073d90] kernel_clone at ffffffc0081d088c
  [ffffffc008073e50] kthreadd at ffffffc00822a8e4

Finally, the Thread-A and Thread-B would be blockd forever.

[2]cpu_offline:

1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock:

  [ffffffc00f9c3850] __switch_to at ffffffc0081229d4
  [ffffffc00f9c38b0] __schedule at ffffffc009c824f8
  [ffffffc00f9c3910] schedule at ffffffc009c82b50
  [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc
  [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180     <<< try to get cpu_hotplug_lock
  [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8
  [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4
  [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8
  [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8
  [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac   <<< get the cgroup_threadgroup_rwsem
  [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018
  [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04
  [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8
  [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c
  [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8
  [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640
  [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc
  [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4
  [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8
  [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8
  [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c
  [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44

2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x:

  [ffffffc01245b740] __switch_to at ffffffc0081229d4
  [ffffffc01245b7a0] __schedule at ffffffc009c824f8
  [ffffffc01245b800] schedule at ffffffc009c82b50
  [ffffffc01245b880] schedule_timeout at ffffffc009c8a144
  [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac     <<< waiting for cpuhp/x complete
  [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac       <<< wake up cpuhp/x
  [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0
  [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000
  [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0
  [ffffffc01245bac0] _cpu_up at ffffffc0081dd844            <<< percpu_down_write(&cpu_hotplug_lock)
  [ffffffc01245bb40] cpu_up at ffffffc0081dd100
  [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4
  [ffffffc01245bb90] device_online at ffffffc008d77dec
  [ffffffc01245bbd0] online_store at ffffffc008d77bb8
  [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8
  [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894
  [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8
  [ffffffc01245bd50] vfs_write at ffffffc008607a8c
  [ffffffc01245bdb0] ksys_write at ffffffc0086076d8
  [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640
  [ffffffc01245be10] invoke_syscall at ffffffc00813dccc
  [ffffffc01245be30] el0_svc_common at ffffffc00813dc14
  [ffffffc01245be70] do_el0_svc at ffffffc00813dac8
  [ffffffc01245be80] el0_svc at ffffffc0098836c8
  [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c
  [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44

3.cpuhp/x would call cpufreq_offline and schedutil need stop_kthread:

  [ffffffc00b683a60] __switch_to at ffffffc0081229d4
  [ffffffc00b683ac0] __schedule at ffffffc009c824f8
  [ffffffc00b683b20] schedule at ffffffc009c82b50
  [ffffffc00b683ba0] schedule_timeout at ffffffc009c8a144
  [ffffffc00b683c00] wait_for_common at ffffffc009c83dac <<< waiting for sugov complete
  [ffffffc00b683c60] kthread_stop at ffffffc008228128   <<< wakeup sugov
  [ffffffc00b683c90] sugov_exit at ffffffc001d016a8
  [ffffffc00b683cc0] cpufreq_offline at ffffffc00917b4b8
  [ffffffc00b683d10] cpuhp_cpufreq_offline at ffffffc00917ee70
  [ffffffc00b683d70] cpuhp_invoke_callback at ffffffc0081dc000
  [ffffffc00b683dd0] cpuhp_thread_fun at ffffffc0081df47c
  [ffffffc00b683e10] smpboot_thread_fn at ffffffc008234718
  [ffffffc00b683e70] kthread at ffffffc00822ac84

4. the sugov thread is waiting cgroup_threadgroup_rwsem:

  [ffffffc01258bc10] __switch_to at ffffffc0081229d4
  [ffffffc01258bc70] __schedule at ffffffc009c824f8
  [ffffffc01258bcd0] schedule at ffffffc009c82b50
  [ffffffc01258bd30] percpu_rwsem_wait at ffffffc00828fbcc  <<< wait for cgroup_threadgroup_rwsem
  [ffffffc01258bd70] __percpu_down_read at ffffffc008290180
  [ffffffc01258bdb0] exit_signals at ffffffc0082074c8
  [ffffffc01258be10] do_exit at ffffffc0081e5130
  [ffffffc01258be70] kthread at ffffffc00822ac8c

Finally, the Thread-A and Thread-B would be blockd forever.

Combining the above two situations, the reason why Thread-A and Thread-B are blocked is because cpuhp/x is blocked.
If we can solve the problem that cpuhp is not blocked, Thread-A and Thread-B can run normally.
So we can let schedutil do not create or destory thread, so that the cpuhp/x would not be blocked.

So init the sugov thread just once to prevent the above deadlock.

Signed-off-by: Ke Wang <ke.wang@unisoc.com>
Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
---
 kernel/sched/cpufreq_schedutil.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

Comments

Xuewen Yan July 11, 2022, 7:21 a.m. UTC | #1
Hi all

This deadlock is inevitable, any comment?

On Tue, Jul 5, 2022 at 11:03 PM Xuewen Yan <xuewen.yan@unisoc.com> wrote:
>
> There is a deadlock between cpu_hotplug_lock sem and cgroup_threadgroup_rwsem.
> And both cpu online and cpu offline scenarios will cause deadlock:
>
> [1] cpu online:
>
> 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock:
>
>   [ffffffc00f9c3850] __switch_to at ffffffc0081229d4
>   [ffffffc00f9c38b0] __schedule at ffffffc009c824f8
>   [ffffffc00f9c3910] schedule at ffffffc009c82b50
>   [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc
>   [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180     <<< try to get cpu_hotplug_lock
>   [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8
>   [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4
>   [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8
>   [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8
>   [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac   <<< get the cgroup_threadgroup_rwsem
>   [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018
>   [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04
>   [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c
>   [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8
>   [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640
>   [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc
>   [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4
>   [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8
>   [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8
>   [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44
>
> 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x:
>
>   [ffffffc01245b740] __switch_to at ffffffc0081229d4
>   [ffffffc01245b7a0] __schedule at ffffffc009c824f8
>   [ffffffc01245b800] schedule at ffffffc009c82b50
>   [ffffffc01245b880] schedule_timeout at ffffffc009c8a144
>   [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac     <<< waiting for cpuhp/x complete
>   [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac       <<< wake up cpuhp/x
>   [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0
>   [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0
>   [ffffffc01245bac0] _cpu_up at ffffffc0081dd844            <<< percpu_down_write(&cpu_hotplug_lock)
>   [ffffffc01245bb40] cpu_up at ffffffc0081dd100
>   [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4
>   [ffffffc01245bb90] device_online at ffffffc008d77dec
>   [ffffffc01245bbd0] online_store at ffffffc008d77bb8
>   [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8
>   [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894
>   [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc01245bd50] vfs_write at ffffffc008607a8c
>   [ffffffc01245bdb0] ksys_write at ffffffc0086076d8
>   [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640
>   [ffffffc01245be10] invoke_syscall at ffffffc00813dccc
>   [ffffffc01245be30] el0_svc_common at ffffffc00813dc14
>   [ffffffc01245be70] do_el0_svc at ffffffc00813dac8
>   [ffffffc01245be80] el0_svc at ffffffc0098836c8
>   [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44
>
> 3. cpuhp/x would call cpufreq_online and the schedutil need create kthread, as a result it need wait for kthreadd:
>
>   [ffffffc00b653860] __switch_to at ffffffc0081229d4
>   [ffffffc00b6538c0] __schedule at ffffffc009c824f8
>   [ffffffc00b653920] schedule at ffffffc009c82b50
>   [ffffffc00b6539a0] schedule_timeout at ffffffc009c8a144
>   [ffffffc00b653a00] wait_for_common at ffffffc009c83dac     <<< wait for kthreadd
>   [ffffffc00b653ad0] __kthread_create_on_node at ffffffc0082296b8
>   [ffffffc00b653b90] kthread_create_on_node at ffffffc008229a58
>   [ffffffc00b653bb0] sugov_init at ffffffc001d01414
>   [ffffffc00b653c00] cpufreq_init_governor at ffffffc009179a48
>   [ffffffc00b653c50] cpufreq_set_policy at ffffffc009179360
>   [ffffffc00b653cb0] cpufreq_online at ffffffc00917c3e4
>   [ffffffc00b653d10] cpuhp_cpufreq_online at ffffffc00917ee3c
>   [ffffffc00b653d70] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc00b653dd0] cpuhp_thread_fun at ffffffc0081df47c
>   [ffffffc00b653e10] smpboot_thread_fn at ffffffc008234718
>   [ffffffc00b653e70] kthread at ffffffc00822ac84
>
> 4. the kthreadd is trying to get cgroup_threadgroup_rwsem...
>
>   [ffffffc008073a70] __switch_to at ffffffc0081229d4
>   [ffffffc008073ad0] __schedule at ffffffc009c824f8
>   [ffffffc008073b30] schedule at ffffffc009c82b50
>   [ffffffc008073b90] percpu_rwsem_wait at ffffffc00828fbcc  <<< cgroup_threadgroup_rwsem
>   [ffffffc008073bd0] __percpu_down_read at ffffffc008290180
>   [ffffffc008073c00] cgroup_css_set_fork at ffffffc008357ecc
>   [ffffffc008073c60] cgroup_can_fork at ffffffc008357bb4
>   [ffffffc008073d00] copy_process at ffffffc0081d18c4
>   [ffffffc008073d90] kernel_clone at ffffffc0081d088c
>   [ffffffc008073e50] kthreadd at ffffffc00822a8e4
>
> Finally, the Thread-A and Thread-B would be blockd forever.
>
> [2]cpu_offline:
>
> 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock:
>
>   [ffffffc00f9c3850] __switch_to at ffffffc0081229d4
>   [ffffffc00f9c38b0] __schedule at ffffffc009c824f8
>   [ffffffc00f9c3910] schedule at ffffffc009c82b50
>   [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc
>   [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180     <<< try to get cpu_hotplug_lock
>   [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8
>   [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4
>   [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8
>   [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8
>   [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac   <<< get the cgroup_threadgroup_rwsem
>   [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018
>   [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04
>   [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c
>   [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8
>   [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640
>   [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc
>   [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4
>   [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8
>   [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8
>   [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44
>
> 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x:
>
>   [ffffffc01245b740] __switch_to at ffffffc0081229d4
>   [ffffffc01245b7a0] __schedule at ffffffc009c824f8
>   [ffffffc01245b800] schedule at ffffffc009c82b50
>   [ffffffc01245b880] schedule_timeout at ffffffc009c8a144
>   [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac     <<< waiting for cpuhp/x complete
>   [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac       <<< wake up cpuhp/x
>   [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0
>   [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0
>   [ffffffc01245bac0] _cpu_up at ffffffc0081dd844            <<< percpu_down_write(&cpu_hotplug_lock)
>   [ffffffc01245bb40] cpu_up at ffffffc0081dd100
>   [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4
>   [ffffffc01245bb90] device_online at ffffffc008d77dec
>   [ffffffc01245bbd0] online_store at ffffffc008d77bb8
>   [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8
>   [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894
>   [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc01245bd50] vfs_write at ffffffc008607a8c
>   [ffffffc01245bdb0] ksys_write at ffffffc0086076d8
>   [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640
>   [ffffffc01245be10] invoke_syscall at ffffffc00813dccc
>   [ffffffc01245be30] el0_svc_common at ffffffc00813dc14
>   [ffffffc01245be70] do_el0_svc at ffffffc00813dac8
>   [ffffffc01245be80] el0_svc at ffffffc0098836c8
>   [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44
>
> 3.cpuhp/x would call cpufreq_offline and schedutil need stop_kthread:
>
>   [ffffffc00b683a60] __switch_to at ffffffc0081229d4
>   [ffffffc00b683ac0] __schedule at ffffffc009c824f8
>   [ffffffc00b683b20] schedule at ffffffc009c82b50
>   [ffffffc00b683ba0] schedule_timeout at ffffffc009c8a144
>   [ffffffc00b683c00] wait_for_common at ffffffc009c83dac <<< waiting for sugov complete
>   [ffffffc00b683c60] kthread_stop at ffffffc008228128   <<< wakeup sugov
>   [ffffffc00b683c90] sugov_exit at ffffffc001d016a8
>   [ffffffc00b683cc0] cpufreq_offline at ffffffc00917b4b8
>   [ffffffc00b683d10] cpuhp_cpufreq_offline at ffffffc00917ee70
>   [ffffffc00b683d70] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc00b683dd0] cpuhp_thread_fun at ffffffc0081df47c
>   [ffffffc00b683e10] smpboot_thread_fn at ffffffc008234718
>   [ffffffc00b683e70] kthread at ffffffc00822ac84
>
> 4. the sugov thread is waiting cgroup_threadgroup_rwsem:
>
>   [ffffffc01258bc10] __switch_to at ffffffc0081229d4
>   [ffffffc01258bc70] __schedule at ffffffc009c824f8
>   [ffffffc01258bcd0] schedule at ffffffc009c82b50
>   [ffffffc01258bd30] percpu_rwsem_wait at ffffffc00828fbcc  <<< wait for cgroup_threadgroup_rwsem
>   [ffffffc01258bd70] __percpu_down_read at ffffffc008290180
>   [ffffffc01258bdb0] exit_signals at ffffffc0082074c8
>   [ffffffc01258be10] do_exit at ffffffc0081e5130
>   [ffffffc01258be70] kthread at ffffffc00822ac8c
>
> Finally, the Thread-A and Thread-B would be blockd forever.
>
> Combining the above two situations, the reason why Thread-A and Thread-B are blocked is because cpuhp/x is blocked.
> If we can solve the problem that cpuhp is not blocked, Thread-A and Thread-B can run normally.
> So we can let schedutil do not create or destory thread, so that the cpuhp/x would not be blocked.
>
> So init the sugov thread just once to prevent the above deadlock.
>
> Signed-off-by: Ke Wang <ke.wang@unisoc.com>
> Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> ---
>  kernel/sched/cpufreq_schedutil.c | 24 +++++++++++++++++++++++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index 285ad51caf0f..42d04a1eac20 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -8,6 +8,8 @@
>
>  #define IOWAIT_BOOST_MIN       (SCHED_CAPACITY_SCALE / 8)
>
> +static LIST_HEAD(sugov_list);
> +
>  struct sugov_tunables {
>         struct gov_attr_set     attr_set;
>         unsigned int            rate_limit_us;
> @@ -15,6 +17,7 @@ struct sugov_tunables {
>
>  struct sugov_policy {
>         struct cpufreq_policy   *policy;
> +       struct list_head        sg_policy_list;
>
>         struct sugov_tunables   *tunables;
>         struct list_head        tunables_hook;
> @@ -659,6 +662,7 @@ static int sugov_init(struct cpufreq_policy *policy)
>         struct sugov_policy *sg_policy;
>         struct sugov_tunables *tunables;
>         int ret = 0;
> +       int sugov_init = 0;
>
>         /* State should be equivalent to EXIT */
>         if (policy->governor_data)
> @@ -666,6 +670,13 @@ static int sugov_init(struct cpufreq_policy *policy)
>
>         cpufreq_enable_fast_switch(policy);
>
> +       list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) {
> +               if (sg_policy->policy == policy) {
> +                       sugov_init = 1;
> +                       goto tunables_lock;
> +               }
> +       }
> +
>         sg_policy = sugov_policy_alloc(policy);
>         if (!sg_policy) {
>                 ret = -ENOMEM;
> @@ -676,6 +687,10 @@ static int sugov_init(struct cpufreq_policy *policy)
>         if (ret)
>                 goto free_sg_policy;
>
> +       list_add(&sg_policy->sg_policy_list, &sugov_list);
> +
> +tunables_lock:
> +
>         mutex_lock(&global_tunables_lock);
>
>         if (global_tunables) {
> @@ -717,7 +732,8 @@ static int sugov_init(struct cpufreq_policy *policy)
>         sugov_clear_global_tunables();
>
>  stop_kthread:
> -       sugov_kthread_stop(sg_policy);
> +       if (!sugov_init)
> +               sugov_kthread_stop(sg_policy);
>         mutex_unlock(&global_tunables_lock);
>
>  free_sg_policy:
> @@ -745,8 +761,14 @@ static void sugov_exit(struct cpufreq_policy *policy)
>
>         mutex_unlock(&global_tunables_lock);
>
> +       list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) {
> +               if (sg_policy->policy == policy) {
> +                       goto out;
> +               }
> +       }
>         sugov_kthread_stop(sg_policy);
>         sugov_policy_free(sg_policy);
> +out:
>         cpufreq_disable_fast_switch(policy);
>  }
>
> --
> 2.25.1
>

Thanks!
BR
Lukasz Luba July 11, 2022, 7:32 a.m. UTC | #2
Hi Xuewen,

On 7/11/22 08:21, Xuewen Yan wrote:
> Hi all
> 
> This deadlock is inevitable, any comment?

Could you tell me how to reproduce this?
Is there a need of special cgroup setup?

Regards,
Lukasz
Xuewen Yan July 11, 2022, 7:43 a.m. UTC | #3
On Mon, Jul 11, 2022 at 3:32 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
>
> Hi Xuewen,
>
> On 7/11/22 08:21, Xuewen Yan wrote:
> > Hi all
> >
> > This deadlock is inevitable, any comment?
>
> Could you tell me how to reproduce this?
> Is there a need of special cgroup setup?

This deadlock occurs when we run the monkey test on an Android phone,
at the same time, randomly put online or offline a cpu core.
Indeed the thread-A which get the cgroup_threadgroup_rwsem and waiting
for the cpu_hotplug_lock is the thread whose name is "OomAdjuster" in
android.

And I see the cpu_hotplug_lock is added by the patch:
https://lore.kernel.org/all/20220121101210.84926-1-zhangqiao22@huawei.com/

Thanks!

>
> Regards,
> Lukasz
Viresh Kumar July 11, 2022, 7:47 a.m. UTC | #4
On 11-07-22, 15:43, Xuewen Yan wrote:
> This deadlock occurs when we run the monkey test on an Android phone,
> at the same time, randomly put online or offline a cpu core.
> Indeed the thread-A which get the cgroup_threadgroup_rwsem and waiting
> for the cpu_hotplug_lock is the thread whose name is "OomAdjuster" in
> android.

It would be better to reproduce this on mainline kernel, no android,
and tell us the steps. We don't want to chase an Android only issue,
just in case.

Thanks.
Lukasz Luba July 11, 2022, 7:50 a.m. UTC | #5
On 7/11/22 08:43, Xuewen Yan wrote:
> On Mon, Jul 11, 2022 at 3:32 PM Lukasz Luba <lukasz.luba@arm.com> wrote:
>>
>> Hi Xuewen,
>>
>> On 7/11/22 08:21, Xuewen Yan wrote:
>>> Hi all
>>>
>>> This deadlock is inevitable, any comment?
>>
>> Could you tell me how to reproduce this?
>> Is there a need of special cgroup setup?
> 
> This deadlock occurs when we run the monkey test on an Android phone,
> at the same time, randomly put online or offline a cpu core.
> Indeed the thread-A which get the cgroup_threadgroup_rwsem and waiting
> for the cpu_hotplug_lock is the thread whose name is "OomAdjuster" in
> android.

Thanks, let me have a look. We have some hotplug stress tests in our
EAS mainline integration tests, which haven't triggered this issue.
I'll try to trigger this on mainline kernel.

> 
> And I see the cpu_hotplug_lock is added by the patch:
> https://lore.kernel.org/all/20220121101210.84926-1-zhangqiao22@huawei.com/

Thanks for the pointer.
Qais Yousef July 11, 2022, 5:46 p.m. UTC | #6
Hi Xuewen

+CC Tejun who might be interested.

On 07/05/22 20:37, Xuewen Yan wrote:
> There is a deadlock between cpu_hotplug_lock sem and cgroup_threadgroup_rwsem.
> And both cpu online and cpu offline scenarios will cause deadlock:
> 
> [1] cpu online:
> 
> 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock:
> 
>   [ffffffc00f9c3850] __switch_to at ffffffc0081229d4
>   [ffffffc00f9c38b0] __schedule at ffffffc009c824f8
>   [ffffffc00f9c3910] schedule at ffffffc009c82b50
>   [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc
>   [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180     <<< try to get cpu_hotplug_lock
>   [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8
>   [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4
>   [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8
>   [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8
>   [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac   <<< get the cgroup_threadgroup_rwsem
>   [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018
>   [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04
>   [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c
>   [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8
>   [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640
>   [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc
>   [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4
>   [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8
>   [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8
>   [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44
> 
> 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x:
> 
>   [ffffffc01245b740] __switch_to at ffffffc0081229d4
>   [ffffffc01245b7a0] __schedule at ffffffc009c824f8
>   [ffffffc01245b800] schedule at ffffffc009c82b50
>   [ffffffc01245b880] schedule_timeout at ffffffc009c8a144
>   [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac     <<< waiting for cpuhp/x complete
>   [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac       <<< wake up cpuhp/x
>   [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0
>   [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0
>   [ffffffc01245bac0] _cpu_up at ffffffc0081dd844            <<< percpu_down_write(&cpu_hotplug_lock)
>   [ffffffc01245bb40] cpu_up at ffffffc0081dd100
>   [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4
>   [ffffffc01245bb90] device_online at ffffffc008d77dec
>   [ffffffc01245bbd0] online_store at ffffffc008d77bb8
>   [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8
>   [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894
>   [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc01245bd50] vfs_write at ffffffc008607a8c
>   [ffffffc01245bdb0] ksys_write at ffffffc0086076d8
>   [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640
>   [ffffffc01245be10] invoke_syscall at ffffffc00813dccc
>   [ffffffc01245be30] el0_svc_common at ffffffc00813dc14
>   [ffffffc01245be70] do_el0_svc at ffffffc00813dac8
>   [ffffffc01245be80] el0_svc at ffffffc0098836c8
>   [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44
> 
> 3. cpuhp/x would call cpufreq_online and the schedutil need create kthread, as a result it need wait for kthreadd:
> 
>   [ffffffc00b653860] __switch_to at ffffffc0081229d4
>   [ffffffc00b6538c0] __schedule at ffffffc009c824f8
>   [ffffffc00b653920] schedule at ffffffc009c82b50
>   [ffffffc00b6539a0] schedule_timeout at ffffffc009c8a144
>   [ffffffc00b653a00] wait_for_common at ffffffc009c83dac     <<< wait for kthreadd
>   [ffffffc00b653ad0] __kthread_create_on_node at ffffffc0082296b8
>   [ffffffc00b653b90] kthread_create_on_node at ffffffc008229a58
>   [ffffffc00b653bb0] sugov_init at ffffffc001d01414
>   [ffffffc00b653c00] cpufreq_init_governor at ffffffc009179a48
>   [ffffffc00b653c50] cpufreq_set_policy at ffffffc009179360
>   [ffffffc00b653cb0] cpufreq_online at ffffffc00917c3e4
>   [ffffffc00b653d10] cpuhp_cpufreq_online at ffffffc00917ee3c
>   [ffffffc00b653d70] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc00b653dd0] cpuhp_thread_fun at ffffffc0081df47c
>   [ffffffc00b653e10] smpboot_thread_fn at ffffffc008234718
>   [ffffffc00b653e70] kthread at ffffffc00822ac84
> 
> 4. the kthreadd is trying to get cgroup_threadgroup_rwsem...
> 
>   [ffffffc008073a70] __switch_to at ffffffc0081229d4
>   [ffffffc008073ad0] __schedule at ffffffc009c824f8
>   [ffffffc008073b30] schedule at ffffffc009c82b50
>   [ffffffc008073b90] percpu_rwsem_wait at ffffffc00828fbcc  <<< cgroup_threadgroup_rwsem
>   [ffffffc008073bd0] __percpu_down_read at ffffffc008290180
>   [ffffffc008073c00] cgroup_css_set_fork at ffffffc008357ecc
>   [ffffffc008073c60] cgroup_can_fork at ffffffc008357bb4
>   [ffffffc008073d00] copy_process at ffffffc0081d18c4
>   [ffffffc008073d90] kernel_clone at ffffffc0081d088c
>   [ffffffc008073e50] kthreadd at ffffffc00822a8e4
> 
> Finally, the Thread-A and Thread-B would be blockd forever.
> 
> [2]cpu_offline:
> 
> 1. first, thread-A get the cgroup_threadgroup_rwsem and waiting foe the cpu_hotplug_lock:
> 
>   [ffffffc00f9c3850] __switch_to at ffffffc0081229d4
>   [ffffffc00f9c38b0] __schedule at ffffffc009c824f8
>   [ffffffc00f9c3910] schedule at ffffffc009c82b50
>   [ffffffc00f9c3970] percpu_rwsem_wait at ffffffc00828fbcc
>   [ffffffc00f9c39b0] __percpu_down_read at ffffffc008290180     <<< try to get cpu_hotplug_lock
>   [ffffffc00f9c39e0] cpus_read_lock at ffffffc0081dade8
>   [ffffffc00f9c3a20] cpuset_attach at ffffffc008366ed4
>   [ffffffc00f9c3a90] cgroup_migrate_execute at ffffffc00834f7d8
>   [ffffffc00f9c3b60] cgroup_attach_task at ffffffc0083539b8
>   [ffffffc00f9c3bd0] __cgroup1_procs_write at ffffffc00835f6ac   <<< get the cgroup_threadgroup_rwsem
>   [ffffffc00f9c3c30] cgroup1_tasks_write at ffffffc00835f018
>   [ffffffc00f9c3c60] cgroup_file_write at ffffffc008348b04
>   [ffffffc00f9c3c90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc00f9c3d50] vfs_write at ffffffc008607a8c
>   [ffffffc00f9c3db0] ksys_write at ffffffc0086076d8
>   [ffffffc00f9c3df0] __arm64_sys_write at ffffffc008607640
>   [ffffffc00f9c3e10] invoke_syscall at ffffffc00813dccc
>   [ffffffc00f9c3e30] el0_svc_common at ffffffc00813dbe4
>   [ffffffc00f9c3e70] do_el0_svc at ffffffc00813dac8
>   [ffffffc00f9c3e80] el0_svc at ffffffc0098836c8
>   [ffffffc00f9c3ea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc00f9c3fe0] el0t_64_sync at ffffffc008091e44
> 
> 2. Thread-B get the cpu_hotplug_lock and waiting for cpuhp/x:
> 
>   [ffffffc01245b740] __switch_to at ffffffc0081229d4
>   [ffffffc01245b7a0] __schedule at ffffffc009c824f8
>   [ffffffc01245b800] schedule at ffffffc009c82b50
>   [ffffffc01245b880] schedule_timeout at ffffffc009c8a144
>   [ffffffc01245b8e0] wait_for_common at ffffffc009c83dac     <<< waiting for cpuhp/x complete
>   [ffffffc01245b940] cpuhp_kick_ap at ffffffc0081dc8ac       <<< wake up cpuhp/x
>   [ffffffc01245b980] bringup_cpu at ffffffc0081db7f0
>   [ffffffc01245ba00] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc01245ba60] cpuhp_up_callbacks at ffffffc0081dd9b0
>   [ffffffc01245bac0] _cpu_up at ffffffc0081dd844            <<< percpu_down_write(&cpu_hotplug_lock)
>   [ffffffc01245bb40] cpu_up at ffffffc0081dd100
>   [ffffffc01245bb80] cpu_subsys_online at ffffffc008d8ebe4
>   [ffffffc01245bb90] device_online at ffffffc008d77dec
>   [ffffffc01245bbd0] online_store at ffffffc008d77bb8
>   [ffffffc01245bc30] dev_attr_store at ffffffc008d7c8b8
>   [ffffffc01245bc60] sysfs_kf_write at ffffffc008757894
>   [ffffffc01245bc90] kernfs_fop_write_iter at ffffffc0087544d8
>   [ffffffc01245bd50] vfs_write at ffffffc008607a8c
>   [ffffffc01245bdb0] ksys_write at ffffffc0086076d8
>   [ffffffc01245bdf0] __arm64_sys_write at ffffffc008607640
>   [ffffffc01245be10] invoke_syscall at ffffffc00813dccc
>   [ffffffc01245be30] el0_svc_common at ffffffc00813dc14
>   [ffffffc01245be70] do_el0_svc at ffffffc00813dac8
>   [ffffffc01245be80] el0_svc at ffffffc0098836c8
>   [ffffffc01245bea0] el0t_64_sync_handler at ffffffc00988363c
>   [ffffffc01245bfe0] el0t_64_sync at ffffffc008091e44
> 
> 3.cpuhp/x would call cpufreq_offline and schedutil need stop_kthread:
> 
>   [ffffffc00b683a60] __switch_to at ffffffc0081229d4
>   [ffffffc00b683ac0] __schedule at ffffffc009c824f8
>   [ffffffc00b683b20] schedule at ffffffc009c82b50
>   [ffffffc00b683ba0] schedule_timeout at ffffffc009c8a144
>   [ffffffc00b683c00] wait_for_common at ffffffc009c83dac <<< waiting for sugov complete
>   [ffffffc00b683c60] kthread_stop at ffffffc008228128   <<< wakeup sugov
>   [ffffffc00b683c90] sugov_exit at ffffffc001d016a8
>   [ffffffc00b683cc0] cpufreq_offline at ffffffc00917b4b8
>   [ffffffc00b683d10] cpuhp_cpufreq_offline at ffffffc00917ee70
>   [ffffffc00b683d70] cpuhp_invoke_callback at ffffffc0081dc000
>   [ffffffc00b683dd0] cpuhp_thread_fun at ffffffc0081df47c
>   [ffffffc00b683e10] smpboot_thread_fn at ffffffc008234718
>   [ffffffc00b683e70] kthread at ffffffc00822ac84
> 
> 4. the sugov thread is waiting cgroup_threadgroup_rwsem:
> 
>   [ffffffc01258bc10] __switch_to at ffffffc0081229d4
>   [ffffffc01258bc70] __schedule at ffffffc009c824f8
>   [ffffffc01258bcd0] schedule at ffffffc009c82b50
>   [ffffffc01258bd30] percpu_rwsem_wait at ffffffc00828fbcc  <<< wait for cgroup_threadgroup_rwsem
>   [ffffffc01258bd70] __percpu_down_read at ffffffc008290180
>   [ffffffc01258bdb0] exit_signals at ffffffc0082074c8
>   [ffffffc01258be10] do_exit at ffffffc0081e5130
>   [ffffffc01258be70] kthread at ffffffc00822ac8c
> 
> Finally, the Thread-A and Thread-B would be blockd forever.
> 
> Combining the above two situations, the reason why Thread-A and Thread-B are blocked is because cpuhp/x is blocked.
> If we can solve the problem that cpuhp is not blocked, Thread-A and Thread-B can run normally.
> So we can let schedutil do not create or destory thread, so that the cpuhp/x would not be blocked.
> 
> So init the sugov thread just once to prevent the above deadlock.
> 
> Signed-off-by: Ke Wang <ke.wang@unisoc.com>
> Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
> ---

Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful
output about the DEADLOCK. But your explanation was good and clear to me.

AFAIU:


CPU0                                     CPU1                                   CPU2

// attach task to a different
// cpuset cgroup via sysfs
__acquire(cgroup_threadgroup_rwsem)

                                         // pring up CPU2 online
                                         __acquire(cpu_hotplug_lock)
                                         // wait for CPU2 to come online
                                                                                // bringup cpu online
                                                                                // call cpufreq_online() which tries to create sugov kthread
__acquire(cpu_hotplug_lock)                                                     copy_process()
                                                                                   cgroup_can_fork()
                                                                                      cgroup_css_set_fork()
                                                                                      __acquire(cgroup_threadgroup_rwsem)
// blocks forever                        // blocks forever                            // blocks forever


Is this a correct summary of the problem?

The locks are held in reverse order and we end up with a DEADLOCK.

I believe the same happens on offline it's just the path to hold the
cgroup_threadgroup_rwsem on CPU2 is different.

This will be a tricky one. Your proposed patch might fix it for this case, but
if there's anything else that creates a kthread when a cpu goes online/offline
then we'll hit the same problem again.

I haven't reviewed your patch to be honest, but I think worth seeing first if
there's something that can be done at the 'right level' first.

Needs head scratching from my side at least. This is the not the first type of
locking issue between hotplug and cpuset :-/


Thanks

--
Qais Yousef
Tejun Heo July 11, 2022, 8:58 p.m. UTC | #7
(cc'ing Waiman)

On Mon, Jul 11, 2022 at 06:46:29PM +0100, Qais Yousef wrote:
> Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful
> output about the DEADLOCK. But your explanation was good and clear to me.

I don't think lockdep would be able to track CPU1 -> CPU2 dependency here
unfortunately.

> AFAIU:
> 
> 
> CPU0                                     CPU1                                   CPU2
> 
> // attach task to a different
> // cpuset cgroup via sysfs
> __acquire(cgroup_threadgroup_rwsem)
> 
>                                          // pring up CPU2 online
>                                          __acquire(cpu_hotplug_lock)
>                                          // wait for CPU2 to come online
>                                                                                 // bringup cpu online
>                                                                                 // call cpufreq_online() which tries to create sugov kthread
> __acquire(cpu_hotplug_lock)                                                     copy_process()
>                                                                                    cgroup_can_fork()
>                                                                                       cgroup_css_set_fork()
>                                                                                       __acquire(cgroup_threadgroup_rwsem)
> // blocks forever                        // blocks forever                            // blocks forever
> 
> 
> Is this a correct summary of the problem?
> 
> The locks are held in reverse order and we end up with a DEADLOCK.
> 
> I believe the same happens on offline it's just the path to hold the
> cgroup_threadgroup_rwsem on CPU2 is different.
> 
> This will be a tricky one. Your proposed patch might fix it for this case, but
> if there's anything else that creates a kthread when a cpu goes online/offline
> then we'll hit the same problem again.
> 
> I haven't reviewed your patch to be honest, but I think worth seeing first if
> there's something that can be done at the 'right level' first.
> 
> Needs head scratching from my side at least. This is the not the first type of
> locking issue between hotplug and cpuset :-/

Well, the only thing I can think of is always grabbing cpus_read_lock()
before grabbing threadgroup_rwsem. Waiman, what do you think?

Thanks.
Steven Rostedt July 11, 2022, 9:11 p.m. UTC | #8
On Mon, 11 Jul 2022 10:58:28 -1000
Tejun Heo <tj@kernel.org> wrote:

> I don't think lockdep would be able to track CPU1 -> CPU2 dependency here
> unfortunately.
> 
> > AFAIU:
> > 
> > 
> > CPU0                                     CPU1                                   CPU2
> > 
> > // attach task to a different
> > // cpuset cgroup via sysfs
> > __acquire(cgroup_threadgroup_rwsem)
> > 
> >                                          // pring up CPU2 online
> >                                          __acquire(cpu_hotplug_lock)
> >                                          // wait for CPU2 to come online

Should there be some annotation here that tells lockdep that CPU1 is now
blocked on CPU2?

Then this case would be caught by lockdep.

-- Steve


> >                                                                                 // bringup cpu online
> >                                                                                 // call cpufreq_online() which tries to create sugov kthread
> > __acquire(cpu_hotplug_lock)                                                     copy_process()
> >                                                                                    cgroup_can_fork()
> >                                                                                       cgroup_css_set_fork()
> >                                                                                       __acquire(cgroup_threadgroup_rwsem)
> > // blocks forever                        // blocks forever                            // blocks forever
> >
Xuewen Yan July 12, 2022, 5:42 a.m. UTC | #9
On Tue, Jul 12, 2022 at 5:34 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 11 Jul 2022 10:58:28 -1000
> Tejun Heo <tj@kernel.org> wrote:
>
> > I don't think lockdep would be able to track CPU1 -> CPU2 dependency here
> > unfortunately.
> >
> > > AFAIU:
> > >
> > >
> > > CPU0                                     CPU1                                   CPU2
> > >
> > > // attach task to a different
> > > // cpuset cgroup via sysfs
> > > __acquire(cgroup_threadgroup_rwsem)
> > >
> > >                                          // pring up CPU2 online
> > >                                          __acquire(cpu_hotplug_lock)
> > >                                          // wait for CPU2 to come online
>
> Should there be some annotation here that tells lockdep that CPU1 is now
> blocked on CPU2?
>
> Then this case would be caught by lockdep.
>
> -- Steve
>
>
> > >                                                                                 // bringup cpu online
> > >                                                                                 // call cpufreq_online() which tries to create sugov kthread
> > > __acquire(cpu_hotplug_lock)                                                     copy_process()
> > >                                                                                    cgroup_can_fork()
> > >                                                                                       cgroup_css_set_fork()
> > >                                                                                       __acquire(cgroup_threadgroup_rwsem)
> > > // blocks forever                        // blocks forever                            // blocks forever
> > >

Indeed, It's caused by threads instead of cpus.

Our soc contains two cpufreq policy.0-5 belongs to policy0, 6-7
belongs to policy1.

when cpu6/7 online

Thread-A                                                      Thread-B
                          cpuhp/6
  kthreadd

cgroup_file_write                                          device_online
cgroup1_tasks_write                                              ...
__cgroup1_procs_write                                      _cpu_up
write(&cgroup_threadgroup_rwsem); <<     cpus_write_lock();<<
cgroup_attach_task                                               ......
cgroup_migrate_execute                             cpuhp_kick_ap
--------->  cpuhp_thread_fun
cpuset_attach                                               //waiting
for cpuhp             cpuhp_invoke_callback
cpus_read_lock()
                             cpuhp_cpufreq_online

                                         cpufreq_online
 //blocked
                                    sugov_init

                                        __kthread_create_on_node
      kthreadd

                                       //blocked, waiting for kthreadd
         copy_process


                          cgroup_can_fork


                          cgroup_css_set_fork


                          __acquires(&cgroup_threadgroup_rwsem)

So it's logic is:
Thread-A ----->Thread-B------>cpuhp----->kthreadd-----
     ^
              |
     |<---------------------------------------------------------------<-

When cpu offline, the sugov thread would stop, so it would waiting
cgroup_threadgroup_rwsem when kthread_stop();
It's logic is:
Thread-A ----->Thread-B------>cpuhp-------->sugov-----
     ^
              |
     |<---------------------------------------------------------------<-


As Qais said:
> if there's anything else that creates a kthread when a cpu goes online/offline
> then we'll hit the same problem again.

Indeed, only the cpuhp thread create/destroy kthread would cause the case.


I have put the test script in the mail, and I have tested it without
monkey test, the deadlock still occurs..


Thanks!
xuewen.yan
Xuewen Yan July 12, 2022, 5:57 a.m. UTC | #10
[RESEND]

On Tue, Jul 12, 2022 at 5:34 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 11 Jul 2022 10:58:28 -1000
> Tejun Heo <tj@kernel.org> wrote:
>
> > I don't think lockdep would be able to track CPU1 -> CPU2 dependency here
> > unfortunately.
> >
> > > AFAIU:
> > >
> > >
> > > CPU0                                     CPU1                                   CPU2
> > >
> > > // attach task to a different
> > > // cpuset cgroup via sysfs
> > > __acquire(cgroup_threadgroup_rwsem)
> > >
> > >                                          // pring up CPU2 online
> > >                                          __acquire(cpu_hotplug_lock)
> > >                                          // wait for CPU2 to come online
>
> Should there be some annotation here that tells lockdep that CPU1 is now
> blocked on CPU2?
>
> Then this case would be caught by lockdep.
>
> -- Steve
>
>
> > >                                                                                 // bringup cpu online
> > >                                                                                 // call cpufreq_online() which tries to create sugov kthread
> > > __acquire(cpu_hotplug_lock)                                                     copy_process()
> > >                                                                                    cgroup_can_fork()
> > >                                                                                       cgroup_css_set_fork()
> > >                                                                                       __acquire(cgroup_threadgroup_rwsem)
> > > // blocks forever                        // blocks forever                            // blocks forever
> > >

Indeed, It's caused by threads instead of cpus.

Our soc contains two cpufreq policy.0-5 belongs to policy0, 6-7
belongs to policy1.

when cpu6/7 online

Thread-A                                                      Thread-B

cgroup_file_write                                          device_online
cgroup1_tasks_write                                              ...
__cgroup1_procs_write                                      _cpu_up
write(&cgroup_threadgroup_rwsem); <<     cpus_write_lock();<<
cgroup_attach_task                                               ......
cgroup_migrate_execute                             cpuhp_kick_ap
cpuset_attach                                              //wakeup cpuhp
cpus_read_lock()                                        //waitingfor cpuhp


cpuhp/6                                                        kthreadd

cpuhp_thread_fun
cpuhp_invoke_callback
cpuhp_cpufreq_online
cpufreq_online
 sugov_init
 __kthread_create_on_node                 copy_process
 //blocked, waiting for kthreadd             cgroup_can_fork
                                                        cgroup_css_set_fork
                           __acquires(&cgroup_threadgroup_rwsem)
                                                        //blocked

So it's logic is:
Thread-A --->Thread-B---->cpuhp--->kthreadd---->Thread-A

When cpu offline, the sugov thread would stop, so it would waiting
cgroup_threadgroup_rwsem when kthread_stop();
It's logic is:
Thread-A --->Thread-B---->cpuhp--->sugov---->Thread-A

As Qais said:
> if there's anything else that creates a kthread when a cpu goes online/offline
> then we'll hit the same problem again.

Indeed, only the cpuhp thread create/destroy kthread would cause the case.

I have put the test script in the mail, and I have tested it without
monkey test, the deadlock still occurs..


Thanks!
xuewen.yan
Qais Yousef July 12, 2022, 12:57 p.m. UTC | #11
On 07/11/22 10:58, Tejun Heo wrote:
> (cc'ing Waiman)
> 
> On Mon, Jul 11, 2022 at 06:46:29PM +0100, Qais Yousef wrote:
> > Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful
> > output about the DEADLOCK. But your explanation was good and clear to me.
> 
> I don't think lockdep would be able to track CPU1 -> CPU2 dependency here
> unfortunately.

I see. It seems I got it wrong and CPU1 and CPU2 below run from the same
context, IIU Xuewen's response correctly this time that is.

> 
> > AFAIU:
> > 
> > 
> > CPU0                                     CPU1                                   CPU2
> > 
> > // attach task to a different
> > // cpuset cgroup via sysfs
> > __acquire(cgroup_threadgroup_rwsem)
> > 
> >                                          // pring up CPU2 online
> >                                          __acquire(cpu_hotplug_lock)
> >                                          // wait for CPU2 to come online
> >                                                                                 // bringup cpu online
> >                                                                                 // call cpufreq_online() which tries to create sugov kthread
> > __acquire(cpu_hotplug_lock)                                                     copy_process()
> >                                                                                    cgroup_can_fork()
> >                                                                                       cgroup_css_set_fork()
> >                                                                                       __acquire(cgroup_threadgroup_rwsem)
> > // blocks forever                        // blocks forever                            // blocks forever
> > 
> > 
> > Is this a correct summary of the problem?
> > 
> > The locks are held in reverse order and we end up with a DEADLOCK.
> > 
> > I believe the same happens on offline it's just the path to hold the
> > cgroup_threadgroup_rwsem on CPU2 is different.
> > 
> > This will be a tricky one. Your proposed patch might fix it for this case, but
> > if there's anything else that creates a kthread when a cpu goes online/offline
> > then we'll hit the same problem again.
> > 
> > I haven't reviewed your patch to be honest, but I think worth seeing first if
> > there's something that can be done at the 'right level' first.
> > 
> > Needs head scratching from my side at least. This is the not the first type of
> > locking issue between hotplug and cpuset :-/
> 
> Well, the only thing I can think of is always grabbing cpus_read_lock()
> before grabbing threadgroup_rwsem. Waiman, what do you think?

Is there a lot of subsystems beside cpuset that needs the cpus_read_lock()?
A quick grep tells me it's the only one.

Can't we instead use cpus_read_trylock() in cpuset_can_attach() so that we
either hold the lock successfully then before we go ahead and call
cpuset_attach(), or bail out and cancel the whole attach operation which should
unlock the threadgroup_rwsem() lock?

Sounds simple in theory, but as always the devil is in the details which
I don't have all inside my head. AFAICS cgroup_migration_execute() will call
ss->can_attach() for each subsystem. And it's the only caller of can_attach().

Not sure if it'd be safe to do cpus_read_unlock() at the end of cpuset_attach()
or we need to introduce an additional callback to 'cpuset_attach_finish()'
where we'd release the lock.

ie something like the below untested-and-for-demo-purposes-only patch:


diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 71a418858a5e..86c7c6aa0f12 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2226,6 +2226,12 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
        struct task_struct *task;
        int ret;

+       ret = cpus_read_trylock();
+       if (!ret) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+
        /* used later by cpuset_attach() */
        cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
        cs = css_cs(css);
@@ -2289,7 +2295,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

-       cpus_read_lock();
+       // Requires cpus_read_lock() to be held already
+       // releases cpus_read_lock() when done
        percpu_down_write(&cpuset_rwsem);

        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);


Cheers

--
Qais Yousef
Tejun Heo July 12, 2022, 4:13 p.m. UTC | #12
On Tue, Jul 12, 2022 at 01:57:02PM +0100, Qais Yousef wrote:
> Is there a lot of subsystems beside cpuset that needs the cpus_read_lock()?
> A quick grep tells me it's the only one.
> 
> Can't we instead use cpus_read_trylock() in cpuset_can_attach() so that we
> either hold the lock successfully then before we go ahead and call
> cpuset_attach(), or bail out and cancel the whole attach operation which should
> unlock the threadgroup_rwsem() lock?

But now we're failing user-initiated operations randomly. I have a hard time
seeing that as an acceptable solution. The only thing we can do, I think, is
establishing a locking order between the two locks by either nesting
threadgroup_rwsem under cpus_read_lock or disallowing thread creation during
hotplug operations.

Thanks.
Qais Yousef July 12, 2022, 5:14 p.m. UTC | #13
On 07/12/22 06:13, Tejun Heo wrote:
> On Tue, Jul 12, 2022 at 01:57:02PM +0100, Qais Yousef wrote:
> > Is there a lot of subsystems beside cpuset that needs the cpus_read_lock()?
> > A quick grep tells me it's the only one.
> > 
> > Can't we instead use cpus_read_trylock() in cpuset_can_attach() so that we
> > either hold the lock successfully then before we go ahead and call
> > cpuset_attach(), or bail out and cancel the whole attach operation which should
> > unlock the threadgroup_rwsem() lock?
> 
> But now we're failing user-initiated operations randomly. I have a hard time

True. That might appear more random than necessary. It looked neat and
I thought since hotplug operations aren't that common and users must be
prepared for failures for other reasons, it might be okay.

> seeing that as an acceptable solution. The only thing we can do, I think, is
> establishing a locking order between the two locks by either nesting

That might be enough if no other paths can exist which would hold them in
reverse order again. It would be more robust to either hold them both or wait
until we can. Then potential ordering problems can't happen again, because of
this path at least.

> threadgroup_rwsem under cpus_read_lock or disallowing thread creation during
> hotplug operations.

I think that's what Xuewen tried to do in the proposed patch. But it fixes it
for a specific user. If we go with that we'll need nuts and bolts to help warn
when other users do that.


Thanks

--
Qais Yousef
Waiman Long July 13, 2022, 2:49 a.m. UTC | #14
On 7/11/22 16:58, Tejun Heo wrote:
> (cc'ing Waiman)
>
> On Mon, Jul 11, 2022 at 06:46:29PM +0100, Qais Yousef wrote:
>> Have you tried running with PROVE_LOCKDEP enabled? It'll help print a useful
>> output about the DEADLOCK. But your explanation was good and clear to me.
> I don't think lockdep would be able to track CPU1 -> CPU2 dependency here
> unfortunately.

That is the case AFAIK. Lockdep only track individually the locks taken 
by each task.


>> AFAIU:
>>
>>
>> CPU0                                     CPU1                                   CPU2
>>
>> // attach task to a different
>> // cpuset cgroup via sysfs
>> __acquire(cgroup_threadgroup_rwsem)
>>
>>                                           // pring up CPU2 online
>>                                           __acquire(cpu_hotplug_lock)
>>                                           // wait for CPU2 to come online
>>                                                                                  // bringup cpu online
>>                                                                                  // call cpufreq_online() which tries to create sugov kthread
>> __acquire(cpu_hotplug_lock)                                                     copy_process()
>>                                                                                     cgroup_can_fork()
>>                                                                                        cgroup_css_set_fork()
>>                                                                                        __acquire(cgroup_threadgroup_rwsem)
>> // blocks forever                        // blocks forever                            // blocks forever
>>
>>
>> Is this a correct summary of the problem?
>>
>> The locks are held in reverse order and we end up with a DEADLOCK.
>>
>> I believe the same happens on offline it's just the path to hold the
>> cgroup_threadgroup_rwsem on CPU2 is different.
>>
>> This will be a tricky one. Your proposed patch might fix it for this case, but
>> if there's anything else that creates a kthread when a cpu goes online/offline
>> then we'll hit the same problem again.
>>
>> I haven't reviewed your patch to be honest, but I think worth seeing first if
>> there's something that can be done at the 'right level' first.
>>
>> Needs head scratching from my side at least. This is the not the first type of
>> locking issue between hotplug and cpuset :-/
> Well, the only thing I can think of is always grabbing cpus_read_lock()
> before grabbing threadgroup_rwsem. Waiman, what do you think?

That is a possible solution as cpus_read_lock() is rather lightweight. 
It is a good practice to acquire it first.

Cheers,
Longman

>
> Thanks.
>
Tejun Heo July 13, 2022, 3 a.m. UTC | #15
On Tue, Jul 12, 2022 at 10:49:57PM -0400, Waiman Long wrote:
> > Well, the only thing I can think of is always grabbing cpus_read_lock()
> > before grabbing threadgroup_rwsem. Waiman, what do you think?
> 
> That is a possible solution as cpus_read_lock() is rather lightweight. It is
> a good practice to acquire it first.

On a similar note, I think we probably should re-enable percpu operations on
threadgroup_rwsem too by default and allow users who are affected by slower
write path operations to opt-in. After the new CLONE_INTO_CGROUP which
doesn't need the rwsem, we have far fewer write lockers after all.

Thanks.
Waiman Long July 13, 2022, 3:02 a.m. UTC | #16
On 7/11/22 17:11, Steven Rostedt wrote:
> On Mon, 11 Jul 2022 10:58:28 -1000
> Tejun Heo <tj@kernel.org> wrote:
>
>> I don't think lockdep would be able to track CPU1 -> CPU2 dependency here
>> unfortunately.
>>
>>> AFAIU:
>>>
>>>
>>> CPU0                                     CPU1                                   CPU2
>>>
>>> // attach task to a different
>>> // cpuset cgroup via sysfs
>>> __acquire(cgroup_threadgroup_rwsem)
>>>
>>>                                           // pring up CPU2 online
>>>                                           __acquire(cpu_hotplug_lock)
>>>                                           // wait for CPU2 to come online
> Should there be some annotation here that tells lockdep that CPU1 is now
> blocked on CPU2?
>
> Then this case would be caught by lockdep.
>
> -- Steve
>
>
>>>                                                                                  // bringup cpu online
>>>                                                                                  // call cpufreq_online() which tries to create sugov kthread
>>> __acquire(cpu_hotplug_lock)                                                     copy_process()
>>>                                                                                     cgroup_can_fork()
>>>                                                                                        cgroup_css_set_fork()
>>>                                                                                        __acquire(cgroup_threadgroup_rwsem)
>>> // blocks forever                        // blocks forever                            // blocks forever

Actually, the dependency can probably be coded by calling 
lockdep_acquire_cpus_lock() in cpu2 (task 2) before acquiring 
cgroup_threadgroup_rwsem to indicate the dependency between CPU1 and 
CPU2 (task 1 and task 2). lockdep_release_cpus_lock() can then called 
after release the rwsem.

Cheers,
Longman
Xuewen Yan July 20, 2022, 7:45 a.m. UTC | #17
Dear all

On Wed, Jul 13, 2022 at 11:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> On Tue, Jul 12, 2022 at 10:49:57PM -0400, Waiman Long wrote:
> > > Well, the only thing I can think of is always grabbing cpus_read_lock()
> > > before grabbing threadgroup_rwsem. Waiman, what do you think?
> >
> > That is a possible solution as cpus_read_lock() is rather lightweight. It is
> > a good practice to acquire it first.
>
> On a similar note, I think we probably should re-enable percpu operations on
> threadgroup_rwsem too by default and allow users who are affected by slower
> write path operations to opt-in. After the new CLONE_INTO_CGROUP which
> doesn't need the rwsem, we have far fewer write lockers after all.
>

 If there's any patch for me to try? I would be very grateful.

Thanks!

---
xw.yan
Tejun Heo July 27, 2022, 8:09 p.m. UTC | #18
Hello,

On Wed, Jul 20, 2022 at 03:45:27PM +0800, Xuewen Yan wrote:
> Dear all
> 
> On Wed, Jul 13, 2022 at 11:20 AM Tejun Heo <tj@kernel.org> wrote:
> >
> > On Tue, Jul 12, 2022 at 10:49:57PM -0400, Waiman Long wrote:
> > > > Well, the only thing I can think of is always grabbing cpus_read_lock()
> > > > before grabbing threadgroup_rwsem. Waiman, what do you think?
> > >
> > > That is a possible solution as cpus_read_lock() is rather lightweight. It is
> > > a good practice to acquire it first.
> >
> > On a similar note, I think we probably should re-enable percpu operations on
> > threadgroup_rwsem too by default and allow users who are affected by slower
> > write path operations to opt-in. After the new CLONE_INTO_CGROUP which
> > doesn't need the rwsem, we have far fewer write lockers after all.
> >
> 
>  If there's any patch for me to try? I would be very grateful.

Can youp please see whether the following patch fixes the problem?

Thanks.

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 13c8e91d78620..7caefc8af9127 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2345,6 +2345,38 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
+/**
+ * lock_threadgroup - Stabilize threadgroups
+ *
+ * cgroup migration operations need the threadgroups stabilized against forks
+ * and exits, which can be achieved by write-locking cgroup_threadgroup_rwsem.
+ *
+ * Note that write-locking threadgdroup_rwsem is nested inside CPU hotplug
+ * disable. This is because cpuset needs CPU hotplug disabled during ->attach()
+ * and bringing up a CPU may involve creating new tasks which can require
+ * read-locking threadgroup_rwsem. If we call cpuset's ->attach() with
+ * threadgroup_rwsem write-locked with hotplug enabled, we can deadlock by
+ * cpuset waiting for an on-going CPU hotplug operation which in turn is waiting
+ * for the threadgroup_rwsem to be released to create new tasks. See the
+ * following for more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ */
+static void lock_threadgroup(void)
+{
+	cpus_read_lock();
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * lock_threadgroup - Undo lock_threadgroup
+ */
+static void unlock_threadgroup(void)
+{
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	cpus_read_unlock();
+}
+
 /**
  * cgroup_migrate_add_task - add a migration target task to a migration context
  * @task: target task
@@ -2822,7 +2854,6 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 
 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 					     bool *locked)
-	__acquires(&cgroup_threadgroup_rwsem)
 {
 	struct task_struct *tsk;
 	pid_t pid;
@@ -2840,7 +2871,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 	 */
 	lockdep_assert_held(&cgroup_mutex);
 	if (pid || threadgroup) {
-		percpu_down_write(&cgroup_threadgroup_rwsem);
+		lock_threadgroup();
 		*locked = true;
 	} else {
 		*locked = false;
@@ -2876,7 +2907,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 
 out_unlock_threadgroup:
 	if (*locked) {
-		percpu_up_write(&cgroup_threadgroup_rwsem);
+		unlock_threadgroup();
 		*locked = false;
 	}
 out_unlock_rcu:
@@ -2885,7 +2916,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 }
 
 void cgroup_procs_write_finish(struct task_struct *task, bool locked)
-	__releases(&cgroup_threadgroup_rwsem)
 {
 	struct cgroup_subsys *ss;
 	int ssid;
@@ -2894,7 +2924,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
 	put_task_struct(task);
 
 	if (locked)
-		percpu_up_write(&cgroup_threadgroup_rwsem);
+		unlock_threadgroup();
+
 	for_each_subsys(ss, ssid)
 		if (ss->post_attach)
 			ss->post_attach();
@@ -2953,7 +2984,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
+	lock_threadgroup();
 
 	/* look up all csses currently attached to @cgrp's subtree */
 	spin_lock_irq(&css_set_lock);
@@ -2984,7 +3015,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	ret = cgroup_migrate_execute(&mgctx);
 out_finish:
 	cgroup_migrate_finish(&mgctx);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
+	unlock_threadgroup();
 	return ret;
 }
Peter Zijlstra July 27, 2022, 10:01 p.m. UTC | #19
On Tue, Jul 12, 2022 at 06:13:46AM -1000, Tejun Heo wrote:

> But now we're failing user-initiated operations randomly. I have a hard time
> seeing that as an acceptable solution. 

There (sadly) is precedent for that; grep for "cpu_hotplug_disable".
Tejun Heo July 27, 2022, 10:05 p.m. UTC | #20
Hello,

On Thu, Jul 28, 2022 at 12:01:35AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 12, 2022 at 06:13:46AM -1000, Tejun Heo wrote:
> 
> > But now we're failing user-initiated operations randomly. I have a hard time
> > seeing that as an acceptable solution. 
> 
> There (sadly) is precedent for that; grep for "cpu_hotplug_disable".

Yeah, there's another one in cpu cgroup controller attach path. We should
get better at not doing these. Low frequency arbitrary failures are really
difficult to handle reliably downstream.

Thanks.
Xuewen Yan July 29, 2022, 7:33 a.m. UTC | #21
Hi Tejun

On Thu, Jul 28, 2022 at 4:09 AM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
>
> Can youp please see whether the following patch fixes the problem?
>

I have tested the patch, sadly, although the original deadlock call
stack is gone, there seems to be a new problem.
I did a preliminary analysis, but no further in-depth analysis. I
haven't found the root cause yet, but I think I should let you know
the test results first.
Once I find the root cause, I'll report back immediately.
The thread-A is waiting for the cpu_hotplug_lock's writer, but the
writer is waiting for the readers:

PID: 1106   TASK: ffffff8097e90000  CPU: 7   COMMAND: "OomAdjuster"
 #0 [ffffffc011a23850] __switch_to at ffffffc0081229b4
 #1 [ffffffc011a238b0] __schedule at ffffffc009c824f8
 #2 [ffffffc011a23910] schedule at ffffffc009c82b50
 #3 [ffffffc011a23970] percpu_rwsem_wait at ffffffc00828fbf4
 #4 [ffffffc011a239b0] __percpu_down_read at ffffffc0082901a8
 #5 [ffffffc011a239e0] cpus_read_lock at ffffffc0081dadc8
 #6 [ffffffc011a23a20] cpuset_attach at ffffffc008366f10
 #7 [ffffffc011a23a90] cgroup_migrate_execute at ffffffc00834f808
 #8 [ffffffc011a23b60] cgroup_attach_task at ffffffc0083539f0
 #9 [ffffffc011a23bd0] __cgroup1_procs_write at ffffffc00835f6e8
#10 [ffffffc011a23c30] cgroup1_tasks_write at ffffffc00835f054
#11 [ffffffc011a23c60] cgroup_file_write at ffffffc008348b2c
#12 [ffffffc011a23c90] kernfs_fop_write_iter at ffffffc008754514
#13 [ffffffc011a23d50] vfs_write at ffffffc008607ac8
#14 [ffffffc011a23db0] ksys_write at ffffffc008607714
#15 [ffffffc011a23df0] __arm64_sys_write at ffffffc00860767c
#16 [ffffffc011a23e10] invoke_syscall at ffffffc00813dcac
#17 [ffffffc011a23e30] el0_svc_common at ffffffc00813dbc4
#18 [ffffffc011a23e70] do_el0_svc at ffffffc00813daa8
#19 [ffffffc011a23e80] el0_svc at ffffffc0098828d8
#20 [ffffffc011a23ea0] el0t_64_sync_handler at ffffffc00988284c
#21 [ffffffc011a23fe0] el0t_64_sync at ffffffc008091e44

cpu_hotplug_lock = $3 = {
  rss = {
    gp_state = 2,
    gp_count = 1,
    gp_wait = {
      lock = {
        {
          rlock = {
            raw_lock = {
              {
                val = {
                  counter = 0
                },
                {
                  locked = 0 '\000',
                  pending = 0 '\000'
                },
                {
                  locked_pending = 0,
                  tail = 0
                }
              }
            }
          }
        }
      },
      head = {
        next = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>,
        prev = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>
      }
    },
    cb_head = {
      next = 0x0,
      func = 0x0
    }
  },
  read_count = 0xffffffc00b0a0808 <__percpu_rwsem_rc_cpu_hotplug_lock>,
  writer = {
    task = 0xffffff80f303ca00
  },
  {
    waiters = {
      lock = {
        {
          rlock = {
            raw_lock = {
              {
                val = {
                  counter = 0
                },
                {
                  locked = 0 '\000',
                  pending = 0 '\000'
                },
                {
                  locked_pending = 0,
                  tail = 0
                }
              }
            }
          }
        }
      },
      head = {
        next = 0xffffffc011a23958,
        prev = 0xffffffc01430bcb8
      }
    },
    destroy_list_entry = {
      next = 0x0,
      prev = 0xffffffc011a23958
    }
  },
  block = {
    counter = 1
  }
}

PID: 15760  TASK: ffffff80f303ca00  CPU: 5   COMMAND: "test.sh"
 #0 [ffffffc0129639a0] __switch_to at ffffffc0081229b4
 #1 [ffffffc012963a00] __schedule at ffffffc009c824f8
 #2 [ffffffc012963a60] schedule at ffffffc009c82b50
 #3 [ffffffc012963a90] percpu_down_write at ffffffc00828f9d8
 #4 [ffffffc012963ae0] _cpu_down at ffffffc009884550
 #5 [ffffffc012963b40] cpu_device_down at ffffffc0081df814
 #6 [ffffffc012963b60] cpu_subsys_offline at ffffffc008d8dd8c
 #7 [ffffffc012963b90] device_offline at ffffffc008d77124
 #8 [ffffffc012963bd0] online_store at ffffffc008d76d44
 #9 [ffffffc012963c30] dev_attr_store at ffffffc008d7ba30
#10 [ffffffc012963c60] sysfs_kf_write at ffffffc0087578d0
#11 [ffffffc012963c90] kernfs_fop_write_iter at ffffffc008754514
#12 [ffffffc012963d50] vfs_write at ffffffc008607ac8
#13 [ffffffc012963db0] ksys_write at ffffffc008607714
#14 [ffffffc012963df0] __arm64_sys_write at ffffffc00860767c
#15 [ffffffc012963e10] invoke_syscall at ffffffc00813dcac
#16 [ffffffc012963e30] el0_svc_common at ffffffc00813dbf4
#17 [ffffffc012963e70] do_el0_svc at ffffffc00813daa8
#18 [ffffffc012963e80] el0_svc at ffffffc0098828d8
#19 [ffffffc012963ea0] el0t_64_sync_handler at ffffffc00988284c
#20 [ffffffc012963fe0] el0t_64_sync at ffffffc008091e44

Thanks!
BR
Qais Yousef July 29, 2022, 8:39 a.m. UTC | #22
On 07/29/22 15:33, Xuewen Yan wrote:
> Hi Tejun
> 
> On Thu, Jul 28, 2022 at 4:09 AM Tejun Heo <tj@kernel.org> wrote:
> >
> > Hello,
> >
> >
> > Can youp please see whether the following patch fixes the problem?
> >
> 
> I have tested the patch, sadly, although the original deadlock call
> stack is gone, there seems to be a new problem.
> I did a preliminary analysis, but no further in-depth analysis. I
> haven't found the root cause yet, but I think I should let you know
> the test results first.
> Once I find the root cause, I'll report back immediately.
> The thread-A is waiting for the cpu_hotplug_lock's writer, but the
> writer is waiting for the readers:

I *think* it's because we haven't removed cpus_read_lock() from
cpuset_attach(). So we end up holding the lock twice in the same path. Since we
hold it unconditionally now, we should remove cpuset dependency on
cpus_read_lock() I believe.


Cheers

--
Qais Yousef

> 
> PID: 1106   TASK: ffffff8097e90000  CPU: 7   COMMAND: "OomAdjuster"
>  #0 [ffffffc011a23850] __switch_to at ffffffc0081229b4
>  #1 [ffffffc011a238b0] __schedule at ffffffc009c824f8
>  #2 [ffffffc011a23910] schedule at ffffffc009c82b50
>  #3 [ffffffc011a23970] percpu_rwsem_wait at ffffffc00828fbf4
>  #4 [ffffffc011a239b0] __percpu_down_read at ffffffc0082901a8
>  #5 [ffffffc011a239e0] cpus_read_lock at ffffffc0081dadc8
>  #6 [ffffffc011a23a20] cpuset_attach at ffffffc008366f10
>  #7 [ffffffc011a23a90] cgroup_migrate_execute at ffffffc00834f808
>  #8 [ffffffc011a23b60] cgroup_attach_task at ffffffc0083539f0
>  #9 [ffffffc011a23bd0] __cgroup1_procs_write at ffffffc00835f6e8
> #10 [ffffffc011a23c30] cgroup1_tasks_write at ffffffc00835f054
> #11 [ffffffc011a23c60] cgroup_file_write at ffffffc008348b2c
> #12 [ffffffc011a23c90] kernfs_fop_write_iter at ffffffc008754514
> #13 [ffffffc011a23d50] vfs_write at ffffffc008607ac8
> #14 [ffffffc011a23db0] ksys_write at ffffffc008607714
> #15 [ffffffc011a23df0] __arm64_sys_write at ffffffc00860767c
> #16 [ffffffc011a23e10] invoke_syscall at ffffffc00813dcac
> #17 [ffffffc011a23e30] el0_svc_common at ffffffc00813dbc4
> #18 [ffffffc011a23e70] do_el0_svc at ffffffc00813daa8
> #19 [ffffffc011a23e80] el0_svc at ffffffc0098828d8
> #20 [ffffffc011a23ea0] el0t_64_sync_handler at ffffffc00988284c
> #21 [ffffffc011a23fe0] el0t_64_sync at ffffffc008091e44
> 
> cpu_hotplug_lock = $3 = {
>   rss = {
>     gp_state = 2,
>     gp_count = 1,
>     gp_wait = {
>       lock = {
>         {
>           rlock = {
>             raw_lock = {
>               {
>                 val = {
>                   counter = 0
>                 },
>                 {
>                   locked = 0 '\000',
>                   pending = 0 '\000'
>                 },
>                 {
>                   locked_pending = 0,
>                   tail = 0
>                 }
>               }
>             }
>           }
>         }
>       },
>       head = {
>         next = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>,
>         prev = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>
>       }
>     },
>     cb_head = {
>       next = 0x0,
>       func = 0x0
>     }
>   },
>   read_count = 0xffffffc00b0a0808 <__percpu_rwsem_rc_cpu_hotplug_lock>,
>   writer = {
>     task = 0xffffff80f303ca00
>   },
>   {
>     waiters = {
>       lock = {
>         {
>           rlock = {
>             raw_lock = {
>               {
>                 val = {
>                   counter = 0
>                 },
>                 {
>                   locked = 0 '\000',
>                   pending = 0 '\000'
>                 },
>                 {
>                   locked_pending = 0,
>                   tail = 0
>                 }
>               }
>             }
>           }
>         }
>       },
>       head = {
>         next = 0xffffffc011a23958,
>         prev = 0xffffffc01430bcb8
>       }
>     },
>     destroy_list_entry = {
>       next = 0x0,
>       prev = 0xffffffc011a23958
>     }
>   },
>   block = {
>     counter = 1
>   }
> }
> 
> PID: 15760  TASK: ffffff80f303ca00  CPU: 5   COMMAND: "test.sh"
>  #0 [ffffffc0129639a0] __switch_to at ffffffc0081229b4
>  #1 [ffffffc012963a00] __schedule at ffffffc009c824f8
>  #2 [ffffffc012963a60] schedule at ffffffc009c82b50
>  #3 [ffffffc012963a90] percpu_down_write at ffffffc00828f9d8
>  #4 [ffffffc012963ae0] _cpu_down at ffffffc009884550
>  #5 [ffffffc012963b40] cpu_device_down at ffffffc0081df814
>  #6 [ffffffc012963b60] cpu_subsys_offline at ffffffc008d8dd8c
>  #7 [ffffffc012963b90] device_offline at ffffffc008d77124
>  #8 [ffffffc012963bd0] online_store at ffffffc008d76d44
>  #9 [ffffffc012963c30] dev_attr_store at ffffffc008d7ba30
> #10 [ffffffc012963c60] sysfs_kf_write at ffffffc0087578d0
> #11 [ffffffc012963c90] kernfs_fop_write_iter at ffffffc008754514
> #12 [ffffffc012963d50] vfs_write at ffffffc008607ac8
> #13 [ffffffc012963db0] ksys_write at ffffffc008607714
> #14 [ffffffc012963df0] __arm64_sys_write at ffffffc00860767c
> #15 [ffffffc012963e10] invoke_syscall at ffffffc00813dcac
> #16 [ffffffc012963e30] el0_svc_common at ffffffc00813dbf4
> #17 [ffffffc012963e70] do_el0_svc at ffffffc00813daa8
> #18 [ffffffc012963e80] el0_svc at ffffffc0098828d8
> #19 [ffffffc012963ea0] el0t_64_sync_handler at ffffffc00988284c
> #20 [ffffffc012963fe0] el0t_64_sync at ffffffc008091e44
> 
> Thanks!
> BR
> -- 
> xuewen.yan
> 
> > Thanks.
> >
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index 13c8e91d78620..7caefc8af9127 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -2345,6 +2345,38 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
> >  }
> >  EXPORT_SYMBOL_GPL(task_cgroup_path);
> >
> > +/**
> > + * lock_threadgroup - Stabilize threadgroups
> > + *
> > + * cgroup migration operations need the threadgroups stabilized against forks
> > + * and exits, which can be achieved by write-locking cgroup_threadgroup_rwsem.
> > + *
> > + * Note that write-locking threadgdroup_rwsem is nested inside CPU hotplug
> > + * disable. This is because cpuset needs CPU hotplug disabled during ->attach()
> > + * and bringing up a CPU may involve creating new tasks which can require
> > + * read-locking threadgroup_rwsem. If we call cpuset's ->attach() with
> > + * threadgroup_rwsem write-locked with hotplug enabled, we can deadlock by
> > + * cpuset waiting for an on-going CPU hotplug operation which in turn is waiting
> > + * for the threadgroup_rwsem to be released to create new tasks. See the
> > + * following for more details:
> > + *
> > + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
> > + */
> > +static void lock_threadgroup(void)
> > +{
> > +       cpus_read_lock();
> > +       percpu_down_write(&cgroup_threadgroup_rwsem);
> > +}
> > +
> > +/**
> > + * lock_threadgroup - Undo lock_threadgroup
> > + */
> > +static void unlock_threadgroup(void)
> > +{
> > +       percpu_up_write(&cgroup_threadgroup_rwsem);
> > +       cpus_read_unlock();
> > +}
> > +
> >  /**
> >   * cgroup_migrate_add_task - add a migration target task to a migration context
> >   * @task: target task
> > @@ -2822,7 +2854,6 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
> >
> >  struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> >                                              bool *locked)
> > -       __acquires(&cgroup_threadgroup_rwsem)
> >  {
> >         struct task_struct *tsk;
> >         pid_t pid;
> > @@ -2840,7 +2871,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> >          */
> >         lockdep_assert_held(&cgroup_mutex);
> >         if (pid || threadgroup) {
> > -               percpu_down_write(&cgroup_threadgroup_rwsem);
> > +               lock_threadgroup();
> >                 *locked = true;
> >         } else {
> >                 *locked = false;
> > @@ -2876,7 +2907,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> >
> >  out_unlock_threadgroup:
> >         if (*locked) {
> > -               percpu_up_write(&cgroup_threadgroup_rwsem);
> > +               unlock_threadgroup();
> >                 *locked = false;
> >         }
> >  out_unlock_rcu:
> > @@ -2885,7 +2916,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> >  }
> >
> >  void cgroup_procs_write_finish(struct task_struct *task, bool locked)
> > -       __releases(&cgroup_threadgroup_rwsem)
> >  {
> >         struct cgroup_subsys *ss;
> >         int ssid;
> > @@ -2894,7 +2924,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
> >         put_task_struct(task);
> >
> >         if (locked)
> > -               percpu_up_write(&cgroup_threadgroup_rwsem);
> > +               unlock_threadgroup();
> > +
> >         for_each_subsys(ss, ssid)
> >                 if (ss->post_attach)
> >                         ss->post_attach();
> > @@ -2953,7 +2984,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
> >
> >         lockdep_assert_held(&cgroup_mutex);
> >
> > -       percpu_down_write(&cgroup_threadgroup_rwsem);
> > +       lock_threadgroup();
> >
> >         /* look up all csses currently attached to @cgrp's subtree */
> >         spin_lock_irq(&css_set_lock);
> > @@ -2984,7 +3015,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
> >         ret = cgroup_migrate_execute(&mgctx);
> >  out_finish:
> >         cgroup_migrate_finish(&mgctx);
> > -       percpu_up_write(&cgroup_threadgroup_rwsem);
> > +       unlock_threadgroup();
> >         return ret;
> >  }
> >
Tejun Heo July 29, 2022, 7:59 p.m. UTC | #23
On Fri, Jul 29, 2022 at 09:39:49AM +0100, Qais Yousef wrote:
> I *think* it's because we haven't removed cpus_read_lock() from
> cpuset_attach(). So we end up holding the lock twice in the same path. Since we
> hold it unconditionally now, we should remove cpuset dependency on
> cpus_read_lock() I believe.

Ah, yeah, that's because pending write locker makes future reader lockers
wait, so even if we're holding read lock, if we try to read lock again, we
end up waiting. I'll make the cpus_read_lock() unconditional in cgroup core
and drop it from cpuset's attach operation.

Thanks.
Xuewen Yan Aug. 9, 2022, 2:02 a.m. UTC | #24
Hi Tejun

On Sat, Jul 30, 2022 at 3:59 AM Tejun Heo <tj@kernel.org> wrote:
>
> On Fri, Jul 29, 2022 at 09:39:49AM +0100, Qais Yousef wrote:
> > I *think* it's because we haven't removed cpus_read_lock() from
> > cpuset_attach(). So we end up holding the lock twice in the same path. Since we
> > hold it unconditionally now, we should remove cpuset dependency on
> > cpus_read_lock() I believe.
>
> Ah, yeah, that's because pending write locker makes future reader lockers
> wait, so even if we're holding read lock, if we try to read lock again, we
> end up waiting. I'll make the cpus_read_lock() unconditional in cgroup core
> and drop it from cpuset's attach operation.

I revert the following patch which add the cpus_read_lock() in
cpuset's attach and have test for a while. And the deadlock has not
reproduced.

https://lore.kernel.org/all/20220121101210.84926-1-zhangqiao22@huawei.com/

But I do not know the risk with reverting the patch..

Thanks!
BR
diff mbox series

Patch

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 285ad51caf0f..42d04a1eac20 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -8,6 +8,8 @@ 
 
 #define IOWAIT_BOOST_MIN	(SCHED_CAPACITY_SCALE / 8)
 
+static LIST_HEAD(sugov_list);
+
 struct sugov_tunables {
 	struct gov_attr_set	attr_set;
 	unsigned int		rate_limit_us;
@@ -15,6 +17,7 @@  struct sugov_tunables {
 
 struct sugov_policy {
 	struct cpufreq_policy	*policy;
+	struct list_head	sg_policy_list;
 
 	struct sugov_tunables	*tunables;
 	struct list_head	tunables_hook;
@@ -659,6 +662,7 @@  static int sugov_init(struct cpufreq_policy *policy)
 	struct sugov_policy *sg_policy;
 	struct sugov_tunables *tunables;
 	int ret = 0;
+	int sugov_init = 0;
 
 	/* State should be equivalent to EXIT */
 	if (policy->governor_data)
@@ -666,6 +670,13 @@  static int sugov_init(struct cpufreq_policy *policy)
 
 	cpufreq_enable_fast_switch(policy);
 
+	list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) {
+		if (sg_policy->policy == policy) {
+			sugov_init = 1;
+			goto tunables_lock;
+		}
+	}
+
 	sg_policy = sugov_policy_alloc(policy);
 	if (!sg_policy) {
 		ret = -ENOMEM;
@@ -676,6 +687,10 @@  static int sugov_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto free_sg_policy;
 
+	list_add(&sg_policy->sg_policy_list, &sugov_list);
+
+tunables_lock:
+
 	mutex_lock(&global_tunables_lock);
 
 	if (global_tunables) {
@@ -717,7 +732,8 @@  static int sugov_init(struct cpufreq_policy *policy)
 	sugov_clear_global_tunables();
 
 stop_kthread:
-	sugov_kthread_stop(sg_policy);
+	if (!sugov_init)
+		sugov_kthread_stop(sg_policy);
 	mutex_unlock(&global_tunables_lock);
 
 free_sg_policy:
@@ -745,8 +761,14 @@  static void sugov_exit(struct cpufreq_policy *policy)
 
 	mutex_unlock(&global_tunables_lock);
 
+	list_for_each_entry(sg_policy, &sugov_list, sg_policy_list) {
+		if (sg_policy->policy == policy) {
+			goto out;
+		}
+	}
 	sugov_kthread_stop(sg_policy);
 	sugov_policy_free(sg_policy);
+out:
 	cpufreq_disable_fast_switch(policy);
 }