diff mbox series

[v2,1/2] Revert "mm: zswap: fix race between [de]compression and CPU hotunplug"

Message ID 20250107222236.2715883-1-yosryahmed@google.com (mailing list archive)
State New
Headers show
Series [v2,1/2] Revert "mm: zswap: fix race between [de]compression and CPU hotunplug" | expand

Commit Message

Yosry Ahmed Jan. 7, 2025, 10:22 p.m. UTC
This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.

Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
operations to protect against a race with CPU hotunplug making some
per-CPU resources go away.

However, zswap compress/decompress can be reached through reclaim while
the lock is held, resulting in a potential deadlock as reported by
syzbot:
======================================================
WARNING: possible circular locking dependency detected
6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
------------------------------------------------------
kswapd0/89 is trying to acquire lock:
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546

but task is already holding lock:
 ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
 ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (fs_reclaim){+.+.}-{0:0}:
        lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
        __fs_reclaim_acquire mm/page_alloc.c:3853 [inline]
        fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3867
        might_alloc include/linux/sched/mm.h:318 [inline]
        slab_pre_alloc_hook mm/slub.c:4070 [inline]
        slab_alloc_node mm/slub.c:4148 [inline]
        __kmalloc_cache_node_noprof+0x40/0x3a0 mm/slub.c:4337
        kmalloc_node_noprof include/linux/slab.h:924 [inline]
        alloc_worker kernel/workqueue.c:2638 [inline]
        create_worker+0x11b/0x720 kernel/workqueue.c:2781
        workqueue_prepare_cpu+0xe3/0x170 kernel/workqueue.c:6628
        cpuhp_invoke_callback+0x48d/0x830 kernel/cpu.c:194
        __cpuhp_invoke_callback_range kernel/cpu.c:965 [inline]
        cpuhp_invoke_callback_range kernel/cpu.c:989 [inline]
        cpuhp_up_callbacks kernel/cpu.c:1020 [inline]
        _cpu_up+0x2b3/0x580 kernel/cpu.c:1690
        cpu_up+0x184/0x230 kernel/cpu.c:1722
        cpuhp_bringup_mask+0xdf/0x260 kernel/cpu.c:1788
        cpuhp_bringup_cpus_parallel+0xf9/0x160 kernel/cpu.c:1878
        bringup_nonboot_cpus+0x2b/0x50 kernel/cpu.c:1892
        smp_init+0x34/0x150 kernel/smp.c:1009
        kernel_init_freeable+0x417/0x5d0 init/main.c:1569
        kernel_init+0x1d/0x2b0 init/main.c:1466
        ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
        ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

-> #0 (cpu_hotplug_lock){++++}-{0:0}:
        check_prev_add kernel/locking/lockdep.c:3161 [inline]
        check_prevs_add kernel/locking/lockdep.c:3280 [inline]
        validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
        __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
        lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
        percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
        cpus_read_lock+0x42/0x150 kernel/cpu.c:490
        acomp_ctx_get_cpu mm/zswap.c:886 [inline]
        zswap_compress mm/zswap.c:908 [inline]
        zswap_store_page mm/zswap.c:1439 [inline]
        zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
        swap_writepage+0x647/0xce0 mm/page_io.c:279
        shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
        pageout mm/vmscan.c:696 [inline]
        shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
        shrink_inactive_list mm/vmscan.c:1967 [inline]
        shrink_list mm/vmscan.c:2205 [inline]
        shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
        mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
        mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
        memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
        balance_pgdat mm/vmscan.c:6975 [inline]
        kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
        kthread+0x2f0/0x390 kernel/kthread.c:389
        ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
        ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(fs_reclaim);
                               lock(cpu_hotplug_lock);
                               lock(fs_reclaim);
  rlock(cpu_hotplug_lock);

 *** DEADLOCK ***

1 lock held by kswapd0/89:
  #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
  #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253

stack backtrace:
CPU: 0 UID: 0 PID: 89 Comm: kswapd0 Not tainted 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
Call Trace:
 <TASK>
  __dump_stack lib/dump_stack.c:94 [inline]
  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
  print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074
  check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206
  check_prev_add kernel/locking/lockdep.c:3161 [inline]
  check_prevs_add kernel/locking/lockdep.c:3280 [inline]
  validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
  __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
  lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
  percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
  cpus_read_lock+0x42/0x150 kernel/cpu.c:490
  acomp_ctx_get_cpu mm/zswap.c:886 [inline]
  zswap_compress mm/zswap.c:908 [inline]
  zswap_store_page mm/zswap.c:1439 [inline]
  zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
  swap_writepage+0x647/0xce0 mm/page_io.c:279
  shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
  pageout mm/vmscan.c:696 [inline]
  shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
  shrink_inactive_list mm/vmscan.c:1967 [inline]
  shrink_list mm/vmscan.c:2205 [inline]
  shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
  mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
  mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
  memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
  balance_pgdat mm/vmscan.c:6975 [inline]
  kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
  kthread+0x2f0/0x390 kernel/kthread.c:389
  ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
 </TASK>

Revert the change. A different fix for the race with CPU hotunplug will
follow.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---

The patches apply on top of mm-hotfixes-unstable and are meant for
v6.13.

Andrew, I am not sure what's the best way to handle this. This fix is
already merged into Linus's tree and had CC:stable, so I thought it's
best to revert it and replace it with a separate fix that would be easy
to backport instead of the revert patch, especially that functionally
the new fix is different anyway.

v1 -> v2:
- Disable migration as an alternative fix instead of SRCU, and explain
  why SRCU and cpus_read_lock() cannot be used in the commit log of
  patch 2.

---
 mm/zswap.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

Comments

Barry Song Jan. 7, 2025, 11:01 p.m. UTC | #1
On Wed, Jan 8, 2025 at 11:22 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.
>
> Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
> CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
> operations to protect against a race with CPU hotunplug making some
> per-CPU resources go away.
>
> However, zswap compress/decompress can be reached through reclaim while
> the lock is held, resulting in a potential deadlock as reported by
> syzbot:
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
> ------------------------------------------------------
> kswapd0/89 is trying to acquire lock:
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>
> but task is already holding lock:
>  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
>  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
>
> which lock already depends on the new lock.

We have functions like percpu_is_write_locked(),
percpu_is_read_locked(), and cpus_read_trylock().
Could they help prevent circular locking dependencies if we perform a
check before acquiring the lock?

>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (fs_reclaim){+.+.}-{0:0}:
>         lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>         __fs_reclaim_acquire mm/page_alloc.c:3853 [inline]
>         fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3867
>         might_alloc include/linux/sched/mm.h:318 [inline]
>         slab_pre_alloc_hook mm/slub.c:4070 [inline]
>         slab_alloc_node mm/slub.c:4148 [inline]
>         __kmalloc_cache_node_noprof+0x40/0x3a0 mm/slub.c:4337
>         kmalloc_node_noprof include/linux/slab.h:924 [inline]
>         alloc_worker kernel/workqueue.c:2638 [inline]
>         create_worker+0x11b/0x720 kernel/workqueue.c:2781
>         workqueue_prepare_cpu+0xe3/0x170 kernel/workqueue.c:6628
>         cpuhp_invoke_callback+0x48d/0x830 kernel/cpu.c:194
>         __cpuhp_invoke_callback_range kernel/cpu.c:965 [inline]
>         cpuhp_invoke_callback_range kernel/cpu.c:989 [inline]
>         cpuhp_up_callbacks kernel/cpu.c:1020 [inline]
>         _cpu_up+0x2b3/0x580 kernel/cpu.c:1690
>         cpu_up+0x184/0x230 kernel/cpu.c:1722
>         cpuhp_bringup_mask+0xdf/0x260 kernel/cpu.c:1788
>         cpuhp_bringup_cpus_parallel+0xf9/0x160 kernel/cpu.c:1878
>         bringup_nonboot_cpus+0x2b/0x50 kernel/cpu.c:1892
>         smp_init+0x34/0x150 kernel/smp.c:1009
>         kernel_init_freeable+0x417/0x5d0 init/main.c:1569
>         kernel_init+0x1d/0x2b0 init/main.c:1466
>         ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>         ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>
> -> #0 (cpu_hotplug_lock){++++}-{0:0}:
>         check_prev_add kernel/locking/lockdep.c:3161 [inline]
>         check_prevs_add kernel/locking/lockdep.c:3280 [inline]
>         validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
>         __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
>         lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>         percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
>         cpus_read_lock+0x42/0x150 kernel/cpu.c:490
>         acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>         zswap_compress mm/zswap.c:908 [inline]
>         zswap_store_page mm/zswap.c:1439 [inline]
>         zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>         swap_writepage+0x647/0xce0 mm/page_io.c:279
>         shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
>         pageout mm/vmscan.c:696 [inline]
>         shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
>         shrink_inactive_list mm/vmscan.c:1967 [inline]
>         shrink_list mm/vmscan.c:2205 [inline]
>         shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
>         mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
>         mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
>         memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
>         balance_pgdat mm/vmscan.c:6975 [inline]
>         kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
>         kthread+0x2f0/0x390 kernel/kthread.c:389
>         ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>         ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>
> other info that might help us debug this:
>
>  Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(fs_reclaim);
>                                lock(cpu_hotplug_lock);
>                                lock(fs_reclaim);
>   rlock(cpu_hotplug_lock);
>
>  *** DEADLOCK ***
>
> 1 lock held by kswapd0/89:
>   #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
>   #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
>
> stack backtrace:
> CPU: 0 UID: 0 PID: 89 Comm: kswapd0 Not tainted 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
> Call Trace:
>  <TASK>
>   __dump_stack lib/dump_stack.c:94 [inline]
>   dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
>   print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074
>   check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206
>   check_prev_add kernel/locking/lockdep.c:3161 [inline]
>   check_prevs_add kernel/locking/lockdep.c:3280 [inline]
>   validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
>   __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
>   lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>   percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
>   cpus_read_lock+0x42/0x150 kernel/cpu.c:490
>   acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>   zswap_compress mm/zswap.c:908 [inline]
>   zswap_store_page mm/zswap.c:1439 [inline]
>   zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>   swap_writepage+0x647/0xce0 mm/page_io.c:279
>   shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
>   pageout mm/vmscan.c:696 [inline]
>   shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
>   shrink_inactive_list mm/vmscan.c:1967 [inline]
>   shrink_list mm/vmscan.c:2205 [inline]
>   shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
>   mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
>   mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
>   memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
>   balance_pgdat mm/vmscan.c:6975 [inline]
>   kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
>   kthread+0x2f0/0x390 kernel/kthread.c:389
>   ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>  </TASK>
>
> Revert the change. A different fix for the race with CPU hotunplug will
> follow.
>
> Reported-by: syzbot <syzkaller@googlegroups.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>
> The patches apply on top of mm-hotfixes-unstable and are meant for
> v6.13.
>
> Andrew, I am not sure what's the best way to handle this. This fix is
> already merged into Linus's tree and had CC:stable, so I thought it's
> best to revert it and replace it with a separate fix that would be easy
> to backport instead of the revert patch, especially that functionally
> the new fix is different anyway.
>
> v1 -> v2:
> - Disable migration as an alternative fix instead of SRCU, and explain
>   why SRCU and cpus_read_lock() cannot be used in the commit log of
>   patch 2.
>
> ---
>  mm/zswap.c | 19 +++----------------
>  1 file changed, 3 insertions(+), 16 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 5a27af8d86ea9..f6316b66fb236 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -880,18 +880,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
>         return 0;
>  }
>
> -/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
> -static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
> -{
> -       cpus_read_lock();
> -       return raw_cpu_ptr(acomp_ctx);
> -}
> -
> -static void acomp_ctx_put_cpu(void)
> -{
> -       cpus_read_unlock();
> -}
> -
>  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>                            struct zswap_pool *pool)
>  {
> @@ -905,7 +893,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>         gfp_t gfp;
>         u8 *dst;
>
> -       acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
> +       acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> +
>         mutex_lock(&acomp_ctx->mutex);
>
>         dst = acomp_ctx->buffer;
> @@ -961,7 +950,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>                 zswap_reject_alloc_fail++;
>
>         mutex_unlock(&acomp_ctx->mutex);
> -       acomp_ctx_put_cpu();
>         return comp_ret == 0 && alloc_ret == 0;
>  }
>
> @@ -972,7 +960,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>         struct crypto_acomp_ctx *acomp_ctx;
>         u8 *src;
>
> -       acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>         mutex_lock(&acomp_ctx->mutex);
>
>         src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> @@ -1002,7 +990,6 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
>         if (src != acomp_ctx->buffer)
>                 zpool_unmap_handle(zpool, entry->handle);
> -       acomp_ctx_put_cpu();
>  }
>
>  /*********************************
> --
> 2.47.1.613.gc27f4b7a9f-goog
>

Thanks
barry
Yosry Ahmed Jan. 7, 2025, 11:39 p.m. UTC | #2
On Tue, Jan 7, 2025 at 3:01 PM Barry Song <baohua@kernel.org> wrote:
>
> On Wed, Jan 8, 2025 at 11:22 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.
> >
> > Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
> > CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
> > operations to protect against a race with CPU hotunplug making some
> > per-CPU resources go away.
> >
> > However, zswap compress/decompress can be reached through reclaim while
> > the lock is held, resulting in a potential deadlock as reported by
> > syzbot:
> > ======================================================
> > WARNING: possible circular locking dependency detected
> > 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
> > ------------------------------------------------------
> > kswapd0/89 is trying to acquire lock:
> >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
> >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
> >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
> >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
> >
> > but task is already holding lock:
> >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
> >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
> >
> > which lock already depends on the new lock.
>
> We have functions like percpu_is_write_locked(),
> percpu_is_read_locked(), and cpus_read_trylock().
> Could they help prevent circular locking dependencies if we perform a
> check before acquiring the lock?

Yeah we can do that but it feels a bit hacky, we may have to
unnecessarily fail the operation in some cases, right? Not sure tbh.
Barry Song Jan. 8, 2025, 12:34 a.m. UTC | #3
On Wed, Jan 8, 2025 at 12:39 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jan 7, 2025 at 3:01 PM Barry Song <baohua@kernel.org> wrote:
> >
> > On Wed, Jan 8, 2025 at 11:22 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.
> > >
> > > Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
> > > CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
> > > operations to protect against a race with CPU hotunplug making some
> > > per-CPU resources go away.
> > >
> > > However, zswap compress/decompress can be reached through reclaim while
> > > the lock is held, resulting in a potential deadlock as reported by
> > > syzbot:
> > > ======================================================
> > > WARNING: possible circular locking dependency detected
> > > 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
> > > ------------------------------------------------------
> > > kswapd0/89 is trying to acquire lock:
> > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
> > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
> > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
> > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
> > >
> > > but task is already holding lock:
> > >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
> > >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
> > >
> > > which lock already depends on the new lock.
> >
> > We have functions like percpu_is_write_locked(),
> > percpu_is_read_locked(), and cpus_read_trylock().
> > Could they help prevent circular locking dependencies if we perform a
> > check before acquiring the lock?
>
> Yeah we can do that but it feels a bit hacky, we may have to
> unnecessarily fail the operation in some cases, right? Not sure tbh.

Not sure if it can be as simple as the following:

    locked = cpus_read_trylock();
    ....
    if (locked)
        cpus_read_unlock();

If this works, it seems better than migrate_disable(), which could affect
the scheduler's select_rq especially given that swap is a hot path :-)

Thanks
Barry
Yosry Ahmed Jan. 8, 2025, 12:54 a.m. UTC | #4
On Tue, Jan 7, 2025 at 4:34 PM Barry Song <baohua@kernel.org> wrote:
>
> On Wed, Jan 8, 2025 at 12:39 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Jan 7, 2025 at 3:01 PM Barry Song <baohua@kernel.org> wrote:
> > >
> > > On Wed, Jan 8, 2025 at 11:22 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > >
> > > > This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.
> > > >
> > > > Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
> > > > CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
> > > > operations to protect against a race with CPU hotunplug making some
> > > > per-CPU resources go away.
> > > >
> > > > However, zswap compress/decompress can be reached through reclaim while
> > > > the lock is held, resulting in a potential deadlock as reported by
> > > > syzbot:
> > > > ======================================================
> > > > WARNING: possible circular locking dependency detected
> > > > 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
> > > > ------------------------------------------------------
> > > > kswapd0/89 is trying to acquire lock:
> > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
> > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
> > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
> > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
> > > >
> > > > but task is already holding lock:
> > > >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
> > > >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
> > > >
> > > > which lock already depends on the new lock.
> > >
> > > We have functions like percpu_is_write_locked(),
> > > percpu_is_read_locked(), and cpus_read_trylock().
> > > Could they help prevent circular locking dependencies if we perform a
> > > check before acquiring the lock?
> >
> > Yeah we can do that but it feels a bit hacky, we may have to
> > unnecessarily fail the operation in some cases, right? Not sure tbh.
>
> Not sure if it can be as simple as the following:
>
>     locked = cpus_read_trylock();
>     ....
>     if (locked)
>         cpus_read_unlock();
>
> If this works, it seems better than migrate_disable(), which could affect
> the scheduler's select_rq especially given that swap is a hot path :-)

I didn't look too closely into this, but I'd prefer the simpler fix
unless it causes any noticeable regressions. Unless others are also
concerned about disabling migration..

>
> Thanks
> Barry
Barry Song Jan. 8, 2025, 1:11 a.m. UTC | #5
On Wed, Jan 8, 2025 at 1:54 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jan 7, 2025 at 4:34 PM Barry Song <baohua@kernel.org> wrote:
> >
> > On Wed, Jan 8, 2025 at 12:39 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Tue, Jan 7, 2025 at 3:01 PM Barry Song <baohua@kernel.org> wrote:
> > > >
> > > > On Wed, Jan 8, 2025 at 11:22 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > > >
> > > > > This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.
> > > > >
> > > > > Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
> > > > > CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
> > > > > operations to protect against a race with CPU hotunplug making some
> > > > > per-CPU resources go away.
> > > > >
> > > > > However, zswap compress/decompress can be reached through reclaim while
> > > > > the lock is held, resulting in a potential deadlock as reported by
> > > > > syzbot:
> > > > > ======================================================
> > > > > WARNING: possible circular locking dependency detected
> > > > > 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
> > > > > ------------------------------------------------------
> > > > > kswapd0/89 is trying to acquire lock:
> > > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
> > > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
> > > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
> > > > >  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
> > > > >
> > > > > but task is already holding lock:
> > > > >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
> > > > >  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
> > > > >
> > > > > which lock already depends on the new lock.
> > > >
> > > > We have functions like percpu_is_write_locked(),
> > > > percpu_is_read_locked(), and cpus_read_trylock().
> > > > Could they help prevent circular locking dependencies if we perform a
> > > > check before acquiring the lock?
> > >
> > > Yeah we can do that but it feels a bit hacky, we may have to
> > > unnecessarily fail the operation in some cases, right? Not sure tbh.
> >
> > Not sure if it can be as simple as the following:
> >
> >     locked = cpus_read_trylock();
> >     ....
> >     if (locked)
> >         cpus_read_unlock();
> >
> > If this works, it seems better than migrate_disable(), which could affect
> > the scheduler's select_rq especially given that swap is a hot path :-)
>
> I didn't look too closely into this, but I'd prefer the simpler fix
> unless it causes any noticeable regressions. Unless others are also
> concerned about disabling migration..

Okay, fair enough. It could be hacky, as there's a chance that a write
lock could be acquired by someone else. Waiman's initial patchset to fix
the same issue had an ugly sleep/retry mechanism:

https://lore.kernel.org/all/1532368179-15263-1-git-send-email-longman@redhat.com/
https://lore.kernel.org/all/1532368179-15263-3-git-send-email-longman@redhat.com/

>
> >
Thanks
Barry
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 5a27af8d86ea9..f6316b66fb236 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -880,18 +880,6 @@  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
-/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
-{
-	cpus_read_lock();
-	return raw_cpu_ptr(acomp_ctx);
-}
-
-static void acomp_ctx_put_cpu(void)
-{
-	cpus_read_unlock();
-}
-
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 			   struct zswap_pool *pool)
 {
@@ -905,7 +893,8 @@  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	gfp_t gfp;
 	u8 *dst;
 
-	acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
+	acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+
 	mutex_lock(&acomp_ctx->mutex);
 
 	dst = acomp_ctx->buffer;
@@ -961,7 +950,6 @@  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 		zswap_reject_alloc_fail++;
 
 	mutex_unlock(&acomp_ctx->mutex);
-	acomp_ctx_put_cpu();
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
@@ -972,7 +960,7 @@  static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	struct crypto_acomp_ctx *acomp_ctx;
 	u8 *src;
 
-	acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
 
 	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
@@ -1002,7 +990,6 @@  static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 
 	if (src != acomp_ctx->buffer)
 		zpool_unmap_handle(zpool, entry->handle);
-	acomp_ctx_put_cpu();
 }
 
 /*********************************