diff mbox series

[RESEND,1/2] Revert "mm: zswap: fix race between [de]compression and CPU hotunplug"

Message ID 20250107074724.1756696-1-yosryahmed@google.com (mailing list archive)
State New
Headers show
Series [RESEND,1/2] Revert "mm: zswap: fix race between [de]compression and CPU hotunplug" | expand

Commit Message

Yosry Ahmed Jan. 7, 2025, 7:47 a.m. UTC
This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.

Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
operations to protect against a race with CPU hotunplug making some
per-CPU resources go away.

However, zswap compress/decompress can be reached through reclaim while
the lock is held, resulting in a potential deadlock as reported by
syzbot:
======================================================
WARNING: possible circular locking dependency detected
6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
------------------------------------------------------
kswapd0/89 is trying to acquire lock:
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546

but task is already holding lock:
 ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
 ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (fs_reclaim){+.+.}-{0:0}:
        lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
        __fs_reclaim_acquire mm/page_alloc.c:3853 [inline]
        fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3867
        might_alloc include/linux/sched/mm.h:318 [inline]
        slab_pre_alloc_hook mm/slub.c:4070 [inline]
        slab_alloc_node mm/slub.c:4148 [inline]
        __kmalloc_cache_node_noprof+0x40/0x3a0 mm/slub.c:4337
        kmalloc_node_noprof include/linux/slab.h:924 [inline]
        alloc_worker kernel/workqueue.c:2638 [inline]
        create_worker+0x11b/0x720 kernel/workqueue.c:2781
        workqueue_prepare_cpu+0xe3/0x170 kernel/workqueue.c:6628
        cpuhp_invoke_callback+0x48d/0x830 kernel/cpu.c:194
        __cpuhp_invoke_callback_range kernel/cpu.c:965 [inline]
        cpuhp_invoke_callback_range kernel/cpu.c:989 [inline]
        cpuhp_up_callbacks kernel/cpu.c:1020 [inline]
        _cpu_up+0x2b3/0x580 kernel/cpu.c:1690
        cpu_up+0x184/0x230 kernel/cpu.c:1722
        cpuhp_bringup_mask+0xdf/0x260 kernel/cpu.c:1788
        cpuhp_bringup_cpus_parallel+0xf9/0x160 kernel/cpu.c:1878
        bringup_nonboot_cpus+0x2b/0x50 kernel/cpu.c:1892
        smp_init+0x34/0x150 kernel/smp.c:1009
        kernel_init_freeable+0x417/0x5d0 init/main.c:1569
        kernel_init+0x1d/0x2b0 init/main.c:1466
        ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
        ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

-> #0 (cpu_hotplug_lock){++++}-{0:0}:
        check_prev_add kernel/locking/lockdep.c:3161 [inline]
        check_prevs_add kernel/locking/lockdep.c:3280 [inline]
        validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
        __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
        lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
        percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
        cpus_read_lock+0x42/0x150 kernel/cpu.c:490
        acomp_ctx_get_cpu mm/zswap.c:886 [inline]
        zswap_compress mm/zswap.c:908 [inline]
        zswap_store_page mm/zswap.c:1439 [inline]
        zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
        swap_writepage+0x647/0xce0 mm/page_io.c:279
        shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
        pageout mm/vmscan.c:696 [inline]
        shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
        shrink_inactive_list mm/vmscan.c:1967 [inline]
        shrink_list mm/vmscan.c:2205 [inline]
        shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
        mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
        mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
        memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
        balance_pgdat mm/vmscan.c:6975 [inline]
        kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
        kthread+0x2f0/0x390 kernel/kthread.c:389
        ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
        ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(fs_reclaim);
                               lock(cpu_hotplug_lock);
                               lock(fs_reclaim);
  rlock(cpu_hotplug_lock);

 *** DEADLOCK ***

1 lock held by kswapd0/89:
  #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
  #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253

stack backtrace:
CPU: 0 UID: 0 PID: 89 Comm: kswapd0 Not tainted 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
Call Trace:
 <TASK>
  __dump_stack lib/dump_stack.c:94 [inline]
  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
  print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074
  check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206
  check_prev_add kernel/locking/lockdep.c:3161 [inline]
  check_prevs_add kernel/locking/lockdep.c:3280 [inline]
  validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
  __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
  lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
  percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
  cpus_read_lock+0x42/0x150 kernel/cpu.c:490
  acomp_ctx_get_cpu mm/zswap.c:886 [inline]
  zswap_compress mm/zswap.c:908 [inline]
  zswap_store_page mm/zswap.c:1439 [inline]
  zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
  swap_writepage+0x647/0xce0 mm/page_io.c:279
  shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
  pageout mm/vmscan.c:696 [inline]
  shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
  shrink_inactive_list mm/vmscan.c:1967 [inline]
  shrink_list mm/vmscan.c:2205 [inline]
  shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
  mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
  mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
  memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
  balance_pgdat mm/vmscan.c:6975 [inline]
  kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
  kthread+0x2f0/0x390 kernel/kthread.c:389
  ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
 </TASK>

Revert the change. A different fix for the race with CPU hotunplug will
follow.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---

This is a resend because I screwed up sending these two patches the
first time and included the patch getting reverted as well. The patches
apply on top of mm-hotfixes-unstable and are meant for v6.13.

Andrew, I am not sure what's the best way to handle this. This fix is
already merged into Linus's tree and had CC:stable, so I thought it's
best to revert it and replace it with a separate fix that would be easy
to backport instead of the revert patch, especially that functionally
the new fix is completely different anyway.

Does the revert automatically signal the stable maintainers to drop it?
Do we need to add CC:stable to this revert as well?

---
 mm/zswap.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 5a27af8d86ea9..f6316b66fb236 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -880,18 +880,6 @@  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
-/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
-{
-	cpus_read_lock();
-	return raw_cpu_ptr(acomp_ctx);
-}
-
-static void acomp_ctx_put_cpu(void)
-{
-	cpus_read_unlock();
-}
-
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 			   struct zswap_pool *pool)
 {
@@ -905,7 +893,8 @@  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	gfp_t gfp;
 	u8 *dst;
 
-	acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
+	acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+
 	mutex_lock(&acomp_ctx->mutex);
 
 	dst = acomp_ctx->buffer;
@@ -961,7 +950,6 @@  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 		zswap_reject_alloc_fail++;
 
 	mutex_unlock(&acomp_ctx->mutex);
-	acomp_ctx_put_cpu();
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
@@ -972,7 +960,7 @@  static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	struct crypto_acomp_ctx *acomp_ctx;
 	u8 *src;
 
-	acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
 
 	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
@@ -1002,7 +990,6 @@  static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 
 	if (src != acomp_ctx->buffer)
 		zpool_unmap_handle(zpool, entry->handle);
-	acomp_ctx_put_cpu();
 }
 
 /*********************************