diff mbox series

[v4,29/35] mm: slub: Move flush_cpu_slab() invocations __free_slab() invocations out of IRQ context

Message ID 20210805152000.12817-30-vbabka@suse.cz (mailing list archive)
State New
Headers show
Series SLUB: reduce irq disabled scope and make it RT compatible | expand

Commit Message

Vlastimil Babka Aug. 5, 2021, 3:19 p.m. UTC
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

flush_all() flushes a specific SLAB cache on each CPU (where the cache
is present). The deactivate_slab()/__free_slab() invocation happens
within IPI handler and is problematic for PREEMPT_RT.

The flush operation is not a frequent operation or a hot path. The
per-CPU flush operation can be moved to within a workqueue.

[vbabka@suse.cz: adapt to new SLUB changes]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 8 deletions(-)

Comments

Qian Cai Aug. 9, 2021, 1:41 p.m. UTC | #1
On 8/5/2021 11:19 AM, Vlastimil Babka wrote:
> From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> 
> flush_all() flushes a specific SLAB cache on each CPU (where the cache
> is present). The deactivate_slab()/__free_slab() invocation happens
> within IPI handler and is problematic for PREEMPT_RT.
> 
> The flush operation is not a frequent operation or a hot path. The
> per-CPU flush operation can be moved to within a workqueue.
> 
> [vbabka@suse.cz: adapt to new SLUB changes]
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 48 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index dceb289cb052..da48ada3d17f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2513,33 +2513,73 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
>  	unfreeze_partials_cpu(s, c);
>  }
>  
> +struct slub_flush_work {
> +	struct work_struct work;
> +	struct kmem_cache *s;
> +	bool skip;
> +};
> +
>  /*
>   * Flush cpu slab.
>   *
> - * Called from IPI handler with interrupts disabled.
> + * Called from CPU work handler with migration disabled.
>   */
> -static void flush_cpu_slab(void *d)
> +static void flush_cpu_slab(struct work_struct *w)
>  {
> -	struct kmem_cache *s = d;
> -	struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
> +	struct kmem_cache *s;
> +	struct kmem_cache_cpu *c;
> +	struct slub_flush_work *sfw;
> +
> +	sfw = container_of(w, struct slub_flush_work, work);
> +
> +	s = sfw->s;
> +	c = this_cpu_ptr(s->cpu_slab);
>  
>  	if (c->page)
> -		flush_slab(s, c, false);
> +		flush_slab(s, c, true);
>  
>  	unfreeze_partials(s);
>  }
>  
> -static bool has_cpu_slab(int cpu, void *info)
> +static bool has_cpu_slab(int cpu, struct kmem_cache *s)
>  {
> -	struct kmem_cache *s = info;
>  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
>  
>  	return c->page || slub_percpu_partial(c);
>  }
>  
> +static DEFINE_MUTEX(flush_lock);
> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> +
>  static void flush_all(struct kmem_cache *s)
>  {
> -	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> +	struct slub_flush_work *sfw;
> +	unsigned int cpu;
> +
> +	mutex_lock(&flush_lock);

Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:

slab_mutex -> flush_lock

[   91.374541] WARNING: possible circular locking dependency detected
[   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
[   91.387149] ------------------------------------------------------
[   91.394016] lsbug/1523 is trying to acquire lock:
[   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
[   91.407425] 
               but task is already holding lock:
[   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
[   91.423603] 
               which lock already depends on the new lock.

[   91.433854] 
               the existing dependency chain (in reverse order) is:
[   91.442715] 
               -> #4 (slab_mutex){+.+.}-{3:3}:
[   91.449766]        __lock_acquire+0xb0c/0x1aa8
[   91.454901]        lock_acquire+0x34c/0xb20
[   91.459773]        __mutex_lock+0x194/0x1470
[   91.464732]        mutex_lock_nested+0x6c/0xc0
[   91.469864]        slab_memory_callback+0x44/0x280
[   91.475344]        blocking_notifier_call_chain+0xd0/0x138
[   91.481519]        memory_notify+0x28/0x38
[   91.486304]        offline_pages+0x2cc/0xce4
[   91.491262]        memory_subsys_offline+0xd8/0x280
[   91.496827]        device_offline+0x154/0x1e0
[   91.501872]        online_store+0xa4/0x118
[   91.506656]        dev_attr_store+0x44/0x78
[   91.511527]        sysfs_kf_write+0xe8/0x138
[   91.516485]        kernfs_fop_write_iter+0x26c/0x3d0
[   91.522138]        new_sync_write+0x2bc/0x4f8
[   91.527185]        vfs_write+0x718/0xc88
[   91.531795]        ksys_write+0xf8/0x1e0
[   91.536404]        __arm64_sys_write+0x74/0xa8
[   91.541535]        invoke_syscall.constprop.0+0xdc/0x1d8
[   91.547536]        do_el0_svc+0xe4/0x2a8
[   91.552146]        el0_svc+0x64/0x130
[   91.556498]        el0t_64_sync_handler+0xb0/0xb8
[   91.561889]        el0t_64_sync+0x180/0x184
[   91.566760] 
               -> #3 ((memory_chain).rwsem){++++}-{3:3}:
[   91.574680]        __lock_acquire+0xb0c/0x1aa8
[   91.579814]        lock_acquire+0x34c/0xb20
[   91.584685]        down_read+0xf0/0x488
[   91.589210]        blocking_notifier_call_chain+0x58/0x138
[   91.595383]        memory_notify+0x28/0x38
[   91.600167]        offline_pages+0x2cc/0xce4
[   91.605124]        memory_subsys_offline+0xd8/0x280
[   91.610689]        device_offline+0x154/0x1e0
[   91.615734]        online_store+0xa4/0x118
[   91.620518]        dev_attr_store+0x44/0x78
[   91.625388]        sysfs_kf_write+0xe8/0x138
[   91.630346]        kernfs_fop_write_iter+0x26c/0x3d0
[   91.635997]        new_sync_write+0x2bc/0x4f8
[   91.641043]        vfs_write+0x718/0xc88
[   91.645652]        ksys_write+0xf8/0x1e0
[   91.650262]        __arm64_sys_write+0x74/0xa8
[   91.655393]        invoke_syscall.constprop.0+0xdc/0x1d8
[   91.661394]        do_el0_svc+0xe4/0x2a8
[   91.666004]        el0_svc+0x64/0x130
[   91.670355]        el0t_64_sync_handler+0xb0/0xb8
[   91.675747]        el0t_64_sync+0x180/0x184
[   91.680617] 
               -> #2 (pcp_batch_high_lock){+.+.}-{3:3}:
[   91.688449]        __lock_acquire+0xb0c/0x1aa8
[   91.693582]        lock_acquire+0x34c/0xb20
[   91.698452]        __mutex_lock+0x194/0x1470
[   91.703410]        mutex_lock_nested+0x6c/0xc0
[   91.708541]        zone_pcp_update+0x3c/0x68
[   91.713500]        page_alloc_cpu_online+0x64/0x90
[   91.718978]        cpuhp_invoke_callback+0x588/0x2ba8
[   91.724718]        cpuhp_invoke_callback_range+0xa4/0x108
[   91.730804]        cpu_up+0x598/0xb78
[   91.735154]        bringup_nonboot_cpus+0x110/0x168
[   91.740719]        smp_init+0x4c/0xe0
[   91.745070]        kernel_init_freeable+0x554/0x7c8
[   91.750637]        kernel_init+0x2c/0x140
[   91.755334]        ret_from_fork+0x10/0x20
[   91.760118] 
               -> #1 (cpu_hotplug_lock){++++}-{0:0}:
[   91.767688]        __lock_acquire+0xb0c/0x1aa8
[   91.772820]        lock_acquire+0x34c/0xb20
[   91.777691]        cpus_read_lock+0x98/0x308
[   91.782649]        flush_all+0x54/0x1c8
[   91.787173]        __kmem_cache_shrink+0x38/0x2f0
[   91.792566]        kmem_cache_shrink+0x28/0x38
[   91.797699]        acpi_os_purge_cache+0x18/0x28
[   91.803006]        acpi_purge_cached_objects+0x44/0xdc
[   91.808832]        acpi_initialize_objects+0x24/0x88
[   91.814487]        acpi_bus_init+0xe0/0x47c
[   91.819357]        acpi_init+0x130/0x27c
[   91.823967]        do_one_initcall+0x180/0xbe8
[   91.829098]        kernel_init_freeable+0x710/0x7c8
[   91.834663]        kernel_init+0x2c/0x140
[   91.839360]        ret_from_fork+0x10/0x20
[   91.844143] 
               -> #0 (flush_lock){+.+.}-{3:3}:
[   91.851193]        check_prev_add+0x194/0x1170
[   91.856326]        validate_chain+0xfe8/0x1c20
[   91.861458]        __lock_acquire+0xb0c/0x1aa8
[   91.866589]        lock_acquire+0x34c/0xb20
[   91.871460]        __mutex_lock+0x194/0x1470
[   91.876418]        mutex_lock_nested+0x6c/0xc0
[   91.881549]        flush_all+0x50/0x1c8
[   91.886072]        __kmem_cache_shrink+0x38/0x2f0
[   91.891465]        slab_memory_callback+0x68/0x280
[   91.896943]        blocking_notifier_call_chain+0xd0/0x138
[   91.903117]        memory_notify+0x28/0x38
[   91.907901]        offline_pages+0x2cc/0xce4
[   91.912859]        memory_subsys_offline+0xd8/0x280
[   91.918424]        device_offline+0x154/0x1e0
[   91.923470]        online_store+0xa4/0x118
[   91.928254]        dev_attr_store+0x44/0x78
[   91.933125]        sysfs_kf_write+0xe8/0x138
[   91.938083]        kernfs_fop_write_iter+0x26c/0x3d0
[   91.943735]        new_sync_write+0x2bc/0x4f8
[   91.948781]        vfs_write+0x718/0xc88
[   91.953391]        ksys_write+0xf8/0x1e0
[   91.958000]        __arm64_sys_write+0x74/0xa8
[   91.963130]        invoke_syscall.constprop.0+0xdc/0x1d8
[   91.969131]        do_el0_svc+0xe4/0x2a8
[   91.973741]        el0_svc+0x64/0x130
[   91.978093]        el0t_64_sync_handler+0xb0/0xb8
[   91.983484]        el0t_64_sync+0x180/0x184
[   91.988354] 
               other info that might help us debug this:

[   91.998431] Chain exists of:
                 flush_lock --> (memory_chain).rwsem --> slab_mutex

[   92.010867]  Possible unsafe locking scenario:

[   92.018166]        CPU0                    CPU1
[   92.023380]        ----                    ----
[   92.028595]   lock(slab_mutex);
[   92.032425]                                lock((memory_chain).rwsem);
[   92.039641]                                lock(slab_mutex);
[   92.045989]   lock(flush_lock);
[   92.049819] 
                *** DEADLOCK ***

[   92.057811] 10 locks held by lsbug/1523:
[   92.062420]  #0: ffff0000505a8430 (sb_writers#6){.+.+}-{0:0}, at: ksys_write+0xf8/0x1e0
[   92.071128]  #1: ffff000870f99e88 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x1dc/0x3d0
[   92.080701]  #2: ffff0000145b2ab8 (kn->active#175){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x1f8/0x3d0
[   92.090623]  #3: ffff800018f84f08 (device_hotplug_lock){+.+.}-{3:3}, at: lock_device_hotplug_sysfs+0x24/0x88
[   92.101151]  #4: ffff0000145e9190 (&dev->mutex){....}-{3:3}, at: device_offline+0xa0/0x1e0
[   92.110115]  #5: ffff800011d26450 (cpu_hotplug_lock){++++}-{0:0}, at: offline_pages+0x10c/0xce4
[   92.119514]  #6: ffff800018e60570 (mem_hotplug_lock){++++}-{0:0}, at: offline_pages+0x11c/0xce4
[   92.128919]  #7: ffff800018e5bb68 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x30/0x60
[   92.138668]  #8: ffff800018fa0610 ((memory_chain).rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x58/0x138
[   92.149633]  #9: ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
[   92.159033] 
               stack backtrace:
[   92.164772] CPU: 29 PID: 1523 Comm: lsbug Not tainted 5.14.0-rc5-next-20210809+ #84
[   92.173116] Hardware name: MiTAC RAPTOR EV-883832-X3-0001/RAPTOR, BIOS 1.6 06/28/2020
[   92.181631] Call trace:
[   92.184763]  dump_backtrace+0x0/0x3b8
[   92.189115]  show_stack+0x20/0x30
[   92.193118]  dump_stack_lvl+0x8c/0xb8
[   92.197469]  dump_stack+0x1c/0x38
[   92.201472]  print_circular_bug.isra.0+0x530/0x540
[   92.206953]  check_noncircular+0x27c/0x2f0
[   92.211738]  check_prev_add+0x194/0x1170
[   92.216349]  validate_chain+0xfe8/0x1c20
[   92.220961]  __lock_acquire+0xb0c/0x1aa8
[   92.225571]  lock_acquire+0x34c/0xb20
[   92.229921]  __mutex_lock+0x194/0x1470
[   92.234358]  mutex_lock_nested+0x6c/0xc0
[   92.238968]  flush_all+0x50/0x1c8
flush_all at /usr/src/linux-next/mm/slub.c:2649
[   92.242971]  __kmem_cache_shrink+0x38/0x2f0
[   92.247842]  slab_memory_callback+0x68/0x280
slab_mem_going_offline_callback at /usr/src/linux-next/mm/slub.c:4586
(inlined by) slab_memory_callback at /usr/src/linux-next/mm/slub.c:4678
[   92.252800]  blocking_notifier_call_chain+0xd0/0x138
notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:83
(inlined by) blocking_notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:337
(inlined by) blocking_notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:325
[   92.258453]  memory_notify+0x28/0x38
[   92.262717]  offline_pages+0x2cc/0xce4
[   92.267153]  memory_subsys_offline+0xd8/0x280
[   92.272198]  device_offline+0x154/0x1e0
[   92.276723]  online_store+0xa4/0x118
[   92.280986]  dev_attr_store+0x44/0x78
[   92.285336]  sysfs_kf_write+0xe8/0x138
[   92.289774]  kernfs_fop_write_iter+0x26c/0x3d0
[   92.294906]  new_sync_write+0x2bc/0x4f8
[   92.299431]  vfs_write+0x718/0xc88
[   92.303520]  ksys_write+0xf8/0x1e0
[   92.307608]  __arm64_sys_write+0x74/0xa8
[   92.312219]  invoke_syscall.constprop.0+0xdc/0x1d8
[   92.317698]  do_el0_svc+0xe4/0x2a8
[   92.321789]  el0_svc+0x64/0x130
[   92.325619]  el0t_64_sync_handler+0xb0/0xb8
[   92.330489]  el0t_64_sync+0x180/0x184

> +	cpus_read_lock();
> +
> +	for_each_online_cpu(cpu) {
> +		sfw = &per_cpu(slub_flush, cpu);
> +		if (!has_cpu_slab(cpu, s)) {
> +			sfw->skip = true;
> +			continue;
> +		}
> +		INIT_WORK(&sfw->work, flush_cpu_slab);
> +		sfw->skip = false;
> +		sfw->s = s;
> +		schedule_work_on(cpu, &sfw->work);
> +	}
> +
> +	for_each_online_cpu(cpu) {
> +		sfw = &per_cpu(slub_flush, cpu);
> +		if (sfw->skip)
> +			continue;
> +		flush_work(&sfw->work);
> +	}
> +
> +	cpus_read_unlock();
> +	mutex_unlock(&flush_lock);
>  }
>  
>  /*
>
Mike Galbraith Aug. 9, 2021, 6:44 p.m. UTC | #2
On Mon, 2021-08-09 at 09:41 -0400, Qian Cai wrote:
> 
> 
> On 8/5/2021 11:19 AM, Vlastimil Babka wrote:
> > 
> >  
> > +static DEFINE_MUTEX(flush_lock);
> > +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> > +
> >  static void flush_all(struct kmem_cache *s)
> >  {
> > -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> > +       struct slub_flush_work *sfw;
> > +       unsigned int cpu;
> > +
> > +       mutex_lock(&flush_lock);
> 
> Vlastimil, taking the lock here could trigger a warning during memory
> offline/online due to the locking order:
> 
> slab_mutex -> flush_lock

Bugger.  That chain ending with cpu_hotplug_lock makes slub_cpu_dead()
taking slab_mutex a non-starter for cpu hotplug as well.  It's
established early by kernel_init_freeable()..kmem_cache_destroy() as
well as by slab_mem_going_offline_callback().

> [   91.374541] WARNING: possible circular locking dependency detected
> [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> [   91.387149] ------------------------------------------------------
> [   91.394016] lsbug/1523 is trying to acquire lock:
> [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at:
> flush_all+0x50/0x1c8
> [   91.407425] 
>                but task is already holding lock:
> [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at:
> slab_memory_callback+0x44/0x280
> [   91.423603] 
>                which lock already depends on the new lock.
> 
> [   91.433854] 
>                the existing dependency chain (in reverse order) is:
> [   91.442715] 
>                -> #4 (slab_mutex){+.+.}-{3:3}:
> [   91.449766]        __lock_acquire+0xb0c/0x1aa8
> [   91.454901]        lock_acquire+0x34c/0xb20
> [   91.459773]        __mutex_lock+0x194/0x1470
> [   91.464732]        mutex_lock_nested+0x6c/0xc0
> [   91.469864]        slab_memory_callback+0x44/0x280
> [   91.475344]        blocking_notifier_call_chain+0xd0/0x138
> [   91.481519]        memory_notify+0x28/0x38
> [   91.486304]        offline_pages+0x2cc/0xce4
> [   91.491262]        memory_subsys_offline+0xd8/0x280
> [   91.496827]        device_offline+0x154/0x1e0
> [   91.501872]        online_store+0xa4/0x118
> [   91.506656]        dev_attr_store+0x44/0x78
> [   91.511527]        sysfs_kf_write+0xe8/0x138
> [   91.516485]        kernfs_fop_write_iter+0x26c/0x3d0
> [   91.522138]        new_sync_write+0x2bc/0x4f8
> [   91.527185]        vfs_write+0x718/0xc88
> [   91.531795]        ksys_write+0xf8/0x1e0
> [   91.536404]        __arm64_sys_write+0x74/0xa8
> [   91.541535]        invoke_syscall.constprop.0+0xdc/0x1d8
> [   91.547536]        do_el0_svc+0xe4/0x2a8
> [   91.552146]        el0_svc+0x64/0x130
> [   91.556498]        el0t_64_sync_handler+0xb0/0xb8
> [   91.561889]        el0t_64_sync+0x180/0x184
> [   91.566760] 
>                -> #3 ((memory_chain).rwsem){++++}-{3:3}:
> [   91.574680]        __lock_acquire+0xb0c/0x1aa8
> [   91.579814]        lock_acquire+0x34c/0xb20
> [   91.584685]        down_read+0xf0/0x488
> [   91.589210]        blocking_notifier_call_chain+0x58/0x138
> [   91.595383]        memory_notify+0x28/0x38
> [   91.600167]        offline_pages+0x2cc/0xce4
> [   91.605124]        memory_subsys_offline+0xd8/0x280
> [   91.610689]        device_offline+0x154/0x1e0
> [   91.615734]        online_store+0xa4/0x118
> [   91.620518]        dev_attr_store+0x44/0x78
> [   91.625388]        sysfs_kf_write+0xe8/0x138
> [   91.630346]        kernfs_fop_write_iter+0x26c/0x3d0
> [   91.635997]        new_sync_write+0x2bc/0x4f8
> [   91.641043]        vfs_write+0x718/0xc88
> [   91.645652]        ksys_write+0xf8/0x1e0
> [   91.650262]        __arm64_sys_write+0x74/0xa8
> [   91.655393]        invoke_syscall.constprop.0+0xdc/0x1d8
> [   91.661394]        do_el0_svc+0xe4/0x2a8
> [   91.666004]        el0_svc+0x64/0x130
> [   91.670355]        el0t_64_sync_handler+0xb0/0xb8
> [   91.675747]        el0t_64_sync+0x180/0x184
> [   91.680617] 
>                -> #2 (pcp_batch_high_lock){+.+.}-{3:3}:
> [   91.688449]        __lock_acquire+0xb0c/0x1aa8
> [   91.693582]        lock_acquire+0x34c/0xb20
> [   91.698452]        __mutex_lock+0x194/0x1470
> [   91.703410]        mutex_lock_nested+0x6c/0xc0
> [   91.708541]        zone_pcp_update+0x3c/0x68
> [   91.713500]        page_alloc_cpu_online+0x64/0x90
> [   91.718978]        cpuhp_invoke_callback+0x588/0x2ba8
> [   91.724718]        cpuhp_invoke_callback_range+0xa4/0x108
> [   91.730804]        cpu_up+0x598/0xb78
> [   91.735154]        bringup_nonboot_cpus+0x110/0x168
> [   91.740719]        smp_init+0x4c/0xe0
> [   91.745070]        kernel_init_freeable+0x554/0x7c8
> [   91.750637]        kernel_init+0x2c/0x140
> [   91.755334]        ret_from_fork+0x10/0x20
> [   91.760118] 
>                -> #1 (cpu_hotplug_lock){++++}-{0:0}:
> [   91.767688]        __lock_acquire+0xb0c/0x1aa8
> [   91.772820]        lock_acquire+0x34c/0xb20
> [   91.777691]        cpus_read_lock+0x98/0x308
> [   91.782649]        flush_all+0x54/0x1c8
> [   91.787173]        __kmem_cache_shrink+0x38/0x2f0
> [   91.792566]        kmem_cache_shrink+0x28/0x38
> [   91.797699]        acpi_os_purge_cache+0x18/0x28
> [   91.803006]        acpi_purge_cached_objects+0x44/0xdc
> [   91.808832]        acpi_initialize_objects+0x24/0x88
> [   91.814487]        acpi_bus_init+0xe0/0x47c
> [   91.819357]        acpi_init+0x130/0x27c
> [   91.823967]        do_one_initcall+0x180/0xbe8
> [   91.829098]        kernel_init_freeable+0x710/0x7c8
> [   91.834663]        kernel_init+0x2c/0x140
> [   91.839360]        ret_from_fork+0x10/0x20
> [   91.844143] 
>                -> #0 (flush_lock){+.+.}-{3:3}:
> [   91.851193]        check_prev_add+0x194/0x1170
> [   91.856326]        validate_chain+0xfe8/0x1c20
> [   91.861458]        __lock_acquire+0xb0c/0x1aa8
> [   91.866589]        lock_acquire+0x34c/0xb20
> [   91.871460]        __mutex_lock+0x194/0x1470
> [   91.876418]        mutex_lock_nested+0x6c/0xc0
> [   91.881549]        flush_all+0x50/0x1c8
> [   91.886072]        __kmem_cache_shrink+0x38/0x2f0
> [   91.891465]        slab_memory_callback+0x68/0x280
> [   91.896943]        blocking_notifier_call_chain+0xd0/0x138
> [   91.903117]        memory_notify+0x28/0x38
> [   91.907901]        offline_pages+0x2cc/0xce4
> [   91.912859]        memory_subsys_offline+0xd8/0x280
> [   91.918424]        device_offline+0x154/0x1e0
> [   91.923470]        online_store+0xa4/0x118
> [   91.928254]        dev_attr_store+0x44/0x78
> [   91.933125]        sysfs_kf_write+0xe8/0x138
> [   91.938083]        kernfs_fop_write_iter+0x26c/0x3d0
> [   91.943735]        new_sync_write+0x2bc/0x4f8
> [   91.948781]        vfs_write+0x718/0xc88
> [   91.953391]        ksys_write+0xf8/0x1e0
> [   91.958000]        __arm64_sys_write+0x74/0xa8
> [   91.963130]        invoke_syscall.constprop.0+0xdc/0x1d8
> [   91.969131]        do_el0_svc+0xe4/0x2a8
> [   91.973741]        el0_svc+0x64/0x130
> [   91.978093]        el0t_64_sync_handler+0xb0/0xb8
> [   91.983484]        el0t_64_sync+0x180/0x184
> [   91.988354] 
>                other info that might help us debug this:
> 
> [   91.998431] Chain exists of:
>                  flush_lock --> (memory_chain).rwsem --> slab_mutex
> 
> [   92.010867]  Possible unsafe locking scenario:
> 
> [   92.018166]        CPU0                    CPU1
> [   92.023380]        ----                    ----
> [   92.028595]   lock(slab_mutex);
> [   92.032425]                               
> lock((memory_chain).rwsem);
> [   92.039641]                                lock(slab_mutex);
> [   92.045989]   lock(flush_lock);
> [   92.049819] 
>                 *** DEADLOCK ***
> 
> [   92.057811] 10 locks held by lsbug/1523:
> [   92.062420]  #0: ffff0000505a8430 (sb_writers#6){.+.+}-{0:0}, at:
> ksys_write+0xf8/0x1e0
> [   92.071128]  #1: ffff000870f99e88 (&of->mutex){+.+.}-{3:3}, at:
> kernfs_fop_write_iter+0x1dc/0x3d0
> [   92.080701]  #2: ffff0000145b2ab8 (kn->active#175){.+.+}-{0:0},
> at: kernfs_fop_write_iter+0x1f8/0x3d0
> [   92.090623]  #3: ffff800018f84f08 (device_hotplug_lock){+.+.}-
> {3:3}, at: lock_device_hotplug_sysfs+0x24/0x88
> [   92.101151]  #4: ffff0000145e9190 (&dev->mutex){....}-{3:3}, at:
> device_offline+0xa0/0x1e0
> [   92.110115]  #5: ffff800011d26450 (cpu_hotplug_lock){++++}-{0:0},
> at: offline_pages+0x10c/0xce4
> [   92.119514]  #6: ffff800018e60570 (mem_hotplug_lock){++++}-{0:0},
> at: offline_pages+0x11c/0xce4
> [   92.128919]  #7: ffff800018e5bb68 (pcp_batch_high_lock){+.+.}-
> {3:3}, at: zone_pcp_disable+0x30/0x60
> [   92.138668]  #8: ffff800018fa0610 ((memory_chain).rwsem){++++}-
> {3:3}, at: blocking_notifier_call_chain+0x58/0x138
> [   92.149633]  #9: ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at:
> slab_memory_callback+0x44/0x280
> [   92.159033] 
>                stack backtrace:
> [   92.164772] CPU: 29 PID: 1523 Comm: lsbug Not tainted 5.14.0-rc5-
> next-20210809+ #84
> [   92.173116] Hardware name: MiTAC RAPTOR EV-883832-X3-0001/RAPTOR,
> BIOS 1.6 06/28/2020
> [   92.181631] Call trace:
> [   92.184763]  dump_backtrace+0x0/0x3b8
> [   92.189115]  show_stack+0x20/0x30
> [   92.193118]  dump_stack_lvl+0x8c/0xb8
> [   92.197469]  dump_stack+0x1c/0x38
> [   92.201472]  print_circular_bug.isra.0+0x530/0x540
> [   92.206953]  check_noncircular+0x27c/0x2f0
> [   92.211738]  check_prev_add+0x194/0x1170
> [   92.216349]  validate_chain+0xfe8/0x1c20
> [   92.220961]  __lock_acquire+0xb0c/0x1aa8
> [   92.225571]  lock_acquire+0x34c/0xb20
> [   92.229921]  __mutex_lock+0x194/0x1470
> [   92.234358]  mutex_lock_nested+0x6c/0xc0
> [   92.238968]  flush_all+0x50/0x1c8
> flush_all at /usr/src/linux-next/mm/slub.c:2649
> [   92.242971]  __kmem_cache_shrink+0x38/0x2f0
> [   92.247842]  slab_memory_callback+0x68/0x280
> slab_mem_going_offline_callback at /usr/src/linux-next/mm/slub.c:4586
> (inlined by) slab_memory_callback at /usr/src/linux-
> next/mm/slub.c:4678
> [   92.252800]  blocking_notifier_call_chain+0xd0/0x138
> notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:83
> (inlined by) blocking_notifier_call_chain at /usr/src/linux-
> next/kernel/notifier.c:337
> (inlined by) blocking_notifier_call_chain at /usr/src/linux-
> next/kernel/notifier.c:325
> [   92.258453]  memory_notify+0x28/0x38
> [   92.262717]  offline_pages+0x2cc/0xce4
> [   92.267153]  memory_subsys_offline+0xd8/0x280
> [   92.272198]  device_offline+0x154/0x1e0
> [   92.276723]  online_store+0xa4/0x118
> [   92.280986]  dev_attr_store+0x44/0x78
> [   92.285336]  sysfs_kf_write+0xe8/0x138
> [   92.289774]  kernfs_fop_write_iter+0x26c/0x3d0
> [   92.294906]  new_sync_write+0x2bc/0x4f8
> [   92.299431]  vfs_write+0x718/0xc88
> [   92.303520]  ksys_write+0xf8/0x1e0
> [   92.307608]  __arm64_sys_write+0x74/0xa8
> [   92.312219]  invoke_syscall.constprop.0+0xdc/0x1d8
> [   92.317698]  do_el0_svc+0xe4/0x2a8
> [   92.321789]  el0_svc+0x64/0x130
> [   92.325619]  el0t_64_sync_handler+0xb0/0xb8
> [   92.330489]  el0t_64_sync+0x180/0x184
> 
> > +       cpus_read_lock();
> > +
> > +       for_each_online_cpu(cpu) {
> > +               sfw = &per_cpu(slub_flush, cpu);
> > +               if (!has_cpu_slab(cpu, s)) {
> > +                       sfw->skip = true;
> > +                       continue;
> > +               }
> > +               INIT_WORK(&sfw->work, flush_cpu_slab);
> > +               sfw->skip = false;
> > +               sfw->s = s;
> > +               schedule_work_on(cpu, &sfw->work);
> > +       }
> > +
> > +       for_each_online_cpu(cpu) {
> > +               sfw = &per_cpu(slub_flush, cpu);
> > +               if (sfw->skip)
> > +                       continue;
> > +               flush_work(&sfw->work);
> > +       }
> > +
> > +       cpus_read_unlock();
> > +       mutex_unlock(&flush_lock);
> >  }
> >  
> >  /*
> >
Vlastimil Babka Aug. 9, 2021, 8:08 p.m. UTC | #3
On 8/9/2021 8:44 PM, Mike Galbraith wrote:
> On Mon, 2021-08-09 at 09:41 -0400, Qian Cai wrote:
>>
>>
>> On 8/5/2021 11:19 AM, Vlastimil Babka wrote:
>>>
>>>  
>>> +static DEFINE_MUTEX(flush_lock);
>>> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
>>> +
>>>  static void flush_all(struct kmem_cache *s)
>>>  {
>>> -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
>>> +       struct slub_flush_work *sfw;
>>> +       unsigned int cpu;
>>> +
>>> +       mutex_lock(&flush_lock);
>>
>> Vlastimil, taking the lock here could trigger a warning during memory
>> offline/online due to the locking order:
>>
>> slab_mutex -> flush_lock
> 
> Bugger.  That chain ending with cpu_hotplug_lock makes slub_cpu_dead()
> taking slab_mutex a non-starter for cpu hotplug as well.  It's
> established early by kernel_init_freeable()..kmem_cache_destroy() as
> well as by slab_mem_going_offline_callback().

I suck at reading the lockdep splats, so I don't see yet how the "existing
reverse order" occurs - I do understand the order in the "lsbug".
What I also wonder is why didn't this occur also in the older RT trees with this
patch. I did change the order of locks in flush_all() to take flush_lock first
and cpus_read_lock() second, as Cyrill Gorcunov suggested. Would the original
order prevent this? Or we would fail anyway because we already took
cpus_read_lock() in offline_pages() and now are taking it again - do these nest
or not?
Qian Cai Aug. 9, 2021, 10:13 p.m. UTC | #4
On 8/9/2021 4:08 PM, Vlastimil Babka wrote:
> On 8/9/2021 8:44 PM, Mike Galbraith wrote:
>> On Mon, 2021-08-09 at 09:41 -0400, Qian Cai wrote:
>>>
>>>
>>> On 8/5/2021 11:19 AM, Vlastimil Babka wrote:
>>>>
>>>>  
>>>> +static DEFINE_MUTEX(flush_lock);
>>>> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
>>>> +
>>>>  static void flush_all(struct kmem_cache *s)
>>>>  {
>>>> -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
>>>> +       struct slub_flush_work *sfw;
>>>> +       unsigned int cpu;
>>>> +
>>>> +       mutex_lock(&flush_lock);
>>>
>>> Vlastimil, taking the lock here could trigger a warning during memory
>>> offline/online due to the locking order:
>>>
>>> slab_mutex -> flush_lock
>>
>> Bugger.  That chain ending with cpu_hotplug_lock makes slub_cpu_dead()
>> taking slab_mutex a non-starter for cpu hotplug as well.  It's
>> established early by kernel_init_freeable()..kmem_cache_destroy() as
>> well as by slab_mem_going_offline_callback().
> 
> I suck at reading the lockdep splats, so I don't see yet how the "existing
> reverse order" occurs - I do understand the order in the "lsbug".
> What I also wonder is why didn't this occur also in the older RT trees with this
> patch. I did change the order of locks in flush_all() to take flush_lock first
> and cpus_read_lock() second, as Cyrill Gorcunov suggested. Would the original
> order prevent this? Or we would fail anyway because we already took
> cpus_read_lock() in offline_pages() and now are taking it again - do these nest
> or not?

"lsbug" is just an user-space tool running workloads like memory offline/online
via sysfs. The splat indicated that the existing locking orders on the running
system saw so far are:

flush_lock -> cpu_hotplug_lock (in #1)
  cpu_hotplug_lock -> pck_batch_high_lock (in #2)
    pcp_batch_high_lock -> (memory_chain).rwsem (in #3)
      (memory_chain).rwsem -> slab_mutex (in #4)

Thus, lockdep inferences that taking flush_lock first could later reaching
slab_mutex. Then, in the commit, memory offline (in #0) started to take the locking
order slab_mutex -> flush_lock. Thus, the potential deadlock warning.
Mike Galbraith Aug. 10, 2021, 1:07 a.m. UTC | #5
On Mon, 2021-08-09 at 22:08 +0200, Vlastimil Babka wrote:
> On 8/9/2021 8:44 PM, Mike Galbraith wrote:
> > >
> > > slab_mutex -> flush_lock
> >
> > Bugger.  That chain ending with cpu_hotplug_lock makes slub_cpu_dead()
> > taking slab_mutex a non-starter for cpu hotplug as well.  It's
> > established early by kernel_init_freeable()..kmem_cache_destroy() as
> > well as by slab_mem_going_offline_callback().
>
> I suck at reading the lockdep splats, so I don't see yet how the "existing
> reverse order" occurs - I do understand the order in the "lsbug".
> What I also wonder is why didn't this occur also in the older RT trees with this
> patch.

Apparently (oops) nobody got around to hotplug+lockdep testing, RT or
otherwise.  I know I didn't, goldfish like attention span being used up
by explosion testing ;-)

	-Mike
Vlastimil Babka Aug. 10, 2021, 9:03 a.m. UTC | #6
On 8/9/21 3:41 PM, Qian Cai wrote:
>>  
>> +static DEFINE_MUTEX(flush_lock);
>> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
>> +
>>  static void flush_all(struct kmem_cache *s)
>>  {
>> -	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
>> +	struct slub_flush_work *sfw;
>> +	unsigned int cpu;
>> +
>> +	mutex_lock(&flush_lock);
> 
> Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
> 
> slab_mutex -> flush_lock
> 
> [   91.374541] WARNING: possible circular locking dependency detected
> [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> [   91.387149] ------------------------------------------------------
> [   91.394016] lsbug/1523 is trying to acquire lock:
> [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
> [   91.407425] 
>                but task is already holding lock:
> [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
> [   91.423603] 
>                which lock already depends on the new lock.
> 

OK, managed to reproduce in qemu and this fixes it for me on top of
next-20210809. Could you test as well, as your testing might be more
comprehensive? I will format is as a fixup for the proper patch in the series then.

----8<----
From 7ce71c7f9455e8b96dc1b728ea566b6ef5e424e4 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 10 Aug 2021 10:58:07 +0200
Subject: [PATCH] mm, slub: fix memory offline lockdep splat

Reverse order of flush_lock and cpus_read_lock() to prevent lockdep splat.
In slab_mem_going_offline_callback() we already have cpus_read_lock()
held so make sure it's not taken again.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 88a6c3ed2751..073cdd4b020f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2640,13 +2640,13 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s)
 static DEFINE_MUTEX(flush_lock);
 static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
 
-static void flush_all(struct kmem_cache *s)
+static void flush_all_cpus_locked(struct kmem_cache *s)
 {
 	struct slub_flush_work *sfw;
 	unsigned int cpu;
 
+	lockdep_assert_cpus_held();
 	mutex_lock(&flush_lock);
-	cpus_read_lock();
 
 	for_each_online_cpu(cpu) {
 		sfw = &per_cpu(slub_flush, cpu);
@@ -2667,10 +2667,16 @@ static void flush_all(struct kmem_cache *s)
 		flush_work(&sfw->work);
 	}
 
-	cpus_read_unlock();
 	mutex_unlock(&flush_lock);
 }
 
+static void flush_all(struct kmem_cache *s)
+{
+	cpus_read_lock();
+	flush_all_cpus_locked(s);
+	cpus_read_unlock();
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
@@ -4516,7 +4522,7 @@ EXPORT_SYMBOL(kfree);
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_do_shrink(struct kmem_cache *s)
 {
 	int node;
 	int i;
@@ -4528,7 +4534,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	unsigned long flags;
 	int ret = 0;
 
-	flush_all(s);
 	for_each_kmem_cache_node(s, node, n) {
 		INIT_LIST_HEAD(&discard);
 		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
@@ -4578,13 +4583,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	return ret;
 }
 
+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+	flush_all(s);
+	return __kmem_cache_do_shrink(s);
+}
+
 static int slab_mem_going_offline_callback(void *arg)
 {
 	struct kmem_cache *s;
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list)
-		__kmem_cache_shrink(s);
+	list_for_each_entry(s, &slab_caches, list) {
+		flush_all_cpus_locked(s);
+		__kmem_cache_do_shrink(s);
+	}
 	mutex_unlock(&slab_mutex);
 
 	return 0;
Mike Galbraith Aug. 10, 2021, 11:47 a.m. UTC | #7
On Tue, 2021-08-10 at 11:03 +0200, Vlastimil Babka wrote:
> On 8/9/21 3:41 PM, Qian Cai wrote:
> > >  
> > > +static DEFINE_MUTEX(flush_lock);
> > > +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> > > +
> > >  static void flush_all(struct kmem_cache *s)
> > >  {
> > > -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> > > +       struct slub_flush_work *sfw;
> > > +       unsigned int cpu;
> > > +
> > > +       mutex_lock(&flush_lock);
> >
> > Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
> >
> > slab_mutex -> flush_lock
> >
> > [   91.374541] WARNING: possible circular locking dependency detected
> > [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> > [   91.387149] ------------------------------------------------------
> > [   91.394016] lsbug/1523 is trying to acquire lock:
> > [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
> > [   91.407425]
> >                but task is already holding lock:
> > [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
> > [   91.423603]
> >                which lock already depends on the new lock.
> >
>
> OK, managed to reproduce in qemu and this fixes it for me on top of
> next-20210809. Could you test as well, as your testing might be more
> comprehensive? I will format is as a fixup for the proper patch in the series then.

As it appeared it should, moving cpu_hotplug_lock outside slab_mutex in
kmem_cache_destroy() on top of that silenced the cpu offline gripe.

---
 mm/slab_common.c |    2 ++
 mm/slub.c        |    2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cach
 	if (unlikely(!s))
 		return;

+	cpus_read_lock();
 	mutex_lock(&slab_mutex);

 	s->refcount--;
@@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cach
 	}
 out_unlock:
 	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);

--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4234,7 +4234,7 @@ int __kmem_cache_shutdown(struct kmem_ca
 	int node;
 	struct kmem_cache_node *n;

-	flush_all(s);
+	flush_all_cpus_locked(s);
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		free_partial(s, n);
Vlastimil Babka Aug. 10, 2021, 2:33 p.m. UTC | #8
On 8/9/21 3:41 PM, Qian Cai wrote:

>>  static void flush_all(struct kmem_cache *s)
>>  {
>> -	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
>> +	struct slub_flush_work *sfw;
>> +	unsigned int cpu;
>> +
>> +	mutex_lock(&flush_lock);
> 
> Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
> 
> slab_mutex -> flush_lock

Here's the full fixup, also incorporating Mike's fix. Thanks.

----8<----
From c2df67d5116d4615c322e262556e34117e268104 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 10 Aug 2021 10:58:07 +0200
Subject: [PATCH] mm, slub: fix memory and cpu hotplug related lock ordering
 issues

Qian Cai reported [1] a lockdep splat on memory offline.

[   91.374541] WARNING: possible circular locking dependency detected
[   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
[   91.387149] ------------------------------------------------------
[   91.394016] lsbug/1523 is trying to acquire lock:
[   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
[   91.407425] but task is already holding lock:
[   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
[   91.423603] which lock already depends on the new lock.

To fix it, we need to change the order in flush_all() so that cpus_read_lock()
is first and mutex_lock(&flush_lock) second.

Also when called from slab_mem_going_offline_callback() we are already under
cpus_read_lock() and cannot take it again, so create a flush_all_cpus_locked()
variant and decouple flushing from actual shrinking for this call path.

Additionally, Mike Galbraith reported [2] wrong order of cpus_read_lock() and
slab_mutex in kmem_cache_destroy() path and proposed a fix to reverse it.

This patch is a fixup for the mmotm patch
mm-slub-move-flush_cpu_slab-invocations-__free_slab-invocations-out-of-irq-context.patch

[1] https://lore.kernel.org/lkml/0b36128c-3e12-77df-85fe-a153a714569b@quicinc.com/
[2] https://lore.kernel.org/lkml/2eb3cf340716c40f03a0a342ab40219b3d1de195.camel@gmx.de/

Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c |  2 ++
 mm/slub.c        | 29 +++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1c673c323baf..ec2bb0beed75 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (unlikely(!s))
 		return;
 
+	cpus_read_lock();
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
@@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	}
 out_unlock:
 	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
diff --git a/mm/slub.c b/mm/slub.c
index da48ada3d17f..152487f84025 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2551,13 +2551,13 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s)
 static DEFINE_MUTEX(flush_lock);
 static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
 
-static void flush_all(struct kmem_cache *s)
+static void flush_all_cpus_locked(struct kmem_cache *s)
 {
 	struct slub_flush_work *sfw;
 	unsigned int cpu;
 
+	lockdep_assert_cpus_held();
 	mutex_lock(&flush_lock);
-	cpus_read_lock();
 
 	for_each_online_cpu(cpu) {
 		sfw = &per_cpu(slub_flush, cpu);
@@ -2578,10 +2578,16 @@ static void flush_all(struct kmem_cache *s)
 		flush_work(&sfw->work);
 	}
 
-	cpus_read_unlock();
 	mutex_unlock(&flush_lock);
 }
 
+static void flush_all(struct kmem_cache *s)
+{
+	cpus_read_lock();
+	flush_all_cpus_locked(s);
+	cpus_read_unlock();
+}
+
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
@@ -4111,7 +4117,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	int node;
 	struct kmem_cache_node *n;
 
-	flush_all(s);
+	flush_all_cpus_locked(s);
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		free_partial(s, n);
@@ -4387,7 +4393,7 @@ EXPORT_SYMBOL(kfree);
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_do_shrink(struct kmem_cache *s)
 {
 	int node;
 	int i;
@@ -4399,7 +4405,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	unsigned long flags;
 	int ret = 0;
 
-	flush_all(s);
 	for_each_kmem_cache_node(s, node, n) {
 		INIT_LIST_HEAD(&discard);
 		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
@@ -4449,13 +4454,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	return ret;
 }
 
+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+	flush_all(s);
+	return __kmem_cache_do_shrink(s);
+}
+
 static int slab_mem_going_offline_callback(void *arg)
 {
 	struct kmem_cache *s;
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list)
-		__kmem_cache_shrink(s);
+	list_for_each_entry(s, &slab_caches, list) {
+		flush_all_cpus_locked(s);
+		__kmem_cache_do_shrink(s);
+	}
 	mutex_unlock(&slab_mutex);
 
 	return 0;
Paul E. McKenney Aug. 10, 2021, 8:25 p.m. UTC | #9
On Tue, Aug 10, 2021 at 11:03:02AM +0200, Vlastimil Babka wrote:
> On 8/9/21 3:41 PM, Qian Cai wrote:
> >>  
> >> +static DEFINE_MUTEX(flush_lock);
> >> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> >> +
> >>  static void flush_all(struct kmem_cache *s)
> >>  {
> >> -	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> >> +	struct slub_flush_work *sfw;
> >> +	unsigned int cpu;
> >> +
> >> +	mutex_lock(&flush_lock);
> > 
> > Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
> > 
> > slab_mutex -> flush_lock
> > 
> > [   91.374541] WARNING: possible circular locking dependency detected
> > [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> > [   91.387149] ------------------------------------------------------
> > [   91.394016] lsbug/1523 is trying to acquire lock:
> > [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
> > [   91.407425] 
> >                but task is already holding lock:
> > [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
> > [   91.423603] 
> >                which lock already depends on the new lock.

From the series in -next, I got a three-way deadlock similar to what
Qian Cai got.

> OK, managed to reproduce in qemu and this fixes it for me on top of
> next-20210809. Could you test as well, as your testing might be more
> comprehensive? I will format is as a fixup for the proper patch in the series then.
> 
> ----8<----
> >From 7ce71c7f9455e8b96dc1b728ea566b6ef5e424e4 Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Tue, 10 Aug 2021 10:58:07 +0200
> Subject: [PATCH] mm, slub: fix memory offline lockdep splat
> 
> Reverse order of flush_lock and cpus_read_lock() to prevent lockdep splat.
> In slab_mem_going_offline_callback() we already have cpus_read_lock()
> held so make sure it's not taken again.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

With this patch, it reduces to a two-way deadlock as shown at the end
of this message.

My reproducer is the following on a two-socket system:

tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 10m --configs RUDE01 --trust-make

This likely needs the RCU commits in -next to reproduce quickly, though
you never know.

							Thanx, Paul

> ---
>  mm/slub.c | 27 ++++++++++++++++++++-------
>  1 file changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 88a6c3ed2751..073cdd4b020f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2640,13 +2640,13 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s)
>  static DEFINE_MUTEX(flush_lock);
>  static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
>  
> -static void flush_all(struct kmem_cache *s)
> +static void flush_all_cpus_locked(struct kmem_cache *s)
>  {
>  	struct slub_flush_work *sfw;
>  	unsigned int cpu;
>  
> +	lockdep_assert_cpus_held();
>  	mutex_lock(&flush_lock);
> -	cpus_read_lock();
>  
>  	for_each_online_cpu(cpu) {
>  		sfw = &per_cpu(slub_flush, cpu);
> @@ -2667,10 +2667,16 @@ static void flush_all(struct kmem_cache *s)
>  		flush_work(&sfw->work);
>  	}
>  
> -	cpus_read_unlock();
>  	mutex_unlock(&flush_lock);
>  }
>  
> +static void flush_all(struct kmem_cache *s)
> +{
> +	cpus_read_lock();
> +	flush_all_cpus_locked(s);
> +	cpus_read_unlock();
> +}
> +
>  /*
>   * Use the cpu notifier to insure that the cpu slabs are flushed when
>   * necessary.
> @@ -4516,7 +4522,7 @@ EXPORT_SYMBOL(kfree);
>   * being allocated from last increasing the chance that the last objects
>   * are freed in them.
>   */
> -int __kmem_cache_shrink(struct kmem_cache *s)
> +int __kmem_cache_do_shrink(struct kmem_cache *s)
>  {
>  	int node;
>  	int i;
> @@ -4528,7 +4534,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
>  	unsigned long flags;
>  	int ret = 0;
>  
> -	flush_all(s);
>  	for_each_kmem_cache_node(s, node, n) {
>  		INIT_LIST_HEAD(&discard);
>  		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
> @@ -4578,13 +4583,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
>  	return ret;
>  }
>  
> +int __kmem_cache_shrink(struct kmem_cache *s)
> +{
> +	flush_all(s);
> +	return __kmem_cache_do_shrink(s);
> +}
> +
>  static int slab_mem_going_offline_callback(void *arg)
>  {
>  	struct kmem_cache *s;
>  
>  	mutex_lock(&slab_mutex);
> -	list_for_each_entry(s, &slab_caches, list)
> -		__kmem_cache_shrink(s);
> +	list_for_each_entry(s, &slab_caches, list) {
> +		flush_all_cpus_locked(s);
> +		__kmem_cache_do_shrink(s);
> +	}
>  	mutex_unlock(&slab_mutex);
>  
>  	return 0;
> -- 
> 2.32.0

[  602.668050] ========================================================
[  602.668924] WARNING: possible circular locking dependency detected
[  602.669796] 5.14.0-rc5-next-20210809+ #3298 Not tainted
[  602.670537] ------------------------------------------------------
[  602.671408] torture_shutdow/88 is trying to acquire lock:
[  602.672169] ffffffffb00686b0 (cpu_hotplug_lock){++++}-{0:0}, at: __kmem_=
cache_shutdown+0x26/0x210
[  602.673416]
[  602.673416] but task is already holding lock:
[  602.674240] ffffffffb0178368 (slab_mutex){+.+.}-{3:3}, at: kmem_cache_de=
stroy+0x1c/0x110
[  602.675379]
[  602.675379] which lock already depends on the new lock.
[  602.675379]
[  602.676525]
[  602.676525] the existing dependency chain (in reverse order) is:
[  602.677576]
[  602.677576] -> #1 (slab_mutex){+.+.}-{3:3}:
[  602.678377]        __mutex_lock+0x81/0x9a0
[  602.678964]        slub_cpu_dead+0x17/0xb0
[  602.679547]        cpuhp_invoke_callback+0x180/0x890
[  602.680255]        cpuhp_invoke_callback_range+0x3b/0x80
[  602.681009]        _cpu_down+0xe4/0x2b0
[  602.681556]        cpu_down+0x29/0x50
[  602.682082]        device_offline+0x7e/0xb0
[  602.682677]        remove_cpu+0x17/0x30
[  602.683225]        torture_offline+0x7d/0x140
[  602.683844]        torture_onoff+0x14f/0x260
[  602.684455]        kthread+0x132/0x160
[  602.684994]        ret_from_fork+0x22/0x30
[  602.685574]
[  602.685574] -> #0 (cpu_hotplug_lock){++++}-{0:0}:
[  602.686460]        __lock_acquire+0x13d2/0x2470
[  602.687107]        lock_acquire+0xc9/0x2e0
[  602.687686]        cpus_read_lock+0x26/0xb0
[  602.688284]        __kmem_cache_shutdown+0x26/0x210
[  602.688973]        kmem_cache_destroy+0x38/0x110
[  602.689625]        rcu_torture_cleanup.cold.36+0x192/0x421
[  602.690399]        torture_shutdown+0xdd/0x1c0
[  602.691032]        kthread+0x132/0x160
[  602.691563]        ret_from_fork+0x22/0x30
[  602.692147]
[  602.692147] other info that might help us debug this:
[  602.692147]
[  602.693268]  Possible unsafe locking scenario:
[  602.693268]
[  602.694128]        CPU0                    CPU1
[  602.694766]        ----                    ----
[  602.695409]   lock(slab_mutex);
[  602.695858]                                lock(cpu_hotplug_lock);
[  602.696731]                                lock(slab_mutex);
[  602.697531]   lock(cpu_hotplug_lock);
[  602.698057]
[  602.698057]  *** DEADLOCK ***
[  602.698057]
[  602.698884] 1 lock held by torture_shutdow/88:
[  602.699517]  #0: ffffffffb0178368 (slab_mutex){+.+.}-{3:3}, at: kmem_cac=
he_destroy+0x1c/0x110
[  602.700716]
[  602.700716] stack backtrace:
[  602.701334] CPU: 3 PID: 88 Comm: torture_shutdow Not tainted 5.14.0-rc5-=
next-20210809+ #3298
[  602.702518] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.13.0-2.module_el8=
.5.0+746+bbd5d70c 04/01/2014
[  602.703799] Call Trace:
[  602.704160]  dump_stack_lvl+0x44/0x57
[  602.704686]  check_noncircular+0xfe/0x110
[  602.705264]  __lock_acquire+0x13d2/0x2470
[  602.705836]  lock_acquire+0xc9/0x2e0
[  602.706389]  ? __kmem_cache_shutdown+0x26/0x210
[  602.707059]  cpus_read_lock+0x26/0xb0
[  602.707582]  ? __kmem_cache_shutdown+0x26/0x210
[  602.708226]  __kmem_cache_shutdown+0x26/0x210
[  602.708843]  ? lock_is_held_type+0xd6/0x130
[  602.709442]  ? torture_onoff+0x260/0x260
[  602.710007]  kmem_cache_destroy+0x38/0x110
[  602.710590]  rcu_torture_cleanup.cold.36+0x192/0x421
[  602.711298]  ? wait_woken+0x60/0x60
[  602.711796]  ? torture_onoff+0x260/0x260
[  602.712359]  torture_shutdown+0xdd/0x1c0
[  602.712918]  kthread+0x132/0x160
[  602.713386]  ? set_kthread_struct+0x40/0x40
[  602.713985]  ret_from_fork+0x22/0x30
Paul E. McKenney Aug. 10, 2021, 8:31 p.m. UTC | #10
On Tue, Aug 10, 2021 at 01:47:42PM +0200, Mike Galbraith wrote:
> On Tue, 2021-08-10 at 11:03 +0200, Vlastimil Babka wrote:
> > On 8/9/21 3:41 PM, Qian Cai wrote:
> > > >  
> > > > +static DEFINE_MUTEX(flush_lock);
> > > > +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> > > > +
> > > >  static void flush_all(struct kmem_cache *s)
> > > >  {
> > > > -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> > > > +       struct slub_flush_work *sfw;
> > > > +       unsigned int cpu;
> > > > +
> > > > +       mutex_lock(&flush_lock);
> > >
> > > Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
> > >
> > > slab_mutex -> flush_lock
> > >
> > > [   91.374541] WARNING: possible circular locking dependency detected
> > > [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> > > [   91.387149] ------------------------------------------------------
> > > [   91.394016] lsbug/1523 is trying to acquire lock:
> > > [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
> > > [   91.407425]
> > >                but task is already holding lock:
> > > [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
> > > [   91.423603]
> > >                which lock already depends on the new lock.
> > >
> >
> > OK, managed to reproduce in qemu and this fixes it for me on top of
> > next-20210809. Could you test as well, as your testing might be more
> > comprehensive? I will format is as a fixup for the proper patch in the series then.
> 
> As it appeared it should, moving cpu_hotplug_lock outside slab_mutex in
> kmem_cache_destroy() on top of that silenced the cpu offline gripe.

And this one got rid of the remainder of the deadlock, but gets me the
splat shown at the end of this message.  So some sort of middle ground
may be needed.

(Same reproducer as in my previous reply to Vlastimil.)

							Thanx, Paul

> ---
>  mm/slab_common.c |    2 ++
>  mm/slub.c        |    2 +-
>  2 files changed, 3 insertions(+), 1 deletion(-)
> 
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cach
>  	if (unlikely(!s))
>  		return;
> 
> +	cpus_read_lock();
>  	mutex_lock(&slab_mutex);
> 
>  	s->refcount--;
> @@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cach
>  	}
>  out_unlock:
>  	mutex_unlock(&slab_mutex);
> +	cpus_read_unlock();
>  }
>  EXPORT_SYMBOL(kmem_cache_destroy);
> 
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4234,7 +4234,7 @@ int __kmem_cache_shutdown(struct kmem_ca
>  	int node;
>  	struct kmem_cache_node *n;
> 
> -	flush_all(s);
> +	flush_all_cpus_locked(s);
>  	/* Attempt to free all objects */
>  	for_each_kmem_cache_node(s, node, n) {
>  		free_partial(s, n);

[  602.539109] ------------[ cut here ]------------
[  602.539804] WARNING: CPU: 3 PID: 88 at kernel/cpu.c:335 lockdep_assert_cpus_held+0x29/0x30
[  602.540940] Modules linked in:
[  602.541377] CPU: 3 PID: 88 Comm: torture_shutdow Not tainted 5.14.0-rc5-next-20210809+ #3299
[  602.542536] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.13.0-2.module_el8.5.0+746+bbd5d70c 04/01/2014
[  602.543786] RIP: 0010:lockdep_assert_cpus_held+0x29/0x30
[  602.544524] Code: 00 83 3d 4d f1 a4 01 01 76 0a 8b 05 4d 23 a5 01 85 c0 75 01 c3 be ff ff ff ff 48 c7 c7 b0 86 66 a3 e8 9b 05 c9 00 85 c0 75 ea <0f> 0b c3 0f 1f 40 00 41 57 41 89 ff 41 56 4d 89 c6 41 55 49 89 cd
[  602.547051] RSP: 0000:ffffb382802efdb8 EFLAGS: 00010246
[  602.547783] RAX: 0000000000000000 RBX: ffffa23301a44000 RCX: 0000000000000001
[  602.548764] RDX: 0000000000000001 RSI: ffffffffa335f5c0 RDI: ffffffffa33adbbf[  602.549747] RBP: ffffa23301a44000 R08: ffffa23302810000 R09: 974cf0ba5c48ad3c
[  602.550727] R10: ffffb382802efe78 R11: 0000000000000001 R12: ffffa23301a44000[  602.551709] R13: 00000000000249c0 R14: 00000000ffffffff R15: 0000000fffffffe0
[  602.552694] FS:  0000000000000000(0000) GS:ffffa2331f580000(0000) knlGS:0000000000000000
[  602.553805] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  602.554606] CR2: 0000000000000000 CR3: 0000000017222000 CR4: 00000000000006e0
[  602.555601] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  602.556590] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  602.557585] Call Trace:
[  602.557927]  flush_all_cpus_locked+0x29/0x140
[  602.558535]  __kmem_cache_shutdown+0x26/0x200
[  602.559145]  ? lock_is_held_type+0xd6/0x130
[  602.559739]  ? torture_onoff+0x260/0x260
[  602.560284]  kmem_cache_destroy+0x38/0x110
[  602.560859]  rcu_torture_cleanup.cold.36+0x192/0x421
[  602.561539]  ? wait_woken+0x60/0x60
[  602.562035]  ? torture_onoff+0x260/0x260
[  602.562591]  torture_shutdown+0xdd/0x1c0
[  602.563131]  kthread+0x132/0x160
[  602.563592]  ? set_kthread_struct+0x40/0x40
[  602.564172]  ret_from_fork+0x22/0x30
[  602.564696] irq event stamp: 1307
[  602.565161] hardirqs last  enabled at (1315): [<ffffffffa1eddced>] __up_console_sem+0x4d/0x50
[  602.566321] hardirqs last disabled at (1324): [<ffffffffa1eddcd2>] __up_console_sem+0x32/0x50
[  602.567479] softirqs last  enabled at (1304): [<ffffffffa2e00311>] __do_softirq+0x311/0x473
[  602.568616] softirqs last disabled at (1299): [<ffffffffa1e72eb8>] irq_exit_rcu+0xe8/0xf0
[  602.569735] ---[ end trace 26fd643e1df331c9 ]---
Vlastimil Babka Aug. 10, 2021, 10:36 p.m. UTC | #11
On 8/10/2021 10:31 PM, Paul E. McKenney wrote:
> On Tue, Aug 10, 2021 at 01:47:42PM +0200, Mike Galbraith wrote:
>> On Tue, 2021-08-10 at 11:03 +0200, Vlastimil Babka wrote:
>>> On 8/9/21 3:41 PM, Qian Cai wrote:
>>>>>  
>>>>> +static DEFINE_MUTEX(flush_lock);
>>>>> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
>>>>> +
>>>>>  static void flush_all(struct kmem_cache *s)
>>>>>  {
>>>>> -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
>>>>> +       struct slub_flush_work *sfw;
>>>>> +       unsigned int cpu;
>>>>> +
>>>>> +       mutex_lock(&flush_lock);
>>>>
>>>> Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
>>>>
>>>> slab_mutex -> flush_lock
>>>>
>>>> [   91.374541] WARNING: possible circular locking dependency detected
>>>> [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
>>>> [   91.387149] ------------------------------------------------------
>>>> [   91.394016] lsbug/1523 is trying to acquire lock:
>>>> [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
>>>> [   91.407425]
>>>>                but task is already holding lock:
>>>> [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
>>>> [   91.423603]
>>>>                which lock already depends on the new lock.
>>>>
>>>
>>> OK, managed to reproduce in qemu and this fixes it for me on top of
>>> next-20210809. Could you test as well, as your testing might be more
>>> comprehensive? I will format is as a fixup for the proper patch in the series then.
>>
>> As it appeared it should, moving cpu_hotplug_lock outside slab_mutex in
>> kmem_cache_destroy() on top of that silenced the cpu offline gripe.
> 
> And this one got rid of the remainder of the deadlock, but gets me the
> splat shown at the end of this message.  So some sort of middle ground
> may be needed.
> 
> (Same reproducer as in my previous reply to Vlastimil.)
> 
> 							Thanx, Paul
> 
>> ---
>>  mm/slab_common.c |    2 ++
>>  mm/slub.c        |    2 +-
>>  2 files changed, 3 insertions(+), 1 deletion(-)
>>
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cach
>>  	if (unlikely(!s))
>>  		return;
>>
>> +	cpus_read_lock();
>>  	mutex_lock(&slab_mutex);
>>
>>  	s->refcount--;
>> @@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cach
>>  	}
>>  out_unlock:
>>  	mutex_unlock(&slab_mutex);
>> +	cpus_read_unlock();
>>  }
>>  EXPORT_SYMBOL(kmem_cache_destroy);
>>
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -4234,7 +4234,7 @@ int __kmem_cache_shutdown(struct kmem_ca
>>  	int node;
>>  	struct kmem_cache_node *n;
>>
>> -	flush_all(s);
>> +	flush_all_cpus_locked(s);
>>  	/* Attempt to free all objects */
>>  	for_each_kmem_cache_node(s, node, n) {
>>  		free_partial(s, n);
> 
> [  602.539109] ------------[ cut here ]------------
> [  602.539804] WARNING: CPU: 3 PID: 88 at kernel/cpu.c:335 lockdep_assert_cpus_held+0x29/0x30

So this says the assert failed and we don't have the cpus_read_lock(), right, but...

> [  602.540940] Modules linked in:
> [  602.541377] CPU: 3 PID: 88 Comm: torture_shutdow Not tainted 5.14.0-rc5-next-20210809+ #3299
> [  602.542536] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.13.0-2.module_el8.5.0+746+bbd5d70c 04/01/2014
> [  602.543786] RIP: 0010:lockdep_assert_cpus_held+0x29/0x30
> [  602.544524] Code: 00 83 3d 4d f1 a4 01 01 76 0a 8b 05 4d 23 a5 01 85 c0 75 01 c3 be ff ff ff ff 48 c7 c7 b0 86 66 a3 e8 9b 05 c9 00 85 c0 75 ea <0f> 0b c3 0f 1f 40 00 41 57 41 89 ff 41 56 4d 89 c6 41 55 49 89 cd
> [  602.547051] RSP: 0000:ffffb382802efdb8 EFLAGS: 00010246
> [  602.547783] RAX: 0000000000000000 RBX: ffffa23301a44000 RCX: 0000000000000001
> [  602.548764] RDX: 0000000000000001 RSI: ffffffffa335f5c0 RDI: ffffffffa33adbbf[  602.549747] RBP: ffffa23301a44000 R08: ffffa23302810000 R09: 974cf0ba5c48ad3c
> [  602.550727] R10: ffffb382802efe78 R11: 0000000000000001 R12: ffffa23301a44000[  602.551709] R13: 00000000000249c0 R14: 00000000ffffffff R15: 0000000fffffffe0
> [  602.552694] FS:  0000000000000000(0000) GS:ffffa2331f580000(0000) knlGS:0000000000000000
> [  602.553805] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  602.554606] CR2: 0000000000000000 CR3: 0000000017222000 CR4: 00000000000006e0
> [  602.555601] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  602.556590] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  602.557585] Call Trace:
> [  602.557927]  flush_all_cpus_locked+0x29/0x140
> [  602.558535]  __kmem_cache_shutdown+0x26/0x200
> [  602.559145]  ? lock_is_held_type+0xd6/0x130
> [  602.559739]  ? torture_onoff+0x260/0x260
> [  602.560284]  kmem_cache_destroy+0x38/0x110

It should have been taken here. I don't understand. It's as if only the
mm/slub.c was patched by Mike's patch, but mm/slab_common.c not?

> [  602.560859]  rcu_torture_cleanup.cold.36+0x192/0x421
> [  602.561539]  ? wait_woken+0x60/0x60
> [  602.562035]  ? torture_onoff+0x260/0x260
> [  602.562591]  torture_shutdown+0xdd/0x1c0
> [  602.563131]  kthread+0x132/0x160
> [  602.563592]  ? set_kthread_struct+0x40/0x40
> [  602.564172]  ret_from_fork+0x22/0x30
> [  602.564696] irq event stamp: 1307
> [  602.565161] hardirqs last  enabled at (1315): [<ffffffffa1eddced>] __up_console_sem+0x4d/0x50
> [  602.566321] hardirqs last disabled at (1324): [<ffffffffa1eddcd2>] __up_console_sem+0x32/0x50
> [  602.567479] softirqs last  enabled at (1304): [<ffffffffa2e00311>] __do_softirq+0x311/0x473
> [  602.568616] softirqs last disabled at (1299): [<ffffffffa1e72eb8>] irq_exit_rcu+0xe8/0xf0
> [  602.569735] ---[ end trace 26fd643e1df331c9 ]---
>
Paul E. McKenney Aug. 10, 2021, 11:53 p.m. UTC | #12
On Wed, Aug 11, 2021 at 12:36:00AM +0200, Vlastimil Babka wrote:
> On 8/10/2021 10:31 PM, Paul E. McKenney wrote:
> > On Tue, Aug 10, 2021 at 01:47:42PM +0200, Mike Galbraith wrote:
> >> On Tue, 2021-08-10 at 11:03 +0200, Vlastimil Babka wrote:
> >>> On 8/9/21 3:41 PM, Qian Cai wrote:
> >>>>>  
> >>>>> +static DEFINE_MUTEX(flush_lock);
> >>>>> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> >>>>> +
> >>>>>  static void flush_all(struct kmem_cache *s)
> >>>>>  {
> >>>>> -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> >>>>> +       struct slub_flush_work *sfw;
> >>>>> +       unsigned int cpu;
> >>>>> +
> >>>>> +       mutex_lock(&flush_lock);
> >>>>
> >>>> Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
> >>>>
> >>>> slab_mutex -> flush_lock
> >>>>
> >>>> [   91.374541] WARNING: possible circular locking dependency detected
> >>>> [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> >>>> [   91.387149] ------------------------------------------------------
> >>>> [   91.394016] lsbug/1523 is trying to acquire lock:
> >>>> [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
> >>>> [   91.407425]
> >>>>                but task is already holding lock:
> >>>> [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
> >>>> [   91.423603]
> >>>>                which lock already depends on the new lock.
> >>>>
> >>>
> >>> OK, managed to reproduce in qemu and this fixes it for me on top of
> >>> next-20210809. Could you test as well, as your testing might be more
> >>> comprehensive? I will format is as a fixup for the proper patch in the series then.
> >>
> >> As it appeared it should, moving cpu_hotplug_lock outside slab_mutex in
> >> kmem_cache_destroy() on top of that silenced the cpu offline gripe.
> > 
> > And this one got rid of the remainder of the deadlock, but gets me the
> > splat shown at the end of this message.  So some sort of middle ground
> > may be needed.
> > 
> > (Same reproducer as in my previous reply to Vlastimil.)
> > 
> > 							Thanx, Paul
> > 
> >> ---
> >>  mm/slab_common.c |    2 ++
> >>  mm/slub.c        |    2 +-
> >>  2 files changed, 3 insertions(+), 1 deletion(-)
> >>
> >> --- a/mm/slab_common.c
> >> +++ b/mm/slab_common.c
> >> @@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cach
> >>  	if (unlikely(!s))
> >>  		return;
> >>
> >> +	cpus_read_lock();
> >>  	mutex_lock(&slab_mutex);
> >>
> >>  	s->refcount--;
> >> @@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cach
> >>  	}
> >>  out_unlock:
> >>  	mutex_unlock(&slab_mutex);
> >> +	cpus_read_unlock();
> >>  }
> >>  EXPORT_SYMBOL(kmem_cache_destroy);
> >>
> >> --- a/mm/slub.c
> >> +++ b/mm/slub.c
> >> @@ -4234,7 +4234,7 @@ int __kmem_cache_shutdown(struct kmem_ca
> >>  	int node;
> >>  	struct kmem_cache_node *n;
> >>
> >> -	flush_all(s);
> >> +	flush_all_cpus_locked(s);
> >>  	/* Attempt to free all objects */
> >>  	for_each_kmem_cache_node(s, node, n) {
> >>  		free_partial(s, n);
> > 
> > [  602.539109] ------------[ cut here ]------------
> > [  602.539804] WARNING: CPU: 3 PID: 88 at kernel/cpu.c:335 lockdep_assert_cpus_held+0x29/0x30
> 
> So this says the assert failed and we don't have the cpus_read_lock(), right, but...
> 
> > [  602.540940] Modules linked in:
> > [  602.541377] CPU: 3 PID: 88 Comm: torture_shutdow Not tainted 5.14.0-rc5-next-20210809+ #3299
> > [  602.542536] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.13.0-2.module_el8.5.0+746+bbd5d70c 04/01/2014
> > [  602.543786] RIP: 0010:lockdep_assert_cpus_held+0x29/0x30
> > [  602.544524] Code: 00 83 3d 4d f1 a4 01 01 76 0a 8b 05 4d 23 a5 01 85 c0 75 01 c3 be ff ff ff ff 48 c7 c7 b0 86 66 a3 e8 9b 05 c9 00 85 c0 75 ea <0f> 0b c3 0f 1f 40 00 41 57 41 89 ff 41 56 4d 89 c6 41 55 49 89 cd
> > [  602.547051] RSP: 0000:ffffb382802efdb8 EFLAGS: 00010246
> > [  602.547783] RAX: 0000000000000000 RBX: ffffa23301a44000 RCX: 0000000000000001
> > [  602.548764] RDX: 0000000000000001 RSI: ffffffffa335f5c0 RDI: ffffffffa33adbbf[  602.549747] RBP: ffffa23301a44000 R08: ffffa23302810000 R09: 974cf0ba5c48ad3c
> > [  602.550727] R10: ffffb382802efe78 R11: 0000000000000001 R12: ffffa23301a44000[  602.551709] R13: 00000000000249c0 R14: 00000000ffffffff R15: 0000000fffffffe0
> > [  602.552694] FS:  0000000000000000(0000) GS:ffffa2331f580000(0000) knlGS:0000000000000000
> > [  602.553805] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [  602.554606] CR2: 0000000000000000 CR3: 0000000017222000 CR4: 00000000000006e0
> > [  602.555601] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [  602.556590] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [  602.557585] Call Trace:
> > [  602.557927]  flush_all_cpus_locked+0x29/0x140
> > [  602.558535]  __kmem_cache_shutdown+0x26/0x200
> > [  602.559145]  ? lock_is_held_type+0xd6/0x130
> > [  602.559739]  ? torture_onoff+0x260/0x260
> > [  602.560284]  kmem_cache_destroy+0x38/0x110
> 
> It should have been taken here. I don't understand. It's as if only the
> mm/slub.c was patched by Mike's patch, but mm/slab_common.c not?

You know, you would think that I would have learned how to reliably
apply a patch by now.  Apparently not this morning.

Anyway, right in one!  I will try again with the full patch later.

							Thanx, Paul

> > [  602.560859]  rcu_torture_cleanup.cold.36+0x192/0x421
> > [  602.561539]  ? wait_woken+0x60/0x60
> > [  602.562035]  ? torture_onoff+0x260/0x260
> > [  602.562591]  torture_shutdown+0xdd/0x1c0
> > [  602.563131]  kthread+0x132/0x160
> > [  602.563592]  ? set_kthread_struct+0x40/0x40
> > [  602.564172]  ret_from_fork+0x22/0x30
> > [  602.564696] irq event stamp: 1307
> > [  602.565161] hardirqs last  enabled at (1315): [<ffffffffa1eddced>] __up_console_sem+0x4d/0x50
> > [  602.566321] hardirqs last disabled at (1324): [<ffffffffa1eddcd2>] __up_console_sem+0x32/0x50
> > [  602.567479] softirqs last  enabled at (1304): [<ffffffffa2e00311>] __do_softirq+0x311/0x473
> > [  602.568616] softirqs last disabled at (1299): [<ffffffffa1e72eb8>] irq_exit_rcu+0xe8/0xf0
> > [  602.569735] ---[ end trace 26fd643e1df331c9 ]---
> > 
>
Qian Cai Aug. 11, 2021, 1:42 a.m. UTC | #13
On 8/10/2021 10:33 AM, Vlastimil Babka wrote:
> On 8/9/21 3:41 PM, Qian Cai wrote:
> 
>>>  static void flush_all(struct kmem_cache *s)
>>>  {
>>> -	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
>>> +	struct slub_flush_work *sfw;
>>> +	unsigned int cpu;
>>> +
>>> +	mutex_lock(&flush_lock);
>>
>> Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
>>
>> slab_mutex -> flush_lock
> 
> Here's the full fixup, also incorporating Mike's fix. Thanks.
> 
> ----8<----
> From c2df67d5116d4615c322e262556e34117e268104 Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Tue, 10 Aug 2021 10:58:07 +0200
> Subject: [PATCH] mm, slub: fix memory and cpu hotplug related lock ordering
>  issues
> 
> Qian Cai reported [1] a lockdep splat on memory offline.
> 
> [   91.374541] WARNING: possible circular locking dependency detected
> [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> [   91.387149] ------------------------------------------------------
> [   91.394016] lsbug/1523 is trying to acquire lock:
> [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
> [   91.407425] but task is already holding lock:
> [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
> [   91.423603] which lock already depends on the new lock.
> 
> To fix it, we need to change the order in flush_all() so that cpus_read_lock()
> is first and mutex_lock(&flush_lock) second.
> 
> Also when called from slab_mem_going_offline_callback() we are already under
> cpus_read_lock() and cannot take it again, so create a flush_all_cpus_locked()
> variant and decouple flushing from actual shrinking for this call path.
> 
> Additionally, Mike Galbraith reported [2] wrong order of cpus_read_lock() and
> slab_mutex in kmem_cache_destroy() path and proposed a fix to reverse it.
> 
> This patch is a fixup for the mmotm patch
> mm-slub-move-flush_cpu_slab-invocations-__free_slab-invocations-out-of-irq-context.patch
> 
> [1] https://lore.kernel.org/lkml/0b36128c-3e12-77df-85fe-a153a714569b@quicinc.com/
> [2] https://lore.kernel.org/lkml/2eb3cf340716c40f03a0a342ab40219b3d1de195.camel@gmx.de/
> 
> Reported-by: Qian Cai <quic_qiancai@quicinc.com>
> Reported-by: Mike Galbraith <efault@gmx.de>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

This is running fine for me. There is a separate hugetlb crash while fuzzing and will
report to where it belongs.
Vlastimil Babka Aug. 11, 2021, 8:55 a.m. UTC | #14
On 8/10/21 4:33 PM, Vlastimil Babka wrote:
> On 8/9/21 3:41 PM, Qian Cai wrote:
> 
>>>  static void flush_all(struct kmem_cache *s)
>>>  {
>>> -	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
>>> +	struct slub_flush_work *sfw;
>>> +	unsigned int cpu;
>>> +
>>> +	mutex_lock(&flush_lock);
>> 
>> Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
>> 
>> slab_mutex -> flush_lock
> 
> Here's the full fixup, also incorporating Mike's fix. Thanks.
> 

One more fixup, sorry for the churn.
----8<----
From 7cfe3fb1bcd6e589199b10bef480ed097ba9de14 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Aug 2021 10:51:14 +0200
Subject: [PATCH] mm, slub: fix memory and cpu hotplug related lock ordering
 issues - fix

Make __kmem_cache_do_shrink static to silence "no previous prototype" warning.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 152487f84025..c9531e03addd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4393,7 +4393,7 @@ EXPORT_SYMBOL(kfree);
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int __kmem_cache_do_shrink(struct kmem_cache *s)
+static int __kmem_cache_do_shrink(struct kmem_cache *s)
 {
 	int node;
 	int i;
Paul E. McKenney Aug. 11, 2021, 2:17 p.m. UTC | #15
On Tue, Aug 10, 2021 at 04:53:36PM -0700, Paul E. McKenney wrote:
> On Wed, Aug 11, 2021 at 12:36:00AM +0200, Vlastimil Babka wrote:
> > On 8/10/2021 10:31 PM, Paul E. McKenney wrote:
> > > On Tue, Aug 10, 2021 at 01:47:42PM +0200, Mike Galbraith wrote:
> > >> On Tue, 2021-08-10 at 11:03 +0200, Vlastimil Babka wrote:
> > >>> On 8/9/21 3:41 PM, Qian Cai wrote:
> > >>>>>  
> > >>>>> +static DEFINE_MUTEX(flush_lock);
> > >>>>> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> > >>>>> +
> > >>>>>  static void flush_all(struct kmem_cache *s)
> > >>>>>  {
> > >>>>> -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> > >>>>> +       struct slub_flush_work *sfw;
> > >>>>> +       unsigned int cpu;
> > >>>>> +
> > >>>>> +       mutex_lock(&flush_lock);
> > >>>>
> > >>>> Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:
> > >>>>
> > >>>> slab_mutex -> flush_lock
> > >>>>
> > >>>> [   91.374541] WARNING: possible circular locking dependency detected
> > >>>> [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> > >>>> [   91.387149] ------------------------------------------------------
> > >>>> [   91.394016] lsbug/1523 is trying to acquire lock:
> > >>>> [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
> > >>>> [   91.407425]
> > >>>>                but task is already holding lock:
> > >>>> [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
> > >>>> [   91.423603]
> > >>>>                which lock already depends on the new lock.
> > >>>>
> > >>>
> > >>> OK, managed to reproduce in qemu and this fixes it for me on top of
> > >>> next-20210809. Could you test as well, as your testing might be more
> > >>> comprehensive? I will format is as a fixup for the proper patch in the series then.
> > >>
> > >> As it appeared it should, moving cpu_hotplug_lock outside slab_mutex in
> > >> kmem_cache_destroy() on top of that silenced the cpu offline gripe.
> > > 
> > > And this one got rid of the remainder of the deadlock, but gets me the
> > > splat shown at the end of this message.  So some sort of middle ground
> > > may be needed.
> > > 
> > > (Same reproducer as in my previous reply to Vlastimil.)
> > > 
> > > 							Thanx, Paul
> > > 
> > >> ---
> > >>  mm/slab_common.c |    2 ++
> > >>  mm/slub.c        |    2 +-
> > >>  2 files changed, 3 insertions(+), 1 deletion(-)
> > >>
> > >> --- a/mm/slab_common.c
> > >> +++ b/mm/slab_common.c
> > >> @@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cach
> > >>  	if (unlikely(!s))
> > >>  		return;
> > >>
> > >> +	cpus_read_lock();
> > >>  	mutex_lock(&slab_mutex);
> > >>
> > >>  	s->refcount--;
> > >> @@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cach
> > >>  	}
> > >>  out_unlock:
> > >>  	mutex_unlock(&slab_mutex);
> > >> +	cpus_read_unlock();
> > >>  }
> > >>  EXPORT_SYMBOL(kmem_cache_destroy);
> > >>
> > >> --- a/mm/slub.c
> > >> +++ b/mm/slub.c
> > >> @@ -4234,7 +4234,7 @@ int __kmem_cache_shutdown(struct kmem_ca
> > >>  	int node;
> > >>  	struct kmem_cache_node *n;
> > >>
> > >> -	flush_all(s);
> > >> +	flush_all_cpus_locked(s);
> > >>  	/* Attempt to free all objects */
> > >>  	for_each_kmem_cache_node(s, node, n) {
> > >>  		free_partial(s, n);
> > > 
> > > [  602.539109] ------------[ cut here ]------------
> > > [  602.539804] WARNING: CPU: 3 PID: 88 at kernel/cpu.c:335 lockdep_assert_cpus_held+0x29/0x30
> > 
> > So this says the assert failed and we don't have the cpus_read_lock(), right, but...
> > 
> > > [  602.540940] Modules linked in:
> > > [  602.541377] CPU: 3 PID: 88 Comm: torture_shutdow Not tainted 5.14.0-rc5-next-20210809+ #3299
> > > [  602.542536] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.13.0-2.module_el8.5.0+746+bbd5d70c 04/01/2014
> > > [  602.543786] RIP: 0010:lockdep_assert_cpus_held+0x29/0x30
> > > [  602.544524] Code: 00 83 3d 4d f1 a4 01 01 76 0a 8b 05 4d 23 a5 01 85 c0 75 01 c3 be ff ff ff ff 48 c7 c7 b0 86 66 a3 e8 9b 05 c9 00 85 c0 75 ea <0f> 0b c3 0f 1f 40 00 41 57 41 89 ff 41 56 4d 89 c6 41 55 49 89 cd
> > > [  602.547051] RSP: 0000:ffffb382802efdb8 EFLAGS: 00010246
> > > [  602.547783] RAX: 0000000000000000 RBX: ffffa23301a44000 RCX: 0000000000000001
> > > [  602.548764] RDX: 0000000000000001 RSI: ffffffffa335f5c0 RDI: ffffffffa33adbbf[  602.549747] RBP: ffffa23301a44000 R08: ffffa23302810000 R09: 974cf0ba5c48ad3c
> > > [  602.550727] R10: ffffb382802efe78 R11: 0000000000000001 R12: ffffa23301a44000[  602.551709] R13: 00000000000249c0 R14: 00000000ffffffff R15: 0000000fffffffe0
> > > [  602.552694] FS:  0000000000000000(0000) GS:ffffa2331f580000(0000) knlGS:0000000000000000
> > > [  602.553805] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [  602.554606] CR2: 0000000000000000 CR3: 0000000017222000 CR4: 00000000000006e0
> > > [  602.555601] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > [  602.556590] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > [  602.557585] Call Trace:
> > > [  602.557927]  flush_all_cpus_locked+0x29/0x140
> > > [  602.558535]  __kmem_cache_shutdown+0x26/0x200
> > > [  602.559145]  ? lock_is_held_type+0xd6/0x130
> > > [  602.559739]  ? torture_onoff+0x260/0x260
> > > [  602.560284]  kmem_cache_destroy+0x38/0x110
> > 
> > It should have been taken here. I don't understand. It's as if only the
> > mm/slub.c was patched by Mike's patch, but mm/slab_common.c not?
> 
> You know, you would think that I would have learned how to reliably
> apply a patch by now.  Apparently not this morning.
> 
> Anyway, right in one!  I will try again with the full patch later.

And with both patches:

Tested-by: Paul E. McKenney <paulmck@kernel.org>

							Thanx, Paul

> > > [  602.560859]  rcu_torture_cleanup.cold.36+0x192/0x421
> > > [  602.561539]  ? wait_woken+0x60/0x60
> > > [  602.562035]  ? torture_onoff+0x260/0x260
> > > [  602.562591]  torture_shutdown+0xdd/0x1c0
> > > [  602.563131]  kthread+0x132/0x160
> > > [  602.563592]  ? set_kthread_struct+0x40/0x40
> > > [  602.564172]  ret_from_fork+0x22/0x30
> > > [  602.564696] irq event stamp: 1307
> > > [  602.565161] hardirqs last  enabled at (1315): [<ffffffffa1eddced>] __up_console_sem+0x4d/0x50
> > > [  602.566321] hardirqs last disabled at (1324): [<ffffffffa1eddcd2>] __up_console_sem+0x32/0x50
> > > [  602.567479] softirqs last  enabled at (1304): [<ffffffffa2e00311>] __do_softirq+0x311/0x473
> > > [  602.568616] softirqs last disabled at (1299): [<ffffffffa1e72eb8>] irq_exit_rcu+0xe8/0xf0
> > > [  602.569735] ---[ end trace 26fd643e1df331c9 ]---
> > > 
> >
diff mbox series

Patch

diff --git a/mm/slub.c b/mm/slub.c
index dceb289cb052..da48ada3d17f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2513,33 +2513,73 @@  static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 	unfreeze_partials_cpu(s, c);
 }
 
+struct slub_flush_work {
+	struct work_struct work;
+	struct kmem_cache *s;
+	bool skip;
+};
+
 /*
  * Flush cpu slab.
  *
- * Called from IPI handler with interrupts disabled.
+ * Called from CPU work handler with migration disabled.
  */
-static void flush_cpu_slab(void *d)
+static void flush_cpu_slab(struct work_struct *w)
 {
-	struct kmem_cache *s = d;
-	struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+	struct kmem_cache *s;
+	struct kmem_cache_cpu *c;
+	struct slub_flush_work *sfw;
+
+	sfw = container_of(w, struct slub_flush_work, work);
+
+	s = sfw->s;
+	c = this_cpu_ptr(s->cpu_slab);
 
 	if (c->page)
-		flush_slab(s, c, false);
+		flush_slab(s, c, true);
 
 	unfreeze_partials(s);
 }
 
-static bool has_cpu_slab(int cpu, void *info)
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
 {
-	struct kmem_cache *s = info;
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
 	return c->page || slub_percpu_partial(c);
 }
 
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
+
 static void flush_all(struct kmem_cache *s)
 {
-	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
+	struct slub_flush_work *sfw;
+	unsigned int cpu;
+
+	mutex_lock(&flush_lock);
+	cpus_read_lock();
+
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
+		if (!has_cpu_slab(cpu, s)) {
+			sfw->skip = true;
+			continue;
+		}
+		INIT_WORK(&sfw->work, flush_cpu_slab);
+		sfw->skip = false;
+		sfw->s = s;
+		schedule_work_on(cpu, &sfw->work);
+	}
+
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
+		if (sfw->skip)
+			continue;
+		flush_work(&sfw->work);
+	}
+
+	cpus_read_unlock();
+	mutex_unlock(&flush_lock);
 }
 
 /*