diff mbox series

[v2] kasan: Fix sleeping function called from invalid context on RT kernel

Message ID 20220401134649.2222485-1-qiang1.zhang@intel.com (mailing list archive)
State New
Headers show
Series [v2] kasan: Fix sleeping function called from invalid context on RT kernel | expand

Commit Message

Zqiang April 1, 2022, 1:46 p.m. UTC
BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0
preempt_count: 1, expected: 0
...........
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt16-yocto-preempt-rt #22
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009),
BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x60/0x8c
dump_stack+0x10/0x12
 __might_resched.cold+0x13b/0x173
rt_spin_lock+0x5b/0xf0
 ___cache_free+0xa5/0x180
qlist_free_all+0x7a/0x160
per_cpu_remove_cache+0x5f/0x70
smp_call_function_many_cond+0x4c4/0x4f0
on_each_cpu_cond_mask+0x49/0xc0
kasan_quarantine_remove_cache+0x54/0xf0
kasan_cache_shrink+0x9/0x10
kmem_cache_shrink+0x13/0x20
acpi_os_purge_cache+0xe/0x20
acpi_purge_cached_objects+0x21/0x6d
acpi_initialize_objects+0x15/0x3b
acpi_init+0x130/0x5ba
do_one_initcall+0xe5/0x5b0
kernel_init_freeable+0x34f/0x3ad
kernel_init+0x1e/0x140
ret_from_fork+0x22/0x30

When the kmem_cache_shrink() be called, the IPI was triggered, the
___cache_free() is called in IPI interrupt context, the local-lock
or spin-lock will be acquired. on PREEMPT_RT kernel, these lock is
replaced with sleepbale rt-spinlock, so the above problem is triggered.
fix it by move the qlist_free_allfrom() the IPI interrupt context
to the task context when PREEMPT_RT is enabled.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
---
 v1->v2:
 Add raw_spinlock protect per-cpu shrink qlist. 

 mm/kasan/quarantine.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

Comments

Dmitry Vyukov April 1, 2022, 4:06 p.m. UTC | #1
On Fri, 1 Apr 2022 at 15:46, Zqiang <qiang1.zhang@intel.com> wrote:
>
> BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
> in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0
> preempt_count: 1, expected: 0
> ...........
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt16-yocto-preempt-rt #22
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009),
> BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
> Call Trace:
> <TASK>
> dump_stack_lvl+0x60/0x8c
> dump_stack+0x10/0x12
>  __might_resched.cold+0x13b/0x173
> rt_spin_lock+0x5b/0xf0
>  ___cache_free+0xa5/0x180
> qlist_free_all+0x7a/0x160
> per_cpu_remove_cache+0x5f/0x70
> smp_call_function_many_cond+0x4c4/0x4f0
> on_each_cpu_cond_mask+0x49/0xc0
> kasan_quarantine_remove_cache+0x54/0xf0
> kasan_cache_shrink+0x9/0x10
> kmem_cache_shrink+0x13/0x20
> acpi_os_purge_cache+0xe/0x20
> acpi_purge_cached_objects+0x21/0x6d
> acpi_initialize_objects+0x15/0x3b
> acpi_init+0x130/0x5ba
> do_one_initcall+0xe5/0x5b0
> kernel_init_freeable+0x34f/0x3ad
> kernel_init+0x1e/0x140
> ret_from_fork+0x22/0x30
>
> When the kmem_cache_shrink() be called, the IPI was triggered, the
> ___cache_free() is called in IPI interrupt context, the local-lock
> or spin-lock will be acquired. on PREEMPT_RT kernel, these lock is
> replaced with sleepbale rt-spinlock, so the above problem is triggered.
> fix it by move the qlist_free_allfrom() the IPI interrupt context
> to the task context when PREEMPT_RT is enabled.
>
> Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> ---
>  v1->v2:
>  Add raw_spinlock protect per-cpu shrink qlist.
>
>  mm/kasan/quarantine.c | 40 ++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 38 insertions(+), 2 deletions(-)
>
> diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
> index 08291ed33e93..0e33d30abb8d 100644
> --- a/mm/kasan/quarantine.c
> +++ b/mm/kasan/quarantine.c
> @@ -99,6 +99,17 @@ static unsigned long quarantine_size;
>  static DEFINE_RAW_SPINLOCK(quarantine_lock);
>  DEFINE_STATIC_SRCU(remove_cache_srcu);
>
> +#ifdef CONFIG_PREEMPT_RT
> +struct cpu_shrink_qlist {
> +       raw_spinlock_t lock;
> +       struct qlist_head qlist;
> +};
> +
> +static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = {
> +       .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock),
> +};
> +#endif
> +
>  /* Maximum size of the global queue. */
>  static unsigned long quarantine_max_size;
>
> @@ -311,12 +322,23 @@ static void qlist_move_cache(struct qlist_head *from,
>  static void per_cpu_remove_cache(void *arg)
>  {
>         struct kmem_cache *cache = arg;
> -       struct qlist_head to_free = QLIST_INIT;
>         struct qlist_head *q;
> -
> +#ifndef CONFIG_PREEMPT_RT
> +       struct qlist_head to_free = QLIST_INIT;
> +#else
> +       unsigned long flags;
> +       struct cpu_shrink_qlist *sq;
> +#endif
>         q = this_cpu_ptr(&cpu_quarantine);
> +#ifndef CONFIG_PREEMPT_RT
>         qlist_move_cache(q, &to_free, cache);
>         qlist_free_all(&to_free, cache);
> +#else
> +       sq = this_cpu_ptr(&shrink_qlist);
> +       raw_spin_lock_irqsave(&sq->lock, flags);
> +       qlist_move_cache(q, &sq->qlist, cache);
> +       raw_spin_unlock_irqrestore(&sq->lock, flags);
> +#endif
>  }
>
>  /* Free all quarantined objects belonging to cache. */
> @@ -324,6 +346,10 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache)
>  {
>         unsigned long flags, i;
>         struct qlist_head to_free = QLIST_INIT;
> +#ifdef CONFIG_PREEMPT_RT
> +       int cpu;
> +       struct cpu_shrink_qlist *sq;
> +#endif
>
>         /*
>          * Must be careful to not miss any objects that are being moved from
> @@ -334,6 +360,16 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache)
>          */
>         on_each_cpu(per_cpu_remove_cache, cache, 1);
>
> +#ifdef CONFIG_PREEMPT_RT
> +       for_each_online_cpu(cpu) {
> +               sq = per_cpu_ptr(&shrink_qlist, cpu);
> +               raw_spin_lock_irqsave(&sq->lock, flags);
> +               qlist_move_cache(&sq->qlist, &to_free, cache);
> +               raw_spin_unlock_irqrestore(&sq->lock, flags);
> +       }
> +       qlist_free_all(&to_free, cache);

I think now there is another subtle bug.
I assume that by the time kasan_quarantine_remove_cache(cache) returns
all objects belonging to the cache must be freed. I think there are
scenarios where it's not the case.
Consider there is thread 1 that calls kasan_quarantine_remove_cache(A)
and thread 2 that calls kasan_quarantine_remove_cache(B).
Consider that kasan_quarantine_remove_cache callbacks for both A and B
has finished and shrink_qlist contains all objects that belong to
caches A and B.
Now thread 1 executes for_each_online_cpu part and collects all
objects into the local to_free list.
Now thread 2 executes the for_each_online_cpu, calls qlist_free_all
(on an empty list) and returns from kasan_quarantine_remove_cache.
Then cache B is completely destroyed and freed.
Now thread 1 resumes and calls qlist_free_all for objects from cache B.
Bang!




> +#endif
> +
>         raw_spin_lock_irqsave(&quarantine_lock, flags);
>         for (i = 0; i < QUARANTINE_BATCHES; i++) {
>                 if (qlist_empty(&global_quarantine[i]))
> --
> 2.25.1
>
Zqiang April 1, 2022, 11:14 p.m. UTC | #2
On Fri, 1 Apr 2022 at 15:46, Zqiang <qiang1.zhang@intel.com> wrote:
>
> BUG: sleeping function called from invalid context at 
> kernel/locking/spinlock_rt.c:46
> in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name: 
> swapper/0
> preempt_count: 1, expected: 0
> ...........
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt16-yocto-preempt-rt 
> #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace:
> <TASK>
> dump_stack_lvl+0x60/0x8c
> dump_stack+0x10/0x12
>  __might_resched.cold+0x13b/0x173
> rt_spin_lock+0x5b/0xf0
>  ___cache_free+0xa5/0x180
> qlist_free_all+0x7a/0x160
> per_cpu_remove_cache+0x5f/0x70
> smp_call_function_many_cond+0x4c4/0x4f0
> on_each_cpu_cond_mask+0x49/0xc0
> kasan_quarantine_remove_cache+0x54/0xf0
> kasan_cache_shrink+0x9/0x10
> kmem_cache_shrink+0x13/0x20
> acpi_os_purge_cache+0xe/0x20
> acpi_purge_cached_objects+0x21/0x6d
> acpi_initialize_objects+0x15/0x3b
> acpi_init+0x130/0x5ba
> do_one_initcall+0xe5/0x5b0
> kernel_init_freeable+0x34f/0x3ad
> kernel_init+0x1e/0x140
> ret_from_fork+0x22/0x30
>
> When the kmem_cache_shrink() be called, the IPI was triggered, the
> ___cache_free() is called in IPI interrupt context, the local-lock or 
> spin-lock will be acquired. on PREEMPT_RT kernel, these lock is 
> replaced with sleepbale rt-spinlock, so the above problem is triggered.
> fix it by move the qlist_free_allfrom() the IPI interrupt context to 
> the task context when PREEMPT_RT is enabled.
>
> Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> ---
>  v1->v2:
>  Add raw_spinlock protect per-cpu shrink qlist.
>
>  mm/kasan/quarantine.c | 40 ++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 38 insertions(+), 2 deletions(-)
>
> diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 
> 08291ed33e93..0e33d30abb8d 100644
> --- a/mm/kasan/quarantine.c
> +++ b/mm/kasan/quarantine.c
> @@ -99,6 +99,17 @@ static unsigned long quarantine_size;  static 
> DEFINE_RAW_SPINLOCK(quarantine_lock);
>  DEFINE_STATIC_SRCU(remove_cache_srcu);
>
> +#ifdef CONFIG_PREEMPT_RT
> +struct cpu_shrink_qlist {
> +       raw_spinlock_t lock;
> +       struct qlist_head qlist;
> +};
> +
> +static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = {
> +       .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock),
> +};
> +#endif
> +
>  /* Maximum size of the global queue. */  static unsigned long 
> quarantine_max_size;
>
> @@ -311,12 +322,23 @@ static void qlist_move_cache(struct qlist_head 
> *from,  static void per_cpu_remove_cache(void *arg)  {
>         struct kmem_cache *cache = arg;
> -       struct qlist_head to_free = QLIST_INIT;
>         struct qlist_head *q;
> -
> +#ifndef CONFIG_PREEMPT_RT
> +       struct qlist_head to_free = QLIST_INIT; #else
> +       unsigned long flags;
> +       struct cpu_shrink_qlist *sq;
> +#endif
>         q = this_cpu_ptr(&cpu_quarantine);
> +#ifndef CONFIG_PREEMPT_RT
>         qlist_move_cache(q, &to_free, cache);
>         qlist_free_all(&to_free, cache);
> +#else
> +       sq = this_cpu_ptr(&shrink_qlist);
> +       raw_spin_lock_irqsave(&sq->lock, flags);
> +       qlist_move_cache(q, &sq->qlist, cache);
> +       raw_spin_unlock_irqrestore(&sq->lock, flags); #endif
>  }
>
>  /* Free all quarantined objects belonging to cache. */ @@ -324,6 
> +346,10 @@ void kasan_quarantine_remove_cache(struct kmem_cache 
> *cache)  {
>         unsigned long flags, i;
>         struct qlist_head to_free = QLIST_INIT;
> +#ifdef CONFIG_PREEMPT_RT
> +       int cpu;
> +       struct cpu_shrink_qlist *sq;
> +#endif
>
>         /*
>          * Must be careful to not miss any objects that are being 
> moved from @@ -334,6 +360,16 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache)
>          */
>         on_each_cpu(per_cpu_remove_cache, cache, 1);
>
> +#ifdef CONFIG_PREEMPT_RT
> +       for_each_online_cpu(cpu) {
> +               sq = per_cpu_ptr(&shrink_qlist, cpu);
> +               raw_spin_lock_irqsave(&sq->lock, flags);
> +               qlist_move_cache(&sq->qlist, &to_free, cache);
> +               raw_spin_unlock_irqrestore(&sq->lock, flags);
> +       }
> +       qlist_free_all(&to_free, cache);

>
>I think now there is another subtle bug.
>I assume that by the time kasan_quarantine_remove_cache(cache) returns all objects belonging to the cache must be freed. I think there are scenarios where it's not the case.
>Consider there is thread 1 that calls kasan_quarantine_remove_cache(A) and thread 2 that calls kasan_quarantine_remove_cache(B).
>Consider that kasan_quarantine_remove_cache callbacks for both A and B has finished and shrink_qlist contains all objects that belong to caches A and B.
>Now thread 1 executes for_each_online_cpu part and collects all objects into the local to_free list.

According to my understanding
Thread 1 only collects objects which belong to caches A , because the qlist_move_cache(&sq->qlist, &to_free, cache)
Will filtered again,  or did I miss something?

Thanks
Zqiang

>Now thread 2 executes the for_each_online_cpu, calls qlist_free_all (on an empty list) and returns from kasan_quarantine_remove_cache.
>Then cache B is completely destroyed and freed.
>Now thread 1 resumes and calls qlist_free_all for objects from cache B.
>Bang!




> +#endif
> +
>         raw_spin_lock_irqsave(&quarantine_lock, flags);
>         for (i = 0; i < QUARANTINE_BATCHES; i++) {
>                 if (qlist_empty(&global_quarantine[i]))
> --
> 2.25.1
>
Dmitry Vyukov April 2, 2022, 7:03 a.m. UTC | #3
On Sat, 2 Apr 2022 at 01:15, Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
>
>
>  On Fri, 1 Apr 2022 at 15:46, Zqiang <qiang1.zhang@intel.com> wrote:
> >
> > BUG: sleeping function called from invalid context at
> > kernel/locking/spinlock_rt.c:46
> > in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name:
> > swapper/0
> > preempt_count: 1, expected: 0
> > ...........
> > CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt16-yocto-preempt-rt
> > #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
> > rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace:
> > <TASK>
> > dump_stack_lvl+0x60/0x8c
> > dump_stack+0x10/0x12
> >  __might_resched.cold+0x13b/0x173
> > rt_spin_lock+0x5b/0xf0
> >  ___cache_free+0xa5/0x180
> > qlist_free_all+0x7a/0x160
> > per_cpu_remove_cache+0x5f/0x70
> > smp_call_function_many_cond+0x4c4/0x4f0
> > on_each_cpu_cond_mask+0x49/0xc0
> > kasan_quarantine_remove_cache+0x54/0xf0
> > kasan_cache_shrink+0x9/0x10
> > kmem_cache_shrink+0x13/0x20
> > acpi_os_purge_cache+0xe/0x20
> > acpi_purge_cached_objects+0x21/0x6d
> > acpi_initialize_objects+0x15/0x3b
> > acpi_init+0x130/0x5ba
> > do_one_initcall+0xe5/0x5b0
> > kernel_init_freeable+0x34f/0x3ad
> > kernel_init+0x1e/0x140
> > ret_from_fork+0x22/0x30
> >
> > When the kmem_cache_shrink() be called, the IPI was triggered, the
> > ___cache_free() is called in IPI interrupt context, the local-lock or
> > spin-lock will be acquired. on PREEMPT_RT kernel, these lock is
> > replaced with sleepbale rt-spinlock, so the above problem is triggered.
> > fix it by move the qlist_free_allfrom() the IPI interrupt context to
> > the task context when PREEMPT_RT is enabled.
> >
> > Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> > ---
> >  v1->v2:
> >  Add raw_spinlock protect per-cpu shrink qlist.
> >
> >  mm/kasan/quarantine.c | 40 ++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 38 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index
> > 08291ed33e93..0e33d30abb8d 100644
> > --- a/mm/kasan/quarantine.c
> > +++ b/mm/kasan/quarantine.c
> > @@ -99,6 +99,17 @@ static unsigned long quarantine_size;  static
> > DEFINE_RAW_SPINLOCK(quarantine_lock);
> >  DEFINE_STATIC_SRCU(remove_cache_srcu);
> >
> > +#ifdef CONFIG_PREEMPT_RT
> > +struct cpu_shrink_qlist {
> > +       raw_spinlock_t lock;
> > +       struct qlist_head qlist;
> > +};
> > +
> > +static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = {
> > +       .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock),
> > +};
> > +#endif
> > +
> >  /* Maximum size of the global queue. */  static unsigned long
> > quarantine_max_size;
> >
> > @@ -311,12 +322,23 @@ static void qlist_move_cache(struct qlist_head
> > *from,  static void per_cpu_remove_cache(void *arg)  {
> >         struct kmem_cache *cache = arg;
> > -       struct qlist_head to_free = QLIST_INIT;
> >         struct qlist_head *q;
> > -
> > +#ifndef CONFIG_PREEMPT_RT
> > +       struct qlist_head to_free = QLIST_INIT; #else
> > +       unsigned long flags;
> > +       struct cpu_shrink_qlist *sq;
> > +#endif
> >         q = this_cpu_ptr(&cpu_quarantine);
> > +#ifndef CONFIG_PREEMPT_RT
> >         qlist_move_cache(q, &to_free, cache);
> >         qlist_free_all(&to_free, cache);
> > +#else
> > +       sq = this_cpu_ptr(&shrink_qlist);
> > +       raw_spin_lock_irqsave(&sq->lock, flags);
> > +       qlist_move_cache(q, &sq->qlist, cache);
> > +       raw_spin_unlock_irqrestore(&sq->lock, flags); #endif
> >  }
> >
> >  /* Free all quarantined objects belonging to cache. */ @@ -324,6
> > +346,10 @@ void kasan_quarantine_remove_cache(struct kmem_cache
> > *cache)  {
> >         unsigned long flags, i;
> >         struct qlist_head to_free = QLIST_INIT;
> > +#ifdef CONFIG_PREEMPT_RT
> > +       int cpu;
> > +       struct cpu_shrink_qlist *sq;
> > +#endif
> >
> >         /*
> >          * Must be careful to not miss any objects that are being
> > moved from @@ -334,6 +360,16 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache)
> >          */
> >         on_each_cpu(per_cpu_remove_cache, cache, 1);
> >
> > +#ifdef CONFIG_PREEMPT_RT
> > +       for_each_online_cpu(cpu) {
> > +               sq = per_cpu_ptr(&shrink_qlist, cpu);
> > +               raw_spin_lock_irqsave(&sq->lock, flags);
> > +               qlist_move_cache(&sq->qlist, &to_free, cache);
> > +               raw_spin_unlock_irqrestore(&sq->lock, flags);
> > +       }
> > +       qlist_free_all(&to_free, cache);
>
> >
> >I think now there is another subtle bug.
> >I assume that by the time kasan_quarantine_remove_cache(cache) returns all objects belonging to the cache must be freed. I think there are scenarios where it's not the case.
> >Consider there is thread 1 that calls kasan_quarantine_remove_cache(A) and thread 2 that calls kasan_quarantine_remove_cache(B).
> >Consider that kasan_quarantine_remove_cache callbacks for both A and B has finished and shrink_qlist contains all objects that belong to caches A and B.
> >Now thread 1 executes for_each_online_cpu part and collects all objects into the local to_free list.
>
> According to my understanding
> Thread 1 only collects objects which belong to caches A , because the qlist_move_cache(&sq->qlist, &to_free, cache)
> Will filtered again,  or did I miss something?

You are right. I missed that kasan_quarantine_remove_cache also
filters based on cache.

Acked-by: Dmitry Vyukov <dvyukov@google.com>

> >Now thread 2 executes the for_each_online_cpu, calls qlist_free_all (on an empty list) and returns from kasan_quarantine_remove_cache.
> >Then cache B is completely destroyed and freed.
> >Now thread 1 resumes and calls qlist_free_all for objects from cache B.
> >Bang!
>
>
>
>
> > +#endif
> > +
> >         raw_spin_lock_irqsave(&quarantine_lock, flags);
> >         for (i = 0; i < QUARANTINE_BATCHES; i++) {
> >                 if (qlist_empty(&global_quarantine[i]))
> > --
> > 2.25.1
> >
Zqiang April 6, 2022, 4:39 a.m. UTC | #4
On Sat, 2 Apr 2022 at 01:15, Zhang, Qiang1 <qiang1.zhang@intel.com> wrote:
>
>
>  On Fri, 1 Apr 2022 at 15:46, Zqiang <qiang1.zhang@intel.com> wrote:
> >
> > BUG: sleeping function called from invalid context at
> > kernel/locking/spinlock_rt.c:46
> > in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name:
> > swapper/0
> > preempt_count: 1, expected: 0
> > ...........
> > CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
> > 5.17.1-rt16-yocto-preempt-rt
> > #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> > rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace:
> > <TASK>
> > dump_stack_lvl+0x60/0x8c
> > dump_stack+0x10/0x12
> >  __might_resched.cold+0x13b/0x173
> > rt_spin_lock+0x5b/0xf0
> >  ___cache_free+0xa5/0x180
> > qlist_free_all+0x7a/0x160
> > per_cpu_remove_cache+0x5f/0x70
> > smp_call_function_many_cond+0x4c4/0x4f0
> > on_each_cpu_cond_mask+0x49/0xc0
> > kasan_quarantine_remove_cache+0x54/0xf0
> > kasan_cache_shrink+0x9/0x10
> > kmem_cache_shrink+0x13/0x20
> > acpi_os_purge_cache+0xe/0x20
> > acpi_purge_cached_objects+0x21/0x6d
> > acpi_initialize_objects+0x15/0x3b
> > acpi_init+0x130/0x5ba
> > do_one_initcall+0xe5/0x5b0
> > kernel_init_freeable+0x34f/0x3ad
> > kernel_init+0x1e/0x140
> > ret_from_fork+0x22/0x30
> >
> > When the kmem_cache_shrink() be called, the IPI was triggered, the
> > ___cache_free() is called in IPI interrupt context, the local-lock 
> > or spin-lock will be acquired. on PREEMPT_RT kernel, these lock is 
> > replaced with sleepbale rt-spinlock, so the above problem is triggered.
> > fix it by move the qlist_free_allfrom() the IPI interrupt context to 
> > the task context when PREEMPT_RT is enabled.
> >
> > Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> > ---
> >  v1->v2:
> >  Add raw_spinlock protect per-cpu shrink qlist.
> >
> >  mm/kasan/quarantine.c | 40 ++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 38 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 
> > 08291ed33e93..0e33d30abb8d 100644
> > --- a/mm/kasan/quarantine.c
> > +++ b/mm/kasan/quarantine.c
> > @@ -99,6 +99,17 @@ static unsigned long quarantine_size;  static 
> > DEFINE_RAW_SPINLOCK(quarantine_lock);
> >  DEFINE_STATIC_SRCU(remove_cache_srcu);
> >
> > +#ifdef CONFIG_PREEMPT_RT
> > +struct cpu_shrink_qlist {
> > +       raw_spinlock_t lock;
> > +       struct qlist_head qlist;
> > +};
> > +
> > +static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = {
> > +       .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock),
> > +};
> > +#endif
> > +
> >  /* Maximum size of the global queue. */  static unsigned long 
> > quarantine_max_size;
> >
> > @@ -311,12 +322,23 @@ static void qlist_move_cache(struct qlist_head 
> > *from,  static void per_cpu_remove_cache(void *arg)  {
> >         struct kmem_cache *cache = arg;
> > -       struct qlist_head to_free = QLIST_INIT;
> >         struct qlist_head *q;
> > -
> > +#ifndef CONFIG_PREEMPT_RT
> > +       struct qlist_head to_free = QLIST_INIT; #else
> > +       unsigned long flags;
> > +       struct cpu_shrink_qlist *sq; #endif
> >         q = this_cpu_ptr(&cpu_quarantine);
> > +#ifndef CONFIG_PREEMPT_RT
> >         qlist_move_cache(q, &to_free, cache);
> >         qlist_free_all(&to_free, cache);
> > +#else
> > +       sq = this_cpu_ptr(&shrink_qlist);
> > +       raw_spin_lock_irqsave(&sq->lock, flags);
> > +       qlist_move_cache(q, &sq->qlist, cache);
> > +       raw_spin_unlock_irqrestore(&sq->lock, flags); #endif
> >  }
> >
> >  /* Free all quarantined objects belonging to cache. */ @@ -324,6
> > +346,10 @@ void kasan_quarantine_remove_cache(struct kmem_cache
> > *cache)  {
> >         unsigned long flags, i;
> >         struct qlist_head to_free = QLIST_INIT;
> > +#ifdef CONFIG_PREEMPT_RT
> > +       int cpu;
> > +       struct cpu_shrink_qlist *sq; #endif
> >
> >         /*
> >          * Must be careful to not miss any objects that are being 
> > moved from @@ -334,6 +360,16 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache)
> >          */
> >         on_each_cpu(per_cpu_remove_cache, cache, 1);
> >
> > +#ifdef CONFIG_PREEMPT_RT
> > +       for_each_online_cpu(cpu) {
> > +               sq = per_cpu_ptr(&shrink_qlist, cpu);
> > +               raw_spin_lock_irqsave(&sq->lock, flags);
> > +               qlist_move_cache(&sq->qlist, &to_free, cache);
> > +               raw_spin_unlock_irqrestore(&sq->lock, flags);
> > +       }
> > +       qlist_free_all(&to_free, cache);
>
> >
> >I think now there is another subtle bug.
> >I assume that by the time kasan_quarantine_remove_cache(cache) returns all objects belonging to the cache must be freed. I think there are scenarios where it's not the case.
> >Consider there is thread 1 that calls kasan_quarantine_remove_cache(A) and thread 2 that calls kasan_quarantine_remove_cache(B).
> >Consider that kasan_quarantine_remove_cache callbacks for both A and B has finished and shrink_qlist contains all objects that belong to caches A and B.
> >Now thread 1 executes for_each_online_cpu part and collects all objects into the local to_free list.
>
> According to my understanding
> Thread 1 only collects objects which belong to caches A , because the 
> qlist_move_cache(&sq->qlist, &to_free, cache) Will filtered again,  or did I miss something?

>You are right. I missed that kasan_quarantine_remove_cache also filters based on cache.
>
>Acked-by: Dmitry Vyukov <dvyukov@google.com>

Cc: Andrew Morton

> >Now thread 2 executes the for_each_online_cpu, calls qlist_free_all (on an empty list) and returns from kasan_quarantine_remove_cache.
> >Then cache B is completely destroyed and freed.
> >Now thread 1 resumes and calls qlist_free_all for objects from cache B.
> >Bang!
>
>
>
>
> > +#endif
> > +
> >         raw_spin_lock_irqsave(&quarantine_lock, flags);
> >         for (i = 0; i < QUARANTINE_BATCHES; i++) {
> >                 if (qlist_empty(&global_quarantine[i]))
> > --
> > 2.25.1
> >
Andrew Morton April 15, 2022, 6:34 p.m. UTC | #5
On Fri,  1 Apr 2022 21:46:49 +0800 Zqiang <qiang1.zhang@intel.com> wrote:

> When the kmem_cache_shrink() be called, the IPI was triggered, the
> ___cache_free() is called in IPI interrupt context, the local-lock
> or spin-lock will be acquired. on PREEMPT_RT kernel, these lock is
> replaced with sleepbale rt-spinlock, so the above problem is triggered.
> fix it by move the qlist_free_allfrom() the IPI interrupt context
> to the task context when PREEMPT_RT is enabled.

This patch is rather ifdeffy so I propose the below cleanup.  Please
review and test?

Note that it incorporates the changes from your
https://lkml.kernel.org/r/20220414025925.2423818-1-qiang1.zhang@intel.com

btw, how are we supposed to test PREEMPT_RT builds?  I had to patch
arch/Kconfig.

--- a/mm/kasan/quarantine.c~kasan-fix-sleeping-function-called-from-invalid-context-on-rt-kernel-fix
+++ a/mm/kasan/quarantine.c
@@ -319,28 +319,37 @@ static void qlist_move_cache(struct qlis
 	}
 }
 
-static void per_cpu_remove_cache(void *arg)
+#ifndef CONFIG_PREEMPT_RT
+static void __per_cpu_remove_cache(struct qlist_head *q, void *arg)
 {
 	struct kmem_cache *cache = arg;
-	struct qlist_head *q;
-#ifndef CONFIG_PREEMPT_RT
 	struct qlist_head to_free = QLIST_INIT;
-#else
-	unsigned long flags;
-	struct cpu_shrink_qlist *sq;
-#endif
-	q = this_cpu_ptr(&cpu_quarantine);
-#ifndef CONFIG_PREEMPT_RT
-	if (READ_ONCE(q->offline))
-		return;
+
 	qlist_move_cache(q, &to_free, cache);
 	qlist_free_all(&to_free, cache);
+}
 #else
+static void __per_cpu_remove_cache(struct qlist_head *q, void *arg)
+{
+	struct kmem_cache *cache = arg;
+	unsigned long flags;
+	struct cpu_shrink_qlist *sq;
+
 	sq = this_cpu_ptr(&shrink_qlist);
 	raw_spin_lock_irqsave(&sq->lock, flags);
 	qlist_move_cache(q, &sq->qlist, cache);
 	raw_spin_unlock_irqrestore(&sq->lock, flags);
+}
 #endif
+
+static void per_cpu_remove_cache(void *arg)
+{
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	if (READ_ONCE(q->offline))
+		return;
+	__per_cpu_remove_cache(q, arg);
 }
 
 /* Free all quarantined objects belonging to cache. */
@@ -348,10 +357,6 @@ void kasan_quarantine_remove_cache(struc
 {
 	unsigned long flags, i;
 	struct qlist_head to_free = QLIST_INIT;
-#ifdef CONFIG_PREEMPT_RT
-	int cpu;
-	struct cpu_shrink_qlist *sq;
-#endif
 
 	/*
 	 * Must be careful to not miss any objects that are being moved from
@@ -363,13 +368,18 @@ void kasan_quarantine_remove_cache(struc
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 
 #ifdef CONFIG_PREEMPT_RT
-	for_each_online_cpu(cpu) {
-		sq = per_cpu_ptr(&shrink_qlist, cpu);
-		raw_spin_lock_irqsave(&sq->lock, flags);
-		qlist_move_cache(&sq->qlist, &to_free, cache);
-		raw_spin_unlock_irqrestore(&sq->lock, flags);
+	{
+		int cpu;
+		struct cpu_shrink_qlist *sq;
+
+		for_each_online_cpu(cpu) {
+			sq = per_cpu_ptr(&shrink_qlist, cpu);
+			raw_spin_lock_irqsave(&sq->lock, flags);
+			qlist_move_cache(&sq->qlist, &to_free, cache);
+			raw_spin_unlock_irqrestore(&sq->lock, flags);
+		}
+		qlist_free_all(&to_free, cache);
 	}
-	qlist_free_all(&to_free, cache);
 #endif
 
 	raw_spin_lock_irqsave(&quarantine_lock, flags);
Zqiang April 16, 2022, 2:47 a.m. UTC | #6
On Fri,  1 Apr 2022 21:46:49 +0800 Zqiang <qiang1.zhang@intel.com> wrote:

> When the kmem_cache_shrink() be called, the IPI was triggered, the
> ___cache_free() is called in IPI interrupt context, the local-lock or 
> spin-lock will be acquired. on PREEMPT_RT kernel, these lock is 
> replaced with sleepbale rt-spinlock, so the above problem is triggered.
> fix it by move the qlist_free_allfrom() the IPI interrupt context to 
> the task context when PREEMPT_RT is enabled.
>
>This patch is rather ifdeffy so I propose the below cleanup.  Please review and test?

Thanks, it looks clearer.

>Note that it incorporates the changes from your https://lkml.kernel.org/r/20220414025925.2423818-1-qiang1.zhang@intel.com
>
>btw, how are we supposed to test PREEMPT_RT builds?  I had to patch arch/Kconfig.

The attachment is a config file and I test with linux-5.17.y-rt branch (https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git),  I have been tested.

Thanks
Zqiang
	
>
--- a/mm/kasan/quarantine.c~kasan-fix-sleeping-function-called-from-invalid-context-on-rt-kernel-fix
+++ a/mm/kasan/quarantine.c
@@ -319,28 +319,37 @@ static void qlist_move_cache(struct qlis
 	}
 }
 
-static void per_cpu_remove_cache(void *arg)
+#ifndef CONFIG_PREEMPT_RT
+static void __per_cpu_remove_cache(struct qlist_head *q, void *arg)
 {
 	struct kmem_cache *cache = arg;
-	struct qlist_head *q;
-#ifndef CONFIG_PREEMPT_RT
 	struct qlist_head to_free = QLIST_INIT; -#else
-	unsigned long flags;
-	struct cpu_shrink_qlist *sq;
-#endif
-	q = this_cpu_ptr(&cpu_quarantine);
-#ifndef CONFIG_PREEMPT_RT
-	if (READ_ONCE(q->offline))
-		return;
+
 	qlist_move_cache(q, &to_free, cache);
 	qlist_free_all(&to_free, cache);
+}
 #else
+static void __per_cpu_remove_cache(struct qlist_head *q, void *arg) {
+	struct kmem_cache *cache = arg;
+	unsigned long flags;
+	struct cpu_shrink_qlist *sq;
+
 	sq = this_cpu_ptr(&shrink_qlist);
 	raw_spin_lock_irqsave(&sq->lock, flags);
 	qlist_move_cache(q, &sq->qlist, cache);
 	raw_spin_unlock_irqrestore(&sq->lock, flags);
+}
 #endif
+
+static void per_cpu_remove_cache(void *arg) {
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	if (READ_ONCE(q->offline))
+		return;
+	__per_cpu_remove_cache(q, arg);
 }
 
 /* Free all quarantined objects belonging to cache. */ @@ -348,10 +357,6 @@ void kasan_quarantine_remove_cache(struc
 {
 	unsigned long flags, i;
 	struct qlist_head to_free = QLIST_INIT; -#ifdef CONFIG_PREEMPT_RT
-	int cpu;
-	struct cpu_shrink_qlist *sq;
-#endif
 
 	/*
 	 * Must be careful to not miss any objects that are being moved from @@ -363,13 +368,18 @@ void kasan_quarantine_remove_cache(struc
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 
 #ifdef CONFIG_PREEMPT_RT
-	for_each_online_cpu(cpu) {
-		sq = per_cpu_ptr(&shrink_qlist, cpu);
-		raw_spin_lock_irqsave(&sq->lock, flags);
-		qlist_move_cache(&sq->qlist, &to_free, cache);
-		raw_spin_unlock_irqrestore(&sq->lock, flags);
+	{
+		int cpu;
+		struct cpu_shrink_qlist *sq;
+
+		for_each_online_cpu(cpu) {
+			sq = per_cpu_ptr(&shrink_qlist, cpu);
+			raw_spin_lock_irqsave(&sq->lock, flags);
+			qlist_move_cache(&sq->qlist, &to_free, cache);
+			raw_spin_unlock_irqrestore(&sq->lock, flags);
+		}
+		qlist_free_all(&to_free, cache);
 	}
-	qlist_free_all(&to_free, cache);
 #endif
 
 	raw_spin_lock_irqsave(&quarantine_lock, flags); _
Andrew Morton April 19, 2022, 4:20 a.m. UTC | #7
On Sat, 16 Apr 2022 02:47:43 +0000 "Zhang, Qiang1" <qiang1.zhang@intel.com> wrote:

> >Note that it incorporates the changes from your https://lkml.kernel.org/r/20220414025925.2423818-1-qiang1.zhang@intel.com
> >
> >btw, how are we supposed to test PREEMPT_RT builds?  I had to patch arch/Kconfig.
> 
> The attachment is a config file and I test with linux-5.17.y-rt branch (https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git),  I have been tested.

OK, but it's a problem that we're adding CONFIG_PREEMPT_RT code to the
mainline kernel without providing for it to be at least compile-tested.

Sebastian, is there something we can do here to increase the coverage
testing?
diff mbox series

Patch

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 08291ed33e93..0e33d30abb8d 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -99,6 +99,17 @@  static unsigned long quarantine_size;
 static DEFINE_RAW_SPINLOCK(quarantine_lock);
 DEFINE_STATIC_SRCU(remove_cache_srcu);
 
+#ifdef CONFIG_PREEMPT_RT
+struct cpu_shrink_qlist {
+	raw_spinlock_t lock;
+	struct qlist_head qlist;
+};
+
+static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = {
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock),
+};
+#endif
+
 /* Maximum size of the global queue. */
 static unsigned long quarantine_max_size;
 
@@ -311,12 +322,23 @@  static void qlist_move_cache(struct qlist_head *from,
 static void per_cpu_remove_cache(void *arg)
 {
 	struct kmem_cache *cache = arg;
-	struct qlist_head to_free = QLIST_INIT;
 	struct qlist_head *q;
-
+#ifndef CONFIG_PREEMPT_RT
+	struct qlist_head to_free = QLIST_INIT;
+#else
+	unsigned long flags;
+	struct cpu_shrink_qlist *sq;
+#endif
 	q = this_cpu_ptr(&cpu_quarantine);
+#ifndef CONFIG_PREEMPT_RT
 	qlist_move_cache(q, &to_free, cache);
 	qlist_free_all(&to_free, cache);
+#else
+	sq = this_cpu_ptr(&shrink_qlist);
+	raw_spin_lock_irqsave(&sq->lock, flags);
+	qlist_move_cache(q, &sq->qlist, cache);
+	raw_spin_unlock_irqrestore(&sq->lock, flags);
+#endif
 }
 
 /* Free all quarantined objects belonging to cache. */
@@ -324,6 +346,10 @@  void kasan_quarantine_remove_cache(struct kmem_cache *cache)
 {
 	unsigned long flags, i;
 	struct qlist_head to_free = QLIST_INIT;
+#ifdef CONFIG_PREEMPT_RT
+	int cpu;
+	struct cpu_shrink_qlist *sq;
+#endif
 
 	/*
 	 * Must be careful to not miss any objects that are being moved from
@@ -334,6 +360,16 @@  void kasan_quarantine_remove_cache(struct kmem_cache *cache)
 	 */
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 
+#ifdef CONFIG_PREEMPT_RT
+	for_each_online_cpu(cpu) {
+		sq = per_cpu_ptr(&shrink_qlist, cpu);
+		raw_spin_lock_irqsave(&sq->lock, flags);
+		qlist_move_cache(&sq->qlist, &to_free, cache);
+		raw_spin_unlock_irqrestore(&sq->lock, flags);
+	}
+	qlist_free_all(&to_free, cache);
+#endif
+
 	raw_spin_lock_irqsave(&quarantine_lock, flags);
 	for (i = 0; i < QUARANTINE_BATCHES; i++) {
 		if (qlist_empty(&global_quarantine[i]))