diff mbox series

[2/5] mm/slub: use stackdepot to save stack trace in objects

Message ID 20220225180318.20594-3-vbabka@suse.cz (mailing list archive)
State New
Headers show
Series SLUB debugfs improvements based on stackdepot | expand

Commit Message

Vlastimil Babka Feb. 25, 2022, 6:03 p.m. UTC
From: Oliver Glitta <glittao@gmail.com>

Many stack traces are similar so there are many similar arrays.
Stackdepot saves each unique stack only once.

Replace field addrs in struct track with depot_stack_handle_t handle.  Use
stackdepot to save stack trace.

The benefits are smaller memory overhead and possibility to aggregate
per-cache statistics in the following patch using the stackdepot handle
instead of matching stacks manually.

[ vbabka@suse.cz: rebase to 5.17-rc1 and adjust accordingly ]

This was initially merged as commit 788691464c29 and reverted by commit
ae14c63a9f20 due to several issues, that should now be fixed.
The problem of unconditional memory overhead by stackdepot has been
addressed by commit 2dba5eb1c73b ("lib/stackdepot: allow optional init
and stack_table allocation by kvmalloc()"), so the dependency on
stackdepot will result in extra memory usage only when a slab cache
tracking is actually enabled, and not for all CONFIG_SLUB_DEBUG builds.
The build failures on some architectures were also addressed, and the
reported issue with xfs/433 test did not reproduce on 5.17-rc1 with this
patch.

Signed-off-by: Oliver Glitta <glittao@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 init/Kconfig |  1 +
 mm/slub.c    | 88 +++++++++++++++++++++++++++++-----------------------
 2 files changed, 50 insertions(+), 39 deletions(-)

Comments

Hyeonggon Yoo Feb. 26, 2022, 10:24 a.m. UTC | #1
On Fri, Feb 25, 2022 at 07:03:15PM +0100, Vlastimil Babka wrote:
> From: Oliver Glitta <glittao@gmail.com>
> 
> Many stack traces are similar so there are many similar arrays.
> Stackdepot saves each unique stack only once.
> 
> Replace field addrs in struct track with depot_stack_handle_t handle.  Use
> stackdepot to save stack trace.
> 
> The benefits are smaller memory overhead and possibility to aggregate
> per-cache statistics in the following patch using the stackdepot handle
> instead of matching stacks manually.
> 
> [ vbabka@suse.cz: rebase to 5.17-rc1 and adjust accordingly ]
> 
> This was initially merged as commit 788691464c29 and reverted by commit
> ae14c63a9f20 due to several issues, that should now be fixed.
> The problem of unconditional memory overhead by stackdepot has been
> addressed by commit 2dba5eb1c73b ("lib/stackdepot: allow optional init
> and stack_table allocation by kvmalloc()"), so the dependency on
> stackdepot will result in extra memory usage only when a slab cache
> tracking is actually enabled, and not for all CONFIG_SLUB_DEBUG builds.
> The build failures on some architectures were also addressed, and the
> reported issue with xfs/433 test did not reproduce on 5.17-rc1 with this
> patch.
> 
> Signed-off-by: Oliver Glitta <glittao@gmail.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> ---
>  init/Kconfig |  1 +
>  mm/slub.c    | 88 +++++++++++++++++++++++++++++-----------------------
>  2 files changed, 50 insertions(+), 39 deletions(-)
> 
> diff --git a/init/Kconfig b/init/Kconfig
> index e9119bf54b1f..b21dd3a4a106 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1871,6 +1871,7 @@ config SLUB_DEBUG
>  	default y
>  	bool "Enable SLUB debugging support" if EXPERT
>  	depends on SLUB && SYSFS
> +	select STACKDEPOT if STACKTRACE_SUPPORT
>  	help
>  	  SLUB has extensive debug support features. Disabling these can
>  	  result in significant savings in code size. This also disables
> diff --git a/mm/slub.c b/mm/slub.c
> index 1fc451f4fe62..3140f763e819 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -26,6 +26,7 @@
>  #include <linux/cpuset.h>
>  #include <linux/mempolicy.h>
>  #include <linux/ctype.h>
> +#include <linux/stackdepot.h>
>  #include <linux/debugobjects.h>
>  #include <linux/kallsyms.h>
>  #include <linux/kfence.h>
> @@ -264,8 +265,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
>  #define TRACK_ADDRS_COUNT 16
>  struct track {
>  	unsigned long addr;	/* Called from address */
> -#ifdef CONFIG_STACKTRACE
> -	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
> +#ifdef CONFIG_STACKDEPOT
> +	depot_stack_handle_t handle;
>  #endif
>  	int cpu;		/* Was running on cpu */
>  	int pid;		/* Pid context */
> @@ -724,22 +725,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
>  	return kasan_reset_tag(p + alloc);
>  }
>  
> -static void set_track(struct kmem_cache *s, void *object,
> -			enum track_item alloc, unsigned long addr)
> +static noinline void
> +set_track(struct kmem_cache *s, void *object, enum track_item alloc,
> +	  unsigned long addr, gfp_t flags)
>  {
>  	struct track *p = get_track(s, object, alloc);
>  
> -#ifdef CONFIG_STACKTRACE
> +#ifdef CONFIG_STACKDEPOT
> +	unsigned long entries[TRACK_ADDRS_COUNT];
>  	unsigned int nr_entries;
>  
> -	metadata_access_enable();
> -	nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
> -				      TRACK_ADDRS_COUNT, 3);
> -	metadata_access_disable();
> -
> -	if (nr_entries < TRACK_ADDRS_COUNT)
> -		p->addrs[nr_entries] = 0;
> +	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
> +	p->handle = stack_depot_save(entries, nr_entries, flags);
>  #endif
> +
>  	p->addr = addr;
>  	p->cpu = smp_processor_id();
>  	p->pid = current->pid;
> @@ -759,20 +758,19 @@ static void init_tracking(struct kmem_cache *s, void *object)
>  
>  static void print_track(const char *s, struct track *t, unsigned long pr_time)
>  {
> +	depot_stack_handle_t handle __maybe_unused;
> +
>  	if (!t->addr)
>  		return;
>  
>  	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
>  	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
> -#ifdef CONFIG_STACKTRACE
> -	{
> -		int i;
> -		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
> -			if (t->addrs[i])
> -				pr_err("\t%pS\n", (void *)t->addrs[i]);
> -			else
> -				break;
> -	}
> +#ifdef CONFIG_STACKDEPOT
> +	handle = READ_ONCE(t->handle);
> +	if (handle)
> +		stack_depot_print(handle);
> +	else
> +		pr_err("object allocation/free stack trace missing\n");
>  #endif
>  }
>  
> @@ -1304,9 +1302,9 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
>  	return 1;
>  }
>  
> -static noinline int alloc_debug_processing(struct kmem_cache *s,
> -					struct slab *slab,
> -					void *object, unsigned long addr)
> +static noinline int
> +alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object,
> +		       unsigned long addr, gfp_t flags)
>  {
>  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
>  		if (!alloc_consistency_checks(s, slab, object))
> @@ -1315,7 +1313,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,
>  
>  	/* Success perform special debug activities for allocs */
>  	if (s->flags & SLAB_STORE_USER)
> -		set_track(s, object, TRACK_ALLOC, addr);
> +		set_track(s, object, TRACK_ALLOC, addr, flags);

I see warning because of this.
We should not reuse flags here because alloc_debug_processing() can be
called with preemption disabled, and caller specified GFP_KERNEL.

[    2.015902] BUG: sleeping function called from invalid context at mm/page_alloc.c:5164
[    2.022052] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
[    2.028357] preempt_count: 1, expected: 0
[    2.031508] RCU nest depth: 0, expected: 0
[    2.034722] 1 lock held by swapper/0/1:
[    2.037905]  #0: ffff00000488f4d0 (&sb->s_type->i_mutex_key#5){+.+.}-{4:4}, at: start_creating+0x58/0x130
[    2.045393] Preemption disabled at:
[    2.045400] [<ffff8000083bd008>] __slab_alloc.constprop.0+0x38/0xc0
[    2.053039] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W         5.17.0-rc5+ #105
[    2.059365] Hardware name: linux,dummy-virt (DT)
[    2.063160] Call trace:
[    2.065217]  dump_backtrace+0xf8/0x130
[    2.068350]  show_stack+0x24/0x80
[    2.071104]  dump_stack_lvl+0x9c/0xd8
[    2.074140]  dump_stack+0x18/0x34
[    2.076894]  __might_resched+0x1a0/0x280
[    2.080146]  __might_sleep+0x58/0x90
[    2.083108]  prepare_alloc_pages.constprop.0+0x1b4/0x1f0
[    2.087468]  __alloc_pages+0x88/0x1e0
[    2.090502]  alloc_page_interleave+0x24/0xb4
[    2.094021]  alloc_pages+0x10c/0x170
[    2.096984]  __stack_depot_save+0x3e0/0x4e0
[    2.100446]  stack_depot_save+0x14/0x20
[    2.103617]  set_track.isra.0+0x64/0xa4
[    2.106787]  alloc_debug_processing+0x11c/0x1e0
[    2.110532]  ___slab_alloc+0x3e8/0x750
[    2.113643]  __slab_alloc.constprop.0+0x64/0xc0
[    2.117391]  kmem_cache_alloc+0x304/0x350
[    2.120702]  security_inode_alloc+0x38/0xa4
[    2.124169]  inode_init_always+0xd0/0x264
[    2.127501]  alloc_inode+0x44/0xec
[    2.130325]  new_inode+0x28/0xc0
[    2.133011]  tracefs_create_file+0x74/0x1e0
[    2.136459]  init_tracer_tracefs+0x248/0x644
[    2.140030]  tracer_init_tracefs+0x9c/0x34c
[    2.143483]  do_one_initcall+0x44/0x170
[    2.146654]  do_initcalls+0x104/0x144
[    2.149704]  kernel_init_freeable+0x130/0x178

[...]

>  	trace(s, slab, object, 1);
>  	init_object(s, object, SLUB_RED_ACTIVE);
>  	return 1;
> @@ -1395,7 +1393,7 @@ static noinline int free_debug_processing(
>  	}
>  
>  	if (s->flags & SLAB_STORE_USER)
> -		set_track(s, object, TRACK_FREE, addr);
> +		set_track(s, object, TRACK_FREE, addr, GFP_NOWAIT);
>  	trace(s, slab, object, 0);
>  	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
>  	init_object(s, object, SLUB_RED_INACTIVE);
> @@ -1632,7 +1630,8 @@ static inline
>  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
>  
>  static inline int alloc_debug_processing(struct kmem_cache *s,
> -	struct slab *slab, void *object, unsigned long addr) { return 0; }
> +	struct slab *slab, void *object, unsigned long addr,
> +	gfp_t flags) { return 0; }
>  
>  static inline int free_debug_processing(
>  	struct kmem_cache *s, struct slab *slab,
> @@ -3033,7 +3032,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  check_new_slab:
>  
>  	if (kmem_cache_debug(s)) {
> -		if (!alloc_debug_processing(s, slab, freelist, addr)) {
> +		if (!alloc_debug_processing(s, slab, freelist, addr, gfpflags)) {
>  			/* Slab failed checks. Next slab needed */
>  			goto new_slab;
>  		} else {
> @@ -4221,6 +4220,9 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
>  	s->remote_node_defrag_ratio = 1000;
>  #endif
>  
> +	if (s->flags & SLAB_STORE_USER && IS_ENABLED(CONFIG_STACKDEPOT))
> +		stack_depot_init();
> +

As mentioned in my report, it can crash system when creating boot caches
with debugging enabled.

The rest looks fine!

>  	/* Initialize the pre-computed randomized freelist if slab is up */
>  	if (slab_state >= UP) {
>  		if (init_cache_random_seq(s))
> @@ -4352,18 +4354,26 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
>  	objp = fixup_red_left(s, objp);
>  	trackp = get_track(s, objp, TRACK_ALLOC);
>  	kpp->kp_ret = (void *)trackp->addr;
> -#ifdef CONFIG_STACKTRACE
> -	for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
> -		kpp->kp_stack[i] = (void *)trackp->addrs[i];
> -		if (!kpp->kp_stack[i])
> -			break;
> -	}
> +#ifdef CONFIG_STACKDEPOT
> +	{
> +		depot_stack_handle_t handle;
> +		unsigned long *entries;
> +		unsigned int nr_entries;
> +
> +		handle = READ_ONCE(trackp->handle);
> +		if (handle) {
> +			nr_entries = stack_depot_fetch(handle, &entries);
> +			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
> +				kpp->kp_stack[i] = (void *)entries[i];
> +		}
>  
> -	trackp = get_track(s, objp, TRACK_FREE);
> -	for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
> -		kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
> -		if (!kpp->kp_free_stack[i])
> -			break;
> +		trackp = get_track(s, objp, TRACK_FREE);
> +		handle = READ_ONCE(trackp->handle);
> +		if (handle) {
> +			nr_entries = stack_depot_fetch(handle, &entries);
> +			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
> +				kpp->kp_free_stack[i] = (void *)entries[i];
> +		}
>  	}
>  #endif
>  #endif
> -- 
> 2.35.1
> 
>
Hyeonggon Yoo Feb. 27, 2022, 9:44 a.m. UTC | #2
On Fri, Feb 25, 2022 at 07:03:15PM +0100, Vlastimil Babka wrote:
> From: Oliver Glitta <glittao@gmail.com>
> 
> Many stack traces are similar so there are many similar arrays.
> Stackdepot saves each unique stack only once.
>
> Replace field addrs in struct track with depot_stack_handle_t handle.  Use
> stackdepot to save stack trace.
>

I think it's not a replacement?

> The benefits are smaller memory overhead and possibility to aggregate
> per-cache statistics in the following patch using the stackdepot handle
> instead of matching stacks manually.
> 
> [ vbabka@suse.cz: rebase to 5.17-rc1 and adjust accordingly ]
> 
> This was initially merged as commit 788691464c29 and reverted by commit
> ae14c63a9f20 due to several issues, that should now be fixed.
> The problem of unconditional memory overhead by stackdepot has been
> addressed by commit 2dba5eb1c73b ("lib/stackdepot: allow optional init
> and stack_table allocation by kvmalloc()"), so the dependency on
> stackdepot will result in extra memory usage only when a slab cache
> tracking is actually enabled, and not for all CONFIG_SLUB_DEBUG builds.
> The build failures on some architectures were also addressed, and the
> reported issue with xfs/433 test did not reproduce on 5.17-rc1 with this
> patch.

This is just an idea and beyond this patch.

After this patch, now we have external storage that records stack traces.

It's possible that some rare stack traces are in stack depot, but
not reachable because track is overwritten.

I think it's worth implementing a way to iterate through stacks in stack depot?

> 
> Signed-off-by: Oliver Glitta <glittao@gmail.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Vlastimil Babka Feb. 28, 2022, 6:44 p.m. UTC | #3
On 2/26/22 11:24, Hyeonggon Yoo wrote:
> On Fri, Feb 25, 2022 at 07:03:15PM +0100, Vlastimil Babka wrote:
>> From: Oliver Glitta <glittao@gmail.com>
>> 
>> Many stack traces are similar so there are many similar arrays.
>> Stackdepot saves each unique stack only once.
>> 
>> Replace field addrs in struct track with depot_stack_handle_t handle.  Use
>> stackdepot to save stack trace.
>> 
>> The benefits are smaller memory overhead and possibility to aggregate
>> per-cache statistics in the following patch using the stackdepot handle
>> instead of matching stacks manually.
>> 
>> [ vbabka@suse.cz: rebase to 5.17-rc1 and adjust accordingly ]
>> 
>> This was initially merged as commit 788691464c29 and reverted by commit
>> ae14c63a9f20 due to several issues, that should now be fixed.
>> The problem of unconditional memory overhead by stackdepot has been
>> addressed by commit 2dba5eb1c73b ("lib/stackdepot: allow optional init
>> and stack_table allocation by kvmalloc()"), so the dependency on
>> stackdepot will result in extra memory usage only when a slab cache
>> tracking is actually enabled, and not for all CONFIG_SLUB_DEBUG builds.
>> The build failures on some architectures were also addressed, and the
>> reported issue with xfs/433 test did not reproduce on 5.17-rc1 with this
>> patch.
>> 
>> Signed-off-by: Oliver Glitta <glittao@gmail.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: David Rientjes <rientjes@google.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> ---
>>  init/Kconfig |  1 +
>>  mm/slub.c    | 88 +++++++++++++++++++++++++++++-----------------------
>>  2 files changed, 50 insertions(+), 39 deletions(-)
>> 
>> diff --git a/init/Kconfig b/init/Kconfig
>> index e9119bf54b1f..b21dd3a4a106 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1871,6 +1871,7 @@ config SLUB_DEBUG
>>  	default y
>>  	bool "Enable SLUB debugging support" if EXPERT
>>  	depends on SLUB && SYSFS
>> +	select STACKDEPOT if STACKTRACE_SUPPORT
>>  	help
>>  	  SLUB has extensive debug support features. Disabling these can
>>  	  result in significant savings in code size. This also disables
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 1fc451f4fe62..3140f763e819 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -26,6 +26,7 @@
>>  #include <linux/cpuset.h>
>>  #include <linux/mempolicy.h>
>>  #include <linux/ctype.h>
>> +#include <linux/stackdepot.h>
>>  #include <linux/debugobjects.h>
>>  #include <linux/kallsyms.h>
>>  #include <linux/kfence.h>
>> @@ -264,8 +265,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
>>  #define TRACK_ADDRS_COUNT 16
>>  struct track {
>>  	unsigned long addr;	/* Called from address */
>> -#ifdef CONFIG_STACKTRACE
>> -	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
>> +#ifdef CONFIG_STACKDEPOT
>> +	depot_stack_handle_t handle;
>>  #endif
>>  	int cpu;		/* Was running on cpu */
>>  	int pid;		/* Pid context */
>> @@ -724,22 +725,20 @@ static struct track *get_track(struct kmem_cache *s, void *object,
>>  	return kasan_reset_tag(p + alloc);
>>  }
>>  
>> -static void set_track(struct kmem_cache *s, void *object,
>> -			enum track_item alloc, unsigned long addr)
>> +static noinline void
>> +set_track(struct kmem_cache *s, void *object, enum track_item alloc,
>> +	  unsigned long addr, gfp_t flags)
>>  {
>>  	struct track *p = get_track(s, object, alloc);
>>  
>> -#ifdef CONFIG_STACKTRACE
>> +#ifdef CONFIG_STACKDEPOT
>> +	unsigned long entries[TRACK_ADDRS_COUNT];
>>  	unsigned int nr_entries;
>>  
>> -	metadata_access_enable();
>> -	nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
>> -				      TRACK_ADDRS_COUNT, 3);
>> -	metadata_access_disable();
>> -
>> -	if (nr_entries < TRACK_ADDRS_COUNT)
>> -		p->addrs[nr_entries] = 0;
>> +	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
>> +	p->handle = stack_depot_save(entries, nr_entries, flags);
>>  #endif
>> +
>>  	p->addr = addr;
>>  	p->cpu = smp_processor_id();
>>  	p->pid = current->pid;
>> @@ -759,20 +758,19 @@ static void init_tracking(struct kmem_cache *s, void *object)
>>  
>>  static void print_track(const char *s, struct track *t, unsigned long pr_time)
>>  {
>> +	depot_stack_handle_t handle __maybe_unused;
>> +
>>  	if (!t->addr)
>>  		return;
>>  
>>  	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
>>  	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
>> -#ifdef CONFIG_STACKTRACE
>> -	{
>> -		int i;
>> -		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
>> -			if (t->addrs[i])
>> -				pr_err("\t%pS\n", (void *)t->addrs[i]);
>> -			else
>> -				break;
>> -	}
>> +#ifdef CONFIG_STACKDEPOT
>> +	handle = READ_ONCE(t->handle);
>> +	if (handle)
>> +		stack_depot_print(handle);
>> +	else
>> +		pr_err("object allocation/free stack trace missing\n");
>>  #endif
>>  }
>>  
>> @@ -1304,9 +1302,9 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
>>  	return 1;
>>  }
>>  
>> -static noinline int alloc_debug_processing(struct kmem_cache *s,
>> -					struct slab *slab,
>> -					void *object, unsigned long addr)
>> +static noinline int
>> +alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object,
>> +		       unsigned long addr, gfp_t flags)
>>  {
>>  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
>>  		if (!alloc_consistency_checks(s, slab, object))
>> @@ -1315,7 +1313,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,
>>  
>>  	/* Success perform special debug activities for allocs */
>>  	if (s->flags & SLAB_STORE_USER)
>> -		set_track(s, object, TRACK_ALLOC, addr);
>> +		set_track(s, object, TRACK_ALLOC, addr, flags);
> 
> I see warning because of this.
> We should not reuse flags here because alloc_debug_processing() can be
> called with preemption disabled, and caller specified GFP_KERNEL.

Ugh, thanks for catching this, looks like I forgot to test with necessary
config options. Indeed the previous version of this patch that was commit
788691464c29 used GFP_NOWAIT. I have used the idea to pass allocation
gfpflags from Imran's version (another Cc I forgot, sorry)
https://lore.kernel.org/all/20210831062539.898293-3-imran.f.khan@oracle.com/
...

> [    2.015902] BUG: sleeping function called from invalid context at mm/page_alloc.c:5164
> [    2.022052] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
> [    2.028357] preempt_count: 1, expected: 0
> [    2.031508] RCU nest depth: 0, expected: 0
> [    2.034722] 1 lock held by swapper/0/1:
> [    2.037905]  #0: ffff00000488f4d0 (&sb->s_type->i_mutex_key#5){+.+.}-{4:4}, at: start_creating+0x58/0x130
> [    2.045393] Preemption disabled at:
> [    2.045400] [<ffff8000083bd008>] __slab_alloc.constprop.0+0x38/0xc0

... but indeed __slab_alloc() disables preemption so that won't work, and we
can only safely use GFP_NOWAIT. Will fix in v2, thanks.

> [    2.053039] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W         5.17.0-rc5+ #105
> [    2.059365] Hardware name: linux,dummy-virt (DT)
> [    2.063160] Call trace:
> [    2.065217]  dump_backtrace+0xf8/0x130
> [    2.068350]  show_stack+0x24/0x80
> [    2.071104]  dump_stack_lvl+0x9c/0xd8
> [    2.074140]  dump_stack+0x18/0x34
> [    2.076894]  __might_resched+0x1a0/0x280
> [    2.080146]  __might_sleep+0x58/0x90
> [    2.083108]  prepare_alloc_pages.constprop.0+0x1b4/0x1f0
> [    2.087468]  __alloc_pages+0x88/0x1e0
> [    2.090502]  alloc_page_interleave+0x24/0xb4
> [    2.094021]  alloc_pages+0x10c/0x170
> [    2.096984]  __stack_depot_save+0x3e0/0x4e0
> [    2.100446]  stack_depot_save+0x14/0x20
> [    2.103617]  set_track.isra.0+0x64/0xa4
> [    2.106787]  alloc_debug_processing+0x11c/0x1e0
> [    2.110532]  ___slab_alloc+0x3e8/0x750
> [    2.113643]  __slab_alloc.constprop.0+0x64/0xc0
> [    2.117391]  kmem_cache_alloc+0x304/0x350
> [    2.120702]  security_inode_alloc+0x38/0xa4
> [    2.124169]  inode_init_always+0xd0/0x264
> [    2.127501]  alloc_inode+0x44/0xec
> [    2.130325]  new_inode+0x28/0xc0
> [    2.133011]  tracefs_create_file+0x74/0x1e0
> [    2.136459]  init_tracer_tracefs+0x248/0x644
> [    2.140030]  tracer_init_tracefs+0x9c/0x34c
> [    2.143483]  do_one_initcall+0x44/0x170
> [    2.146654]  do_initcalls+0x104/0x144
> [    2.149704]  kernel_init_freeable+0x130/0x178
> 
> [...]
> 
>>  	trace(s, slab, object, 1);
>>  	init_object(s, object, SLUB_RED_ACTIVE);
>>  	return 1;
>> @@ -1395,7 +1393,7 @@ static noinline int free_debug_processing(
>>  	}
>>  
>>  	if (s->flags & SLAB_STORE_USER)
>> -		set_track(s, object, TRACK_FREE, addr);
>> +		set_track(s, object, TRACK_FREE, addr, GFP_NOWAIT);
>>  	trace(s, slab, object, 0);
>>  	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
>>  	init_object(s, object, SLUB_RED_INACTIVE);
>> @@ -1632,7 +1630,8 @@ static inline
>>  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
>>  
>>  static inline int alloc_debug_processing(struct kmem_cache *s,
>> -	struct slab *slab, void *object, unsigned long addr) { return 0; }
>> +	struct slab *slab, void *object, unsigned long addr,
>> +	gfp_t flags) { return 0; }
>>  
>>  static inline int free_debug_processing(
>>  	struct kmem_cache *s, struct slab *slab,
>> @@ -3033,7 +3032,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>  check_new_slab:
>>  
>>  	if (kmem_cache_debug(s)) {
>> -		if (!alloc_debug_processing(s, slab, freelist, addr)) {
>> +		if (!alloc_debug_processing(s, slab, freelist, addr, gfpflags)) {
>>  			/* Slab failed checks. Next slab needed */
>>  			goto new_slab;
>>  		} else {
>> @@ -4221,6 +4220,9 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
>>  	s->remote_node_defrag_ratio = 1000;
>>  #endif
>>  
>> +	if (s->flags & SLAB_STORE_USER && IS_ENABLED(CONFIG_STACKDEPOT))
>> +		stack_depot_init();
>> +
> 
> As mentioned in my report, it can crash system when creating boot caches
> with debugging enabled.
> 
> The rest looks fine!
> 
>>  	/* Initialize the pre-computed randomized freelist if slab is up */
>>  	if (slab_state >= UP) {
>>  		if (init_cache_random_seq(s))
>> @@ -4352,18 +4354,26 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
>>  	objp = fixup_red_left(s, objp);
>>  	trackp = get_track(s, objp, TRACK_ALLOC);
>>  	kpp->kp_ret = (void *)trackp->addr;
>> -#ifdef CONFIG_STACKTRACE
>> -	for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
>> -		kpp->kp_stack[i] = (void *)trackp->addrs[i];
>> -		if (!kpp->kp_stack[i])
>> -			break;
>> -	}
>> +#ifdef CONFIG_STACKDEPOT
>> +	{
>> +		depot_stack_handle_t handle;
>> +		unsigned long *entries;
>> +		unsigned int nr_entries;
>> +
>> +		handle = READ_ONCE(trackp->handle);
>> +		if (handle) {
>> +			nr_entries = stack_depot_fetch(handle, &entries);
>> +			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
>> +				kpp->kp_stack[i] = (void *)entries[i];
>> +		}
>>  
>> -	trackp = get_track(s, objp, TRACK_FREE);
>> -	for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
>> -		kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
>> -		if (!kpp->kp_free_stack[i])
>> -			break;
>> +		trackp = get_track(s, objp, TRACK_FREE);
>> +		handle = READ_ONCE(trackp->handle);
>> +		if (handle) {
>> +			nr_entries = stack_depot_fetch(handle, &entries);
>> +			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
>> +				kpp->kp_free_stack[i] = (void *)entries[i];
>> +		}
>>  	}
>>  #endif
>>  #endif
>> -- 
>> 2.35.1
>> 
>> 
>
Vlastimil Babka March 2, 2022, 4:51 p.m. UTC | #4
On 2/27/22 10:44, Hyeonggon Yoo wrote:
> On Fri, Feb 25, 2022 at 07:03:15PM +0100, Vlastimil Babka wrote:
>> From: Oliver Glitta <glittao@gmail.com>
>> 
>> Many stack traces are similar so there are many similar arrays.
>> Stackdepot saves each unique stack only once.
>>
>> Replace field addrs in struct track with depot_stack_handle_t handle.  Use
>> stackdepot to save stack trace.
>>
> 
> I think it's not a replacement?

It is, for the array 'addrs':

-#ifdef CONFIG_STACKTRACE
-	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
+#ifdef CONFIG_STACKDEPOT
+	depot_stack_handle_t handle;

Not confuse with 'addr' which is the immediate caller and indeed stays
for redundancy/kernels without stack trace enabled.

>> The benefits are smaller memory overhead and possibility to aggregate
>> per-cache statistics in the following patch using the stackdepot handle
>> instead of matching stacks manually.
>> 
>> [ vbabka@suse.cz: rebase to 5.17-rc1 and adjust accordingly ]
>> 
>> This was initially merged as commit 788691464c29 and reverted by commit
>> ae14c63a9f20 due to several issues, that should now be fixed.
>> The problem of unconditional memory overhead by stackdepot has been
>> addressed by commit 2dba5eb1c73b ("lib/stackdepot: allow optional init
>> and stack_table allocation by kvmalloc()"), so the dependency on
>> stackdepot will result in extra memory usage only when a slab cache
>> tracking is actually enabled, and not for all CONFIG_SLUB_DEBUG builds.
>> The build failures on some architectures were also addressed, and the
>> reported issue with xfs/433 test did not reproduce on 5.17-rc1 with this
>> patch.
> 
> This is just an idea and beyond this patch.
> 
> After this patch, now we have external storage that records stack traces.

Well, we had it before this patch too.

> It's possible that some rare stack traces are in stack depot, but
> not reachable because track is overwritten.

Yes.

> I think it's worth implementing a way to iterate through stacks in stack depot?

The question is for what use case? We might even not know who stored
them - could have been page_owner, or other stack depot users. But the
point is usually not to learn about all existing traces, but to
determine which ones cause an object lifetime bug, or memory leak.

>> 
>> Signed-off-by: Oliver Glitta <glittao@gmail.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> Cc: David Rientjes <rientjes@google.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>
Hyeonggon Yoo March 2, 2022, 5:22 p.m. UTC | #5
On Wed, Mar 02, 2022 at 05:51:32PM +0100, Vlastimil Babka wrote:
> On 2/27/22 10:44, Hyeonggon Yoo wrote:
> > On Fri, Feb 25, 2022 at 07:03:15PM +0100, Vlastimil Babka wrote:
> >> From: Oliver Glitta <glittao@gmail.com>
> >> 
> >> Many stack traces are similar so there are many similar arrays.
> >> Stackdepot saves each unique stack only once.
> >>
> >> Replace field addrs in struct track with depot_stack_handle_t handle.  Use
> >> stackdepot to save stack trace.
> >>
> > 
> > I think it's not a replacement?
> 
> It is, for the array 'addrs':
> 
> -#ifdef CONFIG_STACKTRACE
> -	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
> +#ifdef CONFIG_STACKDEPOT
> +	depot_stack_handle_t handle;
> 
> Not confuse with 'addr' which is the immediate caller and indeed stays
> for redundancy/kernels without stack trace enabled.
>

Oh, my fault. Right. I was confused.
I should read it again.

> >> The benefits are smaller memory overhead and possibility to aggregate
> >> per-cache statistics in the following patch using the stackdepot handle
> >> instead of matching stacks manually.
> >> 
> >> [ vbabka@suse.cz: rebase to 5.17-rc1 and adjust accordingly ]
> >> 
> >> This was initially merged as commit 788691464c29 and reverted by commit
> >> ae14c63a9f20 due to several issues, that should now be fixed.
> >> The problem of unconditional memory overhead by stackdepot has been
> >> addressed by commit 2dba5eb1c73b ("lib/stackdepot: allow optional init
> >> and stack_table allocation by kvmalloc()"), so the dependency on
> >> stackdepot will result in extra memory usage only when a slab cache
> >> tracking is actually enabled, and not for all CONFIG_SLUB_DEBUG builds.
> >> The build failures on some architectures were also addressed, and the
> >> reported issue with xfs/433 test did not reproduce on 5.17-rc1 with this
> >> patch.
> > 
> > This is just an idea and beyond this patch.
> > 
> > After this patch, now we have external storage that records stack traces.
> 
> Well, we had it before this patch too.
>
> > It's possible that some rare stack traces are in stack depot, but
> > not reachable because track is overwritten.
> 
> Yes.
> 
> > I think it's worth implementing a way to iterate through stacks in stack depot?
> 
> The question is for what use case? We might even not know who stored
> them - could have been page_owner, or other stack depot users.

> But the point is usually not to learn about all existing traces, but to
> determine which ones cause an object lifetime bug, or memory leak.

Yeah, this is exactly what I misunderstood.
I thought purpose of free_traces is to show all existing traces.
But I realized today that free trace without alloc trace is not useful.

I'll review v2 with these in mind.
Thank you.

> >> 
> >> Signed-off-by: Oliver Glitta <glittao@gmail.com>
> >> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >> Cc: David Rientjes <rientjes@google.com>
> >> Cc: Christoph Lameter <cl@linux.com>
> >> Cc: Pekka Enberg <penberg@kernel.org>
> >> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > 
>
diff mbox series

Patch

diff --git a/init/Kconfig b/init/Kconfig
index e9119bf54b1f..b21dd3a4a106 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1871,6 +1871,7 @@  config SLUB_DEBUG
 	default y
 	bool "Enable SLUB debugging support" if EXPERT
 	depends on SLUB && SYSFS
+	select STACKDEPOT if STACKTRACE_SUPPORT
 	help
 	  SLUB has extensive debug support features. Disabling these can
 	  result in significant savings in code size. This also disables
diff --git a/mm/slub.c b/mm/slub.c
index 1fc451f4fe62..3140f763e819 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -26,6 +26,7 @@ 
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
+#include <linux/stackdepot.h>
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
 #include <linux/kfence.h>
@@ -264,8 +265,8 @@  static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 #define TRACK_ADDRS_COUNT 16
 struct track {
 	unsigned long addr;	/* Called from address */
-#ifdef CONFIG_STACKTRACE
-	unsigned long addrs[TRACK_ADDRS_COUNT];	/* Called from address */
+#ifdef CONFIG_STACKDEPOT
+	depot_stack_handle_t handle;
 #endif
 	int cpu;		/* Was running on cpu */
 	int pid;		/* Pid context */
@@ -724,22 +725,20 @@  static struct track *get_track(struct kmem_cache *s, void *object,
 	return kasan_reset_tag(p + alloc);
 }
 
-static void set_track(struct kmem_cache *s, void *object,
-			enum track_item alloc, unsigned long addr)
+static noinline void
+set_track(struct kmem_cache *s, void *object, enum track_item alloc,
+	  unsigned long addr, gfp_t flags)
 {
 	struct track *p = get_track(s, object, alloc);
 
-#ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_STACKDEPOT
+	unsigned long entries[TRACK_ADDRS_COUNT];
 	unsigned int nr_entries;
 
-	metadata_access_enable();
-	nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
-				      TRACK_ADDRS_COUNT, 3);
-	metadata_access_disable();
-
-	if (nr_entries < TRACK_ADDRS_COUNT)
-		p->addrs[nr_entries] = 0;
+	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
+	p->handle = stack_depot_save(entries, nr_entries, flags);
 #endif
+
 	p->addr = addr;
 	p->cpu = smp_processor_id();
 	p->pid = current->pid;
@@ -759,20 +758,19 @@  static void init_tracking(struct kmem_cache *s, void *object)
 
 static void print_track(const char *s, struct track *t, unsigned long pr_time)
 {
+	depot_stack_handle_t handle __maybe_unused;
+
 	if (!t->addr)
 		return;
 
 	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
 	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
-#ifdef CONFIG_STACKTRACE
-	{
-		int i;
-		for (i = 0; i < TRACK_ADDRS_COUNT; i++)
-			if (t->addrs[i])
-				pr_err("\t%pS\n", (void *)t->addrs[i]);
-			else
-				break;
-	}
+#ifdef CONFIG_STACKDEPOT
+	handle = READ_ONCE(t->handle);
+	if (handle)
+		stack_depot_print(handle);
+	else
+		pr_err("object allocation/free stack trace missing\n");
 #endif
 }
 
@@ -1304,9 +1302,9 @@  static inline int alloc_consistency_checks(struct kmem_cache *s,
 	return 1;
 }
 
-static noinline int alloc_debug_processing(struct kmem_cache *s,
-					struct slab *slab,
-					void *object, unsigned long addr)
+static noinline int
+alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object,
+		       unsigned long addr, gfp_t flags)
 {
 	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 		if (!alloc_consistency_checks(s, slab, object))
@@ -1315,7 +1313,7 @@  static noinline int alloc_debug_processing(struct kmem_cache *s,
 
 	/* Success perform special debug activities for allocs */
 	if (s->flags & SLAB_STORE_USER)
-		set_track(s, object, TRACK_ALLOC, addr);
+		set_track(s, object, TRACK_ALLOC, addr, flags);
 	trace(s, slab, object, 1);
 	init_object(s, object, SLUB_RED_ACTIVE);
 	return 1;
@@ -1395,7 +1393,7 @@  static noinline int free_debug_processing(
 	}
 
 	if (s->flags & SLAB_STORE_USER)
-		set_track(s, object, TRACK_FREE, addr);
+		set_track(s, object, TRACK_FREE, addr, GFP_NOWAIT);
 	trace(s, slab, object, 0);
 	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
 	init_object(s, object, SLUB_RED_INACTIVE);
@@ -1632,7 +1630,8 @@  static inline
 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
-	struct slab *slab, void *object, unsigned long addr) { return 0; }
+	struct slab *slab, void *object, unsigned long addr,
+	gfp_t flags) { return 0; }
 
 static inline int free_debug_processing(
 	struct kmem_cache *s, struct slab *slab,
@@ -3033,7 +3032,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 check_new_slab:
 
 	if (kmem_cache_debug(s)) {
-		if (!alloc_debug_processing(s, slab, freelist, addr)) {
+		if (!alloc_debug_processing(s, slab, freelist, addr, gfpflags)) {
 			/* Slab failed checks. Next slab needed */
 			goto new_slab;
 		} else {
@@ -4221,6 +4220,9 @@  static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
 	s->remote_node_defrag_ratio = 1000;
 #endif
 
+	if (s->flags & SLAB_STORE_USER && IS_ENABLED(CONFIG_STACKDEPOT))
+		stack_depot_init();
+
 	/* Initialize the pre-computed randomized freelist if slab is up */
 	if (slab_state >= UP) {
 		if (init_cache_random_seq(s))
@@ -4352,18 +4354,26 @@  void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
 	objp = fixup_red_left(s, objp);
 	trackp = get_track(s, objp, TRACK_ALLOC);
 	kpp->kp_ret = (void *)trackp->addr;
-#ifdef CONFIG_STACKTRACE
-	for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
-		kpp->kp_stack[i] = (void *)trackp->addrs[i];
-		if (!kpp->kp_stack[i])
-			break;
-	}
+#ifdef CONFIG_STACKDEPOT
+	{
+		depot_stack_handle_t handle;
+		unsigned long *entries;
+		unsigned int nr_entries;
+
+		handle = READ_ONCE(trackp->handle);
+		if (handle) {
+			nr_entries = stack_depot_fetch(handle, &entries);
+			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+				kpp->kp_stack[i] = (void *)entries[i];
+		}
 
-	trackp = get_track(s, objp, TRACK_FREE);
-	for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
-		kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
-		if (!kpp->kp_free_stack[i])
-			break;
+		trackp = get_track(s, objp, TRACK_FREE);
+		handle = READ_ONCE(trackp->handle);
+		if (handle) {
+			nr_entries = stack_depot_fetch(handle, &entries);
+			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+				kpp->kp_free_stack[i] = (void *)entries[i];
+		}
 	}
 #endif
 #endif