[RFC] mm/slub: remove left-over debugging code

Message ID	1568650294-8579-1-git-send-email-cai@lca.pw (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=CHX8=XL=kvack.org=owner-linux-mm@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 2C98021881 From: Qian Cai <cai@lca.pw> To: cl@linux.com Cc: penberg@kernel.org, rientjes@google.com, akpm@linux-foundation.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, Qian Cai <cai@lca.pw> Subject: [RFC PATCH] mm/slub: remove left-over debugging code Date: Mon, 16 Sep 2019 12:11:34 -0400 Message-Id: <1568650294-8579-1-git-send-email-cai@lca.pw> Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[RFC] mm/slub: remove left-over debugging code \| expand [RFC] mm/slub: remove left-over debugging code

Message ID

1568650294-8579-1-git-send-email-cai@lca.pw (mailing list archive)

State

New, archived

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 2C98021881
From: Qian Cai <cai@lca.pw>
To: cl@linux.com
Cc: penberg@kernel.org,
	rientjes@google.com,
	akpm@linux-foundation.org,
	linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	Qian Cai <cai@lca.pw>
Subject: [RFC PATCH] mm/slub: remove left-over debugging code
Date: Mon, 16 Sep 2019 12:11:34 -0400
Message-Id: <1568650294-8579-1-git-send-email-cai@lca.pw>
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

[RFC] mm/slub: remove left-over debugging code | expand

Commit Message

Qian Cai Sept. 16, 2019, 4:11 p.m. UTC

SLUB_RESILIENCY_TEST and SLUB_DEBUG_CMPXCHG look like some left-over
debugging code during the internal development that probably nobody uses
it anymore. Remove them to make the world greener.
---
 mm/slub.c | 110 --------------------------------------------------------------
 1 file changed, 110 deletions(-)

Comments

David Rientjes Sept. 16, 2019, 6:32 p.m. UTC | #1

On Mon, 16 Sep 2019, Qian Cai wrote:

> SLUB_RESILIENCY_TEST and SLUB_DEBUG_CMPXCHG look like some left-over
> debugging code during the internal development that probably nobody uses
> it anymore. Remove them to make the world greener.

Adding Pengfei Li who has been working on a patchset for modified handling 
of kmalloc cache initialization and touches the resiliency test.

I still find the resiliency test to be helpful/instructional for handling 
unexpected conditions in these caches, so I'd suggest against removing it: 
the only downside is that it's additional source code.  But it's helpful 
source code for reference.

The cmpxchg failures could likely be more generalized beyond SLUB since 
there will be other dependencies in the kernel than just this allocator.

(I assume you didn't send a Signed-off-by line because this is an RFC.)

> ---
>  mm/slub.c | 110 --------------------------------------------------------------
>  1 file changed, 110 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 8834563cdb4b..f97155ba097d 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -150,12 +150,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
>   * - Variable sizing of the per node arrays
>   */
>  
> -/* Enable to test recovery from slab corruption on boot */
> -#undef SLUB_RESILIENCY_TEST
> -
> -/* Enable to log cmpxchg failures */
> -#undef SLUB_DEBUG_CMPXCHG
> -
>  /*
>   * Mininum number of partial slabs. These will be left on the partial
>   * lists even if they are empty. kmem_cache_shrink may reclaim them.
> @@ -392,10 +386,6 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
>  	cpu_relax();
>  	stat(s, CMPXCHG_DOUBLE_FAIL);
>  
> -#ifdef SLUB_DEBUG_CMPXCHG
> -	pr_info("%s %s: cmpxchg double redo ", n, s->name);
> -#endif
> -
>  	return false;
>  }
>  
> @@ -433,10 +423,6 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
>  	cpu_relax();
>  	stat(s, CMPXCHG_DOUBLE_FAIL);
>  
> -#ifdef SLUB_DEBUG_CMPXCHG
> -	pr_info("%s %s: cmpxchg double redo ", n, s->name);
> -#endif
> -
>  	return false;
>  }
>  
> @@ -2004,45 +1990,11 @@ static inline unsigned long next_tid(unsigned long tid)
>  	return tid + TID_STEP;
>  }
>  
> -static inline unsigned int tid_to_cpu(unsigned long tid)
> -{
> -	return tid % TID_STEP;
> -}
> -
> -static inline unsigned long tid_to_event(unsigned long tid)
> -{
> -	return tid / TID_STEP;
> -}
> -
>  static inline unsigned int init_tid(int cpu)
>  {
>  	return cpu;
>  }
>  
> -static inline void note_cmpxchg_failure(const char *n,
> -		const struct kmem_cache *s, unsigned long tid)
> -{
> -#ifdef SLUB_DEBUG_CMPXCHG
> -	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
> -
> -	pr_info("%s %s: cmpxchg redo ", n, s->name);
> -
> -#ifdef CONFIG_PREEMPT
> -	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
> -		pr_warn("due to cpu change %d -> %d\n",
> -			tid_to_cpu(tid), tid_to_cpu(actual_tid));
> -	else
> -#endif
> -	if (tid_to_event(tid) != tid_to_event(actual_tid))
> -		pr_warn("due to cpu running other code. Event %ld->%ld\n",
> -			tid_to_event(tid), tid_to_event(actual_tid));
> -	else
> -		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
> -			actual_tid, tid, next_tid(tid));
> -#endif
> -	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
> -}
> -
>  static void init_kmem_cache_cpus(struct kmem_cache *s)
>  {
>  	int cpu;
> @@ -2751,7 +2703,6 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
>  				object, tid,
>  				next_object, next_tid(tid)))) {
>  
> -			note_cmpxchg_failure("slab_alloc", s, tid);
>  			goto redo;
>  		}
>  		prefetch_freepointer(s, next_object);
> @@ -4694,66 +4645,6 @@ static int list_locations(struct kmem_cache *s, char *buf,
>  }
>  #endif	/* CONFIG_SLUB_DEBUG */
>  
> -#ifdef SLUB_RESILIENCY_TEST
> -static void __init resiliency_test(void)
> -{
> -	u8 *p;
> -	int type = KMALLOC_NORMAL;
> -
> -	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
> -
> -	pr_err("SLUB resiliency testing\n");
> -	pr_err("-----------------------\n");
> -	pr_err("A. Corruption after allocation\n");
> -
> -	p = kzalloc(16, GFP_KERNEL);
> -	p[16] = 0x12;
> -	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
> -	       p + 16);
> -
> -	validate_slab_cache(kmalloc_caches[type][4]);
> -
> -	/* Hmmm... The next two are dangerous */
> -	p = kzalloc(32, GFP_KERNEL);
> -	p[32 + sizeof(void *)] = 0x34;
> -	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
> -	       p);
> -	pr_err("If allocated object is overwritten then not detectable\n\n");
> -
> -	validate_slab_cache(kmalloc_caches[type][5]);
> -	p = kzalloc(64, GFP_KERNEL);
> -	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
> -	*p = 0x56;
> -	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
> -	       p);
> -	pr_err("If allocated object is overwritten then not detectable\n\n");
> -	validate_slab_cache(kmalloc_caches[type][6]);
> -
> -	pr_err("\nB. Corruption after free\n");
> -	p = kzalloc(128, GFP_KERNEL);
> -	kfree(p);
> -	*p = 0x78;
> -	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
> -	validate_slab_cache(kmalloc_caches[type][7]);
> -
> -	p = kzalloc(256, GFP_KERNEL);
> -	kfree(p);
> -	p[50] = 0x9a;
> -	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
> -	validate_slab_cache(kmalloc_caches[type][8]);
> -
> -	p = kzalloc(512, GFP_KERNEL);
> -	kfree(p);
> -	p[512] = 0xab;
> -	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
> -	validate_slab_cache(kmalloc_caches[type][9]);
> -}
> -#else
> -#ifdef CONFIG_SYSFS
> -static void resiliency_test(void) {};
> -#endif
> -#endif	/* SLUB_RESILIENCY_TEST */
> -
>  #ifdef CONFIG_SYSFS
>  enum slab_stat_type {
>  	SL_ALL,			/* All slabs */
> @@ -5875,7 +5766,6 @@ static int __init slab_sysfs_init(void)
>  	}
>  
>  	mutex_unlock(&slab_mutex);
> -	resiliency_test();
>  	return 0;
>  }
>  
> -- 
> 1.8.3.1
> 
>

Pengfei Li Sept. 17, 2019, 1:24 p.m. UTC | #2

On Tue, Sep 17, 2019 at 2:32 AM David Rientjes <rientjes@google.com> wrote:
>
> On Mon, 16 Sep 2019, Qian Cai wrote:
>
> > SLUB_RESILIENCY_TEST and SLUB_DEBUG_CMPXCHG look like some left-over
> > debugging code during the internal development that probably nobody uses
> > it anymore. Remove them to make the world greener.
>
> Adding Pengfei Li who has been working on a patchset for modified handling
> of kmalloc cache initialization and touches the resiliency test.
>

Thanks for looping me in.

My opinion is the same as David Rientjes.
The resiliency test should not be removed but should be improved.

> I still find the resiliency test to be helpful/instructional for handling
> unexpected conditions in these caches, so I'd suggest against removing it:
> the only downside is that it's additional source code.  But it's helpful
> source code for reference.
>
> The cmpxchg failures could likely be more generalized beyond SLUB since
> there will be other dependencies in the kernel than just this allocator.
>
> (I assume you didn't send a Signed-off-by line because this is an RFC.)
>
> > ---
> >  mm/slub.c | 110 --------------------------------------------------------------
> >  1 file changed, 110 deletions(-)
> >
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 8834563cdb4b..f97155ba097d 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -150,12 +150,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
> >   * - Variable sizing of the per node arrays
> >   */
> >
> > -/* Enable to test recovery from slab corruption on boot */
> > -#undef SLUB_RESILIENCY_TEST
> > -
> > -/* Enable to log cmpxchg failures */
> > -#undef SLUB_DEBUG_CMPXCHG
> > -
> >  /*
> >   * Mininum number of partial slabs. These will be left on the partial
> >   * lists even if they are empty. kmem_cache_shrink may reclaim them.
> > @@ -392,10 +386,6 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
> >       cpu_relax();
> >       stat(s, CMPXCHG_DOUBLE_FAIL);
> >
> > -#ifdef SLUB_DEBUG_CMPXCHG
> > -     pr_info("%s %s: cmpxchg double redo ", n, s->name);
> > -#endif
> > -
> >       return false;
> >  }
> >
> > @@ -433,10 +423,6 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
> >       cpu_relax();
> >       stat(s, CMPXCHG_DOUBLE_FAIL);
> >
> > -#ifdef SLUB_DEBUG_CMPXCHG
> > -     pr_info("%s %s: cmpxchg double redo ", n, s->name);
> > -#endif
> > -
> >       return false;
> >  }
> >
> > @@ -2004,45 +1990,11 @@ static inline unsigned long next_tid(unsigned long tid)
> >       return tid + TID_STEP;
> >  }
> >
> > -static inline unsigned int tid_to_cpu(unsigned long tid)
> > -{
> > -     return tid % TID_STEP;
> > -}
> > -
> > -static inline unsigned long tid_to_event(unsigned long tid)
> > -{
> > -     return tid / TID_STEP;
> > -}
> > -
> >  static inline unsigned int init_tid(int cpu)
> >  {
> >       return cpu;
> >  }
> >
> > -static inline void note_cmpxchg_failure(const char *n,
> > -             const struct kmem_cache *s, unsigned long tid)
> > -{
> > -#ifdef SLUB_DEBUG_CMPXCHG
> > -     unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
> > -
> > -     pr_info("%s %s: cmpxchg redo ", n, s->name);
> > -
> > -#ifdef CONFIG_PREEMPT
> > -     if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
> > -             pr_warn("due to cpu change %d -> %d\n",
> > -                     tid_to_cpu(tid), tid_to_cpu(actual_tid));
> > -     else
> > -#endif
> > -     if (tid_to_event(tid) != tid_to_event(actual_tid))
> > -             pr_warn("due to cpu running other code. Event %ld->%ld\n",
> > -                     tid_to_event(tid), tid_to_event(actual_tid));
> > -     else
> > -             pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
> > -                     actual_tid, tid, next_tid(tid));
> > -#endif
> > -     stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
> > -}
> > -
> >  static void init_kmem_cache_cpus(struct kmem_cache *s)
> >  {
> >       int cpu;
> > @@ -2751,7 +2703,6 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
> >                               object, tid,
> >                               next_object, next_tid(tid)))) {
> >
> > -                     note_cmpxchg_failure("slab_alloc", s, tid);
> >                       goto redo;
> >               }
> >               prefetch_freepointer(s, next_object);
> > @@ -4694,66 +4645,6 @@ static int list_locations(struct kmem_cache *s, char *buf,
> >  }
> >  #endif       /* CONFIG_SLUB_DEBUG */
> >
> > -#ifdef SLUB_RESILIENCY_TEST
> > -static void __init resiliency_test(void)
> > -{
> > -     u8 *p;
> > -     int type = KMALLOC_NORMAL;
> > -
> > -     BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
> > -
> > -     pr_err("SLUB resiliency testing\n");
> > -     pr_err("-----------------------\n");
> > -     pr_err("A. Corruption after allocation\n");
> > -
> > -     p = kzalloc(16, GFP_KERNEL);
> > -     p[16] = 0x12;
> > -     pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
> > -            p + 16);
> > -
> > -     validate_slab_cache(kmalloc_caches[type][4]);
> > -
> > -     /* Hmmm... The next two are dangerous */
> > -     p = kzalloc(32, GFP_KERNEL);
> > -     p[32 + sizeof(void *)] = 0x34;
> > -     pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
> > -            p);
> > -     pr_err("If allocated object is overwritten then not detectable\n\n");
> > -
> > -     validate_slab_cache(kmalloc_caches[type][5]);
> > -     p = kzalloc(64, GFP_KERNEL);
> > -     p += 64 + (get_cycles() & 0xff) * sizeof(void *);
> > -     *p = 0x56;
> > -     pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
> > -            p);
> > -     pr_err("If allocated object is overwritten then not detectable\n\n");
> > -     validate_slab_cache(kmalloc_caches[type][6]);
> > -
> > -     pr_err("\nB. Corruption after free\n");
> > -     p = kzalloc(128, GFP_KERNEL);
> > -     kfree(p);
> > -     *p = 0x78;
> > -     pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
> > -     validate_slab_cache(kmalloc_caches[type][7]);
> > -
> > -     p = kzalloc(256, GFP_KERNEL);
> > -     kfree(p);
> > -     p[50] = 0x9a;
> > -     pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
> > -     validate_slab_cache(kmalloc_caches[type][8]);
> > -
> > -     p = kzalloc(512, GFP_KERNEL);
> > -     kfree(p);
> > -     p[512] = 0xab;
> > -     pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
> > -     validate_slab_cache(kmalloc_caches[type][9]);
> > -}
> > -#else
> > -#ifdef CONFIG_SYSFS
> > -static void resiliency_test(void) {};
> > -#endif
> > -#endif       /* SLUB_RESILIENCY_TEST */
> > -
> >  #ifdef CONFIG_SYSFS
> >  enum slab_stat_type {
> >       SL_ALL,                 /* All slabs */
> > @@ -5875,7 +5766,6 @@ static int __init slab_sysfs_init(void)
> >       }
> >
> >       mutex_unlock(&slab_mutex);
> > -     resiliency_test();
> >       return 0;
> >  }
> >
> > --
> > 1.8.3.1
> >
> >

Qian Cai Sept. 17, 2019, 1:40 p.m. UTC | #3

On Mon, 2019-09-16 at 11:32 -0700, David Rientjes wrote:
> On Mon, 16 Sep 2019, Qian Cai wrote:
> 
> > SLUB_RESILIENCY_TEST and SLUB_DEBUG_CMPXCHG look like some left-over
> > debugging code during the internal development that probably nobody uses
> > it anymore. Remove them to make the world greener.
> 
> Adding Pengfei Li who has been working on a patchset for modified handling 
> of kmalloc cache initialization and touches the resiliency test.
> 
> I still find the resiliency test to be helpful/instructional for handling 
> unexpected conditions in these caches, so I'd suggest against removing it: 
> the only downside is that it's additional source code.  But it's helpful 
> source code for reference.
> 
> The cmpxchg failures could likely be more generalized beyond SLUB since 
> there will be other dependencies in the kernel than just this allocator.

OK, SLUB_RESILIENCY_TEST is fine to keep around and maybe be turned into a
Kconfig option to make it more visible.

Is it fine to remove SLUB_DEBUG_CMPXCHG? If somebody later want to generalize it
beyond SLUB, he/she can always find the old code somewhere anyway.

> 
> (I assume you didn't send a Signed-off-by line because this is an RFC.)
> 
> > ---
> >  mm/slub.c | 110 --------------------------------------------------------------
> >  1 file changed, 110 deletions(-)
> > 
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 8834563cdb4b..f97155ba097d 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -150,12 +150,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
> >   * - Variable sizing of the per node arrays
> >   */
> >  
> > -/* Enable to test recovery from slab corruption on boot */
> > -#undef SLUB_RESILIENCY_TEST
> > -
> > -/* Enable to log cmpxchg failures */
> > -#undef SLUB_DEBUG_CMPXCHG
> > -
> >  /*
> >   * Mininum number of partial slabs. These will be left on the partial
> >   * lists even if they are empty. kmem_cache_shrink may reclaim them.
> > @@ -392,10 +386,6 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
> >  	cpu_relax();
> >  	stat(s, CMPXCHG_DOUBLE_FAIL);
> >  
> > -#ifdef SLUB_DEBUG_CMPXCHG
> > -	pr_info("%s %s: cmpxchg double redo ", n, s->name);
> > -#endif
> > -
> >  	return false;
> >  }
> >  
> > @@ -433,10 +423,6 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
> >  	cpu_relax();
> >  	stat(s, CMPXCHG_DOUBLE_FAIL);
> >  
> > -#ifdef SLUB_DEBUG_CMPXCHG
> > -	pr_info("%s %s: cmpxchg double redo ", n, s->name);
> > -#endif
> > -
> >  	return false;
> >  }
> >  
> > @@ -2004,45 +1990,11 @@ static inline unsigned long next_tid(unsigned long tid)
> >  	return tid + TID_STEP;
> >  }
> >  
> > -static inline unsigned int tid_to_cpu(unsigned long tid)
> > -{
> > -	return tid % TID_STEP;
> > -}
> > -
> > -static inline unsigned long tid_to_event(unsigned long tid)
> > -{
> > -	return tid / TID_STEP;
> > -}
> > -
> >  static inline unsigned int init_tid(int cpu)
> >  {
> >  	return cpu;
> >  }
> >  
> > -static inline void note_cmpxchg_failure(const char *n,
> > -		const struct kmem_cache *s, unsigned long tid)
> > -{
> > -#ifdef SLUB_DEBUG_CMPXCHG
> > -	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
> > -
> > -	pr_info("%s %s: cmpxchg redo ", n, s->name);
> > -
> > -#ifdef CONFIG_PREEMPT
> > -	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
> > -		pr_warn("due to cpu change %d -> %d\n",
> > -			tid_to_cpu(tid), tid_to_cpu(actual_tid));
> > -	else
> > -#endif
> > -	if (tid_to_event(tid) != tid_to_event(actual_tid))
> > -		pr_warn("due to cpu running other code. Event %ld->%ld\n",
> > -			tid_to_event(tid), tid_to_event(actual_tid));
> > -	else
> > -		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
> > -			actual_tid, tid, next_tid(tid));
> > -#endif
> > -	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
> > -}
> > -
> >  static void init_kmem_cache_cpus(struct kmem_cache *s)
> >  {
> >  	int cpu;
> > @@ -2751,7 +2703,6 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
> >  				object, tid,
> >  				next_object, next_tid(tid)))) {
> >  
> > -			note_cmpxchg_failure("slab_alloc", s, tid);
> >  			goto redo;
> >  		}
> >  		prefetch_freepointer(s, next_object);
> > @@ -4694,66 +4645,6 @@ static int list_locations(struct kmem_cache *s, char *buf,
> >  }
> >  #endif	/* CONFIG_SLUB_DEBUG */
> >  
> > -#ifdef SLUB_RESILIENCY_TEST
> > -static void __init resiliency_test(void)
> > -{
> > -	u8 *p;
> > -	int type = KMALLOC_NORMAL;
> > -
> > -	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
> > -
> > -	pr_err("SLUB resiliency testing\n");
> > -	pr_err("-----------------------\n");
> > -	pr_err("A. Corruption after allocation\n");
> > -
> > -	p = kzalloc(16, GFP_KERNEL);
> > -	p[16] = 0x12;
> > -	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
> > -	       p + 16);
> > -
> > -	validate_slab_cache(kmalloc_caches[type][4]);
> > -
> > -	/* Hmmm... The next two are dangerous */
> > -	p = kzalloc(32, GFP_KERNEL);
> > -	p[32 + sizeof(void *)] = 0x34;
> > -	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
> > -	       p);
> > -	pr_err("If allocated object is overwritten then not detectable\n\n");
> > -
> > -	validate_slab_cache(kmalloc_caches[type][5]);
> > -	p = kzalloc(64, GFP_KERNEL);
> > -	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
> > -	*p = 0x56;
> > -	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
> > -	       p);
> > -	pr_err("If allocated object is overwritten then not detectable\n\n");
> > -	validate_slab_cache(kmalloc_caches[type][6]);
> > -
> > -	pr_err("\nB. Corruption after free\n");
> > -	p = kzalloc(128, GFP_KERNEL);
> > -	kfree(p);
> > -	*p = 0x78;
> > -	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
> > -	validate_slab_cache(kmalloc_caches[type][7]);
> > -
> > -	p = kzalloc(256, GFP_KERNEL);
> > -	kfree(p);
> > -	p[50] = 0x9a;
> > -	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
> > -	validate_slab_cache(kmalloc_caches[type][8]);
> > -
> > -	p = kzalloc(512, GFP_KERNEL);
> > -	kfree(p);
> > -	p[512] = 0xab;
> > -	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
> > -	validate_slab_cache(kmalloc_caches[type][9]);
> > -}
> > -#else
> > -#ifdef CONFIG_SYSFS
> > -static void resiliency_test(void) {};
> > -#endif
> > -#endif	/* SLUB_RESILIENCY_TEST */
> > -
> >  #ifdef CONFIG_SYSFS
> >  enum slab_stat_type {
> >  	SL_ALL,			/* All slabs */
> > @@ -5875,7 +5766,6 @@ static int __init slab_sysfs_init(void)
> >  	}
> >  
> >  	mutex_unlock(&slab_mutex);
> > -	resiliency_test();
> >  	return 0;
> >  }
> >  
> > -- 
> > 1.8.3.1
> > 
> >

David Rientjes Sept. 17, 2019, 8:08 p.m. UTC | #4

On Tue, 17 Sep 2019, Qian Cai wrote:

> > The cmpxchg failures could likely be more generalized beyond SLUB since 
> > there will be other dependencies in the kernel than just this allocator.
> 
> OK, SLUB_RESILIENCY_TEST is fine to keep around and maybe be turned into a
> Kconfig option to make it more visible.
> 
> Is it fine to remove SLUB_DEBUG_CMPXCHG? If somebody later want to generalize it
> beyond SLUB, he/she can always find the old code somewhere anyway.
> 

Beyond the fact that your patch doesn't compile, slub is the most notable 
(only?) user of double cmpxchg in the kernel so generalizing it would only 
serve to add more indirection at the moment.  If/when it becomes more 
widely used, we can have a discussion about generalizing it so that we can 
detect failures even when SLUB is not used.

Note that the primary purpose of the option is to diagnose issues when the 
CMPXCHG_DOUBLE_FAIL is observed.  If we encounter that, we wouldn't have 
any diagnostic tools to look deeper without adding this code back.  So I 
don't think anything around cmpxchg failure notifications needs to be 
changed right now.

diff --git a/mm/slub.c b/mm/slub.c
index 8834563cdb4b..f97155ba097d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -150,12 +150,6 @@  static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
  * - Variable sizing of the per node arrays
  */
 
-/* Enable to test recovery from slab corruption on boot */
-#undef SLUB_RESILIENCY_TEST
-
-/* Enable to log cmpxchg failures */
-#undef SLUB_DEBUG_CMPXCHG
-
 /*
  * Mininum number of partial slabs. These will be left on the partial
  * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -392,10 +386,6 @@  static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
 	cpu_relax();
 	stat(s, CMPXCHG_DOUBLE_FAIL);
 
-#ifdef SLUB_DEBUG_CMPXCHG
-	pr_info("%s %s: cmpxchg double redo ", n, s->name);
-#endif
-
 	return false;
 }
 
@@ -433,10 +423,6 @@  static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 	cpu_relax();
 	stat(s, CMPXCHG_DOUBLE_FAIL);
 
-#ifdef SLUB_DEBUG_CMPXCHG
-	pr_info("%s %s: cmpxchg double redo ", n, s->name);
-#endif
-
 	return false;
 }
 
@@ -2004,45 +1990,11 @@  static inline unsigned long next_tid(unsigned long tid)
 	return tid + TID_STEP;
 }
 
-static inline unsigned int tid_to_cpu(unsigned long tid)
-{
-	return tid % TID_STEP;
-}
-
-static inline unsigned long tid_to_event(unsigned long tid)
-{
-	return tid / TID_STEP;
-}
-
 static inline unsigned int init_tid(int cpu)
 {
 	return cpu;
 }
 
-static inline void note_cmpxchg_failure(const char *n,
-		const struct kmem_cache *s, unsigned long tid)
-{
-#ifdef SLUB_DEBUG_CMPXCHG
-	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
-
-	pr_info("%s %s: cmpxchg redo ", n, s->name);
-
-#ifdef CONFIG_PREEMPT
-	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
-		pr_warn("due to cpu change %d -> %d\n",
-			tid_to_cpu(tid), tid_to_cpu(actual_tid));
-	else
-#endif
-	if (tid_to_event(tid) != tid_to_event(actual_tid))
-		pr_warn("due to cpu running other code. Event %ld->%ld\n",
-			tid_to_event(tid), tid_to_event(actual_tid));
-	else
-		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
-			actual_tid, tid, next_tid(tid));
-#endif
-	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
-}
-
 static void init_kmem_cache_cpus(struct kmem_cache *s)
 {
 	int cpu;
@@ -2751,7 +2703,6 @@  static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 				object, tid,
 				next_object, next_tid(tid)))) {
 
-			note_cmpxchg_failure("slab_alloc", s, tid);
 			goto redo;
 		}
 		prefetch_freepointer(s, next_object);
@@ -4694,66 +4645,6 @@  static int list_locations(struct kmem_cache *s, char *buf,
 }
 #endif	/* CONFIG_SLUB_DEBUG */
 
-#ifdef SLUB_RESILIENCY_TEST
-static void __init resiliency_test(void)
-{
-	u8 *p;
-	int type = KMALLOC_NORMAL;
-
-	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
-
-	pr_err("SLUB resiliency testing\n");
-	pr_err("-----------------------\n");
-	pr_err("A. Corruption after allocation\n");
-
-	p = kzalloc(16, GFP_KERNEL);
-	p[16] = 0x12;
-	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
-	       p + 16);
-
-	validate_slab_cache(kmalloc_caches[type][4]);
-
-	/* Hmmm... The next two are dangerous */
-	p = kzalloc(32, GFP_KERNEL);
-	p[32 + sizeof(void *)] = 0x34;
-	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
-	       p);
-	pr_err("If allocated object is overwritten then not detectable\n\n");
-
-	validate_slab_cache(kmalloc_caches[type][5]);
-	p = kzalloc(64, GFP_KERNEL);
-	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
-	*p = 0x56;
-	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
-	       p);
-	pr_err("If allocated object is overwritten then not detectable\n\n");
-	validate_slab_cache(kmalloc_caches[type][6]);
-
-	pr_err("\nB. Corruption after free\n");
-	p = kzalloc(128, GFP_KERNEL);
-	kfree(p);
-	*p = 0x78;
-	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
-	validate_slab_cache(kmalloc_caches[type][7]);
-
-	p = kzalloc(256, GFP_KERNEL);
-	kfree(p);
-	p[50] = 0x9a;
-	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
-	validate_slab_cache(kmalloc_caches[type][8]);
-
-	p = kzalloc(512, GFP_KERNEL);
-	kfree(p);
-	p[512] = 0xab;
-	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
-	validate_slab_cache(kmalloc_caches[type][9]);
-}
-#else
-#ifdef CONFIG_SYSFS
-static void resiliency_test(void) {};
-#endif
-#endif	/* SLUB_RESILIENCY_TEST */
-
 #ifdef CONFIG_SYSFS
 enum slab_stat_type {
 	SL_ALL,			/* All slabs */
@@ -5875,7 +5766,6 @@  static int __init slab_sysfs_init(void)
 	}
 
 	mutex_unlock(&slab_mutex);
-	resiliency_test();
 	return 0;
 }

[RFC] mm/slub: remove left-over debugging code

Commit Message

Comments

Patch