diff mbox series

[v3,2/2] mm: slub: call WARN() when the slab detect an error

Message ID 20250220033953.1606820-3-hyesoo.yu@samsung.com (mailing list archive)
State New
Headers show
Series mm: slub: Enhanced debugging in slub error | expand

Commit Message

Hyesoo Yu Feb. 20, 2025, 3:39 a.m. UTC
If a slab object is corrupted or an error occurs in its internal
value, continuing after restoration may cause other side effects.
At this point, it is difficult to debug because the problem occurred
in the past. It is useful to use WARN() to catch errors at the point
of issue because WARN() could trigger panic for system debugging when
panic_on_warn is enabled. WARN() is added where to detect the error
on slab_err and object_err.

There are cases where slab_err is called before meaningful logs are
printed. If the WARN() in slab_err cause a panic, these logs will not
be printed. WARN() should called after these logs are printed. Thus
slab_err() is splited to __slab_err that calls the WARN() and it is
called after printing logs.

Changes in v3:
- move the WARN from slab_fix to slab_err, object_err to use WARN on
all error reporting paths.

Changes in v2:
- Replace direct calling with BUG_ON with the use of WARN in slab_fix.

Signed-off-by: Hyesoo Yu <hyesoo.yu@samsung.com>
Change-Id: I90b2ea9ffc58e3826f7ae9f1a774bb48c2d43bf4
---
 mm/slub.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

Comments

Harry Yoo Feb. 21, 2025, 8:30 a.m. UTC | #1
On Thu, Feb 20, 2025 at 12:39:44PM +0900, Hyesoo Yu wrote:
> If a slab object is corrupted or an error occurs in its internal
> value, continuing after restoration may cause other side effects.
> At this point, it is difficult to debug because the problem occurred
> in the past. It is useful to use WARN() to catch errors at the point
> of issue because WARN() could trigger panic for system debugging when
> panic_on_warn is enabled. WARN() is added where to detect the error
> on slab_err and object_err.
>
> There are cases where slab_err is called before meaningful logs are
> printed. If the WARN() in slab_err cause a panic, these logs will not
> be printed. WARN() should called after these logs are printed. Thus
> slab_err() is splited to __slab_err that calls the WARN() and it is
> called after printing logs.
> 
> Changes in v3:
> - move the WARN from slab_fix to slab_err, object_err to use WARN on
> all error reporting paths.
> 
> Changes in v2:
> - Replace direct calling with BUG_ON with the use of WARN in slab_fix.
> 
> Signed-off-by: Hyesoo Yu <hyesoo.yu@samsung.com>
> Change-Id: I90b2ea9ffc58e3826f7ae9f1a774bb48c2d43bf4
> ---
>  mm/slub.c | 31 +++++++++++++++++++------------
>  1 file changed, 19 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index de62fed12236..7f0583a71cda 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -5473,8 +5481,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>  			remove_partial(n, slab);
>  			list_add(&slab->slab_list, &discard);
>  		} else {
> -			list_slab_objects(s, slab,
> -			  "Objects remaining in %s on __kmem_cache_shutdown()");
> +			list_slab_objects(s, slab);

Could you remove WARN() in kmem_cache_destroy()?

When a cache is destroyed with remaining objects, two WARNINGs being are
printed, one from list_slab_objects() and another from kmem_cache_destroy().
The latter becomes redundant with this patch.

The WARN() is added there because it's good to catch such an error.
At that time, slab_err() and object_err() did not call WARN().

>  		}
>  	}
>  	spin_unlock_irq(&n->list_lock);
> -- 
> 2.28.0
>
diff mbox series

Patch

diff --git a/mm/slub.c b/mm/slub.c
index de62fed12236..7f0583a71cda 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1096,8 +1096,6 @@  static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
 		/* Beginning of the filler is the free pointer */
 		print_section(KERN_ERR, "Padding  ", p + off,
 			      size_from_object(s) - off);
-
-	dump_stack();
 }
 
 static void object_err(struct kmem_cache *s, struct slab *slab,
@@ -1109,6 +1107,8 @@  static void object_err(struct kmem_cache *s, struct slab *slab,
 	slab_bug(s, "%s", reason);
 	print_trailer(s, slab, object);
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+	WARN_ON(1);
 }
 
 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
@@ -1125,6 +1125,14 @@  static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 	return false;
 }
 
+static void __slab_err(struct slab *slab)
+{
+	print_slab_info(slab);
+	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+	WARN_ON(1);
+}
+
 static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
 			const char *fmt, ...)
 {
@@ -1138,9 +1146,7 @@  static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	slab_bug(s, "%s", buf);
-	print_slab_info(slab);
-	dump_stack();
-	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+	__slab_err(slab);
 }
 
 static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1316,9 +1322,10 @@  slab_pad_check(struct kmem_cache *s, struct slab *slab)
 	while (end > fault && end[-1] == POISON_INUSE)
 		end--;
 
-	slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
-			fault, end - 1, fault - start);
+	slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+		 fault, end - 1, fault - start);
 	print_section(KERN_ERR, "Padding ", pad, remainder);
+	__slab_err(slab);
 
 	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
 }
@@ -5431,14 +5438,13 @@  static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
 	return !!oo_objects(s->oo);
 }
 
-static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
-			      const char *text)
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
 {
 #ifdef CONFIG_SLUB_DEBUG
 	void *addr = slab_address(slab);
 	void *p;
 
-	slab_err(s, slab, text, s->name);
+	slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
 
 	spin_lock(&object_map_lock);
 	__fill_map(object_map, s, slab);
@@ -5453,6 +5459,8 @@  static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
 		}
 	}
 	spin_unlock(&object_map_lock);
+
+	__slab_err(slab);
 #endif
 }
 
@@ -5473,8 +5481,7 @@  static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 			remove_partial(n, slab);
 			list_add(&slab->slab_list, &discard);
 		} else {
-			list_slab_objects(s, slab,
-			  "Objects remaining in %s on __kmem_cache_shutdown()");
+			list_slab_objects(s, slab);
 		}
 	}
 	spin_unlock_irq(&n->list_lock);