diff mbox series

[v3,1/3] mm/slub: enable debugging memory wasting of kmalloc

Message ID 20220727071042.8796-2-feng.tang@intel.com (mailing list archive)
State New
Headers show
Series mm/slub: some debug enhancements | expand

Commit Message

Feng Tang July 27, 2022, 7:10 a.m. UTC
kmalloc's API family is critical for mm, with one nature that it will
round up the request size to a fixed one (mostly power of 2). Say 
when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes
could be allocated, so in worst case, there is around 50% memory
space waste.

The wastage is not a big issue for requests that get allocated/freed 
quickly, but may cause problems with objects that have longer life
time.

We've met a kernel boot OOM panic (v5.10), and from the dumped slab
info:

    [   26.062145] kmalloc-2k            814056KB     814056KB

From debug we found there are huge number of 'struct iova_magazine',
whose size is 1032 bytes (1024 + 8), so each allocation will waste
1016 bytes. Though the issue was solved by giving the right (bigger)
size of RAM, it is still nice to optimize the size (either use a
kmalloc friendly size or create a dedicated slab for it).

And from lkml archive, there was another crash kernel OOM case [1]
back in 2019, which seems to be related with the similar slab waste
situation, as the log is similar:

    [    4.332648] iommu: Adding device 0000:20:02.0 to group 16
    [    4.338946] swapper/0 invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=0, oom_score_adj=0
    ...
    [    4.857565] kmalloc-2048           59164KB      59164KB

The crash kernel only has 256M memory, and 59M is pretty big here.
(Note: the related code has been changed and optimised in recent
kernel [2], these logs are picked just to demo the problem)

So add an way to track each kmalloc's memory waste info, and
leverage the existing SLUB debug framework to show its call stack
of original allocation, so that user can evaluate the waste
situation, identify some hot spots and optimize accordingly, for
a better utilization of memory.

The waste info is integrated into existing interface:
/sys/kernel/debug/slab/kmalloc-xx/alloc_traces, one example of
'kmalloc-4k' after boot is:

126 ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe] waste=233856/1856 age=1493302/1493830/1494358 pid=1284 cpus=32 nodes=1
        __slab_alloc.isra.86+0x52/0x80
        __kmalloc_node+0x143/0x350
        ixgbe_alloc_q_vector+0xa5/0x4a0 [ixgbe]
        ixgbe_init_interrupt_scheme+0x1a6/0x730 [ixgbe]
        ixgbe_probe+0xc8e/0x10d0 [ixgbe]
        local_pci_probe+0x42/0x80
        work_for_cpu_fn+0x13/0x20
        process_one_work+0x1c5/0x390

which means in 'kmalloc-4k' slab, there are 126 requests of
2240 bytes which got a 4KB space (wasting 1856 bytes each
and 233856 bytes in total). And when system starts some real
workload like multiple docker instances, there are more
severe waste.

[1]. https://lkml.org/lkml/2019/8/12/266
[2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@huawei.com/

[Thanks Hyeonggon for pointing out several bugs about sorting/format]
[Thanks Vlastimil for suggesting way to reduce memory usage of
 orig_size and keep it only for kmalloc objects]

Signed-off-by: Feng Tang <feng.tang@intel.com>
---
 include/linux/slab.h |  2 +
 mm/slub.c            | 99 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 83 insertions(+), 18 deletions(-)

Comments

Christoph Lameter July 27, 2022, 10:20 a.m. UTC | #1
On Wed, 27 Jul 2022, Feng Tang wrote:

> @@ -2905,7 +2950,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
>   * already disabled (which is the case for bulk allocation).
>   */
>  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> -			  unsigned long addr, struct kmem_cache_cpu *c)
> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>  {
>  	void *freelist;
>  	struct slab *slab;
> @@ -3102,7 +3147,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>   * pointer.
>   */
>  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> -			  unsigned long addr, struct kmem_cache_cpu *c)
> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>  {
>  	void *p;
>
> @@ -3115,7 +3160,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	c = slub_get_cpu_ptr(s->cpu_slab);
>  #endif
>
> -	p = ___slab_alloc(s, gfpflags, node, addr, c);
> +	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
>  #ifdef CONFIG_PREEMPT_COUNT
>  	slub_put_cpu_ptr(s->cpu_slab);

This is modifying and making execution of standard slab functions more
expensive. Could you restrict modifications to the kmalloc subsystem?

kmem_cache_alloc() and friends are not doing any rounding up to power of
two  sizes.

What is happening here is that you pass kmalloc object size info through
the kmem_cache_alloc functions so that the regular allocation functions
debug functionality can then save the kmalloc specific object request
size. This is active even when no debugging options are enabled.

Can you avoid that? Have kmalloc do the object allocation without passing
through the kmalloc request size and then add the original size info
to the debug field later after execution continues in the kmalloc functions?
Feng Tang July 27, 2022, 12:59 p.m. UTC | #2
On 2022/7/27 18:20, Christoph Lameter wrote:
> On Wed, 27 Jul 2022, Feng Tang wrote:
> 
>> @@ -2905,7 +2950,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
>>    * already disabled (which is the case for bulk allocation).
>>    */
>>   static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>> -			  unsigned long addr, struct kmem_cache_cpu *c)
>> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>>   {
>>   	void *freelist;
>>   	struct slab *slab;
>> @@ -3102,7 +3147,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>    * pointer.
>>    */
>>   static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>> -			  unsigned long addr, struct kmem_cache_cpu *c)
>> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>>   {
>>   	void *p;
>>
>> @@ -3115,7 +3160,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>   	c = slub_get_cpu_ptr(s->cpu_slab);
>>   #endif
>>
>> -	p = ___slab_alloc(s, gfpflags, node, addr, c);
>> +	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
>>   #ifdef CONFIG_PREEMPT_COUNT
>>   	slub_put_cpu_ptr(s->cpu_slab);
> 
> This is modifying and making execution of standard slab functions more
> expensive. Could you restrict modifications to the kmalloc subsystem?
> 
> kmem_cache_alloc() and friends are not doing any rounding up to power of
> two  sizes.
> 
> What is happening here is that you pass kmalloc object size info through
> the kmem_cache_alloc functions so that the regular allocation functions
> debug functionality can then save the kmalloc specific object request
> size. This is active even when no debugging options are enabled.

Yes, it indeed is some extra cost which I don't like either.

> 
> Can you avoid that? Have kmalloc do the object allocation without passing
> through the kmalloc request size and then add the original size info
> to the debug field later after execution continues in the kmalloc functions?


How about the following patch which adds no new 'orig_size' to core 
functions (the following 2nd, 3rd redzone debug patch may also need
some changes).

(Our email server has just been changed, and my mutt can't
work correctly, so the format could be broken, and I attached the
new patch as well. Sorry for the inconvenience!)

Thanks,
Feng

---
diff --git a/include/linux/slab.h b/include/linux/slab.h

index 0fefdf528e0d..a713b0e5bbcd 100644

--- a/include/linux/slab.h

+++ b/include/linux/slab.h

@@ -29,6 +29,8 @@

  #define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400U)

  /* DEBUG: Poison objects */

  #define SLAB_POISON		((slab_flags_t __force)0x00000800U)

+/* Indicate a kmalloc slab */

+#define SLAB_KMALLOC		((slab_flags_t __force)0x00001000U)

  /* Align objs on cache lines */

  #define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)

  /* Use GFP_DMA memory */

diff --git a/mm/slub.c b/mm/slub.c

index 862dbd9af4f5..97c21a37a6a1 100644

--- a/mm/slub.c

+++ b/mm/slub.c

@@ -191,6 +191,12 @@ static inline bool kmem_cache_debug(struct 
kmem_cache *s)

  	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);

  }



+static inline bool slub_debug_orig_size(struct kmem_cache *s)

+{

+	return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&

+			(s->flags & SLAB_KMALLOC));

+}

+

  void *fixup_red_left(struct kmem_cache *s, void *p)

  {

  	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))

@@ -816,6 +822,39 @@ static void print_slab_info(const struct slab *slab)

  	       folio_flags(folio, 0));

  }



+static inline unsigned int *get_orig_size_pointer(struct kmem_cache *s,

+						void *object)

+{

+	void *p = kasan_reset_tag(object);

+

+	p += get_info_end(s);

+	p += sizeof(struct track) * 2;

+	return (unsigned int *)p;

+}

+

+static void set_orig_size(struct kmem_cache *s,

+					void *object, unsigned int orig_size)

+{

+	unsigned int *p;

+

+	if (!slub_debug_orig_size(s))

+		return;

+

+	p = get_orig_size_pointer(s, object);

+	*p = orig_size;

+}

+

+static unsigned int get_orig_size(struct kmem_cache *s, void *object)

+{

+	unsigned int *p;

+

+	if (!slub_debug_orig_size(s))

+		return s->object_size;

+

+	p = get_orig_size_pointer(s, object);

+	return *p;

+}

+

  static void slab_bug(struct kmem_cache *s, char *fmt, ...)

  {

  	struct va_format vaf;

@@ -875,6 +914,9 @@ static void print_trailer(struct kmem_cache *s, 
struct slab *slab, u8 *p)

  	if (s->flags & SLAB_STORE_USER)

  		off += 2 * sizeof(struct track);



+	if (slub_debug_orig_size(s))

+		off += sizeof(unsigned int);

+

  	off += kasan_metadata_size(s);



  	if (off != size_from_object(s))

@@ -1026,10 +1068,14 @@ static int check_pad_bytes(struct kmem_cache *s, 
struct slab *slab, u8 *p)

  {

  	unsigned long off = get_info_end(s);	/* The end of info */



-	if (s->flags & SLAB_STORE_USER)

+	if (s->flags & SLAB_STORE_USER) {

  		/* We also have user information there */

  		off += 2 * sizeof(struct track);



+		if (s->flags & SLAB_KMALLOC)

+			off += sizeof(unsigned int);

+	}

+

  	off += kasan_metadata_size(s);



  	if (size_from_object(s) == off)

@@ -1335,6 +1381,7 @@ static noinline int alloc_debug_processing(struct 
kmem_cache *s,

  	/* Success perform special debug activities for allocs */

  	if (s->flags & SLAB_STORE_USER)

  		set_track(s, object, TRACK_ALLOC, addr);

+

  	trace(s, slab, object, 1);

  	init_object(s, object, SLUB_RED_ACTIVE);

  	return 1;

@@ -3240,6 +3287,9 @@ static __always_inline void 
*slab_alloc_node(struct kmem_cache *s, struct list_l

  	init = slab_want_init_on_alloc(gfpflags, s);



  out:

+#ifdef CONFIG_SLUB_DEBUG

+	set_orig_size(s, object, orig_size);

+#endif

  	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);



  	return object;

@@ -4112,12 +4162,17 @@ static int calculate_sizes(struct kmem_cache *s)

  	}



  #ifdef CONFIG_SLUB_DEBUG

-	if (flags & SLAB_STORE_USER)

+	if (flags & SLAB_STORE_USER) {

  		/*

  		 * Need to store information about allocs and frees after

  		 * the object.

  		 */

  		size += 2 * sizeof(struct track);

+

+		/* Save the original kmalloc request size */

+		if (flags & SLAB_KMALLOC)

+			size += sizeof(unsigned int);

+	}

  #endif



  	kasan_cache_create(s, &size, &s->flags);

@@ -4842,7 +4897,7 @@ void __init kmem_cache_init(void)



  	/* Now we can use the kmem_cache to allocate kmalloc slabs */

  	setup_kmalloc_cache_index_table();

-	create_kmalloc_caches(0);

+	create_kmalloc_caches(SLAB_KMALLOC);



  	/* Setup random freelists for each cache */

  	init_freelist_randomization();

@@ -5068,6 +5123,7 @@ struct location {

  	depot_stack_handle_t handle;

  	unsigned long count;

  	unsigned long addr;

+	unsigned long waste;

  	long long sum_time;

  	long min_time;

  	long max_time;

@@ -5114,13 +5170,15 @@ static int alloc_loc_track(struct loc_track *t, 
unsigned long max, gfp_t flags)

  }



  static int add_location(struct loc_track *t, struct kmem_cache *s,

-				const struct track *track)

+				const struct track *track,

+				unsigned int orig_size)

  {

  	long start, end, pos;

  	struct location *l;

-	unsigned long caddr, chandle;

+	unsigned long caddr, chandle, cwaste;

  	unsigned long age = jiffies - track->when;

  	depot_stack_handle_t handle = 0;

+	unsigned int waste = s->object_size - orig_size;



  #ifdef CONFIG_STACKDEPOT

  	handle = READ_ONCE(track->handle);

@@ -5138,11 +5196,13 @@ static int add_location(struct loc_track *t, 
struct kmem_cache *s,

  		if (pos == end)

  			break;



-		caddr = t->loc[pos].addr;

-		chandle = t->loc[pos].handle;

-		if ((track->addr == caddr) && (handle == chandle)) {

+		l = &t->loc[pos];

+		caddr = l->addr;

+		chandle = l->handle;

+		cwaste = l->waste;

+		if ((track->addr == caddr) && (handle == chandle) &&

+			(waste == cwaste)) {



-			l = &t->loc[pos];

  			l->count++;

  			if (track->when) {

  				l->sum_time += age;

@@ -5167,6 +5227,9 @@ static int add_location(struct loc_track *t, 
struct kmem_cache *s,

  			end = pos;

  		else if (track->addr == caddr && handle < chandle)

  			end = pos;

+		else if (track->addr == caddr && handle == chandle &&

+				waste < cwaste)

+			end = pos;

  		else

  			start = pos;

  	}

@@ -5190,6 +5253,7 @@ static int add_location(struct loc_track *t, 
struct kmem_cache *s,

  	l->min_pid = track->pid;

  	l->max_pid = track->pid;

  	l->handle = handle;

+	l->waste = waste;

  	cpumask_clear(to_cpumask(l->cpus));

  	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));

  	nodes_clear(l->nodes);

@@ -5208,7 +5272,7 @@ static void process_slab(struct loc_track *t, 
struct kmem_cache *s,



  	for_each_object(p, s, addr, slab->objects)

  		if (!test_bit(__obj_to_index(s, addr, p), obj_map))

-			add_location(t, s, get_track(s, p, alloc));

+			add_location(t, s, get_track(s, p, alloc), get_orig_size(s, p));

  }

  #endif  /* CONFIG_DEBUG_FS   */

  #endif	/* CONFIG_SLUB_DEBUG */

@@ -6078,6 +6142,10 @@ static int slab_debugfs_show(struct seq_file 
*seq, void *v)

  		else

  			seq_puts(seq, "<not-available>");



+		if (l->waste)

+			seq_printf(seq, " waste=%lu/%lu",

+				l->count * l->waste, l->waste);

+

  		if (l->sum_time != l->min_time) {

  			seq_printf(seq, " age=%ld/%llu/%ld",

  				l->min_time, div_u64(l->sum_time, l->count),
Vlastimil Babka July 27, 2022, 2:12 p.m. UTC | #3
On 7/27/22 12:20, Christoph Lameter wrote:
> On Wed, 27 Jul 2022, Feng Tang wrote:
> 
>> @@ -2905,7 +2950,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
>>   * already disabled (which is the case for bulk allocation).
>>   */
>>  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>> -			  unsigned long addr, struct kmem_cache_cpu *c)
>> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>>  {
>>  	void *freelist;
>>  	struct slab *slab;
>> @@ -3102,7 +3147,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>   * pointer.
>>   */
>>  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>> -			  unsigned long addr, struct kmem_cache_cpu *c)
>> +			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
>>  {
>>  	void *p;
>>
>> @@ -3115,7 +3160,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>  	c = slub_get_cpu_ptr(s->cpu_slab);
>>  #endif
>>
>> -	p = ___slab_alloc(s, gfpflags, node, addr, c);
>> +	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
>>  #ifdef CONFIG_PREEMPT_COUNT
>>  	slub_put_cpu_ptr(s->cpu_slab);
> 
> This is modifying and making execution of standard slab functions more
> expensive. Could you restrict modifications to the kmalloc subsystem?
> 
> kmem_cache_alloc() and friends are not doing any rounding up to power of
> two  sizes.
> 
> What is happening here is that you pass kmalloc object size info through
> the kmem_cache_alloc functions so that the regular allocation functions
> debug functionality can then save the kmalloc specific object request
> size. This is active even when no debugging options are enabled.

I don't think the extra orig_size parameter (unused for non-debug caches)
adds any noticeable overhead. In slab_alloc_node() we already have the
orig_size parameter (for both kmalloc and non-kmalloc caches) before this
patch, and it remains unused in the cmpxchg based fast path. The patch adds
it to __slab_alloc() which is not the fast path, and it's still unused for
non-debug caches there. So the overhead is basically one less register
available (because of the extra param) in a slow path and that should be
immeasurable.

> Can you avoid that? Have kmalloc do the object allocation without passing
> through the kmalloc request size and then add the original size info
> to the debug field later after execution continues in the kmalloc functions?

That approach is problematic wrt patches 2+3 if we want to use orig_size to
affect the boundaries of zero-init and redzoning.
Also it goes against the attempt to fix races wrt validation, see [1] where
the idea is to have alloc_debug_processing() including redzoning done under
n->list_lock and for that should have orig_size passed there as well.

[1] https://lore.kernel.org/all/69462916-2d1c-dd50-2e64-b31c2b61690e@suse.cz/
diff mbox series

Patch

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0fefdf528e0d..a713b0e5bbcd 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -29,6 +29,8 @@ 
 #define SLAB_RED_ZONE		((slab_flags_t __force)0x00000400U)
 /* DEBUG: Poison objects */
 #define SLAB_POISON		((slab_flags_t __force)0x00000800U)
+/* Indicate a kmalloc slab */
+#define SLAB_KMALLOC		((slab_flags_t __force)0x00001000U)
 /* Align objs on cache lines */
 #define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
 /* Use GFP_DMA memory */
diff --git a/mm/slub.c b/mm/slub.c
index 862dbd9af4f5..2e046cc10b84 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -191,6 +191,12 @@  static inline bool kmem_cache_debug(struct kmem_cache *s)
 	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
 }
 
+static inline bool slub_debug_orig_size(struct kmem_cache *s)
+{
+	return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
+			(s->flags & SLAB_KMALLOC));
+}
+
 void *fixup_red_left(struct kmem_cache *s, void *p)
 {
 	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -816,6 +822,33 @@  static void print_slab_info(const struct slab *slab)
 	       folio_flags(folio, 0));
 }
 
+static inline void set_orig_size(struct kmem_cache *s,
+					void *object, unsigned int orig_size)
+{
+	void *p = kasan_reset_tag(object);
+
+	if (!slub_debug_orig_size(s))
+		return;
+
+	p += get_info_end(s);
+	p += sizeof(struct track) * 2;
+
+	*(unsigned int *)p = orig_size;
+}
+
+static unsigned int get_orig_size(struct kmem_cache *s, void *object)
+{
+	void *p = kasan_reset_tag(object);
+
+	if (!slub_debug_orig_size(s))
+		return s->object_size;
+
+	p += get_info_end(s);
+	p += sizeof(struct track) * 2;
+
+	return *(unsigned int *)p;
+}
+
 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 {
 	struct va_format vaf;
@@ -875,6 +908,9 @@  static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
 	if (s->flags & SLAB_STORE_USER)
 		off += 2 * sizeof(struct track);
 
+	if (slub_debug_orig_size(s))
+		off += sizeof(unsigned int);
+
 	off += kasan_metadata_size(s);
 
 	if (off != size_from_object(s))
@@ -1026,10 +1062,14 @@  static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
 {
 	unsigned long off = get_info_end(s);	/* The end of info */
 
-	if (s->flags & SLAB_STORE_USER)
+	if (s->flags & SLAB_STORE_USER) {
 		/* We also have user information there */
 		off += 2 * sizeof(struct track);
 
+		if (s->flags & SLAB_KMALLOC)
+			off += sizeof(unsigned int);
+	}
+
 	off += kasan_metadata_size(s);
 
 	if (size_from_object(s) == off)
@@ -1325,7 +1365,8 @@  static inline int alloc_consistency_checks(struct kmem_cache *s,
 
 static noinline int alloc_debug_processing(struct kmem_cache *s,
 					struct slab *slab,
-					void *object, unsigned long addr)
+					void *object, unsigned long addr,
+					unsigned int orig_size)
 {
 	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 		if (!alloc_consistency_checks(s, slab, object))
@@ -1335,6 +1376,9 @@  static noinline int alloc_debug_processing(struct kmem_cache *s,
 	/* Success perform special debug activities for allocs */
 	if (s->flags & SLAB_STORE_USER)
 		set_track(s, object, TRACK_ALLOC, addr);
+
+	set_orig_size(s, object, orig_size);
+
 	trace(s, slab, object, 1);
 	init_object(s, object, SLUB_RED_ACTIVE);
 	return 1;
@@ -1661,7 +1705,8 @@  static inline
 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
-	struct slab *slab, void *object, unsigned long addr) { return 0; }
+	struct slab *slab, void *object, unsigned long addr,
+	unsigned int orig_size) { return 0; }
 
 static inline int free_debug_processing(
 	struct kmem_cache *s, struct slab *slab,
@@ -2905,7 +2950,7 @@  static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
  * already disabled (which is the case for bulk allocation).
  */
 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
-			  unsigned long addr, struct kmem_cache_cpu *c)
+			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 {
 	void *freelist;
 	struct slab *slab;
@@ -3048,7 +3093,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 check_new_slab:
 
 	if (kmem_cache_debug(s)) {
-		if (!alloc_debug_processing(s, slab, freelist, addr)) {
+		if (!alloc_debug_processing(s, slab, freelist, addr, orig_size)) {
 			/* Slab failed checks. Next slab needed */
 			goto new_slab;
 		} else {
@@ -3102,7 +3147,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  * pointer.
  */
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
-			  unsigned long addr, struct kmem_cache_cpu *c)
+			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 {
 	void *p;
 
@@ -3115,7 +3160,7 @@  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	c = slub_get_cpu_ptr(s->cpu_slab);
 #endif
 
-	p = ___slab_alloc(s, gfpflags, node, addr, c);
+	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
 #ifdef CONFIG_PREEMPT_COUNT
 	slub_put_cpu_ptr(s->cpu_slab);
 #endif
@@ -3206,7 +3251,7 @@  static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l
 	 */
 	if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
 	    unlikely(!object || !slab || !node_match(slab, node))) {
-		object = __slab_alloc(s, gfpflags, node, addr, c);
+		object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
 	} else {
 		void *next_object = get_freepointer_safe(s, object);
 
@@ -3709,7 +3754,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			 * of re-populating per CPU c->freelist
 			 */
 			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
-					    _RET_IP_, c);
+					    _RET_IP_, c, s->object_size);
 			if (unlikely(!p[i]))
 				goto error;
 
@@ -4112,12 +4157,17 @@  static int calculate_sizes(struct kmem_cache *s)
 	}
 
 #ifdef CONFIG_SLUB_DEBUG
-	if (flags & SLAB_STORE_USER)
+	if (flags & SLAB_STORE_USER) {
 		/*
 		 * Need to store information about allocs and frees after
 		 * the object.
 		 */
 		size += 2 * sizeof(struct track);
+
+		/* Save the original kmalloc request size */
+		if (flags & SLAB_KMALLOC)
+			size += sizeof(unsigned int);
+	}
 #endif
 
 	kasan_cache_create(s, &size, &s->flags);
@@ -4842,7 +4892,7 @@  void __init kmem_cache_init(void)
 
 	/* Now we can use the kmem_cache to allocate kmalloc slabs */
 	setup_kmalloc_cache_index_table();
-	create_kmalloc_caches(0);
+	create_kmalloc_caches(SLAB_KMALLOC);
 
 	/* Setup random freelists for each cache */
 	init_freelist_randomization();
@@ -5068,6 +5118,7 @@  struct location {
 	depot_stack_handle_t handle;
 	unsigned long count;
 	unsigned long addr;
+	unsigned long waste;
 	long long sum_time;
 	long min_time;
 	long max_time;
@@ -5114,13 +5165,15 @@  static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
 }
 
 static int add_location(struct loc_track *t, struct kmem_cache *s,
-				const struct track *track)
+				const struct track *track,
+				unsigned int orig_size)
 {
 	long start, end, pos;
 	struct location *l;
-	unsigned long caddr, chandle;
+	unsigned long caddr, chandle, cwaste;
 	unsigned long age = jiffies - track->when;
 	depot_stack_handle_t handle = 0;
+	unsigned int waste = s->object_size - orig_size;
 
 #ifdef CONFIG_STACKDEPOT
 	handle = READ_ONCE(track->handle);
@@ -5138,11 +5191,13 @@  static int add_location(struct loc_track *t, struct kmem_cache *s,
 		if (pos == end)
 			break;
 
-		caddr = t->loc[pos].addr;
-		chandle = t->loc[pos].handle;
-		if ((track->addr == caddr) && (handle == chandle)) {
+		l = &t->loc[pos];
+		caddr = l->addr;
+		chandle = l->handle;
+		cwaste = l->waste;
+		if ((track->addr == caddr) && (handle == chandle) &&
+			(waste == cwaste)) {
 
-			l = &t->loc[pos];
 			l->count++;
 			if (track->when) {
 				l->sum_time += age;
@@ -5167,6 +5222,9 @@  static int add_location(struct loc_track *t, struct kmem_cache *s,
 			end = pos;
 		else if (track->addr == caddr && handle < chandle)
 			end = pos;
+		else if (track->addr == caddr && handle == chandle &&
+				waste < cwaste)
+			end = pos;
 		else
 			start = pos;
 	}
@@ -5190,6 +5248,7 @@  static int add_location(struct loc_track *t, struct kmem_cache *s,
 	l->min_pid = track->pid;
 	l->max_pid = track->pid;
 	l->handle = handle;
+	l->waste = waste;
 	cpumask_clear(to_cpumask(l->cpus));
 	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
 	nodes_clear(l->nodes);
@@ -5208,7 +5267,7 @@  static void process_slab(struct loc_track *t, struct kmem_cache *s,
 
 	for_each_object(p, s, addr, slab->objects)
 		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
-			add_location(t, s, get_track(s, p, alloc));
+			add_location(t, s, get_track(s, p, alloc), get_orig_size(s, p));
 }
 #endif  /* CONFIG_DEBUG_FS   */
 #endif	/* CONFIG_SLUB_DEBUG */
@@ -6078,6 +6137,10 @@  static int slab_debugfs_show(struct seq_file *seq, void *v)
 		else
 			seq_puts(seq, "<not-available>");
 
+		if (l->waste)
+			seq_printf(seq, " waste=%lu/%lu",
+				l->count * l->waste, l->waste);
+
 		if (l->sum_time != l->min_time) {
 			seq_printf(seq, " age=%ld/%llu/%ld",
 				l->min_time, div_u64(l->sum_time, l->count),