diff mbox series

[RFC,v2,2/7] mm, slub: add opt-in slub_percpu_array

Message ID 20230810163627.6206-11-vbabka@suse.cz (mailing list archive)
State New
Headers show
Series SLUB percpu array caches and maple tree nodes | expand

Commit Message

Vlastimil Babka Aug. 10, 2023, 4:36 p.m. UTC
kmem_cache_setup_percpu_array() will allocate a per-cpu array for
caching alloc/free objects of given size for the cache. The cache
has to be created with SLAB_NO_MERGE flag.

The array is filled by freeing. When empty for alloc or full for
freeing, it's simply bypassed by the operation, there's currently no
batch freeing/allocations.

The locking is copied from the page allocator's pcplists, based on
embedded spin locks. Interrupts are not disabled, only preemption (cpu
migration on RT). Trylock is attempted to avoid deadlock due to
an intnerrupt, trylock failure means the array is bypassed.

Sysfs stat counters alloc_cpu_cache and free_cpu_cache count operations
that used the percpu array.

Bulk allocation bypasses the array, bulk freeing does not.

kmem_cache_prefill_percpu_array() can be called to ensure the array on
the current cpu to at least the given number of objects. However this is
only opportunistic as there's no cpu pinning and the trylocks may always
fail. Therefore allocations cannot rely on the array for success even
after the prefill. But misses should be rare enough that e.g. GFP_ATOMIC
allocations should be acceptable after the refill.
The operation is currently not optimized.

Mark SLAB_DEPRECATED as BROKEN so the new APIs don't need to be
reimplemented there and the bots don't complain. SLAB has percpu arrays
by design but their sizes are determined internally.

More TODO/FIXMEs:

- NUMA awareness - preferred node currently ignored, __GFP_THISNODE not
  honored
- slub_debug - will not work for allocations from the array. Normally in
  SLUB implementation the slub_debug kills all fast paths, but that
  could lead to depleting the reserves if we ignore the prefill and use
  GFP_ATOMIC. Needs more thought.
---
 include/linux/slab.h     |   4 +
 include/linux/slub_def.h |  10 ++
 mm/Kconfig               |   1 +
 mm/slub.c                | 210 ++++++++++++++++++++++++++++++++++++++-
 4 files changed, 224 insertions(+), 1 deletion(-)

Comments

Hyeonggon Yoo Aug. 21, 2023, 2:57 p.m. UTC | #1
Hi,

On Fri, Aug 11, 2023 at 1:36 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> kmem_cache_setup_percpu_array() will allocate a per-cpu array for
> caching alloc/free objects of given size for the cache. The cache
> has to be created with SLAB_NO_MERGE flag.
>
> The array is filled by freeing. When empty for alloc or full for
> freeing, it's simply bypassed by the operation, there's currently no
> batch freeing/allocations.
>
> The locking is copied from the page allocator's pcplists, based on
> embedded spin locks. Interrupts are not disabled, only preemption (cpu
> migration on RT). Trylock is attempted to avoid deadlock due to
> an intnerrupt, trylock failure means the array is bypassed.

nit: s/intnerrupt/interrupt/

>  /*
>   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
>   * have the fastpath folded into their functions. So no function call
> @@ -3465,7 +3564,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
>         if (unlikely(object))
>                 goto out;
>
> -       object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
> +       if (s->cpu_array)
> +               object = alloc_from_pca(s);
> +
> +       if (!object)
> +               object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
>
>         maybe_wipe_obj_freeptr(s, object);
>         init = slab_want_init_on_alloc(gfpflags, s);
> @@ -3715,6 +3818,34 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>         discard_slab(s, slab);
>  }

>  #ifndef CONFIG_SLUB_TINY
>  /*
>   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
> @@ -3740,6 +3871,11 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
>         unsigned long tid;
>         void **freelist;
>
> +       if (s->cpu_array && cnt == 1) {
> +               if (free_to_pca(s, head))
> +                       return;
> +       }
> +
>  redo:
>         /*
>          * Determine the currently cpus per cpu slab.
> @@ -3793,6 +3929,11 @@ static void do_slab_free(struct kmem_cache *s,
>  {
>         void *tail_obj = tail ? : head;
>
> +       if (s->cpu_array && cnt == 1) {
> +               if (free_to_pca(s, head))
> +                       return;
> +       }
> +
>         __slab_free(s, slab, head, tail_obj, cnt, addr);
>  }
>  #endif /* CONFIG_SLUB_TINY */

Is this functionality needed for SLUB_TINY?

> @@ -4060,6 +4201,45 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>  }
>  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
>
> +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
> +               gfp_t gfp)
> +{
> +       struct slub_percpu_array *pca;
> +       void *objects[32];
> +       unsigned int used;
> +       unsigned int allocated;
> +
> +       if (!s->cpu_array)
> +               return -EINVAL;
> +
> +       /* racy but we don't care */
> +       pca = raw_cpu_ptr(s->cpu_array);
> +
> +       used = READ_ONCE(pca->used);

Hmm for the prefill to be meaningful,
remote allocation should be possible, right?

Otherwise it only prefills for the CPU that requested it.

> +       if (used >= count)
> +               return 0;
> +
> +       if (pca->count < count)
> +               return -EINVAL;
> +
> +       count -= used;
> +
> +       /* TODO fix later */
> +       if (count > 32)
> +               count = 32;
> +
> +       for (int i = 0; i < count; i++)
> +               objects[i] = NULL;
> +       allocated = kmem_cache_alloc_bulk(s, gfp, count, &objects[0]);
> +
> +       for (int i = 0; i < count; i++) {
> +               if (objects[i]) {
> +                       kmem_cache_free(s, objects[i]);
> +               }
> +       }

nit: why not

for (int i = 0; i < allocated; i++) {
    kmem_cache_free(s, objects[i]);
}

and skip objects[i] = NULL

> +       return allocated;
> +}

And a question:
Does SLUB still need to maintain per-cpu partial slab lists even when
an opt-in percpu array is used?
Vlastimil Babka Nov. 28, 2023, 5:37 p.m. UTC | #2
On 8/21/23 16:57, Hyeonggon Yoo wrote:
> Hi,
> 
> On Fri, Aug 11, 2023 at 1:36 AM Vlastimil Babka <vbabka@suse.cz> wrote:

Oops, looks like I forgot reply, sorry (preparing v3 now).

>>
>> kmem_cache_setup_percpu_array() will allocate a per-cpu array for
>> caching alloc/free objects of given size for the cache. The cache
>> has to be created with SLAB_NO_MERGE flag.
>>
>> The array is filled by freeing. When empty for alloc or full for
>> freeing, it's simply bypassed by the operation, there's currently no
>> batch freeing/allocations.
>>
>> The locking is copied from the page allocator's pcplists, based on
>> embedded spin locks. Interrupts are not disabled, only preemption (cpu
>> migration on RT). Trylock is attempted to avoid deadlock due to
>> an intnerrupt, trylock failure means the array is bypassed.
> 
> nit: s/intnerrupt/interrupt/

Thanks.

> 
>>  /*
>>   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
>>   * have the fastpath folded into their functions. So no function call
>> @@ -3465,7 +3564,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
>>         if (unlikely(object))
>>                 goto out;
>>
>> -       object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
>> +       if (s->cpu_array)
>> +               object = alloc_from_pca(s);
>> +
>> +       if (!object)
>> +               object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
>>
>>         maybe_wipe_obj_freeptr(s, object);
>>         init = slab_want_init_on_alloc(gfpflags, s);
>> @@ -3715,6 +3818,34 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>>         discard_slab(s, slab);
>>  }
> 
>>  #ifndef CONFIG_SLUB_TINY
>>  /*
>>   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
>> @@ -3740,6 +3871,11 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
>>         unsigned long tid;
>>         void **freelist;
>>
>> +       if (s->cpu_array && cnt == 1) {
>> +               if (free_to_pca(s, head))
>> +                       return;
>> +       }
>> +
>>  redo:
>>         /*
>>          * Determine the currently cpus per cpu slab.
>> @@ -3793,6 +3929,11 @@ static void do_slab_free(struct kmem_cache *s,
>>  {
>>         void *tail_obj = tail ? : head;
>>
>> +       if (s->cpu_array && cnt == 1) {
>> +               if (free_to_pca(s, head))
>> +                       return;
>> +       }
>> +
>>         __slab_free(s, slab, head, tail_obj, cnt, addr);
>>  }
>>  #endif /* CONFIG_SLUB_TINY */
> 
> Is this functionality needed for SLUB_TINY?

Due to the prefill semantics, I think it has to be be even in TINY, or we
risk running out of memory reserves. Also later I want to investigate
extending this approach for supporting allocations in very constrained
contexts (NMI) so e.g. bpf doesn't have to reimplement the slab allocator,
and that would also not be good to limit to !SLUB_TINY.

>> @@ -4060,6 +4201,45 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>>  }
>>  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
>>
>> +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
>> +               gfp_t gfp)
>> +{
>> +       struct slub_percpu_array *pca;
>> +       void *objects[32];
>> +       unsigned int used;
>> +       unsigned int allocated;
>> +
>> +       if (!s->cpu_array)
>> +               return -EINVAL;
>> +
>> +       /* racy but we don't care */
>> +       pca = raw_cpu_ptr(s->cpu_array);
>> +
>> +       used = READ_ONCE(pca->used);
> 
> Hmm for the prefill to be meaningful,
> remote allocation should be possible, right?

Remote in what sense?

> Otherwise it only prefills for the CPU that requested it.

If there's a cpu migration between the prefill and usage, it might run out
of the cached array, but assumption is to be rare enough to become an issue.

>> +       if (used >= count)
>> +               return 0;
>> +
>> +       if (pca->count < count)
>> +               return -EINVAL;
>> +
>> +       count -= used;
>> +
>> +       /* TODO fix later */
>> +       if (count > 32)
>> +               count = 32;
>> +
>> +       for (int i = 0; i < count; i++)
>> +               objects[i] = NULL;
>> +       allocated = kmem_cache_alloc_bulk(s, gfp, count, &objects[0]);
>> +
>> +       for (int i = 0; i < count; i++) {
>> +               if (objects[i]) {
>> +                       kmem_cache_free(s, objects[i]);
>> +               }
>> +       }
> 
> nit: why not
> 
> for (int i = 0; i < allocated; i++) {
>     kmem_cache_free(s, objects[i]);
> }
> 
> and skip objects[i] = NULL
> 

This is rewritten significantly in v3 so I think it doesn't apply anymore.

>> +       return allocated;
>> +}
> 
> And a question:
> Does SLUB still need to maintain per-cpu partial slab lists even when
> an opt-in percpu array is used?

Good question :) didn't investigate it yet. We can, once this settles.

Thanks.
Hyeonggon Yoo Nov. 29, 2023, 12:46 a.m. UTC | #3
On Wed, Nov 29, 2023 at 2:37 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 8/21/23 16:57, Hyeonggon Yoo wrote:
> > Hi,
> >
> > On Fri, Aug 11, 2023 at 1:36 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> Oops, looks like I forgot reply, sorry (preparing v3 now).

It's fine, you were busy removing SLAB :)
thanks for replying.

> >
> >>  /*
> >>   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
> >>   * have the fastpath folded into their functions. So no function call
> >> @@ -3465,7 +3564,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
> >>         if (unlikely(object))
> >>                 goto out;
> >>
> >> -       object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
> >> +       if (s->cpu_array)
> >> +               object = alloc_from_pca(s);
> >> +
> >> +       if (!object)
> >> +               object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
> >>
> >>         maybe_wipe_obj_freeptr(s, object);
> >>         init = slab_want_init_on_alloc(gfpflags, s);
> >> @@ -3715,6 +3818,34 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >>         discard_slab(s, slab);
> >>  }
> >
> >>  #ifndef CONFIG_SLUB_TINY
> >>  /*
> >>   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
> >> @@ -3740,6 +3871,11 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
> >>         unsigned long tid;
> >>         void **freelist;
> >>
> >> +       if (s->cpu_array && cnt == 1) {
> >> +               if (free_to_pca(s, head))
> >> +                       return;
> >> +       }
> >> +
> >>  redo:
> >>         /*
> >>          * Determine the currently cpus per cpu slab.
> >> @@ -3793,6 +3929,11 @@ static void do_slab_free(struct kmem_cache *s,
> >>  {
> >>         void *tail_obj = tail ? : head;
> >>
> >> +       if (s->cpu_array && cnt == 1) {
> >> +               if (free_to_pca(s, head))
> >> +                       return;
> >> +       }
> >> +
> >>         __slab_free(s, slab, head, tail_obj, cnt, addr);
> >>  }
> >>  #endif /* CONFIG_SLUB_TINY */
> >
> > Is this functionality needed for SLUB_TINY?
>
> Due to the prefill semantics, I think it has to be be even in TINY, or we
> risk running out of memory reserves. Also later I want to investigate
> extending this approach for supporting allocations in very constrained
> contexts (NMI) so e.g. bpf doesn't have to reimplement the slab allocator,
> and that would also not be good to limit to !SLUB_TINY.

I've got the point, thanks for the explanation!

> >> @@ -4060,6 +4201,45 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
> >>  }
> >>  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
> >>
> >> +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
> >> +               gfp_t gfp)
> >> +{
> >> +       struct slub_percpu_array *pca;
> >> +       void *objects[32];
> >> +       unsigned int used;
> >> +       unsigned int allocated;
> >> +
> >> +       if (!s->cpu_array)
> >> +               return -EINVAL;
> >> +
> >> +       /* racy but we don't care */
> >> +       pca = raw_cpu_ptr(s->cpu_array);
> >> +
> >> +       used = READ_ONCE(pca->used);
> >
> > Hmm for the prefill to be meaningful,
> > remote allocation should be possible, right?
>
> Remote in what sense?

TL;DR) What I wanted to ask was:
"How pre-filling a number of objects works when the pre-filled objects
are not shared between CPUs"

IIUC the prefill is opportunistically filling the array so (hopefully)
expecting there are
some objects filled in it.

Let's say CPU X calls kmem_cache_prefill_percpu_array(32) and all 32
objects are filled into CPU X's array.
But if CPU Y can't allocate from CPU X's array (which I referred to as
"remote allocation"), the semantics differ from
the maple tree's perspective because preallocated objects were shared
between CPUs before, but now it's not?

Thanks!

--
Hyeonggon
Vlastimil Babka Nov. 29, 2023, 1:25 p.m. UTC | #4
On 11/29/23 01:46, Hyeonggon Yoo wrote:
> On Wed, Nov 29, 2023 at 2:37 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> 
>> >> @@ -4060,6 +4201,45 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>> >>  }
>> >>  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
>> >>
>> >> +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
>> >> +               gfp_t gfp)
>> >> +{
>> >> +       struct slub_percpu_array *pca;
>> >> +       void *objects[32];
>> >> +       unsigned int used;
>> >> +       unsigned int allocated;
>> >> +
>> >> +       if (!s->cpu_array)
>> >> +               return -EINVAL;
>> >> +
>> >> +       /* racy but we don't care */
>> >> +       pca = raw_cpu_ptr(s->cpu_array);
>> >> +
>> >> +       used = READ_ONCE(pca->used);
>> >
>> > Hmm for the prefill to be meaningful,
>> > remote allocation should be possible, right?
>>
>> Remote in what sense?
> 
> TL;DR) What I wanted to ask was:
> "How pre-filling a number of objects works when the pre-filled objects
> are not shared between CPUs"
> 
> IIUC the prefill is opportunistically filling the array so (hopefully)
> expecting there are
> some objects filled in it.

Yes.

> Let's say CPU X calls kmem_cache_prefill_percpu_array(32) and all 32
> objects are filled into CPU X's array.
> But if CPU Y can't allocate from CPU X's array (which I referred to as
> "remote allocation"), the semantics differ from
> the maple tree's perspective because preallocated objects were shared
> between CPUs before, but now it's not?

The assumption is that the operation will prefill on CPU X and then consume
it also on X, because shortly after prefill it will enter some restricted
context (i.e. spin_lock_irqsave or whatnot) that prevents it from migrating.
That's not guaranteed of course, but migration in a bad moment and
subsequent depleted array should be rare enough that we'll just handle it in
the slow paths, and if it results in dipping into reserves, it won't be too
disruptive.

> Thanks!
> 
> --
> Hyeonggon
diff mbox series

Patch

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 848c7c82ad5a..f6c91cbc1544 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -196,6 +196,8 @@  struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 void kmem_cache_destroy(struct kmem_cache *s);
 int kmem_cache_shrink(struct kmem_cache *s);
 
+int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count);
+
 /*
  * Please use this macro to create slab caches. Simply specify the
  * name of the structure and maybe some flags that are listed above.
@@ -494,6 +496,8 @@  void kmem_cache_free(struct kmem_cache *s, void *objp);
 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
 
+int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count, gfp_t gfp);
+
 static __always_inline void kfree_bulk(size_t size, void **p)
 {
 	kmem_cache_free_bulk(NULL, size, p);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index deb90cf4bffb..c85434668419 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -13,8 +13,10 @@ 
 #include <linux/local_lock.h>
 
 enum stat_item {
+	ALLOC_PERCPU_CACHE,	/* Allocation from percpu array cache */
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
+	FREE_PERCPU_CACHE,	/* Free to percpu array cache */
 	FREE_FASTPATH,		/* Free to cpu slab */
 	FREE_SLOWPATH,		/* Freeing not to cpu slab */
 	FREE_FROZEN,		/* Freeing to frozen slab */
@@ -66,6 +68,13 @@  struct kmem_cache_cpu {
 };
 #endif /* CONFIG_SLUB_TINY */
 
+struct slub_percpu_array {
+	spinlock_t lock;
+	unsigned int count;
+	unsigned int used;
+	void * objects[];
+};
+
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 #define slub_percpu_partial(c)		((c)->partial)
 
@@ -99,6 +108,7 @@  struct kmem_cache {
 #ifndef CONFIG_SLUB_TINY
 	struct kmem_cache_cpu __percpu *cpu_slab;
 #endif
+	struct slub_percpu_array __percpu *cpu_array;
 	/* Used for retrieving partial slabs, etc. */
 	slab_flags_t flags;
 	unsigned long min_partial;
diff --git a/mm/Kconfig b/mm/Kconfig
index 09130434e30d..84f4dff70d39 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -237,6 +237,7 @@  choice
 config SLAB_DEPRECATED
 	bool "SLAB (DEPRECATED)"
 	depends on !PREEMPT_RT
+	depends on BROKEN
 	help
 	  Deprecated and scheduled for removal in a few cycles. Replaced by
 	  SLUB.
diff --git a/mm/slub.c b/mm/slub.c
index a9437d48840c..f41c69bac07d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -188,6 +188,79 @@  do {					\
 #define USE_LOCKLESS_FAST_PATH()	(false)
 #endif
 
+/* copy/pasted  from mm/page_alloc.c */
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags)	do { } while (0)
+#define pcp_trylock_finish(flag)	do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)	local_irq_save(flags)
+#define pcp_trylock_finish(flags)	local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin()		preempt_disable()
+#define pcpu_task_unpin()	preempt_enable()
+#else
+#define pcpu_task_pin()		migrate_disable()
+#define pcpu_task_unpin()	migrate_enable()
+#endif
+
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr)				\
+({									\
+	type *_ret;							\
+	pcpu_task_pin();						\
+	_ret = this_cpu_ptr(ptr);					\
+	spin_lock(&_ret->member);					\
+	_ret;								\
+})
+
+#define pcpu_spin_trylock(type, member, ptr)				\
+({									\
+	type *_ret;							\
+	pcpu_task_pin();						\
+	_ret = this_cpu_ptr(ptr);					\
+	if (!spin_trylock(&_ret->member)) {				\
+		pcpu_task_unpin();					\
+		_ret = NULL;						\
+	}								\
+	_ret;								\
+})
+
+#define pcpu_spin_unlock(member, ptr)					\
+({									\
+	spin_unlock(&ptr->member);					\
+	pcpu_task_unpin();						\
+})
+
+/* struct slub_percpu_array specific helpers. */
+#define pca_spin_lock(ptr)						\
+	pcpu_spin_lock(struct slub_percpu_array, lock, ptr)
+
+#define pca_spin_trylock(ptr)						\
+	pcpu_spin_trylock(struct slub_percpu_array, lock, ptr)
+
+#define pca_spin_unlock(ptr)						\
+	pcpu_spin_unlock(lock, ptr)
+
 #ifndef CONFIG_SLUB_TINY
 #define __fastpath_inline __always_inline
 #else
@@ -3440,6 +3513,32 @@  static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
 			0, sizeof(void *));
 }
 
+static inline void *alloc_from_pca(struct kmem_cache *s)
+{
+	unsigned long __maybe_unused UP_flags;
+	struct slub_percpu_array *pca;
+	void *object = NULL;
+
+	pcp_trylock_prepare(UP_flags);
+	pca = pca_spin_trylock(s->cpu_array);
+
+	if (unlikely(!pca))
+		goto failed;
+
+	if (likely(pca->used > 0)) {
+		object = pca->objects[--pca->used];
+		pca_spin_unlock(pca);
+		pcp_trylock_finish(UP_flags);
+		stat(s, ALLOC_PERCPU_CACHE);
+		return object;
+	}
+	pca_spin_unlock(pca);
+
+failed:
+	pcp_trylock_finish(UP_flags);
+	return NULL;
+}
+
 /*
  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
  * have the fastpath folded into their functions. So no function call
@@ -3465,7 +3564,11 @@  static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
 	if (unlikely(object))
 		goto out;
 
-	object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
+	if (s->cpu_array)
+		object = alloc_from_pca(s);
+
+	if (!object)
+		object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
 
 	maybe_wipe_obj_freeptr(s, object);
 	init = slab_want_init_on_alloc(gfpflags, s);
@@ -3715,6 +3818,34 @@  static void __slab_free(struct kmem_cache *s, struct slab *slab,
 	discard_slab(s, slab);
 }
 
+static inline bool free_to_pca(struct kmem_cache *s, void *object)
+{
+	unsigned long __maybe_unused UP_flags;
+	struct slub_percpu_array *pca;
+	bool ret = false;
+
+	pcp_trylock_prepare(UP_flags);
+	pca = pca_spin_trylock(s->cpu_array);
+
+	if (!pca) {
+		pcp_trylock_finish(UP_flags);
+		return false;
+	}
+
+	if (pca->used < pca->count) {
+		pca->objects[pca->used++] = object;
+		ret = true;
+	}
+
+	pca_spin_unlock(pca);
+	pcp_trylock_finish(UP_flags);
+
+	if (ret)
+		stat(s, FREE_PERCPU_CACHE);
+
+	return ret;
+}
+
 #ifndef CONFIG_SLUB_TINY
 /*
  * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
@@ -3740,6 +3871,11 @@  static __always_inline void do_slab_free(struct kmem_cache *s,
 	unsigned long tid;
 	void **freelist;
 
+	if (s->cpu_array && cnt == 1) {
+		if (free_to_pca(s, head))
+			return;
+	}
+
 redo:
 	/*
 	 * Determine the currently cpus per cpu slab.
@@ -3793,6 +3929,11 @@  static void do_slab_free(struct kmem_cache *s,
 {
 	void *tail_obj = tail ? : head;
 
+	if (s->cpu_array && cnt == 1) {
+		if (free_to_pca(s, head))
+			return;
+	}
+
 	__slab_free(s, slab, head, tail_obj, cnt, addr);
 }
 #endif /* CONFIG_SLUB_TINY */
@@ -4060,6 +4201,45 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 
+int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
+		gfp_t gfp)
+{
+	struct slub_percpu_array *pca;
+	void *objects[32];
+	unsigned int used;
+	unsigned int allocated;
+
+	if (!s->cpu_array)
+		return -EINVAL;
+
+	/* racy but we don't care */
+	pca = raw_cpu_ptr(s->cpu_array);
+
+	used = READ_ONCE(pca->used);
+
+	if (used >= count)
+		return 0;
+
+	if (pca->count < count)
+		return -EINVAL;
+
+	count -= used;
+
+	/* TODO fix later */
+	if (count > 32)
+		count = 32;
+
+	for (int i = 0; i < count; i++)
+		objects[i] = NULL;
+	allocated = kmem_cache_alloc_bulk(s, gfp, count, &objects[0]);
+
+	for (int i = 0; i < count; i++) {
+		if (objects[i]) {
+			kmem_cache_free(s, objects[i]);
+		}
+	}
+	return allocated;
+}
 
 /*
  * Object placement in a slab is made very easy because we always start at
@@ -5131,6 +5311,30 @@  int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
 	return 0;
 }
 
+int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count)
+{
+	int cpu;
+
+	if (WARN_ON_ONCE(!(s->flags & SLAB_NO_MERGE)))
+		return -EINVAL;
+
+	s->cpu_array = __alloc_percpu(struct_size(s->cpu_array, objects, count),
+					sizeof(void *));
+
+	if (!s->cpu_array)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct slub_percpu_array *pca = per_cpu_ptr(s->cpu_array, cpu);
+
+		spin_lock_init(&pca->lock);
+		pca->count = count;
+		pca->used = 0;
+	}
+
+	return 0;
+}
+
 #ifdef SLAB_SUPPORTS_SYSFS
 static int count_inuse(struct slab *slab)
 {
@@ -5908,8 +6112,10 @@  static ssize_t text##_store(struct kmem_cache *s,		\
 }								\
 SLAB_ATTR(text);						\
 
+STAT_ATTR(ALLOC_PERCPU_CACHE, alloc_cpu_cache);
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_PERCPU_CACHE, free_cpu_cache);
 STAT_ATTR(FREE_FASTPATH, free_fastpath);
 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -5995,8 +6201,10 @@  static struct attribute *slab_attrs[] = {
 	&remote_node_defrag_ratio_attr.attr,
 #endif
 #ifdef CONFIG_SLUB_STATS
+	&alloc_cpu_cache_attr.attr,
 	&alloc_fastpath_attr.attr,
 	&alloc_slowpath_attr.attr,
+	&free_cpu_cache_attr.attr,
 	&free_fastpath_attr.attr,
 	&free_slowpath_attr.attr,
 	&free_frozen_attr.attr,