@@ -10,6 +10,7 @@
#include <linux/kfence.h>
#include <linux/kobject.h>
#include <linux/reciprocal_div.h>
+#include <linux/local_lock.h>
enum stat_item {
ALLOC_FASTPATH, /* Allocation from cpu slab */
@@ -40,6 +41,10 @@ enum stat_item {
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
NR_SLUB_STAT_ITEMS };
+/*
+ * When changing the layout, make sure freelist and tid are still compatible
+ * with this_cpu_cmpxchg_double() alignment requirements.
+ */
struct kmem_cache_cpu {
void **freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
@@ -47,6 +52,7 @@ struct kmem_cache_cpu {
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct page *partial; /* Partially allocated frozen slabs */
#endif
+ local_lock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
@@ -46,13 +46,21 @@
/*
* Lock order:
* 1. slab_mutex (Global Mutex)
- * 2. node->list_lock
- * 3. slab_lock(page) (Only on some arches and for debugging)
+ * 2. node->list_lock (Spinlock)
+ * 3. kmem_cache->cpu_slab->lock (Local lock)
+ * 4. slab_lock(page) (Only on some arches or for debugging)
+ * 5. object_map_lock (Only for debugging)
*
* slab_mutex
*
* The role of the slab_mutex is to protect the list of all the slabs
* and to synchronize major metadata changes to slab cache structures.
+ * Also synchronizes memory hotplug callbacks.
+ *
+ * slab_lock
+ *
+ * The slab_lock is a wrapper around the page lock, thus it is a bit
+ * spinlock.
*
* The slab_lock is only used for debugging and on arches that do not
* have the ability to do a cmpxchg_double. It only protects:
@@ -61,6 +69,8 @@
* C. page->objects -> Number of objects in page
* D. page->frozen -> frozen state
*
+ * Frozen slabs
+ *
* If a slab is frozen then it is exempt from list management. It is not
* on any list except per cpu partial list. The processor that froze the
* slab is the one who can perform list operations on the page. Other
@@ -68,6 +78,8 @@
* froze the slab is the only one that can retrieve the objects from the
* page's freelist.
*
+ * list_lock
+ *
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
* removed from the lists nor make the number of partial slabs be modified.
@@ -79,10 +91,36 @@
* slabs, operations can continue without any centralized lock. F.e.
* allocating a long series of objects that fill up slabs does not require
* the list lock.
- * Interrupts are disabled during allocation and deallocation in order to
- * make the slab allocator safe to use in the context of an irq. In addition
- * interrupts are disabled to ensure that the processor does not change
- * while handling per_cpu slabs, due to kernel preemption.
+ *
+ * cpu_slab->lock local lock
+ *
+ * This locks protect slowpath manipulation of all kmem_cache_cpu fields
+ * except the stat counters. This is a percpu structure manipulated only by
+ * the local cpu, so the lock protects against being preempted or interrupted
+ * by an irq. Fast path operations rely on lockless operations instead.
+ * On PREEMPT_RT, the local lock does not actually disable irqs (and thus
+ * prevent the lockless operations), so fastpath operations also need to take
+ * the lock and are no longer lockless.
+ *
+ * lockless fastpaths
+ *
+ * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
+ * are fully lockless when satisfied from the percpu slab (and when
+ * cmpxchg_double is possible to use, otherwise slab_lock is taken).
+ * They also don't disable preemption or migration or irqs. They rely on
+ * the transaction id (tid) field to detect being preempted or moved to
+ * another cpu.
+ *
+ * irq, preemption, migration considerations
+ *
+ * Interrupts are disabled as part of list_lock or local_lock operations, or
+ * around the slab_lock operation, in order to make the slab allocator safe
+ * to use in the context of an irq.
+ *
+ * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
+ * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
+ * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
+ * doesn't have to be revalidated in each section protected by the local lock.
*
* SLUB assigns one slab for allocation to each processor.
* Allocations only occur from these slabs called cpu slabs.
@@ -2250,9 +2288,13 @@ static inline void note_cmpxchg_failure(
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
int cpu;
+ struct kmem_cache_cpu *c;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(s->cpu_slab, cpu);
+ local_lock_init(&c->lock);
+ c->tid = init_tid(cpu);
+ }
}
/*
@@ -2463,10 +2505,10 @@ static void unfreeze_partials(struct kme
struct page *partial_page;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
partial_page = this_cpu_read(s->cpu_slab->partial);
this_cpu_write(s->cpu_slab->partial, NULL);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
if (partial_page)
__unfreeze_partials(s, partial_page);
@@ -2499,7 +2541,7 @@ static void put_cpu_partial(struct kmem_
int pages = 0;
int pobjects = 0;
- local_irq_save(flags);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
oldpage = this_cpu_read(s->cpu_slab->partial);
@@ -2527,7 +2569,7 @@ static void put_cpu_partial(struct kmem_
this_cpu_write(s->cpu_slab->partial, page);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
if (page_to_unfreeze) {
__unfreeze_partials(s, page_to_unfreeze);
@@ -2549,7 +2591,7 @@ static inline void flush_slab(struct kme
struct page *page;
void *freelist;
- local_irq_save(flags);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
page = c->page;
freelist = c->freelist;
@@ -2558,7 +2600,7 @@ static inline void flush_slab(struct kme
c->freelist = NULL;
c->tid = next_tid(c->tid);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
if (page) {
deactivate_slab(s, page, freelist);
@@ -2780,8 +2822,6 @@ static inline bool pfmemalloc_match_unsa
* The page is still frozen if the return value is not NULL.
*
* If this function returns NULL then the page has been unfrozen.
- *
- * This function must be called with interrupt disabled.
*/
static inline void *get_freelist(struct kmem_cache *s, struct page *page)
{
@@ -2789,6 +2829,8 @@ static inline void *get_freelist(struct
unsigned long counters;
void *freelist;
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+
do {
freelist = page->freelist;
counters = page->counters;
@@ -2873,9 +2915,9 @@ redo:
goto deactivate_slab;
/* must check again c->page in case we got preempted and it changed */
- local_irq_save(flags);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(page != c->page)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_page;
}
freelist = c->freelist;
@@ -2886,7 +2928,7 @@ redo:
if (!freelist) {
c->page = NULL;
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -2895,7 +2937,7 @@ redo:
load_freelist:
- lockdep_assert_irqs_disabled();
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
/*
* freelist is pointing to the list of objects to be used.
@@ -2905,39 +2947,39 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
return freelist;
deactivate_slab:
- local_irq_save(flags);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
if (page != c->page) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_page;
}
freelist = c->freelist;
c->page = NULL;
c->freelist = NULL;
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
deactivate_slab(s, page, freelist);
new_slab:
if (slub_percpu_partial(c)) {
- local_irq_save(flags);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(c->page)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_page;
}
if (unlikely(!slub_percpu_partial(c))) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
/* we were preempted and partial list got empty */
goto new_objects;
}
page = c->page = slub_percpu_partial(c);
slub_set_percpu_partial(c, page);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
}
@@ -2990,7 +3032,7 @@ check_new_page:
retry_load_page:
- local_irq_save(flags);
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(c->page)) {
void *flush_freelist = c->freelist;
struct page *flush_page = c->page;
@@ -2999,7 +3041,7 @@ retry_load_page:
c->freelist = NULL;
c->tid = next_tid(c->tid);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
deactivate_slab(s, flush_page, flush_freelist);
@@ -3118,7 +3160,15 @@ redo:
object = c->freelist;
page = c->page;
- if (unlikely(!object || !page || !node_match(page, node))) {
+ /*
+ * We cannot use the lockless fastpath on PREEMPT_RT because if a
+ * slowpath has taken the local_lock_irqsave(), it is not protected
+ * against a fast path operation in an irq handler. So we need to take
+ * the slow path which uses local_lock. It is still relatively fast if
+ * there is a suitable cpu freelist.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+ unlikely(!object || !page || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3378,6 +3428,7 @@ redo:
barrier();
if (likely(page == c->page)) {
+#ifndef CONFIG_PREEMPT_RT
void **freelist = READ_ONCE(c->freelist);
set_freepointer(s, tail_obj, freelist);
@@ -3390,6 +3441,31 @@ redo:
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
+#else /* CONFIG_PREEMPT_RT */
+ /*
+ * We cannot use the lockless fastpath on PREEMPT_RT because if
+ * a slowpath has taken the local_lock_irqsave(), it is not
+ * protected against a fast path operation in an irq handler. So
+ * we need to take the local_lock. We shouldn't simply defer to
+ * __slab_free() as that wouldn't use the cpu freelist at all.
+ */
+ void **freelist;
+
+ local_lock(&s->cpu_slab->lock);
+ c = this_cpu_ptr(s->cpu_slab);
+ if (unlikely(page != c->page)) {
+ local_unlock(&s->cpu_slab->lock);
+ goto redo;
+ }
+ tid = c->tid;
+ freelist = c->freelist;
+
+ set_freepointer(s, tail_obj, freelist);
+ c->freelist = head;
+ c->tid = next_tid(tid);
+
+ local_unlock(&s->cpu_slab->lock);
+#endif
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, head, tail_obj, cnt, addr);
@@ -3568,7 +3644,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
* handlers invoking normal fastpath.
*/
c = slub_get_cpu_ptr(s->cpu_slab);
- local_irq_disable();
+ local_lock_irq(&s->cpu_slab->lock);
for (i = 0; i < size; i++) {
void *object = kfence_alloc(s, s->object_size, flags);
@@ -3589,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
*/
c->tid = next_tid(c->tid);
- local_irq_enable();
+ local_unlock_irq(&s->cpu_slab->lock);
/*
* Invoking slow path likely have side-effect
@@ -3603,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
c = this_cpu_ptr(s->cpu_slab);
maybe_wipe_obj_freeptr(s, p[i]);
- local_irq_disable();
+ local_lock_irq(&s->cpu_slab->lock);
continue; /* goto for-loop */
}
@@ -3612,7 +3688,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
- local_irq_enable();
+ local_unlock_irq(&s->cpu_slab->lock);
slub_put_cpu_ptr(s->cpu_slab);
/*