diff mbox series

[RFC,v2,33/34] mm, slub: use migrate_disable() on PREEMPT_RT

Message ID 20210609113903.1421-34-vbabka@suse.cz (mailing list archive)
State New
Headers show
Series SLUB: reduce irq disabled scope and make it RT compatible | expand

Commit Message

Vlastimil Babka June 9, 2021, 11:39 a.m. UTC
We currently use preempt_disable() (directly or via get_cpu_ptr()) to stabilize
the pointer to kmem_cache_cpu. On PREEMPT_RT this would be incompatible with
the list_lock spinlock. We can use migrate_disable() instead, but that
increases overhead on !PREEMPT_RT as it's an unconditional function call even
though it's ultimately a migrate_disable() there.

In order to get the best available mechanism on both PREEMPT_RT and
!PREEMPT_RT, introduce private slub_get_cpu_ptr() and slub_put_cpu_ptr()
wrappers and use them.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

Comments

Vlastimil Babka June 14, 2021, 11:07 a.m. UTC | #1
On 6/9/21 1:39 PM, Vlastimil Babka wrote:
> We currently use preempt_disable() (directly or via get_cpu_ptr()) to stabilize
> the pointer to kmem_cache_cpu. On PREEMPT_RT this would be incompatible with
> the list_lock spinlock. We can use migrate_disable() instead, but that
> increases overhead on !PREEMPT_RT as it's an unconditional function call even
> though it's ultimately a migrate_disable() there.
> 
> In order to get the best available mechanism on both PREEMPT_RT and
> !PREEMPT_RT, introduce private slub_get_cpu_ptr() and slub_put_cpu_ptr()
> wrappers and use them.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 12e966f07f19..caa206213e72 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -115,6 +115,26 @@
>   * 			the fast path and disables lockless freelists.
>   */
>  
> +/*
> + * We could simply use migrate_disable()/enable() but as long as it's a
> + * function call even on !PREEMPT_RT, use inline preempt_disable() there.
> + */
> +#ifdef CONFIG_PREEMPT_RT
> +#define slub_get_cpu_ptr(var)	get_cpu_ptr(var)
> +#define slub_put_cpu_ptr(var)	put_cpu_ptr(var)

After Mel's report and bisect pointing to this patch, I realized I got the
#ifdef wrong and it should be #ifnded

Fixed patch follows:

----8<----
From e79e166414b8c22e3bcf73a9383bafe802fa64d2 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 21 May 2021 14:03:23 +0200
Subject: [PATCH] mm, slub: use migrate_disable() on PREEMPT_RT

We currently use preempt_disable() (directly or via get_cpu_ptr()) to stabilize
the pointer to kmem_cache_cpu. On PREEMPT_RT this would be incompatible with
the list_lock spinlock. We can use migrate_disable() instead, but that
increases overhead on !PREEMPT_RT as it's an unconditional function call even
though it's ultimately a migrate_disable() there.

In order to get the best available mechanism on both PREEMPT_RT and
!PREEMPT_RT, introduce private slub_get_cpu_ptr() and slub_put_cpu_ptr()
wrappers and use them.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 12e966f07f19..f0359b0c8154 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -115,6 +115,26 @@
  * 			the fast path and disables lockless freelists.
  */
 
+/*
+ * We could simply use migrate_disable()/enable() but as long as it's a
+ * function call even on !PREEMPT_RT, use inline preempt_disable() there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define slub_get_cpu_ptr(var)	get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var)	put_cpu_ptr(var)
+#else
+#define slub_get_cpu_ptr(var)		\
+({					\
+	migrate_disable();		\
+	this_cpu_ptr(var);		\
+})
+#define slub_put_cpu_ptr(var)		\
+do {					\
+	(void)(var);			\
+	migrate_enable();		\
+} while (0)
+#endif
+
 #ifdef CONFIG_SLUB_DEBUG
 #ifdef CONFIG_SLUB_DEBUG_ON
 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
@@ -2419,7 +2439,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 	int pages;
 	int pobjects;
 
-	preempt_disable();
+	slub_get_cpu_ptr(s->cpu_slab);
 	do {
 		pages = 0;
 		pobjects = 0;
@@ -2450,7 +2470,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 
 	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
 								!= oldpage);
-	preempt_enable();
+	slub_put_cpu_ptr(s->cpu_slab);
 #endif	/* CONFIG_SLUB_CPU_PARTIAL */
 }
 
@@ -2759,7 +2779,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	if (unlikely(!pfmemalloc_match(page, gfpflags)))
 		goto deactivate_slab;
 
-	/* must check again c->page in case IRQ handler changed it */
+	/* must check again c->page in case we got preempted and it changed */
 	local_irq_save(flags);
 	if (unlikely(page != c->page)) {
 		local_irq_restore(flags);
@@ -2818,7 +2838,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		}
 		if (unlikely(!slub_percpu_partial(c))) {
 			local_irq_restore(flags);
-			goto new_objects; /* stolen by an IRQ handler */
+			/* we were preempted and partial list got empty */
+			goto new_objects;
 		}
 
 		page = c->page = slub_percpu_partial(c);
@@ -2834,9 +2855,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	if (freelist)
 		goto check_new_page;
 
-	put_cpu_ptr(s->cpu_slab);
+	slub_put_cpu_ptr(s->cpu_slab);
 	page = new_slab(s, gfpflags, node);
-	c = get_cpu_ptr(s->cpu_slab);
+	c = slub_get_cpu_ptr(s->cpu_slab);
 
 	if (unlikely(!page)) {
 		slab_out_of_memory(s, gfpflags, node);
@@ -2919,12 +2940,12 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	 * cpu before disabling preemption. Need to reload cpu area
 	 * pointer.
 	 */
-	c = get_cpu_ptr(s->cpu_slab);
+	c = slub_get_cpu_ptr(s->cpu_slab);
 #endif
 
 	p = ___slab_alloc(s, gfpflags, node, addr, c);
 #ifdef CONFIG_PREEMPTION
-	put_cpu_ptr(s->cpu_slab);
+	slub_put_cpu_ptr(s->cpu_slab);
 #endif
 	return p;
 }
@@ -3445,7 +3466,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	 * IRQs, which protects against PREEMPT and interrupts
 	 * handlers invoking normal fastpath.
 	 */
-	c = get_cpu_ptr(s->cpu_slab);
+	c = slub_get_cpu_ptr(s->cpu_slab);
 	local_irq_disable();
 
 	for (i = 0; i < size; i++) {
@@ -3491,7 +3512,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	}
 	c->tid = next_tid(c->tid);
 	local_irq_enable();
-	put_cpu_ptr(s->cpu_slab);
+	slub_put_cpu_ptr(s->cpu_slab);
 
 	/*
 	 * memcg and kmem_cache debug support and memory initialization.
Sebastian Andrzej Siewior June 14, 2021, 11:16 a.m. UTC | #2
On 2021-06-14 13:07:14 [+0200], Vlastimil Babka wrote:
> > +#ifdef CONFIG_PREEMPT_RT
> > +#define slub_get_cpu_ptr(var)	get_cpu_ptr(var)
> > +#define slub_put_cpu_ptr(var)	put_cpu_ptr(var)
> 
> After Mel's report and bisect pointing to this patch, I realized I got the
> #ifdef wrong and it should be #ifnded

So if you got the ifdef wrong (and kept everything as-is) then you
tested the RT version on !RT. migrate_disable() behaves on !RT as on RT.
As per changelog you don't use migrate_disable() unconditionally because
it increases the overhead on !RT.
I haven't looked at the series and I have just this tiny question: why
did migrate_disable() crash for Mel on !RT and why do you expect that it
does not happen on PREEMPT_RT?

Sebastian
Vlastimil Babka June 14, 2021, 11:33 a.m. UTC | #3
On 6/14/21 1:16 PM, Sebastian Andrzej Siewior wrote:
> On 2021-06-14 13:07:14 [+0200], Vlastimil Babka wrote:
>> > +#ifdef CONFIG_PREEMPT_RT
>> > +#define slub_get_cpu_ptr(var)	get_cpu_ptr(var)
>> > +#define slub_put_cpu_ptr(var)	put_cpu_ptr(var)
>> 
>> After Mel's report and bisect pointing to this patch, I realized I got the
>> #ifdef wrong and it should be #ifnded
> 
> So if you got the ifdef wrong (and kept everything as-is) then you
> tested the RT version on !RT. migrate_disable() behaves on !RT as on RT.
> As per changelog you don't use migrate_disable() unconditionally because
> it increases the overhead on !RT.

Correct.

> I haven't looked at the series and I have just this tiny question: why
> did migrate_disable() crash for Mel on !RT and why do you expect that it
> does not happen on PREEMPT_RT?

Right, so it's because __slab_alloc() has this optimization to avoid re-reading
'c' in case there is no preemption enabled at all (or it's just voluntary).

#ifdef CONFIG_PREEMPTION
        /*
         * We may have been preempted and rescheduled on a different
         * cpu before disabling preemption. Need to reload cpu area
         * pointer.
         */
        c = slub_get_cpu_ptr(s->cpu_slab);
#endif

Mel's config has CONFIG_PREEMPT_VOLUNTARY, which means CONFIG_PREEMPTION is not
enabled.

But then later in ___slab_alloc() we have

        slub_put_cpu_ptr(s->cpu_slab);
        page = new_slab(s, gfpflags, node);
        c = slub_get_cpu_ptr(s->cpu_slab);

And this is not hidden under CONFIG_PREEMPTION, so with the #ifdef bug the
slub_put_cpu_ptr did a migrate_enable() with Mel's config, without prior
migrate_disable().

If there wasn't the #ifdef PREEMPT_RT bug:
- this slub_put_cpu_ptr() would translate to put_cpu_ptr() thus
preempt_enable(), which on this config is just a barrier(), so it doesn't matter
that there was no matching preempt_disable() before.
- with PREEMPT_RT the CONFIG_PREEMPTION would be enabled, so the
slub_get_cpu_ptr() would do a migrate_disable() and there's no imbalance.

But now that I dig into this in detail, I can see there might be another
instance of this imbalance bug, if CONFIG_PREEMPTION is disabled, but
CONFIG_PREEMPT_COUNT is enabled, which seems to be possible in some debug
scenarios. Because then preempt_disable()/preempt_enable() still manipulate the
preempt counter and compiling them out in __slab_alloc() will cause imbalance.

So I think the guards in __slab_alloc() should be using CONFIG_PREEMPT_COUNT
instead of CONFIG_PREEMPT to be correct on all configs. I dare not remove them
completely :)
Vlastimil Babka June 14, 2021, 12:54 p.m. UTC | #4
On 6/14/21 1:33 PM, Vlastimil Babka wrote:
> On 6/14/21 1:16 PM, Sebastian Andrzej Siewior wrote:
> But now that I dig into this in detail, I can see there might be another
> instance of this imbalance bug, if CONFIG_PREEMPTION is disabled, but
> CONFIG_PREEMPT_COUNT is enabled, which seems to be possible in some debug
> scenarios. Because then preempt_disable()/preempt_enable() still manipulate the
> preempt counter and compiling them out in __slab_alloc() will cause imbalance.
> 
> So I think the guards in __slab_alloc() should be using CONFIG_PREEMPT_COUNT
> instead of CONFIG_PREEMPT to be correct on all configs. I dare not remove them
> completely :)

Yep, it's possible to get such scenario with PREEMPT_VOLUNTARY plus
PROVE_LOCKING - CONFIG_PREEMPTION is disabled but CONFIG_PREEMPT_COUNT is
enabled, and RCU then complains in the page allocator due to the unpaired
preempt_disable() before entering it.

I've pushed a new branch revision with this fixed:

https://git.kernel.org/pub/scm/linux/kernel/git/vbabka/linux.git/log/?h=slub-local-lock-v2r3
Sebastian Andrzej Siewior June 14, 2021, 2:01 p.m. UTC | #5
On 2021-06-14 13:33:43 [+0200], Vlastimil Babka wrote:
> On 6/14/21 1:16 PM, Sebastian Andrzej Siewior wrote:
> > I haven't looked at the series and I have just this tiny question: why
> > did migrate_disable() crash for Mel on !RT and why do you expect that it
> > does not happen on PREEMPT_RT?
> 
> Right, so it's because __slab_alloc() has this optimization to avoid re-reading
> 'c' in case there is no preemption enabled at all (or it's just voluntary).
> 
> #ifdef CONFIG_PREEMPTION
>         /*
>          * We may have been preempted and rescheduled on a different
>          * cpu before disabling preemption. Need to reload cpu area
>          * pointer.
>          */
>         c = slub_get_cpu_ptr(s->cpu_slab);
> #endif
> 
> Mel's config has CONFIG_PREEMPT_VOLUNTARY, which means CONFIG_PREEMPTION is not
> enabled.
> 
> But then later in ___slab_alloc() we have
> 
>         slub_put_cpu_ptr(s->cpu_slab);
>         page = new_slab(s, gfpflags, node);
>         c = slub_get_cpu_ptr(s->cpu_slab);
> 
> And this is not hidden under CONFIG_PREEMPTION, so with the #ifdef bug the
> slub_put_cpu_ptr did a migrate_enable() with Mel's config, without prior
> migrate_disable().

Ach, right. The update to this field is done with cmpxchg-double (if I
remember correctly) but I don't remember if this is also re-entry safe. 

> If there wasn't the #ifdef PREEMPT_RT bug:
> - this slub_put_cpu_ptr() would translate to put_cpu_ptr() thus
> preempt_enable(), which on this config is just a barrier(), so it doesn't matter
> that there was no matching preempt_disable() before.
> - with PREEMPT_RT the CONFIG_PREEMPTION would be enabled, so the
> slub_get_cpu_ptr() would do a migrate_disable() and there's no imbalance.
> 
> But now that I dig into this in detail, I can see there might be another
> instance of this imbalance bug, if CONFIG_PREEMPTION is disabled, but
> CONFIG_PREEMPT_COUNT is enabled, which seems to be possible in some debug
> scenarios. Because then preempt_disable()/preempt_enable() still manipulate the
> preempt counter and compiling them out in __slab_alloc() will cause imbalance.
> 
> So I think the guards in __slab_alloc() should be using CONFIG_PREEMPT_COUNT
> instead of CONFIG_PREEMPT to be correct on all configs. I dare not remove them
> completely :)

:)

Sebastian
diff mbox series

Patch

diff --git a/mm/slub.c b/mm/slub.c
index 12e966f07f19..caa206213e72 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -115,6 +115,26 @@ 
  * 			the fast path and disables lockless freelists.
  */
 
+/*
+ * We could simply use migrate_disable()/enable() but as long as it's a
+ * function call even on !PREEMPT_RT, use inline preempt_disable() there.
+ */
+#ifdef CONFIG_PREEMPT_RT
+#define slub_get_cpu_ptr(var)	get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var)	put_cpu_ptr(var)
+#else
+#define slub_get_cpu_ptr(var)		\
+({					\
+	migrate_disable();		\
+	this_cpu_ptr(var);		\
+})
+#define slub_put_cpu_ptr(var)		\
+do {					\
+	(void)(var);			\
+	migrate_enable();		\
+} while (0)
+#endif
+
 #ifdef CONFIG_SLUB_DEBUG
 #ifdef CONFIG_SLUB_DEBUG_ON
 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
@@ -2419,7 +2439,7 @@  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 	int pages;
 	int pobjects;
 
-	preempt_disable();
+	slub_get_cpu_ptr(s->cpu_slab);
 	do {
 		pages = 0;
 		pobjects = 0;
@@ -2450,7 +2470,7 @@  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 
 	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
 								!= oldpage);
-	preempt_enable();
+	slub_put_cpu_ptr(s->cpu_slab);
 #endif	/* CONFIG_SLUB_CPU_PARTIAL */
 }
 
@@ -2759,7 +2779,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	if (unlikely(!pfmemalloc_match(page, gfpflags)))
 		goto deactivate_slab;
 
-	/* must check again c->page in case IRQ handler changed it */
+	/* must check again c->page in case we got preempted and it changed */
 	local_irq_save(flags);
 	if (unlikely(page != c->page)) {
 		local_irq_restore(flags);
@@ -2818,7 +2838,8 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		}
 		if (unlikely(!slub_percpu_partial(c))) {
 			local_irq_restore(flags);
-			goto new_objects; /* stolen by an IRQ handler */
+			/* we were preempted and partial list got empty */
+			goto new_objects;
 		}
 
 		page = c->page = slub_percpu_partial(c);
@@ -2834,9 +2855,9 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	if (freelist)
 		goto check_new_page;
 
-	put_cpu_ptr(s->cpu_slab);
+	slub_put_cpu_ptr(s->cpu_slab);
 	page = new_slab(s, gfpflags, node);
-	c = get_cpu_ptr(s->cpu_slab);
+	c = slub_get_cpu_ptr(s->cpu_slab);
 
 	if (unlikely(!page)) {
 		slab_out_of_memory(s, gfpflags, node);
@@ -2919,12 +2940,12 @@  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	 * cpu before disabling preemption. Need to reload cpu area
 	 * pointer.
 	 */
-	c = get_cpu_ptr(s->cpu_slab);
+	c = slub_get_cpu_ptr(s->cpu_slab);
 #endif
 
 	p = ___slab_alloc(s, gfpflags, node, addr, c);
 #ifdef CONFIG_PREEMPTION
-	put_cpu_ptr(s->cpu_slab);
+	slub_put_cpu_ptr(s->cpu_slab);
 #endif
 	return p;
 }
@@ -3445,7 +3466,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	 * IRQs, which protects against PREEMPT and interrupts
 	 * handlers invoking normal fastpath.
 	 */
-	c = get_cpu_ptr(s->cpu_slab);
+	c = slub_get_cpu_ptr(s->cpu_slab);
 	local_irq_disable();
 
 	for (i = 0; i < size; i++) {
@@ -3491,7 +3512,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	}
 	c->tid = next_tid(c->tid);
 	local_irq_enable();
-	put_cpu_ptr(s->cpu_slab);
+	slub_put_cpu_ptr(s->cpu_slab);
 
 	/*
 	 * memcg and kmem_cache debug support and memory initialization.