diff mbox series

[v5,02/18] mm/sl[au]b: rearrange struct slab fields to allow larger rcu_head

Message ID 20220901221720.1105021-3-joel@joelfernandes.org (mailing list archive)
State New, archived
Headers show
Series Implement call_rcu_lazy() and miscellaneous fixes | expand

Commit Message

Joel Fernandes Sept. 1, 2022, 10:17 p.m. UTC
From: Vlastimil Babka <vbabka@suse.cz>

Joel reports [1] that increasing the rcu_head size for debugging
purposes used to work before struct slab was split from struct page, but
now runs into the various SLAB_MATCH() sanity checks of the layout.

This is because the rcu_head in struct page is in union with large
sub-structures and has space to grow without exceeding their size, while
in struct slab (for SLAB and SLUB) it's in union only with a list_head.

On closer inspection (and after the previous patch) we can put all
fields except slab_cache to a union with rcu_head, as slab_cache is
sufficient for the rcu freeing callbacks to work and the rest can be
overwritten by rcu_head without causing issues.

This is only somewhat complicated by the need to keep SLUB's
freelist+counters aligned for cmpxchg_double. As a result the fields
need to be reordered so that slab_cache is first (after page flags) and
the union with rcu_head follows. For consistency, do that for SLAB as
well, although not necessary there.

As a result, the rcu_head field in struct page and struct slab is no
longer at the same offset, but that doesn't matter as there is no
casting that would rely on that in the slab freeing callbacks, so we can
just drop the respective SLAB_MATCH() check.

Also we need to update the SLAB_MATCH() for compound_head to reflect the
new ordering.

While at it, also add a static_assert to check the alignment needed for
cmpxchg_double so mistakes are found sooner than a runtime GPF.

[1] https://lore.kernel.org/all/85afd876-d8bb-0804-b2c5-48ed3055e702@joelfernandes.org/

Reported-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h | 54 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 22 deletions(-)

Comments

Vlastimil Babka Sept. 2, 2022, 9:26 a.m. UTC | #1
On 9/2/22 00:17, Joel Fernandes (Google) wrote:
> From: Vlastimil Babka <vbabka@suse.cz>
> 
> Joel reports [1] that increasing the rcu_head size for debugging
> purposes used to work before struct slab was split from struct page, but
> now runs into the various SLAB_MATCH() sanity checks of the layout.
> 
> This is because the rcu_head in struct page is in union with large
> sub-structures and has space to grow without exceeding their size, while
> in struct slab (for SLAB and SLUB) it's in union only with a list_head.
> 
> On closer inspection (and after the previous patch) we can put all
> fields except slab_cache to a union with rcu_head, as slab_cache is
> sufficient for the rcu freeing callbacks to work and the rest can be
> overwritten by rcu_head without causing issues.
> 
> This is only somewhat complicated by the need to keep SLUB's
> freelist+counters aligned for cmpxchg_double. As a result the fields
> need to be reordered so that slab_cache is first (after page flags) and
> the union with rcu_head follows. For consistency, do that for SLAB as
> well, although not necessary there.
> 
> As a result, the rcu_head field in struct page and struct slab is no
> longer at the same offset, but that doesn't matter as there is no
> casting that would rely on that in the slab freeing callbacks, so we can
> just drop the respective SLAB_MATCH() check.
> 
> Also we need to update the SLAB_MATCH() for compound_head to reflect the
> new ordering.
> 
> While at it, also add a static_assert to check the alignment needed for
> cmpxchg_double so mistakes are found sooner than a runtime GPF.
> 
> [1] https://lore.kernel.org/all/85afd876-d8bb-0804-b2c5-48ed3055e702@joelfernandes.org/
> 
> Reported-by: Joel Fernandes <joel@joelfernandes.org>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

I've added patches 01 and 02 to slab tree for -next exposure before Joel's
full series posting, but it should be also ok if rcu tree carries them with
the whole patchset. I can then drop them from slab tree (there are no
dependencies with other stuff there) so we don't introduce duplicite commits
needlessly, just give me a heads up.

> ---
>  mm/slab.h | 54 ++++++++++++++++++++++++++++++++----------------------
>  1 file changed, 32 insertions(+), 22 deletions(-)
> 
> diff --git a/mm/slab.h b/mm/slab.h
> index 4ec82bec15ec..2c248864ea91 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -11,37 +11,43 @@ struct slab {
>  
>  #if defined(CONFIG_SLAB)
>  
> +	struct kmem_cache *slab_cache;
>  	union {
> -		struct list_head slab_list;
> +		struct {
> +			struct list_head slab_list;
> +			void *freelist;	/* array of free object indexes */
> +			void *s_mem;	/* first object */
> +		};
>  		struct rcu_head rcu_head;
>  	};
> -	struct kmem_cache *slab_cache;
> -	void *freelist;	/* array of free object indexes */
> -	void *s_mem;	/* first object */
>  	unsigned int active;
>  
>  #elif defined(CONFIG_SLUB)
>  
> -	union {
> -		struct list_head slab_list;
> -		struct rcu_head rcu_head;
> -#ifdef CONFIG_SLUB_CPU_PARTIAL
> -		struct {
> -			struct slab *next;
> -			int slabs;	/* Nr of slabs left */
> -		};
> -#endif
> -	};
>  	struct kmem_cache *slab_cache;
> -	/* Double-word boundary */
> -	void *freelist;		/* first free object */
>  	union {
> -		unsigned long counters;
>  		struct {
> -			unsigned inuse:16;
> -			unsigned objects:15;
> -			unsigned frozen:1;
> +			union {
> +				struct list_head slab_list;
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> +				struct {
> +					struct slab *next;
> +					int slabs;	/* Nr of slabs left */
> +				};
> +#endif
> +			};
> +			/* Double-word boundary */
> +			void *freelist;		/* first free object */
> +			union {
> +				unsigned long counters;
> +				struct {
> +					unsigned inuse:16;
> +					unsigned objects:15;
> +					unsigned frozen:1;
> +				};
> +			};
>  		};
> +		struct rcu_head rcu_head;
>  	};
>  	unsigned int __unused;
>  
> @@ -66,9 +72,10 @@ struct slab {
>  #define SLAB_MATCH(pg, sl)						\
>  	static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
>  SLAB_MATCH(flags, __page_flags);
> -SLAB_MATCH(compound_head, slab_list);	/* Ensure bit 0 is clear */
>  #ifndef CONFIG_SLOB
> -SLAB_MATCH(rcu_head, rcu_head);
> +SLAB_MATCH(compound_head, slab_cache);	/* Ensure bit 0 is clear */
> +#else
> +SLAB_MATCH(compound_head, slab_list);	/* Ensure bit 0 is clear */
>  #endif
>  SLAB_MATCH(_refcount, __page_refcount);
>  #ifdef CONFIG_MEMCG
> @@ -76,6 +83,9 @@ SLAB_MATCH(memcg_data, memcg_data);
>  #endif
>  #undef SLAB_MATCH
>  static_assert(sizeof(struct slab) <= sizeof(struct page));
> +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB)
> +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 16));
> +#endif
>  
>  /**
>   * folio_slab - Converts from folio to slab.
Vlastimil Babka Sept. 2, 2022, 9:30 a.m. UTC | #2
On 9/2/22 11:26, Vlastimil Babka wrote:
> On 9/2/22 00:17, Joel Fernandes (Google) wrote:
>> From: Vlastimil Babka <vbabka@suse.cz>
>> 
>> Joel reports [1] that increasing the rcu_head size for debugging
>> purposes used to work before struct slab was split from struct page, but
>> now runs into the various SLAB_MATCH() sanity checks of the layout.
>> 
>> This is because the rcu_head in struct page is in union with large
>> sub-structures and has space to grow without exceeding their size, while
>> in struct slab (for SLAB and SLUB) it's in union only with a list_head.
>> 
>> On closer inspection (and after the previous patch) we can put all
>> fields except slab_cache to a union with rcu_head, as slab_cache is
>> sufficient for the rcu freeing callbacks to work and the rest can be
>> overwritten by rcu_head without causing issues.
>> 
>> This is only somewhat complicated by the need to keep SLUB's
>> freelist+counters aligned for cmpxchg_double. As a result the fields
>> need to be reordered so that slab_cache is first (after page flags) and
>> the union with rcu_head follows. For consistency, do that for SLAB as
>> well, although not necessary there.
>> 
>> As a result, the rcu_head field in struct page and struct slab is no
>> longer at the same offset, but that doesn't matter as there is no
>> casting that would rely on that in the slab freeing callbacks, so we can
>> just drop the respective SLAB_MATCH() check.
>> 
>> Also we need to update the SLAB_MATCH() for compound_head to reflect the
>> new ordering.
>> 
>> While at it, also add a static_assert to check the alignment needed for
>> cmpxchg_double so mistakes are found sooner than a runtime GPF.
>> 
>> [1] https://lore.kernel.org/all/85afd876-d8bb-0804-b2c5-48ed3055e702@joelfernandes.org/
>> 
>> Reported-by: Joel Fernandes <joel@joelfernandes.org>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> I've added patches 01 and 02 to slab tree for -next exposure before Joel's
> full series posting, but it should be also ok if rcu tree carries them with
> the whole patchset. I can then drop them from slab tree (there are no
> dependencies with other stuff there) so we don't introduce duplicite commits
> needlessly, just give me a heads up.

Ah but in that case please apply the reviews from my posting [1]

patch 1:
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>

patch 2
Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>

[1] https://lore.kernel.org/all/20220826090912.11292-1-vbabka@suse.cz/
Joel Fernandes Sept. 2, 2022, 3:09 p.m. UTC | #3
On 9/2/2022 5:30 AM, Vlastimil Babka wrote:
> On 9/2/22 11:26, Vlastimil Babka wrote:
>> On 9/2/22 00:17, Joel Fernandes (Google) wrote:
>>> From: Vlastimil Babka <vbabka@suse.cz>
>>>
>>> Joel reports [1] that increasing the rcu_head size for debugging
>>> purposes used to work before struct slab was split from struct page, but
>>> now runs into the various SLAB_MATCH() sanity checks of the layout.
>>>
>>> This is because the rcu_head in struct page is in union with large
>>> sub-structures and has space to grow without exceeding their size, while
>>> in struct slab (for SLAB and SLUB) it's in union only with a list_head.
>>>
>>> On closer inspection (and after the previous patch) we can put all
>>> fields except slab_cache to a union with rcu_head, as slab_cache is
>>> sufficient for the rcu freeing callbacks to work and the rest can be
>>> overwritten by rcu_head without causing issues.
>>>
>>> This is only somewhat complicated by the need to keep SLUB's
>>> freelist+counters aligned for cmpxchg_double. As a result the fields
>>> need to be reordered so that slab_cache is first (after page flags) and
>>> the union with rcu_head follows. For consistency, do that for SLAB as
>>> well, although not necessary there.
>>>
>>> As a result, the rcu_head field in struct page and struct slab is no
>>> longer at the same offset, but that doesn't matter as there is no
>>> casting that would rely on that in the slab freeing callbacks, so we can
>>> just drop the respective SLAB_MATCH() check.
>>>
>>> Also we need to update the SLAB_MATCH() for compound_head to reflect the
>>> new ordering.
>>>
>>> While at it, also add a static_assert to check the alignment needed for
>>> cmpxchg_double so mistakes are found sooner than a runtime GPF.
>>>
>>> [1] https://lore.kernel.org/all/85afd876-d8bb-0804-b2c5-48ed3055e702@joelfernandes.org/
>>>
>>> Reported-by: Joel Fernandes <joel@joelfernandes.org>
>>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>>
>> I've added patches 01 and 02 to slab tree for -next exposure before Joel's
>> full series posting, but it should be also ok if rcu tree carries them with
>> the whole patchset. I can then drop them from slab tree (there are no
>> dependencies with other stuff there) so we don't introduce duplicite commits
>> needlessly, just give me a heads up.
> 
> Ah but in that case please apply the reviews from my posting [1]
> 
> patch 1:
> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> 
> patch 2
> Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> 
> [1] https://lore.kernel.org/all/20220826090912.11292-1-vbabka@suse.cz/


Sorry for injecting confusion - my main intent with including the mm patches in
this series is to make it easier for other reviewers/testers to backport the
series to their kernels in one shot. Some reviewers expressed interested in
trying out the series.

I think it is best to let the -mm patches in the series go through the slab
tree, as you also have the Acks/Reviews there and will take sure those
dependencies are out of the way.

My lesson here is to be more clear, I could have added some notes for context
below the "---" of those mm patches.

Thanks again for your help,

- Joel
Paul E. McKenney Sept. 3, 2022, 1:53 p.m. UTC | #4
On Fri, Sep 02, 2022 at 11:09:08AM -0400, Joel Fernandes wrote:
> 
> 
> On 9/2/2022 5:30 AM, Vlastimil Babka wrote:
> > On 9/2/22 11:26, Vlastimil Babka wrote:
> >> On 9/2/22 00:17, Joel Fernandes (Google) wrote:
> >>> From: Vlastimil Babka <vbabka@suse.cz>
> >>>
> >>> Joel reports [1] that increasing the rcu_head size for debugging
> >>> purposes used to work before struct slab was split from struct page, but
> >>> now runs into the various SLAB_MATCH() sanity checks of the layout.
> >>>
> >>> This is because the rcu_head in struct page is in union with large
> >>> sub-structures and has space to grow without exceeding their size, while
> >>> in struct slab (for SLAB and SLUB) it's in union only with a list_head.
> >>>
> >>> On closer inspection (and after the previous patch) we can put all
> >>> fields except slab_cache to a union with rcu_head, as slab_cache is
> >>> sufficient for the rcu freeing callbacks to work and the rest can be
> >>> overwritten by rcu_head without causing issues.
> >>>
> >>> This is only somewhat complicated by the need to keep SLUB's
> >>> freelist+counters aligned for cmpxchg_double. As a result the fields
> >>> need to be reordered so that slab_cache is first (after page flags) and
> >>> the union with rcu_head follows. For consistency, do that for SLAB as
> >>> well, although not necessary there.
> >>>
> >>> As a result, the rcu_head field in struct page and struct slab is no
> >>> longer at the same offset, but that doesn't matter as there is no
> >>> casting that would rely on that in the slab freeing callbacks, so we can
> >>> just drop the respective SLAB_MATCH() check.
> >>>
> >>> Also we need to update the SLAB_MATCH() for compound_head to reflect the
> >>> new ordering.
> >>>
> >>> While at it, also add a static_assert to check the alignment needed for
> >>> cmpxchg_double so mistakes are found sooner than a runtime GPF.
> >>>
> >>> [1] https://lore.kernel.org/all/85afd876-d8bb-0804-b2c5-48ed3055e702@joelfernandes.org/
> >>>
> >>> Reported-by: Joel Fernandes <joel@joelfernandes.org>
> >>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> >>
> >> I've added patches 01 and 02 to slab tree for -next exposure before Joel's
> >> full series posting, but it should be also ok if rcu tree carries them with
> >> the whole patchset. I can then drop them from slab tree (there are no
> >> dependencies with other stuff there) so we don't introduce duplicite commits
> >> needlessly, just give me a heads up.
> > 
> > Ah but in that case please apply the reviews from my posting [1]
> > 
> > patch 1:
> > Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> > 
> > patch 2
> > Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> > 
> > [1] https://lore.kernel.org/all/20220826090912.11292-1-vbabka@suse.cz/
> 
> 
> Sorry for injecting confusion - my main intent with including the mm patches in
> this series is to make it easier for other reviewers/testers to backport the
> series to their kernels in one shot. Some reviewers expressed interested in
> trying out the series.
> 
> I think it is best to let the -mm patches in the series go through the slab
> tree, as you also have the Acks/Reviews there and will take sure those
> dependencies are out of the way.
> 
> My lesson here is to be more clear, I could have added some notes for context
> below the "---" of those mm patches.
> 
> Thanks again for your help,

Hello, Vlastimil, and thank you for putting these together!

I believe that your two patches should go in via the slab tree.
I am queueing them in -rcu only temporarily and just for convenience
in testing.  I expect that I will rebase them so that I can let your
versions cover things in -next.

							Thanx, Paul
diff mbox series

Patch

diff --git a/mm/slab.h b/mm/slab.h
index 4ec82bec15ec..2c248864ea91 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -11,37 +11,43 @@  struct slab {
 
 #if defined(CONFIG_SLAB)
 
+	struct kmem_cache *slab_cache;
 	union {
-		struct list_head slab_list;
+		struct {
+			struct list_head slab_list;
+			void *freelist;	/* array of free object indexes */
+			void *s_mem;	/* first object */
+		};
 		struct rcu_head rcu_head;
 	};
-	struct kmem_cache *slab_cache;
-	void *freelist;	/* array of free object indexes */
-	void *s_mem;	/* first object */
 	unsigned int active;
 
 #elif defined(CONFIG_SLUB)
 
-	union {
-		struct list_head slab_list;
-		struct rcu_head rcu_head;
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-		struct {
-			struct slab *next;
-			int slabs;	/* Nr of slabs left */
-		};
-#endif
-	};
 	struct kmem_cache *slab_cache;
-	/* Double-word boundary */
-	void *freelist;		/* first free object */
 	union {
-		unsigned long counters;
 		struct {
-			unsigned inuse:16;
-			unsigned objects:15;
-			unsigned frozen:1;
+			union {
+				struct list_head slab_list;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+				struct {
+					struct slab *next;
+					int slabs;	/* Nr of slabs left */
+				};
+#endif
+			};
+			/* Double-word boundary */
+			void *freelist;		/* first free object */
+			union {
+				unsigned long counters;
+				struct {
+					unsigned inuse:16;
+					unsigned objects:15;
+					unsigned frozen:1;
+				};
+			};
 		};
+		struct rcu_head rcu_head;
 	};
 	unsigned int __unused;
 
@@ -66,9 +72,10 @@  struct slab {
 #define SLAB_MATCH(pg, sl)						\
 	static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
 SLAB_MATCH(flags, __page_flags);
-SLAB_MATCH(compound_head, slab_list);	/* Ensure bit 0 is clear */
 #ifndef CONFIG_SLOB
-SLAB_MATCH(rcu_head, rcu_head);
+SLAB_MATCH(compound_head, slab_cache);	/* Ensure bit 0 is clear */
+#else
+SLAB_MATCH(compound_head, slab_list);	/* Ensure bit 0 is clear */
 #endif
 SLAB_MATCH(_refcount, __page_refcount);
 #ifdef CONFIG_MEMCG
@@ -76,6 +83,9 @@  SLAB_MATCH(memcg_data, memcg_data);
 #endif
 #undef SLAB_MATCH
 static_assert(sizeof(struct slab) <= sizeof(struct page));
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB)
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 16));
+#endif
 
 /**
  * folio_slab - Converts from folio to slab.