diff mbox series

[v2,20/21] mm/slub: optimize alloc fastpath code layout

Message ID 20231120-slab-remove-slab-v2-20-9c9c70177183@suse.cz (mailing list archive)
State Mainlined
Commit 3450a0e5a6fc4cdbd70853f12c0c332dd24c1349
Headers show
Series remove the SLAB allocator | expand

Commit Message

Vlastimil Babka Nov. 20, 2023, 6:34 p.m. UTC
With allocation fastpaths no longer divided between two .c files, we
have better inlining, however checking the disassembly of
kmem_cache_alloc() reveals we can do better to make the fastpaths
smaller and move the less common situations out of line or to separate
functions, to reduce instruction cache pressure.

- split memcg pre/post alloc hooks to inlined checks that use likely()
  to assume there will be no objcg handling necessary, and non-inline
  functions doing the actual handling

- add some more likely/unlikely() to pre/post alloc hooks to indicate
  which scenarios should be out of line

- change gfp_allowed_mask handling in slab_post_alloc_hook() so the
  code can be optimized away when kasan/kmsan/kmemleak is configured out

bloat-o-meter shows:
add/remove: 4/2 grow/shrink: 1/8 up/down: 521/-2924 (-2403)
Function                                     old     new   delta
__memcg_slab_post_alloc_hook                   -     461    +461
kmem_cache_alloc_bulk                        775     791     +16
__pfx_should_failslab.constprop                -      16     +16
__pfx___memcg_slab_post_alloc_hook             -      16     +16
should_failslab.constprop                      -      12     +12
__pfx_memcg_slab_post_alloc_hook              16       -     -16
kmem_cache_alloc_lru                        1295    1023    -272
kmem_cache_alloc_node                       1118     817    -301
kmem_cache_alloc                            1076     772    -304
kmalloc_node_trace                          1149     838    -311
kmalloc_trace                               1102     789    -313
__kmalloc_node_track_caller                 1393    1080    -313
__kmalloc_node                              1397    1082    -315
__kmalloc                                   1374    1059    -315
memcg_slab_post_alloc_hook                   464       -    -464

Note that gcc still decided to inline __memcg_pre_alloc_hook(), but the
code is out of line. Forcing noinline did not improve the results. As a
result the fastpaths are shorter and overal code size is reduced.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 89 ++++++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 35 deletions(-)

Comments

Hyeonggon Yoo Dec. 7, 2023, 2:32 a.m. UTC | #1
On Mon, Nov 20, 2023 at 07:34:31PM +0100, Vlastimil Babka wrote:
> With allocation fastpaths no longer divided between two .c files, we
> have better inlining, however checking the disassembly of
> kmem_cache_alloc() reveals we can do better to make the fastpaths
> smaller and move the less common situations out of line or to separate
> functions, to reduce instruction cache pressure.
> 
> - split memcg pre/post alloc hooks to inlined checks that use likely()
>   to assume there will be no objcg handling necessary, and non-inline
>   functions doing the actual handling
> 
> - add some more likely/unlikely() to pre/post alloc hooks to indicate
>   which scenarios should be out of line
> 
> - change gfp_allowed_mask handling in slab_post_alloc_hook() so the
>   code can be optimized away when kasan/kmsan/kmemleak is configured out
> 
> bloat-o-meter shows:
> add/remove: 4/2 grow/shrink: 1/8 up/down: 521/-2924 (-2403)
> Function                                     old     new   delta
> __memcg_slab_post_alloc_hook                   -     461    +461
> kmem_cache_alloc_bulk                        775     791     +16
> __pfx_should_failslab.constprop                -      16     +16
> __pfx___memcg_slab_post_alloc_hook             -      16     +16
> should_failslab.constprop                      -      12     +12
> __pfx_memcg_slab_post_alloc_hook              16       -     -16
> kmem_cache_alloc_lru                        1295    1023    -272
> kmem_cache_alloc_node                       1118     817    -301
> kmem_cache_alloc                            1076     772    -304
> kmalloc_node_trace                          1149     838    -311
> kmalloc_trace                               1102     789    -313
> __kmalloc_node_track_caller                 1393    1080    -313
> __kmalloc_node                              1397    1082    -315
> __kmalloc                                   1374    1059    -315
> memcg_slab_post_alloc_hook                   464       -    -464
> 
> Note that gcc still decided to inline __memcg_pre_alloc_hook(), but the
> code is out of line. Forcing noinline did not improve the results. As a
> result the fastpaths are shorter and overal code size is reduced.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/slub.c | 89 ++++++++++++++++++++++++++++++++++++++-------------------------
>  1 file changed, 54 insertions(+), 35 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 5683f1d02e4f..77d259f3d592 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1866,25 +1866,17 @@ static inline size_t obj_full_size(struct kmem_cache *s)
>  /*
>   * Returns false if the allocation should fail.
>   */
> -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
> -					     struct list_lru *lru,
> -					     struct obj_cgroup **objcgp,
> -					     size_t objects, gfp_t flags)
> +static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s,
> +					struct list_lru *lru,
> +					struct obj_cgroup **objcgp,
> +					size_t objects, gfp_t flags)
>  {
> -	struct obj_cgroup *objcg;
> -
> -	if (!memcg_kmem_online())
> -		return true;
> -
> -	if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
> -		return true;
> -
>  	/*
>  	 * The obtained objcg pointer is safe to use within the current scope,
>  	 * defined by current task or set_active_memcg() pair.
>  	 * obj_cgroup_get() is used to get a permanent reference.
>  	 */
> -	objcg = current_obj_cgroup();
> +	struct obj_cgroup *objcg = current_obj_cgroup();
>  	if (!objcg)
>  		return true;
>  
> @@ -1907,17 +1899,34 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
>  	return true;
>  }
>  
> -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> -					      struct obj_cgroup *objcg,
> -					      gfp_t flags, size_t size,
> -					      void **p)
> +/*
> + * Returns false if the allocation should fail.
> + */
> +static __fastpath_inline
> +bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
> +			       struct obj_cgroup **objcgp, size_t objects,
> +			       gfp_t flags)
> +{
> +	if (!memcg_kmem_online())
> +		return true;
> +
> +	if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
> +		return true;
> +
> +	return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects,
> +						  flags));
> +}
> +
> +static void __memcg_slab_post_alloc_hook(struct kmem_cache *s,
> +					 struct obj_cgroup *objcg,
> +					 gfp_t flags, size_t size,
> +					 void **p)
>  {
>  	struct slab *slab;
>  	unsigned long off;
>  	size_t i;
>  
> -	if (!memcg_kmem_online() || !objcg)
> -		return;
> +	flags &= gfp_allowed_mask;
>  
>  	for (i = 0; i < size; i++) {
>  		if (likely(p[i])) {
> @@ -1940,6 +1949,16 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
>  	}
>  }
>  
> +static __fastpath_inline
> +void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
> +				gfp_t flags, size_t size, void **p)
> +{
> +	if (likely(!memcg_kmem_online() || !objcg))
> +		return;
> +
> +	return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
> +}
> +
>  static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
>  					void **p, int objects)
>  {
> @@ -3709,34 +3728,34 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
>  }
>  ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
>  
> -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
> -						     struct list_lru *lru,
> -						     struct obj_cgroup **objcgp,
> -						     size_t size, gfp_t flags)
> +static __fastpath_inline
> +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
> +				       struct list_lru *lru,
> +				       struct obj_cgroup **objcgp,
> +				       size_t size, gfp_t flags)
>  {
>  	flags &= gfp_allowed_mask;
>  
>  	might_alloc(flags);
>  
> -	if (should_failslab(s, flags))
> +	if (unlikely(should_failslab(s, flags)))
>  		return NULL;
>  
> -	if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))
> +	if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)))
>  		return NULL;
>  
>  	return s;
>  }
>  
> -static inline void slab_post_alloc_hook(struct kmem_cache *s,
> -					struct obj_cgroup *objcg, gfp_t flags,
> -					size_t size, void **p, bool init,
> -					unsigned int orig_size)
> +static __fastpath_inline
> +void slab_post_alloc_hook(struct kmem_cache *s,	struct obj_cgroup *objcg,
> +			  gfp_t flags, size_t size, void **p, bool init,
> +			  unsigned int orig_size)
>  {
>  	unsigned int zero_size = s->object_size;
>  	bool kasan_init = init;
>  	size_t i;
> -
> -	flags &= gfp_allowed_mask;
> +	gfp_t init_flags = flags & gfp_allowed_mask;
>  
>  	/*
>  	 * For kmalloc object, the allocated memory size(object_size) is likely
> @@ -3769,13 +3788,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
>  	 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
>  	 */
>  	for (i = 0; i < size; i++) {
> -		p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init);
> +		p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
>  		if (p[i] && init && (!kasan_init ||
>  				     !kasan_has_integrated_init()))
>  			memset(p[i], 0, zero_size);
>  		kmemleak_alloc_recursive(p[i], s->object_size, 1,
> -					 s->flags, flags);
> -		kmsan_slab_alloc(s, p[i], flags);
> +					 s->flags, init_flags);
> +		kmsan_slab_alloc(s, p[i], init_flags);
>  	}
>  
>  	memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
> @@ -3799,7 +3818,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
>  	bool init = false;
>  
>  	s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
> -	if (!s)
> +	if (unlikely(!s))
>  		return NULL;
>  
>  	object = kfence_alloc(s, orig_size, gfpflags);
> 
> -- 

Looks good to me,
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>

> 2.42.1
> 
>
diff mbox series

Patch

diff --git a/mm/slub.c b/mm/slub.c
index 5683f1d02e4f..77d259f3d592 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1866,25 +1866,17 @@  static inline size_t obj_full_size(struct kmem_cache *s)
 /*
  * Returns false if the allocation should fail.
  */
-static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
-					     struct list_lru *lru,
-					     struct obj_cgroup **objcgp,
-					     size_t objects, gfp_t flags)
+static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+					struct list_lru *lru,
+					struct obj_cgroup **objcgp,
+					size_t objects, gfp_t flags)
 {
-	struct obj_cgroup *objcg;
-
-	if (!memcg_kmem_online())
-		return true;
-
-	if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
-		return true;
-
 	/*
 	 * The obtained objcg pointer is safe to use within the current scope,
 	 * defined by current task or set_active_memcg() pair.
 	 * obj_cgroup_get() is used to get a permanent reference.
 	 */
-	objcg = current_obj_cgroup();
+	struct obj_cgroup *objcg = current_obj_cgroup();
 	if (!objcg)
 		return true;
 
@@ -1907,17 +1899,34 @@  static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
 	return true;
 }
 
-static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
-					      struct obj_cgroup *objcg,
-					      gfp_t flags, size_t size,
-					      void **p)
+/*
+ * Returns false if the allocation should fail.
+ */
+static __fastpath_inline
+bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
+			       struct obj_cgroup **objcgp, size_t objects,
+			       gfp_t flags)
+{
+	if (!memcg_kmem_online())
+		return true;
+
+	if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
+		return true;
+
+	return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects,
+						  flags));
+}
+
+static void __memcg_slab_post_alloc_hook(struct kmem_cache *s,
+					 struct obj_cgroup *objcg,
+					 gfp_t flags, size_t size,
+					 void **p)
 {
 	struct slab *slab;
 	unsigned long off;
 	size_t i;
 
-	if (!memcg_kmem_online() || !objcg)
-		return;
+	flags &= gfp_allowed_mask;
 
 	for (i = 0; i < size; i++) {
 		if (likely(p[i])) {
@@ -1940,6 +1949,16 @@  static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 	}
 }
 
+static __fastpath_inline
+void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
+				gfp_t flags, size_t size, void **p)
+{
+	if (likely(!memcg_kmem_online() || !objcg))
+		return;
+
+	return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
+}
+
 static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
 					void **p, int objects)
 {
@@ -3709,34 +3728,34 @@  noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
 }
 ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
 
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
-						     struct list_lru *lru,
-						     struct obj_cgroup **objcgp,
-						     size_t size, gfp_t flags)
+static __fastpath_inline
+struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+				       struct list_lru *lru,
+				       struct obj_cgroup **objcgp,
+				       size_t size, gfp_t flags)
 {
 	flags &= gfp_allowed_mask;
 
 	might_alloc(flags);
 
-	if (should_failslab(s, flags))
+	if (unlikely(should_failslab(s, flags)))
 		return NULL;
 
-	if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))
+	if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)))
 		return NULL;
 
 	return s;
 }
 
-static inline void slab_post_alloc_hook(struct kmem_cache *s,
-					struct obj_cgroup *objcg, gfp_t flags,
-					size_t size, void **p, bool init,
-					unsigned int orig_size)
+static __fastpath_inline
+void slab_post_alloc_hook(struct kmem_cache *s,	struct obj_cgroup *objcg,
+			  gfp_t flags, size_t size, void **p, bool init,
+			  unsigned int orig_size)
 {
 	unsigned int zero_size = s->object_size;
 	bool kasan_init = init;
 	size_t i;
-
-	flags &= gfp_allowed_mask;
+	gfp_t init_flags = flags & gfp_allowed_mask;
 
 	/*
 	 * For kmalloc object, the allocated memory size(object_size) is likely
@@ -3769,13 +3788,13 @@  static inline void slab_post_alloc_hook(struct kmem_cache *s,
 	 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
 	 */
 	for (i = 0; i < size; i++) {
-		p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init);
+		p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
 		if (p[i] && init && (!kasan_init ||
 				     !kasan_has_integrated_init()))
 			memset(p[i], 0, zero_size);
 		kmemleak_alloc_recursive(p[i], s->object_size, 1,
-					 s->flags, flags);
-		kmsan_slab_alloc(s, p[i], flags);
+					 s->flags, init_flags);
+		kmsan_slab_alloc(s, p[i], init_flags);
 	}
 
 	memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
@@ -3799,7 +3818,7 @@  static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
 	bool init = false;
 
 	s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
-	if (!s)
+	if (unlikely(!s))
 		return NULL;
 
 	object = kfence_alloc(s, orig_size, gfpflags);