Message ID | 20230202185801.4179599-5-edumazet@google.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: core: use a dedicated kmem_cache for skb head allocs | expand |
On Thu, Feb 2, 2023 at 1:58 PM Eric Dumazet <edumazet@google.com> wrote: > > Recent removal of ksize() in alloc_skb() increased > performance because we no longer read > the associated struct page. > > We have an equivalent cost at kfree_skb() time. > > kfree(skb->head) has to access a struct page, > often cold in cpu caches to get the owning > struct kmem_cache. > > Considering that many allocations are small, > we can have our own kmem_cache to avoid the cache line miss. > > This also saves memory because these small heads > are no longer padded to 1024 bytes. > > CONFIG_SLUB=y > $ grep skbuff_small_head /proc/slabinfo > skbuff_small_head 2907 2907 640 51 8 : tunables 0 0 0 : slabdata 57 57 0 > > CONFIG_SLAB=y > $ grep skbuff_small_head /proc/slabinfo > skbuff_small_head 607 624 640 6 1 : tunables 54 27 8 : slabdata 104 104 5 > > Note: after Kees Cook patches and this one, we might > be able to revert commit > dbae2b062824 ("net: skb: introduce and use a single page frag cache") > because GRO_MAX_HEAD is also small. > > Signed-off-by: Eric Dumazet <edumazet@google.com> > Cc: Paolo Abeni <pabeni@redhat.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Very nice! > --- > net/core/skbuff.c | 52 ++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 47 insertions(+), 5 deletions(-) > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index ae0b2aa1f01e8060cc4fe69137e9bd98e44280cc..3e540b4924701cc57b6fbd1b668bab3b652ee94c 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -89,6 +89,19 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init; > #ifdef CONFIG_SKB_EXTENSIONS > static struct kmem_cache *skbuff_ext_cache __ro_after_init; > #endif > +static struct kmem_cache *skb_small_head_cache __ro_after_init; > + > +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) > + > +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. */ > +#define SKB_SMALL_HEAD_CACHE_SIZE \ > + (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ > + (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ > + SKB_SMALL_HEAD_SIZE) > + > +#define SKB_SMALL_HEAD_HEADROOM \ > + SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) > + > int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; > EXPORT_SYMBOL(sysctl_max_skb_frags); > > @@ -486,6 +499,21 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, > void *obj; > > obj_size = SKB_HEAD_ALIGN(*size); > + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && > + !(flags & KMALLOC_NOT_NORMAL_BITS)) { > + > + /* skb_small_head_cache has non power of two size, > + * likely forcing SLUB to use order-3 pages. > + * We deliberately attempt a NOMEMALLOC allocation only. > + */ > + obj = kmem_cache_alloc_node(skb_small_head_cache, > + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, > + node); > + if (obj) { > + *size = SKB_SMALL_HEAD_CACHE_SIZE; > + goto out; > + } > + } > *size = obj_size = kmalloc_size_roundup(obj_size); > /* > * Try a regular allocation, when that fails and we're not entitled > @@ -805,6 +833,14 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data) > return page_pool_return_skb_page(virt_to_page(data)); > } > > +static void skb_kfree_head(void *head, unsigned int end_offset) > +{ > + if (end_offset == SKB_SMALL_HEAD_HEADROOM) > + kmem_cache_free(skb_small_head_cache, head); > + else > + kfree(head); > +} > + > static void skb_free_head(struct sk_buff *skb) > { > unsigned char *head = skb->head; > @@ -814,7 +850,7 @@ static void skb_free_head(struct sk_buff *skb) > return; > skb_free_frag(head); > } else { > - kfree(head); > + skb_kfree_head(head, skb_end_offset(skb)); > } > } > > @@ -1995,7 +2031,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, > return 0; > > nofrags: > - kfree(data); > + skb_kfree_head(data, size); > nodata: > return -ENOMEM; > } > @@ -4633,6 +4669,12 @@ void __init skb_init(void) > 0, > SLAB_HWCACHE_ALIGN|SLAB_PANIC, > NULL); > + skb_small_head_cache = kmem_cache_create("skbuff_small_head", > + SKB_SMALL_HEAD_CACHE_SIZE, > + 0, > + SLAB_HWCACHE_ALIGN | SLAB_PANIC, > + NULL); > + > skb_extensions_init(); > } > > @@ -6297,7 +6339,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, > if (skb_cloned(skb)) { > /* drop the old head gracefully */ > if (skb_orphan_frags(skb, gfp_mask)) { > - kfree(data); > + skb_kfree_head(data, size); > return -ENOMEM; > } > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > @@ -6405,7 +6447,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, > memcpy((struct skb_shared_info *)(data + size), > skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); > if (skb_orphan_frags(skb, gfp_mask)) { > - kfree(data); > + skb_kfree_head(data, size); > return -ENOMEM; > } > shinfo = (struct skb_shared_info *)(data + size); > @@ -6441,7 +6483,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, > /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ > if (skb_has_frag_list(skb)) > kfree_skb_list(skb_shinfo(skb)->frag_list); > - kfree(data); > + skb_kfree_head(data, size); > return -ENOMEM; > } > skb_release_data(skb, SKB_CONSUMED); > -- > 2.39.1.456.gfc5497dd1b-goog >
On Thu, 2 Feb 2023 18:58:01 +0000 Eric Dumazet wrote:
> +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. */
Why is that? Is it to prevent potential mixing up of objects from
the cache with objects from general slabs (since we only do a
end_offset == SKB_SMALL_HEAD_HEADROOM check)?
On Fri, Feb 3, 2023 at 6:11 AM Jakub Kicinski <kuba@kernel.org> wrote: > > On Thu, 2 Feb 2023 18:58:01 +0000 Eric Dumazet wrote: > > +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. */ > > Why is that? Is it to prevent potential mixing up of objects from > the cache with objects from general slabs (since we only do a > end_offset == SKB_SMALL_HEAD_HEADROOM check)? Good question. Some alloc_skb() callers use GFP_DMA (or __GFP_ACCOUNT) we can not use the dedicated kmem_cache for them. They could get an object of size 512 or 1024 Since I chose not adding yet another skb->head_has_been_allocated_from_small_head_cache, we want to make sure we will not have issues in the future, if SKB_HEAD_ALIGN(MAX_TCP_HEADER) becomes a power-of-two. (for example for some of us increasing MAX_SKB_FRAGS) Alternative would be to add a check at boot time, making sure no standard cache has the same object size. This might have an issue with CONFIG_SLOB=y, I wish this was gone already...
On Thu, 2023-02-02 at 18:58 +0000, Eric Dumazet wrote: > Note: after Kees Cook patches and this one, we might > be able to revert commit > dbae2b062824 ("net: skb: introduce and use a single page frag cache") > because GRO_MAX_HEAD is also small. I guess I'll need some time to do the relevant benchmarks, but I'm not able to schedule them very soon. > @@ -486,6 +499,21 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, > void *obj; > > obj_size = SKB_HEAD_ALIGN(*size); > + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && > + !(flags & KMALLOC_NOT_NORMAL_BITS)) { > + > + /* skb_small_head_cache has non power of two size, > + * likely forcing SLUB to use order-3 pages. > + * We deliberately attempt a NOMEMALLOC allocation only. > + */ > + obj = kmem_cache_alloc_node(skb_small_head_cache, > + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, > + node); > + if (obj) { > + *size = SKB_SMALL_HEAD_CACHE_SIZE; > + goto out; > + } In case kmem allocation failure, should we try to skip the 2nd __GFP_NOMEMALLOC attempt below? I *think* non power of two size is also required to avoid an issue plain (no GFP_DMA nor __GFP_ACCOUNT) allocations in case of fallback to kmalloc(), to prevent skb_kfree_head() mis-interpreting skb->head as kmem_cache allocated. Thanks! Paolo
On Fri, Feb 3, 2023 at 8:59 AM Paolo Abeni <pabeni@redhat.com> wrote: > > On Thu, 2023-02-02 at 18:58 +0000, Eric Dumazet wrote: > > Note: after Kees Cook patches and this one, we might > > be able to revert commit > > dbae2b062824 ("net: skb: introduce and use a single page frag cache") > > because GRO_MAX_HEAD is also small. > > I guess I'll need some time to do the relevant benchmarks, but I'm not > able to schedule them very soon. No worries, this can be done later. Note the results might depend on SLUB/SLAB choice. > > > @@ -486,6 +499,21 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, > > void *obj; > > > > obj_size = SKB_HEAD_ALIGN(*size); > > + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && > > + !(flags & KMALLOC_NOT_NORMAL_BITS)) { > > + > > + /* skb_small_head_cache has non power of two size, > > + * likely forcing SLUB to use order-3 pages. > > + * We deliberately attempt a NOMEMALLOC allocation only. > > + */ > > + obj = kmem_cache_alloc_node(skb_small_head_cache, > > + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, > > + node); > > + if (obj) { > > + *size = SKB_SMALL_HEAD_CACHE_SIZE; > > + goto out; > > + } > > In case kmem allocation failure, should we try to skip the 2nd > __GFP_NOMEMALLOC attempt below? We could, but my reasoning was that we might find an object in the other kmem_cache freelist. kmalloc-1 objects tend to use smaller page orders, so are more likely to succeed if there is no order-3 page available in the buddy allocator.(I mentioned this in the comment) > > I *think* non power of two size is also required to avoid an issue > plain (no GFP_DMA nor __GFP_ACCOUNT) allocations in case of fallback to > kmalloc(), to prevent skb_kfree_head() mis-interpreting skb->head as > kmem_cache allocated. Indeed there are multiple cases explaining why SKB_SMALL_HEAD_CACHE_SIZE needs to be a non power of two.
Hi Eric, I love your patch! Yet something to improve: [auto build test ERROR on net-next/master] url: https://github.com/intel-lab-lkp/linux/commits/Eric-Dumazet/net-add-SKB_HEAD_ALIGN-helper/20230203-031126 patch link: https://lore.kernel.org/r/20230202185801.4179599-5-edumazet%40google.com patch subject: [PATCH net-next 4/4] net: add dedicated kmem_cache for typical/small skb->head config: arc-randconfig-r014-20230204 (https://download.01.org/0day-ci/archive/20230204/202302040329.E10xZHbY-lkp@intel.com/config) compiler: arceb-elf-gcc (GCC) 12.1.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/002bb9238e98f3fbeb0402c97f711420c3321b93 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Eric-Dumazet/net-add-SKB_HEAD_ALIGN-helper/20230203-031126 git checkout 002bb9238e98f3fbeb0402c97f711420c3321b93 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc olddefconfig COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc SHELL=/bin/bash net/core/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> All errors (new ones prefixed by >>): net/core/skbuff.c: In function 'kmalloc_reserve': >> net/core/skbuff.c:503:23: error: 'KMALLOC_NOT_NORMAL_BITS' undeclared (first use in this function) 503 | !(flags & KMALLOC_NOT_NORMAL_BITS)) { | ^~~~~~~~~~~~~~~~~~~~~~~ net/core/skbuff.c:503:23: note: each undeclared identifier is reported only once for each function it appears in vim +/KMALLOC_NOT_NORMAL_BITS +503 net/core/skbuff.c 486 487 /* 488 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 489 * the caller if emergency pfmemalloc reserves are being used. If it is and 490 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 491 * may be used. Otherwise, the packet data may be discarded until enough 492 * memory is free 493 */ 494 static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, 495 bool *pfmemalloc) 496 { 497 bool ret_pfmemalloc = false; 498 unsigned int obj_size; 499 void *obj; 500 501 obj_size = SKB_HEAD_ALIGN(*size); 502 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && > 503 !(flags & KMALLOC_NOT_NORMAL_BITS)) { 504 505 /* skb_small_head_cache has non power of two size, 506 * likely forcing SLUB to use order-3 pages. 507 * We deliberately attempt a NOMEMALLOC allocation only. 508 */ 509 obj = kmem_cache_alloc_node(skb_small_head_cache, 510 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 511 node); 512 if (obj) { 513 *size = SKB_SMALL_HEAD_CACHE_SIZE; 514 goto out; 515 } 516 } 517 *size = obj_size = kmalloc_size_roundup(obj_size); 518 /* 519 * Try a regular allocation, when that fails and we're not entitled 520 * to the reserves, fail. 521 */ 522 obj = kmalloc_node_track_caller(obj_size, 523 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 524 node); 525 if (obj || !(gfp_pfmemalloc_allowed(flags))) 526 goto out; 527 528 /* Try again but now we are using pfmemalloc reserves */ 529 ret_pfmemalloc = true; 530 obj = kmalloc_node_track_caller(obj_size, flags, node); 531 532 out: 533 if (pfmemalloc) 534 *pfmemalloc = ret_pfmemalloc; 535 536 return obj; 537 } 538
On Sat, 4 Feb 2023 03:37:10 +0800 kernel test robot wrote: > All errors (new ones prefixed by >>): > > net/core/skbuff.c: In function 'kmalloc_reserve': > >> net/core/skbuff.c:503:23: error: 'KMALLOC_NOT_NORMAL_BITS' undeclared (first use in this function) > 503 | !(flags & KMALLOC_NOT_NORMAL_BITS)) { > | ^~~~~~~~~~~~~~~~~~~~~~~ > net/core/skbuff.c:503:23: note: each undeclared identifier is reported only once for each function it appears in diff --git a/net/Kconfig b/net/Kconfig index 27bc49c7ad73..41e27a4805a8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -8,6 +8,7 @@ menuconfig NET select NLATTR select GENERIC_NET_UTILS select BPF + depends on !SLOB help Unless you really know what you are doing, you should say Y here. The reason is that some programs need kernel networking support even ? :)
On Fri, 3 Feb 2023 09:17:55 +0100 Eric Dumazet wrote: > > I *think* non power of two size is also required to avoid an issue > > plain (no GFP_DMA nor __GFP_ACCOUNT) allocations in case of fallback to > > kmalloc(), to prevent skb_kfree_head() mis-interpreting skb->head as > > kmem_cache allocated. > > Indeed there are multiple cases explaining why SKB_SMALL_HEAD_CACHE_SIZE > needs to be a non power of two. Since you may need to respin would you mind spelling this out a bit more in the comment ? Maybe: -/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. */ +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. + * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique + * size, and we can differentiate heads from skb_small_head_cache + * vs system slabs by looking at their size (skb_end_offset()). + */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ae0b2aa1f01e8060cc4fe69137e9bd98e44280cc..3e540b4924701cc57b6fbd1b668bab3b652ee94c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -89,6 +89,19 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif +static struct kmem_cache *skb_small_head_cache __ro_after_init; + +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) + +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. */ +#define SKB_SMALL_HEAD_CACHE_SIZE \ + (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ + (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ + SKB_SMALL_HEAD_SIZE) + +#define SKB_SMALL_HEAD_HEADROOM \ + SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) + int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -486,6 +499,21 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, void *obj; obj_size = SKB_HEAD_ALIGN(*size); + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && + !(flags & KMALLOC_NOT_NORMAL_BITS)) { + + /* skb_small_head_cache has non power of two size, + * likely forcing SLUB to use order-3 pages. + * We deliberately attempt a NOMEMALLOC allocation only. + */ + obj = kmem_cache_alloc_node(skb_small_head_cache, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj) { + *size = SKB_SMALL_HEAD_CACHE_SIZE; + goto out; + } + } *size = obj_size = kmalloc_size_roundup(obj_size); /* * Try a regular allocation, when that fails and we're not entitled @@ -805,6 +833,14 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data) return page_pool_return_skb_page(virt_to_page(data)); } +static void skb_kfree_head(void *head, unsigned int end_offset) +{ + if (end_offset == SKB_SMALL_HEAD_HEADROOM) + kmem_cache_free(skb_small_head_cache, head); + else + kfree(head); +} + static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; @@ -814,7 +850,7 @@ static void skb_free_head(struct sk_buff *skb) return; skb_free_frag(head); } else { - kfree(head); + skb_kfree_head(head, skb_end_offset(skb)); } } @@ -1995,7 +2031,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, return 0; nofrags: - kfree(data); + skb_kfree_head(data, size); nodata: return -ENOMEM; } @@ -4633,6 +4669,12 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_small_head_cache = kmem_cache_create("skbuff_small_head", + SKB_SMALL_HEAD_CACHE_SIZE, + 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, + NULL); + skb_extensions_init(); } @@ -6297,7 +6339,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_cloned(skb)) { /* drop the old head gracefully */ if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -6405,7 +6447,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } shinfo = (struct skb_shared_info *)(data + size); @@ -6441,7 +6483,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } skb_release_data(skb, SKB_CONSUMED);
Recent removal of ksize() in alloc_skb() increased performance because we no longer read the associated struct page. We have an equivalent cost at kfree_skb() time. kfree(skb->head) has to access a struct page, often cold in cpu caches to get the owning struct kmem_cache. Considering that many allocations are small, we can have our own kmem_cache to avoid the cache line miss. This also saves memory because these small heads are no longer padded to 1024 bytes. CONFIG_SLUB=y $ grep skbuff_small_head /proc/slabinfo skbuff_small_head 2907 2907 640 51 8 : tunables 0 0 0 : slabdata 57 57 0 CONFIG_SLAB=y $ grep skbuff_small_head /proc/slabinfo skbuff_small_head 607 624 640 6 1 : tunables 54 27 8 : slabdata 104 104 5 Note: after Kees Cook patches and this one, we might be able to revert commit dbae2b062824 ("net: skb: introduce and use a single page frag cache") because GRO_MAX_HEAD is also small. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Paolo Abeni <pabeni@redhat.com> --- net/core/skbuff.c | 52 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-)