diff mbox series

[net-next,2/3] net: skbuff: cache one skb_ext for use by GRO

Message ID 20230215034355.481925-3-kuba@kernel.org (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: skbuff: cache one skb_ext for use by GRO | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5337 this patch: 5337
netdev/cc_maintainers warning 1 maintainers not CCed: imagedong@tencent.com
netdev/build_clang success Errors and warnings before: 1087 this patch: 1087
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5552 this patch: 5552
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 122 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Jakub Kicinski Feb. 15, 2023, 3:43 a.m. UTC
On the driver -> GRO path we can avoid thrashing the kmemcache
by holding onto one skb_ext.

Drivers usually report static data, so don't bother trying to
hold onto the skb_ext if the ext has contents which require
a destructor.

With a single flow and SW GRO adding a tc_skb_ext to every
frame costs around 16.6% of performance (21.2Gbps -> 17.6Gbps,
yes it's a relatively slow CPU). Using the cache reduces
the loss to 9.3%, (-> 19.2Gbps) although obviously in real
life the recycling will be less effective.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c      | 79 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 75 insertions(+), 5 deletions(-)

Comments

Paolo Abeni Feb. 15, 2023, 8:41 a.m. UTC | #1
On Tue, 2023-02-14 at 19:43 -0800, Jakub Kicinski wrote:
> On the driver -> GRO path we can avoid thrashing the kmemcache
> by holding onto one skb_ext.
> 
> Drivers usually report static data, so don't bother trying to
> hold onto the skb_ext if the ext has contents which require
> a destructor.
> 
> With a single flow and SW GRO adding a tc_skb_ext to every
> frame costs around 16.6% of performance (21.2Gbps -> 17.6Gbps,
> yes it's a relatively slow CPU). Using the cache reduces
> the loss to 9.3%, (-> 19.2Gbps) although obviously in real
> life the recycling will be less effective.
> 
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
>  include/linux/skbuff.h |  1 +
>  net/core/skbuff.c      | 79 +++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 75 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index d5602b15c714..e68cb0a777b9 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -4622,6 +4622,7 @@ struct skb_ext *__skb_ext_alloc(gfp_t flags);
>  void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
>  		    struct skb_ext *ext);
>  void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
> +void *napi_skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
>  void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
>  void __skb_ext_put(struct skb_ext *ext);
>  
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 6f0fc1f09536..feb5034b13ad 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -224,6 +224,9 @@ static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
>  struct napi_alloc_cache {
>  	struct page_frag_cache page;
>  	struct page_frag_1k page_small;
> +#ifdef CONFIG_SKB_EXTENSIONS
> +	struct skb_ext *ext;
> +#endif
>  	unsigned int skb_count;
>  	void *skb_cache[NAPI_SKB_CACHE_SIZE];
>  };
> @@ -1228,6 +1231,43 @@ static void napi_skb_cache_put(struct sk_buff *skb)
>  	}
>  }
>  
> +static bool skb_ext_needs_destruct(const struct skb_ext *ext)
> +{
> +	bool needs_destruct = false;
> +
> +#ifdef CONFIG_XFRM
> +	needs_destruct |= __skb_ext_exist(ext, SKB_EXT_SEC_PATH);
> +#endif
> +#ifdef CONFIG_MCTP_FLOWS
> +	needs_destruct |= __skb_ext_exist(ext, SKB_EXT_MCTP);
> +#endif
> +
> +	return needs_destruct;
> +}
> +
> +static void napi_skb_ext_put(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_SKB_EXTENSIONS
> +	struct skb_ext *ext;
> +
> +	if (!skb->active_extensions)
> +		return;
> +
> +	ext = skb->extensions;
> +	if (!skb_ext_needs_destruct(ext)) {
> +		struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> +
> +		if (refcount_read(&ext->refcnt) == 1 && !nc->ext) {
> +			kasan_poison_object_data(skbuff_ext_cache, ext);
> +			nc->ext = ext;
> +			return;
> +		}
> +	}
> +
> +	__skb_ext_put(ext);
> +#endif
> +}
> +
>  void __kfree_skb_defer(struct sk_buff *skb)

I'm wondering if napi_reuse_skb() should be touched, too? Even it's not
directly used by the following patch...

Cheers,

Paolo
Edward Cree Feb. 15, 2023, 3:37 p.m. UTC | #2
On 15/02/2023 03:43, Jakub Kicinski wrote:
> On the driver -> GRO path we can avoid thrashing the kmemcache
> by holding onto one skb_ext.

Hmm, will one be enough if we're doing GRO_NORMAL batching?
As for e.g. UDP traffic up to 8 skbs (by default) can have
 overlapping lifetimes.
Alexander Lobakin Feb. 15, 2023, 4:17 p.m. UTC | #3
From: Edward Cree <ecree.xilinx@gmail.com>
Date: Wed, 15 Feb 2023 15:37:44 +0000

> On 15/02/2023 03:43, Jakub Kicinski wrote:
>> On the driver -> GRO path we can avoid thrashing the kmemcache
>> by holding onto one skb_ext.
> 
> Hmm, will one be enough if we're doing GRO_NORMAL batching?
> As for e.g. UDP traffic up to 8 skbs (by default) can have
>  overlapping lifetimes.
> 
I thought of an array of %NAPI_SKB_CACHE_SIZE to be honest. From what
I've ever tested, no cache (for any netstack-related object) is enough
if it can't serve one full NAPI poll :D

+ agree with Paolo re napi_reuse_skb(), it's used only in the NAPI
context and recycles a lot o'stuff already, we can speed it up safely here.

Thanks,
Olek
Jakub Kicinski Feb. 15, 2023, 5:45 p.m. UTC | #4
On Wed, 15 Feb 2023 09:41:13 +0100 Paolo Abeni wrote:
> I'm wondering if napi_reuse_skb() should be touched, too? Even it's not
> directly used by the following patch...

I didn't touch it because I (sadly) don't have access to any driver
using GRO frags to test :(  But I certainly can.

What about __kfree_skb_defer() and napi_consume_skb() (basically 
the other two napi_skb_cache_put() callers) ?
Jakub Kicinski Feb. 15, 2023, 5:52 p.m. UTC | #5
On Wed, 15 Feb 2023 17:17:53 +0100 Alexander Lobakin wrote:
> > On 15/02/2023 03:43, Jakub Kicinski wrote:  
> >> On the driver -> GRO path we can avoid thrashing the kmemcache
> >> by holding onto one skb_ext.  
> > 
> > Hmm, will one be enough if we're doing GRO_NORMAL batching?
> > As for e.g. UDP traffic up to 8 skbs (by default) can have
> >  overlapping lifetimes.
> >   
> I thought of an array of %NAPI_SKB_CACHE_SIZE to be honest. From what
> I've ever tested, no cache (for any netstack-related object) is enough
> if it can't serve one full NAPI poll :D

I was hoping to leave sizing of the cache until we have some data from
a production network (or at least representative packet traces).

NAPI_SKB_CACHE_SIZE kinda assumes we're not doing much GRO, right?
And the current patch feeds the cache exclusively from GRO...

> + agree with Paolo re napi_reuse_skb(), it's used only in the NAPI
> context and recycles a lot o'stuff already, we can speed it up safely here.

LMK what's your opinion on touching the other potential spots, too.
(in Paolo's subthread).
Alexander Lobakin Feb. 15, 2023, 6:01 p.m. UTC | #6
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Feb 2023 09:52:00 -0800

> On Wed, 15 Feb 2023 17:17:53 +0100 Alexander Lobakin wrote:
>>> On 15/02/2023 03:43, Jakub Kicinski wrote:  
>>>> On the driver -> GRO path we can avoid thrashing the kmemcache
>>>> by holding onto one skb_ext.  
>>>
>>> Hmm, will one be enough if we're doing GRO_NORMAL batching?
>>> As for e.g. UDP traffic up to 8 skbs (by default) can have
>>>  overlapping lifetimes.
>>>   
>> I thought of an array of %NAPI_SKB_CACHE_SIZE to be honest. From what
>> I've ever tested, no cache (for any netstack-related object) is enough
>> if it can't serve one full NAPI poll :D
> 
> I was hoping to leave sizing of the cache until we have some data from
> a production network (or at least representative packet traces).
> 
> NAPI_SKB_CACHE_SIZE kinda assumes we're not doing much GRO, right?

It assumes we GRO a lot :D

Imagine that you have 64 frames during one poll and the GRO layer
decides to coalesce them by batches of 16. Then only 4 skbs will be
used, the rest will go as frags (with "stolen heads") -> 60 of 64 skbs
will return to that skb cache and will then be reused by napi_build_skb().

> And the current patch feeds the cache exclusively from GRO...
> 
>> + agree with Paolo re napi_reuse_skb(), it's used only in the NAPI
>> context and recycles a lot o'stuff already, we can speed it up safely here.
> 
> LMK what's your opinion on touching the other potential spots, too.
> (in Paolo's subthread).
<went to take a look already>

Thanks,
Olek
Alexander Lobakin Feb. 15, 2023, 6:08 p.m. UTC | #7
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Feb 2023 09:45:42 -0800

> On Wed, 15 Feb 2023 09:41:13 +0100 Paolo Abeni wrote:
>> I'm wondering if napi_reuse_skb() should be touched, too? Even it's not
>> directly used by the following patch...
> 
> I didn't touch it because I (sadly) don't have access to any driver
> using GRO frags to test :(  But I certainly can.
> 
> What about __kfree_skb_defer() and napi_consume_skb() (basically 
> the other two napi_skb_cache_put() callers) ?
>

Sounds good. Basically any caller of napi_skb_cache_put() can be
switched to recycle extensions.
But you certainly need to have a pool instead of just one pointer then,
since napi_consume_skb() will return a lot if exts are actively used :)
Having just one pointer will only make freeing extensions 1 step longer
(`caller -> skb cache -> kmem_cache_free()` 63 times per 1 Tx completion
poll for example to save you 1 pointer to be used on Rx later).

Thanks,
Olek
Jakub Kicinski Feb. 15, 2023, 6:20 p.m. UTC | #8
On Wed, 15 Feb 2023 19:01:19 +0100 Alexander Lobakin wrote:
> > I was hoping to leave sizing of the cache until we have some data from
> > a production network (or at least representative packet traces).
> > 
> > NAPI_SKB_CACHE_SIZE kinda assumes we're not doing much GRO, right?  
> 
> It assumes we GRO a lot :D
> 
> Imagine that you have 64 frames during one poll and the GRO layer
> decides to coalesce them by batches of 16. Then only 4 skbs will be
> used, the rest will go as frags (with "stolen heads") -> 60 of 64 skbs
> will return to that skb cache and will then be reused by napi_build_skb().

Let's say 5 - for 4 resulting skbs GRO will need the 4 resulting and
one extra to shuttle between the driver and GRO (worst case).
With a cache of 1 I'm guaranteed to save 59 alloc calls, 92%, right?

That's why I'm saying - the larger cache would help workloads which
don't GRO as much. Am I missing the point or how GRO works?
Paolo Abeni Feb. 15, 2023, 7:08 p.m. UTC | #9
On Wed, 2023-02-15 at 19:08 +0100, Alexander Lobakin wrote:
> From: Jakub Kicinski <kuba@kernel.org>
> Date: Wed, 15 Feb 2023 09:45:42 -0800
> 
> > On Wed, 15 Feb 2023 09:41:13 +0100 Paolo Abeni wrote:
> > > I'm wondering if napi_reuse_skb() should be touched, too? Even it's not
> > > directly used by the following patch...
> > 
> > I didn't touch it because I (sadly) don't have access to any driver
> > using GRO frags to test :(  But I certainly can.
> > 
> > What about __kfree_skb_defer() and napi_consume_skb() (basically 
> > the other two napi_skb_cache_put() callers) ?
> > 
> 
> Sounds good. Basically any caller of napi_skb_cache_put() can be
> switched to recycle extensions.
> But you certainly need to have a pool instead of just one pointer then,
> since napi_consume_skb() will return a lot if exts are actively used :)

This could be also a point to (initially) exclude napi_consume_skb()
and keep the (initial) implementation as simple as possible.

If the expected use-case more related to forwarded traffic, local
traffic or independent from such consideration?

Cheers,

Paolo
Alexander Lobakin Feb. 16, 2023, 12:04 p.m. UTC | #10
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Feb 2023 10:20:15 -0800

> On Wed, 15 Feb 2023 19:01:19 +0100 Alexander Lobakin wrote:
>>> I was hoping to leave sizing of the cache until we have some data from
>>> a production network (or at least representative packet traces).
>>>
>>> NAPI_SKB_CACHE_SIZE kinda assumes we're not doing much GRO, right?  
>>
>> It assumes we GRO a lot :D
>>
>> Imagine that you have 64 frames during one poll and the GRO layer
>> decides to coalesce them by batches of 16. Then only 4 skbs will be
>> used, the rest will go as frags (with "stolen heads") -> 60 of 64 skbs
>> will return to that skb cache and will then be reused by napi_build_skb().
> 
> Let's say 5 - for 4 resulting skbs GRO will need the 4 resulting and
> one extra to shuttle between the driver and GRO (worst case).
> With a cache of 1 I'm guaranteed to save 59 alloc calls, 92%, right?
> 
> That's why I'm saying - the larger cache would help workloads which
> don't GRO as much. Am I missing the point or how GRO works?

Maybe I'm missing something now :D

The driver receives 5 frames, so it allocates 5 skbs. GRO coalesces them
into one big, so the first one remains as an skb, the following 4 get
their data added as frags and then are moved to the NAPI cache
(%NAPI_GRO_FREE_STOLEN_HEAD).
After GRO decides it's enough for this skb, it gets moved to the pending
list to be flushed soon. @gro_normal_batch is usually 8, so it means
there can be up to 8....
Oh wait, Eric changed this to count segments, not skbs :D
...there can be up to 2* such skbs waiting for a flush (the first one
sets the counter to 5, the second adds 5 more => flush happens). So you
anyway would need at least 2* skb extensions cached, otherwise there
will be new allocations.
This is not counting fraglists, when GRO decides to fraglist an skb, it
requires at least 1 skb more. UDP fraglisted GRO (I know almost nobody
uses it, still it does exist) doesn't use frags at all and requires 1
skb per each segment.
You're right that the cache size of %NAPI_POLL_WEIGHT is needed only for
corner cases like big @gro_normal_batch, fraglists, UDP fraglisted GRO
and so on, still think we shouldn't ignore them :) Also this cache can
then be reused later to bulk-free extensions on Tx completion, just like
it's done for skbs.

* or less/more if customized by user, for example I set 16 on MIPS,
x86_64 works better with 8.

Thanks,
Olek
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d5602b15c714..e68cb0a777b9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4622,6 +4622,7 @@  struct skb_ext *__skb_ext_alloc(gfp_t flags);
 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
 		    struct skb_ext *ext);
 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
+void *napi_skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
 void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
 void __skb_ext_put(struct skb_ext *ext);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6f0fc1f09536..feb5034b13ad 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -224,6 +224,9 @@  static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
 struct napi_alloc_cache {
 	struct page_frag_cache page;
 	struct page_frag_1k page_small;
+#ifdef CONFIG_SKB_EXTENSIONS
+	struct skb_ext *ext;
+#endif
 	unsigned int skb_count;
 	void *skb_cache[NAPI_SKB_CACHE_SIZE];
 };
@@ -1228,6 +1231,43 @@  static void napi_skb_cache_put(struct sk_buff *skb)
 	}
 }
 
+static bool skb_ext_needs_destruct(const struct skb_ext *ext)
+{
+	bool needs_destruct = false;
+
+#ifdef CONFIG_XFRM
+	needs_destruct |= __skb_ext_exist(ext, SKB_EXT_SEC_PATH);
+#endif
+#ifdef CONFIG_MCTP_FLOWS
+	needs_destruct |= __skb_ext_exist(ext, SKB_EXT_MCTP);
+#endif
+
+	return needs_destruct;
+}
+
+static void napi_skb_ext_put(struct sk_buff *skb)
+{
+#ifdef CONFIG_SKB_EXTENSIONS
+	struct skb_ext *ext;
+
+	if (!skb->active_extensions)
+		return;
+
+	ext = skb->extensions;
+	if (!skb_ext_needs_destruct(ext)) {
+		struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+		if (refcount_read(&ext->refcnt) == 1 && !nc->ext) {
+			kasan_poison_object_data(skbuff_ext_cache, ext);
+			nc->ext = ext;
+			return;
+		}
+	}
+
+	__skb_ext_put(ext);
+#endif
+}
+
 void __kfree_skb_defer(struct sk_buff *skb)
 {
 	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
@@ -1239,7 +1279,7 @@  void napi_skb_free_stolen_head(struct sk_buff *skb)
 	if (unlikely(skb->slow_gro)) {
 		nf_reset_ct(skb);
 		skb_dst_drop(skb);
-		skb_ext_put(skb);
+		napi_skb_ext_put(skb);
 		skb_orphan(skb);
 		skb->slow_gro = 0;
 	}
@@ -6599,6 +6639,12 @@  static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
 	return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
 }
 
+static void skb_ext_init(struct skb_ext *new)
+{
+	memset(new->offset, 0, sizeof(new->offset));
+	refcount_set(&new->refcnt, 1);
+}
+
 /**
  * __skb_ext_alloc - allocate a new skb extensions storage
  *
@@ -6612,10 +6658,8 @@  struct skb_ext *__skb_ext_alloc(gfp_t flags)
 {
 	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
 
-	if (new) {
-		memset(new->offset, 0, sizeof(new->offset));
-		refcount_set(&new->refcnt, 1);
-	}
+	if (new)
+		skb_ext_init(new);
 
 	return new;
 }
@@ -6731,6 +6775,31 @@  void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
 }
 EXPORT_SYMBOL(skb_ext_add);
 
+void *napi_skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+	struct skb_ext *new = NULL;
+
+	if (!skb->active_extensions) {
+		struct napi_alloc_cache *nc;
+
+		nc = this_cpu_ptr(&napi_alloc_cache);
+		new = nc->ext;
+		if (new) {
+			kasan_unpoison_object_data(skbuff_ext_cache, new);
+			nc->ext = NULL;
+		} else {
+			new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+			if (!new)
+				return NULL;
+		}
+
+		skb_ext_init(new);
+	}
+
+	return skb_ext_add_finalize(skb, id, new);
+}
+EXPORT_SYMBOL(napi_skb_ext_add);
+
 #ifdef CONFIG_XFRM
 static void skb_ext_put_sp(struct sec_path *sp)
 {