diff mbox series

[v5,net-next,1/3] net: introduce page_pool pointer in softnet_data percpu struct

Message ID b1432fc51c694f1be8daabb19773744fcee13cf1.1702563810.git.lorenzo@kernel.org (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series add multi-buff support for xdp running in generic mode | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5200 this patch: 5200
netdev/cc_maintainers warning 4 maintainers not CCed: ast@kernel.org ilias.apalodimas@linaro.org daniel@iogearbox.net john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 1358 this patch: 1358
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5527 this patch: 5527
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 124 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Lorenzo Bianconi Dec. 14, 2023, 2:29 p.m. UTC
Allocate percpu page_pools in softnet_data.
Moreover add cpuid filed in page_pool struct in order to recycle the
page in the page_pool "hot" cache if napi_pp_put_page() is running on
the same cpu.
This is a preliminary patch to add xdp multi-buff support for xdp running
in generic mode.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/linux/netdevice.h       |  1 +
 include/net/page_pool/helpers.h |  5 +++++
 include/net/page_pool/types.h   |  1 +
 net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
 net/core/page_pool.c            |  5 +++++
 net/core/skbuff.c               |  5 +++--
 6 files changed, 53 insertions(+), 3 deletions(-)

Comments

Paolo Abeni Dec. 19, 2023, 3:23 p.m. UTC | #1
On Thu, 2023-12-14 at 15:29 +0100, Lorenzo Bianconi wrote:
> Allocate percpu page_pools in softnet_data.
> Moreover add cpuid filed in page_pool struct in order to recycle the
> page in the page_pool "hot" cache if napi_pp_put_page() is running on
> the same cpu.
> This is a preliminary patch to add xdp multi-buff support for xdp running
> in generic mode.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/linux/netdevice.h       |  1 +
>  include/net/page_pool/helpers.h |  5 +++++
>  include/net/page_pool/types.h   |  1 +
>  net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
>  net/core/page_pool.c            |  5 +++++
>  net/core/skbuff.c               |  5 +++--
>  6 files changed, 53 insertions(+), 3 deletions(-)

@Jesper, @Ilias: could you please have a look at the pp bits?

Thanks!

Paolo
Eric Dumazet Dec. 19, 2023, 4:29 p.m. UTC | #2
On Thu, Dec 14, 2023 at 3:30 PM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
>
> Allocate percpu page_pools in softnet_data.
> Moreover add cpuid filed in page_pool struct in order to recycle the
> page in the page_pool "hot" cache if napi_pp_put_page() is running on
> the same cpu.
> This is a preliminary patch to add xdp multi-buff support for xdp running
> in generic mode.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/linux/netdevice.h       |  1 +
>  include/net/page_pool/helpers.h |  5 +++++
>  include/net/page_pool/types.h   |  1 +
>  net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
>  net/core/page_pool.c            |  5 +++++
>  net/core/skbuff.c               |  5 +++--
>  6 files changed, 53 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 1b935ee341b4..30b6a3f601fe 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3319,6 +3319,7 @@ struct softnet_data {
>         int                     defer_count;
>         int                     defer_ipi_scheduled;
>         struct sk_buff          *defer_list;
> +       struct page_pool        *page_pool;
>         call_single_data_t      defer_csd;
>  };

This field should be put elsewhere, not in this contended cache line.
Lorenzo Bianconi Dec. 19, 2023, 5:32 p.m. UTC | #3
> On Thu, Dec 14, 2023 at 3:30 PM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> >
> > Allocate percpu page_pools in softnet_data.
> > Moreover add cpuid filed in page_pool struct in order to recycle the
> > page in the page_pool "hot" cache if napi_pp_put_page() is running on
> > the same cpu.
> > This is a preliminary patch to add xdp multi-buff support for xdp running
> > in generic mode.
> >
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  include/linux/netdevice.h       |  1 +
> >  include/net/page_pool/helpers.h |  5 +++++
> >  include/net/page_pool/types.h   |  1 +
> >  net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
> >  net/core/page_pool.c            |  5 +++++
> >  net/core/skbuff.c               |  5 +++--
> >  6 files changed, 53 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 1b935ee341b4..30b6a3f601fe 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -3319,6 +3319,7 @@ struct softnet_data {
> >         int                     defer_count;
> >         int                     defer_ipi_scheduled;
> >         struct sk_buff          *defer_list;
> > +       struct page_pool        *page_pool;
> >         call_single_data_t      defer_csd;
> >  };
> 
> This field should be put elsewhere, not in this contended cache line.

ack, I think we could add a percpu dedicated pointer for it.

Regards,
Lorenzo
Jesper Dangaard Brouer Dec. 20, 2023, noon UTC | #4
On 19/12/2023 16.23, Paolo Abeni wrote:
> On Thu, 2023-12-14 at 15:29 +0100, Lorenzo Bianconi wrote:
>> Allocate percpu page_pools in softnet_data.
>> Moreover add cpuid filed in page_pool struct in order to recycle the
>> page in the page_pool "hot" cache if napi_pp_put_page() is running on
>> the same cpu.
>> This is a preliminary patch to add xdp multi-buff support for xdp running
>> in generic mode.
>>
>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>> ---
>>   include/linux/netdevice.h       |  1 +
>>   include/net/page_pool/helpers.h |  5 +++++
>>   include/net/page_pool/types.h   |  1 +
>>   net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
>>   net/core/page_pool.c            |  5 +++++
>>   net/core/skbuff.c               |  5 +++--
>>   6 files changed, 53 insertions(+), 3 deletions(-)
> 
> @Jesper, @Ilias: could you please have a look at the pp bits?
> 

I have some concerns... I'm still entertaining the idea, but we need to
be aware of the tradeoffs we are making.

(1)
Adding PP to softnet_data means per CPU caching 256 pages in the
ptr_ring (plus likely 64 in the alloc-cache).   Fortunately, PP starts
out empty, so as long as this PP isn't used they don't get cached. But
if used, then PP don't have a MM shrinker that removes these cached
pages, in case system is under MM pressure.  I guess, you can argue that
keeping this per netdev rx-queue would make memory usage even higher.
This is a tradeoff, we are trading memory (waste) for speed.


(2) (Question to Jakub I guess)
How does this connect with Jakub's PP netlink stats interface?
E.g. I find it very practical that this allow us get PP stats per
netdev, but in this case there isn't a netdev.


(3) (Implicit locking)
PP have lockless "alloc" because it it relies on drivers NAPI context.
The places where netstack access softnet_data provide similar protection
that we can rely on for PP, so this is likely correct implementation
wise.  But it will give people like Sebastian (Cc) more gray hair when
figuring out how PREEMPT_RT handle these cases.

(4)
The optimization is needed for the case where we need to re-allocate and
copy SKB fragments.  I think we should focus on avoiding this code path,
instead of optimizing it.  For UDP it should be fairly easy, but for TCP
this is harder.

--Jesper
Lorenzo Bianconi Jan. 17, 2024, 5:36 p.m. UTC | #5
> 
> 
> On 19/12/2023 16.23, Paolo Abeni wrote:
> > On Thu, 2023-12-14 at 15:29 +0100, Lorenzo Bianconi wrote:
> > > Allocate percpu page_pools in softnet_data.
> > > Moreover add cpuid filed in page_pool struct in order to recycle the
> > > page in the page_pool "hot" cache if napi_pp_put_page() is running on
> > > the same cpu.
> > > This is a preliminary patch to add xdp multi-buff support for xdp running
> > > in generic mode.
> > > 
> > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > > ---
> > >   include/linux/netdevice.h       |  1 +
> > >   include/net/page_pool/helpers.h |  5 +++++
> > >   include/net/page_pool/types.h   |  1 +
> > >   net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
> > >   net/core/page_pool.c            |  5 +++++
> > >   net/core/skbuff.c               |  5 +++--
> > >   6 files changed, 53 insertions(+), 3 deletions(-)
> > 
> > @Jesper, @Ilias: could you please have a look at the pp bits?
> > 
> 
> I have some concerns... I'm still entertaining the idea, but we need to
> be aware of the tradeoffs we are making.
> 
> (1)
> Adding PP to softnet_data means per CPU caching 256 pages in the
> ptr_ring (plus likely 64 in the alloc-cache).   Fortunately, PP starts
> out empty, so as long as this PP isn't used they don't get cached. But
> if used, then PP don't have a MM shrinker that removes these cached
> pages, in case system is under MM pressure.  I guess, you can argue that
> keeping this per netdev rx-queue would make memory usage even higher.
> This is a tradeoff, we are trading memory (waste) for speed.
> 
> 
> (2) (Question to Jakub I guess)
> How does this connect with Jakub's PP netlink stats interface?
> E.g. I find it very practical that this allow us get PP stats per
> netdev, but in this case there isn't a netdev.
> 
> 
> (3) (Implicit locking)
> PP have lockless "alloc" because it it relies on drivers NAPI context.
> The places where netstack access softnet_data provide similar protection
> that we can rely on for PP, so this is likely correct implementation
> wise.  But it will give people like Sebastian (Cc) more gray hair when
> figuring out how PREEMPT_RT handle these cases.
> 
> (4)
> The optimization is needed for the case where we need to re-allocate and
> copy SKB fragments.  I think we should focus on avoiding this code path,
> instead of optimizing it.  For UDP it should be fairly easy, but for TCP
> this is harder.

Hi all,

I would resume this activity and it seems to me there is no a clear direction
about where we should add the page_pool (in a per_cpu pointer or in
netdev_rx_queue struct) or if we can rely on page_frag_cache instead.

@Jakub: what do you think? Should we add a page_pool in a per_cpu pointer?

Regards,
Lorenzo

> 
> --Jesper
Jakub Kicinski Jan. 18, 2024, 1:47 a.m. UTC | #6
On Wed, 17 Jan 2024 18:36:25 +0100 Lorenzo Bianconi wrote:
> I would resume this activity and it seems to me there is no a clear direction
> about where we should add the page_pool (in a per_cpu pointer or in
> netdev_rx_queue struct) or if we can rely on page_frag_cache instead.
> 
> @Jakub: what do you think? Should we add a page_pool in a per_cpu pointer?

Let's try to summarize. We want skb reallocation without linearization
for XDP generic. We need some fast-ish way to get pages for the payload.

First, options for placing the allocator:
 - struct netdev_rx_queue
 - per-CPU

IMO per-CPU has better scaling properties - you're less likely to
increase the CPU count to infinity than spawn extra netdev queues.

The second question is:
 - page_frag_cache
 - page_pool

I like the page pool because we have an increasing amount of infra for
it, and page pool is already used in veth, which we can hopefully also
de-duplicate if we have a per-CPU one, one day. But I do agree that
it's not a perfect fit.

To answer Jesper's questions:
 ad1. cache size - we can lower the cache to match page_frag_cache, 
      so I think 8 entries? page frag cache can give us bigger frags 
      and therefore lower frag count, so that's a minus for using 
      page pool
 ad2. nl API - we can extend netlink to dump unbound page pools fairly
      easily, I didn't want to do it without a clear use case, but I
      don't think there are any blockers
 ad3. locking - a bit independent of allocator but fair point, we assume
      XDP generic or Rx path for now, so sirq context / bh locked out
 ad4. right, well, right, IDK what real workloads need, and whether 
      XDP generic should be optimized at all.. I personally lean
      towards "no"
 
Sorry if I haven't helped much to clarify the direction :)
I have no strong preference on question #2, I would prefer to not add
per-queue state for something that's in no way tied to the device
(question #1 -> per-CPU). 

You did good perf analysis of the options, could you share it here
again?
Lorenzo Bianconi Jan. 22, 2024, 11:19 a.m. UTC | #7
> On Wed, 17 Jan 2024 18:36:25 +0100 Lorenzo Bianconi wrote:
> > I would resume this activity and it seems to me there is no a clear direction
> > about where we should add the page_pool (in a per_cpu pointer or in
> > netdev_rx_queue struct) or if we can rely on page_frag_cache instead.
> > 
> > @Jakub: what do you think? Should we add a page_pool in a per_cpu pointer?

Hi Jakub,

> 
> Let's try to summarize. We want skb reallocation without linearization
> for XDP generic. We need some fast-ish way to get pages for the payload.

correct

> 
> First, options for placing the allocator:
>  - struct netdev_rx_queue
>  - per-CPU
> 
> IMO per-CPU has better scaling properties - you're less likely to
> increase the CPU count to infinity than spawn extra netdev queues.

ack

> 
> The second question is:
>  - page_frag_cache
>  - page_pool
> 
> I like the page pool because we have an increasing amount of infra for
> it, and page pool is already used in veth, which we can hopefully also
> de-duplicate if we have a per-CPU one, one day. But I do agree that
> it's not a perfect fit.
> 
> To answer Jesper's questions:
>  ad1. cache size - we can lower the cache to match page_frag_cache, 
>       so I think 8 entries? page frag cache can give us bigger frags 
>       and therefore lower frag count, so that's a minus for using 
>       page pool
>  ad2. nl API - we can extend netlink to dump unbound page pools fairly
>       easily, I didn't want to do it without a clear use case, but I
>       don't think there are any blockers
>  ad3. locking - a bit independent of allocator but fair point, we assume
>       XDP generic or Rx path for now, so sirq context / bh locked out
>  ad4. right, well, right, IDK what real workloads need, and whether 
>       XDP generic should be optimized at all.. I personally lean
>       towards "no"
>  
> Sorry if I haven't helped much to clarify the direction :)
> I have no strong preference on question #2, I would prefer to not add
> per-queue state for something that's in no way tied to the device
> (question #1 -> per-CPU). 

Relying on netdev_alloc_cache/napi_alloc_cache will have the upside of reusing
current infrastructure (iirc my first revision used this approach).
The downside is we can't unify the code with veth driver.
There other way around adding per-cpu page_pools :). Anyway I am fine to have a
per-cpu page_pool similar to netdev_alloc_cache/napi_alloc_cache.

@Jesper/Ilias: what do you think?

> 
> You did good perf analysis of the options, could you share it here
> again?
> 

copying them out from my previous tests:

v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01 ==(XDP_REDIRECT)==> v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11

- v00: iperf3 client (pinned on core 0)
- v11: iperf3 server (pinned on core 7)

net-next veth codebase (page_pool APIs):
=======================================
- MTU  1500: ~ 5.42 Gbps
- MTU  8000: ~ 14.1 Gbps
- MTU 64000: ~ 18.4 Gbps

net-next veth codebase + netdev_alloc_cache/napi_alloc_cache:
=============================================================
- MTU  1500: ~ 6.62 Gbps
- MTU  8000: ~ 14.7 Gbps
- MTU 64000: ~ 19.7 Gbps

xdp_generic codebase + netdev_alloc_cache/napi_alloc_cache:
===========================================================
- MTU  1500: ~ 6.41 Gbps
- MTU  8000: ~ 14.2 Gbps
- MTU 64000: ~ 19.8 Gbps

xdp_generic codebase + page_pool in netdev_rx_queue:
====================================================
- MTU  1500: ~ 5.75 Gbps
- MTU  8000: ~ 15.3 Gbps
- MTU 64000: ~ 21.2 Gbps

IIRC relying on per-cpu page_pool has similar results of adding them in netdev_rx_queue,
but I can test them again with an updated kernel.

Regards,
Lorenzo
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1b935ee341b4..30b6a3f601fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3319,6 +3319,7 @@  struct softnet_data {
 	int			defer_count;
 	int			defer_ipi_scheduled;
 	struct sk_buff		*defer_list;
+	struct page_pool	*page_pool;
 	call_single_data_t	defer_csd;
 };
 
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 841e0a930bd7..6ae735804b40 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -401,4 +401,9 @@  static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
 		page_pool_update_nid(pool, new_nid);
 }
 
+static inline void page_pool_set_cpuid(struct page_pool *pool, int cpuid)
+{
+	pool->cpuid = cpuid;
+}
+
 #endif /* _NET_PAGE_POOL_HELPERS_H */
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 76481c465375..f63dadf2a6d4 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -128,6 +128,7 @@  struct page_pool_stats {
 struct page_pool {
 	struct page_pool_params_fast p;
 
+	int cpuid;
 	bool has_init_callback;
 
 	long frag_users;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0432b04cf9b0..d600e3a6ec2c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,6 +153,8 @@ 
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 #include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -11670,12 +11672,33 @@  static void __init net_dev_struct_check(void)
  *
  */
 
+#define SD_PAGE_POOL_RING_SIZE	256
+static int net_sd_page_pool_alloc(struct softnet_data *sd, int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	struct page_pool_params page_pool_params = {
+		.pool_size = SD_PAGE_POOL_RING_SIZE,
+		.nid = NUMA_NO_NODE,
+	};
+
+	sd->page_pool = page_pool_create(&page_pool_params);
+	if (IS_ERR(sd->page_pool)) {
+		sd->page_pool = NULL;
+		return -ENOMEM;
+	}
+
+	page_pool_set_cpuid(sd->page_pool, cpuid);
+#endif
+	return 0;
+}
+
 /*
  *       This is called single threaded during boot, so no need
  *       to take the rtnl semaphore.
  */
 static int __init net_dev_init(void)
 {
+	struct softnet_data *sd;
 	int i, rc = -ENOMEM;
 
 	BUG_ON(!dev_boot_phase);
@@ -11701,10 +11724,10 @@  static int __init net_dev_init(void)
 
 	for_each_possible_cpu(i) {
 		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
-		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		INIT_WORK(flush, flush_backlog);
 
+		sd = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&sd->input_pkt_queue);
 		skb_queue_head_init(&sd->process_queue);
 #ifdef CONFIG_XFRM_OFFLOAD
@@ -11722,6 +11745,9 @@  static int __init net_dev_init(void)
 		init_gro_hash(&sd->backlog);
 		sd->backlog.poll = process_backlog;
 		sd->backlog.weight = weight_p;
+
+		if (net_sd_page_pool_alloc(sd, i))
+			goto out;
 	}
 
 	dev_boot_phase = 0;
@@ -11749,6 +11775,17 @@  static int __init net_dev_init(void)
 	WARN_ON(rc < 0);
 	rc = 0;
 out:
+	if (rc < 0) {
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (!sd->page_pool)
+				continue;
+
+			page_pool_destroy(sd->page_pool);
+			sd->page_pool = NULL;
+		}
+	}
+
 	return rc;
 }
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dd5a72533f2b..275b8572a82b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -178,6 +178,11 @@  static int page_pool_init(struct page_pool *pool,
 	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 
+	/* It is up to the consumer to set cpuid if we are using percpu
+	 * page_pool so initialize it to an invalid value.
+	 */
+	pool->cpuid = -1;
+
 	/* Validate only known flags were used */
 	if (pool->p.flags & ~(PP_FLAG_ALL))
 		return -EINVAL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b157efea5dea..4bc0a7f98241 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -918,9 +918,10 @@  bool napi_pp_put_page(struct page *page, bool napi_safe)
 	 */
 	if (napi_safe || in_softirq()) {
 		const struct napi_struct *napi = READ_ONCE(pp->p.napi);
+		unsigned int cpuid = smp_processor_id();
 
-		allow_direct = napi &&
-			READ_ONCE(napi->list_owner) == smp_processor_id();
+		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
+		allow_direct |= (pp->cpuid == cpuid);
 	}
 
 	/* Driver set this to memory recycling info. Reset it on recycle.