diff mbox series

[v3,net-next,2/2] xdp: add multi-buff support for xdp running in generic mode

Message ID c9ee1db92c8baa7806f8949186b43ffc13fa01ca.1701437962.git.lorenzo@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series add multi-buff support for xdp running in generic mode | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-16 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-16 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-16 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-16 / veristat
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1123 this patch: 1123
netdev/cc_maintainers warning 3 maintainers not CCed: john.fastabend@gmail.com ast@kernel.org daniel@iogearbox.net
netdev/build_clang success Errors and warnings before: 1143 this patch: 1143
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1150 this patch: 1150
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-3 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-llvm-16 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-16 / build / build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-llvm-16 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-16 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-16 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-16 / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Lorenzo Bianconi Dec. 1, 2023, 1:48 p.m. UTC
Similar to native xdp, do not always linearize the skb in
netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
processed by the eBPF program. This allow to add  multi-buffer support
for xdp running in generic mode.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 net/core/dev.c | 151 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 126 insertions(+), 25 deletions(-)

Comments

Jakub Kicinski Dec. 2, 2023, 3:48 a.m. UTC | #1
On Fri,  1 Dec 2023 14:48:26 +0100 Lorenzo Bianconi wrote:
> Similar to native xdp, do not always linearize the skb in
> netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> processed by the eBPF program. This allow to add  multi-buffer support
> for xdp running in generic mode.

Hm. How close is the xdp generic code to veth?
I wonder if it'd make sense to create a page pool instance for each
core, we could then pass it into a common "reallocate skb into a
page-pool backed, fragged form" helper. Common between this code
and veth? Perhaps we could even get rid of the veth page pools
and use the per cpu pools there?
Lorenzo Bianconi Dec. 4, 2023, 3:43 p.m. UTC | #2
> On Fri,  1 Dec 2023 14:48:26 +0100 Lorenzo Bianconi wrote:
> > Similar to native xdp, do not always linearize the skb in
> > netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> > processed by the eBPF program. This allow to add  multi-buffer support
> > for xdp running in generic mode.
> 
> Hm. How close is the xdp generic code to veth?

Actually they are quite close, the only difference is the use of page_pool vs
page_frag_cache APIs.

> I wonder if it'd make sense to create a page pool instance for each
> core, we could then pass it into a common "reallocate skb into a
> page-pool backed, fragged form" helper. Common between this code
> and veth? Perhaps we could even get rid of the veth page pools
> and use the per cpu pools there?

yes, I was thinking about it actually.
I run some preliminary tests to check if we are introducing any performance
penalties or so.
My setup relies on a couple of veth pairs and an eBPF program to perform
XDP_REDIRECT from one pair to another one. I am running the program in xdp
driver mode (not generic one).

v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01    v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11

v00: iperf3 client
v11: iperf3 server

I am run the test with different MTU valeus (1500B, 8KB, 64KB)

net-next veth codebase:
=======================
- MTU  1500: iperf3 ~  4.37Gbps
- MTU  8000: iperf3 ~  9.75Gbps
- MTU 64000: iperf3 ~ 11.24Gbps

net-next veth codebase + page_frag_cache instead of page_pool:
==============================================================
- MTU  1500: iperf3 ~  4.99Gbps (+14%)
- MTU  8000: iperf3 ~  8.5Gbps  (-12%)
- MTU 64000: iperf3 ~ 11.9Gbps  ( +6%)

It seems there is no a clear win situation of using page_pool or
page_frag_cache. What do you think?

Regards,
Lorenzo
Jakub Kicinski Dec. 4, 2023, 8:01 p.m. UTC | #3
On Mon, 4 Dec 2023 16:43:56 +0100 Lorenzo Bianconi wrote:
> yes, I was thinking about it actually.
> I run some preliminary tests to check if we are introducing any performance
> penalties or so.
> My setup relies on a couple of veth pairs and an eBPF program to perform
> XDP_REDIRECT from one pair to another one. I am running the program in xdp
> driver mode (not generic one).
> 
> v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01    v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11
> 
> v00: iperf3 client
> v11: iperf3 server
> 
> I am run the test with different MTU valeus (1500B, 8KB, 64KB)
> 
> net-next veth codebase:
> =======================
> - MTU  1500: iperf3 ~  4.37Gbps
> - MTU  8000: iperf3 ~  9.75Gbps
> - MTU 64000: iperf3 ~ 11.24Gbps
> 
> net-next veth codebase + page_frag_cache instead of page_pool:
> ==============================================================
> - MTU  1500: iperf3 ~  4.99Gbps (+14%)
> - MTU  8000: iperf3 ~  8.5Gbps  (-12%)
> - MTU 64000: iperf3 ~ 11.9Gbps  ( +6%)
> 
> It seems there is no a clear win situation of using page_pool or
> page_frag_cache. What do you think?

Hm, interesting. Are the iperf processes running on different cores?
May be worth pinning (both same and different) to make sure the cache
effects are isolated.
Lorenzo Bianconi Dec. 5, 2023, 11:08 p.m. UTC | #4
> On Mon, 4 Dec 2023 16:43:56 +0100 Lorenzo Bianconi wrote:
> > yes, I was thinking about it actually.
> > I run some preliminary tests to check if we are introducing any performance
> > penalties or so.
> > My setup relies on a couple of veth pairs and an eBPF program to perform
> > XDP_REDIRECT from one pair to another one. I am running the program in xdp
> > driver mode (not generic one).
> > 
> > v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01    v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11
> > 
> > v00: iperf3 client
> > v11: iperf3 server
> > 
> > I am run the test with different MTU valeus (1500B, 8KB, 64KB)
> > 
> > net-next veth codebase:
> > =======================
> > - MTU  1500: iperf3 ~  4.37Gbps
> > - MTU  8000: iperf3 ~  9.75Gbps
> > - MTU 64000: iperf3 ~ 11.24Gbps
> > 
> > net-next veth codebase + page_frag_cache instead of page_pool:
> > ==============================================================
> > - MTU  1500: iperf3 ~  4.99Gbps (+14%)
> > - MTU  8000: iperf3 ~  8.5Gbps  (-12%)
> > - MTU 64000: iperf3 ~ 11.9Gbps  ( +6%)
> > 
> > It seems there is no a clear win situation of using page_pool or
> > page_frag_cache. What do you think?
> 
> Hm, interesting. Are the iperf processes running on different cores?
> May be worth pinning (both same and different) to make sure the cache
> effects are isolated.

Hi Jakub,

I carried out some more tests today based on your suggestion on both veth
driver and xdp_generic codebase (on a more powerful system).
Test setup:

v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01 ==(XDP_REDIRECT)==> v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11

- v00: iperf3 client (pinned on core 0)
- v11: iperf3 server (pinned on core 7)

net-next veth codebase (page_pool APIs):
=======================================
- MTU  1500: ~ 5.42 Gbps
- MTU  8000: ~ 14.1 Gbps
- MTU 64000: ~ 18.4 Gbps

net-next veth codebase + page_frag_cahe APIs [0]:
=================================================
- MTU  1500: ~ 6.62 Gbps
- MTU  8000: ~ 14.7 Gbps
- MTU 64000: ~ 19.7 Gbps

xdp_generic codebase + page_frag_cahe APIs (current proposed patch):
====================================================================
- MTU  1500: ~ 6.41 Gbps
- MTU  8000: ~ 14.2 Gbps
- MTU 64000: ~ 19.8 Gbps

xdp_generic codebase + page_frag_cahe APIs [1]:
===============================================
- MTU  1500: ~ 5.75 Gbps
- MTU  8000: ~ 15.3 Gbps
- MTU 64000: ~ 21.2 Gbps

It seems page_pool APIs are working better for xdp_generic codebase
(except MTU 1500 case) while page_frag_cache APIs are better for
veth driver. What do you think? Am I missing something?

Regards,
Lorenzo

[0] Here I have just used napi_alloc_frag() instead of
page_pool_dev_alloc_va()/page_pool_dev_alloc() in
veth_convert_skb_to_xdp_buff()

[1] I developed this PoC to use page_pool APIs for xdp_generic code:

diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index cdcafb30d437..5115b61f38f1 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -21,6 +21,7 @@ struct netdev_rx_queue {
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool            *pool;
 #endif
+	struct page_pool		*page_pool;
 } ____cacheline_aligned_in_smp;
 
 /*
diff --git a/net/core/dev.c b/net/core/dev.c
index ed827b443d48..06fb568427c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,6 +153,8 @@
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 #include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -4964,6 +4966,7 @@ static int netif_skb_check_for_generic_xdp(struct sk_buff **pskb,
 	 */
 	if (skb_cloned(skb) || skb_shinfo(skb)->nr_frags ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		struct netdev_rx_queue *rxq = netif_get_rxqueue(skb);
 		u32 mac_len = skb->data - skb_mac_header(skb);
 		u32 size, truesize, len, max_head_size, off;
 		struct sk_buff *nskb;
@@ -4978,18 +4981,19 @@ static int netif_skb_check_for_generic_xdp(struct sk_buff **pskb,
 
 		size = min_t(u32, skb->len, max_head_size);
 		truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
-		data = napi_alloc_frag(truesize);
+		data = page_pool_dev_alloc_va(rxq->page_pool, &truesize);
 		if (!data)
 			return -ENOMEM;
 
 		nskb = napi_build_skb(data, truesize);
 		if (!nskb) {
-			skb_free_frag(data);
+			page_pool_free_va(rxq->page_pool, data, true);
 			return -ENOMEM;
 		}
 
 		skb_reserve(nskb, XDP_PACKET_HEADROOM);
 		skb_copy_header(nskb, skb);
+		skb_mark_for_recycle(nskb);
 
 		err = skb_copy_bits(skb, 0, nskb->data, size);
 		if (err) {
@@ -5005,18 +5009,21 @@ static int netif_skb_check_for_generic_xdp(struct sk_buff **pskb,
 		len = skb->len - off;
 		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
 			struct page *page;
+			u32 page_off;
 
 			size = min_t(u32, len, PAGE_SIZE);
-			data = napi_alloc_frag(size);
+			truesize = size;
+			page = page_pool_dev_alloc(rxq->page_pool, &page_off,
+						   &truesize);
 			if (!data) {
 				consume_skb(nskb);
 				return -ENOMEM;
 			}
 
-			page = virt_to_head_page(data);
-			skb_add_rx_frag(nskb, i, page,
-					data - page_address(page), size, size);
-			err = skb_copy_bits(skb, off, data, size);
+			skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
+			err = skb_copy_bits(skb, off,
+					    page_address(page) + page_off,
+					    size);
 			if (err) {
 				consume_skb(nskb);
 				return err;
@@ -10057,6 +10064,11 @@ EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
+	struct page_pool_params page_pool_params = {
+		.pool_size = 256,
+		.nid = NUMA_NO_NODE,
+		.dev = &dev->dev,
+	};
 	struct netdev_rx_queue *rx;
 	size_t sz = count * sizeof(*rx);
 	int err = 0;
@@ -10075,14 +10087,25 @@ static int netif_alloc_rx_queues(struct net_device *dev)
 		/* XDP RX-queue setup */
 		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
 		if (err < 0)
-			goto err_rxq_info;
+			goto err_rxq;
+
+		/* rx queue page pool allocator */
+		rx[i].page_pool = page_pool_create(&page_pool_params);
+		if (IS_ERR(rx[i].page_pool)) {
+			rx[i].page_pool = NULL;
+			goto err_rxq;
+		}
 	}
 	return 0;
 
-err_rxq_info:
+err_rxq:
 	/* Rollback successful reg's and free other resources */
-	while (i--)
+	while (i--) {
 		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+		if (rx[i].page_pool)
+			page_pool_destroy(rx[i].page_pool);
+	}
+
 	kvfree(dev->_rx);
 	dev->_rx = NULL;
 	return err;
@@ -10096,8 +10119,11 @@ static void netif_free_rx_queues(struct net_device *dev)
 	if (!dev->_rx)
 		return;
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
+		if (dev->_rx[i].page_pool)
+			page_pool_destroy(dev->_rx[i].page_pool);
+	}
 
 	kvfree(dev->_rx);
 }
Jakub Kicinski Dec. 5, 2023, 11:58 p.m. UTC | #5
On Wed, 6 Dec 2023 00:08:15 +0100 Lorenzo Bianconi wrote:
> v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01 ==(XDP_REDIRECT)==> v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11
> 
> - v00: iperf3 client (pinned on core 0)
> - v11: iperf3 server (pinned on core 7)
> 
> net-next veth codebase (page_pool APIs):
> =======================================
> - MTU  1500: ~ 5.42 Gbps
> - MTU  8000: ~ 14.1 Gbps
> - MTU 64000: ~ 18.4 Gbps
> 
> net-next veth codebase + page_frag_cahe APIs [0]:
> =================================================
> - MTU  1500: ~ 6.62 Gbps
> - MTU  8000: ~ 14.7 Gbps
> - MTU 64000: ~ 19.7 Gbps
> 
> xdp_generic codebase + page_frag_cahe APIs (current proposed patch):
> ====================================================================
> - MTU  1500: ~ 6.41 Gbps
> - MTU  8000: ~ 14.2 Gbps
> - MTU 64000: ~ 19.8 Gbps
> 
> xdp_generic codebase + page_frag_cahe APIs [1]:
> ===============================================

This one should say page pool?

> - MTU  1500: ~ 5.75 Gbps
> - MTU  8000: ~ 15.3 Gbps
> - MTU 64000: ~ 21.2 Gbps
> 
> It seems page_pool APIs are working better for xdp_generic codebase
> (except MTU 1500 case) while page_frag_cache APIs are better for
> veth driver. What do you think? Am I missing something?

IDK the details of veth XDP very well but IIUC they are pretty much 
the same. Are there any clues in perf -C 0 / 7?

> [0] Here I have just used napi_alloc_frag() instead of
> page_pool_dev_alloc_va()/page_pool_dev_alloc() in
> veth_convert_skb_to_xdp_buff()
> 
> [1] I developed this PoC to use page_pool APIs for xdp_generic code:

Why not put the page pool in softnet_data?
Jesper Dangaard Brouer Dec. 6, 2023, 12:41 p.m. UTC | #6
On 12/6/23 00:58, Jakub Kicinski wrote:
> On Wed, 6 Dec 2023 00:08:15 +0100 Lorenzo Bianconi wrote:
>> v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01 ==(XDP_REDIRECT)==> v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11
>>
>> - v00: iperf3 client (pinned on core 0)
>> - v11: iperf3 server (pinned on core 7)
>>
>> net-next veth codebase (page_pool APIs):
>> =======================================
>> - MTU  1500: ~ 5.42 Gbps
>> - MTU  8000: ~ 14.1 Gbps
>> - MTU 64000: ~ 18.4 Gbps
>>
>> net-next veth codebase + page_frag_cahe APIs [0]:
>> =================================================
>> - MTU  1500: ~ 6.62 Gbps
>> - MTU  8000: ~ 14.7 Gbps
>> - MTU 64000: ~ 19.7 Gbps
>>
>> xdp_generic codebase + page_frag_cahe APIs (current proposed patch):
>> ====================================================================
>> - MTU  1500: ~ 6.41 Gbps
>> - MTU  8000: ~ 14.2 Gbps
>> - MTU 64000: ~ 19.8 Gbps
>>
>> xdp_generic codebase + page_frag_cahe APIs [1]:
>> ===============================================
> 
> This one should say page pool?
> 
>> - MTU  1500: ~ 5.75 Gbps
>> - MTU  8000: ~ 15.3 Gbps
>> - MTU 64000: ~ 21.2 Gbps
>>
>> It seems page_pool APIs are working better for xdp_generic codebase
>> (except MTU 1500 case) while page_frag_cache APIs are better for
>> veth driver. What do you think? Am I missing something?
> 
> IDK the details of veth XDP very well but IIUC they are pretty much
> the same. Are there any clues in perf -C 0 / 7?
> 
>> [0] Here I have just used napi_alloc_frag() instead of
>> page_pool_dev_alloc_va()/page_pool_dev_alloc() in
>> veth_convert_skb_to_xdp_buff()
>>
>> [1] I developed this PoC to use page_pool APIs for xdp_generic code:
> 
> Why not put the page pool in softnet_data?

First I thought cool that Jakub is suggesting softnet_data, which will
make page_pool (PP) even more central as the netstacks memory layer.

BUT then I realized that PP have a weakness, which is the return/free
path that need to take a normal spin_lock, as that can be called from
any CPU (unlike the RX/alloc case).  Thus, I fear that making multiple
devices share a page_pool via softnet_data, increase the chance of lock
contention when packets are "freed" returned/recycled.

--Jesper

p.s. PP have the page_pool_put_page_bulk() API, but only XDP 
(NIC-drivers) leverage this.
Lorenzo Bianconi Dec. 6, 2023, 1:51 p.m. UTC | #7
> 
> 
> On 12/6/23 00:58, Jakub Kicinski wrote:
> > On Wed, 6 Dec 2023 00:08:15 +0100 Lorenzo Bianconi wrote:
> > > v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01 ==(XDP_REDIRECT)==> v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11
> > > 
> > > - v00: iperf3 client (pinned on core 0)
> > > - v11: iperf3 server (pinned on core 7)
> > > 
> > > net-next veth codebase (page_pool APIs):
> > > =======================================
> > > - MTU  1500: ~ 5.42 Gbps
> > > - MTU  8000: ~ 14.1 Gbps
> > > - MTU 64000: ~ 18.4 Gbps
> > > 
> > > net-next veth codebase + page_frag_cahe APIs [0]:
> > > =================================================
> > > - MTU  1500: ~ 6.62 Gbps
> > > - MTU  8000: ~ 14.7 Gbps
> > > - MTU 64000: ~ 19.7 Gbps
> > > 
> > > xdp_generic codebase + page_frag_cahe APIs (current proposed patch):
> > > ====================================================================
> > > - MTU  1500: ~ 6.41 Gbps
> > > - MTU  8000: ~ 14.2 Gbps
> > > - MTU 64000: ~ 19.8 Gbps
> > > 
> > > xdp_generic codebase + page_frag_cahe APIs [1]:
> > > ===============================================
> > 
> > This one should say page pool?

yep, sorry

> > 
> > > - MTU  1500: ~ 5.75 Gbps
> > > - MTU  8000: ~ 15.3 Gbps
> > > - MTU 64000: ~ 21.2 Gbps
> > > 
> > > It seems page_pool APIs are working better for xdp_generic codebase
> > > (except MTU 1500 case) while page_frag_cache APIs are better for
> > > veth driver. What do you think? Am I missing something?
> > 
> > IDK the details of veth XDP very well but IIUC they are pretty much
> > the same. Are there any clues in perf -C 0 / 7?
> > 
> > > [0] Here I have just used napi_alloc_frag() instead of
> > > page_pool_dev_alloc_va()/page_pool_dev_alloc() in
> > > veth_convert_skb_to_xdp_buff()
> > > 
> > > [1] I developed this PoC to use page_pool APIs for xdp_generic code:
> > 
> > Why not put the page pool in softnet_data?
> 
> First I thought cool that Jakub is suggesting softnet_data, which will
> make page_pool (PP) even more central as the netstacks memory layer.
> 
> BUT then I realized that PP have a weakness, which is the return/free
> path that need to take a normal spin_lock, as that can be called from
> any CPU (unlike the RX/alloc case).  Thus, I fear that making multiple
> devices share a page_pool via softnet_data, increase the chance of lock
> contention when packets are "freed" returned/recycled.

yep, afaik skb_attempt_defer_free() is used just by the tcp stack so far
(e.g. we will have contention for udp).

moreover it seems page_pool return path is not so optimized for the percpu
approach (we have a lot of atomic read/write operations and page_pool stats
are already implemented as percpu variables).

Regards,
Lorenzo

> 
> --Jesper
> 
> p.s. PP have the page_pool_put_page_bulk() API, but only XDP (NIC-drivers)
> leverage this.
Jakub Kicinski Dec. 6, 2023, 4:03 p.m. UTC | #8
On Wed, 6 Dec 2023 13:41:49 +0100 Jesper Dangaard Brouer wrote:
> BUT then I realized that PP have a weakness, which is the return/free
> path that need to take a normal spin_lock, as that can be called from
> any CPU (unlike the RX/alloc case).  Thus, I fear that making multiple
> devices share a page_pool via softnet_data, increase the chance of lock
> contention when packets are "freed" returned/recycled.

I was thinking we can add a pcpu CPU ID to page pool so that
napi_pp_put_page() has a chance to realize that its on the "right CPU"
and feed the cache directly.
Lorenzo Bianconi Dec. 9, 2023, 7:23 p.m. UTC | #9
> On Wed, 6 Dec 2023 13:41:49 +0100 Jesper Dangaard Brouer wrote:
> > BUT then I realized that PP have a weakness, which is the return/free
> > path that need to take a normal spin_lock, as that can be called from
> > any CPU (unlike the RX/alloc case).  Thus, I fear that making multiple
> > devices share a page_pool via softnet_data, increase the chance of lock
> > contention when packets are "freed" returned/recycled.
> 
> I was thinking we can add a pcpu CPU ID to page pool so that
> napi_pp_put_page() has a chance to realize that its on the "right CPU"
> and feed the cache directly.

Are we going to use these page_pools just for virtual devices (e.g. veth) or
even for hw NICs? If we do not bound the page_pool to a netdevice I think we
can't rely on it to DMA map/unmap the buffer, right?
Moreover, are we going to rework page_pool stats first? It seems a bit weird to
have a percpu struct with a percpu pointer in it, right?

Regards,
Lorenzo
Jakub Kicinski Dec. 11, 2023, 5 p.m. UTC | #10
On Sat, 9 Dec 2023 20:23:09 +0100 Lorenzo Bianconi wrote:
> Are we going to use these page_pools just for virtual devices (e.g. veth) or
> even for hw NICs? If we do not bound the page_pool to a netdevice I think we
> can't rely on it to DMA map/unmap the buffer, right?

Right, I don't think it's particularly useful for HW NICs.
Maybe for allocating skb heads? We could possibly kill
struct page_frag_1k and use PP page / frag instead.
But not sure how Eric would react :)

> Moreover, are we going to rework page_pool stats first? It seems a bit weird to
> have a percpu struct with a percpu pointer in it, right?

The per-CPU stuff is for recycling, IIRC. Even if PP is for a single
CPU we can still end up freeing packets which used its pages anywhere
in the system.

I don't disagree that we may end up with a lot of stats on a large
system, but seems tangential to per-cpu page pools.
Paolo Abeni Dec. 12, 2023, 8:36 a.m. UTC | #11
On Mon, 2023-12-11 at 09:00 -0800, Jakub Kicinski wrote:
> On Sat, 9 Dec 2023 20:23:09 +0100 Lorenzo Bianconi wrote:
> > Are we going to use these page_pools just for virtual devices (e.g. veth) or
> > even for hw NICs? If we do not bound the page_pool to a netdevice I think we
> > can't rely on it to DMA map/unmap the buffer, right?
> 
> Right, I don't think it's particularly useful for HW NICs.
> Maybe for allocating skb heads? We could possibly kill
> struct page_frag_1k and use PP page / frag instead.
> But not sure how Eric would react :)

Side note here: we have a dedicated kmem_cache for typical skb head
allocation since commit bf9f1baa279f0758dc2297080360c5a616843927 -
where Eric mentioned we could possibly remove the page_frag_1k after
that (on my todo list since forever, sorry). 

Cheers,

Paolo
>
diff mbox series

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index 4df68d7f04a2..ed827b443d48 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4853,6 +4853,12 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 			 skb_headlen(skb) + mac_len, true);
+	if (skb_is_nonlinear(skb)) {
+		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+		xdp_buff_set_frags_flag(xdp);
+	} else {
+		xdp_buff_clear_frags_flag(xdp);
+	}
 
 	orig_data_end = xdp->data_end;
 	orig_data = xdp->data;
@@ -4882,6 +4888,14 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 		skb->len += off; /* positive on grow, negative on shrink */
 	}
 
+	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+	 */
+	if (xdp_buff_has_frags(xdp))
+		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+	else
+		skb->data_len = 0;
+
 	/* check if XDP changed eth hdr such SKB needs update */
 	eth = (struct ethhdr *)xdp->data;
 	if ((orig_eth_type != eth->h_proto) ||
@@ -4915,54 +4929,141 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	return act;
 }
 
-static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
-				     struct xdp_buff *xdp,
-				     struct bpf_prog *xdp_prog)
+static int netif_skb_check_for_generic_xdp(struct sk_buff **pskb,
+					   struct bpf_prog *prog)
 {
 	struct sk_buff *skb = *pskb;
-	u32 act = XDP_DROP;
-
-	/* Reinjected packets coming from act_mirred or similar should
-	 * not get XDP generic processing.
-	 */
-	if (skb_is_redirected(skb))
-		return XDP_PASS;
+	int err;
 
-	/* XDP packets must be linear and must have sufficient headroom
-	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
-	 * native XDP provides, thus we need to do it here as well.
+	/* XDP does not support fraglist so we need to linearize
+	 * the skb.
 	 */
-	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
-	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+	if (skb_has_frag_list(skb) || !prog->aux->xdp_has_frags) {
 		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 		int troom = skb->tail + skb->data_len - skb->end;
 
 		/* In case we have to go down the path and also linearize,
 		 * then lets do the pskb_expand_head() work just once here.
 		 */
-		if (pskb_expand_head(skb,
-				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
-				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
-			goto do_drop;
-		if (skb_linearize(skb))
-			goto do_drop;
+		err = pskb_expand_head(skb,
+				       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+		if (err)
+			return err;
+
+		err = skb_linearize(skb);
+		if (err)
+			return err;
+
+		return 0;
 	}
 
-	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+	 * bytes. This is the guarantee that also native XDP provides,
+	 * thus we need to do it here as well.
+	 */
+	if (skb_cloned(skb) || skb_shinfo(skb)->nr_frags ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		u32 mac_len = skb->data - skb_mac_header(skb);
+		u32 size, truesize, len, max_head_size, off;
+		struct sk_buff *nskb;
+		int i, head_off;
+		void *data;
+
+		__skb_push(skb, mac_len);
+		max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
+						  XDP_PACKET_HEADROOM);
+		if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
+			return -ENOMEM;
+
+		size = min_t(u32, skb->len, max_head_size);
+		truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
+		data = napi_alloc_frag(truesize);
+		if (!data)
+			return -ENOMEM;
+
+		nskb = napi_build_skb(data, truesize);
+		if (!nskb) {
+			skb_free_frag(data);
+			return -ENOMEM;
+		}
+
+		skb_reserve(nskb, XDP_PACKET_HEADROOM);
+		skb_copy_header(nskb, skb);
+
+		err = skb_copy_bits(skb, 0, nskb->data, size);
+		if (err) {
+			consume_skb(nskb);
+			return err;
+		}
+		skb_put(nskb, size);
+
+		head_off = skb_headroom(nskb) - skb_headroom(skb);
+		skb_headers_offset_update(nskb, head_off);
+
+		off = size;
+		len = skb->len - off;
+		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+			struct page *page;
+
+			size = min_t(u32, len, PAGE_SIZE);
+			data = napi_alloc_frag(size);
+			if (!data) {
+				consume_skb(nskb);
+				return -ENOMEM;
+			}
+
+			page = virt_to_head_page(data);
+			skb_add_rx_frag(nskb, i, page,
+					data - page_address(page), size, size);
+			err = skb_copy_bits(skb, off, data, size);
+			if (err) {
+				consume_skb(nskb);
+				return err;
+			}
+
+			len -= size;
+			off += size;
+		}
+
+		consume_skb(skb);
+		*pskb = nskb;
+		__skb_pull(nskb, mac_len);
+	}
+
+	return 0;
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
+				     struct xdp_buff *xdp,
+				     struct bpf_prog *xdp_prog)
+{
+	u32 act = XDP_DROP;
+
+	/* Reinjected packets coming from act_mirred or similar should
+	 * not get XDP generic processing.
+	 */
+	if (skb_is_redirected(*pskb))
+		return XDP_PASS;
+
+	if (netif_skb_check_for_generic_xdp(pskb, xdp_prog))
+		goto do_drop;
+
+	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
 	switch (act) {
 	case XDP_REDIRECT:
 	case XDP_TX:
 	case XDP_PASS:
 		break;
 	default:
-		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_ABORTED:
-		trace_xdp_exception(skb->dev, xdp_prog, act);
+		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_DROP:
 	do_drop:
-		kfree_skb(skb);
+		kfree_skb(*pskb);
 		break;
 	}