[RFC,v4,15/16] io_uring/zcrx: add copy fallback

Message ID	20240312214430.2923019-16-dw@davidwei.uk (mailing list archive)
State	RFC
Headers	show Received: from mail-pf1-f170.google.com (mail-pf1-f170.google.com [209.85.210.170]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3E8E414600D for <netdev@vger.kernel.org>; Tue, 12 Mar 2024 21:44:49 +0000 (UTC) From: David Wei <dw@davidwei.uk> To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe <axboe@kernel.dk>, Pavel Begunkov <asml.silence@gmail.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jesper Dangaard Brouer <hawk@kernel.org>, David Ahern <dsahern@kernel.org>, Mina Almasry <almasrymina@google.com> Subject: [RFC PATCH v4 15/16] io_uring/zcrx: add copy fallback Date: Tue, 12 Mar 2024 14:44:29 -0700 Message-ID: <20240312214430.2923019-16-dw@davidwei.uk> In-Reply-To: <20240312214430.2923019-1-dw@davidwei.uk> References: <20240312214430.2923019-1-dw@davidwei.uk> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	Zero copy Rx using io_uring \| expand [RFC,v4,00/16] Zero copy Rx using io_uring [RFC,v4,01/16] net: generalise pp provider params passing [RFC,v4,02/16] io_uring: delayed cqe commit [RFC,v4,03/16] net: page_pool: add ->scrub mem provider callback [RFC,v4,04/16] io_uring: separate header for exported net bits [RFC,v4,05/16] io_uring: introduce interface queue [RFC,v4,06/16] io_uring: add mmap support for shared ifq ringbuffers [RFC,v4,07/16] netdev: add XDP_SETUP_ZC_RX command [RFC,v4,08/16] io_uring: setup ZC for an Rx queue when registering an ifq [RFC,v4,09/16] io_uring/zcrx: implement socket registration [RFC,v4,10/16] io_uring: add zero copy buf representation and pool [RFC,v4,11/16] io_uring: implement pp memory provider for zc rx [RFC,v4,12/16] io_uring/zcrx: implement PP_FLAG_DMA_* handling [RFC,v4,13/16] io_uring: add io_recvzc request [RFC,v4,14/16] net: execute custom callback from napi [RFC,v4,15/16] io_uring/zcrx: add copy fallback [RFC,v4,16/16] veth: add support for io_uring zc rx

Message ID

20240312214430.2923019-16-dw@davidwei.uk (mailing list archive)

State

RFC

Headers

From: David Wei <dw@davidwei.uk>
To: io-uring@vger.kernel.org,
	netdev@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>,
	Pavel Begunkov <asml.silence@gmail.com>,
	Jakub Kicinski <kuba@kernel.org>,
	Paolo Abeni <pabeni@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jesper Dangaard Brouer <hawk@kernel.org>,
	David Ahern <dsahern@kernel.org>,
	Mina Almasry <almasrymina@google.com>
Subject: [RFC PATCH v4 15/16] io_uring/zcrx: add copy fallback
Date: Tue, 12 Mar 2024 14:44:29 -0700
Message-ID: <20240312214430.2923019-16-dw@davidwei.uk>
In-Reply-To: <20240312214430.2923019-1-dw@davidwei.uk>
References: <20240312214430.2923019-1-dw@davidwei.uk>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

Series

Zero copy Rx using io_uring | expand

Context	Check	Description
netdev/tree_selection	success	Guessing tree name failed - patch did not apply, async

Context

Check

Description

netdev/tree_selection

success

Guessing tree name failed - patch did not apply, async

Commit Message

David Wei March 12, 2024, 9:44 p.m. UTC

Currently, if user fails to keep up with the network and doesn't refill
the buffer ring fast enough the NIC/driver will start dropping packets.
That might be too punishing. Add a fallback path, which would allow
drivers to allocate normal pages when there is starvation, then
zc_rx_recv_skb() we'll detect them and copy into the user specified
buffers, when they become available.

That should help with adoption and also help the user striking the right
balance allocating just the right amount of zerocopy buffers but also
being resilient to sudden surges in traffic.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 io_uring/zc_rx.c | 111 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 105 insertions(+), 6 deletions(-)

diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index bb9251111735..d5f49590e682 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -8,6 +8,7 @@ 
 #include <linux/nospec.h>
 
 #include <net/page_pool/helpers.h>
+#include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/af_unix.h>
 
@@ -26,6 +27,11 @@  struct io_zc_rx_args {
 	struct socket		*sock;
 };
 
+struct io_zc_refill_data {
+	struct io_zc_rx_ifq *ifq;
+	struct io_zc_rx_buf *buf;
+};
+
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
 static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq,
@@ -648,6 +654,34 @@  const struct memory_provider_ops io_uring_pp_zc_ops = {
 };
 EXPORT_SYMBOL(io_uring_pp_zc_ops);
 
+static void io_napi_refill(void *data)
+{
+	struct io_zc_refill_data *rd = data;
+	struct io_zc_rx_ifq *ifq = rd->ifq;
+	netmem_ref netmem;
+
+	if (WARN_ON_ONCE(!ifq->pp))
+		return;
+
+	netmem = page_pool_alloc_netmem(ifq->pp, GFP_ATOMIC | __GFP_NOWARN);
+	if (!netmem)
+		return;
+	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+		return;
+
+	rd->buf = io_niov_to_buf(netmem_to_net_iov(netmem));
+}
+
+static struct io_zc_rx_buf *io_zc_get_buf_task_safe(struct io_zc_rx_ifq *ifq)
+{
+	struct io_zc_refill_data rd = {
+		.ifq = ifq,
+	};
+
+	napi_execute(ifq->pp->p.napi, io_napi_refill, &rd);
+	return rd.buf;
+}
+
 static bool zc_rx_queue_cqe(struct io_kiocb *req, struct io_zc_rx_buf *buf,
 			   struct io_zc_rx_ifq *ifq, int off, int len)
 {
@@ -669,6 +703,42 @@  static bool zc_rx_queue_cqe(struct io_kiocb *req, struct io_zc_rx_buf *buf,
 	return true;
 }
 
+static ssize_t zc_rx_copy_chunk(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
+				void *data, unsigned int offset, size_t len)
+{
+	size_t copy_size, copied = 0;
+	struct io_zc_rx_buf *buf;
+	int ret = 0, off = 0;
+	u8 *vaddr;
+
+	do {
+		buf = io_zc_get_buf_task_safe(ifq);
+		if (!buf) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		vaddr = kmap_local_page(buf->page);
+		copy_size = min_t(size_t, PAGE_SIZE, len);
+		memcpy(vaddr, data + offset, copy_size);
+		kunmap_local(vaddr);
+
+		if (!zc_rx_queue_cqe(req, buf, ifq, off, copy_size)) {
+			napi_pp_put_page(net_iov_to_netmem(&buf->niov), false);
+			return -ENOSPC;
+		}
+
+		io_zc_rx_get_buf_uref(buf);
+		napi_pp_put_page(net_iov_to_netmem(&buf->niov), false);
+
+		offset += copy_size;
+		len -= copy_size;
+		copied += copy_size;
+	} while (offset < len);
+
+	return copied ? copied : ret;
+}
+
 static int zc_rx_recv_frag(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
 			   const skb_frag_t *frag, int off, int len)
 {
@@ -688,7 +758,22 @@  static int zc_rx_recv_frag(struct io_kiocb *req, struct io_zc_rx_ifq *ifq,
 			return -ENOSPC;
 		io_zc_rx_get_buf_uref(buf);
 	} else {
-		return -EOPNOTSUPP;
+		struct page *page = skb_frag_page(frag);
+		u32 p_off, p_len, t, copied = 0;
+		u8 *vaddr;
+		int ret = 0;
+
+		skb_frag_foreach_page(frag, off, len,
+				      page, p_off, p_len, t) {
+			vaddr = kmap_local_page(page);
+			ret = zc_rx_copy_chunk(req, ifq, vaddr, p_off, p_len);
+			kunmap_local(vaddr);
+
+			if (ret < 0)
+				return copied ? copied : ret;
+			copied += ret;
+		}
+		len = copied;
 	}
 
 	return len;
@@ -702,15 +787,29 @@  zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 	struct io_zc_rx_ifq *ifq = args->ifq;
 	struct io_kiocb *req = args->req;
 	struct sk_buff *frag_iter;
-	unsigned start, start_off;
+	unsigned start, start_off = offset;
 	int i, copy, end, off;
 	int ret = 0;
 
-	start = skb_headlen(skb);
-	start_off = offset;
+	if (unlikely(offset < skb_headlen(skb))) {
+		ssize_t copied;
+		size_t to_copy;
 
-	if (offset < start)
-		return -EOPNOTSUPP;
+		to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
+		copied = zc_rx_copy_chunk(req, ifq, skb->data, offset, to_copy);
+		if (copied < 0) {
+			ret = copied;
+			goto out;
+		}
+		offset += copied;
+		len -= copied;
+		if (!len)
+			goto out;
+		if (offset != skb_headlen(skb))
+			goto out;
+	}
+
+	start = skb_headlen(skb);
 
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 		const skb_frag_t *frag;

[RFC,v4,15/16] io_uring/zcrx: add copy fallback

Checks

Commit Message

Patch