[15/20] io_uring/zcrx: add copy fallback

Message ID	20231107214045.2172393-16-dw@davidwei.uk (mailing list archive)
State	New
Headers	show Received: from lindbergh.monkeyblade.net (lindbergh.monkeyblade.net [23.128.96.19]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 450202B2EE for <io-uring@vger.kernel.org>; Tue, 7 Nov 2023 21:41:11 +0000 (UTC) From: David Wei <dw@davidwei.uk> To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe <axboe@kernel.dk>, Pavel Begunkov <asml.silence@gmail.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jesper Dangaard Brouer <hawk@kernel.org>, David Ahern <dsahern@kernel.org>, Mina Almasry <almasrymina@google.com>, Willem de Bruijn <willemdebruijn.kernel@gmail.com>, Dragos Tatulea <dtatulea@nvidia.com> Subject: [PATCH 15/20] io_uring/zcrx: add copy fallback Date: Tue, 7 Nov 2023 13:40:40 -0800 Message-Id: <20231107214045.2172393-16-dw@davidwei.uk> In-Reply-To: <20231107214045.2172393-1-dw@davidwei.uk> References: <20231107214045.2172393-1-dw@davidwei.uk> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	Zero copy Rx using io_uring \| expand [RFC,v2,00/20] Zero copy Rx using io_uring [01/20] io_uring: add interface queue [02/20] io_uring: add mmap support for shared ifq ringbuffers [03/20] netdev: add XDP_SETUP_ZC_RX command [04/20] io_uring: setup ZC for an Rx queue when registering an ifq [05/20] io_uring/zcrx: implement socket registration [06/20] io_uring: add ZC buf and pool [07/20] io_uring: add ZC pool API [08/20] skbuff: add SKBFL_FIXED_FRAG and skb_fixed() [09/20] io_uring: allocate a uarg for freeing zero copy skbs [10/20] io_uring: delay ZC pool destruction [11/20] net: add data pool [12/20] io_uring: add io_recvzc request [13/20] io_uring/zcrx: propagate ifq down the stack [14/20] io_uring/zcrx: introduce io_zc_get_rbuf_cqe [15/20] io_uring/zcrx: add copy fallback [16/20] net: execute custom callback from napi [17/20] io_uring/zcrx: copy fallback to ring buffers [18/20] veth: add support for io_uring zc rx [19/20] bnxt: use data pool [20/20] io_uring/zcrx: add multi socket support per Rx queue

Message ID

20231107214045.2172393-16-dw@davidwei.uk (mailing list archive)

State

New

Headers

From: David Wei <dw@davidwei.uk>
To: io-uring@vger.kernel.org,
	netdev@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>,
	Pavel Begunkov <asml.silence@gmail.com>,
	Jakub Kicinski <kuba@kernel.org>,
	Paolo Abeni <pabeni@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jesper Dangaard Brouer <hawk@kernel.org>,
	David Ahern <dsahern@kernel.org>,
	Mina Almasry <almasrymina@google.com>,
	Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
	Dragos Tatulea <dtatulea@nvidia.com>
Subject: [PATCH 15/20] io_uring/zcrx: add copy fallback
Date: Tue,  7 Nov 2023 13:40:40 -0800
Message-Id: <20231107214045.2172393-16-dw@davidwei.uk>
In-Reply-To: <20231107214045.2172393-1-dw@davidwei.uk>
References: <20231107214045.2172393-1-dw@davidwei.uk>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

Series

Zero copy Rx using io_uring | expand

Commit Message

David Wei Nov. 7, 2023, 9:40 p.m. UTC

From: Pavel Begunkov <asml.silence@gmail.com>

Currently, if user fails to keep up with the network and doesn't refill
the buffer ring fast enough the NIC/driver will start dropping packets.
That might be too punishing, so let's fall back to non-zerocopy version
by allowing the driver to do normal kernel allocations. Later, when
we're in the task context doing zc_rx_recv_skb() we'll detect such pages
and copy them into user specified buffers.

This patch implement the second (copy) part. It'll facilitate adoption
and help the user to strike the balance b/w allocation the right amount
of zerocopy buffers and being resilient to surges in traffic.

Note, due to technical reasons for now we're only using buffers from
->freelist, which is unreliably and is likely to fail with time. It'll
be revised in later patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 io_uring/zc_rx.c | 115 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 105 insertions(+), 10 deletions(-)

diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index c1502ec3e629..c2ed600f0951 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -498,6 +498,26 @@  static void io_zc_rx_refill_cache(struct io_zc_rx_ifq *ifq, int count)
 	pool->cache_count += filled;
 }
 
+static struct io_zc_rx_buf *io_zc_get_buf_task_safe(struct io_zc_rx_ifq *ifq)
+{
+	struct io_zc_rx_pool *pool = ifq->pool;
+	struct io_zc_rx_buf *buf = NULL;
+	u32 pgid;
+
+	if (!READ_ONCE(pool->free_count))
+		return NULL;
+
+	spin_lock_bh(&pool->freelist_lock);
+	if (pool->free_count) {
+		pool->free_count--;
+		pgid = pool->freelist[pool->free_count];
+		buf = &pool->bufs[pgid];
+		atomic_set(&buf->refcount, 1);
+	}
+	spin_unlock_bh(&pool->freelist_lock);
+	return buf;
+}
+
 struct io_zc_rx_buf *io_zc_rx_get_buf(struct io_zc_rx_ifq *ifq)
 {
 	struct io_zc_rx_pool *pool = ifq->pool;
@@ -576,6 +596,11 @@  static struct io_zc_rx_ifq *io_zc_rx_ifq_skb(struct sk_buff *skb)
 	return NULL;
 }
 
+static inline void io_zc_return_rbuf_cqe(struct io_zc_rx_ifq *ifq)
+{
+	ifq->cached_cq_tail--;
+}
+
 static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *ifq)
 {
 	struct io_uring_rbuf_cqe *cqe;
@@ -595,6 +620,51 @@  static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *
 	return cqe;
 }
 
+static ssize_t zc_rx_copy_chunk(struct io_zc_rx_ifq *ifq, void *data,
+				unsigned int offset, size_t len)
+{
+	size_t copy_size, copied = 0;
+	struct io_uring_rbuf_cqe *cqe;
+	struct io_zc_rx_buf *buf;
+	unsigned int pgid;
+	int ret = 0, off = 0;
+	u8 *vaddr;
+
+	do {
+		cqe = io_zc_get_rbuf_cqe(ifq);
+		if (!cqe) {
+			ret = ENOBUFS;
+			break;
+		}
+		buf = io_zc_get_buf_task_safe(ifq);
+		if (!buf) {
+			io_zc_return_rbuf_cqe(ifq);
+			ret = -ENOMEM;
+			break;
+		}
+
+		vaddr = kmap_local_page(buf->page);
+		copy_size = min_t(size_t, PAGE_SIZE, len);
+		memcpy(vaddr, data + offset, copy_size);
+		kunmap_local(vaddr);
+
+		pgid = page_private(buf->page) & 0xffffffff;
+		io_zc_rx_get_buf_uref(ifq->pool, pgid);
+		io_zc_rx_put_buf(ifq, buf);
+
+		cqe->region = 0;
+		cqe->off = pgid * PAGE_SIZE + off;
+		cqe->len = copy_size;
+		cqe->flags = 0;
+
+		offset += copy_size;
+		len -= copy_size;
+		copied += copy_size;
+	} while (offset < len);
+
+	return copied ? copied : ret;
+}
+
 static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
 			   int off, int len, bool zc_skb)
 {
@@ -618,9 +688,21 @@  static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
 		cqe->len = len;
 		cqe->flags = 0;
 	} else {
-		/* TODO: copy frags that aren't backed by zc pages */
-		WARN_ON_ONCE(1);
-		return -ENOMEM;
+		u32 p_off, p_len, t, copied = 0;
+		u8 *vaddr;
+		int ret = 0;
+
+		skb_frag_foreach_page(frag, off, len,
+				      page, p_off, p_len, t) {
+			vaddr = kmap_local_page(page);
+			ret = zc_rx_copy_chunk(ifq, vaddr, p_off, p_len);
+			kunmap_local(vaddr);
+
+			if (ret < 0)
+				return copied ? copied : ret;
+			copied += ret;
+		}
+		len = copied;
 	}
 
 	return len;
@@ -633,7 +715,7 @@  zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 	struct io_zc_rx_ifq *ifq = desc->arg.data;
 	struct io_zc_rx_ifq *skb_ifq;
 	struct sk_buff *frag_iter;
-	unsigned start, start_off;
+	unsigned start, start_off = offset;
 	int i, copy, end, off;
 	bool zc_skb = true;
 	int ret = 0;
@@ -643,14 +725,27 @@  zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 		zc_skb = false;
 		if (WARN_ON_ONCE(skb_ifq))
 			return -EFAULT;
-		pr_debug("non zerocopy pages are not supported\n");
-		return -EFAULT;
 	}
-	start = skb_headlen(skb);
-	start_off = offset;
 
-	// TODO: copy payload in skb linear data */
-	WARN_ON_ONCE(offset < start);
+	if (unlikely(offset < skb_headlen(skb))) {
+		ssize_t copied;
+		size_t to_copy;
+
+		to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
+		copied = zc_rx_copy_chunk(ifq, skb->data, offset, to_copy);
+		if (copied < 0) {
+			ret = copied;
+			goto out;
+		}
+		offset += copied;
+		len -= copied;
+		if (!len)
+			goto out;
+		if (offset != skb_headlen(skb))
+			goto out;
+	}
+
+	start = skb_headlen(skb);
 
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 		const skb_frag_t *frag;

[15/20] io_uring/zcrx: add copy fallback

Commit Message

Patch