diff mbox series

[v6,14/15] io_uring/zcrx: add copy fallback

Message ID 20241016185252.3746190-15-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Oct. 16, 2024, 6:52 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

There are scenarios in which the zerocopy path might get a normal
in-kernel buffer, it could be a mis-steered packet or simply the linear
part of an skb. Another use case is to allow the driver to allocate
kernel pages when it's out of zc buffers, which makes it more resilient
to spikes in load and allow the user to choose the balance between the
amount of memory provided and performance.

At the moment we fail such requests. Instead, grab a buffer from the
page pool, copy data there, and return back to user in the usual way.
Because the refill ring is private to the napi our page pool is running
from, it's done by stopping the napi via napi_execute() helper. It grabs
only one buffer, which is inefficient, and improving it is left for
follow up patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 io_uring/zcrx.c | 133 +++++++++++++++++++++++++++++++++++++++++++++---
 io_uring/zcrx.h |   1 +
 2 files changed, 127 insertions(+), 7 deletions(-)

Comments

Paolo Abeni Oct. 21, 2024, 2:40 p.m. UTC | #1
On 10/16/24 20:52, David Wei wrote:
> @@ -540,6 +562,34 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
>  	.scrub			= io_pp_zc_scrub,
>  };
>  
> +static void io_napi_refill(void *data)
> +{
> +	struct io_zc_refill_data *rd = data;
> +	struct io_zcrx_ifq *ifq = rd->ifq;
> +	netmem_ref netmem;
> +
> +	if (WARN_ON_ONCE(!ifq->pp))
> +		return;
> +
> +	netmem = page_pool_alloc_netmem(ifq->pp, GFP_ATOMIC | __GFP_NOWARN);
> +	if (!netmem)
> +		return;
> +	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
> +		return;
> +
> +	rd->niov = netmem_to_net_iov(netmem);
> +}
> +
> +static struct net_iov *io_zc_get_buf_task_safe(struct io_zcrx_ifq *ifq)
> +{
> +	struct io_zc_refill_data rd = {
> +		.ifq = ifq,
> +	};
> +
> +	napi_execute(ifq->napi_id, io_napi_refill, &rd);

Under UDP flood the above has unbounded/unlimited execution time, unless
you set NAPI_STATE_PREFER_BUSY_POLL. Is the allocation schema here
somehow preventing such unlimited wait?

Thanks,

Paolo
David Wei Oct. 21, 2024, 6:31 p.m. UTC | #2
On 2024-10-21 07:40, Paolo Abeni wrote:
> On 10/16/24 20:52, David Wei wrote:
>> @@ -540,6 +562,34 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
>>  	.scrub			= io_pp_zc_scrub,
>>  };
>>  
>> +static void io_napi_refill(void *data)
>> +{
>> +	struct io_zc_refill_data *rd = data;
>> +	struct io_zcrx_ifq *ifq = rd->ifq;
>> +	netmem_ref netmem;
>> +
>> +	if (WARN_ON_ONCE(!ifq->pp))
>> +		return;
>> +
>> +	netmem = page_pool_alloc_netmem(ifq->pp, GFP_ATOMIC | __GFP_NOWARN);
>> +	if (!netmem)
>> +		return;
>> +	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
>> +		return;
>> +
>> +	rd->niov = netmem_to_net_iov(netmem);
>> +}
>> +
>> +static struct net_iov *io_zc_get_buf_task_safe(struct io_zcrx_ifq *ifq)
>> +{
>> +	struct io_zc_refill_data rd = {
>> +		.ifq = ifq,
>> +	};
>> +
>> +	napi_execute(ifq->napi_id, io_napi_refill, &rd);
> 
> Under UDP flood the above has unbounded/unlimited execution time, unless
> you set NAPI_STATE_PREFER_BUSY_POLL. Is the allocation schema here
> somehow preventing such unlimited wait?

Hi Paolo. Do you mean that under UDP flood, napi_execute() will have
unbounded execution time because napi_state_start_busy_polling() and
need_resched() will always return false? My understanding is that
need_resched() will eventually kick the caller task out of
napi_execute().

> 
> Thanks,
> 
> Paolo
>
Paolo Abeni Oct. 22, 2024, 7:48 a.m. UTC | #3
On 10/21/24 20:31, David Wei wrote:
> On 2024-10-21 07:40, Paolo Abeni wrote:
>> On 10/16/24 20:52, David Wei wrote:
>>> @@ -540,6 +562,34 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
>>>  	.scrub			= io_pp_zc_scrub,
>>>  };
>>>  
>>> +static void io_napi_refill(void *data)
>>> +{
>>> +	struct io_zc_refill_data *rd = data;
>>> +	struct io_zcrx_ifq *ifq = rd->ifq;
>>> +	netmem_ref netmem;
>>> +
>>> +	if (WARN_ON_ONCE(!ifq->pp))
>>> +		return;
>>> +
>>> +	netmem = page_pool_alloc_netmem(ifq->pp, GFP_ATOMIC | __GFP_NOWARN);
>>> +	if (!netmem)
>>> +		return;
>>> +	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
>>> +		return;
>>> +
>>> +	rd->niov = netmem_to_net_iov(netmem);
>>> +}
>>> +
>>> +static struct net_iov *io_zc_get_buf_task_safe(struct io_zcrx_ifq *ifq)
>>> +{
>>> +	struct io_zc_refill_data rd = {
>>> +		.ifq = ifq,
>>> +	};
>>> +
>>> +	napi_execute(ifq->napi_id, io_napi_refill, &rd);
>>
>> Under UDP flood the above has unbounded/unlimited execution time, unless
>> you set NAPI_STATE_PREFER_BUSY_POLL. Is the allocation schema here
>> somehow preventing such unlimited wait?
> 
> Hi Paolo. Do you mean that under UDP flood, napi_execute() will have
> unbounded execution time because napi_state_start_busy_polling() and
> need_resched() will always return false? My understanding is that
> need_resched() will eventually kick the caller task out of
> napi_execute().

Sorry for the short reply. Let's try to consolidate this discussion on
patch 8, which is strictly related had has the relevant code more handy.

Thanks,

Paolo
diff mbox series

Patch

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 3f4625730dbd..1f4db70e3370 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -5,6 +5,8 @@ 
 #include <linux/nospec.h>
 #include <linux/netdevice.h>
 #include <linux/io_uring.h>
+#include <linux/skbuff_ref.h>
+#include <net/busy_poll.h>
 #include <net/page_pool/helpers.h>
 #include <net/page_pool/memory_provider.h>
 #include <trace/events/page_pool.h>
@@ -28,6 +30,11 @@  struct io_zcrx_args {
 	struct socket		*sock;
 };
 
+struct io_zc_refill_data {
+	struct io_zcrx_ifq *ifq;
+	struct net_iov *niov;
+};
+
 static const struct memory_provider_ops io_uring_pp_zc_ops;
 
 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
@@ -37,6 +44,13 @@  static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio
 	return container_of(owner, struct io_zcrx_area, nia);
 }
 
+static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+{
+	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+	return area->pages[net_iov_idx(niov)];
+}
+
 static int io_open_zc_rxq(struct io_zcrx_ifq *ifq, unsigned ifq_idx)
 {
 	struct netdev_rx_queue *rxq;
@@ -59,6 +73,13 @@  static int io_open_zc_rxq(struct io_zcrx_ifq *ifq, unsigned ifq_idx)
 	ret = netdev_rx_queue_restart(ifq->dev, ifq->if_rxq);
 	if (ret)
 		goto fail;
+
+	if (WARN_ON_ONCE(!ifq->pp)) {
+		ret = -EFAULT;
+		goto fail;
+	}
+	/* grab napi_id while still under rtnl */
+	ifq->napi_id = ifq->pp->p.napi->napi_id;
 	return 0;
 fail:
 	rxq->mp_params.mp_ops = NULL;
@@ -526,6 +547,7 @@  static void io_pp_zc_destroy(struct page_pool *pp)
 	page_pool_mp_release_area(pp, &ifq->area->nia);
 
 	ifq->pp = NULL;
+	ifq->napi_id = 0;
 
 	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
 		return;
@@ -540,6 +562,34 @@  static const struct memory_provider_ops io_uring_pp_zc_ops = {
 	.scrub			= io_pp_zc_scrub,
 };
 
+static void io_napi_refill(void *data)
+{
+	struct io_zc_refill_data *rd = data;
+	struct io_zcrx_ifq *ifq = rd->ifq;
+	netmem_ref netmem;
+
+	if (WARN_ON_ONCE(!ifq->pp))
+		return;
+
+	netmem = page_pool_alloc_netmem(ifq->pp, GFP_ATOMIC | __GFP_NOWARN);
+	if (!netmem)
+		return;
+	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+		return;
+
+	rd->niov = netmem_to_net_iov(netmem);
+}
+
+static struct net_iov *io_zc_get_buf_task_safe(struct io_zcrx_ifq *ifq)
+{
+	struct io_zc_refill_data rd = {
+		.ifq = ifq,
+	};
+
+	napi_execute(ifq->napi_id, io_napi_refill, &rd);
+	return rd.niov;
+}
+
 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
 			      struct io_zcrx_ifq *ifq, int off, int len)
 {
@@ -563,6 +613,45 @@  static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
 	return true;
 }
 
+static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
+				  void *data, unsigned int offset, size_t len)
+{
+	size_t copy_size, copied = 0;
+	int ret = 0, off = 0;
+	struct page *page;
+	u8 *vaddr;
+
+	do {
+		struct net_iov *niov;
+
+		niov = io_zc_get_buf_task_safe(ifq);
+		if (!niov) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		page = io_zcrx_iov_page(niov);
+		vaddr = kmap_local_page(page);
+		copy_size = min_t(size_t, PAGE_SIZE, len);
+		memcpy(vaddr, data + offset, copy_size);
+		kunmap_local(vaddr);
+
+		if (!io_zcrx_queue_cqe(req, niov, ifq, off, copy_size)) {
+			napi_pp_put_page(net_iov_to_netmem(niov));
+			return -ENOSPC;
+		}
+
+		io_zcrx_get_buf_uref(niov);
+		napi_pp_put_page(net_iov_to_netmem(niov));
+
+		offset += copy_size;
+		len -= copy_size;
+		copied += copy_size;
+	} while (offset < len);
+
+	return copied ? copied : ret;
+}
+
 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 			     const skb_frag_t *frag, int off, int len)
 {
@@ -570,8 +659,24 @@  static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 
 	off += skb_frag_off(frag);
 
-	if (unlikely(!skb_frag_is_net_iov(frag)))
-		return -EOPNOTSUPP;
+	if (unlikely(!skb_frag_is_net_iov(frag))) {
+		struct page *page = skb_frag_page(frag);
+		u32 p_off, p_len, t, copied = 0;
+		u8 *vaddr;
+		int ret = 0;
+
+		skb_frag_foreach_page(frag, off, len,
+				      page, p_off, p_len, t) {
+			vaddr = kmap_local_page(page);
+			ret = io_zcrx_copy_chunk(req, ifq, vaddr, p_off, p_len);
+			kunmap_local(vaddr);
+
+			if (ret < 0)
+				return copied ? copied : ret;
+			copied += ret;
+		}
+		return copied;
+	}
 
 	niov = netmem_to_net_iov(frag->netmem);
 	if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
@@ -592,15 +697,29 @@  io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 	struct io_zcrx_ifq *ifq = args->ifq;
 	struct io_kiocb *req = args->req;
 	struct sk_buff *frag_iter;
-	unsigned start, start_off;
+	unsigned start, start_off = offset;
 	int i, copy, end, off;
 	int ret = 0;
 
-	start = skb_headlen(skb);
-	start_off = offset;
+	if (unlikely(offset < skb_headlen(skb))) {
+		ssize_t copied;
+		size_t to_copy;
 
-	if (offset < start)
-		return -EOPNOTSUPP;
+		to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
+		copied = io_zcrx_copy_chunk(req, ifq, skb->data, offset, to_copy);
+		if (copied < 0) {
+			ret = copied;
+			goto out;
+		}
+		offset += copied;
+		len -= copied;
+		if (!len)
+			goto out;
+		if (offset != skb_headlen(skb))
+			goto out;
+	}
+
+	start = skb_headlen(skb);
 
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 		const skb_frag_t *frag;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index d3f6b6cdd647..5d7920972e95 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -39,6 +39,7 @@  struct io_zcrx_ifq {
 
 	u32				if_rxq;
 	netdevice_tracker		netdev_tracker;
+	unsigned			napi_id;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)