diff mbox series

[v1,12/15] io_uring: add OP_RECV_ZC command.

Message ID 20221108050521.3198458-13-jonathan.lemon@gmail.com (mailing list archive)
State New
Headers show
Series zero-copy RX for io_uring | expand

Commit Message

Jonathan Lemon Nov. 8, 2022, 5:05 a.m. UTC
The recvzc opcode uses a metadata buffer either supplied directly
with buf/len, or indirectly from the buffer group.  The expectation
is that this buffer is then filled with an array of io_uring_zctap_iov
structures, which point to the data in user-memory.

The opcode uses addr3 with an encoded format:

    addr3 = (readlen << 32) | (copy_bgid << 16);

Readlen specifies the maximum amount of data which should be read
from the socket.  The amount of returned data is also limited by
the number of iovs which the metadata area can hold.

As a fallback, if the desired skb data is not already present in
user memory, then a seprate buffer is obtained from the copy_bgid
and the data is copied into user memory, which is returned as
an iov[] structure.

This may happen possibly due to system misconfiguration, imprecise
header splitting, running the RECV_ZC opcode without hardware zero-copy
support, or other network stack follies.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 include/uapi/linux/io_uring.h |   1 +
 io_uring/net.c                | 123 ++++++++++++
 io_uring/opdef.c              |  15 ++
 io_uring/zctap.c              | 340 ++++++++++++++++++++++++++++++++++
 io_uring/zctap.h              |  20 ++
 5 files changed, 499 insertions(+)
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 88f01bda12be..6d20dfbf5bb1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -215,6 +215,7 @@  enum io_uring_op {
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
+	IORING_OP_RECV_ZC,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index 15dea91625e2..ef5ed6002751 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -16,6 +16,7 @@ 
 #include "net.h"
 #include "notif.h"
 #include "rsrc.h"
+#include "zctap.h"
 
 #if defined(CONFIG_NET)
 struct io_shutdown {
@@ -67,6 +68,12 @@  struct io_sr_msg {
 	struct io_kiocb 		*notif;
 };
 
+struct io_recvzc {
+	struct io_sr_msg		sr;
+	u32				datalen;
+	u16				copy_bgid;
+};
+
 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
 
 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -908,6 +915,122 @@  int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+	u64 recvzc_cmd;
+
+	/* use addr3 in order to leverage io_recvmsg_prep */
+	recvzc_cmd = READ_ONCE(sqe->addr3);
+
+	if (recvzc_cmd & 0xffff)
+		return -EINVAL;
+	zc->copy_bgid = (recvzc_cmd >> 16) & 0xffff;
+	zc->datalen = recvzc_cmd >> 32;
+
+	return io_recvmsg_prep(req, sqe);
+}
+
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+	struct zctap_read_desc zrd;
+	struct msghdr msg;
+	struct socket *sock;
+	struct iovec iov;
+	unsigned int cflags;
+	unsigned flags;
+	int ret, min_ret = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = zc->sr.len;
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (zc->sr.flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+retry_multishot:
+	if (io_do_buffer_select(req)) {
+		void __user *buf;
+
+		buf = io_buffer_select(req, &len, issue_flags);
+		if (!buf)
+			return -ENOBUFS;
+		zc->sr.buf = buf;
+	}
+
+	ret = import_single_range(READ, zc->sr.buf, len, &iov, &msg.msg_iter);
+	if (unlikely(ret))
+		goto out_free;
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+	msg.msg_get_inq = 1;
+	msg.msg_flags = 0;
+	msg.msg_controllen = 0;
+	msg.msg_iocb = NULL;
+	msg.msg_ubuf = NULL;
+
+	flags = zc->sr.msg_flags;
+	if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
+	zrd = (struct zctap_read_desc) {
+		.iov_limit = msg_data_left(&msg),
+		.recv_limit = zc->datalen,
+		.iter = &msg.msg_iter,
+		.ctx = req->ctx,
+		.copy_bgid = zc->copy_bgid,
+	};
+
+	ret = io_zctap_recv(sock, &zrd, &msg, flags);
+	if (ret < min_ret) {
+		if (ret == -EAGAIN && force_nonblock) {
+			if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+
+			return -EAGAIN;
+		}
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			zc->sr.len -= ret;
+			zc->sr.buf += ret;
+			zc->sr.done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return -EAGAIN;
+		}
+		req_set_fail(req);
+	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+out_free:
+		req_set_fail(req);
+	}
+
+	if (ret > 0)
+		ret += zc->sr.done_io;
+	else if (zc->sr.done_io)
+		ret = zc->sr.done_io;
+	else
+		io_kbuf_recycle(req, issue_flags);
+
+	cflags = io_put_kbuf(req, issue_flags);
+	if (msg.msg_inq)
+		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+
+	if (!io_recv_finish(req, &ret, cflags, ret <= 0))
+		goto retry_multishot;
+
+	return ret;
+}
+
 void io_send_zc_cleanup(struct io_kiocb *req)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 83dc0f9ad3b2..14b42811a78e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@ 
 #include "poll.h"
 #include "cancel.h"
 #include "rw.h"
+#include "zctap.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -521,6 +522,20 @@  const struct io_op_def io_op_defs[] = {
 		.fail			= io_sendrecv_fail,
 #else
 		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_RECV_ZC] = {
+		.name			= "RECV_ZC",
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.ioprio			= 1,
+#if defined(CONFIG_NET)
+		.prep			= io_recvzc_prep,
+		.issue			= io_recvzc,
+#else
+		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 };
diff --git a/io_uring/zctap.c b/io_uring/zctap.c
index 10d74b8f7cef..096b3dd5a8a3 100644
--- a/io_uring/zctap.c
+++ b/io_uring/zctap.c
@@ -7,6 +7,7 @@ 
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
 #include <linux/nospec.h>
+#include <net/tcp.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -53,6 +54,11 @@  static u64 zctap_page_info(const struct page *page)
 	return page_private(page);
 }
 
+static u16 zctap_page_region_id(const struct page *page)
+{
+	return (zctap_page_info(page) >> 16) & 0xffff;
+}
+
 static u16 zctap_page_id(const struct page *page)
 {
 	return zctap_page_info(page) & 0xffff;
@@ -72,6 +78,14 @@  static bool zctap_page_ours(struct page *page)
 #define IO_ZCTAP_UREF		0x10000
 #define IO_ZCTAP_KREF_MASK	(IO_ZCTAP_UREF - 1)
 
+static void io_zctap_get_buf_uref(struct ifq_region *ifr, u16 pgid)
+{
+	if (WARN_ON(pgid >= ifr->nr_pages))
+		return;
+
+	atomic_add(IO_ZCTAP_UREF, &ifr->buf[pgid].refcount);
+}
+
 /* return user refs back, indicate whether buffer is reusable */
 static bool io_zctap_put_buf_uref(struct io_zctap_buf *buf)
 {
@@ -396,6 +410,18 @@  static void io_zctap_ifq_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 	}
 }
 
+static struct io_zctap_ifq *io_zctap_skb_ifq(struct sk_buff *skb)
+{
+	struct io_zctap_ifq_priv *priv;
+	struct ubuf_info *uarg = skb_zcopy(skb);
+
+	if (uarg && uarg->callback == io_zctap_ifq_callback) {
+		priv = container_of(uarg, struct io_zctap_ifq_priv, uarg);
+		return &priv->ifq;
+	}
+	return NULL;
+}
+
 static struct io_zctap_ifq *io_zctap_ifq_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_zctap_ifq_priv *priv;
@@ -492,3 +518,317 @@  void io_unregister_zctap_all(struct io_ring_ctx *ctx)
 	for (i = 0; i < NR_ZCTAP_IFQS; i++)
 		io_unregister_zctap_ifq(ctx, i);
 }
+
+static int __zctap_get_user_buffer(struct zctap_read_desc *zrd, int len)
+{
+	if (!zrd->buflen) {
+		zrd->req = (struct io_kiocb) {
+			.ctx = zrd->ctx,
+			.buf_index = zrd->copy_bgid,
+		};
+
+		zrd->buf = (u8 *)io_zctap_buffer(&zrd->req, &zrd->buflen);
+		zrd->offset = 0;
+	}
+	return len > zrd->buflen ? zrd->buflen : len;
+}
+
+static int zctap_copy_data(struct zctap_read_desc *zrd, int len, u8 *kaddr)
+{
+	struct io_uring_zctap_iov zov;
+	u32 space;
+	int err;
+
+	space = zrd->iov_space + sizeof(zov);
+	if (space > zrd->iov_limit)
+		return 0;
+
+	len = __zctap_get_user_buffer(zrd, len);
+	if (!len)
+		return -ENOBUFS;
+
+	err = copy_to_user(zrd->buf + zrd->offset, kaddr, len);
+	if (err)
+		return -EFAULT;
+
+	zov = (struct io_uring_zctap_iov) {
+		.off = zrd->offset,
+		.len = len,
+		.bgid = zrd->copy_bgid,
+		.bid = zrd->req.buf_index,
+	};
+
+	if (copy_to_iter(&zov, sizeof(zov), zrd->iter) != sizeof(zov))
+		return -EFAULT;
+
+	zrd->offset += len;
+	zrd->buflen -= len;
+	zrd->iov_space = space;
+
+	return len;
+}
+
+static int zctap_copy_frag(struct zctap_read_desc *zrd, struct page *page,
+			   int off, int len, struct io_uring_zctap_iov *zov)
+{
+	u8 *kaddr;
+	int err;
+
+	len = __zctap_get_user_buffer(zrd, len);
+	if (!len)
+		return -ENOBUFS;
+
+	kaddr = kmap(page) + off;
+	err = copy_to_user(zrd->buf + zrd->offset, kaddr, len);
+	kunmap(page);
+
+	if (err)
+		return -EFAULT;
+
+	*zov = (struct io_uring_zctap_iov) {
+		.off = zrd->offset,
+		.len = len,
+		.bgid = zrd->copy_bgid,
+		.bid = zrd->req.buf_index,
+	};
+
+	zrd->offset += len;
+	zrd->buflen -= len;
+
+	return len;
+}
+
+static int zctap_recv_frag(struct zctap_read_desc *zrd,
+			   struct io_zctap_ifq *ifq,
+			   const skb_frag_t *frag, int off, int len)
+{
+	struct io_uring_zctap_iov zov;
+	struct page *page;
+	u32 space;
+	int pgid;
+
+	space = zrd->iov_space + sizeof(zov);
+	if (space > zrd->iov_limit)
+		return 0;
+
+	page = skb_frag_page(frag);
+	off += skb_frag_off(frag);
+
+	if (likely(ifq && ifq->ctx == zrd->ctx && zctap_page_ours(page))) {
+		pgid = zctap_page_id(page);
+		io_zctap_get_buf_uref(ifq->region, pgid);
+		zov = (struct io_uring_zctap_iov) {
+			.off = off,
+			.len = len,
+			.bgid = zctap_page_region_id(page),
+			.bid = pgid,
+		};
+	} else {
+		len = zctap_copy_frag(zrd, page, off, len, &zov);
+		if (len <= 0)
+			return len;
+	}
+
+	if (copy_to_iter(&zov, sizeof(zov), zrd->iter) != sizeof(zov))
+		return -EFAULT;
+
+	zrd->iov_space = space;
+
+	return len;
+}
+
+/* Our version of __skb_datagram_iter  -- should work for UDP also. */
+static int
+zctap_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+	       unsigned int offset, size_t len)
+{
+	struct zctap_read_desc *zrd = desc->arg.data;
+	struct io_zctap_ifq *ifq;
+	unsigned start, start_off;
+	struct sk_buff *frag_iter;
+	int i, copy, end, off;
+	int ret = 0;
+
+	if (zrd->iov_space >= zrd->iov_limit) {
+		desc->count = 0;
+		return 0;
+	}
+	if (len > zrd->recv_limit)
+		len = zrd->recv_limit;
+
+	start = skb_headlen(skb);
+	start_off = offset;
+
+	ifq = io_zctap_skb_ifq(skb);
+
+	if (offset < start) {
+		copy = start - offset;
+		if (copy > len)
+			copy = len;
+
+		/* copy out linear data */
+		ret = zctap_copy_data(zrd, copy, skb->data + offset);
+		if (ret < 0)
+			goto out;
+		offset += ret;
+		len -= ret;
+		if (len == 0 || ret != copy)
+			goto out;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag;
+
+		WARN_ON(start > offset + len);
+
+		frag = &skb_shinfo(skb)->frags[i];
+		end = start + skb_frag_size(frag);
+
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zctap_recv_frag(zrd, ifq, frag, off, copy);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zctap_recv_skb(desc, frag_iter, off, copy);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+out:
+	if (offset == start_off)
+		return ret;
+	return offset - start_off;
+}
+
+static int __io_zctap_tcp_read(struct sock *sk, struct zctap_read_desc *zrd)
+{
+	read_descriptor_t rd_desc = {
+		.arg.data = zrd,
+		.count = 1,
+	};
+
+	return tcp_read_sock(sk, &rd_desc, zctap_recv_skb);
+}
+
+static int io_zctap_tcp_recvmsg(struct sock *sk, struct zctap_read_desc *zrd,
+				int flags, int *addr_len)
+{
+	size_t used;
+	long timeo;
+	int ret;
+
+	ret = used = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	while (zrd->recv_limit) {
+		ret = __io_zctap_tcp_read(sk, zrd);
+		if (ret < 0)
+			break;
+		if (!ret) {
+			if (used)
+				break;
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				ret = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				break;
+			sk_wait_data(sk, &timeo, NULL);
+			if (signal_pending(current)) {
+				ret = sock_intr_errno(timeo);
+				break;
+			}
+			continue;
+		}
+		zrd->recv_limit -= ret;
+		used += ret;
+
+		if (!timeo)
+			break;
+		release_sock(sk);
+		lock_sock(sk);
+
+		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current))
+			break;
+	}
+
+	release_sock(sk);
+
+	/* XXX, handle timestamping */
+
+	if (used)
+		return used;
+
+	return ret;
+}
+
+int io_zctap_recv(struct socket *sock, struct zctap_read_desc *zrd,
+		  struct msghdr *msg, unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	const struct proto *prot;
+	int addr_len = 0;
+	int ret;
+
+	if (flags & MSG_ERRQUEUE)
+		return -EOPNOTSUPP;
+
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot->recvmsg != tcp_recvmsg)
+		return -EPROTONOSUPPORT;
+
+	sock_rps_record_flow(sk);
+
+	ret = io_zctap_tcp_recvmsg(sk, zrd, flags, &addr_len);
+	if (ret >= 0) {
+		msg->msg_namelen = addr_len;
+		ret = zrd->iov_space;
+	}
+	return ret;
+}
diff --git a/io_uring/zctap.h b/io_uring/zctap.h
index bb44f8e972e8..4db516707d19 100644
--- a/io_uring/zctap.h
+++ b/io_uring/zctap.h
@@ -2,10 +2,30 @@ 
 #ifndef IOU_ZCTAP_H
 #define IOU_ZCTAP_H
 
+struct zctap_read_desc {
+	struct io_ring_ctx *ctx;
+	struct iov_iter *iter;
+	u32 iov_space;
+	u32 iov_limit;
+	u32 recv_limit;
+
+	struct io_kiocb req;
+	u8 *buf;
+	size_t offset;
+	size_t buflen;
+
+	u16 copy_bgid;			/* XXX move to register ifq? */
+};
+
 int io_register_ifq(struct io_ring_ctx *ctx,
 		    struct io_uring_ifq_req __user *arg);
 void io_unregister_zctap_all(struct io_ring_ctx *ctx);
 
 int io_provide_ifq_region(struct io_zctap_ifq *ifq, u16 id);
 
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_zctap_recv(struct socket *sock, struct zctap_read_desc *zrd,
+		  struct msghdr *msg, unsigned int flags);
+
 #endif