@@ -63,6 +63,10 @@ struct io_uring_sqe {
union {
__s32 splice_fd_in;
__u32 file_index;
+ struct {
+ __u16 notification_idx;
+ __u16 __pad;
+ };
};
union {
struct {
@@ -194,6 +198,7 @@ enum io_uring_op {
IORING_OP_GETXATTR,
IORING_OP_SOCKET,
IORING_OP_URING_CMD,
+ IORING_OP_SENDZC,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -13,6 +13,7 @@
#include "io_uring.h"
#include "kbuf.h"
#include "net.h"
+#include "notif.h"
#if defined(CONFIG_NET)
struct io_shutdown {
@@ -58,6 +59,15 @@ struct io_sr_msg {
unsigned int flags;
};
+struct io_sendzc {
+ struct file *file;
+ void __user *buf;
+ size_t len;
+ u16 slot_idx;
+ unsigned msg_flags;
+ unsigned flags;
+};
+
#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -652,6 +662,90 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sendzc *zc = io_kiocb_to_cmd(req);
+
+ if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) ||
+ READ_ONCE(sqe->addr3))
+ return -EINVAL;
+
+ zc->flags = READ_ONCE(sqe->ioprio);
+ if (zc->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
+
+ zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ zc->len = READ_ONCE(sqe->len);
+ zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ zc->slot_idx = READ_ONCE(sqe->notification_idx);
+ if (zc->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ zc->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+ return 0;
+}
+
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_sendzc *zc = io_kiocb_to_cmd(req);
+ struct io_notif_slot *notif_slot;
+ struct io_notif *notif;
+ struct msghdr msg;
+ struct iovec iov;
+ struct socket *sock;
+ unsigned msg_flags;
+ int ret, min_ret = 0;
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (zc->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ return -EAGAIN;
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
+ if (!notif_slot)
+ return -EINVAL;
+ notif = io_get_notif(ctx, notif_slot);
+ if (!notif)
+ return -ENOMEM;
+
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+
+ ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+
+ msg_flags = zc->msg_flags | MSG_ZEROCOPY;
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ msg_flags |= MSG_DONTWAIT;
+ if (msg_flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&msg.msg_iter);
+
+ msg.msg_flags = msg_flags;
+ msg.msg_ubuf = ¬if->uarg;
+ msg.sg_from_iter = NULL;
+ ret = sock_sendmsg(sock, &msg);
+
+ if (unlikely(ret < min_ret)) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ return ret == -ERESTARTSYS ? -EINTR : ret;
+ }
+
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
+
int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = io_kiocb_to_cmd(req);
@@ -40,4 +40,8 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags);
int io_connect_prep_async(struct io_kiocb *req);
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+
#endif
@@ -470,6 +470,21 @@ const struct io_op_def io_op_defs[] = {
.issue = io_uring_cmd,
.prep_async = io_uring_cmd_prep_async,
},
+ [IORING_OP_SENDZC] = {
+ .name = "SENDZC",
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollout = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+#if defined(CONFIG_NET)
+ .prep = io_sendzc_prep,
+ .issue = io_sendzc,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+
+ },
};
const char *io_uring_get_opcode(u8 opcode)
Add a new io_uring opcode IORING_OP_SENDZC. The main distinction from IORING_OP_SEND is that the user should specify a notification slot index in sqe::notification_idx and the buffers are safe to reuse only when the used notification is flushed and completes. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- include/uapi/linux/io_uring.h | 5 ++ io_uring/net.c | 94 +++++++++++++++++++++++++++++++++++ io_uring/net.h | 4 ++ io_uring/opdef.c | 15 ++++++ 4 files changed, 118 insertions(+)