@@ -22,6 +22,11 @@ enum io_uring_cmd_flags {
IO_URING_F_IOPOLL = (1 << 10),
};
+union io_uring_fused_cmd_data {
+ /* secondary list, fused cmd private, driver do not touch it */
+ struct io_kiocb *__secondary;
+};
+
struct io_uring_cmd {
struct file *file;
const void *cmd;
@@ -33,7 +38,15 @@ struct io_uring_cmd {
};
u32 cmd_op;
u32 flags;
- u8 pdu[32]; /* available inline for free use */
+
+ /* for fused command, the available pdu is a bit less */
+ union {
+ struct {
+ union io_uring_fused_cmd_data data;
+ u8 pdu[24]; /* available inline for free use */
+ } fused;
+ u8 pdu[32]; /* available inline for free use */
+ };
};
#if defined(CONFIG_IO_URING)
@@ -407,6 +407,7 @@ enum {
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
+ REQ_F_FUSED_SECONDARY_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -476,6 +477,8 @@ enum {
REQ_F_CLEAR_POLLIN = BIT_ULL(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
REQ_F_HASH_LOCKED = BIT_ULL(REQ_F_HASH_LOCKED_BIT),
+ /* secondary request in fused cmd, won't be one uring cmd */
+ REQ_F_FUSED_SECONDARY = BIT_ULL(REQ_F_FUSED_SECONDARY_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -558,6 +561,9 @@ struct io_kiocb {
* REQ_F_BUFFER_RING is set.
*/
struct io_buffer_list *buf_list;
+
+ /* store fused command's primary request for the secondary */
+ struct io_kiocb *fused_primary_req;
};
union {
@@ -73,6 +73,8 @@ struct io_uring_sqe {
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
+ /* how many secondary normal SQEs following this fused SQE */
+ __u16 nr_secondary;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
@@ -173,6 +175,16 @@ enum {
*/
#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
+/*
+ * Multiple requests submitted as one whole request logically, and the 1st
+ * one is primary request, and the others are secondary requests, which number
+ * can be retrieved from primary SQE.
+ *
+ * Primary request is responsible for submitting secondary requests, and it
+ * won't be completed until all secondary requests are done.
+ */
+#define IORING_SETUP_FUSED_REQ (1U << 14)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -223,6 +235,7 @@ enum io_uring_op {
IORING_OP_URING_CMD,
IORING_OP_SEND_ZC,
IORING_OP_SENDMSG_ZC,
+ IORING_OP_FUSED_CMD,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -476,6 +489,7 @@ struct io_uring_params {
#define IORING_FEAT_CQE_SKIP (1U << 11)
#define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
+#define IORING_FEAT_FUSED_REQ (1U << 14)
/*
* io_uring_register(2) opcodes and arguments
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
- cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+ cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o fused_cmd.o
obj-$(CONFIG_IO_WQ) += io-wq.o
new file mode 100644
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "opdef.h"
+#include "uring_cmd.h"
+#include "fused_cmd.h"
+
+static bool io_fused_secondary_valid(u8 op)
+{
+ if (op == IORING_OP_FUSED_CMD)
+ return false;
+
+ if (!io_issue_defs[op].fused_secondary)
+ return false;
+
+ return true;
+}
+
+int io_fused_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+ __must_hold(&req->ctx->uring_lock)
+{
+ struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ const struct io_uring_sqe *secondary_sqe = NULL;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *secondary;
+ u8 secondary_op;
+ int ret;
+
+ if (!(ctx->flags & IORING_SETUP_FUSED_REQ))
+ return -EINVAL;
+
+ if (unlikely(sqe->__pad1))
+ return -EINVAL;
+
+ /*
+ * Only support single secondary request, in future we may extend to
+ * support multiple secondary requests, which can be covered by
+ * multiple fused command now.
+ */
+ if (unlikely(sqe->nr_secondary != 1))
+ return -EINVAL;
+
+ ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
+ if (unlikely(ioucmd->flags))
+ return -EINVAL;
+
+ if (unlikely(!io_get_secondary_sqe(ctx, &secondary_sqe)))
+ return -EINVAL;
+
+ if (unlikely(!secondary_sqe))
+ return -EINVAL;
+
+ secondary_op = READ_ONCE(secondary_sqe->opcode);
+ if (unlikely(!io_fused_secondary_valid(secondary_op)))
+ return -EINVAL;
+
+ ioucmd->cmd = sqe->cmd;
+ ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
+
+ ret = -ENOMEM;
+ if (unlikely(!io_alloc_req(ctx, &secondary)))
+ goto fail;
+
+ ret = io_init_secondary_req(ctx, secondary, secondary_sqe,
+ REQ_F_FUSED_SECONDARY);
+ if (unlikely(ret))
+ goto fail_free_req;
+
+ ioucmd->fused.data.__secondary = secondary;
+
+ return 0;
+
+fail_free_req:
+ io_free_req(secondary);
+fail:
+ return ret;
+}
+
+int io_fused_cmd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ int ret = -EINVAL;
+
+ ret = io_uring_cmd(req, issue_flags);
+ if (ret != IOU_ISSUE_SKIP_COMPLETE)
+ io_free_req(ioucmd->fused.data.__secondary);
+
+ return ret;
+}
+
+/*
+ * Called after secondary request is completed,
+ *
+ * Notify primary request by the saved callback that we are done
+ */
+void io_fused_cmd_complete_secondary(struct io_kiocb *secondary)
+{
+ struct io_kiocb *req = secondary->fused_primary_req;
+ struct io_uring_cmd *ioucmd;
+
+ if (unlikely(!req || !(secondary->flags & REQ_F_FUSED_SECONDARY)))
+ return;
+
+ /* notify primary command that we are done */
+ secondary->fused_primary_req = NULL;
+ ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ ioucmd->fused.data.__secondary = NULL;
+
+ io_uring_cmd_complete_in_task(ioucmd, ioucmd->task_work_cb);
+}
new file mode 100644
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_FUSED_CMD_H
+#define IOU_FUSED_CMD_H
+
+int io_fused_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_fused_cmd(struct io_kiocb *req, unsigned int issue_flags);
+void io_fused_cmd_complete_secondary(struct io_kiocb *secondary);
+
+#endif
@@ -92,6 +92,7 @@
#include "cancel.h"
#include "net.h"
#include "notif.h"
+#include "fused_cmd.h"
#include "timeout.h"
#include "poll.h"
@@ -111,7 +112,7 @@
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
- REQ_F_ASYNC_DATA)
+ REQ_F_ASYNC_DATA | REQ_F_FUSED_SECONDARY)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
IO_REQ_CLEAN_FLAGS)
@@ -971,6 +972,9 @@ static void __io_req_complete_post(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ if (req->flags & REQ_F_FUSED_SECONDARY)
+ io_fused_cmd_complete_secondary(req);
+
io_cq_lock(ctx);
if (!(req->flags & REQ_F_CQE_SKIP))
io_fill_cqe_req(ctx, req);
@@ -1855,6 +1859,8 @@ static void io_clean_op(struct io_kiocb *req)
spin_lock(&req->ctx->completion_lock);
io_put_kbuf_comp(req);
spin_unlock(&req->ctx->completion_lock);
+ } else if (req->flags & REQ_F_FUSED_SECONDARY) {
+ io_fused_cmd_complete_secondary(req);
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -2163,8 +2169,8 @@ static void io_init_req_drain(struct io_kiocb *req)
}
}
-static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static inline int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe, u64 secondary_flags)
__must_hold(&ctx->uring_lock)
{
const struct io_issue_def *def;
@@ -2217,6 +2223,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
}
+ if (secondary_flags) {
+ if (!def->fused_secondary)
+ return -EINVAL;
+ req->flags |= secondary_flags;
+ }
+
if (!def->ioprio && sqe->ioprio)
return -EINVAL;
if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
@@ -2257,6 +2269,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
return def->prep(req, sqe);
}
+int io_init_secondary_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe, u64 flags)
+{
+ return io_init_req(ctx, req, sqe, flags);
+}
+
static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
struct io_kiocb *req, int ret)
{
@@ -2301,7 +2319,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
struct io_submit_link *link = &ctx->submit_state.link;
int ret;
- ret = io_init_req(ctx, req, sqe);
+ ret = io_init_req(ctx, req, sqe, 0);
if (unlikely(ret))
return io_submit_fail_init(sqe, req, ret);
@@ -2396,7 +2414,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
+static inline bool io_get_sqe(struct io_ring_ctx *ctx,
+ const struct io_uring_sqe **sqe)
{
unsigned head, mask = ctx->sq_entries - 1;
unsigned sq_idx = ctx->cached_sq_head++ & mask;
@@ -2425,6 +2444,12 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
return false;
}
+bool io_get_secondary_sqe(struct io_ring_ctx *ctx,
+ const struct io_uring_sqe **sqe)
+{
+ return io_get_sqe(ctx, sqe);
+}
+
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
@@ -3855,7 +3880,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
- IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
+ IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
+ IORING_FEAT_FUSED_REQ;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -3913,7 +3939,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
- IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
+ IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
+ IORING_SETUP_FUSED_REQ))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -78,6 +78,11 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
+bool io_get_secondary_sqe(struct io_ring_ctx *ctx,
+ const struct io_uring_sqe **sqe);
+int io_init_secondary_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe, u64 flags);
+
#define io_lockdep_assert_cq_locked(ctx) \
do { \
if (ctx->flags & IORING_SETUP_IOPOLL) { \
@@ -33,6 +33,7 @@
#include "poll.h"
#include "cancel.h"
#include "rw.h"
+#include "fused_cmd.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -428,6 +429,12 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_FUSED_CMD] = {
+ .needs_file = 1,
+ .plug = 1,
+ .prep = io_fused_cmd_prep,
+ .issue = io_fused_cmd,
+ },
};
@@ -648,6 +655,11 @@ const struct io_cold_def io_cold_defs[] = {
.fail = io_sendrecv_fail,
#endif
},
+ [IORING_OP_FUSED_CMD] = {
+ .name = "FUSED_CMD",
+ .async_size = uring_cmd_pdu_size(1),
+ .prep_async = io_uring_cmd_prep_async,
+ },
};
const char *io_uring_get_opcode(u8 opcode)
@@ -29,6 +29,8 @@ struct io_issue_def {
unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1;
+ /* can be secondary op of fused command */
+ unsigned fused_secondary : 1;
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
Multiple requests submitted as one whole request logically, and the 1st one is primary command(IORING_OP_FUSED_CMD), and the others are secondary requests, which number can be retrieved from primary SQE. Primary command is responsible for providing resources and submitting secondary requests, which depends on primary command's resources, and primary command won't be completed until all secondary requests are done. The provided resource has same lifetime with primary command, and it lifetime won't cross multiple OPs, and this way provides safe way for secondary OPs to use the resource. Add generic IORING_OP_FUSED_CMD for modeling this primary/secondary relationship among requests. So far, the motivation is for supporting ublk zero copy, but it is one generic framework which could be used for other use cases. This way actually provides one generic interface for doing something efficiently: 1) offload complicated logic to userspace, such as one generic lvm - one read IO request is coming to this lvm block device - the kernel driver gets notified, and start to handle the read request - the driver finds that mapping for this IO isn't cached, so wakeup userspace daemon, which calls KV store or generic btree mapping API to get the mapped offset/len, then call IORING_OP_FUSED_CMD to handle the read request in zero copy style - when IORING_OP_FUSED_CMD is completed, the read IO request is completed 2) decouple kernel subsystems - subsystem A wants to use subsystem B's service - subsystem B doesn't provide kernel internal APIs, since it may be hard to support, but anytime generic syscalls are provided for userspace - subsystem A can notify userspace proxy to call IORING_OP_FUSED_CMD for using subsystem B's service in resource-sharing way, such as zero copy Signed-off-by: Ming Lei <ming.lei@redhat.com> --- include/linux/io_uring.h | 15 ++++- include/linux/io_uring_types.h | 6 ++ include/uapi/linux/io_uring.h | 14 ++++ io_uring/Makefile | 2 +- io_uring/fused_cmd.c | 119 +++++++++++++++++++++++++++++++++ io_uring/fused_cmd.h | 9 +++ io_uring/io_uring.c | 41 ++++++++++-- io_uring/io_uring.h | 5 ++ io_uring/opdef.c | 12 ++++ io_uring/opdef.h | 2 + 10 files changed, 216 insertions(+), 9 deletions(-) create mode 100644 io_uring/fused_cmd.c create mode 100644 io_uring/fused_cmd.h