diff mbox series

[1/4] io_uring: add support for NO_OFFLOAD

Message ID 20230420183135.119618-2-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Enable NO_OFFLOAD support | expand

Commit Message

Jens Axboe April 20, 2023, 6:31 p.m. UTC
Some applications don't necessarily care about io_uring not blocking for
request issue, they simply want to use io_uring for batched submission
of IO. However, io_uring will always do non-blocking issues, and for
some request types, there's simply no support for doing non-blocking
issue and hence they get punted to io-wq unconditionally. If the
application doesn't care about issue potentially blocking, this causes
a performance slowdown as thread offload is not nearly as efficient as
inline issue.

Add support for configuring the ring with IORING_SETUP_NO_OFFLOAD, and
add an IORING_ENTER_NO_OFFLOAD flag to io_uring_enter(2). If either one
of these is set, then io_uring will ignore the non-block issue attempt
for any file which we cannot poll for readiness. The simplified io_uring
issue model looks as follows:

1) Non-blocking issue is attempted for IO. If successful, we're done for
   now.

2) Case 1 failed. Now we have two options
  	a) We can poll the file. We arm poll, and we're done for now
	   until that triggers.
   	b) File cannot be polled, we punt to io-wq which then does a
	   blocking attempt.

If either of the NO_OFFLOAD flags are set, we should never hit case
2b. Instead, case 1 would issue the IO without the non-blocking flag
being set and perform an inline completion.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h      |  1 +
 include/uapi/linux/io_uring.h |  7 +++++
 io_uring/io_uring.c           | 52 +++++++++++++++++++++++++----------
 io_uring/io_uring.h           |  2 +-
 io_uring/sqpoll.c             |  6 ++--
 5 files changed, 50 insertions(+), 18 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 35b9328ca335..386d6b722481 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -13,6 +13,7 @@  enum io_uring_cmd_flags {
 	IO_URING_F_MULTISHOT		= 4,
 	/* executed by io-wq */
 	IO_URING_F_IOWQ			= 8,
+	IO_URING_F_NO_OFFLOAD		= 16,
 	/* int's last bit, sign checks are usually faster than a bit test */
 	IO_URING_F_NONBLOCK		= INT_MIN,
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0716cb17e436..ea903a677ce9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -173,6 +173,12 @@  enum {
  */
 #define IORING_SETUP_DEFER_TASKRUN	(1U << 13)
 
+/*
+ * Don't attempt non-blocking issue on file types that would otherwise
+ * punt to io-wq if they cannot be completed non-blocking.
+ */
+#define IORING_SETUP_NO_OFFLOAD		(1U << 14)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
@@ -443,6 +449,7 @@  struct io_cqring_offsets {
 #define IORING_ENTER_SQ_WAIT		(1U << 2)
 #define IORING_ENTER_EXT_ARG		(1U << 3)
 #define IORING_ENTER_REGISTERED_RING	(1U << 4)
+#define IORING_ENTER_NO_OFFLOAD		(1U << 5)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3d43df8f1e4e..fee3e461e149 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -147,7 +147,7 @@  static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 
 static void io_dismantle_req(struct io_kiocb *req);
 static void io_clean_op(struct io_kiocb *req);
-static void io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req, unsigned int issue_flags);
 static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 static __cold void io_fallback_tw(struct io_uring_task *tctx);
@@ -1471,7 +1471,7 @@  void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
 	else if (req->flags & REQ_F_FORCE_ASYNC)
 		io_queue_iowq(req, ts);
 	else
-		io_queue_sqe(req);
+		io_queue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 }
 
 void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -1947,6 +1947,10 @@  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!io_assign_file(req, def, issue_flags)))
 		return -EBADF;
 
+	if (issue_flags & IO_URING_F_NO_OFFLOAD &&
+	    (!req->file || !file_can_poll(req->file)))
+		issue_flags &= ~IO_URING_F_NONBLOCK;
+
 	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
 		creds = override_creds(req->creds);
 
@@ -2120,12 +2124,12 @@  static void io_queue_async(struct io_kiocb *req, int ret)
 		io_queue_linked_timeout(linked_timeout);
 }
 
-static inline void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	__must_hold(&req->ctx->uring_lock)
 {
 	int ret;
 
-	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+	ret = io_issue_sqe(req, issue_flags);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2337,7 +2341,8 @@  static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
 }
 
 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			 const struct io_uring_sqe *sqe)
+			 const struct io_uring_sqe *sqe,
+			 unsigned int aux_issue_flags)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_link *link = &ctx->submit_state.link;
@@ -2385,7 +2390,8 @@  static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		return 0;
 	}
 
-	io_queue_sqe(req);
+	io_queue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER|
+		     aux_issue_flags);
 	return 0;
 }
 
@@ -2466,7 +2472,8 @@  static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 	return false;
 }
 
-int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
+		   unsigned int aux_issue_flags)
 	__must_hold(&ctx->uring_lock)
 {
 	unsigned int entries = io_sqring_entries(ctx);
@@ -2495,7 +2502,7 @@  int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		 * Continue submitting even for sqe failure if the
 		 * ring was setup with IORING_SETUP_SUBMIT_ALL
 		 */
-		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+		if (unlikely(io_submit_sqe(ctx, req, sqe, aux_issue_flags)) &&
 		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
 			left--;
 			break;
@@ -3524,7 +3531,8 @@  SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
 			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
-			       IORING_ENTER_REGISTERED_RING)))
+			       IORING_ENTER_REGISTERED_RING |
+			       IORING_ENTER_NO_OFFLOAD)))
 		return -EINVAL;
 
 	/*
@@ -3575,12 +3583,18 @@  SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 
 		ret = to_submit;
 	} else if (to_submit) {
+		unsigned int aux_issue_flags = 0;
+
 		ret = io_uring_add_tctx_node(ctx);
 		if (unlikely(ret))
 			goto out;
 
+		if (flags & IORING_ENTER_NO_OFFLOAD ||
+		    ctx->flags & IORING_SETUP_NO_OFFLOAD)
+			aux_issue_flags = IO_URING_F_NO_OFFLOAD;
+
 		mutex_lock(&ctx->uring_lock);
-		ret = io_submit_sqes(ctx, to_submit);
+		ret = io_submit_sqes(ctx, to_submit, aux_issue_flags);
 		if (ret != to_submit) {
 			mutex_unlock(&ctx->uring_lock);
 			goto out;
@@ -3827,9 +3841,17 @@  static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	 * polling again, they can rely on io_sq_thread to do polling
 	 * work, which can reduce cpu usage and uring_lock contention.
 	 */
-	if (ctx->flags & IORING_SETUP_IOPOLL &&
-	    !(ctx->flags & IORING_SETUP_SQPOLL))
-		ctx->syscall_iopoll = 1;
+	ret = -EINVAL;
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		/*
+		 * Can't sanely block for issue for IOPOLL, nor does this
+		 * combination make any sense. Disallow it.
+		 */
+		if (ctx->flags & IORING_SETUP_NO_OFFLOAD)
+			goto err;
+		if (!(ctx->flags & IORING_SETUP_SQPOLL))
+			ctx->syscall_iopoll = 1;
+	}
 
 	ctx->compat = in_compat_syscall();
 	if (!capable(CAP_IPC_LOCK))
@@ -3839,7 +3861,6 @@  static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
 	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
 	 */
-	ret = -EINVAL;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
 		/* IPI related flags don't make sense with SQPOLL */
 		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
@@ -3969,7 +3990,8 @@  static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
-			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
+			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
+			IORING_SETUP_NO_OFFLOAD))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 25515d69d205..fb3619ae0fd3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -76,7 +76,7 @@  int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 
 int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
-int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
+int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, unsigned int aux_issue_flags);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
 int io_req_prep_async(struct io_kiocb *req);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 9db4bc1f521a..9f2968a441ce 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -166,7 +166,7 @@  static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 
 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 {
-	unsigned int to_submit;
+	unsigned int to_submit, aux_issue_flags = 0;
 	int ret = 0;
 
 	to_submit = io_sqring_entries(ctx);
@@ -179,6 +179,8 @@  static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 
 		if (ctx->sq_creds != current_cred())
 			creds = override_creds(ctx->sq_creds);
+		if (ctx->flags & IORING_SETUP_NO_OFFLOAD)
+			aux_issue_flags = IO_URING_F_NO_OFFLOAD;
 
 		mutex_lock(&ctx->uring_lock);
 		if (!wq_list_empty(&ctx->iopoll_list))
@@ -190,7 +192,7 @@  static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 		 */
 		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
 		    !(ctx->flags & IORING_SETUP_R_DISABLED))
-			ret = io_submit_sqes(ctx, to_submit);
+			ret = io_submit_sqes(ctx, to_submit, aux_issue_flags);
 		mutex_unlock(&ctx->uring_lock);
 
 		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))