diff mbox series

[for-next,v3,4/9] io_uring: allow defer completion for aux posted cqes

Message ID 20221124093559.3780686-5-dylany@meta.com (mailing list archive)
State New
Headers show
Series io_uring: batch multishot completions | expand

Commit Message

Dylan Yudaken Nov. 24, 2022, 9:35 a.m. UTC
Multishot ops cannot use the compl_reqs list as the request must stay in
the poll list, but that means they need to run each completion without
benefiting from batching.

Here introduce batching infrastructure for only small (ie 16 byte)
CQEs. This restriction is ok because there are no use cases posting 32
byte CQEs.

In the ring keep a batch of up to 16 posted results, and flush in the same
way as compl_reqs.

16 was chosen through experimentation on a microbenchmark ([1]), as well
as trying not to increase the size of the ring too much. This increases
the size to 1472 bytes from 1216.

[1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf
Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10
Gives results:
baseline	8309 k/s
8		18807 k/s
16		19338 k/s
32		20134 k/s

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Dylan Yudaken <dylany@meta.com>
---
 include/linux/io_uring_types.h |  2 ++
 io_uring/io_uring.c            | 27 ++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

Comments

Pavel Begunkov Nov. 24, 2022, 4:04 p.m. UTC | #1
On 11/24/22 09:35, Dylan Yudaken wrote:
> Multishot ops cannot use the compl_reqs list as the request must stay in
> the poll list, but that means they need to run each completion without
> benefiting from batching.
> 
> Here introduce batching infrastructure for only small (ie 16 byte)
> CQEs. This restriction is ok because there are no use cases posting 32
> byte CQEs.
> 
> In the ring keep a batch of up to 16 posted results, and flush in the same
> way as compl_reqs.
> 
> 16 was chosen through experimentation on a microbenchmark ([1]), as well
> as trying not to increase the size of the ring too much. This increases
> the size to 1472 bytes from 1216.

It might be cleaner to defer io_cqring_ev_posted() instead of caching
cqes and spinlocking is not a big problem, because we can globally
get rid of them.


> [1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf
> Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10
> Gives results:
> baseline	8309 k/s
> 8		18807 k/s
> 16		19338 k/s
> 32		20134 k/s
> 
> Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
> Signed-off-by: Dylan Yudaken <dylany@meta.com>
> ---
>   include/linux/io_uring_types.h |  2 ++
>   io_uring/io_uring.c            | 27 ++++++++++++++++++++++++---
>   2 files changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index f5b687a787a3..accdfecee953 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -174,7 +174,9 @@ struct io_submit_state {
>   	bool			plug_started;
>   	bool			need_plug;
>   	unsigned short		submit_nr;
> +	unsigned int		cqes_count;
>   	struct blk_plug		plug;
> +	struct io_uring_cqe	cqes[16];
>   };
>   
>   struct io_ev_fd {
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index cbd271b6188a..53b61b5cde80 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -167,7 +167,8 @@ EXPORT_SYMBOL(io_uring_get_socket);
>   
>   static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
>   {
> -	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
> +	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
> +	    ctx->submit_state.cqes_count)
>   		__io_submit_flush_completions(ctx);
>   }
>   
> @@ -802,6 +803,21 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
>   	return false;
>   }
>   
> +static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
> +	__must_hold(&ctx->uring_lock)
> +{
> +	struct io_submit_state *state = &ctx->submit_state;
> +	unsigned int i;
> +
> +	lockdep_assert_held(&ctx->uring_lock);
> +	for (i = 0; i < state->cqes_count; i++) {
> +		struct io_uring_cqe *cqe = &state->cqes[i];
> +
> +		io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true);
> +	}
> +	state->cqes_count = 0;
> +}
> +
>   bool io_post_aux_cqe(struct io_ring_ctx *ctx,
>   		     u64 user_data, s32 res, u32 cflags,
>   		     bool allow_overflow)
> @@ -1325,6 +1341,9 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
>   	struct io_submit_state *state = &ctx->submit_state;
>   
>   	io_cq_lock(ctx);
> +	/* must come first to preserve CQE ordering in failure cases */
> +	if (state->cqes_count)
> +		__io_flush_post_cqes(ctx);
>   	wq_list_for_each(node, prev, &state->compl_reqs) {
>   		struct io_kiocb *req = container_of(node, struct io_kiocb,
>   					    comp_list);
> @@ -1334,8 +1353,10 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
>   	}
>   	io_cq_unlock_post(ctx);
>   
> -	io_free_batch_list(ctx, state->compl_reqs.first);
> -	INIT_WQ_LIST(&state->compl_reqs);
> +	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
> +		io_free_batch_list(ctx, state->compl_reqs.first);
> +		INIT_WQ_LIST(&state->compl_reqs);
> +	}
>   }
>   
>   /*
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f5b687a787a3..accdfecee953 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -174,7 +174,9 @@  struct io_submit_state {
 	bool			plug_started;
 	bool			need_plug;
 	unsigned short		submit_nr;
+	unsigned int		cqes_count;
 	struct blk_plug		plug;
+	struct io_uring_cqe	cqes[16];
 };
 
 struct io_ev_fd {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cbd271b6188a..53b61b5cde80 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -167,7 +167,8 @@  EXPORT_SYMBOL(io_uring_get_socket);
 
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
-	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
+	    ctx->submit_state.cqes_count)
 		__io_submit_flush_completions(ctx);
 }
 
@@ -802,6 +803,21 @@  bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 	return false;
 }
 
+static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_submit_state *state = &ctx->submit_state;
+	unsigned int i;
+
+	lockdep_assert_held(&ctx->uring_lock);
+	for (i = 0; i < state->cqes_count; i++) {
+		struct io_uring_cqe *cqe = &state->cqes[i];
+
+		io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true);
+	}
+	state->cqes_count = 0;
+}
+
 bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 		     u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow)
@@ -1325,6 +1341,9 @@  static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_submit_state *state = &ctx->submit_state;
 
 	io_cq_lock(ctx);
+	/* must come first to preserve CQE ordering in failure cases */
+	if (state->cqes_count)
+		__io_flush_post_cqes(ctx);
 	wq_list_for_each(node, prev, &state->compl_reqs) {
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 					    comp_list);
@@ -1334,8 +1353,10 @@  static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	}
 	io_cq_unlock_post(ctx);
 
-	io_free_batch_list(ctx, state->compl_reqs.first);
-	INIT_WQ_LIST(&state->compl_reqs);
+	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
+		io_free_batch_list(ctx, state->compl_reqs.first);
+		INIT_WQ_LIST(&state->compl_reqs);
+	}
 }
 
 /*