diff mbox series

[1/3] io_uring: add support for marking commands as draining

Message ID 20190411150657.18480-2-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series io_uring: add sync_file_range and drains | expand

Commit Message

Jens Axboe April 11, 2019, 3:06 p.m. UTC
There are no ordering constraints between the submission and completion
side of io_uring. But sometimes that would be useful to have. One common
example is doing an fsync, for instance, and have it ordered with
previous writes. Without support for that, the application must do this
tracking itself.

This adds a general SQE flag, IOSQE_IO_DRAIN. If a command is marked
with this flag, then it will not be issued before previous commands have
completed, and subsequent commands submitted after the drain will not be
issued before the drain is started.. If there are no pending commands,
setting this flag will not change the behavior of the issue of the
command.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 91 +++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 89 insertions(+), 3 deletions(-)

Comments

Andres Freund May 6, 2019, 6:03 p.m. UTC | #1
Hi,

On 2019-04-11 09:06:55 -0600, Jens Axboe wrote:
> There are no ordering constraints between the submission and completion
> side of io_uring. But sometimes that would be useful to have. One common
> example is doing an fsync, for instance, and have it ordered with
> previous writes. Without support for that, the application must do this
> tracking itself.

The facility seems useful for at least this postgres developer playing
with optionally using io_uring in parts of postgres. As you say, I'd
otherwise need to manually implement drains in userland.


> This adds a general SQE flag, IOSQE_IO_DRAIN. If a command is marked
> with this flag, then it will not be issued before previous commands have
> completed, and subsequent commands submitted after the drain will not be
> issued before the drain is started.. If there are no pending commands,
> setting this flag will not change the behavior of the issue of the
> command.

I think it'd be good if there were some documentation about how io_uring
interacts with writes done via a different io_uring queue, or
traditional write(2) et al.  And whether IOSQE_IO_DRAIN drain influences
that.

In none of the docs I read it's documented if an io_uring fsync
guarantees that a write(2) that finished before an IORING_OP_FSYNC op is
submitted is durable? Given the current implementation that clearly
seems to be the case, but it's not great to rely on the current
implementation as a user of data integrity operations.

Similarly, it'd be good if there were docs about how traditional
read/write/fsync and multiple io_uring queues interact in the face of
concurrent operations. For plain read/write we have posix providing some
baseline guarantees, but obviously doesn't mean anything for io_uring.

I suspect that most people's intuition will be "it's obvious", but also
that such intuitions are likely to differ between people.

Greetings,

Andres Freund
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 07d6ef195d05..a10fd5900a17 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -121,6 +121,8 @@  struct io_ring_ctx {
 		unsigned		sq_mask;
 		unsigned		sq_thread_idle;
 		struct io_uring_sqe	*sq_sqes;
+
+		struct list_head	defer_list;
 	} ____cacheline_aligned_in_smp;
 
 	/* IO offload */
@@ -226,8 +228,11 @@  struct io_kiocb {
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
 #define REQ_F_SEQ_PREV		8	/* sequential with previous */
 #define REQ_F_PREPPED		16	/* prep already done */
+#define REQ_F_IO_DRAIN		32	/* drain existing IO first */
+#define REQ_F_IO_DRAINED	64	/* drain done */
 	u64			user_data;
-	u64			error;
+	u32			error;
+	u32			sequence;
 
 	struct work_struct	work;
 };
@@ -255,6 +260,8 @@  struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+static void io_sq_wq_submit_work(struct work_struct *work);
+
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
@@ -306,10 +313,36 @@  static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	spin_lock_init(&ctx->completion_lock);
 	INIT_LIST_HEAD(&ctx->poll_list);
 	INIT_LIST_HEAD(&ctx->cancel_list);
+	INIT_LIST_HEAD(&ctx->defer_list);
 	return ctx;
 }
 
-static void io_commit_cqring(struct io_ring_ctx *ctx)
+static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
+				     struct io_kiocb *req)
+{
+	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
+		return false;
+
+	return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
+}
+
+static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	if (list_empty(&ctx->defer_list))
+		return NULL;
+
+	req = list_first_entry(&ctx->defer_list, struct io_kiocb, list);
+	if (!io_sequence_defer(ctx, req)) {
+		list_del_init(&req->list);
+		return req;
+	}
+
+	return NULL;
+}
+
+static void __io_commit_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
 
@@ -330,6 +363,18 @@  static void io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	__io_commit_cqring(ctx);
+
+	while ((req = io_get_deferred_req(ctx)) != NULL) {
+		req->flags |= REQ_F_IO_DRAINED;
+		queue_work(ctx->sqo_wq, &req->work);
+	}
+}
+
 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
@@ -1337,6 +1382,34 @@  static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return ipt.error;
 }
 
+static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_uring_sqe *sqe_copy;
+
+	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
+		return 0;
+
+	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+	if (!sqe_copy)
+		return -EAGAIN;
+
+	spin_lock_irq(&ctx->completion_lock);
+	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
+		spin_unlock_irq(&ctx->completion_lock);
+		kfree(sqe_copy);
+		return 0;
+	}
+
+	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
+	req->submit.sqe = sqe_copy;
+
+	INIT_WORK(&req->work, io_sq_wq_submit_work);
+	list_add_tail(&req->list, &ctx->defer_list);
+	spin_unlock_irq(&ctx->completion_lock);
+	return -EIOCBQUEUED;
+}
+
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock,
 			   struct io_submit_state *state)
@@ -1585,6 +1658,11 @@  static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 	flags = READ_ONCE(s->sqe->flags);
 	fd = READ_ONCE(s->sqe->fd);
 
+	if (flags & IOSQE_IO_DRAIN) {
+		req->flags |= REQ_F_IO_DRAIN;
+		req->sequence = ctx->cached_sq_head - 1;
+	}
+
 	if (!io_op_needs_file(s->sqe)) {
 		req->file = NULL;
 		return 0;
@@ -1614,7 +1692,7 @@  static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	int ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
+	if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
 		return -EINVAL;
 
 	req = io_get_req(ctx, state);
@@ -1625,6 +1703,13 @@  static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	if (unlikely(ret))
 		goto out;
 
+	ret = io_req_defer(ctx, req, s->sqe);
+	if (ret) {
+		if (ret == -EIOCBQUEUED)
+			ret = 0;
+		return ret;
+	}
+
 	ret = __io_submit_sqe(ctx, req, s, true, state);
 	if (ret == -EAGAIN) {
 		struct io_uring_sqe *sqe_copy;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e23408692118..a7a6384d0c70 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -38,6 +38,7 @@  struct io_uring_sqe {
  * sqe->flags
  */
 #define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
+#define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
 
 /*
  * io_uring_setup() flags