@@ -285,6 +285,14 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+
+ if (rw->kiocb.dio_complete) {
+ long res = rw->kiocb.dio_complete(rw->kiocb.private);
+
+ io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ }
+
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
@@ -300,9 +308,11 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
struct io_kiocb *req = cmd_to_io_kiocb(rw);
- if (__io_complete_rw_common(req, res))
- return;
- io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ if (!rw->kiocb.dio_complete) {
+ if (__io_complete_rw_common(req, res))
+ return;
+ io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ }
req->io_task_work.func = io_req_rw_complete;
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
}
@@ -312,6 +322,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
struct io_kiocb *req = cmd_to_io_kiocb(rw);
+ if (rw->kiocb.dio_complete)
+ res = rw->kiocb.dio_complete(rw->kiocb.private);
+
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (unlikely(res != req->cqe.res)) {
@@ -914,7 +927,13 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
__sb_writers_release(file_inode(req->file)->i_sb,
SB_FREEZE_WRITE);
}
- kiocb->ki_flags |= IOCB_WRITE;
+
+ /*
+ * Set IOCB_DIO_DEFER, stating that our handler groks deferring the
+ * completion to task context.
+ */
+ kiocb->ki_flags |= IOCB_WRITE | IOCB_DIO_DEFER;
+ kiocb->dio_complete = NULL;
if (likely(req->file->f_op->write_iter))
ret2 = call_write_iter(req->file, kiocb, &s->iter);
If the filesystem dio handler understands IOCB_DIO_DEFER, we'll get a kiocb->ki_complete() callback with kiocb->dio_complete set. In that case, rather than complete the IO directly through task_work, queue up an intermediate task_work handler that first processes this callback and then immediately completes the request. For XFS, this avoids a punt through a workqueue, which is a lot less efficient and adds latency to lower queue depth (or sync) O_DIRECT writes. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/rw.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-)