diff mbox series

[19/22] aio: support kernel side submission for aio with SCQRING

Message ID 20181221192236.12866-20-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series [01/22] fs: add an iopoll method to struct file_operations | expand

Commit Message

Jens Axboe Dec. 21, 2018, 7:22 p.m. UTC
Add support for backing the io_context with either a thread, or a
workqueue and letting those handle the submission for us. This can
be used to reduce overhead for submission, or to always make submission
async. The latter is particularly useful for buffered aio, which is
now fully async with this feature.

For polled IO, we could have the kernel side thread hammer on the SQ
ring and submit when it finds IO. This would mean that an application
would NEVER have to enter the kernel to do IO! Didn't add this yet,
but it would be trivial to add.

If an application sets IOCTX_FLAG_SCQTHREAD, the io_context gets a
single thread backing. If used with buffered IO, this will limit
the device queue depth to 1, but it will be async, IOs will simply
be serialized.

Or an application can set IOCTX_FLAG_SQWQ, in which case the io_context
gets a work queue backing. The concurrency level is the mininum of
twice the available CPUs, or the queue depth specific for the context.
For this mode, we attempt to do buffered reads inline, in case they are
cached. So we should only punt to a workqueue, if we would have to block
to get our data.

Tested with polling, no polling, fixedbufs, no fixedbufs, buffered,
O_DIRECT.

See this sample application for how to use it:

http://git.kernel.dk/cgit/fio/plain/t/aio-ring.c

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c                     | 438 ++++++++++++++++++++++++++++++++---
 include/uapi/linux/aio_abi.h |   3 +
 2 files changed, 414 insertions(+), 27 deletions(-)

Comments

hch@infradead.org Dec. 27, 2018, 1:57 p.m. UTC | #1
On Fri, Dec 21, 2018 at 12:22:33PM -0700, Jens Axboe wrote:
> If an application sets IOCTX_FLAG_SCQTHREAD, the io_context gets a
> single thread backing. If used with buffered IO, this will limit
> the device queue depth to 1, but it will be async, IOs will simply
> be serialized.
> 
> Or an application can set IOCTX_FLAG_SQWQ, in which case the io_context
> gets a work queue backing. The concurrency level is the mininum of
> twice the available CPUs, or the queue depth specific for the context.
> For this mode, we attempt to do buffered reads inline, in case they are
> cached. So we should only punt to a workqueue, if we would have to block
> to get our data.

I'm really worried about having this code present twice.  Do we have
strong use cases for both?  Can't we emulate IOCTX_FLAG_SCQTHREAD good
enough with a workqueue without concurrency management?
Jens Axboe Dec. 27, 2018, 8:34 p.m. UTC | #2
On 12/27/18 6:57 AM, Christoph Hellwig wrote:
> On Fri, Dec 21, 2018 at 12:22:33PM -0700, Jens Axboe wrote:
>> If an application sets IOCTX_FLAG_SCQTHREAD, the io_context gets a
>> single thread backing. If used with buffered IO, this will limit
>> the device queue depth to 1, but it will be async, IOs will simply
>> be serialized.
>>
>> Or an application can set IOCTX_FLAG_SQWQ, in which case the io_context
>> gets a work queue backing. The concurrency level is the mininum of
>> twice the available CPUs, or the queue depth specific for the context.
>> For this mode, we attempt to do buffered reads inline, in case they are
>> cached. So we should only punt to a workqueue, if we would have to block
>> to get our data.
> 
> I'm really worried about having this code present twice.  Do we have
> strong use cases for both?  Can't we emulate IOCTX_FLAG_SCQTHREAD good
> enough with a workqueue without concurrency management?

A work item is not a good fit for this. If we want the polled
submission, which I think is pretty nifty, then you need a higher
context than a wq item that comes and goes.
diff mbox series

Patch

diff --git a/fs/aio.c b/fs/aio.c
index c424aa2ed336..cd4a61642b46 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -25,6 +25,7 @@ 
 #include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/mmu_context.h>
@@ -44,6 +45,7 @@ 
 #include <linux/mount.h>
 #include <linux/sizes.h>
 #include <linux/nospec.h>
+#include <linux/sched/mm.h>
 
 #include <asm/kmap_types.h>
 #include <linux/uaccess.h>
@@ -116,6 +118,14 @@  struct aio_mapped_ubuf {
 	unsigned int nr_bvecs;
 };
 
+struct aio_sq_offload {
+	struct task_struct *thread;	/* if using a thread */
+	struct workqueue_struct *wq;	/* wq offload */
+	struct mm_struct *mm;
+	struct files_struct *files;
+	wait_queue_head_t wait;
+};
+
 struct kioctx {
 	struct percpu_ref	users;
 	atomic_t		dead;
@@ -158,6 +168,10 @@  struct kioctx {
 	struct aio_iocb_ring	sq_ring;
 	struct aio_mapped_range cq_ring;
 	int			cq_ring_overflow;
+	int			submit_eagain;
+
+	/* sq ring submitter thread, if used */
+	struct aio_sq_offload	sq_offload;
 
 	struct rcu_work		free_rwork;	/* see free_ioctx() */
 
@@ -252,6 +266,7 @@  struct aio_kiocb {
 	unsigned long		ki_flags;
 #define KIOCB_F_POLL_COMPLETED	0	/* polled IO has completed */
 #define KIOCB_F_POLL_EAGAIN	1	/* polled submission got EAGAIN */
+#define KIOCB_F_FORCE_NONBLOCK	2	/* inline submission attempt */
 
 	refcount_t		ki_refcnt;
 
@@ -1349,19 +1364,31 @@  static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 		unsigned int tail;
 
 		/*
-		 * If we can't get a cq entry, userspace overflowed the
-		 * submission (by quite a lot). Flag it as an overflow
-		 * condition, and next io_ring_enter(2) call will return
-		 * -EOVERFLOW.
+		 * Catch EAGAIN early if we've forced a nonblock attempt, as
+		 * we don't want to pass that back down to userspace through
+		 * the CQ ring. Just mark the ctx as such, so the caller will
+		 * see it and punt to workqueue. This is just for buffered
+		 * aio reads.
 		 */
-		spin_lock_irqsave(&ctx->completion_lock, flags);
-		ev = aio_peek_cqring(ctx, &tail);
-		if (ev) {
-			aio_fill_event(ev, iocb, res, res2);
-			aio_commit_cqring(ctx, tail);
-		} else
-			ctx->cq_ring_overflow = 1;
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+		if (res == -EAGAIN &&
+		    test_bit(KIOCB_F_FORCE_NONBLOCK, &iocb->ki_flags)) {
+			ctx->submit_eagain = 1;
+		} else {
+			/*
+			 * If we can't get a cq entry, userspace overflowed the
+			 * submission (by quite a lot). Flag it as an overflow
+			 * condition, and next io_ring_enter(2) call will return
+			 * -EOVERFLOW.
+			 */
+			spin_lock_irqsave(&ctx->completion_lock, flags);
+			ev = aio_peek_cqring(ctx, &tail);
+			if (ev) {
+				aio_fill_event(ev, iocb, res, res2);
+				aio_commit_cqring(ctx, tail);
+			} else
+				ctx->cq_ring_overflow = 1;
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+		}
 	} else {
 		aio_ring_complete(ctx, iocb, res, res2);
 
@@ -1727,6 +1754,63 @@  static long read_events(struct kioctx *ctx, long min_nr, long nr,
 	return ret;
 }
 
+static int aio_sq_thread(void *);
+
+static int aio_sq_thread_start(struct kioctx *ctx)
+{
+	struct aio_sq_ring *ring = ctx->sq_ring.ring;
+	struct aio_sq_offload *aso = &ctx->sq_offload;
+	int ret;
+
+	memset(aso, 0, sizeof(*aso));
+	init_waitqueue_head(&aso->wait);
+
+	if (!(ctx->flags & IOCTX_FLAG_FIXEDBUFS))
+		aso->mm = current->mm;
+
+	ret = -EBADF;
+	aso->files = get_files_struct(current);
+	if (!aso->files)
+		goto err;
+
+	if (ctx->flags & IOCTX_FLAG_SQTHREAD) {
+		char name[32];
+
+		snprintf(name, sizeof(name), "aio-sq-%lu/%d", ctx->user_id,
+					ring->sq_thread_cpu);
+		aso->thread = kthread_create_on_cpu(aio_sq_thread, ctx,
+						ring->sq_thread_cpu, name);
+		if (IS_ERR(aso->thread)) {
+			ret = PTR_ERR(aso->thread);
+			aso->thread = NULL;
+			goto err;
+		}
+		wake_up_process(aso->thread);
+	} else if (ctx->flags & IOCTX_FLAG_SQWQ) {
+		int concurrency;
+
+		/* Do QD, or 2 * CPUS, whatever is smallest */
+		concurrency = min(ring->nr_events - 1, 2 * num_online_cpus());
+		aso->wq = alloc_workqueue("aio-sq-%lu",
+						WQ_UNBOUND | WQ_FREEZABLE,
+						concurrency, ctx->user_id);
+		if (!aso->wq) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	return 0;
+err:
+	if (aso->files) {
+		put_files_struct(aso->files);
+		aso->files = NULL;
+	}
+	if (aso->mm)
+		aso->mm = NULL;
+	return ret;
+}
+
 static void aio_unmap_range(struct aio_mapped_range *range)
 {
 	int i;
@@ -1772,6 +1856,20 @@  static int aio_map_range(struct aio_mapped_range *range, void __user *uaddr,
 
 static void aio_scqring_unmap(struct kioctx *ctx)
 {
+	struct aio_sq_offload *aso = &ctx->sq_offload;
+
+	if (aso->thread) {
+		kthread_park(aso->thread);
+		kthread_stop(aso->thread);
+		aso->thread = NULL;
+	} else if (aso->wq) {
+		destroy_workqueue(aso->wq);
+		aso->wq = NULL;
+	}
+	if (aso->files) {
+		put_files_struct(aso->files);
+		aso->files = NULL;
+	}
 	aio_unmap_range(&ctx->sq_ring.ring_range);
 	aio_unmap_range(&ctx->sq_ring.iocb_range);
 	aio_unmap_range(&ctx->cq_ring);
@@ -1833,6 +1931,9 @@  static int aio_scqring_map(struct kioctx *ctx,
 	kcq_ring->nr_events = cq_ring_size;
 	kcq_ring->head = kcq_ring->tail = 0;
 
+	if (ctx->flags & (IOCTX_FLAG_SQTHREAD | IOCTX_FLAG_SQWQ))
+		ret = aio_sq_thread_start(ctx);
+
 err:
 	if (ret) {
 		aio_unmap_range(&ctx->sq_ring.ring_range);
@@ -1978,7 +2079,8 @@  SYSCALL_DEFINE5(io_setup2, u32, nr_events, u32, flags,
 	long ret;
 
 	if (flags & ~(IOCTX_FLAG_IOPOLL | IOCTX_FLAG_SCQRING |
-		      IOCTX_FLAG_FIXEDBUFS))
+		      IOCTX_FLAG_FIXEDBUFS | IOCTX_FLAG_SQTHREAD |
+		      IOCTX_FLAG_SQWQ))
 		return -EINVAL;
 
 	ret = get_user(ctx, ctxp);
@@ -1999,8 +2101,9 @@  SYSCALL_DEFINE5(io_setup2, u32, nr_events, u32, flags,
 			if (ret)
 				goto err;
 		}
-	} else if (flags & IOCTX_FLAG_FIXEDBUFS) {
-		/* can only support fixed bufs with SQ/CQ ring */
+	} else if (flags & (IOCTX_FLAG_FIXEDBUFS | IOCTX_FLAG_SQTHREAD |
+		            IOCTX_FLAG_SQWQ)) {
+		/* These features only supported with SCQRING */
 		ret = -EINVAL;
 		goto err;
 	}
@@ -2210,7 +2313,7 @@  static struct file *aio_file_get(struct aio_submit_state *state, int fd)
 }
 
 static int aio_prep_rw(struct aio_kiocb *kiocb, const struct iocb *iocb,
-		       struct aio_submit_state *state)
+		       struct aio_submit_state *state, bool force_nonblock)
 {
 	struct kioctx *ctx = kiocb->ki_ctx;
 	struct kiocb *req = &kiocb->rw;
@@ -2243,6 +2346,10 @@  static int aio_prep_rw(struct aio_kiocb *kiocb, const struct iocb *iocb,
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))
 		goto out_fput;
+	if (force_nonblock) {
+		req->ki_flags |= IOCB_NOWAIT;
+		set_bit(KIOCB_F_FORCE_NONBLOCK, &kiocb->ki_flags);
+	}
 
 	if (iocb->aio_flags & IOCB_FLAG_HIPRI) {
 		/* shares space in the union, and is rather pointless.. */
@@ -2411,7 +2518,7 @@  static void aio_iopoll_iocb_issued(struct aio_submit_state *state,
 
 static ssize_t aio_read(struct aio_kiocb *kiocb, const struct iocb *iocb,
 			struct aio_submit_state *state, bool vectored,
-			bool compat, bool kaddr)
+			bool compat, bool kaddr, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *req = &kiocb->rw;
@@ -2419,7 +2526,7 @@  static ssize_t aio_read(struct aio_kiocb *kiocb, const struct iocb *iocb,
 	struct file *file;
 	ssize_t ret;
 
-	ret = aio_prep_rw(kiocb, iocb, state);
+	ret = aio_prep_rw(kiocb, iocb, state, force_nonblock);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -2456,7 +2563,7 @@  static ssize_t aio_write(struct aio_kiocb *kiocb, const struct iocb *iocb,
 	struct file *file;
 	ssize_t ret;
 
-	ret = aio_prep_rw(kiocb, iocb, state);
+	ret = aio_prep_rw(kiocb, iocb, state, false);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -2709,7 +2816,7 @@  static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
 			   unsigned long ki_index,
 			   struct aio_submit_state *state, bool compat,
-			   bool kaddr)
+			   bool kaddr, bool force_nonblock)
 {
 	struct aio_kiocb *req;
 	ssize_t ret;
@@ -2770,13 +2877,15 @@  static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
 	ret = -EINVAL;
 	switch (iocb->aio_lio_opcode) {
 	case IOCB_CMD_PREAD:
-		ret = aio_read(req, iocb, state, false, compat, kaddr);
+		ret = aio_read(req, iocb, state, false, compat, kaddr,
+				force_nonblock);
 		break;
 	case IOCB_CMD_PWRITE:
 		ret = aio_write(req, iocb, state, false, compat, kaddr);
 		break;
 	case IOCB_CMD_PREADV:
-		ret = aio_read(req, iocb, state, true, compat, kaddr);
+		ret = aio_read(req, iocb, state, true, compat, kaddr,
+				force_nonblock);
 		break;
 	case IOCB_CMD_PWRITEV:
 		ret = aio_write(req, iocb, state, true, compat, kaddr);
@@ -2836,7 +2945,8 @@  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
 		return -EFAULT;
 
-	return __io_submit_one(ctx, &iocb, ki_index, state, compat, false);
+	return __io_submit_one(ctx, &iocb, ki_index, state, compat, false,
+				false);
 }
 
 #ifdef CONFIG_BLOCK
@@ -2956,7 +3066,8 @@  static int aio_ring_submit(struct kioctx *ctx, unsigned int to_submit)
 		if (!iocb)
 			break;
 
-		ret = __io_submit_one(ctx, iocb, iocb_index, statep, false, kaddr);
+		ret = __io_submit_one(ctx, iocb, iocb_index, statep, false, kaddr,
+					false);
 		if (ret)
 			break;
 
@@ -3008,15 +3119,288 @@  static int aio_cqring_wait(struct kioctx *ctx, int min_events)
 	return ret;
 }
 
+static void aio_fill_cq_error(struct kioctx *ctx, const struct iocb *iocb,
+			      long ret)
+{
+	struct io_event *ev;
+	unsigned tail;
+
+	/*
+	 * Only really need the lock for non-polled IO, but this is an error
+	 * so not worth checking. Just lock it so we know kernel access to
+	 * the CQ ring is serialized.
+	 */
+	spin_lock_irq(&ctx->completion_lock);
+	ev = aio_peek_cqring(ctx, &tail);
+	ev->obj = iocb->aio_data;
+	ev->data = 0;
+	ev->res = ret;
+	ev->res2 = 0;
+	aio_commit_cqring(ctx, tail);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	/*
+	 * for thread offload, app could already be sleeping in io_ring_enter()
+	 * before we get to flag the error. wake them up, if needed.
+	 */
+	if (ctx->flags & (IOCTX_FLAG_SQTHREAD | IOCTX_FLAG_SQWQ))
+		if (waitqueue_active(&ctx->wait))
+			wake_up(&ctx->wait);
+}
+
+struct iocb_submit {
+	const struct iocb *iocb;
+	unsigned int index;
+};
+
+static int aio_submit_iocbs(struct kioctx *ctx, struct iocb_submit *iocbs,
+			    unsigned int nr, struct mm_struct *cur_mm,
+			    bool mm_fault)
+{
+	struct aio_submit_state state, *statep = NULL;
+	int ret, i, submitted = 0;
+
+	if (nr > AIO_PLUG_THRESHOLD) {
+		aio_submit_state_start(&state, ctx, nr);
+		statep = &state;
+	}
+
+	for (i = 0; i < nr; i++) {
+		if (unlikely(mm_fault))
+			ret = -EFAULT;
+		else
+			ret = __io_submit_one(ctx, iocbs[i].iocb,
+						iocbs[i].index, statep, false,
+						!cur_mm, false);
+		if (!ret) {
+			submitted++;
+			continue;
+		}
+
+		aio_fill_cq_error(ctx, iocbs[i].iocb, ret);
+	}
+
+	if (statep)
+		aio_submit_state_end(&state);
+
+	return submitted;
+}
+
+/*
+ * sq thread only supports O_DIRECT or FIXEDBUFS IO
+ */
+static int aio_sq_thread(void *data)
+{
+	struct iocb_submit iocbs[AIO_IOPOLL_BATCH];
+	struct kioctx *ctx = data;
+	struct aio_sq_offload *aso = &ctx->sq_offload;
+	struct mm_struct *cur_mm = NULL;
+	struct files_struct *old_files;
+	mm_segment_t old_fs;
+	DEFINE_WAIT(wait);
+
+	old_files = current->files;
+	current->files = aso->files;
+
+	old_fs = get_fs();
+	set_fs(USER_DS);
+
+	while (!kthread_should_stop()) {
+		const struct iocb *iocb;
+		bool mm_fault = false;
+		unsigned nhead, index;
+		int i;
+
+		iocb = aio_peek_sqring(ctx, &index, &nhead);
+		if (!iocb) {
+			prepare_to_wait(&aso->wait, &wait, TASK_INTERRUPTIBLE);
+			iocb = aio_peek_sqring(ctx, &index, &nhead);
+			if (!iocb) {
+				/*
+				 * Drop cur_mm before scheduler. We can't hold
+				 * it for long periods, and it would also
+				 * introduce a deadlock with kill_ioctx().
+				 */
+				if (cur_mm) {
+					unuse_mm(cur_mm);
+					mmput(cur_mm);
+					cur_mm = NULL;
+				}
+				if (kthread_should_park())
+					kthread_parkme();
+				if (kthread_should_stop()) {
+					finish_wait(&aso->wait, &wait);
+					break;
+				}
+				if (signal_pending(current))
+					flush_signals(current);
+				schedule();
+			}
+			finish_wait(&aso->wait, &wait);
+			if (!iocb)
+				continue;
+		}
+
+		/* If ->mm is set, we're not doing FIXEDBUFS */
+		if (aso->mm && !cur_mm) {
+			mm_fault = !mmget_not_zero(aso->mm);
+			if (!mm_fault) {
+				use_mm(aso->mm);
+				cur_mm = aso->mm;
+			}
+		}
+
+		i = 0;
+		do {
+			if (i == ARRAY_SIZE(iocbs))
+				break;
+			iocbs[i].iocb = iocb;
+			iocbs[i].index = index;
+			++i;
+			aio_commit_sqring(ctx, nhead);
+		} while ((iocb = aio_peek_sqring(ctx, &index, &nhead)) != NULL);
+
+		aio_submit_iocbs(ctx, iocbs, i, cur_mm, mm_fault);
+	}
+	current->files = old_files;
+	set_fs(old_fs);
+	if (cur_mm) {
+		unuse_mm(cur_mm);
+		mmput(cur_mm);
+	}
+	return 0;
+}
+
+struct aio_io_work {
+	struct work_struct work;
+	struct kioctx *ctx;
+	struct iocb iocb;
+	unsigned iocb_index;
+};
+
+static void aio_sq_wq_submit_work(struct work_struct *work)
+{
+	struct aio_io_work *aiw = container_of(work, struct aio_io_work, work);
+	struct kioctx *ctx = aiw->ctx;
+	struct aio_sq_offload *aso = &ctx->sq_offload;
+	mm_segment_t old_fs = get_fs();
+	struct files_struct *old_files;
+	int ret;
+
+	old_files = current->files;
+	current->files = aso->files;
+
+	if (aso->mm) {
+		if (!mmget_not_zero(aso->mm)) {
+			ret = -EFAULT;
+			goto err;
+		}
+		use_mm(aso->mm);
+	}
+
+	set_fs(USER_DS);
+
+	ret = __io_submit_one(ctx, &aiw->iocb, aiw->iocb_index, NULL, false,
+				!aso->mm, false);
+
+	set_fs(old_fs);
+	if (aso->mm) {
+		unuse_mm(aso->mm);
+		mmput(aso->mm);
+	}
+
+err:
+	if (ret)
+		aio_fill_cq_error(ctx, &aiw->iocb, ret);
+	current->files = old_files;
+	kfree(aiw);
+}
+
+/*
+ * If this is a read, try a cached inline read first. If the IO is in the
+ * page cache, we can satisfy it without blocking and without having to
+ * punt to a threaded execution. This is much faster, particularly for
+ * lower queue depth IO, and it's always a lot more efficient.
+ */
+static bool aio_sq_try_inline(struct kioctx *ctx, const struct iocb *iocb,
+			      unsigned index)
+{
+	struct aio_sq_offload *aso = &ctx->sq_offload;
+	int ret;
+
+	if (iocb->aio_lio_opcode != IOCB_CMD_PREAD &&
+	    iocb->aio_lio_opcode != IOCB_CMD_PREADV)
+		return false;
+
+	ret = __io_submit_one(ctx, iocb, index, NULL, false, !aso->mm, true);
+	if (ret == -EAGAIN || ctx->submit_eagain) {
+		ctx->submit_eagain = 0;
+		return false;
+	}
+
+	/*
+	 * We're done - even if this was an error, return 0. The error will
+	 * be in the CQ ring for the application.
+	 */
+	return true;
+}
+
+static int aio_sq_wq_submit(struct kioctx *ctx, unsigned int to_submit)
+{
+	struct aio_io_work *work;
+	const struct iocb *iocb;
+	unsigned nhead, index;
+	int ret, queued;
+
+	ret = queued = 0;
+	while ((iocb = aio_peek_sqring(ctx, &index, &nhead)) != NULL) {
+		ret = aio_sq_try_inline(ctx, iocb, index);
+		if (!ret) {
+			work = kmalloc(sizeof(*work), GFP_KERNEL);
+			if (!work) {
+				ret = -ENOMEM;
+				break;
+			}
+			memcpy(&work->iocb, iocb, sizeof(*iocb));
+			aio_commit_sqring(ctx, nhead);
+			work->iocb_index = index;
+			INIT_WORK(&work->work, aio_sq_wq_submit_work);
+			work->ctx = ctx;
+			queue_work(ctx->sq_offload.wq, &work->work);
+		}
+		queued++;
+		if (queued == to_submit)
+			break;
+	}
+
+	return queued ? queued : ret;
+}
+
 static int __io_ring_enter(struct kioctx *ctx, unsigned int to_submit,
 			   unsigned int min_complete, unsigned int flags)
 {
 	int ret = 0;
 
 	if (flags & IORING_FLAG_SUBMIT) {
-		ret = aio_ring_submit(ctx, to_submit);
-		if (ret < 0)
-			return ret;
+		if (!to_submit)
+			return 0;
+
+		/*
+		 * Three options here:
+		 * 1) We have an sq thread, just wake it up to do submissions
+		 * 2) We have an sq wq, queue a work item for each iocb
+		 * 3) Submit directly
+		 */
+		if (ctx->flags & IOCTX_FLAG_SQTHREAD) {
+			wake_up(&ctx->sq_offload.wait);
+			ret = to_submit;
+		} else if (ctx->flags & IOCTX_FLAG_SQWQ) {
+			ret = aio_sq_wq_submit(ctx, to_submit);
+		} else {
+			ret = aio_ring_submit(ctx, to_submit);
+			if (ret < 0)
+				return ret;
+		}
 	}
 	if (flags & IORING_FLAG_GETEVENTS) {
 		unsigned int nr_events = 0;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index 39d783175872..b09b1976e038 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -111,6 +111,8 @@  struct iocb {
 #define IOCTX_FLAG_IOPOLL	(1 << 0)	/* io_context is polled */
 #define IOCTX_FLAG_SCQRING	(1 << 1)	/* Use SQ/CQ rings */
 #define IOCTX_FLAG_FIXEDBUFS	(1 << 2)	/* IO buffers are fixed */
+#define IOCTX_FLAG_SQTHREAD	(1 << 3)	/* Use SQ thread */
+#define IOCTX_FLAG_SQWQ		(1 << 4)	/* Use SQ workqueue */
 
 struct aio_sq_ring {
 	union {
@@ -118,6 +120,7 @@  struct aio_sq_ring {
 			u32 head;	/* kernel consumer head */
 			u32 tail;	/* app producer tail */
 			u32 nr_events;	/* max events in ring */
+			u16 sq_thread_cpu;
 			u64 iocbs;	/* setup pointer to app iocbs */
 		};
 		u32 pad[16];