[19/22] aio: support kernel side submission for aio with SCQRING

Message ID	20181221192236.12866-20-axboe@kernel.dk (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Jens Axboe <axboe@kernel.dk> To: linux-fsdevel@vger.kernel.org, linux-aio@kvack.org, linux-block@vger.kernel.org Cc: hch@lst.de, viro@zeniv.linux.org.uk, Jens Axboe <axboe@kernel.dk> Subject: [PATCH 19/22] aio: support kernel side submission for aio with SCQRING Date: Fri, 21 Dec 2018 12:22:33 -0700 Message-Id: <20181221192236.12866-20-axboe@kernel.dk> In-Reply-To: <20181221192236.12866-1-axboe@kernel.dk> References: <20181221192236.12866-1-axboe@kernel.dk> Sender: linux-block-owner@vger.kernel.org Precedence: bulk
Series	[01/22] fs: add an iopoll method to struct file_operations \| expand [01/22] fs: add an iopoll method to struct file_operations [02/22] block: add bio_set_polled() helper [03/22] block: wire up block device iopoll method [04/22] block: use REQ_HIPRI_ASYNC for non-sync polled IO [05/22] block: use bio_set_polled() helper for O_DIRECT [06/22] iomap: wire up the iopoll method [07/22] aio: add io_setup2() system call [08/22] aio: support for IO polling [09/22] aio: add submission side request cache [10/22] fs: add fget_many() and fput_many() [11/22] aio: use fget/fput_many() for file references [12/22] aio: split iocb init from allocation [13/22] aio: batch aio_kiocb allocation [14/22] aio: split old ring complete out from aio_complete() [15/22] aio: pass in user index to __io_submit_one() [16/22] aio: add support for submission/completion rings [17/22] block: implement bio helper to add iter bvec pages to bio [18/22] aio: add support for pre-mapped user IO buffers [19/22] aio: support kernel side submission for aio with SCQRING [20/22] aio: enable polling for IOCTX_FLAG_SQTHREAD [21/22] aio: utilize io_event->res2 for CQ ring [22/22] aio: add my copyright

diff --git a/fs/aio.c b/fs/aio.c index c424aa2ed336..cd4a61642b46 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -25,6 +25,7 @@ #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mmu_context.h> @@ -44,6 +45,7 @@ #include <linux/mount.h> #include <linux/sizes.h> #include <linux/nospec.h> +#include <linux/sched/mm.h> #include <asm/kmap_types.h> #include <linux/uaccess.h> @@ -116,6 +118,14 @@ struct aio_mapped_ubuf { unsigned int nr_bvecs; }; +struct aio_sq_offload { + struct task_struct *thread; /* if using a thread */ + struct workqueue_struct *wq; /* wq offload */ + struct mm_struct *mm; + struct files_struct *files; + wait_queue_head_t wait; +}; + struct kioctx { struct percpu_ref users; atomic_t dead; @@ -158,6 +168,10 @@ struct kioctx { struct aio_iocb_ring sq_ring; struct aio_mapped_range cq_ring; int cq_ring_overflow; + int submit_eagain; + + /* sq ring submitter thread, if used */ + struct aio_sq_offload sq_offload; struct rcu_work free_rwork; /* see free_ioctx() */ @@ -252,6 +266,7 @@ struct aio_kiocb { unsigned long ki_flags; #define KIOCB_F_POLL_COMPLETED 0 /* polled IO has completed */ #define KIOCB_F_POLL_EAGAIN 1 /* polled submission got EAGAIN */ +#define KIOCB_F_FORCE_NONBLOCK 2 /* inline submission attempt */ refcount_t ki_refcnt; @@ -1349,19 +1364,31 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) unsigned int tail; /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Flag it as an overflow - * condition, and next io_ring_enter(2) call will return - * -EOVERFLOW. + * Catch EAGAIN early if we've forced a nonblock attempt, as + * we don't want to pass that back down to userspace through + * the CQ ring. Just mark the ctx as such, so the caller will + * see it and punt to workqueue. This is just for buffered + * aio reads. */ - spin_lock_irqsave(&ctx->completion_lock, flags); - ev = aio_peek_cqring(ctx, &tail); - if (ev) { - aio_fill_event(ev, iocb, res, res2); - aio_commit_cqring(ctx, tail); - } else - ctx->cq_ring_overflow = 1; - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (res == -EAGAIN && + test_bit(KIOCB_F_FORCE_NONBLOCK, &iocb->ki_flags)) { + ctx->submit_eagain = 1; + } else { + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Flag it as an overflow + * condition, and next io_ring_enter(2) call will return + * -EOVERFLOW. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + ev = aio_peek_cqring(ctx, &tail); + if (ev) { + aio_fill_event(ev, iocb, res, res2); + aio_commit_cqring(ctx, tail); + } else + ctx->cq_ring_overflow = 1; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } } else { aio_ring_complete(ctx, iocb, res, res2); @@ -1727,6 +1754,63 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, return ret; } +static int aio_sq_thread(void *); + +static int aio_sq_thread_start(struct kioctx *ctx) +{ + struct aio_sq_ring *ring = ctx->sq_ring.ring; + struct aio_sq_offload *aso = &ctx->sq_offload; + int ret; + + memset(aso, 0, sizeof(*aso)); + init_waitqueue_head(&aso->wait); + + if (!(ctx->flags & IOCTX_FLAG_FIXEDBUFS)) + aso->mm = current->mm; + + ret = -EBADF; + aso->files = get_files_struct(current); + if (!aso->files) + goto err; + + if (ctx->flags & IOCTX_FLAG_SQTHREAD) { + char name[32]; + + snprintf(name, sizeof(name), "aio-sq-%lu/%d", ctx->user_id, + ring->sq_thread_cpu); + aso->thread = kthread_create_on_cpu(aio_sq_thread, ctx, + ring->sq_thread_cpu, name); + if (IS_ERR(aso->thread)) { + ret = PTR_ERR(aso->thread); + aso->thread = NULL; + goto err; + } + wake_up_process(aso->thread); + } else if (ctx->flags & IOCTX_FLAG_SQWQ) { + int concurrency; + + /* Do QD, or 2 * CPUS, whatever is smallest */ + concurrency = min(ring->nr_events - 1, 2 * num_online_cpus()); + aso->wq = alloc_workqueue("aio-sq-%lu", + WQ_UNBOUND | WQ_FREEZABLE, + concurrency, ctx->user_id); + if (!aso->wq) { + ret = -ENOMEM; + goto err; + } + } + + return 0; +err: + if (aso->files) { + put_files_struct(aso->files); + aso->files = NULL; + } + if (aso->mm) + aso->mm = NULL; + return ret; +} + static void aio_unmap_range(struct aio_mapped_range *range) { int i; @@ -1772,6 +1856,20 @@ static int aio_map_range(struct aio_mapped_range *range, void __user *uaddr, static void aio_scqring_unmap(struct kioctx *ctx) { + struct aio_sq_offload *aso = &ctx->sq_offload; + + if (aso->thread) { + kthread_park(aso->thread); + kthread_stop(aso->thread); + aso->thread = NULL; + } else if (aso->wq) { + destroy_workqueue(aso->wq); + aso->wq = NULL; + } + if (aso->files) { + put_files_struct(aso->files); + aso->files = NULL; + } aio_unmap_range(&ctx->sq_ring.ring_range); aio_unmap_range(&ctx->sq_ring.iocb_range); aio_unmap_range(&ctx->cq_ring); @@ -1833,6 +1931,9 @@ static int aio_scqring_map(struct kioctx *ctx, kcq_ring->nr_events = cq_ring_size; kcq_ring->head = kcq_ring->tail = 0; + if (ctx->flags & (IOCTX_FLAG_SQTHREAD | IOCTX_FLAG_SQWQ)) + ret = aio_sq_thread_start(ctx); + err: if (ret) { aio_unmap_range(&ctx->sq_ring.ring_range); @@ -1978,7 +2079,8 @@ SYSCALL_DEFINE5(io_setup2, u32, nr_events, u32, flags, long ret; if (flags & ~(IOCTX_FLAG_IOPOLL | IOCTX_FLAG_SCQRING | - IOCTX_FLAG_FIXEDBUFS)) + IOCTX_FLAG_FIXEDBUFS | IOCTX_FLAG_SQTHREAD | + IOCTX_FLAG_SQWQ)) return -EINVAL; ret = get_user(ctx, ctxp); @@ -1999,8 +2101,9 @@ SYSCALL_DEFINE5(io_setup2, u32, nr_events, u32, flags, if (ret) goto err; } - } else if (flags & IOCTX_FLAG_FIXEDBUFS) { - /* can only support fixed bufs with SQ/CQ ring */ + } else if (flags & (IOCTX_FLAG_FIXEDBUFS | IOCTX_FLAG_SQTHREAD | + IOCTX_FLAG_SQWQ)) { + /* These features only supported with SCQRING */ ret = -EINVAL; goto err; } @@ -2210,7 +2313,7 @@ static struct file *aio_file_get(struct aio_submit_state *state, int fd) } static int aio_prep_rw(struct aio_kiocb *kiocb, const struct iocb *iocb, - struct aio_submit_state *state) + struct aio_submit_state *state, bool force_nonblock) { struct kioctx *ctx = kiocb->ki_ctx; struct kiocb *req = &kiocb->rw; @@ -2243,6 +2346,10 @@ static int aio_prep_rw(struct aio_kiocb *kiocb, const struct iocb *iocb, ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) goto out_fput; + if (force_nonblock) { + req->ki_flags |= IOCB_NOWAIT; + set_bit(KIOCB_F_FORCE_NONBLOCK, &kiocb->ki_flags); + } if (iocb->aio_flags & IOCB_FLAG_HIPRI) { /* shares space in the union, and is rather pointless.. */ @@ -2411,7 +2518,7 @@ static void aio_iopoll_iocb_issued(struct aio_submit_state *state, static ssize_t aio_read(struct aio_kiocb *kiocb, const struct iocb *iocb, struct aio_submit_state *state, bool vectored, - bool compat, bool kaddr) + bool compat, bool kaddr, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *req = &kiocb->rw; @@ -2419,7 +2526,7 @@ static ssize_t aio_read(struct aio_kiocb *kiocb, const struct iocb *iocb, struct file *file; ssize_t ret; - ret = aio_prep_rw(kiocb, iocb, state); + ret = aio_prep_rw(kiocb, iocb, state, force_nonblock); if (ret) return ret; file = req->ki_filp; @@ -2456,7 +2563,7 @@ static ssize_t aio_write(struct aio_kiocb *kiocb, const struct iocb *iocb, struct file *file; ssize_t ret; - ret = aio_prep_rw(kiocb, iocb, state); + ret = aio_prep_rw(kiocb, iocb, state, false); if (ret) return ret; file = req->ki_filp; @@ -2709,7 +2816,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, unsigned long ki_index, struct aio_submit_state *state, bool compat, - bool kaddr) + bool kaddr, bool force_nonblock) { struct aio_kiocb *req; ssize_t ret; @@ -2770,13 +2877,15 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, ret = -EINVAL; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(req, iocb, state, false, compat, kaddr); + ret = aio_read(req, iocb, state, false, compat, kaddr, + force_nonblock); break; case IOCB_CMD_PWRITE: ret = aio_write(req, iocb, state, false, compat, kaddr); break; case IOCB_CMD_PREADV: - ret = aio_read(req, iocb, state, true, compat, kaddr); + ret = aio_read(req, iocb, state, true, compat, kaddr, + force_nonblock); break; case IOCB_CMD_PWRITEV: ret = aio_write(req, iocb, state, true, compat, kaddr); @@ -2836,7 +2945,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; - return __io_submit_one(ctx, &iocb, ki_index, state, compat, false); + return __io_submit_one(ctx, &iocb, ki_index, state, compat, false, + false); } #ifdef CONFIG_BLOCK @@ -2956,7 +3066,8 @@ static int aio_ring_submit(struct kioctx *ctx, unsigned int to_submit) if (!iocb) break; - ret = __io_submit_one(ctx, iocb, iocb_index, statep, false, kaddr); + ret = __io_submit_one(ctx, iocb, iocb_index, statep, false, kaddr, + false); if (ret) break; @@ -3008,15 +3119,288 @@ static int aio_cqring_wait(struct kioctx *ctx, int min_events) return ret; } +static void aio_fill_cq_error(struct kioctx *ctx, const struct iocb *iocb, + long ret) +{ + struct io_event *ev; + unsigned tail; + + /* + * Only really need the lock for non-polled IO, but this is an error + * so not worth checking. Just lock it so we know kernel access to + * the CQ ring is serialized. + */ + spin_lock_irq(&ctx->completion_lock); + ev = aio_peek_cqring(ctx, &tail); + ev->obj = iocb->aio_data; + ev->data = 0; + ev->res = ret; + ev->res2 = 0; + aio_commit_cqring(ctx, tail); + spin_unlock_irq(&ctx->completion_lock); + + /* + * for thread offload, app could already be sleeping in io_ring_enter() + * before we get to flag the error. wake them up, if needed. + */ + if (ctx->flags & (IOCTX_FLAG_SQTHREAD | IOCTX_FLAG_SQWQ)) + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +struct iocb_submit { + const struct iocb *iocb; + unsigned int index; +}; + +static int aio_submit_iocbs(struct kioctx *ctx, struct iocb_submit *iocbs, + unsigned int nr, struct mm_struct *cur_mm, + bool mm_fault) +{ + struct aio_submit_state state, *statep = NULL; + int ret, i, submitted = 0; + + if (nr > AIO_PLUG_THRESHOLD) { + aio_submit_state_start(&state, ctx, nr); + statep = &state; + } + + for (i = 0; i < nr; i++) { + if (unlikely(mm_fault)) + ret = -EFAULT; + else + ret = __io_submit_one(ctx, iocbs[i].iocb, + iocbs[i].index, statep, false, + !cur_mm, false); + if (!ret) { + submitted++; + continue; + } + + aio_fill_cq_error(ctx, iocbs[i].iocb, ret); + } + + if (statep) + aio_submit_state_end(&state); + + return submitted; +} + +/* + * sq thread only supports O_DIRECT or FIXEDBUFS IO + */ +static int aio_sq_thread(void *data) +{ + struct iocb_submit iocbs[AIO_IOPOLL_BATCH]; + struct kioctx *ctx = data; + struct aio_sq_offload *aso = &ctx->sq_offload; + struct mm_struct *cur_mm = NULL; + struct files_struct *old_files; + mm_segment_t old_fs; + DEFINE_WAIT(wait); + + old_files = current->files; + current->files = aso->files; + + old_fs = get_fs(); + set_fs(USER_DS); + + while (!kthread_should_stop()) { + const struct iocb *iocb; + bool mm_fault = false; + unsigned nhead, index; + int i; + + iocb = aio_peek_sqring(ctx, &index, &nhead); + if (!iocb) { + prepare_to_wait(&aso->wait, &wait, TASK_INTERRUPTIBLE); + iocb = aio_peek_sqring(ctx, &index, &nhead); + if (!iocb) { + /* + * Drop cur_mm before scheduler. We can't hold + * it for long periods, and it would also + * introduce a deadlock with kill_ioctx(). + */ + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + cur_mm = NULL; + } + if (kthread_should_park()) + kthread_parkme(); + if (kthread_should_stop()) { + finish_wait(&aso->wait, &wait); + break; + } + if (signal_pending(current)) + flush_signals(current); + schedule(); + } + finish_wait(&aso->wait, &wait); + if (!iocb) + continue; + } + + /* If ->mm is set, we're not doing FIXEDBUFS */ + if (aso->mm && !cur_mm) { + mm_fault = !mmget_not_zero(aso->mm); + if (!mm_fault) { + use_mm(aso->mm); + cur_mm = aso->mm; + } + } + + i = 0; + do { + if (i == ARRAY_SIZE(iocbs)) + break; + iocbs[i].iocb = iocb; + iocbs[i].index = index; + ++i; + aio_commit_sqring(ctx, nhead); + } while ((iocb = aio_peek_sqring(ctx, &index, &nhead)) != NULL); + + aio_submit_iocbs(ctx, iocbs, i, cur_mm, mm_fault); + } + current->files = old_files; + set_fs(old_fs); + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + } + return 0; +} + +struct aio_io_work { + struct work_struct work; + struct kioctx *ctx; + struct iocb iocb; + unsigned iocb_index; +}; + +static void aio_sq_wq_submit_work(struct work_struct *work) +{ + struct aio_io_work *aiw = container_of(work, struct aio_io_work, work); + struct kioctx *ctx = aiw->ctx; + struct aio_sq_offload *aso = &ctx->sq_offload; + mm_segment_t old_fs = get_fs(); + struct files_struct *old_files; + int ret; + + old_files = current->files; + current->files = aso->files; + + if (aso->mm) { + if (!mmget_not_zero(aso->mm)) { + ret = -EFAULT; + goto err; + } + use_mm(aso->mm); + } + + set_fs(USER_DS); + + ret = __io_submit_one(ctx, &aiw->iocb, aiw->iocb_index, NULL, false, + !aso->mm, false); + + set_fs(old_fs); + if (aso->mm) { + unuse_mm(aso->mm); + mmput(aso->mm); + } + +err: + if (ret) + aio_fill_cq_error(ctx, &aiw->iocb, ret); + current->files = old_files; + kfree(aiw); +} + +/* + * If this is a read, try a cached inline read first. If the IO is in the + * page cache, we can satisfy it without blocking and without having to + * punt to a threaded execution. This is much faster, particularly for + * lower queue depth IO, and it's always a lot more efficient. + */ +static bool aio_sq_try_inline(struct kioctx *ctx, const struct iocb *iocb, + unsigned index) +{ + struct aio_sq_offload *aso = &ctx->sq_offload; + int ret; + + if (iocb->aio_lio_opcode != IOCB_CMD_PREAD && + iocb->aio_lio_opcode != IOCB_CMD_PREADV) + return false; + + ret = __io_submit_one(ctx, iocb, index, NULL, false, !aso->mm, true); + if (ret == -EAGAIN || ctx->submit_eagain) { + ctx->submit_eagain = 0; + return false; + } + + /* + * We're done - even if this was an error, return 0. The error will + * be in the CQ ring for the application. + */ + return true; +} + +static int aio_sq_wq_submit(struct kioctx *ctx, unsigned int to_submit) +{ + struct aio_io_work *work; + const struct iocb *iocb; + unsigned nhead, index; + int ret, queued; + + ret = queued = 0; + while ((iocb = aio_peek_sqring(ctx, &index, &nhead)) != NULL) { + ret = aio_sq_try_inline(ctx, iocb, index); + if (!ret) { + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + ret = -ENOMEM; + break; + } + memcpy(&work->iocb, iocb, sizeof(*iocb)); + aio_commit_sqring(ctx, nhead); + work->iocb_index = index; + INIT_WORK(&work->work, aio_sq_wq_submit_work); + work->ctx = ctx; + queue_work(ctx->sq_offload.wq, &work->work); + } + queued++; + if (queued == to_submit) + break; + } + + return queued ? queued : ret; +} + static int __io_ring_enter(struct kioctx *ctx, unsigned int to_submit, unsigned int min_complete, unsigned int flags) { int ret = 0; if (flags & IORING_FLAG_SUBMIT) { - ret = aio_ring_submit(ctx, to_submit); - if (ret < 0) - return ret; + if (!to_submit) + return 0; + + /* + * Three options here: + * 1) We have an sq thread, just wake it up to do submissions + * 2) We have an sq wq, queue a work item for each iocb + * 3) Submit directly + */ + if (ctx->flags & IOCTX_FLAG_SQTHREAD) { + wake_up(&ctx->sq_offload.wait); + ret = to_submit; + } else if (ctx->flags & IOCTX_FLAG_SQWQ) { + ret = aio_sq_wq_submit(ctx, to_submit); + } else { + ret = aio_ring_submit(ctx, to_submit); + if (ret < 0) + return ret; + } } if (flags & IORING_FLAG_GETEVENTS) { unsigned int nr_events = 0; diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index 39d783175872..b09b1976e038 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -111,6 +111,8 @@ struct iocb { #define IOCTX_FLAG_IOPOLL (1 << 0) /* io_context is polled */ #define IOCTX_FLAG_SCQRING (1 << 1) /* Use SQ/CQ rings */ #define IOCTX_FLAG_FIXEDBUFS (1 << 2) /* IO buffers are fixed */ +#define IOCTX_FLAG_SQTHREAD (1 << 3) /* Use SQ thread */ +#define IOCTX_FLAG_SQWQ (1 << 4) /* Use SQ workqueue */ struct aio_sq_ring { union { @@ -118,6 +120,7 @@ struct aio_sq_ring { u32 head; /* kernel consumer head */ u32 tail; /* app producer tail */ u32 nr_events; /* max events in ring */ + u16 sq_thread_cpu; u64 iocbs; /* setup pointer to app iocbs */ }; u32 pad[16];

[19/22] aio: support kernel side submission for aio with SCQRING

Commit Message

Comments

Patch