[8/9] io_uring/epoll: add support for IORING_OP_EPOLL_WAIT

Message ID	20250203163114.124077-9-axboe@kernel.dk (mailing list archive)
State	New
Headers	show Received: from mail-io1-f52.google.com (mail-io1-f52.google.com [209.85.166.52]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 58A5720B1F2 for <linux-fsdevel@vger.kernel.org>; Mon, 3 Feb 2025 16:31:32 +0000 (UTC) From: Jens Axboe <axboe@kernel.dk> To: io-uring@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org, brauner@kernel.org, Jens Axboe <axboe@kernel.dk> Subject: [PATCH 8/9] io_uring/epoll: add support for IORING_OP_EPOLL_WAIT Date: Mon, 3 Feb 2025 09:23:46 -0700 Message-ID: <20250203163114.124077-9-axboe@kernel.dk> In-Reply-To: <20250203163114.124077-1-axboe@kernel.dk> References: <20250203163114.124077-1-axboe@kernel.dk> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	io_uring epoll wait support \| expand [PATCHSET,0/9] io_uring epoll wait support [1/9] eventpoll: abstract out main epoll reaper into a function [2/9] eventpoll: add helper to remove wait entry from wait queue head [3/9] eventpoll: abstract out ep_try_send_events() helper [4/9] eventpoll: add struct wait_queue_entry argument to epoll_wait() [5/9] eventpoll: add ep_poll_queue() loop [6/9] io_uring/epoll: remove CONFIG_EPOLL guards [7/9] io_uring/poll: pull ownership handling into poll.h [8/9] io_uring/epoll: add support for IORING_OP_EPOLL_WAIT [9/9] io_uring/epoll: add multishot support for IORING_OP_EPOLL_WAIT

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3def525a1da3..ee56992d31d5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -370,6 +370,10 @@ struct io_ring_ctx { struct io_alloc_cache futex_cache; #endif +#ifdef CONFIG_EPOLL + struct hlist_head epoll_list; +#endif + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e11c82638527..a559e1e1544a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -278,6 +278,7 @@ enum io_uring_op { IORING_OP_FTRUNCATE, IORING_OP_BIND, IORING_OP_LISTEN, + IORING_OP_EPOLL_WAIT, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 484193567839..9cebd0145cb4 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -17,6 +17,7 @@ #include "timeout.h" #include "waitid.h" #include "futex.h" +#include "epoll.h" #include "cancel.h" struct io_cancel { @@ -128,6 +129,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, if (ret != -ENOENT) return ret; + ret = io_epoll_wait_cancel(ctx, cd, issue_flags); + if (ret != -ENOENT) + return ret; + spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 7848d9cc073d..2a9c679516c8 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "epoll.h" +#include "poll.h" struct io_epoll { struct file *file; @@ -20,6 +21,13 @@ struct io_epoll { struct epoll_event event; }; +struct io_epoll_wait { + struct file *file; + int maxevents; + struct epoll_event __user *events; + struct wait_queue_entry wait; +}; + int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll); @@ -57,3 +65,163 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +static void __io_epoll_cancel(struct io_kiocb *req) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + epoll_wait_remove(req->file, &iew->wait); + hlist_del_init(&req->hash_node); + io_req_set_res(req, -ECANCELED, 0); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); +} + +static void __io_epoll_wait_cancel(struct io_kiocb *req) +{ + io_poll_mark_cancelled(req); + if (io_poll_get_ownership(req)) { + __io_epoll_cancel(req); + io_req_set_res(req, -ECANCELED, 0); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + } +} + +bool io_epoll_wait_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, &ctx->epoll_list, hash_node) { + if (!io_match_task_safe(req, tctx, cancel_all)) + continue; + __io_epoll_wait_cancel(req); + found = true; + } + + return found; +} + +int io_epoll_wait_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + int nr = 0; + + io_ring_submit_lock(ctx, issue_flags); + hlist_for_each_entry_safe(req, tmp, &ctx->epoll_list, hash_node) { + if (!io_cancel_req_match(req, cd)) + continue; + __io_epoll_wait_cancel(req); + nr++; + } + io_ring_submit_unlock(ctx, issue_flags); + return nr ?: -ENOENT; +} + +static void io_epoll_retry(struct io_kiocb *req, struct io_tw_state *ts) +{ + int v; + + do { + v = atomic_read(&req->poll_refs); + if (unlikely(v != 1)) { + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return; + if (v & IO_POLL_CANCEL_FLAG) { + __io_epoll_cancel(req); + return; + } + } + v &= IO_POLL_REF_MASK; + } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); + + io_req_task_submit(req, ts); +} + +static int io_epoll_execute(struct io_kiocb *req) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + if (io_poll_get_ownership(req)) { + list_del_init_careful(&iew->wait.entry); + req->io_task_work.func = io_epoll_retry; + io_req_task_work_add(req); + return 1; + } + + return 0; +} + +static __cold int io_epoll_pollfree_wake(struct io_kiocb *req) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + io_poll_mark_cancelled(req); + list_del_init_careful(&iew->wait.entry); + io_epoll_execute(req); + return 1; +} + +static int io_epoll_wait_fn(struct wait_queue_entry *wait, unsigned mode, + int sync, void *key) +{ + struct io_kiocb *req = wait->private; + __poll_t mask = key_to_poll(key); + + if (unlikely(mask & POLLFREE)) + return io_epoll_pollfree_wake(req); + + return io_epoll_execute(req); +} + +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + + iew->maxevents = READ_ONCE(sqe->len); + iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + iew->wait.flags = 0; + iew->wait.private = req; + iew->wait.func = io_epoll_wait_fn; + INIT_LIST_HEAD(&iew->wait.entry); + atomic_set(&req->poll_refs, 0); + return 0; +} + +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + struct io_ring_ctx *ctx = req->ctx; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + hlist_add_head(&req->hash_node, &ctx->epoll_list); + io_ring_submit_unlock(ctx, issue_flags); + + /* + * Timeout is fake here, it doesn't indicate any kind of sleep time. + * It's just set to something that is non-zero, so that wait queue + * wakeup is armed if no events are available. + */ + ret = epoll_wait(req->file, iew->events, iew->maxevents, NULL, &iew->wait); + if (ret == -EIOCBQUEUED) + return IOU_ISSUE_SKIP_COMPLETE; + else if (ret < 0) + req_set_fail(req); + io_ring_submit_lock(ctx, issue_flags); + hlist_del_init(&req->hash_node); + io_ring_submit_unlock(ctx, issue_flags); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/epoll.h b/io_uring/epoll.h index 870cce11ba98..296940d89063 100644 --- a/io_uring/epoll.h +++ b/io_uring/epoll.h @@ -1,6 +1,28 @@ // SPDX-License-Identifier: GPL-2.0 +#include "cancel.h" + #if defined(CONFIG_EPOLL) +int io_epoll_wait_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags); +bool io_epoll_wait_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + bool cancel_all); + int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags); +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags); +#else +static inline bool io_epoll_wait_remove_all(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all) +{ + return false; +} +static inline int io_epoll_wait_cancel(struct io_ring_ctx *ctx, + struct io_cancel_data *cd, + unsigned int issue_flags) +{ + return 0; +} #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2f311aeb536f..a17abdbae7ee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "notif.h" #include "waitid.h" #include "futex.h" +#include "epoll.h" #include "napi.h" #include "uring_cmd.h" #include "msg_ring.h" @@ -358,6 +359,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); +#endif +#ifdef CONFIG_EPOLL + INIT_HLIST_HEAD(&ctx->epoll_list); #endif INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); @@ -3095,6 +3099,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_poll_remove_all(ctx, tctx, cancel_all); ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_epoll_wait_remove_all(ctx, tctx, cancel_all); ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, tctx, cancel_all); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index e8baef4e5146..44553a657476 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -514,6 +514,17 @@ const struct io_issue_def io_issue_defs[] = { .async_size = sizeof(struct io_async_msghdr), #else .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_EPOLL_WAIT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, +#if defined(CONFIG_EPOLL) + .prep = io_epoll_wait_prep, + .issue = io_epoll_wait, +#else + .prep = io_eopnotsupp_prep, #endif }, }; @@ -745,6 +756,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_LISTEN] = { .name = "LISTEN", }, + [IORING_OP_EPOLL_WAIT] = { + .name = "EPOLL_WAIT", + }, }; const char *io_uring_get_opcode(u8 opcode)

[8/9] io_uring/epoll: add support for IORING_OP_EPOLL_WAIT

Commit Message

Patch