@@ -256,7 +256,7 @@ struct io_ring_ctx {
struct task_struct *submitter_task;
struct io_rings *rings;
- struct percpu_ref refs;
+ atomic_long_t refs;
clockid_t clockid;
enum tk_offsets clock_offset;
@@ -252,13 +252,6 @@ static __cold void io_kworker_tw_end(void)
current->flags |= PF_NO_TASKWORK;
}
-static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
-{
- struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
-
- complete(&ctx->ref_comp);
-}
-
static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -269,13 +262,13 @@ static __cold void io_fallback_req_func(struct work_struct *work)
io_kworker_tw_start();
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
mutex_lock(&ctx->uring_lock);
llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
req->io_task_work.func(req, ts);
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
io_kworker_tw_end();
}
@@ -333,10 +326,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
hash_bits = clamp(hash_bits, 1, 8);
if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
goto err;
- if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
- 0, GFP_KERNEL))
- goto err;
+ io_ring_ref_init(ctx);
ctx->flags = p->flags;
ctx->hybrid_poll_time = LLONG_MAX;
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@@ -360,7 +351,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ret |= io_futex_cache_init(ctx);
ret |= io_rsrc_cache_init(ctx);
if (ret)
- goto free_ref;
+ goto err;
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
@@ -386,9 +377,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->mmap_lock);
return ctx;
-
-free_ref:
- percpu_ref_exit(&ctx->refs);
err:
io_free_alloc_caches(ctx);
kvfree(ctx->cancel_table.hbs);
@@ -556,7 +544,7 @@ static void io_queue_iowq(struct io_kiocb *req)
* worker for it).
*/
if (WARN_ON_ONCE(!same_thread_group(tctx->task, current) &&
- !percpu_ref_is_dying(&req->ctx->refs)))
+ !io_ring_ref_is_dying(req->ctx)))
atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
@@ -998,7 +986,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
ret = 1;
}
- percpu_ref_get_many(&ctx->refs, ret);
+ io_ring_ref_get_many(ctx, ret);
while (ret--) {
struct io_kiocb *req = reqs[ret];
@@ -1053,7 +1041,7 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw)
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
}
/*
@@ -1077,7 +1065,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
ctx_flush_and_put(ctx, ts);
ctx = req->ctx;
mutex_lock(&ctx->uring_lock);
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
}
INDIRECT_CALL_2(req->io_task_work.func,
io_poll_task_func, io_req_rw_complete,
@@ -1106,10 +1094,10 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
if (sync && last_ctx != req->ctx) {
if (last_ctx) {
flush_delayed_work(&last_ctx->fallback_work);
- percpu_ref_put(&last_ctx->refs);
+ io_ring_ref_put(last_ctx);
}
last_ctx = req->ctx;
- percpu_ref_get(&last_ctx->refs);
+ io_ring_ref_get(last_ctx);
}
if (llist_add(&req->io_task_work.node,
&req->ctx->fallback_llist))
@@ -1118,7 +1106,7 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
if (last_ctx) {
flush_delayed_work(&last_ctx->fallback_work);
- percpu_ref_put(&last_ctx->refs);
+ io_ring_ref_put(last_ctx);
}
}
@@ -1255,7 +1243,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
return;
}
- if (!percpu_ref_is_dying(&ctx->refs) &&
+ if (!io_ring_ref_is_dying(ctx) &&
!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))
return;
@@ -2739,7 +2727,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
nr++;
}
if (nr)
- percpu_ref_put_many(&ctx->refs, nr);
+ io_ring_ref_put_many(ctx, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -2773,7 +2761,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
static_branch_dec(&io_key_has_sqarray);
- percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
if (ctx->hash_map)
@@ -2798,7 +2785,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
* might've been lost due to loose synchronisation.
*/
wake_up_all(&ctx->poll_wq);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
}
__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
@@ -2816,9 +2803,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
* only need to sync with it, which is done by injecting a tw
*/
init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
out:
spin_unlock(&ctx->completion_lock);
}
@@ -3005,7 +2992,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
struct creds *creds;
mutex_lock(&ctx->uring_lock);
- percpu_ref_kill(&ctx->refs);
+ io_ring_ref_kill(ctx);
xa_for_each(&ctx->personalities, index, creds)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
@@ -13,6 +13,7 @@
#include "slist.h"
#include "filetable.h"
#include "opdef.h"
+#include "refs.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -143,7 +144,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
* Not from an SQE, as those cannot be submitted, but via
* updating tagged resources.
*/
- if (!percpu_ref_is_dying(&ctx->refs))
+ if (!io_ring_ref_is_dying(ctx))
lockdep_assert(current == ctx->submitter_task);
}
#endif
@@ -83,7 +83,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
}
if (req)
kmem_cache_free(req_cachep, req);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
}
static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -95,7 +95,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
req->cqe.user_data = user_data;
io_req_set_res(req, res, cflags);
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
req->ctx = ctx;
req->tctx = NULL;
req->io_task_work.func = io_msg_tw_complete;
@@ -52,4 +52,47 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
{
__io_req_set_refcount(req, 1);
}
+
+#define IO_RING_REF_DEAD (1ULL << 63)
+#define IO_RING_REF_MASK (~IO_RING_REF_DEAD)
+
+static inline bool io_ring_ref_is_dying(struct io_ring_ctx *ctx)
+{
+ return atomic_long_read(&ctx->refs) & IO_RING_REF_DEAD;
+}
+
+static inline void io_ring_ref_put_many(struct io_ring_ctx *ctx, int nr_refs)
+{
+ unsigned long refs;
+
+ refs = atomic_long_sub_return(nr_refs, &ctx->refs);
+ if (!(refs & IO_RING_REF_MASK))
+ complete(&ctx->ref_comp);
+}
+
+static inline void io_ring_ref_put(struct io_ring_ctx *ctx)
+{
+ io_ring_ref_put_many(ctx, 1);
+}
+
+static inline void io_ring_ref_kill(struct io_ring_ctx *ctx)
+{
+ atomic_long_xor(IO_RING_REF_DEAD, &ctx->refs);
+ io_ring_ref_put(ctx);
+}
+
+static inline void io_ring_ref_init(struct io_ring_ctx *ctx)
+{
+ atomic_long_set(&ctx->refs, 1);
+}
+
+static inline void io_ring_ref_get_many(struct io_ring_ctx *ctx, int nr_refs)
+{
+ atomic_long_add(nr_refs, &ctx->refs);
+}
+
+static inline void io_ring_ref_get(struct io_ring_ctx *ctx)
+{
+ atomic_long_inc(&ctx->refs);
+}
#endif
@@ -637,7 +637,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* We don't quiesce the refs for register anymore and so it can't be
* dying as we're holding a file ref here.
*/
- if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
+ if (WARN_ON_ONCE(io_ring_ref_is_dying(ctx)))
return -ENXIO;
if (ctx->submitter_task && ctx->submitter_task != current)
@@ -496,7 +496,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
* Don't attempt to reissue from that path, just let it fail with
* -EAGAIN.
*/
- if (percpu_ref_is_dying(&ctx->refs))
+ if (io_ring_ref_is_dying(ctx))
return false;
io_meta_restore(io, &rw->kiocb);
@@ -184,7 +184,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
* Don't submit if refs are dying, good for io_uring_register(),
* but also it is relied upon by io_ring_exit_work()
*/
- if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
+ if (to_submit && likely(!io_ring_ref_is_dying(ctx)) &&
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
@@ -629,7 +629,7 @@ static int io_pp_zc_init(struct page_pool *pp)
if (pp->p.dma_dir != DMA_FROM_DEVICE)
return -EOPNOTSUPP;
- percpu_ref_get(&ifq->ctx->refs);
+ io_ring_ref_get(ifq->ctx);
return 0;
}
@@ -640,7 +640,7 @@ static void io_pp_zc_destroy(struct page_pool *pp)
if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
return;
- percpu_ref_put(&ifq->ctx->refs);
+ io_ring_ref_put(ifq->ctx);
}
static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
For the common cases, the io_uring ref counts are all batched and hence need not be a percpu reference. This saves some memory on systems, but outside of that, it gets rid of needing a full RCU grace period on tearing down the reference. With io_uring now waiting on cancelations and IO during exit, this slows down the tear down a lot, up to 100x as slow. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 47 ++++++++++++---------------------- io_uring/io_uring.h | 3 ++- io_uring/msg_ring.c | 4 +-- io_uring/refs.h | 43 +++++++++++++++++++++++++++++++ io_uring/register.c | 2 +- io_uring/rw.c | 2 +- io_uring/sqpoll.c | 2 +- io_uring/zcrx.c | 4 +-- 9 files changed, 70 insertions(+), 39 deletions(-)