Message ID | 20221031134126.82928-2-dylany@meta.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring: retarget rsrc nodes periodically | expand |
On 10/31/22 7:41 AM, Dylan Yudaken wrote: > +static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx) > + __must_hold(&ctx->uring_lock) > +{ > + percpu_ref_get(&ctx->refs); > + mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * HZ); > + ctx->rsrc_retarget_scheduled = true; > +} Can this ever be called with rsrc_retarget_work already pending? If so, that would seem to leak a ctx ref.
On Mon, 2022-10-31 at 10:02 -0600, Jens Axboe wrote: > On 10/31/22 7:41 AM, Dylan Yudaken wrote: > > +static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx) > > + __must_hold(&ctx->uring_lock) > > +{ > > + percpu_ref_get(&ctx->refs); > > + mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * > > HZ); > > + ctx->rsrc_retarget_scheduled = true; > > +} > > Can this ever be called with rsrc_retarget_work already pending? If > so, > that would seem to leak a ctx ref. > Not in this patch. In the later patch it can (but only on shutdown) and it checks for it.
On 10/31/22 13:41, Dylan Yudaken wrote: > rsrc node cleanup can be indefinitely delayed when there are long lived > requests. For example if a file is located in the same rsrc node as a long > lived socket with multishot poll, then even if unregistering the file it > will not be closed while the poll request is still active. > > Introduce a timer when rsrc node is switched, so that periodically we can > retarget these long lived requests to the newest nodes. That will allow > the old nodes to be cleaned up, freeing resources. > > Signed-off-by: Dylan Yudaken <dylany@meta.com> > --- > include/linux/io_uring_types.h | 2 + > io_uring/io_uring.c | 1 + > io_uring/opdef.h | 1 + > io_uring/rsrc.c | 92 ++++++++++++++++++++++++++++++++++ > io_uring/rsrc.h | 1 + > 5 files changed, 97 insertions(+) > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index f5b687a787a3..1d4eff4e632c 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -327,6 +327,8 @@ struct io_ring_ctx { > struct llist_head rsrc_put_llist; > struct list_head rsrc_ref_list; > spinlock_t rsrc_ref_lock; > + struct delayed_work rsrc_retarget_work; > + bool rsrc_retarget_scheduled; > > struct list_head io_buffers_pages; > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index 6cc16e39b27f..ea2260359c56 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -320,6 +320,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) > spin_lock_init(&ctx->rsrc_ref_lock); > INIT_LIST_HEAD(&ctx->rsrc_ref_list); > INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); > + INIT_DELAYED_WORK(&ctx->rsrc_retarget_work, io_rsrc_retarget_work); > init_llist_head(&ctx->rsrc_put_llist); > init_llist_head(&ctx->work_llist); > INIT_LIST_HEAD(&ctx->tctx_list); > diff --git a/io_uring/opdef.h b/io_uring/opdef.h > index 3efe06d25473..1b72b14cb5ab 100644 > --- a/io_uring/opdef.h > +++ b/io_uring/opdef.h > @@ -37,6 +37,7 @@ struct io_op_def { > int (*prep_async)(struct io_kiocb *); > void (*cleanup)(struct io_kiocb *); > void (*fail)(struct io_kiocb *); > + bool (*can_retarget_rsrc)(struct io_kiocb *); side note: need to be split at some moment into 2 tables depending on hotness, we want better caching for ->issue and ->prep > }; > > extern const struct io_op_def io_op_defs[]; > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 55d4ab96fb92..106210e0d5d5 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -15,6 +15,7 @@ > #include "io_uring.h" > #include "openclose.h" > #include "rsrc.h" > +#include "opdef.h" > > struct io_rsrc_update { > struct file *file; > @@ -204,6 +205,95 @@ void io_rsrc_put_work(struct work_struct *work) > } > } > > + > +static unsigned int io_rsrc_retarget_req(struct io_ring_ctx *ctx, > + struct io_kiocb *req) > + __must_hold(&ctx->uring_lock) > +{ > + if (!req->rsrc_node || > + req->rsrc_node == ctx->rsrc_node) > + return 0; > + if (!io_op_defs[req->opcode].can_retarget_rsrc) > + return 0; > + if (!(*io_op_defs[req->opcode].can_retarget_rsrc)(req)) > + return 0; nit, there should be no need to deref fptr. if (!io_op_defs[req->opcode].can_retarget_rsrc(req)) ... > + > + io_rsrc_put_node(req->rsrc_node, 1); > + req->rsrc_node = ctx->rsrc_node; > + return 1; > +} > + > +static unsigned int io_rsrc_retarget_table(struct io_ring_ctx *ctx, > + struct io_hash_table *table) > +{ > + unsigned int nr_buckets = 1U << table->hash_bits; > + unsigned int refs = 0; > + struct io_kiocb *req; > + int i; > + > + for (i = 0; i < nr_buckets; i++) { > + struct io_hash_bucket *hb = &table->hbs[i]; > + > + spin_lock(&hb->lock); > + hlist_for_each_entry(req, &hb->list, hash_node) > + refs += io_rsrc_retarget_req(ctx, req); > + spin_unlock(&hb->lock); > + } > + return refs; > +} > + > +static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx) > + __must_hold(&ctx->uring_lock) > +{ > + percpu_ref_get(&ctx->refs); > + mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * HZ); > + ctx->rsrc_retarget_scheduled = true; > +} > + > +static void __io_rsrc_retarget_work(struct io_ring_ctx *ctx) > + __must_hold(&ctx->uring_lock) > +{ > + struct io_rsrc_node *node; > + unsigned int refs; > + bool any_waiting; > + > + if (!ctx->rsrc_node) > + return; > + > + spin_lock_irq(&ctx->rsrc_ref_lock); > + any_waiting = false; > + list_for_each_entry(node, &ctx->rsrc_ref_list, node) { > + if (!node->done) { > + any_waiting = true; > + break; > + } > + } > + spin_unlock_irq(&ctx->rsrc_ref_lock); > + > + if (!any_waiting) > + return; > + > + refs = io_rsrc_retarget_table(ctx, &ctx->cancel_table); > + refs += io_rsrc_retarget_table(ctx, &ctx->cancel_table_locked); > + > + ctx->rsrc_cached_refs -= refs; > + while (unlikely(ctx->rsrc_cached_refs < 0)) > + io_rsrc_refs_refill(ctx); We can charge ->rsrc_cached_refs after setting up nodes in prep / submission without underflowing the actual refs because we know that the requests are not yet submitted and so won't consume refs. This one looks more troublesome > +} > + > +void io_rsrc_retarget_work(struct work_struct *work) > +{ > + struct io_ring_ctx *ctx; > + > + ctx = container_of(work, struct io_ring_ctx, rsrc_retarget_work.work); > + > + mutex_lock(&ctx->uring_lock); > + ctx->rsrc_retarget_scheduled = false; > + __io_rsrc_retarget_work(ctx); > + mutex_unlock(&ctx->uring_lock); > + percpu_ref_put(&ctx->refs); > +} > + > void io_wait_rsrc_data(struct io_rsrc_data *data) > { > if (data && !atomic_dec_and_test(&data->refs)) > @@ -285,6 +375,8 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, > atomic_inc(&data_to_kill->refs); > percpu_ref_kill(&rsrc_node->refs); > ctx->rsrc_node = NULL; > + if (!ctx->rsrc_retarget_scheduled) > + io_rsrc_retarget_schedule(ctx); > } > > if (!ctx->rsrc_node) { > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h > index 81445a477622..2b94df8fd9e8 100644 > --- a/io_uring/rsrc.h > +++ b/io_uring/rsrc.h > @@ -54,6 +54,7 @@ struct io_mapped_ubuf { > }; > > void io_rsrc_put_work(struct work_struct *work); > +void io_rsrc_retarget_work(struct work_struct *work); > void io_rsrc_refs_refill(struct io_ring_ctx *ctx); > void io_wait_rsrc_data(struct io_rsrc_data *data); > void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f5b687a787a3..1d4eff4e632c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -327,6 +327,8 @@ struct io_ring_ctx { struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; + struct delayed_work rsrc_retarget_work; + bool rsrc_retarget_scheduled; struct list_head io_buffers_pages; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6cc16e39b27f..ea2260359c56 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -320,6 +320,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); + INIT_DELAYED_WORK(&ctx->rsrc_retarget_work, io_rsrc_retarget_work); init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 3efe06d25473..1b72b14cb5ab 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -37,6 +37,7 @@ struct io_op_def { int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); + bool (*can_retarget_rsrc)(struct io_kiocb *); }; extern const struct io_op_def io_op_defs[]; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 55d4ab96fb92..106210e0d5d5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -15,6 +15,7 @@ #include "io_uring.h" #include "openclose.h" #include "rsrc.h" +#include "opdef.h" struct io_rsrc_update { struct file *file; @@ -204,6 +205,95 @@ void io_rsrc_put_work(struct work_struct *work) } } + +static unsigned int io_rsrc_retarget_req(struct io_ring_ctx *ctx, + struct io_kiocb *req) + __must_hold(&ctx->uring_lock) +{ + if (!req->rsrc_node || + req->rsrc_node == ctx->rsrc_node) + return 0; + if (!io_op_defs[req->opcode].can_retarget_rsrc) + return 0; + if (!(*io_op_defs[req->opcode].can_retarget_rsrc)(req)) + return 0; + + io_rsrc_put_node(req->rsrc_node, 1); + req->rsrc_node = ctx->rsrc_node; + return 1; +} + +static unsigned int io_rsrc_retarget_table(struct io_ring_ctx *ctx, + struct io_hash_table *table) +{ + unsigned int nr_buckets = 1U << table->hash_bits; + unsigned int refs = 0; + struct io_kiocb *req; + int i; + + for (i = 0; i < nr_buckets; i++) { + struct io_hash_bucket *hb = &table->hbs[i]; + + spin_lock(&hb->lock); + hlist_for_each_entry(req, &hb->list, hash_node) + refs += io_rsrc_retarget_req(ctx, req); + spin_unlock(&hb->lock); + } + return refs; +} + +static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + percpu_ref_get(&ctx->refs); + mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * HZ); + ctx->rsrc_retarget_scheduled = true; +} + +static void __io_rsrc_retarget_work(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + struct io_rsrc_node *node; + unsigned int refs; + bool any_waiting; + + if (!ctx->rsrc_node) + return; + + spin_lock_irq(&ctx->rsrc_ref_lock); + any_waiting = false; + list_for_each_entry(node, &ctx->rsrc_ref_list, node) { + if (!node->done) { + any_waiting = true; + break; + } + } + spin_unlock_irq(&ctx->rsrc_ref_lock); + + if (!any_waiting) + return; + + refs = io_rsrc_retarget_table(ctx, &ctx->cancel_table); + refs += io_rsrc_retarget_table(ctx, &ctx->cancel_table_locked); + + ctx->rsrc_cached_refs -= refs; + while (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); +} + +void io_rsrc_retarget_work(struct work_struct *work) +{ + struct io_ring_ctx *ctx; + + ctx = container_of(work, struct io_ring_ctx, rsrc_retarget_work.work); + + mutex_lock(&ctx->uring_lock); + ctx->rsrc_retarget_scheduled = false; + __io_rsrc_retarget_work(ctx); + mutex_unlock(&ctx->uring_lock); + percpu_ref_put(&ctx->refs); +} + void io_wait_rsrc_data(struct io_rsrc_data *data) { if (data && !atomic_dec_and_test(&data->refs)) @@ -285,6 +375,8 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, atomic_inc(&data_to_kill->refs); percpu_ref_kill(&rsrc_node->refs); ctx->rsrc_node = NULL; + if (!ctx->rsrc_retarget_scheduled) + io_rsrc_retarget_schedule(ctx); } if (!ctx->rsrc_node) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 81445a477622..2b94df8fd9e8 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -54,6 +54,7 @@ struct io_mapped_ubuf { }; void io_rsrc_put_work(struct work_struct *work); +void io_rsrc_retarget_work(struct work_struct *work); void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_wait_rsrc_data(struct io_rsrc_data *data); void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
rsrc node cleanup can be indefinitely delayed when there are long lived requests. For example if a file is located in the same rsrc node as a long lived socket with multishot poll, then even if unregistering the file it will not be closed while the poll request is still active. Introduce a timer when rsrc node is switched, so that periodically we can retarget these long lived requests to the newest nodes. That will allow the old nodes to be cleaned up, freeing resources. Signed-off-by: Dylan Yudaken <dylany@meta.com> --- include/linux/io_uring_types.h | 2 + io_uring/io_uring.c | 1 + io_uring/opdef.h | 1 + io_uring/rsrc.c | 92 ++++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 1 + 5 files changed, 97 insertions(+)