diff mbox series

[for-next,01/12] io_uring: infrastructure for retargeting rsrc nodes

Message ID 20221031134126.82928-2-dylany@meta.com (mailing list archive)
State New
Headers show
Series io_uring: retarget rsrc nodes periodically | expand

Commit Message

Dylan Yudaken Oct. 31, 2022, 1:41 p.m. UTC
rsrc node cleanup can be indefinitely delayed when there are long lived
requests. For example if a file is located in the same rsrc node as a long
lived socket with multishot poll, then even if unregistering the file it
will not be closed while the poll request is still active.

Introduce a timer when rsrc node is switched, so that periodically we can
retarget these long lived requests to the newest nodes. That will allow
the old nodes to be cleaned up, freeing resources.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
---
 include/linux/io_uring_types.h |  2 +
 io_uring/io_uring.c            |  1 +
 io_uring/opdef.h               |  1 +
 io_uring/rsrc.c                | 92 ++++++++++++++++++++++++++++++++++
 io_uring/rsrc.h                |  1 +
 5 files changed, 97 insertions(+)

Comments

Jens Axboe Oct. 31, 2022, 4:02 p.m. UTC | #1
On 10/31/22 7:41 AM, Dylan Yudaken wrote:
> +static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx)
> +	__must_hold(&ctx->uring_lock)
> +{
> +	percpu_ref_get(&ctx->refs);
> +	mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * HZ);
> +	ctx->rsrc_retarget_scheduled = true;
> +}

Can this ever be called with rsrc_retarget_work already pending? If so,
that would seem to leak a ctx ref.
Dylan Yudaken Oct. 31, 2022, 4:45 p.m. UTC | #2
On Mon, 2022-10-31 at 10:02 -0600, Jens Axboe wrote:
> On 10/31/22 7:41 AM, Dylan Yudaken wrote:
> > +static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx)
> > +       __must_hold(&ctx->uring_lock)
> > +{
> > +       percpu_ref_get(&ctx->refs);
> > +       mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 *
> > HZ);
> > +       ctx->rsrc_retarget_scheduled = true;
> > +}
> 
> Can this ever be called with rsrc_retarget_work already pending? If
> so,
> that would seem to leak a ctx ref.
> 

Not in this patch.
In the later patch it can (but only on shutdown) and it checks for it.
Pavel Begunkov Nov. 2, 2022, 11:20 a.m. UTC | #3
On 10/31/22 13:41, Dylan Yudaken wrote:
> rsrc node cleanup can be indefinitely delayed when there are long lived
> requests. For example if a file is located in the same rsrc node as a long
> lived socket with multishot poll, then even if unregistering the file it
> will not be closed while the poll request is still active.
> 
> Introduce a timer when rsrc node is switched, so that periodically we can
> retarget these long lived requests to the newest nodes. That will allow
> the old nodes to be cleaned up, freeing resources.
> 
> Signed-off-by: Dylan Yudaken <dylany@meta.com>
> ---
>   include/linux/io_uring_types.h |  2 +
>   io_uring/io_uring.c            |  1 +
>   io_uring/opdef.h               |  1 +
>   io_uring/rsrc.c                | 92 ++++++++++++++++++++++++++++++++++
>   io_uring/rsrc.h                |  1 +
>   5 files changed, 97 insertions(+)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index f5b687a787a3..1d4eff4e632c 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -327,6 +327,8 @@ struct io_ring_ctx {
>   	struct llist_head		rsrc_put_llist;
>   	struct list_head		rsrc_ref_list;
>   	spinlock_t			rsrc_ref_lock;
> +	struct delayed_work		rsrc_retarget_work;
> +	bool				rsrc_retarget_scheduled;
>   
>   	struct list_head		io_buffers_pages;
>   
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 6cc16e39b27f..ea2260359c56 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -320,6 +320,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>   	spin_lock_init(&ctx->rsrc_ref_lock);
>   	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
>   	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
> +	INIT_DELAYED_WORK(&ctx->rsrc_retarget_work, io_rsrc_retarget_work);
>   	init_llist_head(&ctx->rsrc_put_llist);
>   	init_llist_head(&ctx->work_llist);
>   	INIT_LIST_HEAD(&ctx->tctx_list);
> diff --git a/io_uring/opdef.h b/io_uring/opdef.h
> index 3efe06d25473..1b72b14cb5ab 100644
> --- a/io_uring/opdef.h
> +++ b/io_uring/opdef.h
> @@ -37,6 +37,7 @@ struct io_op_def {
>   	int (*prep_async)(struct io_kiocb *);
>   	void (*cleanup)(struct io_kiocb *);
>   	void (*fail)(struct io_kiocb *);
> +	bool (*can_retarget_rsrc)(struct io_kiocb *);

side note: need to be split at some moment into 2 tables depending
on hotness, we want better caching for ->issue and ->prep

>   };
>   
>   extern const struct io_op_def io_op_defs[];
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 55d4ab96fb92..106210e0d5d5 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -15,6 +15,7 @@
>   #include "io_uring.h"
>   #include "openclose.h"
>   #include "rsrc.h"
> +#include "opdef.h"
>   
>   struct io_rsrc_update {
>   	struct file			*file;
> @@ -204,6 +205,95 @@ void io_rsrc_put_work(struct work_struct *work)
>   	}
>   }
>   
> +
> +static unsigned int io_rsrc_retarget_req(struct io_ring_ctx *ctx,
> +					 struct io_kiocb *req)
> +	__must_hold(&ctx->uring_lock)
> +{
> +	if (!req->rsrc_node ||
> +	     req->rsrc_node == ctx->rsrc_node)
> +		return 0;
> +	if (!io_op_defs[req->opcode].can_retarget_rsrc)
> +		return 0;
> +	if (!(*io_op_defs[req->opcode].can_retarget_rsrc)(req))
> +		return 0;

nit, there should be no need to deref fptr.

if (!io_op_defs[req->opcode].can_retarget_rsrc(req)) ...

> +
> +	io_rsrc_put_node(req->rsrc_node, 1);
> +	req->rsrc_node = ctx->rsrc_node;
> +	return 1;
> +}
> +
> +static unsigned int io_rsrc_retarget_table(struct io_ring_ctx *ctx,
> +				   struct io_hash_table *table)
> +{
> +	unsigned int nr_buckets = 1U << table->hash_bits;
> +	unsigned int refs = 0;
> +	struct io_kiocb *req;
> +	int i;
> +
> +	for (i = 0; i < nr_buckets; i++) {
> +		struct io_hash_bucket *hb = &table->hbs[i];
> +
> +		spin_lock(&hb->lock);
> +		hlist_for_each_entry(req, &hb->list, hash_node)
> +			refs += io_rsrc_retarget_req(ctx, req);
> +		spin_unlock(&hb->lock);
> +	}
> +	return refs;
> +}
> +
> +static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx)
> +	__must_hold(&ctx->uring_lock)
> +{
> +	percpu_ref_get(&ctx->refs);
> +	mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * HZ);
> +	ctx->rsrc_retarget_scheduled = true;
> +}
> +
> +static void __io_rsrc_retarget_work(struct io_ring_ctx *ctx)
> +	__must_hold(&ctx->uring_lock)
> +{
> +	struct io_rsrc_node *node;
> +	unsigned int refs;
> +	bool any_waiting;
> +
> +	if (!ctx->rsrc_node)
> +		return;
> +
> +	spin_lock_irq(&ctx->rsrc_ref_lock);
> +	any_waiting = false;
> +	list_for_each_entry(node, &ctx->rsrc_ref_list, node) {
> +		if (!node->done) {
> +			any_waiting = true;
> +			break;
> +		}
> +	}
> +	spin_unlock_irq(&ctx->rsrc_ref_lock);
> +
> +	if (!any_waiting)
> +		return;
> +
> +	refs = io_rsrc_retarget_table(ctx, &ctx->cancel_table);
> +	refs += io_rsrc_retarget_table(ctx, &ctx->cancel_table_locked);
> +
> +	ctx->rsrc_cached_refs -= refs;
> +	while (unlikely(ctx->rsrc_cached_refs < 0))
> +		io_rsrc_refs_refill(ctx);

We can charge ->rsrc_cached_refs after setting up nodes in prep / submission
without underflowing the actual refs because we know that the requests are
not yet submitted and so won't consume refs. This one looks more troublesome


> +}
> +
> +void io_rsrc_retarget_work(struct work_struct *work)
> +{
> +	struct io_ring_ctx *ctx;
> +
> +	ctx = container_of(work, struct io_ring_ctx, rsrc_retarget_work.work);
> +
> +	mutex_lock(&ctx->uring_lock);
> +	ctx->rsrc_retarget_scheduled = false;
> +	__io_rsrc_retarget_work(ctx);
> +	mutex_unlock(&ctx->uring_lock);
> +	percpu_ref_put(&ctx->refs);
> +}
> +
>   void io_wait_rsrc_data(struct io_rsrc_data *data)
>   {
>   	if (data && !atomic_dec_and_test(&data->refs))
> @@ -285,6 +375,8 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
>   		atomic_inc(&data_to_kill->refs);
>   		percpu_ref_kill(&rsrc_node->refs);
>   		ctx->rsrc_node = NULL;
> +		if (!ctx->rsrc_retarget_scheduled)
> +			io_rsrc_retarget_schedule(ctx);
>   	}
>   
>   	if (!ctx->rsrc_node) {
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 81445a477622..2b94df8fd9e8 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -54,6 +54,7 @@ struct io_mapped_ubuf {
>   };
>   
>   void io_rsrc_put_work(struct work_struct *work);
> +void io_rsrc_retarget_work(struct work_struct *work);
>   void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
>   void io_wait_rsrc_data(struct io_rsrc_data *data);
>   void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f5b687a787a3..1d4eff4e632c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -327,6 +327,8 @@  struct io_ring_ctx {
 	struct llist_head		rsrc_put_llist;
 	struct list_head		rsrc_ref_list;
 	spinlock_t			rsrc_ref_lock;
+	struct delayed_work		rsrc_retarget_work;
+	bool				rsrc_retarget_scheduled;
 
 	struct list_head		io_buffers_pages;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6cc16e39b27f..ea2260359c56 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -320,6 +320,7 @@  static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	spin_lock_init(&ctx->rsrc_ref_lock);
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
+	INIT_DELAYED_WORK(&ctx->rsrc_retarget_work, io_rsrc_retarget_work);
 	init_llist_head(&ctx->rsrc_put_llist);
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 3efe06d25473..1b72b14cb5ab 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -37,6 +37,7 @@  struct io_op_def {
 	int (*prep_async)(struct io_kiocb *);
 	void (*cleanup)(struct io_kiocb *);
 	void (*fail)(struct io_kiocb *);
+	bool (*can_retarget_rsrc)(struct io_kiocb *);
 };
 
 extern const struct io_op_def io_op_defs[];
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 55d4ab96fb92..106210e0d5d5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -15,6 +15,7 @@ 
 #include "io_uring.h"
 #include "openclose.h"
 #include "rsrc.h"
+#include "opdef.h"
 
 struct io_rsrc_update {
 	struct file			*file;
@@ -204,6 +205,95 @@  void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
+
+static unsigned int io_rsrc_retarget_req(struct io_ring_ctx *ctx,
+					 struct io_kiocb *req)
+	__must_hold(&ctx->uring_lock)
+{
+	if (!req->rsrc_node ||
+	     req->rsrc_node == ctx->rsrc_node)
+		return 0;
+	if (!io_op_defs[req->opcode].can_retarget_rsrc)
+		return 0;
+	if (!(*io_op_defs[req->opcode].can_retarget_rsrc)(req))
+		return 0;
+
+	io_rsrc_put_node(req->rsrc_node, 1);
+	req->rsrc_node = ctx->rsrc_node;
+	return 1;
+}
+
+static unsigned int io_rsrc_retarget_table(struct io_ring_ctx *ctx,
+				   struct io_hash_table *table)
+{
+	unsigned int nr_buckets = 1U << table->hash_bits;
+	unsigned int refs = 0;
+	struct io_kiocb *req;
+	int i;
+
+	for (i = 0; i < nr_buckets; i++) {
+		struct io_hash_bucket *hb = &table->hbs[i];
+
+		spin_lock(&hb->lock);
+		hlist_for_each_entry(req, &hb->list, hash_node)
+			refs += io_rsrc_retarget_req(ctx, req);
+		spin_unlock(&hb->lock);
+	}
+	return refs;
+}
+
+static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	percpu_ref_get(&ctx->refs);
+	mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * HZ);
+	ctx->rsrc_retarget_scheduled = true;
+}
+
+static void __io_rsrc_retarget_work(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_rsrc_node *node;
+	unsigned int refs;
+	bool any_waiting;
+
+	if (!ctx->rsrc_node)
+		return;
+
+	spin_lock_irq(&ctx->rsrc_ref_lock);
+	any_waiting = false;
+	list_for_each_entry(node, &ctx->rsrc_ref_list, node) {
+		if (!node->done) {
+			any_waiting = true;
+			break;
+		}
+	}
+	spin_unlock_irq(&ctx->rsrc_ref_lock);
+
+	if (!any_waiting)
+		return;
+
+	refs = io_rsrc_retarget_table(ctx, &ctx->cancel_table);
+	refs += io_rsrc_retarget_table(ctx, &ctx->cancel_table_locked);
+
+	ctx->rsrc_cached_refs -= refs;
+	while (unlikely(ctx->rsrc_cached_refs < 0))
+		io_rsrc_refs_refill(ctx);
+}
+
+void io_rsrc_retarget_work(struct work_struct *work)
+{
+	struct io_ring_ctx *ctx;
+
+	ctx = container_of(work, struct io_ring_ctx, rsrc_retarget_work.work);
+
+	mutex_lock(&ctx->uring_lock);
+	ctx->rsrc_retarget_scheduled = false;
+	__io_rsrc_retarget_work(ctx);
+	mutex_unlock(&ctx->uring_lock);
+	percpu_ref_put(&ctx->refs);
+}
+
 void io_wait_rsrc_data(struct io_rsrc_data *data)
 {
 	if (data && !atomic_dec_and_test(&data->refs))
@@ -285,6 +375,8 @@  void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 		atomic_inc(&data_to_kill->refs);
 		percpu_ref_kill(&rsrc_node->refs);
 		ctx->rsrc_node = NULL;
+		if (!ctx->rsrc_retarget_scheduled)
+			io_rsrc_retarget_schedule(ctx);
 	}
 
 	if (!ctx->rsrc_node) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 81445a477622..2b94df8fd9e8 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -54,6 +54,7 @@  struct io_mapped_ubuf {
 };
 
 void io_rsrc_put_work(struct work_struct *work);
+void io_rsrc_retarget_work(struct work_struct *work);
 void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
 void io_wait_rsrc_data(struct io_rsrc_data *data);
 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);