diff mbox series

[RFC] fuse: Abort connection if FUSE server get stuck

Message ID 20241219204149.11958-2-etmartin4313@gmail.com (mailing list archive)
State New
Headers show
Series [RFC] fuse: Abort connection if FUSE server get stuck | expand

Commit Message

Etienne Martineau Dec. 19, 2024, 8:41 p.m. UTC
Signed-off-by: Etienne Martineau <etmartin4313@gmail.com>
---
 fs/fuse/dev.c    | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h | 13 +++++++++
 fs/fuse/inode.c  |  7 +++++
 3 files changed, 88 insertions(+)

Comments

Joanne Koong Dec. 20, 2024, 9:33 p.m. UTC | #1
On Thu, Dec 19, 2024 at 12:42 PM Etienne Martineau
<etmartin4313@gmail.com> wrote:
>
> Signed-off-by: Etienne Martineau <etmartin4313@gmail.com>
> ---
>  fs/fuse/dev.c    | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/fuse/fuse_i.h | 13 +++++++++
>  fs/fuse/inode.c  |  7 +++++
>  3 files changed, 88 insertions(+)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 27ccae63495d..351787363444 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -21,6 +21,8 @@
>  #include <linux/swap.h>
>  #include <linux/splice.h>
>  #include <linux/sched.h>
> +#include <linux/completion.h>
> +#include <linux/sched/sysctl.h>
>
>  #define CREATE_TRACE_POINTS
>  #include "fuse_trace.h"
> @@ -307,11 +309,75 @@ const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
>  };
>  EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
>
> +static void fuse_timeout_checkpoint_in(struct fuse_req *req)
> +{
> +       int idx;
> +       struct fuse_conn *fc = req->fm->fc;
> +
> +       if (!sysctl_hung_task_timeout_secs)
> +               return;
> +       req->checkpoint_time = jiffies;
> +       idx = (req->checkpoint_time >> FUSE_TIMEOUT_JIFFIES) % FUSE_TIMEOUT_DISTRIBUTION;
> +       atomic_inc(&fc->timeout_distribution[idx]);
> +}
> +
> +static void fuse_timeout_checkpoint_out(struct fuse_req *req)
> +{
> +       int idx;
> +       struct fuse_conn *fc = req->fm->fc;
> +
> +       if (!sysctl_hung_task_timeout_secs)
> +               return;
> +       idx = (req->checkpoint_time >> FUSE_TIMEOUT_JIFFIES) % FUSE_TIMEOUT_DISTRIBUTION;
> +       atomic_dec(&fc->timeout_distribution[idx]);
> +}
> +
> +/*
> + * The fuse timeout logic maintain a time distribution sliding window
> + * made of FUSE_TIMEOUT_DISTRIBUTION entries where each entries span over a
> + * number of jiffies defined by FUSE_TIMEOUT_JIFFIES in base 2.
> + * Just before sending the request to the server, we increment the coresponding
> + * distribution entry and once the request is ack back, we decrement that same
> + * entry because we remember req->checkpoint_time.
> + * Now, a timeout can be detected by simply looking at the old entries in the
> + * distribution and see if there is something hanging past a certain point.
> + */
> +void fuse_check_timeout(struct work_struct *wk)
> +{
> +       int idx, x;
> +       struct timespec64 t;
> +       struct fuse_conn *fc = container_of(wk, struct fuse_conn, work.work);
> +
> +       if (!atomic_read(&fc->num_waiting))
> +               goto out;
> +
> +       /* Current position in the distribution */
> +       idx = (jiffies >> FUSE_TIMEOUT_JIFFIES) % FUSE_TIMEOUT_DISTRIBUTION;
> +
> +       /* Walk back and trigger abort when detecting old entries in the second half */
> +       for (x = 0; x < FUSE_TIMEOUT_DISTRIBUTION; x++,
> +                       idx == 0 ? idx = FUSE_TIMEOUT_DISTRIBUTION-1 : idx--) {
> +               /* First half, keep going */
> +               if (x < FUSE_TIMEOUT_DISTRIBUTION>>1)
> +                       continue;
> +               if (atomic_read(&fc->timeout_distribution[idx])) {
> +                       pr_info("fuse %u is stuck, aborting\n", fc->dev);
> +                       fuse_abort_conn(fc);
> +                       return;
> +               }
> +       }
> +out:
> +       if (sysctl_hung_task_timeout_secs)
> +               queue_delayed_work(system_wq, &fc->work,
> +                       sysctl_hung_task_timeout_secs * (HZ / 2));
> +}

I haven't looked too closely at the exact logic in this yet but at a
first glance, some things don't seem robust. For example, if the
fuse_check_timeout() gets rescheduled right after the "idx = ..." and
before the for loop, then it could falsely abort connections that
haven't actually expired. imo this implementation is a lot more
confusing to follow and harder to prove correctness for. don't see the
logic in [1] as being too slow as yes it grabs locks but it only
checks against the head of each list and the job runs every 15 or so
seconds with no overhead for queueing and dequeueing.


Thanks,
Joanne

[1] https://lore.kernel.org/linux-fsdevel/20241218222630.99920-2-joannelkoong@gmail.com/

> +
>  static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req)
>  {
>         req->in.h.len = sizeof(struct fuse_in_header) +
>                 fuse_len_args(req->args->in_numargs,
>                               (struct fuse_arg *) req->args->in_args);
> +       fuse_timeout_checkpoint_in(req);
>         trace_fuse_request_send(req);
>         fiq->ops->send_req(fiq, req);
>  }
> @@ -359,6 +425,7 @@ void fuse_request_end(struct fuse_req *req)
>         if (test_and_set_bit(FR_FINISHED, &req->flags))
>                 goto put_request;
>
> +       fuse_timeout_checkpoint_out(req);
>         trace_fuse_request_end(req);
>         /*
>          * test_and_set_bit() implies smp_mb() between bit
> @@ -2260,6 +2327,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
>                 LIST_HEAD(to_end);
>                 unsigned int i;
>
> +               cancel_delayed_work(&fc->work);
>                 /* Background queuing checks fc->connected under bg_lock */
>                 spin_lock(&fc->bg_lock);
>                 fc->connected = 0;
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 74744c6f2860..243d482a057d 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -44,6 +44,12 @@
>  /** Number of dentries for each connection in the control filesystem */
>  #define FUSE_CTL_NUM_DENTRIES 5
>
> +/** Request timeout handling */
> +#define FUSE_TIMEOUT_DISTRIBUTION_SHIFT 3
> +#define FUSE_TIMEOUT_DISTRIBUTION (1L << FUSE_TIMEOUT_DISTRIBUTION_SHIFT)
> +#define FUSE_TIMEOUT_JIFFIES (order_base_2( \
> +       (sysctl_hung_task_timeout_secs * HZ) >> FUSE_TIMEOUT_DISTRIBUTION_SHIFT))
> +
>  /** Maximum of max_pages received in init_out */
>  extern unsigned int fuse_max_pages_limit;
>
> @@ -438,6 +444,8 @@ struct fuse_req {
>
>         /** fuse_mount this request belongs to */
>         struct fuse_mount *fm;
> +
> +       unsigned long checkpoint_time;
>  };
>
>  struct fuse_iqueue;
> @@ -923,8 +931,12 @@ struct fuse_conn {
>         /** IDR for backing files ids */
>         struct idr backing_files_map;
>  #endif
> +
> +       atomic_t timeout_distribution[FUSE_TIMEOUT_DISTRIBUTION];
> +       struct delayed_work work;
>  };
>
> +
>  /*
>   * Represents a mounted filesystem, potentially a submount.
>   *
> @@ -1190,6 +1202,7 @@ void fuse_request_end(struct fuse_req *req);
>  /* Abort all requests */
>  void fuse_abort_conn(struct fuse_conn *fc);
>  void fuse_wait_aborted(struct fuse_conn *fc);
> +void fuse_check_timeout(struct work_struct *wk);
>
>  /**
>   * Invalidate inode attributes
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 3ce4f4e81d09..9f8fe17801c4 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -23,6 +23,8 @@
>  #include <linux/exportfs.h>
>  #include <linux/posix_acl.h>
>  #include <linux/pid_namespace.h>
> +#include <linux/completion.h>
> +#include <linux/sched/sysctl.h>
>  #include <uapi/linux/magic.h>
>
>  MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
> @@ -1002,6 +1004,7 @@ void fuse_conn_put(struct fuse_conn *fc)
>                 struct fuse_iqueue *fiq = &fc->iq;
>                 struct fuse_sync_bucket *bucket;
>
> +               cancel_delayed_work_sync(&fc->work);
>                 if (IS_ENABLED(CONFIG_FUSE_DAX))
>                         fuse_dax_conn_free(fc);
>                 if (fiq->ops->release)
> @@ -1785,6 +1788,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
>         fc->destroy = ctx->destroy;
>         fc->no_control = ctx->no_control;
>         fc->no_force_umount = ctx->no_force_umount;
> +       INIT_DELAYED_WORK(&fc->work, fuse_check_timeout);
> +       if (sysctl_hung_task_timeout_secs)
> +               queue_delayed_work(system_wq, &fc->work,
> +                       sysctl_hung_task_timeout_secs * (HZ / 2));
>
>         err = -ENOMEM;
>         root = fuse_get_root_inode(sb, ctx->rootmode);
> --
> 2.34.1
>
diff mbox series

Patch

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 27ccae63495d..351787363444 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -21,6 +21,8 @@ 
 #include <linux/swap.h>
 #include <linux/splice.h>
 #include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/sched/sysctl.h>
 
 #define CREATE_TRACE_POINTS
 #include "fuse_trace.h"
@@ -307,11 +309,75 @@  const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
 };
 EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
 
+static void fuse_timeout_checkpoint_in(struct fuse_req *req)
+{
+	int idx;
+	struct fuse_conn *fc = req->fm->fc;
+
+	if (!sysctl_hung_task_timeout_secs)
+		return;
+	req->checkpoint_time = jiffies;
+	idx = (req->checkpoint_time >> FUSE_TIMEOUT_JIFFIES) % FUSE_TIMEOUT_DISTRIBUTION;
+	atomic_inc(&fc->timeout_distribution[idx]);
+}
+
+static void fuse_timeout_checkpoint_out(struct fuse_req *req)
+{
+	int idx;
+	struct fuse_conn *fc = req->fm->fc;
+
+	if (!sysctl_hung_task_timeout_secs)
+		return;
+	idx = (req->checkpoint_time >> FUSE_TIMEOUT_JIFFIES) % FUSE_TIMEOUT_DISTRIBUTION;
+	atomic_dec(&fc->timeout_distribution[idx]);
+}
+
+/*
+ * The fuse timeout logic maintain a time distribution sliding window
+ * made of FUSE_TIMEOUT_DISTRIBUTION entries where each entries span over a
+ * number of jiffies defined by FUSE_TIMEOUT_JIFFIES in base 2.
+ * Just before sending the request to the server, we increment the coresponding
+ * distribution entry and once the request is ack back, we decrement that same
+ * entry because we remember req->checkpoint_time.
+ * Now, a timeout can be detected by simply looking at the old entries in the
+ * distribution and see if there is something hanging past a certain point.
+ */
+void fuse_check_timeout(struct work_struct *wk)
+{
+	int idx, x;
+	struct timespec64 t;
+	struct fuse_conn *fc = container_of(wk, struct fuse_conn, work.work);
+
+	if (!atomic_read(&fc->num_waiting))
+		goto out;
+
+	/* Current position in the distribution */
+	idx = (jiffies >> FUSE_TIMEOUT_JIFFIES) % FUSE_TIMEOUT_DISTRIBUTION;
+
+	/* Walk back and trigger abort when detecting old entries in the second half */
+	for (x = 0; x < FUSE_TIMEOUT_DISTRIBUTION; x++,
+			idx == 0 ? idx = FUSE_TIMEOUT_DISTRIBUTION-1 : idx--) {
+		/* First half, keep going */
+		if (x < FUSE_TIMEOUT_DISTRIBUTION>>1)
+			continue;
+		if (atomic_read(&fc->timeout_distribution[idx])) {
+			pr_info("fuse %u is stuck, aborting\n", fc->dev);
+			fuse_abort_conn(fc);
+			return;
+		}
+	}
+out:
+	if (sysctl_hung_task_timeout_secs)
+		queue_delayed_work(system_wq, &fc->work,
+			sysctl_hung_task_timeout_secs * (HZ / 2));
+}
+
 static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		fuse_len_args(req->args->in_numargs,
 			      (struct fuse_arg *) req->args->in_args);
+	fuse_timeout_checkpoint_in(req);
 	trace_fuse_request_send(req);
 	fiq->ops->send_req(fiq, req);
 }
@@ -359,6 +425,7 @@  void fuse_request_end(struct fuse_req *req)
 	if (test_and_set_bit(FR_FINISHED, &req->flags))
 		goto put_request;
 
+	fuse_timeout_checkpoint_out(req);
 	trace_fuse_request_end(req);
 	/*
 	 * test_and_set_bit() implies smp_mb() between bit
@@ -2260,6 +2327,7 @@  void fuse_abort_conn(struct fuse_conn *fc)
 		LIST_HEAD(to_end);
 		unsigned int i;
 
+		cancel_delayed_work(&fc->work);
 		/* Background queuing checks fc->connected under bg_lock */
 		spin_lock(&fc->bg_lock);
 		fc->connected = 0;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 74744c6f2860..243d482a057d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -44,6 +44,12 @@ 
 /** Number of dentries for each connection in the control filesystem */
 #define FUSE_CTL_NUM_DENTRIES 5
 
+/** Request timeout handling */
+#define FUSE_TIMEOUT_DISTRIBUTION_SHIFT 3
+#define FUSE_TIMEOUT_DISTRIBUTION (1L << FUSE_TIMEOUT_DISTRIBUTION_SHIFT)
+#define FUSE_TIMEOUT_JIFFIES (order_base_2( \
+	(sysctl_hung_task_timeout_secs * HZ) >> FUSE_TIMEOUT_DISTRIBUTION_SHIFT))
+
 /** Maximum of max_pages received in init_out */
 extern unsigned int fuse_max_pages_limit;
 
@@ -438,6 +444,8 @@  struct fuse_req {
 
 	/** fuse_mount this request belongs to */
 	struct fuse_mount *fm;
+
+	unsigned long checkpoint_time;
 };
 
 struct fuse_iqueue;
@@ -923,8 +931,12 @@  struct fuse_conn {
 	/** IDR for backing files ids */
 	struct idr backing_files_map;
 #endif
+
+	atomic_t timeout_distribution[FUSE_TIMEOUT_DISTRIBUTION];
+	struct delayed_work work;
 };
 
+
 /*
  * Represents a mounted filesystem, potentially a submount.
  *
@@ -1190,6 +1202,7 @@  void fuse_request_end(struct fuse_req *req);
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 void fuse_wait_aborted(struct fuse_conn *fc);
+void fuse_check_timeout(struct work_struct *wk);
 
 /**
  * Invalidate inode attributes
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3ce4f4e81d09..9f8fe17801c4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -23,6 +23,8 @@ 
 #include <linux/exportfs.h>
 #include <linux/posix_acl.h>
 #include <linux/pid_namespace.h>
+#include <linux/completion.h>
+#include <linux/sched/sysctl.h>
 #include <uapi/linux/magic.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
@@ -1002,6 +1004,7 @@  void fuse_conn_put(struct fuse_conn *fc)
 		struct fuse_iqueue *fiq = &fc->iq;
 		struct fuse_sync_bucket *bucket;
 
+		cancel_delayed_work_sync(&fc->work);
 		if (IS_ENABLED(CONFIG_FUSE_DAX))
 			fuse_dax_conn_free(fc);
 		if (fiq->ops->release)
@@ -1785,6 +1788,10 @@  int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 	fc->destroy = ctx->destroy;
 	fc->no_control = ctx->no_control;
 	fc->no_force_umount = ctx->no_force_umount;
+	INIT_DELAYED_WORK(&fc->work, fuse_check_timeout);
+	if (sysctl_hung_task_timeout_secs)
+		queue_delayed_work(system_wq, &fc->work,
+			sysctl_hung_task_timeout_secs * (HZ / 2));
 
 	err = -ENOMEM;
 	root = fuse_get_root_inode(sb, ctx->rootmode);