diff mbox series

[v2] dm mpath: Add timeout mechanism for queue_if_no_path

Message ID 20200113224127.3367484-1-krisman@collabora.com (mailing list archive)
State Accepted, archived
Delegated to: Mike Snitzer
Headers show
Series [v2] dm mpath: Add timeout mechanism for queue_if_no_path | expand

Commit Message

Gabriel Krisman Bertazi Jan. 13, 2020, 10:41 p.m. UTC
From: Anatol Pomazau <anatol@google.com>

Add a configurable timeout mechanism to disable queue_if_no_path without
assistance from multipathd.  In reality, this reimplements the
no_path_retry mechanism from multipathd in kernel space, which is
interesting to prevent processes from hanging indefinitely in cases
where the daemon might be unable to respond, after a failure or for
whatever reason.

Despite replicating the policy configuration on kernel space, it is
quite an important case to prevent IOs from hanging forever, waiting for
userspace to behave correctly.

v2:
  - Use a module parameter instead of configuring per table
  - Simplify code

Co-developed-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Co-developed-by: Bharath Ravi <rbharath@google.com>
Signed-off-by: Bharath Ravi <rbharath@google.com>
Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Signed-off-by: Anatol Pomazau <anatol@google.com>
Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
---
 drivers/md/dm-mpath.c | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

Comments

Mike Snitzer Jan. 14, 2020, 4:15 p.m. UTC | #1
On Mon, Jan 13 2020 at  5:41P -0500,
Gabriel Krisman Bertazi <krisman@collabora.com> wrote:

> From: Anatol Pomazau <anatol@google.com>
> 
> Add a configurable timeout mechanism to disable queue_if_no_path without
> assistance from multipathd.  In reality, this reimplements the
> no_path_retry mechanism from multipathd in kernel space, which is
> interesting to prevent processes from hanging indefinitely in cases
> where the daemon might be unable to respond, after a failure or for
> whatever reason.
> 
> Despite replicating the policy configuration on kernel space, it is
> quite an important case to prevent IOs from hanging forever, waiting for
> userspace to behave correctly.
> 
> v2:
>   - Use a module parameter instead of configuring per table
>   - Simplify code
> 
> Co-developed-by: Frank Mayhar <fmayhar@google.com>
> Signed-off-by: Frank Mayhar <fmayhar@google.com>
> Co-developed-by: Bharath Ravi <rbharath@google.com>
> Signed-off-by: Bharath Ravi <rbharath@google.com>
> Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
> Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
> Signed-off-by: Anatol Pomazau <anatol@google.com>
> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>

All these tags seem rather excessive (especially in that you aren't cc
most of them on the submission).  Please review/scrub.  Just seems odd
that so many had a hand in this relatively small patch.

The Signed-off-by for anatol@google.com seems wrong, or did Anatol
shephard this patch at some point?  Or did you mean Reviewed-by?
Something else?

Anyway, if in the end you hold these tags to reflect the development
evolution of this patch then so be it ;)

I've reviewed the changes and found various things I felt were
worthwhile to modify.  Summary:

- included missing <linux/timer.h>
- added MODULE_PARM_DESC
- moved new functions to reduce needed forward declarations
- tweaked queue_if_no_path_timeout_work's DMWARN message
- added lockdep_assert_held to enable_nopath_timeout
- renamed static queue_if_no_path_timeout to queue_if_no_path_timeout_secs
- use READ_ONCE when accessing queue_if_no_path_timeout_secs

Think that was it.

Please review/apply//test and if you're OK with the end result I'll
get it staged for 5.6 inclusion.

Thanks,
Mike


diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index b09e3f925f47..2bc18c9c3abc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -20,6 +20,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/delay.h>
 #include <scsi/scsi_dh.h>
@@ -31,6 +32,8 @@
 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 #define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
 
+static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
+
 /* Path properties */
 struct pgpath {
 	struct list_head list;
@@ -104,10 +107,6 @@ struct dm_mpath_io {
 	size_t nr_bytes;
 };
 
-static unsigned long queue_if_no_path_timeout = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
-module_param_named(queue_if_no_path_timeout_secs,
-                  queue_if_no_path_timeout, ulong, 0644);
-
 typedef int (*action_fn) (struct pgpath *pgpath);
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -115,10 +114,7 @@ static void trigger_event(struct work_struct *work);
 static void activate_or_offline_path(struct pgpath *pgpath);
 static void activate_path_work(struct work_struct *work);
 static void process_queued_bios(struct work_struct *work);
-
 static void queue_if_no_path_timeout_work(struct timer_list *t);
-static void enable_nopath_timeout(struct multipath *m);
-static void disable_nopath_timeout(struct multipath *m);
 
 /*-----------------------------------------------
  * Multipath state flags.
@@ -730,6 +726,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 	return 0;
 }
 
+/*
+ * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
+ * process any queued I/O.
+ */
+static void queue_if_no_path_timeout_work(struct timer_list *t)
+{
+	struct multipath *m = from_timer(m, t, nopath_timer);
+	struct mapped_device *md = dm_table_get_md(m->ti->table);
+
+	DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
+	queue_if_no_path(m, false, false);
+}
+
+/*
+ * Enable the queue_if_no_path timeout if necessary.
+ * Called with m->lock held.
+ */
+static void enable_nopath_timeout(struct multipath *m)
+{
+	unsigned long queue_if_no_path_timeout =
+		READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
+
+	lockdep_assert_held(&m->lock);
+
+	if (queue_if_no_path_timeout > 0 &&
+	    atomic_read(&m->nr_valid_paths) == 0 &&
+	    test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+		mod_timer(&m->nopath_timer,
+			  jiffies + queue_if_no_path_timeout);
+	}
+}
+
+static void disable_nopath_timeout(struct multipath *m)
+{
+	del_timer_sync(&m->nopath_timer);
+}
+
 /*
  * An event is triggered whenever a path is taken out of use.
  * Includes path failure and PG bypass.
@@ -1231,20 +1264,6 @@ static void multipath_dtr(struct dm_target *ti)
 	free_multipath(m);
 }
 
-/*
- * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
- * process any queued I/O.
- */
-static void queue_if_no_path_timeout_work(struct timer_list *t)
-{
-	struct multipath *m = from_timer(m, t, nopath_timer);
-	struct mapped_device *md = dm_table_get_md((m)->ti->table);
-
-	DMWARN("queue_if_no_path timeout on %s", dm_device_name(md));
-
-	queue_if_no_path(m, false, false);
-}
-
 /*
  * Take a path out of use.
  */
@@ -1352,25 +1371,6 @@ static int action_dev(struct multipath *m, struct dm_dev *dev,
 	return r;
 }
 
-/*
- * Enable the queue_if_no_path timeout if necessary.  Called with m->lock
- * held.
- */
-static void enable_nopath_timeout(struct multipath *m)
-{
-	if (queue_if_no_path_timeout > 0 &&
-	    atomic_read(&m->nr_valid_paths) == 0 &&
-	    test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-		mod_timer(&m->nopath_timer,
-			  jiffies + queue_if_no_path_timeout * HZ);
-	}
-}
-
-static void disable_nopath_timeout(struct multipath *m)
-{
-	del_timer_sync(&m->nopath_timer);
-}
-
 /*
  * Temporarily try to avoid having to use the specified PG
  */
@@ -2127,6 +2127,10 @@ static void __exit dm_multipath_exit(void)
 module_init(dm_multipath_init);
 module_exit(dm_multipath_exit);
 
+module_param_named(queue_if_no_path_timeout_secs,
+		   queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
+
 MODULE_DESCRIPTION(DM_NAME " multipath target");
 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Gabriel Krisman Bertazi Jan. 14, 2020, 4:31 p.m. UTC | #2
Mike Snitzer <snitzer@redhat.com> writes:

> On Mon, Jan 13 2020 at  5:41P -0500,
> Gabriel Krisman Bertazi <krisman@collabora.com> wrote:
>
>> From: Anatol Pomazau <anatol@google.com>
>> 
>> Add a configurable timeout mechanism to disable queue_if_no_path without
>> assistance from multipathd.  In reality, this reimplements the
>> no_path_retry mechanism from multipathd in kernel space, which is
>> interesting to prevent processes from hanging indefinitely in cases
>> where the daemon might be unable to respond, after a failure or for
>> whatever reason.
>> 
>> Despite replicating the policy configuration on kernel space, it is
>> quite an important case to prevent IOs from hanging forever, waiting for
>> userspace to behave correctly.
>> 
>> v2:
>>   - Use a module parameter instead of configuring per table
>>   - Simplify code
>> 
>> Co-developed-by: Frank Mayhar <fmayhar@google.com>
>> Signed-off-by: Frank Mayhar <fmayhar@google.com>
>> Co-developed-by: Bharath Ravi <rbharath@google.com>
>> Signed-off-by: Bharath Ravi <rbharath@google.com>
>> Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
>> Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
>> Signed-off-by: Anatol Pomazau <anatol@google.com>
>> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>
> All these tags seem rather excessive (especially in that you aren't cc
> most of them on the submission).  Please review/scrub.  Just seems odd
> that so many had a hand in this relatively small patch.

Hey,

I removed some of the Cc's because those emails addresses were bouncing.
Still, I kept the lines for credits.  I got the bounces when sending v1.

> The Signed-off-by for anatol@google.com seems wrong, or did Anatol
> shephard this patch at some point?  Or did you mean Reviewed-by?
> Something else?

Anatol was the main author, as listed in the From: header.  This
seems correct with regard to the ordering rules of
Documentation/process/submitting-patches.rst, in particular the second
example, (Example of a patch submitted by a Co-developed-by: author::).

Am I missing something?

>
> Anyway, if in the end you hold these tags to reflect the development
> evolution of this patch then so be it ;)
>
> I've reviewed the changes and found various things I felt were
> worthwhile to modify.  Summary:
>
> - included missing <linux/timer.h>
> - added MODULE_PARM_DESC
> - moved new functions to reduce needed forward declarations
> - tweaked queue_if_no_path_timeout_work's DMWARN message
> - added lockdep_assert_held to enable_nopath_timeout
> - renamed static queue_if_no_path_timeout to queue_if_no_path_timeout_secs
> - use READ_ONCE when accessing queue_if_no_path_timeout_secs
>

The changes you made look good to me and your version of the patch
passes my testcase.
Mike Snitzer Jan. 14, 2020, 4:53 p.m. UTC | #3
On Tue, Jan 14 2020 at 11:31am -0500,
Gabriel Krisman Bertazi <krisman@collabora.com> wrote:

> Mike Snitzer <snitzer@redhat.com> writes:
> 
> > On Mon, Jan 13 2020 at  5:41P -0500,
> > Gabriel Krisman Bertazi <krisman@collabora.com> wrote:
> >
> >> From: Anatol Pomazau <anatol@google.com>
> >> 
> >> Add a configurable timeout mechanism to disable queue_if_no_path without
> >> assistance from multipathd.  In reality, this reimplements the
> >> no_path_retry mechanism from multipathd in kernel space, which is
> >> interesting to prevent processes from hanging indefinitely in cases
> >> where the daemon might be unable to respond, after a failure or for
> >> whatever reason.
> >> 
> >> Despite replicating the policy configuration on kernel space, it is
> >> quite an important case to prevent IOs from hanging forever, waiting for
> >> userspace to behave correctly.
> >> 
> >> v2:
> >>   - Use a module parameter instead of configuring per table
> >>   - Simplify code
> >> 
> >> Co-developed-by: Frank Mayhar <fmayhar@google.com>
> >> Signed-off-by: Frank Mayhar <fmayhar@google.com>
> >> Co-developed-by: Bharath Ravi <rbharath@google.com>
> >> Signed-off-by: Bharath Ravi <rbharath@google.com>
> >> Co-developed-by: Khazhismel Kumykov <khazhy@google.com>
> >> Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
> >> Signed-off-by: Anatol Pomazau <anatol@google.com>
> >> Co-developed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> >> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> >
> > All these tags seem rather excessive (especially in that you aren't cc
> > most of them on the submission).  Please review/scrub.  Just seems odd
> > that so many had a hand in this relatively small patch.
> 
> Hey,
> 
> I removed some of the Cc's because those emails addresses were bouncing.
> Still, I kept the lines for credits.  I got the bounces when sending v1.

Hmm, if their emails bounce, not a lot of point detailing them.

> > The Signed-off-by for anatol@google.com seems wrong, or did Anatol
> > shephard this patch at some point?  Or did you mean Reviewed-by?
> > Something else?
> 
> Anatol was the main author, as listed in the From: header.  This
> seems correct with regard to the ordering rules of
> Documentation/process/submitting-patches.rst, in particular the second
> example, (Example of a patch submitted by a Co-developed-by: author::).
> 
> Am I missing something?

No, I missed that Anatol is main author.  I'd have noticed once I
applied the patch via 'git am' but...

> 
> >
> > Anyway, if in the end you hold these tags to reflect the development
> > evolution of this patch then so be it ;)
> >
> > I've reviewed the changes and found various things I felt were
> > worthwhile to modify.  Summary:
> >
> > - included missing <linux/timer.h>
> > - added MODULE_PARM_DESC
> > - moved new functions to reduce needed forward declarations
> > - tweaked queue_if_no_path_timeout_work's DMWARN message
> > - added lockdep_assert_held to enable_nopath_timeout
> > - renamed static queue_if_no_path_timeout to queue_if_no_path_timeout_secs
> > - use READ_ONCE when accessing queue_if_no_path_timeout_secs
> >
> 
> The changes you made look good to me and your version of the patch
> passes my testcase.

OK, thanks.

Mike

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
diff mbox series

Patch

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e0c32793c248..52d90900e85b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -29,6 +29,7 @@ 
 #define DM_MSG_PREFIX "multipath"
 #define DM_PG_INIT_DELAY_MSECS 2000
 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
+#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
 
 /* Path properties */
 struct pgpath {
@@ -91,6 +92,8 @@  struct multipath {
 
 	struct work_struct process_queued_bios;
 	struct bio_list queued_bios;
+
+	struct timer_list nopath_timer;	/* Timeout for queue_if_no_path */
 };
 
 /*
@@ -101,6 +104,10 @@  struct dm_mpath_io {
 	size_t nr_bytes;
 };
 
+static unsigned long queue_if_no_path_timeout = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
+module_param_named(queue_if_no_path_timeout_secs,
+                  queue_if_no_path_timeout, ulong, 0644);
+
 typedef int (*action_fn) (struct pgpath *pgpath);
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -109,6 +116,10 @@  static void activate_or_offline_path(struct pgpath *pgpath);
 static void activate_path_work(struct work_struct *work);
 static void process_queued_bios(struct work_struct *work);
 
+static void queue_if_no_path_timeout_work(struct timer_list *t);
+static void enable_nopath_timeout(struct multipath *m);
+static void disable_nopath_timeout(struct multipath *m);
+
 /*-----------------------------------------------
  * Multipath state flags.
  *-----------------------------------------------*/
@@ -195,6 +206,8 @@  static struct multipath *alloc_multipath(struct dm_target *ti)
 
 		m->ti = ti;
 		ti->private = m;
+
+		timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
 	}
 
 	return m;
@@ -1090,6 +1103,7 @@  static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	struct dm_arg_set as;
 	unsigned pg_count = 0;
 	unsigned next_pg_num;
+	unsigned long flags;
 
 	as.argc = argc;
 	as.argv = argv;
@@ -1154,6 +1168,10 @@  static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
+	spin_lock_irqsave(&m->lock, flags);
+	enable_nopath_timeout(m);
+	spin_unlock_irqrestore(&m->lock, flags);
+
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
@@ -1208,10 +1226,25 @@  static void multipath_dtr(struct dm_target *ti)
 {
 	struct multipath *m = ti->private;
 
+	disable_nopath_timeout(m);
 	flush_multipath_work(m);
 	free_multipath(m);
 }
 
+/*
+ * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
+ * process any queued I/O.
+ */
+static void queue_if_no_path_timeout_work(struct timer_list *t)
+{
+	struct multipath *m = from_timer(m, t, nopath_timer);
+	struct mapped_device *md = dm_table_get_md((m)->ti->table);
+
+	DMWARN("queue_if_no_path timeout on %s", dm_device_name(md));
+
+	queue_if_no_path(m, false, false);
+}
+
 /*
  * Take a path out of use.
  */
@@ -1241,6 +1274,8 @@  static int fail_path(struct pgpath *pgpath)
 
 	schedule_work(&m->trigger_event);
 
+	enable_nopath_timeout(m);
+
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
 
@@ -1291,6 +1326,9 @@  static int reinstate_path(struct pgpath *pgpath)
 		process_queued_io_list(m);
 	}
 
+	if (pgpath->is_active)
+		disable_nopath_timeout(m);
+
 	return r;
 }
 
@@ -1314,6 +1352,25 @@  static int action_dev(struct multipath *m, struct dm_dev *dev,
 	return r;
 }
 
+/*
+ * Enable the queue_if_no_path timeout if necessary.  Called with m->lock
+ * held.
+ */
+static void enable_nopath_timeout(struct multipath *m)
+{
+	if (queue_if_no_path_timeout > 0 &&
+	    atomic_read(&m->nr_valid_paths) == 0 &&
+	    test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+		mod_timer(&m->nopath_timer,
+			  jiffies + queue_if_no_path_timeout * HZ);
+	}
+}
+
+static void disable_nopath_timeout(struct multipath *m)
+{
+	del_timer_sync(&m->nopath_timer);
+}
+
 /*
  * Temporarily try to avoid having to use the specified PG
  */
@@ -1789,6 +1846,7 @@  static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
 	struct dm_dev *dev;
 	struct multipath *m = ti->private;
 	action_fn action;
+	unsigned long flags;
 
 	mutex_lock(&m->work_mutex);
 
@@ -1800,9 +1858,13 @@  static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
 	if (argc == 1) {
 		if (!strcasecmp(argv[0], "queue_if_no_path")) {
 			r = queue_if_no_path(m, true, false);
+			spin_lock_irqsave(&m->lock, flags);
+			enable_nopath_timeout(m);
+			spin_unlock_irqrestore(&m->lock, flags);
 			goto out;
 		} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
 			r = queue_if_no_path(m, false, false);
+			disable_nopath_timeout(m);
 			goto out;
 		}
 	}