diff mbox series

softdog: Add options 'soft_reboot_target' and 'soft_active_on_boot'

Message ID 20200603103923.116113-1-woodylin@google.com (mailing list archive)
State Changes Requested
Headers show
Series softdog: Add options 'soft_reboot_target' and 'soft_active_on_boot' | expand

Commit Message

Woody Lin June 3, 2020, 10:39 a.m. UTC
Add module parameters 'soft_reboot_target' and 'soft_active_on_boot' for
customizing softdog configuration; config reboot target by assigning
soft_reboot_target, and set soft_active_on_boot to start up softdog
timer at module initialization stage.

Signed-off-by: Woody Lin <woodylin@google.com>
---
 drivers/watchdog/softdog.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

Comments

Guenter Roeck June 9, 2020, 5:18 p.m. UTC | #1
On 6/3/20 3:39 AM, Woody Lin wrote:
> Add module parameters 'soft_reboot_target' and 'soft_active_on_boot' for
> customizing softdog configuration; config reboot target by assigning
> soft_reboot_target, and set soft_active_on_boot to start up softdog
> timer at module initialization stage.
> 
> Signed-off-by: Woody Lin <woodylin@google.com>
> ---
>  drivers/watchdog/softdog.c | 38 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
> 
> diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c
> index 3e4885c1545e..f5027acddbbb 100644
> --- a/drivers/watchdog/softdog.c
> +++ b/drivers/watchdog/softdog.c
> @@ -20,11 +20,13 @@
>  #include <linux/hrtimer.h>
>  #include <linux/init.h>
>  #include <linux/kernel.h>
> +#include <linux/kthread.h>
>  #include <linux/module.h>
>  #include <linux/moduleparam.h>
>  #include <linux/reboot.h>
>  #include <linux/types.h>
>  #include <linux/watchdog.h>
> +#include <linux/workqueue.h>
>  
>  #define TIMER_MARGIN	60		/* Default is 60 seconds */
>  static unsigned int soft_margin = TIMER_MARGIN;	/* in seconds */
> @@ -49,9 +51,32 @@ module_param(soft_panic, int, 0);
>  MODULE_PARM_DESC(soft_panic,
>  	"Softdog action, set to 1 to panic, 0 to reboot (default=0)");
>  
> +static char *soft_reboot_target;
> +module_param(soft_reboot_target, charp, 0000);
> +MODULE_PARM_DESC(soft_reboot_target,
> +	"Softdog action, set reboot target (default=emergency)");
> +
> +static bool soft_active_on_boot;
> +module_param(soft_active_on_boot, bool, 0000);
> +MODULE_PARM_DESC(soft_active_on_boot,
> +	"Set to true to active Softdog on boot (default=false)");
> +
>  static struct hrtimer softdog_ticktock;
>  static struct hrtimer softdog_preticktock;
>  
> +static int reboot_kthread_fn(void *data)
> +{
> +	kernel_restart(soft_reboot_target);
> +	emergency_restart(); /* Should not reach here */
> +	return -EPERM;       /* Should not reach here */
> +}
> +
> +static void reboot_work_fn(struct work_struct *unused)
> +{
> +	if (IS_ERR(kthread_run(reboot_kthread_fn, NULL, "softdog_reboot")))
> +		emergency_restart();
> +}
> +
>  static enum hrtimer_restart softdog_fire(struct hrtimer *timer)
>  {
>  	module_put(THIS_MODULE);
> @@ -62,6 +87,12 @@ static enum hrtimer_restart softdog_fire(struct hrtimer *timer)
>  		panic("Software Watchdog Timer expired");
>  	} else {
>  		pr_crit("Initiating system reboot\n");
> +		if (soft_reboot_target != NULL) {
> +			static DECLARE_WORK(reboot_work, reboot_work_fn);
> +
> +			schedule_work(&reboot_work);
> +			return HRTIMER_NORESTART;
> +		}

So this adds a double indirection: It first schedules a worker,
then the worker creates a kernel thread, which finally calls
kernel_restart(). If the softdog hangs, it may well be because
of scheduling issues. If so, both the worker and the kernel thread
may never execute, and the system would not reboot even though
the softdog did fire.

Is this double indirection really necessary ?

Thanks,
Guenter

>  		emergency_restart();
>  		pr_crit("Reboot didn't ?????\n");
>  	}
> @@ -145,12 +176,19 @@ static int __init softdog_init(void)
>  		softdog_preticktock.function = softdog_pretimeout;
>  	}
>  
> +	if (soft_active_on_boot) {
> +		set_bit(WDOG_HW_RUNNING, &softdog_dev.status);
> +		set_bit(WDOG_ACTIVE, &softdog_dev.status);
> +	}
> +
>  	ret = watchdog_register_device(&softdog_dev);
>  	if (ret)
>  		return ret;
>  
>  	pr_info("initialized. soft_noboot=%d soft_margin=%d sec soft_panic=%d (nowayout=%d)\n",
>  		soft_noboot, softdog_dev.timeout, soft_panic, nowayout);
> +	pr_info("             soft_reboot_target=%s soft_active_on_boot=%d\n",
> +		soft_reboot_target, soft_active_on_boot);
>  
>  	return 0;
>  }
>
Woody Lin June 12, 2020, 7:01 a.m. UTC | #2
Hi Guenter,

Thanks for taking a look at this. Yes it's a double indirection, and reason to
do this is: I was worrying about if I use the system_wq to execute
'kernel_restart' (which calls to 'device_shutdown' and 'syscore_shutdown'), I
might end up blocking kernel modules' destructor callback function that also
using the system_wq for their cleaning up processes (although I didn't notice
there is a condition that a kernel module needs system_wq for its cleaning up).
To avoid resulting a blocking like this, I will need to execute
'kernel_restart' in a standalone context, that is: a kernel thread. But the
context to start up a kernel thread needs to be able to sleep, so this can't be
done in the execution context of the hrtimer callback function. Finally I have
to implement this as a double indirection: schedules a worker in hrtimer
callback function execution context, then creates a kernel thread in the worker
context, which is able to sleep.

And yes, thanks for catching this: both the worker and the kernel thread may
never be executed if there is a scheduling issue. I will upload v2 patch to
cover this by re-scheduling the same hrtimer with another TIMER_MARGIN (60
secs) expiration. So we can at least fallback to 'emergency_restart' when there
is a scheduling issue.

Sincerely,
Woody

On Wed, Jun 10, 2020 at 1:18 AM Guenter Roeck <linux@roeck-us.net> wrote:
> >               pr_crit("Initiating system reboot\n");
> > +             if (soft_reboot_target != NULL) {
> > +                     static DECLARE_WORK(reboot_work, reboot_work_fn);
> > +
> > +                     schedule_work(&reboot_work);
> > +                     return HRTIMER_NORESTART;
> > +             }
>
> So this adds a double indirection: It first schedules a worker,
> then the worker creates a kernel thread, which finally calls
> kernel_restart(). If the softdog hangs, it may well be because
> of scheduling issues. If so, both the worker and the kernel thread
> may never execute, and the system would not reboot even though
> the softdog did fire.
>
> Is this double indirection really necessary ?
>
> Thanks,
> Guenter
diff mbox series

Patch

diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c
index 3e4885c1545e..f5027acddbbb 100644
--- a/drivers/watchdog/softdog.c
+++ b/drivers/watchdog/softdog.c
@@ -20,11 +20,13 @@ 
 #include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/reboot.h>
 #include <linux/types.h>
 #include <linux/watchdog.h>
+#include <linux/workqueue.h>
 
 #define TIMER_MARGIN	60		/* Default is 60 seconds */
 static unsigned int soft_margin = TIMER_MARGIN;	/* in seconds */
@@ -49,9 +51,32 @@  module_param(soft_panic, int, 0);
 MODULE_PARM_DESC(soft_panic,
 	"Softdog action, set to 1 to panic, 0 to reboot (default=0)");
 
+static char *soft_reboot_target;
+module_param(soft_reboot_target, charp, 0000);
+MODULE_PARM_DESC(soft_reboot_target,
+	"Softdog action, set reboot target (default=emergency)");
+
+static bool soft_active_on_boot;
+module_param(soft_active_on_boot, bool, 0000);
+MODULE_PARM_DESC(soft_active_on_boot,
+	"Set to true to active Softdog on boot (default=false)");
+
 static struct hrtimer softdog_ticktock;
 static struct hrtimer softdog_preticktock;
 
+static int reboot_kthread_fn(void *data)
+{
+	kernel_restart(soft_reboot_target);
+	emergency_restart(); /* Should not reach here */
+	return -EPERM;       /* Should not reach here */
+}
+
+static void reboot_work_fn(struct work_struct *unused)
+{
+	if (IS_ERR(kthread_run(reboot_kthread_fn, NULL, "softdog_reboot")))
+		emergency_restart();
+}
+
 static enum hrtimer_restart softdog_fire(struct hrtimer *timer)
 {
 	module_put(THIS_MODULE);
@@ -62,6 +87,12 @@  static enum hrtimer_restart softdog_fire(struct hrtimer *timer)
 		panic("Software Watchdog Timer expired");
 	} else {
 		pr_crit("Initiating system reboot\n");
+		if (soft_reboot_target != NULL) {
+			static DECLARE_WORK(reboot_work, reboot_work_fn);
+
+			schedule_work(&reboot_work);
+			return HRTIMER_NORESTART;
+		}
 		emergency_restart();
 		pr_crit("Reboot didn't ?????\n");
 	}
@@ -145,12 +176,19 @@  static int __init softdog_init(void)
 		softdog_preticktock.function = softdog_pretimeout;
 	}
 
+	if (soft_active_on_boot) {
+		set_bit(WDOG_HW_RUNNING, &softdog_dev.status);
+		set_bit(WDOG_ACTIVE, &softdog_dev.status);
+	}
+
 	ret = watchdog_register_device(&softdog_dev);
 	if (ret)
 		return ret;
 
 	pr_info("initialized. soft_noboot=%d soft_margin=%d sec soft_panic=%d (nowayout=%d)\n",
 		soft_noboot, softdog_dev.timeout, soft_panic, nowayout);
+	pr_info("             soft_reboot_target=%s soft_active_on_boot=%d\n",
+		soft_reboot_target, soft_active_on_boot);
 
 	return 0;
 }