diff mbox series

[PATCHv2,3/4] kernel/watchdog: adapt the watchdog_hld interface for async model

Message ID 20210923140951.35902-4-kernelfans@gmail.com (mailing list archive)
State New, archived
Headers show
Series watchdog_hld cleanup and async model for arm64 | expand

Commit Message

Pingfan Liu Sept. 23, 2021, 2:09 p.m. UTC
When lockup_detector_init()->watchdog_nmi_probe(), PMU may be not ready
yet. E.g. on arm64, PMU is not ready until
device_initcall(armv8_pmu_driver_init).  And it is deeply integrated
with the driver model and cpuhp. Hence it is hard to push this
initialization before smp_init().

But it is easy to take an opposite approach by enabling watchdog_hld to
get the capability of PMU async.

The async model is achieved by expanding watchdog_nmi_probe() with
-EBUSY, and a re-initializing work_struct which waits on a wait_queue_head.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Sumit Garg <sumit.garg@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Wang Qing <wangqing@vivo.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Santosh Sivaraj <santosh@fossix.org>
Cc: linux-arm-kernel@lists.infradead.org
To: linux-kernel@vger.kernel.org
---
 include/linux/nmi.h |  3 +++
 kernel/watchdog.c   | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 2 deletions(-)

Comments

Petr Mladek Oct. 5, 2021, 7:03 a.m. UTC | #1
On Thu 2021-09-23 22:09:50, Pingfan Liu wrote:
> When lockup_detector_init()->watchdog_nmi_probe(), PMU may be not ready
> yet. E.g. on arm64, PMU is not ready until
> device_initcall(armv8_pmu_driver_init).  And it is deeply integrated
> with the driver model and cpuhp. Hence it is hard to push this
> initialization before smp_init().
> 
> But it is easy to take an opposite approach by enabling watchdog_hld to
> get the capability of PMU async.
> 
> The async model is achieved by expanding watchdog_nmi_probe() with
> -EBUSY, and a re-initializing work_struct which waits on a wait_queue_head.
> 
> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> Cc: Sumit Garg <sumit.garg@linaro.org>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Jiri Olsa <jolsa@redhat.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Masahiro Yamada <masahiroy@kernel.org>
> Cc: Sami Tolvanen <samitolvanen@google.com>
> Cc: Petr Mladek <pmladek@suse.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Wang Qing <wangqing@vivo.com>
> Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
> Cc: Santosh Sivaraj <santosh@fossix.org>
> Cc: linux-arm-kernel@lists.infradead.org
> To: linux-kernel@vger.kernel.org
> ---
>  include/linux/nmi.h |  3 +++
>  kernel/watchdog.c   | 37 +++++++++++++++++++++++++++++++++++--
>  2 files changed, 38 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index b7bcd63c36b4..270d440fe4b7 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -118,6 +118,9 @@ static inline int hardlockup_detector_perf_init(void) { return 0; }
>  
>  void watchdog_nmi_stop(void);
>  void watchdog_nmi_start(void);
> +
> +extern bool hld_detector_delay_initialized;
> +extern struct wait_queue_head hld_detector_wait;
>  int watchdog_nmi_probe(void);
>  void watchdog_nmi_enable(unsigned int cpu);
>  void watchdog_nmi_disable(unsigned int cpu);
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 6e6dd5f0bc3e..bd4ae1839b72 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -103,7 +103,10 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
>  	hardlockup_detector_perf_disable();
>  }
>  
> -/* Return 0, if a NMI watchdog is available. Error code otherwise */
> +/*
> + * Return 0, if a NMI watchdog is available. -EBUSY if not ready.
> + * Other negative value if not support.
> + */
>  int __weak __init watchdog_nmi_probe(void)
>  {
>  	return hardlockup_detector_perf_init();
> @@ -739,15 +742,45 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
>  }
>  #endif /* CONFIG_SYSCTL */
>  
> +static void lockup_detector_delay_init(struct work_struct *work);
> +bool hld_detector_delay_initialized __initdata;
> +
> +struct wait_queue_head hld_detector_wait __initdata =
> +		__WAIT_QUEUE_HEAD_INITIALIZER(hld_detector_wait);
> +
> +static struct work_struct detector_work __initdata =
> +		__WORK_INITIALIZER(detector_work, lockup_detector_delay_init);
> +
> +static void __init lockup_detector_delay_init(struct work_struct *work)
> +{
> +	int ret;
> +
> +	wait_event(hld_detector_wait, hld_detector_delay_initialized);
> +	ret = watchdog_nmi_probe();
> +	if (!ret) {
> +		nmi_watchdog_available = true;
> +		lockup_detector_setup();

Is it really safe to call the entire lockup_detector_setup()
later?

It manipulates also softlockup detector. And more importantly,
the original call is before smp_init(). It means that it was
running when only single CPU was on.

It seems that x86 has some problem with hardlockup detector as
well. It later manipulates only the hardlockup detector. Also it uses
cpus_read_lock() to prevent races with CPU hotplug, see
fixup_ht_bug().


> +	} else {
> +		WARN_ON(ret == -EBUSY);
> +		pr_info("Perf NMI watchdog permanently disabled\n");
> +	}
> +}
> +
>  void __init lockup_detector_init(void)
>  {
> +	int ret;
> +
>  	if (tick_nohz_full_enabled())
>  		pr_info("Disabling watchdog on nohz_full cores by default\n");
>  
>  	cpumask_copy(&watchdog_cpumask,
>  		     housekeeping_cpumask(HK_FLAG_TIMER));
>  
> -	if (!watchdog_nmi_probe())
> +	ret = watchdog_nmi_probe();
> +	if (!ret)
>  		nmi_watchdog_available = true;
> +	else if (ret == -EBUSY)
> +		queue_work_on(smp_processor_id(), system_wq, &detector_work);

IMHO, this is not acceptable. It will block one worker until someone
wakes it. Only arm64 will have a code to wake up the work and only
when pmu is successfully initialized. In all other cases, the worker
will stay blocked forever.

The right solution is to do it the other way. Queue the work
from arm64-specific code when armv8_pmu_driver_init() succeeded.

Also I suggest to flush the work to make sure that it is finished
before __init code gets removed.


The open question is what code the work will call. As mentioned
above, I am not sure that lockup_detector_delay_init() is safe.
IMHO, we need to manipulate only hardlockup detector and
we have to serialize it against CPU hotplug.

Best Regards,
Petr
Pingfan Liu Oct. 8, 2021, 5:53 a.m. UTC | #2
On Tue, Oct 05, 2021 at 09:03:17AM +0200, Petr Mladek wrote:
[...]
> > +static void lockup_detector_delay_init(struct work_struct *work);
> > +bool hld_detector_delay_initialized __initdata;
> > +
> > +struct wait_queue_head hld_detector_wait __initdata =
> > +		__WAIT_QUEUE_HEAD_INITIALIZER(hld_detector_wait);
> > +
> > +static struct work_struct detector_work __initdata =
> > +		__WORK_INITIALIZER(detector_work, lockup_detector_delay_init);
> > +
> > +static void __init lockup_detector_delay_init(struct work_struct *work)
> > +{
> > +	int ret;
> > +
> > +	wait_event(hld_detector_wait, hld_detector_delay_initialized);
> > +	ret = watchdog_nmi_probe();
> > +	if (!ret) {
> > +		nmi_watchdog_available = true;
> > +		lockup_detector_setup();
> 
> Is it really safe to call the entire lockup_detector_setup()
> later?
> 
> It manipulates also softlockup detector. And more importantly,
> the original call is before smp_init(). It means that it was
> running when only single CPU was on.
> 
For the race analysis, lockup_detector_reconfigure() is on the centre stage.
Since proc_watchdog_update() can call lockup_detector_reconfigure() to
re-initialize both soft and hard lockup detector, so the race issue
should be already taken into consideration.

> It seems that x86 has some problem with hardlockup detector as
> well. It later manipulates only the hardlockup detector. Also it uses
> cpus_read_lock() to prevent races with CPU hotplug, see
> fixup_ht_bug().
> 
Yes. But hardlockup_detector_perf_{stop,start}() can not meet the
requirement, since no perf_event is created yet. So there is no handy
interface to re-initialize hardlockup detector directly.

> 
> > +	} else {
> > +		WARN_ON(ret == -EBUSY);
> > +		pr_info("Perf NMI watchdog permanently disabled\n");
> > +	}
> > +}
> > +
> >  void __init lockup_detector_init(void)
> >  {
> > +	int ret;
> > +
> >  	if (tick_nohz_full_enabled())
> >  		pr_info("Disabling watchdog on nohz_full cores by default\n");
> >  
> >  	cpumask_copy(&watchdog_cpumask,
> >  		     housekeeping_cpumask(HK_FLAG_TIMER));
> >  
> > -	if (!watchdog_nmi_probe())
> > +	ret = watchdog_nmi_probe();
> > +	if (!ret)
> >  		nmi_watchdog_available = true;
> > +	else if (ret == -EBUSY)
> > +		queue_work_on(smp_processor_id(), system_wq, &detector_work);
> 
> IMHO, this is not acceptable. It will block one worker until someone
> wakes it. Only arm64 will have a code to wake up the work and only
> when pmu is successfully initialized. In all other cases, the worker
> will stay blocked forever.
> 
What about consider -EBUSY and hld_detector_delay_initialized as a unit?
If watchdog_nmi_probe() returns -EBUSY, then
set the state of ld_detector_delay_initialized as "waiting", and then moved to state "finished".

And at the end of do_initcalls(), check the state is "finished". If not,
then throw a warning and wake up the worker.

> The right solution is to do it the other way. Queue the work
> from arm64-specific code when armv8_pmu_driver_init() succeeded.
> 
Could it be better if watchdog can provide a common framework for future
extension instead of arch specific? The 2nd argument is to avoid the
message "Perf NMI watchdog permanently disabled" while later enabling
it.  (Please see
lockup_detector_init()->watchdog_nmi_probe()->hardlockup_detector_perf_init(),
but if providing arch specific probe method, it can be avoided)

> Also I suggest to flush the work to make sure that it is finished
> before __init code gets removed.
> 
Good point, and very interesting. I will look into it.

> 
> The open question is what code the work will call. As mentioned
> above, I am not sure that lockup_detector_delay_init() is safe.
> IMHO, we need to manipulate only hardlockup detector and
> we have to serialize it against CPU hotplug.
> 
As explained ahead, it has already consider the race against CPU
hotplug.

Thanks,

	Pingfan
Pingfan Liu Oct. 8, 2021, 3:10 p.m. UTC | #3
On Fri, Oct 08, 2021 at 01:53:45PM +0800, Pingfan Liu wrote:
> On Tue, Oct 05, 2021 at 09:03:17AM +0200, Petr Mladek wrote:
> [...]
> > > +static void lockup_detector_delay_init(struct work_struct *work);
> > > +bool hld_detector_delay_initialized __initdata;
> > > +
> > > +struct wait_queue_head hld_detector_wait __initdata =
> > > +		__WAIT_QUEUE_HEAD_INITIALIZER(hld_detector_wait);
> > > +
> > > +static struct work_struct detector_work __initdata =
> > > +		__WORK_INITIALIZER(detector_work, lockup_detector_delay_init);
> > > +
> > > +static void __init lockup_detector_delay_init(struct work_struct *work)
> > > +{
> > > +	int ret;
> > > +
> > > +	wait_event(hld_detector_wait, hld_detector_delay_initialized);
> > > +	ret = watchdog_nmi_probe();
> > > +	if (!ret) {
> > > +		nmi_watchdog_available = true;
> > > +		lockup_detector_setup();
> > 
> > Is it really safe to call the entire lockup_detector_setup()
> > later?
> > 
> > It manipulates also softlockup detector. And more importantly,
> > the original call is before smp_init(). It means that it was
> > running when only single CPU was on.
> > 
> For the race analysis, lockup_detector_reconfigure() is on the centre stage.
> Since proc_watchdog_update() can call lockup_detector_reconfigure() to
> re-initialize both soft and hard lockup detector, so the race issue
> should be already taken into consideration.
> 
> > It seems that x86 has some problem with hardlockup detector as
> > well. It later manipulates only the hardlockup detector. Also it uses
> > cpus_read_lock() to prevent races with CPU hotplug, see
> > fixup_ht_bug().
> > 
> Yes. But hardlockup_detector_perf_{stop,start}() can not meet the
> requirement, since no perf_event is created yet. So there is no handy
> interface to re-initialize hardlockup detector directly.
> 
> > 
> > > +	} else {
> > > +		WARN_ON(ret == -EBUSY);
> > > +		pr_info("Perf NMI watchdog permanently disabled\n");
> > > +	}
> > > +}
> > > +
> > >  void __init lockup_detector_init(void)
> > >  {
> > > +	int ret;
> > > +
> > >  	if (tick_nohz_full_enabled())
> > >  		pr_info("Disabling watchdog on nohz_full cores by default\n");
> > >  
> > >  	cpumask_copy(&watchdog_cpumask,
> > >  		     housekeeping_cpumask(HK_FLAG_TIMER));
> > >  
> > > -	if (!watchdog_nmi_probe())
> > > +	ret = watchdog_nmi_probe();
> > > +	if (!ret)
> > >  		nmi_watchdog_available = true;
> > > +	else if (ret == -EBUSY)
> > > +		queue_work_on(smp_processor_id(), system_wq, &detector_work);
> > 
> > IMHO, this is not acceptable. It will block one worker until someone
> > wakes it. Only arm64 will have a code to wake up the work and only
> > when pmu is successfully initialized. In all other cases, the worker
> > will stay blocked forever.
> > 
> What about consider -EBUSY and hld_detector_delay_initialized as a unit?
                                                                     ^^^
								     unity
> If watchdog_nmi_probe() returns -EBUSY, then
> set the state of ld_detector_delay_initialized as "waiting", and then moved to state "finished".
> 
> And at the end of do_initcalls(), check the state is "finished". If not,
> then throw a warning and wake up the worker.
> 
> > The right solution is to do it the other way. Queue the work
> > from arm64-specific code when armv8_pmu_driver_init() succeeded.
> > 
> Could it be better if watchdog can provide a common framework for future
> extension instead of arch specific? The 2nd argument is to avoid the
> message "Perf NMI watchdog permanently disabled" while later enabling
> it.  (Please see
> lockup_detector_init()->watchdog_nmi_probe()->hardlockup_detector_perf_init(),
> but if providing arch specific probe method, it can be avoided)
> 
Sorry for poor expression. I have not explained it completely for the
second point.

Since using arch specific watchdog_nmi_probe() to avoid misleading
message "Perf NMI watchdog permanently disabled", then -EBUSY should be
returned. And from watchdog level, it should know how to handle error,
that is to say queue_work_on(smp_processor_id(), system_wq, &detector_work).

Thanks,

	Pingfan

> > Also I suggest to flush the work to make sure that it is finished
> > before __init code gets removed.
> > 
> Good point, and very interesting. I will look into it.
> 
> > 
> > The open question is what code the work will call. As mentioned
> > above, I am not sure that lockup_detector_delay_init() is safe.
> > IMHO, we need to manipulate only hardlockup detector and
> > we have to serialize it against CPU hotplug.
> > 
> As explained ahead, it has already consider the race against CPU
> hotplug.
> 
> Thanks,
> 
> 	Pingfan
>
diff mbox series

Patch

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b7bcd63c36b4..270d440fe4b7 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -118,6 +118,9 @@  static inline int hardlockup_detector_perf_init(void) { return 0; }
 
 void watchdog_nmi_stop(void);
 void watchdog_nmi_start(void);
+
+extern bool hld_detector_delay_initialized;
+extern struct wait_queue_head hld_detector_wait;
 int watchdog_nmi_probe(void);
 void watchdog_nmi_enable(unsigned int cpu);
 void watchdog_nmi_disable(unsigned int cpu);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e6dd5f0bc3e..bd4ae1839b72 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -103,7 +103,10 @@  void __weak watchdog_nmi_disable(unsigned int cpu)
 	hardlockup_detector_perf_disable();
 }
 
-/* Return 0, if a NMI watchdog is available. Error code otherwise */
+/*
+ * Return 0, if a NMI watchdog is available. -EBUSY if not ready.
+ * Other negative value if not support.
+ */
 int __weak __init watchdog_nmi_probe(void)
 {
 	return hardlockup_detector_perf_init();
@@ -739,15 +742,45 @@  int proc_watchdog_cpumask(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_SYSCTL */
 
+static void lockup_detector_delay_init(struct work_struct *work);
+bool hld_detector_delay_initialized __initdata;
+
+struct wait_queue_head hld_detector_wait __initdata =
+		__WAIT_QUEUE_HEAD_INITIALIZER(hld_detector_wait);
+
+static struct work_struct detector_work __initdata =
+		__WORK_INITIALIZER(detector_work, lockup_detector_delay_init);
+
+static void __init lockup_detector_delay_init(struct work_struct *work)
+{
+	int ret;
+
+	wait_event(hld_detector_wait, hld_detector_delay_initialized);
+	ret = watchdog_nmi_probe();
+	if (!ret) {
+		nmi_watchdog_available = true;
+		lockup_detector_setup();
+	} else {
+		WARN_ON(ret == -EBUSY);
+		pr_info("Perf NMI watchdog permanently disabled\n");
+	}
+}
+
 void __init lockup_detector_init(void)
 {
+	int ret;
+
 	if (tick_nohz_full_enabled())
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
 
 	cpumask_copy(&watchdog_cpumask,
 		     housekeeping_cpumask(HK_FLAG_TIMER));
 
-	if (!watchdog_nmi_probe())
+	ret = watchdog_nmi_probe();
+	if (!ret)
 		nmi_watchdog_available = true;
+	else if (ret == -EBUSY)
+		queue_work_on(smp_processor_id(), system_wq, &detector_work);
+
 	lockup_detector_setup();
 }