diff mbox series

[v3,6/8] remoteproc: Introduce "panic" callback in ops

Message ID 20200211005059.1377279-7-bjorn.andersson@linaro.org (mailing list archive)
State New, archived
Headers show
Series remoteproc: qcom: post mortem debug support | expand

Commit Message

Bjorn Andersson Feb. 11, 2020, 12:50 a.m. UTC
Introduce a "panic" function in the remoteproc ops table, to allow
remoteproc instances to perform operations needed in order to aid in
post mortem system debugging, such as flushing caches etc, when the
kernel panics. The function can return a number of milliseconds needed
by the remote to "settle" and the core will wait the longest returned
duration before returning from the panic handler.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---

Changes since v2:
- Replace per-rproc notifier callback with one generic
- Move the mdelay() from the individual drivers to the core and sleep the
  longest returned duration. Drivers that doesn't need a delay can return 0.
- Unregister the notifier on exit

 drivers/remoteproc/remoteproc_core.c | 46 ++++++++++++++++++++++++++++
 include/linux/remoteproc.h           |  3 ++
 2 files changed, 49 insertions(+)

Comments

Arnaud POULIQUEN Feb. 13, 2020, 4 p.m. UTC | #1
On 2/11/20 1:50 AM, Bjorn Andersson wrote:
> Introduce a "panic" function in the remoteproc ops table, to allow
> remoteproc instances to perform operations needed in order to aid in
> post mortem system debugging, such as flushing caches etc, when the
> kernel panics. The function can return a number of milliseconds needed
> by the remote to "settle" and the core will wait the longest returned
> duration before returning from the panic handler.
> 
> Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
> ---
> 
> Changes since v2:
> - Replace per-rproc notifier callback with one generic
> - Move the mdelay() from the individual drivers to the core and sleep the
>   longest returned duration. Drivers that doesn't need a delay can return 0.
> - Unregister the notifier on exit
> 
>  drivers/remoteproc/remoteproc_core.c | 46 ++++++++++++++++++++++++++++
>  include/linux/remoteproc.h           |  3 ++
>  2 files changed, 49 insertions(+)
> 
> diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
> index 097f33e4f1f3..8b6932027d36 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -16,6 +16,7 @@
>  
>  #define pr_fmt(fmt)    "%s: " fmt, __func__
>  
> +#include <linux/delay.h>
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/device.h>
> @@ -43,6 +44,7 @@
>  
>  static DEFINE_MUTEX(rproc_list_mutex);
>  static LIST_HEAD(rproc_list);
> +static struct notifier_block rproc_panic_nb;
>  
>  typedef int (*rproc_handle_resource_t)(struct rproc *rproc,
>  				 void *, int offset, int avail);
> @@ -2216,10 +2218,53 @@ void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type)
>  }
>  EXPORT_SYMBOL(rproc_report_crash);
>  
> +static int rproc_panic_handler(struct notifier_block *nb, unsigned long event,
> +			       void *ptr)
> +{
> +	unsigned int longest = 0;
> +	struct rproc *rproc;
> +	unsigned int d;
> +	int locked;
> +
> +	locked = mutex_trylock(&rproc_list_mutex);
> +	if (!locked) {
> +		pr_err("Failed to acquire rproc list lock, won't call panic functions\n");
> +		return NOTIFY_DONE;
> +	}
As consequence the panic is not handled for all rproc instance if the mutex is locked.
it seems to me that the first solution with the delay side effect is more safety...

> +
> +	list_for_each_entry(rproc, &rproc_list, node) {
> +		if (!rproc->ops->panic || rproc->state != RPROC_RUNNING)
> +			continue;
> +
> +		d = rproc->ops->panic(rproc);
> +		if (d > longest)
> +			longest = d;
> +	}
> +
> +	mutex_unlock(&rproc_list_mutex);
> +
> +	/* Delay panic for the longest requested duration */
> +	mdelay(longest);
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static void __init rproc_init_panic(void)
> +{
> +	rproc_panic_nb.notifier_call = rproc_panic_handler;
> +	atomic_notifier_chain_register(&panic_notifier_list, &rproc_panic_nb);
> +}
> +
> +static void __exit rproc_exit_panic(void)
> +{
> +	atomic_notifier_chain_unregister(&panic_notifier_list, &rproc_panic_nb);
> +}
> +
>  static int __init remoteproc_init(void)
>  {
>  	rproc_init_sysfs();
>  	rproc_init_debugfs();
> +	rproc_init_panic();
>  
>  	return 0;
>  }
> @@ -2229,6 +2274,7 @@ static void __exit remoteproc_exit(void)
>  {
>  	ida_destroy(&rproc_dev_index);
>  
> +	rproc_exit_panic();
>  	rproc_exit_debugfs();
>  	rproc_exit_sysfs();
>  }
> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> index 16ad66683ad0..14f05f26cbcd 100644
> --- a/include/linux/remoteproc.h
> +++ b/include/linux/remoteproc.h
> @@ -369,6 +369,8 @@ enum rsc_handling_status {
>   *			expects to find it
>   * @sanity_check:	sanity check the fw image
>   * @get_boot_addr:	get boot address to entry point specified in firmware
> + * @panic:	optional callback to react to system panic, core will delay
> + *		panic at least the returned number of milliseconds
>   */
>  struct rproc_ops {
>  	int (*start)(struct rproc *rproc);
> @@ -383,6 +385,7 @@ struct rproc_ops {
>  	int (*load)(struct rproc *rproc, const struct firmware *fw);
>  	int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
>  	u32 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
> +	unsigned int (*panic)(struct rproc *rproc);
>  };
>  
>  /**
>
Stephen Boyd Feb. 14, 2020, 2:41 a.m. UTC | #2
Quoting Bjorn Andersson (2020-02-10 16:50:57)
> diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
> index 097f33e4f1f3..8b6932027d36 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -2216,10 +2218,53 @@ void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type)
>  }
>  EXPORT_SYMBOL(rproc_report_crash);
>  
> +static int rproc_panic_handler(struct notifier_block *nb, unsigned long event,
> +                              void *ptr)
> +{
> +       unsigned int longest = 0;
> +       struct rproc *rproc;
> +       unsigned int d;
> +       int locked;
> +
> +       locked = mutex_trylock(&rproc_list_mutex);
> +       if (!locked) {
> +               pr_err("Failed to acquire rproc list lock, won't call panic functions\n");
> +               return NOTIFY_DONE;
> +       }
> +
> +       list_for_each_entry(rproc, &rproc_list, node) {
> +               if (!rproc->ops->panic || rproc->state != RPROC_RUNNING)
> +                       continue;
> +
> +               d = rproc->ops->panic(rproc);
> +               if (d > longest)
> +                       longest = d;

Could be

	d = max(longest, d);

> +       }
> +
> +       mutex_unlock(&rproc_list_mutex);
> +
> +       /* Delay panic for the longest requested duration */
> +       mdelay(longest);

Is this to flush caches? Maybe indicate that in the comment.

> +
> +       return NOTIFY_DONE;
> +}
> +
> +static void __init rproc_init_panic(void)
> +{
> +       rproc_panic_nb.notifier_call = rproc_panic_handler;
> +       atomic_notifier_chain_register(&panic_notifier_list, &rproc_panic_nb);

This is an atomic notifier, but the notifier function takes a mutex,
which sleeps. It should use spinlocks, and never sleep, given that panic
can be called from anywhere.

> +}
> +
> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> index 16ad66683ad0..14f05f26cbcd 100644
> --- a/include/linux/remoteproc.h
> +++ b/include/linux/remoteproc.h
> @@ -369,6 +369,8 @@ enum rsc_handling_status {
>   *                     expects to find it
>   * @sanity_check:      sanity check the fw image
>   * @get_boot_addr:     get boot address to entry point specified in firmware
> + * @panic:     optional callback to react to system panic, core will delay
> + *             panic at least the returned number of milliseconds
>   */
>  struct rproc_ops {
>         int (*start)(struct rproc *rproc);
> @@ -383,6 +385,7 @@ struct rproc_ops {
>         int (*load)(struct rproc *rproc, const struct firmware *fw);
>         int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
>         u32 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
> +       unsigned int (*panic)(struct rproc *rproc);

Maybe should be unsigned long to match other "timeouts" in the kernel.
Bjorn Andersson Feb. 14, 2020, 4:37 a.m. UTC | #3
On Thu 13 Feb 18:41 PST 2020, Stephen Boyd wrote:

> Quoting Bjorn Andersson (2020-02-10 16:50:57)
> > diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
> > index 097f33e4f1f3..8b6932027d36 100644
> > --- a/drivers/remoteproc/remoteproc_core.c
> > +++ b/drivers/remoteproc/remoteproc_core.c
> > @@ -2216,10 +2218,53 @@ void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type)
> >  }
> >  EXPORT_SYMBOL(rproc_report_crash);
> >  
> > +static int rproc_panic_handler(struct notifier_block *nb, unsigned long event,
> > +                              void *ptr)
> > +{
> > +       unsigned int longest = 0;
> > +       struct rproc *rproc;
> > +       unsigned int d;
> > +       int locked;
> > +
> > +       locked = mutex_trylock(&rproc_list_mutex);
> > +       if (!locked) {
> > +               pr_err("Failed to acquire rproc list lock, won't call panic functions\n");
> > +               return NOTIFY_DONE;
> > +       }
> > +
> > +       list_for_each_entry(rproc, &rproc_list, node) {
> > +               if (!rproc->ops->panic || rproc->state != RPROC_RUNNING)
> > +                       continue;
> > +
> > +               d = rproc->ops->panic(rproc);
> > +               if (d > longest)
> > +                       longest = d;
> 
> Could be
> 
> 	d = max(longest, d);
> 

I like this better and now I have an excuse to change to it.

> > +       }
> > +
> > +       mutex_unlock(&rproc_list_mutex);
> > +
> > +       /* Delay panic for the longest requested duration */
> > +       mdelay(longest);
> 
> Is this to flush caches? Maybe indicate that in the comment.
> 

Here, in the core, it's for whatever the individual drivers might need
it for, but "flushing caches" is likely the main purpose.

That said, the Qualcomm implementation is, as you can see, to issue a
generic "stop request", so flushing caches will not be the only thing
that happens.

> > +
> > +       return NOTIFY_DONE;
> > +}
> > +
> > +static void __init rproc_init_panic(void)
> > +{
> > +       rproc_panic_nb.notifier_call = rproc_panic_handler;
> > +       atomic_notifier_chain_register(&panic_notifier_list, &rproc_panic_nb);
> 
> This is an atomic notifier, but the notifier function takes a mutex,
> which sleeps. It should use spinlocks, and never sleep, given that panic
> can be called from anywhere.
> 

Given that we're only trylocking I was expecting there not to be a
sleep. But if that's the case I'll have to revisit this.

If I rework rproc_get_by_phandle() slightly I should be able to rely on
rcu instead of the mutex for the two readers, which would also resolve
Arnaud's concern regarding the possibility of a panic while updating the
list will cause the panic handling to be skipped.

> > +}
> > +
> > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > index 16ad66683ad0..14f05f26cbcd 100644
> > --- a/include/linux/remoteproc.h
> > +++ b/include/linux/remoteproc.h
> > @@ -369,6 +369,8 @@ enum rsc_handling_status {
> >   *                     expects to find it
> >   * @sanity_check:      sanity check the fw image
> >   * @get_boot_addr:     get boot address to entry point specified in firmware
> > + * @panic:     optional callback to react to system panic, core will delay
> > + *             panic at least the returned number of milliseconds
> >   */
> >  struct rproc_ops {
> >         int (*start)(struct rproc *rproc);
> > @@ -383,6 +385,7 @@ struct rproc_ops {
> >         int (*load)(struct rproc *rproc, const struct firmware *fw);
> >         int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
> >         u32 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
> > +       unsigned int (*panic)(struct rproc *rproc);
> 
> Maybe should be unsigned long to match other "timeouts" in the kernel.

Sounds good.

Thanks,
Bjorn
diff mbox series

Patch

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 097f33e4f1f3..8b6932027d36 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -16,6 +16,7 @@ 
 
 #define pr_fmt(fmt)    "%s: " fmt, __func__
 
+#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
@@ -43,6 +44,7 @@ 
 
 static DEFINE_MUTEX(rproc_list_mutex);
 static LIST_HEAD(rproc_list);
+static struct notifier_block rproc_panic_nb;
 
 typedef int (*rproc_handle_resource_t)(struct rproc *rproc,
 				 void *, int offset, int avail);
@@ -2216,10 +2218,53 @@  void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type)
 }
 EXPORT_SYMBOL(rproc_report_crash);
 
+static int rproc_panic_handler(struct notifier_block *nb, unsigned long event,
+			       void *ptr)
+{
+	unsigned int longest = 0;
+	struct rproc *rproc;
+	unsigned int d;
+	int locked;
+
+	locked = mutex_trylock(&rproc_list_mutex);
+	if (!locked) {
+		pr_err("Failed to acquire rproc list lock, won't call panic functions\n");
+		return NOTIFY_DONE;
+	}
+
+	list_for_each_entry(rproc, &rproc_list, node) {
+		if (!rproc->ops->panic || rproc->state != RPROC_RUNNING)
+			continue;
+
+		d = rproc->ops->panic(rproc);
+		if (d > longest)
+			longest = d;
+	}
+
+	mutex_unlock(&rproc_list_mutex);
+
+	/* Delay panic for the longest requested duration */
+	mdelay(longest);
+
+	return NOTIFY_DONE;
+}
+
+static void __init rproc_init_panic(void)
+{
+	rproc_panic_nb.notifier_call = rproc_panic_handler;
+	atomic_notifier_chain_register(&panic_notifier_list, &rproc_panic_nb);
+}
+
+static void __exit rproc_exit_panic(void)
+{
+	atomic_notifier_chain_unregister(&panic_notifier_list, &rproc_panic_nb);
+}
+
 static int __init remoteproc_init(void)
 {
 	rproc_init_sysfs();
 	rproc_init_debugfs();
+	rproc_init_panic();
 
 	return 0;
 }
@@ -2229,6 +2274,7 @@  static void __exit remoteproc_exit(void)
 {
 	ida_destroy(&rproc_dev_index);
 
+	rproc_exit_panic();
 	rproc_exit_debugfs();
 	rproc_exit_sysfs();
 }
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 16ad66683ad0..14f05f26cbcd 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -369,6 +369,8 @@  enum rsc_handling_status {
  *			expects to find it
  * @sanity_check:	sanity check the fw image
  * @get_boot_addr:	get boot address to entry point specified in firmware
+ * @panic:	optional callback to react to system panic, core will delay
+ *		panic at least the returned number of milliseconds
  */
 struct rproc_ops {
 	int (*start)(struct rproc *rproc);
@@ -383,6 +385,7 @@  struct rproc_ops {
 	int (*load)(struct rproc *rproc, const struct firmware *fw);
 	int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
 	u32 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
+	unsigned int (*panic)(struct rproc *rproc);
 };
 
 /**