diff mbox series

[v1,1/3] remoteproc: qcom: q6v5: Send subdevice notifications before panic

Message ID 842a6b6307d26874959d29f2065aad544ff0b86c.1662995608.git.quic_gokukris@quicinc.com (mailing list archive)
State Not Applicable
Headers show
Series Handle coprocessor crash | expand

Commit Message

Gokul krishna Krishnakumar Sept. 12, 2022, 3:30 p.m. UTC
Subdevice notifications after a remoteproc has crashed are useful to any
clients that might want to preserve data pertaining to the driver after the
remoteproc crashed. Sending subdevice notifications before triggering a
kernel panic gives these drivers the time to do collect this information.

Change-Id: Id6e55fb038b70f54ff5854d2adff72b74b6a9570
Signed-off-by: Gokul krishna Krishnakumar <quic_gokukris@quicinc.com>
---
 drivers/remoteproc/qcom_q6v5.c | 31 +++++++++++++++++++++++++++++++
 drivers/remoteproc/qcom_q6v5.h |  2 ++
 2 files changed, 33 insertions(+)

Comments

Peng Fan Sept. 23, 2022, 7:03 a.m. UTC | #1
> Subject: [PATCH v1 1/3] remoteproc: qcom: q6v5: Send subdevice
> notifications before panic
> 
> Subdevice notifications after a remoteproc has crashed are useful to any
> clients that might want to preserve data pertaining to the driver after the
> remoteproc crashed. Sending subdevice notifications before triggering a
> kernel panic gives these drivers the time to do collect this information.
> 
> Change-Id: Id6e55fb038b70f54ff5854d2adff72b74b6a9570

Change-Id should be removed.

> Signed-off-by: Gokul krishna Krishnakumar <quic_gokukris@quicinc.com>
> ---
>  drivers/remoteproc/qcom_q6v5.c | 31
> +++++++++++++++++++++++++++++++  drivers/remoteproc/qcom_q6v5.h |
> 2 ++
>  2 files changed, 33 insertions(+)
> 
> diff --git a/drivers/remoteproc/qcom_q6v5.c
> b/drivers/remoteproc/qcom_q6v5.c index 497acfb..89f5384 100644
> --- a/drivers/remoteproc/qcom_q6v5.c
> +++ b/drivers/remoteproc/qcom_q6v5.c
> @@ -15,6 +15,7 @@
>  #include <linux/soc/qcom/smem.h>
>  #include <linux/soc/qcom/smem_state.h>
>  #include <linux/remoteproc.h>
> +#include <linux/delay.h>

Seq the head file order.

>  #include "qcom_common.h"
>  #include "qcom_q6v5.h"
> 
> @@ -94,6 +95,29 @@ int qcom_q6v5_unprepare(struct qcom_q6v5
> *q6v5)  }  EXPORT_SYMBOL_GPL(qcom_q6v5_unprepare);
> 
> +static void qcom_q6v5_crash_handler_work(struct work_struct *work) {
> +	struct qcom_q6v5 *q6v5 = container_of(work, struct qcom_q6v5,
> crash_handler);
> +	struct rproc *rproc = q6v5->rproc;
> +	struct rproc_subdev *subdev;
> +
> +	mutex_lock(&rproc->lock);
> +
> +	list_for_each_entry_reverse(subdev, &rproc->subdevs, node) {
> +		if (subdev->stop)
> +			subdev->stop(subdev, true);
> +	}
> +
> +	mutex_unlock(&rproc->lock);
> +
> +	/*
> +	 * Temporary workaround until ramdump userspace application
> calls
> +	 * sync() and fclose() on attempting the dump.
> +	 */
> +	msleep(100);

I have no knowledge on your system setup, but just guess this is
not reliable.

> +	panic("Panicking, remoteproc %s crashed\n", q6v5->rproc->name); }

It is remotecore crash, why use panic to crash the Linux side?

> +
>  static irqreturn_t q6v5_wdog_interrupt(int irq, void *data)  {
>  	struct qcom_q6v5 *q6v5 = data;
> @@ -113,6 +137,9 @@ static irqreturn_t q6v5_wdog_interrupt(int irq, void
> *data)
>  		dev_err(q6v5->dev, "watchdog without message\n");
> 
>  	q6v5->running = false;
> +	if (q6v5->rproc->recovery_disabled)
> +		schedule_work(&q6v5->crash_handler);
> +
>  	rproc_report_crash(q6v5->rproc, RPROC_WATCHDOG);
> 
>  	return IRQ_HANDLED;
> @@ -134,6 +161,9 @@ static irqreturn_t q6v5_fatal_interrupt(int irq, void
> *data)
>  		dev_err(q6v5->dev, "fatal error without message\n");
> 
>  	q6v5->running = false;
> +	if (q6v5->rproc->recovery_disabled)
> +		schedule_work(&q6v5->crash_handler);
> +
>  	rproc_report_crash(q6v5->rproc, RPROC_FATAL_ERROR);
> 
>  	return IRQ_HANDLED;
> @@ -354,6 +384,7 @@ int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct
> platform_device *pdev,
>  	if (IS_ERR(q6v5->path))
>  		return dev_err_probe(&pdev->dev, PTR_ERR(q6v5->path),
>  				     "failed to acquire interconnect path\n");
> +	INIT_WORK(&q6v5->crash_handler,
> qcom_q6v5_crash_handler_work);
> 
>  	return 0;
>  }
> diff --git a/drivers/remoteproc/qcom_q6v5.h
> b/drivers/remoteproc/qcom_q6v5.h index 5a859c4..b1654be 100644
> --- a/drivers/remoteproc/qcom_q6v5.h
> +++ b/drivers/remoteproc/qcom_q6v5.h
> @@ -29,6 +29,8 @@ struct qcom_q6v5 {
>  	int handover_irq;
>  	int stop_irq;
> 
> +	struct work_struct crash_handler;
> +
>  	bool handover_issued;
> 
>  	struct completion start_done;
> --
> 2.7.4
Regards,
Peng.
Bjorn Andersson Dec. 28, 2022, 4:20 p.m. UTC | #2
On Mon, Sep 19, 2022 at 09:00:38AM -0700, Gokul krishna Krishnakumar wrote:

Looking back, I don't think I gave you proper feedback on this feature.
Sorry about that Gokul.

> Subdevice notifications after a remoteproc has crashed are useful to any
> clients that might want to preserve data pertaining to the driver after the
> remoteproc crashed. Sending subdevice notifications before triggering a
> kernel panic gives these drivers the time to do collect this information.

The elephant in the room here is the call to panic(), this deserves more
attention in the commit messages.

> 
> Change-Id: Id6e55fb038b70f54ff5854d2adff72b74b6a9570

Please remember to remove your Gerrit Change-Id when posting to the
list.

> Signed-off-by: Gokul krishna Krishnakumar <quic_gokukris@quicinc.com>
> ---
>  drivers/remoteproc/qcom_q6v5.c | 31 +++++++++++++++++++++++++++++++
>  drivers/remoteproc/qcom_q6v5.h |  2 ++
>  2 files changed, 33 insertions(+)
> 
> diff --git a/drivers/remoteproc/qcom_q6v5.c b/drivers/remoteproc/qcom_q6v5.c
> index 497acfb..89f5384 100644
> --- a/drivers/remoteproc/qcom_q6v5.c
> +++ b/drivers/remoteproc/qcom_q6v5.c
> @@ -15,6 +15,7 @@
>  #include <linux/soc/qcom/smem.h>
>  #include <linux/soc/qcom/smem_state.h>
>  #include <linux/remoteproc.h>
> +#include <linux/delay.h>
>  #include "qcom_common.h"
>  #include "qcom_q6v5.h"
>  
> @@ -94,6 +95,29 @@ int qcom_q6v5_unprepare(struct qcom_q6v5 *q6v5)
>  }
>  EXPORT_SYMBOL_GPL(qcom_q6v5_unprepare);
>  
> +static void qcom_q6v5_crash_handler_work(struct work_struct *work)
> +{
> +	struct qcom_q6v5 *q6v5 = container_of(work, struct qcom_q6v5, crash_handler);
> +	struct rproc *rproc = q6v5->rproc;
> +	struct rproc_subdev *subdev;
> +
> +	mutex_lock(&rproc->lock);
> +
> +	list_for_each_entry_reverse(subdev, &rproc->subdevs, node) {
> +		if (subdev->stop)
> +			subdev->stop(subdev, true);
> +	}
> +
> +	mutex_unlock(&rproc->lock);
> +
> +	/*
> +	 * Temporary workaround until ramdump userspace application calls
> +	 * sync() and fclose() on attempting the dump.
> +	 */

I'm not able to see how this would happen, you only schedule_work() if
rproc->recovery_disabled and the coredump generation will only happen if
!rproc->recovery_disabled.

Also, the reason I can see for invoking panic() here would be to gather
a full system coredump to do post mortem analysis of all systems
involved. So why would you capture a coredump as well? (This needs to be
documented clearly...)

> +	msleep(100);
> +	panic("Panicking, remoteproc %s crashed\n", q6v5->rproc->name);

As you might know, calling panic() is frowned upon. But I can see the
benefit of being able to do full system post mortem analysis.

I do however think that your patch indicates a shortcoming (to you) in
the remoteproc_core's recovery logic. So please fix that shortcoming,
rather than circumventing it in your driver.

I.e. make it possible to get rproc_crash_handler_work() behave like you
want it to, with a good motivation to why you want that.

Regards,
Bjorn

> +}
> +
>  static irqreturn_t q6v5_wdog_interrupt(int irq, void *data)
>  {
>  	struct qcom_q6v5 *q6v5 = data;
> @@ -113,6 +137,9 @@ static irqreturn_t q6v5_wdog_interrupt(int irq, void *data)
>  		dev_err(q6v5->dev, "watchdog without message\n");
>  
>  	q6v5->running = false;
> +	if (q6v5->rproc->recovery_disabled)
> +		schedule_work(&q6v5->crash_handler);
> +
>  	rproc_report_crash(q6v5->rproc, RPROC_WATCHDOG);
>  
>  	return IRQ_HANDLED;
> @@ -134,6 +161,9 @@ static irqreturn_t q6v5_fatal_interrupt(int irq, void *data)
>  		dev_err(q6v5->dev, "fatal error without message\n");
>  
>  	q6v5->running = false;
> +	if (q6v5->rproc->recovery_disabled)
> +		schedule_work(&q6v5->crash_handler);
> +
>  	rproc_report_crash(q6v5->rproc, RPROC_FATAL_ERROR);
>  
>  	return IRQ_HANDLED;
> @@ -354,6 +384,7 @@ int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
>  	if (IS_ERR(q6v5->path))
>  		return dev_err_probe(&pdev->dev, PTR_ERR(q6v5->path),
>  				     "failed to acquire interconnect path\n");
> +	INIT_WORK(&q6v5->crash_handler, qcom_q6v5_crash_handler_work);
>  
>  	return 0;
>  }
> diff --git a/drivers/remoteproc/qcom_q6v5.h b/drivers/remoteproc/qcom_q6v5.h
> index 5a859c4..b1654be 100644
> --- a/drivers/remoteproc/qcom_q6v5.h
> +++ b/drivers/remoteproc/qcom_q6v5.h
> @@ -29,6 +29,8 @@ struct qcom_q6v5 {
>  	int handover_irq;
>  	int stop_irq;
>  
> +	struct work_struct crash_handler;
> +
>  	bool handover_issued;
>  
>  	struct completion start_done;
> -- 
> 2.7.4
>
diff mbox series

Patch

diff --git a/drivers/remoteproc/qcom_q6v5.c b/drivers/remoteproc/qcom_q6v5.c
index 497acfb..89f5384 100644
--- a/drivers/remoteproc/qcom_q6v5.c
+++ b/drivers/remoteproc/qcom_q6v5.c
@@ -15,6 +15,7 @@ 
 #include <linux/soc/qcom/smem.h>
 #include <linux/soc/qcom/smem_state.h>
 #include <linux/remoteproc.h>
+#include <linux/delay.h>
 #include "qcom_common.h"
 #include "qcom_q6v5.h"
 
@@ -94,6 +95,29 @@  int qcom_q6v5_unprepare(struct qcom_q6v5 *q6v5)
 }
 EXPORT_SYMBOL_GPL(qcom_q6v5_unprepare);
 
+static void qcom_q6v5_crash_handler_work(struct work_struct *work)
+{
+	struct qcom_q6v5 *q6v5 = container_of(work, struct qcom_q6v5, crash_handler);
+	struct rproc *rproc = q6v5->rproc;
+	struct rproc_subdev *subdev;
+
+	mutex_lock(&rproc->lock);
+
+	list_for_each_entry_reverse(subdev, &rproc->subdevs, node) {
+		if (subdev->stop)
+			subdev->stop(subdev, true);
+	}
+
+	mutex_unlock(&rproc->lock);
+
+	/*
+	 * Temporary workaround until ramdump userspace application calls
+	 * sync() and fclose() on attempting the dump.
+	 */
+	msleep(100);
+	panic("Panicking, remoteproc %s crashed\n", q6v5->rproc->name);
+}
+
 static irqreturn_t q6v5_wdog_interrupt(int irq, void *data)
 {
 	struct qcom_q6v5 *q6v5 = data;
@@ -113,6 +137,9 @@  static irqreturn_t q6v5_wdog_interrupt(int irq, void *data)
 		dev_err(q6v5->dev, "watchdog without message\n");
 
 	q6v5->running = false;
+	if (q6v5->rproc->recovery_disabled)
+		schedule_work(&q6v5->crash_handler);
+
 	rproc_report_crash(q6v5->rproc, RPROC_WATCHDOG);
 
 	return IRQ_HANDLED;
@@ -134,6 +161,9 @@  static irqreturn_t q6v5_fatal_interrupt(int irq, void *data)
 		dev_err(q6v5->dev, "fatal error without message\n");
 
 	q6v5->running = false;
+	if (q6v5->rproc->recovery_disabled)
+		schedule_work(&q6v5->crash_handler);
+
 	rproc_report_crash(q6v5->rproc, RPROC_FATAL_ERROR);
 
 	return IRQ_HANDLED;
@@ -354,6 +384,7 @@  int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
 	if (IS_ERR(q6v5->path))
 		return dev_err_probe(&pdev->dev, PTR_ERR(q6v5->path),
 				     "failed to acquire interconnect path\n");
+	INIT_WORK(&q6v5->crash_handler, qcom_q6v5_crash_handler_work);
 
 	return 0;
 }
diff --git a/drivers/remoteproc/qcom_q6v5.h b/drivers/remoteproc/qcom_q6v5.h
index 5a859c4..b1654be 100644
--- a/drivers/remoteproc/qcom_q6v5.h
+++ b/drivers/remoteproc/qcom_q6v5.h
@@ -29,6 +29,8 @@  struct qcom_q6v5 {
 	int handover_irq;
 	int stop_irq;
 
+	struct work_struct crash_handler;
+
 	bool handover_issued;
 
 	struct completion start_done;