diff mbox series

[V2] remoteproc: support self recovery after rproc crash

Message ID 20220126085120.3397450-1-peng.fan@oss.nxp.com (mailing list archive)
State Superseded
Headers show
Series [V2] remoteproc: support self recovery after rproc crash | expand

Commit Message

Peng Fan (OSS) Jan. 26, 2022, 8:51 a.m. UTC
From: Peng Fan <peng.fan@nxp.com>

Current logic only support main processor to stop/start the remote
processor after rproc crash. However to SoC, such as i.MX8QM/QXP, the
remote processor could do self recovery after crash and trigger watchdog
reboot. It does not need main processor to load image, stop/start M4
core.

This patch add a new flag to indicate whether the SoC has self recovery
capability. And introduce two functions: rproc_self_recovery,
rproc_assisted_recovery for the two cases. Assisted recovery is as
before, let main processor to help recovery, while self recovery is
recover itself withou help. To self recovery, we only do detach and
attach.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
---

V2:
 Nothing change in V2.
 Only move this patch out from
 https://patchwork.kernel.org/project/linux-remoteproc/list/?series=604364

 drivers/remoteproc/remoteproc_core.c | 66 ++++++++++++++++++++--------
 include/linux/remoteproc.h           |  2 +
 2 files changed, 49 insertions(+), 19 deletions(-)

Comments

Peng Fan Feb. 7, 2022, 1:31 a.m. UTC | #1
> Subject: [PATCH V2] remoteproc: support self recovery after rproc crash

Any comments?

Thanks,
Peng.

> 
> From: Peng Fan <peng.fan@nxp.com>
> 
> Current logic only support main processor to stop/start the remote processor
> after rproc crash. However to SoC, such as i.MX8QM/QXP, the remote
> processor could do self recovery after crash and trigger watchdog reboot. It
> does not need main processor to load image, stop/start M4 core.
> 
> This patch add a new flag to indicate whether the SoC has self recovery
> capability. And introduce two functions: rproc_self_recovery,
> rproc_assisted_recovery for the two cases. Assisted recovery is as before, let
> main processor to help recovery, while self recovery is recover itself withou
> help. To self recovery, we only do detach and attach.
> 
> Signed-off-by: Peng Fan <peng.fan@nxp.com>
> ---
> 
> V2:
>  Nothing change in V2.
>  Only move this patch out from
>  https://patchwork.kernel.org/project/linux-remoteproc/list/?series=604364
> 
>  drivers/remoteproc/remoteproc_core.c | 66 ++++++++++++++++++++--------
>  include/linux/remoteproc.h           |  2 +
>  2 files changed, 49 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/remoteproc/remoteproc_core.c
> b/drivers/remoteproc/remoteproc_core.c
> index 69f51acf235e..4bd5544dab8f 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -1887,6 +1887,49 @@ static int __rproc_detach(struct rproc *rproc)
>  	return 0;
>  }
> 
> +static int rproc_self_recovery(struct rproc *rproc) {
> +	int ret;
> +
> +	mutex_unlock(&rproc->lock);
> +	ret = rproc_detach(rproc);
> +	mutex_lock(&rproc->lock);
> +	if (ret)
> +		return ret;
> +
> +	if (atomic_inc_return(&rproc->power) > 1)
> +		return 0;
> +	return rproc_attach(rproc);
> +}
> +
> +static int rproc_assisted_recovery(struct rproc *rproc) {
> +	const struct firmware *firmware_p;
> +	struct device *dev = &rproc->dev;
> +	int ret;
> +
> +	ret = rproc_stop(rproc, true);
> +	if (ret)
> +		return ret;
> +
> +	/* generate coredump */
> +	rproc->ops->coredump(rproc);
> +
> +	/* load firmware */
> +	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> +	if (ret < 0) {
> +		dev_err(dev, "request_firmware failed: %d\n", ret);
> +		return ret;
> +	}
> +
> +	/* boot the remote processor up again */
> +	ret = rproc_start(rproc, firmware_p);
> +
> +	release_firmware(firmware_p);
> +
> +	return ret;
> +}
> +
>  /**
>   * rproc_trigger_recovery() - recover a remoteproc
>   * @rproc: the remote processor
> @@ -1901,7 +1944,6 @@ static int __rproc_detach(struct rproc *rproc)
>   */
>  int rproc_trigger_recovery(struct rproc *rproc)  {
> -	const struct firmware *firmware_p;
>  	struct device *dev = &rproc->dev;
>  	int ret;
> 
> @@ -1915,24 +1957,10 @@ int rproc_trigger_recovery(struct rproc *rproc)
> 
>  	dev_err(dev, "recovering %s\n", rproc->name);
> 
> -	ret = rproc_stop(rproc, true);
> -	if (ret)
> -		goto unlock_mutex;
> -
> -	/* generate coredump */
> -	rproc->ops->coredump(rproc);
> -
> -	/* load firmware */
> -	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> -	if (ret < 0) {
> -		dev_err(dev, "request_firmware failed: %d\n", ret);
> -		goto unlock_mutex;
> -	}
> -
> -	/* boot the remote processor up again */
> -	ret = rproc_start(rproc, firmware_p);
> -
> -	release_firmware(firmware_p);
> +	if (rproc->self_recovery)
> +		ret = rproc_self_recovery(rproc);
> +	else
> +		ret = rproc_assisted_recovery(rproc);
> 
>  unlock_mutex:
>  	mutex_unlock(&rproc->lock);
> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index
> e0600e1e5c17..b32ef46f8aa4 100644
> --- a/include/linux/remoteproc.h
> +++ b/include/linux/remoteproc.h
> @@ -529,6 +529,7 @@ struct rproc_dump_segment {
>   * @elf_machine: firmware ELF machine
>   * @cdev: character device of the rproc
>   * @cdev_put_on_release: flag to indicate if remoteproc should be
> shutdown on @char_dev release
> + * @self_recovery: flag to indicate if remoteproc support self recovery
>   */
>  struct rproc {
>  	struct list_head node;
> @@ -568,6 +569,7 @@ struct rproc {
>  	u16 elf_machine;
>  	struct cdev cdev;
>  	bool cdev_put_on_release;
> +	bool self_recovery;
>  };
> 
>  /**
> --
> 2.25.1
Mathieu Poirier Feb. 7, 2022, 5:34 p.m. UTC | #2
On Mon, Feb 07, 2022 at 01:31:07AM +0000, Peng Fan wrote:
> > Subject: [PATCH V2] remoteproc: support self recovery after rproc crash
> 
> Any comments?

Well... At this time there is two patchsets on the mailing list that are
introducing serious changes to the subsystem - yours and Arnaud's virtio
refactoring work:

. [PATCH V2] remoteproc: support self recovery after rproc crash
. [RFC PATCH v3 0/4] remoteproc: restructure the remoteproc VirtIO device

Both patchsets have ramifications for NXP, ST and TI.  As such I am expecting
you, Arnaud and Hari to review those before I start looking at them.

> 
> Thanks,
> Peng.
> 
> > 
> > From: Peng Fan <peng.fan@nxp.com>
> > 
> > Current logic only support main processor to stop/start the remote processor
> > after rproc crash. However to SoC, such as i.MX8QM/QXP, the remote
> > processor could do self recovery after crash and trigger watchdog reboot. It
> > does not need main processor to load image, stop/start M4 core.
> > 
> > This patch add a new flag to indicate whether the SoC has self recovery
> > capability. And introduce two functions: rproc_self_recovery,
> > rproc_assisted_recovery for the two cases. Assisted recovery is as before, let
> > main processor to help recovery, while self recovery is recover itself withou
> > help. To self recovery, we only do detach and attach.
> > 
> > Signed-off-by: Peng Fan <peng.fan@nxp.com>
> > ---
> > 
> > V2:
> >  Nothing change in V2.
> >  Only move this patch out from
> >  https://patchwork.kernel.org/project/linux-remoteproc/list/?series=604364
> > 
> >  drivers/remoteproc/remoteproc_core.c | 66 ++++++++++++++++++++--------
> >  include/linux/remoteproc.h           |  2 +
> >  2 files changed, 49 insertions(+), 19 deletions(-)
> > 
> > diff --git a/drivers/remoteproc/remoteproc_core.c
> > b/drivers/remoteproc/remoteproc_core.c
> > index 69f51acf235e..4bd5544dab8f 100644
> > --- a/drivers/remoteproc/remoteproc_core.c
> > +++ b/drivers/remoteproc/remoteproc_core.c
> > @@ -1887,6 +1887,49 @@ static int __rproc_detach(struct rproc *rproc)
> >  	return 0;
> >  }
> > 
> > +static int rproc_self_recovery(struct rproc *rproc) {
> > +	int ret;
> > +
> > +	mutex_unlock(&rproc->lock);
> > +	ret = rproc_detach(rproc);
> > +	mutex_lock(&rproc->lock);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (atomic_inc_return(&rproc->power) > 1)
> > +		return 0;
> > +	return rproc_attach(rproc);
> > +}
> > +
> > +static int rproc_assisted_recovery(struct rproc *rproc) {
> > +	const struct firmware *firmware_p;
> > +	struct device *dev = &rproc->dev;
> > +	int ret;
> > +
> > +	ret = rproc_stop(rproc, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* generate coredump */
> > +	rproc->ops->coredump(rproc);
> > +
> > +	/* load firmware */
> > +	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> > +	if (ret < 0) {
> > +		dev_err(dev, "request_firmware failed: %d\n", ret);
> > +		return ret;
> > +	}
> > +
> > +	/* boot the remote processor up again */
> > +	ret = rproc_start(rproc, firmware_p);
> > +
> > +	release_firmware(firmware_p);
> > +
> > +	return ret;
> > +}
> > +
> >  /**
> >   * rproc_trigger_recovery() - recover a remoteproc
> >   * @rproc: the remote processor
> > @@ -1901,7 +1944,6 @@ static int __rproc_detach(struct rproc *rproc)
> >   */
> >  int rproc_trigger_recovery(struct rproc *rproc)  {
> > -	const struct firmware *firmware_p;
> >  	struct device *dev = &rproc->dev;
> >  	int ret;
> > 
> > @@ -1915,24 +1957,10 @@ int rproc_trigger_recovery(struct rproc *rproc)
> > 
> >  	dev_err(dev, "recovering %s\n", rproc->name);
> > 
> > -	ret = rproc_stop(rproc, true);
> > -	if (ret)
> > -		goto unlock_mutex;
> > -
> > -	/* generate coredump */
> > -	rproc->ops->coredump(rproc);
> > -
> > -	/* load firmware */
> > -	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> > -	if (ret < 0) {
> > -		dev_err(dev, "request_firmware failed: %d\n", ret);
> > -		goto unlock_mutex;
> > -	}
> > -
> > -	/* boot the remote processor up again */
> > -	ret = rproc_start(rproc, firmware_p);
> > -
> > -	release_firmware(firmware_p);
> > +	if (rproc->self_recovery)
> > +		ret = rproc_self_recovery(rproc);
> > +	else
> > +		ret = rproc_assisted_recovery(rproc);
> > 
> >  unlock_mutex:
> >  	mutex_unlock(&rproc->lock);
> > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index
> > e0600e1e5c17..b32ef46f8aa4 100644
> > --- a/include/linux/remoteproc.h
> > +++ b/include/linux/remoteproc.h
> > @@ -529,6 +529,7 @@ struct rproc_dump_segment {
> >   * @elf_machine: firmware ELF machine
> >   * @cdev: character device of the rproc
> >   * @cdev_put_on_release: flag to indicate if remoteproc should be
> > shutdown on @char_dev release
> > + * @self_recovery: flag to indicate if remoteproc support self recovery
> >   */
> >  struct rproc {
> >  	struct list_head node;
> > @@ -568,6 +569,7 @@ struct rproc {
> >  	u16 elf_machine;
> >  	struct cdev cdev;
> >  	bool cdev_put_on_release;
> > +	bool self_recovery;
> >  };
> > 
> >  /**
> > --
> > 2.25.1
>
Arnaud POULIQUEN Feb. 14, 2022, 6:41 p.m. UTC | #3
Hi Peng,

On 1/26/22 09:51, Peng Fan (OSS) wrote:
> From: Peng Fan <peng.fan@nxp.com>
> 
> Current logic only support main processor to stop/start the remote
> processor after rproc crash. However to SoC, such as i.MX8QM/QXP, the
> remote processor could do self recovery after crash and trigger watchdog
> reboot. It does not need main processor to load image, stop/start M4
> core.


On stm32mp1 platform the remote processor watchdog generates an early interrupt
that could be used to detach and reattach before the reset of the remote processor.
I need to test race condition,but I suppose that this should works if the resource
table is not reinitialized by the remote processor firmware.

Another option for the stm32mp1 is that remoteproc manages the reset of the 
remote processor.
For instance this allows to save a core-dump before manually resetting the remote
processor.
But looks like this use case can be handled later, as mentioned below. 

> 
> This patch add a new flag to indicate whether the SoC has self recovery
> capability. And introduce two functions: rproc_self_recovery,
> rproc_assisted_recovery for the two cases. Assisted recovery is as
> before, let main processor to help recovery, while self recovery is
> recover itself withou help. To self recovery, we only do detach and
> attach.


> 
> Signed-off-by: Peng Fan <peng.fan@nxp.com>
> ---
> 
> V2:
>  Nothing change in V2.
>  Only move this patch out from
>  https://patchwork.kernel.org/project/linux-remoteproc/list/?series=604364
> 
>  drivers/remoteproc/remoteproc_core.c | 66 ++++++++++++++++++++--------
>  include/linux/remoteproc.h           |  2 +
>  2 files changed, 49 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
> index 69f51acf235e..4bd5544dab8f 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -1887,6 +1887,49 @@ static int __rproc_detach(struct rproc *rproc)
>  	return 0;
>  }
>  
> +static int rproc_self_recovery(struct rproc *rproc)
> +{
> +	int ret;
> +
> +	mutex_unlock(&rproc->lock);
> +	ret = rproc_detach(rproc);
> +	mutex_lock(&rproc->lock);
> +	if (ret)
> +		return ret;

Here we would want to perform a core dump and manually reset the
co-processor.
I suppose that a new rproc ops could be called here in a next step.

> +
> +	if (atomic_inc_return(&rproc->power) > 1)
> +		return 0;

Do you identify a use case that needs to test rproc->power to
skip the attach?
If yes could you add a comment to describe it?

> +	return rproc_attach(rproc);
> +}
> +
> +static int rproc_assisted_recovery(struct rproc *rproc)
> +{
> +	const struct firmware *firmware_p;
> +	struct device *dev = &rproc->dev;
> +	int ret;
> +
> +	ret = rproc_stop(rproc, true);
> +	if (ret)
> +		return ret;
> +
> +	/* generate coredump */
> +	rproc->ops->coredump(rproc);
> +
> +	/* load firmware */
> +	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> +	if (ret < 0) {
> +		dev_err(dev, "request_firmware failed: %d\n", ret);
> +		return ret;
> +	}
> +
> +	/* boot the remote processor up again */
> +	ret = rproc_start(rproc, firmware_p);
> +
> +	release_firmware(firmware_p);
> +
> +	return ret;
> +}
> +
>  /**
>   * rproc_trigger_recovery() - recover a remoteproc
>   * @rproc: the remote processor
> @@ -1901,7 +1944,6 @@ static int __rproc_detach(struct rproc *rproc)
>   */
>  int rproc_trigger_recovery(struct rproc *rproc)
>  {
> -	const struct firmware *firmware_p;
>  	struct device *dev = &rproc->dev;
>  	int ret;
>  
> @@ -1915,24 +1957,10 @@ int rproc_trigger_recovery(struct rproc *rproc)
>  
>  	dev_err(dev, "recovering %s\n", rproc->name);
>  
> -	ret = rproc_stop(rproc, true);
> -	if (ret)
> -		goto unlock_mutex;
> -
> -	/* generate coredump */
> -	rproc->ops->coredump(rproc);
> -
> -	/* load firmware */
> -	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> -	if (ret < 0) {
> -		dev_err(dev, "request_firmware failed: %d\n", ret);
> -		goto unlock_mutex;
> -	}
> -
> -	/* boot the remote processor up again */
> -	ret = rproc_start(rproc, firmware_p);
> -
> -	release_firmware(firmware_p);
> +	if (rproc->self_recovery)
> +		ret = rproc_self_recovery(rproc);

If some platforms have to manually reset the remote processor (without
reloading the firmware) the name could not be relevant...

Following comments are only suggestions that needs to be commented by maintainers

What about rproc_attach_recovery ?

> +	else
> +		ret = rproc_assisted_recovery(rproc);

and rproc_firmware_recovery ?


>  
>  unlock_mutex:
>  	mutex_unlock(&rproc->lock);
> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> index e0600e1e5c17..b32ef46f8aa4 100644
> --- a/include/linux/remoteproc.h
> +++ b/include/linux/remoteproc.h
> @@ -529,6 +529,7 @@ struct rproc_dump_segment {
>   * @elf_machine: firmware ELF machine
>   * @cdev: character device of the rproc
>   * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release
> + * @self_recovery: flag to indicate if remoteproc support self recovery
>   */
>  struct rproc {
>  	struct list_head node;
> @@ -568,6 +569,7 @@ struct rproc {
>  	u16 elf_machine;
>  	struct cdev cdev;
>  	bool cdev_put_on_release;
> +	bool self_recovery;

This bool seems needed because we have lost the previous state before crash. 
I wonder if a new rproc->state such as RPROC_REBOOT could avoid this boolean.


I will try to test you patch on stm32mp1 next week

Regards,
Arnaud

>  };
>  
>  /**
Peng Fan Feb. 15, 2022, 8:42 a.m. UTC | #4
> Subject: Re: [PATCH V2] remoteproc: support self recovery after rproc crash
> 
> Hi Peng,
> 
> On 1/26/22 09:51, Peng Fan (OSS) wrote:
> > From: Peng Fan <peng.fan@nxp.com>
> >
> > Current logic only support main processor to stop/start the remote
> > processor after rproc crash. However to SoC, such as i.MX8QM/QXP, the
> > remote processor could do self recovery after crash and trigger
> > watchdog reboot. It does not need main processor to load image,
> > stop/start M4 core.
> 
> 
> On stm32mp1 platform the remote processor watchdog generates an early
> interrupt that could be used to detach and reattach before the reset of the
> remote processor.
> I need to test race condition,but I suppose that this should works if the
> resource table is not reinitialized by the remote processor firmware.

In i.MX8QM/QXP partition setup, resource table will be reinitialized by
remote firmware.

> 
> Another option for the stm32mp1 is that remoteproc manages the reset of
> the remote processor.
> For instance this allows to save a core-dump before manually resetting the
> remote processor.
> But looks like this use case can be handled later, as mentioned below.
> 
> >
> > This patch add a new flag to indicate whether the SoC has self
> > recovery capability. And introduce two functions: rproc_self_recovery,
> > rproc_assisted_recovery for the two cases. Assisted recovery is as
> > before, let main processor to help recovery, while self recovery is
> > recover itself withou help. To self recovery, we only do detach and
> > attach.
> 
> 
> >
> > Signed-off-by: Peng Fan <peng.fan@nxp.com>
> > ---
> >
> > V2:
> >  Nothing change in V2.
> >  Only move this patch out from
> >
> > https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatc
> >
> hwork.kernel.org%2Fproject%2Flinux-remoteproc%2Flist%2F%3Fseries%3D6
> 04
> >
> 364&amp;data=04%7C01%7Cpeng.fan%40nxp.com%7C9e8a4ea774124a896f
> ed08d9ef
> >
> e9ac6c%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C637804609
> 168765154
> > %7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzI
> iLCJBTiI6I
> >
> k1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=ewUl7diAOfkomSQMiPDQ
> o5A6c2Hklgo
> > 8xYbMBk5A4Ic%3D&amp;reserved=0
> >
> >  drivers/remoteproc/remoteproc_core.c | 66
> ++++++++++++++++++++--------
> >  include/linux/remoteproc.h           |  2 +
> >  2 files changed, 49 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/remoteproc/remoteproc_core.c
> > b/drivers/remoteproc/remoteproc_core.c
> > index 69f51acf235e..4bd5544dab8f 100644
> > --- a/drivers/remoteproc/remoteproc_core.c
> > +++ b/drivers/remoteproc/remoteproc_core.c
> > @@ -1887,6 +1887,49 @@ static int __rproc_detach(struct rproc *rproc)
> >  	return 0;
> >  }
> >
> > +static int rproc_self_recovery(struct rproc *rproc) {
> > +	int ret;
> > +
> > +	mutex_unlock(&rproc->lock);
> > +	ret = rproc_detach(rproc);
> > +	mutex_lock(&rproc->lock);
> > +	if (ret)
> > +		return ret;
> 
> Here we would want to perform a core dump and manually reset the
> co-processor.

It is self recovery, not needed manual reset from main processor.

> I suppose that a new rproc ops could be called here in a next step.

Not very sure, but core dump could be added if needed.

> 
> > +
> > +	if (atomic_inc_return(&rproc->power) > 1)
> > +		return 0;
> 
> Do you identify a use case that needs to test rproc->power to skip the attach?
> If yes could you add a comment to describe it?

Just to avoid some error path. I think only when power is 1,
and self recovery could attach again.

> 
> > +	return rproc_attach(rproc);
> > +}
> > +
> > +static int rproc_assisted_recovery(struct rproc *rproc) {
> > +	const struct firmware *firmware_p;
> > +	struct device *dev = &rproc->dev;
> > +	int ret;
> > +
> > +	ret = rproc_stop(rproc, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* generate coredump */
> > +	rproc->ops->coredump(rproc);
> > +
> > +	/* load firmware */
> > +	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> > +	if (ret < 0) {
> > +		dev_err(dev, "request_firmware failed: %d\n", ret);
> > +		return ret;
> > +	}
> > +
> > +	/* boot the remote processor up again */
> > +	ret = rproc_start(rproc, firmware_p);
> > +
> > +	release_firmware(firmware_p);
> > +
> > +	return ret;
> > +}
> > +
> >  /**
> >   * rproc_trigger_recovery() - recover a remoteproc
> >   * @rproc: the remote processor
> > @@ -1901,7 +1944,6 @@ static int __rproc_detach(struct rproc *rproc)
> >   */
> >  int rproc_trigger_recovery(struct rproc *rproc)  {
> > -	const struct firmware *firmware_p;
> >  	struct device *dev = &rproc->dev;
> >  	int ret;
> >
> > @@ -1915,24 +1957,10 @@ int rproc_trigger_recovery(struct rproc
> > *rproc)
> >
> >  	dev_err(dev, "recovering %s\n", rproc->name);
> >
> > -	ret = rproc_stop(rproc, true);
> > -	if (ret)
> > -		goto unlock_mutex;
> > -
> > -	/* generate coredump */
> > -	rproc->ops->coredump(rproc);
> > -
> > -	/* load firmware */
> > -	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> > -	if (ret < 0) {
> > -		dev_err(dev, "request_firmware failed: %d\n", ret);
> > -		goto unlock_mutex;
> > -	}
> > -
> > -	/* boot the remote processor up again */
> > -	ret = rproc_start(rproc, firmware_p);
> > -
> > -	release_firmware(firmware_p);
> > +	if (rproc->self_recovery)
> > +		ret = rproc_self_recovery(rproc);
> 
> If some platforms have to manually reset the remote processor (without
> reloading the firmware) the name could not be relevant...
> 
> Following comments are only suggestions that needs to be commented by
> maintainers
> 
> What about rproc_attach_recovery ?

Looks better.

> 
> > +	else
> > +		ret = rproc_assisted_recovery(rproc);
> 
> and rproc_firmware_recovery ?

Yeah, better.

> 
> 
> >
> >  unlock_mutex:
> >  	mutex_unlock(&rproc->lock);
> > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > index e0600e1e5c17..b32ef46f8aa4 100644
> > --- a/include/linux/remoteproc.h
> > +++ b/include/linux/remoteproc.h
> > @@ -529,6 +529,7 @@ struct rproc_dump_segment {
> >   * @elf_machine: firmware ELF machine
> >   * @cdev: character device of the rproc
> >   * @cdev_put_on_release: flag to indicate if remoteproc should be
> > shutdown on @char_dev release
> > + * @self_recovery: flag to indicate if remoteproc support self
> > + recovery
> >   */
> >  struct rproc {
> >  	struct list_head node;
> > @@ -568,6 +569,7 @@ struct rproc {
> >  	u16 elf_machine;
> >  	struct cdev cdev;
> >  	bool cdev_put_on_release;
> > +	bool self_recovery;
> 
> This bool seems needed because we have lost the previous state before crash.
> I wonder if a new rproc->state such as RPROC_REBOOT could avoid this
> boolean.

REBOOT not able to differentiable self recovery or firmware recovery?
Anyway I'll check to add a BIT instead a bool.

> 
> 
> I will try to test you patch on stm32mp1 next week

Thanks,
Peng.

> 
> Regards,
> Arnaud
> 
> >  };
> >
> >  /**
Arnaud POULIQUEN Feb. 24, 2022, 2:08 p.m. UTC | #5
Hi Peng,

On 2/14/22 19:41, Arnaud POULIQUEN wrote:
> Hi Peng,
> 
> On 1/26/22 09:51, Peng Fan (OSS) wrote:
>> From: Peng Fan <peng.fan@nxp.com>
>>
>> Current logic only support main processor to stop/start the remote
>> processor after rproc crash. However to SoC, such as i.MX8QM/QXP, the
>> remote processor could do self recovery after crash and trigger watchdog
>> reboot. It does not need main processor to load image, stop/start M4
>> core.
> 
> 
> On stm32mp1 platform the remote processor watchdog generates an early interrupt
> that could be used to detach and reattach before the reset of the remote processor.
> I need to test race condition,but I suppose that this should works if the resource
> table is not reinitialized by the remote processor firmware.
> 
> Another option for the stm32mp1 is that remoteproc manages the reset of the 
> remote processor.
> For instance this allows to save a core-dump before manually resetting the remote
> processor.
> But looks like this use case can be handled later, as mentioned below. 
> 
>>
>> This patch add a new flag to indicate whether the SoC has self recovery
>> capability. And introduce two functions: rproc_self_recovery,
>> rproc_assisted_recovery for the two cases. Assisted recovery is as
>> before, let main processor to help recovery, while self recovery is
>> recover itself withou help. To self recovery, we only do detach and
>> attach.
> 
> 
>>
>> Signed-off-by: Peng Fan <peng.fan@nxp.com>
>> ---
>>
>> V2:
>>  Nothing change in V2.
>>  Only move this patch out from
>>  https://patchwork.kernel.org/project/linux-remoteproc/list/?series=604364
>>
>>  drivers/remoteproc/remoteproc_core.c | 66 ++++++++++++++++++++--------
>>  include/linux/remoteproc.h           |  2 +
>>  2 files changed, 49 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
>> index 69f51acf235e..4bd5544dab8f 100644
>> --- a/drivers/remoteproc/remoteproc_core.c
>> +++ b/drivers/remoteproc/remoteproc_core.c
>> @@ -1887,6 +1887,49 @@ static int __rproc_detach(struct rproc *rproc)
>>  	return 0;
>>  }
>>  
>> +static int rproc_self_recovery(struct rproc *rproc)
>> +{
>> +	int ret;
>> +
>> +	mutex_unlock(&rproc->lock);
>> +	ret = rproc_detach(rproc);
>> +	mutex_lock(&rproc->lock);
>> +	if (ret)
>> +		return ret;
> 
> Here we would want to perform a core dump and manually reset the
> co-processor.
> I suppose that a new rproc ops could be called here in a next step.
> 
>> +
>> +	if (atomic_inc_return(&rproc->power) > 1)
>> +		return 0;
> 
> Do you identify a use case that needs to test rproc->power to
> skip the attach?
> If yes could you add a comment to describe it?
> 
>> +	return rproc_attach(rproc);
>> +}
>> +
>> +static int rproc_assisted_recovery(struct rproc *rproc)
>> +{
>> +	const struct firmware *firmware_p;
>> +	struct device *dev = &rproc->dev;
>> +	int ret;
>> +
>> +	ret = rproc_stop(rproc, true);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* generate coredump */
>> +	rproc->ops->coredump(rproc);
>> +
>> +	/* load firmware */
>> +	ret = request_firmware(&firmware_p, rproc->firmware, dev);
>> +	if (ret < 0) {
>> +		dev_err(dev, "request_firmware failed: %d\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	/* boot the remote processor up again */
>> +	ret = rproc_start(rproc, firmware_p);
>> +
>> +	release_firmware(firmware_p);
>> +
>> +	return ret;
>> +}
>> +
>>  /**
>>   * rproc_trigger_recovery() - recover a remoteproc
>>   * @rproc: the remote processor
>> @@ -1901,7 +1944,6 @@ static int __rproc_detach(struct rproc *rproc)
>>   */
>>  int rproc_trigger_recovery(struct rproc *rproc)
>>  {
>> -	const struct firmware *firmware_p;
>>  	struct device *dev = &rproc->dev;
>>  	int ret;
>>  
>> @@ -1915,24 +1957,10 @@ int rproc_trigger_recovery(struct rproc *rproc)
>>  
>>  	dev_err(dev, "recovering %s\n", rproc->name);
>>  
>> -	ret = rproc_stop(rproc, true);
>> -	if (ret)
>> -		goto unlock_mutex;
>> -
>> -	/* generate coredump */
>> -	rproc->ops->coredump(rproc);
>> -
>> -	/* load firmware */
>> -	ret = request_firmware(&firmware_p, rproc->firmware, dev);
>> -	if (ret < 0) {
>> -		dev_err(dev, "request_firmware failed: %d\n", ret);
>> -		goto unlock_mutex;
>> -	}
>> -
>> -	/* boot the remote processor up again */
>> -	ret = rproc_start(rproc, firmware_p);
>> -
>> -	release_firmware(firmware_p);
>> +	if (rproc->self_recovery)
>> +		ret = rproc_self_recovery(rproc);
> 
> If some platforms have to manually reset the remote processor (without
> reloading the firmware) the name could not be relevant...
> 
> Following comments are only suggestions that needs to be commented by maintainers
> 
> What about rproc_attach_recovery ?
> 
>> +	else
>> +		ret = rproc_assisted_recovery(rproc);
> 
> and rproc_firmware_recovery ?
> 
> 
>>  
>>  unlock_mutex:
>>  	mutex_unlock(&rproc->lock);
>> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
>> index e0600e1e5c17..b32ef46f8aa4 100644
>> --- a/include/linux/remoteproc.h
>> +++ b/include/linux/remoteproc.h
>> @@ -529,6 +529,7 @@ struct rproc_dump_segment {
>>   * @elf_machine: firmware ELF machine
>>   * @cdev: character device of the rproc
>>   * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release
>> + * @self_recovery: flag to indicate if remoteproc support self recovery
>>   */
>>  struct rproc {
>>  	struct list_head node;
>> @@ -568,6 +569,7 @@ struct rproc {
>>  	u16 elf_machine;
>>  	struct cdev cdev;
>>  	bool cdev_put_on_release;
>> +	bool self_recovery;
> 
> This bool seems needed because we have lost the previous state before crash. 
> I wonder if a new rproc->state such as RPROC_REBOOT could avoid this boolean.
> 
> 
> I will try to test you patch on stm32mp1 next week

I performed few tests on the stm32mp1 with your patch.
Thanks to the resetting of the resource tables on detachment, this works quite well.

Regards,
Arnaud

> 
> Regards,
> Arnaud
> 
>>  };
>>  
>>  /**
Peng Fan Feb. 25, 2022, 2:10 a.m. UTC | #6
> Subject: Re: [PATCH V2] remoteproc: support self recovery after rproc crash
> 
> Hi Peng,
> 
> On 2/14/22 19:41, Arnaud POULIQUEN wrote:
> > Hi Peng,
> >
> > On 1/26/22 09:51, Peng Fan (OSS) wrote:
> >> From: Peng Fan <peng.fan@nxp.com>
> >>
> >> Current logic only support main processor to stop/start the remote
> >> processor after rproc crash. However to SoC, such as i.MX8QM/QXP, the
> >> remote processor could do self recovery after crash and trigger
> >> watchdog reboot. It does not need main processor to load image,
> >> stop/start M4 core.
> >
> >
> > On stm32mp1 platform the remote processor watchdog generates an early
> > interrupt that could be used to detach and reattach before the reset of the
> remote processor.
> > I need to test race condition,but I suppose that this should works if
> > the resource table is not reinitialized by the remote processor firmware.
> >
> > Another option for the stm32mp1 is that remoteproc manages the reset
> > of the remote processor.
> > For instance this allows to save a core-dump before manually resetting
> > the remote processor.
> > But looks like this use case can be handled later, as mentioned below.
> >
> >>
> >> This patch add a new flag to indicate whether the SoC has self
> >> recovery capability. And introduce two functions:
> >> rproc_self_recovery, rproc_assisted_recovery for the two cases.
> >> Assisted recovery is as before, let main processor to help recovery,
> >> while self recovery is recover itself withou help. To self recovery,
> >> we only do detach and attach.
> >
> >
> >>
> >> Signed-off-by: Peng Fan <peng.fan@nxp.com>
> >> ---
> >>
> >> V2:
> >>  Nothing change in V2.
> >>  Only move this patch out from
> >>
> >> https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpat
> >>
> chwork.kernel.org%2Fproject%2Flinux-remoteproc%2Flist%2F%3Fseries%3D
> 6
> >>
> 04364&amp;data=04%7C01%7Cpeng.fan%40nxp.com%7C3d617c5ddb8c42b
> 315f808d
> >>
> 9f79f2ac7%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C6378130
> 8526949
> >>
> 0105%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2lu
> MzIiLCJB
> >>
> TiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=VKKrqOnFz%2BoXjr%2
> FGMKpm4
> >> yyqmRfApeYY0l2V8A0yy4Y%3D&amp;reserved=0
> >>
> >>  drivers/remoteproc/remoteproc_core.c | 66
> ++++++++++++++++++++--------
> >>  include/linux/remoteproc.h           |  2 +
> >>  2 files changed, 49 insertions(+), 19 deletions(-)
> >>
> >> diff --git a/drivers/remoteproc/remoteproc_core.c
> >> b/drivers/remoteproc/remoteproc_core.c
> >> index 69f51acf235e..4bd5544dab8f 100644
> >> --- a/drivers/remoteproc/remoteproc_core.c
> >> +++ b/drivers/remoteproc/remoteproc_core.c
> >> @@ -1887,6 +1887,49 @@ static int __rproc_detach(struct rproc *rproc)
> >>  	return 0;
> >>  }
> >>
> >> +static int rproc_self_recovery(struct rproc *rproc) {
> >> +	int ret;
> >> +
> >> +	mutex_unlock(&rproc->lock);
> >> +	ret = rproc_detach(rproc);
> >> +	mutex_lock(&rproc->lock);
> >> +	if (ret)
> >> +		return ret;
> >
> > Here we would want to perform a core dump and manually reset the
> > co-processor.
> > I suppose that a new rproc ops could be called here in a next step.
> >
> >> +
> >> +	if (atomic_inc_return(&rproc->power) > 1)
> >> +		return 0;
> >
> > Do you identify a use case that needs to test rproc->power to skip the
> > attach?
> > If yes could you add a comment to describe it?
> >
> >> +	return rproc_attach(rproc);
> >> +}
> >> +
> >> +static int rproc_assisted_recovery(struct rproc *rproc) {
> >> +	const struct firmware *firmware_p;
> >> +	struct device *dev = &rproc->dev;
> >> +	int ret;
> >> +
> >> +	ret = rproc_stop(rproc, true);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	/* generate coredump */
> >> +	rproc->ops->coredump(rproc);
> >> +
> >> +	/* load firmware */
> >> +	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> >> +	if (ret < 0) {
> >> +		dev_err(dev, "request_firmware failed: %d\n", ret);
> >> +		return ret;
> >> +	}
> >> +
> >> +	/* boot the remote processor up again */
> >> +	ret = rproc_start(rproc, firmware_p);
> >> +
> >> +	release_firmware(firmware_p);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >>  /**
> >>   * rproc_trigger_recovery() - recover a remoteproc
> >>   * @rproc: the remote processor
> >> @@ -1901,7 +1944,6 @@ static int __rproc_detach(struct rproc *rproc)
> >>   */
> >>  int rproc_trigger_recovery(struct rproc *rproc)  {
> >> -	const struct firmware *firmware_p;
> >>  	struct device *dev = &rproc->dev;
> >>  	int ret;
> >>
> >> @@ -1915,24 +1957,10 @@ int rproc_trigger_recovery(struct rproc
> >> *rproc)
> >>
> >>  	dev_err(dev, "recovering %s\n", rproc->name);
> >>
> >> -	ret = rproc_stop(rproc, true);
> >> -	if (ret)
> >> -		goto unlock_mutex;
> >> -
> >> -	/* generate coredump */
> >> -	rproc->ops->coredump(rproc);
> >> -
> >> -	/* load firmware */
> >> -	ret = request_firmware(&firmware_p, rproc->firmware, dev);
> >> -	if (ret < 0) {
> >> -		dev_err(dev, "request_firmware failed: %d\n", ret);
> >> -		goto unlock_mutex;
> >> -	}
> >> -
> >> -	/* boot the remote processor up again */
> >> -	ret = rproc_start(rproc, firmware_p);
> >> -
> >> -	release_firmware(firmware_p);
> >> +	if (rproc->self_recovery)
> >> +		ret = rproc_self_recovery(rproc);
> >
> > If some platforms have to manually reset the remote processor (without
> > reloading the firmware) the name could not be relevant...
> >
> > Following comments are only suggestions that needs to be commented by
> > maintainers
> >
> > What about rproc_attach_recovery ?
> >
> >> +	else
> >> +		ret = rproc_assisted_recovery(rproc);
> >
> > and rproc_firmware_recovery ?
> >
> >
> >>
> >>  unlock_mutex:
> >>  	mutex_unlock(&rproc->lock);
> >> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> >> index e0600e1e5c17..b32ef46f8aa4 100644
> >> --- a/include/linux/remoteproc.h
> >> +++ b/include/linux/remoteproc.h
> >> @@ -529,6 +529,7 @@ struct rproc_dump_segment {
> >>   * @elf_machine: firmware ELF machine
> >>   * @cdev: character device of the rproc
> >>   * @cdev_put_on_release: flag to indicate if remoteproc should be
> >> shutdown on @char_dev release
> >> + * @self_recovery: flag to indicate if remoteproc support self
> >> + recovery
> >>   */
> >>  struct rproc {
> >>  	struct list_head node;
> >> @@ -568,6 +569,7 @@ struct rproc {
> >>  	u16 elf_machine;
> >>  	struct cdev cdev;
> >>  	bool cdev_put_on_release;
> >> +	bool self_recovery;
> >
> > This bool seems needed because we have lost the previous state before
> crash.
> > I wonder if a new rproc->state such as RPROC_REBOOT could avoid this
> boolean.
> >
> >
> > I will try to test you patch on stm32mp1 next week
> 
> I performed few tests on the stm32mp1 with your patch.
> Thanks to the resetting of the resource tables on detachment, this works
> quite well.

Thanks very much for you testing this patch. I'll try to address your
previous comments and send out v3.

Thanks,
Peng.

> 
> Regards,
> Arnaud
> 
> >
> > Regards,
> > Arnaud
> >
> >>  };
> >>
> >>  /**
diff mbox series

Patch

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 69f51acf235e..4bd5544dab8f 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1887,6 +1887,49 @@  static int __rproc_detach(struct rproc *rproc)
 	return 0;
 }
 
+static int rproc_self_recovery(struct rproc *rproc)
+{
+	int ret;
+
+	mutex_unlock(&rproc->lock);
+	ret = rproc_detach(rproc);
+	mutex_lock(&rproc->lock);
+	if (ret)
+		return ret;
+
+	if (atomic_inc_return(&rproc->power) > 1)
+		return 0;
+	return rproc_attach(rproc);
+}
+
+static int rproc_assisted_recovery(struct rproc *rproc)
+{
+	const struct firmware *firmware_p;
+	struct device *dev = &rproc->dev;
+	int ret;
+
+	ret = rproc_stop(rproc, true);
+	if (ret)
+		return ret;
+
+	/* generate coredump */
+	rproc->ops->coredump(rproc);
+
+	/* load firmware */
+	ret = request_firmware(&firmware_p, rproc->firmware, dev);
+	if (ret < 0) {
+		dev_err(dev, "request_firmware failed: %d\n", ret);
+		return ret;
+	}
+
+	/* boot the remote processor up again */
+	ret = rproc_start(rproc, firmware_p);
+
+	release_firmware(firmware_p);
+
+	return ret;
+}
+
 /**
  * rproc_trigger_recovery() - recover a remoteproc
  * @rproc: the remote processor
@@ -1901,7 +1944,6 @@  static int __rproc_detach(struct rproc *rproc)
  */
 int rproc_trigger_recovery(struct rproc *rproc)
 {
-	const struct firmware *firmware_p;
 	struct device *dev = &rproc->dev;
 	int ret;
 
@@ -1915,24 +1957,10 @@  int rproc_trigger_recovery(struct rproc *rproc)
 
 	dev_err(dev, "recovering %s\n", rproc->name);
 
-	ret = rproc_stop(rproc, true);
-	if (ret)
-		goto unlock_mutex;
-
-	/* generate coredump */
-	rproc->ops->coredump(rproc);
-
-	/* load firmware */
-	ret = request_firmware(&firmware_p, rproc->firmware, dev);
-	if (ret < 0) {
-		dev_err(dev, "request_firmware failed: %d\n", ret);
-		goto unlock_mutex;
-	}
-
-	/* boot the remote processor up again */
-	ret = rproc_start(rproc, firmware_p);
-
-	release_firmware(firmware_p);
+	if (rproc->self_recovery)
+		ret = rproc_self_recovery(rproc);
+	else
+		ret = rproc_assisted_recovery(rproc);
 
 unlock_mutex:
 	mutex_unlock(&rproc->lock);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index e0600e1e5c17..b32ef46f8aa4 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -529,6 +529,7 @@  struct rproc_dump_segment {
  * @elf_machine: firmware ELF machine
  * @cdev: character device of the rproc
  * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release
+ * @self_recovery: flag to indicate if remoteproc support self recovery
  */
 struct rproc {
 	struct list_head node;
@@ -568,6 +569,7 @@  struct rproc {
 	u16 elf_machine;
 	struct cdev cdev;
 	bool cdev_put_on_release;
+	bool self_recovery;
 };
 
 /**