diff mbox series

[v3,03/14] remoteproc: Add new operation and flags for synchronistation

Message ID 20200424200135.28825-4-mathieu.poirier@linaro.org (mailing list archive)
State New, archived
Headers show
Series remoteproc: Add support for synchronisaton with rproc | expand

Commit Message

Mathieu Poirier April 24, 2020, 8:01 p.m. UTC
Add a new sync_ops to support use cases where the remoteproc
core is synchronising with the remote processor.  Exactly when to use
the synchronisation operations is directed by the flags in structure
rproc_sync_flags.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

Comments

Arnaud POULIQUEN April 28, 2020, 4:38 p.m. UTC | #1
On 4/24/20 10:01 PM, Mathieu Poirier wrote:
> Add a new sync_ops to support use cases where the remoteproc
> core is synchronising with the remote processor.  Exactly when to use
> the synchronisation operations is directed by the flags in structure
> rproc_sync_flags.
> 
> Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> ---
>  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
>  1 file changed, 24 insertions(+)
> 
> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> index ac4082f12e8b..ceb3b2bba824 100644
> --- a/include/linux/remoteproc.h
> +++ b/include/linux/remoteproc.h
> @@ -353,6 +353,23 @@ enum rsc_handling_status {
>  	RSC_IGNORED	= 1,
>  };
>  
> +/**
> + * struct rproc_sync_flags - platform specific flags indicating which
> + *			      rproc_ops to use at specific times during
> + *			      the rproc lifecycle.
> + * @on_init: true if synchronising with the remote processor at
> + *	     initialisation time
> + * @after_stop: true if synchronising with the remote processor after it was
> + *		stopped from the cmmand line
typo command
> + * @after_crash: true if synchronising with the remote processor after
> + *		 it has crashed
> + */
> +struct rproc_sync_flags {
> +	bool on_init;
> +	bool after_stop;
> +	bool after_crash;
> +};
> +
how about a bit field instead (just a proposition)?
Platform driver would set the sync flag and rproc_set_sync_flag could be a 
simple mask instead of a switch case.

Is it possible to split this patch in a different ways because difficult to understand as
rproc_sync_flags seems not used before 
[PATCH v3 09/14] remoteproc: Deal with synchronisation when crashing

Thanks
Arnaud  

>  /**
>   * struct rproc_ops - platform-specific device handlers
>   * @start:	power on the device and boot it
> @@ -459,6 +476,9 @@ struct rproc_dump_segment {
>   * @firmware: name of firmware file to be loaded
>   * @priv: private data which belongs to the platform-specific rproc module
>   * @ops: platform-specific start/stop rproc handlers
> + * @sync_ops: platform-specific start/stop rproc handlers when
> + *	      synchronising with a remote processor.
> + * @sync_flags: Determine the rproc_ops to choose in specific states.
>   * @dev: virtual device for refcounting and common remoteproc behavior
>   * @power: refcount of users who need this rproc powered up
>   * @state: state of the device
> @@ -482,6 +502,7 @@ struct rproc_dump_segment {
>   * @table_sz: size of @cached_table
>   * @has_iommu: flag to indicate if remote processor is behind an MMU
>   * @auto_boot: flag to indicate if remote processor should be auto-started
> + * @sync_with_rproc: true if currently synchronising with the rproc
>   * @dump_segments: list of segments in the firmware
>   * @nb_vdev: number of vdev currently handled by rproc
>   */
> @@ -492,6 +513,8 @@ struct rproc {
>  	const char *firmware;
>  	void *priv;
>  	struct rproc_ops *ops;
> +	struct rproc_ops *sync_ops;
> +	struct rproc_sync_flags sync_flags;
>  	struct device dev;
>  	atomic_t power;
>  	unsigned int state;
> @@ -515,6 +538,7 @@ struct rproc {
>  	size_t table_sz;
>  	bool has_iommu;
>  	bool auto_boot;
> +	bool sync_with_rproc;
>  	struct list_head dump_segments;
>  	int nb_vdev;
>  	u8 elf_class;
>
Mathieu Poirier April 30, 2020, 7:49 p.m. UTC | #2
On Tue, Apr 28, 2020 at 06:38:41PM +0200, Arnaud POULIQUEN wrote:
> 
> 
> On 4/24/20 10:01 PM, Mathieu Poirier wrote:
> > Add a new sync_ops to support use cases where the remoteproc
> > core is synchronising with the remote processor.  Exactly when to use
> > the synchronisation operations is directed by the flags in structure
> > rproc_sync_flags.
> > 
> > Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> > ---
> >  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
> >  1 file changed, 24 insertions(+)
> > 
> > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > index ac4082f12e8b..ceb3b2bba824 100644
> > --- a/include/linux/remoteproc.h
> > +++ b/include/linux/remoteproc.h
> > @@ -353,6 +353,23 @@ enum rsc_handling_status {
> >  	RSC_IGNORED	= 1,
> >  };
> >  
> > +/**
> > + * struct rproc_sync_flags - platform specific flags indicating which
> > + *			      rproc_ops to use at specific times during
> > + *			      the rproc lifecycle.
> > + * @on_init: true if synchronising with the remote processor at
> > + *	     initialisation time
> > + * @after_stop: true if synchronising with the remote processor after it was
> > + *		stopped from the cmmand line
> typo command
> > + * @after_crash: true if synchronising with the remote processor after
> > + *		 it has crashed
> > + */
> > +struct rproc_sync_flags {
> > +	bool on_init;
> > +	bool after_stop;
> > +	bool after_crash;
> > +};
> > +
> how about a bit field instead (just a proposition)?
> Platform driver would set the sync flag and rproc_set_sync_flag could be a 
> simple mask instead of a switch case.

I opted for a structure over bit fields because I thought it would be easier to
read/understand.  Both approaches are valid and I have to particular preference
other than, in my own view, a structure is easier to understand.  

I'll wait a little to see what other people think.  If nobody objects the next
revision will have bit fields.

> 
> Is it possible to split this patch in a different ways because difficult to understand as
> rproc_sync_flags seems not used before 
> [PATCH v3 09/14] remoteproc: Deal with synchronisation when crashing

Certainly

> 
> Thanks
> Arnaud  
> 
> >  /**
> >   * struct rproc_ops - platform-specific device handlers
> >   * @start:	power on the device and boot it
> > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> >   * @firmware: name of firmware file to be loaded
> >   * @priv: private data which belongs to the platform-specific rproc module
> >   * @ops: platform-specific start/stop rproc handlers
> > + * @sync_ops: platform-specific start/stop rproc handlers when
> > + *	      synchronising with a remote processor.
> > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> >   * @dev: virtual device for refcounting and common remoteproc behavior
> >   * @power: refcount of users who need this rproc powered up
> >   * @state: state of the device
> > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> >   * @table_sz: size of @cached_table
> >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > + * @sync_with_rproc: true if currently synchronising with the rproc
> >   * @dump_segments: list of segments in the firmware
> >   * @nb_vdev: number of vdev currently handled by rproc
> >   */
> > @@ -492,6 +513,8 @@ struct rproc {
> >  	const char *firmware;
> >  	void *priv;
> >  	struct rproc_ops *ops;
> > +	struct rproc_ops *sync_ops;
> > +	struct rproc_sync_flags sync_flags;
> >  	struct device dev;
> >  	atomic_t power;
> >  	unsigned int state;
> > @@ -515,6 +538,7 @@ struct rproc {
> >  	size_t table_sz;
> >  	bool has_iommu;
> >  	bool auto_boot;
> > +	bool sync_with_rproc;
> >  	struct list_head dump_segments;
> >  	int nb_vdev;
> >  	u8 elf_class;
> >
Bjorn Andersson May 6, 2020, 12:22 a.m. UTC | #3
On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:

> Add a new sync_ops to support use cases where the remoteproc
> core is synchronising with the remote processor.  Exactly when to use
> the synchronisation operations is directed by the flags in structure
> rproc_sync_flags.
> 

I'm sorry, but no matter how many times I read these patches I have to
translate "synchronising" to "remote controlled", and given the number
of comments clarifying this makes me feel that we could perhaps come up
with a better name?

> Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> ---
>  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
>  1 file changed, 24 insertions(+)
> 
> diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> index ac4082f12e8b..ceb3b2bba824 100644
> --- a/include/linux/remoteproc.h
> +++ b/include/linux/remoteproc.h
> @@ -353,6 +353,23 @@ enum rsc_handling_status {
>  	RSC_IGNORED	= 1,
>  };
>  
> +/**
> + * struct rproc_sync_flags - platform specific flags indicating which
> + *			      rproc_ops to use at specific times during
> + *			      the rproc lifecycle.
> + * @on_init: true if synchronising with the remote processor at
> + *	     initialisation time
> + * @after_stop: true if synchronising with the remote processor after it was
> + *		stopped from the cmmand line
> + * @after_crash: true if synchronising with the remote processor after
> + *		 it has crashed
> + */
> +struct rproc_sync_flags {
> +	bool on_init;

This indirectly splits the RPROC_OFFLINE state in an "offline" and
"already-booted" state. Wouldn't it be clearer to represent this with a
new RPROC_ALREADY_BOOTED state?

> +	bool after_stop;

What does it mean when this is true? That Linux can shut the remote core
down, but someone else will start it?

> +	bool after_crash;

Similarly what is the expected steps to be taken by the core when this
is true? Should rproc_report_crash() simply stop/start the subdevices
and upon one of the ops somehow tell the remote controller that it can
proceed with the recovery?

> +};
> +
>  /**
>   * struct rproc_ops - platform-specific device handlers
>   * @start:	power on the device and boot it
> @@ -459,6 +476,9 @@ struct rproc_dump_segment {
>   * @firmware: name of firmware file to be loaded
>   * @priv: private data which belongs to the platform-specific rproc module
>   * @ops: platform-specific start/stop rproc handlers
> + * @sync_ops: platform-specific start/stop rproc handlers when
> + *	      synchronising with a remote processor.
> + * @sync_flags: Determine the rproc_ops to choose in specific states.
>   * @dev: virtual device for refcounting and common remoteproc behavior
>   * @power: refcount of users who need this rproc powered up
>   * @state: state of the device
> @@ -482,6 +502,7 @@ struct rproc_dump_segment {
>   * @table_sz: size of @cached_table
>   * @has_iommu: flag to indicate if remote processor is behind an MMU
>   * @auto_boot: flag to indicate if remote processor should be auto-started
> + * @sync_with_rproc: true if currently synchronising with the rproc
>   * @dump_segments: list of segments in the firmware
>   * @nb_vdev: number of vdev currently handled by rproc
>   */
> @@ -492,6 +513,8 @@ struct rproc {
>  	const char *firmware;
>  	void *priv;
>  	struct rproc_ops *ops;
> +	struct rproc_ops *sync_ops;

Do we really need two rproc_ops, given that both are coming from the
platform driver and the sync_flags will define which one to look at?

Can't the platform driver just provide an ops table that works with the
flags it passes?

Regards,
Bjorn

> +	struct rproc_sync_flags sync_flags;
>  	struct device dev;
>  	atomic_t power;
>  	unsigned int state;
> @@ -515,6 +538,7 @@ struct rproc {
>  	size_t table_sz;
>  	bool has_iommu;
>  	bool auto_boot;
> +	bool sync_with_rproc;
>  	struct list_head dump_segments;
>  	int nb_vdev;
>  	u8 elf_class;
> -- 
> 2.20.1
>
Mathieu Poirier May 8, 2020, 9:01 p.m. UTC | #4
On Tue, May 05, 2020 at 05:22:53PM -0700, Bjorn Andersson wrote:
> On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:
> 
> > Add a new sync_ops to support use cases where the remoteproc
> > core is synchronising with the remote processor.  Exactly when to use
> > the synchronisation operations is directed by the flags in structure
> > rproc_sync_flags.
> > 
> 
> I'm sorry, but no matter how many times I read these patches I have to
> translate "synchronising" to "remote controlled", and given the number
> of comments clarifying this makes me feel that we could perhaps come up
> with a better name?

"remote controlled" as in "someone else is managing the remote processor" ?  It
could also mean the remoteproc core is "remote controlling" the remote
processor, exactly what it currently does today...

How about "autonomous", as in the remote processor doesn't need us to boot or
switch it off.  I'm open to any other suggestions.

> 
> > Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> > ---
> >  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
> >  1 file changed, 24 insertions(+)
> > 
> > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > index ac4082f12e8b..ceb3b2bba824 100644
> > --- a/include/linux/remoteproc.h
> > +++ b/include/linux/remoteproc.h
> > @@ -353,6 +353,23 @@ enum rsc_handling_status {
> >  	RSC_IGNORED	= 1,
> >  };
> >  
> > +/**
> > + * struct rproc_sync_flags - platform specific flags indicating which
> > + *			      rproc_ops to use at specific times during
> > + *			      the rproc lifecycle.
> > + * @on_init: true if synchronising with the remote processor at
> > + *	     initialisation time
> > + * @after_stop: true if synchronising with the remote processor after it was
> > + *		stopped from the cmmand line
> > + * @after_crash: true if synchronising with the remote processor after
> > + *		 it has crashed
> > + */
> > +struct rproc_sync_flags {
> > +	bool on_init;
> 
> This indirectly splits the RPROC_OFFLINE state in an "offline" and
> "already-booted" state. Wouldn't it be clearer to represent this with a
> new RPROC_ALREADY_BOOTED state?
> 

I suggested that at some point in the past but it was in a different context.  I
will revisit to see how doing so could apply here.

> > +	bool after_stop;
> 
> What does it mean when this is true? That Linux can shut the remote core
> down, but someone else will start it?

It tells the remoteproc core how to interact with the remote processor after the
latter has been switched off.  For example, we could want to boot the remote
processor from the boot loader so that minimal functionality can be provided
while the kernel boots.  Once the kernel and user space are in place, the remote
processor is explicitly stopped and booted once again, but this time with a
firmware image that offers full functionality.

It could also be that the remoteproc core can stop the remote processor, but the
remote processor will automatically reboot itself.  In that case the remoteproc
core will simply synchronise with the remote processor, as it does when .on_init
== true.

> 
> > +	bool after_crash;
> 
> Similarly what is the expected steps to be taken by the core when this
> is true? Should rproc_report_crash() simply stop/start the subdevices
> and upon one of the ops somehow tell the remote controller that it can
> proceed with the recovery?

The exact same sequence of steps will be carried out as they are today, except
that if after_crash == true, the remoteproc core won't be switching the remote
processor on, exactly as it would do when on_init == true.

These flags are there to indicate how to set rproc::sync_with_rproc after
different events, that is when the remoteproc core boots, when the remoteproc
has been stopped or when it has crashed.

> 
> > +};
> > +
> >  /**
> >   * struct rproc_ops - platform-specific device handlers
> >   * @start:	power on the device and boot it
> > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> >   * @firmware: name of firmware file to be loaded
> >   * @priv: private data which belongs to the platform-specific rproc module
> >   * @ops: platform-specific start/stop rproc handlers
> > + * @sync_ops: platform-specific start/stop rproc handlers when
> > + *	      synchronising with a remote processor.
> > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> >   * @dev: virtual device for refcounting and common remoteproc behavior
> >   * @power: refcount of users who need this rproc powered up
> >   * @state: state of the device
> > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> >   * @table_sz: size of @cached_table
> >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > + * @sync_with_rproc: true if currently synchronising with the rproc
> >   * @dump_segments: list of segments in the firmware
> >   * @nb_vdev: number of vdev currently handled by rproc
> >   */
> > @@ -492,6 +513,8 @@ struct rproc {
> >  	const char *firmware;
> >  	void *priv;
> >  	struct rproc_ops *ops;
> > +	struct rproc_ops *sync_ops;
> 
> Do we really need two rproc_ops, given that both are coming from the
> platform driver and the sync_flags will define which one to look at?
> 
> Can't the platform driver just provide an ops table that works with the
> flags it passes?

That is the approach Loic took in a previous patchset [1] and that was rejected.
It also lead to all of the platform drivers testing rproc->flag before carring
different actions, something you indicated could be done in the core.  This
patch does exactly that, i.e move the testing of rproc->flag to the core and
calls the right function based on that.

The end result is the same and I'm happy with one or the other, I will need to
know which one.

The advantage with the approach I'm proposing is that everything is controlled
in the core, i.e what ops is called and when to set rproc->flag based on
different states the remote processor transitions through.

Thanks,
Mathieu


[1]. https://patchwork.kernel.org/patch/11265869/

> 
> Regards,
> Bjorn
> 
> > +	struct rproc_sync_flags sync_flags;
> >  	struct device dev;
> >  	atomic_t power;
> >  	unsigned int state;
> > @@ -515,6 +538,7 @@ struct rproc {
> >  	size_t table_sz;
> >  	bool has_iommu;
> >  	bool auto_boot;
> > +	bool sync_with_rproc;
> >  	struct list_head dump_segments;
> >  	int nb_vdev;
> >  	u8 elf_class;
> > -- 
> > 2.20.1
> >
Bjorn Andersson May 14, 2020, 1:32 a.m. UTC | #5
On Fri 08 May 14:01 PDT 2020, Mathieu Poirier wrote:

> On Tue, May 05, 2020 at 05:22:53PM -0700, Bjorn Andersson wrote:
> > On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:
> > 
> > > Add a new sync_ops to support use cases where the remoteproc
> > > core is synchronising with the remote processor.  Exactly when to use
> > > the synchronisation operations is directed by the flags in structure
> > > rproc_sync_flags.
> > > 
> > 
> > I'm sorry, but no matter how many times I read these patches I have to
> > translate "synchronising" to "remote controlled", and given the number
> > of comments clarifying this makes me feel that we could perhaps come up
> > with a better name?
> 
> "remote controlled" as in "someone else is managing the remote processor" ?
> It could also mean the remoteproc core is "remote controlling" the
> remote processor, exactly what it currently does today...
> 

You're right and this would certainly not help the confusion.

> How about "autonomous", as in the remote processor doesn't need us to boot or
> switch it off.  I'm open to any other suggestions.
> 
> > 
> > > Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> > > ---
> > >  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
> > >  1 file changed, 24 insertions(+)
> > > 
> > > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > > index ac4082f12e8b..ceb3b2bba824 100644
> > > --- a/include/linux/remoteproc.h
> > > +++ b/include/linux/remoteproc.h
> > > @@ -353,6 +353,23 @@ enum rsc_handling_status {
> > >  	RSC_IGNORED	= 1,
> > >  };
> > >  
> > > +/**
> > > + * struct rproc_sync_flags - platform specific flags indicating which
> > > + *			      rproc_ops to use at specific times during
> > > + *			      the rproc lifecycle.
> > > + * @on_init: true if synchronising with the remote processor at
> > > + *	     initialisation time
> > > + * @after_stop: true if synchronising with the remote processor after it was
> > > + *		stopped from the cmmand line
> > > + * @after_crash: true if synchronising with the remote processor after
> > > + *		 it has crashed
> > > + */
> > > +struct rproc_sync_flags {
> > > +	bool on_init;
> > 
> > This indirectly splits the RPROC_OFFLINE state in an "offline" and
> > "already-booted" state. Wouldn't it be clearer to represent this with a
> > new RPROC_ALREADY_BOOTED state?
> > 
> 
> I suggested that at some point in the past but it was in a different context.  I
> will revisit to see how doing so could apply here.
> 

How about we introduce a new state named DETACHED and make the platform
drivers specify that the remote processor is in either OFFLINE (as
today) or DETACHED during initialization.

Then on_init = true would be the action of going from DETACHED to
RUNNING, which would involve the following actions:

1) find resource table
2) prepare device (?)
3) handle resources
4) allocate carveouts (?)
5) prepare subdevices
6) "attach"
7) start subdevices

on_init = false would represent the transition from OFFLINE to RUNNING,
which today involve the following actions:

1) request firmware
2) prepare device
3) parse fw
4) handle resources
5) allocate carveouts
6) load segments
7) find resource table
8) prepare subdevices
9) "boot"
10) start subdevices

> > > +	bool after_stop;
> > 
> > What does it mean when this is true? That Linux can shut the remote core
> > down, but someone else will start it?
> 
> It tells the remoteproc core how to interact with the remote processor after the
> latter has been switched off.

Understood.

> For example, we could want to boot the remote
> processor from the boot loader so that minimal functionality can be provided
> while the kernel boots.  Once the kernel and user space are in place, the remote
> processor is explicitly stopped and booted once again, but this time with a
> firmware image that offers full functionality.
> 

This would be the { on_init = true, after_stop = false } use case, with
the new state would relate to the journey of DETACHED -> RUNNING ->
OFFLINE.

As such the next boot would represent above OFFLINE -> RUNNING case,
which we already support today.

> It could also be that the remoteproc core can stop the remote processor, but the
> remote processor will automatically reboot itself.  In that case the remoteproc
> core will simply synchronise with the remote processor, as it does when .on_init
> == true.
> 

I've not been able to come up with a reasonable use case for the {
on_init = ture, after_stop = true } scenario.

But Wendy previously talked about the need to "detach" Linux from a
running remote processor, by somehow just letting it know that the
communication is down - to allow Linux to be rebooted while the remote
was running. So if we support a transition from RUNNING to DETACHED
using a sequence of something like:

1) stop subdevices
2) "detach"
3) unprepare subdevices
4) release carveouts (?)
5) unprepare device (?)

Then perhaps the after_stop could naturally be the transition from
DETACHED to RUNNING, either with or without a reboot of the system
in between?

> > 
> > > +	bool after_crash;
> > 
> > Similarly what is the expected steps to be taken by the core when this
> > is true? Should rproc_report_crash() simply stop/start the subdevices
> > and upon one of the ops somehow tell the remote controller that it can
> > proceed with the recovery?
> 
> The exact same sequence of steps will be carried out as they are today, except
> that if after_crash == true, the remoteproc core won't be switching the remote
> processor on, exactly as it would do when on_init == true.
> 

Just to make sure we're on the same page:

after_crash = false is what we have today, and would mean:

1) stop subdevices
2) power off
3) unprepare subdevices
4) generate coredump
5) request firmware
6) load segments
7) find resource table
8) prepare subdevices
9) "boot"
10) start subdevices

after_crash = true would mean:

1) stop subdevices
2) "detach"
3) unprepare subdevices
4) prepare subdevices
5) "attach"
6) start subdevices

State diagram wise both of these would represent the transition RUNNING
-> CRASHED -> RUNNING, but somehow the platform driver needs to be able
to specify which of these sequences to perform. Per your naming
suggestion above, this does sound like a "autonomous_recovery" boolean
to me.

> These flags are there to indicate how to set rproc::sync_with_rproc after
> different events, that is when the remoteproc core boots, when the remoteproc
> has been stopped or when it has crashed.
> 

Right, that was clear from your patches. Sorry that my reply didn't
convey the information that I had understood this.

> > 
> > > +};
> > > +
> > >  /**
> > >   * struct rproc_ops - platform-specific device handlers
> > >   * @start:	power on the device and boot it
> > > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> > >   * @firmware: name of firmware file to be loaded
> > >   * @priv: private data which belongs to the platform-specific rproc module
> > >   * @ops: platform-specific start/stop rproc handlers
> > > + * @sync_ops: platform-specific start/stop rproc handlers when
> > > + *	      synchronising with a remote processor.
> > > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> > >   * @dev: virtual device for refcounting and common remoteproc behavior
> > >   * @power: refcount of users who need this rproc powered up
> > >   * @state: state of the device
> > > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> > >   * @table_sz: size of @cached_table
> > >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> > >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > > + * @sync_with_rproc: true if currently synchronising with the rproc
> > >   * @dump_segments: list of segments in the firmware
> > >   * @nb_vdev: number of vdev currently handled by rproc
> > >   */
> > > @@ -492,6 +513,8 @@ struct rproc {
> > >  	const char *firmware;
> > >  	void *priv;
> > >  	struct rproc_ops *ops;
> > > +	struct rproc_ops *sync_ops;
> > 
> > Do we really need two rproc_ops, given that both are coming from the
> > platform driver and the sync_flags will define which one to look at?
> > 
> > Can't the platform driver just provide an ops table that works with the
> > flags it passes?
> 
> That is the approach Loic took in a previous patchset [1] and that was rejected.
> It also lead to all of the platform drivers testing rproc->flag before carring
> different actions, something you indicated could be done in the core.  This
> patch does exactly that, i.e move the testing of rproc->flag to the core and
> calls the right function based on that.
> 

I think I see what you mean, as we use "start" for both syncing and
starting the core, a { on_init = true, after_stop = false } setup either
needs two tables or force conditionals on the platform driver.

> The end result is the same and I'm happy with one or the other, I will need to
> know which one.
> 

How about adding a new ops named "attach" to rproc_ops, which the
platform driver can specify if it supports attaching an already running
processor?

> The advantage with the approach I'm proposing is that everything is controlled
> in the core, i.e what ops is called and when to set rproc->flag based on
> different states the remote processor transitions through.
> 

I still think keeping things in the core is the right thing to do.


Please let me know what you think!

PS. If we agree on this the three transitions becomes somewhat
independent, so I think it makes sense to first land support for the
DETACHED -> RUNNING transition (and the stm32 series), then follow up
with RUNNING -> DETACHED and autonomous recovery separately.

Regards,
Bjorn

> Thanks,
> Mathieu
> 
> 
> [1]. https://patchwork.kernel.org/patch/11265869/
> 
> > 
> > Regards,
> > Bjorn
> > 
> > > +	struct rproc_sync_flags sync_flags;
> > >  	struct device dev;
> > >  	atomic_t power;
> > >  	unsigned int state;
> > > @@ -515,6 +538,7 @@ struct rproc {
> > >  	size_t table_sz;
> > >  	bool has_iommu;
> > >  	bool auto_boot;
> > > +	bool sync_with_rproc;
> > >  	struct list_head dump_segments;
> > >  	int nb_vdev;
> > >  	u8 elf_class;
> > > -- 
> > > 2.20.1
> > >
Mathieu Poirier May 15, 2020, 7:24 p.m. UTC | #6
Good day Bjorn,

On Wed, May 13, 2020 at 06:32:24PM -0700, Bjorn Andersson wrote:
> On Fri 08 May 14:01 PDT 2020, Mathieu Poirier wrote:
> 
> > On Tue, May 05, 2020 at 05:22:53PM -0700, Bjorn Andersson wrote:
> > > On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:
> > > 
> > > > Add a new sync_ops to support use cases where the remoteproc
> > > > core is synchronising with the remote processor.  Exactly when to use
> > > > the synchronisation operations is directed by the flags in structure
> > > > rproc_sync_flags.
> > > > 
> > > 
> > > I'm sorry, but no matter how many times I read these patches I have to
> > > translate "synchronising" to "remote controlled", and given the number
> > > of comments clarifying this makes me feel that we could perhaps come up
> > > with a better name?
> > 
> > "remote controlled" as in "someone else is managing the remote processor" ?
> > It could also mean the remoteproc core is "remote controlling" the
> > remote processor, exactly what it currently does today...
> > 
> 
> You're right and this would certainly not help the confusion.
> 
> > How about "autonomous", as in the remote processor doesn't need us to boot or
> > switch it off.  I'm open to any other suggestions.
> > 
> > > 
> > > > Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> > > > ---
> > > >  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
> > > >  1 file changed, 24 insertions(+)
> > > > 
> > > > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > > > index ac4082f12e8b..ceb3b2bba824 100644
> > > > --- a/include/linux/remoteproc.h
> > > > +++ b/include/linux/remoteproc.h
> > > > @@ -353,6 +353,23 @@ enum rsc_handling_status {
> > > >  	RSC_IGNORED	= 1,
> > > >  };
> > > >  
> > > > +/**
> > > > + * struct rproc_sync_flags - platform specific flags indicating which
> > > > + *			      rproc_ops to use at specific times during
> > > > + *			      the rproc lifecycle.
> > > > + * @on_init: true if synchronising with the remote processor at
> > > > + *	     initialisation time
> > > > + * @after_stop: true if synchronising with the remote processor after it was
> > > > + *		stopped from the cmmand line
> > > > + * @after_crash: true if synchronising with the remote processor after
> > > > + *		 it has crashed
> > > > + */
> > > > +struct rproc_sync_flags {
> > > > +	bool on_init;
> > > 
> > > This indirectly splits the RPROC_OFFLINE state in an "offline" and
> > > "already-booted" state. Wouldn't it be clearer to represent this with a
> > > new RPROC_ALREADY_BOOTED state?
> > > 
> > 
> > I suggested that at some point in the past but it was in a different context.  I
> > will revisit to see how doing so could apply here.
> > 
> 
> How about we introduce a new state named DETACHED and make the platform
> drivers specify that the remote processor is in either OFFLINE (as
> today) or DETACHED during initialization.

That is certainly an idea that is growing on me.  Up to now I used the on_init
flag to express duality in the OFFLINE state.  But based on the comments that came
back from yourself, Arnaud and Suman it is clear that my approach is anything
but clear.  As such I am eager to try something else.

> 
> Then on_init = true would be the action of going from DETACHED to
> RUNNING, which would involve the following actions:
> 
> 1) find resource table
> 2) prepare device (?)
> 3) handle resources
> 4) allocate carveouts (?)
> 5) prepare subdevices
> 6) "attach"
> 7) start subdevices
> 
> on_init = false would represent the transition from OFFLINE to RUNNING,
> which today involve the following actions:
> 
> 1) request firmware
> 2) prepare device
> 3) parse fw
> 4) handle resources
> 5) allocate carveouts
> 6) load segments
> 7) find resource table
> 8) prepare subdevices
> 9) "boot"
> 10) start subdevices

If we add a DETACHED state I don't see a scenario where we need the on_init
variable.  When DETACHED is set by the platform we know the MCU is running and
it becomes a matter of when the core attach to it, i.e at initialisation time or
once the kernel has finished booting, and that is already taken care of by the
auto_boot variable.  

The steps you have outlined above to describe the transitions are accurate.

> 
> > > > +	bool after_stop;
> > > 
> > > What does it mean when this is true? That Linux can shut the remote core
> > > down, but someone else will start it?
> > 
> > It tells the remoteproc core how to interact with the remote processor after the
> > latter has been switched off.
> 
> Understood.
> 
> > For example, we could want to boot the remote
> > processor from the boot loader so that minimal functionality can be provided
> > while the kernel boots.  Once the kernel and user space are in place, the remote
> > processor is explicitly stopped and booted once again, but this time with a
> > firmware image that offers full functionality.
> > 
> 
> This would be the { on_init = true, after_stop = false } use case, with
> the new state would relate to the journey of DETACHED -> RUNNING ->
> OFFLINE.

Yes

> 
> As such the next boot would represent above OFFLINE -> RUNNING case,
> which we already support today.

Correct.  This is the level of functionality sought by ST and TI.  Xilinx seems to
have the same requirements as well.

> 
> > It could also be that the remoteproc core can stop the remote processor, but the
> > remote processor will automatically reboot itself.  In that case the remoteproc
> > core will simply synchronise with the remote processor, as it does when .on_init
> > == true.
> > 
> 
> I've not been able to come up with a reasonable use case for the {
> on_init = ture, after_stop = true } scenario.

That one is a little trickier - see the next comment.

> 
> But Wendy previously talked about the need to "detach" Linux from a
> running remote processor, by somehow just letting it know that the
> communication is down - to allow Linux to be rebooted while the remote
> was running. So if we support a transition from RUNNING to DETACHED
> using a sequence of something like:
> 
> 1) stop subdevices
> 2) "detach"
> 3) unprepare subdevices
> 4) release carveouts (?)
> 5) unprepare device (?)
> 
> Then perhaps the after_stop could naturally be the transition from
> DETACHED to RUNNING, either with or without a reboot of the system
> in between?

I see two scenarios for after_stop == true:

1) A "detach" scenario as you mentioned above.  In this case the stop() function
would inform (using a mechanism that is platform specific) the MCU that the core
is shutting down.  In this case the MCU would put itself back in "waiting mode",
waiting for the core to show signs of life again.  On the core side this would
be a DETACHED to RUNNING transition.  Wheter the application processor reboots
or not should not be relevant to the MCU.

2) An "MCU reboot in autonomous mode" scenario.  Here the stop() function would
switch off the MCU.  From there the MCU could automatically restarts itself or
be restarted by some other entity.  In this scenario I would expect the start()
function to block until the MCU is ready to proceed with the rest of the
remoteproc core initialisation steps.

From a remoteproc core perspective, both are handled by a DETACHED -> RUNNING
transition.  This is the functionality NXP is looking for.   

> 
> > > 
> > > > +	bool after_crash;
> > > 
> > > Similarly what is the expected steps to be taken by the core when this
> > > is true? Should rproc_report_crash() simply stop/start the subdevices
> > > and upon one of the ops somehow tell the remote controller that it can
> > > proceed with the recovery?
> > 
> > The exact same sequence of steps will be carried out as they are today, except
> > that if after_crash == true, the remoteproc core won't be switching the remote
> > processor on, exactly as it would do when on_init == true.
> > 
> 
> Just to make sure we're on the same page:
> 
> after_crash = false is what we have today, and would mean:
> 
> 1) stop subdevices
> 2) power off
> 3) unprepare subdevices
> 4) generate coredump
> 5) request firmware
> 6) load segments
> 7) find resource table
> 8) prepare subdevices
> 9) "boot"
> 10) start subdevices

Exactly

> 
> after_crash = true would mean:
> 
> 1) stop subdevices
> 2) "detach"
> 3) unprepare subdevices
> 4) prepare subdevices
> 5) "attach"
> 6) start subdevices
>

Yes
 
> State diagram wise both of these would represent the transition RUNNING
> -> CRASHED -> RUNNING, but somehow the platform driver needs to be able
> to specify which of these sequences to perform. Per your naming
> suggestion above, this does sound like a "autonomous_recovery" boolean
> to me.

Right, semantically "rproc->autonomous" would apply quite well.

In function rproc_crash_handler_work(), a call to rproc_set_sync_flag() has been
strategically placed to set the value of rproc->autonomous based on
"after_crash".  From there the core knows which rproc_ops to use.  Here too we
have to rely on the rproc_ops provided by the platform to do the right thing
based on the scenario to enact.

> 
> > These flags are there to indicate how to set rproc::sync_with_rproc after
> > different events, that is when the remoteproc core boots, when the remoteproc
> > has been stopped or when it has crashed.
> > 
> 
> Right, that was clear from your patches. Sorry that my reply didn't
> convey the information that I had understood this.
> 
> > > 
> > > > +};
> > > > +
> > > >  /**
> > > >   * struct rproc_ops - platform-specific device handlers
> > > >   * @start:	power on the device and boot it
> > > > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> > > >   * @firmware: name of firmware file to be loaded
> > > >   * @priv: private data which belongs to the platform-specific rproc module
> > > >   * @ops: platform-specific start/stop rproc handlers
> > > > + * @sync_ops: platform-specific start/stop rproc handlers when
> > > > + *	      synchronising with a remote processor.
> > > > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> > > >   * @dev: virtual device for refcounting and common remoteproc behavior
> > > >   * @power: refcount of users who need this rproc powered up
> > > >   * @state: state of the device
> > > > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> > > >   * @table_sz: size of @cached_table
> > > >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> > > >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > > > + * @sync_with_rproc: true if currently synchronising with the rproc
> > > >   * @dump_segments: list of segments in the firmware
> > > >   * @nb_vdev: number of vdev currently handled by rproc
> > > >   */
> > > > @@ -492,6 +513,8 @@ struct rproc {
> > > >  	const char *firmware;
> > > >  	void *priv;
> > > >  	struct rproc_ops *ops;
> > > > +	struct rproc_ops *sync_ops;
> > > 
> > > Do we really need two rproc_ops, given that both are coming from the
> > > platform driver and the sync_flags will define which one to look at?
> > > 
> > > Can't the platform driver just provide an ops table that works with the
> > > flags it passes?
> > 
> > That is the approach Loic took in a previous patchset [1] and that was rejected.
> > It also lead to all of the platform drivers testing rproc->flag before carring
> > different actions, something you indicated could be done in the core.  This
> > patch does exactly that, i.e move the testing of rproc->flag to the core and
> > calls the right function based on that.
> > 
> 
> I think I see what you mean, as we use "start" for both syncing and
> starting the core, a { on_init = true, after_stop = false } setup either
> needs two tables or force conditionals on the platform driver.
> 
> > The end result is the same and I'm happy with one or the other, I will need to
> > know which one.
> > 
> 
> How about adding a new ops named "attach" to rproc_ops, which the
> platform driver can specify if it supports attaching an already running
> processor?

Using "attach_ops" works for me.  But would "autonomous_ops", to correlate with
rproc::autonomous, add clarity?  Either way work equally well for me. 

> 
> > The advantage with the approach I'm proposing is that everything is controlled
> > in the core, i.e what ops is called and when to set rproc->flag based on
> > different states the remote processor transitions through.
> > 
> 
> I still think keeping things in the core is the right thing to do.
>

Let's continue down that path then.
 
> 
> Please let me know what you think!

From the above conversion I believe our views are pretty much aligned.

> 
> PS. If we agree on this the three transitions becomes somewhat
> independent, so I think it makes sense to first land support for the
> DETACHED -> RUNNING transition (and the stm32 series), then follow up
> with RUNNING -> DETACHED and autonomous recovery separately.

We can certainly proceed that way.

Thanks for the time,
Mathieu

> 
> Regards,
> Bjorn
> 
> > Thanks,
> > Mathieu
> > 
> > 
> > [1]. https://patchwork.kernel.org/patch/11265869/
> > 
> > > 
> > > Regards,
> > > Bjorn
> > > 
> > > > +	struct rproc_sync_flags sync_flags;
> > > >  	struct device dev;
> > > >  	atomic_t power;
> > > >  	unsigned int state;
> > > > @@ -515,6 +538,7 @@ struct rproc {
> > > >  	size_t table_sz;
> > > >  	bool has_iommu;
> > > >  	bool auto_boot;
> > > > +	bool sync_with_rproc;
> > > >  	struct list_head dump_segments;
> > > >  	int nb_vdev;
> > > >  	u8 elf_class;
> > > > -- 
> > > > 2.20.1
> > > >
Bjorn Andersson May 19, 2020, 12:55 a.m. UTC | #7
On Fri 15 May 12:24 PDT 2020, Mathieu Poirier wrote:

> Good day Bjorn,
> 
> On Wed, May 13, 2020 at 06:32:24PM -0700, Bjorn Andersson wrote:
> > On Fri 08 May 14:01 PDT 2020, Mathieu Poirier wrote:
> > 
> > > On Tue, May 05, 2020 at 05:22:53PM -0700, Bjorn Andersson wrote:
> > > > On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:
> > > > 
> > > > > Add a new sync_ops to support use cases where the remoteproc
> > > > > core is synchronising with the remote processor.  Exactly when to use
> > > > > the synchronisation operations is directed by the flags in structure
> > > > > rproc_sync_flags.
> > > > > 
> > > > 
> > > > I'm sorry, but no matter how many times I read these patches I have to
> > > > translate "synchronising" to "remote controlled", and given the number
> > > > of comments clarifying this makes me feel that we could perhaps come up
> > > > with a better name?
> > > 
> > > "remote controlled" as in "someone else is managing the remote processor" ?
> > > It could also mean the remoteproc core is "remote controlling" the
> > > remote processor, exactly what it currently does today...
> > > 
> > 
> > You're right and this would certainly not help the confusion.
> > 
> > > How about "autonomous", as in the remote processor doesn't need us to boot or
> > > switch it off.  I'm open to any other suggestions.
> > > 
> > > > 
> > > > > Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> > > > > ---
> > > > >  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
> > > > >  1 file changed, 24 insertions(+)
> > > > > 
> > > > > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > > > > index ac4082f12e8b..ceb3b2bba824 100644
> > > > > --- a/include/linux/remoteproc.h
> > > > > +++ b/include/linux/remoteproc.h
> > > > > @@ -353,6 +353,23 @@ enum rsc_handling_status {
> > > > >  	RSC_IGNORED	= 1,
> > > > >  };
> > > > >  
> > > > > +/**
> > > > > + * struct rproc_sync_flags - platform specific flags indicating which
> > > > > + *			      rproc_ops to use at specific times during
> > > > > + *			      the rproc lifecycle.
> > > > > + * @on_init: true if synchronising with the remote processor at
> > > > > + *	     initialisation time
> > > > > + * @after_stop: true if synchronising with the remote processor after it was
> > > > > + *		stopped from the cmmand line
> > > > > + * @after_crash: true if synchronising with the remote processor after
> > > > > + *		 it has crashed
> > > > > + */
> > > > > +struct rproc_sync_flags {
> > > > > +	bool on_init;
> > > > 
> > > > This indirectly splits the RPROC_OFFLINE state in an "offline" and
> > > > "already-booted" state. Wouldn't it be clearer to represent this with a
> > > > new RPROC_ALREADY_BOOTED state?
> > > > 
> > > 
> > > I suggested that at some point in the past but it was in a different context.  I
> > > will revisit to see how doing so could apply here.
> > > 
> > 
> > How about we introduce a new state named DETACHED and make the platform
> > drivers specify that the remote processor is in either OFFLINE (as
> > today) or DETACHED during initialization.
> 
> That is certainly an idea that is growing on me.  Up to now I used the on_init
> flag to express duality in the OFFLINE state.  But based on the comments that came
> back from yourself, Arnaud and Suman it is clear that my approach is anything
> but clear.  As such I am eager to try something else.
> 
> > 
> > Then on_init = true would be the action of going from DETACHED to
> > RUNNING, which would involve the following actions:
> > 
> > 1) find resource table
> > 2) prepare device (?)
> > 3) handle resources
> > 4) allocate carveouts (?)
> > 5) prepare subdevices
> > 6) "attach"
> > 7) start subdevices
> > 
> > on_init = false would represent the transition from OFFLINE to RUNNING,
> > which today involve the following actions:
> > 
> > 1) request firmware
> > 2) prepare device
> > 3) parse fw
> > 4) handle resources
> > 5) allocate carveouts
> > 6) load segments
> > 7) find resource table
> > 8) prepare subdevices
> > 9) "boot"
> > 10) start subdevices
> 
> If we add a DETACHED state I don't see a scenario where we need the on_init
> variable.  When DETACHED is set by the platform we know the MCU is running and
> it becomes a matter of when the core attach to it, i.e at initialisation time or
> once the kernel has finished booting, and that is already taken care of by the
> auto_boot variable.  
> 
> The steps you have outlined above to describe the transitions are accurate.
> 

Thanks for confirming.

I think it would be helpful if we had this properly documented in the
driver, to facilitate reasoning about the various transitions. I'll try
to write down my notes in a patch and send it out.

> > 
> > > > > +	bool after_stop;
> > > > 
> > > > What does it mean when this is true? That Linux can shut the remote core
> > > > down, but someone else will start it?
> > > 
> > > It tells the remoteproc core how to interact with the remote processor after the
> > > latter has been switched off.
> > 
> > Understood.
> > 
> > > For example, we could want to boot the remote
> > > processor from the boot loader so that minimal functionality can be provided
> > > while the kernel boots.  Once the kernel and user space are in place, the remote
> > > processor is explicitly stopped and booted once again, but this time with a
> > > firmware image that offers full functionality.
> > > 
> > 
> > This would be the { on_init = true, after_stop = false } use case, with
> > the new state would relate to the journey of DETACHED -> RUNNING ->
> > OFFLINE.
> 
> Yes
> 
> > 
> > As such the next boot would represent above OFFLINE -> RUNNING case,
> > which we already support today.
> 
> Correct.  This is the level of functionality sought by ST and TI.  Xilinx seems to
> have the same requirements as well.
> 

Good.

> > 
> > > It could also be that the remoteproc core can stop the remote processor, but the
> > > remote processor will automatically reboot itself.  In that case the remoteproc
> > > core will simply synchronise with the remote processor, as it does when .on_init
> > > == true.
> > > 
> > 
> > I've not been able to come up with a reasonable use case for the {
> > on_init = ture, after_stop = true } scenario.
> 
> That one is a little trickier - see the next comment.
> 
> > 
> > But Wendy previously talked about the need to "detach" Linux from a
> > running remote processor, by somehow just letting it know that the
> > communication is down - to allow Linux to be rebooted while the remote
> > was running. So if we support a transition from RUNNING to DETACHED
> > using a sequence of something like:
> > 
> > 1) stop subdevices
> > 2) "detach"
> > 3) unprepare subdevices
> > 4) release carveouts (?)
> > 5) unprepare device (?)
> > 
> > Then perhaps the after_stop could naturally be the transition from
> > DETACHED to RUNNING, either with or without a reboot of the system
> > in between?
> 
> I see two scenarios for after_stop == true:
> 
> 1) A "detach" scenario as you mentioned above.  In this case the stop() function
> would inform (using a mechanism that is platform specific) the MCU that the core
> is shutting down.  In this case the MCU would put itself back in "waiting mode",
> waiting for the core to show signs of life again.  On the core side this would
> be a DETACHED to RUNNING transition.  Wheter the application processor reboots
> or not should not be relevant to the MCU.
> 

Right and after reading the stm32 patches, for drivers with a way to
"detach" the remote, i.e. put it back in DETACHED state, a
rmmod/modprobe should conceptually fit very well.

> 2) An "MCU reboot in autonomous mode" scenario.  Here the stop() function would
> switch off the MCU.  From there the MCU could automatically restarts itself or
> be restarted by some other entity.  In this scenario I would expect the start()
> function to block until the MCU is ready to proceed with the rest of the
> remoteproc core initialisation steps.
> 

Presumably though the NXP driver wouldn't have a mechanism to "start"
the core, only to "attach" to it. And that would wait for it to be up
and running again.

> From a remoteproc core perspective, both are handled by a DETACHED -> RUNNING
> transition.  This is the functionality NXP is looking for.   
> 

Agreed.

> > 
> > > > 
> > > > > +	bool after_crash;
> > > > 
> > > > Similarly what is the expected steps to be taken by the core when this
> > > > is true? Should rproc_report_crash() simply stop/start the subdevices
> > > > and upon one of the ops somehow tell the remote controller that it can
> > > > proceed with the recovery?
> > > 
> > > The exact same sequence of steps will be carried out as they are today, except
> > > that if after_crash == true, the remoteproc core won't be switching the remote
> > > processor on, exactly as it would do when on_init == true.
> > > 
> > 
> > Just to make sure we're on the same page:
> > 
> > after_crash = false is what we have today, and would mean:
> > 
> > 1) stop subdevices
> > 2) power off
> > 3) unprepare subdevices
> > 4) generate coredump
> > 5) request firmware
> > 6) load segments
> > 7) find resource table
> > 8) prepare subdevices
> > 9) "boot"
> > 10) start subdevices
> 
> Exactly
> 
> > 
> > after_crash = true would mean:
> > 
> > 1) stop subdevices
> > 2) "detach"
> > 3) unprepare subdevices
> > 4) prepare subdevices
> > 5) "attach"
> > 6) start subdevices
> >
> 
> Yes
>  
> > State diagram wise both of these would represent the transition RUNNING
> > -> CRASHED -> RUNNING, but somehow the platform driver needs to be able
> > to specify which of these sequences to perform. Per your naming
> > suggestion above, this does sound like a "autonomous_recovery" boolean
> > to me.
> 
> Right, semantically "rproc->autonomous" would apply quite well.
> 
> In function rproc_crash_handler_work(), a call to rproc_set_sync_flag() has been
> strategically placed to set the value of rproc->autonomous based on
> "after_crash".  From there the core knows which rproc_ops to use.  Here too we
> have to rely on the rproc_ops provided by the platform to do the right thing
> based on the scenario to enact.
> 

Do you think that autonomous_recovery would be something that changes
for a given remoteproc instance? I envisioned it as something that you
know at registration time, but perhaps I'm missing some details here.

> > 
> > > These flags are there to indicate how to set rproc::sync_with_rproc after
> > > different events, that is when the remoteproc core boots, when the remoteproc
> > > has been stopped or when it has crashed.
> > > 
> > 
> > Right, that was clear from your patches. Sorry that my reply didn't
> > convey the information that I had understood this.
> > 
> > > > 
> > > > > +};
> > > > > +
> > > > >  /**
> > > > >   * struct rproc_ops - platform-specific device handlers
> > > > >   * @start:	power on the device and boot it
> > > > > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> > > > >   * @firmware: name of firmware file to be loaded
> > > > >   * @priv: private data which belongs to the platform-specific rproc module
> > > > >   * @ops: platform-specific start/stop rproc handlers
> > > > > + * @sync_ops: platform-specific start/stop rproc handlers when
> > > > > + *	      synchronising with a remote processor.
> > > > > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> > > > >   * @dev: virtual device for refcounting and common remoteproc behavior
> > > > >   * @power: refcount of users who need this rproc powered up
> > > > >   * @state: state of the device
> > > > > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> > > > >   * @table_sz: size of @cached_table
> > > > >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> > > > >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > > > > + * @sync_with_rproc: true if currently synchronising with the rproc
> > > > >   * @dump_segments: list of segments in the firmware
> > > > >   * @nb_vdev: number of vdev currently handled by rproc
> > > > >   */
> > > > > @@ -492,6 +513,8 @@ struct rproc {
> > > > >  	const char *firmware;
> > > > >  	void *priv;
> > > > >  	struct rproc_ops *ops;
> > > > > +	struct rproc_ops *sync_ops;
> > > > 
> > > > Do we really need two rproc_ops, given that both are coming from the
> > > > platform driver and the sync_flags will define which one to look at?
> > > > 
> > > > Can't the platform driver just provide an ops table that works with the
> > > > flags it passes?
> > > 
> > > That is the approach Loic took in a previous patchset [1] and that was rejected.
> > > It also lead to all of the platform drivers testing rproc->flag before carring
> > > different actions, something you indicated could be done in the core.  This
> > > patch does exactly that, i.e move the testing of rproc->flag to the core and
> > > calls the right function based on that.
> > > 
> > 
> > I think I see what you mean, as we use "start" for both syncing and
> > starting the core, a { on_init = true, after_stop = false } setup either
> > needs two tables or force conditionals on the platform driver.
> > 
> > > The end result is the same and I'm happy with one or the other, I will need to
> > > know which one.
> > > 
> > 
> > How about adding a new ops named "attach" to rproc_ops, which the
> > platform driver can specify if it supports attaching an already running
> > processor?
> 
> Using "attach_ops" works for me.  But would "autonomous_ops", to correlate with
> rproc::autonomous, add clarity?  Either way work equally well for me. 
> 

What I meant was that we add a function "attach" to the existing
rproc_ops. In the case of OFFLINE->RUNNING we continue to call
rproc->ops->start() and DETACHED->RUNNING we call this
rproc->ops->attach().

As I thought about this I saw that the "autonomous" part would only
apply to the scenario where the remote recovers from crashes by itself
(and we just need to be in sync with that). But I've not yet fully
thought through the NXP case of a stopped remote processor restarting by
itself.

> > 
> > > The advantage with the approach I'm proposing is that everything is controlled
> > > in the core, i.e what ops is called and when to set rproc->flag based on
> > > different states the remote processor transitions through.
> > > 
> > 
> > I still think keeping things in the core is the right thing to do.
> >
> 
> Let's continue down that path then.
>  
> > 
> > Please let me know what you think!
> 
> From the above conversion I believe our views are pretty much aligned.
> 

I share this belief and am looking forward to v4.

Regards,
Bjorn

> > 
> > PS. If we agree on this the three transitions becomes somewhat
> > independent, so I think it makes sense to first land support for the
> > DETACHED -> RUNNING transition (and the stm32 series), then follow up
> > with RUNNING -> DETACHED and autonomous recovery separately.
> 
> We can certainly proceed that way.
> 
> Thanks for the time,
> Mathieu
> 
> > 
> > Regards,
> > Bjorn
> > 
> > > Thanks,
> > > Mathieu
> > > 
> > > 
> > > [1]. https://patchwork.kernel.org/patch/11265869/
> > > 
> > > > 
> > > > Regards,
> > > > Bjorn
> > > > 
> > > > > +	struct rproc_sync_flags sync_flags;
> > > > >  	struct device dev;
> > > > >  	atomic_t power;
> > > > >  	unsigned int state;
> > > > > @@ -515,6 +538,7 @@ struct rproc {
> > > > >  	size_t table_sz;
> > > > >  	bool has_iommu;
> > > > >  	bool auto_boot;
> > > > > +	bool sync_with_rproc;
> > > > >  	struct list_head dump_segments;
> > > > >  	int nb_vdev;
> > > > >  	u8 elf_class;
> > > > > -- 
> > > > > 2.20.1
> > > > >
Mathieu Poirier May 20, 2020, 10:06 p.m. UTC | #8
On Mon, May 18, 2020 at 05:55:00PM -0700, Bjorn Andersson wrote:
> On Fri 15 May 12:24 PDT 2020, Mathieu Poirier wrote:
> 
> > Good day Bjorn,
> > 
> > On Wed, May 13, 2020 at 06:32:24PM -0700, Bjorn Andersson wrote:
> > > On Fri 08 May 14:01 PDT 2020, Mathieu Poirier wrote:
> > > 
> > > > On Tue, May 05, 2020 at 05:22:53PM -0700, Bjorn Andersson wrote:
> > > > > On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:
> > > > > 
> > > > > > Add a new sync_ops to support use cases where the remoteproc
> > > > > > core is synchronising with the remote processor.  Exactly when to use
> > > > > > the synchronisation operations is directed by the flags in structure
> > > > > > rproc_sync_flags.
> > > > > > 
> > > > > 
> > > > > I'm sorry, but no matter how many times I read these patches I have to
> > > > > translate "synchronising" to "remote controlled", and given the number
> > > > > of comments clarifying this makes me feel that we could perhaps come up
> > > > > with a better name?
> > > > 
> > > > "remote controlled" as in "someone else is managing the remote processor" ?
> > > > It could also mean the remoteproc core is "remote controlling" the
> > > > remote processor, exactly what it currently does today...
> > > > 
> > > 
> > > You're right and this would certainly not help the confusion.
> > > 
> > > > How about "autonomous", as in the remote processor doesn't need us to boot or
> > > > switch it off.  I'm open to any other suggestions.
> > > > 
> > > > > 
> > > > > > Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
> > > > > > ---
> > > > > >  include/linux/remoteproc.h | 24 ++++++++++++++++++++++++
> > > > > >  1 file changed, 24 insertions(+)
> > > > > > 
> > > > > > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
> > > > > > index ac4082f12e8b..ceb3b2bba824 100644
> > > > > > --- a/include/linux/remoteproc.h
> > > > > > +++ b/include/linux/remoteproc.h
> > > > > > @@ -353,6 +353,23 @@ enum rsc_handling_status {
> > > > > >  	RSC_IGNORED	= 1,
> > > > > >  };
> > > > > >  
> > > > > > +/**
> > > > > > + * struct rproc_sync_flags - platform specific flags indicating which
> > > > > > + *			      rproc_ops to use at specific times during
> > > > > > + *			      the rproc lifecycle.
> > > > > > + * @on_init: true if synchronising with the remote processor at
> > > > > > + *	     initialisation time
> > > > > > + * @after_stop: true if synchronising with the remote processor after it was
> > > > > > + *		stopped from the cmmand line
> > > > > > + * @after_crash: true if synchronising with the remote processor after
> > > > > > + *		 it has crashed
> > > > > > + */
> > > > > > +struct rproc_sync_flags {
> > > > > > +	bool on_init;
> > > > > 
> > > > > This indirectly splits the RPROC_OFFLINE state in an "offline" and
> > > > > "already-booted" state. Wouldn't it be clearer to represent this with a
> > > > > new RPROC_ALREADY_BOOTED state?
> > > > > 
> > > > 
> > > > I suggested that at some point in the past but it was in a different context.  I
> > > > will revisit to see how doing so could apply here.
> > > > 
> > > 
> > > How about we introduce a new state named DETACHED and make the platform
> > > drivers specify that the remote processor is in either OFFLINE (as
> > > today) or DETACHED during initialization.
> > 
> > That is certainly an idea that is growing on me.  Up to now I used the on_init
> > flag to express duality in the OFFLINE state.  But based on the comments that came
> > back from yourself, Arnaud and Suman it is clear that my approach is anything
> > but clear.  As such I am eager to try something else.
> > 
> > > 
> > > Then on_init = true would be the action of going from DETACHED to
> > > RUNNING, which would involve the following actions:
> > > 
> > > 1) find resource table
> > > 2) prepare device (?)
> > > 3) handle resources
> > > 4) allocate carveouts (?)
> > > 5) prepare subdevices
> > > 6) "attach"
> > > 7) start subdevices
> > > 
> > > on_init = false would represent the transition from OFFLINE to RUNNING,
> > > which today involve the following actions:
> > > 
> > > 1) request firmware
> > > 2) prepare device
> > > 3) parse fw
> > > 4) handle resources
> > > 5) allocate carveouts
> > > 6) load segments
> > > 7) find resource table
> > > 8) prepare subdevices
> > > 9) "boot"
> > > 10) start subdevices
> > 
> > If we add a DETACHED state I don't see a scenario where we need the on_init
> > variable.  When DETACHED is set by the platform we know the MCU is running and
> > it becomes a matter of when the core attach to it, i.e at initialisation time or
> > once the kernel has finished booting, and that is already taken care of by the
> > auto_boot variable.  
> > 
> > The steps you have outlined above to describe the transitions are accurate.
> > 
> 
> Thanks for confirming.
> 
> I think it would be helpful if we had this properly documented in the
> driver, to facilitate reasoning about the various transitions. I'll try
> to write down my notes in a patch and send it out.
> 
> > > 
> > > > > > +	bool after_stop;
> > > > > 
> > > > > What does it mean when this is true? That Linux can shut the remote core
> > > > > down, but someone else will start it?
> > > > 
> > > > It tells the remoteproc core how to interact with the remote processor after the
> > > > latter has been switched off.
> > > 
> > > Understood.
> > > 
> > > > For example, we could want to boot the remote
> > > > processor from the boot loader so that minimal functionality can be provided
> > > > while the kernel boots.  Once the kernel and user space are in place, the remote
> > > > processor is explicitly stopped and booted once again, but this time with a
> > > > firmware image that offers full functionality.
> > > > 
> > > 
> > > This would be the { on_init = true, after_stop = false } use case, with
> > > the new state would relate to the journey of DETACHED -> RUNNING ->
> > > OFFLINE.
> > 
> > Yes
> > 
> > > 
> > > As such the next boot would represent above OFFLINE -> RUNNING case,
> > > which we already support today.
> > 
> > Correct.  This is the level of functionality sought by ST and TI.  Xilinx seems to
> > have the same requirements as well.
> > 
> 
> Good.
> 
> > > 
> > > > It could also be that the remoteproc core can stop the remote processor, but the
> > > > remote processor will automatically reboot itself.  In that case the remoteproc
> > > > core will simply synchronise with the remote processor, as it does when .on_init
> > > > == true.
> > > > 
> > > 
> > > I've not been able to come up with a reasonable use case for the {
> > > on_init = ture, after_stop = true } scenario.
> > 
> > That one is a little trickier - see the next comment.
> > 
> > > 
> > > But Wendy previously talked about the need to "detach" Linux from a
> > > running remote processor, by somehow just letting it know that the
> > > communication is down - to allow Linux to be rebooted while the remote
> > > was running. So if we support a transition from RUNNING to DETACHED
> > > using a sequence of something like:
> > > 
> > > 1) stop subdevices
> > > 2) "detach"
> > > 3) unprepare subdevices
> > > 4) release carveouts (?)
> > > 5) unprepare device (?)
> > > 
> > > Then perhaps the after_stop could naturally be the transition from
> > > DETACHED to RUNNING, either with or without a reboot of the system
> > > in between?
> > 
> > I see two scenarios for after_stop == true:
> > 
> > 1) A "detach" scenario as you mentioned above.  In this case the stop() function
> > would inform (using a mechanism that is platform specific) the MCU that the core
> > is shutting down.  In this case the MCU would put itself back in "waiting mode",
> > waiting for the core to show signs of life again.  On the core side this would
> > be a DETACHED to RUNNING transition.  Wheter the application processor reboots
> > or not should not be relevant to the MCU.
> > 
> 
> Right and after reading the stm32 patches, for drivers with a way to
> "detach" the remote, i.e. put it back in DETACHED state, a
> rmmod/modprobe should conceptually fit very well.
> 
> > 2) An "MCU reboot in autonomous mode" scenario.  Here the stop() function would
> > switch off the MCU.  From there the MCU could automatically restarts itself or
> > be restarted by some other entity.  In this scenario I would expect the start()
> > function to block until the MCU is ready to proceed with the rest of the
> > remoteproc core initialisation steps.
> > 
> 
> Presumably though the NXP driver wouldn't have a mechanism to "start"
> the core, only to "attach" to it. And that would wait for it to be up
> and running again.
> 
> > From a remoteproc core perspective, both are handled by a DETACHED -> RUNNING
> > transition.  This is the functionality NXP is looking for.   
> > 
> 
> Agreed.
> 
> > > 
> > > > > 
> > > > > > +	bool after_crash;
> > > > > 
> > > > > Similarly what is the expected steps to be taken by the core when this
> > > > > is true? Should rproc_report_crash() simply stop/start the subdevices
> > > > > and upon one of the ops somehow tell the remote controller that it can
> > > > > proceed with the recovery?
> > > > 
> > > > The exact same sequence of steps will be carried out as they are today, except
> > > > that if after_crash == true, the remoteproc core won't be switching the remote
> > > > processor on, exactly as it would do when on_init == true.
> > > > 
> > > 
> > > Just to make sure we're on the same page:
> > > 
> > > after_crash = false is what we have today, and would mean:
> > > 
> > > 1) stop subdevices
> > > 2) power off
> > > 3) unprepare subdevices
> > > 4) generate coredump
> > > 5) request firmware
> > > 6) load segments
> > > 7) find resource table
> > > 8) prepare subdevices
> > > 9) "boot"
> > > 10) start subdevices
> > 
> > Exactly
> > 
> > > 
> > > after_crash = true would mean:
> > > 
> > > 1) stop subdevices
> > > 2) "detach"
> > > 3) unprepare subdevices
> > > 4) prepare subdevices
> > > 5) "attach"
> > > 6) start subdevices
> > >
> > 
> > Yes
> >  
> > > State diagram wise both of these would represent the transition RUNNING
> > > -> CRASHED -> RUNNING, but somehow the platform driver needs to be able
> > > to specify which of these sequences to perform. Per your naming
> > > suggestion above, this does sound like a "autonomous_recovery" boolean
> > > to me.
> > 
> > Right, semantically "rproc->autonomous" would apply quite well.
> > 
> > In function rproc_crash_handler_work(), a call to rproc_set_sync_flag() has been
> > strategically placed to set the value of rproc->autonomous based on
> > "after_crash".  From there the core knows which rproc_ops to use.  Here too we
> > have to rely on the rproc_ops provided by the platform to do the right thing
> > based on the scenario to enact.
> > 
> 
> Do you think that autonomous_recovery would be something that changes
> for a given remoteproc instance? I envisioned it as something that you
> know at registration time, but perhaps I'm missing some details here.

I don't envision any of the transision flags to change once they are set by the
platform.   The same applies to the new rproc_ops, it can be set only once.
Otherwise combination of possible scenarios becomes too hard to manage, leading
to situations where the core and MCU get out of sync and can't talk to each
other.

> 
> > > 
> > > > These flags are there to indicate how to set rproc::sync_with_rproc after
> > > > different events, that is when the remoteproc core boots, when the remoteproc
> > > > has been stopped or when it has crashed.
> > > > 
> > > 
> > > Right, that was clear from your patches. Sorry that my reply didn't
> > > convey the information that I had understood this.
> > > 
> > > > > 
> > > > > > +};
> > > > > > +
> > > > > >  /**
> > > > > >   * struct rproc_ops - platform-specific device handlers
> > > > > >   * @start:	power on the device and boot it
> > > > > > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> > > > > >   * @firmware: name of firmware file to be loaded
> > > > > >   * @priv: private data which belongs to the platform-specific rproc module
> > > > > >   * @ops: platform-specific start/stop rproc handlers
> > > > > > + * @sync_ops: platform-specific start/stop rproc handlers when
> > > > > > + *	      synchronising with a remote processor.
> > > > > > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> > > > > >   * @dev: virtual device for refcounting and common remoteproc behavior
> > > > > >   * @power: refcount of users who need this rproc powered up
> > > > > >   * @state: state of the device
> > > > > > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> > > > > >   * @table_sz: size of @cached_table
> > > > > >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> > > > > >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > > > > > + * @sync_with_rproc: true if currently synchronising with the rproc
> > > > > >   * @dump_segments: list of segments in the firmware
> > > > > >   * @nb_vdev: number of vdev currently handled by rproc
> > > > > >   */
> > > > > > @@ -492,6 +513,8 @@ struct rproc {
> > > > > >  	const char *firmware;
> > > > > >  	void *priv;
> > > > > >  	struct rproc_ops *ops;
> > > > > > +	struct rproc_ops *sync_ops;
> > > > > 
> > > > > Do we really need two rproc_ops, given that both are coming from the
> > > > > platform driver and the sync_flags will define which one to look at?
> > > > > 
> > > > > Can't the platform driver just provide an ops table that works with the
> > > > > flags it passes?
> > > > 
> > > > That is the approach Loic took in a previous patchset [1] and that was rejected.
> > > > It also lead to all of the platform drivers testing rproc->flag before carring
> > > > different actions, something you indicated could be done in the core.  This
> > > > patch does exactly that, i.e move the testing of rproc->flag to the core and
> > > > calls the right function based on that.
> > > > 
> > > 
> > > I think I see what you mean, as we use "start" for both syncing and
> > > starting the core, a { on_init = true, after_stop = false } setup either
> > > needs two tables or force conditionals on the platform driver.
> > > 
> > > > The end result is the same and I'm happy with one or the other, I will need to
> > > > know which one.
> > > > 
> > > 
> > > How about adding a new ops named "attach" to rproc_ops, which the
> > > platform driver can specify if it supports attaching an already running
> > > processor?
> > 
> > Using "attach_ops" works for me.  But would "autonomous_ops", to correlate with
> > rproc::autonomous, add clarity?  Either way work equally well for me. 
> > 
> 
> What I meant was that we add a function "attach" to the existing
> rproc_ops. In the case of OFFLINE->RUNNING we continue to call
> rproc->ops->start() and DETACHED->RUNNING we call this
> rproc->ops->attach().

If I read the above properly we'd end up with:

struct rproc_ops {
	int (*start)(struct rproc *rproc);
	int (*stop)(struct rproc *rproc);
	int (*attach)(struct rproc *rproc);
	int (*detach)(struct rproc *rproc);
        ...
        ...
};

But wed'd have to deal with other operations that are common to both scenarios
such as parse_fw() and find_loaded_rsc_table().  

So far lot of improvement have already been suggested on this revision.  I
suggest to spin off a new patchset that only handles the DETACHED->RUNNING
scenario and split common functions such as rproc_fw_boot().  From there we can
see if other refinements (such as what you suggest above) are mandated.

One last thing...  Upon reflecting on all this I think using "attach" is better
than "autonomous", the latter is heavy to drag around.

Thanks,
Mathieu

> 
> As I thought about this I saw that the "autonomous" part would only
> apply to the scenario where the remote recovers from crashes by itself
> (and we just need to be in sync with that). But I've not yet fully
> thought through the NXP case of a stopped remote processor restarting by
> itself.
> 
> > > 
> > > > The advantage with the approach I'm proposing is that everything is controlled
> > > > in the core, i.e what ops is called and when to set rproc->flag based on
> > > > different states the remote processor transitions through.
> > > > 
> > > 
> > > I still think keeping things in the core is the right thing to do.
> > >
> > 
> > Let's continue down that path then.
> >  
> > > 
> > > Please let me know what you think!
> > 
> > From the above conversion I believe our views are pretty much aligned.
> > 
> 
> I share this belief and am looking forward to v4.
> 
> Regards,
> Bjorn
> 
> > > 
> > > PS. If we agree on this the three transitions becomes somewhat
> > > independent, so I think it makes sense to first land support for the
> > > DETACHED -> RUNNING transition (and the stm32 series), then follow up
> > > with RUNNING -> DETACHED and autonomous recovery separately.
> > 
> > We can certainly proceed that way.
> > 
> > Thanks for the time,
> > Mathieu
> > 
> > > 
> > > Regards,
> > > Bjorn
> > > 
> > > > Thanks,
> > > > Mathieu
> > > > 
> > > > 
> > > > [1]. https://patchwork.kernel.org/patch/11265869/
> > > > 
> > > > > 
> > > > > Regards,
> > > > > Bjorn
> > > > > 
> > > > > > +	struct rproc_sync_flags sync_flags;
> > > > > >  	struct device dev;
> > > > > >  	atomic_t power;
> > > > > >  	unsigned int state;
> > > > > > @@ -515,6 +538,7 @@ struct rproc {
> > > > > >  	size_t table_sz;
> > > > > >  	bool has_iommu;
> > > > > >  	bool auto_boot;
> > > > > > +	bool sync_with_rproc;
> > > > > >  	struct list_head dump_segments;
> > > > > >  	int nb_vdev;
> > > > > >  	u8 elf_class;
> > > > > > -- 
> > > > > > 2.20.1
> > > > > >
Bjorn Andersson May 21, 2020, 5:21 a.m. UTC | #9
On Wed 20 May 15:06 PDT 2020, Mathieu Poirier wrote:

> On Mon, May 18, 2020 at 05:55:00PM -0700, Bjorn Andersson wrote:
> > On Fri 15 May 12:24 PDT 2020, Mathieu Poirier wrote:
> > 
> > > Good day Bjorn,
> > > 
> > > On Wed, May 13, 2020 at 06:32:24PM -0700, Bjorn Andersson wrote:
> > > > On Fri 08 May 14:01 PDT 2020, Mathieu Poirier wrote:
> > > > 
> > > > > On Tue, May 05, 2020 at 05:22:53PM -0700, Bjorn Andersson wrote:
> > > > > > On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:
[..]
> > > > > > > +	bool after_crash;
> > > > > > 
> > > > > > Similarly what is the expected steps to be taken by the core when this
> > > > > > is true? Should rproc_report_crash() simply stop/start the subdevices
> > > > > > and upon one of the ops somehow tell the remote controller that it can
> > > > > > proceed with the recovery?
> > > > > 
> > > > > The exact same sequence of steps will be carried out as they are today, except
> > > > > that if after_crash == true, the remoteproc core won't be switching the remote
> > > > > processor on, exactly as it would do when on_init == true.
> > > > > 
> > > > 
> > > > Just to make sure we're on the same page:
> > > > 
> > > > after_crash = false is what we have today, and would mean:
> > > > 
> > > > 1) stop subdevices
> > > > 2) power off
> > > > 3) unprepare subdevices
> > > > 4) generate coredump
> > > > 5) request firmware
> > > > 6) load segments
> > > > 7) find resource table
> > > > 8) prepare subdevices
> > > > 9) "boot"
> > > > 10) start subdevices
> > > 
> > > Exactly
> > > 
> > > > 
> > > > after_crash = true would mean:
> > > > 
> > > > 1) stop subdevices
> > > > 2) "detach"
> > > > 3) unprepare subdevices
> > > > 4) prepare subdevices
> > > > 5) "attach"
> > > > 6) start subdevices
> > > >
> > > 
> > > Yes
> > >  
> > > > State diagram wise both of these would represent the transition RUNNING
> > > > -> CRASHED -> RUNNING, but somehow the platform driver needs to be able
> > > > to specify which of these sequences to perform. Per your naming
> > > > suggestion above, this does sound like a "autonomous_recovery" boolean
> > > > to me.
> > > 
> > > Right, semantically "rproc->autonomous" would apply quite well.
> > > 
> > > In function rproc_crash_handler_work(), a call to rproc_set_sync_flag() has been
> > > strategically placed to set the value of rproc->autonomous based on
> > > "after_crash".  From there the core knows which rproc_ops to use.  Here too we
> > > have to rely on the rproc_ops provided by the platform to do the right thing
> > > based on the scenario to enact.
> > > 
> > 
> > Do you think that autonomous_recovery would be something that changes
> > for a given remoteproc instance? I envisioned it as something that you
> > know at registration time, but perhaps I'm missing some details here.
> 
> I don't envision any of the transision flags to change once they are set by the
> platform.   The same applies to the new rproc_ops, it can be set only once.
> Otherwise combination of possible scenarios becomes too hard to manage, leading
> to situations where the core and MCU get out of sync and can't talk to each
> other.
> 

Sounds good, I share this expectation, just wanted to check with you.

> > 
> > > > 
> > > > > These flags are there to indicate how to set rproc::sync_with_rproc after
> > > > > different events, that is when the remoteproc core boots, when the remoteproc
> > > > > has been stopped or when it has crashed.
> > > > > 
> > > > 
> > > > Right, that was clear from your patches. Sorry that my reply didn't
> > > > convey the information that I had understood this.
> > > > 
> > > > > > 
> > > > > > > +};
> > > > > > > +
> > > > > > >  /**
> > > > > > >   * struct rproc_ops - platform-specific device handlers
> > > > > > >   * @start:	power on the device and boot it
> > > > > > > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> > > > > > >   * @firmware: name of firmware file to be loaded
> > > > > > >   * @priv: private data which belongs to the platform-specific rproc module
> > > > > > >   * @ops: platform-specific start/stop rproc handlers
> > > > > > > + * @sync_ops: platform-specific start/stop rproc handlers when
> > > > > > > + *	      synchronising with a remote processor.
> > > > > > > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> > > > > > >   * @dev: virtual device for refcounting and common remoteproc behavior
> > > > > > >   * @power: refcount of users who need this rproc powered up
> > > > > > >   * @state: state of the device
> > > > > > > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> > > > > > >   * @table_sz: size of @cached_table
> > > > > > >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> > > > > > >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > > > > > > + * @sync_with_rproc: true if currently synchronising with the rproc
> > > > > > >   * @dump_segments: list of segments in the firmware
> > > > > > >   * @nb_vdev: number of vdev currently handled by rproc
> > > > > > >   */
> > > > > > > @@ -492,6 +513,8 @@ struct rproc {
> > > > > > >  	const char *firmware;
> > > > > > >  	void *priv;
> > > > > > >  	struct rproc_ops *ops;
> > > > > > > +	struct rproc_ops *sync_ops;
> > > > > > 
> > > > > > Do we really need two rproc_ops, given that both are coming from the
> > > > > > platform driver and the sync_flags will define which one to look at?
> > > > > > 
> > > > > > Can't the platform driver just provide an ops table that works with the
> > > > > > flags it passes?
> > > > > 
> > > > > That is the approach Loic took in a previous patchset [1] and that was rejected.
> > > > > It also lead to all of the platform drivers testing rproc->flag before carring
> > > > > different actions, something you indicated could be done in the core.  This
> > > > > patch does exactly that, i.e move the testing of rproc->flag to the core and
> > > > > calls the right function based on that.
> > > > > 
> > > > 
> > > > I think I see what you mean, as we use "start" for both syncing and
> > > > starting the core, a { on_init = true, after_stop = false } setup either
> > > > needs two tables or force conditionals on the platform driver.
> > > > 
> > > > > The end result is the same and I'm happy with one or the other, I will need to
> > > > > know which one.
> > > > > 
> > > > 
> > > > How about adding a new ops named "attach" to rproc_ops, which the
> > > > platform driver can specify if it supports attaching an already running
> > > > processor?
> > > 
> > > Using "attach_ops" works for me.  But would "autonomous_ops", to correlate with
> > > rproc::autonomous, add clarity?  Either way work equally well for me. 
> > > 
> > 
> > What I meant was that we add a function "attach" to the existing
> > rproc_ops. In the case of OFFLINE->RUNNING we continue to call
> > rproc->ops->start() and DETACHED->RUNNING we call this
> > rproc->ops->attach().
> 
> If I read the above properly we'd end up with:
> 
> struct rproc_ops {
> 	int (*start)(struct rproc *rproc);
> 	int (*stop)(struct rproc *rproc);
> 	int (*attach)(struct rproc *rproc);
> 	int (*detach)(struct rproc *rproc);
>         ...
>         ...
> };

Yes, that's what I meant.

> 
> But wed'd have to deal with other operations that are common to both scenarios
> such as parse_fw() and find_loaded_rsc_table().  
> 

I would prefer that we don't parse_fw(NULL), perhaps we can turn that
upside down and have the platform_driver just provide the information to
the core as it learns about it during probe?

> So far lot of improvement have already been suggested on this revision.  I
> suggest to spin off a new patchset that only handles the DETACHED->RUNNING
> scenario and split common functions such as rproc_fw_boot().  From there we can
> see if other refinements (such as what you suggest above) are mandated.
> 

As far as I can see, if we take the approach of introducing the DETACHED
state we can add the various transitions piecemeal. So I'm definitely in
favour of starting off with DETACHED->RUNNING, then figure out
"autonomous recovery" and RUNNING->DETACHED in two subsequent series.

> One last thing...  Upon reflecting on all this I think using "attach" is better
> than "autonomous", the latter is heavy to drag around.
> 

For the action of going from DETACHED->RUNNING I too find "attach" to
better represent what's going on. The part where I think we need
something more is to communicate if it's Linux that's in charge or not
for taking the remote processor through RUNNING->CRASHED->RUNNING. For
that the word "autonomous" makes sense to me, but let's bring that up
again after landing this first piece(s).

Regards,
Bjorn
Mathieu Poirier May 21, 2020, 9:55 p.m. UTC | #10
On Wed, 20 May 2020 at 23:22, Bjorn Andersson
<bjorn.andersson@linaro.org> wrote:
>
> On Wed 20 May 15:06 PDT 2020, Mathieu Poirier wrote:
>
> > On Mon, May 18, 2020 at 05:55:00PM -0700, Bjorn Andersson wrote:
> > > On Fri 15 May 12:24 PDT 2020, Mathieu Poirier wrote:
> > >
> > > > Good day Bjorn,
> > > >
> > > > On Wed, May 13, 2020 at 06:32:24PM -0700, Bjorn Andersson wrote:
> > > > > On Fri 08 May 14:01 PDT 2020, Mathieu Poirier wrote:
> > > > >
> > > > > > On Tue, May 05, 2020 at 05:22:53PM -0700, Bjorn Andersson wrote:
> > > > > > > On Fri 24 Apr 13:01 PDT 2020, Mathieu Poirier wrote:
> [..]
> > > > > > > > + bool after_crash;
> > > > > > >
> > > > > > > Similarly what is the expected steps to be taken by the core when this
> > > > > > > is true? Should rproc_report_crash() simply stop/start the subdevices
> > > > > > > and upon one of the ops somehow tell the remote controller that it can
> > > > > > > proceed with the recovery?
> > > > > >
> > > > > > The exact same sequence of steps will be carried out as they are today, except
> > > > > > that if after_crash == true, the remoteproc core won't be switching the remote
> > > > > > processor on, exactly as it would do when on_init == true.
> > > > > >
> > > > >
> > > > > Just to make sure we're on the same page:
> > > > >
> > > > > after_crash = false is what we have today, and would mean:
> > > > >
> > > > > 1) stop subdevices
> > > > > 2) power off
> > > > > 3) unprepare subdevices
> > > > > 4) generate coredump
> > > > > 5) request firmware
> > > > > 6) load segments
> > > > > 7) find resource table
> > > > > 8) prepare subdevices
> > > > > 9) "boot"
> > > > > 10) start subdevices
> > > >
> > > > Exactly
> > > >
> > > > >
> > > > > after_crash = true would mean:
> > > > >
> > > > > 1) stop subdevices
> > > > > 2) "detach"
> > > > > 3) unprepare subdevices
> > > > > 4) prepare subdevices
> > > > > 5) "attach"
> > > > > 6) start subdevices
> > > > >
> > > >
> > > > Yes
> > > >
> > > > > State diagram wise both of these would represent the transition RUNNING
> > > > > -> CRASHED -> RUNNING, but somehow the platform driver needs to be able
> > > > > to specify which of these sequences to perform. Per your naming
> > > > > suggestion above, this does sound like a "autonomous_recovery" boolean
> > > > > to me.
> > > >
> > > > Right, semantically "rproc->autonomous" would apply quite well.
> > > >
> > > > In function rproc_crash_handler_work(), a call to rproc_set_sync_flag() has been
> > > > strategically placed to set the value of rproc->autonomous based on
> > > > "after_crash".  From there the core knows which rproc_ops to use.  Here too we
> > > > have to rely on the rproc_ops provided by the platform to do the right thing
> > > > based on the scenario to enact.
> > > >
> > >
> > > Do you think that autonomous_recovery would be something that changes
> > > for a given remoteproc instance? I envisioned it as something that you
> > > know at registration time, but perhaps I'm missing some details here.
> >
> > I don't envision any of the transision flags to change once they are set by the
> > platform.   The same applies to the new rproc_ops, it can be set only once.
> > Otherwise combination of possible scenarios becomes too hard to manage, leading
> > to situations where the core and MCU get out of sync and can't talk to each
> > other.
> >
>
> Sounds good, I share this expectation, just wanted to check with you.
>
> > >
> > > > >
> > > > > > These flags are there to indicate how to set rproc::sync_with_rproc after
> > > > > > different events, that is when the remoteproc core boots, when the remoteproc
> > > > > > has been stopped or when it has crashed.
> > > > > >
> > > > >
> > > > > Right, that was clear from your patches. Sorry that my reply didn't
> > > > > convey the information that I had understood this.
> > > > >
> > > > > > >
> > > > > > > > +};
> > > > > > > > +
> > > > > > > >  /**
> > > > > > > >   * struct rproc_ops - platform-specific device handlers
> > > > > > > >   * @start:       power on the device and boot it
> > > > > > > > @@ -459,6 +476,9 @@ struct rproc_dump_segment {
> > > > > > > >   * @firmware: name of firmware file to be loaded
> > > > > > > >   * @priv: private data which belongs to the platform-specific rproc module
> > > > > > > >   * @ops: platform-specific start/stop rproc handlers
> > > > > > > > + * @sync_ops: platform-specific start/stop rproc handlers when
> > > > > > > > + *             synchronising with a remote processor.
> > > > > > > > + * @sync_flags: Determine the rproc_ops to choose in specific states.
> > > > > > > >   * @dev: virtual device for refcounting and common remoteproc behavior
> > > > > > > >   * @power: refcount of users who need this rproc powered up
> > > > > > > >   * @state: state of the device
> > > > > > > > @@ -482,6 +502,7 @@ struct rproc_dump_segment {
> > > > > > > >   * @table_sz: size of @cached_table
> > > > > > > >   * @has_iommu: flag to indicate if remote processor is behind an MMU
> > > > > > > >   * @auto_boot: flag to indicate if remote processor should be auto-started
> > > > > > > > + * @sync_with_rproc: true if currently synchronising with the rproc
> > > > > > > >   * @dump_segments: list of segments in the firmware
> > > > > > > >   * @nb_vdev: number of vdev currently handled by rproc
> > > > > > > >   */
> > > > > > > > @@ -492,6 +513,8 @@ struct rproc {
> > > > > > > >   const char *firmware;
> > > > > > > >   void *priv;
> > > > > > > >   struct rproc_ops *ops;
> > > > > > > > + struct rproc_ops *sync_ops;
> > > > > > >
> > > > > > > Do we really need two rproc_ops, given that both are coming from the
> > > > > > > platform driver and the sync_flags will define which one to look at?
> > > > > > >
> > > > > > > Can't the platform driver just provide an ops table that works with the
> > > > > > > flags it passes?
> > > > > >
> > > > > > That is the approach Loic took in a previous patchset [1] and that was rejected.
> > > > > > It also lead to all of the platform drivers testing rproc->flag before carring
> > > > > > different actions, something you indicated could be done in the core.  This
> > > > > > patch does exactly that, i.e move the testing of rproc->flag to the core and
> > > > > > calls the right function based on that.
> > > > > >
> > > > >
> > > > > I think I see what you mean, as we use "start" for both syncing and
> > > > > starting the core, a { on_init = true, after_stop = false } setup either
> > > > > needs two tables or force conditionals on the platform driver.
> > > > >
> > > > > > The end result is the same and I'm happy with one or the other, I will need to
> > > > > > know which one.
> > > > > >
> > > > >
> > > > > How about adding a new ops named "attach" to rproc_ops, which the
> > > > > platform driver can specify if it supports attaching an already running
> > > > > processor?
> > > >
> > > > Using "attach_ops" works for me.  But would "autonomous_ops", to correlate with
> > > > rproc::autonomous, add clarity?  Either way work equally well for me.
> > > >
> > >
> > > What I meant was that we add a function "attach" to the existing
> > > rproc_ops. In the case of OFFLINE->RUNNING we continue to call
> > > rproc->ops->start() and DETACHED->RUNNING we call this
> > > rproc->ops->attach().
> >
> > If I read the above properly we'd end up with:
> >
> > struct rproc_ops {
> >       int (*start)(struct rproc *rproc);
> >       int (*stop)(struct rproc *rproc);
> >       int (*attach)(struct rproc *rproc);
> >       int (*detach)(struct rproc *rproc);
> >         ...
> >         ...
> > };
>
> Yes, that's what I meant.
>
> >
> > But wed'd have to deal with other operations that are common to both scenarios
> > such as parse_fw() and find_loaded_rsc_table().
> >
>
> I would prefer that we don't parse_fw(NULL), perhaps we can turn that
> upside down and have the platform_driver just provide the information to
> the core as it learns about it during probe?

I need to think about that...  We may have to discuss this again in a
not so distant future.

>
> > So far lot of improvement have already been suggested on this revision.  I
> > suggest to spin off a new patchset that only handles the DETACHED->RUNNING
> > scenario and split common functions such as rproc_fw_boot().  From there we can
> > see if other refinements (such as what you suggest above) are mandated.
> >
>
> As far as I can see, if we take the approach of introducing the DETACHED
> state we can add the various transitions piecemeal. So I'm definitely in
> favour of starting off with DETACHED->RUNNING, then figure out
> "autonomous recovery" and RUNNING->DETACHED in two subsequent series.
>
> > One last thing...  Upon reflecting on all this I think using "attach" is better
> > than "autonomous", the latter is heavy to drag around.
> >
>
> For the action of going from DETACHED->RUNNING I too find "attach" to
> better represent what's going on. The part where I think we need
> something more is to communicate if it's Linux that's in charge or not
> for taking the remote processor through RUNNING->CRASHED->RUNNING. For
> that the word "autonomous" makes sense to me, but let's bring that up
> again after landing this first piece(s).

I agree.

>
> Regards,
> Bjorn
diff mbox series

Patch

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index ac4082f12e8b..ceb3b2bba824 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -353,6 +353,23 @@  enum rsc_handling_status {
 	RSC_IGNORED	= 1,
 };
 
+/**
+ * struct rproc_sync_flags - platform specific flags indicating which
+ *			      rproc_ops to use at specific times during
+ *			      the rproc lifecycle.
+ * @on_init: true if synchronising with the remote processor at
+ *	     initialisation time
+ * @after_stop: true if synchronising with the remote processor after it was
+ *		stopped from the cmmand line
+ * @after_crash: true if synchronising with the remote processor after
+ *		 it has crashed
+ */
+struct rproc_sync_flags {
+	bool on_init;
+	bool after_stop;
+	bool after_crash;
+};
+
 /**
  * struct rproc_ops - platform-specific device handlers
  * @start:	power on the device and boot it
@@ -459,6 +476,9 @@  struct rproc_dump_segment {
  * @firmware: name of firmware file to be loaded
  * @priv: private data which belongs to the platform-specific rproc module
  * @ops: platform-specific start/stop rproc handlers
+ * @sync_ops: platform-specific start/stop rproc handlers when
+ *	      synchronising with a remote processor.
+ * @sync_flags: Determine the rproc_ops to choose in specific states.
  * @dev: virtual device for refcounting and common remoteproc behavior
  * @power: refcount of users who need this rproc powered up
  * @state: state of the device
@@ -482,6 +502,7 @@  struct rproc_dump_segment {
  * @table_sz: size of @cached_table
  * @has_iommu: flag to indicate if remote processor is behind an MMU
  * @auto_boot: flag to indicate if remote processor should be auto-started
+ * @sync_with_rproc: true if currently synchronising with the rproc
  * @dump_segments: list of segments in the firmware
  * @nb_vdev: number of vdev currently handled by rproc
  */
@@ -492,6 +513,8 @@  struct rproc {
 	const char *firmware;
 	void *priv;
 	struct rproc_ops *ops;
+	struct rproc_ops *sync_ops;
+	struct rproc_sync_flags sync_flags;
 	struct device dev;
 	atomic_t power;
 	unsigned int state;
@@ -515,6 +538,7 @@  struct rproc {
 	size_t table_sz;
 	bool has_iommu;
 	bool auto_boot;
+	bool sync_with_rproc;
 	struct list_head dump_segments;
 	int nb_vdev;
 	u8 elf_class;