diff mbox series

[v5,07/19] media: vsp1: dl: Support one-shot entries in the display list

Message ID 20190221103212.28764-8-laurent.pinchart+renesas@ideasonboard.com (mailing list archive)
State New, archived
Headers show
Series R-Car DU display writeback support | expand

Commit Message

Laurent Pinchart Feb. 21, 2019, 10:32 a.m. UTC
One-shot entries are used as an alternative to committing a complete new
display list when a couple of registers need to be written for one frame
and then reset to another value for all subsequent frames. This will be
used to implement writeback support that will need to enable writeback
for the duration of a single frame.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
---
 drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
 drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
 2 files changed, 81 insertions(+)

Comments

Kieran Bingham Feb. 21, 2019, 1:16 p.m. UTC | #1
Hi Laurent,

On 21/02/2019 10:32, Laurent Pinchart wrote:
> One-shot entries are used as an alternative to committing a complete new
> display list when a couple of registers need to be written for one frame
> and then reset to another value for all subsequent frames. This will be
> used to implement writeback support that will need to enable writeback
> for the duration of a single frame.
> 
> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>

Thanks for adding the documentation, and the new name _oneshot() sounds
fine to me.


Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>

> ---
>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
>  2 files changed, 81 insertions(+)
> 
> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> index 886b3a69d329..7b4d252bfde7 100644
> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
>  
>  	unsigned int num_entries;
>  	unsigned int max_entries;
> +
> +	unsigned int num_patches;
> +	struct {
> +		struct vsp1_dl_entry *entry;
> +		u32 data;
> +	} patches[2];
>  };
>  
>  /**
> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
>  		return;
>  
>  	dlb->num_entries = 0;
> +	dlb->num_patches = 0;
>  
>  	spin_lock_irqsave(&dlb->pool->lock, flags);
>  	list_add_tail(&dlb->free, &dlb->pool->free);
> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
>  	dlb->num_entries++;
>  }
>  
> +/**
> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> + *	single frame
> + * @dlb: The body
> + * @reg: The register address
> + * @value: The register value
> + * @reset_value: The value to reset the register to at the next vblank
> + *
> + * Display lists in continuous mode are re-used by the hardware for successive
> + * frames until a new display list is committed. Changing the VSP configuration
> + * normally requires creating and committing a new display list. This function
> + * offers an alternative race-free way by writing a @value to the @register in
> + * the display list body for a single frame, specifying in @reset_value the
> + * value to reset the register to one vblank after the display list is
> + * committed.
> + *
> + * The maximum number of one-shot entries is limited to 2 per display list body,
> + * and one-shot entries are counted in the total number of entries specified
> + * when the body is allocated by vsp1_dl_body_alloc().
> + */
> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> +				u32 reset_value)
> +{
> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> +		return;
> +
> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> +		      "DLB patches size exceeded (max %zu)",
> +		      ARRAY_SIZE(dlb->patches)))
> +		return;
> +
> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> +	dlb->patches[dlb->num_patches].data = reset_value;
> +	dlb->num_patches++;
> +
> +	dlb->entries[dlb->num_entries].addr = reg;
> +	dlb->entries[dlb->num_entries].data = value;
> +	dlb->num_entries++;
> +}
> +
>  /* -----------------------------------------------------------------------------
>   * Display List Extended Command Management
>   */
> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
>  	 * has at least one body, thus we reinitialise the entries list.
>  	 */
>  	dl->body0->num_entries = 0;
> +	dl->body0->num_patches = 0;
>  
>  	list_add_tail(&dl->list, &dl->dlm->free);
>  }
> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
>   * Display List Manager
>   */
>  
> +/**
> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> + *	interrupt
> + * @dlm: the display list manager
> + *
> + * Apply all one-shot patches registered for the active display list.
> + */
> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> +{
> +	struct vsp1_dl_body *dlb;
> +	struct vsp1_dl_list *dl;
> +	unsigned int i;
> +
> +	spin_lock(&dlm->lock);
> +
> +	dl = dlm->active;
> +	if (!dl)
> +		goto done;
> +
> +	list_for_each_entry(dlb, &dl->bodies, list) {
> +		for (i = 0; i < dlb->num_patches; ++i)
> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> +		dlb->num_patches = 0;
> +	}
> +
> +done:
> +	spin_unlock(&dlm->lock);
> +}
> +
>  /**
>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
>   * @dlm: the display list manager
> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> index e0fdb145e6ed..f845607abc4c 100644
> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
>  					unsigned int prealloc);
>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
>  
> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
>  
>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> +				u32 reset_value);
>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
>  
>
Brian Starkey Feb. 22, 2019, 2:30 p.m. UTC | #2
Hi Laurent,

On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> One-shot entries are used as an alternative to committing a complete new
> display list when a couple of registers need to be written for one frame
> and then reset to another value for all subsequent frames. This will be
> used to implement writeback support that will need to enable writeback
> for the duration of a single frame.
> 
> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> ---
>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
>  2 files changed, 81 insertions(+)
> 
> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> index 886b3a69d329..7b4d252bfde7 100644
> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
>  
>  	unsigned int num_entries;
>  	unsigned int max_entries;
> +
> +	unsigned int num_patches;
> +	struct {
> +		struct vsp1_dl_entry *entry;
> +		u32 data;
> +	} patches[2];
>  };
>  
>  /**
> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
>  		return;
>  
>  	dlb->num_entries = 0;
> +	dlb->num_patches = 0;
>  
>  	spin_lock_irqsave(&dlb->pool->lock, flags);
>  	list_add_tail(&dlb->free, &dlb->pool->free);
> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
>  	dlb->num_entries++;
>  }
>  
> +/**
> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> + *	single frame
> + * @dlb: The body
> + * @reg: The register address
> + * @value: The register value
> + * @reset_value: The value to reset the register to at the next vblank
> + *
> + * Display lists in continuous mode are re-used by the hardware for successive
> + * frames until a new display list is committed. Changing the VSP configuration
> + * normally requires creating and committing a new display list. This function
> + * offers an alternative race-free way by writing a @value to the @register in
> + * the display list body for a single frame, specifying in @reset_value the
> + * value to reset the register to one vblank after the display list is
> + * committed.
> + *
> + * The maximum number of one-shot entries is limited to 2 per display list body,
> + * and one-shot entries are counted in the total number of entries specified
> + * when the body is allocated by vsp1_dl_body_alloc().
> + */
> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> +				u32 reset_value)
> +{
> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> +		return;
> +
> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> +		      "DLB patches size exceeded (max %zu)",
> +		      ARRAY_SIZE(dlb->patches)))
> +		return;
> +
> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> +	dlb->patches[dlb->num_patches].data = reset_value;
> +	dlb->num_patches++;
> +
> +	dlb->entries[dlb->num_entries].addr = reg;
> +	dlb->entries[dlb->num_entries].data = value;
> +	dlb->num_entries++;
> +}
> +
>  /* -----------------------------------------------------------------------------
>   * Display List Extended Command Management
>   */
> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
>  	 * has at least one body, thus we reinitialise the entries list.
>  	 */
>  	dl->body0->num_entries = 0;
> +	dl->body0->num_patches = 0;
>  
>  	list_add_tail(&dl->list, &dl->dlm->free);
>  }
> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
>   * Display List Manager
>   */
>  
> +/**
> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> + *	interrupt
> + * @dlm: the display list manager
> + *
> + * Apply all one-shot patches registered for the active display list.
> + */
> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> +{
> +	struct vsp1_dl_body *dlb;
> +	struct vsp1_dl_list *dl;
> +	unsigned int i;
> +
> +	spin_lock(&dlm->lock);
> +
> +	dl = dlm->active;
> +	if (!dl)
> +		goto done;
> +
> +	list_for_each_entry(dlb, &dl->bodies, list) {
> +		for (i = 0; i < dlb->num_patches; ++i)
> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> +		dlb->num_patches = 0;
> +	}
> +
> +done:
> +	spin_unlock(&dlm->lock);
> +}
> +

We've got some HW which doesn't support one-shot writeback, and use a
similar trick to try and disable writeback immediately after the flip.

We ran into issues where the "start" interrupt wouldn't run in time to
make sure the writeback disable was committed before the next frame.
We have to keep track of whether the disable really happened in time,
before we release the output buffer.

Might you have a similar problem here?

Thanks,
-Brian

>  /**
>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
>   * @dlm: the display list manager
> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> index e0fdb145e6ed..f845607abc4c 100644
> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
>  					unsigned int prealloc);
>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
>  
> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
>  
>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> +				u32 reset_value);
>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
>  
> -- 
> Regards,
> 
> Laurent Pinchart
>
Laurent Pinchart Feb. 22, 2019, 2:46 p.m. UTC | #3
Hi Brian,

On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> > One-shot entries are used as an alternative to committing a complete new
> > display list when a couple of registers need to be written for one frame
> > and then reset to another value for all subsequent frames. This will be
> > used to implement writeback support that will need to enable writeback
> > for the duration of a single frame.
> > 
> > Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> > ---
> >  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> >  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> >  2 files changed, 81 insertions(+)
> > 
> > diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> > index 886b3a69d329..7b4d252bfde7 100644
> > --- a/drivers/media/platform/vsp1/vsp1_dl.c
> > +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> > @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> >  
> >  	unsigned int num_entries;
> >  	unsigned int max_entries;
> > +
> > +	unsigned int num_patches;
> > +	struct {
> > +		struct vsp1_dl_entry *entry;
> > +		u32 data;
> > +	} patches[2];
> >  };
> >  
> >  /**
> > @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> >  		return;
> >  
> >  	dlb->num_entries = 0;
> > +	dlb->num_patches = 0;
> >  
> >  	spin_lock_irqsave(&dlb->pool->lock, flags);
> >  	list_add_tail(&dlb->free, &dlb->pool->free);
> > @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> >  	dlb->num_entries++;
> >  }
> >  
> > +/**
> > + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> > + *	single frame
> > + * @dlb: The body
> > + * @reg: The register address
> > + * @value: The register value
> > + * @reset_value: The value to reset the register to at the next vblank
> > + *
> > + * Display lists in continuous mode are re-used by the hardware for successive
> > + * frames until a new display list is committed. Changing the VSP configuration
> > + * normally requires creating and committing a new display list. This function
> > + * offers an alternative race-free way by writing a @value to the @register in
> > + * the display list body for a single frame, specifying in @reset_value the
> > + * value to reset the register to one vblank after the display list is
> > + * committed.
> > + *
> > + * The maximum number of one-shot entries is limited to 2 per display list body,
> > + * and one-shot entries are counted in the total number of entries specified
> > + * when the body is allocated by vsp1_dl_body_alloc().
> > + */
> > +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > +				u32 reset_value)
> > +{
> > +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> > +		      "DLB size exceeded (max %u)", dlb->max_entries))
> > +		return;
> > +
> > +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> > +		      "DLB patches size exceeded (max %zu)",
> > +		      ARRAY_SIZE(dlb->patches)))
> > +		return;
> > +
> > +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> > +	dlb->patches[dlb->num_patches].data = reset_value;
> > +	dlb->num_patches++;
> > +
> > +	dlb->entries[dlb->num_entries].addr = reg;
> > +	dlb->entries[dlb->num_entries].data = value;
> > +	dlb->num_entries++;
> > +}
> > +
> >  /* -----------------------------------------------------------------------------
> >   * Display List Extended Command Management
> >   */
> > @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> >  	 * has at least one body, thus we reinitialise the entries list.
> >  	 */
> >  	dl->body0->num_entries = 0;
> > +	dl->body0->num_patches = 0;
> >  
> >  	list_add_tail(&dl->list, &dl->dlm->free);
> >  }
> > @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> >   * Display List Manager
> >   */
> >  
> > +/**
> > + * vsp1_dlm_irq_display_start - Display list handler for the display start
> > + *	interrupt
> > + * @dlm: the display list manager
> > + *
> > + * Apply all one-shot patches registered for the active display list.
> > + */
> > +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> > +{
> > +	struct vsp1_dl_body *dlb;
> > +	struct vsp1_dl_list *dl;
> > +	unsigned int i;
> > +
> > +	spin_lock(&dlm->lock);
> > +
> > +	dl = dlm->active;
> > +	if (!dl)
> > +		goto done;
> > +
> > +	list_for_each_entry(dlb, &dl->bodies, list) {
> > +		for (i = 0; i < dlb->num_patches; ++i)
> > +			dlb->patches[i].entry->data = dlb->patches[i].data;
> > +		dlb->num_patches = 0;
> > +	}
> > +
> > +done:
> > +	spin_unlock(&dlm->lock);
> > +}
> > +
> 
> We've got some HW which doesn't support one-shot writeback, and use a
> similar trick to try and disable writeback immediately after the flip.
> 
> We ran into issues where the "start" interrupt wouldn't run in time to
> make sure the writeback disable was committed before the next frame.
> We have to keep track of whether the disable really happened in time,
> before we release the output buffer.
> 
> Might you have a similar problem here?

We may, but there's no provision at the hardware level to check if the
configuration updated happened in time. I could add some safety checks
but I believe they would be racy in the best case :-(

Note that we have the duration of a complete frame to disable writeback,
as we receive an interrupt when the frame starts, and have until vblank
to update the configuration. It's thus slightly better than having to
disable writeback between vblank and the start of the next frame.

> >  /**
> >   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> >   * @dlm: the display list manager
> > diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> > index e0fdb145e6ed..f845607abc4c 100644
> > --- a/drivers/media/platform/vsp1/vsp1_dl.h
> > +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> > @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> >  					unsigned int prealloc);
> >  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> >  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> > +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> >  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> >  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> >  
> > @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> >  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> >  
> >  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> > +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > +				u32 reset_value);
> >  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> >  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> >
Brian Starkey Feb. 22, 2019, 3:06 p.m. UTC | #4
On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> Hi Brian,
> 
> On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> > On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> > > One-shot entries are used as an alternative to committing a complete new
> > > display list when a couple of registers need to be written for one frame
> > > and then reset to another value for all subsequent frames. This will be
> > > used to implement writeback support that will need to enable writeback
> > > for the duration of a single frame.
> > > 
> > > Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> > > ---
> > >  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> > >  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> > >  2 files changed, 81 insertions(+)
> > > 
> > > diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> > > index 886b3a69d329..7b4d252bfde7 100644
> > > --- a/drivers/media/platform/vsp1/vsp1_dl.c
> > > +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> > > @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> > >  
> > >  	unsigned int num_entries;
> > >  	unsigned int max_entries;
> > > +
> > > +	unsigned int num_patches;
> > > +	struct {
> > > +		struct vsp1_dl_entry *entry;
> > > +		u32 data;
> > > +	} patches[2];
> > >  };
> > >  
> > >  /**
> > > @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> > >  		return;
> > >  
> > >  	dlb->num_entries = 0;
> > > +	dlb->num_patches = 0;
> > >  
> > >  	spin_lock_irqsave(&dlb->pool->lock, flags);
> > >  	list_add_tail(&dlb->free, &dlb->pool->free);
> > > @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> > >  	dlb->num_entries++;
> > >  }
> > >  
> > > +/**
> > > + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> > > + *	single frame
> > > + * @dlb: The body
> > > + * @reg: The register address
> > > + * @value: The register value
> > > + * @reset_value: The value to reset the register to at the next vblank
> > > + *
> > > + * Display lists in continuous mode are re-used by the hardware for successive
> > > + * frames until a new display list is committed. Changing the VSP configuration
> > > + * normally requires creating and committing a new display list. This function
> > > + * offers an alternative race-free way by writing a @value to the @register in
> > > + * the display list body for a single frame, specifying in @reset_value the
> > > + * value to reset the register to one vblank after the display list is
> > > + * committed.
> > > + *
> > > + * The maximum number of one-shot entries is limited to 2 per display list body,
> > > + * and one-shot entries are counted in the total number of entries specified
> > > + * when the body is allocated by vsp1_dl_body_alloc().
> > > + */
> > > +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > > +				u32 reset_value)
> > > +{
> > > +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> > > +		      "DLB size exceeded (max %u)", dlb->max_entries))
> > > +		return;
> > > +
> > > +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> > > +		      "DLB patches size exceeded (max %zu)",
> > > +		      ARRAY_SIZE(dlb->patches)))
> > > +		return;
> > > +
> > > +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> > > +	dlb->patches[dlb->num_patches].data = reset_value;
> > > +	dlb->num_patches++;
> > > +
> > > +	dlb->entries[dlb->num_entries].addr = reg;
> > > +	dlb->entries[dlb->num_entries].data = value;
> > > +	dlb->num_entries++;
> > > +}
> > > +
> > >  /* -----------------------------------------------------------------------------
> > >   * Display List Extended Command Management
> > >   */
> > > @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> > >  	 * has at least one body, thus we reinitialise the entries list.
> > >  	 */
> > >  	dl->body0->num_entries = 0;
> > > +	dl->body0->num_patches = 0;
> > >  
> > >  	list_add_tail(&dl->list, &dl->dlm->free);
> > >  }
> > > @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> > >   * Display List Manager
> > >   */
> > >  
> > > +/**
> > > + * vsp1_dlm_irq_display_start - Display list handler for the display start
> > > + *	interrupt
> > > + * @dlm: the display list manager
> > > + *
> > > + * Apply all one-shot patches registered for the active display list.
> > > + */
> > > +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> > > +{
> > > +	struct vsp1_dl_body *dlb;
> > > +	struct vsp1_dl_list *dl;
> > > +	unsigned int i;
> > > +
> > > +	spin_lock(&dlm->lock);
> > > +
> > > +	dl = dlm->active;
> > > +	if (!dl)
> > > +		goto done;
> > > +
> > > +	list_for_each_entry(dlb, &dl->bodies, list) {
> > > +		for (i = 0; i < dlb->num_patches; ++i)
> > > +			dlb->patches[i].entry->data = dlb->patches[i].data;
> > > +		dlb->num_patches = 0;
> > > +	}
> > > +
> > > +done:
> > > +	spin_unlock(&dlm->lock);
> > > +}
> > > +
> > 
> > We've got some HW which doesn't support one-shot writeback, and use a
> > similar trick to try and disable writeback immediately after the flip.
> > 
> > We ran into issues where the "start" interrupt wouldn't run in time to
> > make sure the writeback disable was committed before the next frame.
> > We have to keep track of whether the disable really happened in time,
> > before we release the output buffer.
> > 
> > Might you have a similar problem here?
> 
> We may, but there's no provision at the hardware level to check if the
> configuration updated happened in time. I could add some safety checks
> but I believe they would be racy in the best case :-(

We managed to find (what I believe to be...) a non-racy way, but it
will of course depend a lot on the HW behaviour, so I'll leave it to
your best judgement.

We basically have a "configuration committed" interrupt which we can
use to set a flag indicating writeback was disabled.

> 
> Note that we have the duration of a complete frame to disable writeback,
> as we receive an interrupt when the frame starts, and have until vblank
> to update the configuration. It's thus slightly better than having to
> disable writeback between vblank and the start of the next frame.

Yeah... we have a whole frame too. I'm struggling to find our wiki
page with the data, but anecdotally there's some (out-of-tree) drivers
which keep interrupts masked for a _really long_ time. It's nice if
you don't have to care about those :-)

-Brian

> 
> > >  /**
> > >   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> > >   * @dlm: the display list manager
> > > diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> > > index e0fdb145e6ed..f845607abc4c 100644
> > > --- a/drivers/media/platform/vsp1/vsp1_dl.h
> > > +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> > > @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> > >  					unsigned int prealloc);
> > >  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> > >  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> > > +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> > >  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> > >  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> > >  
> > > @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> > >  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> > >  
> > >  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> > > +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > > +				u32 reset_value);
> > >  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> > >  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> > >  
> 
> -- 
> Regards,
> 
> Laurent Pinchart
Laurent Pinchart March 5, 2019, 11:14 p.m. UTC | #5
Hi Brian,

On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> > On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> >> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> >>> One-shot entries are used as an alternative to committing a complete new
> >>> display list when a couple of registers need to be written for one frame
> >>> and then reset to another value for all subsequent frames. This will be
> >>> used to implement writeback support that will need to enable writeback
> >>> for the duration of a single frame.
> >>> 
> >>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> >>> ---
> >>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> >>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> >>>  2 files changed, 81 insertions(+)
> >>> 
> >>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> >>> index 886b3a69d329..7b4d252bfde7 100644
> >>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> >>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> >>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> >>>  
> >>>  	unsigned int num_entries;
> >>>  	unsigned int max_entries;
> >>> +
> >>> +	unsigned int num_patches;
> >>> +	struct {
> >>> +		struct vsp1_dl_entry *entry;
> >>> +		u32 data;
> >>> +	} patches[2];
> >>>  };
> >>>  
> >>>  /**
> >>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> >>>  		return;
> >>>  
> >>>  	dlb->num_entries = 0;
> >>> +	dlb->num_patches = 0;
> >>>  
> >>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> >>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> >>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> >>>  	dlb->num_entries++;
> >>>  }
> >>>  
> >>> +/**
> >>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> >>> + *	single frame
> >>> + * @dlb: The body
> >>> + * @reg: The register address
> >>> + * @value: The register value
> >>> + * @reset_value: The value to reset the register to at the next vblank
> >>> + *
> >>> + * Display lists in continuous mode are re-used by the hardware for successive
> >>> + * frames until a new display list is committed. Changing the VSP configuration
> >>> + * normally requires creating and committing a new display list. This function
> >>> + * offers an alternative race-free way by writing a @value to the @register in
> >>> + * the display list body for a single frame, specifying in @reset_value the
> >>> + * value to reset the register to one vblank after the display list is
> >>> + * committed.
> >>> + *
> >>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> >>> + * and one-shot entries are counted in the total number of entries specified
> >>> + * when the body is allocated by vsp1_dl_body_alloc().
> >>> + */
> >>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>> +				u32 reset_value)
> >>> +{
> >>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> >>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> >>> +		return;
> >>> +
> >>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> >>> +		      "DLB patches size exceeded (max %zu)",
> >>> +		      ARRAY_SIZE(dlb->patches)))
> >>> +		return;
> >>> +
> >>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> >>> +	dlb->patches[dlb->num_patches].data = reset_value;
> >>> +	dlb->num_patches++;
> >>> +
> >>> +	dlb->entries[dlb->num_entries].addr = reg;
> >>> +	dlb->entries[dlb->num_entries].data = value;
> >>> +	dlb->num_entries++;
> >>> +}
> >>> +
> >>>  /* -----------------------------------------------------------------------------
> >>>   * Display List Extended Command Management
> >>>   */
> >>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> >>>  	 * has at least one body, thus we reinitialise the entries list.
> >>>  	 */
> >>>  	dl->body0->num_entries = 0;
> >>> +	dl->body0->num_patches = 0;
> >>>  
> >>>  	list_add_tail(&dl->list, &dl->dlm->free);
> >>>  }
> >>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> >>>   * Display List Manager
> >>>   */
> >>>  
> >>> +/**
> >>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> >>> + *	interrupt
> >>> + * @dlm: the display list manager
> >>> + *
> >>> + * Apply all one-shot patches registered for the active display list.
> >>> + */
> >>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> >>> +{
> >>> +	struct vsp1_dl_body *dlb;
> >>> +	struct vsp1_dl_list *dl;
> >>> +	unsigned int i;
> >>> +
> >>> +	spin_lock(&dlm->lock);
> >>> +
> >>> +	dl = dlm->active;
> >>> +	if (!dl)
> >>> +		goto done;
> >>> +
> >>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> >>> +		for (i = 0; i < dlb->num_patches; ++i)
> >>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> >>> +		dlb->num_patches = 0;
> >>> +	}
> >>> +
> >>> +done:
> >>> +	spin_unlock(&dlm->lock);
> >>> +}
> >>> +
> >> 
> >> We've got some HW which doesn't support one-shot writeback, and use a
> >> similar trick to try and disable writeback immediately after the flip.
> >> 
> >> We ran into issues where the "start" interrupt wouldn't run in time to
> >> make sure the writeback disable was committed before the next frame.
> >> We have to keep track of whether the disable really happened in time,
> >> before we release the output buffer.
> >> 
> >> Might you have a similar problem here?
> > 
> > We may, but there's no provision at the hardware level to check if the
> > configuration updated happened in time. I could add some safety checks
> > but I believe they would be racy in the best case :-(
> 
> We managed to find (what I believe to be...) a non-racy way, but it
> will of course depend a lot on the HW behaviour, so I'll leave it to
> your best judgement.
> 
> We basically have a "configuration committed" interrupt which we can
> use to set a flag indicating writeback was disabled.

The way my hardware is operated is, roughly, as follows:

- The driver prepares a list of values to write to registers and stores
  them in DMA-able memory. This doesn't involve the hardware, and so is
  completely asynchronous to hardware operation.

- The driver then sets a register with the pointer to the registers
  list.

- At the end of the current frame, the hardware reads the memory address
  of the registers list and DMAs values to register. This operation is
  fast and occurs fully during vertical blanking.

- The hardware then waits for the start of the frame, and begins
  processing the framebuffers to send the frame to the display.

- If no new registers list is provided for the next frame, the current
  list is reused.

Two interrupts are provided, one at the start of the frame and one at
the end of the frame. The driver uses the end of frame interrupt to
signal vertical blanking and page flip completion to DRM.

The end of frame interrupt is also used to schedule atomic commits. The
hardware queue depth for atomic commits is just one, so the driver has
to wait until the previous commit has been processed before writing the
new registers list address to the hardware. To solve the race between
the end of frame interrupt and the address write, the hardware provides
a status bit that is set to 1 when the address is written, and reset to
0 when the hardware starts processing the registers list.

We thus have three states for an atomic commit:

- active, where the corresponding registers list address has been
  written to the hardware, and processed

- queued, where the corresponding registers list address has been
  written to the hardware but not processed yet

- pending, where the corresponding registers list address hasn't been
  written to the hardware yet

The status bit mentioned above allows us to tell if a list exists in the
queued state.

At frame end time, if the status bit is set, we have potentially lost
the race between writing the new registers list and the frame end
interrupt, so we wait for one more vblank. Otherwise, if a list was
queued, we move it to the active state, and retire the active list. If a
list was pending, we write its address to the hardware, and move it to
the queued state.

To schedule writeback I ended up using the frame start interrupt.
Writeback has a specific need in that it requires enabling the memory
write interface for a single frame, and that is not something the
hardware supports directly. I can't either queue a new registers list
with the write interface disabled right after the list with the
writeback request became active, as any atomic commit would then be
delayed by an extra frame. I thus ended up modifying the registers list
in place at frame start time, which is right after the active register
list has been DMA'ed to hardware registers. We have the duration of a
whole frame to do so, before the hardware transfers the same list again
at the beginning of the next frame.

There is a race condition there, as the in-place modification of the
active list could be done after the end of the frame if the interrupt
latency gets too large. I don't see how I could solve that, as I could
write the value after the hardware starts processing the active list at
frame end time, but before the interrupt is notified.

> > Note that we have the duration of a complete frame to disable writeback,
> > as we receive an interrupt when the frame starts, and have until vblank
> > to update the configuration. It's thus slightly better than having to
> > disable writeback between vblank and the start of the next frame.
> 
> Yeah... we have a whole frame too. I'm struggling to find our wiki
> page with the data, but anecdotally there's some (out-of-tree) drivers
> which keep interrupts masked for a _really long_ time. It's nice if
> you don't have to care about those :-)

I only provide two options: not my problem, or give me the source code
and documentation and let me upstream a clean version of the out-of-tree
drivers :-)

> >>>  /**
> >>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> >>>   * @dlm: the display list manager
> >>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> >>> index e0fdb145e6ed..f845607abc4c 100644
> >>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> >>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> >>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> >>>  					unsigned int prealloc);
> >>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> >>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> >>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> >>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> >>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> >>>  
> >>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> >>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> >>>  
> >>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> >>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>> +				u32 reset_value);
> >>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> >>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> >>>
Brian Starkey March 6, 2019, 11:05 a.m. UTC | #6
Hi,

On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> Hi Brian,
> 
> On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> > On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> > > On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> > >> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> > >>> One-shot entries are used as an alternative to committing a complete new
> > >>> display list when a couple of registers need to be written for one frame
> > >>> and then reset to another value for all subsequent frames. This will be
> > >>> used to implement writeback support that will need to enable writeback
> > >>> for the duration of a single frame.
> > >>> 
> > >>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> > >>> ---
> > >>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> > >>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> > >>>  2 files changed, 81 insertions(+)
> > >>> 
> > >>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>> index 886b3a69d329..7b4d252bfde7 100644
> > >>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> > >>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> > >>>  
> > >>>  	unsigned int num_entries;
> > >>>  	unsigned int max_entries;
> > >>> +
> > >>> +	unsigned int num_patches;
> > >>> +	struct {
> > >>> +		struct vsp1_dl_entry *entry;
> > >>> +		u32 data;
> > >>> +	} patches[2];
> > >>>  };
> > >>>  
> > >>>  /**
> > >>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> > >>>  		return;
> > >>>  
> > >>>  	dlb->num_entries = 0;
> > >>> +	dlb->num_patches = 0;
> > >>>  
> > >>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> > >>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> > >>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> > >>>  	dlb->num_entries++;
> > >>>  }
> > >>>  
> > >>> +/**
> > >>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> > >>> + *	single frame
> > >>> + * @dlb: The body
> > >>> + * @reg: The register address
> > >>> + * @value: The register value
> > >>> + * @reset_value: The value to reset the register to at the next vblank
> > >>> + *
> > >>> + * Display lists in continuous mode are re-used by the hardware for successive
> > >>> + * frames until a new display list is committed. Changing the VSP configuration
> > >>> + * normally requires creating and committing a new display list. This function
> > >>> + * offers an alternative race-free way by writing a @value to the @register in
> > >>> + * the display list body for a single frame, specifying in @reset_value the
> > >>> + * value to reset the register to one vblank after the display list is
> > >>> + * committed.
> > >>> + *
> > >>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> > >>> + * and one-shot entries are counted in the total number of entries specified
> > >>> + * when the body is allocated by vsp1_dl_body_alloc().
> > >>> + */
> > >>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>> +				u32 reset_value)
> > >>> +{
> > >>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> > >>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> > >>> +		return;
> > >>> +
> > >>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> > >>> +		      "DLB patches size exceeded (max %zu)",
> > >>> +		      ARRAY_SIZE(dlb->patches)))
> > >>> +		return;
> > >>> +
> > >>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> > >>> +	dlb->patches[dlb->num_patches].data = reset_value;
> > >>> +	dlb->num_patches++;
> > >>> +
> > >>> +	dlb->entries[dlb->num_entries].addr = reg;
> > >>> +	dlb->entries[dlb->num_entries].data = value;
> > >>> +	dlb->num_entries++;
> > >>> +}
> > >>> +
> > >>>  /* -----------------------------------------------------------------------------
> > >>>   * Display List Extended Command Management
> > >>>   */
> > >>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> > >>>  	 * has at least one body, thus we reinitialise the entries list.
> > >>>  	 */
> > >>>  	dl->body0->num_entries = 0;
> > >>> +	dl->body0->num_patches = 0;
> > >>>  
> > >>>  	list_add_tail(&dl->list, &dl->dlm->free);
> > >>>  }
> > >>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> > >>>   * Display List Manager
> > >>>   */
> > >>>  
> > >>> +/**
> > >>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> > >>> + *	interrupt
> > >>> + * @dlm: the display list manager
> > >>> + *
> > >>> + * Apply all one-shot patches registered for the active display list.
> > >>> + */
> > >>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> > >>> +{
> > >>> +	struct vsp1_dl_body *dlb;
> > >>> +	struct vsp1_dl_list *dl;
> > >>> +	unsigned int i;
> > >>> +
> > >>> +	spin_lock(&dlm->lock);
> > >>> +
> > >>> +	dl = dlm->active;
> > >>> +	if (!dl)
> > >>> +		goto done;
> > >>> +
> > >>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> > >>> +		for (i = 0; i < dlb->num_patches; ++i)
> > >>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> > >>> +		dlb->num_patches = 0;
> > >>> +	}
> > >>> +
> > >>> +done:
> > >>> +	spin_unlock(&dlm->lock);
> > >>> +}
> > >>> +
> > >> 
> > >> We've got some HW which doesn't support one-shot writeback, and use a
> > >> similar trick to try and disable writeback immediately after the flip.
> > >> 
> > >> We ran into issues where the "start" interrupt wouldn't run in time to
> > >> make sure the writeback disable was committed before the next frame.
> > >> We have to keep track of whether the disable really happened in time,
> > >> before we release the output buffer.
> > >> 
> > >> Might you have a similar problem here?
> > > 
> > > We may, but there's no provision at the hardware level to check if the
> > > configuration updated happened in time. I could add some safety checks
> > > but I believe they would be racy in the best case :-(
> > 
> > We managed to find (what I believe to be...) a non-racy way, but it
> > will of course depend a lot on the HW behaviour, so I'll leave it to
> > your best judgement.
> > 
> > We basically have a "configuration committed" interrupt which we can
> > use to set a flag indicating writeback was disabled.
> 
> The way my hardware is operated is, roughly, as follows:

Thanks for the detailed explanation.

> 
> - The driver prepares a list of values to write to registers and stores
>   them in DMA-able memory. This doesn't involve the hardware, and so is
>   completely asynchronous to hardware operation.
> 
> - The driver then sets a register with the pointer to the registers
>   list.
> 
> - At the end of the current frame, the hardware reads the memory address
>   of the registers list and DMAs values to register. This operation is
>   fast and occurs fully during vertical blanking.
> 
> - The hardware then waits for the start of the frame, and begins
>   processing the framebuffers to send the frame to the display.
> 
> - If no new registers list is provided for the next frame, the current
>   list is reused.

Does it have to be fetched again, or the HW can use the copy that's in
registers already? Does it have to fetch a register list every frame
when the display is active, or you can stop it fetching somehow?

> 
> Two interrupts are provided, one at the start of the frame and one at
> the end of the frame. The driver uses the end of frame interrupt to
> signal vertical blanking and page flip completion to DRM.

Just for my understanding - you signal the page flip at the point
where the HW fetches the register list containing the scene being
flipped to? Or you signal it after that scene has been on-screen for
one frame?

> 
> The end of frame interrupt is also used to schedule atomic commits. The
> hardware queue depth for atomic commits is just one, so the driver has
> to wait until the previous commit has been processed before writing the
> new registers list address to the hardware. To solve the race between
> the end of frame interrupt and the address write, the hardware provides
> a status bit that is set to 1 when the address is written, and reset to
> 0 when the hardware starts processing the registers list.
> 
> We thus have three states for an atomic commit:
> 
> - active, where the corresponding registers list address has been
>   written to the hardware, and processed
> 
> - queued, where the corresponding registers list address has been
>   written to the hardware but not processed yet
> 
> - pending, where the corresponding registers list address hasn't been
>   written to the hardware yet
> 
> The status bit mentioned above allows us to tell if a list exists in the
> queued state.
> 
> At frame end time, if the status bit is set, we have potentially lost
> the race between writing the new registers list and the frame end
> interrupt, so we wait for one more vblank. Otherwise, if a list was
> queued, we move it to the active state, and retire the active list. If a
> list was pending, we write its address to the hardware, and move it to
> the queued state.
> 
> To schedule writeback I ended up using the frame start interrupt.
> Writeback has a specific need in that it requires enabling the memory
> write interface for a single frame, and that is not something the
> hardware supports directly. I can't either queue a new registers list
> with the write interface disabled right after the list with the
> writeback request became active, as any atomic commit would then be
> delayed by an extra frame. I thus ended up modifying the registers list
> in place at frame start time, which is right after the active register
> list has been DMA'ed to hardware registers. We have the duration of a
> whole frame to do so, before the hardware transfers the same list again
> at the beginning of the next frame.
> 
> There is a race condition there, as the in-place modification of the
> active list could be done after the end of the frame if the interrupt
> latency gets too large. I don't see how I could solve that, as I could
> write the value after the hardware starts processing the active list at
> frame end time, but before the interrupt is notified.

So you can be totally safe by making a copy of the register list,
disabling writeback there, and signalling writeback completion when
the HW status bit tells you that new register list has become active.
The issue is that it effectively halves the maximum update rate when
writeback is enabled?

It sounds like the HW doesn't really give you a way to "cancel" a
queued register list, which is a bit unfortunate. You don't happen to
have a DMA engine trigger or something you could use to do the
register list modification at a guaranteed time do you?

Are you always going to be protected by an IOMMU, preventing the
writeback from trashing physical memory? If that's not the case, then
the race can have pretty dire consequences.

> 
> > > Note that we have the duration of a complete frame to disable writeback,
> > > as we receive an interrupt when the frame starts, and have until vblank
> > > to update the configuration. It's thus slightly better than having to
> > > disable writeback between vblank and the start of the next frame.
> > 
> > Yeah... we have a whole frame too. I'm struggling to find our wiki
> > page with the data, but anecdotally there's some (out-of-tree) drivers
> > which keep interrupts masked for a _really long_ time. It's nice if
> > you don't have to care about those :-)
> 
> I only provide two options: not my problem, or give me the source code
> and documentation and let me upstream a clean version of the out-of-tree
> drivers :-)

This was a case of drivers "in-the-wild" - I don't know the specifics,
but they weren't Arm drivers. I was just sharing as a data point.

Thanks,
-Brian

> 
> > >>>  /**
> > >>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> > >>>   * @dlm: the display list manager
> > >>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>> index e0fdb145e6ed..f845607abc4c 100644
> > >>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> > >>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> > >>>  					unsigned int prealloc);
> > >>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> > >>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> > >>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> > >>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> > >>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> > >>>  
> > >>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> > >>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> > >>>  
> > >>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> > >>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>> +				u32 reset_value);
> > >>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> > >>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> > >>>  
> 
> -- 
> Regards,
> 
> Laurent Pinchart
Liviu Dudau March 6, 2019, 2:20 p.m. UTC | #7
On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> Hi Brian,
> 
> On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> > On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> > > On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> > >> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> > >>> One-shot entries are used as an alternative to committing a complete new
> > >>> display list when a couple of registers need to be written for one frame
> > >>> and then reset to another value for all subsequent frames. This will be
> > >>> used to implement writeback support that will need to enable writeback
> > >>> for the duration of a single frame.
> > >>> 
> > >>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> > >>> ---
> > >>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> > >>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> > >>>  2 files changed, 81 insertions(+)
> > >>> 
> > >>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>> index 886b3a69d329..7b4d252bfde7 100644
> > >>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> > >>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> > >>>  
> > >>>  	unsigned int num_entries;
> > >>>  	unsigned int max_entries;
> > >>> +
> > >>> +	unsigned int num_patches;
> > >>> +	struct {
> > >>> +		struct vsp1_dl_entry *entry;
> > >>> +		u32 data;
> > >>> +	} patches[2];
> > >>>  };
> > >>>  
> > >>>  /**
> > >>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> > >>>  		return;
> > >>>  
> > >>>  	dlb->num_entries = 0;
> > >>> +	dlb->num_patches = 0;
> > >>>  
> > >>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> > >>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> > >>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> > >>>  	dlb->num_entries++;
> > >>>  }
> > >>>  
> > >>> +/**
> > >>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> > >>> + *	single frame
> > >>> + * @dlb: The body
> > >>> + * @reg: The register address
> > >>> + * @value: The register value
> > >>> + * @reset_value: The value to reset the register to at the next vblank
> > >>> + *
> > >>> + * Display lists in continuous mode are re-used by the hardware for successive
> > >>> + * frames until a new display list is committed. Changing the VSP configuration
> > >>> + * normally requires creating and committing a new display list. This function
> > >>> + * offers an alternative race-free way by writing a @value to the @register in
> > >>> + * the display list body for a single frame, specifying in @reset_value the
> > >>> + * value to reset the register to one vblank after the display list is
> > >>> + * committed.
> > >>> + *
> > >>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> > >>> + * and one-shot entries are counted in the total number of entries specified
> > >>> + * when the body is allocated by vsp1_dl_body_alloc().
> > >>> + */
> > >>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>> +				u32 reset_value)
> > >>> +{
> > >>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> > >>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> > >>> +		return;
> > >>> +
> > >>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> > >>> +		      "DLB patches size exceeded (max %zu)",
> > >>> +		      ARRAY_SIZE(dlb->patches)))
> > >>> +		return;
> > >>> +
> > >>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> > >>> +	dlb->patches[dlb->num_patches].data = reset_value;
> > >>> +	dlb->num_patches++;
> > >>> +
> > >>> +	dlb->entries[dlb->num_entries].addr = reg;
> > >>> +	dlb->entries[dlb->num_entries].data = value;
> > >>> +	dlb->num_entries++;
> > >>> +}
> > >>> +
> > >>>  /* -----------------------------------------------------------------------------
> > >>>   * Display List Extended Command Management
> > >>>   */
> > >>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> > >>>  	 * has at least one body, thus we reinitialise the entries list.
> > >>>  	 */
> > >>>  	dl->body0->num_entries = 0;
> > >>> +	dl->body0->num_patches = 0;
> > >>>  
> > >>>  	list_add_tail(&dl->list, &dl->dlm->free);
> > >>>  }
> > >>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> > >>>   * Display List Manager
> > >>>   */
> > >>>  
> > >>> +/**
> > >>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> > >>> + *	interrupt
> > >>> + * @dlm: the display list manager
> > >>> + *
> > >>> + * Apply all one-shot patches registered for the active display list.
> > >>> + */
> > >>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> > >>> +{
> > >>> +	struct vsp1_dl_body *dlb;
> > >>> +	struct vsp1_dl_list *dl;
> > >>> +	unsigned int i;
> > >>> +
> > >>> +	spin_lock(&dlm->lock);
> > >>> +
> > >>> +	dl = dlm->active;
> > >>> +	if (!dl)
> > >>> +		goto done;
> > >>> +
> > >>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> > >>> +		for (i = 0; i < dlb->num_patches; ++i)
> > >>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> > >>> +		dlb->num_patches = 0;
> > >>> +	}
> > >>> +
> > >>> +done:
> > >>> +	spin_unlock(&dlm->lock);
> > >>> +}
> > >>> +
> > >> 
> > >> We've got some HW which doesn't support one-shot writeback, and use a
> > >> similar trick to try and disable writeback immediately after the flip.
> > >> 
> > >> We ran into issues where the "start" interrupt wouldn't run in time to
> > >> make sure the writeback disable was committed before the next frame.
> > >> We have to keep track of whether the disable really happened in time,
> > >> before we release the output buffer.
> > >> 
> > >> Might you have a similar problem here?
> > > 
> > > We may, but there's no provision at the hardware level to check if the
> > > configuration updated happened in time. I could add some safety checks
> > > but I believe they would be racy in the best case :-(
> > 
> > We managed to find (what I believe to be...) a non-racy way, but it
> > will of course depend a lot on the HW behaviour, so I'll leave it to
> > your best judgement.
> > 
> > We basically have a "configuration committed" interrupt which we can
> > use to set a flag indicating writeback was disabled.
> 
> The way my hardware is operated is, roughly, as follows:
> 
> - The driver prepares a list of values to write to registers and stores
>   them in DMA-able memory. This doesn't involve the hardware, and so is
>   completely asynchronous to hardware operation.
> 
> - The driver then sets a register with the pointer to the registers
>   list.
> 
> - At the end of the current frame, the hardware reads the memory address
>   of the registers list and DMAs values to register. This operation is
>   fast and occurs fully during vertical blanking.
> 
> - The hardware then waits for the start of the frame, and begins
>   processing the framebuffers to send the frame to the display.
> 
> - If no new registers list is provided for the next frame, the current
>   list is reused.

Does this mean the hardware goes back to step 3 and re-reads the memory
using DMA?

> 
> Two interrupts are provided, one at the start of the frame and one at
> the end of the frame. The driver uses the end of frame interrupt to
> signal vertical blanking and page flip completion to DRM.
> 
> The end of frame interrupt is also used to schedule atomic commits. The
> hardware queue depth for atomic commits is just one, so the driver has
> to wait until the previous commit has been processed before writing the
> new registers list address to the hardware. To solve the race between
> the end of frame interrupt and the address write, the hardware provides
> a status bit that is set to 1 when the address is written, and reset to
> 0 when the hardware starts processing the registers list.

For my understanding, the following simplication is then true: status = 0
means driver is allowed to update the pointer register, status = 1 means
currently programmed pointer is used to fetch the register values, right?
What happens if you update the pointer while status = 1? DMA transfer gets
interrupted and a new one is scheduled? Does the DMA transfer get cancelled?

> 
> We thus have three states for an atomic commit:
> 
> - active, where the corresponding registers list address has been
>   written to the hardware, and processed
> 
> - queued, where the corresponding registers list address has been
>   written to the hardware but not processed yet
> 
> - pending, where the corresponding registers list address hasn't been
>   written to the hardware yet
> 
> The status bit mentioned above allows us to tell if a list exists in the
> queued state.
> 
> At frame end time, if the status bit is set, we have potentially lost
> the race between writing the new registers list and the frame end
> interrupt, so we wait for one more vblank. Otherwise, if a list was
> queued, we move it to the active state, and retire the active list. If a
> list was pending, we write its address to the hardware, and move it to
> the queued state.
> 
> To schedule writeback I ended up using the frame start interrupt.
> Writeback has a specific need in that it requires enabling the memory
> write interface for a single frame, and that is not something the
> hardware supports directly. I can't either queue a new registers list
> with the write interface disabled right after the list with the
> writeback request became active, as any atomic commit would then be
> delayed by an extra frame. I thus ended up modifying the registers list
> in place at frame start time, which is right after the active register
> list has been DMA'ed to hardware registers. We have the duration of a
> whole frame to do so, before the hardware transfers the same list again
> at the beginning of the next frame.

Yes, that is like to what we do for DP500 in mali-dp, where we have a
similar situation. HW will continue to stream to the given buffer until
stopped, but the writeback API is one-shot mode, so on vblank (when we
know the writeback will start at the beginning of the new frame) we
program the memwrite to stop, but that only takes effect when we also
program the GO bit (CONFIG_VALID in our case).

One problem we had to take care for DP500 was when the next commit (the
queued one in your case) also contains a writeback buffer. In that case,
we don't stop the writeback but "re-start" it.

> 
> There is a race condition there, as the in-place modification of the
> active list could be done after the end of the frame if the interrupt
> latency gets too large. I don't see how I could solve that, as I could
> write the value after the hardware starts processing the active list at
> frame end time, but before the interrupt is notified.

I think that would be a better idea if the HW finishes to transfer
before the end of frame interrupt is signalled. Otherwise if your
interrupt handler is scheduled too fast, you might overwrite the active
frame values?

Best regards,
Liviu

> 
> > > Note that we have the duration of a complete frame to disable writeback,
> > > as we receive an interrupt when the frame starts, and have until vblank
> > > to update the configuration. It's thus slightly better than having to
> > > disable writeback between vblank and the start of the next frame.
> > 
> > Yeah... we have a whole frame too. I'm struggling to find our wiki
> > page with the data, but anecdotally there's some (out-of-tree) drivers
> > which keep interrupts masked for a _really long_ time. It's nice if
> > you don't have to care about those :-)
> 
> I only provide two options: not my problem, or give me the source code
> and documentation and let me upstream a clean version of the out-of-tree
> drivers :-)
> 
> > >>>  /**
> > >>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> > >>>   * @dlm: the display list manager
> > >>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>> index e0fdb145e6ed..f845607abc4c 100644
> > >>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> > >>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> > >>>  					unsigned int prealloc);
> > >>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> > >>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> > >>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> > >>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> > >>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> > >>>  
> > >>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> > >>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> > >>>  
> > >>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> > >>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>> +				u32 reset_value);
> > >>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> > >>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> > >>>  
> 
> -- 
> Regards,
> 
> Laurent Pinchart
Laurent Pinchart March 6, 2019, 6:01 p.m. UTC | #8
Hi Liviu,

On Wed, Mar 06, 2019 at 02:20:51PM +0000, Liviu Dudau wrote:
> On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> > On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> >> On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> >>> On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> >>>> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> >>>>> One-shot entries are used as an alternative to committing a complete new
> >>>>> display list when a couple of registers need to be written for one frame
> >>>>> and then reset to another value for all subsequent frames. This will be
> >>>>> used to implement writeback support that will need to enable writeback
> >>>>> for the duration of a single frame.
> >>>>> 
> >>>>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> >>>>> ---
> >>>>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> >>>>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> >>>>>  2 files changed, 81 insertions(+)
> >>>>> 
> >>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>> index 886b3a69d329..7b4d252bfde7 100644
> >>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> >>>>>  
> >>>>>  	unsigned int num_entries;
> >>>>>  	unsigned int max_entries;
> >>>>> +
> >>>>> +	unsigned int num_patches;
> >>>>> +	struct {
> >>>>> +		struct vsp1_dl_entry *entry;
> >>>>> +		u32 data;
> >>>>> +	} patches[2];
> >>>>>  };
> >>>>>  
> >>>>>  /**
> >>>>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> >>>>>  		return;
> >>>>>  
> >>>>>  	dlb->num_entries = 0;
> >>>>> +	dlb->num_patches = 0;
> >>>>>  
> >>>>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> >>>>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> >>>>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> >>>>>  	dlb->num_entries++;
> >>>>>  }
> >>>>>  
> >>>>> +/**
> >>>>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> >>>>> + *	single frame
> >>>>> + * @dlb: The body
> >>>>> + * @reg: The register address
> >>>>> + * @value: The register value
> >>>>> + * @reset_value: The value to reset the register to at the next vblank
> >>>>> + *
> >>>>> + * Display lists in continuous mode are re-used by the hardware for successive
> >>>>> + * frames until a new display list is committed. Changing the VSP configuration
> >>>>> + * normally requires creating and committing a new display list. This function
> >>>>> + * offers an alternative race-free way by writing a @value to the @register in
> >>>>> + * the display list body for a single frame, specifying in @reset_value the
> >>>>> + * value to reset the register to one vblank after the display list is
> >>>>> + * committed.
> >>>>> + *
> >>>>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> >>>>> + * and one-shot entries are counted in the total number of entries specified
> >>>>> + * when the body is allocated by vsp1_dl_body_alloc().
> >>>>> + */
> >>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>>>> +				u32 reset_value)
> >>>>> +{
> >>>>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> >>>>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> >>>>> +		return;
> >>>>> +
> >>>>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> >>>>> +		      "DLB patches size exceeded (max %zu)",
> >>>>> +		      ARRAY_SIZE(dlb->patches)))
> >>>>> +		return;
> >>>>> +
> >>>>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> >>>>> +	dlb->patches[dlb->num_patches].data = reset_value;
> >>>>> +	dlb->num_patches++;
> >>>>> +
> >>>>> +	dlb->entries[dlb->num_entries].addr = reg;
> >>>>> +	dlb->entries[dlb->num_entries].data = value;
> >>>>> +	dlb->num_entries++;
> >>>>> +}
> >>>>> +
> >>>>>  /* -----------------------------------------------------------------------------
> >>>>>   * Display List Extended Command Management
> >>>>>   */
> >>>>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> >>>>>  	 * has at least one body, thus we reinitialise the entries list.
> >>>>>  	 */
> >>>>>  	dl->body0->num_entries = 0;
> >>>>> +	dl->body0->num_patches = 0;
> >>>>>  
> >>>>>  	list_add_tail(&dl->list, &dl->dlm->free);
> >>>>>  }
> >>>>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> >>>>>   * Display List Manager
> >>>>>   */
> >>>>>  
> >>>>> +/**
> >>>>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> >>>>> + *	interrupt
> >>>>> + * @dlm: the display list manager
> >>>>> + *
> >>>>> + * Apply all one-shot patches registered for the active display list.
> >>>>> + */
> >>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> >>>>> +{
> >>>>> +	struct vsp1_dl_body *dlb;
> >>>>> +	struct vsp1_dl_list *dl;
> >>>>> +	unsigned int i;
> >>>>> +
> >>>>> +	spin_lock(&dlm->lock);
> >>>>> +
> >>>>> +	dl = dlm->active;
> >>>>> +	if (!dl)
> >>>>> +		goto done;
> >>>>> +
> >>>>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> >>>>> +		for (i = 0; i < dlb->num_patches; ++i)
> >>>>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> >>>>> +		dlb->num_patches = 0;
> >>>>> +	}
> >>>>> +
> >>>>> +done:
> >>>>> +	spin_unlock(&dlm->lock);
> >>>>> +}
> >>>>> +
> >>>> 
> >>>> We've got some HW which doesn't support one-shot writeback, and use a
> >>>> similar trick to try and disable writeback immediately after the flip.
> >>>> 
> >>>> We ran into issues where the "start" interrupt wouldn't run in time to
> >>>> make sure the writeback disable was committed before the next frame.
> >>>> We have to keep track of whether the disable really happened in time,
> >>>> before we release the output buffer.
> >>>> 
> >>>> Might you have a similar problem here?
> >>> 
> >>> We may, but there's no provision at the hardware level to check if the
> >>> configuration updated happened in time. I could add some safety checks
> >>> but I believe they would be racy in the best case :-(
> >> 
> >> We managed to find (what I believe to be...) a non-racy way, but it
> >> will of course depend a lot on the HW behaviour, so I'll leave it to
> >> your best judgement.
> >> 
> >> We basically have a "configuration committed" interrupt which we can
> >> use to set a flag indicating writeback was disabled.
> > 
> > The way my hardware is operated is, roughly, as follows:
> > 
> > - The driver prepares a list of values to write to registers and stores
> >   them in DMA-able memory. This doesn't involve the hardware, and so is
> >   completely asynchronous to hardware operation.
> > 
> > - The driver then sets a register with the pointer to the registers
> >   list.
> > 
> > - At the end of the current frame, the hardware reads the memory address
> >   of the registers list and DMAs values to register. This operation is
> >   fast and occurs fully during vertical blanking.
> > 
> > - The hardware then waits for the start of the frame, and begins
> >   processing the framebuffers to send the frame to the display.
> > 
> > - If no new registers list is provided for the next frame, the current
> >   list is reused.
> 
> Does this mean the hardware goes back to step 3 and re-reads the memory
> using DMA?

That's correct. At frame end the hardware will always read and apply a
registers list, either a new one or the old one if no new list is
programmed.

> > Two interrupts are provided, one at the start of the frame and one at
> > the end of the frame. The driver uses the end of frame interrupt to
> > signal vertical blanking and page flip completion to DRM.
> > 
> > The end of frame interrupt is also used to schedule atomic commits. The
> > hardware queue depth for atomic commits is just one, so the driver has
> > to wait until the previous commit has been processed before writing the
> > new registers list address to the hardware. To solve the race between
> > the end of frame interrupt and the address write, the hardware provides
> > a status bit that is set to 1 when the address is written, and reset to
> > 0 when the hardware starts processing the registers list.
> 
> For my understanding, the following simplication is then true: status = 0
> means driver is allowed to update the pointer register, status = 1 means
> currently programmed pointer is used to fetch the register values, right?

Almost. status = 1 means that the pointer has been programmed by the
CPU, but not taken into account by the hardware yet. status = 0 means
that the pointer hasn't been written to by the CPU since the last time
the hardware read it.

> What happens if you update the pointer while status = 1? DMA transfer gets
> interrupted and a new one is scheduled? Does the DMA transfer get cancelled?

You should not do that, as it's racy. The hardware will use either the
old value of the pointer or new new one, based on the timings of the
reprogrammation, and you can't tell which one is used. Nothing can
cancel the DMA, when it's time to start the DMA the hardware will
transfer the value of the pointer to an internal register and use that
for DMA. Reprogramming the pointer during the DMA will not affect the
DMA, the new value will be taken into account for the next DMA.

> > We thus have three states for an atomic commit:
> > 
> > - active, where the corresponding registers list address has been
> >   written to the hardware, and processed
> > 
> > - queued, where the corresponding registers list address has been
> >   written to the hardware but not processed yet
> > 
> > - pending, where the corresponding registers list address hasn't been
> >   written to the hardware yet
> > 
> > The status bit mentioned above allows us to tell if a list exists in the
> > queued state.
> > 
> > At frame end time, if the status bit is set, we have potentially lost
> > the race between writing the new registers list and the frame end
> > interrupt, so we wait for one more vblank. Otherwise, if a list was
> > queued, we move it to the active state, and retire the active list. If a
> > list was pending, we write its address to the hardware, and move it to
> > the queued state.
> > 
> > To schedule writeback I ended up using the frame start interrupt.
> > Writeback has a specific need in that it requires enabling the memory
> > write interface for a single frame, and that is not something the
> > hardware supports directly. I can't either queue a new registers list
> > with the write interface disabled right after the list with the
> > writeback request became active, as any atomic commit would then be
> > delayed by an extra frame. I thus ended up modifying the registers list
> > in place at frame start time, which is right after the active register
> > list has been DMA'ed to hardware registers. We have the duration of a
> > whole frame to do so, before the hardware transfers the same list again
> > at the beginning of the next frame.
> 
> Yes, that is like to what we do for DP500 in mali-dp, where we have a
> similar situation. HW will continue to stream to the given buffer until
> stopped, but the writeback API is one-shot mode, so on vblank (when we
> know the writeback will start at the beginning of the new frame) we
> program the memwrite to stop, but that only takes effect when we also
> program the GO bit (CONFIG_VALID in our case).
> 
> One problem we had to take care for DP500 was when the next commit (the
> queued one in your case) also contains a writeback buffer. In that case,
> we don't stop the writeback but "re-start" it.
> 
> > There is a race condition there, as the in-place modification of the
> > active list could be done after the end of the frame if the interrupt
> > latency gets too large. I don't see how I could solve that, as I could
> > write the value after the hardware starts processing the active list at
> > frame end time, but before the interrupt is notified.
> 
> I think that would be a better idea if the HW finishes to transfer
> before the end of frame interrupt is signalled. Otherwise if your
> interrupt handler is scheduled too fast, you might overwrite the active
> frame values?

That can't happen, as I patch the active registers list in-place in the
frame start interrupt handler, and frame start occurs after the hardware
finishes the DMA of the registers list.

> >>> Note that we have the duration of a complete frame to disable writeback,
> >>> as we receive an interrupt when the frame starts, and have until vblank
> >>> to update the configuration. It's thus slightly better than having to
> >>> disable writeback between vblank and the start of the next frame.
> >> 
> >> Yeah... we have a whole frame too. I'm struggling to find our wiki
> >> page with the data, but anecdotally there's some (out-of-tree) drivers
> >> which keep interrupts masked for a _really long_ time. It's nice if
> >> you don't have to care about those :-)
> > 
> > I only provide two options: not my problem, or give me the source code
> > and documentation and let me upstream a clean version of the out-of-tree
> > drivers :-)
> > 
> >>>>>  /**
> >>>>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> >>>>>   * @dlm: the display list manager
> >>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>> index e0fdb145e6ed..f845607abc4c 100644
> >>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> >>>>>  					unsigned int prealloc);
> >>>>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> >>>>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> >>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> >>>>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> >>>>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> >>>>>  
> >>>>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> >>>>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> >>>>>  
> >>>>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> >>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>>>> +				u32 reset_value);
> >>>>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> >>>>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> >>>>>
Laurent Pinchart March 6, 2019, 6:22 p.m. UTC | #9
Hi Brian,

On Wed, Mar 06, 2019 at 11:05:05AM +0000, Brian Starkey wrote:
> On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> > On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> >> On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> >>> On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> >>>> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> >>>>> One-shot entries are used as an alternative to committing a complete new
> >>>>> display list when a couple of registers need to be written for one frame
> >>>>> and then reset to another value for all subsequent frames. This will be
> >>>>> used to implement writeback support that will need to enable writeback
> >>>>> for the duration of a single frame.
> >>>>> 
> >>>>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> >>>>> ---
> >>>>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> >>>>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> >>>>>  2 files changed, 81 insertions(+)
> >>>>> 
> >>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>> index 886b3a69d329..7b4d252bfde7 100644
> >>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> >>>>>  
> >>>>>  	unsigned int num_entries;
> >>>>>  	unsigned int max_entries;
> >>>>> +
> >>>>> +	unsigned int num_patches;
> >>>>> +	struct {
> >>>>> +		struct vsp1_dl_entry *entry;
> >>>>> +		u32 data;
> >>>>> +	} patches[2];
> >>>>>  };
> >>>>>  
> >>>>>  /**
> >>>>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> >>>>>  		return;
> >>>>>  
> >>>>>  	dlb->num_entries = 0;
> >>>>> +	dlb->num_patches = 0;
> >>>>>  
> >>>>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> >>>>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> >>>>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> >>>>>  	dlb->num_entries++;
> >>>>>  }
> >>>>>  
> >>>>> +/**
> >>>>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> >>>>> + *	single frame
> >>>>> + * @dlb: The body
> >>>>> + * @reg: The register address
> >>>>> + * @value: The register value
> >>>>> + * @reset_value: The value to reset the register to at the next vblank
> >>>>> + *
> >>>>> + * Display lists in continuous mode are re-used by the hardware for successive
> >>>>> + * frames until a new display list is committed. Changing the VSP configuration
> >>>>> + * normally requires creating and committing a new display list. This function
> >>>>> + * offers an alternative race-free way by writing a @value to the @register in
> >>>>> + * the display list body for a single frame, specifying in @reset_value the
> >>>>> + * value to reset the register to one vblank after the display list is
> >>>>> + * committed.
> >>>>> + *
> >>>>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> >>>>> + * and one-shot entries are counted in the total number of entries specified
> >>>>> + * when the body is allocated by vsp1_dl_body_alloc().
> >>>>> + */
> >>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>>>> +				u32 reset_value)
> >>>>> +{
> >>>>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> >>>>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> >>>>> +		return;
> >>>>> +
> >>>>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> >>>>> +		      "DLB patches size exceeded (max %zu)",
> >>>>> +		      ARRAY_SIZE(dlb->patches)))
> >>>>> +		return;
> >>>>> +
> >>>>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> >>>>> +	dlb->patches[dlb->num_patches].data = reset_value;
> >>>>> +	dlb->num_patches++;
> >>>>> +
> >>>>> +	dlb->entries[dlb->num_entries].addr = reg;
> >>>>> +	dlb->entries[dlb->num_entries].data = value;
> >>>>> +	dlb->num_entries++;
> >>>>> +}
> >>>>> +
> >>>>>  /* -----------------------------------------------------------------------------
> >>>>>   * Display List Extended Command Management
> >>>>>   */
> >>>>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> >>>>>  	 * has at least one body, thus we reinitialise the entries list.
> >>>>>  	 */
> >>>>>  	dl->body0->num_entries = 0;
> >>>>> +	dl->body0->num_patches = 0;
> >>>>>  
> >>>>>  	list_add_tail(&dl->list, &dl->dlm->free);
> >>>>>  }
> >>>>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> >>>>>   * Display List Manager
> >>>>>   */
> >>>>>  
> >>>>> +/**
> >>>>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> >>>>> + *	interrupt
> >>>>> + * @dlm: the display list manager
> >>>>> + *
> >>>>> + * Apply all one-shot patches registered for the active display list.
> >>>>> + */
> >>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> >>>>> +{
> >>>>> +	struct vsp1_dl_body *dlb;
> >>>>> +	struct vsp1_dl_list *dl;
> >>>>> +	unsigned int i;
> >>>>> +
> >>>>> +	spin_lock(&dlm->lock);
> >>>>> +
> >>>>> +	dl = dlm->active;
> >>>>> +	if (!dl)
> >>>>> +		goto done;
> >>>>> +
> >>>>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> >>>>> +		for (i = 0; i < dlb->num_patches; ++i)
> >>>>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> >>>>> +		dlb->num_patches = 0;
> >>>>> +	}
> >>>>> +
> >>>>> +done:
> >>>>> +	spin_unlock(&dlm->lock);
> >>>>> +}
> >>>>> +
> >>>> 
> >>>> We've got some HW which doesn't support one-shot writeback, and use a
> >>>> similar trick to try and disable writeback immediately after the flip.
> >>>> 
> >>>> We ran into issues where the "start" interrupt wouldn't run in time to
> >>>> make sure the writeback disable was committed before the next frame.
> >>>> We have to keep track of whether the disable really happened in time,
> >>>> before we release the output buffer.
> >>>> 
> >>>> Might you have a similar problem here?
> >>> 
> >>> We may, but there's no provision at the hardware level to check if the
> >>> configuration updated happened in time. I could add some safety checks
> >>> but I believe they would be racy in the best case :-(
> >> 
> >> We managed to find (what I believe to be...) a non-racy way, but it
> >> will of course depend a lot on the HW behaviour, so I'll leave it to
> >> your best judgement.
> >> 
> >> We basically have a "configuration committed" interrupt which we can
> >> use to set a flag indicating writeback was disabled.
> > 
> > The way my hardware is operated is, roughly, as follows:
> 
> Thanks for the detailed explanation.
> 
> > - The driver prepares a list of values to write to registers and stores
> >   them in DMA-able memory. This doesn't involve the hardware, and so is
> >   completely asynchronous to hardware operation.
> > 
> > - The driver then sets a register with the pointer to the registers
> >   list.
> > 
> > - At the end of the current frame, the hardware reads the memory address
> >   of the registers list and DMAs values to register. This operation is
> >   fast and occurs fully during vertical blanking.
> > 
> > - The hardware then waits for the start of the frame, and begins
> >   processing the framebuffers to send the frame to the display.
> > 
> > - If no new registers list is provided for the next frame, the current
> >   list is reused.
> 
> Does it have to be fetched again, or the HW can use the copy that's in
> registers already? Does it have to fetch a register list every frame
> when the display is active, or you can stop it fetching somehow?

It wouldn't need to be fetched again, but the hardware always fetches
the a list at frame end, regardless of whether a new list is available
or not. I could replace the list with an empty one, but it wouldn't be
very useful.

> > Two interrupts are provided, one at the start of the frame and one at
> > the end of the frame. The driver uses the end of frame interrupt to
> > signal vertical blanking and page flip completion to DRM.
> 
> Just for my understanding - you signal the page flip at the point
> where the HW fetches the register list containing the scene being
> flipped to? Or you signal it after that scene has been on-screen for
> one frame?

I signal the flip when the hardware fetches the new list corresponding
to the frame being flipped to. At that point the addresses of the
previous framebuffers are still in the hardware registers, but they are
in the process of being overwritten with the new ones before the
hardware is started again, so I have a guarantee that the framebuffers
have been effectively flipped out.

> > The end of frame interrupt is also used to schedule atomic commits. The
> > hardware queue depth for atomic commits is just one, so the driver has
> > to wait until the previous commit has been processed before writing the
> > new registers list address to the hardware. To solve the race between
> > the end of frame interrupt and the address write, the hardware provides
> > a status bit that is set to 1 when the address is written, and reset to
> > 0 when the hardware starts processing the registers list.
> > 
> > We thus have three states for an atomic commit:
> > 
> > - active, where the corresponding registers list address has been
> >   written to the hardware, and processed
> > 
> > - queued, where the corresponding registers list address has been
> >   written to the hardware but not processed yet
> > 
> > - pending, where the corresponding registers list address hasn't been
> >   written to the hardware yet
> > 
> > The status bit mentioned above allows us to tell if a list exists in the
> > queued state.
> > 
> > At frame end time, if the status bit is set, we have potentially lost
> > the race between writing the new registers list and the frame end
> > interrupt, so we wait for one more vblank. Otherwise, if a list was
> > queued, we move it to the active state, and retire the active list. If a
> > list was pending, we write its address to the hardware, and move it to
> > the queued state.
> > 
> > To schedule writeback I ended up using the frame start interrupt.
> > Writeback has a specific need in that it requires enabling the memory
> > write interface for a single frame, and that is not something the
> > hardware supports directly. I can't either queue a new registers list
> > with the write interface disabled right after the list with the
> > writeback request became active, as any atomic commit would then be
> > delayed by an extra frame. I thus ended up modifying the registers list
> > in place at frame start time, which is right after the active register
> > list has been DMA'ed to hardware registers. We have the duration of a
> > whole frame to do so, before the hardware transfers the same list again
> > at the beginning of the next frame.
> > 
> > There is a race condition there, as the in-place modification of the
> > active list could be done after the end of the frame if the interrupt
> > latency gets too large. I don't see how I could solve that, as I could
> > write the value after the hardware starts processing the active list at
> > frame end time, but before the interrupt is notified.
> 
> So you can be totally safe by making a copy of the register list,
> disabling writeback there, and signalling writeback completion when
> the HW status bit tells you that new register list has become active.
> The issue is that it effectively halves the maximum update rate when
> writeback is enabled?

Correct. I don't think that's acceptable.

> It sounds like the HW doesn't really give you a way to "cancel" a
> queued register list, which is a bit unfortunate.

I can always queue a new one, but I have no way of telling if the newly
queued list raced with the frame end interrupt.

There's another register I'm not using that contains a shadow copy of
the registers list DMA address. It mirrors the address programmed by the
driver when there is no DMA of the registers list in progress, and
contains the address the of registers list being DMA'ed when a DMA is in
progress. I don't think I can do much with it, as reading it either
before or after reprogramming (to check if I can reprogram or if the
reprogram has been taken into account) is racy.

> You don't happen to have a DMA engine trigger or something you could
> use to do the register list modification at a guaranteed time do you?

Not that I know of, unfortunately.

> Are you always going to be protected by an IOMMU, preventing the
> writeback from trashing physical memory? If that's not the case, then
> the race can have pretty dire consequences.

If the IOMMU is enabled in DT, yes. It's a system-level decision.

> >>> Note that we have the duration of a complete frame to disable writeback,
> >>> as we receive an interrupt when the frame starts, and have until vblank
> >>> to update the configuration. It's thus slightly better than having to
> >>> disable writeback between vblank and the start of the next frame.
> >> 
> >> Yeah... we have a whole frame too. I'm struggling to find our wiki
> >> page with the data, but anecdotally there's some (out-of-tree) drivers
> >> which keep interrupts masked for a _really long_ time. It's nice if
> >> you don't have to care about those :-)
> > 
> > I only provide two options: not my problem, or give me the source code
> > and documentation and let me upstream a clean version of the out-of-tree
> > drivers :-)
> 
> This was a case of drivers "in-the-wild" - I don't know the specifics,
> but they weren't Arm drivers. I was just sharing as a data point.

All those pesky wild drivers that need taming... I've seen my fair share
of monstruosities in that area too. I've even come across with a system
full of hacks that allowed userspace to disable interrupts (I'm not
kidding you).

> >>>>>  /**
> >>>>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> >>>>>   * @dlm: the display list manager
> >>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>> index e0fdb145e6ed..f845607abc4c 100644
> >>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> >>>>>  					unsigned int prealloc);
> >>>>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> >>>>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> >>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> >>>>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> >>>>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> >>>>>  
> >>>>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> >>>>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> >>>>>  
> >>>>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> >>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>>>> +				u32 reset_value);
> >>>>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> >>>>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> >>>>>
Liviu Dudau March 7, 2019, 11:52 a.m. UTC | #10
On Wed, Mar 06, 2019 at 08:01:53PM +0200, Laurent Pinchart wrote:
> Hi Liviu,
> 
> On Wed, Mar 06, 2019 at 02:20:51PM +0000, Liviu Dudau wrote:
> > On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> > > On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> > >> On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> > >>> On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> > >>>> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> > >>>>> One-shot entries are used as an alternative to committing a complete new
> > >>>>> display list when a couple of registers need to be written for one frame
> > >>>>> and then reset to another value for all subsequent frames. This will be
> > >>>>> used to implement writeback support that will need to enable writeback
> > >>>>> for the duration of a single frame.
> > >>>>> 
> > >>>>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> > >>>>> ---
> > >>>>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> > >>>>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> > >>>>>  2 files changed, 81 insertions(+)
> > >>>>> 
> > >>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>>>> index 886b3a69d329..7b4d252bfde7 100644
> > >>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> > >>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>>>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> > >>>>>  
> > >>>>>  	unsigned int num_entries;
> > >>>>>  	unsigned int max_entries;
> > >>>>> +
> > >>>>> +	unsigned int num_patches;
> > >>>>> +	struct {
> > >>>>> +		struct vsp1_dl_entry *entry;
> > >>>>> +		u32 data;
> > >>>>> +	} patches[2];
> > >>>>>  };
> > >>>>>  
> > >>>>>  /**
> > >>>>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> > >>>>>  		return;
> > >>>>>  
> > >>>>>  	dlb->num_entries = 0;
> > >>>>> +	dlb->num_patches = 0;
> > >>>>>  
> > >>>>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> > >>>>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> > >>>>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> > >>>>>  	dlb->num_entries++;
> > >>>>>  }
> > >>>>>  
> > >>>>> +/**
> > >>>>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> > >>>>> + *	single frame
> > >>>>> + * @dlb: The body
> > >>>>> + * @reg: The register address
> > >>>>> + * @value: The register value
> > >>>>> + * @reset_value: The value to reset the register to at the next vblank
> > >>>>> + *
> > >>>>> + * Display lists in continuous mode are re-used by the hardware for successive
> > >>>>> + * frames until a new display list is committed. Changing the VSP configuration
> > >>>>> + * normally requires creating and committing a new display list. This function
> > >>>>> + * offers an alternative race-free way by writing a @value to the @register in
> > >>>>> + * the display list body for a single frame, specifying in @reset_value the
> > >>>>> + * value to reset the register to one vblank after the display list is
> > >>>>> + * committed.
> > >>>>> + *
> > >>>>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> > >>>>> + * and one-shot entries are counted in the total number of entries specified
> > >>>>> + * when the body is allocated by vsp1_dl_body_alloc().
> > >>>>> + */
> > >>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>>>> +				u32 reset_value)
> > >>>>> +{
> > >>>>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> > >>>>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> > >>>>> +		return;
> > >>>>> +
> > >>>>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> > >>>>> +		      "DLB patches size exceeded (max %zu)",
> > >>>>> +		      ARRAY_SIZE(dlb->patches)))
> > >>>>> +		return;
> > >>>>> +
> > >>>>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> > >>>>> +	dlb->patches[dlb->num_patches].data = reset_value;
> > >>>>> +	dlb->num_patches++;
> > >>>>> +
> > >>>>> +	dlb->entries[dlb->num_entries].addr = reg;
> > >>>>> +	dlb->entries[dlb->num_entries].data = value;
> > >>>>> +	dlb->num_entries++;
> > >>>>> +}
> > >>>>> +
> > >>>>>  /* -----------------------------------------------------------------------------
> > >>>>>   * Display List Extended Command Management
> > >>>>>   */
> > >>>>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> > >>>>>  	 * has at least one body, thus we reinitialise the entries list.
> > >>>>>  	 */
> > >>>>>  	dl->body0->num_entries = 0;
> > >>>>> +	dl->body0->num_patches = 0;
> > >>>>>  
> > >>>>>  	list_add_tail(&dl->list, &dl->dlm->free);
> > >>>>>  }
> > >>>>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> > >>>>>   * Display List Manager
> > >>>>>   */
> > >>>>>  
> > >>>>> +/**
> > >>>>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> > >>>>> + *	interrupt
> > >>>>> + * @dlm: the display list manager
> > >>>>> + *
> > >>>>> + * Apply all one-shot patches registered for the active display list.
> > >>>>> + */
> > >>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> > >>>>> +{
> > >>>>> +	struct vsp1_dl_body *dlb;
> > >>>>> +	struct vsp1_dl_list *dl;
> > >>>>> +	unsigned int i;
> > >>>>> +
> > >>>>> +	spin_lock(&dlm->lock);
> > >>>>> +
> > >>>>> +	dl = dlm->active;
> > >>>>> +	if (!dl)
> > >>>>> +		goto done;
> > >>>>> +
> > >>>>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> > >>>>> +		for (i = 0; i < dlb->num_patches; ++i)
> > >>>>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> > >>>>> +		dlb->num_patches = 0;
> > >>>>> +	}
> > >>>>> +
> > >>>>> +done:
> > >>>>> +	spin_unlock(&dlm->lock);
> > >>>>> +}
> > >>>>> +
> > >>>> 
> > >>>> We've got some HW which doesn't support one-shot writeback, and use a
> > >>>> similar trick to try and disable writeback immediately after the flip.
> > >>>> 
> > >>>> We ran into issues where the "start" interrupt wouldn't run in time to
> > >>>> make sure the writeback disable was committed before the next frame.
> > >>>> We have to keep track of whether the disable really happened in time,
> > >>>> before we release the output buffer.
> > >>>> 
> > >>>> Might you have a similar problem here?
> > >>> 
> > >>> We may, but there's no provision at the hardware level to check if the
> > >>> configuration updated happened in time. I could add some safety checks
> > >>> but I believe they would be racy in the best case :-(
> > >> 
> > >> We managed to find (what I believe to be...) a non-racy way, but it
> > >> will of course depend a lot on the HW behaviour, so I'll leave it to
> > >> your best judgement.
> > >> 
> > >> We basically have a "configuration committed" interrupt which we can
> > >> use to set a flag indicating writeback was disabled.
> > > 
> > > The way my hardware is operated is, roughly, as follows:
> > > 
> > > - The driver prepares a list of values to write to registers and stores
> > >   them in DMA-able memory. This doesn't involve the hardware, and so is
> > >   completely asynchronous to hardware operation.
> > > 
> > > - The driver then sets a register with the pointer to the registers
> > >   list.
> > > 
> > > - At the end of the current frame, the hardware reads the memory address
> > >   of the registers list and DMAs values to register. This operation is
> > >   fast and occurs fully during vertical blanking.
> > > 
> > > - The hardware then waits for the start of the frame, and begins
> > >   processing the framebuffers to send the frame to the display.
> > > 
> > > - If no new registers list is provided for the next frame, the current
> > >   list is reused.
> > 
> > Does this mean the hardware goes back to step 3 and re-reads the memory
> > using DMA?
> 
> That's correct. At frame end the hardware will always read and apply a
> registers list, either a new one or the old one if no new list is
> programmed.
> 
> > > Two interrupts are provided, one at the start of the frame and one at
> > > the end of the frame. The driver uses the end of frame interrupt to
> > > signal vertical blanking and page flip completion to DRM.
> > > 
> > > The end of frame interrupt is also used to schedule atomic commits. The
> > > hardware queue depth for atomic commits is just one, so the driver has
> > > to wait until the previous commit has been processed before writing the
> > > new registers list address to the hardware. To solve the race between
> > > the end of frame interrupt and the address write, the hardware provides
> > > a status bit that is set to 1 when the address is written, and reset to
> > > 0 when the hardware starts processing the registers list.
> > 
> > For my understanding, the following simplication is then true: status = 0
> > means driver is allowed to update the pointer register, status = 1 means
> > currently programmed pointer is used to fetch the register values, right?
> 
> Almost. status = 1 means that the pointer has been programmed by the
> CPU, but not taken into account by the hardware yet. status = 0 means
> that the pointer hasn't been written to by the CPU since the last time
> the hardware read it.

Right, so CPU writes to the pointer register and that sets status to 1,
and when the HW has copied that value into internal DMA registers it clears
it, correct?

> 
> > What happens if you update the pointer while status = 1? DMA transfer gets
> > interrupted and a new one is scheduled? Does the DMA transfer get cancelled?
> 
> You should not do that, as it's racy. The hardware will use either the
> old value of the pointer or new new one, based on the timings of the
> reprogrammation, and you can't tell which one is used. Nothing can
> cancel the DMA, when it's time to start the DMA the hardware will
> transfer the value of the pointer to an internal register and use that
> for DMA. Reprogramming the pointer during the DMA will not affect the
> DMA, the new value will be taken into account for the next DMA.

It looks to me like you can do the in-place update of the writeback
framebuffer address at the end of the current frame (vblank time(?)). If
you wait until status = 0 then you know the next frame parameters are
being "internalised", so you can set in the commit you're about to queue
the disabling of the writeback if that commit doesn't have a fresh
writeback request (phew, that's a mouthful). I don't think you need to
wait until the next start of frame interrupt to do that. Also, vblank
time is probably the time you want to signal the completion of the
previous writeback anyway, right?

> 
> > > We thus have three states for an atomic commit:
> > > 
> > > - active, where the corresponding registers list address has been
> > >   written to the hardware, and processed
> > > 
> > > - queued, where the corresponding registers list address has been
> > >   written to the hardware but not processed yet
> > > 
> > > - pending, where the corresponding registers list address hasn't been
> > >   written to the hardware yet
> > > 
> > > The status bit mentioned above allows us to tell if a list exists in the
> > > queued state.
> > > 
> > > At frame end time, if the status bit is set, we have potentially lost
> > > the race between writing the new registers list and the frame end
> > > interrupt, so we wait for one more vblank. Otherwise, if a list was
> > > queued, we move it to the active state, and retire the active list. If a
> > > list was pending, we write its address to the hardware, and move it to
> > > the queued state.

It looks to me like the moving of the pending state into queued state is your
opportunity to also in-place modify the writeback registers if the state does
not have its own writeback request.

Best regards,
Liviu

> > > 
> > > To schedule writeback I ended up using the frame start interrupt.
> > > Writeback has a specific need in that it requires enabling the memory
> > > write interface for a single frame, and that is not something the
> > > hardware supports directly. I can't either queue a new registers list
> > > with the write interface disabled right after the list with the
> > > writeback request became active, as any atomic commit would then be
> > > delayed by an extra frame. I thus ended up modifying the registers list
> > > in place at frame start time, which is right after the active register
> > > list has been DMA'ed to hardware registers. We have the duration of a
> > > whole frame to do so, before the hardware transfers the same list again
> > > at the beginning of the next frame.
> > 
> > Yes, that is like to what we do for DP500 in mali-dp, where we have a
> > similar situation. HW will continue to stream to the given buffer until
> > stopped, but the writeback API is one-shot mode, so on vblank (when we
> > know the writeback will start at the beginning of the new frame) we
> > program the memwrite to stop, but that only takes effect when we also
> > program the GO bit (CONFIG_VALID in our case).
> > 
> > One problem we had to take care for DP500 was when the next commit (the
> > queued one in your case) also contains a writeback buffer. In that case,
> > we don't stop the writeback but "re-start" it.
> > 
> > > There is a race condition there, as the in-place modification of the
> > > active list could be done after the end of the frame if the interrupt
> > > latency gets too large. I don't see how I could solve that, as I could
> > > write the value after the hardware starts processing the active list at
> > > frame end time, but before the interrupt is notified.
> > 
> > I think that would be a better idea if the HW finishes to transfer
> > before the end of frame interrupt is signalled. Otherwise if your
> > interrupt handler is scheduled too fast, you might overwrite the active
> > frame values?
> 
> That can't happen, as I patch the active registers list in-place in the
> frame start interrupt handler, and frame start occurs after the hardware
> finishes the DMA of the registers list.
> 
> > >>> Note that we have the duration of a complete frame to disable writeback,
> > >>> as we receive an interrupt when the frame starts, and have until vblank
> > >>> to update the configuration. It's thus slightly better than having to
> > >>> disable writeback between vblank and the start of the next frame.
> > >> 
> > >> Yeah... we have a whole frame too. I'm struggling to find our wiki
> > >> page with the data, but anecdotally there's some (out-of-tree) drivers
> > >> which keep interrupts masked for a _really long_ time. It's nice if
> > >> you don't have to care about those :-)
> > > 
> > > I only provide two options: not my problem, or give me the source code
> > > and documentation and let me upstream a clean version of the out-of-tree
> > > drivers :-)
> > > 
> > >>>>>  /**
> > >>>>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> > >>>>>   * @dlm: the display list manager
> > >>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>>>> index e0fdb145e6ed..f845607abc4c 100644
> > >>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> > >>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>>>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> > >>>>>  					unsigned int prealloc);
> > >>>>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> > >>>>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> > >>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> > >>>>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> > >>>>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> > >>>>>  
> > >>>>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> > >>>>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> > >>>>>  
> > >>>>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> > >>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>>>> +				u32 reset_value);
> > >>>>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> > >>>>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> > >>>>>  
> 
> -- 
> Regards,
> 
> Laurent Pinchart
Brian Starkey March 7, 2019, 12:28 p.m. UTC | #11
On Wed, Mar 06, 2019 at 08:22:44PM +0200, Laurent Pinchart wrote:

[snip]

> 
> I can always queue a new one, but I have no way of telling if the newly
> queued list raced with the frame end interrupt.
> 
> There's another register I'm not using that contains a shadow copy of
> the registers list DMA address. It mirrors the address programmed by the
> driver when there is no DMA of the registers list in progress, and
> contains the address the of registers list being DMA'ed when a DMA is in
> progress. I don't think I can do much with it, as reading it either
> before or after reprogramming (to check if I can reprogram or if the
> reprogram has been taken into account) is racy.
> 

When you say it mirrors the address programmed, is that latched when
the update is accepted, or it will update the moment you program the
address register?

I assume the latter or you would have thought of this yourself (that
seems like a really strange/not-useful behaviour!). But if it is the
former you could:

 - In writeback start-of-frame, create a copy of the register list,
   disabling writeback.
 - Write the address of this copy to the register

If/when an atomic commit comes in before you service the next
end-of-frame:

 - Write the address of the new register list
 - Check the status register. If the "pending" bit is zero, you know
   you had a potential race.
    - Check the DMA address register. If it corresponds to the new
      scene, the new commit won the race, otherwise it's been delayed
      by a frame.

> > You don't happen to have a DMA engine trigger or something you could
> > use to do the register list modification at a guaranteed time do you?
> 
> Not that I know of, unfortunately.
> 
> > Are you always going to be protected by an IOMMU, preventing the
> > writeback from trashing physical memory? If that's not the case, then
> > the race can have pretty dire consequences.
> 
> If the IOMMU is enabled in DT, yes. It's a system-level decision.
> 

Well, it's your driver at the end of the day. But for me, a known
race-condition which would cause random memory corruption sounds like
a really Bad Thing. Halving frame-rate on systems with no IOMMU seems
preferable to me.

Cheers,
-Brian
Laurent Pinchart March 7, 2019, 1:48 p.m. UTC | #12
Hi Liviu,

On Thu, Mar 07, 2019 at 11:52:18AM +0000, Liviu Dudau wrote:
> On Wed, Mar 06, 2019 at 08:01:53PM +0200, Laurent Pinchart wrote:
> > On Wed, Mar 06, 2019 at 02:20:51PM +0000, Liviu Dudau wrote:
> >> On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> >>> On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> >>>> On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> >>>>> On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> >>>>>> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> >>>>>>> One-shot entries are used as an alternative to committing a complete new
> >>>>>>> display list when a couple of registers need to be written for one frame
> >>>>>>> and then reset to another value for all subsequent frames. This will be
> >>>>>>> used to implement writeback support that will need to enable writeback
> >>>>>>> for the duration of a single frame.
> >>>>>>> 
> >>>>>>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> >>>>>>> ---
> >>>>>>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> >>>>>>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> >>>>>>>  2 files changed, 81 insertions(+)
> >>>>>>> 
> >>>>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>>>> index 886b3a69d329..7b4d252bfde7 100644
> >>>>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> >>>>>>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> >>>>>>>  
> >>>>>>>  	unsigned int num_entries;
> >>>>>>>  	unsigned int max_entries;
> >>>>>>> +
> >>>>>>> +	unsigned int num_patches;
> >>>>>>> +	struct {
> >>>>>>> +		struct vsp1_dl_entry *entry;
> >>>>>>> +		u32 data;
> >>>>>>> +	} patches[2];
> >>>>>>>  };
> >>>>>>>  
> >>>>>>>  /**
> >>>>>>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> >>>>>>>  		return;
> >>>>>>>  
> >>>>>>>  	dlb->num_entries = 0;
> >>>>>>> +	dlb->num_patches = 0;
> >>>>>>>  
> >>>>>>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> >>>>>>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> >>>>>>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> >>>>>>>  	dlb->num_entries++;
> >>>>>>>  }
> >>>>>>>  
> >>>>>>> +/**
> >>>>>>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> >>>>>>> + *	single frame
> >>>>>>> + * @dlb: The body
> >>>>>>> + * @reg: The register address
> >>>>>>> + * @value: The register value
> >>>>>>> + * @reset_value: The value to reset the register to at the next vblank
> >>>>>>> + *
> >>>>>>> + * Display lists in continuous mode are re-used by the hardware for successive
> >>>>>>> + * frames until a new display list is committed. Changing the VSP configuration
> >>>>>>> + * normally requires creating and committing a new display list. This function
> >>>>>>> + * offers an alternative race-free way by writing a @value to the @register in
> >>>>>>> + * the display list body for a single frame, specifying in @reset_value the
> >>>>>>> + * value to reset the register to one vblank after the display list is
> >>>>>>> + * committed.
> >>>>>>> + *
> >>>>>>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> >>>>>>> + * and one-shot entries are counted in the total number of entries specified
> >>>>>>> + * when the body is allocated by vsp1_dl_body_alloc().
> >>>>>>> + */
> >>>>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>>>>>> +				u32 reset_value)
> >>>>>>> +{
> >>>>>>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> >>>>>>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> >>>>>>> +		return;
> >>>>>>> +
> >>>>>>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> >>>>>>> +		      "DLB patches size exceeded (max %zu)",
> >>>>>>> +		      ARRAY_SIZE(dlb->patches)))
> >>>>>>> +		return;
> >>>>>>> +
> >>>>>>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> >>>>>>> +	dlb->patches[dlb->num_patches].data = reset_value;
> >>>>>>> +	dlb->num_patches++;
> >>>>>>> +
> >>>>>>> +	dlb->entries[dlb->num_entries].addr = reg;
> >>>>>>> +	dlb->entries[dlb->num_entries].data = value;
> >>>>>>> +	dlb->num_entries++;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>  /* -----------------------------------------------------------------------------
> >>>>>>>   * Display List Extended Command Management
> >>>>>>>   */
> >>>>>>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> >>>>>>>  	 * has at least one body, thus we reinitialise the entries list.
> >>>>>>>  	 */
> >>>>>>>  	dl->body0->num_entries = 0;
> >>>>>>> +	dl->body0->num_patches = 0;
> >>>>>>>  
> >>>>>>>  	list_add_tail(&dl->list, &dl->dlm->free);
> >>>>>>>  }
> >>>>>>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> >>>>>>>   * Display List Manager
> >>>>>>>   */
> >>>>>>>  
> >>>>>>> +/**
> >>>>>>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> >>>>>>> + *	interrupt
> >>>>>>> + * @dlm: the display list manager
> >>>>>>> + *
> >>>>>>> + * Apply all one-shot patches registered for the active display list.
> >>>>>>> + */
> >>>>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> >>>>>>> +{
> >>>>>>> +	struct vsp1_dl_body *dlb;
> >>>>>>> +	struct vsp1_dl_list *dl;
> >>>>>>> +	unsigned int i;
> >>>>>>> +
> >>>>>>> +	spin_lock(&dlm->lock);
> >>>>>>> +
> >>>>>>> +	dl = dlm->active;
> >>>>>>> +	if (!dl)
> >>>>>>> +		goto done;
> >>>>>>> +
> >>>>>>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> >>>>>>> +		for (i = 0; i < dlb->num_patches; ++i)
> >>>>>>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> >>>>>>> +		dlb->num_patches = 0;
> >>>>>>> +	}
> >>>>>>> +
> >>>>>>> +done:
> >>>>>>> +	spin_unlock(&dlm->lock);
> >>>>>>> +}
> >>>>>>> +
> >>>>>> 
> >>>>>> We've got some HW which doesn't support one-shot writeback, and use a
> >>>>>> similar trick to try and disable writeback immediately after the flip.
> >>>>>> 
> >>>>>> We ran into issues where the "start" interrupt wouldn't run in time to
> >>>>>> make sure the writeback disable was committed before the next frame.
> >>>>>> We have to keep track of whether the disable really happened in time,
> >>>>>> before we release the output buffer.
> >>>>>> 
> >>>>>> Might you have a similar problem here?
> >>>>> 
> >>>>> We may, but there's no provision at the hardware level to check if the
> >>>>> configuration updated happened in time. I could add some safety checks
> >>>>> but I believe they would be racy in the best case :-(
> >>>> 
> >>>> We managed to find (what I believe to be...) a non-racy way, but it
> >>>> will of course depend a lot on the HW behaviour, so I'll leave it to
> >>>> your best judgement.
> >>>> 
> >>>> We basically have a "configuration committed" interrupt which we can
> >>>> use to set a flag indicating writeback was disabled.
> >>> 
> >>> The way my hardware is operated is, roughly, as follows:
> >>> 
> >>> - The driver prepares a list of values to write to registers and stores
> >>>   them in DMA-able memory. This doesn't involve the hardware, and so is
> >>>   completely asynchronous to hardware operation.
> >>> 
> >>> - The driver then sets a register with the pointer to the registers
> >>>   list.
> >>> 
> >>> - At the end of the current frame, the hardware reads the memory address
> >>>   of the registers list and DMAs values to register. This operation is
> >>>   fast and occurs fully during vertical blanking.
> >>> 
> >>> - The hardware then waits for the start of the frame, and begins
> >>>   processing the framebuffers to send the frame to the display.
> >>> 
> >>> - If no new registers list is provided for the next frame, the current
> >>>   list is reused.
> >> 
> >> Does this mean the hardware goes back to step 3 and re-reads the memory
> >> using DMA?
> > 
> > That's correct. At frame end the hardware will always read and apply a
> > registers list, either a new one or the old one if no new list is
> > programmed.
> > 
> >>> Two interrupts are provided, one at the start of the frame and one at
> >>> the end of the frame. The driver uses the end of frame interrupt to
> >>> signal vertical blanking and page flip completion to DRM.
> >>> 
> >>> The end of frame interrupt is also used to schedule atomic commits. The
> >>> hardware queue depth for atomic commits is just one, so the driver has
> >>> to wait until the previous commit has been processed before writing the
> >>> new registers list address to the hardware. To solve the race between
> >>> the end of frame interrupt and the address write, the hardware provides
> >>> a status bit that is set to 1 when the address is written, and reset to
> >>> 0 when the hardware starts processing the registers list.
> >> 
> >> For my understanding, the following simplication is then true: status = 0
> >> means driver is allowed to update the pointer register, status = 1 means
> >> currently programmed pointer is used to fetch the register values, right?
> > 
> > Almost. status = 1 means that the pointer has been programmed by the
> > CPU, but not taken into account by the hardware yet. status = 0 means
> > that the pointer hasn't been written to by the CPU since the last time
> > the hardware read it.
> 
> Right, so CPU writes to the pointer register and that sets status to 1,
> and when the HW has copied that value into internal DMA registers it clears
> it, correct?

Yes, that's correct.

> >> What happens if you update the pointer while status = 1? DMA transfer gets
> >> interrupted and a new one is scheduled? Does the DMA transfer get cancelled?
> > 
> > You should not do that, as it's racy. The hardware will use either the
> > old value of the pointer or new new one, based on the timings of the
> > reprogrammation, and you can't tell which one is used. Nothing can
> > cancel the DMA, when it's time to start the DMA the hardware will
> > transfer the value of the pointer to an internal register and use that
> > for DMA. Reprogramming the pointer during the DMA will not affect the
> > DMA, the new value will be taken into account for the next DMA.
> 
> It looks to me like you can do the in-place update of the writeback
> framebuffer address at the end of the current frame (vblank time(?)). If
> you wait until status = 0 then you know the next frame parameters are
> being "internalised", so you can set in the commit you're about to queue
> the disabling of the writeback if that commit doesn't have a fresh
> writeback request (phew, that's a mouthful).

My problem is that there may not be a next commit, if userspace hasn't
queued one. Otherwise this is what I would do. My initial implementation
was queuing two commits for writeback, one with writeback enabled, and
immediately after a second one copied from the first but with writeback
disabled. This halved the frame rate, as the next commit from userspace
had to first wait for completion of the writeback disabling commit. The
real issue here is that I would need to queue the writeback disabling
commit right after the writeback enabling commit to make sure it gets
processed for the next frame, but once its queued any userspace commit
would need to wait one extra frame as a queued commit can be processed
at any time and thus can't be replaced. That's why I decided to modify
the registers list in place instead of queueing a new commit to disable
writeback.

> I don't think you need to wait until the next start of frame interrupt
> to do that. Also, vblank time is probably the time you want to signal
> the completion of the previous writeback anyway, right?

Correct, that's when I signal it.

> >>> We thus have three states for an atomic commit:
> >>> 
> >>> - active, where the corresponding registers list address has been
> >>>   written to the hardware, and processed
> >>> 
> >>> - queued, where the corresponding registers list address has been
> >>>   written to the hardware but not processed yet
> >>> 
> >>> - pending, where the corresponding registers list address hasn't been
> >>>   written to the hardware yet
> >>> 
> >>> The status bit mentioned above allows us to tell if a list exists in the
> >>> queued state.
> >>> 
> >>> At frame end time, if the status bit is set, we have potentially lost
> >>> the race between writing the new registers list and the frame end
> >>> interrupt, so we wait for one more vblank. Otherwise, if a list was
> >>> queued, we move it to the active state, and retire the active list. If a
> >>> list was pending, we write its address to the hardware, and move it to
> >>> the queued state.
> 
> It looks to me like the moving of the pending state into queued state is your
> opportunity to also in-place modify the writeback registers if the state does
> not have its own writeback request.

Moving from pending to queued means the pointer has been given to the
hardware, but not processed yet. I need to wait until the commit that
enables writeback is fully processed before modifying it in-place to
disable writeback, and that's at the frame start following the move from
the queued state to the active state.

> >>> To schedule writeback I ended up using the frame start interrupt.
> >>> Writeback has a specific need in that it requires enabling the memory
> >>> write interface for a single frame, and that is not something the
> >>> hardware supports directly. I can't either queue a new registers list
> >>> with the write interface disabled right after the list with the
> >>> writeback request became active, as any atomic commit would then be
> >>> delayed by an extra frame. I thus ended up modifying the registers list
> >>> in place at frame start time, which is right after the active register
> >>> list has been DMA'ed to hardware registers. We have the duration of a
> >>> whole frame to do so, before the hardware transfers the same list again
> >>> at the beginning of the next frame.
> >> 
> >> Yes, that is like to what we do for DP500 in mali-dp, where we have a
> >> similar situation. HW will continue to stream to the given buffer until
> >> stopped, but the writeback API is one-shot mode, so on vblank (when we
> >> know the writeback will start at the beginning of the new frame) we
> >> program the memwrite to stop, but that only takes effect when we also
> >> program the GO bit (CONFIG_VALID in our case).
> >> 
> >> One problem we had to take care for DP500 was when the next commit (the
> >> queued one in your case) also contains a writeback buffer. In that case,
> >> we don't stop the writeback but "re-start" it.
> >> 
> >>> There is a race condition there, as the in-place modification of the
> >>> active list could be done after the end of the frame if the interrupt
> >>> latency gets too large. I don't see how I could solve that, as I could
> >>> write the value after the hardware starts processing the active list at
> >>> frame end time, but before the interrupt is notified.
> >> 
> >> I think that would be a better idea if the HW finishes to transfer
> >> before the end of frame interrupt is signalled. Otherwise if your
> >> interrupt handler is scheduled too fast, you might overwrite the active
> >> frame values?
> > 
> > That can't happen, as I patch the active registers list in-place in the
> > frame start interrupt handler, and frame start occurs after the hardware
> > finishes the DMA of the registers list.
> > 
> >>>>> Note that we have the duration of a complete frame to disable writeback,
> >>>>> as we receive an interrupt when the frame starts, and have until vblank
> >>>>> to update the configuration. It's thus slightly better than having to
> >>>>> disable writeback between vblank and the start of the next frame.
> >>>> 
> >>>> Yeah... we have a whole frame too. I'm struggling to find our wiki
> >>>> page with the data, but anecdotally there's some (out-of-tree) drivers
> >>>> which keep interrupts masked for a _really long_ time. It's nice if
> >>>> you don't have to care about those :-)
> >>> 
> >>> I only provide two options: not my problem, or give me the source code
> >>> and documentation and let me upstream a clean version of the out-of-tree
> >>> drivers :-)
> >>> 
> >>>>>>>  /**
> >>>>>>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> >>>>>>>   * @dlm: the display list manager
> >>>>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>>>> index e0fdb145e6ed..f845607abc4c 100644
> >>>>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> >>>>>>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> >>>>>>>  					unsigned int prealloc);
> >>>>>>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> >>>>>>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> >>>>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> >>>>>>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> >>>>>>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> >>>>>>>  
> >>>>>>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> >>>>>>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> >>>>>>>  
> >>>>>>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> >>>>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> >>>>>>> +				u32 reset_value);
> >>>>>>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> >>>>>>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> >>>>>>>
Liviu Dudau March 7, 2019, 4:31 p.m. UTC | #13
On Thu, Mar 07, 2019 at 03:48:23PM +0200, Laurent Pinchart wrote:
> Hi Liviu,
> 
> On Thu, Mar 07, 2019 at 11:52:18AM +0000, Liviu Dudau wrote:
> > On Wed, Mar 06, 2019 at 08:01:53PM +0200, Laurent Pinchart wrote:
> > > On Wed, Mar 06, 2019 at 02:20:51PM +0000, Liviu Dudau wrote:
> > >> On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> > >>> On Fri, Feb 22, 2019 at 03:06:19PM +0000, Brian Starkey wrote:
> > >>>> On Fri, Feb 22, 2019 at 04:46:29PM +0200, Laurent Pinchart wrote:
> > >>>>> On Fri, Feb 22, 2019 at 02:30:03PM +0000, Brian Starkey wrote:
> > >>>>>> On Thu, Feb 21, 2019 at 12:32:00PM +0200, Laurent Pinchart wrote:
> > >>>>>>> One-shot entries are used as an alternative to committing a complete new
> > >>>>>>> display list when a couple of registers need to be written for one frame
> > >>>>>>> and then reset to another value for all subsequent frames. This will be
> > >>>>>>> used to implement writeback support that will need to enable writeback
> > >>>>>>> for the duration of a single frame.
> > >>>>>>> 
> > >>>>>>> Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
> > >>>>>>> ---
> > >>>>>>>  drivers/media/platform/vsp1/vsp1_dl.c | 78 +++++++++++++++++++++++++++
> > >>>>>>>  drivers/media/platform/vsp1/vsp1_dl.h |  3 ++
> > >>>>>>>  2 files changed, 81 insertions(+)
> > >>>>>>> 
> > >>>>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>>>>>> index 886b3a69d329..7b4d252bfde7 100644
> > >>>>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.c
> > >>>>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.c
> > >>>>>>> @@ -115,6 +115,12 @@ struct vsp1_dl_body {
> > >>>>>>>  
> > >>>>>>>  	unsigned int num_entries;
> > >>>>>>>  	unsigned int max_entries;
> > >>>>>>> +
> > >>>>>>> +	unsigned int num_patches;
> > >>>>>>> +	struct {
> > >>>>>>> +		struct vsp1_dl_entry *entry;
> > >>>>>>> +		u32 data;
> > >>>>>>> +	} patches[2];
> > >>>>>>>  };
> > >>>>>>>  
> > >>>>>>>  /**
> > >>>>>>> @@ -361,6 +367,7 @@ void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
> > >>>>>>>  		return;
> > >>>>>>>  
> > >>>>>>>  	dlb->num_entries = 0;
> > >>>>>>> +	dlb->num_patches = 0;
> > >>>>>>>  
> > >>>>>>>  	spin_lock_irqsave(&dlb->pool->lock, flags);
> > >>>>>>>  	list_add_tail(&dlb->free, &dlb->pool->free);
> > >>>>>>> @@ -388,6 +395,47 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
> > >>>>>>>  	dlb->num_entries++;
> > >>>>>>>  }
> > >>>>>>>  
> > >>>>>>> +/**
> > >>>>>>> + * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
> > >>>>>>> + *	single frame
> > >>>>>>> + * @dlb: The body
> > >>>>>>> + * @reg: The register address
> > >>>>>>> + * @value: The register value
> > >>>>>>> + * @reset_value: The value to reset the register to at the next vblank
> > >>>>>>> + *
> > >>>>>>> + * Display lists in continuous mode are re-used by the hardware for successive
> > >>>>>>> + * frames until a new display list is committed. Changing the VSP configuration
> > >>>>>>> + * normally requires creating and committing a new display list. This function
> > >>>>>>> + * offers an alternative race-free way by writing a @value to the @register in
> > >>>>>>> + * the display list body for a single frame, specifying in @reset_value the
> > >>>>>>> + * value to reset the register to one vblank after the display list is
> > >>>>>>> + * committed.
> > >>>>>>> + *
> > >>>>>>> + * The maximum number of one-shot entries is limited to 2 per display list body,
> > >>>>>>> + * and one-shot entries are counted in the total number of entries specified
> > >>>>>>> + * when the body is allocated by vsp1_dl_body_alloc().
> > >>>>>>> + */
> > >>>>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>>>>>> +				u32 reset_value)
> > >>>>>>> +{
> > >>>>>>> +	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
> > >>>>>>> +		      "DLB size exceeded (max %u)", dlb->max_entries))
> > >>>>>>> +		return;
> > >>>>>>> +
> > >>>>>>> +	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
> > >>>>>>> +		      "DLB patches size exceeded (max %zu)",
> > >>>>>>> +		      ARRAY_SIZE(dlb->patches)))
> > >>>>>>> +		return;
> > >>>>>>> +
> > >>>>>>> +	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
> > >>>>>>> +	dlb->patches[dlb->num_patches].data = reset_value;
> > >>>>>>> +	dlb->num_patches++;
> > >>>>>>> +
> > >>>>>>> +	dlb->entries[dlb->num_entries].addr = reg;
> > >>>>>>> +	dlb->entries[dlb->num_entries].data = value;
> > >>>>>>> +	dlb->num_entries++;
> > >>>>>>> +}
> > >>>>>>> +
> > >>>>>>>  /* -----------------------------------------------------------------------------
> > >>>>>>>   * Display List Extended Command Management
> > >>>>>>>   */
> > >>>>>>> @@ -652,6 +700,7 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
> > >>>>>>>  	 * has at least one body, thus we reinitialise the entries list.
> > >>>>>>>  	 */
> > >>>>>>>  	dl->body0->num_entries = 0;
> > >>>>>>> +	dl->body0->num_patches = 0;
> > >>>>>>>  
> > >>>>>>>  	list_add_tail(&dl->list, &dl->dlm->free);
> > >>>>>>>  }
> > >>>>>>> @@ -930,6 +979,35 @@ void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
> > >>>>>>>   * Display List Manager
> > >>>>>>>   */
> > >>>>>>>  
> > >>>>>>> +/**
> > >>>>>>> + * vsp1_dlm_irq_display_start - Display list handler for the display start
> > >>>>>>> + *	interrupt
> > >>>>>>> + * @dlm: the display list manager
> > >>>>>>> + *
> > >>>>>>> + * Apply all one-shot patches registered for the active display list.
> > >>>>>>> + */
> > >>>>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
> > >>>>>>> +{
> > >>>>>>> +	struct vsp1_dl_body *dlb;
> > >>>>>>> +	struct vsp1_dl_list *dl;
> > >>>>>>> +	unsigned int i;
> > >>>>>>> +
> > >>>>>>> +	spin_lock(&dlm->lock);
> > >>>>>>> +
> > >>>>>>> +	dl = dlm->active;
> > >>>>>>> +	if (!dl)
> > >>>>>>> +		goto done;
> > >>>>>>> +
> > >>>>>>> +	list_for_each_entry(dlb, &dl->bodies, list) {
> > >>>>>>> +		for (i = 0; i < dlb->num_patches; ++i)
> > >>>>>>> +			dlb->patches[i].entry->data = dlb->patches[i].data;
> > >>>>>>> +		dlb->num_patches = 0;
> > >>>>>>> +	}
> > >>>>>>> +
> > >>>>>>> +done:
> > >>>>>>> +	spin_unlock(&dlm->lock);
> > >>>>>>> +}
> > >>>>>>> +
> > >>>>>> 
> > >>>>>> We've got some HW which doesn't support one-shot writeback, and use a
> > >>>>>> similar trick to try and disable writeback immediately after the flip.
> > >>>>>> 
> > >>>>>> We ran into issues where the "start" interrupt wouldn't run in time to
> > >>>>>> make sure the writeback disable was committed before the next frame.
> > >>>>>> We have to keep track of whether the disable really happened in time,
> > >>>>>> before we release the output buffer.
> > >>>>>> 
> > >>>>>> Might you have a similar problem here?
> > >>>>> 
> > >>>>> We may, but there's no provision at the hardware level to check if the
> > >>>>> configuration updated happened in time. I could add some safety checks
> > >>>>> but I believe they would be racy in the best case :-(
> > >>>> 
> > >>>> We managed to find (what I believe to be...) a non-racy way, but it
> > >>>> will of course depend a lot on the HW behaviour, so I'll leave it to
> > >>>> your best judgement.
> > >>>> 
> > >>>> We basically have a "configuration committed" interrupt which we can
> > >>>> use to set a flag indicating writeback was disabled.
> > >>> 
> > >>> The way my hardware is operated is, roughly, as follows:
> > >>> 
> > >>> - The driver prepares a list of values to write to registers and stores
> > >>>   them in DMA-able memory. This doesn't involve the hardware, and so is
> > >>>   completely asynchronous to hardware operation.
> > >>> 
> > >>> - The driver then sets a register with the pointer to the registers
> > >>>   list.
> > >>> 
> > >>> - At the end of the current frame, the hardware reads the memory address
> > >>>   of the registers list and DMAs values to register. This operation is
> > >>>   fast and occurs fully during vertical blanking.
> > >>> 
> > >>> - The hardware then waits for the start of the frame, and begins
> > >>>   processing the framebuffers to send the frame to the display.
> > >>> 
> > >>> - If no new registers list is provided for the next frame, the current
> > >>>   list is reused.
> > >> 
> > >> Does this mean the hardware goes back to step 3 and re-reads the memory
> > >> using DMA?
> > > 
> > > That's correct. At frame end the hardware will always read and apply a
> > > registers list, either a new one or the old one if no new list is
> > > programmed.
> > > 
> > >>> Two interrupts are provided, one at the start of the frame and one at
> > >>> the end of the frame. The driver uses the end of frame interrupt to
> > >>> signal vertical blanking and page flip completion to DRM.
> > >>> 
> > >>> The end of frame interrupt is also used to schedule atomic commits. The
> > >>> hardware queue depth for atomic commits is just one, so the driver has
> > >>> to wait until the previous commit has been processed before writing the
> > >>> new registers list address to the hardware. To solve the race between
> > >>> the end of frame interrupt and the address write, the hardware provides
> > >>> a status bit that is set to 1 when the address is written, and reset to
> > >>> 0 when the hardware starts processing the registers list.
> > >> 
> > >> For my understanding, the following simplication is then true: status = 0
> > >> means driver is allowed to update the pointer register, status = 1 means
> > >> currently programmed pointer is used to fetch the register values, right?
> > > 
> > > Almost. status = 1 means that the pointer has been programmed by the
> > > CPU, but not taken into account by the hardware yet. status = 0 means
> > > that the pointer hasn't been written to by the CPU since the last time
> > > the hardware read it.
> > 
> > Right, so CPU writes to the pointer register and that sets status to 1,
> > and when the HW has copied that value into internal DMA registers it clears
> > it, correct?
> 
> Yes, that's correct.
> 
> > >> What happens if you update the pointer while status = 1? DMA transfer gets
> > >> interrupted and a new one is scheduled? Does the DMA transfer get cancelled?
> > > 
> > > You should not do that, as it's racy. The hardware will use either the
> > > old value of the pointer or new new one, based on the timings of the
> > > reprogrammation, and you can't tell which one is used. Nothing can
> > > cancel the DMA, when it's time to start the DMA the hardware will
> > > transfer the value of the pointer to an internal register and use that
> > > for DMA. Reprogramming the pointer during the DMA will not affect the
> > > DMA, the new value will be taken into account for the next DMA.
> > 
> > It looks to me like you can do the in-place update of the writeback
> > framebuffer address at the end of the current frame (vblank time(?)). If
> > you wait until status = 0 then you know the next frame parameters are
> > being "internalised", so you can set in the commit you're about to queue
> > the disabling of the writeback if that commit doesn't have a fresh
> > writeback request (phew, that's a mouthful).
> 
> My problem is that there may not be a next commit, if userspace hasn't
> queued one. Otherwise this is what I would do. My initial implementation
> was queuing two commits for writeback, one with writeback enabled, and
> immediately after a second one copied from the first but with writeback
> disabled. This halved the frame rate, as the next commit from userspace
> had to first wait for completion of the writeback disabling commit. The
> real issue here is that I would need to queue the writeback disabling
> commit right after the writeback enabling commit to make sure it gets
> processed for the next frame, but once its queued any userspace commit
> would need to wait one extra frame as a queued commit can be processed
> at any time and thus can't be replaced. That's why I decided to modify
> the registers list in place instead of queueing a new commit to disable
> writeback.
> 
> > I don't think you need to wait until the next start of frame interrupt
> > to do that. Also, vblank time is probably the time you want to signal
> > the completion of the previous writeback anyway, right?
> 
> Correct, that's when I signal it.
> 
> > >>> We thus have three states for an atomic commit:
> > >>> 
> > >>> - active, where the corresponding registers list address has been
> > >>>   written to the hardware, and processed
> > >>> 
> > >>> - queued, where the corresponding registers list address has been
> > >>>   written to the hardware but not processed yet
> > >>> 
> > >>> - pending, where the corresponding registers list address hasn't been
> > >>>   written to the hardware yet
> > >>> 
> > >>> The status bit mentioned above allows us to tell if a list exists in the
> > >>> queued state.
> > >>> 
> > >>> At frame end time, if the status bit is set, we have potentially lost
> > >>> the race between writing the new registers list and the frame end
> > >>> interrupt, so we wait for one more vblank. Otherwise, if a list was
> > >>> queued, we move it to the active state, and retire the active list. If a
> > >>> list was pending, we write its address to the hardware, and move it to
> > >>> the queued state.
> > 
> > It looks to me like the moving of the pending state into queued state is your
> > opportunity to also in-place modify the writeback registers if the state does
> > not have its own writeback request.
> 
> Moving from pending to queued means the pointer has been given to the
> hardware, but not processed yet. I need to wait until the commit that
> enables writeback is fully processed before modifying it in-place to
> disable writeback, and that's at the frame start following the move from
> the queued state to the active state.

I'm not attempting to (re)write your driver, only to explain my thinking process in
a way that is easiest for me:

1. driver prepares a new commit that might have a writeback and sets the
pointer register to the new address. It then marks the commit as queued.

(optional) 2. driver receives a new commit that is marked as pending

3. end-of-frame interrupt arrives
     a. HW reads the new address and programs a DMA transfer to update registers.
     b. driver reads the status bit and waits until status == 0.
     c. driver marks queued commit as active and looks if it has any pending commits
          - if yes, it looks at the pending commit if it has a writeback request
	        - if no, it updates the pending commit to write the register(s) that disable writeback
		- moves pending commit to queued
	  - if no, then it copies active commit and disables writeback. (*)
     d. driver signals vblank and any previous writebacks that might have been programmed
        by the previously active commit

4. ....
5. profit!


(*) depending on how the HW behaves, it might be enough to create a stub
    commit that only disables the writeback, with no other update.

This way writeback commits will always be followed by another commit, even if it is a dummy one
that only disables writeback.

Best regards,
Liviu

> 
> > >>> To schedule writeback I ended up using the frame start interrupt.
> > >>> Writeback has a specific need in that it requires enabling the memory
> > >>> write interface for a single frame, and that is not something the
> > >>> hardware supports directly. I can't either queue a new registers list
> > >>> with the write interface disabled right after the list with the
> > >>> writeback request became active, as any atomic commit would then be
> > >>> delayed by an extra frame. I thus ended up modifying the registers list
> > >>> in place at frame start time, which is right after the active register
> > >>> list has been DMA'ed to hardware registers. We have the duration of a
> > >>> whole frame to do so, before the hardware transfers the same list again
> > >>> at the beginning of the next frame.
> > >> 
> > >> Yes, that is like to what we do for DP500 in mali-dp, where we have a
> > >> similar situation. HW will continue to stream to the given buffer until
> > >> stopped, but the writeback API is one-shot mode, so on vblank (when we
> > >> know the writeback will start at the beginning of the new frame) we
> > >> program the memwrite to stop, but that only takes effect when we also
> > >> program the GO bit (CONFIG_VALID in our case).
> > >> 
> > >> One problem we had to take care for DP500 was when the next commit (the
> > >> queued one in your case) also contains a writeback buffer. In that case,
> > >> we don't stop the writeback but "re-start" it.
> > >> 
> > >>> There is a race condition there, as the in-place modification of the
> > >>> active list could be done after the end of the frame if the interrupt
> > >>> latency gets too large. I don't see how I could solve that, as I could
> > >>> write the value after the hardware starts processing the active list at
> > >>> frame end time, but before the interrupt is notified.
> > >> 
> > >> I think that would be a better idea if the HW finishes to transfer
> > >> before the end of frame interrupt is signalled. Otherwise if your
> > >> interrupt handler is scheduled too fast, you might overwrite the active
> > >> frame values?
> > > 
> > > That can't happen, as I patch the active registers list in-place in the
> > > frame start interrupt handler, and frame start occurs after the hardware
> > > finishes the DMA of the registers list.
> > > 
> > >>>>> Note that we have the duration of a complete frame to disable writeback,
> > >>>>> as we receive an interrupt when the frame starts, and have until vblank
> > >>>>> to update the configuration. It's thus slightly better than having to
> > >>>>> disable writeback between vblank and the start of the next frame.
> > >>>> 
> > >>>> Yeah... we have a whole frame too. I'm struggling to find our wiki
> > >>>> page with the data, but anecdotally there's some (out-of-tree) drivers
> > >>>> which keep interrupts masked for a _really long_ time. It's nice if
> > >>>> you don't have to care about those :-)
> > >>> 
> > >>> I only provide two options: not my problem, or give me the source code
> > >>> and documentation and let me upstream a clean version of the out-of-tree
> > >>> drivers :-)
> > >>> 
> > >>>>>>>  /**
> > >>>>>>>   * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
> > >>>>>>>   * @dlm: the display list manager
> > >>>>>>> diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>>>>>> index e0fdb145e6ed..f845607abc4c 100644
> > >>>>>>> --- a/drivers/media/platform/vsp1/vsp1_dl.h
> > >>>>>>> +++ b/drivers/media/platform/vsp1/vsp1_dl.h
> > >>>>>>> @@ -54,6 +54,7 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
> > >>>>>>>  					unsigned int prealloc);
> > >>>>>>>  void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
> > >>>>>>>  void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
> > >>>>>>> +void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
> > >>>>>>>  unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
> > >>>>>>>  struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
> > >>>>>>>  
> > >>>>>>> @@ -71,6 +72,8 @@ struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
> > >>>>>>>  void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
> > >>>>>>>  
> > >>>>>>>  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
> > >>>>>>> +void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
> > >>>>>>> +				u32 reset_value);
> > >>>>>>>  int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
> > >>>>>>>  int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);
> > >>>>>>>  
> 
> -- 
> Regards,
> 
> Laurent Pinchart
Laurent Pinchart March 8, 2019, 12:24 p.m. UTC | #14
Hi Brian,

On Thu, Mar 07, 2019 at 12:28:08PM +0000, Brian Starkey wrote:
> On Wed, Mar 06, 2019 at 08:22:44PM +0200, Laurent Pinchart wrote:
> 
> [snip]
> 
> > I can always queue a new one, but I have no way of telling if the newly
> > queued list raced with the frame end interrupt.
> > 
> > There's another register I'm not using that contains a shadow copy of
> > the registers list DMA address. It mirrors the address programmed by the
> > driver when there is no DMA of the registers list in progress, and
> > contains the address the of registers list being DMA'ed when a DMA is in
> > progress. I don't think I can do much with it, as reading it either
> > before or after reprogramming (to check if I can reprogram or if the
> > reprogram has been taken into account) is racy.
> 
> When you say it mirrors the address programmed, is that latched when
> the update is accepted, or it will update the moment you program the
> address register?

It is latched when the update is processed, at the next vblank following
the address programming. The timing diagram below shows vblank, the UPD
bit I use for synchronization, the registers list address register
exposed to the CPU, and the internal address used to DMA the registers
list. The address register is written four times to show how the
hardware reacts, but in practice there will be a single write per frame,
right after the beginning of vblank.

                                      DMA starts
                                      |     DMA ends
                                      |     |
                                      V     V
                                      ___________
VBLANK         ______________________|           |________
                     ________________     ________________
UPD            _____|                |___|
               _____ ______ _____________ ________ _______
ADDR register  __A__X__B___X______C______X___D____X___E___
               ______________________ ____________________
ADDR internal  ___________A__________X__________C_________

I can reprogram the address any number of times I want before the
vblank, but the update bit mechanism only lets me protect the race
related to the first write. For any subsequent write I won't be able to
tell whether it was complete before the hardware started the frame, or
arrived too late.

> I assume the latter or you would have thought of this yourself (that
> seems like a really strange/not-useful behaviour!). But if it is the
> former you could:
> 
>  - In writeback start-of-frame, create a copy of the register list,
>    disabling writeback.
>  - Write the address of this copy to the register
> 
> If/when an atomic commit comes in before you service the next
> end-of-frame:
> 
>  - Write the address of the new register list
>  - Check the status register. If the "pending" bit is zero, you know
>    you had a potential race.
>     - Check the DMA address register. If it corresponds to the new
>       scene, the new commit won the race, otherwise it's been delayed
>       by a frame.

The issue here is that there's a potential race if the pending (UPD) bit
is one too. If the commit arrives just before vblank, but the address is
written just after vblank, after the UPD bit has been reset to 0 by the
hardware, but before the vblank interrupt is processed, then the commit
won't be applied yet, and I will have no way to know. Compare the two
following scenarios:

		   [1]       [2] [3]           [4]       [5]
                    |         |   |             |         |
                    |         V   V             |         V
                    V          __________       V          __________
VBLANK         _______________|          |________________|          |__
                     _________     _______________________
UPD            _____|         |___|                       |_____________
               _____ _____________ _____________ _______________________
ADDR register  __A__X______B______X______C______X___________D___________
               ___________________________________________ _____________
ADDR internal  _______A_______X______B____________________X______D______

[1] Atomic commit, registers list address write, with writeback enabled
[2] DMA starts for first atomic commit
[3] Writeback disable registers list address write
[4] Next atomic commit, with writeback disabled
[5] DMA starts for second atomic commit

		   [1]       [2] [3]                     [4][5]
                    |         |   |                       |  |
                    |         V   V                       V  V
                    V          __________                  __________
VBLANK         _______________|          |________________|          |__
                     _________     _______________________    __________
UPD            _____|         |___|                       |__|
               _____ _____________ __________________________ __________
ADDR register  __A__X______B______X_____________C____________X_____D____
               ___________________________________________ _____________
ADDR internal  _______A_______X______B____________________X______C______

[1] Atomic commit, registers list address write, with writeback enabled
[2] DMA starts for first atomic commit
[3] Writeback disable registers list address write
[4] DMA starts for writeback disable registers list (3)
[5] Next atomic commit, with writeback disabled, performed right after
    vblank but befrore the vblank interrupt is serviced

The UPD bit is 1 after writing the ADDR register the second time in both
cases. Furthermore, if [4] and [5] are very close in the second case,
the UPD bit may read 1 just before [5] if the read comes before [4]:

	read UPD bit;
	/* VBLANK [4] */
	write ADDR register;

I thus can't rely on UPD = 1 before the write meaning that the write was
performed before vblank, and I can't rely either on the UPD bit after
write, as it's 1 in both cases.

I initially thought I could protect against the race using the following
procedure.

- Single atomic commit, no IRQ delay

		   [1]       [2]        [3]              [4]
                    |         |          |                |
                    |         V          V                V
                    V          __________                  __________
VBLANK         _______________|          |________________|          |__
                     _________    
UPD            _____|         |_________________________________________
               _____ ___________________________________________________
ADDR register  __A__X___________________B_______________________________
               _______________ _________________________________________
ADDR internal  _______A_______X______B__________________________________

[1] Atomic commit, registers list address write, with writeback enabled
[2] DMA starts for first atomic commit
[3] Frame start, disable writeback in-place in registers list B
[4] DMA starts for "patched" registers list, disables writeback

- Two atomic commits, no IRQ delay

		   [1]       [2]  [3]   [4]              [5]
                    |         |    |     |                |
                    |         V    V     V                V
                    V          __________                  __________
VBLANK         _______________|          |________________|          |__
                     _________      ______________________
UPD            _____|         |____|                      |_____________
               _____ ______________ ____________________________________
ADDR register  __A__X______B_______X_________________C__________________
               _______________ ___________________________ _____________
ADDR internal  _______A_______X_____________B_____________X______C______

[1] Atomic commit, registers list address write, with writeback enabled
[2] DMA starts for first atomic commit
[3] Next atomic commit, registers list address write, with writeback
    disabled
[4] Frame start, disable writeback in-place in registers list B
[5] DMA starts for second atomic commit, disables writeback

[3] and [4] can happen in any order, as long as they both come before
[5]. If [3] comes after [5], we're back to the previous case (Single
atomic commit, no IRQ delay).

- Single atomic commit, IRQ delay

		   [1]       [2]        [3]              [4] [5]
                    |         |          |                |   |
                    |         V          V                V   V
                    V          __________                  __________
VBLANK         _______________|          |________________|          |__
                     _________    
UPD            _____|         |_________________________________________
               _____ ___________________________________________________
ADDR register  __A__X___________________B_______________________________
               _______________ _________________________________________
ADDR internal  _______A_______X______B__________________________________

[1] Atomic commit, registers list address write, with writeback enabled
[2] DMA starts for first atomic commit
[3] Frame start, IRQ is delayed to [5]
[4] DMA starts for unmodified registers list, writeback still enable
[5] disable writeback in-place in registers list B, too late

Here I need to detect that [5] was delayed after [4], and thus delay the
completion of the writeback job by one frame. This could be done by
checking the vblank interrupt status bit, if it is set then vblank
occurred and raced [5]. However, the same issue can also happen when no
race occurred if processing of the vblank interrupt for [2] is delayed
until [3]. Both the vblank interrupt and the frame start interrupt
status bits will be set, indicate a potential race.

The time between [2] and [3] is very short compared to the time between
[3] and [4] and to interrupt latency in general, so we would have lots
of false positives.

> >> You don't happen to have a DMA engine trigger or something you could
> >> use to do the register list modification at a guaranteed time do you?
> > 
> > Not that I know of, unfortunately.
> > 
> >> Are you always going to be protected by an IOMMU, preventing the
> >> writeback from trashing physical memory? If that's not the case, then
> >> the race can have pretty dire consequences.
> > 
> > If the IOMMU is enabled in DT, yes. It's a system-level decision.
> 
> Well, it's your driver at the end of the day. But for me, a known
> race-condition which would cause random memory corruption sounds like
> a really Bad Thing. Halving frame-rate on systems with no IOMMU seems
> preferable to me.

It is a really bad thing. I think the decision should be taken by
Renesas though.
Laurent Pinchart March 8, 2019, 12:46 p.m. UTC | #15
Hi Liviu,

On Thu, Mar 07, 2019 at 04:31:40PM +0000, Liviu Dudau wrote:
> On Thu, Mar 07, 2019 at 03:48:23PM +0200, Laurent Pinchart wrote:
> > On Thu, Mar 07, 2019 at 11:52:18AM +0000, Liviu Dudau wrote:
> >> On Wed, Mar 06, 2019 at 08:01:53PM +0200, Laurent Pinchart wrote:
> >>> On Wed, Mar 06, 2019 at 02:20:51PM +0000, Liviu Dudau wrote:
> >>>> On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:

[snip]

> >>>>> We thus have three states for an atomic commit:
> >>>>> 
> >>>>> - active, where the corresponding registers list address has been
> >>>>>   written to the hardware, and processed
> >>>>> 
> >>>>> - queued, where the corresponding registers list address has been
> >>>>>   written to the hardware but not processed yet
> >>>>> 
> >>>>> - pending, where the corresponding registers list address hasn't been
> >>>>>   written to the hardware yet
> >>>>> 
> >>>>> The status bit mentioned above allows us to tell if a list exists in the
> >>>>> queued state.
> >>>>> 
> >>>>> At frame end time, if the status bit is set, we have potentially lost
> >>>>> the race between writing the new registers list and the frame end
> >>>>> interrupt, so we wait for one more vblank. Otherwise, if a list was
> >>>>> queued, we move it to the active state, and retire the active list. If a
> >>>>> list was pending, we write its address to the hardware, and move it to
> >>>>> the queued state.
> >> 
> >> It looks to me like the moving of the pending state into queued state is your
> >> opportunity to also in-place modify the writeback registers if the state does
> >> not have its own writeback request.
> > 
> > Moving from pending to queued means the pointer has been given to the
> > hardware, but not processed yet. I need to wait until the commit that
> > enables writeback is fully processed before modifying it in-place to
> > disable writeback, and that's at the frame start following the move from
> > the queued state to the active state.
> 
> I'm not attempting to (re)write your driver, only to explain my thinking process in
> a way that is easiest for me:

I wouldn't mind if you attempted to rewrite the driver if it ended up in
a better state :-)

> 1. driver prepares a new commit that might have a writeback and sets the
> pointer register to the new address. It then marks the commit as queued.
> 
> (optional) 2. driver receives a new commit that is marked as pending
> 
> 3. end-of-frame interrupt arrives
>      a. HW reads the new address and programs a DMA transfer to update registers.
>      b. driver reads the status bit and waits until status == 0.

Step b is not needed as the status bit is set to 0 as soon as the
hardware starts the DMA.

>      c. driver marks queued commit as active and looks if it has any pending commits
>           - if yes, it looks at the pending commit if it has a writeback request
> 	        - if no, it updates the pending commit to write the register(s) that disable writeback
> 		- moves pending commit to queued
> 	  - if no, then it copies active commit and disables writeback. (*)

Pending commits are very rare. Userspace will usually queue the next
commit when it receives notification of completion of the previous
commit, so the next commit will arrive after vblank, with a single
commit per frame interval. At that point there will be an active commit,
and no queued commit, and the new commit will become the queued commit.
The "no" case will thus be hit in the vast majority of cases, preventing
the next userspace commit to be queued for the same frame, delaying it
by one frame, and thus halving the frame rate :-(

>      d. driver signals vblank and any previous writebacks that might have been programmed
>         by the previously active commit
> 
> 4. ....
> 5. profit!
> 
> 
> (*) depending on how the HW behaves, it might be enough to create a stub
>     commit that only disables the writeback, with no other update.

A stub is enough, there's no need to reprogram everything.

> This way writeback commits will always be followed by another commit,
> even if it is a dummy one that only disables writeback.
Liviu Dudau March 8, 2019, 3:02 p.m. UTC | #16
On Fri, Mar 08, 2019 at 02:46:08PM +0200, Laurent Pinchart wrote:
> Hi Liviu,
> 
> On Thu, Mar 07, 2019 at 04:31:40PM +0000, Liviu Dudau wrote:
> > On Thu, Mar 07, 2019 at 03:48:23PM +0200, Laurent Pinchart wrote:
> > > On Thu, Mar 07, 2019 at 11:52:18AM +0000, Liviu Dudau wrote:
> > >> On Wed, Mar 06, 2019 at 08:01:53PM +0200, Laurent Pinchart wrote:
> > >>> On Wed, Mar 06, 2019 at 02:20:51PM +0000, Liviu Dudau wrote:
> > >>>> On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> 
> [snip]
> 
> > >>>>> We thus have three states for an atomic commit:
> > >>>>> 
> > >>>>> - active, where the corresponding registers list address has been
> > >>>>>   written to the hardware, and processed
> > >>>>> 
> > >>>>> - queued, where the corresponding registers list address has been
> > >>>>>   written to the hardware but not processed yet
> > >>>>> 
> > >>>>> - pending, where the corresponding registers list address hasn't been
> > >>>>>   written to the hardware yet
> > >>>>> 
> > >>>>> The status bit mentioned above allows us to tell if a list exists in the
> > >>>>> queued state.
> > >>>>> 
> > >>>>> At frame end time, if the status bit is set, we have potentially lost
> > >>>>> the race between writing the new registers list and the frame end
> > >>>>> interrupt, so we wait for one more vblank. Otherwise, if a list was
> > >>>>> queued, we move it to the active state, and retire the active list. If a
> > >>>>> list was pending, we write its address to the hardware, and move it to
> > >>>>> the queued state.
> > >> 
> > >> It looks to me like the moving of the pending state into queued state is your
> > >> opportunity to also in-place modify the writeback registers if the state does
> > >> not have its own writeback request.
> > > 
> > > Moving from pending to queued means the pointer has been given to the
> > > hardware, but not processed yet. I need to wait until the commit that
> > > enables writeback is fully processed before modifying it in-place to
> > > disable writeback, and that's at the frame start following the move from
> > > the queued state to the active state.
> > 
> > I'm not attempting to (re)write your driver, only to explain my thinking process in
> > a way that is easiest for me:
> 
> I wouldn't mind if you attempted to rewrite the driver if it ended up in
> a better state :-)
> 
> > 1. driver prepares a new commit that might have a writeback and sets the
> > pointer register to the new address. It then marks the commit as queued.
> > 
> > (optional) 2. driver receives a new commit that is marked as pending
> > 
> > 3. end-of-frame interrupt arrives
> >      a. HW reads the new address and programs a DMA transfer to update registers.
> >      b. driver reads the status bit and waits until status == 0.
> 
> Step b is not needed as the status bit is set to 0 as soon as the
> hardware starts the DMA.

OK, however because I'm not sure when the driver's interrupt handler is called,
I was trying to be safe and make sure the driver can proceed.

> 
> >      c. driver marks queued commit as active and looks if it has any pending commits
> >           - if yes, it looks at the pending commit if it has a writeback request
> > 	        - if no, it updates the pending commit to write the register(s) that disable writeback
> > 		- moves pending commit to queued
> > 	  - if no, then it copies active commit and disables writeback. (*)
> 
> Pending commits are very rare. Userspace will usually queue the next
> commit when it receives notification of completion of the previous
> commit, so the next commit will arrive after vblank, with a single
> commit per frame interval. At that point there will be an active commit,
> and no queued commit, and the new commit will become the queued commit.
> The "no" case will thus be hit in the vast majority of cases, preventing
> the next userspace commit to be queued for the same frame, delaying it
> by one frame, and thus halving the frame rate :-(

I'll guess you're referring to the (*) case of "no", but I'm not sure how the next
userspace commit will be delayed ..... wait a bit .... a light bulb turned on
somewhere in the darkness of my mind .... see below.

> 
> >      d. driver signals vblank and any previous writebacks that might have been programmed
> >         by the previously active commit
> > 
> > 4. ....
> > 5. profit!
> > 
> > 
> > (*) depending on how the HW behaves, it might be enough to create a stub
> >     commit that only disables the writeback, with no other update.
> 
> A stub is enough, there's no need to reprogram everything.

Right, in that case, can you not look at the queued commit when the next userspace
commit comes in and if it is the stub commit overwrite it with your new commit as
if you're at step c. ?

Best regards,
Liviu

> 
> > This way writeback commits will always be followed by another commit,
> > even if it is a dummy one that only disables writeback.
> 
> -- 
> Regards,
> 
> Laurent Pinchart
Laurent Pinchart March 13, 2019, 12:56 a.m. UTC | #17
Hi Liviu,

On Fri, Mar 08, 2019 at 03:02:39PM +0000, Liviu Dudau wrote:
> On Fri, Mar 08, 2019 at 02:46:08PM +0200, Laurent Pinchart wrote:
> > On Thu, Mar 07, 2019 at 04:31:40PM +0000, Liviu Dudau wrote:
> >> On Thu, Mar 07, 2019 at 03:48:23PM +0200, Laurent Pinchart wrote:
> >>> On Thu, Mar 07, 2019 at 11:52:18AM +0000, Liviu Dudau wrote:
> >>>> On Wed, Mar 06, 2019 at 08:01:53PM +0200, Laurent Pinchart wrote:
> >>>>> On Wed, Mar 06, 2019 at 02:20:51PM +0000, Liviu Dudau wrote:
> >>>>>> On Wed, Mar 06, 2019 at 01:14:40AM +0200, Laurent Pinchart wrote:
> > 
> > [snip]
> > 
> >>>>>>> We thus have three states for an atomic commit:
> >>>>>>> 
> >>>>>>> - active, where the corresponding registers list address has been
> >>>>>>>   written to the hardware, and processed
> >>>>>>> 
> >>>>>>> - queued, where the corresponding registers list address has been
> >>>>>>>   written to the hardware but not processed yet
> >>>>>>> 
> >>>>>>> - pending, where the corresponding registers list address hasn't been
> >>>>>>>   written to the hardware yet
> >>>>>>> 
> >>>>>>> The status bit mentioned above allows us to tell if a list exists in the
> >>>>>>> queued state.
> >>>>>>> 
> >>>>>>> At frame end time, if the status bit is set, we have potentially lost
> >>>>>>> the race between writing the new registers list and the frame end
> >>>>>>> interrupt, so we wait for one more vblank. Otherwise, if a list was
> >>>>>>> queued, we move it to the active state, and retire the active list. If a
> >>>>>>> list was pending, we write its address to the hardware, and move it to
> >>>>>>> the queued state.
> >>>> 
> >>>> It looks to me like the moving of the pending state into queued state is your
> >>>> opportunity to also in-place modify the writeback registers if the state does
> >>>> not have its own writeback request.
> >>> 
> >>> Moving from pending to queued means the pointer has been given to the
> >>> hardware, but not processed yet. I need to wait until the commit that
> >>> enables writeback is fully processed before modifying it in-place to
> >>> disable writeback, and that's at the frame start following the move from
> >>> the queued state to the active state.
> >> 
> >> I'm not attempting to (re)write your driver, only to explain my thinking process in
> >> a way that is easiest for me:
> > 
> > I wouldn't mind if you attempted to rewrite the driver if it ended up in
> > a better state :-)
> > 
> >> 1. driver prepares a new commit that might have a writeback and sets the
> >> pointer register to the new address. It then marks the commit as queued.
> >> 
> >> (optional) 2. driver receives a new commit that is marked as pending
> >> 
> >> 3. end-of-frame interrupt arrives
> >>      a. HW reads the new address and programs a DMA transfer to update registers.
> >>      b. driver reads the status bit and waits until status == 0.
> > 
> > Step b is not needed as the status bit is set to 0 as soon as the
> > hardware starts the DMA.
> 
> OK, however because I'm not sure when the driver's interrupt handler
> is called, I was trying to be safe and make sure the driver can
> proceed.
> 
> >>      c. driver marks queued commit as active and looks if it has any pending commits
> >>           - if yes, it looks at the pending commit if it has a writeback request
> >> 	        - if no, it updates the pending commit to write the register(s) that disable writeback
> >> 		- moves pending commit to queued
> >> 	  - if no, then it copies active commit and disables writeback. (*)
> > 
> > Pending commits are very rare. Userspace will usually queue the next
> > commit when it receives notification of completion of the previous
> > commit, so the next commit will arrive after vblank, with a single
> > commit per frame interval. At that point there will be an active commit,
> > and no queued commit, and the new commit will become the queued commit.
> > The "no" case will thus be hit in the vast majority of cases, preventing
> > the next userspace commit to be queued for the same frame, delaying it
> > by one frame, and thus halving the frame rate :-(
> 
> I'll guess you're referring to the (*) case of "no", but I'm not sure how the next
> userspace commit will be delayed ..... wait a bit .... a light bulb turned on
> somewhere in the darkness of my mind .... see below.
> 
> >>      d. driver signals vblank and any previous writebacks that might have been programmed
> >>         by the previously active commit
> >> 
> >> 4. ....
> >> 5. profit!
> >> 
> >> 
> >> (*) depending on how the HW behaves, it might be enough to create a stub
> >>     commit that only disables the writeback, with no other update.
> > 
> > A stub is enough, there's no need to reprogram everything.
> 
> Right, in that case, can you not look at the queued commit when the next userspace
> commit comes in and if it is the stub commit overwrite it with your new commit as
> if you're at step c. ?

I can't safely overwrite a queued commit, that's my core issue. Once a
commit is queued to the hardware, it can start being processed at any
time, and that could race with overwriting the commit. The hardware
doesn't give me a way to know if the overwrite took place before the
queued commit was processed, or afterwards.

I have however found a way to fix my issue in what I believe is a safe
way, and I've posted a v6 of the series. In a nutshell, the hardware
allows chaining registers lists. I can thus create a registers lists
with writeback enabled, and a second one that disables the writeback.
The two lists are chained, and the first one is queued to the hardware.
When the hardware finishes processing it, it automatically moves to the
chained list for the next frame, without software intervention. As the
chained list is not queued separately it doesn't interfere with the
mechanism that allows me to test if a list is queued before queuing the
next commit, so I can queue the next commit and it will take over the
chained list if it hasn't been processed yet.

> >> This way writeback commits will always be followed by another commit,
> >> even if it is a dummy one that only disables writeback.
Brian Starkey March 18, 2019, 4:59 p.m. UTC | #18
Hi Laurent,

Sorry for the delay, I was travelling last week and didn't find a
chance to digest your diagrams (thanks for all the detail!)

On Fri, Mar 08, 2019 at 02:24:40PM +0200, Laurent Pinchart wrote:
> Hi Brian,
> 
> On Thu, Mar 07, 2019 at 12:28:08PM +0000, Brian Starkey wrote:
> > On Wed, Mar 06, 2019 at 08:22:44PM +0200, Laurent Pinchart wrote:
> > 
> > [snip]
> > 
> > > I can always queue a new one, but I have no way of telling if the newly
> > > queued list raced with the frame end interrupt.
> > > 
> > > There's another register I'm not using that contains a shadow copy of
> > > the registers list DMA address. It mirrors the address programmed by the
> > > driver when there is no DMA of the registers list in progress, and
> > > contains the address the of registers list being DMA'ed when a DMA is in
> > > progress. I don't think I can do much with it, as reading it either
> > > before or after reprogramming (to check if I can reprogram or if the
> > > reprogram has been taken into account) is racy.
> > 
> > When you say it mirrors the address programmed, is that latched when
> > the update is accepted, or it will update the moment you program the
> > address register?
> 
> It is latched when the update is processed, at the next vblank following
> the address programming. The timing diagram below shows vblank, the UPD
> bit I use for synchronization, the registers list address register
> exposed to the CPU, and the internal address used to DMA the registers
> list. The address register is written four times to show how the
> hardware reacts, but in practice there will be a single write per frame,
> right after the beginning of vblank.
> 
>                                       DMA starts
>                                       |     DMA ends
>                                       |     |
>                                       V     V
>                                       ___________
> VBLANK         ______________________|           |________
>                      ________________     ________________
> UPD            _____|                |___|
>                _____ ______ _____________ ________ _______
> ADDR register  __A__X__B___X______C______X___D____X___E___
>                ______________________ ____________________
> ADDR internal  ___________A__________X__________C_________
> 
> I can reprogram the address any number of times I want before the
> vblank, but the update bit mechanism only lets me protect the race
> related to the first write. For any subsequent write I won't be able to
> tell whether it was complete before the hardware started the frame, or
> arrived too late.
> 
> > I assume the latter or you would have thought of this yourself (that
> > seems like a really strange/not-useful behaviour!). But if it is the
> > former you could:
> > 
> >  - In writeback start-of-frame, create a copy of the register list,
> >    disabling writeback.
> >  - Write the address of this copy to the register
> > 
> > If/when an atomic commit comes in before you service the next
> > end-of-frame:
> > 
> >  - Write the address of the new register list
> >  - Check the status register. If the "pending" bit is zero, you know
> >    you had a potential race.
> >     - Check the DMA address register. If it corresponds to the new
> >       scene, the new commit won the race, otherwise it's been delayed
> >       by a frame.
> 
> The issue here is that there's a potential race if the pending (UPD) bit
> is one too. If the commit arrives just before vblank, but the address is
> written just after vblank, after the UPD bit has been reset to 0 by the
> hardware, but before the vblank interrupt is processed, then the commit
> won't be applied yet, and I will have no way to know. Compare the two
> following scenarios:
> 
> 		   [1]       [2] [3]           [4]       [5]
>                     |         |   |             |         |
>                     |         V   V             |         V
>                     V          __________       V          __________
> VBLANK         _______________|          |________________|          |__
>                      _________     _______________________
> UPD            _____|         |___|                       |_____________
>                _____ _____________ _____________ _______________________
> ADDR register  __A__X______B______X______C______X___________D___________
>                ___________________________________________ _____________
> ADDR internal  _______A_______X______B____________________X______D______
> 
> [1] Atomic commit, registers list address write, with writeback enabled
> [2] DMA starts for first atomic commit
> [3] Writeback disable registers list address write
> [4] Next atomic commit, with writeback disabled
> [5] DMA starts for second atomic commit
> 
> 		   [1]       [2] [3]                     [4][5]
>                     |         |   |                       |  |
>                     |         V   V                       V  V
>                     V          __________                  __________
> VBLANK         _______________|          |________________|          |__
>                      _________     _______________________    __________
> UPD            _____|         |___|                       |__|
>                _____ _____________ __________________________ __________
> ADDR register  __A__X______B______X_____________C____________X_____D____
>                ___________________________________________ _____________
> ADDR internal  _______A_______X______B____________________X______C______
> 
> [1] Atomic commit, registers list address write, with writeback enabled
> [2] DMA starts for first atomic commit
> [3] Writeback disable registers list address write
> [4] DMA starts for writeback disable registers list (3)
> [5] Next atomic commit, with writeback disabled, performed right after
>     vblank but befrore the vblank interrupt is serviced
> 
> The UPD bit is 1 after writing the ADDR register the second time in both
> cases. Furthermore, if [4] and [5] are very close in the second case,
> the UPD bit may read 1 just before [5] if the read comes before [4]:
> 
> 	read UPD bit;
> 	/* VBLANK [4] */
> 	write ADDR register;
> 
> I thus can't rely on UPD = 1 before the write meaning that the write was
> performed before vblank, and I can't rely either on the UPD bit after
> write, as it's 1 in both cases.

My mistake, I got the UPD bit the wrong way around. I'm still not
entirely sure why you can't use "ADDR internal" to determine which
side won the race. It shows 'B' in the first case, and 'C' in the
second.

When a new commit comes, unconditionally:
 - Write new address
 - Read status

 if status.UPD == 0 --> You know for sure your new commit was just
                        latched.
 if status.UPD == 1 --> You need to check ADDR internal to see which
			of these three happened:

    1) Your first case happened. We're somewhere in the middle of the
       frame. ADDR internal will show 'B', and you know commit 'D' is
       going on-screen at the next vblank.

    2) Your second case happened. The new commit raced with the
       latching of writeback-disable and "lost". ADDR internal will
       show 'C', and the new commit is delayed by a frame

    3) (Teeny tiny small window) In-between reading status and ADDR
       internal, the new commit was latched. ADDR internal will show
       'D'. You know the new commit "won" so treat it the same as if
       UPD == 0 (which it will be, now).

Anyway, it's all moot now that you've found the chained lists thing -
that sounds ideal. I'll take a look at the new series shortly.

Thanks,
-Brian

> 
> I initially thought I could protect against the race using the following
> procedure.
> 
> - Single atomic commit, no IRQ delay
> 
> 		   [1]       [2]        [3]              [4]
>                     |         |          |                |
>                     |         V          V                V
>                     V          __________                  __________
> VBLANK         _______________|          |________________|          |__
>                      _________    
> UPD            _____|         |_________________________________________
>                _____ ___________________________________________________
> ADDR register  __A__X___________________B_______________________________
>                _______________ _________________________________________
> ADDR internal  _______A_______X______B__________________________________
> 
> [1] Atomic commit, registers list address write, with writeback enabled
> [2] DMA starts for first atomic commit
> [3] Frame start, disable writeback in-place in registers list B
> [4] DMA starts for "patched" registers list, disables writeback
> 
> - Two atomic commits, no IRQ delay
> 
> 		   [1]       [2]  [3]   [4]              [5]
>                     |         |    |     |                |
>                     |         V    V     V                V
>                     V          __________                  __________
> VBLANK         _______________|          |________________|          |__
>                      _________      ______________________
> UPD            _____|         |____|                      |_____________
>                _____ ______________ ____________________________________
> ADDR register  __A__X______B_______X_________________C__________________
>                _______________ ___________________________ _____________
> ADDR internal  _______A_______X_____________B_____________X______C______
> 
> [1] Atomic commit, registers list address write, with writeback enabled
> [2] DMA starts for first atomic commit
> [3] Next atomic commit, registers list address write, with writeback
>     disabled
> [4] Frame start, disable writeback in-place in registers list B
> [5] DMA starts for second atomic commit, disables writeback
> 
> [3] and [4] can happen in any order, as long as they both come before
> [5]. If [3] comes after [5], we're back to the previous case (Single
> atomic commit, no IRQ delay).
> 
> - Single atomic commit, IRQ delay
> 
> 		   [1]       [2]        [3]              [4] [5]
>                     |         |          |                |   |
>                     |         V          V                V   V
>                     V          __________                  __________
> VBLANK         _______________|          |________________|          |__
>                      _________    
> UPD            _____|         |_________________________________________
>                _____ ___________________________________________________
> ADDR register  __A__X___________________B_______________________________
>                _______________ _________________________________________
> ADDR internal  _______A_______X______B__________________________________
> 
> [1] Atomic commit, registers list address write, with writeback enabled
> [2] DMA starts for first atomic commit
> [3] Frame start, IRQ is delayed to [5]
> [4] DMA starts for unmodified registers list, writeback still enable
> [5] disable writeback in-place in registers list B, too late
> 
> Here I need to detect that [5] was delayed after [4], and thus delay the
> completion of the writeback job by one frame. This could be done by
> checking the vblank interrupt status bit, if it is set then vblank
> occurred and raced [5]. However, the same issue can also happen when no
> race occurred if processing of the vblank interrupt for [2] is delayed
> until [3]. Both the vblank interrupt and the frame start interrupt
> status bits will be set, indicate a potential race.
> 
> The time between [2] and [3] is very short compared to the time between
> [3] and [4] and to interrupt latency in general, so we would have lots
> of false positives.
> 
> > >> You don't happen to have a DMA engine trigger or something you could
> > >> use to do the register list modification at a guaranteed time do you?
> > > 
> > > Not that I know of, unfortunately.
> > > 
> > >> Are you always going to be protected by an IOMMU, preventing the
> > >> writeback from trashing physical memory? If that's not the case, then
> > >> the race can have pretty dire consequences.
> > > 
> > > If the IOMMU is enabled in DT, yes. It's a system-level decision.
> > 
> > Well, it's your driver at the end of the day. But for me, a known
> > race-condition which would cause random memory corruption sounds like
> > a really Bad Thing. Halving frame-rate on systems with no IOMMU seems
> > preferable to me.
> 
> It is a really bad thing. I think the decision should be taken by
> Renesas though.
> 
> -- 
> Regards,
> 
> Laurent Pinchart
Laurent Pinchart March 19, 2019, 10 a.m. UTC | #19
Hi Brian,

On Mon, Mar 18, 2019 at 04:59:21PM +0000, Brian Starkey wrote:
> Hi Laurent,
> 
> Sorry for the delay, I was travelling last week and didn't find a
> chance to digest your diagrams (thanks for all the detail!)

No worries, and thank you for the time you're spending on this. In the
meantime I've found a solution that solves the race, using an entirely
different mechanism. It's all explained in v6 of the patch series.

> On Fri, Mar 08, 2019 at 02:24:40PM +0200, Laurent Pinchart wrote:
> > On Thu, Mar 07, 2019 at 12:28:08PM +0000, Brian Starkey wrote:
> > > On Wed, Mar 06, 2019 at 08:22:44PM +0200, Laurent Pinchart wrote:
> > > 
> > > [snip]
> > > 
> > > > I can always queue a new one, but I have no way of telling if the newly
> > > > queued list raced with the frame end interrupt.
> > > > 
> > > > There's another register I'm not using that contains a shadow copy of
> > > > the registers list DMA address. It mirrors the address programmed by the
> > > > driver when there is no DMA of the registers list in progress, and
> > > > contains the address the of registers list being DMA'ed when a DMA is in
> > > > progress. I don't think I can do much with it, as reading it either
> > > > before or after reprogramming (to check if I can reprogram or if the
> > > > reprogram has been taken into account) is racy.
> > > 
> > > When you say it mirrors the address programmed, is that latched when
> > > the update is accepted, or it will update the moment you program the
> > > address register?
> > 
> > It is latched when the update is processed, at the next vblank following
> > the address programming. The timing diagram below shows vblank, the UPD
> > bit I use for synchronization, the registers list address register
> > exposed to the CPU, and the internal address used to DMA the registers
> > list. The address register is written four times to show how the
> > hardware reacts, but in practice there will be a single write per frame,
> > right after the beginning of vblank.
> > 
> >                                       DMA starts
> >                                       |     DMA ends
> >                                       |     |
> >                                       V     V
> >                                       ___________
> > VBLANK         ______________________|           |________
> >                      ________________     ________________
> > UPD            _____|                |___|
> >                _____ ______ _____________ ________ _______
> > ADDR register  __A__X__B___X______C______X___D____X___E___
> >                ______________________ ____________________
> > ADDR internal  ___________A__________X__________C_________
> > 
> > I can reprogram the address any number of times I want before the
> > vblank, but the update bit mechanism only lets me protect the race
> > related to the first write. For any subsequent write I won't be able to
> > tell whether it was complete before the hardware started the frame, or
> > arrived too late.
> > 
> > > I assume the latter or you would have thought of this yourself (that
> > > seems like a really strange/not-useful behaviour!). But if it is the
> > > former you could:
> > > 
> > >  - In writeback start-of-frame, create a copy of the register list,
> > >    disabling writeback.
> > >  - Write the address of this copy to the register
> > > 
> > > If/when an atomic commit comes in before you service the next
> > > end-of-frame:
> > > 
> > >  - Write the address of the new register list
> > >  - Check the status register. If the "pending" bit is zero, you know
> > >    you had a potential race.
> > >     - Check the DMA address register. If it corresponds to the new
> > >       scene, the new commit won the race, otherwise it's been delayed
> > >       by a frame.
> > 
> > The issue here is that there's a potential race if the pending (UPD) bit
> > is one too. If the commit arrives just before vblank, but the address is
> > written just after vblank, after the UPD bit has been reset to 0 by the
> > hardware, but before the vblank interrupt is processed, then the commit
> > won't be applied yet, and I will have no way to know. Compare the two
> > following scenarios:
> > 
> > 		   [1]       [2] [3]           [4]       [5]
> >                     |         |   |             |         |
> >                     |         V   V             |         V
> >                     V          __________       V          __________
> > VBLANK         _______________|          |________________|          |__
> >                      _________     _______________________
> > UPD            _____|         |___|                       |_____________
> >                _____ _____________ _____________ _______________________
> > ADDR register  __A__X______B______X______C______X___________D___________
> >                ___________________________________________ _____________
> > ADDR internal  _______A_______X______B____________________X______D______
> > 
> > [1] Atomic commit, registers list address write, with writeback enabled
> > [2] DMA starts for first atomic commit
> > [3] Writeback disable registers list address write
> > [4] Next atomic commit, with writeback disabled
> > [5] DMA starts for second atomic commit
> > 
> > 		   [1]       [2] [3]                     [4][5]
> >                     |         |   |                       |  |
> >                     |         V   V                       V  V
> >                     V          __________                  __________
> > VBLANK         _______________|          |________________|          |__
> >                      _________     _______________________    __________
> > UPD            _____|         |___|                       |__|
> >                _____ _____________ __________________________ __________
> > ADDR register  __A__X______B______X_____________C____________X_____D____
> >                ___________________________________________ _____________
> > ADDR internal  _______A_______X______B____________________X______C______
> > 
> > [1] Atomic commit, registers list address write, with writeback enabled
> > [2] DMA starts for first atomic commit
> > [3] Writeback disable registers list address write
> > [4] DMA starts for writeback disable registers list (3)
> > [5] Next atomic commit, with writeback disabled, performed right after
> >     vblank but befrore the vblank interrupt is serviced
> > 
> > The UPD bit is 1 after writing the ADDR register the second time in both
> > cases. Furthermore, if [4] and [5] are very close in the second case,
> > the UPD bit may read 1 just before [5] if the read comes before [4]:
> > 
> > 	read UPD bit;
> > 	/* VBLANK [4] */
> > 	write ADDR register;
> > 
> > I thus can't rely on UPD = 1 before the write meaning that the write was
> > performed before vblank, and I can't rely either on the UPD bit after
> > write, as it's 1 in both cases.
> 
> My mistake, I got the UPD bit the wrong way around. I'm still not
> entirely sure why you can't use "ADDR internal" to determine which
> side won the race. It shows 'B' in the first case, and 'C' in the
> second.

Because ADDR internal isn't available to the CPU :-(

> When a new commit comes, unconditionally:
>  - Write new address
>  - Read status
> 
>  if status.UPD == 0 --> You know for sure your new commit was just
>                         latched.
>  if status.UPD == 1 --> You need to check ADDR internal to see which
> 			of these three happened:
> 
>     1) Your first case happened. We're somewhere in the middle of the
>        frame. ADDR internal will show 'B', and you know commit 'D' is
>        going on-screen at the next vblank.
> 
>     2) Your second case happened. The new commit raced with the
>        latching of writeback-disable and "lost". ADDR internal will
>        show 'C', and the new commit is delayed by a frame
> 
>     3) (Teeny tiny small window) In-between reading status and ADDR
>        internal, the new commit was latched. ADDR internal will show
>        'D'. You know the new commit "won" so treat it the same as if
>        UPD == 0 (which it will be, now).
> 
> Anyway, it's all moot now that you've found the chained lists thing -
> that sounds ideal. I'll take a look at the new series shortly.

It's a neat hardware feature, yes. We were already using it for a
different purpose, I should have thought about it for writeback too from
the very beginning.

Please note I've sent a pull request for v7 as it has been fully
reviewed. Nonetheless, if you find issues, I can fix them on top.

> > I initially thought I could protect against the race using the following
> > procedure.
> > 
> > - Single atomic commit, no IRQ delay
> > 
> > 		   [1]       [2]        [3]              [4]
> >                     |         |          |                |
> >                     |         V          V                V
> >                     V          __________                  __________
> > VBLANK         _______________|          |________________|          |__
> >                      _________    
> > UPD            _____|         |_________________________________________
> >                _____ ___________________________________________________
> > ADDR register  __A__X___________________B_______________________________
> >                _______________ _________________________________________
> > ADDR internal  _______A_______X______B__________________________________
> > 
> > [1] Atomic commit, registers list address write, with writeback enabled
> > [2] DMA starts for first atomic commit
> > [3] Frame start, disable writeback in-place in registers list B
> > [4] DMA starts for "patched" registers list, disables writeback
> > 
> > - Two atomic commits, no IRQ delay
> > 
> > 		   [1]       [2]  [3]   [4]              [5]
> >                     |         |    |     |                |
> >                     |         V    V     V                V
> >                     V          __________                  __________
> > VBLANK         _______________|          |________________|          |__
> >                      _________      ______________________
> > UPD            _____|         |____|                      |_____________
> >                _____ ______________ ____________________________________
> > ADDR register  __A__X______B_______X_________________C__________________
> >                _______________ ___________________________ _____________
> > ADDR internal  _______A_______X_____________B_____________X______C______
> > 
> > [1] Atomic commit, registers list address write, with writeback enabled
> > [2] DMA starts for first atomic commit
> > [3] Next atomic commit, registers list address write, with writeback
> >     disabled
> > [4] Frame start, disable writeback in-place in registers list B
> > [5] DMA starts for second atomic commit, disables writeback
> > 
> > [3] and [4] can happen in any order, as long as they both come before
> > [5]. If [3] comes after [5], we're back to the previous case (Single
> > atomic commit, no IRQ delay).
> > 
> > - Single atomic commit, IRQ delay
> > 
> > 		   [1]       [2]        [3]              [4] [5]
> >                     |         |          |                |   |
> >                     |         V          V                V   V
> >                     V          __________                  __________
> > VBLANK         _______________|          |________________|          |__
> >                      _________    
> > UPD            _____|         |_________________________________________
> >                _____ ___________________________________________________
> > ADDR register  __A__X___________________B_______________________________
> >                _______________ _________________________________________
> > ADDR internal  _______A_______X______B__________________________________
> > 
> > [1] Atomic commit, registers list address write, with writeback enabled
> > [2] DMA starts for first atomic commit
> > [3] Frame start, IRQ is delayed to [5]
> > [4] DMA starts for unmodified registers list, writeback still enable
> > [5] disable writeback in-place in registers list B, too late
> > 
> > Here I need to detect that [5] was delayed after [4], and thus delay the
> > completion of the writeback job by one frame. This could be done by
> > checking the vblank interrupt status bit, if it is set then vblank
> > occurred and raced [5]. However, the same issue can also happen when no
> > race occurred if processing of the vblank interrupt for [2] is delayed
> > until [3]. Both the vblank interrupt and the frame start interrupt
> > status bits will be set, indicate a potential race.
> > 
> > The time between [2] and [3] is very short compared to the time between
> > [3] and [4] and to interrupt latency in general, so we would have lots
> > of false positives.
> > 
> > > >> You don't happen to have a DMA engine trigger or something you could
> > > >> use to do the register list modification at a guaranteed time do you?
> > > > 
> > > > Not that I know of, unfortunately.
> > > > 
> > > >> Are you always going to be protected by an IOMMU, preventing the
> > > >> writeback from trashing physical memory? If that's not the case, then
> > > >> the race can have pretty dire consequences.
> > > > 
> > > > If the IOMMU is enabled in DT, yes. It's a system-level decision.
> > > 
> > > Well, it's your driver at the end of the day. But for me, a known
> > > race-condition which would cause random memory corruption sounds like
> > > a really Bad Thing. Halving frame-rate on systems with no IOMMU seems
> > > preferable to me.
> > 
> > It is a really bad thing. I think the decision should be taken by
> > Renesas though.
diff mbox series

Patch

diff --git a/drivers/media/platform/vsp1/vsp1_dl.c b/drivers/media/platform/vsp1/vsp1_dl.c
index 886b3a69d329..7b4d252bfde7 100644
--- a/drivers/media/platform/vsp1/vsp1_dl.c
+++ b/drivers/media/platform/vsp1/vsp1_dl.c
@@ -115,6 +115,12 @@  struct vsp1_dl_body {
 
 	unsigned int num_entries;
 	unsigned int max_entries;
+
+	unsigned int num_patches;
+	struct {
+		struct vsp1_dl_entry *entry;
+		u32 data;
+	} patches[2];
 };
 
 /**
@@ -361,6 +367,7 @@  void vsp1_dl_body_put(struct vsp1_dl_body *dlb)
 		return;
 
 	dlb->num_entries = 0;
+	dlb->num_patches = 0;
 
 	spin_lock_irqsave(&dlb->pool->lock, flags);
 	list_add_tail(&dlb->free, &dlb->pool->free);
@@ -388,6 +395,47 @@  void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data)
 	dlb->num_entries++;
 }
 
+/**
+ * vsp1_dl_body_write_oneshot - Write a register to a display list body for a
+ *	single frame
+ * @dlb: The body
+ * @reg: The register address
+ * @value: The register value
+ * @reset_value: The value to reset the register to at the next vblank
+ *
+ * Display lists in continuous mode are re-used by the hardware for successive
+ * frames until a new display list is committed. Changing the VSP configuration
+ * normally requires creating and committing a new display list. This function
+ * offers an alternative race-free way by writing a @value to the @register in
+ * the display list body for a single frame, specifying in @reset_value the
+ * value to reset the register to one vblank after the display list is
+ * committed.
+ *
+ * The maximum number of one-shot entries is limited to 2 per display list body,
+ * and one-shot entries are counted in the total number of entries specified
+ * when the body is allocated by vsp1_dl_body_alloc().
+ */
+void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
+				u32 reset_value)
+{
+	if (WARN_ONCE(dlb->num_entries >= dlb->max_entries,
+		      "DLB size exceeded (max %u)", dlb->max_entries))
+		return;
+
+	if (WARN_ONCE(dlb->num_patches >= ARRAY_SIZE(dlb->patches),
+		      "DLB patches size exceeded (max %zu)",
+		      ARRAY_SIZE(dlb->patches)))
+		return;
+
+	dlb->patches[dlb->num_patches].entry = &dlb->entries[dlb->num_entries];
+	dlb->patches[dlb->num_patches].data = reset_value;
+	dlb->num_patches++;
+
+	dlb->entries[dlb->num_entries].addr = reg;
+	dlb->entries[dlb->num_entries].data = value;
+	dlb->num_entries++;
+}
+
 /* -----------------------------------------------------------------------------
  * Display List Extended Command Management
  */
@@ -652,6 +700,7 @@  static void __vsp1_dl_list_put(struct vsp1_dl_list *dl)
 	 * has at least one body, thus we reinitialise the entries list.
 	 */
 	dl->body0->num_entries = 0;
+	dl->body0->num_patches = 0;
 
 	list_add_tail(&dl->list, &dl->dlm->free);
 }
@@ -930,6 +979,35 @@  void vsp1_dl_list_commit(struct vsp1_dl_list *dl, unsigned int dl_flags)
  * Display List Manager
  */
 
+/**
+ * vsp1_dlm_irq_display_start - Display list handler for the display start
+ *	interrupt
+ * @dlm: the display list manager
+ *
+ * Apply all one-shot patches registered for the active display list.
+ */
+void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm)
+{
+	struct vsp1_dl_body *dlb;
+	struct vsp1_dl_list *dl;
+	unsigned int i;
+
+	spin_lock(&dlm->lock);
+
+	dl = dlm->active;
+	if (!dl)
+		goto done;
+
+	list_for_each_entry(dlb, &dl->bodies, list) {
+		for (i = 0; i < dlb->num_patches; ++i)
+			dlb->patches[i].entry->data = dlb->patches[i].data;
+		dlb->num_patches = 0;
+	}
+
+done:
+	spin_unlock(&dlm->lock);
+}
+
 /**
  * vsp1_dlm_irq_frame_end - Display list handler for the frame end interrupt
  * @dlm: the display list manager
diff --git a/drivers/media/platform/vsp1/vsp1_dl.h b/drivers/media/platform/vsp1/vsp1_dl.h
index e0fdb145e6ed..f845607abc4c 100644
--- a/drivers/media/platform/vsp1/vsp1_dl.h
+++ b/drivers/media/platform/vsp1/vsp1_dl.h
@@ -54,6 +54,7 @@  struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1,
 					unsigned int prealloc);
 void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm);
 void vsp1_dlm_reset(struct vsp1_dl_manager *dlm);
+void vsp1_dlm_irq_display_start(struct vsp1_dl_manager *dlm);
 unsigned int vsp1_dlm_irq_frame_end(struct vsp1_dl_manager *dlm);
 struct vsp1_dl_body *vsp1_dlm_dl_body_get(struct vsp1_dl_manager *dlm);
 
@@ -71,6 +72,8 @@  struct vsp1_dl_body *vsp1_dl_body_get(struct vsp1_dl_body_pool *pool);
 void vsp1_dl_body_put(struct vsp1_dl_body *dlb);
 
 void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 reg, u32 data);
+void vsp1_dl_body_write_oneshot(struct vsp1_dl_body *dlb, u32 reg, u32 value,
+				u32 reset_value);
 int vsp1_dl_list_add_body(struct vsp1_dl_list *dl, struct vsp1_dl_body *dlb);
 int vsp1_dl_list_add_chain(struct vsp1_dl_list *head, struct vsp1_dl_list *dl);