diff mbox series

media: ti-vpe: cal: fix DMA memory corruption

Message ID 20200313082639.7743-1-tomi.valkeinen@ti.com (mailing list archive)
State New, archived
Headers show
Series media: ti-vpe: cal: fix DMA memory corruption | expand

Commit Message

Tomi Valkeinen March 13, 2020, 8:26 a.m. UTC
When the CAL driver stops streaming, it will shut everything down
without waiting for the current frame to finish. This leaves the CAL DMA
in a slightly undefined state, and when CAL DMA is enabled when the
stream is started the next time, the old DMA transfer will continue.

It is not clear if the old DMA transfer continues with the exact
settings of the original transfer, or is it a mix of old and new
settings, but in any case the end result is memory corruption as the
destination memory address is no longer valid.

I could not find any way to ensure that any old DMA transfer would be
discarded, except perhaps full CAL reset. But we cannot do a full reset
when one port is getting enabled, as that would reset both ports.

This patch tries to make sure that the DMA transfer is finished properly
when the stream is being stopped. I say "tries", as, as mentioned above,
I don't see a way to force the DMA transfer to finish. I believe this
fixes the corruptions for normal cases, but if for some reason the DMA
of the final frame would stall a lot, resulting in timeout in the code
waiting for the DMA to finish, we'll again end up with unfinished DMA
transfer. However, I don't know what could cause such a timeout.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: stable@vger.kernel.org
---
 drivers/media/platform/ti-vpe/cal.c | 32 +++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

Comments

Laurent Pinchart March 13, 2020, 2:03 p.m. UTC | #1
Hi Tomi,

Thank you for the patch.

On Fri, Mar 13, 2020 at 10:26:39AM +0200, Tomi Valkeinen wrote:
> When the CAL driver stops streaming, it will shut everything down
> without waiting for the current frame to finish. This leaves the CAL DMA
> in a slightly undefined state, and when CAL DMA is enabled when the
> stream is started the next time, the old DMA transfer will continue.
> 
> It is not clear if the old DMA transfer continues with the exact
> settings of the original transfer, or is it a mix of old and new
> settings, but in any case the end result is memory corruption as the
> destination memory address is no longer valid.
> 
> I could not find any way to ensure that any old DMA transfer would be
> discarded, except perhaps full CAL reset. But we cannot do a full reset
> when one port is getting enabled, as that would reset both ports.
> 
> This patch tries to make sure that the DMA transfer is finished properly
> when the stream is being stopped. I say "tries", as, as mentioned above,
> I don't see a way to force the DMA transfer to finish. I believe this
> fixes the corruptions for normal cases, but if for some reason the DMA
> of the final frame would stall a lot, resulting in timeout in the code
> waiting for the DMA to finish, we'll again end up with unfinished DMA
> transfer. However, I don't know what could cause such a timeout.
> 
> Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
> Cc: stable@vger.kernel.org
> ---
>  drivers/media/platform/ti-vpe/cal.c | 32 +++++++++++++++++++++++++++++
>  1 file changed, 32 insertions(+)
> 
> diff --git a/drivers/media/platform/ti-vpe/cal.c b/drivers/media/platform/ti-vpe/cal.c
> index be54806180a5..b857cab120ad 100644
> --- a/drivers/media/platform/ti-vpe/cal.c
> +++ b/drivers/media/platform/ti-vpe/cal.c
> @@ -414,6 +414,8 @@ struct cal_ctx {
>  	struct cal_buffer	*cur_frm;
>  	/* Pointer pointing to next v4l2_buffer */
>  	struct cal_buffer	*next_frm;
> +
> +	bool dma_act;
>  };
>  
>  static const struct cal_fmt *find_format_by_pix(struct cal_ctx *ctx,
> @@ -944,6 +946,7 @@ static void csi2_lane_config(struct cal_ctx *ctx)
>  
>  static void csi2_ppi_enable(struct cal_ctx *ctx)
>  {
> +	reg_write(ctx->dev, CAL_CSI2_PPI_CTRL(ctx->csi2_port), BIT(3));
>  	reg_write_field(ctx->dev, CAL_CSI2_PPI_CTRL(ctx->csi2_port),
>  			CAL_GEN_ENABLE, CAL_CSI2_PPI_CTRL_IF_EN_MASK);
>  }
> @@ -1206,15 +1209,25 @@ static irqreturn_t cal_irq(int irq_cal, void *data)
>  		if (isportirqset(irqst2, 1)) {
>  			ctx = dev->ctx[0];
>  
> +			spin_lock(&ctx->slock);
> +			ctx->dma_act = false;
> +
>  			if (ctx->cur_frm != ctx->next_frm)
>  				cal_process_buffer_complete(ctx);
> +
> +			spin_unlock(&ctx->slock);
>  		}
>  
>  		if (isportirqset(irqst2, 2)) {
>  			ctx = dev->ctx[1];
>  
> +			spin_lock(&ctx->slock);
> +			ctx->dma_act = false;
> +
>  			if (ctx->cur_frm != ctx->next_frm)
>  				cal_process_buffer_complete(ctx);
> +
> +			spin_unlock(&ctx->slock);
>  		}
>  	}
>  
> @@ -1230,6 +1243,7 @@ static irqreturn_t cal_irq(int irq_cal, void *data)
>  			dma_q = &ctx->vidq;
>  
>  			spin_lock(&ctx->slock);
> +			ctx->dma_act = true;
>  			if (!list_empty(&dma_q->active) &&
>  			    ctx->cur_frm == ctx->next_frm)
>  				cal_schedule_next_buffer(ctx);
> @@ -1241,6 +1255,7 @@ static irqreturn_t cal_irq(int irq_cal, void *data)
>  			dma_q = &ctx->vidq;
>  
>  			spin_lock(&ctx->slock);
> +			ctx->dma_act = true;
>  			if (!list_empty(&dma_q->active) &&
>  			    ctx->cur_frm == ctx->next_frm)
>  				cal_schedule_next_buffer(ctx);
> @@ -1713,10 +1728,27 @@ static void cal_stop_streaming(struct vb2_queue *vq)
>  	struct cal_ctx *ctx = vb2_get_drv_priv(vq);
>  	struct cal_dmaqueue *dma_q = &ctx->vidq;
>  	struct cal_buffer *buf, *tmp;
> +	unsigned long timeout;
>  	unsigned long flags;
>  	int ret;
> +	bool dma_act;
>  
>  	csi2_ppi_disable(ctx);
> +
> +	/* wait for stream and dma to finish */
> +	dma_act = true;
> +	timeout = jiffies + msecs_to_jiffies(500);
> +	while (dma_act && time_before(jiffies, timeout)) {
> +		msleep(50);
> +
> +		spin_lock_irqsave(&ctx->slock, flags);
> +		dma_act = ctx->dma_act;
> +		spin_unlock_irqrestore(&ctx->slock, flags);
> +	}

Waiting for the transfer to complete seems to be a good idea, but how
about using a wait queue instead of such a loop ? That would allow
better usage of CPU time and faster reaction time, and shouldn't be
difficult to implement. You may also want to replace dma_act with a
state if needed (in case you need to express running/stopping/stopped
states), and I would rename it to running if you just need a boolean.

> +
> +	if (dma_act)
> +		ctx_err(ctx, "failed to disable dma cleanly\n");
> +
>  	disable_irqs(ctx);
>  	csi2_phy_deinit(ctx);
>
Tomi Valkeinen March 13, 2020, 2:18 p.m. UTC | #2
On 13/03/2020 16:03, Laurent Pinchart wrote:

>> +	/* wait for stream and dma to finish */
>> +	dma_act = true;
>> +	timeout = jiffies + msecs_to_jiffies(500);
>> +	while (dma_act && time_before(jiffies, timeout)) {
>> +		msleep(50);
>> +
>> +		spin_lock_irqsave(&ctx->slock, flags);
>> +		dma_act = ctx->dma_act;
>> +		spin_unlock_irqrestore(&ctx->slock, flags);
>> +	}
> 
> Waiting for the transfer to complete seems to be a good idea, but how
> about using a wait queue instead of such a loop ? That would allow
> better usage of CPU time and faster reaction time, and shouldn't be
> difficult to implement. You may also want to replace dma_act with a
> state if needed (in case you need to express running/stopping/stopped
> states), and I would rename it to running if you just need a boolean.

Maybe, but I wasn't sure how to implement it safely.

So, when we call csi2_ppi_disable() (just above the wait code above), the HW will stop the DMA after 
the next frame has ended.

But there's no way to know in the irq handler if the DMA transfer that just ended was the last one 
or not. And I don't see how I could set a "disabling" flag before calling csi2_ppi_disable(), as I 
think that would always be racy with the irq handler.

So I went with a safe way: call csi2_ppi_disable(), then wait a bit so that we are sure that either 
1) the last frame is on going 2) the last frame has finished (instead of the previous-to-last frame 
is on going or finished). Then see if the DMA is active. If yes, we loop for it to end.

I think the loop could be replaced with a wait queue, but we still need the initial sleep to ensure 
we don't end the wait when the previous-to-last frame DMA has been finished.

  Tomi
Laurent Pinchart March 16, 2020, 10:28 a.m. UTC | #3
Hi Tomi,

On Fri, Mar 13, 2020 at 04:18:13PM +0200, Tomi Valkeinen wrote:
> On 13/03/2020 16:03, Laurent Pinchart wrote:
> 
> >> +	/* wait for stream and dma to finish */
> >> +	dma_act = true;
> >> +	timeout = jiffies + msecs_to_jiffies(500);
> >> +	while (dma_act && time_before(jiffies, timeout)) {
> >> +		msleep(50);
> >> +
> >> +		spin_lock_irqsave(&ctx->slock, flags);
> >> +		dma_act = ctx->dma_act;
> >> +		spin_unlock_irqrestore(&ctx->slock, flags);
> >> +	}
> > 
> > Waiting for the transfer to complete seems to be a good idea, but how
> > about using a wait queue instead of such a loop ? That would allow
> > better usage of CPU time and faster reaction time, and shouldn't be
> > difficult to implement. You may also want to replace dma_act with a
> > state if needed (in case you need to express running/stopping/stopped
> > states), and I would rename it to running if you just need a boolean.
> 
> Maybe, but I wasn't sure how to implement it safely.
> 
> So, when we call csi2_ppi_disable() (just above the wait code above), the HW will stop the DMA after 
> the next frame has ended.
> 
> But there's no way to know in the irq handler if the DMA transfer that just ended was the last one 
> or not. And I don't see how I could set a "disabling" flag before calling csi2_ppi_disable(), as I 
> think that would always be racy with the irq handler.
> 
> So I went with a safe way: call csi2_ppi_disable(), then wait a bit so that we are sure that either 
> 1) the last frame is on going 2) the last frame has finished (instead of the previous-to-last frame 
> is on going or finished). Then see if the DMA is active. If yes, we loop for it to end.
> 
> I think the loop could be replaced with a wait queue, but we still need the initial sleep to ensure 
> we don't end the wait when the previous-to-last frame DMA has been finished.

I think you can solve this by introducing a new enum state field with
RUNNING, STOPPING and STOPPED values, protected by a spinlock. Here's
what I have in the VSP1 driver for instance:

bool vsp1_pipeline_stopped(struct vsp1_pipeline *pipe)
{
	unsigned long flags;
	bool stopped;

	spin_lock_irqsave(&pipe->irqlock, flags);
	stopped = pipe->state == VSP1_PIPELINE_STOPPED;
	spin_unlock_irqrestore(&pipe->irqlock, flags);

	return stopped;
}

int vsp1_pipeline_stop(struct vsp1_pipeline *pipe)
{
	...
	spin_lock_irqsave(&pipe->irqlock, flags);
	if (pipe->state == VSP1_PIPELINE_RUNNING)
		pipe->state = VSP1_PIPELINE_STOPPING;
	spin_unlock_irqrestore(&pipe->irqlock, flags);

	ret = wait_event_timeout(pipe->wq, vsp1_pipeline_stopped(pipe),
				 msecs_to_jiffies(500));
	ret = ret == 0 ? -ETIMEDOUT : 0;
	...
}

and in the interrupt handler:

	state = pipe->state;
	pipe->state = VSP1_PIPELINE_STOPPED;

	/*
	 * If a stop has been requested, mark the pipeline as stopped and
	 * return. Otherwise restart the pipeline if ready.
	 */
	if (state == VSP1_PIPELINE_STOPPING)
		wake_up(&pipe->wq);
	else if (vsp1_pipeline_ready(pipe))
		vsp1_video_pipeline_run(pipe);
Tomi Valkeinen March 16, 2020, 11:02 a.m. UTC | #4
On 16/03/2020 12:28, Laurent Pinchart wrote:
> Hi Tomi,
> 
> On Fri, Mar 13, 2020 at 04:18:13PM +0200, Tomi Valkeinen wrote:
>> On 13/03/2020 16:03, Laurent Pinchart wrote:
>>
>>>> +	/* wait for stream and dma to finish */
>>>> +	dma_act = true;
>>>> +	timeout = jiffies + msecs_to_jiffies(500);
>>>> +	while (dma_act && time_before(jiffies, timeout)) {
>>>> +		msleep(50);
>>>> +
>>>> +		spin_lock_irqsave(&ctx->slock, flags);
>>>> +		dma_act = ctx->dma_act;
>>>> +		spin_unlock_irqrestore(&ctx->slock, flags);
>>>> +	}
>>>
>>> Waiting for the transfer to complete seems to be a good idea, but how
>>> about using a wait queue instead of such a loop ? That would allow
>>> better usage of CPU time and faster reaction time, and shouldn't be
>>> difficult to implement. You may also want to replace dma_act with a
>>> state if needed (in case you need to express running/stopping/stopped
>>> states), and I would rename it to running if you just need a boolean.
>>
>> Maybe, but I wasn't sure how to implement it safely.
>>
>> So, when we call csi2_ppi_disable() (just above the wait code above), the HW will stop the DMA after
>> the next frame has ended.
>>
>> But there's no way to know in the irq handler if the DMA transfer that just ended was the last one
>> or not. And I don't see how I could set a "disabling" flag before calling csi2_ppi_disable(), as I
>> think that would always be racy with the irq handler.
>>
>> So I went with a safe way: call csi2_ppi_disable(), then wait a bit so that we are sure that either
>> 1) the last frame is on going 2) the last frame has finished (instead of the previous-to-last frame
>> is on going or finished). Then see if the DMA is active. If yes, we loop for it to end.
>>
>> I think the loop could be replaced with a wait queue, but we still need the initial sleep to ensure
>> we don't end the wait when the previous-to-last frame DMA has been finished.
> 
> I think you can solve this by introducing a new enum state field with
> RUNNING, STOPPING and STOPPED values, protected by a spinlock. Here's
> what I have in the VSP1 driver for instance:

I'm still unsure if such approach would work.

When we're handling DMA-stopped irq, we cannot know if the HW will start transferring a new frame 
right afterwards. Even if we track the call to csi2_ppi_disable(), which disables the interface, we 
cannot know if that call actually happened in time.

So, a sequence like this (H = hardware, 1 - thread 1, 2 - thread 2):

H  Frame ends
H  DMA stops
H  DMA stopped HW irq
1  ISR starts running
H  Frame starts
2  csi2_ppi_disable()
H  DMA starts
1  ISR sees DMA-stopped irq and PPI is disabled, so last frame.
    But PPI disable was called after frame start, so DMA is
    running again, and we'll get DMA started irq soon

  Tomi
diff mbox series

Patch

diff --git a/drivers/media/platform/ti-vpe/cal.c b/drivers/media/platform/ti-vpe/cal.c
index be54806180a5..b857cab120ad 100644
--- a/drivers/media/platform/ti-vpe/cal.c
+++ b/drivers/media/platform/ti-vpe/cal.c
@@ -414,6 +414,8 @@  struct cal_ctx {
 	struct cal_buffer	*cur_frm;
 	/* Pointer pointing to next v4l2_buffer */
 	struct cal_buffer	*next_frm;
+
+	bool dma_act;
 };
 
 static const struct cal_fmt *find_format_by_pix(struct cal_ctx *ctx,
@@ -944,6 +946,7 @@  static void csi2_lane_config(struct cal_ctx *ctx)
 
 static void csi2_ppi_enable(struct cal_ctx *ctx)
 {
+	reg_write(ctx->dev, CAL_CSI2_PPI_CTRL(ctx->csi2_port), BIT(3));
 	reg_write_field(ctx->dev, CAL_CSI2_PPI_CTRL(ctx->csi2_port),
 			CAL_GEN_ENABLE, CAL_CSI2_PPI_CTRL_IF_EN_MASK);
 }
@@ -1206,15 +1209,25 @@  static irqreturn_t cal_irq(int irq_cal, void *data)
 		if (isportirqset(irqst2, 1)) {
 			ctx = dev->ctx[0];
 
+			spin_lock(&ctx->slock);
+			ctx->dma_act = false;
+
 			if (ctx->cur_frm != ctx->next_frm)
 				cal_process_buffer_complete(ctx);
+
+			spin_unlock(&ctx->slock);
 		}
 
 		if (isportirqset(irqst2, 2)) {
 			ctx = dev->ctx[1];
 
+			spin_lock(&ctx->slock);
+			ctx->dma_act = false;
+
 			if (ctx->cur_frm != ctx->next_frm)
 				cal_process_buffer_complete(ctx);
+
+			spin_unlock(&ctx->slock);
 		}
 	}
 
@@ -1230,6 +1243,7 @@  static irqreturn_t cal_irq(int irq_cal, void *data)
 			dma_q = &ctx->vidq;
 
 			spin_lock(&ctx->slock);
+			ctx->dma_act = true;
 			if (!list_empty(&dma_q->active) &&
 			    ctx->cur_frm == ctx->next_frm)
 				cal_schedule_next_buffer(ctx);
@@ -1241,6 +1255,7 @@  static irqreturn_t cal_irq(int irq_cal, void *data)
 			dma_q = &ctx->vidq;
 
 			spin_lock(&ctx->slock);
+			ctx->dma_act = true;
 			if (!list_empty(&dma_q->active) &&
 			    ctx->cur_frm == ctx->next_frm)
 				cal_schedule_next_buffer(ctx);
@@ -1713,10 +1728,27 @@  static void cal_stop_streaming(struct vb2_queue *vq)
 	struct cal_ctx *ctx = vb2_get_drv_priv(vq);
 	struct cal_dmaqueue *dma_q = &ctx->vidq;
 	struct cal_buffer *buf, *tmp;
+	unsigned long timeout;
 	unsigned long flags;
 	int ret;
+	bool dma_act;
 
 	csi2_ppi_disable(ctx);
+
+	/* wait for stream and dma to finish */
+	dma_act = true;
+	timeout = jiffies + msecs_to_jiffies(500);
+	while (dma_act && time_before(jiffies, timeout)) {
+		msleep(50);
+
+		spin_lock_irqsave(&ctx->slock, flags);
+		dma_act = ctx->dma_act;
+		spin_unlock_irqrestore(&ctx->slock, flags);
+	}
+
+	if (dma_act)
+		ctx_err(ctx, "failed to disable dma cleanly\n");
+
 	disable_irqs(ctx);
 	csi2_phy_deinit(ctx);