diff mbox series

[v1,2/2] drm: Clear the fence pointer when writeback job signaled

Message ID 1564571048-15029-3-git-send-email-lowry.li@arm.com (mailing list archive)
State New, archived
Headers show
Series Free the writeback_job when it with an empty fb | expand

Commit Message

Lowry Li (Arm Technology China) July 31, 2019, 11:04 a.m. UTC
During it signals the completion of a writeback job, after releasing
the out_fence, we'd clear the pointer.

Check if fence left over in drm_writeback_cleanup_job(), release it.

Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
---
 drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

Comments

Liviu Dudau July 31, 2019, 1:15 p.m. UTC | #1
Hi Lowry,

On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> During it signals the completion of a writeback job, after releasing
> the out_fence, we'd clear the pointer.
> 
> Check if fence left over in drm_writeback_cleanup_job(), release it.
> 
> Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> ---
>  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> index ff138b6..43d9e3b 100644
> --- a/drivers/gpu/drm/drm_writeback.c
> +++ b/drivers/gpu/drm/drm_writeback.c
> @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
>  	if (job->fb)
>  		drm_framebuffer_put(job->fb);
>  
> +	if (job->out_fence)
> +		dma_fence_put(job->out_fence);
> +
>  	kfree(job);
>  }

This change looks good.

>  EXPORT_SYMBOL(drm_writeback_cleanup_job);
> @@ -366,25 +369,29 @@ static void cleanup_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct drm_writeback_job *job;
> +	struct dma_fence *out_fence;
>  
>  	spin_lock_irqsave(&wb_connector->job_lock, flags);
>  	job = list_first_entry_or_null(&wb_connector->job_queue,
>  				       struct drm_writeback_job,
>  				       list_entry);
> -	if (job) {
> +	if (job)
>  		list_del(&job->list_entry);
> -		if (job->out_fence) {
> -			if (status)
> -				dma_fence_set_error(job->out_fence, status);
> -			dma_fence_signal(job->out_fence);
> -			dma_fence_put(job->out_fence);

*Here*

> -		}
> -	}
> +
>  	spin_unlock_irqrestore(&wb_connector->job_lock, flags);
>  
>  	if (WARN_ON(!job))
>  		return;
>  
> +	out_fence = job->out_fence;
> +	if (out_fence) {
> +		if (status)
> +			dma_fence_set_error(out_fence, status);
> +		dma_fence_signal(out_fence);
> +		dma_fence_put(out_fence);
> +		job->out_fence = NULL;
> +	}
> +

I don't get the point of this change. Why not just add job->out_fence = NULL
where *Here* is?

Best regards,
Liviu 

>  	INIT_WORK(&job->cleanup_work, cleanup_work);
>  	queue_work(system_long_wq, &job->cleanup_work);
>  }
> -- 
> 1.9.1
>
Brian Starkey July 31, 2019, 1:20 p.m. UTC | #2
Hi Lowry,

Thanks for this cleanup.

On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> During it signals the completion of a writeback job, after releasing
> the out_fence, we'd clear the pointer.
> 
> Check if fence left over in drm_writeback_cleanup_job(), release it.
> 
> Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> ---
>  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> index ff138b6..43d9e3b 100644
> --- a/drivers/gpu/drm/drm_writeback.c
> +++ b/drivers/gpu/drm/drm_writeback.c
> @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
>  	if (job->fb)
>  		drm_framebuffer_put(job->fb);
>  
> +	if (job->out_fence)

I'm thinking it might be a good idea to signal the fence with an error
here, if it's not already signaled. Otherwise, if there's someone
waiting (which there shouldn't be), they're going to be waiting a very
long time :-)

Thanks,
-Brian

> +		dma_fence_put(job->out_fence);
> +
>  	kfree(job);
>  }
Lowry Li (Arm Technology China) Aug. 1, 2019, 6:31 a.m. UTC | #3
Hi Liviu,

On Wed, Jul 31, 2019 at 01:15:25PM +0000, Liviu Dudau wrote:
> Hi Lowry,
> 
> On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > During it signals the completion of a writeback job, after releasing
> > the out_fence, we'd clear the pointer.
> > 
> > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > 
> > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > ---
> >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> >  1 file changed, 15 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > index ff138b6..43d9e3b 100644
> > --- a/drivers/gpu/drm/drm_writeback.c
> > +++ b/drivers/gpu/drm/drm_writeback.c
> > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> >  	if (job->fb)
> >  		drm_framebuffer_put(job->fb);
> >  
> > +	if (job->out_fence)
> > +		dma_fence_put(job->out_fence);
> > +
> >  	kfree(job);
> >  }
> 
> This change looks good.
> 
> >  EXPORT_SYMBOL(drm_writeback_cleanup_job);
> > @@ -366,25 +369,29 @@ static void cleanup_work(struct work_struct *work)
> >  {
> >  	unsigned long flags;
> >  	struct drm_writeback_job *job;
> > +	struct dma_fence *out_fence;
> >  
> >  	spin_lock_irqsave(&wb_connector->job_lock, flags);
> >  	job = list_first_entry_or_null(&wb_connector->job_queue,
> >  				       struct drm_writeback_job,
> >  				       list_entry);
> > -	if (job) {
> > +	if (job)
> >  		list_del(&job->list_entry);
> > -		if (job->out_fence) {
> > -			if (status)
> > -				dma_fence_set_error(job->out_fence, status);
> > -			dma_fence_signal(job->out_fence);
> > -			dma_fence_put(job->out_fence);
> 
> *Here*
> 
> > -		}
> > -	}
> > +
> >  	spin_unlock_irqrestore(&wb_connector->job_lock, flags);
> >  
> >  	if (WARN_ON(!job))
> >  		return;
> >  
> > +	out_fence = job->out_fence;
> > +	if (out_fence) {
> > +		if (status)
> > +			dma_fence_set_error(out_fence, status);
> > +		dma_fence_signal(out_fence);
> > +		dma_fence_put(out_fence);
> > +		job->out_fence = NULL;
> > +	}
> > +
> 
> I don't get the point of this change. Why not just add job->out_fence = NULL
> where *Here* is?
>
> Best regards,
> Liviu 
Besides setting NULL, also did a refine by moving the fence operation
out of the lock block.

Best regards,
Lowry 
> >  	INIT_WORK(&job->cleanup_work, cleanup_work);
> >  	queue_work(system_long_wq, &job->cleanup_work);
> >  }
> > -- 
> > 1.9.1
> > 
> 
> -- 
> ====================
> | I would like to |
> | fix the world,  |
> | but they're not |
> | giving me the   |
>  \ source code!  /
>   ---------------
>     ¯\_(ツ)_/¯
Lowry Li (Arm Technology China) Aug. 1, 2019, 6:34 a.m. UTC | #4
Hi Brian,

On Wed, Jul 31, 2019 at 09:20:04PM +0800, Brian Starkey wrote:
> Hi Lowry,
> 
> Thanks for this cleanup.
> 
> On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > During it signals the completion of a writeback job, after releasing
> > the out_fence, we'd clear the pointer.
> > 
> > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > 
> > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > ---
> >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> >  1 file changed, 15 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > index ff138b6..43d9e3b 100644
> > --- a/drivers/gpu/drm/drm_writeback.c
> > +++ b/drivers/gpu/drm/drm_writeback.c
> > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> >  	if (job->fb)
> >  		drm_framebuffer_put(job->fb);
> >  
> > +	if (job->out_fence)
> 
> I'm thinking it might be a good idea to signal the fence with an error
> here, if it's not already signaled. Otherwise, if there's someone
> waiting (which there shouldn't be), they're going to be waiting a very
> long time :-)
> 
> Thanks,
> -Brian
> 
Here it happened at atomic_check failed and test only commit. For both
cases, the commit has been dropped and it's only a clean up. So here better
not be treated as an error case:)

Since for userspace, it should have been failed or a test only case, so
writebace fence should not be signaled.

Best regards,
Lowry
> > +		dma_fence_put(job->out_fence);
> > +
> >  	kfree(job);
> >  }
Liviu Dudau Aug. 1, 2019, 9:58 a.m. UTC | #5
On Thu, Aug 01, 2019 at 06:31:13AM +0000, Lowry Li (Arm Technology China) wrote:
> Hi Liviu,
> 
> On Wed, Jul 31, 2019 at 01:15:25PM +0000, Liviu Dudau wrote:
> > Hi Lowry,
> > 
> > On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > > During it signals the completion of a writeback job, after releasing
> > > the out_fence, we'd clear the pointer.
> > > 
> > > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > > 
> > > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > > ---
> > >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> > >  1 file changed, 15 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > > index ff138b6..43d9e3b 100644
> > > --- a/drivers/gpu/drm/drm_writeback.c
> > > +++ b/drivers/gpu/drm/drm_writeback.c
> > > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> > >  	if (job->fb)
> > >  		drm_framebuffer_put(job->fb);
> > >  
> > > +	if (job->out_fence)
> > > +		dma_fence_put(job->out_fence);
> > > +
> > >  	kfree(job);
> > >  }
> > 
> > This change looks good.
> > 
> > >  EXPORT_SYMBOL(drm_writeback_cleanup_job);
> > > @@ -366,25 +369,29 @@ static void cleanup_work(struct work_struct *work)
> > >  {
> > >  	unsigned long flags;
> > >  	struct drm_writeback_job *job;
> > > +	struct dma_fence *out_fence;
> > >  
> > >  	spin_lock_irqsave(&wb_connector->job_lock, flags);
> > >  	job = list_first_entry_or_null(&wb_connector->job_queue,
> > >  				       struct drm_writeback_job,
> > >  				       list_entry);
> > > -	if (job) {
> > > +	if (job)
> > >  		list_del(&job->list_entry);
> > > -		if (job->out_fence) {
> > > -			if (status)
> > > -				dma_fence_set_error(job->out_fence, status);
> > > -			dma_fence_signal(job->out_fence);
> > > -			dma_fence_put(job->out_fence);
> > 
> > *Here*
> > 
> > > -		}
> > > -	}
> > > +
> > >  	spin_unlock_irqrestore(&wb_connector->job_lock, flags);
> > >  
> > >  	if (WARN_ON(!job))
> > >  		return;
> > >  
> > > +	out_fence = job->out_fence;
> > > +	if (out_fence) {
> > > +		if (status)
> > > +			dma_fence_set_error(out_fence, status);
> > > +		dma_fence_signal(out_fence);
> > > +		dma_fence_put(out_fence);
> > > +		job->out_fence = NULL;
> > > +	}
> > > +
> > 
> > I don't get the point of this change. Why not just add job->out_fence = NULL
> > where *Here* is?
> >
> > Best regards,
> > Liviu 
> Besides setting NULL, also did a refine by moving the fence operation
> out of the lock block.

OK, now it makes sense. May I suggest you add that to the commit message?

Otherwise, Acked-by: Liviu Dudau <liviu.dudau@arm.com>

Best regards,
Liviu

> 
> Best regards,
> Lowry 
> > >  	INIT_WORK(&job->cleanup_work, cleanup_work);
> > >  	queue_work(system_long_wq, &job->cleanup_work);
> > >  }
> > > -- 
> > > 1.9.1
> > > 
> > 
> > -- 
> > ====================
> > | I would like to |
> > | fix the world,  |
> > | but they're not |
> > | giving me the   |
> >  \ source code!  /
> >   ---------------
> >     ¯\_(ツ)_/¯
> 
> -- 
> Regards,
> Lowry
Brian Starkey Aug. 2, 2019, 9:29 a.m. UTC | #6
Hi Lowry,

On Thu, Aug 01, 2019 at 06:34:08AM +0000, Lowry Li (Arm Technology China) wrote:
> Hi Brian,
> 
> On Wed, Jul 31, 2019 at 09:20:04PM +0800, Brian Starkey wrote:
> > Hi Lowry,
> > 
> > Thanks for this cleanup.
> > 
> > On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > > During it signals the completion of a writeback job, after releasing
> > > the out_fence, we'd clear the pointer.
> > > 
> > > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > > 
> > > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > > ---
> > >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> > >  1 file changed, 15 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > > index ff138b6..43d9e3b 100644
> > > --- a/drivers/gpu/drm/drm_writeback.c
> > > +++ b/drivers/gpu/drm/drm_writeback.c
> > > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> > >  	if (job->fb)
> > >  		drm_framebuffer_put(job->fb);
> > >  
> > > +	if (job->out_fence)
> > 
> > I'm thinking it might be a good idea to signal the fence with an error
> > here, if it's not already signaled. Otherwise, if there's someone
> > waiting (which there shouldn't be), they're going to be waiting a very
> > long time :-)
> > 
> > Thanks,
> > -Brian
> > 
> Here it happened at atomic_check failed and test only commit. For both
> cases, the commit has been dropped and it's only a clean up. So here better
> not be treated as an error case:)

If anyone else has a reference on the fence, then IMO it absolutely is
an error to reach this point without the fence being signaled -
because it means that the fence will never be signaled.

I don't think the API gives you a way to check if this is the last
reference, so it's safest to just make sure the fence is signalled
before dropping the reference.

It just feels wrong to me to have the possibility of a dangling fence
which is never going to get signalled; and it's an easy defensive step
to make sure it can never happen.

I know it _shouldn't_ happen, but we often put in handling for cases
which shouldn't happen, because they frequently do happen :-)

> 
> Since for userspace, it should have been failed or a test only case, so
> writebace fence should not be signaled.

It's not only userspace that can wait on fences (and in fact this
fence will never even reach userspace if the commit fails), the driver
may have taken a copy to use for "something".

Cheers,
-Brian

> 
> Best regards,
> Lowry
> > > +		dma_fence_put(job->out_fence);
> > > +
> > >  	kfree(job);
> > >  }
> 
> -- 
> Regards,
> Lowry
Daniel Vetter Aug. 2, 2019, 9:43 a.m. UTC | #7
On Fri, Aug 2, 2019 at 11:29 AM Brian Starkey <Brian.Starkey@arm.com> wrote:
>
> Hi Lowry,
>
> On Thu, Aug 01, 2019 at 06:34:08AM +0000, Lowry Li (Arm Technology China) wrote:
> > Hi Brian,
> >
> > On Wed, Jul 31, 2019 at 09:20:04PM +0800, Brian Starkey wrote:
> > > Hi Lowry,
> > >
> > > Thanks for this cleanup.
> > >
> > > On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > > > During it signals the completion of a writeback job, after releasing
> > > > the out_fence, we'd clear the pointer.
> > > >
> > > > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > > >
> > > > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > > > ---
> > > >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> > > >  1 file changed, 15 insertions(+), 8 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > > > index ff138b6..43d9e3b 100644
> > > > --- a/drivers/gpu/drm/drm_writeback.c
> > > > +++ b/drivers/gpu/drm/drm_writeback.c
> > > > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> > > >   if (job->fb)
> > > >           drm_framebuffer_put(job->fb);
> > > >
> > > > + if (job->out_fence)
> > >
> > > I'm thinking it might be a good idea to signal the fence with an error
> > > here, if it's not already signaled. Otherwise, if there's someone
> > > waiting (which there shouldn't be), they're going to be waiting a very
> > > long time :-)
> > >
> > > Thanks,
> > > -Brian
> > >
> > Here it happened at atomic_check failed and test only commit. For both
> > cases, the commit has been dropped and it's only a clean up. So here better
> > not be treated as an error case:)
>
> If anyone else has a reference on the fence, then IMO it absolutely is
> an error to reach this point without the fence being signaled -
> because it means that the fence will never be signaled.
>
> I don't think the API gives you a way to check if this is the last
> reference, so it's safest to just make sure the fence is signalled
> before dropping the reference.
>
> It just feels wrong to me to have the possibility of a dangling fence
> which is never going to get signalled; and it's an easy defensive step
> to make sure it can never happen.
>
> I know it _shouldn't_ happen, but we often put in handling for cases
> which shouldn't happen, because they frequently do happen :-)

We're not as paranoid with the vblank fences either, so not sure why
we need to be this paranoid with writeback fences. If your driver
grabs anything from the atomic state in ->atomic_check it's buggy
anyway.

If you want to fix this properly I think we need to move the call to
prepare_signalling() in between atomic_check and atomic_commit. Then I
think it makes sense to also force-complete the fence on error ...
-Daniel

> > Since for userspace, it should have been failed or a test only case, so
> > writebace fence should not be signaled.
>
> It's not only userspace that can wait on fences (and in fact this
> fence will never even reach userspace if the commit fails), the driver
> may have taken a copy to use for "something".
>
> Cheers,
> -Brian
>
> >
> > Best regards,
> > Lowry
> > > > +         dma_fence_put(job->out_fence);
> > > > +
> > > >   kfree(job);
> > > >  }
> >
> > --
> > Regards,
> > Lowry
Daniel Vetter Aug. 2, 2019, 9:45 a.m. UTC | #8
On Fri, Aug 2, 2019 at 11:43 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Fri, Aug 2, 2019 at 11:29 AM Brian Starkey <Brian.Starkey@arm.com> wrote:
> >
> > Hi Lowry,
> >
> > On Thu, Aug 01, 2019 at 06:34:08AM +0000, Lowry Li (Arm Technology China) wrote:
> > > Hi Brian,
> > >
> > > On Wed, Jul 31, 2019 at 09:20:04PM +0800, Brian Starkey wrote:
> > > > Hi Lowry,
> > > >
> > > > Thanks for this cleanup.
> > > >
> > > > On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > > > > During it signals the completion of a writeback job, after releasing
> > > > > the out_fence, we'd clear the pointer.
> > > > >
> > > > > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > > > >
> > > > > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > > > > ---
> > > > >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> > > > >  1 file changed, 15 insertions(+), 8 deletions(-)
> > > > >
> > > > > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > > > > index ff138b6..43d9e3b 100644
> > > > > --- a/drivers/gpu/drm/drm_writeback.c
> > > > > +++ b/drivers/gpu/drm/drm_writeback.c
> > > > > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> > > > >   if (job->fb)
> > > > >           drm_framebuffer_put(job->fb);
> > > > >
> > > > > + if (job->out_fence)
> > > >
> > > > I'm thinking it might be a good idea to signal the fence with an error
> > > > here, if it's not already signaled. Otherwise, if there's someone
> > > > waiting (which there shouldn't be), they're going to be waiting a very
> > > > long time :-)
> > > >
> > > > Thanks,
> > > > -Brian
> > > >
> > > Here it happened at atomic_check failed and test only commit. For both
> > > cases, the commit has been dropped and it's only a clean up. So here better
> > > not be treated as an error case:)
> >
> > If anyone else has a reference on the fence, then IMO it absolutely is
> > an error to reach this point without the fence being signaled -
> > because it means that the fence will never be signaled.
> >
> > I don't think the API gives you a way to check if this is the last
> > reference, so it's safest to just make sure the fence is signalled
> > before dropping the reference.
> >
> > It just feels wrong to me to have the possibility of a dangling fence
> > which is never going to get signalled; and it's an easy defensive step
> > to make sure it can never happen.
> >
> > I know it _shouldn't_ happen, but we often put in handling for cases
> > which shouldn't happen, because they frequently do happen :-)
>
> We're not as paranoid with the vblank fences either, so not sure why
> we need to be this paranoid with writeback fences. If your driver
> grabs anything from the atomic state in ->atomic_check it's buggy
> anyway.
>
> If you want to fix this properly I think we need to move the call to
> prepare_signalling() in between atomic_check and atomic_commit. Then I
> think it makes sense to also force-complete the fence on error ...
>
> > > Since for userspace, it should have been failed or a test only case, so
> > > writebace fence should not be signaled.
> >
> > It's not only userspace that can wait on fences (and in fact this
> > fence will never even reach userspace if the commit fails), the driver
> > may have taken a copy to use for "something".

I forgot to add: you can check this by looking at the fence reference
count. A WARN_ON if that's more than 1 on cleanup (but also for the
out fences) could be a nice addition.
-Daniel
Brian Starkey Aug. 2, 2019, 10:09 a.m. UTC | #9
Hi Daniel,

On Fri, Aug 02, 2019 at 11:45:13AM +0200, Daniel Vetter wrote:
> On Fri, Aug 2, 2019 at 11:43 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Fri, Aug 2, 2019 at 11:29 AM Brian Starkey <Brian.Starkey@arm.com> wrote:
> > >
> > > Hi Lowry,
> > >
> > > On Thu, Aug 01, 2019 at 06:34:08AM +0000, Lowry Li (Arm Technology China) wrote:
> > > > Hi Brian,
> > > >
> > > > On Wed, Jul 31, 2019 at 09:20:04PM +0800, Brian Starkey wrote:
> > > > > Hi Lowry,
> > > > >
> > > > > Thanks for this cleanup.
> > > > >
> > > > > On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > > > > > During it signals the completion of a writeback job, after releasing
> > > > > > the out_fence, we'd clear the pointer.
> > > > > >
> > > > > > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > > > > >
> > > > > > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > > > > > ---
> > > > > >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> > > > > >  1 file changed, 15 insertions(+), 8 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > > > > > index ff138b6..43d9e3b 100644
> > > > > > --- a/drivers/gpu/drm/drm_writeback.c
> > > > > > +++ b/drivers/gpu/drm/drm_writeback.c
> > > > > > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> > > > > >   if (job->fb)
> > > > > >           drm_framebuffer_put(job->fb);
> > > > > >
> > > > > > + if (job->out_fence)
> > > > >
> > > > > I'm thinking it might be a good idea to signal the fence with an error
> > > > > here, if it's not already signaled. Otherwise, if there's someone
> > > > > waiting (which there shouldn't be), they're going to be waiting a very
> > > > > long time :-)
> > > > >
> > > > > Thanks,
> > > > > -Brian
> > > > >
> > > > Here it happened at atomic_check failed and test only commit. For both
> > > > cases, the commit has been dropped and it's only a clean up. So here better
> > > > not be treated as an error case:)
> > >
> > > If anyone else has a reference on the fence, then IMO it absolutely is
> > > an error to reach this point without the fence being signaled -
> > > because it means that the fence will never be signaled.
> > >
> > > I don't think the API gives you a way to check if this is the last
> > > reference, so it's safest to just make sure the fence is signalled
> > > before dropping the reference.
> > >
> > > It just feels wrong to me to have the possibility of a dangling fence
> > > which is never going to get signalled; and it's an easy defensive step
> > > to make sure it can never happen.
> > >
> > > I know it _shouldn't_ happen, but we often put in handling for cases
> > > which shouldn't happen, because they frequently do happen :-)
> >
> > We're not as paranoid with the vblank fences either, so not sure why
> > we need to be this paranoid with writeback fences. If your driver
> > grabs anything from the atomic state in ->atomic_check it's buggy
> > anyway.
> >
> > If you want to fix this properly I think we need to move the call to
> > prepare_signalling() in between atomic_check and atomic_commit. Then I
> > think it makes sense to also force-complete the fence on error ...

Well, fair enough. I'm struggling with "that's too paranoid" vs "fix
it properly" though? Is it a "problem" worth fixing or not?

It seems natural to me to do the fence cleanup in the cleanup function
for the object which owns the fence.

> >
> > > > Since for userspace, it should have been failed or a test only case, so
> > > > writebace fence should not be signaled.
> > >
> > > It's not only userspace that can wait on fences (and in fact this
> > > fence will never even reach userspace if the commit fails), the driver
> > > may have taken a copy to use for "something".
> 
> I forgot to add: you can check this by looking at the fence reference
> count. A WARN_ON if that's more than 1 on cleanup (but also for the
> out fences) could be a nice addition.

Do we really want to be looking at the fence internals directly like
that?

Cheers,
-Brian

> -Daniel
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
James Qian Wang Aug. 2, 2019, 10:09 a.m. UTC | #10
On Fri, Aug 02, 2019 at 05:29:20PM +0800, Brian Starkey wrote:
> Hi Lowry,
> 
> On Thu, Aug 01, 2019 at 06:34:08AM +0000, Lowry Li (Arm Technology China) wrote:
> > Hi Brian,
> > 
> > On Wed, Jul 31, 2019 at 09:20:04PM +0800, Brian Starkey wrote:
> > > Hi Lowry,
> > > 
> > > Thanks for this cleanup.
> > > 
> > > On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > > > During it signals the completion of a writeback job, after releasing
> > > > the out_fence, we'd clear the pointer.
> > > > 
> > > > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > > > 
> > > > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > > > ---
> > > >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> > > >  1 file changed, 15 insertions(+), 8 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > > > index ff138b6..43d9e3b 100644
> > > > --- a/drivers/gpu/drm/drm_writeback.c
> > > > +++ b/drivers/gpu/drm/drm_writeback.c
> > > > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> > > >  	if (job->fb)
> > > >  		drm_framebuffer_put(job->fb);
> > > >  
> > > > +	if (job->out_fence)
> > > 
> > > I'm thinking it might be a good idea to signal the fence with an error
> > > here, if it's not already signaled. Otherwise, if there's someone
> > > waiting (which there shouldn't be), they're going to be waiting a very
> > > long time :-)
> > > 
> > > Thanks,
> > > -Brian
> > > 
> > Here it happened at atomic_check failed and test only commit. For both
> > cases, the commit has been dropped and it's only a clean up. So here better
> > not be treated as an error case:)
> 
> If anyone else has a reference on the fence, then IMO it absolutely is
> an error to reach this point without the fence being signaled -
> because it means that the fence will never be signaled.
> 
> I don't think the API gives you a way to check if this is the last
> reference, so it's safest to just make sure the fence is signalled
> before dropping the reference.
> 
> It just feels wrong to me to have the possibility of a dangling fence
> which is never going to get signalled; and it's an easy defensive step
> to make sure it can never happen.
> 
> I know it _shouldn't_ happen, but we often put in handling for cases
> which shouldn't happen, because they frequently do happen :-)
> 
> > 
> > Since for userspace, it should have been failed or a test only case, so
> > writebace fence should not be signaled.
> 
> It's not only userspace that can wait on fences (and in fact this
> fence will never even reach userspace if the commit fails), the driver
> may have taken a copy to use for "something".
>

Maybe we can add a wb_fence canceling into complete_signaling() for the
atomic_check failed cleanup like the crtc->out_fence.

Then if in this place we still can got a fence here, that must be a
error we signal and WARN it

Thanks
James

> Cheers,
> -Brian
> 
> > 
> > Best regards,
> > Lowry
> > > > +		dma_fence_put(job->out_fence);
> > > > +
> > > >  	kfree(job);
> > > >  }
> > 
> > -- 
> > Regards
> > Lowry
Daniel Vetter Aug. 2, 2019, 2:06 p.m. UTC | #11
On Fri, Aug 02, 2019 at 10:09:05AM +0000, Brian Starkey wrote:
> Hi Daniel,
> 
> On Fri, Aug 02, 2019 at 11:45:13AM +0200, Daniel Vetter wrote:
> > On Fri, Aug 2, 2019 at 11:43 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Fri, Aug 2, 2019 at 11:29 AM Brian Starkey <Brian.Starkey@arm.com> wrote:
> > > >
> > > > Hi Lowry,
> > > >
> > > > On Thu, Aug 01, 2019 at 06:34:08AM +0000, Lowry Li (Arm Technology China) wrote:
> > > > > Hi Brian,
> > > > >
> > > > > On Wed, Jul 31, 2019 at 09:20:04PM +0800, Brian Starkey wrote:
> > > > > > Hi Lowry,
> > > > > >
> > > > > > Thanks for this cleanup.
> > > > > >
> > > > > > On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> > > > > > > During it signals the completion of a writeback job, after releasing
> > > > > > > the out_fence, we'd clear the pointer.
> > > > > > >
> > > > > > > Check if fence left over in drm_writeback_cleanup_job(), release it.
> > > > > > >
> > > > > > > Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> > > > > > > ---
> > > > > > >  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
> > > > > > >  1 file changed, 15 insertions(+), 8 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> > > > > > > index ff138b6..43d9e3b 100644
> > > > > > > --- a/drivers/gpu/drm/drm_writeback.c
> > > > > > > +++ b/drivers/gpu/drm/drm_writeback.c
> > > > > > > @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
> > > > > > >   if (job->fb)
> > > > > > >           drm_framebuffer_put(job->fb);
> > > > > > >
> > > > > > > + if (job->out_fence)
> > > > > >
> > > > > > I'm thinking it might be a good idea to signal the fence with an error
> > > > > > here, if it's not already signaled. Otherwise, if there's someone
> > > > > > waiting (which there shouldn't be), they're going to be waiting a very
> > > > > > long time :-)
> > > > > >
> > > > > > Thanks,
> > > > > > -Brian
> > > > > >
> > > > > Here it happened at atomic_check failed and test only commit. For both
> > > > > cases, the commit has been dropped and it's only a clean up. So here better
> > > > > not be treated as an error case:)
> > > >
> > > > If anyone else has a reference on the fence, then IMO it absolutely is
> > > > an error to reach this point without the fence being signaled -
> > > > because it means that the fence will never be signaled.
> > > >
> > > > I don't think the API gives you a way to check if this is the last
> > > > reference, so it's safest to just make sure the fence is signalled
> > > > before dropping the reference.
> > > >
> > > > It just feels wrong to me to have the possibility of a dangling fence
> > > > which is never going to get signalled; and it's an easy defensive step
> > > > to make sure it can never happen.
> > > >
> > > > I know it _shouldn't_ happen, but we often put in handling for cases
> > > > which shouldn't happen, because they frequently do happen :-)
> > >
> > > We're not as paranoid with the vblank fences either, so not sure why
> > > we need to be this paranoid with writeback fences. If your driver
> > > grabs anything from the atomic state in ->atomic_check it's buggy
> > > anyway.
> > >
> > > If you want to fix this properly I think we need to move the call to
> > > prepare_signalling() in between atomic_check and atomic_commit. Then I
> > > think it makes sense to also force-complete the fence on error ...
> 
> Well, fair enough. I'm struggling with "that's too paranoid" vs "fix
> it properly" though? Is it a "problem" worth fixing or not?

Up to you to decide that.

> It seems natural to me to do the fence cleanup in the cleanup function
> for the object which owns the fence.
> 
> > >
> > > > > Since for userspace, it should have been failed or a test only case, so
> > > > > writebace fence should not be signaled.
> > > >
> > > > It's not only userspace that can wait on fences (and in fact this
> > > > fence will never even reach userspace if the commit fails), the driver
> > > > may have taken a copy to use for "something".
> > 
> > I forgot to add: you can check this by looking at the fence reference
> > count. A WARN_ON if that's more than 1 on cleanup (but also for the
> > out fences) could be a nice addition.
> 
> Do we really want to be looking at the fence internals directly like
> that?

Wrap it up in a helper like dma_fence_release_private or whatever, which
combines the check and (hopefully final) _put(). Might need a better name.
-Daniel

> 
> Cheers,
> -Brian
> 
> > -Daniel
> > -- 
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
Brian Starkey Aug. 5, 2019, 1:10 p.m. UTC | #12
Hi Lowry,

Based on Daniel's input, this patch looks fine:

Reviewed-by: Brian Starkey <brian.starkey@arm.com>

I think there's some opportunity for improvement around
prepare_signaling/complete_signaling, but that can be treated as
separate from fixing this bug.

Thanks,
-Brian

On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> During it signals the completion of a writeback job, after releasing
> the out_fence, we'd clear the pointer.
> 
> Check if fence left over in drm_writeback_cleanup_job(), release it.
> 
> Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> ---
>  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> index ff138b6..43d9e3b 100644
> --- a/drivers/gpu/drm/drm_writeback.c
> +++ b/drivers/gpu/drm/drm_writeback.c
> @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
>  	if (job->fb)
>  		drm_framebuffer_put(job->fb);
>  
> +	if (job->out_fence)
> +		dma_fence_put(job->out_fence);
> +
>  	kfree(job);
>  }
>  EXPORT_SYMBOL(drm_writeback_cleanup_job);
> @@ -366,25 +369,29 @@ static void cleanup_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct drm_writeback_job *job;
> +	struct dma_fence *out_fence;
>  
>  	spin_lock_irqsave(&wb_connector->job_lock, flags);
>  	job = list_first_entry_or_null(&wb_connector->job_queue,
>  				       struct drm_writeback_job,
>  				       list_entry);
> -	if (job) {
> +	if (job)
>  		list_del(&job->list_entry);
> -		if (job->out_fence) {
> -			if (status)
> -				dma_fence_set_error(job->out_fence, status);
> -			dma_fence_signal(job->out_fence);
> -			dma_fence_put(job->out_fence);
> -		}
> -	}
> +
>  	spin_unlock_irqrestore(&wb_connector->job_lock, flags);
>  
>  	if (WARN_ON(!job))
>  		return;
>  
> +	out_fence = job->out_fence;
> +	if (out_fence) {
> +		if (status)
> +			dma_fence_set_error(out_fence, status);
> +		dma_fence_signal(out_fence);
> +		dma_fence_put(out_fence);
> +		job->out_fence = NULL;
> +	}
> +
>  	INIT_WORK(&job->cleanup_work, cleanup_work);
>  	queue_work(system_long_wq, &job->cleanup_work);
>  }
> -- 
> 1.9.1
>
James Qian Wang Sept. 23, 2019, 7:25 a.m. UTC | #13
On Wed, Jul 31, 2019 at 11:04:45AM +0000, Lowry Li (Arm Technology China) wrote:
> During it signals the completion of a writeback job, after releasing
> the out_fence, we'd clear the pointer.
> 
> Check if fence left over in drm_writeback_cleanup_job(), release it.
> 
> Signed-off-by: Lowry Li (Arm Technology China) <lowry.li@arm.com>
> Reviewed-by: Brian Starkey <brian.starkey@arm.com>

Looks good to me.

Reviewed-by: James Qian Wang (Arm Technology China) <james.qian.wang@arm.com>

will push it to drm-misc-fixes

James

> ---
>  drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> index ff138b6..43d9e3b 100644
> --- a/drivers/gpu/drm/drm_writeback.c
> +++ b/drivers/gpu/drm/drm_writeback.c
> @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job)
>  	if (job->fb)
>  		drm_framebuffer_put(job->fb);
>  
> +	if (job->out_fence)
> +		dma_fence_put(job->out_fence);
> +
>  	kfree(job);
>  }
>  EXPORT_SYMBOL(drm_writeback_cleanup_job);
> @@ -366,25 +369,29 @@ static void cleanup_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct drm_writeback_job *job;
> +	struct dma_fence *out_fence;
>  
>  	spin_lock_irqsave(&wb_connector->job_lock, flags);
>  	job = list_first_entry_or_null(&wb_connector->job_queue,
>  				       struct drm_writeback_job,
>  				       list_entry);
> -	if (job) {
> +	if (job)
>  		list_del(&job->list_entry);
> -		if (job->out_fence) {
> -			if (status)
> -				dma_fence_set_error(job->out_fence, status);
> -			dma_fence_signal(job->out_fence);
> -			dma_fence_put(job->out_fence);
> -		}
> -	}
> +
>  	spin_unlock_irqrestore(&wb_connector->job_lock, flags);
>  
>  	if (WARN_ON(!job))
>  		return;
>  
> +	out_fence = job->out_fence;
> +	if (out_fence) {
> +		if (status)
> +			dma_fence_set_error(out_fence, status);
> +		dma_fence_signal(out_fence);
> +		dma_fence_put(out_fence);
> +		job->out_fence = NULL;
> +	}
> +
>  	INIT_WORK(&job->cleanup_work, cleanup_work);
>  	queue_work(system_long_wq, &job->cleanup_work);
>  }
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
index ff138b6..43d9e3b 100644
--- a/drivers/gpu/drm/drm_writeback.c
+++ b/drivers/gpu/drm/drm_writeback.c
@@ -324,6 +324,9 @@  void drm_writeback_cleanup_job(struct drm_writeback_job *job)
 	if (job->fb)
 		drm_framebuffer_put(job->fb);
 
+	if (job->out_fence)
+		dma_fence_put(job->out_fence);
+
 	kfree(job);
 }
 EXPORT_SYMBOL(drm_writeback_cleanup_job);
@@ -366,25 +369,29 @@  static void cleanup_work(struct work_struct *work)
 {
 	unsigned long flags;
 	struct drm_writeback_job *job;
+	struct dma_fence *out_fence;
 
 	spin_lock_irqsave(&wb_connector->job_lock, flags);
 	job = list_first_entry_or_null(&wb_connector->job_queue,
 				       struct drm_writeback_job,
 				       list_entry);
-	if (job) {
+	if (job)
 		list_del(&job->list_entry);
-		if (job->out_fence) {
-			if (status)
-				dma_fence_set_error(job->out_fence, status);
-			dma_fence_signal(job->out_fence);
-			dma_fence_put(job->out_fence);
-		}
-	}
+
 	spin_unlock_irqrestore(&wb_connector->job_lock, flags);
 
 	if (WARN_ON(!job))
 		return;
 
+	out_fence = job->out_fence;
+	if (out_fence) {
+		if (status)
+			dma_fence_set_error(out_fence, status);
+		dma_fence_signal(out_fence);
+		dma_fence_put(out_fence);
+		job->out_fence = NULL;
+	}
+
 	INIT_WORK(&job->cleanup_work, cleanup_work);
 	queue_work(system_long_wq, &job->cleanup_work);
 }