diff mbox series

[2/2] drm/panthor: Fix sync-only jobs

Message ID 20240628145536.778349-3-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series drm/panthor: Fix support for sync-only jobs | expand

Commit Message

Boris Brezillon June 28, 2024, 2:55 p.m. UTC
A sync-only job is meant to provide a synchronization point on a
queue, so we can't return a NULL fence there, we have to add a signal
operation to the command stream which executes after all other
previously submitted jobs are done.

Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 41 ++++++++++++++++++++-----
 include/uapi/drm/panthor_drm.h          |  5 +++
 2 files changed, 38 insertions(+), 8 deletions(-)

Comments

Liviu Dudau June 28, 2024, 9:34 p.m. UTC | #1
On Fri, Jun 28, 2024 at 04:55:36PM +0200, Boris Brezillon wrote:
> A sync-only job is meant to provide a synchronization point on a
> queue, so we can't return a NULL fence there, we have to add a signal
> operation to the command stream which executes after all other
> previously submitted jobs are done.
> 
> Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Took me a bit longer to read, lets blame Friday.

> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 41 ++++++++++++++++++++-----
>  include/uapi/drm/panthor_drm.h          |  5 +++
>  2 files changed, 38 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 79ffcbc41d78..951ff7e63ea8 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -458,6 +458,16 @@ struct panthor_queue {
>  		/** @seqno: Sequence number of the last initialized fence. */
>  		atomic64_t seqno;
>  
> +		/**
> +		 * @last_fence: Fence of the last submitted job.
> +		 *
> +		 * We return this fence when we get an empty command stream.
> +		 * This way, we are guaranteed that all earlier jobs have completed
> +		 * when drm_sched_job::s_fence::finished without having to feed
> +		 * the CS ring buffer with a dummy job that only signals the fence.
> +		 */
> +		struct dma_fence *last_fence;
> +
>  		/**
>  		 * @in_flight_jobs: List containing all in-flight jobs.
>  		 *
> @@ -829,6 +839,9 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>  	panthor_kernel_bo_destroy(queue->ringbuf);
>  	panthor_kernel_bo_destroy(queue->iface.mem);
>  
> +	/* Release the last_fence we were holding, if any. */
> +	dma_fence_put(queue->fence_ctx.last_fence);
> +
>  	kfree(queue);
>  }
>  
> @@ -2865,11 +2878,14 @@ queue_run_job(struct drm_sched_job *sched_job)
>  	static_assert(sizeof(call_instrs) % 64 == 0,
>  		      "call_instrs is not aligned on a cacheline");
>  
> -	/* Stream size is zero, nothing to do => return a NULL fence and let
> -	 * drm_sched signal the parent.
> +	/* Stream size is zero, nothing to do except making sure all previously
> +	 * submitted jobs are done before we signal the
> +	 * drm_sched_job::s_fence::finished fence.
>  	 */
> -	if (!job->call_info.size)
> -		return NULL;
> +	if (!job->call_info.size) {
> +		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> +		return job->done_fence;

What happens if the last job's done_fence was cancelled or timed out? Is the
sync job's done_fence going to be signalled with the same error?

Now that we're returning a fence here, should the job be also added into the
in_flight_jobs?

If you're happy with depending on the previous job's done_fence and not
track the sync job in in_flight_jobs, then you can have my

Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>

Best regards,
Liviu

> +	}
>  
>  	ret = pm_runtime_resume_and_get(ptdev->base.dev);
>  	if (drm_WARN_ON(&ptdev->base, ret))
> @@ -2928,6 +2944,10 @@ queue_run_job(struct drm_sched_job *sched_job)
>  		}
>  	}
>  
> +	/* Update the last fence. */
> +	dma_fence_put(queue->fence_ctx.last_fence);
> +	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
> +
>  	done_fence = dma_fence_get(job->done_fence);
>  
>  out_unlock:
> @@ -3378,10 +3398,15 @@ panthor_job_create(struct panthor_file *pfile,
>  		goto err_put_job;
>  	}
>  
> -	job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> -	if (!job->done_fence) {
> -		ret = -ENOMEM;
> -		goto err_put_job;
> +	/* Empty command streams don't need a fence, they'll pick the one from
> +	 * the previously submitted job.
> +	 */
> +	if (job->call_info.size) {
> +		job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> +		if (!job->done_fence) {
> +			ret = -ENOMEM;
> +			goto err_put_job;
> +		}
>  	}
>  
>  	ret = drm_sched_job_init(&job->base,
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index aaed8e12ad0b..926b1deb1116 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -802,6 +802,9 @@ struct drm_panthor_queue_submit {
>  	 * Must be 64-bit/8-byte aligned (the size of a CS instruction)
>  	 *
>  	 * Can be zero if stream_addr is zero too.
> +	 *
> +	 * When the stream size is zero, the queue submit serves as a
> +	 * synchronization point.
>  	 */
>  	__u32 stream_size;
>  
> @@ -822,6 +825,8 @@ struct drm_panthor_queue_submit {
>  	 * ensure the GPU doesn't get garbage when reading the indirect command
>  	 * stream buffers. If you want the cache flush to happen
>  	 * unconditionally, pass a zero here.
> +	 *
> +	 * Ignored when stream_size is zero.
>  	 */
>  	__u32 latest_flush;
>  
> -- 
> 2.45.0
>
Boris Brezillon June 29, 2024, 8:52 a.m. UTC | #2
On Fri, 28 Jun 2024 22:34:57 +0100
Liviu Dudau <liviu.dudau@arm.com> wrote:

> On Fri, Jun 28, 2024 at 04:55:36PM +0200, Boris Brezillon wrote:
> > A sync-only job is meant to provide a synchronization point on a
> > queue, so we can't return a NULL fence there, we have to add a signal
> > operation to the command stream which executes after all other
> > previously submitted jobs are done.
> > 
> > Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>  
> 
> Took me a bit longer to read, lets blame Friday.
> 
> > ---
> >  drivers/gpu/drm/panthor/panthor_sched.c | 41 ++++++++++++++++++++-----
> >  include/uapi/drm/panthor_drm.h          |  5 +++
> >  2 files changed, 38 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > index 79ffcbc41d78..951ff7e63ea8 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > @@ -458,6 +458,16 @@ struct panthor_queue {
> >  		/** @seqno: Sequence number of the last initialized fence. */
> >  		atomic64_t seqno;
> >  
> > +		/**
> > +		 * @last_fence: Fence of the last submitted job.
> > +		 *
> > +		 * We return this fence when we get an empty command stream.
> > +		 * This way, we are guaranteed that all earlier jobs have completed
> > +		 * when drm_sched_job::s_fence::finished without having to feed
> > +		 * the CS ring buffer with a dummy job that only signals the fence.
> > +		 */
> > +		struct dma_fence *last_fence;
> > +
> >  		/**
> >  		 * @in_flight_jobs: List containing all in-flight jobs.
> >  		 *
> > @@ -829,6 +839,9 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
> >  	panthor_kernel_bo_destroy(queue->ringbuf);
> >  	panthor_kernel_bo_destroy(queue->iface.mem);
> >  
> > +	/* Release the last_fence we were holding, if any. */
> > +	dma_fence_put(queue->fence_ctx.last_fence);
> > +
> >  	kfree(queue);
> >  }
> >  
> > @@ -2865,11 +2878,14 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  	static_assert(sizeof(call_instrs) % 64 == 0,
> >  		      "call_instrs is not aligned on a cacheline");
> >  
> > -	/* Stream size is zero, nothing to do => return a NULL fence and let
> > -	 * drm_sched signal the parent.
> > +	/* Stream size is zero, nothing to do except making sure all previously
> > +	 * submitted jobs are done before we signal the
> > +	 * drm_sched_job::s_fence::finished fence.
> >  	 */
> > -	if (!job->call_info.size)
> > -		return NULL;
> > +	if (!job->call_info.size) {
> > +		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> > +		return job->done_fence;  
> 
> What happens if the last job's done_fence was cancelled or timed out? Is the
> sync job's done_fence going to be signalled with the same error?

It's the same object, so yes, the job will also be considered faulty
(the error propagated to the job::s_fence::finished fence). I guess
synchronization jobs are not supposed to fail/timeout in theory, because
they don't do anything, but I don't think that's an issue in
practice, because dma_fence errors are never propagated to user-space
(only the queue status is).

> 
> Now that we're returning a fence here, should the job be also added into the
> in_flight_jobs?

Yeah, that's done on purpose, such that we don't end up signalling the
same dma_fence object twice (which is forbidden). This makes me realize
I should probably drop the 'is_cs_empty()' check in
group_sync_upd_work(), since we're not supposed to have a job with an
empty CS in the in_flight_jobs list.

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 951ff7e63ea8..8bf01b7b1596 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -2797,9 +2797,6 @@ static void group_sync_upd_work(struct work_struct *work)
 
                spin_lock(&queue->fence_ctx.lock);
                list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
-                       if (!job->call_info.size)
-                               continue;
-
                        if (syncobj->seqno < job->done_fence->seqno)
                                break;


> 
> If you're happy with depending on the previous job's done_fence and not
> track the sync job in in_flight_jobs, then you can have my
> 
> Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
> 
> Best regards,
> Liviu
> 
> > +	}
> >  
> >  	ret = pm_runtime_resume_and_get(ptdev->base.dev);
> >  	if (drm_WARN_ON(&ptdev->base, ret))
> > @@ -2928,6 +2944,10 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  		}
> >  	}
> >  
> > +	/* Update the last fence. */
> > +	dma_fence_put(queue->fence_ctx.last_fence);
> > +	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
> > +
> >  	done_fence = dma_fence_get(job->done_fence);
> >  
> >  out_unlock:
> > @@ -3378,10 +3398,15 @@ panthor_job_create(struct panthor_file *pfile,
> >  		goto err_put_job;
> >  	}
> >  
> > -	job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> > -	if (!job->done_fence) {
> > -		ret = -ENOMEM;
> > -		goto err_put_job;
> > +	/* Empty command streams don't need a fence, they'll pick the one from
> > +	 * the previously submitted job.
> > +	 */
> > +	if (job->call_info.size) {
> > +		job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> > +		if (!job->done_fence) {
> > +			ret = -ENOMEM;
> > +			goto err_put_job;
> > +		}
> >  	}
> >  
> >  	ret = drm_sched_job_init(&job->base,
> > diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> > index aaed8e12ad0b..926b1deb1116 100644
> > --- a/include/uapi/drm/panthor_drm.h
> > +++ b/include/uapi/drm/panthor_drm.h
> > @@ -802,6 +802,9 @@ struct drm_panthor_queue_submit {
> >  	 * Must be 64-bit/8-byte aligned (the size of a CS instruction)
> >  	 *
> >  	 * Can be zero if stream_addr is zero too.
> > +	 *
> > +	 * When the stream size is zero, the queue submit serves as a
> > +	 * synchronization point.
> >  	 */
> >  	__u32 stream_size;
> >  
> > @@ -822,6 +825,8 @@ struct drm_panthor_queue_submit {
> >  	 * ensure the GPU doesn't get garbage when reading the indirect command
> >  	 * stream buffers. If you want the cache flush to happen
> >  	 * unconditionally, pass a zero here.
> > +	 *
> > +	 * Ignored when stream_size is zero.
> >  	 */
> >  	__u32 latest_flush;
> >  
> > -- 
> > 2.45.0
> >   
>
Liviu Dudau June 29, 2024, 12:10 p.m. UTC | #3
On Sat, Jun 29, 2024 at 10:52:56AM +0200, Boris Brezillon wrote:
> On Fri, 28 Jun 2024 22:34:57 +0100
> Liviu Dudau <liviu.dudau@arm.com> wrote:
> 
> > On Fri, Jun 28, 2024 at 04:55:36PM +0200, Boris Brezillon wrote:
> > > A sync-only job is meant to provide a synchronization point on a
> > > queue, so we can't return a NULL fence there, we have to add a signal
> > > operation to the command stream which executes after all other
> > > previously submitted jobs are done.
> > > 
> > > Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>  
> > 
> > Took me a bit longer to read, lets blame Friday.
> > 
> > > ---
> > >  drivers/gpu/drm/panthor/panthor_sched.c | 41 ++++++++++++++++++++-----
> > >  include/uapi/drm/panthor_drm.h          |  5 +++
> > >  2 files changed, 38 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > > index 79ffcbc41d78..951ff7e63ea8 100644
> > > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > > @@ -458,6 +458,16 @@ struct panthor_queue {
> > >  		/** @seqno: Sequence number of the last initialized fence. */
> > >  		atomic64_t seqno;
> > >  
> > > +		/**
> > > +		 * @last_fence: Fence of the last submitted job.
> > > +		 *
> > > +		 * We return this fence when we get an empty command stream.
> > > +		 * This way, we are guaranteed that all earlier jobs have completed
> > > +		 * when drm_sched_job::s_fence::finished without having to feed
> > > +		 * the CS ring buffer with a dummy job that only signals the fence.
> > > +		 */
> > > +		struct dma_fence *last_fence;
> > > +
> > >  		/**
> > >  		 * @in_flight_jobs: List containing all in-flight jobs.
> > >  		 *
> > > @@ -829,6 +839,9 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
> > >  	panthor_kernel_bo_destroy(queue->ringbuf);
> > >  	panthor_kernel_bo_destroy(queue->iface.mem);
> > >  
> > > +	/* Release the last_fence we were holding, if any. */
> > > +	dma_fence_put(queue->fence_ctx.last_fence);
> > > +
> > >  	kfree(queue);
> > >  }
> > >  
> > > @@ -2865,11 +2878,14 @@ queue_run_job(struct drm_sched_job *sched_job)
> > >  	static_assert(sizeof(call_instrs) % 64 == 0,
> > >  		      "call_instrs is not aligned on a cacheline");
> > >  
> > > -	/* Stream size is zero, nothing to do => return a NULL fence and let
> > > -	 * drm_sched signal the parent.
> > > +	/* Stream size is zero, nothing to do except making sure all previously
> > > +	 * submitted jobs are done before we signal the
> > > +	 * drm_sched_job::s_fence::finished fence.
> > >  	 */
> > > -	if (!job->call_info.size)
> > > -		return NULL;
> > > +	if (!job->call_info.size) {
> > > +		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> > > +		return job->done_fence;  
> > 
> > What happens if the last job's done_fence was cancelled or timed out? Is the
> > sync job's done_fence going to be signalled with the same error?
> 
> It's the same object, so yes, the job will also be considered faulty
> (the error propagated to the job::s_fence::finished fence). I guess
> synchronization jobs are not supposed to fail/timeout in theory, because
> they don't do anything, but I don't think that's an issue in
> practice, because dma_fence errors are never propagated to user-space
> (only the queue status is).
> 
> > 
> > Now that we're returning a fence here, should the job be also added into the
> > in_flight_jobs?
> 
> Yeah, that's done on purpose, such that we don't end up signalling the
> same dma_fence object twice (which is forbidden).

That's the thing I was trying to figure out on Friday: how do we stop the fence
returned as last_fence for the sync job to be signalled after the job's done_fence
has also been signalled. I can't say that I found a good answer, if you can point
me to what I've missed it will be appreciated.

> This makes me realize
> I should probably drop the 'is_cs_empty()' check in
> group_sync_upd_work(), since we're not supposed to have a job with an
> empty CS in the in_flight_jobs list.
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 951ff7e63ea8..8bf01b7b1596 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -2797,9 +2797,6 @@ static void group_sync_upd_work(struct work_struct *work)
>  
>                 spin_lock(&queue->fence_ctx.lock);
>                 list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> -                       if (!job->call_info.size)
> -                               continue;
> -
>                         if (syncobj->seqno < job->done_fence->seqno)
>                                 break;

Looks good to me.

Best regards,
Liviu

> 
> 
> > 
> > If you're happy with depending on the previous job's done_fence and not
> > track the sync job in in_flight_jobs, then you can have my
> > 
> > Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
> > 
> > Best regards,
> > Liviu
> > 
> > > +	}
> > >  
> > >  	ret = pm_runtime_resume_and_get(ptdev->base.dev);
> > >  	if (drm_WARN_ON(&ptdev->base, ret))
> > > @@ -2928,6 +2944,10 @@ queue_run_job(struct drm_sched_job *sched_job)
> > >  		}
> > >  	}
> > >  
> > > +	/* Update the last fence. */
> > > +	dma_fence_put(queue->fence_ctx.last_fence);
> > > +	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
> > > +
> > >  	done_fence = dma_fence_get(job->done_fence);
> > >  
> > >  out_unlock:
> > > @@ -3378,10 +3398,15 @@ panthor_job_create(struct panthor_file *pfile,
> > >  		goto err_put_job;
> > >  	}
> > >  
> > > -	job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> > > -	if (!job->done_fence) {
> > > -		ret = -ENOMEM;
> > > -		goto err_put_job;
> > > +	/* Empty command streams don't need a fence, they'll pick the one from
> > > +	 * the previously submitted job.
> > > +	 */
> > > +	if (job->call_info.size) {
> > > +		job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> > > +		if (!job->done_fence) {
> > > +			ret = -ENOMEM;
> > > +			goto err_put_job;
> > > +		}
> > >  	}
> > >  
> > >  	ret = drm_sched_job_init(&job->base,
> > > diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> > > index aaed8e12ad0b..926b1deb1116 100644
> > > --- a/include/uapi/drm/panthor_drm.h
> > > +++ b/include/uapi/drm/panthor_drm.h
> > > @@ -802,6 +802,9 @@ struct drm_panthor_queue_submit {
> > >  	 * Must be 64-bit/8-byte aligned (the size of a CS instruction)
> > >  	 *
> > >  	 * Can be zero if stream_addr is zero too.
> > > +	 *
> > > +	 * When the stream size is zero, the queue submit serves as a
> > > +	 * synchronization point.
> > >  	 */
> > >  	__u32 stream_size;
> > >  
> > > @@ -822,6 +825,8 @@ struct drm_panthor_queue_submit {
> > >  	 * ensure the GPU doesn't get garbage when reading the indirect command
> > >  	 * stream buffers. If you want the cache flush to happen
> > >  	 * unconditionally, pass a zero here.
> > > +	 *
> > > +	 * Ignored when stream_size is zero.
> > >  	 */
> > >  	__u32 latest_flush;
> > >  
> > > -- 
> > > 2.45.0
> > >   
> > 
>
Boris Brezillon July 1, 2024, 6:59 a.m. UTC | #4
On Sat, 29 Jun 2024 13:10:35 +0100
Liviu Dudau <liviu.dudau@arm.com> wrote:

> On Sat, Jun 29, 2024 at 10:52:56AM +0200, Boris Brezillon wrote:
> > On Fri, 28 Jun 2024 22:34:57 +0100
> > Liviu Dudau <liviu.dudau@arm.com> wrote:
> >   
> > > On Fri, Jun 28, 2024 at 04:55:36PM +0200, Boris Brezillon wrote:  
> > > > A sync-only job is meant to provide a synchronization point on a
> > > > queue, so we can't return a NULL fence there, we have to add a signal
> > > > operation to the command stream which executes after all other
> > > > previously submitted jobs are done.
> > > > 
> > > > Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> > > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>    
> > > 
> > > Took me a bit longer to read, lets blame Friday.
> > >   
> > > > ---
> > > >  drivers/gpu/drm/panthor/panthor_sched.c | 41 ++++++++++++++++++++-----
> > > >  include/uapi/drm/panthor_drm.h          |  5 +++
> > > >  2 files changed, 38 insertions(+), 8 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > > > index 79ffcbc41d78..951ff7e63ea8 100644
> > > > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > > > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > > > @@ -458,6 +458,16 @@ struct panthor_queue {
> > > >  		/** @seqno: Sequence number of the last initialized fence. */
> > > >  		atomic64_t seqno;
> > > >  
> > > > +		/**
> > > > +		 * @last_fence: Fence of the last submitted job.
> > > > +		 *
> > > > +		 * We return this fence when we get an empty command stream.
> > > > +		 * This way, we are guaranteed that all earlier jobs have completed
> > > > +		 * when drm_sched_job::s_fence::finished without having to feed
> > > > +		 * the CS ring buffer with a dummy job that only signals the fence.
> > > > +		 */
> > > > +		struct dma_fence *last_fence;
> > > > +
> > > >  		/**
> > > >  		 * @in_flight_jobs: List containing all in-flight jobs.
> > > >  		 *
> > > > @@ -829,6 +839,9 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
> > > >  	panthor_kernel_bo_destroy(queue->ringbuf);
> > > >  	panthor_kernel_bo_destroy(queue->iface.mem);
> > > >  
> > > > +	/* Release the last_fence we were holding, if any. */
> > > > +	dma_fence_put(queue->fence_ctx.last_fence);
> > > > +
> > > >  	kfree(queue);
> > > >  }
> > > >  
> > > > @@ -2865,11 +2878,14 @@ queue_run_job(struct drm_sched_job *sched_job)
> > > >  	static_assert(sizeof(call_instrs) % 64 == 0,
> > > >  		      "call_instrs is not aligned on a cacheline");
> > > >  
> > > > -	/* Stream size is zero, nothing to do => return a NULL fence and let
> > > > -	 * drm_sched signal the parent.
> > > > +	/* Stream size is zero, nothing to do except making sure all previously
> > > > +	 * submitted jobs are done before we signal the
> > > > +	 * drm_sched_job::s_fence::finished fence.
> > > >  	 */
> > > > -	if (!job->call_info.size)
> > > > -		return NULL;
> > > > +	if (!job->call_info.size) {
> > > > +		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> > > > +		return job->done_fence;    
> > > 
> > > What happens if the last job's done_fence was cancelled or timed out? Is the
> > > sync job's done_fence going to be signalled with the same error?  
> > 
> > It's the same object, so yes, the job will also be considered faulty
> > (the error propagated to the job::s_fence::finished fence). I guess
> > synchronization jobs are not supposed to fail/timeout in theory, because
> > they don't do anything, but I don't think that's an issue in
> > practice, because dma_fence errors are never propagated to user-space
> > (only the queue status is).
> >   
> > > 
> > > Now that we're returning a fence here, should the job be also added into the
> > > in_flight_jobs?  
> > 
> > Yeah, that's done on purpose, such that we don't end up signalling the
> > same dma_fence object twice (which is forbidden).  
> 
> That's the thing I was trying to figure out on Friday: how do we stop the fence
> returned as last_fence for the sync job to be signalled after the job's done_fence
> has also been signalled. I can't say that I found a good answer, if you can point
> me to what I've missed it will be appreciated.

Well, there's only just one job that's added to the in_flight for a
given done_fence object, so there's no risk of signaling such a fence
more than once (even in case of a timeout). Now, let's say the
previous 'real' job is done when this 'dummy/sync' job queued, the
queue::fence_ctx::last_fence object will be signalled too, since this
is essentially the same object, and when we return a signalled fence to
the drm_sched layer in our ::run_job() callback, it just signals the
finished drm_sched_job::s_fence::finished immediately, just like when
you return a NULL fence.
Steven Price July 1, 2024, 9:03 a.m. UTC | #5
On 28/06/2024 15:55, Boris Brezillon wrote:
> A sync-only job is meant to provide a synchronization point on a
> queue, so we can't return a NULL fence there, we have to add a signal
> operation to the command stream which executes after all other
> previously submitted jobs are done.
> 
> Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Looks correct to me.

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 41 ++++++++++++++++++++-----
>  include/uapi/drm/panthor_drm.h          |  5 +++
>  2 files changed, 38 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 79ffcbc41d78..951ff7e63ea8 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -458,6 +458,16 @@ struct panthor_queue {
>  		/** @seqno: Sequence number of the last initialized fence. */
>  		atomic64_t seqno;
>  
> +		/**
> +		 * @last_fence: Fence of the last submitted job.
> +		 *
> +		 * We return this fence when we get an empty command stream.
> +		 * This way, we are guaranteed that all earlier jobs have completed
> +		 * when drm_sched_job::s_fence::finished without having to feed
> +		 * the CS ring buffer with a dummy job that only signals the fence.
> +		 */
> +		struct dma_fence *last_fence;
> +
>  		/**
>  		 * @in_flight_jobs: List containing all in-flight jobs.
>  		 *
> @@ -829,6 +839,9 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>  	panthor_kernel_bo_destroy(queue->ringbuf);
>  	panthor_kernel_bo_destroy(queue->iface.mem);
>  
> +	/* Release the last_fence we were holding, if any. */
> +	dma_fence_put(queue->fence_ctx.last_fence);
> +
>  	kfree(queue);
>  }
>  
> @@ -2865,11 +2878,14 @@ queue_run_job(struct drm_sched_job *sched_job)
>  	static_assert(sizeof(call_instrs) % 64 == 0,
>  		      "call_instrs is not aligned on a cacheline");
>  
> -	/* Stream size is zero, nothing to do => return a NULL fence and let
> -	 * drm_sched signal the parent.
> +	/* Stream size is zero, nothing to do except making sure all previously
> +	 * submitted jobs are done before we signal the
> +	 * drm_sched_job::s_fence::finished fence.
>  	 */
> -	if (!job->call_info.size)
> -		return NULL;
> +	if (!job->call_info.size) {
> +		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> +		return job->done_fence;
> +	}
>  
>  	ret = pm_runtime_resume_and_get(ptdev->base.dev);
>  	if (drm_WARN_ON(&ptdev->base, ret))
> @@ -2928,6 +2944,10 @@ queue_run_job(struct drm_sched_job *sched_job)
>  		}
>  	}
>  
> +	/* Update the last fence. */
> +	dma_fence_put(queue->fence_ctx.last_fence);
> +	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
> +
>  	done_fence = dma_fence_get(job->done_fence);
>  
>  out_unlock:
> @@ -3378,10 +3398,15 @@ panthor_job_create(struct panthor_file *pfile,
>  		goto err_put_job;
>  	}
>  
> -	job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> -	if (!job->done_fence) {
> -		ret = -ENOMEM;
> -		goto err_put_job;
> +	/* Empty command streams don't need a fence, they'll pick the one from
> +	 * the previously submitted job.
> +	 */
> +	if (job->call_info.size) {
> +		job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
> +		if (!job->done_fence) {
> +			ret = -ENOMEM;
> +			goto err_put_job;
> +		}
>  	}
>  
>  	ret = drm_sched_job_init(&job->base,
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index aaed8e12ad0b..926b1deb1116 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -802,6 +802,9 @@ struct drm_panthor_queue_submit {
>  	 * Must be 64-bit/8-byte aligned (the size of a CS instruction)
>  	 *
>  	 * Can be zero if stream_addr is zero too.
> +	 *
> +	 * When the stream size is zero, the queue submit serves as a
> +	 * synchronization point.
>  	 */
>  	__u32 stream_size;
>  
> @@ -822,6 +825,8 @@ struct drm_panthor_queue_submit {
>  	 * ensure the GPU doesn't get garbage when reading the indirect command
>  	 * stream buffers. If you want the cache flush to happen
>  	 * unconditionally, pass a zero here.
> +	 *
> +	 * Ignored when stream_size is zero.
>  	 */
>  	__u32 latest_flush;
>
kernel test robot July 1, 2024, 11:02 a.m. UTC | #6
Hi Boris,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm/drm-next]
[also build test WARNING on drm-exynos/exynos-drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip linus/master v6.10-rc6 next-20240628]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Boris-Brezillon/drm-panthor-Don-t-check-the-array-stride-on-empty-uobj-arrays/20240630-032128
base:   git://anongit.freedesktop.org/drm/drm drm-next
patch link:    https://lore.kernel.org/r/20240628145536.778349-3-boris.brezillon%40collabora.com
patch subject: [PATCH 2/2] drm/panthor: Fix sync-only jobs
config: arm-randconfig-003-20240701
compiler: clang version 19.0.0git (https://github.com/llvm/llvm-project 326ba38a991250a8587a399a260b0f7af2c9166a)
reproduce (this is a W=1 build):

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407011848.1VqjPD9w-lkp@intel.com/

All warnings (new ones prefixed by >>):

   drivers/gpu/drm/panthor/panthor_sched.c:319: warning: Excess struct member 'runnable' description in 'panthor_scheduler'
   drivers/gpu/drm/panthor/panthor_sched.c:319: warning: Excess struct member 'idle' description in 'panthor_scheduler'
   drivers/gpu/drm/panthor/panthor_sched.c:319: warning: Excess struct member 'waiting' description in 'panthor_scheduler'
   drivers/gpu/drm/panthor/panthor_sched.c:319: warning: Excess struct member 'has_ref' description in 'panthor_scheduler'
   drivers/gpu/drm/panthor/panthor_sched.c:319: warning: Excess struct member 'in_progress' description in 'panthor_scheduler'
   drivers/gpu/drm/panthor/panthor_sched.c:319: warning: Excess struct member 'stopped_groups' description in 'panthor_scheduler'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'mem' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'input' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'output' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'input_fw_va' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'output_fw_va' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'gpu_va' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'ref' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'gt' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'sync64' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'bo' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'offset' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'kmap' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'lock' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'id' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'seqno' description in 'panthor_queue'
>> drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'last_fence' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:479: warning: Excess struct member 'in_flight_jobs' description in 'panthor_queue'
   drivers/gpu/drm/panthor/panthor_sched.c:777: warning: Excess struct member 'start' description in 'panthor_job'
   drivers/gpu/drm/panthor/panthor_sched.c:777: warning: Excess struct member 'size' description in 'panthor_job'
   drivers/gpu/drm/panthor/panthor_sched.c:777: warning: Excess struct member 'latest_flush' description in 'panthor_job'
   drivers/gpu/drm/panthor/panthor_sched.c:777: warning: Excess struct member 'start' description in 'panthor_job'
   drivers/gpu/drm/panthor/panthor_sched.c:777: warning: Excess struct member 'end' description in 'panthor_job'
   drivers/gpu/drm/panthor/panthor_sched.c:1697: warning: Function parameter or struct member 'ptdev' not described in 'panthor_sched_report_fw_events'
   drivers/gpu/drm/panthor/panthor_sched.c:1697: warning: Function parameter or struct member 'events' not described in 'panthor_sched_report_fw_events'
   drivers/gpu/drm/panthor/panthor_sched.c:2591: warning: Function parameter or struct member 'ptdev' not described in 'panthor_sched_report_mmu_fault'


vim +479 drivers/gpu/drm/panthor/panthor_sched.c

de85488138247d Boris Brezillon 2024-02-29  394  
de85488138247d Boris Brezillon 2024-02-29  395  	/** @ringbuf: Command stream ring-buffer. */
de85488138247d Boris Brezillon 2024-02-29  396  	struct panthor_kernel_bo *ringbuf;
de85488138247d Boris Brezillon 2024-02-29  397  
de85488138247d Boris Brezillon 2024-02-29  398  	/** @iface: Firmware interface. */
de85488138247d Boris Brezillon 2024-02-29  399  	struct {
de85488138247d Boris Brezillon 2024-02-29  400  		/** @mem: FW memory allocated for this interface. */
de85488138247d Boris Brezillon 2024-02-29  401  		struct panthor_kernel_bo *mem;
de85488138247d Boris Brezillon 2024-02-29  402  
de85488138247d Boris Brezillon 2024-02-29  403  		/** @input: Input interface. */
de85488138247d Boris Brezillon 2024-02-29  404  		struct panthor_fw_ringbuf_input_iface *input;
de85488138247d Boris Brezillon 2024-02-29  405  
de85488138247d Boris Brezillon 2024-02-29  406  		/** @output: Output interface. */
de85488138247d Boris Brezillon 2024-02-29  407  		const struct panthor_fw_ringbuf_output_iface *output;
de85488138247d Boris Brezillon 2024-02-29  408  
de85488138247d Boris Brezillon 2024-02-29  409  		/** @input_fw_va: FW virtual address of the input interface buffer. */
de85488138247d Boris Brezillon 2024-02-29  410  		u32 input_fw_va;
de85488138247d Boris Brezillon 2024-02-29  411  
de85488138247d Boris Brezillon 2024-02-29  412  		/** @output_fw_va: FW virtual address of the output interface buffer. */
de85488138247d Boris Brezillon 2024-02-29  413  		u32 output_fw_va;
de85488138247d Boris Brezillon 2024-02-29  414  	} iface;
de85488138247d Boris Brezillon 2024-02-29  415  
de85488138247d Boris Brezillon 2024-02-29  416  	/**
de85488138247d Boris Brezillon 2024-02-29  417  	 * @syncwait: Stores information about the synchronization object this
de85488138247d Boris Brezillon 2024-02-29  418  	 * queue is waiting on.
de85488138247d Boris Brezillon 2024-02-29  419  	 */
de85488138247d Boris Brezillon 2024-02-29  420  	struct {
de85488138247d Boris Brezillon 2024-02-29  421  		/** @gpu_va: GPU address of the synchronization object. */
de85488138247d Boris Brezillon 2024-02-29  422  		u64 gpu_va;
de85488138247d Boris Brezillon 2024-02-29  423  
de85488138247d Boris Brezillon 2024-02-29  424  		/** @ref: Reference value to compare against. */
de85488138247d Boris Brezillon 2024-02-29  425  		u64 ref;
de85488138247d Boris Brezillon 2024-02-29  426  
de85488138247d Boris Brezillon 2024-02-29  427  		/** @gt: True if this is a greater-than test. */
de85488138247d Boris Brezillon 2024-02-29  428  		bool gt;
de85488138247d Boris Brezillon 2024-02-29  429  
de85488138247d Boris Brezillon 2024-02-29  430  		/** @sync64: True if this is a 64-bit sync object. */
de85488138247d Boris Brezillon 2024-02-29  431  		bool sync64;
de85488138247d Boris Brezillon 2024-02-29  432  
de85488138247d Boris Brezillon 2024-02-29  433  		/** @bo: Buffer object holding the synchronization object. */
de85488138247d Boris Brezillon 2024-02-29  434  		struct drm_gem_object *obj;
de85488138247d Boris Brezillon 2024-02-29  435  
de85488138247d Boris Brezillon 2024-02-29  436  		/** @offset: Offset of the synchronization object inside @bo. */
de85488138247d Boris Brezillon 2024-02-29  437  		u64 offset;
de85488138247d Boris Brezillon 2024-02-29  438  
de85488138247d Boris Brezillon 2024-02-29  439  		/**
de85488138247d Boris Brezillon 2024-02-29  440  		 * @kmap: Kernel mapping of the buffer object holding the
de85488138247d Boris Brezillon 2024-02-29  441  		 * synchronization object.
de85488138247d Boris Brezillon 2024-02-29  442  		 */
de85488138247d Boris Brezillon 2024-02-29  443  		void *kmap;
de85488138247d Boris Brezillon 2024-02-29  444  	} syncwait;
de85488138247d Boris Brezillon 2024-02-29  445  
de85488138247d Boris Brezillon 2024-02-29  446  	/** @fence_ctx: Fence context fields. */
de85488138247d Boris Brezillon 2024-02-29  447  	struct {
de85488138247d Boris Brezillon 2024-02-29  448  		/** @lock: Used to protect access to all fences allocated by this context. */
de85488138247d Boris Brezillon 2024-02-29  449  		spinlock_t lock;
de85488138247d Boris Brezillon 2024-02-29  450  
de85488138247d Boris Brezillon 2024-02-29  451  		/**
de85488138247d Boris Brezillon 2024-02-29  452  		 * @id: Fence context ID.
de85488138247d Boris Brezillon 2024-02-29  453  		 *
de85488138247d Boris Brezillon 2024-02-29  454  		 * Allocated with dma_fence_context_alloc().
de85488138247d Boris Brezillon 2024-02-29  455  		 */
de85488138247d Boris Brezillon 2024-02-29  456  		u64 id;
de85488138247d Boris Brezillon 2024-02-29  457  
de85488138247d Boris Brezillon 2024-02-29  458  		/** @seqno: Sequence number of the last initialized fence. */
de85488138247d Boris Brezillon 2024-02-29  459  		atomic64_t seqno;
de85488138247d Boris Brezillon 2024-02-29  460  
966c853e5c43b4 Boris Brezillon 2024-06-28  461  		/**
966c853e5c43b4 Boris Brezillon 2024-06-28  462  		 * @last_fence: Fence of the last submitted job.
966c853e5c43b4 Boris Brezillon 2024-06-28  463  		 *
966c853e5c43b4 Boris Brezillon 2024-06-28  464  		 * We return this fence when we get an empty command stream.
966c853e5c43b4 Boris Brezillon 2024-06-28  465  		 * This way, we are guaranteed that all earlier jobs have completed
966c853e5c43b4 Boris Brezillon 2024-06-28  466  		 * when drm_sched_job::s_fence::finished without having to feed
966c853e5c43b4 Boris Brezillon 2024-06-28  467  		 * the CS ring buffer with a dummy job that only signals the fence.
966c853e5c43b4 Boris Brezillon 2024-06-28  468  		 */
966c853e5c43b4 Boris Brezillon 2024-06-28  469  		struct dma_fence *last_fence;
966c853e5c43b4 Boris Brezillon 2024-06-28  470  
de85488138247d Boris Brezillon 2024-02-29  471  		/**
de85488138247d Boris Brezillon 2024-02-29  472  		 * @in_flight_jobs: List containing all in-flight jobs.
de85488138247d Boris Brezillon 2024-02-29  473  		 *
de85488138247d Boris Brezillon 2024-02-29  474  		 * Used to keep track and signal panthor_job::done_fence when the
de85488138247d Boris Brezillon 2024-02-29  475  		 * synchronization object attached to the queue is signaled.
de85488138247d Boris Brezillon 2024-02-29  476  		 */
de85488138247d Boris Brezillon 2024-02-29  477  		struct list_head in_flight_jobs;
de85488138247d Boris Brezillon 2024-02-29  478  	} fence_ctx;
de85488138247d Boris Brezillon 2024-02-29 @479  };
de85488138247d Boris Brezillon 2024-02-29  480
Boris Brezillon July 2, 2024, 3:13 p.m. UTC | #7
On Fri, 28 Jun 2024 16:55:36 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> -	/* Stream size is zero, nothing to do => return a NULL fence and let
> -	 * drm_sched signal the parent.
> +	/* Stream size is zero, nothing to do except making sure all previously
> +	 * submitted jobs are done before we signal the
> +	 * drm_sched_job::s_fence::finished fence.
>  	 */
> -	if (!job->call_info.size)
> -		return NULL;
> +	if (!job->call_info.size) {
> +		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> +		return job->done_fence;

Should be

		return dma_fence_get(job->done_fence);

to keep the get/put balanced.

> +	}
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 79ffcbc41d78..951ff7e63ea8 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -458,6 +458,16 @@  struct panthor_queue {
 		/** @seqno: Sequence number of the last initialized fence. */
 		atomic64_t seqno;
 
+		/**
+		 * @last_fence: Fence of the last submitted job.
+		 *
+		 * We return this fence when we get an empty command stream.
+		 * This way, we are guaranteed that all earlier jobs have completed
+		 * when drm_sched_job::s_fence::finished without having to feed
+		 * the CS ring buffer with a dummy job that only signals the fence.
+		 */
+		struct dma_fence *last_fence;
+
 		/**
 		 * @in_flight_jobs: List containing all in-flight jobs.
 		 *
@@ -829,6 +839,9 @@  static void group_free_queue(struct panthor_group *group, struct panthor_queue *
 	panthor_kernel_bo_destroy(queue->ringbuf);
 	panthor_kernel_bo_destroy(queue->iface.mem);
 
+	/* Release the last_fence we were holding, if any. */
+	dma_fence_put(queue->fence_ctx.last_fence);
+
 	kfree(queue);
 }
 
@@ -2865,11 +2878,14 @@  queue_run_job(struct drm_sched_job *sched_job)
 	static_assert(sizeof(call_instrs) % 64 == 0,
 		      "call_instrs is not aligned on a cacheline");
 
-	/* Stream size is zero, nothing to do => return a NULL fence and let
-	 * drm_sched signal the parent.
+	/* Stream size is zero, nothing to do except making sure all previously
+	 * submitted jobs are done before we signal the
+	 * drm_sched_job::s_fence::finished fence.
 	 */
-	if (!job->call_info.size)
-		return NULL;
+	if (!job->call_info.size) {
+		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
+		return job->done_fence;
+	}
 
 	ret = pm_runtime_resume_and_get(ptdev->base.dev);
 	if (drm_WARN_ON(&ptdev->base, ret))
@@ -2928,6 +2944,10 @@  queue_run_job(struct drm_sched_job *sched_job)
 		}
 	}
 
+	/* Update the last fence. */
+	dma_fence_put(queue->fence_ctx.last_fence);
+	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
+
 	done_fence = dma_fence_get(job->done_fence);
 
 out_unlock:
@@ -3378,10 +3398,15 @@  panthor_job_create(struct panthor_file *pfile,
 		goto err_put_job;
 	}
 
-	job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
-	if (!job->done_fence) {
-		ret = -ENOMEM;
-		goto err_put_job;
+	/* Empty command streams don't need a fence, they'll pick the one from
+	 * the previously submitted job.
+	 */
+	if (job->call_info.size) {
+		job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL);
+		if (!job->done_fence) {
+			ret = -ENOMEM;
+			goto err_put_job;
+		}
 	}
 
 	ret = drm_sched_job_init(&job->base,
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index aaed8e12ad0b..926b1deb1116 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -802,6 +802,9 @@  struct drm_panthor_queue_submit {
 	 * Must be 64-bit/8-byte aligned (the size of a CS instruction)
 	 *
 	 * Can be zero if stream_addr is zero too.
+	 *
+	 * When the stream size is zero, the queue submit serves as a
+	 * synchronization point.
 	 */
 	__u32 stream_size;
 
@@ -822,6 +825,8 @@  struct drm_panthor_queue_submit {
 	 * ensure the GPU doesn't get garbage when reading the indirect command
 	 * stream buffers. If you want the cache flush to happen
 	 * unconditionally, pass a zero here.
+	 *
+	 * Ignored when stream_size is zero.
 	 */
 	__u32 latest_flush;