diff mbox series

[v6,5/7] drm/sched: Split free_job into own work item

Message ID 20231017150958.838613-6-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series DRM scheduler changes for Xe | expand

Commit Message

Matthew Brost Oct. 17, 2023, 3:09 p.m. UTC
Rather than call free_job and run_job in same work item have a dedicated
work item for each. This aligns with the design and intended use of work
queues.

v2:
   - Test for DMA_FENCE_FLAG_TIMESTAMP_BIT before setting
     timestamp in free_job() work item (Danilo)
v3:
  - Drop forward dec of drm_sched_select_entity (Boris)
  - Return in drm_sched_run_job_work if entity NULL (Boris)
v4:
  - Replace dequeue with peek and invert logic (Luben)
  - Wrap to 100 lines (Luben)
  - Update comments for *_queue / *_queue_if_ready functions (Luben)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 287 +++++++++++++++----------
 include/drm/gpu_scheduler.h            |   8 +-
 2 files changed, 178 insertions(+), 117 deletions(-)

Comments

Luben Tuikov Oct. 19, 2023, 1:25 a.m. UTC | #1
Hi,

On 2023-10-17 11:09, Matthew Brost wrote:
> Rather than call free_job and run_job in same work item have a dedicated
> work item for each. This aligns with the design and intended use of work
> queues.
> 
> v2:
>    - Test for DMA_FENCE_FLAG_TIMESTAMP_BIT before setting
>      timestamp in free_job() work item (Danilo)
> v3:
>   - Drop forward dec of drm_sched_select_entity (Boris)
>   - Return in drm_sched_run_job_work if entity NULL (Boris)
> v4:
>   - Replace dequeue with peek and invert logic (Luben)
>   - Wrap to 100 lines (Luben)
>   - Update comments for *_queue / *_queue_if_ready functions (Luben)
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 287 +++++++++++++++----------
>  include/drm/gpu_scheduler.h            |   8 +-
>  2 files changed, 178 insertions(+), 117 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 273e0fbc4eab..b1b8d9f96da5 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>   * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>   *
>   * @rq: scheduler run queue to check.
> + * @peek: Just find, don't set to current.

The "peek" rename is good--thanks!

>   *
>   * Try to find a ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool peek)
>  {
>  	struct drm_sched_entity *entity;
>  
> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>  	if (entity) {
>  		list_for_each_entry_continue(entity, &rq->entities, list) {
>  			if (drm_sched_entity_is_ready(entity)) {
> -				rq->current_entity = entity;
> -				reinit_completion(&entity->entity_idle);
> +				if (!peek) {
> +					rq->current_entity = entity;
> +					reinit_completion(&entity->entity_idle);
> +				}
>  				spin_unlock(&rq->lock);
>  				return entity;
>  			}
> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>  	list_for_each_entry(entity, &rq->entities, list) {
>  
>  		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (!peek) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>  			spin_unlock(&rq->lock);
>  			return entity;
>  		}
> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>   * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>   *
>   * @rq: scheduler run queue to check.
> + * @peek: Just find, don't set to current.
>   *
>   * Find oldest waiting ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool peek)
>  {
>  	struct rb_node *rb;
>  
> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>  
>  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>  		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (!peek) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>  			break;
>  		}
>  	}
> @@ -282,13 +290,98 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>  }
>  
>  /**
> - * drm_sched_wqueue_enqueue - enqueue scheduler submission
> + * drm_sched_run_job_queue - enqueue run-job work

Hmm... so this removes the change from patch 1 in this series, and as such
obviates patch 1.

Do you want to go with "run_job_queue" and the names introduced here?

In this case, we can have that in patch 1 instead, and this patch
would only split the "free job" into its own work item.

> + * @sched: scheduler instance
> + */
> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> +{
> +	if (!READ_ONCE(sched->pause_submit))
> +		queue_work(sched->submit_wq, &sched->work_run_job);
> +}
> +
> +/**
> + * drm_sched_can_queue -- Can we queue more to the hardware?
> + * @sched: scheduler instance
> + *
> + * Return true if we can push more jobs to the hw, otherwise false.
> + */
> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> +{
> +	return atomic_read(&sched->hw_rq_count) <
> +		sched->hw_submission_limit;
> +}
> +
> +/**
> + * drm_sched_select_entity - Select next entity to process
> + *
> + * @sched: scheduler instance
> + * @peek: Just find, don't set to current.
> + *
> + * Returns the entity to process or NULL if none are found.
> + */
> +static struct drm_sched_entity *
> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool peek)
> +{
> +	struct drm_sched_entity *entity;
> +	int i;
> +
> +	if (!drm_sched_can_queue(sched))
> +		return NULL;
> +
> +	if (sched->single_entity) {
> +		if (!READ_ONCE(sched->single_entity->stopped) &&
> +		    drm_sched_entity_is_ready(sched->single_entity))
> +			return sched->single_entity;
> +
> +		return NULL;
> +	}
> +
> +	/* Kernel run queue has higher priority than normal run queue*/
> +	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> +		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i], peek) :
> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i], peek);
> +		if (entity)
> +			break;
> +	}
> +
> +	return entity;
> +}

Can you shed some light on why you need the "peek" capability, i.e. to just
see the entity without actually arming it?

In a single-entity scheduling, surely peeking at the single entity of
a scheduler, can be done easier.

I'm asking as none of these functions were intended as a peek-only/-capable
solution, and if you need it, this shows that the infrastructure lacks
something which you need.

It's not designed for "peek", as you can see above: it gets passed-through
the function stack until it reaches core functionality to be used in logic.

(It just makes the code too complicated with too many special cases, "if's"
and if we keep doing this, it would become very hard to understand,
maintain, and contribute to in a few years.)

> +
> +/**
> + * drm_sched_run_job_queue_if_ready - enqueue run-job work if ready
> + * @sched: scheduler instance
> + */

"if ready" here makes perfect sense, but in the "free job" case,
it should be "if done". See below.

> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> +{
> +	if (drm_sched_select_entity(sched, false))
> +		drm_sched_run_job_queue(sched);
> +}
> +
> +/**
> + * drm_sched_free_job_queue - enqueue free-job work
>   * @sched: scheduler instance
>   */
> -static void drm_sched_wqueue_enqueue(struct drm_gpu_scheduler *sched)
> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>  {
>  	if (!READ_ONCE(sched->pause_submit))
> -		queue_work(sched->submit_wq, &sched->work_submit);
> +		queue_work(sched->submit_wq, &sched->work_free_job);
> +}
> +
> +/**
> + * drm_sched_free_job_queue_if_ready - enqueue free-job work if ready

This should be "if done". Please change this to "if done".

> + * @sched: scheduler instance
> + */
> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)

This should be "if_done". Please change this to "if_done".

A "job" is _done_ when it's on the pending list and its fence has been signalled,
and we free a job only when it's done, not "ready".

> +{
> +	struct drm_sched_job *job;
> +
> +	spin_lock(&sched->job_list_lock);
> +	job = list_first_entry_or_null(&sched->pending_list,
> +				       struct drm_sched_job, list);
> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> +		drm_sched_free_job_queue(sched);
> +	spin_unlock(&sched->job_list_lock);
>  }
>  
>  /**
> @@ -310,7 +403,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>  	dma_fence_get(&s_fence->finished);
>  	drm_sched_fence_finished(s_fence, result);
>  	dma_fence_put(&s_fence->finished);
> -	drm_sched_wqueue_enqueue(sched);
> +	drm_sched_free_job_queue(sched);
>  }
>  
>  /**
> @@ -576,7 +669,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>  				drm_sched_job_done(s_job, fence->error);
>  			else if (r)
>  				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> -					  r);
> +					      r);
>  		} else
>  			drm_sched_job_done(s_job, -ECANCELED);
>  	}
> @@ -885,18 +978,6 @@ void drm_sched_job_cleanup(struct drm_sched_job *job)
>  }
>  EXPORT_SYMBOL(drm_sched_job_cleanup);
>  
> -/**
> - * drm_sched_can_queue -- Can we queue more to the hardware?
> - * @sched: scheduler instance
> - *
> - * Return true if we can push more jobs to the hw, otherwise false.
> - */
> -static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> -{
> -	return atomic_read(&sched->hw_rq_count) <
> -		sched->hw_submission_limit;
> -}
> -
>  /**
>   * drm_sched_wakeup_if_can_queue - Wake up the scheduler
>   * @sched: scheduler instance
> @@ -906,43 +987,7 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>  void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>  {
>  	if (drm_sched_can_queue(sched))
> -		drm_sched_wqueue_enqueue(sched);
> -}
> -
> -/**
> - * drm_sched_select_entity - Select next entity to process
> - *
> - * @sched: scheduler instance
> - *
> - * Returns the entity to process or NULL if none are found.
> - */
> -static struct drm_sched_entity *
> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> -{
> -	struct drm_sched_entity *entity;
> -	int i;
> -
> -	if (!drm_sched_can_queue(sched))
> -		return NULL;
> -
> -	if (sched->single_entity) {
> -		if (!READ_ONCE(sched->single_entity->stopped) &&
> -		    drm_sched_entity_is_ready(sched->single_entity))
> -			return sched->single_entity;
> -
> -		return NULL;
> -	}
> -
> -	/* Kernel run queue has higher priority than normal run queue*/
> -	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> -		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> -		if (entity)
> -			break;
> -	}
> -
> -	return entity;
> +		drm_sched_run_job_queue(sched);
>  }
>  
>  /**
> @@ -974,8 +1019,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>  						typeof(*next), list);
>  
>  		if (next) {
> -			next->s_fence->scheduled.timestamp =
> -				dma_fence_timestamp(&job->s_fence->finished);
> +			if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
> +				     &next->s_fence->scheduled.flags))
> +				next->s_fence->scheduled.timestamp =
> +					dma_fence_timestamp(&job->s_fence->finished);
>  			/* start TO timer for next job */
>  			drm_sched_start_timeout(sched);
>  		}
> @@ -1025,74 +1072,83 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>  EXPORT_SYMBOL(drm_sched_pick_best);
>  
>  /**
> - * drm_sched_main - main scheduler thread
> + * drm_sched_free_job_work - worker to call free_job
>   *
> - * @param: scheduler instance
> + * @w: free job work
>   */
> -static void drm_sched_main(struct work_struct *w)
> +static void drm_sched_free_job_work(struct work_struct *w)
>  {
>  	struct drm_gpu_scheduler *sched =
> -		container_of(w, struct drm_gpu_scheduler, work_submit);
> -	struct drm_sched_entity *entity;
> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>  	struct drm_sched_job *cleanup_job;
> -	int r;
>  
>  	if (READ_ONCE(sched->pause_submit))
>  		return;
>  
>  	cleanup_job = drm_sched_get_cleanup_job(sched);
> -	entity = drm_sched_select_entity(sched);
> -
> -	if (!entity && !cleanup_job)
> -		return;	/* No more work */
> -
> -	if (cleanup_job)
> +	if (cleanup_job) {
>  		sched->ops->free_job(cleanup_job);
>  
> -	if (entity) {
> -		struct dma_fence *fence;
> -		struct drm_sched_fence *s_fence;
> -		struct drm_sched_job *sched_job;
> -
> -		sched_job = drm_sched_entity_pop_job(entity);
> -		if (!sched_job) {
> -			complete_all(&entity->entity_idle);
> -			if (!cleanup_job)
> -				return;	/* No more work */
> -			goto again;
> -		}
> +		drm_sched_free_job_queue_if_ready(sched);
> +		drm_sched_run_job_queue_if_ready(sched);
> +	}
> +}
>  
> -		s_fence = sched_job->s_fence;
> +/**
> + * drm_sched_run_job_work - worker to call run_job
> + *
> + * @w: run job work
> + */
> +static void drm_sched_run_job_work(struct work_struct *w)
> +{
> +	struct drm_gpu_scheduler *sched =
> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> +	struct drm_sched_entity *entity;
> +	struct dma_fence *fence;
> +	struct drm_sched_fence *s_fence;
> +	struct drm_sched_job *sched_job;
> +	int r;
>  
> -		atomic_inc(&sched->hw_rq_count);
> -		drm_sched_job_begin(sched_job);
> +	if (READ_ONCE(sched->pause_submit))
> +		return;
> +
> +	entity = drm_sched_select_entity(sched, true);
> +	if (!entity)
> +		return;
>  
> -		trace_drm_run_job(sched_job, entity);
> -		fence = sched->ops->run_job(sched_job);
> +	sched_job = drm_sched_entity_pop_job(entity);
> +	if (!sched_job) {
>  		complete_all(&entity->entity_idle);
> -		drm_sched_fence_scheduled(s_fence, fence);
> +		return;	/* No more work */
> +	}
>  
> -		if (!IS_ERR_OR_NULL(fence)) {
> -			/* Drop for original kref_init of the fence */
> -			dma_fence_put(fence);
> +	s_fence = sched_job->s_fence;
>  
> -			r = dma_fence_add_callback(fence, &sched_job->cb,
> -						   drm_sched_job_done_cb);
> -			if (r == -ENOENT)
> -				drm_sched_job_done(sched_job, fence->error);
> -			else if (r)
> -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> -					  r);
> -		} else {
> -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
> -					   PTR_ERR(fence) : 0);
> -		}
> +	atomic_inc(&sched->hw_rq_count);
> +	drm_sched_job_begin(sched_job);
>  
> -		wake_up(&sched->job_scheduled);
> +	trace_drm_run_job(sched_job, entity);
> +	fence = sched->ops->run_job(sched_job);
> +	complete_all(&entity->entity_idle);
> +	drm_sched_fence_scheduled(s_fence, fence);
> +
> +	if (!IS_ERR_OR_NULL(fence)) {
> +		/* Drop for original kref_init of the fence */
> +		dma_fence_put(fence);
> +
> +		r = dma_fence_add_callback(fence, &sched_job->cb,
> +					   drm_sched_job_done_cb);
> +		if (r == -ENOENT)
> +			drm_sched_job_done(sched_job, fence->error);
> +		else if (r)
> +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
> +	} else {
> +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
> +				   PTR_ERR(fence) : 0);
>  	}
>  
> -again:
> -	drm_sched_wqueue_enqueue(sched);
> +	wake_up(&sched->job_scheduled);
> +	drm_sched_run_job_queue_if_ready(sched);
>  }
>  
>  /**
> @@ -1159,7 +1215,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  	spin_lock_init(&sched->job_list_lock);
>  	atomic_set(&sched->hw_rq_count, 0);
>  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> -	INIT_WORK(&sched->work_submit, drm_sched_main);
> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>  	atomic_set(&sched->_score, 0);
>  	atomic64_set(&sched->job_id_count, 0);
>  	sched->pause_submit = false;
> @@ -1282,7 +1339,8 @@ EXPORT_SYMBOL(drm_sched_wqueue_ready);
>  void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
>  {
>  	WRITE_ONCE(sched->pause_submit, true);
> -	cancel_work_sync(&sched->work_submit);
> +	cancel_work_sync(&sched->work_run_job);
> +	cancel_work_sync(&sched->work_free_job);
>  }
>  EXPORT_SYMBOL(drm_sched_wqueue_stop);
>  
> @@ -1294,6 +1352,7 @@ EXPORT_SYMBOL(drm_sched_wqueue_stop);
>  void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
>  {
>  	WRITE_ONCE(sched->pause_submit, false);
> -	queue_work(sched->submit_wq, &sched->work_submit);
> +	queue_work(sched->submit_wq, &sched->work_run_job);
> +	queue_work(sched->submit_wq, &sched->work_free_job);
>  }
>  EXPORT_SYMBOL(drm_sched_wqueue_start);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index e67b53c3546b..625ffe040de3 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>   *                 finished.
>   * @hw_rq_count: the number of jobs currently in the hardware queue.
>   * @job_id_count: used to assign unique id to the each job.
> - * @submit_wq: workqueue used to queue @work_submit
> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>   * @timeout_wq: workqueue used to queue @work_tdr
> - * @work_submit: schedules jobs and cleans up entities
> + * @work_run_job: schedules jobs

Perhaps a more descriptive explanation could be had,

	@work_run_job: work which calls run_job op of each scheduler.

> + * @work_free_job: cleans up jobs
>   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>   *            timeout interval is over.
>   * @pending_list: the list of jobs which are currently in the job queue.
> @@ -519,7 +520,8 @@ struct drm_gpu_scheduler {
>  	atomic64_t			job_id_count;
>  	struct workqueue_struct		*submit_wq;
>  	struct workqueue_struct		*timeout_wq;
> -	struct work_struct		work_submit;
> +	struct work_struct		work_run_job;
> +	struct work_struct		work_free_job;
>  	struct delayed_work		work_tdr;
>  	struct list_head		pending_list;
>  	spinlock_t			job_list_lock;
Matthew Brost Oct. 19, 2023, 4:55 p.m. UTC | #2
On Wed, Oct 18, 2023 at 09:25:36PM -0400, Luben Tuikov wrote:
> Hi,
> 
> On 2023-10-17 11:09, Matthew Brost wrote:
> > Rather than call free_job and run_job in same work item have a dedicated
> > work item for each. This aligns with the design and intended use of work
> > queues.
> > 
> > v2:
> >    - Test for DMA_FENCE_FLAG_TIMESTAMP_BIT before setting
> >      timestamp in free_job() work item (Danilo)
> > v3:
> >   - Drop forward dec of drm_sched_select_entity (Boris)
> >   - Return in drm_sched_run_job_work if entity NULL (Boris)
> > v4:
> >   - Replace dequeue with peek and invert logic (Luben)
> >   - Wrap to 100 lines (Luben)
> >   - Update comments for *_queue / *_queue_if_ready functions (Luben)
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/scheduler/sched_main.c | 287 +++++++++++++++----------
> >  include/drm/gpu_scheduler.h            |   8 +-
> >  2 files changed, 178 insertions(+), 117 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index 273e0fbc4eab..b1b8d9f96da5 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> >   * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> >   *
> >   * @rq: scheduler run queue to check.
> > + * @peek: Just find, don't set to current.
> 
> The "peek" rename is good--thanks!
> 
> >   *
> >   * Try to find a ready entity, returns NULL if none found.
> >   */
> >  static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool peek)
> >  {
> >  	struct drm_sched_entity *entity;
> >  
> > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >  	if (entity) {
> >  		list_for_each_entry_continue(entity, &rq->entities, list) {
> >  			if (drm_sched_entity_is_ready(entity)) {
> > -				rq->current_entity = entity;
> > -				reinit_completion(&entity->entity_idle);
> > +				if (!peek) {
> > +					rq->current_entity = entity;
> > +					reinit_completion(&entity->entity_idle);
> > +				}
> >  				spin_unlock(&rq->lock);
> >  				return entity;
> >  			}
> > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >  	list_for_each_entry(entity, &rq->entities, list) {
> >  
> >  		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (!peek) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >  			spin_unlock(&rq->lock);
> >  			return entity;
> >  		}
> > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >   * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> >   *
> >   * @rq: scheduler run queue to check.
> > + * @peek: Just find, don't set to current.
> >   *
> >   * Find oldest waiting ready entity, returns NULL if none found.
> >   */
> >  static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool peek)
> >  {
> >  	struct rb_node *rb;
> >  
> > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >  
> >  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> >  		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (!peek) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >  			break;
> >  		}
> >  	}
> > @@ -282,13 +290,98 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >  }
> >  
> >  /**
> > - * drm_sched_wqueue_enqueue - enqueue scheduler submission
> > + * drm_sched_run_job_queue - enqueue run-job work
> 
> Hmm... so this removes the change from patch 1 in this series, and as such
> obviates patch 1.
> 
> Do you want to go with "run_job_queue" and the names introduced here?
> 

Yes, I like the run_job_queue name here.

> In this case, we can have that in patch 1 instead, and this patch
> would only split the "free job" into its own work item.
>

Sure, so s/drm_sched_wqueue_enqueue/drm_sched_run_job_queue in patch #1?
 
> > + * @sched: scheduler instance
> > + */
> > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (!READ_ONCE(sched->pause_submit))
> > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > +}
> > +
> > +/**
> > + * drm_sched_can_queue -- Can we queue more to the hardware?
> > + * @sched: scheduler instance
> > + *
> > + * Return true if we can push more jobs to the hw, otherwise false.
> > + */
> > +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > +{
> > +	return atomic_read(&sched->hw_rq_count) <
> > +		sched->hw_submission_limit;
> > +}
> > +
> > +/**
> > + * drm_sched_select_entity - Select next entity to process
> > + *
> > + * @sched: scheduler instance
> > + * @peek: Just find, don't set to current.
> > + *
> > + * Returns the entity to process or NULL if none are found.
> > + */
> > +static struct drm_sched_entity *
> > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool peek)
> > +{
> > +	struct drm_sched_entity *entity;
> > +	int i;
> > +
> > +	if (!drm_sched_can_queue(sched))
> > +		return NULL;
> > +
> > +	if (sched->single_entity) {
> > +		if (!READ_ONCE(sched->single_entity->stopped) &&
> > +		    drm_sched_entity_is_ready(sched->single_entity))
> > +			return sched->single_entity;
> > +
> > +		return NULL;
> > +	}
> > +
> > +	/* Kernel run queue has higher priority than normal run queue*/
> > +	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > +		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i], peek) :
> > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i], peek);
> > +		if (entity)
> > +			break;
> > +	}
> > +
> > +	return entity;
> > +}
> 
> Can you shed some light on why you need the "peek" capability, i.e. to just
> see the entity without actually arming it?
> 
> In a single-entity scheduling, surely peeking at the single entity of
> a scheduler, can be done easier.
> 
> I'm asking as none of these functions were intended as a peek-only/-capable
> solution, and if you need it, this shows that the infrastructure lacks
> something which you need.
> 
> It's not designed for "peek", as you can see above: it gets passed-through
> the function stack until it reaches core functionality to be used in logic.
> 
> (It just makes the code too complicated with too many special cases, "if's"
> and if we keep doing this, it would become very hard to understand,
> maintain, and contribute to in a few years.)
>

This question made me realize that the callers of this function inverted
the peek arguments. Will fix in next rev.

The peek is need for drm_sched_run_job_queue_if_ready which checks if a
job is ready before queuing the job run worker. This is called from the
free job worker (hw submission count decrement makes a new job ready) or
from the run job worker (another job in queue).

If we want to blindly queue the job in these cases then this can be
removed.

Or alternatively we could remove the peek argument and blindly reinit
the idle when selecting entity. I think this fine if I am reading the
code correctly. If you agree, I'd lean towards this option.
 
> > +
> > +/**
> > + * drm_sched_run_job_queue_if_ready - enqueue run-job work if ready
> > + * @sched: scheduler instance
> > + */
> 
> "if ready" here makes perfect sense, but in the "free job" case,
> it should be "if done". See below.
> 

Yes, agree that if done for job makes more sense.

> > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (drm_sched_select_entity(sched, false))
> > +		drm_sched_run_job_queue(sched);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue - enqueue free-job work
> >   * @sched: scheduler instance
> >   */
> > -static void drm_sched_wqueue_enqueue(struct drm_gpu_scheduler *sched)
> > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> >  {
> >  	if (!READ_ONCE(sched->pause_submit))
> > -		queue_work(sched->submit_wq, &sched->work_submit);
> > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue_if_ready - enqueue free-job work if ready
> 
> This should be "if done". Please change this to "if done".
> 
> > + * @sched: scheduler instance
> > + */
> > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> 
> This should be "if_done". Please change this to "if_done".
> 
> A "job" is _done_ when it's on the pending list and its fence has been signalled,
> and we free a job only when it's done, not "ready".
> 

Agree on all of this.

> > +{
> > +	struct drm_sched_job *job;
> > +
> > +	spin_lock(&sched->job_list_lock);
> > +	job = list_first_entry_or_null(&sched->pending_list,
> > +				       struct drm_sched_job, list);
> > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > +		drm_sched_free_job_queue(sched);
> > +	spin_unlock(&sched->job_list_lock);
> >  }
> >  
> >  /**
> > @@ -310,7 +403,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> >  	dma_fence_get(&s_fence->finished);
> >  	drm_sched_fence_finished(s_fence, result);
> >  	dma_fence_put(&s_fence->finished);
> > -	drm_sched_wqueue_enqueue(sched);
> > +	drm_sched_free_job_queue(sched);
> >  }
> >  
> >  /**
> > @@ -576,7 +669,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
> >  				drm_sched_job_done(s_job, fence->error);
> >  			else if (r)
> >  				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> > -					  r);
> > +					      r);
> >  		} else
> >  			drm_sched_job_done(s_job, -ECANCELED);
> >  	}
> > @@ -885,18 +978,6 @@ void drm_sched_job_cleanup(struct drm_sched_job *job)
> >  }
> >  EXPORT_SYMBOL(drm_sched_job_cleanup);
> >  
> > -/**
> > - * drm_sched_can_queue -- Can we queue more to the hardware?
> > - * @sched: scheduler instance
> > - *
> > - * Return true if we can push more jobs to the hw, otherwise false.
> > - */
> > -static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > -{
> > -	return atomic_read(&sched->hw_rq_count) <
> > -		sched->hw_submission_limit;
> > -}
> > -
> >  /**
> >   * drm_sched_wakeup_if_can_queue - Wake up the scheduler
> >   * @sched: scheduler instance
> > @@ -906,43 +987,7 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> >  void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> >  {
> >  	if (drm_sched_can_queue(sched))
> > -		drm_sched_wqueue_enqueue(sched);
> > -}
> > -
> > -/**
> > - * drm_sched_select_entity - Select next entity to process
> > - *
> > - * @sched: scheduler instance
> > - *
> > - * Returns the entity to process or NULL if none are found.
> > - */
> > -static struct drm_sched_entity *
> > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > -{
> > -	struct drm_sched_entity *entity;
> > -	int i;
> > -
> > -	if (!drm_sched_can_queue(sched))
> > -		return NULL;
> > -
> > -	if (sched->single_entity) {
> > -		if (!READ_ONCE(sched->single_entity->stopped) &&
> > -		    drm_sched_entity_is_ready(sched->single_entity))
> > -			return sched->single_entity;
> > -
> > -		return NULL;
> > -	}
> > -
> > -	/* Kernel run queue has higher priority than normal run queue*/
> > -	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > -		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > -		if (entity)
> > -			break;
> > -	}
> > -
> > -	return entity;
> > +		drm_sched_run_job_queue(sched);
> >  }
> >  
> >  /**
> > @@ -974,8 +1019,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
> >  						typeof(*next), list);
> >  
> >  		if (next) {
> > -			next->s_fence->scheduled.timestamp =
> > -				dma_fence_timestamp(&job->s_fence->finished);
> > +			if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
> > +				     &next->s_fence->scheduled.flags))
> > +				next->s_fence->scheduled.timestamp =
> > +					dma_fence_timestamp(&job->s_fence->finished);
> >  			/* start TO timer for next job */
> >  			drm_sched_start_timeout(sched);
> >  		}
> > @@ -1025,74 +1072,83 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> >  EXPORT_SYMBOL(drm_sched_pick_best);
> >  
> >  /**
> > - * drm_sched_main - main scheduler thread
> > + * drm_sched_free_job_work - worker to call free_job
> >   *
> > - * @param: scheduler instance
> > + * @w: free job work
> >   */
> > -static void drm_sched_main(struct work_struct *w)
> > +static void drm_sched_free_job_work(struct work_struct *w)
> >  {
> >  	struct drm_gpu_scheduler *sched =
> > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > -	struct drm_sched_entity *entity;
> > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> >  	struct drm_sched_job *cleanup_job;
> > -	int r;
> >  
> >  	if (READ_ONCE(sched->pause_submit))
> >  		return;
> >  
> >  	cleanup_job = drm_sched_get_cleanup_job(sched);
> > -	entity = drm_sched_select_entity(sched);
> > -
> > -	if (!entity && !cleanup_job)
> > -		return;	/* No more work */
> > -
> > -	if (cleanup_job)
> > +	if (cleanup_job) {
> >  		sched->ops->free_job(cleanup_job);
> >  
> > -	if (entity) {
> > -		struct dma_fence *fence;
> > -		struct drm_sched_fence *s_fence;
> > -		struct drm_sched_job *sched_job;
> > -
> > -		sched_job = drm_sched_entity_pop_job(entity);
> > -		if (!sched_job) {
> > -			complete_all(&entity->entity_idle);
> > -			if (!cleanup_job)
> > -				return;	/* No more work */
> > -			goto again;
> > -		}
> > +		drm_sched_free_job_queue_if_ready(sched);
> > +		drm_sched_run_job_queue_if_ready(sched);
> > +	}
> > +}
> >  
> > -		s_fence = sched_job->s_fence;
> > +/**
> > + * drm_sched_run_job_work - worker to call run_job
> > + *
> > + * @w: run job work
> > + */
> > +static void drm_sched_run_job_work(struct work_struct *w)
> > +{
> > +	struct drm_gpu_scheduler *sched =
> > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > +	struct drm_sched_entity *entity;
> > +	struct dma_fence *fence;
> > +	struct drm_sched_fence *s_fence;
> > +	struct drm_sched_job *sched_job;
> > +	int r;
> >  
> > -		atomic_inc(&sched->hw_rq_count);
> > -		drm_sched_job_begin(sched_job);
> > +	if (READ_ONCE(sched->pause_submit))
> > +		return;
> > +
> > +	entity = drm_sched_select_entity(sched, true);
> > +	if (!entity)
> > +		return;
> >  
> > -		trace_drm_run_job(sched_job, entity);
> > -		fence = sched->ops->run_job(sched_job);
> > +	sched_job = drm_sched_entity_pop_job(entity);
> > +	if (!sched_job) {
> >  		complete_all(&entity->entity_idle);
> > -		drm_sched_fence_scheduled(s_fence, fence);
> > +		return;	/* No more work */
> > +	}
> >  
> > -		if (!IS_ERR_OR_NULL(fence)) {
> > -			/* Drop for original kref_init of the fence */
> > -			dma_fence_put(fence);
> > +	s_fence = sched_job->s_fence;
> >  
> > -			r = dma_fence_add_callback(fence, &sched_job->cb,
> > -						   drm_sched_job_done_cb);
> > -			if (r == -ENOENT)
> > -				drm_sched_job_done(sched_job, fence->error);
> > -			else if (r)
> > -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> > -					  r);
> > -		} else {
> > -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > -					   PTR_ERR(fence) : 0);
> > -		}
> > +	atomic_inc(&sched->hw_rq_count);
> > +	drm_sched_job_begin(sched_job);
> >  
> > -		wake_up(&sched->job_scheduled);
> > +	trace_drm_run_job(sched_job, entity);
> > +	fence = sched->ops->run_job(sched_job);
> > +	complete_all(&entity->entity_idle);
> > +	drm_sched_fence_scheduled(s_fence, fence);
> > +
> > +	if (!IS_ERR_OR_NULL(fence)) {
> > +		/* Drop for original kref_init of the fence */
> > +		dma_fence_put(fence);
> > +
> > +		r = dma_fence_add_callback(fence, &sched_job->cb,
> > +					   drm_sched_job_done_cb);
> > +		if (r == -ENOENT)
> > +			drm_sched_job_done(sched_job, fence->error);
> > +		else if (r)
> > +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
> > +	} else {
> > +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > +				   PTR_ERR(fence) : 0);
> >  	}
> >  
> > -again:
> > -	drm_sched_wqueue_enqueue(sched);
> > +	wake_up(&sched->job_scheduled);
> > +	drm_sched_run_job_queue_if_ready(sched);
> >  }
> >  
> >  /**
> > @@ -1159,7 +1215,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >  	spin_lock_init(&sched->job_list_lock);
> >  	atomic_set(&sched->hw_rq_count, 0);
> >  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> >  	atomic_set(&sched->_score, 0);
> >  	atomic64_set(&sched->job_id_count, 0);
> >  	sched->pause_submit = false;
> > @@ -1282,7 +1339,8 @@ EXPORT_SYMBOL(drm_sched_wqueue_ready);
> >  void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
> >  {
> >  	WRITE_ONCE(sched->pause_submit, true);
> > -	cancel_work_sync(&sched->work_submit);
> > +	cancel_work_sync(&sched->work_run_job);
> > +	cancel_work_sync(&sched->work_free_job);
> >  }
> >  EXPORT_SYMBOL(drm_sched_wqueue_stop);
> >  
> > @@ -1294,6 +1352,7 @@ EXPORT_SYMBOL(drm_sched_wqueue_stop);
> >  void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
> >  {
> >  	WRITE_ONCE(sched->pause_submit, false);
> > -	queue_work(sched->submit_wq, &sched->work_submit);
> > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > +	queue_work(sched->submit_wq, &sched->work_free_job);
> >  }
> >  EXPORT_SYMBOL(drm_sched_wqueue_start);
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index e67b53c3546b..625ffe040de3 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> >   *                 finished.
> >   * @hw_rq_count: the number of jobs currently in the hardware queue.
> >   * @job_id_count: used to assign unique id to the each job.
> > - * @submit_wq: workqueue used to queue @work_submit
> > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> >   * @timeout_wq: workqueue used to queue @work_tdr
> > - * @work_submit: schedules jobs and cleans up entities
> > + * @work_run_job: schedules jobs
> 
> Perhaps a more descriptive explanation could be had,
> 
> 	@work_run_job: work which calls run_job op of each scheduler.
>

Got it.
 
> > + * @work_free_job: cleans up jobs
> >   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> >   *            timeout interval is over.
> >   * @pending_list: the list of jobs which are currently in the job queue.
> > @@ -519,7 +520,8 @@ struct drm_gpu_scheduler {
> >  	atomic64_t			job_id_count;
> >  	struct workqueue_struct		*submit_wq;
> >  	struct workqueue_struct		*timeout_wq;
> > -	struct work_struct		work_submit;
> > +	struct work_struct		work_run_job;
> > +	struct work_struct		work_free_job;
> >  	struct delayed_work		work_tdr;
> >  	struct list_head		pending_list;
> >  	spinlock_t			job_list_lock;
> 
> -- 

Thanks for the review.

Matt

> Regards,
> Luben
>
Luben Tuikov Oct. 22, 2023, 1:27 a.m. UTC | #3
Hi,

On 2023-10-19 12:55, Matthew Brost wrote:
> On Wed, Oct 18, 2023 at 09:25:36PM -0400, Luben Tuikov wrote:
>> Hi,
>>
>> On 2023-10-17 11:09, Matthew Brost wrote:
>>> Rather than call free_job and run_job in same work item have a dedicated
>>> work item for each. This aligns with the design and intended use of work
>>> queues.
>>>
>>> v2:
>>>    - Test for DMA_FENCE_FLAG_TIMESTAMP_BIT before setting
>>>      timestamp in free_job() work item (Danilo)
>>> v3:
>>>   - Drop forward dec of drm_sched_select_entity (Boris)
>>>   - Return in drm_sched_run_job_work if entity NULL (Boris)
>>> v4:
>>>   - Replace dequeue with peek and invert logic (Luben)
>>>   - Wrap to 100 lines (Luben)
>>>   - Update comments for *_queue / *_queue_if_ready functions (Luben)
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>  drivers/gpu/drm/scheduler/sched_main.c | 287 +++++++++++++++----------
>>>  include/drm/gpu_scheduler.h            |   8 +-
>>>  2 files changed, 178 insertions(+), 117 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 273e0fbc4eab..b1b8d9f96da5 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>   * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>   *
>>>   * @rq: scheduler run queue to check.
>>> + * @peek: Just find, don't set to current.
>>
>> The "peek" rename is good--thanks!
>>
>>>   *
>>>   * Try to find a ready entity, returns NULL if none found.
>>>   */
>>>  static struct drm_sched_entity *
>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool peek)
>>>  {
>>>  	struct drm_sched_entity *entity;
>>>  
>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>  	if (entity) {
>>>  		list_for_each_entry_continue(entity, &rq->entities, list) {
>>>  			if (drm_sched_entity_is_ready(entity)) {
>>> -				rq->current_entity = entity;
>>> -				reinit_completion(&entity->entity_idle);
>>> +				if (!peek) {
>>> +					rq->current_entity = entity;
>>> +					reinit_completion(&entity->entity_idle);
>>> +				}
>>>  				spin_unlock(&rq->lock);
>>>  				return entity;
>>>  			}
>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>  	list_for_each_entry(entity, &rq->entities, list) {
>>>  
>>>  		if (drm_sched_entity_is_ready(entity)) {
>>> -			rq->current_entity = entity;
>>> -			reinit_completion(&entity->entity_idle);
>>> +			if (!peek) {
>>> +				rq->current_entity = entity;
>>> +				reinit_completion(&entity->entity_idle);
>>> +			}
>>>  			spin_unlock(&rq->lock);
>>>  			return entity;
>>>  		}
>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>   * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>>   *
>>>   * @rq: scheduler run queue to check.
>>> + * @peek: Just find, don't set to current.
>>>   *
>>>   * Find oldest waiting ready entity, returns NULL if none found.
>>>   */
>>>  static struct drm_sched_entity *
>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool peek)
>>>  {
>>>  	struct rb_node *rb;
>>>  
>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>  
>>>  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>>  		if (drm_sched_entity_is_ready(entity)) {
>>> -			rq->current_entity = entity;
>>> -			reinit_completion(&entity->entity_idle);
>>> +			if (!peek) {
>>> +				rq->current_entity = entity;
>>> +				reinit_completion(&entity->entity_idle);
>>> +			}
>>>  			break;
>>>  		}
>>>  	}
>>> @@ -282,13 +290,98 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>  }
>>>  
>>>  /**
>>> - * drm_sched_wqueue_enqueue - enqueue scheduler submission
>>> + * drm_sched_run_job_queue - enqueue run-job work
>>
>> Hmm... so this removes the change from patch 1 in this series, and as such
>> obviates patch 1.
>>
>> Do you want to go with "run_job_queue" and the names introduced here?
>>
> 
> Yes, I like the run_job_queue name here.
> 
>> In this case, we can have that in patch 1 instead, and this patch
>> would only split the "free job" into its own work item.
>>
> 
> Sure, so s/drm_sched_wqueue_enqueue/drm_sched_run_job_queue in patch #1?

Yes. That'll create less thrashing about here and essentialize the changes
this patch is doing here (i.e. minimize the number of changes).

I think "run_job_queue" is a good enough name, and please add lucid comments,
so that it's easy to understand--kinda of like you're telling a story. Thanks!

Generally, patch series shouldn't introduce a change (like in patch #1) which
the same patch series further changes yet again to something else (like in this
patch here). It should change one thing only once.

>  
>>> + * @sched: scheduler instance
>>> + */
>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	if (!READ_ONCE(sched->pause_submit))
>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_can_queue -- Can we queue more to the hardware?
>>> + * @sched: scheduler instance
>>> + *
>>> + * Return true if we can push more jobs to the hw, otherwise false.
>>> + */
>>> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	return atomic_read(&sched->hw_rq_count) <
>>> +		sched->hw_submission_limit;
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_select_entity - Select next entity to process
>>> + *
>>> + * @sched: scheduler instance
>>> + * @peek: Just find, don't set to current.
>>> + *
>>> + * Returns the entity to process or NULL if none are found.
>>> + */
>>> +static struct drm_sched_entity *
>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool peek)
>>> +{
>>> +	struct drm_sched_entity *entity;
>>> +	int i;
>>> +
>>> +	if (!drm_sched_can_queue(sched))
>>> +		return NULL;
>>> +
>>> +	if (sched->single_entity) {
>>> +		if (!READ_ONCE(sched->single_entity->stopped) &&
>>> +		    drm_sched_entity_is_ready(sched->single_entity))
>>> +			return sched->single_entity;
>>> +
>>> +		return NULL;
>>> +	}
>>> +
>>> +	/* Kernel run queue has higher priority than normal run queue*/
>>> +	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>> +		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i], peek) :
>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i], peek);
>>> +		if (entity)
>>> +			break;
>>> +	}
>>> +
>>> +	return entity;
>>> +}
>>
>> Can you shed some light on why you need the "peek" capability, i.e. to just
>> see the entity without actually arming it?
>>
>> In a single-entity scheduling, surely peeking at the single entity of
>> a scheduler, can be done easier.
>>
>> I'm asking as none of these functions were intended as a peek-only/-capable
>> solution, and if you need it, this shows that the infrastructure lacks
>> something which you need.
>>
>> It's not designed for "peek", as you can see above: it gets passed-through
>> the function stack until it reaches core functionality to be used in logic.
>>
>> (It just makes the code too complicated with too many special cases, "if's"
>> and if we keep doing this, it would become very hard to understand,
>> maintain, and contribute to in a few years.)
>>
> 
> This question made me realize that the callers of this function inverted
> the peek arguments. Will fix in next rev.
> 
> The peek is need for drm_sched_run_job_queue_if_ready which checks if a
> job is ready before queuing the job run worker. This is called from the
> free job worker (hw submission count decrement makes a new job ready) or
> from the run job worker (another job in queue).

Yes, we kind of want "queue if ready" to be "atomic", instead of thrashing
about like,
	if run_if_ready(peek),
		j = run_if_ready(!peek),
			run_job(j).
to become
	j = run_if_ready(void).
	if (j) run_job(j).

> If we want to blindly queue the job in these cases then this can be
> removed.

Yes, it's what we're doing now.

> Or alternatively we could remove the peek argument and blindly reinit
> the idle when selecting entity. I think this fine if I am reading the
> code correctly. If you agree, I'd lean towards this option.

Yes please--it's what we have now, so no need to change it. select_entity()
really just gives back the entity whose jobs we can (should) run next.

So, yes, let's leave it as is, without a "peek". Thanks!

>  
>>> +
>>> +/**
>>> + * drm_sched_run_job_queue_if_ready - enqueue run-job work if ready
>>> + * @sched: scheduler instance
>>> + */
>>
>> "if ready" here makes perfect sense, but in the "free job" case,
>> it should be "if done". See below.
>>
> 
> Yes, agree that if done for job makes more sense.
> 
>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	if (drm_sched_select_entity(sched, false))
>>> +		drm_sched_run_job_queue(sched);
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_free_job_queue - enqueue free-job work
>>>   * @sched: scheduler instance
>>>   */
>>> -static void drm_sched_wqueue_enqueue(struct drm_gpu_scheduler *sched)
>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>>>  {
>>>  	if (!READ_ONCE(sched->pause_submit))
>>> -		queue_work(sched->submit_wq, &sched->work_submit);
>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_free_job_queue_if_ready - enqueue free-job work if ready
>>
>> This should be "if done". Please change this to "if done".
>>
>>> + * @sched: scheduler instance
>>> + */
>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>
>> This should be "if_done". Please change this to "if_done".
>>
>> A "job" is _done_ when it's on the pending list and its fence has been signalled,
>> and we free a job only when it's done, not "ready".
>>
> 
> Agree on all of this.

Great! :-)

Regards,
Luben

> 
>>> +{
>>> +	struct drm_sched_job *job;
>>> +
>>> +	spin_lock(&sched->job_list_lock);
>>> +	job = list_first_entry_or_null(&sched->pending_list,
>>> +				       struct drm_sched_job, list);
>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
>>> +		drm_sched_free_job_queue(sched);
>>> +	spin_unlock(&sched->job_list_lock);
>>>  }
>>>  
>>>  /**
>>> @@ -310,7 +403,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>>  	dma_fence_get(&s_fence->finished);
>>>  	drm_sched_fence_finished(s_fence, result);
>>>  	dma_fence_put(&s_fence->finished);
>>> -	drm_sched_wqueue_enqueue(sched);
>>> +	drm_sched_free_job_queue(sched);
>>>  }
>>>  
>>>  /**
>>> @@ -576,7 +669,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>>>  				drm_sched_job_done(s_job, fence->error);
>>>  			else if (r)
>>>  				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
>>> -					  r);
>>> +					      r);
>>>  		} else
>>>  			drm_sched_job_done(s_job, -ECANCELED);
>>>  	}
>>> @@ -885,18 +978,6 @@ void drm_sched_job_cleanup(struct drm_sched_job *job)
>>>  }
>>>  EXPORT_SYMBOL(drm_sched_job_cleanup);
>>>  
>>> -/**
>>> - * drm_sched_can_queue -- Can we queue more to the hardware?
>>> - * @sched: scheduler instance
>>> - *
>>> - * Return true if we can push more jobs to the hw, otherwise false.
>>> - */
>>> -static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>> -{
>>> -	return atomic_read(&sched->hw_rq_count) <
>>> -		sched->hw_submission_limit;
>>> -}
>>> -
>>>  /**
>>>   * drm_sched_wakeup_if_can_queue - Wake up the scheduler
>>>   * @sched: scheduler instance
>>> @@ -906,43 +987,7 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>>  void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>>>  {
>>>  	if (drm_sched_can_queue(sched))
>>> -		drm_sched_wqueue_enqueue(sched);
>>> -}
>>> -
>>> -/**
>>> - * drm_sched_select_entity - Select next entity to process
>>> - *
>>> - * @sched: scheduler instance
>>> - *
>>> - * Returns the entity to process or NULL if none are found.
>>> - */
>>> -static struct drm_sched_entity *
>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>> -{
>>> -	struct drm_sched_entity *entity;
>>> -	int i;
>>> -
>>> -	if (!drm_sched_can_queue(sched))
>>> -		return NULL;
>>> -
>>> -	if (sched->single_entity) {
>>> -		if (!READ_ONCE(sched->single_entity->stopped) &&
>>> -		    drm_sched_entity_is_ready(sched->single_entity))
>>> -			return sched->single_entity;
>>> -
>>> -		return NULL;
>>> -	}
>>> -
>>> -	/* Kernel run queue has higher priority than normal run queue*/
>>> -	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>> -		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
>>> -		if (entity)
>>> -			break;
>>> -	}
>>> -
>>> -	return entity;
>>> +		drm_sched_run_job_queue(sched);
>>>  }
>>>  
>>>  /**
>>> @@ -974,8 +1019,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>>>  						typeof(*next), list);
>>>  
>>>  		if (next) {
>>> -			next->s_fence->scheduled.timestamp =
>>> -				dma_fence_timestamp(&job->s_fence->finished);
>>> +			if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
>>> +				     &next->s_fence->scheduled.flags))
>>> +				next->s_fence->scheduled.timestamp =
>>> +					dma_fence_timestamp(&job->s_fence->finished);
>>>  			/* start TO timer for next job */
>>>  			drm_sched_start_timeout(sched);
>>>  		}
>>> @@ -1025,74 +1072,83 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>  EXPORT_SYMBOL(drm_sched_pick_best);
>>>  
>>>  /**
>>> - * drm_sched_main - main scheduler thread
>>> + * drm_sched_free_job_work - worker to call free_job
>>>   *
>>> - * @param: scheduler instance
>>> + * @w: free job work
>>>   */
>>> -static void drm_sched_main(struct work_struct *w)
>>> +static void drm_sched_free_job_work(struct work_struct *w)
>>>  {
>>>  	struct drm_gpu_scheduler *sched =
>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
>>> -	struct drm_sched_entity *entity;
>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>>>  	struct drm_sched_job *cleanup_job;
>>> -	int r;
>>>  
>>>  	if (READ_ONCE(sched->pause_submit))
>>>  		return;
>>>  
>>>  	cleanup_job = drm_sched_get_cleanup_job(sched);
>>> -	entity = drm_sched_select_entity(sched);
>>> -
>>> -	if (!entity && !cleanup_job)
>>> -		return;	/* No more work */
>>> -
>>> -	if (cleanup_job)
>>> +	if (cleanup_job) {
>>>  		sched->ops->free_job(cleanup_job);
>>>  
>>> -	if (entity) {
>>> -		struct dma_fence *fence;
>>> -		struct drm_sched_fence *s_fence;
>>> -		struct drm_sched_job *sched_job;
>>> -
>>> -		sched_job = drm_sched_entity_pop_job(entity);
>>> -		if (!sched_job) {
>>> -			complete_all(&entity->entity_idle);
>>> -			if (!cleanup_job)
>>> -				return;	/* No more work */
>>> -			goto again;
>>> -		}
>>> +		drm_sched_free_job_queue_if_ready(sched);
>>> +		drm_sched_run_job_queue_if_ready(sched);
>>> +	}
>>> +}
>>>  
>>> -		s_fence = sched_job->s_fence;
>>> +/**
>>> + * drm_sched_run_job_work - worker to call run_job
>>> + *
>>> + * @w: run job work
>>> + */
>>> +static void drm_sched_run_job_work(struct work_struct *w)
>>> +{
>>> +	struct drm_gpu_scheduler *sched =
>>> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
>>> +	struct drm_sched_entity *entity;
>>> +	struct dma_fence *fence;
>>> +	struct drm_sched_fence *s_fence;
>>> +	struct drm_sched_job *sched_job;
>>> +	int r;
>>>  
>>> -		atomic_inc(&sched->hw_rq_count);
>>> -		drm_sched_job_begin(sched_job);
>>> +	if (READ_ONCE(sched->pause_submit))
>>> +		return;
>>> +
>>> +	entity = drm_sched_select_entity(sched, true);
>>> +	if (!entity)
>>> +		return;
>>>  
>>> -		trace_drm_run_job(sched_job, entity);
>>> -		fence = sched->ops->run_job(sched_job);
>>> +	sched_job = drm_sched_entity_pop_job(entity);
>>> +	if (!sched_job) {
>>>  		complete_all(&entity->entity_idle);
>>> -		drm_sched_fence_scheduled(s_fence, fence);
>>> +		return;	/* No more work */
>>> +	}
>>>  
>>> -		if (!IS_ERR_OR_NULL(fence)) {
>>> -			/* Drop for original kref_init of the fence */
>>> -			dma_fence_put(fence);
>>> +	s_fence = sched_job->s_fence;
>>>  
>>> -			r = dma_fence_add_callback(fence, &sched_job->cb,
>>> -						   drm_sched_job_done_cb);
>>> -			if (r == -ENOENT)
>>> -				drm_sched_job_done(sched_job, fence->error);
>>> -			else if (r)
>>> -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
>>> -					  r);
>>> -		} else {
>>> -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
>>> -					   PTR_ERR(fence) : 0);
>>> -		}
>>> +	atomic_inc(&sched->hw_rq_count);
>>> +	drm_sched_job_begin(sched_job);
>>>  
>>> -		wake_up(&sched->job_scheduled);
>>> +	trace_drm_run_job(sched_job, entity);
>>> +	fence = sched->ops->run_job(sched_job);
>>> +	complete_all(&entity->entity_idle);
>>> +	drm_sched_fence_scheduled(s_fence, fence);
>>> +
>>> +	if (!IS_ERR_OR_NULL(fence)) {
>>> +		/* Drop for original kref_init of the fence */
>>> +		dma_fence_put(fence);
>>> +
>>> +		r = dma_fence_add_callback(fence, &sched_job->cb,
>>> +					   drm_sched_job_done_cb);
>>> +		if (r == -ENOENT)
>>> +			drm_sched_job_done(sched_job, fence->error);
>>> +		else if (r)
>>> +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
>>> +	} else {
>>> +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
>>> +				   PTR_ERR(fence) : 0);
>>>  	}
>>>  
>>> -again:
>>> -	drm_sched_wqueue_enqueue(sched);
>>> +	wake_up(&sched->job_scheduled);
>>> +	drm_sched_run_job_queue_if_ready(sched);
>>>  }
>>>  
>>>  /**
>>> @@ -1159,7 +1215,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>  	spin_lock_init(&sched->job_list_lock);
>>>  	atomic_set(&sched->hw_rq_count, 0);
>>>  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>> -	INIT_WORK(&sched->work_submit, drm_sched_main);
>>> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>>>  	atomic_set(&sched->_score, 0);
>>>  	atomic64_set(&sched->job_id_count, 0);
>>>  	sched->pause_submit = false;
>>> @@ -1282,7 +1339,8 @@ EXPORT_SYMBOL(drm_sched_wqueue_ready);
>>>  void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
>>>  {
>>>  	WRITE_ONCE(sched->pause_submit, true);
>>> -	cancel_work_sync(&sched->work_submit);
>>> +	cancel_work_sync(&sched->work_run_job);
>>> +	cancel_work_sync(&sched->work_free_job);
>>>  }
>>>  EXPORT_SYMBOL(drm_sched_wqueue_stop);
>>>  
>>> @@ -1294,6 +1352,7 @@ EXPORT_SYMBOL(drm_sched_wqueue_stop);
>>>  void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
>>>  {
>>>  	WRITE_ONCE(sched->pause_submit, false);
>>> -	queue_work(sched->submit_wq, &sched->work_submit);
>>> +	queue_work(sched->submit_wq, &sched->work_run_job);
>>> +	queue_work(sched->submit_wq, &sched->work_free_job);
>>>  }
>>>  EXPORT_SYMBOL(drm_sched_wqueue_start);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index e67b53c3546b..625ffe040de3 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>>>   *                 finished.
>>>   * @hw_rq_count: the number of jobs currently in the hardware queue.
>>>   * @job_id_count: used to assign unique id to the each job.
>>> - * @submit_wq: workqueue used to queue @work_submit
>>> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>>   * @timeout_wq: workqueue used to queue @work_tdr
>>> - * @work_submit: schedules jobs and cleans up entities
>>> + * @work_run_job: schedules jobs
>>
>> Perhaps a more descriptive explanation could be had,
>>
>> 	@work_run_job: work which calls run_job op of each scheduler.
>>
> 
> Got it.
>  
>>> + * @work_free_job: cleans up jobs
>>>   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>   *            timeout interval is over.
>>>   * @pending_list: the list of jobs which are currently in the job queue.
>>> @@ -519,7 +520,8 @@ struct drm_gpu_scheduler {
>>>  	atomic64_t			job_id_count;
>>>  	struct workqueue_struct		*submit_wq;
>>>  	struct workqueue_struct		*timeout_wq;
>>> -	struct work_struct		work_submit;
>>> +	struct work_struct		work_run_job;
>>> +	struct work_struct		work_free_job;
>>>  	struct delayed_work		work_tdr;
>>>  	struct list_head		pending_list;
>>>  	spinlock_t			job_list_lock;
>>
>> -- 
> 
> Thanks for the review.
> 
> Matt
> 
>> Regards,
>> Luben
>>
Boris Brezillon Oct. 23, 2023, 12:16 p.m. UTC | #4
Hi,

On Tue, 17 Oct 2023 08:09:56 -0700
Matthew Brost <matthew.brost@intel.com> wrote:

> +static void drm_sched_run_job_work(struct work_struct *w)
> +{
> +	struct drm_gpu_scheduler *sched =
> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> +	struct drm_sched_entity *entity;
> +	struct dma_fence *fence;
> +	struct drm_sched_fence *s_fence;
> +	struct drm_sched_job *sched_job;
> +	int r;
>  
> -		atomic_inc(&sched->hw_rq_count);
> -		drm_sched_job_begin(sched_job);
> +	if (READ_ONCE(sched->pause_submit))
> +		return;
> +
> +	entity = drm_sched_select_entity(sched, true);
> +	if (!entity)
> +		return;
>  
> -		trace_drm_run_job(sched_job, entity);
> -		fence = sched->ops->run_job(sched_job);
> +	sched_job = drm_sched_entity_pop_job(entity);
> +	if (!sched_job) {
>  		complete_all(&entity->entity_idle);
> -		drm_sched_fence_scheduled(s_fence, fence);
> +		return;	/* No more work */
> +	}
>  
> -		if (!IS_ERR_OR_NULL(fence)) {
> -			/* Drop for original kref_init of the fence */
> -			dma_fence_put(fence);
> +	s_fence = sched_job->s_fence;
>  
> -			r = dma_fence_add_callback(fence, &sched_job->cb,
> -						   drm_sched_job_done_cb);
> -			if (r == -ENOENT)
> -				drm_sched_job_done(sched_job, fence->error);
> -			else if (r)
> -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> -					  r);
> -		} else {
> -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
> -					   PTR_ERR(fence) : 0);
> -		}
> +	atomic_inc(&sched->hw_rq_count);
> +	drm_sched_job_begin(sched_job);
>  
> -		wake_up(&sched->job_scheduled);
> +	trace_drm_run_job(sched_job, entity);
> +	fence = sched->ops->run_job(sched_job);
> +	complete_all(&entity->entity_idle);
> +	drm_sched_fence_scheduled(s_fence, fence);
> +
> +	if (!IS_ERR_OR_NULL(fence)) {
> +		/* Drop for original kref_init of the fence */
> +		dma_fence_put(fence);
> +
> +		r = dma_fence_add_callback(fence, &sched_job->cb,
> +					   drm_sched_job_done_cb);
> +		if (r == -ENOENT)
> +			drm_sched_job_done(sched_job, fence->error);
> +		else if (r)
> +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
> +	} else {
> +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
> +				   PTR_ERR(fence) : 0);
>  	}

Just ran into a race condition when using a non-ordered workqueue
for drm_sched:

thread A							thread B

drm_sched_run_job_work()			
  drm_sched_job_begin()
    // inserts jobA in pending_list

								drm_sched_free_job_work()
								  drm_sched_get_cleanup_job()
								    // check first job in pending list
								    // if s_fence->parent == NULL, consider
								    // for cleanup
								  ->free_job(jobA)
								    drm_sched_job_cleanup()
								      // set sched_job->s_fence = NULL

    ->run_job()
    drm_sched_fence_scheduled()
      // sched_job->s_fence->parent = parent_fence
      // BOOM => NULL pointer deref

For now, I'll just use a dedicated ordered wq, but if we claim
multi-threaded workqueues are supported, this is probably worth fixing.
I know there's been some discussions about when the timeout should be
started, and the job insertion in the pending_list is kinda related.
If we want this insertion to happen before ->run_job() is called, we
need a way to flag when a job is inserted, but not fully submitted yet.

Regards,

Boris
Boris Brezillon Oct. 23, 2023, 12:39 p.m. UTC | #5
On Mon, 23 Oct 2023 14:16:06 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> Hi,
> 
> On Tue, 17 Oct 2023 08:09:56 -0700
> Matthew Brost <matthew.brost@intel.com> wrote:
> 
> > +static void drm_sched_run_job_work(struct work_struct *w)
> > +{
> > +	struct drm_gpu_scheduler *sched =
> > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > +	struct drm_sched_entity *entity;
> > +	struct dma_fence *fence;
> > +	struct drm_sched_fence *s_fence;
> > +	struct drm_sched_job *sched_job;
> > +	int r;
> >  
> > -		atomic_inc(&sched->hw_rq_count);
> > -		drm_sched_job_begin(sched_job);
> > +	if (READ_ONCE(sched->pause_submit))
> > +		return;
> > +
> > +	entity = drm_sched_select_entity(sched, true);
> > +	if (!entity)
> > +		return;
> >  
> > -		trace_drm_run_job(sched_job, entity);
> > -		fence = sched->ops->run_job(sched_job);
> > +	sched_job = drm_sched_entity_pop_job(entity);
> > +	if (!sched_job) {
> >  		complete_all(&entity->entity_idle);
> > -		drm_sched_fence_scheduled(s_fence, fence);
> > +		return;	/* No more work */
> > +	}
> >  
> > -		if (!IS_ERR_OR_NULL(fence)) {
> > -			/* Drop for original kref_init of the fence */
> > -			dma_fence_put(fence);
> > +	s_fence = sched_job->s_fence;
> >  
> > -			r = dma_fence_add_callback(fence, &sched_job->cb,
> > -						   drm_sched_job_done_cb);
> > -			if (r == -ENOENT)
> > -				drm_sched_job_done(sched_job, fence->error);
> > -			else if (r)
> > -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> > -					  r);
> > -		} else {
> > -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > -					   PTR_ERR(fence) : 0);
> > -		}
> > +	atomic_inc(&sched->hw_rq_count);
> > +	drm_sched_job_begin(sched_job);
> >  
> > -		wake_up(&sched->job_scheduled);
> > +	trace_drm_run_job(sched_job, entity);
> > +	fence = sched->ops->run_job(sched_job);
> > +	complete_all(&entity->entity_idle);
> > +	drm_sched_fence_scheduled(s_fence, fence);
> > +
> > +	if (!IS_ERR_OR_NULL(fence)) {
> > +		/* Drop for original kref_init of the fence */
> > +		dma_fence_put(fence);
> > +
> > +		r = dma_fence_add_callback(fence, &sched_job->cb,
> > +					   drm_sched_job_done_cb);
> > +		if (r == -ENOENT)
> > +			drm_sched_job_done(sched_job, fence->error);
> > +		else if (r)
> > +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
> > +	} else {
> > +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > +				   PTR_ERR(fence) : 0);
> >  	}  
> 
> Just ran into a race condition when using a non-ordered workqueue
> for drm_sched:
> 
> thread A							thread B
> 
> drm_sched_run_job_work()			
>   drm_sched_job_begin()
>     // inserts jobA in pending_list
> 
> 								drm_sched_free_job_work()
> 								  drm_sched_get_cleanup_job()
> 								    // check first job in pending list
> 								    // if s_fence->parent == NULL, consider
> 								    // for cleanup
> 								  ->free_job(jobA)  
> 								    drm_sched_job_cleanup()
> 								      // set sched_job->s_fence = NULL
> 
>     ->run_job()  
>     drm_sched_fence_scheduled()

Correction: the NULL pointer deref happens in drm_sched_job_done()
(when the driver returns an error directly) not in
drm_sched_fence_scheduled(), but the problem remains the same.


>       // sched_job->s_fence->parent = parent_fence
>       // BOOM => NULL pointer deref
> 
> For now, I'll just use a dedicated ordered wq, but if we claim
> multi-threaded workqueues are supported, this is probably worth fixing.
> I know there's been some discussions about when the timeout should be
> started, and the job insertion in the pending_list is kinda related.
> If we want this insertion to happen before ->run_job() is called, we
> need a way to flag when a job is inserted, but not fully submitted yet.
> 
> Regards,
> 
> Boris
Matthew Brost Oct. 23, 2023, 1:54 p.m. UTC | #6
On Mon, Oct 23, 2023 at 02:39:37PM +0200, Boris Brezillon wrote:
> On Mon, 23 Oct 2023 14:16:06 +0200
> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> 
> > Hi,
> > 
> > On Tue, 17 Oct 2023 08:09:56 -0700
> > Matthew Brost <matthew.brost@intel.com> wrote:
> > 
> > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > +{
> > > +	struct drm_gpu_scheduler *sched =
> > > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > +	struct drm_sched_entity *entity;
> > > +	struct dma_fence *fence;
> > > +	struct drm_sched_fence *s_fence;
> > > +	struct drm_sched_job *sched_job;
> > > +	int r;
> > >  
> > > -		atomic_inc(&sched->hw_rq_count);
> > > -		drm_sched_job_begin(sched_job);
> > > +	if (READ_ONCE(sched->pause_submit))
> > > +		return;
> > > +
> > > +	entity = drm_sched_select_entity(sched, true);
> > > +	if (!entity)
> > > +		return;
> > >  
> > > -		trace_drm_run_job(sched_job, entity);
> > > -		fence = sched->ops->run_job(sched_job);
> > > +	sched_job = drm_sched_entity_pop_job(entity);
> > > +	if (!sched_job) {
> > >  		complete_all(&entity->entity_idle);
> > > -		drm_sched_fence_scheduled(s_fence, fence);
> > > +		return;	/* No more work */
> > > +	}
> > >  
> > > -		if (!IS_ERR_OR_NULL(fence)) {
> > > -			/* Drop for original kref_init of the fence */
> > > -			dma_fence_put(fence);
> > > +	s_fence = sched_job->s_fence;
> > >  
> > > -			r = dma_fence_add_callback(fence, &sched_job->cb,
> > > -						   drm_sched_job_done_cb);
> > > -			if (r == -ENOENT)
> > > -				drm_sched_job_done(sched_job, fence->error);
> > > -			else if (r)
> > > -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> > > -					  r);
> > > -		} else {
> > > -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > > -					   PTR_ERR(fence) : 0);
> > > -		}
> > > +	atomic_inc(&sched->hw_rq_count);
> > > +	drm_sched_job_begin(sched_job);
> > >  
> > > -		wake_up(&sched->job_scheduled);
> > > +	trace_drm_run_job(sched_job, entity);
> > > +	fence = sched->ops->run_job(sched_job);
> > > +	complete_all(&entity->entity_idle);
> > > +	drm_sched_fence_scheduled(s_fence, fence);
> > > +
> > > +	if (!IS_ERR_OR_NULL(fence)) {
> > > +		/* Drop for original kref_init of the fence */
> > > +		dma_fence_put(fence);
> > > +
> > > +		r = dma_fence_add_callback(fence, &sched_job->cb,
> > > +					   drm_sched_job_done_cb);
> > > +		if (r == -ENOENT)
> > > +			drm_sched_job_done(sched_job, fence->error);
> > > +		else if (r)
> > > +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
> > > +	} else {
> > > +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > > +				   PTR_ERR(fence) : 0);
> > >  	}  
> > 
> > Just ran into a race condition when using a non-ordered workqueue
> > for drm_sched:
> > 
> > thread A							thread B
> > 
> > drm_sched_run_job_work()			
> >   drm_sched_job_begin()
> >     // inserts jobA in pending_list
> > 
> > 								drm_sched_free_job_work()
> > 								  drm_sched_get_cleanup_job()
> > 								    // check first job in pending list
> > 								    // if s_fence->parent == NULL, consider
> > 								    // for cleanup
> > 								  ->free_job(jobA)  
> > 								    drm_sched_job_cleanup()
> > 								      // set sched_job->s_fence = NULL
> > 
> >     ->run_job()  
> >     drm_sched_fence_scheduled()
> 
> Correction: the NULL pointer deref happens in drm_sched_job_done()
> (when the driver returns an error directly) not in
> drm_sched_fence_scheduled(), but the problem remains the same.
> 
> 

Trying to understand this. I don't see how drm_sched_get_cleanup_job can
return a job until dma_fence_is_signaled(&job->s_fence->finished) is
true. That fence is no signaled until drm_sched_fence_finished(s_fence,
result); called in drm_sched_job_done().

What am I missing here?

Matt

> >       // sched_job->s_fence->parent = parent_fence
> >       // BOOM => NULL pointer deref
> > 
> > For now, I'll just use a dedicated ordered wq, but if we claim
> > multi-threaded workqueues are supported, this is probably worth fixing.
> > I know there's been some discussions about when the timeout should be
> > started, and the job insertion in the pending_list is kinda related.
> > If we want this insertion to happen before ->run_job() is called, we
> > need a way to flag when a job is inserted, but not fully submitted yet.
> > 
> > Regards,
> > 
> > Boris
>
Boris Brezillon Oct. 23, 2023, 2:22 p.m. UTC | #7
On Mon, 23 Oct 2023 13:54:13 +0000
Matthew Brost <matthew.brost@intel.com> wrote:

> On Mon, Oct 23, 2023 at 02:39:37PM +0200, Boris Brezillon wrote:
> > On Mon, 23 Oct 2023 14:16:06 +0200
> > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> >   
> > > Hi,
> > > 
> > > On Tue, 17 Oct 2023 08:09:56 -0700
> > > Matthew Brost <matthew.brost@intel.com> wrote:
> > >   
> > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > +{
> > > > +	struct drm_gpu_scheduler *sched =
> > > > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > +	struct drm_sched_entity *entity;
> > > > +	struct dma_fence *fence;
> > > > +	struct drm_sched_fence *s_fence;
> > > > +	struct drm_sched_job *sched_job;
> > > > +	int r;
> > > >  
> > > > -		atomic_inc(&sched->hw_rq_count);
> > > > -		drm_sched_job_begin(sched_job);
> > > > +	if (READ_ONCE(sched->pause_submit))
> > > > +		return;
> > > > +
> > > > +	entity = drm_sched_select_entity(sched, true);
> > > > +	if (!entity)
> > > > +		return;
> > > >  
> > > > -		trace_drm_run_job(sched_job, entity);
> > > > -		fence = sched->ops->run_job(sched_job);
> > > > +	sched_job = drm_sched_entity_pop_job(entity);
> > > > +	if (!sched_job) {
> > > >  		complete_all(&entity->entity_idle);
> > > > -		drm_sched_fence_scheduled(s_fence, fence);
> > > > +		return;	/* No more work */
> > > > +	}
> > > >  
> > > > -		if (!IS_ERR_OR_NULL(fence)) {
> > > > -			/* Drop for original kref_init of the fence */
> > > > -			dma_fence_put(fence);
> > > > +	s_fence = sched_job->s_fence;
> > > >  
> > > > -			r = dma_fence_add_callback(fence, &sched_job->cb,
> > > > -						   drm_sched_job_done_cb);
> > > > -			if (r == -ENOENT)
> > > > -				drm_sched_job_done(sched_job, fence->error);
> > > > -			else if (r)
> > > > -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> > > > -					  r);
> > > > -		} else {
> > > > -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > > > -					   PTR_ERR(fence) : 0);
> > > > -		}
> > > > +	atomic_inc(&sched->hw_rq_count);
> > > > +	drm_sched_job_begin(sched_job);
> > > >  
> > > > -		wake_up(&sched->job_scheduled);
> > > > +	trace_drm_run_job(sched_job, entity);
> > > > +	fence = sched->ops->run_job(sched_job);
> > > > +	complete_all(&entity->entity_idle);
> > > > +	drm_sched_fence_scheduled(s_fence, fence);
> > > > +
> > > > +	if (!IS_ERR_OR_NULL(fence)) {
> > > > +		/* Drop for original kref_init of the fence */
> > > > +		dma_fence_put(fence);
> > > > +
> > > > +		r = dma_fence_add_callback(fence, &sched_job->cb,
> > > > +					   drm_sched_job_done_cb);
> > > > +		if (r == -ENOENT)
> > > > +			drm_sched_job_done(sched_job, fence->error);
> > > > +		else if (r)
> > > > +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
> > > > +	} else {
> > > > +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > > > +				   PTR_ERR(fence) : 0);
> > > >  	}    
> > > 
> > > Just ran into a race condition when using a non-ordered workqueue
> > > for drm_sched:
> > > 
> > > thread A							thread B
> > > 
> > > drm_sched_run_job_work()			
> > >   drm_sched_job_begin()
> > >     // inserts jobA in pending_list
> > > 
> > > 								drm_sched_free_job_work()
> > > 								  drm_sched_get_cleanup_job()
> > > 								    // check first job in pending list
> > > 								    // if s_fence->parent == NULL, consider
> > > 								    // for cleanup  
> > > 								  ->free_job(jobA)    
> > > 								    drm_sched_job_cleanup()
> > > 								      // set sched_job->s_fence = NULL
> > >   
> > >     ->run_job()    
> > >     drm_sched_fence_scheduled()  
> > 
> > Correction: the NULL pointer deref happens in drm_sched_job_done()
> > (when the driver returns an error directly) not in
> > drm_sched_fence_scheduled(), but the problem remains the same.
> > 
> >   
> 
> Trying to understand this. I don't see how drm_sched_get_cleanup_job can
> return a job until dma_fence_is_signaled(&job->s_fence->finished) is
> true. That fence is no signaled until drm_sched_fence_finished(s_fence,
> result); called in drm_sched_job_done().
> 
> What am I missing here?

Uh, my bad. I was trying to backport panthor (and all its deps) to a 6.1
kernel, and the condition in drm_sched_get_cleanup_job() is different
there:

        if (job && (!job->s_fence->parent ||
                    dma_fence_is_signaled(job->s_fence->parent))) {
		// pick this job for cleanup
	}

Sorry for the noise.
Matthew Brost Oct. 23, 2023, 10:03 p.m. UTC | #8
On Mon, Oct 23, 2023 at 04:22:16PM +0200, Boris Brezillon wrote:
> On Mon, 23 Oct 2023 13:54:13 +0000
> Matthew Brost <matthew.brost@intel.com> wrote:
> 
> > On Mon, Oct 23, 2023 at 02:39:37PM +0200, Boris Brezillon wrote:
> > > On Mon, 23 Oct 2023 14:16:06 +0200
> > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > >   
> > > > Hi,
> > > > 
> > > > On Tue, 17 Oct 2023 08:09:56 -0700
> > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > >   
> > > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > > +{
> > > > > +	struct drm_gpu_scheduler *sched =
> > > > > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > > +	struct drm_sched_entity *entity;
> > > > > +	struct dma_fence *fence;
> > > > > +	struct drm_sched_fence *s_fence;
> > > > > +	struct drm_sched_job *sched_job;
> > > > > +	int r;
> > > > >  
> > > > > -		atomic_inc(&sched->hw_rq_count);
> > > > > -		drm_sched_job_begin(sched_job);
> > > > > +	if (READ_ONCE(sched->pause_submit))
> > > > > +		return;
> > > > > +
> > > > > +	entity = drm_sched_select_entity(sched, true);
> > > > > +	if (!entity)
> > > > > +		return;
> > > > >  
> > > > > -		trace_drm_run_job(sched_job, entity);
> > > > > -		fence = sched->ops->run_job(sched_job);
> > > > > +	sched_job = drm_sched_entity_pop_job(entity);
> > > > > +	if (!sched_job) {
> > > > >  		complete_all(&entity->entity_idle);
> > > > > -		drm_sched_fence_scheduled(s_fence, fence);
> > > > > +		return;	/* No more work */
> > > > > +	}
> > > > >  
> > > > > -		if (!IS_ERR_OR_NULL(fence)) {
> > > > > -			/* Drop for original kref_init of the fence */
> > > > > -			dma_fence_put(fence);
> > > > > +	s_fence = sched_job->s_fence;
> > > > >  
> > > > > -			r = dma_fence_add_callback(fence, &sched_job->cb,
> > > > > -						   drm_sched_job_done_cb);
> > > > > -			if (r == -ENOENT)
> > > > > -				drm_sched_job_done(sched_job, fence->error);
> > > > > -			else if (r)
> > > > > -				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
> > > > > -					  r);
> > > > > -		} else {
> > > > > -			drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > > > > -					   PTR_ERR(fence) : 0);
> > > > > -		}
> > > > > +	atomic_inc(&sched->hw_rq_count);
> > > > > +	drm_sched_job_begin(sched_job);
> > > > >  
> > > > > -		wake_up(&sched->job_scheduled);
> > > > > +	trace_drm_run_job(sched_job, entity);
> > > > > +	fence = sched->ops->run_job(sched_job);
> > > > > +	complete_all(&entity->entity_idle);
> > > > > +	drm_sched_fence_scheduled(s_fence, fence);
> > > > > +
> > > > > +	if (!IS_ERR_OR_NULL(fence)) {
> > > > > +		/* Drop for original kref_init of the fence */
> > > > > +		dma_fence_put(fence);
> > > > > +
> > > > > +		r = dma_fence_add_callback(fence, &sched_job->cb,
> > > > > +					   drm_sched_job_done_cb);
> > > > > +		if (r == -ENOENT)
> > > > > +			drm_sched_job_done(sched_job, fence->error);
> > > > > +		else if (r)
> > > > > +			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
> > > > > +	} else {
> > > > > +		drm_sched_job_done(sched_job, IS_ERR(fence) ?
> > > > > +				   PTR_ERR(fence) : 0);
> > > > >  	}    
> > > > 
> > > > Just ran into a race condition when using a non-ordered workqueue
> > > > for drm_sched:
> > > > 
> > > > thread A							thread B
> > > > 
> > > > drm_sched_run_job_work()			
> > > >   drm_sched_job_begin()
> > > >     // inserts jobA in pending_list
> > > > 
> > > > 								drm_sched_free_job_work()
> > > > 								  drm_sched_get_cleanup_job()
> > > > 								    // check first job in pending list
> > > > 								    // if s_fence->parent == NULL, consider
> > > > 								    // for cleanup  
> > > > 								  ->free_job(jobA)    
> > > > 								    drm_sched_job_cleanup()
> > > > 								      // set sched_job->s_fence = NULL
> > > >   
> > > >     ->run_job()    
> > > >     drm_sched_fence_scheduled()  
> > > 
> > > Correction: the NULL pointer deref happens in drm_sched_job_done()
> > > (when the driver returns an error directly) not in
> > > drm_sched_fence_scheduled(), but the problem remains the same.
> > > 
> > >   
> > 
> > Trying to understand this. I don't see how drm_sched_get_cleanup_job can
> > return a job until dma_fence_is_signaled(&job->s_fence->finished) is
> > true. That fence is no signaled until drm_sched_fence_finished(s_fence,
> > result); called in drm_sched_job_done().
> > 
> > What am I missing here?
> 
> Uh, my bad. I was trying to backport panthor (and all its deps) to a 6.1
> kernel, and the condition in drm_sched_get_cleanup_job() is different
> there:
> 
>         if (job && (!job->s_fence->parent ||
>                     dma_fence_is_signaled(job->s_fence->parent))) {
> 		// pick this job for cleanup
> 	}
> 
> Sorry for the noise.

No problem, glad this is resolved.

Matt
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 273e0fbc4eab..b1b8d9f96da5 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -213,11 +213,12 @@  void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
  * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
  *
  * @rq: scheduler run queue to check.
+ * @peek: Just find, don't set to current.
  *
  * Try to find a ready entity, returns NULL if none found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
+drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool peek)
 {
 	struct drm_sched_entity *entity;
 
@@ -227,8 +228,10 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
 	if (entity) {
 		list_for_each_entry_continue(entity, &rq->entities, list) {
 			if (drm_sched_entity_is_ready(entity)) {
-				rq->current_entity = entity;
-				reinit_completion(&entity->entity_idle);
+				if (!peek) {
+					rq->current_entity = entity;
+					reinit_completion(&entity->entity_idle);
+				}
 				spin_unlock(&rq->lock);
 				return entity;
 			}
@@ -238,8 +241,10 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
 	list_for_each_entry(entity, &rq->entities, list) {
 
 		if (drm_sched_entity_is_ready(entity)) {
-			rq->current_entity = entity;
-			reinit_completion(&entity->entity_idle);
+			if (!peek) {
+				rq->current_entity = entity;
+				reinit_completion(&entity->entity_idle);
+			}
 			spin_unlock(&rq->lock);
 			return entity;
 		}
@@ -257,11 +262,12 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
  * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
  *
  * @rq: scheduler run queue to check.
+ * @peek: Just find, don't set to current.
  *
  * Find oldest waiting ready entity, returns NULL if none found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
+drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool peek)
 {
 	struct rb_node *rb;
 
@@ -271,8 +277,10 @@  drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 
 		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
 		if (drm_sched_entity_is_ready(entity)) {
-			rq->current_entity = entity;
-			reinit_completion(&entity->entity_idle);
+			if (!peek) {
+				rq->current_entity = entity;
+				reinit_completion(&entity->entity_idle);
+			}
 			break;
 		}
 	}
@@ -282,13 +290,98 @@  drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 }
 
 /**
- * drm_sched_wqueue_enqueue - enqueue scheduler submission
+ * drm_sched_run_job_queue - enqueue run-job work
+ * @sched: scheduler instance
+ */
+static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
+{
+	if (!READ_ONCE(sched->pause_submit))
+		queue_work(sched->submit_wq, &sched->work_run_job);
+}
+
+/**
+ * drm_sched_can_queue -- Can we queue more to the hardware?
+ * @sched: scheduler instance
+ *
+ * Return true if we can push more jobs to the hw, otherwise false.
+ */
+static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
+{
+	return atomic_read(&sched->hw_rq_count) <
+		sched->hw_submission_limit;
+}
+
+/**
+ * drm_sched_select_entity - Select next entity to process
+ *
+ * @sched: scheduler instance
+ * @peek: Just find, don't set to current.
+ *
+ * Returns the entity to process or NULL if none are found.
+ */
+static struct drm_sched_entity *
+drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool peek)
+{
+	struct drm_sched_entity *entity;
+	int i;
+
+	if (!drm_sched_can_queue(sched))
+		return NULL;
+
+	if (sched->single_entity) {
+		if (!READ_ONCE(sched->single_entity->stopped) &&
+		    drm_sched_entity_is_ready(sched->single_entity))
+			return sched->single_entity;
+
+		return NULL;
+	}
+
+	/* Kernel run queue has higher priority than normal run queue*/
+	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
+		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
+			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i], peek) :
+			drm_sched_rq_select_entity_rr(&sched->sched_rq[i], peek);
+		if (entity)
+			break;
+	}
+
+	return entity;
+}
+
+/**
+ * drm_sched_run_job_queue_if_ready - enqueue run-job work if ready
+ * @sched: scheduler instance
+ */
+static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
+{
+	if (drm_sched_select_entity(sched, false))
+		drm_sched_run_job_queue(sched);
+}
+
+/**
+ * drm_sched_free_job_queue - enqueue free-job work
  * @sched: scheduler instance
  */
-static void drm_sched_wqueue_enqueue(struct drm_gpu_scheduler *sched)
+static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
 {
 	if (!READ_ONCE(sched->pause_submit))
-		queue_work(sched->submit_wq, &sched->work_submit);
+		queue_work(sched->submit_wq, &sched->work_free_job);
+}
+
+/**
+ * drm_sched_free_job_queue_if_ready - enqueue free-job work if ready
+ * @sched: scheduler instance
+ */
+static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
+{
+	struct drm_sched_job *job;
+
+	spin_lock(&sched->job_list_lock);
+	job = list_first_entry_or_null(&sched->pending_list,
+				       struct drm_sched_job, list);
+	if (job && dma_fence_is_signaled(&job->s_fence->finished))
+		drm_sched_free_job_queue(sched);
+	spin_unlock(&sched->job_list_lock);
 }
 
 /**
@@ -310,7 +403,7 @@  static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
 	dma_fence_get(&s_fence->finished);
 	drm_sched_fence_finished(s_fence, result);
 	dma_fence_put(&s_fence->finished);
-	drm_sched_wqueue_enqueue(sched);
+	drm_sched_free_job_queue(sched);
 }
 
 /**
@@ -576,7 +669,7 @@  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 				drm_sched_job_done(s_job, fence->error);
 			else if (r)
 				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
-					  r);
+					      r);
 		} else
 			drm_sched_job_done(s_job, -ECANCELED);
 	}
@@ -885,18 +978,6 @@  void drm_sched_job_cleanup(struct drm_sched_job *job)
 }
 EXPORT_SYMBOL(drm_sched_job_cleanup);
 
-/**
- * drm_sched_can_queue -- Can we queue more to the hardware?
- * @sched: scheduler instance
- *
- * Return true if we can push more jobs to the hw, otherwise false.
- */
-static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
-{
-	return atomic_read(&sched->hw_rq_count) <
-		sched->hw_submission_limit;
-}
-
 /**
  * drm_sched_wakeup_if_can_queue - Wake up the scheduler
  * @sched: scheduler instance
@@ -906,43 +987,7 @@  static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
 void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
 {
 	if (drm_sched_can_queue(sched))
-		drm_sched_wqueue_enqueue(sched);
-}
-
-/**
- * drm_sched_select_entity - Select next entity to process
- *
- * @sched: scheduler instance
- *
- * Returns the entity to process or NULL if none are found.
- */
-static struct drm_sched_entity *
-drm_sched_select_entity(struct drm_gpu_scheduler *sched)
-{
-	struct drm_sched_entity *entity;
-	int i;
-
-	if (!drm_sched_can_queue(sched))
-		return NULL;
-
-	if (sched->single_entity) {
-		if (!READ_ONCE(sched->single_entity->stopped) &&
-		    drm_sched_entity_is_ready(sched->single_entity))
-			return sched->single_entity;
-
-		return NULL;
-	}
-
-	/* Kernel run queue has higher priority than normal run queue*/
-	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
-		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
-			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
-			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
-		if (entity)
-			break;
-	}
-
-	return entity;
+		drm_sched_run_job_queue(sched);
 }
 
 /**
@@ -974,8 +1019,10 @@  drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 						typeof(*next), list);
 
 		if (next) {
-			next->s_fence->scheduled.timestamp =
-				dma_fence_timestamp(&job->s_fence->finished);
+			if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
+				     &next->s_fence->scheduled.flags))
+				next->s_fence->scheduled.timestamp =
+					dma_fence_timestamp(&job->s_fence->finished);
 			/* start TO timer for next job */
 			drm_sched_start_timeout(sched);
 		}
@@ -1025,74 +1072,83 @@  drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
 EXPORT_SYMBOL(drm_sched_pick_best);
 
 /**
- * drm_sched_main - main scheduler thread
+ * drm_sched_free_job_work - worker to call free_job
  *
- * @param: scheduler instance
+ * @w: free job work
  */
-static void drm_sched_main(struct work_struct *w)
+static void drm_sched_free_job_work(struct work_struct *w)
 {
 	struct drm_gpu_scheduler *sched =
-		container_of(w, struct drm_gpu_scheduler, work_submit);
-	struct drm_sched_entity *entity;
+		container_of(w, struct drm_gpu_scheduler, work_free_job);
 	struct drm_sched_job *cleanup_job;
-	int r;
 
 	if (READ_ONCE(sched->pause_submit))
 		return;
 
 	cleanup_job = drm_sched_get_cleanup_job(sched);
-	entity = drm_sched_select_entity(sched);
-
-	if (!entity && !cleanup_job)
-		return;	/* No more work */
-
-	if (cleanup_job)
+	if (cleanup_job) {
 		sched->ops->free_job(cleanup_job);
 
-	if (entity) {
-		struct dma_fence *fence;
-		struct drm_sched_fence *s_fence;
-		struct drm_sched_job *sched_job;
-
-		sched_job = drm_sched_entity_pop_job(entity);
-		if (!sched_job) {
-			complete_all(&entity->entity_idle);
-			if (!cleanup_job)
-				return;	/* No more work */
-			goto again;
-		}
+		drm_sched_free_job_queue_if_ready(sched);
+		drm_sched_run_job_queue_if_ready(sched);
+	}
+}
 
-		s_fence = sched_job->s_fence;
+/**
+ * drm_sched_run_job_work - worker to call run_job
+ *
+ * @w: run job work
+ */
+static void drm_sched_run_job_work(struct work_struct *w)
+{
+	struct drm_gpu_scheduler *sched =
+		container_of(w, struct drm_gpu_scheduler, work_run_job);
+	struct drm_sched_entity *entity;
+	struct dma_fence *fence;
+	struct drm_sched_fence *s_fence;
+	struct drm_sched_job *sched_job;
+	int r;
 
-		atomic_inc(&sched->hw_rq_count);
-		drm_sched_job_begin(sched_job);
+	if (READ_ONCE(sched->pause_submit))
+		return;
+
+	entity = drm_sched_select_entity(sched, true);
+	if (!entity)
+		return;
 
-		trace_drm_run_job(sched_job, entity);
-		fence = sched->ops->run_job(sched_job);
+	sched_job = drm_sched_entity_pop_job(entity);
+	if (!sched_job) {
 		complete_all(&entity->entity_idle);
-		drm_sched_fence_scheduled(s_fence, fence);
+		return;	/* No more work */
+	}
 
-		if (!IS_ERR_OR_NULL(fence)) {
-			/* Drop for original kref_init of the fence */
-			dma_fence_put(fence);
+	s_fence = sched_job->s_fence;
 
-			r = dma_fence_add_callback(fence, &sched_job->cb,
-						   drm_sched_job_done_cb);
-			if (r == -ENOENT)
-				drm_sched_job_done(sched_job, fence->error);
-			else if (r)
-				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
-					  r);
-		} else {
-			drm_sched_job_done(sched_job, IS_ERR(fence) ?
-					   PTR_ERR(fence) : 0);
-		}
+	atomic_inc(&sched->hw_rq_count);
+	drm_sched_job_begin(sched_job);
 
-		wake_up(&sched->job_scheduled);
+	trace_drm_run_job(sched_job, entity);
+	fence = sched->ops->run_job(sched_job);
+	complete_all(&entity->entity_idle);
+	drm_sched_fence_scheduled(s_fence, fence);
+
+	if (!IS_ERR_OR_NULL(fence)) {
+		/* Drop for original kref_init of the fence */
+		dma_fence_put(fence);
+
+		r = dma_fence_add_callback(fence, &sched_job->cb,
+					   drm_sched_job_done_cb);
+		if (r == -ENOENT)
+			drm_sched_job_done(sched_job, fence->error);
+		else if (r)
+			DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
+	} else {
+		drm_sched_job_done(sched_job, IS_ERR(fence) ?
+				   PTR_ERR(fence) : 0);
 	}
 
-again:
-	drm_sched_wqueue_enqueue(sched);
+	wake_up(&sched->job_scheduled);
+	drm_sched_run_job_queue_if_ready(sched);
 }
 
 /**
@@ -1159,7 +1215,8 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 	spin_lock_init(&sched->job_list_lock);
 	atomic_set(&sched->hw_rq_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
-	INIT_WORK(&sched->work_submit, drm_sched_main);
+	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
+	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
 	atomic_set(&sched->_score, 0);
 	atomic64_set(&sched->job_id_count, 0);
 	sched->pause_submit = false;
@@ -1282,7 +1339,8 @@  EXPORT_SYMBOL(drm_sched_wqueue_ready);
 void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
 {
 	WRITE_ONCE(sched->pause_submit, true);
-	cancel_work_sync(&sched->work_submit);
+	cancel_work_sync(&sched->work_run_job);
+	cancel_work_sync(&sched->work_free_job);
 }
 EXPORT_SYMBOL(drm_sched_wqueue_stop);
 
@@ -1294,6 +1352,7 @@  EXPORT_SYMBOL(drm_sched_wqueue_stop);
 void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
 {
 	WRITE_ONCE(sched->pause_submit, false);
-	queue_work(sched->submit_wq, &sched->work_submit);
+	queue_work(sched->submit_wq, &sched->work_run_job);
+	queue_work(sched->submit_wq, &sched->work_free_job);
 }
 EXPORT_SYMBOL(drm_sched_wqueue_start);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index e67b53c3546b..625ffe040de3 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -487,9 +487,10 @@  struct drm_sched_backend_ops {
  *                 finished.
  * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
- * @submit_wq: workqueue used to queue @work_submit
+ * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
  * @timeout_wq: workqueue used to queue @work_tdr
- * @work_submit: schedules jobs and cleans up entities
+ * @work_run_job: schedules jobs
+ * @work_free_job: cleans up jobs
  * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
  *            timeout interval is over.
  * @pending_list: the list of jobs which are currently in the job queue.
@@ -519,7 +520,8 @@  struct drm_gpu_scheduler {
 	atomic64_t			job_id_count;
 	struct workqueue_struct		*submit_wq;
 	struct workqueue_struct		*timeout_wq;
-	struct work_struct		work_submit;
+	struct work_struct		work_run_job;
+	struct work_struct		work_free_job;
 	struct delayed_work		work_tdr;
 	struct list_head		pending_list;
 	spinlock_t			job_list_lock;