diff mbox series

[v2,4/9] drm/sched: Split free_job into own work item

Message ID 20230811023137.659037-5-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series DRM scheduler changes for Xe | expand

Commit Message

Matthew Brost Aug. 11, 2023, 2:31 a.m. UTC
Rather than call free_job and run_job in same work item have a dedicated
work item for each. This aligns with the design and intended use of work
queues.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
 include/drm/gpu_scheduler.h            |   8 +-
 2 files changed, 106 insertions(+), 39 deletions(-)

Comments

Christian König Aug. 17, 2023, 1:39 p.m. UTC | #1
Am 11.08.23 um 04:31 schrieb Matthew Brost:
> Rather than call free_job and run_job in same work item have a dedicated
> work item for each. This aligns with the design and intended use of work
> queues.

I would rather say we should get completely rid of the free_job callback.

Essentially the job is just the container which carries the information 
which are necessary before you push it to the hw. The real 
representation of the submission is actually the scheduler fence.

All the lifetime issues we had came from ignoring this fact and I think 
we should push for fixing this design up again.

Regards,
Christian.

>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>   include/drm/gpu_scheduler.h            |   8 +-
>   2 files changed, 106 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index cede47afc800..b67469eac179 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>    *
>    * @rq: scheduler run queue to check.
> + * @dequeue: dequeue selected entity
>    *
>    * Try to find a ready entity, returns NULL if none found.
>    */
>   static struct drm_sched_entity *
> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>   {
>   	struct drm_sched_entity *entity;
>   
> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>   	if (entity) {
>   		list_for_each_entry_continue(entity, &rq->entities, list) {
>   			if (drm_sched_entity_is_ready(entity)) {
> -				rq->current_entity = entity;
> -				reinit_completion(&entity->entity_idle);
> +				if (dequeue) {
> +					rq->current_entity = entity;
> +					reinit_completion(&entity->entity_idle);
> +				}
>   				spin_unlock(&rq->lock);
>   				return entity;
>   			}
> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>   	list_for_each_entry(entity, &rq->entities, list) {
>   
>   		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (dequeue) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>   			spin_unlock(&rq->lock);
>   			return entity;
>   		}
> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>    *
>    * @rq: scheduler run queue to check.
> + * @dequeue: dequeue selected entity
>    *
>    * Find oldest waiting ready entity, returns NULL if none found.
>    */
>   static struct drm_sched_entity *
> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>   {
>   	struct rb_node *rb;
>   
> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>   
>   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>   		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (dequeue) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>   			break;
>   		}
>   	}
> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>   }
>   
>   /**
> - * drm_sched_submit_queue - scheduler queue submission
> + * drm_sched_run_job_queue - queue job submission
>    * @sched: scheduler instance
>    */
> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>   {
>   	if (!READ_ONCE(sched->pause_submit))
> -		queue_work(sched->submit_wq, &sched->work_submit);
> +		queue_work(sched->submit_wq, &sched->work_run_job);
> +}
> +
> +static struct drm_sched_entity *
> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> +
> +/**
> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> + * @sched: scheduler instance
> + */
> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> +{
> +	if (drm_sched_select_entity(sched, false))
> +		drm_sched_run_job_queue(sched);
> +}
> +
> +/**
> + * drm_sched_free_job_queue - queue free job
> + *
> + * @sched: scheduler instance to queue free job
> + */
> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> +{
> +	if (!READ_ONCE(sched->pause_submit))
> +		queue_work(sched->submit_wq, &sched->work_free_job);
> +}
> +
> +/**
> + * drm_sched_free_job_queue_if_ready - queue free job if ready
> + *
> + * @sched: scheduler instance to queue free job
> + */
> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> +{
> +	struct drm_sched_job *job;
> +
> +	spin_lock(&sched->job_list_lock);
> +	job = list_first_entry_or_null(&sched->pending_list,
> +				       struct drm_sched_job, list);
> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> +		drm_sched_free_job_queue(sched);
> +	spin_unlock(&sched->job_list_lock);
>   }
>   
>   /**
> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>   	dma_fence_get(&s_fence->finished);
>   	drm_sched_fence_finished(s_fence, result);
>   	dma_fence_put(&s_fence->finished);
> -	drm_sched_submit_queue(sched);
> +	drm_sched_free_job_queue(sched);
>   }
>   
>   /**
> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>   void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>   {
>   	if (drm_sched_can_queue(sched))
> -		drm_sched_submit_queue(sched);
> +		drm_sched_run_job_queue(sched);
>   }
>   
>   /**
>    * drm_sched_select_entity - Select next entity to process
>    *
>    * @sched: scheduler instance
> + * @dequeue: dequeue selected entity
>    *
>    * Returns the entity to process or NULL if none are found.
>    */
>   static struct drm_sched_entity *
> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>   {
>   	struct drm_sched_entity *entity;
>   	int i;
> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>   	/* Kernel run queue has higher priority than normal run queue*/
>   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> +							dequeue) :
> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> +						      dequeue);
>   		if (entity)
>   			break;
>   	}
> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>   EXPORT_SYMBOL(drm_sched_pick_best);
>   
>   /**
> - * drm_sched_main - main scheduler thread
> + * drm_sched_free_job_work - worker to call free_job
>    *
> - * @param: scheduler instance
> + * @w: free job work
>    */
> -static void drm_sched_main(struct work_struct *w)
> +static void drm_sched_free_job_work(struct work_struct *w)
>   {
>   	struct drm_gpu_scheduler *sched =
> -		container_of(w, struct drm_gpu_scheduler, work_submit);
> -	struct drm_sched_entity *entity;
> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>   	struct drm_sched_job *cleanup_job;
> -	int r;
>   
>   	if (READ_ONCE(sched->pause_submit))
>   		return;
>   
>   	cleanup_job = drm_sched_get_cleanup_job(sched);
> -	entity = drm_sched_select_entity(sched);
> +	if (cleanup_job) {
> +		sched->ops->free_job(cleanup_job);
> +
> +		drm_sched_free_job_queue_if_ready(sched);
> +		drm_sched_run_job_queue_if_ready(sched);
> +	}
> +}
>   
> -	if (!entity && !cleanup_job)
> -		return;	/* No more work */
> +/**
> + * drm_sched_run_job_work - worker to call run_job
> + *
> + * @w: run job work
> + */
> +static void drm_sched_run_job_work(struct work_struct *w)
> +{
> +	struct drm_gpu_scheduler *sched =
> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> +	struct drm_sched_entity *entity;
> +	int r;
>   
> -	if (cleanup_job)
> -		sched->ops->free_job(cleanup_job);
> +	if (READ_ONCE(sched->pause_submit))
> +		return;
>   
> +	entity = drm_sched_select_entity(sched, true);
>   	if (entity) {
>   		struct dma_fence *fence;
>   		struct drm_sched_fence *s_fence;
> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>   		sched_job = drm_sched_entity_pop_job(entity);
>   		if (!sched_job) {
>   			complete_all(&entity->entity_idle);
> -			if (!cleanup_job)
> -				return;	/* No more work */
> -			goto again;
> +			return;	/* No more work */
>   		}
>   
>   		s_fence = sched_job->s_fence;
> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>   		}
>   
>   		wake_up(&sched->job_scheduled);
> +		drm_sched_run_job_queue_if_ready(sched);
>   	}
> -
> -again:
> -	drm_sched_submit_queue(sched);
>   }
>   
>   /**
> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   	spin_lock_init(&sched->job_list_lock);
>   	atomic_set(&sched->hw_rq_count, 0);
>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> -	INIT_WORK(&sched->work_submit, drm_sched_main);
> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>   	atomic_set(&sched->_score, 0);
>   	atomic64_set(&sched->job_id_count, 0);
>   	sched->pause_submit = false;
> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>   void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>   {
>   	WRITE_ONCE(sched->pause_submit, true);
> -	cancel_work_sync(&sched->work_submit);
> +	cancel_work_sync(&sched->work_run_job);
> +	cancel_work_sync(&sched->work_free_job);
>   }
>   EXPORT_SYMBOL(drm_sched_submit_stop);
>   
> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>   void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>   {
>   	WRITE_ONCE(sched->pause_submit, false);
> -	queue_work(sched->submit_wq, &sched->work_submit);
> +	queue_work(sched->submit_wq, &sched->work_run_job);
> +	queue_work(sched->submit_wq, &sched->work_free_job);
>   }
>   EXPORT_SYMBOL(drm_sched_submit_start);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 04eec2d7635f..fbc083a92757 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>    *                 finished.
>    * @hw_rq_count: the number of jobs currently in the hardware queue.
>    * @job_id_count: used to assign unique id to the each job.
> - * @submit_wq: workqueue used to queue @work_submit
> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>    * @timeout_wq: workqueue used to queue @work_tdr
> - * @work_submit: schedules jobs and cleans up entities
> + * @work_run_job: schedules jobs
> + * @work_free_job: cleans up jobs
>    * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>    *            timeout interval is over.
>    * @pending_list: the list of jobs which are currently in the job queue.
> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>   	atomic64_t			job_id_count;
>   	struct workqueue_struct		*submit_wq;
>   	struct workqueue_struct		*timeout_wq;
> -	struct work_struct		work_submit;
> +	struct work_struct		work_run_job;
> +	struct work_struct		work_free_job;
>   	struct delayed_work		work_tdr;
>   	struct list_head		pending_list;
>   	spinlock_t			job_list_lock;
Matthew Brost Aug. 17, 2023, 5:54 p.m. UTC | #2
On Thu, Aug 17, 2023 at 03:39:40PM +0200, Christian König wrote:
> Am 11.08.23 um 04:31 schrieb Matthew Brost:
> > Rather than call free_job and run_job in same work item have a dedicated
> > work item for each. This aligns with the design and intended use of work
> > queues.
> 
> I would rather say we should get completely rid of the free_job callback.
> 

Would we still have work item? e.g. Would we still want to call
drm_sched_get_cleanup_job which removes the job from the pending list
and adjusts the TDR? Trying to figure out out what this looks like. We
probably can't do all of this from an IRQ context.

> Essentially the job is just the container which carries the information
> which are necessary before you push it to the hw. The real representation of
> the submission is actually the scheduler fence.
>

Most of the free_jobs call plus drm_sched_job_cleanup + a put on job. In
Xe this cannot be called from an IRQ context either.

I'm just confused what exactly you are suggesting here.

Matt

> All the lifetime issues we had came from ignoring this fact and I think we
> should push for fixing this design up again.
> 
> Regards,
> Christian.
> 
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> >   include/drm/gpu_scheduler.h            |   8 +-
> >   2 files changed, 106 insertions(+), 39 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index cede47afc800..b67469eac179 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> >    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> >    *
> >    * @rq: scheduler run queue to check.
> > + * @dequeue: dequeue selected entity
> >    *
> >    * Try to find a ready entity, returns NULL if none found.
> >    */
> >   static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> >   {
> >   	struct drm_sched_entity *entity;
> > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >   	if (entity) {
> >   		list_for_each_entry_continue(entity, &rq->entities, list) {
> >   			if (drm_sched_entity_is_ready(entity)) {
> > -				rq->current_entity = entity;
> > -				reinit_completion(&entity->entity_idle);
> > +				if (dequeue) {
> > +					rq->current_entity = entity;
> > +					reinit_completion(&entity->entity_idle);
> > +				}
> >   				spin_unlock(&rq->lock);
> >   				return entity;
> >   			}
> > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >   	list_for_each_entry(entity, &rq->entities, list) {
> >   		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (dequeue) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >   			spin_unlock(&rq->lock);
> >   			return entity;
> >   		}
> > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> >    *
> >    * @rq: scheduler run queue to check.
> > + * @dequeue: dequeue selected entity
> >    *
> >    * Find oldest waiting ready entity, returns NULL if none found.
> >    */
> >   static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> >   {
> >   	struct rb_node *rb;
> > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> >   		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (dequeue) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >   			break;
> >   		}
> >   	}
> > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >   }
> >   /**
> > - * drm_sched_submit_queue - scheduler queue submission
> > + * drm_sched_run_job_queue - queue job submission
> >    * @sched: scheduler instance
> >    */
> > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> >   {
> >   	if (!READ_ONCE(sched->pause_submit))
> > -		queue_work(sched->submit_wq, &sched->work_submit);
> > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > +}
> > +
> > +static struct drm_sched_entity *
> > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > +
> > +/**
> > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > + * @sched: scheduler instance
> > + */
> > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (drm_sched_select_entity(sched, false))
> > +		drm_sched_run_job_queue(sched);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue - queue free job
> > + *
> > + * @sched: scheduler instance to queue free job
> > + */
> > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (!READ_ONCE(sched->pause_submit))
> > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > + *
> > + * @sched: scheduler instance to queue free job
> > + */
> > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > +{
> > +	struct drm_sched_job *job;
> > +
> > +	spin_lock(&sched->job_list_lock);
> > +	job = list_first_entry_or_null(&sched->pending_list,
> > +				       struct drm_sched_job, list);
> > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > +		drm_sched_free_job_queue(sched);
> > +	spin_unlock(&sched->job_list_lock);
> >   }
> >   /**
> > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> >   	dma_fence_get(&s_fence->finished);
> >   	drm_sched_fence_finished(s_fence, result);
> >   	dma_fence_put(&s_fence->finished);
> > -	drm_sched_submit_queue(sched);
> > +	drm_sched_free_job_queue(sched);
> >   }
> >   /**
> > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> >   void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> >   {
> >   	if (drm_sched_can_queue(sched))
> > -		drm_sched_submit_queue(sched);
> > +		drm_sched_run_job_queue(sched);
> >   }
> >   /**
> >    * drm_sched_select_entity - Select next entity to process
> >    *
> >    * @sched: scheduler instance
> > + * @dequeue: dequeue selected entity
> >    *
> >    * Returns the entity to process or NULL if none are found.
> >    */
> >   static struct drm_sched_entity *
> > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> >   {
> >   	struct drm_sched_entity *entity;
> >   	int i;
> > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> >   	/* Kernel run queue has higher priority than normal run queue*/
> >   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> >   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > +							dequeue) :
> > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > +						      dequeue);
> >   		if (entity)
> >   			break;
> >   	}
> > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> >   EXPORT_SYMBOL(drm_sched_pick_best);
> >   /**
> > - * drm_sched_main - main scheduler thread
> > + * drm_sched_free_job_work - worker to call free_job
> >    *
> > - * @param: scheduler instance
> > + * @w: free job work
> >    */
> > -static void drm_sched_main(struct work_struct *w)
> > +static void drm_sched_free_job_work(struct work_struct *w)
> >   {
> >   	struct drm_gpu_scheduler *sched =
> > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > -	struct drm_sched_entity *entity;
> > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> >   	struct drm_sched_job *cleanup_job;
> > -	int r;
> >   	if (READ_ONCE(sched->pause_submit))
> >   		return;
> >   	cleanup_job = drm_sched_get_cleanup_job(sched);
> > -	entity = drm_sched_select_entity(sched);
> > +	if (cleanup_job) {
> > +		sched->ops->free_job(cleanup_job);
> > +
> > +		drm_sched_free_job_queue_if_ready(sched);
> > +		drm_sched_run_job_queue_if_ready(sched);
> > +	}
> > +}
> > -	if (!entity && !cleanup_job)
> > -		return;	/* No more work */
> > +/**
> > + * drm_sched_run_job_work - worker to call run_job
> > + *
> > + * @w: run job work
> > + */
> > +static void drm_sched_run_job_work(struct work_struct *w)
> > +{
> > +	struct drm_gpu_scheduler *sched =
> > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > +	struct drm_sched_entity *entity;
> > +	int r;
> > -	if (cleanup_job)
> > -		sched->ops->free_job(cleanup_job);
> > +	if (READ_ONCE(sched->pause_submit))
> > +		return;
> > +	entity = drm_sched_select_entity(sched, true);
> >   	if (entity) {
> >   		struct dma_fence *fence;
> >   		struct drm_sched_fence *s_fence;
> > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> >   		sched_job = drm_sched_entity_pop_job(entity);
> >   		if (!sched_job) {
> >   			complete_all(&entity->entity_idle);
> > -			if (!cleanup_job)
> > -				return;	/* No more work */
> > -			goto again;
> > +			return;	/* No more work */
> >   		}
> >   		s_fence = sched_job->s_fence;
> > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> >   		}
> >   		wake_up(&sched->job_scheduled);
> > +		drm_sched_run_job_queue_if_ready(sched);
> >   	}
> > -
> > -again:
> > -	drm_sched_submit_queue(sched);
> >   }
> >   /**
> > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >   	spin_lock_init(&sched->job_list_lock);
> >   	atomic_set(&sched->hw_rq_count, 0);
> >   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> >   	atomic_set(&sched->_score, 0);
> >   	atomic64_set(&sched->job_id_count, 0);
> >   	sched->pause_submit = false;
> > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> >   void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> >   {
> >   	WRITE_ONCE(sched->pause_submit, true);
> > -	cancel_work_sync(&sched->work_submit);
> > +	cancel_work_sync(&sched->work_run_job);
> > +	cancel_work_sync(&sched->work_free_job);
> >   }
> >   EXPORT_SYMBOL(drm_sched_submit_stop);
> > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> >   void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> >   {
> >   	WRITE_ONCE(sched->pause_submit, false);
> > -	queue_work(sched->submit_wq, &sched->work_submit);
> > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > +	queue_work(sched->submit_wq, &sched->work_free_job);
> >   }
> >   EXPORT_SYMBOL(drm_sched_submit_start);
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 04eec2d7635f..fbc083a92757 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> >    *                 finished.
> >    * @hw_rq_count: the number of jobs currently in the hardware queue.
> >    * @job_id_count: used to assign unique id to the each job.
> > - * @submit_wq: workqueue used to queue @work_submit
> > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> >    * @timeout_wq: workqueue used to queue @work_tdr
> > - * @work_submit: schedules jobs and cleans up entities
> > + * @work_run_job: schedules jobs
> > + * @work_free_job: cleans up jobs
> >    * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> >    *            timeout interval is over.
> >    * @pending_list: the list of jobs which are currently in the job queue.
> > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> >   	atomic64_t			job_id_count;
> >   	struct workqueue_struct		*submit_wq;
> >   	struct workqueue_struct		*timeout_wq;
> > -	struct work_struct		work_submit;
> > +	struct work_struct		work_run_job;
> > +	struct work_struct		work_free_job;
> >   	struct delayed_work		work_tdr;
> >   	struct list_head		pending_list;
> >   	spinlock_t			job_list_lock;
>
Christian König Aug. 18, 2023, 5:27 a.m. UTC | #3
Am 17.08.23 um 19:54 schrieb Matthew Brost:
> On Thu, Aug 17, 2023 at 03:39:40PM +0200, Christian König wrote:
>> Am 11.08.23 um 04:31 schrieb Matthew Brost:
>>> Rather than call free_job and run_job in same work item have a dedicated
>>> work item for each. This aligns with the design and intended use of work
>>> queues.
>> I would rather say we should get completely rid of the free_job callback.
>>
> Would we still have work item? e.g. Would we still want to call
> drm_sched_get_cleanup_job which removes the job from the pending list
> and adjusts the TDR? Trying to figure out out what this looks like. We
> probably can't do all of this from an IRQ context.
>
>> Essentially the job is just the container which carries the information
>> which are necessary before you push it to the hw. The real representation of
>> the submission is actually the scheduler fence.
>>
> Most of the free_jobs call plus drm_sched_job_cleanup + a put on job. In
> Xe this cannot be called from an IRQ context either.
>
> I'm just confused what exactly you are suggesting here.

To summarize on one sentence: Instead of the job we keep the scheduler 
and hardware fences around after pushing the job to the hw.

The free_job callback would then be replaced by dropping the reference 
on the scheduler and hw fence.

Would that work for you?

Christian.

>
> Matt
>
>> All the lifetime issues we had came from ignoring this fact and I think we
>> should push for fixing this design up again.
>>
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>>>    include/drm/gpu_scheduler.h            |   8 +-
>>>    2 files changed, 106 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index cede47afc800..b67469eac179 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>     * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>     *
>>>     * @rq: scheduler run queue to check.
>>> + * @dequeue: dequeue selected entity
>>>     *
>>>     * Try to find a ready entity, returns NULL if none found.
>>>     */
>>>    static struct drm_sched_entity *
>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>>    {
>>>    	struct drm_sched_entity *entity;
>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>    	if (entity) {
>>>    		list_for_each_entry_continue(entity, &rq->entities, list) {
>>>    			if (drm_sched_entity_is_ready(entity)) {
>>> -				rq->current_entity = entity;
>>> -				reinit_completion(&entity->entity_idle);
>>> +				if (dequeue) {
>>> +					rq->current_entity = entity;
>>> +					reinit_completion(&entity->entity_idle);
>>> +				}
>>>    				spin_unlock(&rq->lock);
>>>    				return entity;
>>>    			}
>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>    	list_for_each_entry(entity, &rq->entities, list) {
>>>    		if (drm_sched_entity_is_ready(entity)) {
>>> -			rq->current_entity = entity;
>>> -			reinit_completion(&entity->entity_idle);
>>> +			if (dequeue) {
>>> +				rq->current_entity = entity;
>>> +				reinit_completion(&entity->entity_idle);
>>> +			}
>>>    			spin_unlock(&rq->lock);
>>>    			return entity;
>>>    		}
>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>     * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>>     *
>>>     * @rq: scheduler run queue to check.
>>> + * @dequeue: dequeue selected entity
>>>     *
>>>     * Find oldest waiting ready entity, returns NULL if none found.
>>>     */
>>>    static struct drm_sched_entity *
>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>>    {
>>>    	struct rb_node *rb;
>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>    		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>>    		if (drm_sched_entity_is_ready(entity)) {
>>> -			rq->current_entity = entity;
>>> -			reinit_completion(&entity->entity_idle);
>>> +			if (dequeue) {
>>> +				rq->current_entity = entity;
>>> +				reinit_completion(&entity->entity_idle);
>>> +			}
>>>    			break;
>>>    		}
>>>    	}
>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>    }
>>>    /**
>>> - * drm_sched_submit_queue - scheduler queue submission
>>> + * drm_sched_run_job_queue - queue job submission
>>>     * @sched: scheduler instance
>>>     */
>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>>    {
>>>    	if (!READ_ONCE(sched->pause_submit))
>>> -		queue_work(sched->submit_wq, &sched->work_submit);
>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
>>> +}
>>> +
>>> +static struct drm_sched_entity *
>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
>>> +
>>> +/**
>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
>>> + * @sched: scheduler instance
>>> + */
>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	if (drm_sched_select_entity(sched, false))
>>> +		drm_sched_run_job_queue(sched);
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_free_job_queue - queue free job
>>> + *
>>> + * @sched: scheduler instance to queue free job
>>> + */
>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	if (!READ_ONCE(sched->pause_submit))
>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
>>> + *
>>> + * @sched: scheduler instance to queue free job
>>> + */
>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	struct drm_sched_job *job;
>>> +
>>> +	spin_lock(&sched->job_list_lock);
>>> +	job = list_first_entry_or_null(&sched->pending_list,
>>> +				       struct drm_sched_job, list);
>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
>>> +		drm_sched_free_job_queue(sched);
>>> +	spin_unlock(&sched->job_list_lock);
>>>    }
>>>    /**
>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>>    	dma_fence_get(&s_fence->finished);
>>>    	drm_sched_fence_finished(s_fence, result);
>>>    	dma_fence_put(&s_fence->finished);
>>> -	drm_sched_submit_queue(sched);
>>> +	drm_sched_free_job_queue(sched);
>>>    }
>>>    /**
>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>>    void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>>>    {
>>>    	if (drm_sched_can_queue(sched))
>>> -		drm_sched_submit_queue(sched);
>>> +		drm_sched_run_job_queue(sched);
>>>    }
>>>    /**
>>>     * drm_sched_select_entity - Select next entity to process
>>>     *
>>>     * @sched: scheduler instance
>>> + * @dequeue: dequeue selected entity
>>>     *
>>>     * Returns the entity to process or NULL if none are found.
>>>     */
>>>    static struct drm_sched_entity *
>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>>    {
>>>    	struct drm_sched_entity *entity;
>>>    	int i;
>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>    	/* Kernel run queue has higher priority than normal run queue*/
>>>    	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>>    		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>>> +							dequeue) :
>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>>> +						      dequeue);
>>>    		if (entity)
>>>    			break;
>>>    	}
>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>    EXPORT_SYMBOL(drm_sched_pick_best);
>>>    /**
>>> - * drm_sched_main - main scheduler thread
>>> + * drm_sched_free_job_work - worker to call free_job
>>>     *
>>> - * @param: scheduler instance
>>> + * @w: free job work
>>>     */
>>> -static void drm_sched_main(struct work_struct *w)
>>> +static void drm_sched_free_job_work(struct work_struct *w)
>>>    {
>>>    	struct drm_gpu_scheduler *sched =
>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
>>> -	struct drm_sched_entity *entity;
>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>>>    	struct drm_sched_job *cleanup_job;
>>> -	int r;
>>>    	if (READ_ONCE(sched->pause_submit))
>>>    		return;
>>>    	cleanup_job = drm_sched_get_cleanup_job(sched);
>>> -	entity = drm_sched_select_entity(sched);
>>> +	if (cleanup_job) {
>>> +		sched->ops->free_job(cleanup_job);
>>> +
>>> +		drm_sched_free_job_queue_if_ready(sched);
>>> +		drm_sched_run_job_queue_if_ready(sched);
>>> +	}
>>> +}
>>> -	if (!entity && !cleanup_job)
>>> -		return;	/* No more work */
>>> +/**
>>> + * drm_sched_run_job_work - worker to call run_job
>>> + *
>>> + * @w: run job work
>>> + */
>>> +static void drm_sched_run_job_work(struct work_struct *w)
>>> +{
>>> +	struct drm_gpu_scheduler *sched =
>>> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
>>> +	struct drm_sched_entity *entity;
>>> +	int r;
>>> -	if (cleanup_job)
>>> -		sched->ops->free_job(cleanup_job);
>>> +	if (READ_ONCE(sched->pause_submit))
>>> +		return;
>>> +	entity = drm_sched_select_entity(sched, true);
>>>    	if (entity) {
>>>    		struct dma_fence *fence;
>>>    		struct drm_sched_fence *s_fence;
>>> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>>>    		sched_job = drm_sched_entity_pop_job(entity);
>>>    		if (!sched_job) {
>>>    			complete_all(&entity->entity_idle);
>>> -			if (!cleanup_job)
>>> -				return;	/* No more work */
>>> -			goto again;
>>> +			return;	/* No more work */
>>>    		}
>>>    		s_fence = sched_job->s_fence;
>>> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>>>    		}
>>>    		wake_up(&sched->job_scheduled);
>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>    	}
>>> -
>>> -again:
>>> -	drm_sched_submit_queue(sched);
>>>    }
>>>    /**
>>> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>    	spin_lock_init(&sched->job_list_lock);
>>>    	atomic_set(&sched->hw_rq_count, 0);
>>>    	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>> -	INIT_WORK(&sched->work_submit, drm_sched_main);
>>> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>>>    	atomic_set(&sched->_score, 0);
>>>    	atomic64_set(&sched->job_id_count, 0);
>>>    	sched->pause_submit = false;
>>> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>>>    void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>>>    {
>>>    	WRITE_ONCE(sched->pause_submit, true);
>>> -	cancel_work_sync(&sched->work_submit);
>>> +	cancel_work_sync(&sched->work_run_job);
>>> +	cancel_work_sync(&sched->work_free_job);
>>>    }
>>>    EXPORT_SYMBOL(drm_sched_submit_stop);
>>> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>>>    void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>>>    {
>>>    	WRITE_ONCE(sched->pause_submit, false);
>>> -	queue_work(sched->submit_wq, &sched->work_submit);
>>> +	queue_work(sched->submit_wq, &sched->work_run_job);
>>> +	queue_work(sched->submit_wq, &sched->work_free_job);
>>>    }
>>>    EXPORT_SYMBOL(drm_sched_submit_start);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 04eec2d7635f..fbc083a92757 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>>>     *                 finished.
>>>     * @hw_rq_count: the number of jobs currently in the hardware queue.
>>>     * @job_id_count: used to assign unique id to the each job.
>>> - * @submit_wq: workqueue used to queue @work_submit
>>> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>>     * @timeout_wq: workqueue used to queue @work_tdr
>>> - * @work_submit: schedules jobs and cleans up entities
>>> + * @work_run_job: schedules jobs
>>> + * @work_free_job: cleans up jobs
>>>     * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>     *            timeout interval is over.
>>>     * @pending_list: the list of jobs which are currently in the job queue.
>>> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>>>    	atomic64_t			job_id_count;
>>>    	struct workqueue_struct		*submit_wq;
>>>    	struct workqueue_struct		*timeout_wq;
>>> -	struct work_struct		work_submit;
>>> +	struct work_struct		work_run_job;
>>> +	struct work_struct		work_free_job;
>>>    	struct delayed_work		work_tdr;
>>>    	struct list_head		pending_list;
>>>    	spinlock_t			job_list_lock;
Matthew Brost Aug. 18, 2023, 1:13 p.m. UTC | #4
On Fri, Aug 18, 2023 at 07:27:33AM +0200, Christian König wrote:
> Am 17.08.23 um 19:54 schrieb Matthew Brost:
> > On Thu, Aug 17, 2023 at 03:39:40PM +0200, Christian König wrote:
> > > Am 11.08.23 um 04:31 schrieb Matthew Brost:
> > > > Rather than call free_job and run_job in same work item have a dedicated
> > > > work item for each. This aligns with the design and intended use of work
> > > > queues.
> > > I would rather say we should get completely rid of the free_job callback.
> > > 
> > Would we still have work item? e.g. Would we still want to call
> > drm_sched_get_cleanup_job which removes the job from the pending list
> > and adjusts the TDR? Trying to figure out out what this looks like. We
> > probably can't do all of this from an IRQ context.
> > 
> > > Essentially the job is just the container which carries the information
> > > which are necessary before you push it to the hw. The real representation of
> > > the submission is actually the scheduler fence.
> > > 
> > Most of the free_jobs call plus drm_sched_job_cleanup + a put on job. In
> > Xe this cannot be called from an IRQ context either.
> > 
> > I'm just confused what exactly you are suggesting here.
> 
> To summarize on one sentence: Instead of the job we keep the scheduler and
> hardware fences around after pushing the job to the hw.
> 
> The free_job callback would then be replaced by dropping the reference on
> the scheduler and hw fence.
> 
> Would that work for you?
> 

I don't think so for a few reasons.

The job and hw fence are different structures (also different allocs too)
for a reason. The job referenced until it is complete (hw fence is
signaled) and the free_job is called. This reference is needed for the
TDR to work properly and also some reset flows too. Also in Xe some of
things done in free_job cannot be from an IRQ context, hence calling
this from the scheduler worker is rather helpful.

The HW fence can live for longer as it can be installed in dma-resv
slots, syncobjs, etc... If the job and hw fence are combined now we
holding on the memory for the longer and perhaps at the mercy of the
user. We also run the risk of the final put being done from an IRQ
context which again wont work in Xe as it is currently coded. Lastly 2
jobs from the same scheduler could do the final put in parallel, so
rather than having free_job serialized by the worker now multiple jobs
are freeing themselves at the same time. This might not be an issue but
adds another level of raceyness that needs to be accounted for. None of
this sounds desirable to me.

FWIW what you suggesting sounds like how the i915 did things
(i915_request and hw fence in 1 memory alloc) and that turned out to be
a huge mess. As rule of thumb I generally do the opposite of whatever
the i915 did.

Matt

> Christian.
> 
> > 
> > Matt
> > 
> > > All the lifetime issues we had came from ignoring this fact and I think we
> > > should push for fixing this design up again.
> > > 
> > > Regards,
> > > Christian.
> > > 
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > >    include/drm/gpu_scheduler.h            |   8 +-
> > > >    2 files changed, 106 insertions(+), 39 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > index cede47afc800..b67469eac179 100644
> > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > >     * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > >     *
> > > >     * @rq: scheduler run queue to check.
> > > > + * @dequeue: dequeue selected entity
> > > >     *
> > > >     * Try to find a ready entity, returns NULL if none found.
> > > >     */
> > > >    static struct drm_sched_entity *
> > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > >    {
> > > >    	struct drm_sched_entity *entity;
> > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > >    	if (entity) {
> > > >    		list_for_each_entry_continue(entity, &rq->entities, list) {
> > > >    			if (drm_sched_entity_is_ready(entity)) {
> > > > -				rq->current_entity = entity;
> > > > -				reinit_completion(&entity->entity_idle);
> > > > +				if (dequeue) {
> > > > +					rq->current_entity = entity;
> > > > +					reinit_completion(&entity->entity_idle);
> > > > +				}
> > > >    				spin_unlock(&rq->lock);
> > > >    				return entity;
> > > >    			}
> > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > >    	list_for_each_entry(entity, &rq->entities, list) {
> > > >    		if (drm_sched_entity_is_ready(entity)) {
> > > > -			rq->current_entity = entity;
> > > > -			reinit_completion(&entity->entity_idle);
> > > > +			if (dequeue) {
> > > > +				rq->current_entity = entity;
> > > > +				reinit_completion(&entity->entity_idle);
> > > > +			}
> > > >    			spin_unlock(&rq->lock);
> > > >    			return entity;
> > > >    		}
> > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > >     * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > >     *
> > > >     * @rq: scheduler run queue to check.
> > > > + * @dequeue: dequeue selected entity
> > > >     *
> > > >     * Find oldest waiting ready entity, returns NULL if none found.
> > > >     */
> > > >    static struct drm_sched_entity *
> > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > >    {
> > > >    	struct rb_node *rb;
> > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > >    		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > >    		if (drm_sched_entity_is_ready(entity)) {
> > > > -			rq->current_entity = entity;
> > > > -			reinit_completion(&entity->entity_idle);
> > > > +			if (dequeue) {
> > > > +				rq->current_entity = entity;
> > > > +				reinit_completion(&entity->entity_idle);
> > > > +			}
> > > >    			break;
> > > >    		}
> > > >    	}
> > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > >    }
> > > >    /**
> > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > + * drm_sched_run_job_queue - queue job submission
> > > >     * @sched: scheduler instance
> > > >     */
> > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > >    {
> > > >    	if (!READ_ONCE(sched->pause_submit))
> > > > -		queue_work(sched->submit_wq, &sched->work_submit);
> > > > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > > > +}
> > > > +
> > > > +static struct drm_sched_entity *
> > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > +
> > > > +/**
> > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > + * @sched: scheduler instance
> > > > + */
> > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > +{
> > > > +	if (drm_sched_select_entity(sched, false))
> > > > +		drm_sched_run_job_queue(sched);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_sched_free_job_queue - queue free job
> > > > + *
> > > > + * @sched: scheduler instance to queue free job
> > > > + */
> > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > +{
> > > > +	if (!READ_ONCE(sched->pause_submit))
> > > > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > + *
> > > > + * @sched: scheduler instance to queue free job
> > > > + */
> > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > +{
> > > > +	struct drm_sched_job *job;
> > > > +
> > > > +	spin_lock(&sched->job_list_lock);
> > > > +	job = list_first_entry_or_null(&sched->pending_list,
> > > > +				       struct drm_sched_job, list);
> > > > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > +		drm_sched_free_job_queue(sched);
> > > > +	spin_unlock(&sched->job_list_lock);
> > > >    }
> > > >    /**
> > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > >    	dma_fence_get(&s_fence->finished);
> > > >    	drm_sched_fence_finished(s_fence, result);
> > > >    	dma_fence_put(&s_fence->finished);
> > > > -	drm_sched_submit_queue(sched);
> > > > +	drm_sched_free_job_queue(sched);
> > > >    }
> > > >    /**
> > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > >    void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > >    {
> > > >    	if (drm_sched_can_queue(sched))
> > > > -		drm_sched_submit_queue(sched);
> > > > +		drm_sched_run_job_queue(sched);
> > > >    }
> > > >    /**
> > > >     * drm_sched_select_entity - Select next entity to process
> > > >     *
> > > >     * @sched: scheduler instance
> > > > + * @dequeue: dequeue selected entity
> > > >     *
> > > >     * Returns the entity to process or NULL if none are found.
> > > >     */
> > > >    static struct drm_sched_entity *
> > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > >    {
> > > >    	struct drm_sched_entity *entity;
> > > >    	int i;
> > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > >    	/* Kernel run queue has higher priority than normal run queue*/
> > > >    	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > >    		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > +							dequeue) :
> > > > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > +						      dequeue);
> > > >    		if (entity)
> > > >    			break;
> > > >    	}
> > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > >    EXPORT_SYMBOL(drm_sched_pick_best);
> > > >    /**
> > > > - * drm_sched_main - main scheduler thread
> > > > + * drm_sched_free_job_work - worker to call free_job
> > > >     *
> > > > - * @param: scheduler instance
> > > > + * @w: free job work
> > > >     */
> > > > -static void drm_sched_main(struct work_struct *w)
> > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > >    {
> > > >    	struct drm_gpu_scheduler *sched =
> > > > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > -	struct drm_sched_entity *entity;
> > > > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > >    	struct drm_sched_job *cleanup_job;
> > > > -	int r;
> > > >    	if (READ_ONCE(sched->pause_submit))
> > > >    		return;
> > > >    	cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > -	entity = drm_sched_select_entity(sched);
> > > > +	if (cleanup_job) {
> > > > +		sched->ops->free_job(cleanup_job);
> > > > +
> > > > +		drm_sched_free_job_queue_if_ready(sched);
> > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > > +	}
> > > > +}
> > > > -	if (!entity && !cleanup_job)
> > > > -		return;	/* No more work */
> > > > +/**
> > > > + * drm_sched_run_job_work - worker to call run_job
> > > > + *
> > > > + * @w: run job work
> > > > + */
> > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > +{
> > > > +	struct drm_gpu_scheduler *sched =
> > > > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > +	struct drm_sched_entity *entity;
> > > > +	int r;
> > > > -	if (cleanup_job)
> > > > -		sched->ops->free_job(cleanup_job);
> > > > +	if (READ_ONCE(sched->pause_submit))
> > > > +		return;
> > > > +	entity = drm_sched_select_entity(sched, true);
> > > >    	if (entity) {
> > > >    		struct dma_fence *fence;
> > > >    		struct drm_sched_fence *s_fence;
> > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > >    		sched_job = drm_sched_entity_pop_job(entity);
> > > >    		if (!sched_job) {
> > > >    			complete_all(&entity->entity_idle);
> > > > -			if (!cleanup_job)
> > > > -				return;	/* No more work */
> > > > -			goto again;
> > > > +			return;	/* No more work */
> > > >    		}
> > > >    		s_fence = sched_job->s_fence;
> > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > >    		}
> > > >    		wake_up(&sched->job_scheduled);
> > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > >    	}
> > > > -
> > > > -again:
> > > > -	drm_sched_submit_queue(sched);
> > > >    }
> > > >    /**
> > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > >    	spin_lock_init(&sched->job_list_lock);
> > > >    	atomic_set(&sched->hw_rq_count, 0);
> > > >    	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > >    	atomic_set(&sched->_score, 0);
> > > >    	atomic64_set(&sched->job_id_count, 0);
> > > >    	sched->pause_submit = false;
> > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > >    void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > >    {
> > > >    	WRITE_ONCE(sched->pause_submit, true);
> > > > -	cancel_work_sync(&sched->work_submit);
> > > > +	cancel_work_sync(&sched->work_run_job);
> > > > +	cancel_work_sync(&sched->work_free_job);
> > > >    }
> > > >    EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > >    void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > >    {
> > > >    	WRITE_ONCE(sched->pause_submit, false);
> > > > -	queue_work(sched->submit_wq, &sched->work_submit);
> > > > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > > > +	queue_work(sched->submit_wq, &sched->work_free_job);
> > > >    }
> > > >    EXPORT_SYMBOL(drm_sched_submit_start);
> > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > index 04eec2d7635f..fbc083a92757 100644
> > > > --- a/include/drm/gpu_scheduler.h
> > > > +++ b/include/drm/gpu_scheduler.h
> > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > >     *                 finished.
> > > >     * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > >     * @job_id_count: used to assign unique id to the each job.
> > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > >     * @timeout_wq: workqueue used to queue @work_tdr
> > > > - * @work_submit: schedules jobs and cleans up entities
> > > > + * @work_run_job: schedules jobs
> > > > + * @work_free_job: cleans up jobs
> > > >     * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > >     *            timeout interval is over.
> > > >     * @pending_list: the list of jobs which are currently in the job queue.
> > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > >    	atomic64_t			job_id_count;
> > > >    	struct workqueue_struct		*submit_wq;
> > > >    	struct workqueue_struct		*timeout_wq;
> > > > -	struct work_struct		work_submit;
> > > > +	struct work_struct		work_run_job;
> > > > +	struct work_struct		work_free_job;
> > > >    	struct delayed_work		work_tdr;
> > > >    	struct list_head		pending_list;
> > > >    	spinlock_t			job_list_lock;
>
Christian König Aug. 21, 2023, 1:17 p.m. UTC | #5
Am 18.08.23 um 15:13 schrieb Matthew Brost:
> On Fri, Aug 18, 2023 at 07:27:33AM +0200, Christian König wrote:
>> Am 17.08.23 um 19:54 schrieb Matthew Brost:
>>> On Thu, Aug 17, 2023 at 03:39:40PM +0200, Christian König wrote:
>>>> Am 11.08.23 um 04:31 schrieb Matthew Brost:
>>>>> Rather than call free_job and run_job in same work item have a dedicated
>>>>> work item for each. This aligns with the design and intended use of work
>>>>> queues.
>>>> I would rather say we should get completely rid of the free_job callback.
>>>>
>>> Would we still have work item? e.g. Would we still want to call
>>> drm_sched_get_cleanup_job which removes the job from the pending list
>>> and adjusts the TDR? Trying to figure out out what this looks like. We
>>> probably can't do all of this from an IRQ context.
>>>
>>>> Essentially the job is just the container which carries the information
>>>> which are necessary before you push it to the hw. The real representation of
>>>> the submission is actually the scheduler fence.
>>>>
>>> Most of the free_jobs call plus drm_sched_job_cleanup + a put on job. In
>>> Xe this cannot be called from an IRQ context either.
>>>
>>> I'm just confused what exactly you are suggesting here.
>> To summarize on one sentence: Instead of the job we keep the scheduler and
>> hardware fences around after pushing the job to the hw.
>>
>> The free_job callback would then be replaced by dropping the reference on
>> the scheduler and hw fence.
>>
>> Would that work for you?
>>
> I don't think so for a few reasons.
>
> The job and hw fence are different structures (also different allocs too)
> for a reason. The job referenced until it is complete (hw fence is
> signaled) and the free_job is called. This reference is needed for the
> TDR to work properly and also some reset flows too.

That is exactly what I want to avoid, tying the TDR to the job is what 
some AMD engineers pushed for because it looked like a simple solution 
and made the whole thing similar to what Windows does.

This turned the previous relatively clean scheduler and TDR design into 
a complete nightmare. The job contains quite a bunch of things which are 
not necessarily available after the application which submitted the job 
is torn down.

So what happens is that you either have stale pointers in the TDR which 
can go boom extremely easily or we somehow find a way to keep the 
necessary structures (which include struct thread_info and struct file 
for this driver connection) alive until all submissions are completed.

Delaying application tear down is also not an option because then you 
run into massive trouble with the OOM killer (or more generally OOM 
handling). See what we do in drm_sched_entity_flush() as well.

Since adding the TDR support we completely exercised this through in the 
last two or three years or so. And to sum it up I would really like to 
get away from this mess again.

Compared to that what i915 does is actually rather clean I think.

>   Also in Xe some of
> things done in free_job cannot be from an IRQ context, hence calling
> this from the scheduler worker is rather helpful.

Well putting things for cleanup into a workitem doesn't sounds like 
something hard.

Question is what do you really need for TDR which is not inside the 
hardware fence?

Regards,
Christian.

>
> The HW fence can live for longer as it can be installed in dma-resv
> slots, syncobjs, etc... If the job and hw fence are combined now we
> holding on the memory for the longer and perhaps at the mercy of the
> user. We also run the risk of the final put being done from an IRQ
> context which again wont work in Xe as it is currently coded. Lastly 2
> jobs from the same scheduler could do the final put in parallel, so
> rather than having free_job serialized by the worker now multiple jobs
> are freeing themselves at the same time. This might not be an issue but
> adds another level of raceyness that needs to be accounted for. None of
> this sounds desirable to me.
>
> FWIW what you suggesting sounds like how the i915 did things
> (i915_request and hw fence in 1 memory alloc) and that turned out to be
> a huge mess. As rule of thumb I generally do the opposite of whatever
> the i915 did.
>
> Matt
>
>> Christian.
>>
>>> Matt
>>>
>>>> All the lifetime issues we had came from ignoring this fact and I think we
>>>> should push for fixing this design up again.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>> ---
>>>>>     drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>>>>>     include/drm/gpu_scheduler.h            |   8 +-
>>>>>     2 files changed, 106 insertions(+), 39 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> index cede47afc800..b67469eac179 100644
>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>>>      * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>>>      *
>>>>>      * @rq: scheduler run queue to check.
>>>>> + * @dequeue: dequeue selected entity
>>>>>      *
>>>>>      * Try to find a ready entity, returns NULL if none found.
>>>>>      */
>>>>>     static struct drm_sched_entity *
>>>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>>>>     {
>>>>>     	struct drm_sched_entity *entity;
>>>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>     	if (entity) {
>>>>>     		list_for_each_entry_continue(entity, &rq->entities, list) {
>>>>>     			if (drm_sched_entity_is_ready(entity)) {
>>>>> -				rq->current_entity = entity;
>>>>> -				reinit_completion(&entity->entity_idle);
>>>>> +				if (dequeue) {
>>>>> +					rq->current_entity = entity;
>>>>> +					reinit_completion(&entity->entity_idle);
>>>>> +				}
>>>>>     				spin_unlock(&rq->lock);
>>>>>     				return entity;
>>>>>     			}
>>>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>     	list_for_each_entry(entity, &rq->entities, list) {
>>>>>     		if (drm_sched_entity_is_ready(entity)) {
>>>>> -			rq->current_entity = entity;
>>>>> -			reinit_completion(&entity->entity_idle);
>>>>> +			if (dequeue) {
>>>>> +				rq->current_entity = entity;
>>>>> +				reinit_completion(&entity->entity_idle);
>>>>> +			}
>>>>>     			spin_unlock(&rq->lock);
>>>>>     			return entity;
>>>>>     		}
>>>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>      * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>>>>      *
>>>>>      * @rq: scheduler run queue to check.
>>>>> + * @dequeue: dequeue selected entity
>>>>>      *
>>>>>      * Find oldest waiting ready entity, returns NULL if none found.
>>>>>      */
>>>>>     static struct drm_sched_entity *
>>>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>>>>     {
>>>>>     	struct rb_node *rb;
>>>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>     		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>>>>     		if (drm_sched_entity_is_ready(entity)) {
>>>>> -			rq->current_entity = entity;
>>>>> -			reinit_completion(&entity->entity_idle);
>>>>> +			if (dequeue) {
>>>>> +				rq->current_entity = entity;
>>>>> +				reinit_completion(&entity->entity_idle);
>>>>> +			}
>>>>>     			break;
>>>>>     		}
>>>>>     	}
>>>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>     }
>>>>>     /**
>>>>> - * drm_sched_submit_queue - scheduler queue submission
>>>>> + * drm_sched_run_job_queue - queue job submission
>>>>>      * @sched: scheduler instance
>>>>>      */
>>>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
>>>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>>>>     {
>>>>>     	if (!READ_ONCE(sched->pause_submit))
>>>>> -		queue_work(sched->submit_wq, &sched->work_submit);
>>>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
>>>>> +}
>>>>> +
>>>>> +static struct drm_sched_entity *
>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
>>>>> +
>>>>> +/**
>>>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
>>>>> + * @sched: scheduler instance
>>>>> + */
>>>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>> +{
>>>>> +	if (drm_sched_select_entity(sched, false))
>>>>> +		drm_sched_run_job_queue(sched);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * drm_sched_free_job_queue - queue free job
>>>>> + *
>>>>> + * @sched: scheduler instance to queue free job
>>>>> + */
>>>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>>>>> +{
>>>>> +	if (!READ_ONCE(sched->pause_submit))
>>>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
>>>>> + *
>>>>> + * @sched: scheduler instance to queue free job
>>>>> + */
>>>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>> +{
>>>>> +	struct drm_sched_job *job;
>>>>> +
>>>>> +	spin_lock(&sched->job_list_lock);
>>>>> +	job = list_first_entry_or_null(&sched->pending_list,
>>>>> +				       struct drm_sched_job, list);
>>>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
>>>>> +		drm_sched_free_job_queue(sched);
>>>>> +	spin_unlock(&sched->job_list_lock);
>>>>>     }
>>>>>     /**
>>>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>>>>     	dma_fence_get(&s_fence->finished);
>>>>>     	drm_sched_fence_finished(s_fence, result);
>>>>>     	dma_fence_put(&s_fence->finished);
>>>>> -	drm_sched_submit_queue(sched);
>>>>> +	drm_sched_free_job_queue(sched);
>>>>>     }
>>>>>     /**
>>>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>>>>     void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>>>>>     {
>>>>>     	if (drm_sched_can_queue(sched))
>>>>> -		drm_sched_submit_queue(sched);
>>>>> +		drm_sched_run_job_queue(sched);
>>>>>     }
>>>>>     /**
>>>>>      * drm_sched_select_entity - Select next entity to process
>>>>>      *
>>>>>      * @sched: scheduler instance
>>>>> + * @dequeue: dequeue selected entity
>>>>>      *
>>>>>      * Returns the entity to process or NULL if none are found.
>>>>>      */
>>>>>     static struct drm_sched_entity *
>>>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>>>>     {
>>>>>     	struct drm_sched_entity *entity;
>>>>>     	int i;
>>>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>>     	/* Kernel run queue has higher priority than normal run queue*/
>>>>>     	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>>>>     		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
>>>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
>>>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>>>>> +							dequeue) :
>>>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>>>>> +						      dequeue);
>>>>>     		if (entity)
>>>>>     			break;
>>>>>     	}
>>>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>>>     EXPORT_SYMBOL(drm_sched_pick_best);
>>>>>     /**
>>>>> - * drm_sched_main - main scheduler thread
>>>>> + * drm_sched_free_job_work - worker to call free_job
>>>>>      *
>>>>> - * @param: scheduler instance
>>>>> + * @w: free job work
>>>>>      */
>>>>> -static void drm_sched_main(struct work_struct *w)
>>>>> +static void drm_sched_free_job_work(struct work_struct *w)
>>>>>     {
>>>>>     	struct drm_gpu_scheduler *sched =
>>>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
>>>>> -	struct drm_sched_entity *entity;
>>>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>>>>>     	struct drm_sched_job *cleanup_job;
>>>>> -	int r;
>>>>>     	if (READ_ONCE(sched->pause_submit))
>>>>>     		return;
>>>>>     	cleanup_job = drm_sched_get_cleanup_job(sched);
>>>>> -	entity = drm_sched_select_entity(sched);
>>>>> +	if (cleanup_job) {
>>>>> +		sched->ops->free_job(cleanup_job);
>>>>> +
>>>>> +		drm_sched_free_job_queue_if_ready(sched);
>>>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>>> +	}
>>>>> +}
>>>>> -	if (!entity && !cleanup_job)
>>>>> -		return;	/* No more work */
>>>>> +/**
>>>>> + * drm_sched_run_job_work - worker to call run_job
>>>>> + *
>>>>> + * @w: run job work
>>>>> + */
>>>>> +static void drm_sched_run_job_work(struct work_struct *w)
>>>>> +{
>>>>> +	struct drm_gpu_scheduler *sched =
>>>>> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
>>>>> +	struct drm_sched_entity *entity;
>>>>> +	int r;
>>>>> -	if (cleanup_job)
>>>>> -		sched->ops->free_job(cleanup_job);
>>>>> +	if (READ_ONCE(sched->pause_submit))
>>>>> +		return;
>>>>> +	entity = drm_sched_select_entity(sched, true);
>>>>>     	if (entity) {
>>>>>     		struct dma_fence *fence;
>>>>>     		struct drm_sched_fence *s_fence;
>>>>> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>     		sched_job = drm_sched_entity_pop_job(entity);
>>>>>     		if (!sched_job) {
>>>>>     			complete_all(&entity->entity_idle);
>>>>> -			if (!cleanup_job)
>>>>> -				return;	/* No more work */
>>>>> -			goto again;
>>>>> +			return;	/* No more work */
>>>>>     		}
>>>>>     		s_fence = sched_job->s_fence;
>>>>> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>>>>>     		}
>>>>>     		wake_up(&sched->job_scheduled);
>>>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>>>     	}
>>>>> -
>>>>> -again:
>>>>> -	drm_sched_submit_queue(sched);
>>>>>     }
>>>>>     /**
>>>>> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>     	spin_lock_init(&sched->job_list_lock);
>>>>>     	atomic_set(&sched->hw_rq_count, 0);
>>>>>     	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>>> -	INIT_WORK(&sched->work_submit, drm_sched_main);
>>>>> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>>>> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>>>>>     	atomic_set(&sched->_score, 0);
>>>>>     	atomic64_set(&sched->job_id_count, 0);
>>>>>     	sched->pause_submit = false;
>>>>> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>>>>>     void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>>>>>     {
>>>>>     	WRITE_ONCE(sched->pause_submit, true);
>>>>> -	cancel_work_sync(&sched->work_submit);
>>>>> +	cancel_work_sync(&sched->work_run_job);
>>>>> +	cancel_work_sync(&sched->work_free_job);
>>>>>     }
>>>>>     EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>>     void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>>>>>     {
>>>>>     	WRITE_ONCE(sched->pause_submit, false);
>>>>> -	queue_work(sched->submit_wq, &sched->work_submit);
>>>>> +	queue_work(sched->submit_wq, &sched->work_run_job);
>>>>> +	queue_work(sched->submit_wq, &sched->work_free_job);
>>>>>     }
>>>>>     EXPORT_SYMBOL(drm_sched_submit_start);
>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>> index 04eec2d7635f..fbc083a92757 100644
>>>>> --- a/include/drm/gpu_scheduler.h
>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>>>>>      *                 finished.
>>>>>      * @hw_rq_count: the number of jobs currently in the hardware queue.
>>>>>      * @job_id_count: used to assign unique id to the each job.
>>>>> - * @submit_wq: workqueue used to queue @work_submit
>>>>> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>>>>      * @timeout_wq: workqueue used to queue @work_tdr
>>>>> - * @work_submit: schedules jobs and cleans up entities
>>>>> + * @work_run_job: schedules jobs
>>>>> + * @work_free_job: cleans up jobs
>>>>>      * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>>>      *            timeout interval is over.
>>>>>      * @pending_list: the list of jobs which are currently in the job queue.
>>>>> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>>>>>     	atomic64_t			job_id_count;
>>>>>     	struct workqueue_struct		*submit_wq;
>>>>>     	struct workqueue_struct		*timeout_wq;
>>>>> -	struct work_struct		work_submit;
>>>>> +	struct work_struct		work_run_job;
>>>>> +	struct work_struct		work_free_job;
>>>>>     	struct delayed_work		work_tdr;
>>>>>     	struct list_head		pending_list;
>>>>>     	spinlock_t			job_list_lock;
Matthew Brost Aug. 23, 2023, 3:27 a.m. UTC | #6
On Mon, Aug 21, 2023 at 03:17:29PM +0200, Christian König wrote:
> Am 18.08.23 um 15:13 schrieb Matthew Brost:
> > On Fri, Aug 18, 2023 at 07:27:33AM +0200, Christian König wrote:
> > > Am 17.08.23 um 19:54 schrieb Matthew Brost:
> > > > On Thu, Aug 17, 2023 at 03:39:40PM +0200, Christian König wrote:
> > > > > Am 11.08.23 um 04:31 schrieb Matthew Brost:
> > > > > > Rather than call free_job and run_job in same work item have a dedicated
> > > > > > work item for each. This aligns with the design and intended use of work
> > > > > > queues.
> > > > > I would rather say we should get completely rid of the free_job callback.
> > > > > 
> > > > Would we still have work item? e.g. Would we still want to call
> > > > drm_sched_get_cleanup_job which removes the job from the pending list
> > > > and adjusts the TDR? Trying to figure out out what this looks like. We
> > > > probably can't do all of this from an IRQ context.
> > > > 
> > > > > Essentially the job is just the container which carries the information
> > > > > which are necessary before you push it to the hw. The real representation of
> > > > > the submission is actually the scheduler fence.
> > > > > 
> > > > Most of the free_jobs call plus drm_sched_job_cleanup + a put on job. In
> > > > Xe this cannot be called from an IRQ context either.
> > > > 
> > > > I'm just confused what exactly you are suggesting here.
> > > To summarize on one sentence: Instead of the job we keep the scheduler and
> > > hardware fences around after pushing the job to the hw.
> > > 
> > > The free_job callback would then be replaced by dropping the reference on
> > > the scheduler and hw fence.
> > > 
> > > Would that work for you?
> > > 
> > I don't think so for a few reasons.
> > 
> > The job and hw fence are different structures (also different allocs too)
> > for a reason. The job referenced until it is complete (hw fence is
> > signaled) and the free_job is called. This reference is needed for the
> > TDR to work properly and also some reset flows too.
> 
> That is exactly what I want to avoid, tying the TDR to the job is what some
> AMD engineers pushed for because it looked like a simple solution and made
> the whole thing similar to what Windows does.
> 
> This turned the previous relatively clean scheduler and TDR design into a
> complete nightmare. The job contains quite a bunch of things which are not
> necessarily available after the application which submitted the job is torn
> down.
>

Agree the TDR shouldn't be accessing anything application specific
rather just internal job state required to tear the job down on the
hardware.
 
> So what happens is that you either have stale pointers in the TDR which can
> go boom extremely easily or we somehow find a way to keep the necessary

I have not experenced the TDR going boom in Xe.

> structures (which include struct thread_info and struct file for this driver
> connection) alive until all submissions are completed.
> 

In Xe we keep everything alive until all submissions are completed. By
everything I mean the drm job, entity, scheduler, and VM via a reference
counting scheme. All of these structures are just kernel state which can
safely be accessed even if the application has been killed.

If we need to teardown on demand we just set the TDR to a minimum value and
it kicks the jobs off the hardware, gracefully cleans everything up and
drops all references. This is a benefit of the 1 to 1 relationship, not
sure if this works with how AMDGPU uses the scheduler.

> Delaying application tear down is also not an option because then you run
> into massive trouble with the OOM killer (or more generally OOM handling).
> See what we do in drm_sched_entity_flush() as well.
> 

Not an issue for Xe, we never call drm_sched_entity_flush as our
referencing counting scheme is all jobs are finished before we attempt
to tear down entity / scheduler.

> Since adding the TDR support we completely exercised this through in the
> last two or three years or so. And to sum it up I would really like to get
> away from this mess again.
> 
> Compared to that what i915 does is actually rather clean I think.
> 

Not even close, resets where a nightmare in the i915 (I spend years
trying to get this right and probably still completely work) and in Xe
basically got it right on the attempt.

> >   Also in Xe some of
> > things done in free_job cannot be from an IRQ context, hence calling
> > this from the scheduler worker is rather helpful.
> 
> Well putting things for cleanup into a workitem doesn't sounds like
> something hard.
>

That is exactly what we doing in the scheduler with the free_job
workitem.

> Question is what do you really need for TDR which is not inside the hardware
> fence?
>

A reference to the entity to be able to kick the job off the hardware.
A reference to the entity, job, and VM for error capture.

We also need a reference to the job for recovery after a GPU reset so
run_job can be called again for innocent jobs.

All of this leads to believe we need to stick with the design.

Matt

> Regards,
> Christian.
> 
> > 
> > The HW fence can live for longer as it can be installed in dma-resv
> > slots, syncobjs, etc... If the job and hw fence are combined now we
> > holding on the memory for the longer and perhaps at the mercy of the
> > user. We also run the risk of the final put being done from an IRQ
> > context which again wont work in Xe as it is currently coded. Lastly 2
> > jobs from the same scheduler could do the final put in parallel, so
> > rather than having free_job serialized by the worker now multiple jobs
> > are freeing themselves at the same time. This might not be an issue but
> > adds another level of raceyness that needs to be accounted for. None of
> > this sounds desirable to me.
> > 
> > FWIW what you suggesting sounds like how the i915 did things
> > (i915_request and hw fence in 1 memory alloc) and that turned out to be
> > a huge mess. As rule of thumb I generally do the opposite of whatever
> > the i915 did.
> > 
> > Matt
> > 
> > > Christian.
> > > 
> > > > Matt
> > > > 
> > > > > All the lifetime issues we had came from ignoring this fact and I think we
> > > > > should push for fixing this design up again.
> > > > > 
> > > > > Regards,
> > > > > Christian.
> > > > > 
> > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > ---
> > > > > >     drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > > > >     include/drm/gpu_scheduler.h            |   8 +-
> > > > > >     2 files changed, 106 insertions(+), 39 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > index cede47afc800..b67469eac179 100644
> > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > > > >      * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > > > >      *
> > > > > >      * @rq: scheduler run queue to check.
> > > > > > + * @dequeue: dequeue selected entity
> > > > > >      *
> > > > > >      * Try to find a ready entity, returns NULL if none found.
> > > > > >      */
> > > > > >     static struct drm_sched_entity *
> > > > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > > > >     {
> > > > > >     	struct drm_sched_entity *entity;
> > > > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > >     	if (entity) {
> > > > > >     		list_for_each_entry_continue(entity, &rq->entities, list) {
> > > > > >     			if (drm_sched_entity_is_ready(entity)) {
> > > > > > -				rq->current_entity = entity;
> > > > > > -				reinit_completion(&entity->entity_idle);
> > > > > > +				if (dequeue) {
> > > > > > +					rq->current_entity = entity;
> > > > > > +					reinit_completion(&entity->entity_idle);
> > > > > > +				}
> > > > > >     				spin_unlock(&rq->lock);
> > > > > >     				return entity;
> > > > > >     			}
> > > > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > >     	list_for_each_entry(entity, &rq->entities, list) {
> > > > > >     		if (drm_sched_entity_is_ready(entity)) {
> > > > > > -			rq->current_entity = entity;
> > > > > > -			reinit_completion(&entity->entity_idle);
> > > > > > +			if (dequeue) {
> > > > > > +				rq->current_entity = entity;
> > > > > > +				reinit_completion(&entity->entity_idle);
> > > > > > +			}
> > > > > >     			spin_unlock(&rq->lock);
> > > > > >     			return entity;
> > > > > >     		}
> > > > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > >      * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > > > >      *
> > > > > >      * @rq: scheduler run queue to check.
> > > > > > + * @dequeue: dequeue selected entity
> > > > > >      *
> > > > > >      * Find oldest waiting ready entity, returns NULL if none found.
> > > > > >      */
> > > > > >     static struct drm_sched_entity *
> > > > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > > > >     {
> > > > > >     	struct rb_node *rb;
> > > > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > >     		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > > > >     		if (drm_sched_entity_is_ready(entity)) {
> > > > > > -			rq->current_entity = entity;
> > > > > > -			reinit_completion(&entity->entity_idle);
> > > > > > +			if (dequeue) {
> > > > > > +				rq->current_entity = entity;
> > > > > > +				reinit_completion(&entity->entity_idle);
> > > > > > +			}
> > > > > >     			break;
> > > > > >     		}
> > > > > >     	}
> > > > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > >     }
> > > > > >     /**
> > > > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > > > + * drm_sched_run_job_queue - queue job submission
> > > > > >      * @sched: scheduler instance
> > > > > >      */
> > > > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > > > >     {
> > > > > >     	if (!READ_ONCE(sched->pause_submit))
> > > > > > -		queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > +}
> > > > > > +
> > > > > > +static struct drm_sched_entity *
> > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > > > + * @sched: scheduler instance
> > > > > > + */
> > > > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > +{
> > > > > > +	if (drm_sched_select_entity(sched, false))
> > > > > > +		drm_sched_run_job_queue(sched);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_sched_free_job_queue - queue free job
> > > > > > + *
> > > > > > + * @sched: scheduler instance to queue free job
> > > > > > + */
> > > > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > +{
> > > > > > +	if (!READ_ONCE(sched->pause_submit))
> > > > > > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > > > + *
> > > > > > + * @sched: scheduler instance to queue free job
> > > > > > + */
> > > > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > +{
> > > > > > +	struct drm_sched_job *job;
> > > > > > +
> > > > > > +	spin_lock(&sched->job_list_lock);
> > > > > > +	job = list_first_entry_or_null(&sched->pending_list,
> > > > > > +				       struct drm_sched_job, list);
> > > > > > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > > > +		drm_sched_free_job_queue(sched);
> > > > > > +	spin_unlock(&sched->job_list_lock);
> > > > > >     }
> > > > > >     /**
> > > > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > > > >     	dma_fence_get(&s_fence->finished);
> > > > > >     	drm_sched_fence_finished(s_fence, result);
> > > > > >     	dma_fence_put(&s_fence->finished);
> > > > > > -	drm_sched_submit_queue(sched);
> > > > > > +	drm_sched_free_job_queue(sched);
> > > > > >     }
> > > > > >     /**
> > > > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > > > >     void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > > > >     {
> > > > > >     	if (drm_sched_can_queue(sched))
> > > > > > -		drm_sched_submit_queue(sched);
> > > > > > +		drm_sched_run_job_queue(sched);
> > > > > >     }
> > > > > >     /**
> > > > > >      * drm_sched_select_entity - Select next entity to process
> > > > > >      *
> > > > > >      * @sched: scheduler instance
> > > > > > + * @dequeue: dequeue selected entity
> > > > > >      *
> > > > > >      * Returns the entity to process or NULL if none are found.
> > > > > >      */
> > > > > >     static struct drm_sched_entity *
> > > > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > > > >     {
> > > > > >     	struct drm_sched_entity *entity;
> > > > > >     	int i;
> > > > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > >     	/* Kernel run queue has higher priority than normal run queue*/
> > > > > >     	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > > > >     		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > > > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > > > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > > > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > > > +							dequeue) :
> > > > > > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > > > +						      dequeue);
> > > > > >     		if (entity)
> > > > > >     			break;
> > > > > >     	}
> > > > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > >     EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > >     /**
> > > > > > - * drm_sched_main - main scheduler thread
> > > > > > + * drm_sched_free_job_work - worker to call free_job
> > > > > >      *
> > > > > > - * @param: scheduler instance
> > > > > > + * @w: free job work
> > > > > >      */
> > > > > > -static void drm_sched_main(struct work_struct *w)
> > > > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > > > >     {
> > > > > >     	struct drm_gpu_scheduler *sched =
> > > > > > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > > > -	struct drm_sched_entity *entity;
> > > > > > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > > > >     	struct drm_sched_job *cleanup_job;
> > > > > > -	int r;
> > > > > >     	if (READ_ONCE(sched->pause_submit))
> > > > > >     		return;
> > > > > >     	cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > -	entity = drm_sched_select_entity(sched);
> > > > > > +	if (cleanup_job) {
> > > > > > +		sched->ops->free_job(cleanup_job);
> > > > > > +
> > > > > > +		drm_sched_free_job_queue_if_ready(sched);
> > > > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > > > > +	}
> > > > > > +}
> > > > > > -	if (!entity && !cleanup_job)
> > > > > > -		return;	/* No more work */
> > > > > > +/**
> > > > > > + * drm_sched_run_job_work - worker to call run_job
> > > > > > + *
> > > > > > + * @w: run job work
> > > > > > + */
> > > > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > > > +{
> > > > > > +	struct drm_gpu_scheduler *sched =
> > > > > > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > > > +	struct drm_sched_entity *entity;
> > > > > > +	int r;
> > > > > > -	if (cleanup_job)
> > > > > > -		sched->ops->free_job(cleanup_job);
> > > > > > +	if (READ_ONCE(sched->pause_submit))
> > > > > > +		return;
> > > > > > +	entity = drm_sched_select_entity(sched, true);
> > > > > >     	if (entity) {
> > > > > >     		struct dma_fence *fence;
> > > > > >     		struct drm_sched_fence *s_fence;
> > > > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > >     		sched_job = drm_sched_entity_pop_job(entity);
> > > > > >     		if (!sched_job) {
> > > > > >     			complete_all(&entity->entity_idle);
> > > > > > -			if (!cleanup_job)
> > > > > > -				return;	/* No more work */
> > > > > > -			goto again;
> > > > > > +			return;	/* No more work */
> > > > > >     		}
> > > > > >     		s_fence = sched_job->s_fence;
> > > > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > > > >     		}
> > > > > >     		wake_up(&sched->job_scheduled);
> > > > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > > > >     	}
> > > > > > -
> > > > > > -again:
> > > > > > -	drm_sched_submit_queue(sched);
> > > > > >     }
> > > > > >     /**
> > > > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > >     	spin_lock_init(&sched->job_list_lock);
> > > > > >     	atomic_set(&sched->hw_rq_count, 0);
> > > > > >     	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > > > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > > > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > > > >     	atomic_set(&sched->_score, 0);
> > > > > >     	atomic64_set(&sched->job_id_count, 0);
> > > > > >     	sched->pause_submit = false;
> > > > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > > > >     void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > > > >     {
> > > > > >     	WRITE_ONCE(sched->pause_submit, true);
> > > > > > -	cancel_work_sync(&sched->work_submit);
> > > > > > +	cancel_work_sync(&sched->work_run_job);
> > > > > > +	cancel_work_sync(&sched->work_free_job);
> > > > > >     }
> > > > > >     EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > >     void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > > > >     {
> > > > > >     	WRITE_ONCE(sched->pause_submit, false);
> > > > > > -	queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > +	queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > >     }
> > > > > >     EXPORT_SYMBOL(drm_sched_submit_start);
> > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > index 04eec2d7635f..fbc083a92757 100644
> > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > > > >      *                 finished.
> > > > > >      * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > > > >      * @job_id_count: used to assign unique id to the each job.
> > > > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > > > >      * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > - * @work_submit: schedules jobs and cleans up entities
> > > > > > + * @work_run_job: schedules jobs
> > > > > > + * @work_free_job: cleans up jobs
> > > > > >      * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > >      *            timeout interval is over.
> > > > > >      * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > > > >     	atomic64_t			job_id_count;
> > > > > >     	struct workqueue_struct		*submit_wq;
> > > > > >     	struct workqueue_struct		*timeout_wq;
> > > > > > -	struct work_struct		work_submit;
> > > > > > +	struct work_struct		work_run_job;
> > > > > > +	struct work_struct		work_free_job;
> > > > > >     	struct delayed_work		work_tdr;
> > > > > >     	struct list_head		pending_list;
> > > > > >     	spinlock_t			job_list_lock;
>
Christian König Aug. 23, 2023, 7:10 a.m. UTC | #7
Am 23.08.23 um 05:27 schrieb Matthew Brost:
> [SNIP]
>> That is exactly what I want to avoid, tying the TDR to the job is what some
>> AMD engineers pushed for because it looked like a simple solution and made
>> the whole thing similar to what Windows does.
>>
>> This turned the previous relatively clean scheduler and TDR design into a
>> complete nightmare. The job contains quite a bunch of things which are not
>> necessarily available after the application which submitted the job is torn
>> down.
>>
> Agree the TDR shouldn't be accessing anything application specific
> rather just internal job state required to tear the job down on the
> hardware.
>   
>> So what happens is that you either have stale pointers in the TDR which can
>> go boom extremely easily or we somehow find a way to keep the necessary
> I have not experenced the TDR going boom in Xe.
>
>> structures (which include struct thread_info and struct file for this driver
>> connection) alive until all submissions are completed.
>>
> In Xe we keep everything alive until all submissions are completed. By
> everything I mean the drm job, entity, scheduler, and VM via a reference
> counting scheme. All of these structures are just kernel state which can
> safely be accessed even if the application has been killed.

Yeah, but that might just not be such a good idea from memory management 
point of view.

When you (for example) kill a process all resource from that progress 
should at least be queued to be freed more or less immediately.

What Linux is doing for other I/O operations is to keep the relevant 
pages alive until the I/O operation is completed, but for GPUs that 
usually means keeping most of the memory of the process alive and that 
in turn is really not something you can do.

You can of course do this if your driver has a reliable way of killing 
your submissions and freeing resources in a reasonable amount of time. 
This should then be done in the flush callback.

> If we need to teardown on demand we just set the TDR to a minimum value and
> it kicks the jobs off the hardware, gracefully cleans everything up and
> drops all references. This is a benefit of the 1 to 1 relationship, not
> sure if this works with how AMDGPU uses the scheduler.
>
>> Delaying application tear down is also not an option because then you run
>> into massive trouble with the OOM killer (or more generally OOM handling).
>> See what we do in drm_sched_entity_flush() as well.
>>
> Not an issue for Xe, we never call drm_sched_entity_flush as our
> referencing counting scheme is all jobs are finished before we attempt
> to tear down entity / scheduler.

I don't think you can do that upstream. Calling drm_sched_entity_flush() 
is a must have from your flush callback for the file descriptor.

Unless you have some other method for killing your submissions this 
would give a path for a deny of service attack vector when the Xe driver 
is in use.

>> Since adding the TDR support we completely exercised this through in the
>> last two or three years or so. And to sum it up I would really like to get
>> away from this mess again.
>>
>> Compared to that what i915 does is actually rather clean I think.
>>
> Not even close, resets where a nightmare in the i915 (I spend years
> trying to get this right and probably still completely work) and in Xe
> basically got it right on the attempt.
>
>>>    Also in Xe some of
>>> things done in free_job cannot be from an IRQ context, hence calling
>>> this from the scheduler worker is rather helpful.
>> Well putting things for cleanup into a workitem doesn't sounds like
>> something hard.
>>
> That is exactly what we doing in the scheduler with the free_job
> workitem.

Yeah, but I think that we do it in the scheduler and not the driver is 
problematic.

For the scheduler it shouldn't care about the job any more as soon as 
the driver takes over.

>
>> Question is what do you really need for TDR which is not inside the hardware
>> fence?
>>
> A reference to the entity to be able to kick the job off the hardware.
> A reference to the entity, job, and VM for error capture.
>
> We also need a reference to the job for recovery after a GPU reset so
> run_job can be called again for innocent jobs.

Well exactly that's what I'm massively pushing back. Letting the 
scheduler call run_job() for the same job again is *NOT* something you 
can actually do.

This pretty clearly violates some of the dma_fence constrains and has 
cause massively headaches for me already.

What you can do is to do this inside your driver, e.g. take the jobs and 
push them again to the hw ring or just tell the hw to start executing 
again from a previous position.

BTW that re-submitting of jobs seems to be a no-go from userspace 
perspective as well. Take a look at the Vulkan spec for that, at least 
Marek pretty much pointed out that we should absolutely not do this 
inside the kernel.

The generally right approach seems to be to cleanly signal to userspace 
that something bad happened and that userspace then needs to submit 
things again even for innocent jobs.

Regards,
Christian.

>
> All of this leads to believe we need to stick with the design.
>
> Matt
>
>> Regards,
>> Christian.
>>
>>> The HW fence can live for longer as it can be installed in dma-resv
>>> slots, syncobjs, etc... If the job and hw fence are combined now we
>>> holding on the memory for the longer and perhaps at the mercy of the
>>> user. We also run the risk of the final put being done from an IRQ
>>> context which again wont work in Xe as it is currently coded. Lastly 2
>>> jobs from the same scheduler could do the final put in parallel, so
>>> rather than having free_job serialized by the worker now multiple jobs
>>> are freeing themselves at the same time. This might not be an issue but
>>> adds another level of raceyness that needs to be accounted for. None of
>>> this sounds desirable to me.
>>>
>>> FWIW what you suggesting sounds like how the i915 did things
>>> (i915_request and hw fence in 1 memory alloc) and that turned out to be
>>> a huge mess. As rule of thumb I generally do the opposite of whatever
>>> the i915 did.
>>>
>>> Matt
>>>
>>>> Christian.
>>>>
>>>>> Matt
>>>>>
>>>>>> All the lifetime issues we had came from ignoring this fact and I think we
>>>>>> should push for fixing this design up again.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>>> ---
>>>>>>>      drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>>>>>>>      include/drm/gpu_scheduler.h            |   8 +-
>>>>>>>      2 files changed, 106 insertions(+), 39 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> index cede47afc800..b67469eac179 100644
>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>>>>>       * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>>>>>       *
>>>>>>>       * @rq: scheduler run queue to check.
>>>>>>> + * @dequeue: dequeue selected entity
>>>>>>>       *
>>>>>>>       * Try to find a ready entity, returns NULL if none found.
>>>>>>>       */
>>>>>>>      static struct drm_sched_entity *
>>>>>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>>>>>>      {
>>>>>>>      	struct drm_sched_entity *entity;
>>>>>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>>      	if (entity) {
>>>>>>>      		list_for_each_entry_continue(entity, &rq->entities, list) {
>>>>>>>      			if (drm_sched_entity_is_ready(entity)) {
>>>>>>> -				rq->current_entity = entity;
>>>>>>> -				reinit_completion(&entity->entity_idle);
>>>>>>> +				if (dequeue) {
>>>>>>> +					rq->current_entity = entity;
>>>>>>> +					reinit_completion(&entity->entity_idle);
>>>>>>> +				}
>>>>>>>      				spin_unlock(&rq->lock);
>>>>>>>      				return entity;
>>>>>>>      			}
>>>>>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>>      	list_for_each_entry(entity, &rq->entities, list) {
>>>>>>>      		if (drm_sched_entity_is_ready(entity)) {
>>>>>>> -			rq->current_entity = entity;
>>>>>>> -			reinit_completion(&entity->entity_idle);
>>>>>>> +			if (dequeue) {
>>>>>>> +				rq->current_entity = entity;
>>>>>>> +				reinit_completion(&entity->entity_idle);
>>>>>>> +			}
>>>>>>>      			spin_unlock(&rq->lock);
>>>>>>>      			return entity;
>>>>>>>      		}
>>>>>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>>       * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>>>>>>       *
>>>>>>>       * @rq: scheduler run queue to check.
>>>>>>> + * @dequeue: dequeue selected entity
>>>>>>>       *
>>>>>>>       * Find oldest waiting ready entity, returns NULL if none found.
>>>>>>>       */
>>>>>>>      static struct drm_sched_entity *
>>>>>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>>>>>>      {
>>>>>>>      	struct rb_node *rb;
>>>>>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>>>      		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>>>>>>      		if (drm_sched_entity_is_ready(entity)) {
>>>>>>> -			rq->current_entity = entity;
>>>>>>> -			reinit_completion(&entity->entity_idle);
>>>>>>> +			if (dequeue) {
>>>>>>> +				rq->current_entity = entity;
>>>>>>> +				reinit_completion(&entity->entity_idle);
>>>>>>> +			}
>>>>>>>      			break;
>>>>>>>      		}
>>>>>>>      	}
>>>>>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>>>      }
>>>>>>>      /**
>>>>>>> - * drm_sched_submit_queue - scheduler queue submission
>>>>>>> + * drm_sched_run_job_queue - queue job submission
>>>>>>>       * @sched: scheduler instance
>>>>>>>       */
>>>>>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
>>>>>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>>>>>>      {
>>>>>>>      	if (!READ_ONCE(sched->pause_submit))
>>>>>>> -		queue_work(sched->submit_wq, &sched->work_submit);
>>>>>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static struct drm_sched_entity *
>>>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
>>>>>>> + * @sched: scheduler instance
>>>>>>> + */
>>>>>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>>>> +{
>>>>>>> +	if (drm_sched_select_entity(sched, false))
>>>>>>> +		drm_sched_run_job_queue(sched);
>>>>>>> +}
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * drm_sched_free_job_queue - queue free job
>>>>>>> + *
>>>>>>> + * @sched: scheduler instance to queue free job
>>>>>>> + */
>>>>>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>>>>>>> +{
>>>>>>> +	if (!READ_ONCE(sched->pause_submit))
>>>>>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
>>>>>>> +}
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
>>>>>>> + *
>>>>>>> + * @sched: scheduler instance to queue free job
>>>>>>> + */
>>>>>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>>>> +{
>>>>>>> +	struct drm_sched_job *job;
>>>>>>> +
>>>>>>> +	spin_lock(&sched->job_list_lock);
>>>>>>> +	job = list_first_entry_or_null(&sched->pending_list,
>>>>>>> +				       struct drm_sched_job, list);
>>>>>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
>>>>>>> +		drm_sched_free_job_queue(sched);
>>>>>>> +	spin_unlock(&sched->job_list_lock);
>>>>>>>      }
>>>>>>>      /**
>>>>>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>>>>>>      	dma_fence_get(&s_fence->finished);
>>>>>>>      	drm_sched_fence_finished(s_fence, result);
>>>>>>>      	dma_fence_put(&s_fence->finished);
>>>>>>> -	drm_sched_submit_queue(sched);
>>>>>>> +	drm_sched_free_job_queue(sched);
>>>>>>>      }
>>>>>>>      /**
>>>>>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>>>>>>      void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>>>>>>>      {
>>>>>>>      	if (drm_sched_can_queue(sched))
>>>>>>> -		drm_sched_submit_queue(sched);
>>>>>>> +		drm_sched_run_job_queue(sched);
>>>>>>>      }
>>>>>>>      /**
>>>>>>>       * drm_sched_select_entity - Select next entity to process
>>>>>>>       *
>>>>>>>       * @sched: scheduler instance
>>>>>>> + * @dequeue: dequeue selected entity
>>>>>>>       *
>>>>>>>       * Returns the entity to process or NULL if none are found.
>>>>>>>       */
>>>>>>>      static struct drm_sched_entity *
>>>>>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>>>>>>      {
>>>>>>>      	struct drm_sched_entity *entity;
>>>>>>>      	int i;
>>>>>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>>>>      	/* Kernel run queue has higher priority than normal run queue*/
>>>>>>>      	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>>>>>>      		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>>>>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
>>>>>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
>>>>>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>>>>>>> +							dequeue) :
>>>>>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>>>>>>> +						      dequeue);
>>>>>>>      		if (entity)
>>>>>>>      			break;
>>>>>>>      	}
>>>>>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>>>>>      EXPORT_SYMBOL(drm_sched_pick_best);
>>>>>>>      /**
>>>>>>> - * drm_sched_main - main scheduler thread
>>>>>>> + * drm_sched_free_job_work - worker to call free_job
>>>>>>>       *
>>>>>>> - * @param: scheduler instance
>>>>>>> + * @w: free job work
>>>>>>>       */
>>>>>>> -static void drm_sched_main(struct work_struct *w)
>>>>>>> +static void drm_sched_free_job_work(struct work_struct *w)
>>>>>>>      {
>>>>>>>      	struct drm_gpu_scheduler *sched =
>>>>>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
>>>>>>> -	struct drm_sched_entity *entity;
>>>>>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>>>>>>>      	struct drm_sched_job *cleanup_job;
>>>>>>> -	int r;
>>>>>>>      	if (READ_ONCE(sched->pause_submit))
>>>>>>>      		return;
>>>>>>>      	cleanup_job = drm_sched_get_cleanup_job(sched);
>>>>>>> -	entity = drm_sched_select_entity(sched);
>>>>>>> +	if (cleanup_job) {
>>>>>>> +		sched->ops->free_job(cleanup_job);
>>>>>>> +
>>>>>>> +		drm_sched_free_job_queue_if_ready(sched);
>>>>>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>>>>> +	}
>>>>>>> +}
>>>>>>> -	if (!entity && !cleanup_job)
>>>>>>> -		return;	/* No more work */
>>>>>>> +/**
>>>>>>> + * drm_sched_run_job_work - worker to call run_job
>>>>>>> + *
>>>>>>> + * @w: run job work
>>>>>>> + */
>>>>>>> +static void drm_sched_run_job_work(struct work_struct *w)
>>>>>>> +{
>>>>>>> +	struct drm_gpu_scheduler *sched =
>>>>>>> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
>>>>>>> +	struct drm_sched_entity *entity;
>>>>>>> +	int r;
>>>>>>> -	if (cleanup_job)
>>>>>>> -		sched->ops->free_job(cleanup_job);
>>>>>>> +	if (READ_ONCE(sched->pause_submit))
>>>>>>> +		return;
>>>>>>> +	entity = drm_sched_select_entity(sched, true);
>>>>>>>      	if (entity) {
>>>>>>>      		struct dma_fence *fence;
>>>>>>>      		struct drm_sched_fence *s_fence;
>>>>>>> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>>      		sched_job = drm_sched_entity_pop_job(entity);
>>>>>>>      		if (!sched_job) {
>>>>>>>      			complete_all(&entity->entity_idle);
>>>>>>> -			if (!cleanup_job)
>>>>>>> -				return;	/* No more work */
>>>>>>> -			goto again;
>>>>>>> +			return;	/* No more work */
>>>>>>>      		}
>>>>>>>      		s_fence = sched_job->s_fence;
>>>>>>> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>>      		}
>>>>>>>      		wake_up(&sched->job_scheduled);
>>>>>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>>>>>      	}
>>>>>>> -
>>>>>>> -again:
>>>>>>> -	drm_sched_submit_queue(sched);
>>>>>>>      }
>>>>>>>      /**
>>>>>>> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>      	spin_lock_init(&sched->job_list_lock);
>>>>>>>      	atomic_set(&sched->hw_rq_count, 0);
>>>>>>>      	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>>>>> -	INIT_WORK(&sched->work_submit, drm_sched_main);
>>>>>>> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>>>>>> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>>>>>>>      	atomic_set(&sched->_score, 0);
>>>>>>>      	atomic64_set(&sched->job_id_count, 0);
>>>>>>>      	sched->pause_submit = false;
>>>>>>> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>>>>>>>      void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>>>>>>>      {
>>>>>>>      	WRITE_ONCE(sched->pause_submit, true);
>>>>>>> -	cancel_work_sync(&sched->work_submit);
>>>>>>> +	cancel_work_sync(&sched->work_run_job);
>>>>>>> +	cancel_work_sync(&sched->work_free_job);
>>>>>>>      }
>>>>>>>      EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>>>> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>>>>      void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>>>>>>>      {
>>>>>>>      	WRITE_ONCE(sched->pause_submit, false);
>>>>>>> -	queue_work(sched->submit_wq, &sched->work_submit);
>>>>>>> +	queue_work(sched->submit_wq, &sched->work_run_job);
>>>>>>> +	queue_work(sched->submit_wq, &sched->work_free_job);
>>>>>>>      }
>>>>>>>      EXPORT_SYMBOL(drm_sched_submit_start);
>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>> index 04eec2d7635f..fbc083a92757 100644
>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>>>>>>>       *                 finished.
>>>>>>>       * @hw_rq_count: the number of jobs currently in the hardware queue.
>>>>>>>       * @job_id_count: used to assign unique id to the each job.
>>>>>>> - * @submit_wq: workqueue used to queue @work_submit
>>>>>>> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>>>>>>       * @timeout_wq: workqueue used to queue @work_tdr
>>>>>>> - * @work_submit: schedules jobs and cleans up entities
>>>>>>> + * @work_run_job: schedules jobs
>>>>>>> + * @work_free_job: cleans up jobs
>>>>>>>       * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>>>>>       *            timeout interval is over.
>>>>>>>       * @pending_list: the list of jobs which are currently in the job queue.
>>>>>>> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>>>>>>>      	atomic64_t			job_id_count;
>>>>>>>      	struct workqueue_struct		*submit_wq;
>>>>>>>      	struct workqueue_struct		*timeout_wq;
>>>>>>> -	struct work_struct		work_submit;
>>>>>>> +	struct work_struct		work_run_job;
>>>>>>> +	struct work_struct		work_free_job;
>>>>>>>      	struct delayed_work		work_tdr;
>>>>>>>      	struct list_head		pending_list;
>>>>>>>      	spinlock_t			job_list_lock;
Matthew Brost Aug. 23, 2023, 3:24 p.m. UTC | #8
On Wed, Aug 23, 2023 at 09:10:51AM +0200, Christian König wrote:
> Am 23.08.23 um 05:27 schrieb Matthew Brost:
> > [SNIP]
> > > That is exactly what I want to avoid, tying the TDR to the job is what some
> > > AMD engineers pushed for because it looked like a simple solution and made
> > > the whole thing similar to what Windows does.
> > > 
> > > This turned the previous relatively clean scheduler and TDR design into a
> > > complete nightmare. The job contains quite a bunch of things which are not
> > > necessarily available after the application which submitted the job is torn
> > > down.
> > > 
> > Agree the TDR shouldn't be accessing anything application specific
> > rather just internal job state required to tear the job down on the
> > hardware.
> > > So what happens is that you either have stale pointers in the TDR which can
> > > go boom extremely easily or we somehow find a way to keep the necessary
> > I have not experenced the TDR going boom in Xe.
> > 
> > > structures (which include struct thread_info and struct file for this driver
> > > connection) alive until all submissions are completed.
> > > 
> > In Xe we keep everything alive until all submissions are completed. By
> > everything I mean the drm job, entity, scheduler, and VM via a reference
> > counting scheme. All of these structures are just kernel state which can
> > safely be accessed even if the application has been killed.
> 
> Yeah, but that might just not be such a good idea from memory management
> point of view.
> 
> When you (for example) kill a process all resource from that progress should
> at least be queued to be freed more or less immediately.
> 

We do this, the TDR kicks jobs off the hardware as fast as the hw
interface allows and signals all pending hw fences immediately after.
Free jobs then is immediately called and the reference count goes to
zero. I think max time for all of this to occur is a handful of ms.

> What Linux is doing for other I/O operations is to keep the relevant pages
> alive until the I/O operation is completed, but for GPUs that usually means
> keeping most of the memory of the process alive and that in turn is really
> not something you can do.
> 
> You can of course do this if your driver has a reliable way of killing your
> submissions and freeing resources in a reasonable amount of time. This
> should then be done in the flush callback.
> 

'flush callback' - Do you mean drm_sched_entity_flush? I looked at that
and think that function doesn't even work for what I tell. It flushes
the spsc queue but what about jobs on the hardware, how do those get
killed?

As stated we do via the TDR which is rather clean design and fits with
our reference couting scheme.

> > If we need to teardown on demand we just set the TDR to a minimum value and
> > it kicks the jobs off the hardware, gracefully cleans everything up and
> > drops all references. This is a benefit of the 1 to 1 relationship, not
> > sure if this works with how AMDGPU uses the scheduler.
> > 
> > > Delaying application tear down is also not an option because then you run
> > > into massive trouble with the OOM killer (or more generally OOM handling).
> > > See what we do in drm_sched_entity_flush() as well.
> > > 
> > Not an issue for Xe, we never call drm_sched_entity_flush as our
> > referencing counting scheme is all jobs are finished before we attempt
> > to tear down entity / scheduler.
> 
> I don't think you can do that upstream. Calling drm_sched_entity_flush() is
> a must have from your flush callback for the file descriptor.
> 

Again 'flush callback'? What are you refering too.

And why does drm_sched_entity_flush need to be called, doesn't seem to
do anything useful.

> Unless you have some other method for killing your submissions this would
> give a path for a deny of service attack vector when the Xe driver is in
> use.
> 

Yes, once th TDR fires is disallows all new submissions at the exec
IOCTL plus flushes any pending submissions as fast as possible.

> > > Since adding the TDR support we completely exercised this through in the
> > > last two or three years or so. And to sum it up I would really like to get
> > > away from this mess again.
> > > 
> > > Compared to that what i915 does is actually rather clean I think.
> > > 
> > Not even close, resets where a nightmare in the i915 (I spend years
> > trying to get this right and probably still completely work) and in Xe
> > basically got it right on the attempt.
> > 
> > > >    Also in Xe some of
> > > > things done in free_job cannot be from an IRQ context, hence calling
> > > > this from the scheduler worker is rather helpful.
> > > Well putting things for cleanup into a workitem doesn't sounds like
> > > something hard.
> > > 
> > That is exactly what we doing in the scheduler with the free_job
> > workitem.
> 
> Yeah, but I think that we do it in the scheduler and not the driver is
> problematic.
>

Disagree, a common clean callback from a non-irq contexts IMO is a good
design rather than each driver possibly having its own worker for
cleanup.

> For the scheduler it shouldn't care about the job any more as soon as the
> driver takes over.
> 

This a massive rewrite for all users of the DRM scheduler, I'm saying
for Xe what you are suggesting makes little to no sense. 

I'd like other users of the DRM scheduler to chime in on what you
purposing. The scope of this change affects 8ish drivers that would
require buy in each of the stakeholders. I certainly can't change of
these drivers as I don't feel comfortable in all of those code bases nor
do I have hardware to test all of these drivers.

> > 
> > > Question is what do you really need for TDR which is not inside the hardware
> > > fence?
> > > 
> > A reference to the entity to be able to kick the job off the hardware.
> > A reference to the entity, job, and VM for error capture.
> > 
> > We also need a reference to the job for recovery after a GPU reset so
> > run_job can be called again for innocent jobs.
> 
> Well exactly that's what I'm massively pushing back. Letting the scheduler
> call run_job() for the same job again is *NOT* something you can actually
> do.
> 

But lots of drivers do this already and the DRM scheduler documents
this.

> This pretty clearly violates some of the dma_fence constrains and has cause
> massively headaches for me already.
> 

Seems to work fine in Xe.

> What you can do is to do this inside your driver, e.g. take the jobs and
> push them again to the hw ring or just tell the hw to start executing again
> from a previous position.
> 

Again this now is massive rewrite of many drivers.

> BTW that re-submitting of jobs seems to be a no-go from userspace
> perspective as well. Take a look at the Vulkan spec for that, at least Marek
> pretty much pointed out that we should absolutely not do this inside the
> kernel.
> 

Yes if the job causes the hang, we ban the queue. Typcially only per
entity (queue) resets are done in Xe but occasionally device level
resets are done (issues with hardware) and innocent jobs / entities call
run_job again.

> The generally right approach seems to be to cleanly signal to userspace that
> something bad happened and that userspace then needs to submit things again
> even for innocent jobs.
> 

I disagree that innocent jobs should be banned. What you are suggesting
is if a device reset needs to be done we kill / ban every user space queue.
Thats seems like overkill. Not seeing where that is stated in this doc
[1], it seems to imply that only jobs that are stuck results in bans.

Matt

[1] https://patchwork.freedesktop.org/patch/553465/?series=119883&rev=3

> Regards,
> Christian.
> 
> > 
> > All of this leads to believe we need to stick with the design.
> > 
> > Matt
> > 
> > > Regards,
> > > Christian.
> > > 
> > > > The HW fence can live for longer as it can be installed in dma-resv
> > > > slots, syncobjs, etc... If the job and hw fence are combined now we
> > > > holding on the memory for the longer and perhaps at the mercy of the
> > > > user. We also run the risk of the final put being done from an IRQ
> > > > context which again wont work in Xe as it is currently coded. Lastly 2
> > > > jobs from the same scheduler could do the final put in parallel, so
> > > > rather than having free_job serialized by the worker now multiple jobs
> > > > are freeing themselves at the same time. This might not be an issue but
> > > > adds another level of raceyness that needs to be accounted for. None of
> > > > this sounds desirable to me.
> > > > 
> > > > FWIW what you suggesting sounds like how the i915 did things
> > > > (i915_request and hw fence in 1 memory alloc) and that turned out to be
> > > > a huge mess. As rule of thumb I generally do the opposite of whatever
> > > > the i915 did.
> > > > 
> > > > Matt
> > > > 
> > > > > Christian.
> > > > > 
> > > > > > Matt
> > > > > > 
> > > > > > > All the lifetime issues we had came from ignoring this fact and I think we
> > > > > > > should push for fixing this design up again.
> > > > > > > 
> > > > > > > Regards,
> > > > > > > Christian.
> > > > > > > 
> > > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > > > ---
> > > > > > > >      drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > > > > > >      include/drm/gpu_scheduler.h            |   8 +-
> > > > > > > >      2 files changed, 106 insertions(+), 39 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > index cede47afc800..b67469eac179 100644
> > > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > > > > > >       * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > > > > > >       *
> > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > >       *
> > > > > > > >       * Try to find a ready entity, returns NULL if none found.
> > > > > > > >       */
> > > > > > > >      static struct drm_sched_entity *
> > > > > > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > >      {
> > > > > > > >      	struct drm_sched_entity *entity;
> > > > > > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > >      	if (entity) {
> > > > > > > >      		list_for_each_entry_continue(entity, &rq->entities, list) {
> > > > > > > >      			if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > -				rq->current_entity = entity;
> > > > > > > > -				reinit_completion(&entity->entity_idle);
> > > > > > > > +				if (dequeue) {
> > > > > > > > +					rq->current_entity = entity;
> > > > > > > > +					reinit_completion(&entity->entity_idle);
> > > > > > > > +				}
> > > > > > > >      				spin_unlock(&rq->lock);
> > > > > > > >      				return entity;
> > > > > > > >      			}
> > > > > > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > >      	list_for_each_entry(entity, &rq->entities, list) {
> > > > > > > >      		if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > -			rq->current_entity = entity;
> > > > > > > > -			reinit_completion(&entity->entity_idle);
> > > > > > > > +			if (dequeue) {
> > > > > > > > +				rq->current_entity = entity;
> > > > > > > > +				reinit_completion(&entity->entity_idle);
> > > > > > > > +			}
> > > > > > > >      			spin_unlock(&rq->lock);
> > > > > > > >      			return entity;
> > > > > > > >      		}
> > > > > > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > >       * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > > > > > >       *
> > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > >       *
> > > > > > > >       * Find oldest waiting ready entity, returns NULL if none found.
> > > > > > > >       */
> > > > > > > >      static struct drm_sched_entity *
> > > > > > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > >      {
> > > > > > > >      	struct rb_node *rb;
> > > > > > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > >      		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > > > > > >      		if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > -			rq->current_entity = entity;
> > > > > > > > -			reinit_completion(&entity->entity_idle);
> > > > > > > > +			if (dequeue) {
> > > > > > > > +				rq->current_entity = entity;
> > > > > > > > +				reinit_completion(&entity->entity_idle);
> > > > > > > > +			}
> > > > > > > >      			break;
> > > > > > > >      		}
> > > > > > > >      	}
> > > > > > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > >      }
> > > > > > > >      /**
> > > > > > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > > > > > + * drm_sched_run_job_queue - queue job submission
> > > > > > > >       * @sched: scheduler instance
> > > > > > > >       */
> > > > > > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > >      {
> > > > > > > >      	if (!READ_ONCE(sched->pause_submit))
> > > > > > > > -		queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static struct drm_sched_entity *
> > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > > > > > +
> > > > > > > > +/**
> > > > > > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > > > > > + * @sched: scheduler instance
> > > > > > > > + */
> > > > > > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > +{
> > > > > > > > +	if (drm_sched_select_entity(sched, false))
> > > > > > > > +		drm_sched_run_job_queue(sched);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/**
> > > > > > > > + * drm_sched_free_job_queue - queue free job
> > > > > > > > + *
> > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > + */
> > > > > > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > +{
> > > > > > > > +	if (!READ_ONCE(sched->pause_submit))
> > > > > > > > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/**
> > > > > > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > > > > > + *
> > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > + */
> > > > > > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > +{
> > > > > > > > +	struct drm_sched_job *job;
> > > > > > > > +
> > > > > > > > +	spin_lock(&sched->job_list_lock);
> > > > > > > > +	job = list_first_entry_or_null(&sched->pending_list,
> > > > > > > > +				       struct drm_sched_job, list);
> > > > > > > > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > > > > > +		drm_sched_free_job_queue(sched);
> > > > > > > > +	spin_unlock(&sched->job_list_lock);
> > > > > > > >      }
> > > > > > > >      /**
> > > > > > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > > > > > >      	dma_fence_get(&s_fence->finished);
> > > > > > > >      	drm_sched_fence_finished(s_fence, result);
> > > > > > > >      	dma_fence_put(&s_fence->finished);
> > > > > > > > -	drm_sched_submit_queue(sched);
> > > > > > > > +	drm_sched_free_job_queue(sched);
> > > > > > > >      }
> > > > > > > >      /**
> > > > > > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > >      void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > >      {
> > > > > > > >      	if (drm_sched_can_queue(sched))
> > > > > > > > -		drm_sched_submit_queue(sched);
> > > > > > > > +		drm_sched_run_job_queue(sched);
> > > > > > > >      }
> > > > > > > >      /**
> > > > > > > >       * drm_sched_select_entity - Select next entity to process
> > > > > > > >       *
> > > > > > > >       * @sched: scheduler instance
> > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > >       *
> > > > > > > >       * Returns the entity to process or NULL if none are found.
> > > > > > > >       */
> > > > > > > >      static struct drm_sched_entity *
> > > > > > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > > > > > >      {
> > > > > > > >      	struct drm_sched_entity *entity;
> > > > > > > >      	int i;
> > > > > > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > >      	/* Kernel run queue has higher priority than normal run queue*/
> > > > > > > >      	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > > > > > >      		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > > > > > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > > > > > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > > > > > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > > > > > +							dequeue) :
> > > > > > > > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > > > > > +						      dequeue);
> > > > > > > >      		if (entity)
> > > > > > > >      			break;
> > > > > > > >      	}
> > > > > > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > > > >      EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > > > >      /**
> > > > > > > > - * drm_sched_main - main scheduler thread
> > > > > > > > + * drm_sched_free_job_work - worker to call free_job
> > > > > > > >       *
> > > > > > > > - * @param: scheduler instance
> > > > > > > > + * @w: free job work
> > > > > > > >       */
> > > > > > > > -static void drm_sched_main(struct work_struct *w)
> > > > > > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > > > > > >      {
> > > > > > > >      	struct drm_gpu_scheduler *sched =
> > > > > > > > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > > > > > -	struct drm_sched_entity *entity;
> > > > > > > > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > > > > > >      	struct drm_sched_job *cleanup_job;
> > > > > > > > -	int r;
> > > > > > > >      	if (READ_ONCE(sched->pause_submit))
> > > > > > > >      		return;
> > > > > > > >      	cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > > > -	entity = drm_sched_select_entity(sched);
> > > > > > > > +	if (cleanup_job) {
> > > > > > > > +		sched->ops->free_job(cleanup_job);
> > > > > > > > +
> > > > > > > > +		drm_sched_free_job_queue_if_ready(sched);
> > > > > > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > +	}
> > > > > > > > +}
> > > > > > > > -	if (!entity && !cleanup_job)
> > > > > > > > -		return;	/* No more work */
> > > > > > > > +/**
> > > > > > > > + * drm_sched_run_job_work - worker to call run_job
> > > > > > > > + *
> > > > > > > > + * @w: run job work
> > > > > > > > + */
> > > > > > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > > > > > +{
> > > > > > > > +	struct drm_gpu_scheduler *sched =
> > > > > > > > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > > > > > +	struct drm_sched_entity *entity;
> > > > > > > > +	int r;
> > > > > > > > -	if (cleanup_job)
> > > > > > > > -		sched->ops->free_job(cleanup_job);
> > > > > > > > +	if (READ_ONCE(sched->pause_submit))
> > > > > > > > +		return;
> > > > > > > > +	entity = drm_sched_select_entity(sched, true);
> > > > > > > >      	if (entity) {
> > > > > > > >      		struct dma_fence *fence;
> > > > > > > >      		struct drm_sched_fence *s_fence;
> > > > > > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > >      		sched_job = drm_sched_entity_pop_job(entity);
> > > > > > > >      		if (!sched_job) {
> > > > > > > >      			complete_all(&entity->entity_idle);
> > > > > > > > -			if (!cleanup_job)
> > > > > > > > -				return;	/* No more work */
> > > > > > > > -			goto again;
> > > > > > > > +			return;	/* No more work */
> > > > > > > >      		}
> > > > > > > >      		s_fence = sched_job->s_fence;
> > > > > > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > >      		}
> > > > > > > >      		wake_up(&sched->job_scheduled);
> > > > > > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > > > > > >      	}
> > > > > > > > -
> > > > > > > > -again:
> > > > > > > > -	drm_sched_submit_queue(sched);
> > > > > > > >      }
> > > > > > > >      /**
> > > > > > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > >      	spin_lock_init(&sched->job_list_lock);
> > > > > > > >      	atomic_set(&sched->hw_rq_count, 0);
> > > > > > > >      	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > > > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > > > > > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > > > > > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > > > > > >      	atomic_set(&sched->_score, 0);
> > > > > > > >      	atomic64_set(&sched->job_id_count, 0);
> > > > > > > >      	sched->pause_submit = false;
> > > > > > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > > > > > >      void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > > > > > >      {
> > > > > > > >      	WRITE_ONCE(sched->pause_submit, true);
> > > > > > > > -	cancel_work_sync(&sched->work_submit);
> > > > > > > > +	cancel_work_sync(&sched->work_run_job);
> > > > > > > > +	cancel_work_sync(&sched->work_free_job);
> > > > > > > >      }
> > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > >      void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > > > > > >      {
> > > > > > > >      	WRITE_ONCE(sched->pause_submit, false);
> > > > > > > > -	queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > +	queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > >      }
> > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_start);
> > > > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > > > index 04eec2d7635f..fbc083a92757 100644
> > > > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > > > > > >       *                 finished.
> > > > > > > >       * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > > > > > >       * @job_id_count: used to assign unique id to the each job.
> > > > > > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > > > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > > > > > >       * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > > > - * @work_submit: schedules jobs and cleans up entities
> > > > > > > > + * @work_run_job: schedules jobs
> > > > > > > > + * @work_free_job: cleans up jobs
> > > > > > > >       * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > > > >       *            timeout interval is over.
> > > > > > > >       * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > > > > > >      	atomic64_t			job_id_count;
> > > > > > > >      	struct workqueue_struct		*submit_wq;
> > > > > > > >      	struct workqueue_struct		*timeout_wq;
> > > > > > > > -	struct work_struct		work_submit;
> > > > > > > > +	struct work_struct		work_run_job;
> > > > > > > > +	struct work_struct		work_free_job;
> > > > > > > >      	struct delayed_work		work_tdr;
> > > > > > > >      	struct list_head		pending_list;
> > > > > > > >      	spinlock_t			job_list_lock;
>
Alex Deucher Aug. 23, 2023, 3:41 p.m. UTC | #9
On Wed, Aug 23, 2023 at 11:26 AM Matthew Brost <matthew.brost@intel.com> wrote:
>
> On Wed, Aug 23, 2023 at 09:10:51AM +0200, Christian König wrote:
> > Am 23.08.23 um 05:27 schrieb Matthew Brost:
> > > [SNIP]
> > > > That is exactly what I want to avoid, tying the TDR to the job is what some
> > > > AMD engineers pushed for because it looked like a simple solution and made
> > > > the whole thing similar to what Windows does.
> > > >
> > > > This turned the previous relatively clean scheduler and TDR design into a
> > > > complete nightmare. The job contains quite a bunch of things which are not
> > > > necessarily available after the application which submitted the job is torn
> > > > down.
> > > >
> > > Agree the TDR shouldn't be accessing anything application specific
> > > rather just internal job state required to tear the job down on the
> > > hardware.
> > > > So what happens is that you either have stale pointers in the TDR which can
> > > > go boom extremely easily or we somehow find a way to keep the necessary
> > > I have not experenced the TDR going boom in Xe.
> > >
> > > > structures (which include struct thread_info and struct file for this driver
> > > > connection) alive until all submissions are completed.
> > > >
> > > In Xe we keep everything alive until all submissions are completed. By
> > > everything I mean the drm job, entity, scheduler, and VM via a reference
> > > counting scheme. All of these structures are just kernel state which can
> > > safely be accessed even if the application has been killed.
> >
> > Yeah, but that might just not be such a good idea from memory management
> > point of view.
> >
> > When you (for example) kill a process all resource from that progress should
> > at least be queued to be freed more or less immediately.
> >
>
> We do this, the TDR kicks jobs off the hardware as fast as the hw
> interface allows and signals all pending hw fences immediately after.
> Free jobs then is immediately called and the reference count goes to
> zero. I think max time for all of this to occur is a handful of ms.
>
> > What Linux is doing for other I/O operations is to keep the relevant pages
> > alive until the I/O operation is completed, but for GPUs that usually means
> > keeping most of the memory of the process alive and that in turn is really
> > not something you can do.
> >
> > You can of course do this if your driver has a reliable way of killing your
> > submissions and freeing resources in a reasonable amount of time. This
> > should then be done in the flush callback.
> >
>
> 'flush callback' - Do you mean drm_sched_entity_flush? I looked at that
> and think that function doesn't even work for what I tell. It flushes
> the spsc queue but what about jobs on the hardware, how do those get
> killed?
>
> As stated we do via the TDR which is rather clean design and fits with
> our reference couting scheme.
>
> > > If we need to teardown on demand we just set the TDR to a minimum value and
> > > it kicks the jobs off the hardware, gracefully cleans everything up and
> > > drops all references. This is a benefit of the 1 to 1 relationship, not
> > > sure if this works with how AMDGPU uses the scheduler.
> > >
> > > > Delaying application tear down is also not an option because then you run
> > > > into massive trouble with the OOM killer (or more generally OOM handling).
> > > > See what we do in drm_sched_entity_flush() as well.
> > > >
> > > Not an issue for Xe, we never call drm_sched_entity_flush as our
> > > referencing counting scheme is all jobs are finished before we attempt
> > > to tear down entity / scheduler.
> >
> > I don't think you can do that upstream. Calling drm_sched_entity_flush() is
> > a must have from your flush callback for the file descriptor.
> >
>
> Again 'flush callback'? What are you refering too.
>
> And why does drm_sched_entity_flush need to be called, doesn't seem to
> do anything useful.
>
> > Unless you have some other method for killing your submissions this would
> > give a path for a deny of service attack vector when the Xe driver is in
> > use.
> >
>
> Yes, once th TDR fires is disallows all new submissions at the exec
> IOCTL plus flushes any pending submissions as fast as possible.
>
> > > > Since adding the TDR support we completely exercised this through in the
> > > > last two or three years or so. And to sum it up I would really like to get
> > > > away from this mess again.
> > > >
> > > > Compared to that what i915 does is actually rather clean I think.
> > > >
> > > Not even close, resets where a nightmare in the i915 (I spend years
> > > trying to get this right and probably still completely work) and in Xe
> > > basically got it right on the attempt.
> > >
> > > > >    Also in Xe some of
> > > > > things done in free_job cannot be from an IRQ context, hence calling
> > > > > this from the scheduler worker is rather helpful.
> > > > Well putting things for cleanup into a workitem doesn't sounds like
> > > > something hard.
> > > >
> > > That is exactly what we doing in the scheduler with the free_job
> > > workitem.
> >
> > Yeah, but I think that we do it in the scheduler and not the driver is
> > problematic.
> >
>
> Disagree, a common clean callback from a non-irq contexts IMO is a good
> design rather than each driver possibly having its own worker for
> cleanup.
>
> > For the scheduler it shouldn't care about the job any more as soon as the
> > driver takes over.
> >
>
> This a massive rewrite for all users of the DRM scheduler, I'm saying
> for Xe what you are suggesting makes little to no sense.
>
> I'd like other users of the DRM scheduler to chime in on what you
> purposing. The scope of this change affects 8ish drivers that would
> require buy in each of the stakeholders. I certainly can't change of
> these drivers as I don't feel comfortable in all of those code bases nor
> do I have hardware to test all of these drivers.
>
> > >
> > > > Question is what do you really need for TDR which is not inside the hardware
> > > > fence?
> > > >
> > > A reference to the entity to be able to kick the job off the hardware.
> > > A reference to the entity, job, and VM for error capture.
> > >
> > > We also need a reference to the job for recovery after a GPU reset so
> > > run_job can be called again for innocent jobs.
> >
> > Well exactly that's what I'm massively pushing back. Letting the scheduler
> > call run_job() for the same job again is *NOT* something you can actually
> > do.
> >
>
> But lots of drivers do this already and the DRM scheduler documents
> this.
>
> > This pretty clearly violates some of the dma_fence constrains and has cause
> > massively headaches for me already.
> >
>
> Seems to work fine in Xe.
>
> > What you can do is to do this inside your driver, e.g. take the jobs and
> > push them again to the hw ring or just tell the hw to start executing again
> > from a previous position.
> >
>
> Again this now is massive rewrite of many drivers.
>
> > BTW that re-submitting of jobs seems to be a no-go from userspace
> > perspective as well. Take a look at the Vulkan spec for that, at least Marek
> > pretty much pointed out that we should absolutely not do this inside the
> > kernel.
> >
>
> Yes if the job causes the hang, we ban the queue. Typcially only per
> entity (queue) resets are done in Xe but occasionally device level
> resets are done (issues with hardware) and innocent jobs / entities call
> run_job again.

If the engine is reset and the job was already executing, how can you
determine that it's in a good state to resubmit?  What if some
internal fence or semaphore in memory used by the logic in the command
buffer has been signaled already and then you resubmit the job and it
now starts executing with different input state?

Alex

>
> > The generally right approach seems to be to cleanly signal to userspace that
> > something bad happened and that userspace then needs to submit things again
> > even for innocent jobs.
> >
>
> I disagree that innocent jobs should be banned. What you are suggesting
> is if a device reset needs to be done we kill / ban every user space queue.
> Thats seems like overkill. Not seeing where that is stated in this doc
> [1], it seems to imply that only jobs that are stuck results in bans.
>
> Matt
>
> [1] https://patchwork.freedesktop.org/patch/553465/?series=119883&rev=3
>
> > Regards,
> > Christian.
> >
> > >
> > > All of this leads to believe we need to stick with the design.
> > >
> > > Matt
> > >
> > > > Regards,
> > > > Christian.
> > > >
> > > > > The HW fence can live for longer as it can be installed in dma-resv
> > > > > slots, syncobjs, etc... If the job and hw fence are combined now we
> > > > > holding on the memory for the longer and perhaps at the mercy of the
> > > > > user. We also run the risk of the final put being done from an IRQ
> > > > > context which again wont work in Xe as it is currently coded. Lastly 2
> > > > > jobs from the same scheduler could do the final put in parallel, so
> > > > > rather than having free_job serialized by the worker now multiple jobs
> > > > > are freeing themselves at the same time. This might not be an issue but
> > > > > adds another level of raceyness that needs to be accounted for. None of
> > > > > this sounds desirable to me.
> > > > >
> > > > > FWIW what you suggesting sounds like how the i915 did things
> > > > > (i915_request and hw fence in 1 memory alloc) and that turned out to be
> > > > > a huge mess. As rule of thumb I generally do the opposite of whatever
> > > > > the i915 did.
> > > > >
> > > > > Matt
> > > > >
> > > > > > Christian.
> > > > > >
> > > > > > > Matt
> > > > > > >
> > > > > > > > All the lifetime issues we had came from ignoring this fact and I think we
> > > > > > > > should push for fixing this design up again.
> > > > > > > >
> > > > > > > > Regards,
> > > > > > > > Christian.
> > > > > > > >
> > > > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > ---
> > > > > > > > >      drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > > > > > > >      include/drm/gpu_scheduler.h            |   8 +-
> > > > > > > > >      2 files changed, 106 insertions(+), 39 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > index cede47afc800..b67469eac179 100644
> > > > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > > > > > > >       * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > > > > > > >       *
> > > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > >       *
> > > > > > > > >       * Try to find a ready entity, returns NULL if none found.
> > > > > > > > >       */
> > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > >      {
> > > > > > > > >         struct drm_sched_entity *entity;
> > > > > > > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > >         if (entity) {
> > > > > > > > >                 list_for_each_entry_continue(entity, &rq->entities, list) {
> > > > > > > > >                         if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > -                               rq->current_entity = entity;
> > > > > > > > > -                               reinit_completion(&entity->entity_idle);
> > > > > > > > > +                               if (dequeue) {
> > > > > > > > > +                                       rq->current_entity = entity;
> > > > > > > > > +                                       reinit_completion(&entity->entity_idle);
> > > > > > > > > +                               }
> > > > > > > > >                                 spin_unlock(&rq->lock);
> > > > > > > > >                                 return entity;
> > > > > > > > >                         }
> > > > > > > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > >         list_for_each_entry(entity, &rq->entities, list) {
> > > > > > > > >                 if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > +                       if (dequeue) {
> > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > +                       }
> > > > > > > > >                         spin_unlock(&rq->lock);
> > > > > > > > >                         return entity;
> > > > > > > > >                 }
> > > > > > > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > >       * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > > > > > > >       *
> > > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > >       *
> > > > > > > > >       * Find oldest waiting ready entity, returns NULL if none found.
> > > > > > > > >       */
> > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > >      {
> > > > > > > > >         struct rb_node *rb;
> > > > > > > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > >                 entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > > > > > > >                 if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > +                       if (dequeue) {
> > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > +                       }
> > > > > > > > >                         break;
> > > > > > > > >                 }
> > > > > > > > >         }
> > > > > > > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > >      }
> > > > > > > > >      /**
> > > > > > > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > > > > > > + * drm_sched_run_job_queue - queue job submission
> > > > > > > > >       * @sched: scheduler instance
> > > > > > > > >       */
> > > > > > > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > >      {
> > > > > > > > >         if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > -               queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static struct drm_sched_entity *
> > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > > > > > > +
> > > > > > > > > +/**
> > > > > > > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > > > > > > + * @sched: scheduler instance
> > > > > > > > > + */
> > > > > > > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > +{
> > > > > > > > > +       if (drm_sched_select_entity(sched, false))
> > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +/**
> > > > > > > > > + * drm_sched_free_job_queue - queue free job
> > > > > > > > > + *
> > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > + */
> > > > > > > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > +{
> > > > > > > > > +       if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +/**
> > > > > > > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > > > > > > + *
> > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > + */
> > > > > > > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > +{
> > > > > > > > > +       struct drm_sched_job *job;
> > > > > > > > > +
> > > > > > > > > +       spin_lock(&sched->job_list_lock);
> > > > > > > > > +       job = list_first_entry_or_null(&sched->pending_list,
> > > > > > > > > +                                      struct drm_sched_job, list);
> > > > > > > > > +       if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > > > > > > +               drm_sched_free_job_queue(sched);
> > > > > > > > > +       spin_unlock(&sched->job_list_lock);
> > > > > > > > >      }
> > > > > > > > >      /**
> > > > > > > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > > > > > > >         dma_fence_get(&s_fence->finished);
> > > > > > > > >         drm_sched_fence_finished(s_fence, result);
> > > > > > > > >         dma_fence_put(&s_fence->finished);
> > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > > +       drm_sched_free_job_queue(sched);
> > > > > > > > >      }
> > > > > > > > >      /**
> > > > > > > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > >      void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > >      {
> > > > > > > > >         if (drm_sched_can_queue(sched))
> > > > > > > > > -               drm_sched_submit_queue(sched);
> > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > >      }
> > > > > > > > >      /**
> > > > > > > > >       * drm_sched_select_entity - Select next entity to process
> > > > > > > > >       *
> > > > > > > > >       * @sched: scheduler instance
> > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > >       *
> > > > > > > > >       * Returns the entity to process or NULL if none are found.
> > > > > > > > >       */
> > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > > > > > > >      {
> > > > > > > > >         struct drm_sched_entity *entity;
> > > > > > > > >         int i;
> > > > > > > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > >         /* Kernel run queue has higher priority than normal run queue*/
> > > > > > > > >         for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > > > > > > >                 entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > > > > > > -                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > > > > > > -                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > > > > > > +                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > > > > > > +                                                       dequeue) :
> > > > > > > > > +                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > > > > > > +                                                     dequeue);
> > > > > > > > >                 if (entity)
> > > > > > > > >                         break;
> > > > > > > > >         }
> > > > > > > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > > > > >      EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > > > > >      /**
> > > > > > > > > - * drm_sched_main - main scheduler thread
> > > > > > > > > + * drm_sched_free_job_work - worker to call free_job
> > > > > > > > >       *
> > > > > > > > > - * @param: scheduler instance
> > > > > > > > > + * @w: free job work
> > > > > > > > >       */
> > > > > > > > > -static void drm_sched_main(struct work_struct *w)
> > > > > > > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > > > > > > >      {
> > > > > > > > >         struct drm_gpu_scheduler *sched =
> > > > > > > > > -               container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > > > > > > -       struct drm_sched_entity *entity;
> > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > > > > > > >         struct drm_sched_job *cleanup_job;
> > > > > > > > > -       int r;
> > > > > > > > >         if (READ_ONCE(sched->pause_submit))
> > > > > > > > >                 return;
> > > > > > > > >         cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > > > > -       entity = drm_sched_select_entity(sched);
> > > > > > > > > +       if (cleanup_job) {
> > > > > > > > > +               sched->ops->free_job(cleanup_job);
> > > > > > > > > +
> > > > > > > > > +               drm_sched_free_job_queue_if_ready(sched);
> > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > > +       }
> > > > > > > > > +}
> > > > > > > > > -       if (!entity && !cleanup_job)
> > > > > > > > > -               return; /* No more work */
> > > > > > > > > +/**
> > > > > > > > > + * drm_sched_run_job_work - worker to call run_job
> > > > > > > > > + *
> > > > > > > > > + * @w: run job work
> > > > > > > > > + */
> > > > > > > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > > > > > > +{
> > > > > > > > > +       struct drm_gpu_scheduler *sched =
> > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > > > > > > +       struct drm_sched_entity *entity;
> > > > > > > > > +       int r;
> > > > > > > > > -       if (cleanup_job)
> > > > > > > > > -               sched->ops->free_job(cleanup_job);
> > > > > > > > > +       if (READ_ONCE(sched->pause_submit))
> > > > > > > > > +               return;
> > > > > > > > > +       entity = drm_sched_select_entity(sched, true);
> > > > > > > > >         if (entity) {
> > > > > > > > >                 struct dma_fence *fence;
> > > > > > > > >                 struct drm_sched_fence *s_fence;
> > > > > > > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > >                 sched_job = drm_sched_entity_pop_job(entity);
> > > > > > > > >                 if (!sched_job) {
> > > > > > > > >                         complete_all(&entity->entity_idle);
> > > > > > > > > -                       if (!cleanup_job)
> > > > > > > > > -                               return; /* No more work */
> > > > > > > > > -                       goto again;
> > > > > > > > > +                       return; /* No more work */
> > > > > > > > >                 }
> > > > > > > > >                 s_fence = sched_job->s_fence;
> > > > > > > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > >                 }
> > > > > > > > >                 wake_up(&sched->job_scheduled);
> > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > >         }
> > > > > > > > > -
> > > > > > > > > -again:
> > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > >      }
> > > > > > > > >      /**
> > > > > > > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > > >         spin_lock_init(&sched->job_list_lock);
> > > > > > > > >         atomic_set(&sched->hw_rq_count, 0);
> > > > > > > > >         INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > > > > -       INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > > > > > > +       INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > > > > > > +       INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > > > > > > >         atomic_set(&sched->_score, 0);
> > > > > > > > >         atomic64_set(&sched->job_id_count, 0);
> > > > > > > > >         sched->pause_submit = false;
> > > > > > > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > > > > > > >      void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > > > > > > >      {
> > > > > > > > >         WRITE_ONCE(sched->pause_submit, true);
> > > > > > > > > -       cancel_work_sync(&sched->work_submit);
> > > > > > > > > +       cancel_work_sync(&sched->work_run_job);
> > > > > > > > > +       cancel_work_sync(&sched->work_free_job);
> > > > > > > > >      }
> > > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > >      void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > > > > > > >      {
> > > > > > > > >         WRITE_ONCE(sched->pause_submit, false);
> > > > > > > > > -       queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > >      }
> > > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_start);
> > > > > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > > > > index 04eec2d7635f..fbc083a92757 100644
> > > > > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > > > > > > >       *                 finished.
> > > > > > > > >       * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > > > > > > >       * @job_id_count: used to assign unique id to the each job.
> > > > > > > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > > > > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > > > > > > >       * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > > > > - * @work_submit: schedules jobs and cleans up entities
> > > > > > > > > + * @work_run_job: schedules jobs
> > > > > > > > > + * @work_free_job: cleans up jobs
> > > > > > > > >       * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > > > > >       *            timeout interval is over.
> > > > > > > > >       * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > > > > > > >         atomic64_t                      job_id_count;
> > > > > > > > >         struct workqueue_struct         *submit_wq;
> > > > > > > > >         struct workqueue_struct         *timeout_wq;
> > > > > > > > > -       struct work_struct              work_submit;
> > > > > > > > > +       struct work_struct              work_run_job;
> > > > > > > > > +       struct work_struct              work_free_job;
> > > > > > > > >         struct delayed_work             work_tdr;
> > > > > > > > >         struct list_head                pending_list;
> > > > > > > > >         spinlock_t                      job_list_lock;
> >
Rodrigo Vivi Aug. 23, 2023, 5:26 p.m. UTC | #10
On Wed, Aug 23, 2023 at 11:41:19AM -0400, Alex Deucher wrote:
> On Wed, Aug 23, 2023 at 11:26 AM Matthew Brost <matthew.brost@intel.com> wrote:
> >
> > On Wed, Aug 23, 2023 at 09:10:51AM +0200, Christian König wrote:
> > > Am 23.08.23 um 05:27 schrieb Matthew Brost:
> > > > [SNIP]
> > > > > That is exactly what I want to avoid, tying the TDR to the job is what some
> > > > > AMD engineers pushed for because it looked like a simple solution and made
> > > > > the whole thing similar to what Windows does.
> > > > >
> > > > > This turned the previous relatively clean scheduler and TDR design into a
> > > > > complete nightmare. The job contains quite a bunch of things which are not
> > > > > necessarily available after the application which submitted the job is torn
> > > > > down.
> > > > >
> > > > Agree the TDR shouldn't be accessing anything application specific
> > > > rather just internal job state required to tear the job down on the
> > > > hardware.
> > > > > So what happens is that you either have stale pointers in the TDR which can
> > > > > go boom extremely easily or we somehow find a way to keep the necessary
> > > > I have not experenced the TDR going boom in Xe.
> > > >
> > > > > structures (which include struct thread_info and struct file for this driver
> > > > > connection) alive until all submissions are completed.
> > > > >
> > > > In Xe we keep everything alive until all submissions are completed. By
> > > > everything I mean the drm job, entity, scheduler, and VM via a reference
> > > > counting scheme. All of these structures are just kernel state which can
> > > > safely be accessed even if the application has been killed.
> > >
> > > Yeah, but that might just not be such a good idea from memory management
> > > point of view.
> > >
> > > When you (for example) kill a process all resource from that progress should
> > > at least be queued to be freed more or less immediately.
> > >
> >
> > We do this, the TDR kicks jobs off the hardware as fast as the hw
> > interface allows and signals all pending hw fences immediately after.
> > Free jobs then is immediately called and the reference count goes to
> > zero. I think max time for all of this to occur is a handful of ms.
> >
> > > What Linux is doing for other I/O operations is to keep the relevant pages
> > > alive until the I/O operation is completed, but for GPUs that usually means
> > > keeping most of the memory of the process alive and that in turn is really
> > > not something you can do.
> > >
> > > You can of course do this if your driver has a reliable way of killing your
> > > submissions and freeing resources in a reasonable amount of time. This
> > > should then be done in the flush callback.
> > >
> >
> > 'flush callback' - Do you mean drm_sched_entity_flush? I looked at that
> > and think that function doesn't even work for what I tell. It flushes
> > the spsc queue but what about jobs on the hardware, how do those get
> > killed?
> >
> > As stated we do via the TDR which is rather clean design and fits with
> > our reference couting scheme.
> >
> > > > If we need to teardown on demand we just set the TDR to a minimum value and
> > > > it kicks the jobs off the hardware, gracefully cleans everything up and
> > > > drops all references. This is a benefit of the 1 to 1 relationship, not
> > > > sure if this works with how AMDGPU uses the scheduler.
> > > >
> > > > > Delaying application tear down is also not an option because then you run
> > > > > into massive trouble with the OOM killer (or more generally OOM handling).
> > > > > See what we do in drm_sched_entity_flush() as well.
> > > > >
> > > > Not an issue for Xe, we never call drm_sched_entity_flush as our
> > > > referencing counting scheme is all jobs are finished before we attempt
> > > > to tear down entity / scheduler.
> > >
> > > I don't think you can do that upstream. Calling drm_sched_entity_flush() is
> > > a must have from your flush callback for the file descriptor.
> > >
> >
> > Again 'flush callback'? What are you refering too.
> >
> > And why does drm_sched_entity_flush need to be called, doesn't seem to
> > do anything useful.
> >
> > > Unless you have some other method for killing your submissions this would
> > > give a path for a deny of service attack vector when the Xe driver is in
> > > use.
> > >
> >
> > Yes, once th TDR fires is disallows all new submissions at the exec
> > IOCTL plus flushes any pending submissions as fast as possible.
> >
> > > > > Since adding the TDR support we completely exercised this through in the
> > > > > last two or three years or so. And to sum it up I would really like to get
> > > > > away from this mess again.
> > > > >
> > > > > Compared to that what i915 does is actually rather clean I think.
> > > > >
> > > > Not even close, resets where a nightmare in the i915 (I spend years
> > > > trying to get this right and probably still completely work) and in Xe
> > > > basically got it right on the attempt.
> > > >
> > > > > >    Also in Xe some of
> > > > > > things done in free_job cannot be from an IRQ context, hence calling
> > > > > > this from the scheduler worker is rather helpful.
> > > > > Well putting things for cleanup into a workitem doesn't sounds like
> > > > > something hard.
> > > > >
> > > > That is exactly what we doing in the scheduler with the free_job
> > > > workitem.
> > >
> > > Yeah, but I think that we do it in the scheduler and not the driver is
> > > problematic.

Christian, I do see your point on simply get rid of free job callbacks here
then use fence with own-driver workqueue and house cleaning. But I wonder if
starting with this patch as a clear separation of that is not a step forward
and that could be cleaned up on a follow up!?

Matt, why exactly do we need the separation in this patch? Commit message tells
what it is doing and that it is aligned with design, but is not clear on why
exactly we need this right now. Specially if in the end what we want is exactly
keeping the submit_wq to ensure the serialization of the operations you mentioned.
I mean, could we simply drop this patch and then work on a follow-up later and
investigate the Christian suggestion when we are in-tree?

> > >
> >
> > Disagree, a common clean callback from a non-irq contexts IMO is a good
> > design rather than each driver possibly having its own worker for
> > cleanup.
> >
> > > For the scheduler it shouldn't care about the job any more as soon as the
> > > driver takes over.
> > >
> >
> > This a massive rewrite for all users of the DRM scheduler, I'm saying
> > for Xe what you are suggesting makes little to no sense.
> >
> > I'd like other users of the DRM scheduler to chime in on what you
> > purposing. The scope of this change affects 8ish drivers that would
> > require buy in each of the stakeholders. I certainly can't change of
> > these drivers as I don't feel comfortable in all of those code bases nor
> > do I have hardware to test all of these drivers.
> >
> > > >
> > > > > Question is what do you really need for TDR which is not inside the hardware
> > > > > fence?
> > > > >
> > > > A reference to the entity to be able to kick the job off the hardware.
> > > > A reference to the entity, job, and VM for error capture.
> > > >
> > > > We also need a reference to the job for recovery after a GPU reset so
> > > > run_job can be called again for innocent jobs.
> > >
> > > Well exactly that's what I'm massively pushing back. Letting the scheduler
> > > call run_job() for the same job again is *NOT* something you can actually
> > > do.
> > >
> >
> > But lots of drivers do this already and the DRM scheduler documents
> > this.
> >
> > > This pretty clearly violates some of the dma_fence constrains and has cause
> > > massively headaches for me already.
> > >
> >
> > Seems to work fine in Xe.
> >
> > > What you can do is to do this inside your driver, e.g. take the jobs and
> > > push them again to the hw ring or just tell the hw to start executing again
> > > from a previous position.
> > >
> >
> > Again this now is massive rewrite of many drivers.
> >
> > > BTW that re-submitting of jobs seems to be a no-go from userspace
> > > perspective as well. Take a look at the Vulkan spec for that, at least Marek
> > > pretty much pointed out that we should absolutely not do this inside the
> > > kernel.
> > >
> >
> > Yes if the job causes the hang, we ban the queue. Typcially only per
> > entity (queue) resets are done in Xe but occasionally device level
> > resets are done (issues with hardware) and innocent jobs / entities call
> > run_job again.
> 
> If the engine is reset and the job was already executing, how can you
> determine that it's in a good state to resubmit?  What if some
> internal fence or semaphore in memory used by the logic in the command
> buffer has been signaled already and then you resubmit the job and it
> now starts executing with different input state?

I believe we could set some more rules in the new robustness documentation:
https://lore.kernel.org/all/20230818200642.276735-1-andrealmeid@igalia.com/

For this robustness implementation i915 pin point the exact context that
was in execution when the gpu hang and only blame that, although the
ressubmission is up to the user space. While on Xe we are blaming every
single context that was in the queue. So I'm actually confused on what
are the innocent jobs and who are calling for resubmission, if all of
them got banned and blamed.

> 
> Alex
> 
> >
> > > The generally right approach seems to be to cleanly signal to userspace that
> > > something bad happened and that userspace then needs to submit things again
> > > even for innocent jobs.
> > >
> >
> > I disagree that innocent jobs should be banned. What you are suggesting
> > is if a device reset needs to be done we kill / ban every user space queue.
> > Thats seems like overkill. Not seeing where that is stated in this doc
> > [1], it seems to imply that only jobs that are stuck results in bans.
> >
> > Matt
> >
> > [1] https://patchwork.freedesktop.org/patch/553465/?series=119883&rev=3
> >
> > > Regards,
> > > Christian.
> > >
> > > >
> > > > All of this leads to believe we need to stick with the design.
> > > >
> > > > Matt
> > > >
> > > > > Regards,
> > > > > Christian.
> > > > >
> > > > > > The HW fence can live for longer as it can be installed in dma-resv
> > > > > > slots, syncobjs, etc... If the job and hw fence are combined now we
> > > > > > holding on the memory for the longer and perhaps at the mercy of the
> > > > > > user. We also run the risk of the final put being done from an IRQ
> > > > > > context which again wont work in Xe as it is currently coded. Lastly 2
> > > > > > jobs from the same scheduler could do the final put in parallel, so
> > > > > > rather than having free_job serialized by the worker now multiple jobs
> > > > > > are freeing themselves at the same time. This might not be an issue but
> > > > > > adds another level of raceyness that needs to be accounted for. None of
> > > > > > this sounds desirable to me.
> > > > > >
> > > > > > FWIW what you suggesting sounds like how the i915 did things
> > > > > > (i915_request and hw fence in 1 memory alloc) and that turned out to be
> > > > > > a huge mess. As rule of thumb I generally do the opposite of whatever
> > > > > > the i915 did.
> > > > > >
> > > > > > Matt
> > > > > >
> > > > > > > Christian.
> > > > > > >
> > > > > > > > Matt
> > > > > > > >
> > > > > > > > > All the lifetime issues we had came from ignoring this fact and I think we
> > > > > > > > > should push for fixing this design up again.
> > > > > > > > >
> > > > > > > > > Regards,
> > > > > > > > > Christian.
> > > > > > > > >
> > > > > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > > ---
> > > > > > > > > >      drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > > > > > > > >      include/drm/gpu_scheduler.h            |   8 +-
> > > > > > > > > >      2 files changed, 106 insertions(+), 39 deletions(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > index cede47afc800..b67469eac179 100644
> > > > > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > > > > > > > >       * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > > > > > > > >       *
> > > > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > >       *
> > > > > > > > > >       * Try to find a ready entity, returns NULL if none found.
> > > > > > > > > >       */
> > > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > > >      {
> > > > > > > > > >         struct drm_sched_entity *entity;
> > > > > > > > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > >         if (entity) {
> > > > > > > > > >                 list_for_each_entry_continue(entity, &rq->entities, list) {
> > > > > > > > > >                         if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > -                               rq->current_entity = entity;
> > > > > > > > > > -                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > +                               if (dequeue) {
> > > > > > > > > > +                                       rq->current_entity = entity;
> > > > > > > > > > +                                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > +                               }
> > > > > > > > > >                                 spin_unlock(&rq->lock);
> > > > > > > > > >                                 return entity;
> > > > > > > > > >                         }
> > > > > > > > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > >         list_for_each_entry(entity, &rq->entities, list) {
> > > > > > > > > >                 if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > +                       if (dequeue) {
> > > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > +                       }
> > > > > > > > > >                         spin_unlock(&rq->lock);
> > > > > > > > > >                         return entity;
> > > > > > > > > >                 }
> > > > > > > > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > >       * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > > > > > > > >       *
> > > > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > >       *
> > > > > > > > > >       * Find oldest waiting ready entity, returns NULL if none found.
> > > > > > > > > >       */
> > > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > > >      {
> > > > > > > > > >         struct rb_node *rb;
> > > > > > > > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > >                 entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > > > > > > > >                 if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > +                       if (dequeue) {
> > > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > +                       }
> > > > > > > > > >                         break;
> > > > > > > > > >                 }
> > > > > > > > > >         }
> > > > > > > > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > >      }
> > > > > > > > > >      /**
> > > > > > > > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > > > > > > > + * drm_sched_run_job_queue - queue job submission
> > > > > > > > > >       * @sched: scheduler instance
> > > > > > > > > >       */
> > > > > > > > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > >      {
> > > > > > > > > >         if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > > -               queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static struct drm_sched_entity *
> > > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > > > > > > > +
> > > > > > > > > > +/**
> > > > > > > > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > > > > > > > + * @sched: scheduler instance
> > > > > > > > > > + */
> > > > > > > > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > > +{
> > > > > > > > > > +       if (drm_sched_select_entity(sched, false))
> > > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +/**
> > > > > > > > > > + * drm_sched_free_job_queue - queue free job
> > > > > > > > > > + *
> > > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > > + */
> > > > > > > > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > +{
> > > > > > > > > > +       if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +/**
> > > > > > > > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > > > > > > > + *
> > > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > > + */
> > > > > > > > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > > +{
> > > > > > > > > > +       struct drm_sched_job *job;
> > > > > > > > > > +
> > > > > > > > > > +       spin_lock(&sched->job_list_lock);
> > > > > > > > > > +       job = list_first_entry_or_null(&sched->pending_list,
> > > > > > > > > > +                                      struct drm_sched_job, list);
> > > > > > > > > > +       if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > > > > > > > +               drm_sched_free_job_queue(sched);
> > > > > > > > > > +       spin_unlock(&sched->job_list_lock);
> > > > > > > > > >      }
> > > > > > > > > >      /**
> > > > > > > > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > > > > > > > >         dma_fence_get(&s_fence->finished);
> > > > > > > > > >         drm_sched_fence_finished(s_fence, result);
> > > > > > > > > >         dma_fence_put(&s_fence->finished);
> > > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > > > +       drm_sched_free_job_queue(sched);
> > > > > > > > > >      }
> > > > > > > > > >      /**
> > > > > > > > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > >      void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > >      {
> > > > > > > > > >         if (drm_sched_can_queue(sched))
> > > > > > > > > > -               drm_sched_submit_queue(sched);
> > > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > > >      }
> > > > > > > > > >      /**
> > > > > > > > > >       * drm_sched_select_entity - Select next entity to process
> > > > > > > > > >       *
> > > > > > > > > >       * @sched: scheduler instance
> > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > >       *
> > > > > > > > > >       * Returns the entity to process or NULL if none are found.
> > > > > > > > > >       */
> > > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > > > > > > > >      {
> > > > > > > > > >         struct drm_sched_entity *entity;
> > > > > > > > > >         int i;
> > > > > > > > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > > >         /* Kernel run queue has higher priority than normal run queue*/
> > > > > > > > > >         for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > > > > > > > >                 entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > > > > > > > -                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > > > > > > > -                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > > > > > > > +                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > > > > > > > +                                                       dequeue) :
> > > > > > > > > > +                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > > > > > > > +                                                     dequeue);
> > > > > > > > > >                 if (entity)
> > > > > > > > > >                         break;
> > > > > > > > > >         }
> > > > > > > > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > > > > > >      EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > > > > > >      /**
> > > > > > > > > > - * drm_sched_main - main scheduler thread
> > > > > > > > > > + * drm_sched_free_job_work - worker to call free_job
> > > > > > > > > >       *
> > > > > > > > > > - * @param: scheduler instance
> > > > > > > > > > + * @w: free job work
> > > > > > > > > >       */
> > > > > > > > > > -static void drm_sched_main(struct work_struct *w)
> > > > > > > > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > > > > > > > >      {
> > > > > > > > > >         struct drm_gpu_scheduler *sched =
> > > > > > > > > > -               container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > > > > > > > -       struct drm_sched_entity *entity;
> > > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > > > > > > > >         struct drm_sched_job *cleanup_job;
> > > > > > > > > > -       int r;
> > > > > > > > > >         if (READ_ONCE(sched->pause_submit))
> > > > > > > > > >                 return;
> > > > > > > > > >         cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > > > > > -       entity = drm_sched_select_entity(sched);
> > > > > > > > > > +       if (cleanup_job) {
> > > > > > > > > > +               sched->ops->free_job(cleanup_job);
> > > > > > > > > > +
> > > > > > > > > > +               drm_sched_free_job_queue_if_ready(sched);
> > > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > > > +       }
> > > > > > > > > > +}
> > > > > > > > > > -       if (!entity && !cleanup_job)
> > > > > > > > > > -               return; /* No more work */
> > > > > > > > > > +/**
> > > > > > > > > > + * drm_sched_run_job_work - worker to call run_job
> > > > > > > > > > + *
> > > > > > > > > > + * @w: run job work
> > > > > > > > > > + */
> > > > > > > > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > > > > > > > +{
> > > > > > > > > > +       struct drm_gpu_scheduler *sched =
> > > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > > > > > > > +       struct drm_sched_entity *entity;
> > > > > > > > > > +       int r;
> > > > > > > > > > -       if (cleanup_job)
> > > > > > > > > > -               sched->ops->free_job(cleanup_job);
> > > > > > > > > > +       if (READ_ONCE(sched->pause_submit))
> > > > > > > > > > +               return;
> > > > > > > > > > +       entity = drm_sched_select_entity(sched, true);
> > > > > > > > > >         if (entity) {
> > > > > > > > > >                 struct dma_fence *fence;
> > > > > > > > > >                 struct drm_sched_fence *s_fence;
> > > > > > > > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > >                 sched_job = drm_sched_entity_pop_job(entity);
> > > > > > > > > >                 if (!sched_job) {
> > > > > > > > > >                         complete_all(&entity->entity_idle);
> > > > > > > > > > -                       if (!cleanup_job)
> > > > > > > > > > -                               return; /* No more work */
> > > > > > > > > > -                       goto again;
> > > > > > > > > > +                       return; /* No more work */
> > > > > > > > > >                 }
> > > > > > > > > >                 s_fence = sched_job->s_fence;
> > > > > > > > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > >                 }
> > > > > > > > > >                 wake_up(&sched->job_scheduled);
> > > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > > >         }
> > > > > > > > > > -
> > > > > > > > > > -again:
> > > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > > >      }
> > > > > > > > > >      /**
> > > > > > > > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > > > >         spin_lock_init(&sched->job_list_lock);
> > > > > > > > > >         atomic_set(&sched->hw_rq_count, 0);
> > > > > > > > > >         INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > > > > > -       INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > > > > > > > +       INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > > > > > > > +       INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > > > > > > > >         atomic_set(&sched->_score, 0);
> > > > > > > > > >         atomic64_set(&sched->job_id_count, 0);
> > > > > > > > > >         sched->pause_submit = false;
> > > > > > > > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > > > > > > > >      void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > > > > > > > >      {
> > > > > > > > > >         WRITE_ONCE(sched->pause_submit, true);
> > > > > > > > > > -       cancel_work_sync(&sched->work_submit);
> > > > > > > > > > +       cancel_work_sync(&sched->work_run_job);
> > > > > > > > > > +       cancel_work_sync(&sched->work_free_job);
> > > > > > > > > >      }
> > > > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > > >      void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > > > > > > > >      {
> > > > > > > > > >         WRITE_ONCE(sched->pause_submit, false);
> > > > > > > > > > -       queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > > >      }
> > > > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_start);
> > > > > > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > > > > > index 04eec2d7635f..fbc083a92757 100644
> > > > > > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > > > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > > > > > > > >       *                 finished.
> > > > > > > > > >       * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > > > > > > > >       * @job_id_count: used to assign unique id to the each job.
> > > > > > > > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > > > > > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > > > > > > > >       * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > > > > > - * @work_submit: schedules jobs and cleans up entities
> > > > > > > > > > + * @work_run_job: schedules jobs
> > > > > > > > > > + * @work_free_job: cleans up jobs
> > > > > > > > > >       * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > > > > > >       *            timeout interval is over.
> > > > > > > > > >       * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > > > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > > > > > > > >         atomic64_t                      job_id_count;
> > > > > > > > > >         struct workqueue_struct         *submit_wq;
> > > > > > > > > >         struct workqueue_struct         *timeout_wq;
> > > > > > > > > > -       struct work_struct              work_submit;
> > > > > > > > > > +       struct work_struct              work_run_job;
> > > > > > > > > > +       struct work_struct              work_free_job;
> > > > > > > > > >         struct delayed_work             work_tdr;
> > > > > > > > > >         struct list_head                pending_list;
> > > > > > > > > >         spinlock_t                      job_list_lock;
> > >
Matthew Brost Aug. 23, 2023, 11:12 p.m. UTC | #11
On Wed, Aug 23, 2023 at 01:26:09PM -0400, Rodrigo Vivi wrote:
> On Wed, Aug 23, 2023 at 11:41:19AM -0400, Alex Deucher wrote:
> > On Wed, Aug 23, 2023 at 11:26 AM Matthew Brost <matthew.brost@intel.com> wrote:
> > >
> > > On Wed, Aug 23, 2023 at 09:10:51AM +0200, Christian König wrote:
> > > > Am 23.08.23 um 05:27 schrieb Matthew Brost:
> > > > > [SNIP]
> > > > > > That is exactly what I want to avoid, tying the TDR to the job is what some
> > > > > > AMD engineers pushed for because it looked like a simple solution and made
> > > > > > the whole thing similar to what Windows does.
> > > > > >
> > > > > > This turned the previous relatively clean scheduler and TDR design into a
> > > > > > complete nightmare. The job contains quite a bunch of things which are not
> > > > > > necessarily available after the application which submitted the job is torn
> > > > > > down.
> > > > > >
> > > > > Agree the TDR shouldn't be accessing anything application specific
> > > > > rather just internal job state required to tear the job down on the
> > > > > hardware.
> > > > > > So what happens is that you either have stale pointers in the TDR which can
> > > > > > go boom extremely easily or we somehow find a way to keep the necessary
> > > > > I have not experenced the TDR going boom in Xe.
> > > > >
> > > > > > structures (which include struct thread_info and struct file for this driver
> > > > > > connection) alive until all submissions are completed.
> > > > > >
> > > > > In Xe we keep everything alive until all submissions are completed. By
> > > > > everything I mean the drm job, entity, scheduler, and VM via a reference
> > > > > counting scheme. All of these structures are just kernel state which can
> > > > > safely be accessed even if the application has been killed.
> > > >
> > > > Yeah, but that might just not be such a good idea from memory management
> > > > point of view.
> > > >
> > > > When you (for example) kill a process all resource from that progress should
> > > > at least be queued to be freed more or less immediately.
> > > >
> > >
> > > We do this, the TDR kicks jobs off the hardware as fast as the hw
> > > interface allows and signals all pending hw fences immediately after.
> > > Free jobs then is immediately called and the reference count goes to
> > > zero. I think max time for all of this to occur is a handful of ms.
> > >
> > > > What Linux is doing for other I/O operations is to keep the relevant pages
> > > > alive until the I/O operation is completed, but for GPUs that usually means
> > > > keeping most of the memory of the process alive and that in turn is really
> > > > not something you can do.
> > > >
> > > > You can of course do this if your driver has a reliable way of killing your
> > > > submissions and freeing resources in a reasonable amount of time. This
> > > > should then be done in the flush callback.
> > > >
> > >
> > > 'flush callback' - Do you mean drm_sched_entity_flush? I looked at that
> > > and think that function doesn't even work for what I tell. It flushes
> > > the spsc queue but what about jobs on the hardware, how do those get
> > > killed?
> > >
> > > As stated we do via the TDR which is rather clean design and fits with
> > > our reference couting scheme.
> > >
> > > > > If we need to teardown on demand we just set the TDR to a minimum value and
> > > > > it kicks the jobs off the hardware, gracefully cleans everything up and
> > > > > drops all references. This is a benefit of the 1 to 1 relationship, not
> > > > > sure if this works with how AMDGPU uses the scheduler.
> > > > >
> > > > > > Delaying application tear down is also not an option because then you run
> > > > > > into massive trouble with the OOM killer (or more generally OOM handling).
> > > > > > See what we do in drm_sched_entity_flush() as well.
> > > > > >
> > > > > Not an issue for Xe, we never call drm_sched_entity_flush as our
> > > > > referencing counting scheme is all jobs are finished before we attempt
> > > > > to tear down entity / scheduler.
> > > >
> > > > I don't think you can do that upstream. Calling drm_sched_entity_flush() is
> > > > a must have from your flush callback for the file descriptor.
> > > >
> > >
> > > Again 'flush callback'? What are you refering too.
> > >
> > > And why does drm_sched_entity_flush need to be called, doesn't seem to
> > > do anything useful.
> > >
> > > > Unless you have some other method for killing your submissions this would
> > > > give a path for a deny of service attack vector when the Xe driver is in
> > > > use.
> > > >
> > >
> > > Yes, once th TDR fires is disallows all new submissions at the exec
> > > IOCTL plus flushes any pending submissions as fast as possible.
> > >
> > > > > > Since adding the TDR support we completely exercised this through in the
> > > > > > last two or three years or so. And to sum it up I would really like to get
> > > > > > away from this mess again.
> > > > > >
> > > > > > Compared to that what i915 does is actually rather clean I think.
> > > > > >
> > > > > Not even close, resets where a nightmare in the i915 (I spend years
> > > > > trying to get this right and probably still completely work) and in Xe
> > > > > basically got it right on the attempt.
> > > > >
> > > > > > >    Also in Xe some of
> > > > > > > things done in free_job cannot be from an IRQ context, hence calling
> > > > > > > this from the scheduler worker is rather helpful.
> > > > > > Well putting things for cleanup into a workitem doesn't sounds like
> > > > > > something hard.
> > > > > >
> > > > > That is exactly what we doing in the scheduler with the free_job
> > > > > workitem.
> > > >
> > > > Yeah, but I think that we do it in the scheduler and not the driver is
> > > > problematic.
> 
> Christian, I do see your point on simply get rid of free job callbacks here
> then use fence with own-driver workqueue and house cleaning. But I wonder if
> starting with this patch as a clear separation of that is not a step forward
> and that could be cleaned up on a follow up!?
> 
> Matt, why exactly do we need the separation in this patch? Commit message tells
> what it is doing and that it is aligned with design, but is not clear on why
> exactly we need this right now. Specially if in the end what we want is exactly
> keeping the submit_wq to ensure the serialization of the operations you mentioned.
> I mean, could we simply drop this patch and then work on a follow-up later and
> investigate the Christian suggestion when we are in-tree?
> 

I believe Christian suggested this change in a previous rev (free_job,
proccess_msg) in there own workitem [1].

Dropping free_job / calling run_job again is really a completely
different topic than this patch.

[1] https://patchwork.freedesktop.org/patch/550722/?series=121745&rev=1

> > > >
> > >
> > > Disagree, a common clean callback from a non-irq contexts IMO is a good
> > > design rather than each driver possibly having its own worker for
> > > cleanup.
> > >
> > > > For the scheduler it shouldn't care about the job any more as soon as the
> > > > driver takes over.
> > > >
> > >
> > > This a massive rewrite for all users of the DRM scheduler, I'm saying
> > > for Xe what you are suggesting makes little to no sense.
> > >
> > > I'd like other users of the DRM scheduler to chime in on what you
> > > purposing. The scope of this change affects 8ish drivers that would
> > > require buy in each of the stakeholders. I certainly can't change of
> > > these drivers as I don't feel comfortable in all of those code bases nor
> > > do I have hardware to test all of these drivers.
> > >
> > > > >
> > > > > > Question is what do you really need for TDR which is not inside the hardware
> > > > > > fence?
> > > > > >
> > > > > A reference to the entity to be able to kick the job off the hardware.
> > > > > A reference to the entity, job, and VM for error capture.
> > > > >
> > > > > We also need a reference to the job for recovery after a GPU reset so
> > > > > run_job can be called again for innocent jobs.
> > > >
> > > > Well exactly that's what I'm massively pushing back. Letting the scheduler
> > > > call run_job() for the same job again is *NOT* something you can actually
> > > > do.
> > > >
> > >
> > > But lots of drivers do this already and the DRM scheduler documents
> > > this.
> > >
> > > > This pretty clearly violates some of the dma_fence constrains and has cause
> > > > massively headaches for me already.
> > > >
> > >
> > > Seems to work fine in Xe.
> > >
> > > > What you can do is to do this inside your driver, e.g. take the jobs and
> > > > push them again to the hw ring or just tell the hw to start executing again
> > > > from a previous position.
> > > >
> > >
> > > Again this now is massive rewrite of many drivers.
> > >
> > > > BTW that re-submitting of jobs seems to be a no-go from userspace
> > > > perspective as well. Take a look at the Vulkan spec for that, at least Marek
> > > > pretty much pointed out that we should absolutely not do this inside the
> > > > kernel.
> > > >
> > >
> > > Yes if the job causes the hang, we ban the queue. Typcially only per
> > > entity (queue) resets are done in Xe but occasionally device level
> > > resets are done (issues with hardware) and innocent jobs / entities call
> > > run_job again.
> > 
> > If the engine is reset and the job was already executing, how can you
> > determine that it's in a good state to resubmit?  What if some

If a job has started but not completed we ban the queue during device
reset. If a queue have jobs submitted but not started we resubmit all
jobs on the queue during device reset.

The started / completed state can be determined by looking at a seqno in
memory.

> > internal fence or semaphore in memory used by the logic in the command
> > buffer has been signaled already and then you resubmit the job and it
> > now starts executing with different input state?
> 
> I believe we could set some more rules in the new robustness documentation:
> https://lore.kernel.org/all/20230818200642.276735-1-andrealmeid@igalia.com/
> 
> For this robustness implementation i915 pin point the exact context that
> was in execution when the gpu hang and only blame that, although the
> ressubmission is up to the user space. While on Xe we are blaming every
> single context that was in the queue. So I'm actually confused on what
> are the innocent jobs and who are calling for resubmission, if all of
> them got banned and blamed.

See above, innocent job == submited job but not started (i.e. a job
stuck in the FW queue not yet been put on the hardware). Because we have
a FW scheduler we could have 1000s of innocent jobs that don't need to
get banned. This is very different from drivers without FW schedulers as
typically when run_job is called the job hits the hardware immediately.

Matt

> 
> > 
> > Alex
> > 
> > >
> > > > The generally right approach seems to be to cleanly signal to userspace that
> > > > something bad happened and that userspace then needs to submit things again
> > > > even for innocent jobs.
> > > >
> > >
> > > I disagree that innocent jobs should be banned. What you are suggesting
> > > is if a device reset needs to be done we kill / ban every user space queue.
> > > Thats seems like overkill. Not seeing where that is stated in this doc
> > > [1], it seems to imply that only jobs that are stuck results in bans.
> > >
> > > Matt
> > >
> > > [1] https://patchwork.freedesktop.org/patch/553465/?series=119883&rev=3
> > >
> > > > Regards,
> > > > Christian.
> > > >
> > > > >
> > > > > All of this leads to believe we need to stick with the design.
> > > > >
> > > > > Matt
> > > > >
> > > > > > Regards,
> > > > > > Christian.
> > > > > >
> > > > > > > The HW fence can live for longer as it can be installed in dma-resv
> > > > > > > slots, syncobjs, etc... If the job and hw fence are combined now we
> > > > > > > holding on the memory for the longer and perhaps at the mercy of the
> > > > > > > user. We also run the risk of the final put being done from an IRQ
> > > > > > > context which again wont work in Xe as it is currently coded. Lastly 2
> > > > > > > jobs from the same scheduler could do the final put in parallel, so
> > > > > > > rather than having free_job serialized by the worker now multiple jobs
> > > > > > > are freeing themselves at the same time. This might not be an issue but
> > > > > > > adds another level of raceyness that needs to be accounted for. None of
> > > > > > > this sounds desirable to me.
> > > > > > >
> > > > > > > FWIW what you suggesting sounds like how the i915 did things
> > > > > > > (i915_request and hw fence in 1 memory alloc) and that turned out to be
> > > > > > > a huge mess. As rule of thumb I generally do the opposite of whatever
> > > > > > > the i915 did.
> > > > > > >
> > > > > > > Matt
> > > > > > >
> > > > > > > > Christian.
> > > > > > > >
> > > > > > > > > Matt
> > > > > > > > >
> > > > > > > > > > All the lifetime issues we had came from ignoring this fact and I think we
> > > > > > > > > > should push for fixing this design up again.
> > > > > > > > > >
> > > > > > > > > > Regards,
> > > > > > > > > > Christian.
> > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > > > ---
> > > > > > > > > > >      drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > > > > > > > > >      include/drm/gpu_scheduler.h            |   8 +-
> > > > > > > > > > >      2 files changed, 106 insertions(+), 39 deletions(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > index cede47afc800..b67469eac179 100644
> > > > > > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > > > > > > > > >       * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > > > > > > > > >       *
> > > > > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > > >       *
> > > > > > > > > > >       * Try to find a ready entity, returns NULL if none found.
> > > > > > > > > > >       */
> > > > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > > > >      {
> > > > > > > > > > >         struct drm_sched_entity *entity;
> > > > > > > > > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > >         if (entity) {
> > > > > > > > > > >                 list_for_each_entry_continue(entity, &rq->entities, list) {
> > > > > > > > > > >                         if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > > -                               rq->current_entity = entity;
> > > > > > > > > > > -                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > > +                               if (dequeue) {
> > > > > > > > > > > +                                       rq->current_entity = entity;
> > > > > > > > > > > +                                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > > +                               }
> > > > > > > > > > >                                 spin_unlock(&rq->lock);
> > > > > > > > > > >                                 return entity;
> > > > > > > > > > >                         }
> > > > > > > > > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > >         list_for_each_entry(entity, &rq->entities, list) {
> > > > > > > > > > >                 if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > > +                       if (dequeue) {
> > > > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > > +                       }
> > > > > > > > > > >                         spin_unlock(&rq->lock);
> > > > > > > > > > >                         return entity;
> > > > > > > > > > >                 }
> > > > > > > > > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > >       * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > > > > > > > > >       *
> > > > > > > > > > >       * @rq: scheduler run queue to check.
> > > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > > >       *
> > > > > > > > > > >       * Find oldest waiting ready entity, returns NULL if none found.
> > > > > > > > > > >       */
> > > > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > > > >      {
> > > > > > > > > > >         struct rb_node *rb;
> > > > > > > > > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > > >                 entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > > > > > > > > >                 if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > > +                       if (dequeue) {
> > > > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > > +                       }
> > > > > > > > > > >                         break;
> > > > > > > > > > >                 }
> > > > > > > > > > >         }
> > > > > > > > > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > > >      }
> > > > > > > > > > >      /**
> > > > > > > > > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > > > > > > > > + * drm_sched_run_job_queue - queue job submission
> > > > > > > > > > >       * @sched: scheduler instance
> > > > > > > > > > >       */
> > > > > > > > > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > >      {
> > > > > > > > > > >         if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > > > -               queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static struct drm_sched_entity *
> > > > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > > > > > > > > +
> > > > > > > > > > > +/**
> > > > > > > > > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > > > > > > > > + * @sched: scheduler instance
> > > > > > > > > > > + */
> > > > > > > > > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > +{
> > > > > > > > > > > +       if (drm_sched_select_entity(sched, false))
> > > > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +/**
> > > > > > > > > > > + * drm_sched_free_job_queue - queue free job
> > > > > > > > > > > + *
> > > > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > > > + */
> > > > > > > > > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > +{
> > > > > > > > > > > +       if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +/**
> > > > > > > > > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > > > > > > > > + *
> > > > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > > > + */
> > > > > > > > > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct drm_sched_job *job;
> > > > > > > > > > > +
> > > > > > > > > > > +       spin_lock(&sched->job_list_lock);
> > > > > > > > > > > +       job = list_first_entry_or_null(&sched->pending_list,
> > > > > > > > > > > +                                      struct drm_sched_job, list);
> > > > > > > > > > > +       if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > > > > > > > > +               drm_sched_free_job_queue(sched);
> > > > > > > > > > > +       spin_unlock(&sched->job_list_lock);
> > > > > > > > > > >      }
> > > > > > > > > > >      /**
> > > > > > > > > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > > > > > > > > >         dma_fence_get(&s_fence->finished);
> > > > > > > > > > >         drm_sched_fence_finished(s_fence, result);
> > > > > > > > > > >         dma_fence_put(&s_fence->finished);
> > > > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > > > > +       drm_sched_free_job_queue(sched);
> > > > > > > > > > >      }
> > > > > > > > > > >      /**
> > > > > > > > > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > >      void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > >      {
> > > > > > > > > > >         if (drm_sched_can_queue(sched))
> > > > > > > > > > > -               drm_sched_submit_queue(sched);
> > > > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > > > >      }
> > > > > > > > > > >      /**
> > > > > > > > > > >       * drm_sched_select_entity - Select next entity to process
> > > > > > > > > > >       *
> > > > > > > > > > >       * @sched: scheduler instance
> > > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > > >       *
> > > > > > > > > > >       * Returns the entity to process or NULL if none are found.
> > > > > > > > > > >       */
> > > > > > > > > > >      static struct drm_sched_entity *
> > > > > > > > > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > > > > > > > > >      {
> > > > > > > > > > >         struct drm_sched_entity *entity;
> > > > > > > > > > >         int i;
> > > > > > > > > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > > > >         /* Kernel run queue has higher priority than normal run queue*/
> > > > > > > > > > >         for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > > > > > > > > >                 entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > > > > > > > > -                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > > > > > > > > -                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > > > > > > > > +                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > > > > > > > > +                                                       dequeue) :
> > > > > > > > > > > +                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > > > > > > > > +                                                     dequeue);
> > > > > > > > > > >                 if (entity)
> > > > > > > > > > >                         break;
> > > > > > > > > > >         }
> > > > > > > > > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > > > > > > >      EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > > > > > > >      /**
> > > > > > > > > > > - * drm_sched_main - main scheduler thread
> > > > > > > > > > > + * drm_sched_free_job_work - worker to call free_job
> > > > > > > > > > >       *
> > > > > > > > > > > - * @param: scheduler instance
> > > > > > > > > > > + * @w: free job work
> > > > > > > > > > >       */
> > > > > > > > > > > -static void drm_sched_main(struct work_struct *w)
> > > > > > > > > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > > > > > > > > >      {
> > > > > > > > > > >         struct drm_gpu_scheduler *sched =
> > > > > > > > > > > -               container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > > > > > > > > -       struct drm_sched_entity *entity;
> > > > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > > > > > > > > >         struct drm_sched_job *cleanup_job;
> > > > > > > > > > > -       int r;
> > > > > > > > > > >         if (READ_ONCE(sched->pause_submit))
> > > > > > > > > > >                 return;
> > > > > > > > > > >         cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > > > > > > -       entity = drm_sched_select_entity(sched);
> > > > > > > > > > > +       if (cleanup_job) {
> > > > > > > > > > > +               sched->ops->free_job(cleanup_job);
> > > > > > > > > > > +
> > > > > > > > > > > +               drm_sched_free_job_queue_if_ready(sched);
> > > > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > > > > +       }
> > > > > > > > > > > +}
> > > > > > > > > > > -       if (!entity && !cleanup_job)
> > > > > > > > > > > -               return; /* No more work */
> > > > > > > > > > > +/**
> > > > > > > > > > > + * drm_sched_run_job_work - worker to call run_job
> > > > > > > > > > > + *
> > > > > > > > > > > + * @w: run job work
> > > > > > > > > > > + */
> > > > > > > > > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct drm_gpu_scheduler *sched =
> > > > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > > > > > > > > +       struct drm_sched_entity *entity;
> > > > > > > > > > > +       int r;
> > > > > > > > > > > -       if (cleanup_job)
> > > > > > > > > > > -               sched->ops->free_job(cleanup_job);
> > > > > > > > > > > +       if (READ_ONCE(sched->pause_submit))
> > > > > > > > > > > +               return;
> > > > > > > > > > > +       entity = drm_sched_select_entity(sched, true);
> > > > > > > > > > >         if (entity) {
> > > > > > > > > > >                 struct dma_fence *fence;
> > > > > > > > > > >                 struct drm_sched_fence *s_fence;
> > > > > > > > > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > > >                 sched_job = drm_sched_entity_pop_job(entity);
> > > > > > > > > > >                 if (!sched_job) {
> > > > > > > > > > >                         complete_all(&entity->entity_idle);
> > > > > > > > > > > -                       if (!cleanup_job)
> > > > > > > > > > > -                               return; /* No more work */
> > > > > > > > > > > -                       goto again;
> > > > > > > > > > > +                       return; /* No more work */
> > > > > > > > > > >                 }
> > > > > > > > > > >                 s_fence = sched_job->s_fence;
> > > > > > > > > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > > >                 }
> > > > > > > > > > >                 wake_up(&sched->job_scheduled);
> > > > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > > > >         }
> > > > > > > > > > > -
> > > > > > > > > > > -again:
> > > > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > > > >      }
> > > > > > > > > > >      /**
> > > > > > > > > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > > > > >         spin_lock_init(&sched->job_list_lock);
> > > > > > > > > > >         atomic_set(&sched->hw_rq_count, 0);
> > > > > > > > > > >         INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > > > > > > -       INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > > > > > > > > +       INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > > > > > > > > +       INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > > > > > > > > >         atomic_set(&sched->_score, 0);
> > > > > > > > > > >         atomic64_set(&sched->job_id_count, 0);
> > > > > > > > > > >         sched->pause_submit = false;
> > > > > > > > > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > > > > > > > > >      void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > > > > > > > > >      {
> > > > > > > > > > >         WRITE_ONCE(sched->pause_submit, true);
> > > > > > > > > > > -       cancel_work_sync(&sched->work_submit);
> > > > > > > > > > > +       cancel_work_sync(&sched->work_run_job);
> > > > > > > > > > > +       cancel_work_sync(&sched->work_free_job);
> > > > > > > > > > >      }
> > > > > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > > > >      void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > > > > > > > > >      {
> > > > > > > > > > >         WRITE_ONCE(sched->pause_submit, false);
> > > > > > > > > > > -       queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > > > >      }
> > > > > > > > > > >      EXPORT_SYMBOL(drm_sched_submit_start);
> > > > > > > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > > > > > > index 04eec2d7635f..fbc083a92757 100644
> > > > > > > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > > > > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > > > > > > > > >       *                 finished.
> > > > > > > > > > >       * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > > > > > > > > >       * @job_id_count: used to assign unique id to the each job.
> > > > > > > > > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > > > > > > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > > > > > > > > >       * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > > > > > > - * @work_submit: schedules jobs and cleans up entities
> > > > > > > > > > > + * @work_run_job: schedules jobs
> > > > > > > > > > > + * @work_free_job: cleans up jobs
> > > > > > > > > > >       * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > > > > > > >       *            timeout interval is over.
> > > > > > > > > > >       * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > > > > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > > > > > > > > >         atomic64_t                      job_id_count;
> > > > > > > > > > >         struct workqueue_struct         *submit_wq;
> > > > > > > > > > >         struct workqueue_struct         *timeout_wq;
> > > > > > > > > > > -       struct work_struct              work_submit;
> > > > > > > > > > > +       struct work_struct              work_run_job;
> > > > > > > > > > > +       struct work_struct              work_free_job;
> > > > > > > > > > >         struct delayed_work             work_tdr;
> > > > > > > > > > >         struct list_head                pending_list;
> > > > > > > > > > >         spinlock_t                      job_list_lock;
> > > >
Christian König Aug. 24, 2023, 11:44 a.m. UTC | #12
Am 24.08.23 um 01:12 schrieb Matthew Brost:
> On Wed, Aug 23, 2023 at 01:26:09PM -0400, Rodrigo Vivi wrote:
>> On Wed, Aug 23, 2023 at 11:41:19AM -0400, Alex Deucher wrote:
>>> On Wed, Aug 23, 2023 at 11:26 AM Matthew Brost <matthew.brost@intel.com> wrote:
>>>> On Wed, Aug 23, 2023 at 09:10:51AM +0200, Christian König wrote:
>>>>> Am 23.08.23 um 05:27 schrieb Matthew Brost:
>>>>>> [SNIP]
>>>>>>> That is exactly what I want to avoid, tying the TDR to the job is what some
>>>>>>> AMD engineers pushed for because it looked like a simple solution and made
>>>>>>> the whole thing similar to what Windows does.
>>>>>>>
>>>>>>> This turned the previous relatively clean scheduler and TDR design into a
>>>>>>> complete nightmare. The job contains quite a bunch of things which are not
>>>>>>> necessarily available after the application which submitted the job is torn
>>>>>>> down.
>>>>>>>
>>>>>> Agree the TDR shouldn't be accessing anything application specific
>>>>>> rather just internal job state required to tear the job down on the
>>>>>> hardware.
>>>>>>> So what happens is that you either have stale pointers in the TDR which can
>>>>>>> go boom extremely easily or we somehow find a way to keep the necessary
>>>>>> I have not experenced the TDR going boom in Xe.
>>>>>>
>>>>>>> structures (which include struct thread_info and struct file for this driver
>>>>>>> connection) alive until all submissions are completed.
>>>>>>>
>>>>>> In Xe we keep everything alive until all submissions are completed. By
>>>>>> everything I mean the drm job, entity, scheduler, and VM via a reference
>>>>>> counting scheme. All of these structures are just kernel state which can
>>>>>> safely be accessed even if the application has been killed.
>>>>> Yeah, but that might just not be such a good idea from memory management
>>>>> point of view.
>>>>>
>>>>> When you (for example) kill a process all resource from that progress should
>>>>> at least be queued to be freed more or less immediately.
>>>>>
>>>> We do this, the TDR kicks jobs off the hardware as fast as the hw
>>>> interface allows and signals all pending hw fences immediately after.
>>>> Free jobs then is immediately called and the reference count goes to
>>>> zero. I think max time for all of this to occur is a handful of ms.
>>>>
>>>>> What Linux is doing for other I/O operations is to keep the relevant pages
>>>>> alive until the I/O operation is completed, but for GPUs that usually means
>>>>> keeping most of the memory of the process alive and that in turn is really
>>>>> not something you can do.
>>>>>
>>>>> You can of course do this if your driver has a reliable way of killing your
>>>>> submissions and freeing resources in a reasonable amount of time. This
>>>>> should then be done in the flush callback.
>>>>>
>>>> 'flush callback' - Do you mean drm_sched_entity_flush? I looked at that
>>>> and think that function doesn't even work for what I tell. It flushes
>>>> the spsc queue but what about jobs on the hardware, how do those get
>>>> killed?
>>>>
>>>> As stated we do via the TDR which is rather clean design and fits with
>>>> our reference couting scheme.
>>>>
>>>>>> If we need to teardown on demand we just set the TDR to a minimum value and
>>>>>> it kicks the jobs off the hardware, gracefully cleans everything up and
>>>>>> drops all references. This is a benefit of the 1 to 1 relationship, not
>>>>>> sure if this works with how AMDGPU uses the scheduler.
>>>>>>
>>>>>>> Delaying application tear down is also not an option because then you run
>>>>>>> into massive trouble with the OOM killer (or more generally OOM handling).
>>>>>>> See what we do in drm_sched_entity_flush() as well.
>>>>>>>
>>>>>> Not an issue for Xe, we never call drm_sched_entity_flush as our
>>>>>> referencing counting scheme is all jobs are finished before we attempt
>>>>>> to tear down entity / scheduler.
>>>>> I don't think you can do that upstream. Calling drm_sched_entity_flush() is
>>>>> a must have from your flush callback for the file descriptor.
>>>>>
>>>> Again 'flush callback'? What are you refering too.
>>>>
>>>> And why does drm_sched_entity_flush need to be called, doesn't seem to
>>>> do anything useful.
>>>>
>>>>> Unless you have some other method for killing your submissions this would
>>>>> give a path for a deny of service attack vector when the Xe driver is in
>>>>> use.
>>>>>
>>>> Yes, once th TDR fires is disallows all new submissions at the exec
>>>> IOCTL plus flushes any pending submissions as fast as possible.
>>>>
>>>>>>> Since adding the TDR support we completely exercised this through in the
>>>>>>> last two or three years or so. And to sum it up I would really like to get
>>>>>>> away from this mess again.
>>>>>>>
>>>>>>> Compared to that what i915 does is actually rather clean I think.
>>>>>>>
>>>>>> Not even close, resets where a nightmare in the i915 (I spend years
>>>>>> trying to get this right and probably still completely work) and in Xe
>>>>>> basically got it right on the attempt.
>>>>>>
>>>>>>>>     Also in Xe some of
>>>>>>>> things done in free_job cannot be from an IRQ context, hence calling
>>>>>>>> this from the scheduler worker is rather helpful.
>>>>>>> Well putting things for cleanup into a workitem doesn't sounds like
>>>>>>> something hard.
>>>>>>>
>>>>>> That is exactly what we doing in the scheduler with the free_job
>>>>>> workitem.
>>>>> Yeah, but I think that we do it in the scheduler and not the driver is
>>>>> problematic.
>> Christian, I do see your point on simply get rid of free job callbacks here
>> then use fence with own-driver workqueue and house cleaning. But I wonder if
>> starting with this patch as a clear separation of that is not a step forward
>> and that could be cleaned up on a follow up!?
>>
>> Matt, why exactly do we need the separation in this patch? Commit message tells
>> what it is doing and that it is aligned with design, but is not clear on why
>> exactly we need this right now. Specially if in the end what we want is exactly
>> keeping the submit_wq to ensure the serialization of the operations you mentioned.
>> I mean, could we simply drop this patch and then work on a follow-up later and
>> investigate the Christian suggestion when we are in-tree?
>>
> I believe Christian suggested this change in a previous rev (free_job,
> proccess_msg) in there own workitem [1].
>
> Dropping free_job / calling run_job again is really a completely
> different topic than this patch.

Yeah, agree. I just wanted to bring this up before we put even more 
effort in the free_job based approach.

Rodrigos point is a really good one, no matter if the driver or the 
scheduler frees the job. Doing that in a separate work item sounds like 
the right thing to do.

Regards,
Christian.

>
> [1] https://patchwork.freedesktop.org/patch/550722/?series=121745&rev=1
>
>>>> Disagree, a common clean callback from a non-irq contexts IMO is a good
>>>> design rather than each driver possibly having its own worker for
>>>> cleanup.
>>>>
>>>>> For the scheduler it shouldn't care about the job any more as soon as the
>>>>> driver takes over.
>>>>>
>>>> This a massive rewrite for all users of the DRM scheduler, I'm saying
>>>> for Xe what you are suggesting makes little to no sense.
>>>>
>>>> I'd like other users of the DRM scheduler to chime in on what you
>>>> purposing. The scope of this change affects 8ish drivers that would
>>>> require buy in each of the stakeholders. I certainly can't change of
>>>> these drivers as I don't feel comfortable in all of those code bases nor
>>>> do I have hardware to test all of these drivers.
>>>>
>>>>>>> Question is what do you really need for TDR which is not inside the hardware
>>>>>>> fence?
>>>>>>>
>>>>>> A reference to the entity to be able to kick the job off the hardware.
>>>>>> A reference to the entity, job, and VM for error capture.
>>>>>>
>>>>>> We also need a reference to the job for recovery after a GPU reset so
>>>>>> run_job can be called again for innocent jobs.
>>>>> Well exactly that's what I'm massively pushing back. Letting the scheduler
>>>>> call run_job() for the same job again is *NOT* something you can actually
>>>>> do.
>>>>>
>>>> But lots of drivers do this already and the DRM scheduler documents
>>>> this.
>>>>
>>>>> This pretty clearly violates some of the dma_fence constrains and has cause
>>>>> massively headaches for me already.
>>>>>
>>>> Seems to work fine in Xe.
>>>>
>>>>> What you can do is to do this inside your driver, e.g. take the jobs and
>>>>> push them again to the hw ring or just tell the hw to start executing again
>>>>> from a previous position.
>>>>>
>>>> Again this now is massive rewrite of many drivers.
>>>>
>>>>> BTW that re-submitting of jobs seems to be a no-go from userspace
>>>>> perspective as well. Take a look at the Vulkan spec for that, at least Marek
>>>>> pretty much pointed out that we should absolutely not do this inside the
>>>>> kernel.
>>>>>
>>>> Yes if the job causes the hang, we ban the queue. Typcially only per
>>>> entity (queue) resets are done in Xe but occasionally device level
>>>> resets are done (issues with hardware) and innocent jobs / entities call
>>>> run_job again.
>>> If the engine is reset and the job was already executing, how can you
>>> determine that it's in a good state to resubmit?  What if some
> If a job has started but not completed we ban the queue during device
> reset. If a queue have jobs submitted but not started we resubmit all
> jobs on the queue during device reset.
>
> The started / completed state can be determined by looking at a seqno in
> memory.
>
>>> internal fence or semaphore in memory used by the logic in the command
>>> buffer has been signaled already and then you resubmit the job and it
>>> now starts executing with different input state?
>> I believe we could set some more rules in the new robustness documentation:
>> https://lore.kernel.org/all/20230818200642.276735-1-andrealmeid@igalia.com/
>>
>> For this robustness implementation i915 pin point the exact context that
>> was in execution when the gpu hang and only blame that, although the
>> ressubmission is up to the user space. While on Xe we are blaming every
>> single context that was in the queue. So I'm actually confused on what
>> are the innocent jobs and who are calling for resubmission, if all of
>> them got banned and blamed.
> See above, innocent job == submited job but not started (i.e. a job
> stuck in the FW queue not yet been put on the hardware). Because we have
> a FW scheduler we could have 1000s of innocent jobs that don't need to
> get banned. This is very different from drivers without FW schedulers as
> typically when run_job is called the job hits the hardware immediately.
>
> Matt
>
>>> Alex
>>>
>>>>> The generally right approach seems to be to cleanly signal to userspace that
>>>>> something bad happened and that userspace then needs to submit things again
>>>>> even for innocent jobs.
>>>>>
>>>> I disagree that innocent jobs should be banned. What you are suggesting
>>>> is if a device reset needs to be done we kill / ban every user space queue.
>>>> Thats seems like overkill. Not seeing where that is stated in this doc
>>>> [1], it seems to imply that only jobs that are stuck results in bans.
>>>>
>>>> Matt
>>>>
>>>> [1] https://patchwork.freedesktop.org/patch/553465/?series=119883&rev=3
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> All of this leads to believe we need to stick with the design.
>>>>>>
>>>>>> Matt
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> The HW fence can live for longer as it can be installed in dma-resv
>>>>>>>> slots, syncobjs, etc... If the job and hw fence are combined now we
>>>>>>>> holding on the memory for the longer and perhaps at the mercy of the
>>>>>>>> user. We also run the risk of the final put being done from an IRQ
>>>>>>>> context which again wont work in Xe as it is currently coded. Lastly 2
>>>>>>>> jobs from the same scheduler could do the final put in parallel, so
>>>>>>>> rather than having free_job serialized by the worker now multiple jobs
>>>>>>>> are freeing themselves at the same time. This might not be an issue but
>>>>>>>> adds another level of raceyness that needs to be accounted for. None of
>>>>>>>> this sounds desirable to me.
>>>>>>>>
>>>>>>>> FWIW what you suggesting sounds like how the i915 did things
>>>>>>>> (i915_request and hw fence in 1 memory alloc) and that turned out to be
>>>>>>>> a huge mess. As rule of thumb I generally do the opposite of whatever
>>>>>>>> the i915 did.
>>>>>>>>
>>>>>>>> Matt
>>>>>>>>
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Matt
>>>>>>>>>>
>>>>>>>>>>> All the lifetime issues we had came from ignoring this fact and I think we
>>>>>>>>>>> should push for fixing this design up again.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>       drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>>>>>>>>>>>>       include/drm/gpu_scheduler.h            |   8 +-
>>>>>>>>>>>>       2 files changed, 106 insertions(+), 39 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> index cede47afc800..b67469eac179 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>>>>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>>>>>>>>>>        * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>>>>>>>>>>        *
>>>>>>>>>>>>        * @rq: scheduler run queue to check.
>>>>>>>>>>>> + * @dequeue: dequeue selected entity
>>>>>>>>>>>>        *
>>>>>>>>>>>>        * Try to find a ready entity, returns NULL if none found.
>>>>>>>>>>>>        */
>>>>>>>>>>>>       static struct drm_sched_entity *
>>>>>>>>>>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>>>>>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          struct drm_sched_entity *entity;
>>>>>>>>>>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>>>>>>>          if (entity) {
>>>>>>>>>>>>                  list_for_each_entry_continue(entity, &rq->entities, list) {
>>>>>>>>>>>>                          if (drm_sched_entity_is_ready(entity)) {
>>>>>>>>>>>> -                               rq->current_entity = entity;
>>>>>>>>>>>> -                               reinit_completion(&entity->entity_idle);
>>>>>>>>>>>> +                               if (dequeue) {
>>>>>>>>>>>> +                                       rq->current_entity = entity;
>>>>>>>>>>>> +                                       reinit_completion(&entity->entity_idle);
>>>>>>>>>>>> +                               }
>>>>>>>>>>>>                                  spin_unlock(&rq->lock);
>>>>>>>>>>>>                                  return entity;
>>>>>>>>>>>>                          }
>>>>>>>>>>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>>>>>>>          list_for_each_entry(entity, &rq->entities, list) {
>>>>>>>>>>>>                  if (drm_sched_entity_is_ready(entity)) {
>>>>>>>>>>>> -                       rq->current_entity = entity;
>>>>>>>>>>>> -                       reinit_completion(&entity->entity_idle);
>>>>>>>>>>>> +                       if (dequeue) {
>>>>>>>>>>>> +                               rq->current_entity = entity;
>>>>>>>>>>>> +                               reinit_completion(&entity->entity_idle);
>>>>>>>>>>>> +                       }
>>>>>>>>>>>>                          spin_unlock(&rq->lock);
>>>>>>>>>>>>                          return entity;
>>>>>>>>>>>>                  }
>>>>>>>>>>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>>>>>>>>        * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>>>>>>>>>>>        *
>>>>>>>>>>>>        * @rq: scheduler run queue to check.
>>>>>>>>>>>> + * @dequeue: dequeue selected entity
>>>>>>>>>>>>        *
>>>>>>>>>>>>        * Find oldest waiting ready entity, returns NULL if none found.
>>>>>>>>>>>>        */
>>>>>>>>>>>>       static struct drm_sched_entity *
>>>>>>>>>>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>>>>>>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          struct rb_node *rb;
>>>>>>>>>>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>>>>>>>>                  entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>>>>>>>>>>>                  if (drm_sched_entity_is_ready(entity)) {
>>>>>>>>>>>> -                       rq->current_entity = entity;
>>>>>>>>>>>> -                       reinit_completion(&entity->entity_idle);
>>>>>>>>>>>> +                       if (dequeue) {
>>>>>>>>>>>> +                               rq->current_entity = entity;
>>>>>>>>>>>> +                               reinit_completion(&entity->entity_idle);
>>>>>>>>>>>> +                       }
>>>>>>>>>>>>                          break;
>>>>>>>>>>>>                  }
>>>>>>>>>>>>          }
>>>>>>>>>>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>>>>>>>>       }
>>>>>>>>>>>>       /**
>>>>>>>>>>>> - * drm_sched_submit_queue - scheduler queue submission
>>>>>>>>>>>> + * drm_sched_run_job_queue - queue job submission
>>>>>>>>>>>>        * @sched: scheduler instance
>>>>>>>>>>>>        */
>>>>>>>>>>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          if (!READ_ONCE(sched->pause_submit))
>>>>>>>>>>>> -               queue_work(sched->submit_wq, &sched->work_submit);
>>>>>>>>>>>> +               queue_work(sched->submit_wq, &sched->work_run_job);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +static struct drm_sched_entity *
>>>>>>>>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
>>>>>>>>>>>> +
>>>>>>>>>>>> +/**
>>>>>>>>>>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
>>>>>>>>>>>> + * @sched: scheduler instance
>>>>>>>>>>>> + */
>>>>>>>>>>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +       if (drm_sched_select_entity(sched, false))
>>>>>>>>>>>> +               drm_sched_run_job_queue(sched);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +/**
>>>>>>>>>>>> + * drm_sched_free_job_queue - queue free job
>>>>>>>>>>>> + *
>>>>>>>>>>>> + * @sched: scheduler instance to queue free job
>>>>>>>>>>>> + */
>>>>>>>>>>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +       if (!READ_ONCE(sched->pause_submit))
>>>>>>>>>>>> +               queue_work(sched->submit_wq, &sched->work_free_job);
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>> +/**
>>>>>>>>>>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
>>>>>>>>>>>> + *
>>>>>>>>>>>> + * @sched: scheduler instance to queue free job
>>>>>>>>>>>> + */
>>>>>>>>>>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +       struct drm_sched_job *job;
>>>>>>>>>>>> +
>>>>>>>>>>>> +       spin_lock(&sched->job_list_lock);
>>>>>>>>>>>> +       job = list_first_entry_or_null(&sched->pending_list,
>>>>>>>>>>>> +                                      struct drm_sched_job, list);
>>>>>>>>>>>> +       if (job && dma_fence_is_signaled(&job->s_fence->finished))
>>>>>>>>>>>> +               drm_sched_free_job_queue(sched);
>>>>>>>>>>>> +       spin_unlock(&sched->job_list_lock);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       /**
>>>>>>>>>>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>>>>>>>>>>>          dma_fence_get(&s_fence->finished);
>>>>>>>>>>>>          drm_sched_fence_finished(s_fence, result);
>>>>>>>>>>>>          dma_fence_put(&s_fence->finished);
>>>>>>>>>>>> -       drm_sched_submit_queue(sched);
>>>>>>>>>>>> +       drm_sched_free_job_queue(sched);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       /**
>>>>>>>>>>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>>       void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          if (drm_sched_can_queue(sched))
>>>>>>>>>>>> -               drm_sched_submit_queue(sched);
>>>>>>>>>>>> +               drm_sched_run_job_queue(sched);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       /**
>>>>>>>>>>>>        * drm_sched_select_entity - Select next entity to process
>>>>>>>>>>>>        *
>>>>>>>>>>>>        * @sched: scheduler instance
>>>>>>>>>>>> + * @dequeue: dequeue selected entity
>>>>>>>>>>>>        *
>>>>>>>>>>>>        * Returns the entity to process or NULL if none are found.
>>>>>>>>>>>>        */
>>>>>>>>>>>>       static struct drm_sched_entity *
>>>>>>>>>>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          struct drm_sched_entity *entity;
>>>>>>>>>>>>          int i;
>>>>>>>>>>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>>          /* Kernel run queue has higher priority than normal run queue*/
>>>>>>>>>>>>          for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>>>>>>>>>>>                  entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>>>>>>>>>>> -                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
>>>>>>>>>>>> -                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
>>>>>>>>>>>> +                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>>>>>>>>>>>> +                                                       dequeue) :
>>>>>>>>>>>> +                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>>>>>>>>>>>> +                                                     dequeue);
>>>>>>>>>>>>                  if (entity)
>>>>>>>>>>>>                          break;
>>>>>>>>>>>>          }
>>>>>>>>>>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>>>>>>>>>>       EXPORT_SYMBOL(drm_sched_pick_best);
>>>>>>>>>>>>       /**
>>>>>>>>>>>> - * drm_sched_main - main scheduler thread
>>>>>>>>>>>> + * drm_sched_free_job_work - worker to call free_job
>>>>>>>>>>>>        *
>>>>>>>>>>>> - * @param: scheduler instance
>>>>>>>>>>>> + * @w: free job work
>>>>>>>>>>>>        */
>>>>>>>>>>>> -static void drm_sched_main(struct work_struct *w)
>>>>>>>>>>>> +static void drm_sched_free_job_work(struct work_struct *w)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          struct drm_gpu_scheduler *sched =
>>>>>>>>>>>> -               container_of(w, struct drm_gpu_scheduler, work_submit);
>>>>>>>>>>>> -       struct drm_sched_entity *entity;
>>>>>>>>>>>> +               container_of(w, struct drm_gpu_scheduler, work_free_job);
>>>>>>>>>>>>          struct drm_sched_job *cleanup_job;
>>>>>>>>>>>> -       int r;
>>>>>>>>>>>>          if (READ_ONCE(sched->pause_submit))
>>>>>>>>>>>>                  return;
>>>>>>>>>>>>          cleanup_job = drm_sched_get_cleanup_job(sched);
>>>>>>>>>>>> -       entity = drm_sched_select_entity(sched);
>>>>>>>>>>>> +       if (cleanup_job) {
>>>>>>>>>>>> +               sched->ops->free_job(cleanup_job);
>>>>>>>>>>>> +
>>>>>>>>>>>> +               drm_sched_free_job_queue_if_ready(sched);
>>>>>>>>>>>> +               drm_sched_run_job_queue_if_ready(sched);
>>>>>>>>>>>> +       }
>>>>>>>>>>>> +}
>>>>>>>>>>>> -       if (!entity && !cleanup_job)
>>>>>>>>>>>> -               return; /* No more work */
>>>>>>>>>>>> +/**
>>>>>>>>>>>> + * drm_sched_run_job_work - worker to call run_job
>>>>>>>>>>>> + *
>>>>>>>>>>>> + * @w: run job work
>>>>>>>>>>>> + */
>>>>>>>>>>>> +static void drm_sched_run_job_work(struct work_struct *w)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +       struct drm_gpu_scheduler *sched =
>>>>>>>>>>>> +               container_of(w, struct drm_gpu_scheduler, work_run_job);
>>>>>>>>>>>> +       struct drm_sched_entity *entity;
>>>>>>>>>>>> +       int r;
>>>>>>>>>>>> -       if (cleanup_job)
>>>>>>>>>>>> -               sched->ops->free_job(cleanup_job);
>>>>>>>>>>>> +       if (READ_ONCE(sched->pause_submit))
>>>>>>>>>>>> +               return;
>>>>>>>>>>>> +       entity = drm_sched_select_entity(sched, true);
>>>>>>>>>>>>          if (entity) {
>>>>>>>>>>>>                  struct dma_fence *fence;
>>>>>>>>>>>>                  struct drm_sched_fence *s_fence;
>>>>>>>>>>>> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>>>>>>>                  sched_job = drm_sched_entity_pop_job(entity);
>>>>>>>>>>>>                  if (!sched_job) {
>>>>>>>>>>>>                          complete_all(&entity->entity_idle);
>>>>>>>>>>>> -                       if (!cleanup_job)
>>>>>>>>>>>> -                               return; /* No more work */
>>>>>>>>>>>> -                       goto again;
>>>>>>>>>>>> +                       return; /* No more work */
>>>>>>>>>>>>                  }
>>>>>>>>>>>>                  s_fence = sched_job->s_fence;
>>>>>>>>>>>> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>>>>>>>>>>>>                  }
>>>>>>>>>>>>                  wake_up(&sched->job_scheduled);
>>>>>>>>>>>> +               drm_sched_run_job_queue_if_ready(sched);
>>>>>>>>>>>>          }
>>>>>>>>>>>> -
>>>>>>>>>>>> -again:
>>>>>>>>>>>> -       drm_sched_submit_queue(sched);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       /**
>>>>>>>>>>>> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>>>>>          spin_lock_init(&sched->job_list_lock);
>>>>>>>>>>>>          atomic_set(&sched->hw_rq_count, 0);
>>>>>>>>>>>>          INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>>>>>>>>>> -       INIT_WORK(&sched->work_submit, drm_sched_main);
>>>>>>>>>>>> +       INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>>>>>>>>>>> +       INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>>>>>>>>>>>>          atomic_set(&sched->_score, 0);
>>>>>>>>>>>>          atomic64_set(&sched->job_id_count, 0);
>>>>>>>>>>>>          sched->pause_submit = false;
>>>>>>>>>>>> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>>>>>>>>>>>>       void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          WRITE_ONCE(sched->pause_submit, true);
>>>>>>>>>>>> -       cancel_work_sync(&sched->work_submit);
>>>>>>>>>>>> +       cancel_work_sync(&sched->work_run_job);
>>>>>>>>>>>> +       cancel_work_sync(&sched->work_free_job);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>>>>>>>>> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>>>>>>>>>       void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>>>>>>>>>>>>       {
>>>>>>>>>>>>          WRITE_ONCE(sched->pause_submit, false);
>>>>>>>>>>>> -       queue_work(sched->submit_wq, &sched->work_submit);
>>>>>>>>>>>> +       queue_work(sched->submit_wq, &sched->work_run_job);
>>>>>>>>>>>> +       queue_work(sched->submit_wq, &sched->work_free_job);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       EXPORT_SYMBOL(drm_sched_submit_start);
>>>>>>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>>>>>>> index 04eec2d7635f..fbc083a92757 100644
>>>>>>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>>>>>>> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>>>>>>>>>>>>        *                 finished.
>>>>>>>>>>>>        * @hw_rq_count: the number of jobs currently in the hardware queue.
>>>>>>>>>>>>        * @job_id_count: used to assign unique id to the each job.
>>>>>>>>>>>> - * @submit_wq: workqueue used to queue @work_submit
>>>>>>>>>>>> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>>>>>>>>>>>        * @timeout_wq: workqueue used to queue @work_tdr
>>>>>>>>>>>> - * @work_submit: schedules jobs and cleans up entities
>>>>>>>>>>>> + * @work_run_job: schedules jobs
>>>>>>>>>>>> + * @work_free_job: cleans up jobs
>>>>>>>>>>>>        * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>>>>>>>>>>        *            timeout interval is over.
>>>>>>>>>>>>        * @pending_list: the list of jobs which are currently in the job queue.
>>>>>>>>>>>> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>>>>>>>>>>>>          atomic64_t                      job_id_count;
>>>>>>>>>>>>          struct workqueue_struct         *submit_wq;
>>>>>>>>>>>>          struct workqueue_struct         *timeout_wq;
>>>>>>>>>>>> -       struct work_struct              work_submit;
>>>>>>>>>>>> +       struct work_struct              work_run_job;
>>>>>>>>>>>> +       struct work_struct              work_free_job;
>>>>>>>>>>>>          struct delayed_work             work_tdr;
>>>>>>>>>>>>          struct list_head                pending_list;
>>>>>>>>>>>>          spinlock_t                      job_list_lock;
Matthew Brost Aug. 24, 2023, 2:30 p.m. UTC | #13
On Thu, Aug 24, 2023 at 01:44:41PM +0200, Christian König wrote:
> Am 24.08.23 um 01:12 schrieb Matthew Brost:
> > On Wed, Aug 23, 2023 at 01:26:09PM -0400, Rodrigo Vivi wrote:
> > > On Wed, Aug 23, 2023 at 11:41:19AM -0400, Alex Deucher wrote:
> > > > On Wed, Aug 23, 2023 at 11:26 AM Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > On Wed, Aug 23, 2023 at 09:10:51AM +0200, Christian König wrote:
> > > > > > Am 23.08.23 um 05:27 schrieb Matthew Brost:
> > > > > > > [SNIP]
> > > > > > > > That is exactly what I want to avoid, tying the TDR to the job is what some
> > > > > > > > AMD engineers pushed for because it looked like a simple solution and made
> > > > > > > > the whole thing similar to what Windows does.
> > > > > > > > 
> > > > > > > > This turned the previous relatively clean scheduler and TDR design into a
> > > > > > > > complete nightmare. The job contains quite a bunch of things which are not
> > > > > > > > necessarily available after the application which submitted the job is torn
> > > > > > > > down.
> > > > > > > > 
> > > > > > > Agree the TDR shouldn't be accessing anything application specific
> > > > > > > rather just internal job state required to tear the job down on the
> > > > > > > hardware.
> > > > > > > > So what happens is that you either have stale pointers in the TDR which can
> > > > > > > > go boom extremely easily or we somehow find a way to keep the necessary
> > > > > > > I have not experenced the TDR going boom in Xe.
> > > > > > > 
> > > > > > > > structures (which include struct thread_info and struct file for this driver
> > > > > > > > connection) alive until all submissions are completed.
> > > > > > > > 
> > > > > > > In Xe we keep everything alive until all submissions are completed. By
> > > > > > > everything I mean the drm job, entity, scheduler, and VM via a reference
> > > > > > > counting scheme. All of these structures are just kernel state which can
> > > > > > > safely be accessed even if the application has been killed.
> > > > > > Yeah, but that might just not be such a good idea from memory management
> > > > > > point of view.
> > > > > > 
> > > > > > When you (for example) kill a process all resource from that progress should
> > > > > > at least be queued to be freed more or less immediately.
> > > > > > 
> > > > > We do this, the TDR kicks jobs off the hardware as fast as the hw
> > > > > interface allows and signals all pending hw fences immediately after.
> > > > > Free jobs then is immediately called and the reference count goes to
> > > > > zero. I think max time for all of this to occur is a handful of ms.
> > > > > 
> > > > > > What Linux is doing for other I/O operations is to keep the relevant pages
> > > > > > alive until the I/O operation is completed, but for GPUs that usually means
> > > > > > keeping most of the memory of the process alive and that in turn is really
> > > > > > not something you can do.
> > > > > > 
> > > > > > You can of course do this if your driver has a reliable way of killing your
> > > > > > submissions and freeing resources in a reasonable amount of time. This
> > > > > > should then be done in the flush callback.
> > > > > > 
> > > > > 'flush callback' - Do you mean drm_sched_entity_flush? I looked at that
> > > > > and think that function doesn't even work for what I tell. It flushes
> > > > > the spsc queue but what about jobs on the hardware, how do those get
> > > > > killed?
> > > > > 
> > > > > As stated we do via the TDR which is rather clean design and fits with
> > > > > our reference couting scheme.
> > > > > 
> > > > > > > If we need to teardown on demand we just set the TDR to a minimum value and
> > > > > > > it kicks the jobs off the hardware, gracefully cleans everything up and
> > > > > > > drops all references. This is a benefit of the 1 to 1 relationship, not
> > > > > > > sure if this works with how AMDGPU uses the scheduler.
> > > > > > > 
> > > > > > > > Delaying application tear down is also not an option because then you run
> > > > > > > > into massive trouble with the OOM killer (or more generally OOM handling).
> > > > > > > > See what we do in drm_sched_entity_flush() as well.
> > > > > > > > 
> > > > > > > Not an issue for Xe, we never call drm_sched_entity_flush as our
> > > > > > > referencing counting scheme is all jobs are finished before we attempt
> > > > > > > to tear down entity / scheduler.
> > > > > > I don't think you can do that upstream. Calling drm_sched_entity_flush() is
> > > > > > a must have from your flush callback for the file descriptor.
> > > > > > 
> > > > > Again 'flush callback'? What are you refering too.
> > > > > 
> > > > > And why does drm_sched_entity_flush need to be called, doesn't seem to
> > > > > do anything useful.
> > > > > 
> > > > > > Unless you have some other method for killing your submissions this would
> > > > > > give a path for a deny of service attack vector when the Xe driver is in
> > > > > > use.
> > > > > > 
> > > > > Yes, once th TDR fires is disallows all new submissions at the exec
> > > > > IOCTL plus flushes any pending submissions as fast as possible.
> > > > > 
> > > > > > > > Since adding the TDR support we completely exercised this through in the
> > > > > > > > last two or three years or so. And to sum it up I would really like to get
> > > > > > > > away from this mess again.
> > > > > > > > 
> > > > > > > > Compared to that what i915 does is actually rather clean I think.
> > > > > > > > 
> > > > > > > Not even close, resets where a nightmare in the i915 (I spend years
> > > > > > > trying to get this right and probably still completely work) and in Xe
> > > > > > > basically got it right on the attempt.
> > > > > > > 
> > > > > > > > >     Also in Xe some of
> > > > > > > > > things done in free_job cannot be from an IRQ context, hence calling
> > > > > > > > > this from the scheduler worker is rather helpful.
> > > > > > > > Well putting things for cleanup into a workitem doesn't sounds like
> > > > > > > > something hard.
> > > > > > > > 
> > > > > > > That is exactly what we doing in the scheduler with the free_job
> > > > > > > workitem.
> > > > > > Yeah, but I think that we do it in the scheduler and not the driver is
> > > > > > problematic.
> > > Christian, I do see your point on simply get rid of free job callbacks here
> > > then use fence with own-driver workqueue and house cleaning. But I wonder if
> > > starting with this patch as a clear separation of that is not a step forward
> > > and that could be cleaned up on a follow up!?
> > > 
> > > Matt, why exactly do we need the separation in this patch? Commit message tells
> > > what it is doing and that it is aligned with design, but is not clear on why
> > > exactly we need this right now. Specially if in the end what we want is exactly
> > > keeping the submit_wq to ensure the serialization of the operations you mentioned.
> > > I mean, could we simply drop this patch and then work on a follow-up later and
> > > investigate the Christian suggestion when we are in-tree?
> > > 
> > I believe Christian suggested this change in a previous rev (free_job,
> > proccess_msg) in there own workitem [1].
> > 
> > Dropping free_job / calling run_job again is really a completely
> > different topic than this patch.
> 
> Yeah, agree. I just wanted to bring this up before we put even more effort
> in the free_job based approach.
> 
> Rodrigos point is a really good one, no matter if the driver or the
> scheduler frees the job. Doing that in a separate work item sounds like the
> right thing to do.
> 

Ok, so this patch for now is ok but as a follow up we should explore
dropping free_job / scheduler refs to jobs with a wider audience as this
change affects all drivers.

Matt

> Regards,
> Christian.
> 
> > 
> > [1] https://patchwork.freedesktop.org/patch/550722/?series=121745&rev=1
> > 
> > > > > Disagree, a common clean callback from a non-irq contexts IMO is a good
> > > > > design rather than each driver possibly having its own worker for
> > > > > cleanup.
> > > > > 
> > > > > > For the scheduler it shouldn't care about the job any more as soon as the
> > > > > > driver takes over.
> > > > > > 
> > > > > This a massive rewrite for all users of the DRM scheduler, I'm saying
> > > > > for Xe what you are suggesting makes little to no sense.
> > > > > 
> > > > > I'd like other users of the DRM scheduler to chime in on what you
> > > > > purposing. The scope of this change affects 8ish drivers that would
> > > > > require buy in each of the stakeholders. I certainly can't change of
> > > > > these drivers as I don't feel comfortable in all of those code bases nor
> > > > > do I have hardware to test all of these drivers.
> > > > > 
> > > > > > > > Question is what do you really need for TDR which is not inside the hardware
> > > > > > > > fence?
> > > > > > > > 
> > > > > > > A reference to the entity to be able to kick the job off the hardware.
> > > > > > > A reference to the entity, job, and VM for error capture.
> > > > > > > 
> > > > > > > We also need a reference to the job for recovery after a GPU reset so
> > > > > > > run_job can be called again for innocent jobs.
> > > > > > Well exactly that's what I'm massively pushing back. Letting the scheduler
> > > > > > call run_job() for the same job again is *NOT* something you can actually
> > > > > > do.
> > > > > > 
> > > > > But lots of drivers do this already and the DRM scheduler documents
> > > > > this.
> > > > > 
> > > > > > This pretty clearly violates some of the dma_fence constrains and has cause
> > > > > > massively headaches for me already.
> > > > > > 
> > > > > Seems to work fine in Xe.
> > > > > 
> > > > > > What you can do is to do this inside your driver, e.g. take the jobs and
> > > > > > push them again to the hw ring or just tell the hw to start executing again
> > > > > > from a previous position.
> > > > > > 
> > > > > Again this now is massive rewrite of many drivers.
> > > > > 
> > > > > > BTW that re-submitting of jobs seems to be a no-go from userspace
> > > > > > perspective as well. Take a look at the Vulkan spec for that, at least Marek
> > > > > > pretty much pointed out that we should absolutely not do this inside the
> > > > > > kernel.
> > > > > > 
> > > > > Yes if the job causes the hang, we ban the queue. Typcially only per
> > > > > entity (queue) resets are done in Xe but occasionally device level
> > > > > resets are done (issues with hardware) and innocent jobs / entities call
> > > > > run_job again.
> > > > If the engine is reset and the job was already executing, how can you
> > > > determine that it's in a good state to resubmit?  What if some
> > If a job has started but not completed we ban the queue during device
> > reset. If a queue have jobs submitted but not started we resubmit all
> > jobs on the queue during device reset.
> > 
> > The started / completed state can be determined by looking at a seqno in
> > memory.
> > 
> > > > internal fence or semaphore in memory used by the logic in the command
> > > > buffer has been signaled already and then you resubmit the job and it
> > > > now starts executing with different input state?
> > > I believe we could set some more rules in the new robustness documentation:
> > > https://lore.kernel.org/all/20230818200642.276735-1-andrealmeid@igalia.com/
> > > 
> > > For this robustness implementation i915 pin point the exact context that
> > > was in execution when the gpu hang and only blame that, although the
> > > ressubmission is up to the user space. While on Xe we are blaming every
> > > single context that was in the queue. So I'm actually confused on what
> > > are the innocent jobs and who are calling for resubmission, if all of
> > > them got banned and blamed.
> > See above, innocent job == submited job but not started (i.e. a job
> > stuck in the FW queue not yet been put on the hardware). Because we have
> > a FW scheduler we could have 1000s of innocent jobs that don't need to
> > get banned. This is very different from drivers without FW schedulers as
> > typically when run_job is called the job hits the hardware immediately.
> > 
> > Matt
> > 
> > > > Alex
> > > > 
> > > > > > The generally right approach seems to be to cleanly signal to userspace that
> > > > > > something bad happened and that userspace then needs to submit things again
> > > > > > even for innocent jobs.
> > > > > > 
> > > > > I disagree that innocent jobs should be banned. What you are suggesting
> > > > > is if a device reset needs to be done we kill / ban every user space queue.
> > > > > Thats seems like overkill. Not seeing where that is stated in this doc
> > > > > [1], it seems to imply that only jobs that are stuck results in bans.
> > > > > 
> > > > > Matt
> > > > > 
> > > > > [1] https://patchwork.freedesktop.org/patch/553465/?series=119883&rev=3
> > > > > 
> > > > > > Regards,
> > > > > > Christian.
> > > > > > 
> > > > > > > All of this leads to believe we need to stick with the design.
> > > > > > > 
> > > > > > > Matt
> > > > > > > 
> > > > > > > > Regards,
> > > > > > > > Christian.
> > > > > > > > 
> > > > > > > > > The HW fence can live for longer as it can be installed in dma-resv
> > > > > > > > > slots, syncobjs, etc... If the job and hw fence are combined now we
> > > > > > > > > holding on the memory for the longer and perhaps at the mercy of the
> > > > > > > > > user. We also run the risk of the final put being done from an IRQ
> > > > > > > > > context which again wont work in Xe as it is currently coded. Lastly 2
> > > > > > > > > jobs from the same scheduler could do the final put in parallel, so
> > > > > > > > > rather than having free_job serialized by the worker now multiple jobs
> > > > > > > > > are freeing themselves at the same time. This might not be an issue but
> > > > > > > > > adds another level of raceyness that needs to be accounted for. None of
> > > > > > > > > this sounds desirable to me.
> > > > > > > > > 
> > > > > > > > > FWIW what you suggesting sounds like how the i915 did things
> > > > > > > > > (i915_request and hw fence in 1 memory alloc) and that turned out to be
> > > > > > > > > a huge mess. As rule of thumb I generally do the opposite of whatever
> > > > > > > > > the i915 did.
> > > > > > > > > 
> > > > > > > > > Matt
> > > > > > > > > 
> > > > > > > > > > Christian.
> > > > > > > > > > 
> > > > > > > > > > > Matt
> > > > > > > > > > > 
> > > > > > > > > > > > All the lifetime issues we had came from ignoring this fact and I think we
> > > > > > > > > > > > should push for fixing this design up again.
> > > > > > > > > > > > 
> > > > > > > > > > > > Regards,
> > > > > > > > > > > > Christian.
> > > > > > > > > > > > 
> > > > > > > > > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > > > > > ---
> > > > > > > > > > > > >       drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > > > > > > > > > > >       include/drm/gpu_scheduler.h            |   8 +-
> > > > > > > > > > > > >       2 files changed, 106 insertions(+), 39 deletions(-)
> > > > > > > > > > > > > 
> > > > > > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > > > index cede47afc800..b67469eac179 100644
> > > > > > > > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > > > > > > > > > > >        * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > > > > > > > > > > >        *
> > > > > > > > > > > > >        * @rq: scheduler run queue to check.
> > > > > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > > > > >        *
> > > > > > > > > > > > >        * Try to find a ready entity, returns NULL if none found.
> > > > > > > > > > > > >        */
> > > > > > > > > > > > >       static struct drm_sched_entity *
> > > > > > > > > > > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          struct drm_sched_entity *entity;
> > > > > > > > > > > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > > > >          if (entity) {
> > > > > > > > > > > > >                  list_for_each_entry_continue(entity, &rq->entities, list) {
> > > > > > > > > > > > >                          if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > > > > -                               rq->current_entity = entity;
> > > > > > > > > > > > > -                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > > > > +                               if (dequeue) {
> > > > > > > > > > > > > +                                       rq->current_entity = entity;
> > > > > > > > > > > > > +                                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > > > > +                               }
> > > > > > > > > > > > >                                  spin_unlock(&rq->lock);
> > > > > > > > > > > > >                                  return entity;
> > > > > > > > > > > > >                          }
> > > > > > > > > > > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > > > >          list_for_each_entry(entity, &rq->entities, list) {
> > > > > > > > > > > > >                  if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > > > > +                       if (dequeue) {
> > > > > > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > > > > +                       }
> > > > > > > > > > > > >                          spin_unlock(&rq->lock);
> > > > > > > > > > > > >                          return entity;
> > > > > > > > > > > > >                  }
> > > > > > > > > > > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > > > > > > > > > >        * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > > > > > > > > > > >        *
> > > > > > > > > > > > >        * @rq: scheduler run queue to check.
> > > > > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > > > > >        *
> > > > > > > > > > > > >        * Find oldest waiting ready entity, returns NULL if none found.
> > > > > > > > > > > > >        */
> > > > > > > > > > > > >       static struct drm_sched_entity *
> > > > > > > > > > > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > > > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          struct rb_node *rb;
> > > > > > > > > > > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > > > > >                  entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > > > > > > > > > > >                  if (drm_sched_entity_is_ready(entity)) {
> > > > > > > > > > > > > -                       rq->current_entity = entity;
> > > > > > > > > > > > > -                       reinit_completion(&entity->entity_idle);
> > > > > > > > > > > > > +                       if (dequeue) {
> > > > > > > > > > > > > +                               rq->current_entity = entity;
> > > > > > > > > > > > > +                               reinit_completion(&entity->entity_idle);
> > > > > > > > > > > > > +                       }
> > > > > > > > > > > > >                          break;
> > > > > > > > > > > > >                  }
> > > > > > > > > > > > >          }
> > > > > > > > > > > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > > > > > > > > > >       }
> > > > > > > > > > > > >       /**
> > > > > > > > > > > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > > > > > > > > > > + * drm_sched_run_job_queue - queue job submission
> > > > > > > > > > > > >        * @sched: scheduler instance
> > > > > > > > > > > > >        */
> > > > > > > > > > > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > > > > > -               queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +static struct drm_sched_entity *
> > > > > > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +/**
> > > > > > > > > > > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > > > > > > > > > > + * @sched: scheduler instance
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       if (drm_sched_select_entity(sched, false))
> > > > > > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +/**
> > > > > > > > > > > > > + * drm_sched_free_job_queue - queue free job
> > > > > > > > > > > > > + *
> > > > > > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       if (!READ_ONCE(sched->pause_submit))
> > > > > > > > > > > > > +               queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +/**
> > > > > > > > > > > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > > > > > > > > > > + *
> > > > > > > > > > > > > + * @sched: scheduler instance to queue free job
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct drm_sched_job *job;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       spin_lock(&sched->job_list_lock);
> > > > > > > > > > > > > +       job = list_first_entry_or_null(&sched->pending_list,
> > > > > > > > > > > > > +                                      struct drm_sched_job, list);
> > > > > > > > > > > > > +       if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > > > > > > > > > > +               drm_sched_free_job_queue(sched);
> > > > > > > > > > > > > +       spin_unlock(&sched->job_list_lock);
> > > > > > > > > > > > >       }
> > > > > > > > > > > > >       /**
> > > > > > > > > > > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > > > > > > > > > > >          dma_fence_get(&s_fence->finished);
> > > > > > > > > > > > >          drm_sched_fence_finished(s_fence, result);
> > > > > > > > > > > > >          dma_fence_put(&s_fence->finished);
> > > > > > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > > > > > > +       drm_sched_free_job_queue(sched);
> > > > > > > > > > > > >       }
> > > > > > > > > > > > >       /**
> > > > > > > > > > > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > >       void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          if (drm_sched_can_queue(sched))
> > > > > > > > > > > > > -               drm_sched_submit_queue(sched);
> > > > > > > > > > > > > +               drm_sched_run_job_queue(sched);
> > > > > > > > > > > > >       }
> > > > > > > > > > > > >       /**
> > > > > > > > > > > > >        * drm_sched_select_entity - Select next entity to process
> > > > > > > > > > > > >        *
> > > > > > > > > > > > >        * @sched: scheduler instance
> > > > > > > > > > > > > + * @dequeue: dequeue selected entity
> > > > > > > > > > > > >        *
> > > > > > > > > > > > >        * Returns the entity to process or NULL if none are found.
> > > > > > > > > > > > >        */
> > > > > > > > > > > > >       static struct drm_sched_entity *
> > > > > > > > > > > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          struct drm_sched_entity *entity;
> > > > > > > > > > > > >          int i;
> > > > > > > > > > > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > >          /* Kernel run queue has higher priority than normal run queue*/
> > > > > > > > > > > > >          for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > > > > > > > > > > >                  entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > > > > > > > > > > -                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > > > > > > > > > > -                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > > > > > > > > > > +                       drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > > > > > > > > > > +                                                       dequeue) :
> > > > > > > > > > > > > +                       drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > > > > > > > > > > +                                                     dequeue);
> > > > > > > > > > > > >                  if (entity)
> > > > > > > > > > > > >                          break;
> > > > > > > > > > > > >          }
> > > > > > > > > > > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > > > > > > > > > > >       EXPORT_SYMBOL(drm_sched_pick_best);
> > > > > > > > > > > > >       /**
> > > > > > > > > > > > > - * drm_sched_main - main scheduler thread
> > > > > > > > > > > > > + * drm_sched_free_job_work - worker to call free_job
> > > > > > > > > > > > >        *
> > > > > > > > > > > > > - * @param: scheduler instance
> > > > > > > > > > > > > + * @w: free job work
> > > > > > > > > > > > >        */
> > > > > > > > > > > > > -static void drm_sched_main(struct work_struct *w)
> > > > > > > > > > > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          struct drm_gpu_scheduler *sched =
> > > > > > > > > > > > > -               container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > > > > > > > > > > -       struct drm_sched_entity *entity;
> > > > > > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > > > > > > > > > > >          struct drm_sched_job *cleanup_job;
> > > > > > > > > > > > > -       int r;
> > > > > > > > > > > > >          if (READ_ONCE(sched->pause_submit))
> > > > > > > > > > > > >                  return;
> > > > > > > > > > > > >          cleanup_job = drm_sched_get_cleanup_job(sched);
> > > > > > > > > > > > > -       entity = drm_sched_select_entity(sched);
> > > > > > > > > > > > > +       if (cleanup_job) {
> > > > > > > > > > > > > +               sched->ops->free_job(cleanup_job);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               drm_sched_free_job_queue_if_ready(sched);
> > > > > > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > > > > > > +       }
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > -       if (!entity && !cleanup_job)
> > > > > > > > > > > > > -               return; /* No more work */
> > > > > > > > > > > > > +/**
> > > > > > > > > > > > > + * drm_sched_run_job_work - worker to call run_job
> > > > > > > > > > > > > + *
> > > > > > > > > > > > > + * @w: run job work
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct drm_gpu_scheduler *sched =
> > > > > > > > > > > > > +               container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > > > > > > > > > > +       struct drm_sched_entity *entity;
> > > > > > > > > > > > > +       int r;
> > > > > > > > > > > > > -       if (cleanup_job)
> > > > > > > > > > > > > -               sched->ops->free_job(cleanup_job);
> > > > > > > > > > > > > +       if (READ_ONCE(sched->pause_submit))
> > > > > > > > > > > > > +               return;
> > > > > > > > > > > > > +       entity = drm_sched_select_entity(sched, true);
> > > > > > > > > > > > >          if (entity) {
> > > > > > > > > > > > >                  struct dma_fence *fence;
> > > > > > > > > > > > >                  struct drm_sched_fence *s_fence;
> > > > > > > > > > > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > > > > >                  sched_job = drm_sched_entity_pop_job(entity);
> > > > > > > > > > > > >                  if (!sched_job) {
> > > > > > > > > > > > >                          complete_all(&entity->entity_idle);
> > > > > > > > > > > > > -                       if (!cleanup_job)
> > > > > > > > > > > > > -                               return; /* No more work */
> > > > > > > > > > > > > -                       goto again;
> > > > > > > > > > > > > +                       return; /* No more work */
> > > > > > > > > > > > >                  }
> > > > > > > > > > > > >                  s_fence = sched_job->s_fence;
> > > > > > > > > > > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > > > > > > > > > > >                  }
> > > > > > > > > > > > >                  wake_up(&sched->job_scheduled);
> > > > > > > > > > > > > +               drm_sched_run_job_queue_if_ready(sched);
> > > > > > > > > > > > >          }
> > > > > > > > > > > > > -
> > > > > > > > > > > > > -again:
> > > > > > > > > > > > > -       drm_sched_submit_queue(sched);
> > > > > > > > > > > > >       }
> > > > > > > > > > > > >       /**
> > > > > > > > > > > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > > > > > > >          spin_lock_init(&sched->job_list_lock);
> > > > > > > > > > > > >          atomic_set(&sched->hw_rq_count, 0);
> > > > > > > > > > > > >          INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > > > > > > > > > > -       INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > > > > > > > > > > +       INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > > > > > > > > > > +       INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > > > > > > > > > > >          atomic_set(&sched->_score, 0);
> > > > > > > > > > > > >          atomic64_set(&sched->job_id_count, 0);
> > > > > > > > > > > > >          sched->pause_submit = false;
> > > > > > > > > > > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > > > > > > > > > > >       void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          WRITE_ONCE(sched->pause_submit, true);
> > > > > > > > > > > > > -       cancel_work_sync(&sched->work_submit);
> > > > > > > > > > > > > +       cancel_work_sync(&sched->work_run_job);
> > > > > > > > > > > > > +       cancel_work_sync(&sched->work_free_job);
> > > > > > > > > > > > >       }
> > > > > > > > > > > > >       EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > > > > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > > > > > > > > > >       void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > > > > > > > > > > >       {
> > > > > > > > > > > > >          WRITE_ONCE(sched->pause_submit, false);
> > > > > > > > > > > > > -       queue_work(sched->submit_wq, &sched->work_submit);
> > > > > > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_run_job);
> > > > > > > > > > > > > +       queue_work(sched->submit_wq, &sched->work_free_job);
> > > > > > > > > > > > >       }
> > > > > > > > > > > > >       EXPORT_SYMBOL(drm_sched_submit_start);
> > > > > > > > > > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > > > > > > > > > > index 04eec2d7635f..fbc083a92757 100644
> > > > > > > > > > > > > --- a/include/drm/gpu_scheduler.h
> > > > > > > > > > > > > +++ b/include/drm/gpu_scheduler.h
> > > > > > > > > > > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > > > > > > > > > > >        *                 finished.
> > > > > > > > > > > > >        * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > > > > > > > > > > >        * @job_id_count: used to assign unique id to the each job.
> > > > > > > > > > > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > > > > > > > > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > > > > > > > > > > >        * @timeout_wq: workqueue used to queue @work_tdr
> > > > > > > > > > > > > - * @work_submit: schedules jobs and cleans up entities
> > > > > > > > > > > > > + * @work_run_job: schedules jobs
> > > > > > > > > > > > > + * @work_free_job: cleans up jobs
> > > > > > > > > > > > >        * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > > > > > > > > > > >        *            timeout interval is over.
> > > > > > > > > > > > >        * @pending_list: the list of jobs which are currently in the job queue.
> > > > > > > > > > > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > > > > > > > > > > >          atomic64_t                      job_id_count;
> > > > > > > > > > > > >          struct workqueue_struct         *submit_wq;
> > > > > > > > > > > > >          struct workqueue_struct         *timeout_wq;
> > > > > > > > > > > > > -       struct work_struct              work_submit;
> > > > > > > > > > > > > +       struct work_struct              work_run_job;
> > > > > > > > > > > > > +       struct work_struct              work_free_job;
> > > > > > > > > > > > >          struct delayed_work             work_tdr;
> > > > > > > > > > > > >          struct list_head                pending_list;
> > > > > > > > > > > > >          spinlock_t                      job_list_lock;
>
Danilo Krummrich Aug. 24, 2023, 11:04 p.m. UTC | #14
On Thu, Aug 10, 2023 at 07:31:32PM -0700, Matthew Brost wrote:
> Rather than call free_job and run_job in same work item have a dedicated
> work item for each. This aligns with the design and intended use of work
> queues.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>  include/drm/gpu_scheduler.h            |   8 +-
>  2 files changed, 106 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index cede47afc800..b67469eac179 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>   * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>   *
>   * @rq: scheduler run queue to check.
> + * @dequeue: dequeue selected entity
>   *
>   * Try to find a ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>  {
>  	struct drm_sched_entity *entity;
>  
> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>  	if (entity) {
>  		list_for_each_entry_continue(entity, &rq->entities, list) {
>  			if (drm_sched_entity_is_ready(entity)) {
> -				rq->current_entity = entity;
> -				reinit_completion(&entity->entity_idle);
> +				if (dequeue) {
> +					rq->current_entity = entity;
> +					reinit_completion(&entity->entity_idle);
> +				}
>  				spin_unlock(&rq->lock);
>  				return entity;
>  			}
> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>  	list_for_each_entry(entity, &rq->entities, list) {
>  
>  		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (dequeue) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>  			spin_unlock(&rq->lock);
>  			return entity;
>  		}
> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>   * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>   *
>   * @rq: scheduler run queue to check.
> + * @dequeue: dequeue selected entity
>   *
>   * Find oldest waiting ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>  {
>  	struct rb_node *rb;
>  
> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>  
>  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>  		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (dequeue) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>  			break;
>  		}
>  	}
> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>  }
>  
>  /**
> - * drm_sched_submit_queue - scheduler queue submission
> + * drm_sched_run_job_queue - queue job submission
>   * @sched: scheduler instance
>   */
> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>  {
>  	if (!READ_ONCE(sched->pause_submit))
> -		queue_work(sched->submit_wq, &sched->work_submit);
> +		queue_work(sched->submit_wq, &sched->work_run_job);
> +}
> +
> +static struct drm_sched_entity *
> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> +
> +/**
> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> + * @sched: scheduler instance
> + */
> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> +{
> +	if (drm_sched_select_entity(sched, false))
> +		drm_sched_run_job_queue(sched);
> +}
> +
> +/**
> + * drm_sched_free_job_queue - queue free job
> + *
> + * @sched: scheduler instance to queue free job
> + */
> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> +{
> +	if (!READ_ONCE(sched->pause_submit))
> +		queue_work(sched->submit_wq, &sched->work_free_job);
> +}
> +
> +/**
> + * drm_sched_free_job_queue_if_ready - queue free job if ready
> + *
> + * @sched: scheduler instance to queue free job
> + */
> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> +{
> +	struct drm_sched_job *job;
> +
> +	spin_lock(&sched->job_list_lock);
> +	job = list_first_entry_or_null(&sched->pending_list,
> +				       struct drm_sched_job, list);
> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> +		drm_sched_free_job_queue(sched);
> +	spin_unlock(&sched->job_list_lock);
>  }
>  
>  /**
> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>  	dma_fence_get(&s_fence->finished);
>  	drm_sched_fence_finished(s_fence, result);
>  	dma_fence_put(&s_fence->finished);
> -	drm_sched_submit_queue(sched);
> +	drm_sched_free_job_queue(sched);
>  }
>  
>  /**
> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>  void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>  {
>  	if (drm_sched_can_queue(sched))
> -		drm_sched_submit_queue(sched);
> +		drm_sched_run_job_queue(sched);
>  }
>  
>  /**
>   * drm_sched_select_entity - Select next entity to process
>   *
>   * @sched: scheduler instance
> + * @dequeue: dequeue selected entity
>   *
>   * Returns the entity to process or NULL if none are found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>  {
>  	struct drm_sched_entity *entity;
>  	int i;
> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>  	/* Kernel run queue has higher priority than normal run queue*/
>  	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>  		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> +							dequeue) :
> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> +						      dequeue);
>  		if (entity)
>  			break;
>  	}
> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>  EXPORT_SYMBOL(drm_sched_pick_best);
>  
>  /**
> - * drm_sched_main - main scheduler thread
> + * drm_sched_free_job_work - worker to call free_job
>   *
> - * @param: scheduler instance
> + * @w: free job work
>   */
> -static void drm_sched_main(struct work_struct *w)
> +static void drm_sched_free_job_work(struct work_struct *w)
>  {
>  	struct drm_gpu_scheduler *sched =
> -		container_of(w, struct drm_gpu_scheduler, work_submit);
> -	struct drm_sched_entity *entity;
> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>  	struct drm_sched_job *cleanup_job;
> -	int r;
>  
>  	if (READ_ONCE(sched->pause_submit))
>  		return;
>  
>  	cleanup_job = drm_sched_get_cleanup_job(sched);

I tried this patch with Nouveau and found a race condition:

In drm_sched_run_job_work() the job is added to the pending_list via
drm_sched_job_begin(), then the run_job() callback is called and the scheduled
fence is signaled.

However, in parallel drm_sched_get_cleanup_job() might be called from
drm_sched_free_job_work(), which picks the first job from the pending_list and
for the next job on the pending_list sets the scheduled fence' timestamp field.

The job can be on the pending_list, but the scheduled fence might not yet be
signaled. The call to actually signal the fence will subsequently fault because
it will try to dereference the timestamp.

I'm not sure what's the best way to fix this, maybe it's enough to re-order
signalling the scheduled fence and adding the job to the pending_list. Not sure
if this has other implications though.

- Danilo

> -	entity = drm_sched_select_entity(sched);
> +	if (cleanup_job) {
> +		sched->ops->free_job(cleanup_job);
> +
> +		drm_sched_free_job_queue_if_ready(sched);
> +		drm_sched_run_job_queue_if_ready(sched);
> +	}
> +}
>  
> -	if (!entity && !cleanup_job)
> -		return;	/* No more work */
> +/**
> + * drm_sched_run_job_work - worker to call run_job
> + *
> + * @w: run job work
> + */
> +static void drm_sched_run_job_work(struct work_struct *w)
> +{
> +	struct drm_gpu_scheduler *sched =
> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> +	struct drm_sched_entity *entity;
> +	int r;
>  
> -	if (cleanup_job)
> -		sched->ops->free_job(cleanup_job);
> +	if (READ_ONCE(sched->pause_submit))
> +		return;
>  
> +	entity = drm_sched_select_entity(sched, true);
>  	if (entity) {
>  		struct dma_fence *fence;
>  		struct drm_sched_fence *s_fence;
> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>  		sched_job = drm_sched_entity_pop_job(entity);
>  		if (!sched_job) {
>  			complete_all(&entity->entity_idle);
> -			if (!cleanup_job)
> -				return;	/* No more work */
> -			goto again;
> +			return;	/* No more work */
>  		}
>  
>  		s_fence = sched_job->s_fence;
> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>  		}
>  
>  		wake_up(&sched->job_scheduled);
> +		drm_sched_run_job_queue_if_ready(sched);
>  	}
> -
> -again:
> -	drm_sched_submit_queue(sched);
>  }
>  
>  /**
> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  	spin_lock_init(&sched->job_list_lock);
>  	atomic_set(&sched->hw_rq_count, 0);
>  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> -	INIT_WORK(&sched->work_submit, drm_sched_main);
> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>  	atomic_set(&sched->_score, 0);
>  	atomic64_set(&sched->job_id_count, 0);
>  	sched->pause_submit = false;
> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>  void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>  {
>  	WRITE_ONCE(sched->pause_submit, true);
> -	cancel_work_sync(&sched->work_submit);
> +	cancel_work_sync(&sched->work_run_job);
> +	cancel_work_sync(&sched->work_free_job);
>  }
>  EXPORT_SYMBOL(drm_sched_submit_stop);
>  
> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>  void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>  {
>  	WRITE_ONCE(sched->pause_submit, false);
> -	queue_work(sched->submit_wq, &sched->work_submit);
> +	queue_work(sched->submit_wq, &sched->work_run_job);
> +	queue_work(sched->submit_wq, &sched->work_free_job);
>  }
>  EXPORT_SYMBOL(drm_sched_submit_start);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 04eec2d7635f..fbc083a92757 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>   *                 finished.
>   * @hw_rq_count: the number of jobs currently in the hardware queue.
>   * @job_id_count: used to assign unique id to the each job.
> - * @submit_wq: workqueue used to queue @work_submit
> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>   * @timeout_wq: workqueue used to queue @work_tdr
> - * @work_submit: schedules jobs and cleans up entities
> + * @work_run_job: schedules jobs
> + * @work_free_job: cleans up jobs
>   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>   *            timeout interval is over.
>   * @pending_list: the list of jobs which are currently in the job queue.
> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>  	atomic64_t			job_id_count;
>  	struct workqueue_struct		*submit_wq;
>  	struct workqueue_struct		*timeout_wq;
> -	struct work_struct		work_submit;
> +	struct work_struct		work_run_job;
> +	struct work_struct		work_free_job;
>  	struct delayed_work		work_tdr;
>  	struct list_head		pending_list;
>  	spinlock_t			job_list_lock;
> -- 
> 2.34.1
>
Matthew Brost Aug. 25, 2023, 2:58 a.m. UTC | #15
On Fri, Aug 25, 2023 at 01:04:10AM +0200, Danilo Krummrich wrote:
> On Thu, Aug 10, 2023 at 07:31:32PM -0700, Matthew Brost wrote:
> > Rather than call free_job and run_job in same work item have a dedicated
> > work item for each. This aligns with the design and intended use of work
> > queues.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> >  include/drm/gpu_scheduler.h            |   8 +-
> >  2 files changed, 106 insertions(+), 39 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index cede47afc800..b67469eac179 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> >   * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> >   *
> >   * @rq: scheduler run queue to check.
> > + * @dequeue: dequeue selected entity
> >   *
> >   * Try to find a ready entity, returns NULL if none found.
> >   */
> >  static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> >  {
> >  	struct drm_sched_entity *entity;
> >  
> > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >  	if (entity) {
> >  		list_for_each_entry_continue(entity, &rq->entities, list) {
> >  			if (drm_sched_entity_is_ready(entity)) {
> > -				rq->current_entity = entity;
> > -				reinit_completion(&entity->entity_idle);
> > +				if (dequeue) {
> > +					rq->current_entity = entity;
> > +					reinit_completion(&entity->entity_idle);
> > +				}
> >  				spin_unlock(&rq->lock);
> >  				return entity;
> >  			}
> > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >  	list_for_each_entry(entity, &rq->entities, list) {
> >  
> >  		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (dequeue) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >  			spin_unlock(&rq->lock);
> >  			return entity;
> >  		}
> > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >   * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> >   *
> >   * @rq: scheduler run queue to check.
> > + * @dequeue: dequeue selected entity
> >   *
> >   * Find oldest waiting ready entity, returns NULL if none found.
> >   */
> >  static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> >  {
> >  	struct rb_node *rb;
> >  
> > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >  
> >  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> >  		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (dequeue) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >  			break;
> >  		}
> >  	}
> > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >  }
> >  
> >  /**
> > - * drm_sched_submit_queue - scheduler queue submission
> > + * drm_sched_run_job_queue - queue job submission
> >   * @sched: scheduler instance
> >   */
> > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> >  {
> >  	if (!READ_ONCE(sched->pause_submit))
> > -		queue_work(sched->submit_wq, &sched->work_submit);
> > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > +}
> > +
> > +static struct drm_sched_entity *
> > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > +
> > +/**
> > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > + * @sched: scheduler instance
> > + */
> > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (drm_sched_select_entity(sched, false))
> > +		drm_sched_run_job_queue(sched);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue - queue free job
> > + *
> > + * @sched: scheduler instance to queue free job
> > + */
> > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (!READ_ONCE(sched->pause_submit))
> > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > + *
> > + * @sched: scheduler instance to queue free job
> > + */
> > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > +{
> > +	struct drm_sched_job *job;
> > +
> > +	spin_lock(&sched->job_list_lock);
> > +	job = list_first_entry_or_null(&sched->pending_list,
> > +				       struct drm_sched_job, list);
> > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > +		drm_sched_free_job_queue(sched);
> > +	spin_unlock(&sched->job_list_lock);
> >  }
> >  
> >  /**
> > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> >  	dma_fence_get(&s_fence->finished);
> >  	drm_sched_fence_finished(s_fence, result);
> >  	dma_fence_put(&s_fence->finished);
> > -	drm_sched_submit_queue(sched);
> > +	drm_sched_free_job_queue(sched);
> >  }
> >  
> >  /**
> > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> >  void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> >  {
> >  	if (drm_sched_can_queue(sched))
> > -		drm_sched_submit_queue(sched);
> > +		drm_sched_run_job_queue(sched);
> >  }
> >  
> >  /**
> >   * drm_sched_select_entity - Select next entity to process
> >   *
> >   * @sched: scheduler instance
> > + * @dequeue: dequeue selected entity
> >   *
> >   * Returns the entity to process or NULL if none are found.
> >   */
> >  static struct drm_sched_entity *
> > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> >  {
> >  	struct drm_sched_entity *entity;
> >  	int i;
> > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> >  	/* Kernel run queue has higher priority than normal run queue*/
> >  	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> >  		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > +							dequeue) :
> > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > +						      dequeue);
> >  		if (entity)
> >  			break;
> >  	}
> > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> >  EXPORT_SYMBOL(drm_sched_pick_best);
> >  
> >  /**
> > - * drm_sched_main - main scheduler thread
> > + * drm_sched_free_job_work - worker to call free_job
> >   *
> > - * @param: scheduler instance
> > + * @w: free job work
> >   */
> > -static void drm_sched_main(struct work_struct *w)
> > +static void drm_sched_free_job_work(struct work_struct *w)
> >  {
> >  	struct drm_gpu_scheduler *sched =
> > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > -	struct drm_sched_entity *entity;
> > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> >  	struct drm_sched_job *cleanup_job;
> > -	int r;
> >  
> >  	if (READ_ONCE(sched->pause_submit))
> >  		return;
> >  
> >  	cleanup_job = drm_sched_get_cleanup_job(sched);
> 
> I tried this patch with Nouveau and found a race condition:
> 
> In drm_sched_run_job_work() the job is added to the pending_list via
> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
> fence is signaled.
> 
> However, in parallel drm_sched_get_cleanup_job() might be called from
> drm_sched_free_job_work(), which picks the first job from the pending_list and
> for the next job on the pending_list sets the scheduled fence' timestamp field.
> 
> The job can be on the pending_list, but the scheduled fence might not yet be
> signaled. The call to actually signal the fence will subsequently fault because
> it will try to dereference the timestamp.
> 
> I'm not sure what's the best way to fix this, maybe it's enough to re-order
> signalling the scheduled fence and adding the job to the pending_list. Not sure
> if this has other implications though.
> 

We really want the job on the pending list before calling run_job.

I'm thinking we just delete the updating of the timestamp, not sure why
this is useful.

Or we could do something like this where we try to update the timestamp,
if we can't update the timestamp run_job worker will do it anyways.

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 67e0fb6e7d18..54bd3e88f139 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -1074,8 +1074,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
                                                typeof(*next), list);

                if (next) {
-                       next->s_fence->scheduled.timestamp =
-                               job->s_fence->finished.timestamp;
+                       if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
+                                    &next->s_fence->scheduled.flags))
+                               next->s_fence->scheduled.timestamp =
+                                       job->s_fence->finished.timestamp;
                        /* start TO timer for next job */
                        drm_sched_start_timeout(sched);
                }

I guess I'm leaning towards the latter option.

Matt

> - Danilo
> 
> > -	entity = drm_sched_select_entity(sched);
> > +	if (cleanup_job) {
> > +		sched->ops->free_job(cleanup_job);
> > +
> > +		drm_sched_free_job_queue_if_ready(sched);
> > +		drm_sched_run_job_queue_if_ready(sched);
> > +	}
> > +}
> >  
> > -	if (!entity && !cleanup_job)
> > -		return;	/* No more work */
> > +/**
> > + * drm_sched_run_job_work - worker to call run_job
> > + *
> > + * @w: run job work
> > + */
> > +static void drm_sched_run_job_work(struct work_struct *w)
> > +{
> > +	struct drm_gpu_scheduler *sched =
> > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > +	struct drm_sched_entity *entity;
> > +	int r;
> >  
> > -	if (cleanup_job)
> > -		sched->ops->free_job(cleanup_job);
> > +	if (READ_ONCE(sched->pause_submit))
> > +		return;
> >  
> > +	entity = drm_sched_select_entity(sched, true);
> >  	if (entity) {
> >  		struct dma_fence *fence;
> >  		struct drm_sched_fence *s_fence;
> > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> >  		sched_job = drm_sched_entity_pop_job(entity);
> >  		if (!sched_job) {
> >  			complete_all(&entity->entity_idle);
> > -			if (!cleanup_job)
> > -				return;	/* No more work */
> > -			goto again;
> > +			return;	/* No more work */
> >  		}
> >  
> >  		s_fence = sched_job->s_fence;
> > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> >  		}
> >  
> >  		wake_up(&sched->job_scheduled);
> > +		drm_sched_run_job_queue_if_ready(sched);
> >  	}
> > -
> > -again:
> > -	drm_sched_submit_queue(sched);
> >  }
> >  
> >  /**
> > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >  	spin_lock_init(&sched->job_list_lock);
> >  	atomic_set(&sched->hw_rq_count, 0);
> >  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> >  	atomic_set(&sched->_score, 0);
> >  	atomic64_set(&sched->job_id_count, 0);
> >  	sched->pause_submit = false;
> > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> >  void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> >  {
> >  	WRITE_ONCE(sched->pause_submit, true);
> > -	cancel_work_sync(&sched->work_submit);
> > +	cancel_work_sync(&sched->work_run_job);
> > +	cancel_work_sync(&sched->work_free_job);
> >  }
> >  EXPORT_SYMBOL(drm_sched_submit_stop);
> >  
> > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> >  void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> >  {
> >  	WRITE_ONCE(sched->pause_submit, false);
> > -	queue_work(sched->submit_wq, &sched->work_submit);
> > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > +	queue_work(sched->submit_wq, &sched->work_free_job);
> >  }
> >  EXPORT_SYMBOL(drm_sched_submit_start);
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 04eec2d7635f..fbc083a92757 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> >   *                 finished.
> >   * @hw_rq_count: the number of jobs currently in the hardware queue.
> >   * @job_id_count: used to assign unique id to the each job.
> > - * @submit_wq: workqueue used to queue @work_submit
> > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> >   * @timeout_wq: workqueue used to queue @work_tdr
> > - * @work_submit: schedules jobs and cleans up entities
> > + * @work_run_job: schedules jobs
> > + * @work_free_job: cleans up jobs
> >   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> >   *            timeout interval is over.
> >   * @pending_list: the list of jobs which are currently in the job queue.
> > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> >  	atomic64_t			job_id_count;
> >  	struct workqueue_struct		*submit_wq;
> >  	struct workqueue_struct		*timeout_wq;
> > -	struct work_struct		work_submit;
> > +	struct work_struct		work_run_job;
> > +	struct work_struct		work_free_job;
> >  	struct delayed_work		work_tdr;
> >  	struct list_head		pending_list;
> >  	spinlock_t			job_list_lock;
> > -- 
> > 2.34.1
> > 
>
Christian König Aug. 25, 2023, 8:02 a.m. UTC | #16
Am 25.08.23 um 04:58 schrieb Matthew Brost:
> On Fri, Aug 25, 2023 at 01:04:10AM +0200, Danilo Krummrich wrote:
>> On Thu, Aug 10, 2023 at 07:31:32PM -0700, Matthew Brost wrote:
>>> Rather than call free_job and run_job in same work item have a dedicated
>>> work item for each. This aligns with the design and intended use of work
>>> queues.
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>   drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>>>   include/drm/gpu_scheduler.h            |   8 +-
>>>   2 files changed, 106 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index cede47afc800..b67469eac179 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>    *
>>>    * @rq: scheduler run queue to check.
>>> + * @dequeue: dequeue selected entity
>>>    *
>>>    * Try to find a ready entity, returns NULL if none found.
>>>    */
>>>   static struct drm_sched_entity *
>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>>   {
>>>   	struct drm_sched_entity *entity;
>>>   
>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>   	if (entity) {
>>>   		list_for_each_entry_continue(entity, &rq->entities, list) {
>>>   			if (drm_sched_entity_is_ready(entity)) {
>>> -				rq->current_entity = entity;
>>> -				reinit_completion(&entity->entity_idle);
>>> +				if (dequeue) {
>>> +					rq->current_entity = entity;
>>> +					reinit_completion(&entity->entity_idle);
>>> +				}
>>>   				spin_unlock(&rq->lock);
>>>   				return entity;
>>>   			}
>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>   	list_for_each_entry(entity, &rq->entities, list) {
>>>   
>>>   		if (drm_sched_entity_is_ready(entity)) {
>>> -			rq->current_entity = entity;
>>> -			reinit_completion(&entity->entity_idle);
>>> +			if (dequeue) {
>>> +				rq->current_entity = entity;
>>> +				reinit_completion(&entity->entity_idle);
>>> +			}
>>>   			spin_unlock(&rq->lock);
>>>   			return entity;
>>>   		}
>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>>    *
>>>    * @rq: scheduler run queue to check.
>>> + * @dequeue: dequeue selected entity
>>>    *
>>>    * Find oldest waiting ready entity, returns NULL if none found.
>>>    */
>>>   static struct drm_sched_entity *
>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>>   {
>>>   	struct rb_node *rb;
>>>   
>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>   
>>>   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>>   		if (drm_sched_entity_is_ready(entity)) {
>>> -			rq->current_entity = entity;
>>> -			reinit_completion(&entity->entity_idle);
>>> +			if (dequeue) {
>>> +				rq->current_entity = entity;
>>> +				reinit_completion(&entity->entity_idle);
>>> +			}
>>>   			break;
>>>   		}
>>>   	}
>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>   }
>>>   
>>>   /**
>>> - * drm_sched_submit_queue - scheduler queue submission
>>> + * drm_sched_run_job_queue - queue job submission
>>>    * @sched: scheduler instance
>>>    */
>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>>   {
>>>   	if (!READ_ONCE(sched->pause_submit))
>>> -		queue_work(sched->submit_wq, &sched->work_submit);
>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
>>> +}
>>> +
>>> +static struct drm_sched_entity *
>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
>>> +
>>> +/**
>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
>>> + * @sched: scheduler instance
>>> + */
>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	if (drm_sched_select_entity(sched, false))
>>> +		drm_sched_run_job_queue(sched);
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_free_job_queue - queue free job
>>> + *
>>> + * @sched: scheduler instance to queue free job
>>> + */
>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	if (!READ_ONCE(sched->pause_submit))
>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
>>> +}
>>> +
>>> +/**
>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
>>> + *
>>> + * @sched: scheduler instance to queue free job
>>> + */
>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>> +{
>>> +	struct drm_sched_job *job;
>>> +
>>> +	spin_lock(&sched->job_list_lock);
>>> +	job = list_first_entry_or_null(&sched->pending_list,
>>> +				       struct drm_sched_job, list);
>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
>>> +		drm_sched_free_job_queue(sched);
>>> +	spin_unlock(&sched->job_list_lock);
>>>   }
>>>   
>>>   /**
>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>>   	dma_fence_get(&s_fence->finished);
>>>   	drm_sched_fence_finished(s_fence, result);
>>>   	dma_fence_put(&s_fence->finished);
>>> -	drm_sched_submit_queue(sched);
>>> +	drm_sched_free_job_queue(sched);
>>>   }
>>>   
>>>   /**
>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>>   void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>>>   {
>>>   	if (drm_sched_can_queue(sched))
>>> -		drm_sched_submit_queue(sched);
>>> +		drm_sched_run_job_queue(sched);
>>>   }
>>>   
>>>   /**
>>>    * drm_sched_select_entity - Select next entity to process
>>>    *
>>>    * @sched: scheduler instance
>>> + * @dequeue: dequeue selected entity
>>>    *
>>>    * Returns the entity to process or NULL if none are found.
>>>    */
>>>   static struct drm_sched_entity *
>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>>   {
>>>   	struct drm_sched_entity *entity;
>>>   	int i;
>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>   	/* Kernel run queue has higher priority than normal run queue*/
>>>   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>>   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>>> +							dequeue) :
>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>>> +						      dequeue);
>>>   		if (entity)
>>>   			break;
>>>   	}
>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>   EXPORT_SYMBOL(drm_sched_pick_best);
>>>   
>>>   /**
>>> - * drm_sched_main - main scheduler thread
>>> + * drm_sched_free_job_work - worker to call free_job
>>>    *
>>> - * @param: scheduler instance
>>> + * @w: free job work
>>>    */
>>> -static void drm_sched_main(struct work_struct *w)
>>> +static void drm_sched_free_job_work(struct work_struct *w)
>>>   {
>>>   	struct drm_gpu_scheduler *sched =
>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
>>> -	struct drm_sched_entity *entity;
>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>>>   	struct drm_sched_job *cleanup_job;
>>> -	int r;
>>>   
>>>   	if (READ_ONCE(sched->pause_submit))
>>>   		return;
>>>   
>>>   	cleanup_job = drm_sched_get_cleanup_job(sched);
>> I tried this patch with Nouveau and found a race condition:
>>
>> In drm_sched_run_job_work() the job is added to the pending_list via
>> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
>> fence is signaled.
>>
>> However, in parallel drm_sched_get_cleanup_job() might be called from
>> drm_sched_free_job_work(), which picks the first job from the pending_list and
>> for the next job on the pending_list sets the scheduled fence' timestamp field.

Well why can this happen in parallel? Either the work items are 
scheduled to a single threaded work queue or you have protected the 
pending list with some locks.

Just moving the free_job into a separate work item without such 
precautions won't work because of quite a bunch of other reasons as well.

>>
>> The job can be on the pending_list, but the scheduled fence might not yet be
>> signaled. The call to actually signal the fence will subsequently fault because
>> it will try to dereference the timestamp.
>>
>> I'm not sure what's the best way to fix this, maybe it's enough to re-order
>> signalling the scheduled fence and adding the job to the pending_list. Not sure
>> if this has other implications though.
>>
> We really want the job on the pending list before calling run_job.
>
> I'm thinking we just delete the updating of the timestamp, not sure why
> this is useful.

This is used for calculating how long each job has spend on the hw, so 
big NAK to deleting this.

Regards,
Christian.

>
> Or we could do something like this where we try to update the timestamp,
> if we can't update the timestamp run_job worker will do it anyways.
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 67e0fb6e7d18..54bd3e88f139 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -1074,8 +1074,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>                                                  typeof(*next), list);
>
>                  if (next) {
> -                       next->s_fence->scheduled.timestamp =
> -                               job->s_fence->finished.timestamp;
> +                       if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
> +                                    &next->s_fence->scheduled.flags))
> +                               next->s_fence->scheduled.timestamp =
> +                                       job->s_fence->finished.timestamp;
>                          /* start TO timer for next job */
>                          drm_sched_start_timeout(sched);
>                  }
>
> I guess I'm leaning towards the latter option.
>
> Matt
>
>> - Danilo
>>
>>> -	entity = drm_sched_select_entity(sched);
>>> +	if (cleanup_job) {
>>> +		sched->ops->free_job(cleanup_job);
>>> +
>>> +		drm_sched_free_job_queue_if_ready(sched);
>>> +		drm_sched_run_job_queue_if_ready(sched);
>>> +	}
>>> +}
>>>   
>>> -	if (!entity && !cleanup_job)
>>> -		return;	/* No more work */
>>> +/**
>>> + * drm_sched_run_job_work - worker to call run_job
>>> + *
>>> + * @w: run job work
>>> + */
>>> +static void drm_sched_run_job_work(struct work_struct *w)
>>> +{
>>> +	struct drm_gpu_scheduler *sched =
>>> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
>>> +	struct drm_sched_entity *entity;
>>> +	int r;
>>>   
>>> -	if (cleanup_job)
>>> -		sched->ops->free_job(cleanup_job);
>>> +	if (READ_ONCE(sched->pause_submit))
>>> +		return;
>>>   
>>> +	entity = drm_sched_select_entity(sched, true);
>>>   	if (entity) {
>>>   		struct dma_fence *fence;
>>>   		struct drm_sched_fence *s_fence;
>>> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>>>   		sched_job = drm_sched_entity_pop_job(entity);
>>>   		if (!sched_job) {
>>>   			complete_all(&entity->entity_idle);
>>> -			if (!cleanup_job)
>>> -				return;	/* No more work */
>>> -			goto again;
>>> +			return;	/* No more work */
>>>   		}
>>>   
>>>   		s_fence = sched_job->s_fence;
>>> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>>>   		}
>>>   
>>>   		wake_up(&sched->job_scheduled);
>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>   	}
>>> -
>>> -again:
>>> -	drm_sched_submit_queue(sched);
>>>   }
>>>   
>>>   /**
>>> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>   	spin_lock_init(&sched->job_list_lock);
>>>   	atomic_set(&sched->hw_rq_count, 0);
>>>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>> -	INIT_WORK(&sched->work_submit, drm_sched_main);
>>> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>>>   	atomic_set(&sched->_score, 0);
>>>   	atomic64_set(&sched->job_id_count, 0);
>>>   	sched->pause_submit = false;
>>> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>>>   void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>>>   {
>>>   	WRITE_ONCE(sched->pause_submit, true);
>>> -	cancel_work_sync(&sched->work_submit);
>>> +	cancel_work_sync(&sched->work_run_job);
>>> +	cancel_work_sync(&sched->work_free_job);
>>>   }
>>>   EXPORT_SYMBOL(drm_sched_submit_stop);
>>>   
>>> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>>>   void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>>>   {
>>>   	WRITE_ONCE(sched->pause_submit, false);
>>> -	queue_work(sched->submit_wq, &sched->work_submit);
>>> +	queue_work(sched->submit_wq, &sched->work_run_job);
>>> +	queue_work(sched->submit_wq, &sched->work_free_job);
>>>   }
>>>   EXPORT_SYMBOL(drm_sched_submit_start);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 04eec2d7635f..fbc083a92757 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>>>    *                 finished.
>>>    * @hw_rq_count: the number of jobs currently in the hardware queue.
>>>    * @job_id_count: used to assign unique id to the each job.
>>> - * @submit_wq: workqueue used to queue @work_submit
>>> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>>    * @timeout_wq: workqueue used to queue @work_tdr
>>> - * @work_submit: schedules jobs and cleans up entities
>>> + * @work_run_job: schedules jobs
>>> + * @work_free_job: cleans up jobs
>>>    * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>    *            timeout interval is over.
>>>    * @pending_list: the list of jobs which are currently in the job queue.
>>> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>>>   	atomic64_t			job_id_count;
>>>   	struct workqueue_struct		*submit_wq;
>>>   	struct workqueue_struct		*timeout_wq;
>>> -	struct work_struct		work_submit;
>>> +	struct work_struct		work_run_job;
>>> +	struct work_struct		work_free_job;
>>>   	struct delayed_work		work_tdr;
>>>   	struct list_head		pending_list;
>>>   	spinlock_t			job_list_lock;
>>> -- 
>>> 2.34.1
>>>
Matthew Brost Aug. 25, 2023, 1:36 p.m. UTC | #17
On Fri, Aug 25, 2023 at 10:02:32AM +0200, Christian König wrote:
> Am 25.08.23 um 04:58 schrieb Matthew Brost:
> > On Fri, Aug 25, 2023 at 01:04:10AM +0200, Danilo Krummrich wrote:
> > > On Thu, Aug 10, 2023 at 07:31:32PM -0700, Matthew Brost wrote:
> > > > Rather than call free_job and run_job in same work item have a dedicated
> > > > work item for each. This aligns with the design and intended use of work
> > > > queues.
> > > > 
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >   drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > > >   include/drm/gpu_scheduler.h            |   8 +-
> > > >   2 files changed, 106 insertions(+), 39 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > > > index cede47afc800..b67469eac179 100644
> > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > > >    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > > >    *
> > > >    * @rq: scheduler run queue to check.
> > > > + * @dequeue: dequeue selected entity
> > > >    *
> > > >    * Try to find a ready entity, returns NULL if none found.
> > > >    */
> > > >   static struct drm_sched_entity *
> > > > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > > >   {
> > > >   	struct drm_sched_entity *entity;
> > > > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > >   	if (entity) {
> > > >   		list_for_each_entry_continue(entity, &rq->entities, list) {
> > > >   			if (drm_sched_entity_is_ready(entity)) {
> > > > -				rq->current_entity = entity;
> > > > -				reinit_completion(&entity->entity_idle);
> > > > +				if (dequeue) {
> > > > +					rq->current_entity = entity;
> > > > +					reinit_completion(&entity->entity_idle);
> > > > +				}
> > > >   				spin_unlock(&rq->lock);
> > > >   				return entity;
> > > >   			}
> > > > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > >   	list_for_each_entry(entity, &rq->entities, list) {
> > > >   		if (drm_sched_entity_is_ready(entity)) {
> > > > -			rq->current_entity = entity;
> > > > -			reinit_completion(&entity->entity_idle);
> > > > +			if (dequeue) {
> > > > +				rq->current_entity = entity;
> > > > +				reinit_completion(&entity->entity_idle);
> > > > +			}
> > > >   			spin_unlock(&rq->lock);
> > > >   			return entity;
> > > >   		}
> > > > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > > >    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > > >    *
> > > >    * @rq: scheduler run queue to check.
> > > > + * @dequeue: dequeue selected entity
> > > >    *
> > > >    * Find oldest waiting ready entity, returns NULL if none found.
> > > >    */
> > > >   static struct drm_sched_entity *
> > > > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > > >   {
> > > >   	struct rb_node *rb;
> > > > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > >   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > > >   		if (drm_sched_entity_is_ready(entity)) {
> > > > -			rq->current_entity = entity;
> > > > -			reinit_completion(&entity->entity_idle);
> > > > +			if (dequeue) {
> > > > +				rq->current_entity = entity;
> > > > +				reinit_completion(&entity->entity_idle);
> > > > +			}
> > > >   			break;
> > > >   		}
> > > >   	}
> > > > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > > >   }
> > > >   /**
> > > > - * drm_sched_submit_queue - scheduler queue submission
> > > > + * drm_sched_run_job_queue - queue job submission
> > > >    * @sched: scheduler instance
> > > >    */
> > > > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > > > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > > >   {
> > > >   	if (!READ_ONCE(sched->pause_submit))
> > > > -		queue_work(sched->submit_wq, &sched->work_submit);
> > > > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > > > +}
> > > > +
> > > > +static struct drm_sched_entity *
> > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > > > +
> > > > +/**
> > > > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > > > + * @sched: scheduler instance
> > > > + */
> > > > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > +{
> > > > +	if (drm_sched_select_entity(sched, false))
> > > > +		drm_sched_run_job_queue(sched);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_sched_free_job_queue - queue free job
> > > > + *
> > > > + * @sched: scheduler instance to queue free job
> > > > + */
> > > > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > > > +{
> > > > +	if (!READ_ONCE(sched->pause_submit))
> > > > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > > > + *
> > > > + * @sched: scheduler instance to queue free job
> > > > + */
> > > > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > > > +{
> > > > +	struct drm_sched_job *job;
> > > > +
> > > > +	spin_lock(&sched->job_list_lock);
> > > > +	job = list_first_entry_or_null(&sched->pending_list,
> > > > +				       struct drm_sched_job, list);
> > > > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > > > +		drm_sched_free_job_queue(sched);
> > > > +	spin_unlock(&sched->job_list_lock);
> > > >   }
> > > >   /**
> > > > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > > >   	dma_fence_get(&s_fence->finished);
> > > >   	drm_sched_fence_finished(s_fence, result);
> > > >   	dma_fence_put(&s_fence->finished);
> > > > -	drm_sched_submit_queue(sched);
> > > > +	drm_sched_free_job_queue(sched);
> > > >   }
> > > >   /**
> > > > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > > >   void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > > >   {
> > > >   	if (drm_sched_can_queue(sched))
> > > > -		drm_sched_submit_queue(sched);
> > > > +		drm_sched_run_job_queue(sched);
> > > >   }
> > > >   /**
> > > >    * drm_sched_select_entity - Select next entity to process
> > > >    *
> > > >    * @sched: scheduler instance
> > > > + * @dequeue: dequeue selected entity
> > > >    *
> > > >    * Returns the entity to process or NULL if none are found.
> > > >    */
> > > >   static struct drm_sched_entity *
> > > > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > > >   {
> > > >   	struct drm_sched_entity *entity;
> > > >   	int i;
> > > > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > > >   	/* Kernel run queue has higher priority than normal run queue*/
> > > >   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > > >   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > > > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > > > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > > > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > > > +							dequeue) :
> > > > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > > > +						      dequeue);
> > > >   		if (entity)
> > > >   			break;
> > > >   	}
> > > > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > > >   EXPORT_SYMBOL(drm_sched_pick_best);
> > > >   /**
> > > > - * drm_sched_main - main scheduler thread
> > > > + * drm_sched_free_job_work - worker to call free_job
> > > >    *
> > > > - * @param: scheduler instance
> > > > + * @w: free job work
> > > >    */
> > > > -static void drm_sched_main(struct work_struct *w)
> > > > +static void drm_sched_free_job_work(struct work_struct *w)
> > > >   {
> > > >   	struct drm_gpu_scheduler *sched =
> > > > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > > > -	struct drm_sched_entity *entity;
> > > > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> > > >   	struct drm_sched_job *cleanup_job;
> > > > -	int r;
> > > >   	if (READ_ONCE(sched->pause_submit))
> > > >   		return;
> > > >   	cleanup_job = drm_sched_get_cleanup_job(sched);
> > > I tried this patch with Nouveau and found a race condition:
> > > 
> > > In drm_sched_run_job_work() the job is added to the pending_list via
> > > drm_sched_job_begin(), then the run_job() callback is called and the scheduled
> > > fence is signaled.
> > > 
> > > However, in parallel drm_sched_get_cleanup_job() might be called from
> > > drm_sched_free_job_work(), which picks the first job from the pending_list and
> > > for the next job on the pending_list sets the scheduled fence' timestamp field.
> 
> Well why can this happen in parallel? Either the work items are scheduled to
> a single threaded work queue or you have protected the pending list with
> some locks.
> 

Xe uses a single-threaded work queue, Nouveau does not (desired
behavior).

The list of pending jobs is protected by a lock (safe), the race is:

add job to pending list
run_job
signal scheduled fence

dequeue from pending list
free_job
update timestamp

Once a job is on the pending list its timestamp can be accessed which
can blow up if scheduled fence isn't signaled or more specifically unless
DMA_FENCE_FLAG_TIMESTAMP_BIT is set. Logically it makes sense for the
job to be in the pending list before run_job and signal the scheduled
fence after run_job so I think we need to live with this race.

> Just moving the free_job into a separate work item without such precautions
> won't work because of quite a bunch of other reasons as well.
>

Yes, free_job might not be safe to run in parallel with run_job
depending on the driver vfuncs. Mention this in the cover letter.

Certainly this should be safe in the scheduler code though and I think
it will be after fixing this.

Matt

> > > 
> > > The job can be on the pending_list, but the scheduled fence might not yet be
> > > signaled. The call to actually signal the fence will subsequently fault because
> > > it will try to dereference the timestamp.
> > > 
> > > I'm not sure what's the best way to fix this, maybe it's enough to re-order
> > > signalling the scheduled fence and adding the job to the pending_list. Not sure
> > > if this has other implications though.
> > > 
> > We really want the job on the pending list before calling run_job.
> > 
> > I'm thinking we just delete the updating of the timestamp, not sure why
> > this is useful.
> 
> This is used for calculating how long each job has spend on the hw, so big
> NAK to deleting this.
>

Ah, I see that AMDGPU uses this. Previously just checked the scheduler
code.

The below patch should work just fine then.

Matt

> Regards,
> Christian.
> 
> > 
> > Or we could do something like this where we try to update the timestamp,
> > if we can't update the timestamp run_job worker will do it anyways.
> > 
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index 67e0fb6e7d18..54bd3e88f139 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -1074,8 +1074,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
> >                                                  typeof(*next), list);
> > 
> >                  if (next) {
> > -                       next->s_fence->scheduled.timestamp =
> > -                               job->s_fence->finished.timestamp;
> > +                       if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
> > +                                    &next->s_fence->scheduled.flags))
> > +                               next->s_fence->scheduled.timestamp =
> > +                                       job->s_fence->finished.timestamp;
> >                          /* start TO timer for next job */
> >                          drm_sched_start_timeout(sched);
> >                  }
> > 
> > I guess I'm leaning towards the latter option.
> > 
> > Matt
> > 
> > > - Danilo
> > > 
> > > > -	entity = drm_sched_select_entity(sched);
> > > > +	if (cleanup_job) {
> > > > +		sched->ops->free_job(cleanup_job);
> > > > +
> > > > +		drm_sched_free_job_queue_if_ready(sched);
> > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > > +	}
> > > > +}
> > > > -	if (!entity && !cleanup_job)
> > > > -		return;	/* No more work */
> > > > +/**
> > > > + * drm_sched_run_job_work - worker to call run_job
> > > > + *
> > > > + * @w: run job work
> > > > + */
> > > > +static void drm_sched_run_job_work(struct work_struct *w)
> > > > +{
> > > > +	struct drm_gpu_scheduler *sched =
> > > > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > > > +	struct drm_sched_entity *entity;
> > > > +	int r;
> > > > -	if (cleanup_job)
> > > > -		sched->ops->free_job(cleanup_job);
> > > > +	if (READ_ONCE(sched->pause_submit))
> > > > +		return;
> > > > +	entity = drm_sched_select_entity(sched, true);
> > > >   	if (entity) {
> > > >   		struct dma_fence *fence;
> > > >   		struct drm_sched_fence *s_fence;
> > > > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> > > >   		sched_job = drm_sched_entity_pop_job(entity);
> > > >   		if (!sched_job) {
> > > >   			complete_all(&entity->entity_idle);
> > > > -			if (!cleanup_job)
> > > > -				return;	/* No more work */
> > > > -			goto again;
> > > > +			return;	/* No more work */
> > > >   		}
> > > >   		s_fence = sched_job->s_fence;
> > > > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> > > >   		}
> > > >   		wake_up(&sched->job_scheduled);
> > > > +		drm_sched_run_job_queue_if_ready(sched);
> > > >   	}
> > > > -
> > > > -again:
> > > > -	drm_sched_submit_queue(sched);
> > > >   }
> > > >   /**
> > > > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> > > >   	spin_lock_init(&sched->job_list_lock);
> > > >   	atomic_set(&sched->hw_rq_count, 0);
> > > >   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > > > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > > > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > > > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> > > >   	atomic_set(&sched->_score, 0);
> > > >   	atomic64_set(&sched->job_id_count, 0);
> > > >   	sched->pause_submit = false;
> > > > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> > > >   void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> > > >   {
> > > >   	WRITE_ONCE(sched->pause_submit, true);
> > > > -	cancel_work_sync(&sched->work_submit);
> > > > +	cancel_work_sync(&sched->work_run_job);
> > > > +	cancel_work_sync(&sched->work_free_job);
> > > >   }
> > > >   EXPORT_SYMBOL(drm_sched_submit_stop);
> > > > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> > > >   void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> > > >   {
> > > >   	WRITE_ONCE(sched->pause_submit, false);
> > > > -	queue_work(sched->submit_wq, &sched->work_submit);
> > > > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > > > +	queue_work(sched->submit_wq, &sched->work_free_job);
> > > >   }
> > > >   EXPORT_SYMBOL(drm_sched_submit_start);
> > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > > > index 04eec2d7635f..fbc083a92757 100644
> > > > --- a/include/drm/gpu_scheduler.h
> > > > +++ b/include/drm/gpu_scheduler.h
> > > > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> > > >    *                 finished.
> > > >    * @hw_rq_count: the number of jobs currently in the hardware queue.
> > > >    * @job_id_count: used to assign unique id to the each job.
> > > > - * @submit_wq: workqueue used to queue @work_submit
> > > > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> > > >    * @timeout_wq: workqueue used to queue @work_tdr
> > > > - * @work_submit: schedules jobs and cleans up entities
> > > > + * @work_run_job: schedules jobs
> > > > + * @work_free_job: cleans up jobs
> > > >    * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> > > >    *            timeout interval is over.
> > > >    * @pending_list: the list of jobs which are currently in the job queue.
> > > > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> > > >   	atomic64_t			job_id_count;
> > > >   	struct workqueue_struct		*submit_wq;
> > > >   	struct workqueue_struct		*timeout_wq;
> > > > -	struct work_struct		work_submit;
> > > > +	struct work_struct		work_run_job;
> > > > +	struct work_struct		work_free_job;
> > > >   	struct delayed_work		work_tdr;
> > > >   	struct list_head		pending_list;
> > > >   	spinlock_t			job_list_lock;
> > > > -- 
> > > > 2.34.1
> > > > 
>
Christian König Aug. 25, 2023, 1:45 p.m. UTC | #18
Am 25.08.23 um 15:36 schrieb Matthew Brost:
> On Fri, Aug 25, 2023 at 10:02:32AM +0200, Christian König wrote:
>> Am 25.08.23 um 04:58 schrieb Matthew Brost:
>>> On Fri, Aug 25, 2023 at 01:04:10AM +0200, Danilo Krummrich wrote:
>>>> On Thu, Aug 10, 2023 at 07:31:32PM -0700, Matthew Brost wrote:
>>>>> Rather than call free_job and run_job in same work item have a dedicated
>>>>> work item for each. This aligns with the design and intended use of work
>>>>> queues.
>>>>>
>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>>>>>    include/drm/gpu_scheduler.h            |   8 +-
>>>>>    2 files changed, 106 insertions(+), 39 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> index cede47afc800..b67469eac179 100644
>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>>>     * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>>>     *
>>>>>     * @rq: scheduler run queue to check.
>>>>> + * @dequeue: dequeue selected entity
>>>>>     *
>>>>>     * Try to find a ready entity, returns NULL if none found.
>>>>>     */
>>>>>    static struct drm_sched_entity *
>>>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>>>>    {
>>>>>    	struct drm_sched_entity *entity;
>>>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>    	if (entity) {
>>>>>    		list_for_each_entry_continue(entity, &rq->entities, list) {
>>>>>    			if (drm_sched_entity_is_ready(entity)) {
>>>>> -				rq->current_entity = entity;
>>>>> -				reinit_completion(&entity->entity_idle);
>>>>> +				if (dequeue) {
>>>>> +					rq->current_entity = entity;
>>>>> +					reinit_completion(&entity->entity_idle);
>>>>> +				}
>>>>>    				spin_unlock(&rq->lock);
>>>>>    				return entity;
>>>>>    			}
>>>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>    	list_for_each_entry(entity, &rq->entities, list) {
>>>>>    		if (drm_sched_entity_is_ready(entity)) {
>>>>> -			rq->current_entity = entity;
>>>>> -			reinit_completion(&entity->entity_idle);
>>>>> +			if (dequeue) {
>>>>> +				rq->current_entity = entity;
>>>>> +				reinit_completion(&entity->entity_idle);
>>>>> +			}
>>>>>    			spin_unlock(&rq->lock);
>>>>>    			return entity;
>>>>>    		}
>>>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>>>     * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>>>>     *
>>>>>     * @rq: scheduler run queue to check.
>>>>> + * @dequeue: dequeue selected entity
>>>>>     *
>>>>>     * Find oldest waiting ready entity, returns NULL if none found.
>>>>>     */
>>>>>    static struct drm_sched_entity *
>>>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>>>>    {
>>>>>    	struct rb_node *rb;
>>>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>    		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>>>>    		if (drm_sched_entity_is_ready(entity)) {
>>>>> -			rq->current_entity = entity;
>>>>> -			reinit_completion(&entity->entity_idle);
>>>>> +			if (dequeue) {
>>>>> +				rq->current_entity = entity;
>>>>> +				reinit_completion(&entity->entity_idle);
>>>>> +			}
>>>>>    			break;
>>>>>    		}
>>>>>    	}
>>>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>>>>    }
>>>>>    /**
>>>>> - * drm_sched_submit_queue - scheduler queue submission
>>>>> + * drm_sched_run_job_queue - queue job submission
>>>>>     * @sched: scheduler instance
>>>>>     */
>>>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
>>>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>>>>    {
>>>>>    	if (!READ_ONCE(sched->pause_submit))
>>>>> -		queue_work(sched->submit_wq, &sched->work_submit);
>>>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
>>>>> +}
>>>>> +
>>>>> +static struct drm_sched_entity *
>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
>>>>> +
>>>>> +/**
>>>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
>>>>> + * @sched: scheduler instance
>>>>> + */
>>>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>> +{
>>>>> +	if (drm_sched_select_entity(sched, false))
>>>>> +		drm_sched_run_job_queue(sched);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * drm_sched_free_job_queue - queue free job
>>>>> + *
>>>>> + * @sched: scheduler instance to queue free job
>>>>> + */
>>>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
>>>>> +{
>>>>> +	if (!READ_ONCE(sched->pause_submit))
>>>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
>>>>> + *
>>>>> + * @sched: scheduler instance to queue free job
>>>>> + */
>>>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
>>>>> +{
>>>>> +	struct drm_sched_job *job;
>>>>> +
>>>>> +	spin_lock(&sched->job_list_lock);
>>>>> +	job = list_first_entry_or_null(&sched->pending_list,
>>>>> +				       struct drm_sched_job, list);
>>>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
>>>>> +		drm_sched_free_job_queue(sched);
>>>>> +	spin_unlock(&sched->job_list_lock);
>>>>>    }
>>>>>    /**
>>>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>>>>    	dma_fence_get(&s_fence->finished);
>>>>>    	drm_sched_fence_finished(s_fence, result);
>>>>>    	dma_fence_put(&s_fence->finished);
>>>>> -	drm_sched_submit_queue(sched);
>>>>> +	drm_sched_free_job_queue(sched);
>>>>>    }
>>>>>    /**
>>>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>>>>>    void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>>>>>    {
>>>>>    	if (drm_sched_can_queue(sched))
>>>>> -		drm_sched_submit_queue(sched);
>>>>> +		drm_sched_run_job_queue(sched);
>>>>>    }
>>>>>    /**
>>>>>     * drm_sched_select_entity - Select next entity to process
>>>>>     *
>>>>>     * @sched: scheduler instance
>>>>> + * @dequeue: dequeue selected entity
>>>>>     *
>>>>>     * Returns the entity to process or NULL if none are found.
>>>>>     */
>>>>>    static struct drm_sched_entity *
>>>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>>>>    {
>>>>>    	struct drm_sched_entity *entity;
>>>>>    	int i;
>>>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>>>    	/* Kernel run queue has higher priority than normal run queue*/
>>>>>    	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>>>>    		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>>>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
>>>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
>>>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>>>>> +							dequeue) :
>>>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>>>>> +						      dequeue);
>>>>>    		if (entity)
>>>>>    			break;
>>>>>    	}
>>>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>>>    EXPORT_SYMBOL(drm_sched_pick_best);
>>>>>    /**
>>>>> - * drm_sched_main - main scheduler thread
>>>>> + * drm_sched_free_job_work - worker to call free_job
>>>>>     *
>>>>> - * @param: scheduler instance
>>>>> + * @w: free job work
>>>>>     */
>>>>> -static void drm_sched_main(struct work_struct *w)
>>>>> +static void drm_sched_free_job_work(struct work_struct *w)
>>>>>    {
>>>>>    	struct drm_gpu_scheduler *sched =
>>>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
>>>>> -	struct drm_sched_entity *entity;
>>>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>>>>>    	struct drm_sched_job *cleanup_job;
>>>>> -	int r;
>>>>>    	if (READ_ONCE(sched->pause_submit))
>>>>>    		return;
>>>>>    	cleanup_job = drm_sched_get_cleanup_job(sched);
>>>> I tried this patch with Nouveau and found a race condition:
>>>>
>>>> In drm_sched_run_job_work() the job is added to the pending_list via
>>>> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
>>>> fence is signaled.
>>>>
>>>> However, in parallel drm_sched_get_cleanup_job() might be called from
>>>> drm_sched_free_job_work(), which picks the first job from the pending_list and
>>>> for the next job on the pending_list sets the scheduled fence' timestamp field.
>> Well why can this happen in parallel? Either the work items are scheduled to
>> a single threaded work queue or you have protected the pending list with
>> some locks.
>>
> Xe uses a single-threaded work queue, Nouveau does not (desired
> behavior).
>
> The list of pending jobs is protected by a lock (safe), the race is:
>
> add job to pending list
> run_job
> signal scheduled fence
>
> dequeue from pending list
> free_job
> update timestamp
>
> Once a job is on the pending list its timestamp can be accessed which
> can blow up if scheduled fence isn't signaled or more specifically unless
> DMA_FENCE_FLAG_TIMESTAMP_BIT is set.

Ah, that problem again. No that is actually quite harmless.

You just need to double check if the DMA_FENCE_FLAG_TIMESTAMP_BIT is 
already set and if it's not set don't do anything.

Regards,
Christian.


>   Logically it makes sense for the
> job to be in the pending list before run_job and signal the scheduled
> fence after run_job so I think we need to live with this race.
>
>> Just moving the free_job into a separate work item without such precautions
>> won't work because of quite a bunch of other reasons as well.
>>
> Yes, free_job might not be safe to run in parallel with run_job
> depending on the driver vfuncs. Mention this in the cover letter.
>
> Certainly this should be safe in the scheduler code though and I think
> it will be after fixing this.
>
> Matt
>
>>>> The job can be on the pending_list, but the scheduled fence might not yet be
>>>> signaled. The call to actually signal the fence will subsequently fault because
>>>> it will try to dereference the timestamp.
>>>>
>>>> I'm not sure what's the best way to fix this, maybe it's enough to re-order
>>>> signalling the scheduled fence and adding the job to the pending_list. Not sure
>>>> if this has other implications though.
>>>>
>>> We really want the job on the pending list before calling run_job.
>>>
>>> I'm thinking we just delete the updating of the timestamp, not sure why
>>> this is useful.
>> This is used for calculating how long each job has spend on the hw, so big
>> NAK to deleting this.
>>
> Ah, I see that AMDGPU uses this. Previously just checked the scheduler
> code.
>
> The below patch should work just fine then.
>
> Matt
>
>> Regards,
>> Christian.
>>
>>> Or we could do something like this where we try to update the timestamp,
>>> if we can't update the timestamp run_job worker will do it anyways.
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 67e0fb6e7d18..54bd3e88f139 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -1074,8 +1074,10 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>>>                                                   typeof(*next), list);
>>>
>>>                   if (next) {
>>> -                       next->s_fence->scheduled.timestamp =
>>> -                               job->s_fence->finished.timestamp;
>>> +                       if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
>>> +                                    &next->s_fence->scheduled.flags))
>>> +                               next->s_fence->scheduled.timestamp =
>>> +                                       job->s_fence->finished.timestamp;
>>>                           /* start TO timer for next job */
>>>                           drm_sched_start_timeout(sched);
>>>                   }
>>>
>>> I guess I'm leaning towards the latter option.
>>>
>>> Matt
>>>
>>>> - Danilo
>>>>
>>>>> -	entity = drm_sched_select_entity(sched);
>>>>> +	if (cleanup_job) {
>>>>> +		sched->ops->free_job(cleanup_job);
>>>>> +
>>>>> +		drm_sched_free_job_queue_if_ready(sched);
>>>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>>> +	}
>>>>> +}
>>>>> -	if (!entity && !cleanup_job)
>>>>> -		return;	/* No more work */
>>>>> +/**
>>>>> + * drm_sched_run_job_work - worker to call run_job
>>>>> + *
>>>>> + * @w: run job work
>>>>> + */
>>>>> +static void drm_sched_run_job_work(struct work_struct *w)
>>>>> +{
>>>>> +	struct drm_gpu_scheduler *sched =
>>>>> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
>>>>> +	struct drm_sched_entity *entity;
>>>>> +	int r;
>>>>> -	if (cleanup_job)
>>>>> -		sched->ops->free_job(cleanup_job);
>>>>> +	if (READ_ONCE(sched->pause_submit))
>>>>> +		return;
>>>>> +	entity = drm_sched_select_entity(sched, true);
>>>>>    	if (entity) {
>>>>>    		struct dma_fence *fence;
>>>>>    		struct drm_sched_fence *s_fence;
>>>>> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>>>>>    		sched_job = drm_sched_entity_pop_job(entity);
>>>>>    		if (!sched_job) {
>>>>>    			complete_all(&entity->entity_idle);
>>>>> -			if (!cleanup_job)
>>>>> -				return;	/* No more work */
>>>>> -			goto again;
>>>>> +			return;	/* No more work */
>>>>>    		}
>>>>>    		s_fence = sched_job->s_fence;
>>>>> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>>>>>    		}
>>>>>    		wake_up(&sched->job_scheduled);
>>>>> +		drm_sched_run_job_queue_if_ready(sched);
>>>>>    	}
>>>>> -
>>>>> -again:
>>>>> -	drm_sched_submit_queue(sched);
>>>>>    }
>>>>>    /**
>>>>> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>    	spin_lock_init(&sched->job_list_lock);
>>>>>    	atomic_set(&sched->hw_rq_count, 0);
>>>>>    	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>>> -	INIT_WORK(&sched->work_submit, drm_sched_main);
>>>>> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>>>> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>>>>>    	atomic_set(&sched->_score, 0);
>>>>>    	atomic64_set(&sched->job_id_count, 0);
>>>>>    	sched->pause_submit = false;
>>>>> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>>>>>    void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>>>>>    {
>>>>>    	WRITE_ONCE(sched->pause_submit, true);
>>>>> -	cancel_work_sync(&sched->work_submit);
>>>>> +	cancel_work_sync(&sched->work_run_job);
>>>>> +	cancel_work_sync(&sched->work_free_job);
>>>>>    }
>>>>>    EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>>>>>    void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>>>>>    {
>>>>>    	WRITE_ONCE(sched->pause_submit, false);
>>>>> -	queue_work(sched->submit_wq, &sched->work_submit);
>>>>> +	queue_work(sched->submit_wq, &sched->work_run_job);
>>>>> +	queue_work(sched->submit_wq, &sched->work_free_job);
>>>>>    }
>>>>>    EXPORT_SYMBOL(drm_sched_submit_start);
>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>> index 04eec2d7635f..fbc083a92757 100644
>>>>> --- a/include/drm/gpu_scheduler.h
>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>>>>>     *                 finished.
>>>>>     * @hw_rq_count: the number of jobs currently in the hardware queue.
>>>>>     * @job_id_count: used to assign unique id to the each job.
>>>>> - * @submit_wq: workqueue used to queue @work_submit
>>>>> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>>>>     * @timeout_wq: workqueue used to queue @work_tdr
>>>>> - * @work_submit: schedules jobs and cleans up entities
>>>>> + * @work_run_job: schedules jobs
>>>>> + * @work_free_job: cleans up jobs
>>>>>     * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>>>>>     *            timeout interval is over.
>>>>>     * @pending_list: the list of jobs which are currently in the job queue.
>>>>> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>>>>>    	atomic64_t			job_id_count;
>>>>>    	struct workqueue_struct		*submit_wq;
>>>>>    	struct workqueue_struct		*timeout_wq;
>>>>> -	struct work_struct		work_submit;
>>>>> +	struct work_struct		work_run_job;
>>>>> +	struct work_struct		work_free_job;
>>>>>    	struct delayed_work		work_tdr;
>>>>>    	struct list_head		pending_list;
>>>>>    	spinlock_t			job_list_lock;
>>>>> -- 
>>>>> 2.34.1
>>>>>
Danilo Krummrich Aug. 28, 2023, 6:04 p.m. UTC | #19
On 8/11/23 04:31, Matthew Brost wrote:
> Rather than call free_job and run_job in same work item have a dedicated
> work item for each. This aligns with the design and intended use of work
> queues.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>   include/drm/gpu_scheduler.h            |   8 +-
>   2 files changed, 106 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index cede47afc800..b67469eac179 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>    *
>    * @rq: scheduler run queue to check.
> + * @dequeue: dequeue selected entity
>    *
>    * Try to find a ready entity, returns NULL if none found.
>    */
>   static struct drm_sched_entity *
> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>   {
>   	struct drm_sched_entity *entity;
>   
> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>   	if (entity) {
>   		list_for_each_entry_continue(entity, &rq->entities, list) {
>   			if (drm_sched_entity_is_ready(entity)) {
> -				rq->current_entity = entity;
> -				reinit_completion(&entity->entity_idle);
> +				if (dequeue) {
> +					rq->current_entity = entity;
> +					reinit_completion(&entity->entity_idle);
> +				}
>   				spin_unlock(&rq->lock);
>   				return entity;
>   			}
> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>   	list_for_each_entry(entity, &rq->entities, list) {
>   
>   		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (dequeue) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>   			spin_unlock(&rq->lock);
>   			return entity;
>   		}
> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>    *
>    * @rq: scheduler run queue to check.
> + * @dequeue: dequeue selected entity
>    *
>    * Find oldest waiting ready entity, returns NULL if none found.
>    */
>   static struct drm_sched_entity *
> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>   {
>   	struct rb_node *rb;
>   
> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>   
>   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>   		if (drm_sched_entity_is_ready(entity)) {
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> +			if (dequeue) {
> +				rq->current_entity = entity;
> +				reinit_completion(&entity->entity_idle);
> +			}
>   			break;
>   		}
>   	}
> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>   }
>   
>   /**
> - * drm_sched_submit_queue - scheduler queue submission
> + * drm_sched_run_job_queue - queue job submission
>    * @sched: scheduler instance
>    */
> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>   {
>   	if (!READ_ONCE(sched->pause_submit))
> -		queue_work(sched->submit_wq, &sched->work_submit);
> +		queue_work(sched->submit_wq, &sched->work_run_job);
> +}
> +
> +static struct drm_sched_entity *
> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> +
> +/**
> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> + * @sched: scheduler instance
> + */
> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> +{
> +	if (drm_sched_select_entity(sched, false))
> +		drm_sched_run_job_queue(sched);
> +}
> +
> +/**
> + * drm_sched_free_job_queue - queue free job
> + *
> + * @sched: scheduler instance to queue free job
> + */
> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> +{
> +	if (!READ_ONCE(sched->pause_submit))
> +		queue_work(sched->submit_wq, &sched->work_free_job);
> +}
> +
> +/**
> + * drm_sched_free_job_queue_if_ready - queue free job if ready
> + *
> + * @sched: scheduler instance to queue free job
> + */
> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> +{
> +	struct drm_sched_job *job;
> +
> +	spin_lock(&sched->job_list_lock);
> +	job = list_first_entry_or_null(&sched->pending_list,
> +				       struct drm_sched_job, list);
> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> +		drm_sched_free_job_queue(sched);
> +	spin_unlock(&sched->job_list_lock);
>   }
>   
>   /**
> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>   	dma_fence_get(&s_fence->finished);
>   	drm_sched_fence_finished(s_fence, result);
>   	dma_fence_put(&s_fence->finished);
> -	drm_sched_submit_queue(sched);
> +	drm_sched_free_job_queue(sched);
>   }
>   
>   /**
> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>   void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>   {
>   	if (drm_sched_can_queue(sched))
> -		drm_sched_submit_queue(sched);
> +		drm_sched_run_job_queue(sched);
>   }
>   
>   /**
>    * drm_sched_select_entity - Select next entity to process
>    *
>    * @sched: scheduler instance
> + * @dequeue: dequeue selected entity
>    *
>    * Returns the entity to process or NULL if none are found.
>    */
>   static struct drm_sched_entity *
> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>   {
>   	struct drm_sched_entity *entity;
>   	int i;
> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>   	/* Kernel run queue has higher priority than normal run queue*/
>   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> +							dequeue) :
> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> +						      dequeue);
>   		if (entity)
>   			break;
>   	}
> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>   EXPORT_SYMBOL(drm_sched_pick_best);
>   
>   /**
> - * drm_sched_main - main scheduler thread
> + * drm_sched_free_job_work - worker to call free_job
>    *
> - * @param: scheduler instance
> + * @w: free job work
>    */
> -static void drm_sched_main(struct work_struct *w)
> +static void drm_sched_free_job_work(struct work_struct *w)
>   {
>   	struct drm_gpu_scheduler *sched =
> -		container_of(w, struct drm_gpu_scheduler, work_submit);
> -	struct drm_sched_entity *entity;
> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
>   	struct drm_sched_job *cleanup_job;
> -	int r;
>   
>   	if (READ_ONCE(sched->pause_submit))
>   		return;
>   
>   	cleanup_job = drm_sched_get_cleanup_job(sched);
> -	entity = drm_sched_select_entity(sched);
> +	if (cleanup_job) {
> +		sched->ops->free_job(cleanup_job);
> +
> +		drm_sched_free_job_queue_if_ready(sched);
> +		drm_sched_run_job_queue_if_ready(sched);
> +	}
> +}
>   
> -	if (!entity && !cleanup_job)
> -		return;	/* No more work */
> +/**
> + * drm_sched_run_job_work - worker to call run_job
> + *
> + * @w: run job work
> + */
> +static void drm_sched_run_job_work(struct work_struct *w)
> +{
> +	struct drm_gpu_scheduler *sched =
> +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> +	struct drm_sched_entity *entity;
> +	int r;
>   
> -	if (cleanup_job)
> -		sched->ops->free_job(cleanup_job);
> +	if (READ_ONCE(sched->pause_submit))
> +		return;
>   
> +	entity = drm_sched_select_entity(sched, true);
>   	if (entity) {
>   		struct dma_fence *fence;
>   		struct drm_sched_fence *s_fence;
> @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
>   		sched_job = drm_sched_entity_pop_job(entity);
>   		if (!sched_job) {
>   			complete_all(&entity->entity_idle);
> -			if (!cleanup_job)
> -				return;	/* No more work */
> -			goto again;
> +			return;	/* No more work */
>   		}
>   
>   		s_fence = sched_job->s_fence;
> @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
>   		}
>   
>   		wake_up(&sched->job_scheduled);
> +		drm_sched_run_job_queue_if_ready(sched);
>   	}
> -
> -again:
> -	drm_sched_submit_queue(sched);
>   }
>   
>   /**
> @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   	spin_lock_init(&sched->job_list_lock);
>   	atomic_set(&sched->hw_rq_count, 0);
>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> -	INIT_WORK(&sched->work_submit, drm_sched_main);
> +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>   	atomic_set(&sched->_score, 0);
>   	atomic64_set(&sched->job_id_count, 0);
>   	sched->pause_submit = false;
> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>   void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)

I was wondering what the scheduler teardown sequence looks like for
DRM_SCHED_POLICY_SINGLE_ENTITY and how XE does that.

In Nouveau, userspace can ask the kernel to create a channel (or multiple),
where each channel represents a ring feeding the firmware scheduler. Userspace
can forcefully close channels via either a dedicated IOCTL or by just closing
the FD which subsequently closes all channels opened through this FD.

When this happens the scheduler needs to be teared down. Without keeping track of
things in a driver specific way, the only thing I could really come up with is the
following.

/* Make sure no more jobs are fetched from the entity. */
drm_sched_submit_stop();

/* Wait for the channel to be idle, namely jobs in flight to complete. */
nouveau_channel_idle();

/* Stop the scheduler to free jobs from the pending_list. Ring must be idle at this
  * point, otherwise me might leak jobs. Feels more like a workaround to free
  * finished jobs.
  */
drm_sched_stop();

/* Free jobs from the entity queue. */
drm_sched_entity_fini();

/* Probably not even needed in this case. */
drm_sched_fini();

This doesn't look very straightforward though. I wonder if other drivers feeding
firmware schedulers have similar cases. Maybe something like drm_sched_teardown(),
which would stop job submission, wait for pending jobs to finish and subsequently
free them up would makes sense?

- Danilo

>   {
>   	WRITE_ONCE(sched->pause_submit, true);
> -	cancel_work_sync(&sched->work_submit);
> +	cancel_work_sync(&sched->work_run_job);
> +	cancel_work_sync(&sched->work_free_job);
>   }
>   EXPORT_SYMBOL(drm_sched_submit_stop);
>   
> @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
>   void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
>   {
>   	WRITE_ONCE(sched->pause_submit, false);
> -	queue_work(sched->submit_wq, &sched->work_submit);
> +	queue_work(sched->submit_wq, &sched->work_run_job);
> +	queue_work(sched->submit_wq, &sched->work_free_job);
>   }
>   EXPORT_SYMBOL(drm_sched_submit_start);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 04eec2d7635f..fbc083a92757 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
>    *                 finished.
>    * @hw_rq_count: the number of jobs currently in the hardware queue.
>    * @job_id_count: used to assign unique id to the each job.
> - * @submit_wq: workqueue used to queue @work_submit
> + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>    * @timeout_wq: workqueue used to queue @work_tdr
> - * @work_submit: schedules jobs and cleans up entities
> + * @work_run_job: schedules jobs
> + * @work_free_job: cleans up jobs
>    * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>    *            timeout interval is over.
>    * @pending_list: the list of jobs which are currently in the job queue.
> @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
>   	atomic64_t			job_id_count;
>   	struct workqueue_struct		*submit_wq;
>   	struct workqueue_struct		*timeout_wq;
> -	struct work_struct		work_submit;
> +	struct work_struct		work_run_job;
> +	struct work_struct		work_free_job;
>   	struct delayed_work		work_tdr;
>   	struct list_head		pending_list;
>   	spinlock_t			job_list_lock;
Matthew Brost Aug. 28, 2023, 6:41 p.m. UTC | #20
On Mon, Aug 28, 2023 at 08:04:31PM +0200, Danilo Krummrich wrote:
> On 8/11/23 04:31, Matthew Brost wrote:
> > Rather than call free_job and run_job in same work item have a dedicated
> > work item for each. This aligns with the design and intended use of work
> > queues.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> >   include/drm/gpu_scheduler.h            |   8 +-
> >   2 files changed, 106 insertions(+), 39 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index cede47afc800..b67469eac179 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> >    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> >    *
> >    * @rq: scheduler run queue to check.
> > + * @dequeue: dequeue selected entity
> >    *
> >    * Try to find a ready entity, returns NULL if none found.
> >    */
> >   static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> >   {
> >   	struct drm_sched_entity *entity;
> > @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >   	if (entity) {
> >   		list_for_each_entry_continue(entity, &rq->entities, list) {
> >   			if (drm_sched_entity_is_ready(entity)) {
> > -				rq->current_entity = entity;
> > -				reinit_completion(&entity->entity_idle);
> > +				if (dequeue) {
> > +					rq->current_entity = entity;
> > +					reinit_completion(&entity->entity_idle);
> > +				}
> >   				spin_unlock(&rq->lock);
> >   				return entity;
> >   			}
> > @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >   	list_for_each_entry(entity, &rq->entities, list) {
> >   		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (dequeue) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >   			spin_unlock(&rq->lock);
> >   			return entity;
> >   		}
> > @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> >    *
> >    * @rq: scheduler run queue to check.
> > + * @dequeue: dequeue selected entity
> >    *
> >    * Find oldest waiting ready entity, returns NULL if none found.
> >    */
> >   static struct drm_sched_entity *
> > -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> >   {
> >   	struct rb_node *rb;
> > @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> >   		if (drm_sched_entity_is_ready(entity)) {
> > -			rq->current_entity = entity;
> > -			reinit_completion(&entity->entity_idle);
> > +			if (dequeue) {
> > +				rq->current_entity = entity;
> > +				reinit_completion(&entity->entity_idle);
> > +			}
> >   			break;
> >   		}
> >   	}
> > @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >   }
> >   /**
> > - * drm_sched_submit_queue - scheduler queue submission
> > + * drm_sched_run_job_queue - queue job submission
> >    * @sched: scheduler instance
> >    */
> > -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> >   {
> >   	if (!READ_ONCE(sched->pause_submit))
> > -		queue_work(sched->submit_wq, &sched->work_submit);
> > +		queue_work(sched->submit_wq, &sched->work_run_job);
> > +}
> > +
> > +static struct drm_sched_entity *
> > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > +
> > +/**
> > + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > + * @sched: scheduler instance
> > + */
> > +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (drm_sched_select_entity(sched, false))
> > +		drm_sched_run_job_queue(sched);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue - queue free job
> > + *
> > + * @sched: scheduler instance to queue free job
> > + */
> > +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > +{
> > +	if (!READ_ONCE(sched->pause_submit))
> > +		queue_work(sched->submit_wq, &sched->work_free_job);
> > +}
> > +
> > +/**
> > + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > + *
> > + * @sched: scheduler instance to queue free job
> > + */
> > +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > +{
> > +	struct drm_sched_job *job;
> > +
> > +	spin_lock(&sched->job_list_lock);
> > +	job = list_first_entry_or_null(&sched->pending_list,
> > +				       struct drm_sched_job, list);
> > +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > +		drm_sched_free_job_queue(sched);
> > +	spin_unlock(&sched->job_list_lock);
> >   }
> >   /**
> > @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> >   	dma_fence_get(&s_fence->finished);
> >   	drm_sched_fence_finished(s_fence, result);
> >   	dma_fence_put(&s_fence->finished);
> > -	drm_sched_submit_queue(sched);
> > +	drm_sched_free_job_queue(sched);
> >   }
> >   /**
> > @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> >   void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> >   {
> >   	if (drm_sched_can_queue(sched))
> > -		drm_sched_submit_queue(sched);
> > +		drm_sched_run_job_queue(sched);
> >   }
> >   /**
> >    * drm_sched_select_entity - Select next entity to process
> >    *
> >    * @sched: scheduler instance
> > + * @dequeue: dequeue selected entity
> >    *
> >    * Returns the entity to process or NULL if none are found.
> >    */
> >   static struct drm_sched_entity *
> > -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> >   {
> >   	struct drm_sched_entity *entity;
> >   	int i;
> > @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> >   	/* Kernel run queue has higher priority than normal run queue*/
> >   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> >   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > +							dequeue) :
> > +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > +						      dequeue);
> >   		if (entity)
> >   			break;
> >   	}
> > @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> >   EXPORT_SYMBOL(drm_sched_pick_best);
> >   /**
> > - * drm_sched_main - main scheduler thread
> > + * drm_sched_free_job_work - worker to call free_job
> >    *
> > - * @param: scheduler instance
> > + * @w: free job work
> >    */
> > -static void drm_sched_main(struct work_struct *w)
> > +static void drm_sched_free_job_work(struct work_struct *w)
> >   {
> >   	struct drm_gpu_scheduler *sched =
> > -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > -	struct drm_sched_entity *entity;
> > +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> >   	struct drm_sched_job *cleanup_job;
> > -	int r;
> >   	if (READ_ONCE(sched->pause_submit))
> >   		return;
> >   	cleanup_job = drm_sched_get_cleanup_job(sched);
> > -	entity = drm_sched_select_entity(sched);
> > +	if (cleanup_job) {
> > +		sched->ops->free_job(cleanup_job);
> > +
> > +		drm_sched_free_job_queue_if_ready(sched);
> > +		drm_sched_run_job_queue_if_ready(sched);
> > +	}
> > +}
> > -	if (!entity && !cleanup_job)
> > -		return;	/* No more work */
> > +/**
> > + * drm_sched_run_job_work - worker to call run_job
> > + *
> > + * @w: run job work
> > + */
> > +static void drm_sched_run_job_work(struct work_struct *w)
> > +{
> > +	struct drm_gpu_scheduler *sched =
> > +		container_of(w, struct drm_gpu_scheduler, work_run_job);
> > +	struct drm_sched_entity *entity;
> > +	int r;
> > -	if (cleanup_job)
> > -		sched->ops->free_job(cleanup_job);
> > +	if (READ_ONCE(sched->pause_submit))
> > +		return;
> > +	entity = drm_sched_select_entity(sched, true);
> >   	if (entity) {
> >   		struct dma_fence *fence;
> >   		struct drm_sched_fence *s_fence;
> > @@ -1056,9 +1122,7 @@ static void drm_sched_main(struct work_struct *w)
> >   		sched_job = drm_sched_entity_pop_job(entity);
> >   		if (!sched_job) {
> >   			complete_all(&entity->entity_idle);
> > -			if (!cleanup_job)
> > -				return;	/* No more work */
> > -			goto again;
> > +			return;	/* No more work */
> >   		}
> >   		s_fence = sched_job->s_fence;
> > @@ -1088,10 +1152,8 @@ static void drm_sched_main(struct work_struct *w)
> >   		}
> >   		wake_up(&sched->job_scheduled);
> > +		drm_sched_run_job_queue_if_ready(sched);
> >   	}
> > -
> > -again:
> > -	drm_sched_submit_queue(sched);
> >   }
> >   /**
> > @@ -1150,7 +1212,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >   	spin_lock_init(&sched->job_list_lock);
> >   	atomic_set(&sched->hw_rq_count, 0);
> >   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > -	INIT_WORK(&sched->work_submit, drm_sched_main);
> > +	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> > +	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> >   	atomic_set(&sched->_score, 0);
> >   	atomic64_set(&sched->job_id_count, 0);
> >   	sched->pause_submit = false;
> > @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
> >   void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
> 
> I was wondering what the scheduler teardown sequence looks like for
> DRM_SCHED_POLICY_SINGLE_ENTITY and how XE does that.
> 
> In Nouveau, userspace can ask the kernel to create a channel (or multiple),
> where each channel represents a ring feeding the firmware scheduler. Userspace
> can forcefully close channels via either a dedicated IOCTL or by just closing
> the FD which subsequently closes all channels opened through this FD.
> 
> When this happens the scheduler needs to be teared down. Without keeping track of
> things in a driver specific way, the only thing I could really come up with is the
> following.
> 
> /* Make sure no more jobs are fetched from the entity. */
> drm_sched_submit_stop();
> 
> /* Wait for the channel to be idle, namely jobs in flight to complete. */
> nouveau_channel_idle();
> 
> /* Stop the scheduler to free jobs from the pending_list. Ring must be idle at this
>  * point, otherwise me might leak jobs. Feels more like a workaround to free
>  * finished jobs.
>  */
> drm_sched_stop();
> 
> /* Free jobs from the entity queue. */
> drm_sched_entity_fini();
> 
> /* Probably not even needed in this case. */
> drm_sched_fini();
> 
> This doesn't look very straightforward though. I wonder if other drivers feeding
> firmware schedulers have similar cases. Maybe something like drm_sched_teardown(),
> which would stop job submission, wait for pending jobs to finish and subsequently
> free them up would makes sense?
> 

exec queue == gpu scheduler + entity in Xe

We kinda invented our own flow with reference counting + use the TDR for
cleanup.

We have a creation ref for the exec queue plus each job takes a ref to
the exec queue. On exec queue close [1][2] (whether that be IOCTL or FD
close) we drop the creation reference and call a vfunc for killing thr
exec queue. The firmware implementation is here [3].

If you read through it just sets the TDR to the minimum value [4], the
TDR will kick any running jobs the off the hardware, signals the jobs
fences, any jobs waiting on dependencies eventually flush out via
run_job + TDR for cleanup without going on the hardware, the exec queue
reference count goes to zero once all jobs are flushed out, we trigger
the exec queue clean up flow and finally free all memory for the exec
queue.

Using the TDR in this way is how we teardown an exec queue for other
reasons too (user page fault, user job times out, user job hang detected
by firmware, device reset, etc...).

This all works rather nicely and is a single code path for all of these
cases. I'm no sure if this can be made any more generic nor do I really
see the need too (at least I don't see Xe needing a generic solution).

Hope this helps,
Matt

[1] https://gitlab.freedesktop.org/drm/xe/kernel/-/blob/drm-xe-next/drivers/gpu/drm/xe/xe_exec_queue.c#L911
[2] https://gitlab.freedesktop.org/drm/xe/kernel/-/blob/drm-xe-next/drivers/gpu/drm/xe/xe_device.c#L77
[3] https://gitlab.freedesktop.org/drm/xe/kernel/-/tree/drm-xe-next/drivers/gpu/drm/xe#L1184
[4] https://gitlab.freedesktop.org/drm/xe/kernel/-/tree/drm-xe-next/drivers/gpu/drm/xe#L789

> - Danilo
> 
> >   {
> >   	WRITE_ONCE(sched->pause_submit, true);
> > -	cancel_work_sync(&sched->work_submit);
> > +	cancel_work_sync(&sched->work_run_job);
> > +	cancel_work_sync(&sched->work_free_job);
> >   }
> >   EXPORT_SYMBOL(drm_sched_submit_stop);
> > @@ -1287,6 +1351,7 @@ EXPORT_SYMBOL(drm_sched_submit_stop);
> >   void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
> >   {
> >   	WRITE_ONCE(sched->pause_submit, false);
> > -	queue_work(sched->submit_wq, &sched->work_submit);
> > +	queue_work(sched->submit_wq, &sched->work_run_job);
> > +	queue_work(sched->submit_wq, &sched->work_free_job);
> >   }
> >   EXPORT_SYMBOL(drm_sched_submit_start);
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 04eec2d7635f..fbc083a92757 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -487,9 +487,10 @@ struct drm_sched_backend_ops {
> >    *                 finished.
> >    * @hw_rq_count: the number of jobs currently in the hardware queue.
> >    * @job_id_count: used to assign unique id to the each job.
> > - * @submit_wq: workqueue used to queue @work_submit
> > + * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
> >    * @timeout_wq: workqueue used to queue @work_tdr
> > - * @work_submit: schedules jobs and cleans up entities
> > + * @work_run_job: schedules jobs
> > + * @work_free_job: cleans up jobs
> >    * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> >    *            timeout interval is over.
> >    * @pending_list: the list of jobs which are currently in the job queue.
> > @@ -518,7 +519,8 @@ struct drm_gpu_scheduler {
> >   	atomic64_t			job_id_count;
> >   	struct workqueue_struct		*submit_wq;
> >   	struct workqueue_struct		*timeout_wq;
> > -	struct work_struct		work_submit;
> > +	struct work_struct		work_run_job;
> > +	struct work_struct		work_free_job;
> >   	struct delayed_work		work_tdr;
> >   	struct list_head		pending_list;
> >   	spinlock_t			job_list_lock;
>
Danilo Krummrich Aug. 29, 2023, 1:20 a.m. UTC | #21
On 8/28/23 20:41, Matthew Brost wrote:
> On Mon, Aug 28, 2023 at 08:04:31PM +0200, Danilo Krummrich wrote:
>> On 8/11/23 04:31, Matthew Brost wrote:
>>> Rather than call free_job and run_job in same work item have a dedicated
>>> work item for each. This aligns with the design and intended use of work
>>> queues.
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
>>>    include/drm/gpu_scheduler.h            |   8 +-
>>>    2 files changed, 106 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index cede47afc800..b67469eac179 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -1275,7 +1338,8 @@ EXPORT_SYMBOL(drm_sched_submit_ready);
>>>    void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
>>
>> I was wondering what the scheduler teardown sequence looks like for
>> DRM_SCHED_POLICY_SINGLE_ENTITY and how XE does that.
>>
>> In Nouveau, userspace can ask the kernel to create a channel (or multiple),
>> where each channel represents a ring feeding the firmware scheduler. Userspace
>> can forcefully close channels via either a dedicated IOCTL or by just closing
>> the FD which subsequently closes all channels opened through this FD.
>>
>> When this happens the scheduler needs to be teared down. Without keeping track of
>> things in a driver specific way, the only thing I could really come up with is the
>> following.
>>
>> /* Make sure no more jobs are fetched from the entity. */
>> drm_sched_submit_stop();
>>
>> /* Wait for the channel to be idle, namely jobs in flight to complete. */
>> nouveau_channel_idle();
>>
>> /* Stop the scheduler to free jobs from the pending_list. Ring must be idle at this
>>   * point, otherwise me might leak jobs. Feels more like a workaround to free
>>   * finished jobs.
>>   */
>> drm_sched_stop();
>>
>> /* Free jobs from the entity queue. */
>> drm_sched_entity_fini();
>>
>> /* Probably not even needed in this case. */
>> drm_sched_fini();
>>
>> This doesn't look very straightforward though. I wonder if other drivers feeding
>> firmware schedulers have similar cases. Maybe something like drm_sched_teardown(),
>> which would stop job submission, wait for pending jobs to finish and subsequently
>> free them up would makes sense?
>>
> 
> exec queue == gpu scheduler + entity in Xe
> 
> We kinda invented our own flow with reference counting + use the TDR for
> cleanup.

Thanks for the detailed explanation. In case of making it driver specific
I thought about something similar, pretty much the same reference counting,
but instead of the TDR, let jobs from the entity just return -ECANCELED from
job_run() and also signal pending jobs with the same error code.

On the other hand, I don't really want scheduler and job structures to
potentially outlive the channel. Which is where I think I'd be nice to avoid
consuming all the queued up jobs from the entity in the first place, stop the
schdeduler with drm_sched_submit_stop(), signal all pending_jobs with
-ECANCELED and call the free_job() callbacks right away.

The latter I could probably do in Nouveau as well, however, it kinda feels
wrong to do all that within the driver.

Also, I was wondering how existing drivers using the GPU scheduler handle
that. It seems like they just rely on the pending_list of the scheduler being
empty once drm_sched_fini() is called. Admittedly, that's pretty likely (never
to happen) since it's typically called on driver remove, but I don't see how
that's actually ensured. Am I missing something?
> 
> We have a creation ref for the exec queue plus each job takes a ref to
> the exec queue. On exec queue close [1][2] (whether that be IOCTL or FD
> close) we drop the creation reference and call a vfunc for killing thr
> exec queue. The firmware implementation is here [3].
> 
> If you read through it just sets the TDR to the minimum value [4], the
> TDR will kick any running jobs the off the hardware, signals the jobs
> fences, any jobs waiting on dependencies eventually flush out via
> run_job + TDR for cleanup without going on the hardware, the exec queue
> reference count goes to zero once all jobs are flushed out, we trigger
> the exec queue clean up flow and finally free all memory for the exec
> queue.
> 
> Using the TDR in this way is how we teardown an exec queue for other
> reasons too (user page fault, user job times out, user job hang detected
> by firmware, device reset, etc...).
> 
> This all works rather nicely and is a single code path for all of these
> cases. I'm no sure if this can be made any more generic nor do I really
> see the need too (at least I don't see Xe needing a generic solution).
> 
> Hope this helps,
> Matt
> 
> [1] https://gitlab.freedesktop.org/drm/xe/kernel/-/blob/drm-xe-next/drivers/gpu/drm/xe/xe_exec_queue.c#L911
> [2] https://gitlab.freedesktop.org/drm/xe/kernel/-/blob/drm-xe-next/drivers/gpu/drm/xe/xe_device.c#L77
> [3] https://gitlab.freedesktop.org/drm/xe/kernel/-/tree/drm-xe-next/drivers/gpu/drm/xe#L1184
> [4] https://gitlab.freedesktop.org/drm/xe/kernel/-/tree/drm-xe-next/drivers/gpu/drm/xe#L789
> 
>> - Danilo
>>
Boris Brezillon Sept. 12, 2023, 10:13 a.m. UTC | #22
On Fri, 25 Aug 2023 15:45:49 +0200
Christian König <christian.koenig@amd.com> wrote:

> Am 25.08.23 um 15:36 schrieb Matthew Brost:
> > On Fri, Aug 25, 2023 at 10:02:32AM +0200, Christian König wrote:  
> >> Am 25.08.23 um 04:58 schrieb Matthew Brost:  
> >>> On Fri, Aug 25, 2023 at 01:04:10AM +0200, Danilo Krummrich wrote:  
> >>>> On Thu, Aug 10, 2023 at 07:31:32PM -0700, Matthew Brost wrote:  
> >>>>> Rather than call free_job and run_job in same work item have a dedicated
> >>>>> work item for each. This aligns with the design and intended use of work
> >>>>> queues.
> >>>>>
> >>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> >>>>> ---
> >>>>>    drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> >>>>>    include/drm/gpu_scheduler.h            |   8 +-
> >>>>>    2 files changed, 106 insertions(+), 39 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> >>>>> index cede47afc800..b67469eac179 100644
> >>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
> >>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> >>>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> >>>>>     * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> >>>>>     *
> >>>>>     * @rq: scheduler run queue to check.
> >>>>> + * @dequeue: dequeue selected entity
> >>>>>     *
> >>>>>     * Try to find a ready entity, returns NULL if none found.
> >>>>>     */
> >>>>>    static struct drm_sched_entity *
> >>>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >>>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> >>>>>    {
> >>>>>    	struct drm_sched_entity *entity;
> >>>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >>>>>    	if (entity) {
> >>>>>    		list_for_each_entry_continue(entity, &rq->entities, list) {
> >>>>>    			if (drm_sched_entity_is_ready(entity)) {
> >>>>> -				rq->current_entity = entity;
> >>>>> -				reinit_completion(&entity->entity_idle);
> >>>>> +				if (dequeue) {
> >>>>> +					rq->current_entity = entity;
> >>>>> +					reinit_completion(&entity->entity_idle);
> >>>>> +				}
> >>>>>    				spin_unlock(&rq->lock);
> >>>>>    				return entity;
> >>>>>    			}
> >>>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >>>>>    	list_for_each_entry(entity, &rq->entities, list) {
> >>>>>    		if (drm_sched_entity_is_ready(entity)) {
> >>>>> -			rq->current_entity = entity;
> >>>>> -			reinit_completion(&entity->entity_idle);
> >>>>> +			if (dequeue) {
> >>>>> +				rq->current_entity = entity;
> >>>>> +				reinit_completion(&entity->entity_idle);
> >>>>> +			}
> >>>>>    			spin_unlock(&rq->lock);
> >>>>>    			return entity;
> >>>>>    		}
> >>>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> >>>>>     * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> >>>>>     *
> >>>>>     * @rq: scheduler run queue to check.
> >>>>> + * @dequeue: dequeue selected entity
> >>>>>     *
> >>>>>     * Find oldest waiting ready entity, returns NULL if none found.
> >>>>>     */
> >>>>>    static struct drm_sched_entity *
> >>>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >>>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> >>>>>    {
> >>>>>    	struct rb_node *rb;
> >>>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >>>>>    		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> >>>>>    		if (drm_sched_entity_is_ready(entity)) {
> >>>>> -			rq->current_entity = entity;
> >>>>> -			reinit_completion(&entity->entity_idle);
> >>>>> +			if (dequeue) {
> >>>>> +				rq->current_entity = entity;
> >>>>> +				reinit_completion(&entity->entity_idle);
> >>>>> +			}
> >>>>>    			break;
> >>>>>    		}
> >>>>>    	}
> >>>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >>>>>    }
> >>>>>    /**
> >>>>> - * drm_sched_submit_queue - scheduler queue submission
> >>>>> + * drm_sched_run_job_queue - queue job submission
> >>>>>     * @sched: scheduler instance
> >>>>>     */
> >>>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> >>>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> >>>>>    {
> >>>>>    	if (!READ_ONCE(sched->pause_submit))
> >>>>> -		queue_work(sched->submit_wq, &sched->work_submit);
> >>>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
> >>>>> +}
> >>>>> +
> >>>>> +static struct drm_sched_entity *
> >>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> >>>>> +
> >>>>> +/**
> >>>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> >>>>> + * @sched: scheduler instance
> >>>>> + */
> >>>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> >>>>> +{
> >>>>> +	if (drm_sched_select_entity(sched, false))
> >>>>> +		drm_sched_run_job_queue(sched);
> >>>>> +}
> >>>>> +
> >>>>> +/**
> >>>>> + * drm_sched_free_job_queue - queue free job
> >>>>> + *
> >>>>> + * @sched: scheduler instance to queue free job
> >>>>> + */
> >>>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> >>>>> +{
> >>>>> +	if (!READ_ONCE(sched->pause_submit))
> >>>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
> >>>>> +}
> >>>>> +
> >>>>> +/**
> >>>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
> >>>>> + *
> >>>>> + * @sched: scheduler instance to queue free job
> >>>>> + */
> >>>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> >>>>> +{
> >>>>> +	struct drm_sched_job *job;
> >>>>> +
> >>>>> +	spin_lock(&sched->job_list_lock);
> >>>>> +	job = list_first_entry_or_null(&sched->pending_list,
> >>>>> +				       struct drm_sched_job, list);
> >>>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> >>>>> +		drm_sched_free_job_queue(sched);
> >>>>> +	spin_unlock(&sched->job_list_lock);
> >>>>>    }
> >>>>>    /**
> >>>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> >>>>>    	dma_fence_get(&s_fence->finished);
> >>>>>    	drm_sched_fence_finished(s_fence, result);
> >>>>>    	dma_fence_put(&s_fence->finished);
> >>>>> -	drm_sched_submit_queue(sched);
> >>>>> +	drm_sched_free_job_queue(sched);
> >>>>>    }
> >>>>>    /**
> >>>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> >>>>>    void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> >>>>>    {
> >>>>>    	if (drm_sched_can_queue(sched))
> >>>>> -		drm_sched_submit_queue(sched);
> >>>>> +		drm_sched_run_job_queue(sched);
> >>>>>    }
> >>>>>    /**
> >>>>>     * drm_sched_select_entity - Select next entity to process
> >>>>>     *
> >>>>>     * @sched: scheduler instance
> >>>>> + * @dequeue: dequeue selected entity
> >>>>>     *
> >>>>>     * Returns the entity to process or NULL if none are found.
> >>>>>     */
> >>>>>    static struct drm_sched_entity *
> >>>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> >>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> >>>>>    {
> >>>>>    	struct drm_sched_entity *entity;
> >>>>>    	int i;
> >>>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> >>>>>    	/* Kernel run queue has higher priority than normal run queue*/
> >>>>>    	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> >>>>>    		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> >>>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> >>>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> >>>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> >>>>> +							dequeue) :
> >>>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> >>>>> +						      dequeue);
> >>>>>    		if (entity)
> >>>>>    			break;
> >>>>>    	}
> >>>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> >>>>>    EXPORT_SYMBOL(drm_sched_pick_best);
> >>>>>    /**
> >>>>> - * drm_sched_main - main scheduler thread
> >>>>> + * drm_sched_free_job_work - worker to call free_job
> >>>>>     *
> >>>>> - * @param: scheduler instance
> >>>>> + * @w: free job work
> >>>>>     */
> >>>>> -static void drm_sched_main(struct work_struct *w)
> >>>>> +static void drm_sched_free_job_work(struct work_struct *w)
> >>>>>    {
> >>>>>    	struct drm_gpu_scheduler *sched =
> >>>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
> >>>>> -	struct drm_sched_entity *entity;
> >>>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> >>>>>    	struct drm_sched_job *cleanup_job;
> >>>>> -	int r;
> >>>>>    	if (READ_ONCE(sched->pause_submit))
> >>>>>    		return;
> >>>>>    	cleanup_job = drm_sched_get_cleanup_job(sched);  
> >>>> I tried this patch with Nouveau and found a race condition:
> >>>>
> >>>> In drm_sched_run_job_work() the job is added to the pending_list via
> >>>> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
> >>>> fence is signaled.
> >>>>
> >>>> However, in parallel drm_sched_get_cleanup_job() might be called from
> >>>> drm_sched_free_job_work(), which picks the first job from the pending_list and
> >>>> for the next job on the pending_list sets the scheduled fence' timestamp field.  
> >> Well why can this happen in parallel? Either the work items are scheduled to
> >> a single threaded work queue or you have protected the pending list with
> >> some locks.
> >>  
> > Xe uses a single-threaded work queue, Nouveau does not (desired
> > behavior).

I'm a bit worried that leaving this single vs multi-threaded wq
decision to drivers is going to cause unnecessary pain, because what
was previously a granted in term of run/cleanup execution order (thanks
to the kthread+static-drm_sched_main-workflow approach) is now subject
to the wq ordering guarantees, which depend on the wq type picked by
the driver.

> >
> > The list of pending jobs is protected by a lock (safe), the race is:
> >
> > add job to pending list
> > run_job
> > signal scheduled fence
> >
> > dequeue from pending list
> > free_job
> > update timestamp
> >
> > Once a job is on the pending list its timestamp can be accessed which
> > can blow up if scheduled fence isn't signaled or more specifically unless
> > DMA_FENCE_FLAG_TIMESTAMP_BIT is set. 

Ah, so that's the reason for the TIMESTAMP test added in v3. Sorry for
the noise in my v3 review, but I still think it'd be beneficial to have
that change moved to its own commit.
Danilo Krummrich Sept. 12, 2023, 10:46 a.m. UTC | #23
On Tue, Sep 12, 2023 at 12:13:57PM +0200, Boris Brezillon wrote:
> On Fri, 25 Aug 2023 15:45:49 +0200
> Christian König <christian.koenig@amd.com> wrote:
> 
> > Am 25.08.23 um 15:36 schrieb Matthew Brost:
> > > On Fri, Aug 25, 2023 at 10:02:32AM +0200, Christian König wrote:  
> > >> Am 25.08.23 um 04:58 schrieb Matthew Brost:  
> > >>> On Fri, Aug 25, 2023 at 01:04:10AM +0200, Danilo Krummrich wrote:  
> > >>>> On Thu, Aug 10, 2023 at 07:31:32PM -0700, Matthew Brost wrote:  
> > >>>>> Rather than call free_job and run_job in same work item have a dedicated
> > >>>>> work item for each. This aligns with the design and intended use of work
> > >>>>> queues.
> > >>>>>
> > >>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > >>>>> ---
> > >>>>>    drivers/gpu/drm/scheduler/sched_main.c | 137 ++++++++++++++++++-------
> > >>>>>    include/drm/gpu_scheduler.h            |   8 +-
> > >>>>>    2 files changed, 106 insertions(+), 39 deletions(-)
> > >>>>>
> > >>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > >>>>> index cede47afc800..b67469eac179 100644
> > >>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
> > >>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > >>>>> @@ -213,11 +213,12 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> > >>>>>     * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> > >>>>>     *
> > >>>>>     * @rq: scheduler run queue to check.
> > >>>>> + * @dequeue: dequeue selected entity
> > >>>>>     *
> > >>>>>     * Try to find a ready entity, returns NULL if none found.
> > >>>>>     */
> > >>>>>    static struct drm_sched_entity *
> > >>>>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > >>>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> > >>>>>    {
> > >>>>>    	struct drm_sched_entity *entity;
> > >>>>> @@ -227,8 +228,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > >>>>>    	if (entity) {
> > >>>>>    		list_for_each_entry_continue(entity, &rq->entities, list) {
> > >>>>>    			if (drm_sched_entity_is_ready(entity)) {
> > >>>>> -				rq->current_entity = entity;
> > >>>>> -				reinit_completion(&entity->entity_idle);
> > >>>>> +				if (dequeue) {
> > >>>>> +					rq->current_entity = entity;
> > >>>>> +					reinit_completion(&entity->entity_idle);
> > >>>>> +				}
> > >>>>>    				spin_unlock(&rq->lock);
> > >>>>>    				return entity;
> > >>>>>    			}
> > >>>>> @@ -238,8 +241,10 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > >>>>>    	list_for_each_entry(entity, &rq->entities, list) {
> > >>>>>    		if (drm_sched_entity_is_ready(entity)) {
> > >>>>> -			rq->current_entity = entity;
> > >>>>> -			reinit_completion(&entity->entity_idle);
> > >>>>> +			if (dequeue) {
> > >>>>> +				rq->current_entity = entity;
> > >>>>> +				reinit_completion(&entity->entity_idle);
> > >>>>> +			}
> > >>>>>    			spin_unlock(&rq->lock);
> > >>>>>    			return entity;
> > >>>>>    		}
> > >>>>> @@ -257,11 +262,12 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
> > >>>>>     * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> > >>>>>     *
> > >>>>>     * @rq: scheduler run queue to check.
> > >>>>> + * @dequeue: dequeue selected entity
> > >>>>>     *
> > >>>>>     * Find oldest waiting ready entity, returns NULL if none found.
> > >>>>>     */
> > >>>>>    static struct drm_sched_entity *
> > >>>>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > >>>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> > >>>>>    {
> > >>>>>    	struct rb_node *rb;
> > >>>>> @@ -271,8 +277,10 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > >>>>>    		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> > >>>>>    		if (drm_sched_entity_is_ready(entity)) {
> > >>>>> -			rq->current_entity = entity;
> > >>>>> -			reinit_completion(&entity->entity_idle);
> > >>>>> +			if (dequeue) {
> > >>>>> +				rq->current_entity = entity;
> > >>>>> +				reinit_completion(&entity->entity_idle);
> > >>>>> +			}
> > >>>>>    			break;
> > >>>>>    		}
> > >>>>>    	}
> > >>>>> @@ -282,13 +290,54 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> > >>>>>    }
> > >>>>>    /**
> > >>>>> - * drm_sched_submit_queue - scheduler queue submission
> > >>>>> + * drm_sched_run_job_queue - queue job submission
> > >>>>>     * @sched: scheduler instance
> > >>>>>     */
> > >>>>> -static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
> > >>>>> +static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> > >>>>>    {
> > >>>>>    	if (!READ_ONCE(sched->pause_submit))
> > >>>>> -		queue_work(sched->submit_wq, &sched->work_submit);
> > >>>>> +		queue_work(sched->submit_wq, &sched->work_run_job);
> > >>>>> +}
> > >>>>> +
> > >>>>> +static struct drm_sched_entity *
> > >>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
> > >>>>> +
> > >>>>> +/**
> > >>>>> + * drm_sched_run_job_queue_if_ready - queue job submission if ready
> > >>>>> + * @sched: scheduler instance
> > >>>>> + */
> > >>>>> +static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > >>>>> +{
> > >>>>> +	if (drm_sched_select_entity(sched, false))
> > >>>>> +		drm_sched_run_job_queue(sched);
> > >>>>> +}
> > >>>>> +
> > >>>>> +/**
> > >>>>> + * drm_sched_free_job_queue - queue free job
> > >>>>> + *
> > >>>>> + * @sched: scheduler instance to queue free job
> > >>>>> + */
> > >>>>> +static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
> > >>>>> +{
> > >>>>> +	if (!READ_ONCE(sched->pause_submit))
> > >>>>> +		queue_work(sched->submit_wq, &sched->work_free_job);
> > >>>>> +}
> > >>>>> +
> > >>>>> +/**
> > >>>>> + * drm_sched_free_job_queue_if_ready - queue free job if ready
> > >>>>> + *
> > >>>>> + * @sched: scheduler instance to queue free job
> > >>>>> + */
> > >>>>> +static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
> > >>>>> +{
> > >>>>> +	struct drm_sched_job *job;
> > >>>>> +
> > >>>>> +	spin_lock(&sched->job_list_lock);
> > >>>>> +	job = list_first_entry_or_null(&sched->pending_list,
> > >>>>> +				       struct drm_sched_job, list);
> > >>>>> +	if (job && dma_fence_is_signaled(&job->s_fence->finished))
> > >>>>> +		drm_sched_free_job_queue(sched);
> > >>>>> +	spin_unlock(&sched->job_list_lock);
> > >>>>>    }
> > >>>>>    /**
> > >>>>> @@ -310,7 +359,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
> > >>>>>    	dma_fence_get(&s_fence->finished);
> > >>>>>    	drm_sched_fence_finished(s_fence, result);
> > >>>>>    	dma_fence_put(&s_fence->finished);
> > >>>>> -	drm_sched_submit_queue(sched);
> > >>>>> +	drm_sched_free_job_queue(sched);
> > >>>>>    }
> > >>>>>    /**
> > >>>>> @@ -906,18 +955,19 @@ static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> > >>>>>    void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> > >>>>>    {
> > >>>>>    	if (drm_sched_can_queue(sched))
> > >>>>> -		drm_sched_submit_queue(sched);
> > >>>>> +		drm_sched_run_job_queue(sched);
> > >>>>>    }
> > >>>>>    /**
> > >>>>>     * drm_sched_select_entity - Select next entity to process
> > >>>>>     *
> > >>>>>     * @sched: scheduler instance
> > >>>>> + * @dequeue: dequeue selected entity
> > >>>>>     *
> > >>>>>     * Returns the entity to process or NULL if none are found.
> > >>>>>     */
> > >>>>>    static struct drm_sched_entity *
> > >>>>> -drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > >>>>> +drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
> > >>>>>    {
> > >>>>>    	struct drm_sched_entity *entity;
> > >>>>>    	int i;
> > >>>>> @@ -935,8 +985,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
> > >>>>>    	/* Kernel run queue has higher priority than normal run queue*/
> > >>>>>    	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> > >>>>>    		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> > >>>>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
> > >>>>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
> > >>>>> +			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> > >>>>> +							dequeue) :
> > >>>>> +			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> > >>>>> +						      dequeue);
> > >>>>>    		if (entity)
> > >>>>>    			break;
> > >>>>>    	}
> > >>>>> @@ -1024,30 +1076,44 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> > >>>>>    EXPORT_SYMBOL(drm_sched_pick_best);
> > >>>>>    /**
> > >>>>> - * drm_sched_main - main scheduler thread
> > >>>>> + * drm_sched_free_job_work - worker to call free_job
> > >>>>>     *
> > >>>>> - * @param: scheduler instance
> > >>>>> + * @w: free job work
> > >>>>>     */
> > >>>>> -static void drm_sched_main(struct work_struct *w)
> > >>>>> +static void drm_sched_free_job_work(struct work_struct *w)
> > >>>>>    {
> > >>>>>    	struct drm_gpu_scheduler *sched =
> > >>>>> -		container_of(w, struct drm_gpu_scheduler, work_submit);
> > >>>>> -	struct drm_sched_entity *entity;
> > >>>>> +		container_of(w, struct drm_gpu_scheduler, work_free_job);
> > >>>>>    	struct drm_sched_job *cleanup_job;
> > >>>>> -	int r;
> > >>>>>    	if (READ_ONCE(sched->pause_submit))
> > >>>>>    		return;
> > >>>>>    	cleanup_job = drm_sched_get_cleanup_job(sched);  
> > >>>> I tried this patch with Nouveau and found a race condition:
> > >>>>
> > >>>> In drm_sched_run_job_work() the job is added to the pending_list via
> > >>>> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
> > >>>> fence is signaled.
> > >>>>
> > >>>> However, in parallel drm_sched_get_cleanup_job() might be called from
> > >>>> drm_sched_free_job_work(), which picks the first job from the pending_list and
> > >>>> for the next job on the pending_list sets the scheduled fence' timestamp field.  
> > >> Well why can this happen in parallel? Either the work items are scheduled to
> > >> a single threaded work queue or you have protected the pending list with
> > >> some locks.
> > >>  
> > > Xe uses a single-threaded work queue, Nouveau does not (desired
> > > behavior).
> 
> I'm a bit worried that leaving this single vs multi-threaded wq
> decision to drivers is going to cause unnecessary pain, because what
> was previously a granted in term of run/cleanup execution order (thanks
> to the kthread+static-drm_sched_main-workflow approach) is now subject
> to the wq ordering guarantees, which depend on the wq type picked by
> the driver.

Not sure if this ends up to be much different. The only thing I could think of
is that IIRC with the kthread implementation cleanup was always preferred over
run. With a single threaded wq this should be a bit more balanced.

With a multi-threaded wq it's still the same, but run and cleanup can run
concurrently, which has the nice side effect that free_job() gets out of the
fence signaling path. At least as long as the workqueue has max_active > 1.
Which is one reason why I'm using a multi-threaded wq in Nouveau.

That latter seems a bit subtile, we probably need to document this aspect of
under which conditions free_job() is or is not within the fence signaling path.

- Danilo

> 
> > >
> > > The list of pending jobs is protected by a lock (safe), the race is:
> > >
> > > add job to pending list
> > > run_job
> > > signal scheduled fence
> > >
> > > dequeue from pending list
> > > free_job
> > > update timestamp
> > >
> > > Once a job is on the pending list its timestamp can be accessed which
> > > can blow up if scheduled fence isn't signaled or more specifically unless
> > > DMA_FENCE_FLAG_TIMESTAMP_BIT is set. 
> 
> Ah, so that's the reason for the TIMESTAMP test added in v3. Sorry for
> the noise in my v3 review, but I still think it'd be beneficial to have
> that change moved to its own commit.
>
Boris Brezillon Sept. 12, 2023, 12:18 p.m. UTC | #24
On Tue, 12 Sep 2023 12:46:26 +0200
Danilo Krummrich <dakr@redhat.com> wrote:

> > I'm a bit worried that leaving this single vs multi-threaded wq
> > decision to drivers is going to cause unnecessary pain, because what
> > was previously a granted in term of run/cleanup execution order (thanks
> > to the kthread+static-drm_sched_main-workflow approach) is now subject
> > to the wq ordering guarantees, which depend on the wq type picked by
> > the driver.  
> 
> Not sure if this ends up to be much different. The only thing I could think of
> is that IIRC with the kthread implementation cleanup was always preferred over
> run.

Given the sequence in drm_sched_main(), I'd say that cleanup and run
operations are naturally interleaved when both are available, but I
might be wrong.

> With a single threaded wq this should be a bit more balanced.

With a single threaded wq, it's less clear, because each work
reschedules itself for further processing, but it's likely to be more
or less interleaved. Anyway, I'm not too worried about cleanup taking
precedence on run or the other way around, because the limited amount
of HW slots (size of the ring-buffer) will regulate that.

> 
> With a multi-threaded wq it's still the same, but run and cleanup can run
> concurrently,

What I'm worried about is that ^. I'm not saying it's fundamentally
unsafe, but I'm saying drm_sched hasn't been designed with this
concurrency in mind, and I fear we'll face subtle bugs if we go from
kthread to multi-threaded-wq+run-and-cleanup-split-in-2-work-items.

> which has the nice side effect that free_job() gets out of the
> fence signaling path. At least as long as the workqueue has max_active > 1.

Oh, yeah, I don't deny using a multi-threaded workqueue has some
benefits, just saying it might be trickier than it sounds.

> Which is one reason why I'm using a multi-threaded wq in Nouveau.

Note that I'm using a multi-threaded workqueue internally at the moment
to deal with all sort of interactions with the FW (Mali HW only has a
limited amount of scheduling slots, and we need to rotate entities
having jobs to execute so every one gets a chance to run on the GPU),
but this has been designed this way from the ground up, unlike
drm_sched_main() operations, which were mostly thought as a fixed
sequential set of operations. That's not to say it's impossible to get
right, but I fear we'll face weird/unexpected behavior if we go from
completely-serialized to multi-threaded-with-pseudo-random-processing
order.

> 
> That latter seems a bit subtile, we probably need to document this aspect of
> under which conditions free_job() is or is not within the fence signaling path.

Well, I'm not even sure it can be clearly defined when the driver is
using the submit_wq for its own work items (which can be done since we
pass an optional submit_wq when calling drm_sched_init()). Sure, having
max_active >= 2 should be enough to guarantee that the free_job work
won't block the run_job one when these are the 2 only works being
queued, but what if you have many other work items being queued by the
driver to this wq, and some of those try to acquire resv locks? Could
this prevent execution of the run_job() callback, thus preventing
signaling of fences? I'm genuinely asking, don't know enough about the
cmwq implementation to tell what's happening when work items are
blocked (might be that the worker pool is extended to unblock the
situation).

Anyway, documenting when free_job() is in the dma signalling path should
be doable (single-threaded wq), but at this point, are we not better
off considering anything called from the submit_wq as being part of the
dma signalling path, so we can accommodate with both cases. And if
there is cleanup processing that require taking dma_resv locks, I'd be
tempted to queue that to a driver-specific wq (which is what I'm doing
right now), just to be safe.
Danilo Krummrich Sept. 12, 2023, 12:56 p.m. UTC | #25
On Tue, Sep 12, 2023 at 02:18:18PM +0200, Boris Brezillon wrote:
> On Tue, 12 Sep 2023 12:46:26 +0200
> Danilo Krummrich <dakr@redhat.com> wrote:
> 
> > > I'm a bit worried that leaving this single vs multi-threaded wq
> > > decision to drivers is going to cause unnecessary pain, because what
> > > was previously a granted in term of run/cleanup execution order (thanks
> > > to the kthread+static-drm_sched_main-workflow approach) is now subject
> > > to the wq ordering guarantees, which depend on the wq type picked by
> > > the driver.  
> > 
> > Not sure if this ends up to be much different. The only thing I could think of
> > is that IIRC with the kthread implementation cleanup was always preferred over
> > run.
> 
> Given the sequence in drm_sched_main(), I'd say that cleanup and run
> operations are naturally interleaved when both are available, but I
> might be wrong.

From drm_sched_main():

	wait_event_interruptible(sched->wake_up_worker,
				 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
				 (!drm_sched_blocked(sched) &&
				  (entity = drm_sched_select_entity(sched))) ||
				 kthread_should_stop());

	if (cleanup_job)
		sched->ops->free_job(cleanup_job);

	if (!entity)
		continue;

If cleanup_job is not NULL the rest shouldn't be evaluated I guess. Hence entity
would be NULL and we'd loop until there are no more cleanup_jobs if I don't miss
anything here.

> 
> > With a single threaded wq this should be a bit more balanced.
> 
> With a single threaded wq, it's less clear, because each work
> reschedules itself for further processing, but it's likely to be more
> or less interleaved. Anyway, I'm not too worried about cleanup taking
> precedence on run or the other way around, because the limited amount
> of HW slots (size of the ring-buffer) will regulate that.

Yeah, that's what I meant, with to work items rescheduling themselves it starts
to be interleaved. Which I'm not worried about as well.

> 
> > 
> > With a multi-threaded wq it's still the same, but run and cleanup can run
> > concurrently,
> 
> What I'm worried about is that ^. I'm not saying it's fundamentally
> unsafe, but I'm saying drm_sched hasn't been designed with this
> concurrency in mind, and I fear we'll face subtle bugs if we go from
> kthread to multi-threaded-wq+run-and-cleanup-split-in-2-work-items.
> 

Yeah, so what we get with that is that job_run() of job A and job_free() of job
B can run in parallel. Unless drivers do weird things there, I'm not seeing an
issue with that as well at a first glance.

> > which has the nice side effect that free_job() gets out of the
> > fence signaling path. At least as long as the workqueue has max_active > 1.
> 
> Oh, yeah, I don't deny using a multi-threaded workqueue has some
> benefits, just saying it might be trickier than it sounds.
> 
> > Which is one reason why I'm using a multi-threaded wq in Nouveau.
> 
> Note that I'm using a multi-threaded workqueue internally at the moment
> to deal with all sort of interactions with the FW (Mali HW only has a
> limited amount of scheduling slots, and we need to rotate entities
> having jobs to execute so every one gets a chance to run on the GPU),
> but this has been designed this way from the ground up, unlike
> drm_sched_main() operations, which were mostly thought as a fixed
> sequential set of operations. That's not to say it's impossible to get
> right, but I fear we'll face weird/unexpected behavior if we go from
> completely-serialized to multi-threaded-with-pseudo-random-processing
> order.

From a per job perspective it's still all sequential and besides fence
dependencies, which are still resolved, I don't see where jobs could have cross
dependencies that make this racy. But agree that it's probably worth to think
through it a bit more.

> 
> > 
> > That latter seems a bit subtile, we probably need to document this aspect of
> > under which conditions free_job() is or is not within the fence signaling path.
> 
> Well, I'm not even sure it can be clearly defined when the driver is
> using the submit_wq for its own work items (which can be done since we
> pass an optional submit_wq when calling drm_sched_init()). Sure, having
> max_active >= 2 should be enough to guarantee that the free_job work
> won't block the run_job one when these are the 2 only works being
> queued, but what if you have many other work items being queued by the
> driver to this wq, and some of those try to acquire resv locks? Could
> this prevent execution of the run_job() callback, thus preventing
> signaling of fences? I'm genuinely asking, don't know enough about the
> cmwq implementation to tell what's happening when work items are
> blocked (might be that the worker pool is extended to unblock the
> situation).

Yes, I think so. If max_active would be 2 and you have two jobs running on this
workqueue already waiting on allocations then the 3rd job signaling the fence
the allocation is blocked by would be stuck and we'd have a deadlock I guess.

But that's where I start to see the driver being responsible not to pass a
workqueue to the driver where it queues up other work, either at all, or that
interferes with fence signaling paths.

So, I guess the message here would be something like: free_job() must be
considered to be in the fence signaling path, unless the submit_wq is a
multi-threaded workqueue with max_active > 1 *dedicated* to the DRM scheduler.
Otherwise it's the drivers full resposibility to make sure it doesn't violate
the rules.

> 
> Anyway, documenting when free_job() is in the dma signalling path should
> be doable (single-threaded wq), but at this point, are we not better
> off considering anything called from the submit_wq as being part of the
> dma signalling path, so we can accommodate with both cases. And if
> there is cleanup processing that require taking dma_resv locks, I'd be
> tempted to queue that to a driver-specific wq (which is what I'm doing
> right now), just to be safe.
> 

It's not only the dma-resv lock, it's any lock under which allocations may be
performed.
Boris Brezillon Sept. 12, 2023, 1:27 p.m. UTC | #26
On Fri, 25 Aug 2023 15:45:49 +0200
Christian König <christian.koenig@amd.com> wrote:

> >>>> I tried this patch with Nouveau and found a race condition:
> >>>>
> >>>> In drm_sched_run_job_work() the job is added to the pending_list via
> >>>> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
> >>>> fence is signaled.
> >>>>
> >>>> However, in parallel drm_sched_get_cleanup_job() might be called from
> >>>> drm_sched_free_job_work(), which picks the first job from the pending_list and
> >>>> for the next job on the pending_list sets the scheduled fence' timestamp field.  
> >> Well why can this happen in parallel? Either the work items are scheduled to
> >> a single threaded work queue or you have protected the pending list with
> >> some locks.
> >>  
> > Xe uses a single-threaded work queue, Nouveau does not (desired
> > behavior).
> >
> > The list of pending jobs is protected by a lock (safe), the race is:
> >
> > add job to pending list
> > run_job
> > signal scheduled fence
> >
> > dequeue from pending list
> > free_job
> > update timestamp
> >
> > Once a job is on the pending list its timestamp can be accessed which
> > can blow up if scheduled fence isn't signaled or more specifically unless
> > DMA_FENCE_FLAG_TIMESTAMP_BIT is set.  

I'm a bit lost. How can this lead to a NULL deref? Timestamp is a
ktime_t embedded in dma_fence, and finished/scheduled are both
dma_fence objects embedded in drm_sched_fence. So, unless
{job,next_job}->s_fence is NULL, or {job,next_job} itself is NULL, I
don't really see where the NULL deref is. If s_fence is NULL, that means
drm_sched_job_init() wasn't called (unlikely to be detected that late),
or ->free_job()/drm_sched_job_cleanup() was called while the job was
still in the pending list. I don't really see a situation where job
could NULL to be honest.

While I agree that updating the timestamp before the fence has been
flagged as signaled/timestamped is broken (timestamp will be
overwritten when dma_fence_signal(scheduled) is called) I don't see a
situation where it would cause a NULL/invalid pointer deref. So I
suspect there's another race causing jobs to be cleaned up while
they're still in the pending_list.

> 
> Ah, that problem again. No that is actually quite harmless.
> 
> You just need to double check if the DMA_FENCE_FLAG_TIMESTAMP_BIT is 
> already set and if it's not set don't do anything.
Danilo Krummrich Sept. 12, 2023, 1:34 p.m. UTC | #27
On Tue, Sep 12, 2023 at 03:27:05PM +0200, Boris Brezillon wrote:
> On Fri, 25 Aug 2023 15:45:49 +0200
> Christian König <christian.koenig@amd.com> wrote:
> 
> > >>>> I tried this patch with Nouveau and found a race condition:
> > >>>>
> > >>>> In drm_sched_run_job_work() the job is added to the pending_list via
> > >>>> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
> > >>>> fence is signaled.
> > >>>>
> > >>>> However, in parallel drm_sched_get_cleanup_job() might be called from
> > >>>> drm_sched_free_job_work(), which picks the first job from the pending_list and
> > >>>> for the next job on the pending_list sets the scheduled fence' timestamp field.  
> > >> Well why can this happen in parallel? Either the work items are scheduled to
> > >> a single threaded work queue or you have protected the pending list with
> > >> some locks.
> > >>  
> > > Xe uses a single-threaded work queue, Nouveau does not (desired
> > > behavior).
> > >
> > > The list of pending jobs is protected by a lock (safe), the race is:
> > >
> > > add job to pending list
> > > run_job
> > > signal scheduled fence
> > >
> > > dequeue from pending list
> > > free_job
> > > update timestamp
> > >
> > > Once a job is on the pending list its timestamp can be accessed which
> > > can blow up if scheduled fence isn't signaled or more specifically unless
> > > DMA_FENCE_FLAG_TIMESTAMP_BIT is set.  
> 
> I'm a bit lost. How can this lead to a NULL deref? Timestamp is a
> ktime_t embedded in dma_fence, and finished/scheduled are both
> dma_fence objects embedded in drm_sched_fence. So, unless
> {job,next_job}->s_fence is NULL, or {job,next_job} itself is NULL, I
> don't really see where the NULL deref is. If s_fence is NULL, that means
> drm_sched_job_init() wasn't called (unlikely to be detected that late),
> or ->free_job()/drm_sched_job_cleanup() was called while the job was
> still in the pending list. I don't really see a situation where job
> could NULL to be honest.

I think the problem here was that a dma_fence' timestamp field is within a union
together with it's cb_list list_head [1]. If a timestamp is set before the fence
is actually signalled, dma_fence_signal_timestamp_locked() will access the
cb_list to run the particular callbacks registered to this dma_fence. However,
writing the timestap will overwrite this list_head since it's a union, hence
we'd try to dereference the timestamp while iterating the list.

[1] https://elixir.bootlin.com/linux/latest/source/include/linux/dma-fence.h#L87

> 
> While I agree that updating the timestamp before the fence has been
> flagged as signaled/timestamped is broken (timestamp will be
> overwritten when dma_fence_signal(scheduled) is called) I don't see a
> situation where it would cause a NULL/invalid pointer deref. So I
> suspect there's another race causing jobs to be cleaned up while
> they're still in the pending_list.
> 
> > 
> > Ah, that problem again. No that is actually quite harmless.
> > 
> > You just need to double check if the DMA_FENCE_FLAG_TIMESTAMP_BIT is 
> > already set and if it's not set don't do anything.
>
Boris Brezillon Sept. 12, 2023, 1:52 p.m. UTC | #28
On Tue, 12 Sep 2023 14:56:06 +0200
Danilo Krummrich <dakr@redhat.com> wrote:

> On Tue, Sep 12, 2023 at 02:18:18PM +0200, Boris Brezillon wrote:
> > On Tue, 12 Sep 2023 12:46:26 +0200
> > Danilo Krummrich <dakr@redhat.com> wrote:
> >   
> > > > I'm a bit worried that leaving this single vs multi-threaded wq
> > > > decision to drivers is going to cause unnecessary pain, because what
> > > > was previously a granted in term of run/cleanup execution order (thanks
> > > > to the kthread+static-drm_sched_main-workflow approach) is now subject
> > > > to the wq ordering guarantees, which depend on the wq type picked by
> > > > the driver.    
> > > 
> > > Not sure if this ends up to be much different. The only thing I could think of
> > > is that IIRC with the kthread implementation cleanup was always preferred over
> > > run.  
> > 
> > Given the sequence in drm_sched_main(), I'd say that cleanup and run
> > operations are naturally interleaved when both are available, but I
> > might be wrong.  
> 
> From drm_sched_main():
> 
> 	wait_event_interruptible(sched->wake_up_worker,
> 				 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
> 				 (!drm_sched_blocked(sched) &&
> 				  (entity = drm_sched_select_entity(sched))) ||
> 				 kthread_should_stop());
> 
> 	if (cleanup_job)
> 		sched->ops->free_job(cleanup_job);
> 
> 	if (!entity)
> 		continue;
> 
> If cleanup_job is not NULL the rest shouldn't be evaluated I guess. Hence entity
> would be NULL and we'd loop until there are no more cleanup_jobs if I don't miss
> anything here.

Indeed, I got tricked by the wait_event() expression.

> 
> >   
> > > With a single threaded wq this should be a bit more balanced.  
> > 
> > With a single threaded wq, it's less clear, because each work
> > reschedules itself for further processing, but it's likely to be more
> > or less interleaved. Anyway, I'm not too worried about cleanup taking
> > precedence on run or the other way around, because the limited amount
> > of HW slots (size of the ring-buffer) will regulate that.  
> 
> Yeah, that's what I meant, with to work items rescheduling themselves it starts
> to be interleaved. Which I'm not worried about as well.
> 
> >   
> > > 
> > > With a multi-threaded wq it's still the same, but run and cleanup can run
> > > concurrently,  
> > 
> > What I'm worried about is that ^. I'm not saying it's fundamentally
> > unsafe, but I'm saying drm_sched hasn't been designed with this
> > concurrency in mind, and I fear we'll face subtle bugs if we go from
> > kthread to multi-threaded-wq+run-and-cleanup-split-in-2-work-items.
> >   
> 
> Yeah, so what we get with that is that job_run() of job A and job_free() of job
> B can run in parallel. Unless drivers do weird things there, I'm not seeing an
> issue with that as well at a first glance.

I might be wrong of course, but I'm pretty sure the timestamp race you
reported is indirectly coming from this ST -> MT transition. Again, I'm
not saying we should never use an MT wq, but it feels a bit premature,
and I think I'd prefer if we do it in 2 steps to minimize the amount of
things that could go wrong, and avoid a late revert.

> 
> > > which has the nice side effect that free_job() gets out of the
> > > fence signaling path. At least as long as the workqueue has max_active > 1.  
> > 
> > Oh, yeah, I don't deny using a multi-threaded workqueue has some
> > benefits, just saying it might be trickier than it sounds.
> >   
> > > Which is one reason why I'm using a multi-threaded wq in Nouveau.  
> > 
> > Note that I'm using a multi-threaded workqueue internally at the moment
> > to deal with all sort of interactions with the FW (Mali HW only has a
> > limited amount of scheduling slots, and we need to rotate entities
> > having jobs to execute so every one gets a chance to run on the GPU),
> > but this has been designed this way from the ground up, unlike
> > drm_sched_main() operations, which were mostly thought as a fixed
> > sequential set of operations. That's not to say it's impossible to get
> > right, but I fear we'll face weird/unexpected behavior if we go from
> > completely-serialized to multi-threaded-with-pseudo-random-processing
> > order.  
> 
> From a per job perspective it's still all sequential and besides fence
> dependencies,

Sure, per job ops are still sequential (run, then cleanup once parent
fence is signalled).

> which are still resolved, I don't see where jobs could have cross
> dependencies that make this racy. But agree that it's probably worth to think
> through it a bit more.
> 
> >   
> > > 
> > > That latter seems a bit subtile, we probably need to document this aspect of
> > > under which conditions free_job() is or is not within the fence signaling path.  
> > 
> > Well, I'm not even sure it can be clearly defined when the driver is
> > using the submit_wq for its own work items (which can be done since we
> > pass an optional submit_wq when calling drm_sched_init()). Sure, having
> > max_active >= 2 should be enough to guarantee that the free_job work
> > won't block the run_job one when these are the 2 only works being
> > queued, but what if you have many other work items being queued by the
> > driver to this wq, and some of those try to acquire resv locks? Could
> > this prevent execution of the run_job() callback, thus preventing
> > signaling of fences? I'm genuinely asking, don't know enough about the
> > cmwq implementation to tell what's happening when work items are
> > blocked (might be that the worker pool is extended to unblock the
> > situation).  
> 
> Yes, I think so. If max_active would be 2 and you have two jobs running on this
> workqueue already waiting on allocations then the 3rd job signaling the fence
> the allocation is blocked by would be stuck and we'd have a deadlock I guess.
> 
> But that's where I start to see the driver being responsible not to pass a
> workqueue to the driver where it queues up other work, either at all, or that
> interferes with fence signaling paths.
> 
> So, I guess the message here would be something like: free_job() must be
> considered to be in the fence signaling path, unless the submit_wq is a
> multi-threaded workqueue with max_active > 1 *dedicated* to the DRM scheduler.

If it's meant to be dedicated to the drm scheduler, is there any point
passing a custom submit_wq? I mean, we could start with a dedicated
ordered-wq created by the core to replace the kthread, and then, once
enough testing has been done to make sure things work correctly in a MT
env, switch everyone to a mutithreaded-wq. The fact that we let the
caller pass its own workqueue, to then restrict its usage to things
directly related to drm_sched is somewhat confusing.

> Otherwise it's the drivers full resposibility to make sure it doesn't violate
> the rules.

Yeah, that's what I'm worried about tbh. There's so many subtle ways we
let DRM drivers shoot themselves in the foot already, using the
excuse we want drivers to be in control (for optimization/perf
concerns). I'm just not comfortable adding one more way of doing that,
especially given drm_sched has been one thread calling multiple hooks
sequentially until now, which is essentially what an ordered wq would
provide.

> 
> > 
> > Anyway, documenting when free_job() is in the dma signalling path should
> > be doable (single-threaded wq), but at this point, are we not better
> > off considering anything called from the submit_wq as being part of the
> > dma signalling path, so we can accommodate with both cases. And if
> > there is cleanup processing that require taking dma_resv locks, I'd be
> > tempted to queue that to a driver-specific wq (which is what I'm doing
> > right now), just to be safe.
> >   
> 
> It's not only the dma-resv lock, it's any lock under which allocations may be
> performed.

Sure, I was taking the resv lock in example, because that's easy to
reason about, but that's indeed any lock being taken while doing
allocations that don't have the GFP_{NOWAIT,ATOMIC} flags set.
Boris Brezillon Sept. 12, 2023, 1:53 p.m. UTC | #29
On Tue, 12 Sep 2023 15:34:41 +0200
Danilo Krummrich <dakr@redhat.com> wrote:

> On Tue, Sep 12, 2023 at 03:27:05PM +0200, Boris Brezillon wrote:
> > On Fri, 25 Aug 2023 15:45:49 +0200
> > Christian König <christian.koenig@amd.com> wrote:
> >   
> > > >>>> I tried this patch with Nouveau and found a race condition:
> > > >>>>
> > > >>>> In drm_sched_run_job_work() the job is added to the pending_list via
> > > >>>> drm_sched_job_begin(), then the run_job() callback is called and the scheduled
> > > >>>> fence is signaled.
> > > >>>>
> > > >>>> However, in parallel drm_sched_get_cleanup_job() might be called from
> > > >>>> drm_sched_free_job_work(), which picks the first job from the pending_list and
> > > >>>> for the next job on the pending_list sets the scheduled fence' timestamp field.    
> > > >> Well why can this happen in parallel? Either the work items are scheduled to
> > > >> a single threaded work queue or you have protected the pending list with
> > > >> some locks.
> > > >>    
> > > > Xe uses a single-threaded work queue, Nouveau does not (desired
> > > > behavior).
> > > >
> > > > The list of pending jobs is protected by a lock (safe), the race is:
> > > >
> > > > add job to pending list
> > > > run_job
> > > > signal scheduled fence
> > > >
> > > > dequeue from pending list
> > > > free_job
> > > > update timestamp
> > > >
> > > > Once a job is on the pending list its timestamp can be accessed which
> > > > can blow up if scheduled fence isn't signaled or more specifically unless
> > > > DMA_FENCE_FLAG_TIMESTAMP_BIT is set.    
> > 
> > I'm a bit lost. How can this lead to a NULL deref? Timestamp is a
> > ktime_t embedded in dma_fence, and finished/scheduled are both
> > dma_fence objects embedded in drm_sched_fence. So, unless
> > {job,next_job}->s_fence is NULL, or {job,next_job} itself is NULL, I
> > don't really see where the NULL deref is. If s_fence is NULL, that means
> > drm_sched_job_init() wasn't called (unlikely to be detected that late),
> > or ->free_job()/drm_sched_job_cleanup() was called while the job was
> > still in the pending list. I don't really see a situation where job
> > could NULL to be honest.  
> 
> I think the problem here was that a dma_fence' timestamp field is within a union
> together with it's cb_list list_head [1]. If a timestamp is set before the fence
> is actually signalled, dma_fence_signal_timestamp_locked() will access the
> cb_list to run the particular callbacks registered to this dma_fence. However,
> writing the timestap will overwrite this list_head since it's a union, hence
> we'd try to dereference the timestamp while iterating the list.

Ah, right. I didn't notice it was a union, thought it was a struct...

> 
> [1] https://elixir.bootlin.com/linux/latest/source/include/linux/dma-fence.h#L87
> 
> > 
> > While I agree that updating the timestamp before the fence has been
> > flagged as signaled/timestamped is broken (timestamp will be
> > overwritten when dma_fence_signal(scheduled) is called) I don't see a
> > situation where it would cause a NULL/invalid pointer deref. So I
> > suspect there's another race causing jobs to be cleaned up while
> > they're still in the pending_list.
> >   
> > > 
> > > Ah, that problem again. No that is actually quite harmless.
> > > 
> > > You just need to double check if the DMA_FENCE_FLAG_TIMESTAMP_BIT is 
> > > already set and if it's not set don't do anything.  
> >   
>
Danilo Krummrich Sept. 12, 2023, 2:10 p.m. UTC | #30
On Tue, Sep 12, 2023 at 03:52:28PM +0200, Boris Brezillon wrote:
> On Tue, 12 Sep 2023 14:56:06 +0200
> Danilo Krummrich <dakr@redhat.com> wrote:
> 
> > On Tue, Sep 12, 2023 at 02:18:18PM +0200, Boris Brezillon wrote:
> > > On Tue, 12 Sep 2023 12:46:26 +0200
> > > Danilo Krummrich <dakr@redhat.com> wrote:
> > >   
> > > > > I'm a bit worried that leaving this single vs multi-threaded wq
> > > > > decision to drivers is going to cause unnecessary pain, because what
> > > > > was previously a granted in term of run/cleanup execution order (thanks
> > > > > to the kthread+static-drm_sched_main-workflow approach) is now subject
> > > > > to the wq ordering guarantees, which depend on the wq type picked by
> > > > > the driver.    
> > > > 
> > > > Not sure if this ends up to be much different. The only thing I could think of
> > > > is that IIRC with the kthread implementation cleanup was always preferred over
> > > > run.  
> > > 
> > > Given the sequence in drm_sched_main(), I'd say that cleanup and run
> > > operations are naturally interleaved when both are available, but I
> > > might be wrong.  
> > 
> > From drm_sched_main():
> > 
> > 	wait_event_interruptible(sched->wake_up_worker,
> > 				 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
> > 				 (!drm_sched_blocked(sched) &&
> > 				  (entity = drm_sched_select_entity(sched))) ||
> > 				 kthread_should_stop());
> > 
> > 	if (cleanup_job)
> > 		sched->ops->free_job(cleanup_job);
> > 
> > 	if (!entity)
> > 		continue;
> > 
> > If cleanup_job is not NULL the rest shouldn't be evaluated I guess. Hence entity
> > would be NULL and we'd loop until there are no more cleanup_jobs if I don't miss
> > anything here.
> 
> Indeed, I got tricked by the wait_event() expression.
> 
> > 
> > >   
> > > > With a single threaded wq this should be a bit more balanced.  
> > > 
> > > With a single threaded wq, it's less clear, because each work
> > > reschedules itself for further processing, but it's likely to be more
> > > or less interleaved. Anyway, I'm not too worried about cleanup taking
> > > precedence on run or the other way around, because the limited amount
> > > of HW slots (size of the ring-buffer) will regulate that.  
> > 
> > Yeah, that's what I meant, with to work items rescheduling themselves it starts
> > to be interleaved. Which I'm not worried about as well.
> > 
> > >   
> > > > 
> > > > With a multi-threaded wq it's still the same, but run and cleanup can run
> > > > concurrently,  
> > > 
> > > What I'm worried about is that ^. I'm not saying it's fundamentally
> > > unsafe, but I'm saying drm_sched hasn't been designed with this
> > > concurrency in mind, and I fear we'll face subtle bugs if we go from
> > > kthread to multi-threaded-wq+run-and-cleanup-split-in-2-work-items.
> > >   
> > 
> > Yeah, so what we get with that is that job_run() of job A and job_free() of job
> > B can run in parallel. Unless drivers do weird things there, I'm not seeing an
> > issue with that as well at a first glance.
> 
> I might be wrong of course, but I'm pretty sure the timestamp race you
> reported is indirectly coming from this ST -> MT transition. Again, I'm
> not saying we should never use an MT wq, but it feels a bit premature,
> and I think I'd prefer if we do it in 2 steps to minimize the amount of
> things that could go wrong, and avoid a late revert.

Indirectly, yes. I would agree with using an internal single threaded workqueue
by default although I'm a bit more optimistic about that. Howver, I'd still like
the driver to choose. Otherwise, in Nouveau I'd need to keep queueing work in
free_job() to another workqueue, which isn't very nice.

> 
> > 
> > > > which has the nice side effect that free_job() gets out of the
> > > > fence signaling path. At least as long as the workqueue has max_active > 1.  
> > > 
> > > Oh, yeah, I don't deny using a multi-threaded workqueue has some
> > > benefits, just saying it might be trickier than it sounds.
> > >   
> > > > Which is one reason why I'm using a multi-threaded wq in Nouveau.  
> > > 
> > > Note that I'm using a multi-threaded workqueue internally at the moment
> > > to deal with all sort of interactions with the FW (Mali HW only has a
> > > limited amount of scheduling slots, and we need to rotate entities
> > > having jobs to execute so every one gets a chance to run on the GPU),
> > > but this has been designed this way from the ground up, unlike
> > > drm_sched_main() operations, which were mostly thought as a fixed
> > > sequential set of operations. That's not to say it's impossible to get
> > > right, but I fear we'll face weird/unexpected behavior if we go from
> > > completely-serialized to multi-threaded-with-pseudo-random-processing
> > > order.  
> > 
> > From a per job perspective it's still all sequential and besides fence
> > dependencies,
> 
> Sure, per job ops are still sequential (run, then cleanup once parent
> fence is signalled).
> 
> > which are still resolved, I don't see where jobs could have cross
> > dependencies that make this racy. But agree that it's probably worth to think
> > through it a bit more.
> > 
> > >   
> > > > 
> > > > That latter seems a bit subtile, we probably need to document this aspect of
> > > > under which conditions free_job() is or is not within the fence signaling path.  
> > > 
> > > Well, I'm not even sure it can be clearly defined when the driver is
> > > using the submit_wq for its own work items (which can be done since we
> > > pass an optional submit_wq when calling drm_sched_init()). Sure, having
> > > max_active >= 2 should be enough to guarantee that the free_job work
> > > won't block the run_job one when these are the 2 only works being
> > > queued, but what if you have many other work items being queued by the
> > > driver to this wq, and some of those try to acquire resv locks? Could
> > > this prevent execution of the run_job() callback, thus preventing
> > > signaling of fences? I'm genuinely asking, don't know enough about the
> > > cmwq implementation to tell what's happening when work items are
> > > blocked (might be that the worker pool is extended to unblock the
> > > situation).  
> > 
> > Yes, I think so. If max_active would be 2 and you have two jobs running on this
> > workqueue already waiting on allocations then the 3rd job signaling the fence
> > the allocation is blocked by would be stuck and we'd have a deadlock I guess.
> > 
> > But that's where I start to see the driver being responsible not to pass a
> > workqueue to the driver where it queues up other work, either at all, or that
> > interferes with fence signaling paths.
> > 
> > So, I guess the message here would be something like: free_job() must be
> > considered to be in the fence signaling path, unless the submit_wq is a
> > multi-threaded workqueue with max_active > 1 *dedicated* to the DRM scheduler.
> 
> If it's meant to be dedicated to the drm scheduler, is there any point
> passing a custom submit_wq? I mean, we could start with a dedicated
> ordered-wq created by the core to replace the kthread, and then, once
> enough testing has been done to make sure things work correctly in a MT
> env, switch everyone to a mutithreaded-wq. The fact that we let the
> caller pass its own workqueue, to then restrict its usage to things
> directly related to drm_sched is somewhat confusing.
> 

Well, "dedicated to the scheduler" and the other conditions are only related to
giving a guarantee about free_job() not being in the fence signaling path.

The driver could still just not care about free_job() being in the fence
signalng path or not.

Also, drivers can also still queue work to the given workqueue as long as it
does comply with fence signaling critical sections.

I'd be absolutey fine leaving this to the driver, as long as we properly
document it.

It depends on the design goal. If we want to say free_job() is always safe,
then we need to entirely restrict it.

> > Otherwise it's the drivers full resposibility to make sure it doesn't violate
> > the rules.
> 
> Yeah, that's what I'm worried about tbh. There's so many subtle ways we
> let DRM drivers shoot themselves in the foot already, using the
> excuse we want drivers to be in control (for optimization/perf
> concerns). I'm just not comfortable adding one more way of doing that,
> especially given drm_sched has been one thread calling multiple hooks
> sequentially until now, which is essentially what an ordered wq would
> provide.
> 
> > 
> > > 
> > > Anyway, documenting when free_job() is in the dma signalling path should
> > > be doable (single-threaded wq), but at this point, are we not better
> > > off considering anything called from the submit_wq as being part of the
> > > dma signalling path, so we can accommodate with both cases. And if
> > > there is cleanup processing that require taking dma_resv locks, I'd be
> > > tempted to queue that to a driver-specific wq (which is what I'm doing
> > > right now), just to be safe.
> > >   
> > 
> > It's not only the dma-resv lock, it's any lock under which allocations may be
> > performed.
> 
> Sure, I was taking the resv lock in example, because that's easy to
> reason about, but that's indeed any lock being taken while doing
> allocations that don't have the GFP_{NOWAIT,ATOMIC} flags set.
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index cede47afc800..b67469eac179 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -213,11 +213,12 @@  void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
  * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
  *
  * @rq: scheduler run queue to check.
+ * @dequeue: dequeue selected entity
  *
  * Try to find a ready entity, returns NULL if none found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
+drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
 {
 	struct drm_sched_entity *entity;
 
@@ -227,8 +228,10 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
 	if (entity) {
 		list_for_each_entry_continue(entity, &rq->entities, list) {
 			if (drm_sched_entity_is_ready(entity)) {
-				rq->current_entity = entity;
-				reinit_completion(&entity->entity_idle);
+				if (dequeue) {
+					rq->current_entity = entity;
+					reinit_completion(&entity->entity_idle);
+				}
 				spin_unlock(&rq->lock);
 				return entity;
 			}
@@ -238,8 +241,10 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
 	list_for_each_entry(entity, &rq->entities, list) {
 
 		if (drm_sched_entity_is_ready(entity)) {
-			rq->current_entity = entity;
-			reinit_completion(&entity->entity_idle);
+			if (dequeue) {
+				rq->current_entity = entity;
+				reinit_completion(&entity->entity_idle);
+			}
 			spin_unlock(&rq->lock);
 			return entity;
 		}
@@ -257,11 +262,12 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
  * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
  *
  * @rq: scheduler run queue to check.
+ * @dequeue: dequeue selected entity
  *
  * Find oldest waiting ready entity, returns NULL if none found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
+drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
 {
 	struct rb_node *rb;
 
@@ -271,8 +277,10 @@  drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 
 		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
 		if (drm_sched_entity_is_ready(entity)) {
-			rq->current_entity = entity;
-			reinit_completion(&entity->entity_idle);
+			if (dequeue) {
+				rq->current_entity = entity;
+				reinit_completion(&entity->entity_idle);
+			}
 			break;
 		}
 	}
@@ -282,13 +290,54 @@  drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 }
 
 /**
- * drm_sched_submit_queue - scheduler queue submission
+ * drm_sched_run_job_queue - queue job submission
  * @sched: scheduler instance
  */
-static void drm_sched_submit_queue(struct drm_gpu_scheduler *sched)
+static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
 {
 	if (!READ_ONCE(sched->pause_submit))
-		queue_work(sched->submit_wq, &sched->work_submit);
+		queue_work(sched->submit_wq, &sched->work_run_job);
+}
+
+static struct drm_sched_entity *
+drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue);
+
+/**
+ * drm_sched_run_job_queue_if_ready - queue job submission if ready
+ * @sched: scheduler instance
+ */
+static void drm_sched_run_job_queue_if_ready(struct drm_gpu_scheduler *sched)
+{
+	if (drm_sched_select_entity(sched, false))
+		drm_sched_run_job_queue(sched);
+}
+
+/**
+ * drm_sched_free_job_queue - queue free job
+ *
+ * @sched: scheduler instance to queue free job
+ */
+static void drm_sched_free_job_queue(struct drm_gpu_scheduler *sched)
+{
+	if (!READ_ONCE(sched->pause_submit))
+		queue_work(sched->submit_wq, &sched->work_free_job);
+}
+
+/**
+ * drm_sched_free_job_queue_if_ready - queue free job if ready
+ *
+ * @sched: scheduler instance to queue free job
+ */
+static void drm_sched_free_job_queue_if_ready(struct drm_gpu_scheduler *sched)
+{
+	struct drm_sched_job *job;
+
+	spin_lock(&sched->job_list_lock);
+	job = list_first_entry_or_null(&sched->pending_list,
+				       struct drm_sched_job, list);
+	if (job && dma_fence_is_signaled(&job->s_fence->finished))
+		drm_sched_free_job_queue(sched);
+	spin_unlock(&sched->job_list_lock);
 }
 
 /**
@@ -310,7 +359,7 @@  static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
 	dma_fence_get(&s_fence->finished);
 	drm_sched_fence_finished(s_fence, result);
 	dma_fence_put(&s_fence->finished);
-	drm_sched_submit_queue(sched);
+	drm_sched_free_job_queue(sched);
 }
 
 /**
@@ -906,18 +955,19 @@  static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
 void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
 {
 	if (drm_sched_can_queue(sched))
-		drm_sched_submit_queue(sched);
+		drm_sched_run_job_queue(sched);
 }
 
 /**
  * drm_sched_select_entity - Select next entity to process
  *
  * @sched: scheduler instance
+ * @dequeue: dequeue selected entity
  *
  * Returns the entity to process or NULL if none are found.
  */
 static struct drm_sched_entity *
-drm_sched_select_entity(struct drm_gpu_scheduler *sched)
+drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
 {
 	struct drm_sched_entity *entity;
 	int i;
@@ -935,8 +985,10 @@  drm_sched_select_entity(struct drm_gpu_scheduler *sched)
 	/* Kernel run queue has higher priority than normal run queue*/
 	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
 		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
-			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]) :
-			drm_sched_rq_select_entity_rr(&sched->sched_rq[i]);
+			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
+							dequeue) :
+			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
+						      dequeue);
 		if (entity)
 			break;
 	}
@@ -1024,30 +1076,44 @@  drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
 EXPORT_SYMBOL(drm_sched_pick_best);
 
 /**
- * drm_sched_main - main scheduler thread
+ * drm_sched_free_job_work - worker to call free_job
  *
- * @param: scheduler instance
+ * @w: free job work
  */
-static void drm_sched_main(struct work_struct *w)
+static void drm_sched_free_job_work(struct work_struct *w)
 {
 	struct drm_gpu_scheduler *sched =
-		container_of(w, struct drm_gpu_scheduler, work_submit);
-	struct drm_sched_entity *entity;
+		container_of(w, struct drm_gpu_scheduler, work_free_job);
 	struct drm_sched_job *cleanup_job;
-	int r;
 
 	if (READ_ONCE(sched->pause_submit))
 		return;
 
 	cleanup_job = drm_sched_get_cleanup_job(sched);
-	entity = drm_sched_select_entity(sched);
+	if (cleanup_job) {
+		sched->ops->free_job(cleanup_job);
+
+		drm_sched_free_job_queue_if_ready(sched);
+		drm_sched_run_job_queue_if_ready(sched);
+	}
+}
 
-	if (!entity && !cleanup_job)
-		return;	/* No more work */
+/**
+ * drm_sched_run_job_work - worker to call run_job
+ *
+ * @w: run job work
+ */
+static void drm_sched_run_job_work(struct work_struct *w)
+{
+	struct drm_gpu_scheduler *sched =
+		container_of(w, struct drm_gpu_scheduler, work_run_job);
+	struct drm_sched_entity *entity;
+	int r;
 
-	if (cleanup_job)
-		sched->ops->free_job(cleanup_job);
+	if (READ_ONCE(sched->pause_submit))
+		return;
 
+	entity = drm_sched_select_entity(sched, true);
 	if (entity) {
 		struct dma_fence *fence;
 		struct drm_sched_fence *s_fence;
@@ -1056,9 +1122,7 @@  static void drm_sched_main(struct work_struct *w)
 		sched_job = drm_sched_entity_pop_job(entity);
 		if (!sched_job) {
 			complete_all(&entity->entity_idle);
-			if (!cleanup_job)
-				return;	/* No more work */
-			goto again;
+			return;	/* No more work */
 		}
 
 		s_fence = sched_job->s_fence;
@@ -1088,10 +1152,8 @@  static void drm_sched_main(struct work_struct *w)
 		}
 
 		wake_up(&sched->job_scheduled);
+		drm_sched_run_job_queue_if_ready(sched);
 	}
-
-again:
-	drm_sched_submit_queue(sched);
 }
 
 /**
@@ -1150,7 +1212,8 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 	spin_lock_init(&sched->job_list_lock);
 	atomic_set(&sched->hw_rq_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
-	INIT_WORK(&sched->work_submit, drm_sched_main);
+	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
+	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
 	atomic_set(&sched->_score, 0);
 	atomic64_set(&sched->job_id_count, 0);
 	sched->pause_submit = false;
@@ -1275,7 +1338,8 @@  EXPORT_SYMBOL(drm_sched_submit_ready);
 void drm_sched_submit_stop(struct drm_gpu_scheduler *sched)
 {
 	WRITE_ONCE(sched->pause_submit, true);
-	cancel_work_sync(&sched->work_submit);
+	cancel_work_sync(&sched->work_run_job);
+	cancel_work_sync(&sched->work_free_job);
 }
 EXPORT_SYMBOL(drm_sched_submit_stop);
 
@@ -1287,6 +1351,7 @@  EXPORT_SYMBOL(drm_sched_submit_stop);
 void drm_sched_submit_start(struct drm_gpu_scheduler *sched)
 {
 	WRITE_ONCE(sched->pause_submit, false);
-	queue_work(sched->submit_wq, &sched->work_submit);
+	queue_work(sched->submit_wq, &sched->work_run_job);
+	queue_work(sched->submit_wq, &sched->work_free_job);
 }
 EXPORT_SYMBOL(drm_sched_submit_start);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 04eec2d7635f..fbc083a92757 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -487,9 +487,10 @@  struct drm_sched_backend_ops {
  *                 finished.
  * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
- * @submit_wq: workqueue used to queue @work_submit
+ * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
  * @timeout_wq: workqueue used to queue @work_tdr
- * @work_submit: schedules jobs and cleans up entities
+ * @work_run_job: schedules jobs
+ * @work_free_job: cleans up jobs
  * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
  *            timeout interval is over.
  * @pending_list: the list of jobs which are currently in the job queue.
@@ -518,7 +519,8 @@  struct drm_gpu_scheduler {
 	atomic64_t			job_id_count;
 	struct workqueue_struct		*submit_wq;
 	struct workqueue_struct		*timeout_wq;
-	struct work_struct		work_submit;
+	struct work_struct		work_run_job;
+	struct work_struct		work_free_job;
 	struct delayed_work		work_tdr;
 	struct list_head		pending_list;
 	spinlock_t			job_list_lock;