diff mbox series

[drm-misc-next,1/3] drm/sched: implement dynamic job flow control

Message ID 20230924224555.15595-1-dakr@redhat.com (mailing list archive)
State New, archived
Headers show
Series [drm-misc-next,1/3] drm/sched: implement dynamic job flow control | expand

Commit Message

Danilo Krummrich Sept. 24, 2023, 10:43 p.m. UTC
Currently, job flow control is implemented simply by limiting the amount
of jobs in flight. Therefore, a scheduler is initialized with a
submission limit that corresponds to a certain amount of jobs.

This implies that for each job drivers need to account for the maximum
job size possible in order to not overflow the ring buffer.

However, there are drivers, such as Nouveau, where the job size has a
rather large range. For such drivers it can easily happen that job
submissions not even filling the ring by 1% can block subsequent
submissions, which, in the worst case, can lead to the ring run dry.

In order to overcome this issue, allow for tracking the actual job size
instead of the amount job jobs. Therefore, add a field to track a job's
submission units, which represents the amount of units a job contributes
to the scheduler's submission limit.

Signed-off-by: Danilo Krummrich <dakr@redhat.com>
---
This patch is based on Matt's scheduler work [1].

[1] https://lore.kernel.org/dri-devel/20230919050155.2647172-1-matthew.brost@intel.com/
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c  |  2 +-
 drivers/gpu/drm/lima/lima_sched.c             |  2 +-
 drivers/gpu/drm/msm/msm_gem_submit.c          |  2 +-
 drivers/gpu/drm/nouveau/nouveau_sched.c       |  2 +-
 drivers/gpu/drm/panfrost/panfrost_drv.c       |  2 +-
 .../gpu/drm/scheduler/gpu_scheduler_trace.h   |  2 +-
 drivers/gpu/drm/scheduler/sched_entity.c      |  5 +-
 drivers/gpu/drm/scheduler/sched_main.c        | 81 +++++++++++++------
 drivers/gpu/drm/v3d/v3d_gem.c                 |  2 +-
 include/drm/gpu_scheduler.h                   | 18 +++--
 11 files changed, 78 insertions(+), 42 deletions(-)

Comments

Boris Brezillon Sept. 25, 2023, 12:55 p.m. UTC | #1
+The imagination team, who's probably interested too.

On Mon, 25 Sep 2023 00:43:06 +0200
Danilo Krummrich <dakr@redhat.com> wrote:

> Currently, job flow control is implemented simply by limiting the amount
> of jobs in flight. Therefore, a scheduler is initialized with a
> submission limit that corresponds to a certain amount of jobs.
> 
> This implies that for each job drivers need to account for the maximum
> job size possible in order to not overflow the ring buffer.
> 
> However, there are drivers, such as Nouveau, where the job size has a
> rather large range. For such drivers it can easily happen that job
> submissions not even filling the ring by 1% can block subsequent
> submissions, which, in the worst case, can lead to the ring run dry.
> 
> In order to overcome this issue, allow for tracking the actual job size
> instead of the amount job jobs. Therefore, add a field to track a job's
> submission units, which represents the amount of units a job contributes
> to the scheduler's submission limit.

As mentioned earlier, this might allow some simplifications in the
PowerVR driver where we do flow-control using a dma_fence returned
through ->prepare_job(). The only thing that'd be missing is a way to
dynamically query the size of a job (a new hook?), instead of having the
size fixed at creation time, because PVR jobs embed native fence waits,
and the number of native fences will decrease if some of these fences
are signalled before ->run_job() is called, thus reducing the job size.

> 
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
> This patch is based on Matt's scheduler work [1].
> 
> [1] https://lore.kernel.org/dri-devel/20230919050155.2647172-1-matthew.brost@intel.com/
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  2 +-
>  drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c  |  2 +-
>  drivers/gpu/drm/lima/lima_sched.c             |  2 +-
>  drivers/gpu/drm/msm/msm_gem_submit.c          |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_sched.c       |  2 +-
>  drivers/gpu/drm/panfrost/panfrost_drv.c       |  2 +-
>  .../gpu/drm/scheduler/gpu_scheduler_trace.h   |  2 +-
>  drivers/gpu/drm/scheduler/sched_entity.c      |  5 +-
>  drivers/gpu/drm/scheduler/sched_main.c        | 81 +++++++++++++------
>  drivers/gpu/drm/v3d/v3d_gem.c                 |  2 +-
>  include/drm/gpu_scheduler.h                   | 18 +++--
>  11 files changed, 78 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 78476bc75b4e..d54daaf64bf1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -115,7 +115,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  	if (!entity)
>  		return 0;
>  
> -	return drm_sched_job_init(&(*job)->base, entity, owner);
> +	return drm_sched_job_init(&(*job)->base, entity, 1, owner);
>  }
>  
>  int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> index 45403ea38906..74a446711207 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> @@ -538,7 +538,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
>  
>  	ret = drm_sched_job_init(&submit->sched_job,
>  				 &ctx->sched_entity[args->pipe],
> -				 submit->ctx);
> +				 1, submit->ctx);
>  	if (ret)
>  		goto err_submit_put;
>  
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index 50c2075228aa..5dc6678e1eb9 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -123,7 +123,7 @@ int lima_sched_task_init(struct lima_sched_task *task,
>  	for (i = 0; i < num_bos; i++)
>  		drm_gem_object_get(&bos[i]->base.base);
>  
> -	err = drm_sched_job_init(&task->base, &context->base, vm);
> +	err = drm_sched_job_init(&task->base, &context->base, 1, vm);
>  	if (err) {
>  		kfree(task->bos);
>  		return err;
> diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
> index 3f1aa4de3b87..6d230c38e4f5 100644
> --- a/drivers/gpu/drm/msm/msm_gem_submit.c
> +++ b/drivers/gpu/drm/msm/msm_gem_submit.c
> @@ -48,7 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
>  		return ERR_PTR(ret);
>  	}
>  
> -	ret = drm_sched_job_init(&submit->base, queue->entity, queue);
> +	ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue);
>  	if (ret) {
>  		kfree(submit->hw_fence);
>  		kfree(submit);
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index f26a814a9920..e991426d86e4 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -89,7 +89,7 @@ nouveau_job_init(struct nouveau_job *job,
>  
>  	}
>  
> -	ret = drm_sched_job_init(&job->base, &entity->base, NULL);
> +	ret = drm_sched_job_init(&job->base, &entity->base, 1, NULL);
>  	if (ret)
>  		goto err_free_chains;
>  
> diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
> index a2ab99698ca8..d5e777deee5c 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_drv.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
> @@ -272,7 +272,7 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
>  
>  	ret = drm_sched_job_init(&job->base,
>  				 &file_priv->sched_entity[slot],
> -				 NULL);
> +				 1, NULL);
>  	if (ret)
>  		goto out_put_job;
>  
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
> index 3143ecaaff86..2e4ffdecc5dc 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
> @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(drm_sched_job,
>  			   __assign_str(name, sched_job->sched->name);
>  			   __entry->job_count = spsc_queue_count(&entity->job_queue);
>  			   __entry->hw_job_count = atomic_read(
> -				   &sched_job->sched->hw_rq_count);
> +				   &sched_job->sched->submission_count);
>  			   ),
>  	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
>  		      __entry->entity, __entry->id,
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 437c50867c99..6395090d5784 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -401,7 +401,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
>  		container_of(cb, struct drm_sched_entity, cb);
>  
>  	drm_sched_entity_clear_dep(f, cb);
> -	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity));
> +	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity),
> +				      entity);
>  }
>  
>  /**
> @@ -645,7 +646,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>  		if (fifo)
>  			drm_sched_rq_update_fifo(entity, submit_ts);
>  
> -		drm_sched_wakeup_if_can_queue(sched);
> +		drm_sched_wakeup_if_can_queue(sched, entity);
>  	}
>  }
>  EXPORT_SYMBOL(drm_sched_entity_push_job);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 88ef8be2d3c7..857622dd842e 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -93,6 +93,9 @@ int drm_sched_policy_default = DRM_SCHED_POLICY_FIFO;
>  MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
>  module_param_named(sched_policy, drm_sched_policy_default, int, 0444);
>  
> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
> +				struct drm_sched_entity *entity);
> +
>  static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
>  							    const struct rb_node *b)
>  {
> @@ -212,13 +215,15 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>  /**
>   * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>   *
> + * @sched: the gpu scheduler
>   * @rq: scheduler run queue to check.
>   * @dequeue: dequeue selected entity
>   *
>   * Try to find a ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> +drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
> +			      struct drm_sched_rq *rq, bool dequeue)
>  {
>  	struct drm_sched_entity *entity;
>  
> @@ -227,7 +232,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>  	entity = rq->current_entity;
>  	if (entity) {
>  		list_for_each_entry_continue(entity, &rq->entities, list) {
> -			if (drm_sched_entity_is_ready(entity)) {
> +			if (drm_sched_entity_is_ready(entity) &&
> +			    drm_sched_can_queue(sched, entity)) {
>  				if (dequeue) {
>  					rq->current_entity = entity;
>  					reinit_completion(&entity->entity_idle);
> @@ -240,7 +246,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>  
>  	list_for_each_entry(entity, &rq->entities, list) {
>  
> -		if (drm_sched_entity_is_ready(entity)) {
> +		if (drm_sched_entity_is_ready(entity) &&
> +		    drm_sched_can_queue(sched, entity)) {
>  			if (dequeue) {
>  				rq->current_entity = entity;
>  				reinit_completion(&entity->entity_idle);
> @@ -261,13 +268,15 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>  /**
>   * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>   *
> + * @sched: the gpu scheduler
>   * @rq: scheduler run queue to check.
>   * @dequeue: dequeue selected entity
>   *
>   * Find oldest waiting ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> +drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
> +				struct drm_sched_rq *rq, bool dequeue)
>  {
>  	struct rb_node *rb;
>  
> @@ -276,7 +285,8 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>  		struct drm_sched_entity *entity;
>  
>  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> -		if (drm_sched_entity_is_ready(entity)) {
> +		if (drm_sched_entity_is_ready(entity) &&
> +		    drm_sched_can_queue(sched, entity)) {
>  			if (dequeue) {
>  				rq->current_entity = entity;
>  				reinit_completion(&entity->entity_idle);
> @@ -300,15 +310,27 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>  }
>  
>  /**
> - * drm_sched_can_queue -- Can we queue more to the hardware?
> + * drm_sched_can_queue - can we queue more jobs?
>   * @sched: scheduler instance
> + * @entity: the scheduler entity
>   *
> - * Return true if we can push more jobs to the hw, otherwise false.
> + * Return true if we can push at least one more job from @entity, false
> + * otherwise.
>   */
> -static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
> +				struct drm_sched_entity *entity)
>  {
> -	return atomic_read(&sched->hw_rq_count) <
> -		sched->hw_submission_limit;
> +	struct drm_sched_job *s_job;
> +
> +	s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
> +	if (!s_job)
> +		return false;
> +
> +	WARN_ON(s_job->submission_units > sched->submission_limit);
> +
> +	return (sched->submission_limit -
> +		atomic_read(&sched->submission_count)) >=
> +		s_job->submission_units;
>  }
>  
>  /**
> @@ -325,12 +347,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>  	struct drm_sched_entity *entity;
>  	int i;
>  
> -	if (!drm_sched_can_queue(sched))
> -		return NULL;
> -
>  	if (sched->single_entity) {
>  		if (!READ_ONCE(sched->single_entity->stopped) &&
> -		    drm_sched_entity_is_ready(sched->single_entity))
> +		    drm_sched_entity_is_ready(sched->single_entity) &&
> +		    drm_sched_can_queue(sched, sched->single_entity))
>  			return sched->single_entity;
>  
>  		return NULL;
> @@ -339,9 +359,11 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>  	/* Kernel run queue has higher priority than normal run queue*/
>  	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>  		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> +			drm_sched_rq_select_entity_fifo(sched,
> +							&sched->sched_rq[i],
>  							dequeue) :
> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> +			drm_sched_rq_select_entity_rr(sched,
> +						      &sched->sched_rq[i],
>  						      dequeue);
>  		if (entity)
>  			break;
> @@ -399,7 +421,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>  	struct drm_sched_fence *s_fence = s_job->s_fence;
>  	struct drm_gpu_scheduler *sched = s_fence->sched;
>  
> -	atomic_dec(&sched->hw_rq_count);
> +	atomic_sub(s_job->submission_units, &sched->submission_count);
>  	atomic_dec(sched->score);
>  
>  	trace_drm_sched_process_job(s_fence);
> @@ -622,7 +644,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>  					      &s_job->cb)) {
>  			dma_fence_put(s_job->s_fence->parent);
>  			s_job->s_fence->parent = NULL;
> -			atomic_dec(&sched->hw_rq_count);
> +			atomic_sub(s_job->submission_units,
> +				   &sched->submission_count);
>  		} else {
>  			/*
>  			 * remove job from pending_list.
> @@ -683,7 +706,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>  	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>  		struct dma_fence *fence = s_job->s_fence->parent;
>  
> -		atomic_inc(&sched->hw_rq_count);
> +		atomic_add(s_job->submission_units, &sched->submission_count);
>  
>  		if (!full_recovery)
>  			continue;
> @@ -764,6 +787,8 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>   * drm_sched_job_init - init a scheduler job
>   * @job: scheduler job to init
>   * @entity: scheduler entity to use
> + * @submission_units: the amount of units this job contributes to the schdulers
> + * submission limit
>   * @owner: job owner for debugging
>   *
>   * Refer to drm_sched_entity_push_job() documentation
> @@ -781,6 +806,7 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>   */
>  int drm_sched_job_init(struct drm_sched_job *job,
>  		       struct drm_sched_entity *entity,
> +		       u32 submission_units,
>  		       void *owner)
>  {
>  	if (!entity->rq && !entity->single_sched)
> @@ -792,6 +818,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>  		return -ENOMEM;
>  
>  	INIT_LIST_HEAD(&job->list);
> +	job->submission_units = submission_units ? submission_units : 1;
>  
>  	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
>  
> @@ -1004,12 +1031,14 @@ EXPORT_SYMBOL(drm_sched_job_cleanup);
>  /**
>   * drm_sched_wakeup_if_can_queue - Wake up the scheduler
>   * @sched: scheduler instance
> + * @entity: the scheduler entity
>   *
>   * Wake up the scheduler if we can queue jobs.
>   */
> -void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> +void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
> +				   struct drm_sched_entity *entity)
>  {
> -	if (drm_sched_can_queue(sched))
> +	if (drm_sched_can_queue(sched, entity))
>  		drm_sched_run_job_queue(sched);
>  }
>  
> @@ -1147,7 +1176,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>  
>  	s_fence = sched_job->s_fence;
>  
> -	atomic_inc(&sched->hw_rq_count);
> +	atomic_add(sched_job->submission_units, &sched->submission_count);
>  	drm_sched_job_begin(sched_job);
>  
>  	trace_drm_run_job(sched_job, entity);
> @@ -1183,7 +1212,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>   * @ops: backend operations for this scheduler
>   * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is
>   *	       allocated and used
> - * @hw_submission: number of hw submissions that can be in flight
> + * @max_submission_units: number of submission units that can be in flight
>   * @hang_limit: number of times to allow a job to hang before dropping it
>   * @timeout: timeout value in jiffies for the scheduler
>   * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
> @@ -1198,7 +1227,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>  int drm_sched_init(struct drm_gpu_scheduler *sched,
>  		   const struct drm_sched_backend_ops *ops,
>  		   struct workqueue_struct *submit_wq,
> -		   unsigned hw_submission, unsigned hang_limit,
> +		   unsigned max_submission_units, unsigned hang_limit,
>  		   long timeout, struct workqueue_struct *timeout_wq,
>  		   atomic_t *score, const char *name,
>  		   enum drm_sched_policy sched_policy,
> @@ -1211,7 +1240,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  
>  	sched->ops = ops;
>  	sched->single_entity = NULL;
> -	sched->hw_submission_limit = hw_submission;
> +	sched->submission_limit = max_submission_units;
>  	sched->name = name;
>  	if (!submit_wq) {
>  		sched->submit_wq = alloc_ordered_workqueue(name, 0);
> @@ -1238,7 +1267,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  	init_waitqueue_head(&sched->job_scheduled);
>  	INIT_LIST_HEAD(&sched->pending_list);
>  	spin_lock_init(&sched->job_list_lock);
> -	atomic_set(&sched->hw_rq_count, 0);
> +	atomic_set(&sched->submission_count, 0);
>  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>  	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>  	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
> index 2e94ce788c71..8479e5302f7b 100644
> --- a/drivers/gpu/drm/v3d/v3d_gem.c
> +++ b/drivers/gpu/drm/v3d/v3d_gem.c
> @@ -417,7 +417,7 @@ v3d_job_init(struct v3d_dev *v3d, struct drm_file *file_priv,
>  	job->free = free;
>  
>  	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
> -				 v3d_priv);
> +				 1, v3d_priv);
>  	if (ret)
>  		goto fail;
>  
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 27f5778bbd6d..89b0aecd02e3 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -329,6 +329,8 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>   * @sched: the scheduler instance on which this job is scheduled.
>   * @s_fence: contains the fences for the scheduling of job.
>   * @finish_cb: the callback for the finished fence.
> + * @submission_units: the amount of submission units this job contributes to
> + *                    the scheduler
>   * @work: Helper to reschdeule job kill to different context.
>   * @id: a unique id assigned to each job scheduled on the scheduler.
>   * @karma: increment on every hang caused by this job. If this exceeds the hang
> @@ -348,6 +350,8 @@ struct drm_sched_job {
>  	struct drm_gpu_scheduler	*sched;
>  	struct drm_sched_fence		*s_fence;
>  
> +	u32				submission_units;
> +
>  	/*
>  	 * work is used only after finish_cb has been used and will not be
>  	 * accessed anymore.
> @@ -478,14 +482,14 @@ struct drm_sched_backend_ops {
>   *
>   * @ops: backend operations provided by the driver.
>   * @single_entity: Single entity for the scheduler
> - * @hw_submission_limit: the max size of the hardware queue.
> + * @submission_limit: the maximim amount of submission units
> + * @submission_count: the number current amount of submission units in flight
>   * @timeout: the time after which a job is removed from the scheduler.
>   * @name: name of the ring for which this scheduler is being used.
>   * @sched_rq: priority wise array of run queues.
>   * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>   *                 waits on this wait queue until all the scheduled jobs are
>   *                 finished.
> - * @hw_rq_count: the number of jobs currently in the hardware queue.
>   * @job_id_count: used to assign unique id to the each job.
>   * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>   * @timeout_wq: workqueue used to queue @work_tdr
> @@ -511,12 +515,12 @@ struct drm_sched_backend_ops {
>  struct drm_gpu_scheduler {
>  	const struct drm_sched_backend_ops	*ops;
>  	struct drm_sched_entity		*single_entity;
> -	uint32_t			hw_submission_limit;
> +	u32				submission_limit;
> +	atomic_t			submission_count;
>  	long				timeout;
>  	const char			*name;
>  	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
>  	wait_queue_head_t		job_scheduled;
> -	atomic_t			hw_rq_count;
>  	atomic64_t			job_id_count;
>  	struct workqueue_struct		*submit_wq;
>  	struct workqueue_struct		*timeout_wq;
> @@ -539,7 +543,7 @@ struct drm_gpu_scheduler {
>  int drm_sched_init(struct drm_gpu_scheduler *sched,
>  		   const struct drm_sched_backend_ops *ops,
>  		   struct workqueue_struct *submit_wq,
> -		   uint32_t hw_submission, unsigned hang_limit,
> +		   uint32_t max_submission_units, unsigned hang_limit,
>  		   long timeout, struct workqueue_struct *timeout_wq,
>  		   atomic_t *score, const char *name,
>  		   enum drm_sched_policy sched_policy,
> @@ -548,6 +552,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  void drm_sched_fini(struct drm_gpu_scheduler *sched);
>  int drm_sched_job_init(struct drm_sched_job *job,
>  		       struct drm_sched_entity *entity,
> +		       u32 submission_units,
>  		       void *owner);
>  void drm_sched_job_arm(struct drm_sched_job *job);
>  int drm_sched_job_add_dependency(struct drm_sched_job *job,
> @@ -570,7 +575,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>  
>  void drm_sched_tdr_queue_imm(struct drm_gpu_scheduler *sched);
>  void drm_sched_job_cleanup(struct drm_sched_job *job);
> -void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched);
> +void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
> +				   struct drm_sched_entity *entity);
>  bool drm_sched_submit_ready(struct drm_gpu_scheduler *sched);
>  void drm_sched_submit_stop(struct drm_gpu_scheduler *sched);
>  void drm_sched_submit_start(struct drm_gpu_scheduler *sched);
Christian König Sept. 25, 2023, 5:55 p.m. UTC | #2
Am 25.09.23 um 14:55 schrieb Boris Brezillon:
> +The imagination team, who's probably interested too.
>
> On Mon, 25 Sep 2023 00:43:06 +0200
> Danilo Krummrich <dakr@redhat.com> wrote:
>
>> Currently, job flow control is implemented simply by limiting the amount
>> of jobs in flight. Therefore, a scheduler is initialized with a
>> submission limit that corresponds to a certain amount of jobs.
>>
>> This implies that for each job drivers need to account for the maximum
>> job size possible in order to not overflow the ring buffer.
>>
>> However, there are drivers, such as Nouveau, where the job size has a
>> rather large range. For such drivers it can easily happen that job
>> submissions not even filling the ring by 1% can block subsequent
>> submissions, which, in the worst case, can lead to the ring run dry.
>>
>> In order to overcome this issue, allow for tracking the actual job size
>> instead of the amount job jobs. Therefore, add a field to track a job's
>> submission units, which represents the amount of units a job contributes
>> to the scheduler's submission limit.
> As mentioned earlier, this might allow some simplifications in the
> PowerVR driver where we do flow-control using a dma_fence returned
> through ->prepare_job(). The only thing that'd be missing is a way to
> dynamically query the size of a job (a new hook?), instead of having the
> size fixed at creation time, because PVR jobs embed native fence waits,
> and the number of native fences will decrease if some of these fences
> are signalled before ->run_job() is called, thus reducing the job size.

Exactly that is a little bit questionable since it allows for the device 
to postpone jobs infinitely.

It would be good if the scheduler is able to validate if it's ever able 
to run the job when it is pushed into the entity.

Otherwise you can end up with really hard to debug problems.

Regards,
Christian.

>
>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>> ---
>> This patch is based on Matt's scheduler work [1].
>>
>> [1] https://lore.kernel.org/dri-devel/20230919050155.2647172-1-matthew.brost@intel.com/
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  2 +-
>>   drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c  |  2 +-
>>   drivers/gpu/drm/lima/lima_sched.c             |  2 +-
>>   drivers/gpu/drm/msm/msm_gem_submit.c          |  2 +-
>>   drivers/gpu/drm/nouveau/nouveau_sched.c       |  2 +-
>>   drivers/gpu/drm/panfrost/panfrost_drv.c       |  2 +-
>>   .../gpu/drm/scheduler/gpu_scheduler_trace.h   |  2 +-
>>   drivers/gpu/drm/scheduler/sched_entity.c      |  5 +-
>>   drivers/gpu/drm/scheduler/sched_main.c        | 81 +++++++++++++------
>>   drivers/gpu/drm/v3d/v3d_gem.c                 |  2 +-
>>   include/drm/gpu_scheduler.h                   | 18 +++--
>>   11 files changed, 78 insertions(+), 42 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 78476bc75b4e..d54daaf64bf1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -115,7 +115,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>   	if (!entity)
>>   		return 0;
>>   
>> -	return drm_sched_job_init(&(*job)->base, entity, owner);
>> +	return drm_sched_job_init(&(*job)->base, entity, 1, owner);
>>   }
>>   
>>   int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
>> index 45403ea38906..74a446711207 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
>> @@ -538,7 +538,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
>>   
>>   	ret = drm_sched_job_init(&submit->sched_job,
>>   				 &ctx->sched_entity[args->pipe],
>> -				 submit->ctx);
>> +				 1, submit->ctx);
>>   	if (ret)
>>   		goto err_submit_put;
>>   
>> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
>> index 50c2075228aa..5dc6678e1eb9 100644
>> --- a/drivers/gpu/drm/lima/lima_sched.c
>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>> @@ -123,7 +123,7 @@ int lima_sched_task_init(struct lima_sched_task *task,
>>   	for (i = 0; i < num_bos; i++)
>>   		drm_gem_object_get(&bos[i]->base.base);
>>   
>> -	err = drm_sched_job_init(&task->base, &context->base, vm);
>> +	err = drm_sched_job_init(&task->base, &context->base, 1, vm);
>>   	if (err) {
>>   		kfree(task->bos);
>>   		return err;
>> diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
>> index 3f1aa4de3b87..6d230c38e4f5 100644
>> --- a/drivers/gpu/drm/msm/msm_gem_submit.c
>> +++ b/drivers/gpu/drm/msm/msm_gem_submit.c
>> @@ -48,7 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
>>   		return ERR_PTR(ret);
>>   	}
>>   
>> -	ret = drm_sched_job_init(&submit->base, queue->entity, queue);
>> +	ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue);
>>   	if (ret) {
>>   		kfree(submit->hw_fence);
>>   		kfree(submit);
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
>> index f26a814a9920..e991426d86e4 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
>> @@ -89,7 +89,7 @@ nouveau_job_init(struct nouveau_job *job,
>>   
>>   	}
>>   
>> -	ret = drm_sched_job_init(&job->base, &entity->base, NULL);
>> +	ret = drm_sched_job_init(&job->base, &entity->base, 1, NULL);
>>   	if (ret)
>>   		goto err_free_chains;
>>   
>> diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
>> index a2ab99698ca8..d5e777deee5c 100644
>> --- a/drivers/gpu/drm/panfrost/panfrost_drv.c
>> +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
>> @@ -272,7 +272,7 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
>>   
>>   	ret = drm_sched_job_init(&job->base,
>>   				 &file_priv->sched_entity[slot],
>> -				 NULL);
>> +				 1, NULL);
>>   	if (ret)
>>   		goto out_put_job;
>>   
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
>> index 3143ecaaff86..2e4ffdecc5dc 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
>> @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(drm_sched_job,
>>   			   __assign_str(name, sched_job->sched->name);
>>   			   __entry->job_count = spsc_queue_count(&entity->job_queue);
>>   			   __entry->hw_job_count = atomic_read(
>> -				   &sched_job->sched->hw_rq_count);
>> +				   &sched_job->sched->submission_count);
>>   			   ),
>>   	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
>>   		      __entry->entity, __entry->id,
>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
>> index 437c50867c99..6395090d5784 100644
>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>> @@ -401,7 +401,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
>>   		container_of(cb, struct drm_sched_entity, cb);
>>   
>>   	drm_sched_entity_clear_dep(f, cb);
>> -	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity));
>> +	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity),
>> +				      entity);
>>   }
>>   
>>   /**
>> @@ -645,7 +646,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>   		if (fifo)
>>   			drm_sched_rq_update_fifo(entity, submit_ts);
>>   
>> -		drm_sched_wakeup_if_can_queue(sched);
>> +		drm_sched_wakeup_if_can_queue(sched, entity);
>>   	}
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_push_job);
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>> index 88ef8be2d3c7..857622dd842e 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -93,6 +93,9 @@ int drm_sched_policy_default = DRM_SCHED_POLICY_FIFO;
>>   MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
>>   module_param_named(sched_policy, drm_sched_policy_default, int, 0444);
>>   
>> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
>> +				struct drm_sched_entity *entity);
>> +
>>   static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
>>   							    const struct rb_node *b)
>>   {
>> @@ -212,13 +215,15 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>   /**
>>    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>    *
>> + * @sched: the gpu scheduler
>>    * @rq: scheduler run queue to check.
>>    * @dequeue: dequeue selected entity
>>    *
>>    * Try to find a ready entity, returns NULL if none found.
>>    */
>>   static struct drm_sched_entity *
>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>> +drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
>> +			      struct drm_sched_rq *rq, bool dequeue)
>>   {
>>   	struct drm_sched_entity *entity;
>>   
>> @@ -227,7 +232,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>   	entity = rq->current_entity;
>>   	if (entity) {
>>   		list_for_each_entry_continue(entity, &rq->entities, list) {
>> -			if (drm_sched_entity_is_ready(entity)) {
>> +			if (drm_sched_entity_is_ready(entity) &&
>> +			    drm_sched_can_queue(sched, entity)) {
>>   				if (dequeue) {
>>   					rq->current_entity = entity;
>>   					reinit_completion(&entity->entity_idle);
>> @@ -240,7 +246,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>   
>>   	list_for_each_entry(entity, &rq->entities, list) {
>>   
>> -		if (drm_sched_entity_is_ready(entity)) {
>> +		if (drm_sched_entity_is_ready(entity) &&
>> +		    drm_sched_can_queue(sched, entity)) {
>>   			if (dequeue) {
>>   				rq->current_entity = entity;
>>   				reinit_completion(&entity->entity_idle);
>> @@ -261,13 +268,15 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>   /**
>>    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>    *
>> + * @sched: the gpu scheduler
>>    * @rq: scheduler run queue to check.
>>    * @dequeue: dequeue selected entity
>>    *
>>    * Find oldest waiting ready entity, returns NULL if none found.
>>    */
>>   static struct drm_sched_entity *
>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>> +drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
>> +				struct drm_sched_rq *rq, bool dequeue)
>>   {
>>   	struct rb_node *rb;
>>   
>> @@ -276,7 +285,8 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>   		struct drm_sched_entity *entity;
>>   
>>   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>> -		if (drm_sched_entity_is_ready(entity)) {
>> +		if (drm_sched_entity_is_ready(entity) &&
>> +		    drm_sched_can_queue(sched, entity)) {
>>   			if (dequeue) {
>>   				rq->current_entity = entity;
>>   				reinit_completion(&entity->entity_idle);
>> @@ -300,15 +310,27 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>   }
>>   
>>   /**
>> - * drm_sched_can_queue -- Can we queue more to the hardware?
>> + * drm_sched_can_queue - can we queue more jobs?
>>    * @sched: scheduler instance
>> + * @entity: the scheduler entity
>>    *
>> - * Return true if we can push more jobs to the hw, otherwise false.
>> + * Return true if we can push at least one more job from @entity, false
>> + * otherwise.
>>    */
>> -static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
>> +				struct drm_sched_entity *entity)
>>   {
>> -	return atomic_read(&sched->hw_rq_count) <
>> -		sched->hw_submission_limit;
>> +	struct drm_sched_job *s_job;
>> +
>> +	s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>> +	if (!s_job)
>> +		return false;
>> +
>> +	WARN_ON(s_job->submission_units > sched->submission_limit);
>> +
>> +	return (sched->submission_limit -
>> +		atomic_read(&sched->submission_count)) >=
>> +		s_job->submission_units;
>>   }
>>   
>>   /**
>> @@ -325,12 +347,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>   	struct drm_sched_entity *entity;
>>   	int i;
>>   
>> -	if (!drm_sched_can_queue(sched))
>> -		return NULL;
>> -
>>   	if (sched->single_entity) {
>>   		if (!READ_ONCE(sched->single_entity->stopped) &&
>> -		    drm_sched_entity_is_ready(sched->single_entity))
>> +		    drm_sched_entity_is_ready(sched->single_entity) &&
>> +		    drm_sched_can_queue(sched, sched->single_entity))
>>   			return sched->single_entity;
>>   
>>   		return NULL;
>> @@ -339,9 +359,11 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>   	/* Kernel run queue has higher priority than normal run queue*/
>>   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>> +			drm_sched_rq_select_entity_fifo(sched,
>> +							&sched->sched_rq[i],
>>   							dequeue) :
>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>> +			drm_sched_rq_select_entity_rr(sched,
>> +						      &sched->sched_rq[i],
>>   						      dequeue);
>>   		if (entity)
>>   			break;
>> @@ -399,7 +421,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>   	struct drm_sched_fence *s_fence = s_job->s_fence;
>>   	struct drm_gpu_scheduler *sched = s_fence->sched;
>>   
>> -	atomic_dec(&sched->hw_rq_count);
>> +	atomic_sub(s_job->submission_units, &sched->submission_count);
>>   	atomic_dec(sched->score);
>>   
>>   	trace_drm_sched_process_job(s_fence);
>> @@ -622,7 +644,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>>   					      &s_job->cb)) {
>>   			dma_fence_put(s_job->s_fence->parent);
>>   			s_job->s_fence->parent = NULL;
>> -			atomic_dec(&sched->hw_rq_count);
>> +			atomic_sub(s_job->submission_units,
>> +				   &sched->submission_count);
>>   		} else {
>>   			/*
>>   			 * remove job from pending_list.
>> @@ -683,7 +706,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>>   	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>>   		struct dma_fence *fence = s_job->s_fence->parent;
>>   
>> -		atomic_inc(&sched->hw_rq_count);
>> +		atomic_add(s_job->submission_units, &sched->submission_count);
>>   
>>   		if (!full_recovery)
>>   			continue;
>> @@ -764,6 +787,8 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>    * drm_sched_job_init - init a scheduler job
>>    * @job: scheduler job to init
>>    * @entity: scheduler entity to use
>> + * @submission_units: the amount of units this job contributes to the schdulers
>> + * submission limit
>>    * @owner: job owner for debugging
>>    *
>>    * Refer to drm_sched_entity_push_job() documentation
>> @@ -781,6 +806,7 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>    */
>>   int drm_sched_job_init(struct drm_sched_job *job,
>>   		       struct drm_sched_entity *entity,
>> +		       u32 submission_units,
>>   		       void *owner)
>>   {
>>   	if (!entity->rq && !entity->single_sched)
>> @@ -792,6 +818,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>>   		return -ENOMEM;
>>   
>>   	INIT_LIST_HEAD(&job->list);
>> +	job->submission_units = submission_units ? submission_units : 1;
>>   
>>   	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
>>   
>> @@ -1004,12 +1031,14 @@ EXPORT_SYMBOL(drm_sched_job_cleanup);
>>   /**
>>    * drm_sched_wakeup_if_can_queue - Wake up the scheduler
>>    * @sched: scheduler instance
>> + * @entity: the scheduler entity
>>    *
>>    * Wake up the scheduler if we can queue jobs.
>>    */
>> -void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>> +void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
>> +				   struct drm_sched_entity *entity)
>>   {
>> -	if (drm_sched_can_queue(sched))
>> +	if (drm_sched_can_queue(sched, entity))
>>   		drm_sched_run_job_queue(sched);
>>   }
>>   
>> @@ -1147,7 +1176,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>>   
>>   	s_fence = sched_job->s_fence;
>>   
>> -	atomic_inc(&sched->hw_rq_count);
>> +	atomic_add(sched_job->submission_units, &sched->submission_count);
>>   	drm_sched_job_begin(sched_job);
>>   
>>   	trace_drm_run_job(sched_job, entity);
>> @@ -1183,7 +1212,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>>    * @ops: backend operations for this scheduler
>>    * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is
>>    *	       allocated and used
>> - * @hw_submission: number of hw submissions that can be in flight
>> + * @max_submission_units: number of submission units that can be in flight
>>    * @hang_limit: number of times to allow a job to hang before dropping it
>>    * @timeout: timeout value in jiffies for the scheduler
>>    * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
>> @@ -1198,7 +1227,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   		   const struct drm_sched_backend_ops *ops,
>>   		   struct workqueue_struct *submit_wq,
>> -		   unsigned hw_submission, unsigned hang_limit,
>> +		   unsigned max_submission_units, unsigned hang_limit,
>>   		   long timeout, struct workqueue_struct *timeout_wq,
>>   		   atomic_t *score, const char *name,
>>   		   enum drm_sched_policy sched_policy,
>> @@ -1211,7 +1240,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   
>>   	sched->ops = ops;
>>   	sched->single_entity = NULL;
>> -	sched->hw_submission_limit = hw_submission;
>> +	sched->submission_limit = max_submission_units;
>>   	sched->name = name;
>>   	if (!submit_wq) {
>>   		sched->submit_wq = alloc_ordered_workqueue(name, 0);
>> @@ -1238,7 +1267,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   	init_waitqueue_head(&sched->job_scheduled);
>>   	INIT_LIST_HEAD(&sched->pending_list);
>>   	spin_lock_init(&sched->job_list_lock);
>> -	atomic_set(&sched->hw_rq_count, 0);
>> +	atomic_set(&sched->submission_count, 0);
>>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>   	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>   	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>> diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
>> index 2e94ce788c71..8479e5302f7b 100644
>> --- a/drivers/gpu/drm/v3d/v3d_gem.c
>> +++ b/drivers/gpu/drm/v3d/v3d_gem.c
>> @@ -417,7 +417,7 @@ v3d_job_init(struct v3d_dev *v3d, struct drm_file *file_priv,
>>   	job->free = free;
>>   
>>   	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
>> -				 v3d_priv);
>> +				 1, v3d_priv);
>>   	if (ret)
>>   		goto fail;
>>   
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 27f5778bbd6d..89b0aecd02e3 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -329,6 +329,8 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>>    * @sched: the scheduler instance on which this job is scheduled.
>>    * @s_fence: contains the fences for the scheduling of job.
>>    * @finish_cb: the callback for the finished fence.
>> + * @submission_units: the amount of submission units this job contributes to
>> + *                    the scheduler
>>    * @work: Helper to reschdeule job kill to different context.
>>    * @id: a unique id assigned to each job scheduled on the scheduler.
>>    * @karma: increment on every hang caused by this job. If this exceeds the hang
>> @@ -348,6 +350,8 @@ struct drm_sched_job {
>>   	struct drm_gpu_scheduler	*sched;
>>   	struct drm_sched_fence		*s_fence;
>>   
>> +	u32				submission_units;
>> +
>>   	/*
>>   	 * work is used only after finish_cb has been used and will not be
>>   	 * accessed anymore.
>> @@ -478,14 +482,14 @@ struct drm_sched_backend_ops {
>>    *
>>    * @ops: backend operations provided by the driver.
>>    * @single_entity: Single entity for the scheduler
>> - * @hw_submission_limit: the max size of the hardware queue.
>> + * @submission_limit: the maximim amount of submission units
>> + * @submission_count: the number current amount of submission units in flight
>>    * @timeout: the time after which a job is removed from the scheduler.
>>    * @name: name of the ring for which this scheduler is being used.
>>    * @sched_rq: priority wise array of run queues.
>>    * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>>    *                 waits on this wait queue until all the scheduled jobs are
>>    *                 finished.
>> - * @hw_rq_count: the number of jobs currently in the hardware queue.
>>    * @job_id_count: used to assign unique id to the each job.
>>    * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>    * @timeout_wq: workqueue used to queue @work_tdr
>> @@ -511,12 +515,12 @@ struct drm_sched_backend_ops {
>>   struct drm_gpu_scheduler {
>>   	const struct drm_sched_backend_ops	*ops;
>>   	struct drm_sched_entity		*single_entity;
>> -	uint32_t			hw_submission_limit;
>> +	u32				submission_limit;
>> +	atomic_t			submission_count;
>>   	long				timeout;
>>   	const char			*name;
>>   	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
>>   	wait_queue_head_t		job_scheduled;
>> -	atomic_t			hw_rq_count;
>>   	atomic64_t			job_id_count;
>>   	struct workqueue_struct		*submit_wq;
>>   	struct workqueue_struct		*timeout_wq;
>> @@ -539,7 +543,7 @@ struct drm_gpu_scheduler {
>>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   		   const struct drm_sched_backend_ops *ops,
>>   		   struct workqueue_struct *submit_wq,
>> -		   uint32_t hw_submission, unsigned hang_limit,
>> +		   uint32_t max_submission_units, unsigned hang_limit,
>>   		   long timeout, struct workqueue_struct *timeout_wq,
>>   		   atomic_t *score, const char *name,
>>   		   enum drm_sched_policy sched_policy,
>> @@ -548,6 +552,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>   int drm_sched_job_init(struct drm_sched_job *job,
>>   		       struct drm_sched_entity *entity,
>> +		       u32 submission_units,
>>   		       void *owner);
>>   void drm_sched_job_arm(struct drm_sched_job *job);
>>   int drm_sched_job_add_dependency(struct drm_sched_job *job,
>> @@ -570,7 +575,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>>   
>>   void drm_sched_tdr_queue_imm(struct drm_gpu_scheduler *sched);
>>   void drm_sched_job_cleanup(struct drm_sched_job *job);
>> -void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched);
>> +void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
>> +				   struct drm_sched_entity *entity);
>>   bool drm_sched_submit_ready(struct drm_gpu_scheduler *sched);
>>   void drm_sched_submit_stop(struct drm_gpu_scheduler *sched);
>>   void drm_sched_submit_start(struct drm_gpu_scheduler *sched);
Boris Brezillon Sept. 26, 2023, 7:11 a.m. UTC | #3
On Mon, 25 Sep 2023 19:55:21 +0200
Christian König <christian.koenig@amd.com> wrote:

> Am 25.09.23 um 14:55 schrieb Boris Brezillon:
> > +The imagination team, who's probably interested too.
> >
> > On Mon, 25 Sep 2023 00:43:06 +0200
> > Danilo Krummrich <dakr@redhat.com> wrote:
> >  
> >> Currently, job flow control is implemented simply by limiting the amount
> >> of jobs in flight. Therefore, a scheduler is initialized with a
> >> submission limit that corresponds to a certain amount of jobs.
> >>
> >> This implies that for each job drivers need to account for the maximum
> >> job size possible in order to not overflow the ring buffer.
> >>
> >> However, there are drivers, such as Nouveau, where the job size has a
> >> rather large range. For such drivers it can easily happen that job
> >> submissions not even filling the ring by 1% can block subsequent
> >> submissions, which, in the worst case, can lead to the ring run dry.
> >>
> >> In order to overcome this issue, allow for tracking the actual job size
> >> instead of the amount job jobs. Therefore, add a field to track a job's
> >> submission units, which represents the amount of units a job contributes
> >> to the scheduler's submission limit.  
> > As mentioned earlier, this might allow some simplifications in the
> > PowerVR driver where we do flow-control using a dma_fence returned
> > through ->prepare_job(). The only thing that'd be missing is a way to
> > dynamically query the size of a job (a new hook?), instead of having the
> > size fixed at creation time, because PVR jobs embed native fence waits,
> > and the number of native fences will decrease if some of these fences
> > are signalled before ->run_job() is called, thus reducing the job size.  
> 
> Exactly that is a little bit questionable since it allows for the device 
> to postpone jobs infinitely.
> 
> It would be good if the scheduler is able to validate if it's ever able 
> to run the job when it is pushed into the entity.

Yes, we do that already. We check that the immutable part of the job
(everything that's not a native fence wait) fits in the ringbuf.
Luben Tuikov Sept. 26, 2023, 8:43 p.m. UTC | #4
Hi,

On 2023-09-24 18:43, Danilo Krummrich wrote:
> Currently, job flow control is implemented simply by limiting the amount
> of jobs in flight. Therefore, a scheduler is initialized with a
> submission limit that corresponds to a certain amount of jobs.

"certain"? How about this instead:
" ... that corresponds to the number of jobs which can be sent
  to the hardware."?

> 
> This implies that for each job drivers need to account for the maximum
                                ^,
Please add a comma after "job".

> job size possible in order to not overflow the ring buffer.

Well, different hardware designs would implement this differently.
Ideally, you only want pointers into the ring buffer, and then
the hardware consumes as much as it can. But this is a moot point
and it's always a good idea to have a "job size" hint from the client.
So this is a good patch.

Ideally, you want to say that the hardware needs to be able to
accommodate the number of jobs which can fit in the hardware
queue times the largest job. This is a waste of resources
however, and it is better to give a hint as to the size of a job,
by the client. If the hardware can peek and understand dependencies,
on top of knowing the "size of the job", it can be an extremely
efficient scheduler.

> 
> However, there are drivers, such as Nouveau, where the job size has a
> rather large range. For such drivers it can easily happen that job
> submissions not even filling the ring by 1% can block subsequent
> submissions, which, in the worst case, can lead to the ring run dry.
> 
> In order to overcome this issue, allow for tracking the actual job size
> instead of the amount job jobs. Therefore, add a field to track a job's

"the amount job jobs." --> "the number of jobs."

> submission units, which represents the amount of units a job contributes
> to the scheduler's submission limit.

Yeah, that's a good thing.

I suppose that drivers which don't support this, would just use "1" to achieve
the same behaviour as before.

> 
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
> This patch is based on Matt's scheduler work [1].
> 
> [1] https://lore.kernel.org/dri-devel/20230919050155.2647172-1-matthew.brost@intel.com/
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  2 +-
>  drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c  |  2 +-
>  drivers/gpu/drm/lima/lima_sched.c             |  2 +-
>  drivers/gpu/drm/msm/msm_gem_submit.c          |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_sched.c       |  2 +-
>  drivers/gpu/drm/panfrost/panfrost_drv.c       |  2 +-
>  .../gpu/drm/scheduler/gpu_scheduler_trace.h   |  2 +-
>  drivers/gpu/drm/scheduler/sched_entity.c      |  5 +-
>  drivers/gpu/drm/scheduler/sched_main.c        | 81 +++++++++++++------
>  drivers/gpu/drm/v3d/v3d_gem.c                 |  2 +-
>  include/drm/gpu_scheduler.h                   | 18 +++--
>  11 files changed, 78 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 78476bc75b4e..d54daaf64bf1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -115,7 +115,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  	if (!entity)
>  		return 0;
>  
> -	return drm_sched_job_init(&(*job)->base, entity, owner);
> +	return drm_sched_job_init(&(*job)->base, entity, 1, owner);
>  }
>  
>  int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> index 45403ea38906..74a446711207 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
> @@ -538,7 +538,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
>  
>  	ret = drm_sched_job_init(&submit->sched_job,
>  				 &ctx->sched_entity[args->pipe],
> -				 submit->ctx);
> +				 1, submit->ctx);
>  	if (ret)
>  		goto err_submit_put;
>  
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index 50c2075228aa..5dc6678e1eb9 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -123,7 +123,7 @@ int lima_sched_task_init(struct lima_sched_task *task,
>  	for (i = 0; i < num_bos; i++)
>  		drm_gem_object_get(&bos[i]->base.base);
>  
> -	err = drm_sched_job_init(&task->base, &context->base, vm);
> +	err = drm_sched_job_init(&task->base, &context->base, 1, vm);
>  	if (err) {
>  		kfree(task->bos);
>  		return err;
> diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
> index 3f1aa4de3b87..6d230c38e4f5 100644
> --- a/drivers/gpu/drm/msm/msm_gem_submit.c
> +++ b/drivers/gpu/drm/msm/msm_gem_submit.c
> @@ -48,7 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
>  		return ERR_PTR(ret);
>  	}
>  
> -	ret = drm_sched_job_init(&submit->base, queue->entity, queue);
> +	ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue);
>  	if (ret) {
>  		kfree(submit->hw_fence);
>  		kfree(submit);
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index f26a814a9920..e991426d86e4 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -89,7 +89,7 @@ nouveau_job_init(struct nouveau_job *job,
>  
>  	}
>  
> -	ret = drm_sched_job_init(&job->base, &entity->base, NULL);
> +	ret = drm_sched_job_init(&job->base, &entity->base, 1, NULL);
>  	if (ret)
>  		goto err_free_chains;
>  
> diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
> index a2ab99698ca8..d5e777deee5c 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_drv.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
> @@ -272,7 +272,7 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
>  
>  	ret = drm_sched_job_init(&job->base,
>  				 &file_priv->sched_entity[slot],
> -				 NULL);
> +				 1, NULL);
>  	if (ret)
>  		goto out_put_job;
>  
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
> index 3143ecaaff86..2e4ffdecc5dc 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
> @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(drm_sched_job,
>  			   __assign_str(name, sched_job->sched->name);
>  			   __entry->job_count = spsc_queue_count(&entity->job_queue);
>  			   __entry->hw_job_count = atomic_read(
> -				   &sched_job->sched->hw_rq_count);
> +				   &sched_job->sched->submission_count);
>  			   ),
>  	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
>  		      __entry->entity, __entry->id,
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 437c50867c99..6395090d5784 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -401,7 +401,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
>  		container_of(cb, struct drm_sched_entity, cb);
>  
>  	drm_sched_entity_clear_dep(f, cb);
> -	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity));
> +	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity),
> +				      entity);
>  }
>  
>  /**
> @@ -645,7 +646,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>  		if (fifo)
>  			drm_sched_rq_update_fifo(entity, submit_ts);
>  
> -		drm_sched_wakeup_if_can_queue(sched);
> +		drm_sched_wakeup_if_can_queue(sched, entity);
>  	}
>  }
>  EXPORT_SYMBOL(drm_sched_entity_push_job);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 88ef8be2d3c7..857622dd842e 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -93,6 +93,9 @@ int drm_sched_policy_default = DRM_SCHED_POLICY_FIFO;
>  MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
>  module_param_named(sched_policy, drm_sched_policy_default, int, 0444);
>  
> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
> +				struct drm_sched_entity *entity);
> +
>  static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
>  							    const struct rb_node *b)
>  {
> @@ -212,13 +215,15 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>  /**
>   * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>   *
> + * @sched: the gpu scheduler
>   * @rq: scheduler run queue to check.
>   * @dequeue: dequeue selected entity
>   *
>   * Try to find a ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
> +drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
> +			      struct drm_sched_rq *rq, bool dequeue)
>  {
>  	struct drm_sched_entity *entity;
>  
> @@ -227,7 +232,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>  	entity = rq->current_entity;
>  	if (entity) {
>  		list_for_each_entry_continue(entity, &rq->entities, list) {
> -			if (drm_sched_entity_is_ready(entity)) {
> +			if (drm_sched_entity_is_ready(entity) &&
> +			    drm_sched_can_queue(sched, entity)) {
>  				if (dequeue) {
>  					rq->current_entity = entity;
>  					reinit_completion(&entity->entity_idle);
> @@ -240,7 +246,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>  
>  	list_for_each_entry(entity, &rq->entities, list) {
>  
> -		if (drm_sched_entity_is_ready(entity)) {
> +		if (drm_sched_entity_is_ready(entity) &&
> +		    drm_sched_can_queue(sched, entity)) {
>  			if (dequeue) {
>  				rq->current_entity = entity;
>  				reinit_completion(&entity->entity_idle);
> @@ -261,13 +268,15 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>  /**
>   * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>   *
> + * @sched: the gpu scheduler
>   * @rq: scheduler run queue to check.
>   * @dequeue: dequeue selected entity
>   *
>   * Find oldest waiting ready entity, returns NULL if none found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
> +drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
> +				struct drm_sched_rq *rq, bool dequeue)
>  {
>  	struct rb_node *rb;
>  
> @@ -276,7 +285,8 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>  		struct drm_sched_entity *entity;
>  
>  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
> -		if (drm_sched_entity_is_ready(entity)) {
> +		if (drm_sched_entity_is_ready(entity) &&
> +		    drm_sched_can_queue(sched, entity)) {
>  			if (dequeue) {
>  				rq->current_entity = entity;
>  				reinit_completion(&entity->entity_idle);
> @@ -300,15 +310,27 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>  }
>  
>  /**
> - * drm_sched_can_queue -- Can we queue more to the hardware?
> + * drm_sched_can_queue - can we queue more jobs?

This change doesn't seem necessary. "can we queue more jobs?" is,
while very puritan, very vague and unclear. Where are the jobs
queue to? What are they being queued for? Please leave this one
alone, or if you're determined to change this, say something like:

	Can we queue more jobs for execution?

Also do NOT remove capitalization--leave it a proper sentence.

(Yes, they may be going to a firmware scheduler running in the hardware,
but from our point of view, they cross over the PCIe horizon to a different
domain, and we call that which lies beyond the PCIe horizon, "hardware".)


>   * @sched: scheduler instance
> + * @entity: the scheduler entity
>   *
> - * Return true if we can push more jobs to the hw, otherwise false.
> + * Return true if we can push at least one more job from @entity, false
> + * otherwise.
>   */
> -static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
> +				struct drm_sched_entity *entity)
>  {
> -	return atomic_read(&sched->hw_rq_count) <
> -		sched->hw_submission_limit;
> +	struct drm_sched_job *s_job;
> +
> +	s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
> +	if (!s_job)
> +		return false;
> +
> +	WARN_ON(s_job->submission_units > sched->submission_limit);
> +
> +	return (sched->submission_limit -
> +		atomic_read(&sched->submission_count)) >=
> +		s_job->submission_units;
>  }
>  
>  /**
> @@ -325,12 +347,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>  	struct drm_sched_entity *entity;
>  	int i;
>  
> -	if (!drm_sched_can_queue(sched))
> -		return NULL;
> -
>  	if (sched->single_entity) {
>  		if (!READ_ONCE(sched->single_entity->stopped) &&
> -		    drm_sched_entity_is_ready(sched->single_entity))
> +		    drm_sched_entity_is_ready(sched->single_entity) &&
> +		    drm_sched_can_queue(sched, sched->single_entity))
>  			return sched->single_entity;

I believe the fact that we're running on a work queue protects the scheduler
count, right? That's good.

>  
>  		return NULL;
> @@ -339,9 +359,11 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>  	/* Kernel run queue has higher priority than normal run queue*/
>  	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>  		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
> +			drm_sched_rq_select_entity_fifo(sched,
> +							&sched->sched_rq[i],
>  							dequeue) :
> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
> +			drm_sched_rq_select_entity_rr(sched,
> +						      &sched->sched_rq[i],
>  						      dequeue);
>  		if (entity)
>  			break;
> @@ -399,7 +421,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>  	struct drm_sched_fence *s_fence = s_job->s_fence;
>  	struct drm_gpu_scheduler *sched = s_fence->sched;
>  
> -	atomic_dec(&sched->hw_rq_count);
> +	atomic_sub(s_job->submission_units, &sched->submission_count);
>  	atomic_dec(sched->score);
>  
>  	trace_drm_sched_process_job(s_fence);
> @@ -622,7 +644,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>  					      &s_job->cb)) {
>  			dma_fence_put(s_job->s_fence->parent);
>  			s_job->s_fence->parent = NULL;
> -			atomic_dec(&sched->hw_rq_count);
> +			atomic_sub(s_job->submission_units,
> +				   &sched->submission_count);
>  		} else {
>  			/*
>  			 * remove job from pending_list.
> @@ -683,7 +706,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>  	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>  		struct dma_fence *fence = s_job->s_fence->parent;
>  
> -		atomic_inc(&sched->hw_rq_count);
> +		atomic_add(s_job->submission_units, &sched->submission_count);
>  
>  		if (!full_recovery)
>  			continue;
> @@ -764,6 +787,8 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>   * drm_sched_job_init - init a scheduler job
>   * @job: scheduler job to init
>   * @entity: scheduler entity to use
> + * @submission_units: the amount of units this job contributes to the schdulers
> + * submission limit


"amount" is for uncountable entities, like milk, or water. Please use "number",
also fix "schedulers", e.g.,

    * @submission_units: the number of units this job contributes to the schedulers'
    *       submission limit

>   * @owner: job owner for debugging
>   *
>   * Refer to drm_sched_entity_push_job() documentation
> @@ -781,6 +806,7 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>   */
>  int drm_sched_job_init(struct drm_sched_job *job,
>  		       struct drm_sched_entity *entity,
> +		       u32 submission_units,
>  		       void *owner)
>  {
>  	if (!entity->rq && !entity->single_sched)
> @@ -792,6 +818,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>  		return -ENOMEM;
>  
>  	INIT_LIST_HEAD(&job->list);
> +	job->submission_units = submission_units ? submission_units : 1;
>  
>  	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
>  
> @@ -1004,12 +1031,14 @@ EXPORT_SYMBOL(drm_sched_job_cleanup);
>  /**
>   * drm_sched_wakeup_if_can_queue - Wake up the scheduler
>   * @sched: scheduler instance
> + * @entity: the scheduler entity
>   *
>   * Wake up the scheduler if we can queue jobs.
>   */
> -void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
> +void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
> +				   struct drm_sched_entity *entity)
>  {
> -	if (drm_sched_can_queue(sched))
> +	if (drm_sched_can_queue(sched, entity))
>  		drm_sched_run_job_queue(sched);
>  }
>  
> @@ -1147,7 +1176,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>  
>  	s_fence = sched_job->s_fence;
>  
> -	atomic_inc(&sched->hw_rq_count);
> +	atomic_add(sched_job->submission_units, &sched->submission_count);
>  	drm_sched_job_begin(sched_job);
>  
>  	trace_drm_run_job(sched_job, entity);
> @@ -1183,7 +1212,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>   * @ops: backend operations for this scheduler
>   * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is
>   *	       allocated and used
> - * @hw_submission: number of hw submissions that can be in flight
> + * @max_submission_units: number of submission units that can be in flight
>   * @hang_limit: number of times to allow a job to hang before dropping it
>   * @timeout: timeout value in jiffies for the scheduler
>   * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
> @@ -1198,7 +1227,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>  int drm_sched_init(struct drm_gpu_scheduler *sched,
>  		   const struct drm_sched_backend_ops *ops,
>  		   struct workqueue_struct *submit_wq,
> -		   unsigned hw_submission, unsigned hang_limit,
> +		   unsigned max_submission_units, unsigned hang_limit,
>  		   long timeout, struct workqueue_struct *timeout_wq,
>  		   atomic_t *score, const char *name,
>  		   enum drm_sched_policy sched_policy,
> @@ -1211,7 +1240,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  
>  	sched->ops = ops;
>  	sched->single_entity = NULL;
> -	sched->hw_submission_limit = hw_submission;
> +	sched->submission_limit = max_submission_units;
>  	sched->name = name;
>  	if (!submit_wq) {
>  		sched->submit_wq = alloc_ordered_workqueue(name, 0);
> @@ -1238,7 +1267,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  	init_waitqueue_head(&sched->job_scheduled);
>  	INIT_LIST_HEAD(&sched->pending_list);
>  	spin_lock_init(&sched->job_list_lock);
> -	atomic_set(&sched->hw_rq_count, 0);
> +	atomic_set(&sched->submission_count, 0);
>  	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>  	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>  	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
> diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
> index 2e94ce788c71..8479e5302f7b 100644
> --- a/drivers/gpu/drm/v3d/v3d_gem.c
> +++ b/drivers/gpu/drm/v3d/v3d_gem.c
> @@ -417,7 +417,7 @@ v3d_job_init(struct v3d_dev *v3d, struct drm_file *file_priv,
>  	job->free = free;
>  
>  	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
> -				 v3d_priv);
> +				 1, v3d_priv);
>  	if (ret)
>  		goto fail;
>  
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 27f5778bbd6d..89b0aecd02e3 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -329,6 +329,8 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>   * @sched: the scheduler instance on which this job is scheduled.
>   * @s_fence: contains the fences for the scheduling of job.
>   * @finish_cb: the callback for the finished fence.
> + * @submission_units: the amount of submission units this job contributes to
> + *                    the scheduler

"The _number_ of submission units ..."

"The amount of milk and sugar necessary to make a 3 lbs cake is ..."

>   * @work: Helper to reschdeule job kill to different context.
>   * @id: a unique id assigned to each job scheduled on the scheduler.
>   * @karma: increment on every hang caused by this job. If this exceeds the hang
> @@ -348,6 +350,8 @@ struct drm_sched_job {
>  	struct drm_gpu_scheduler	*sched;
>  	struct drm_sched_fence		*s_fence;
>  
> +	u32				submission_units;
> +
>  	/*
>  	 * work is used only after finish_cb has been used and will not be
>  	 * accessed anymore.
> @@ -478,14 +482,14 @@ struct drm_sched_backend_ops {
>   *
>   * @ops: backend operations provided by the driver.
>   * @single_entity: Single entity for the scheduler
> - * @hw_submission_limit: the max size of the hardware queue.
> + * @submission_limit: the maximim amount of submission units

"the maximum number of submission units"

> + * @submission_count: the number current amount of submission units in flight

"the number current amount of" --> "the number of submission units in flight",
you don't need "current" as it is implied by the English language--it's redundant.


>   * @timeout: the time after which a job is removed from the scheduler.
>   * @name: name of the ring for which this scheduler is being used.
>   * @sched_rq: priority wise array of run queues.
>   * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>   *                 waits on this wait queue until all the scheduled jobs are
>   *                 finished.
> - * @hw_rq_count: the number of jobs currently in the hardware queue.
>   * @job_id_count: used to assign unique id to the each job.
>   * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>   * @timeout_wq: workqueue used to queue @work_tdr
> @@ -511,12 +515,12 @@ struct drm_sched_backend_ops {
>  struct drm_gpu_scheduler {
>  	const struct drm_sched_backend_ops	*ops;
>  	struct drm_sched_entity		*single_entity;
> -	uint32_t			hw_submission_limit;
> +	u32				submission_limit;
> +	atomic_t			submission_count;

That's good.

So, this is a good patch. Please fix the annotated above, and repost it
for an R-B. Thanks! :-)
Luben Tuikov Sept. 26, 2023, 11:48 p.m. UTC | #5
Hi,

Please also CC me to the whole set, as opposed to just one patch of the set.
And so in the future.

Thanks!
Danilo Krummrich Sept. 27, 2023, 12:13 a.m. UTC | #6
On 9/26/23 22:43, Luben Tuikov wrote:
> Hi,
> 
> On 2023-09-24 18:43, Danilo Krummrich wrote:
>> Currently, job flow control is implemented simply by limiting the amount
>> of jobs in flight. Therefore, a scheduler is initialized with a
>> submission limit that corresponds to a certain amount of jobs.
> 
> "certain"? How about this instead:
> " ... that corresponds to the number of jobs which can be sent
>    to the hardware."?
> 
>>
>> This implies that for each job drivers need to account for the maximum
>                                  ^,
> Please add a comma after "job".
> 
>> job size possible in order to not overflow the ring buffer.
> 
> Well, different hardware designs would implement this differently.
> Ideally, you only want pointers into the ring buffer, and then
> the hardware consumes as much as it can. But this is a moot point
> and it's always a good idea to have a "job size" hint from the client.
> So this is a good patch.
> 
> Ideally, you want to say that the hardware needs to be able to
> accommodate the number of jobs which can fit in the hardware
> queue times the largest job. This is a waste of resources
> however, and it is better to give a hint as to the size of a job,
> by the client. If the hardware can peek and understand dependencies,
> on top of knowing the "size of the job", it can be an extremely
> efficient scheduler.
> 
>>
>> However, there are drivers, such as Nouveau, where the job size has a
>> rather large range. For such drivers it can easily happen that job
>> submissions not even filling the ring by 1% can block subsequent
>> submissions, which, in the worst case, can lead to the ring run dry.
>>
>> In order to overcome this issue, allow for tracking the actual job size
>> instead of the amount job jobs. Therefore, add a field to track a job's
> 
> "the amount job jobs." --> "the number of jobs."

Yeah, I somehow manage to always get this wrong, which I guess you noticed
below already.

That's all good points below - gonna address them.

Did you see Boris' response regarding a separate callback in order to fetch
the job's submission units dynamically? Since this is needed by PowerVR, I'd
like to include this in V2. What's your take on that?

My only concern with that would be that if I got what Boris was saying
correctly calling

WARN_ON(s_job->submission_units > sched->submission_limit);

from drm_sched_can_queue() wouldn't work anymore, since this could indeed happen
temporarily. I think this was also Christian's concern.

- Danilo

> 
>> submission units, which represents the amount of units a job contributes
>> to the scheduler's submission limit.
> 
> Yeah, that's a good thing.
> 
> I suppose that drivers which don't support this, would just use "1" to achieve
> the same behaviour as before.
> 
>>
>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>> ---
>> This patch is based on Matt's scheduler work [1].
>>
>> [1] https://lore.kernel.org/dri-devel/20230919050155.2647172-1-matthew.brost@intel.com/
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  2 +-
>>   drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c  |  2 +-
>>   drivers/gpu/drm/lima/lima_sched.c             |  2 +-
>>   drivers/gpu/drm/msm/msm_gem_submit.c          |  2 +-
>>   drivers/gpu/drm/nouveau/nouveau_sched.c       |  2 +-
>>   drivers/gpu/drm/panfrost/panfrost_drv.c       |  2 +-
>>   .../gpu/drm/scheduler/gpu_scheduler_trace.h   |  2 +-
>>   drivers/gpu/drm/scheduler/sched_entity.c      |  5 +-
>>   drivers/gpu/drm/scheduler/sched_main.c        | 81 +++++++++++++------
>>   drivers/gpu/drm/v3d/v3d_gem.c                 |  2 +-
>>   include/drm/gpu_scheduler.h                   | 18 +++--
>>   11 files changed, 78 insertions(+), 42 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 78476bc75b4e..d54daaf64bf1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -115,7 +115,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>   	if (!entity)
>>   		return 0;
>>   
>> -	return drm_sched_job_init(&(*job)->base, entity, owner);
>> +	return drm_sched_job_init(&(*job)->base, entity, 1, owner);
>>   }
>>   
>>   int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
>> index 45403ea38906..74a446711207 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
>> @@ -538,7 +538,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
>>   
>>   	ret = drm_sched_job_init(&submit->sched_job,
>>   				 &ctx->sched_entity[args->pipe],
>> -				 submit->ctx);
>> +				 1, submit->ctx);
>>   	if (ret)
>>   		goto err_submit_put;
>>   
>> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
>> index 50c2075228aa..5dc6678e1eb9 100644
>> --- a/drivers/gpu/drm/lima/lima_sched.c
>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>> @@ -123,7 +123,7 @@ int lima_sched_task_init(struct lima_sched_task *task,
>>   	for (i = 0; i < num_bos; i++)
>>   		drm_gem_object_get(&bos[i]->base.base);
>>   
>> -	err = drm_sched_job_init(&task->base, &context->base, vm);
>> +	err = drm_sched_job_init(&task->base, &context->base, 1, vm);
>>   	if (err) {
>>   		kfree(task->bos);
>>   		return err;
>> diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
>> index 3f1aa4de3b87..6d230c38e4f5 100644
>> --- a/drivers/gpu/drm/msm/msm_gem_submit.c
>> +++ b/drivers/gpu/drm/msm/msm_gem_submit.c
>> @@ -48,7 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
>>   		return ERR_PTR(ret);
>>   	}
>>   
>> -	ret = drm_sched_job_init(&submit->base, queue->entity, queue);
>> +	ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue);
>>   	if (ret) {
>>   		kfree(submit->hw_fence);
>>   		kfree(submit);
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
>> index f26a814a9920..e991426d86e4 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
>> @@ -89,7 +89,7 @@ nouveau_job_init(struct nouveau_job *job,
>>   
>>   	}
>>   
>> -	ret = drm_sched_job_init(&job->base, &entity->base, NULL);
>> +	ret = drm_sched_job_init(&job->base, &entity->base, 1, NULL);
>>   	if (ret)
>>   		goto err_free_chains;
>>   
>> diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
>> index a2ab99698ca8..d5e777deee5c 100644
>> --- a/drivers/gpu/drm/panfrost/panfrost_drv.c
>> +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
>> @@ -272,7 +272,7 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
>>   
>>   	ret = drm_sched_job_init(&job->base,
>>   				 &file_priv->sched_entity[slot],
>> -				 NULL);
>> +				 1, NULL);
>>   	if (ret)
>>   		goto out_put_job;
>>   
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
>> index 3143ecaaff86..2e4ffdecc5dc 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
>> @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(drm_sched_job,
>>   			   __assign_str(name, sched_job->sched->name);
>>   			   __entry->job_count = spsc_queue_count(&entity->job_queue);
>>   			   __entry->hw_job_count = atomic_read(
>> -				   &sched_job->sched->hw_rq_count);
>> +				   &sched_job->sched->submission_count);
>>   			   ),
>>   	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
>>   		      __entry->entity, __entry->id,
>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
>> index 437c50867c99..6395090d5784 100644
>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>> @@ -401,7 +401,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
>>   		container_of(cb, struct drm_sched_entity, cb);
>>   
>>   	drm_sched_entity_clear_dep(f, cb);
>> -	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity));
>> +	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity),
>> +				      entity);
>>   }
>>   
>>   /**
>> @@ -645,7 +646,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>   		if (fifo)
>>   			drm_sched_rq_update_fifo(entity, submit_ts);
>>   
>> -		drm_sched_wakeup_if_can_queue(sched);
>> +		drm_sched_wakeup_if_can_queue(sched, entity);
>>   	}
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_push_job);
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>> index 88ef8be2d3c7..857622dd842e 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -93,6 +93,9 @@ int drm_sched_policy_default = DRM_SCHED_POLICY_FIFO;
>>   MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
>>   module_param_named(sched_policy, drm_sched_policy_default, int, 0444);
>>   
>> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
>> +				struct drm_sched_entity *entity);
>> +
>>   static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
>>   							    const struct rb_node *b)
>>   {
>> @@ -212,13 +215,15 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>   /**
>>    * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>    *
>> + * @sched: the gpu scheduler
>>    * @rq: scheduler run queue to check.
>>    * @dequeue: dequeue selected entity
>>    *
>>    * Try to find a ready entity, returns NULL if none found.
>>    */
>>   static struct drm_sched_entity *
>> -drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>> +drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
>> +			      struct drm_sched_rq *rq, bool dequeue)
>>   {
>>   	struct drm_sched_entity *entity;
>>   
>> @@ -227,7 +232,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>   	entity = rq->current_entity;
>>   	if (entity) {
>>   		list_for_each_entry_continue(entity, &rq->entities, list) {
>> -			if (drm_sched_entity_is_ready(entity)) {
>> +			if (drm_sched_entity_is_ready(entity) &&
>> +			    drm_sched_can_queue(sched, entity)) {
>>   				if (dequeue) {
>>   					rq->current_entity = entity;
>>   					reinit_completion(&entity->entity_idle);
>> @@ -240,7 +246,8 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>   
>>   	list_for_each_entry(entity, &rq->entities, list) {
>>   
>> -		if (drm_sched_entity_is_ready(entity)) {
>> +		if (drm_sched_entity_is_ready(entity) &&
>> +		    drm_sched_can_queue(sched, entity)) {
>>   			if (dequeue) {
>>   				rq->current_entity = entity;
>>   				reinit_completion(&entity->entity_idle);
>> @@ -261,13 +268,15 @@ drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
>>   /**
>>    * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>    *
>> + * @sched: the gpu scheduler
>>    * @rq: scheduler run queue to check.
>>    * @dequeue: dequeue selected entity
>>    *
>>    * Find oldest waiting ready entity, returns NULL if none found.
>>    */
>>   static struct drm_sched_entity *
>> -drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>> +drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
>> +				struct drm_sched_rq *rq, bool dequeue)
>>   {
>>   	struct rb_node *rb;
>>   
>> @@ -276,7 +285,8 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
>>   		struct drm_sched_entity *entity;
>>   
>>   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>> -		if (drm_sched_entity_is_ready(entity)) {
>> +		if (drm_sched_entity_is_ready(entity) &&
>> +		    drm_sched_can_queue(sched, entity)) {
>>   			if (dequeue) {
>>   				rq->current_entity = entity;
>>   				reinit_completion(&entity->entity_idle);
>> @@ -300,15 +310,27 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
>>   }
>>   
>>   /**
>> - * drm_sched_can_queue -- Can we queue more to the hardware?
>> + * drm_sched_can_queue - can we queue more jobs?
> 
> This change doesn't seem necessary. "can we queue more jobs?" is,
> while very puritan, very vague and unclear. Where are the jobs
> queue to? What are they being queued for? Please leave this one
> alone, or if you're determined to change this, say something like:
> 
> 	Can we queue more jobs for execution?
> 
> Also do NOT remove capitalization--leave it a proper sentence.
> 
> (Yes, they may be going to a firmware scheduler running in the hardware,
> but from our point of view, they cross over the PCIe horizon to a different
> domain, and we call that which lies beyond the PCIe horizon, "hardware".)
> 
> 
>>    * @sched: scheduler instance
>> + * @entity: the scheduler entity
>>    *
>> - * Return true if we can push more jobs to the hw, otherwise false.
>> + * Return true if we can push at least one more job from @entity, false
>> + * otherwise.
>>    */
>> -static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
>> +static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
>> +				struct drm_sched_entity *entity)
>>   {
>> -	return atomic_read(&sched->hw_rq_count) <
>> -		sched->hw_submission_limit;
>> +	struct drm_sched_job *s_job;
>> +
>> +	s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>> +	if (!s_job)
>> +		return false;
>> +
>> +	WARN_ON(s_job->submission_units > sched->submission_limit);
>> +
>> +	return (sched->submission_limit -
>> +		atomic_read(&sched->submission_count)) >=
>> +		s_job->submission_units;
>>   }
>>   
>>   /**
>> @@ -325,12 +347,10 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>   	struct drm_sched_entity *entity;
>>   	int i;
>>   
>> -	if (!drm_sched_can_queue(sched))
>> -		return NULL;
>> -
>>   	if (sched->single_entity) {
>>   		if (!READ_ONCE(sched->single_entity->stopped) &&
>> -		    drm_sched_entity_is_ready(sched->single_entity))
>> +		    drm_sched_entity_is_ready(sched->single_entity) &&
>> +		    drm_sched_can_queue(sched, sched->single_entity))
>>   			return sched->single_entity;
> 
> I believe the fact that we're running on a work queue protects the scheduler
> count, right? That's good.
> 
>>   
>>   		return NULL;
>> @@ -339,9 +359,11 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
>>   	/* Kernel run queue has higher priority than normal run queue*/
>>   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>   		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
>> -			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
>> +			drm_sched_rq_select_entity_fifo(sched,
>> +							&sched->sched_rq[i],
>>   							dequeue) :
>> -			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
>> +			drm_sched_rq_select_entity_rr(sched,
>> +						      &sched->sched_rq[i],
>>   						      dequeue);
>>   		if (entity)
>>   			break;
>> @@ -399,7 +421,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
>>   	struct drm_sched_fence *s_fence = s_job->s_fence;
>>   	struct drm_gpu_scheduler *sched = s_fence->sched;
>>   
>> -	atomic_dec(&sched->hw_rq_count);
>> +	atomic_sub(s_job->submission_units, &sched->submission_count);
>>   	atomic_dec(sched->score);
>>   
>>   	trace_drm_sched_process_job(s_fence);
>> @@ -622,7 +644,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>>   					      &s_job->cb)) {
>>   			dma_fence_put(s_job->s_fence->parent);
>>   			s_job->s_fence->parent = NULL;
>> -			atomic_dec(&sched->hw_rq_count);
>> +			atomic_sub(s_job->submission_units,
>> +				   &sched->submission_count);
>>   		} else {
>>   			/*
>>   			 * remove job from pending_list.
>> @@ -683,7 +706,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>>   	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>>   		struct dma_fence *fence = s_job->s_fence->parent;
>>   
>> -		atomic_inc(&sched->hw_rq_count);
>> +		atomic_add(s_job->submission_units, &sched->submission_count);
>>   
>>   		if (!full_recovery)
>>   			continue;
>> @@ -764,6 +787,8 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>    * drm_sched_job_init - init a scheduler job
>>    * @job: scheduler job to init
>>    * @entity: scheduler entity to use
>> + * @submission_units: the amount of units this job contributes to the schdulers
>> + * submission limit
> 
> 
> "amount" is for uncountable entities, like milk, or water. Please use "number",
> also fix "schedulers", e.g.,
> 
>      * @submission_units: the number of units this job contributes to the schedulers'
>      *       submission limit
> 
>>    * @owner: job owner for debugging
>>    *
>>    * Refer to drm_sched_entity_push_job() documentation
>> @@ -781,6 +806,7 @@ EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>>    */
>>   int drm_sched_job_init(struct drm_sched_job *job,
>>   		       struct drm_sched_entity *entity,
>> +		       u32 submission_units,
>>   		       void *owner)
>>   {
>>   	if (!entity->rq && !entity->single_sched)
>> @@ -792,6 +818,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>>   		return -ENOMEM;
>>   
>>   	INIT_LIST_HEAD(&job->list);
>> +	job->submission_units = submission_units ? submission_units : 1;
>>   
>>   	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
>>   
>> @@ -1004,12 +1031,14 @@ EXPORT_SYMBOL(drm_sched_job_cleanup);
>>   /**
>>    * drm_sched_wakeup_if_can_queue - Wake up the scheduler
>>    * @sched: scheduler instance
>> + * @entity: the scheduler entity
>>    *
>>    * Wake up the scheduler if we can queue jobs.
>>    */
>> -void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
>> +void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
>> +				   struct drm_sched_entity *entity)
>>   {
>> -	if (drm_sched_can_queue(sched))
>> +	if (drm_sched_can_queue(sched, entity))
>>   		drm_sched_run_job_queue(sched);
>>   }
>>   
>> @@ -1147,7 +1176,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>>   
>>   	s_fence = sched_job->s_fence;
>>   
>> -	atomic_inc(&sched->hw_rq_count);
>> +	atomic_add(sched_job->submission_units, &sched->submission_count);
>>   	drm_sched_job_begin(sched_job);
>>   
>>   	trace_drm_run_job(sched_job, entity);
>> @@ -1183,7 +1212,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>>    * @ops: backend operations for this scheduler
>>    * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is
>>    *	       allocated and used
>> - * @hw_submission: number of hw submissions that can be in flight
>> + * @max_submission_units: number of submission units that can be in flight
>>    * @hang_limit: number of times to allow a job to hang before dropping it
>>    * @timeout: timeout value in jiffies for the scheduler
>>    * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
>> @@ -1198,7 +1227,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
>>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   		   const struct drm_sched_backend_ops *ops,
>>   		   struct workqueue_struct *submit_wq,
>> -		   unsigned hw_submission, unsigned hang_limit,
>> +		   unsigned max_submission_units, unsigned hang_limit,
>>   		   long timeout, struct workqueue_struct *timeout_wq,
>>   		   atomic_t *score, const char *name,
>>   		   enum drm_sched_policy sched_policy,
>> @@ -1211,7 +1240,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   
>>   	sched->ops = ops;
>>   	sched->single_entity = NULL;
>> -	sched->hw_submission_limit = hw_submission;
>> +	sched->submission_limit = max_submission_units;
>>   	sched->name = name;
>>   	if (!submit_wq) {
>>   		sched->submit_wq = alloc_ordered_workqueue(name, 0);
>> @@ -1238,7 +1267,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   	init_waitqueue_head(&sched->job_scheduled);
>>   	INIT_LIST_HEAD(&sched->pending_list);
>>   	spin_lock_init(&sched->job_list_lock);
>> -	atomic_set(&sched->hw_rq_count, 0);
>> +	atomic_set(&sched->submission_count, 0);
>>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>   	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
>>   	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
>> diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
>> index 2e94ce788c71..8479e5302f7b 100644
>> --- a/drivers/gpu/drm/v3d/v3d_gem.c
>> +++ b/drivers/gpu/drm/v3d/v3d_gem.c
>> @@ -417,7 +417,7 @@ v3d_job_init(struct v3d_dev *v3d, struct drm_file *file_priv,
>>   	job->free = free;
>>   
>>   	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
>> -				 v3d_priv);
>> +				 1, v3d_priv);
>>   	if (ret)
>>   		goto fail;
>>   
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 27f5778bbd6d..89b0aecd02e3 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -329,6 +329,8 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>>    * @sched: the scheduler instance on which this job is scheduled.
>>    * @s_fence: contains the fences for the scheduling of job.
>>    * @finish_cb: the callback for the finished fence.
>> + * @submission_units: the amount of submission units this job contributes to
>> + *                    the scheduler
> 
> "The _number_ of submission units ..."
> 
> "The amount of milk and sugar necessary to make a 3 lbs cake is ..."
> 
>>    * @work: Helper to reschdeule job kill to different context.
>>    * @id: a unique id assigned to each job scheduled on the scheduler.
>>    * @karma: increment on every hang caused by this job. If this exceeds the hang
>> @@ -348,6 +350,8 @@ struct drm_sched_job {
>>   	struct drm_gpu_scheduler	*sched;
>>   	struct drm_sched_fence		*s_fence;
>>   
>> +	u32				submission_units;
>> +
>>   	/*
>>   	 * work is used only after finish_cb has been used and will not be
>>   	 * accessed anymore.
>> @@ -478,14 +482,14 @@ struct drm_sched_backend_ops {
>>    *
>>    * @ops: backend operations provided by the driver.
>>    * @single_entity: Single entity for the scheduler
>> - * @hw_submission_limit: the max size of the hardware queue.
>> + * @submission_limit: the maximim amount of submission units
> 
> "the maximum number of submission units"
> 
>> + * @submission_count: the number current amount of submission units in flight
> 
> "the number current amount of" --> "the number of submission units in flight",
> you don't need "current" as it is implied by the English language--it's redundant.
> 
> 
>>    * @timeout: the time after which a job is removed from the scheduler.
>>    * @name: name of the ring for which this scheduler is being used.
>>    * @sched_rq: priority wise array of run queues.
>>    * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>>    *                 waits on this wait queue until all the scheduled jobs are
>>    *                 finished.
>> - * @hw_rq_count: the number of jobs currently in the hardware queue.
>>    * @job_id_count: used to assign unique id to the each job.
>>    * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
>>    * @timeout_wq: workqueue used to queue @work_tdr
>> @@ -511,12 +515,12 @@ struct drm_sched_backend_ops {
>>   struct drm_gpu_scheduler {
>>   	const struct drm_sched_backend_ops	*ops;
>>   	struct drm_sched_entity		*single_entity;
>> -	uint32_t			hw_submission_limit;
>> +	u32				submission_limit;
>> +	atomic_t			submission_count;
> 
> That's good.
> 
> So, this is a good patch. Please fix the annotated above, and repost it
> for an R-B. Thanks! :-)
Danilo Krummrich Sept. 27, 2023, 12:18 a.m. UTC | #7
Hi,

On 9/27/23 01:48, Luben Tuikov wrote:
> Hi,
> 
> Please also CC me to the whole set, as opposed to just one patch of the set.
> And so in the future.

There is no series. I created a series in the first place, but finally decided to
send this one and a few driver patches separately. However, I just accidentally
sent out the wrong one still containing the "1/3" addition.

- Danilo

> 
> Thanks!
Luben Tuikov Sept. 27, 2023, 1:55 a.m. UTC | #8
Hi,

On 2023-09-26 20:13, Danilo Krummrich wrote:
> On 9/26/23 22:43, Luben Tuikov wrote:
>> Hi,
>>
>> On 2023-09-24 18:43, Danilo Krummrich wrote:
>>> Currently, job flow control is implemented simply by limiting the amount
>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>> submission limit that corresponds to a certain amount of jobs.
>>
>> "certain"? How about this instead:
>> " ... that corresponds to the number of jobs which can be sent
>>    to the hardware."?
>>
>>>
>>> This implies that for each job drivers need to account for the maximum
>>                                  ^,
>> Please add a comma after "job".
>>
>>> job size possible in order to not overflow the ring buffer.
>>
>> Well, different hardware designs would implement this differently.
>> Ideally, you only want pointers into the ring buffer, and then
>> the hardware consumes as much as it can. But this is a moot point
>> and it's always a good idea to have a "job size" hint from the client.
>> So this is a good patch.
>>
>> Ideally, you want to say that the hardware needs to be able to
>> accommodate the number of jobs which can fit in the hardware
>> queue times the largest job. This is a waste of resources
>> however, and it is better to give a hint as to the size of a job,
>> by the client. If the hardware can peek and understand dependencies,
>> on top of knowing the "size of the job", it can be an extremely
>> efficient scheduler.
>>
>>>
>>> However, there are drivers, such as Nouveau, where the job size has a
>>> rather large range. For such drivers it can easily happen that job
>>> submissions not even filling the ring by 1% can block subsequent
>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>
>>> In order to overcome this issue, allow for tracking the actual job size
>>> instead of the amount job jobs. Therefore, add a field to track a job's
>>
>> "the amount job jobs." --> "the number of jobs."
> 
> Yeah, I somehow manage to always get this wrong, which I guess you noticed
> below already.
> 
> That's all good points below - gonna address them.

Forgot to mention, title tweak,
 "implement dynamic job flow control" --> "implement dynamic flow job control"
would perhaps be better? Unless that was meant to be "job-flow control"?

> Did you see Boris' response regarding a separate callback in order to fetch
> the job's submission units dynamically? Since this is needed by PowerVR, I'd
> like to include this in V2. What's your take on that?

Both of them have good valid points. The whole point is to guarantee forward
progress, and to be able to easily debug a stuck driver. Using a fence
in prepare-job would be easy to see that the fence never triggered, but
if we replace this with dynamic job credits, then debugging would be hard,
as Christian pointed out.

> My only concern with that would be that if I got what Boris was saying
> correctly calling
> 
> WARN_ON(s_job->submission_units > sched->submission_limit);
> 
> from drm_sched_can_queue() wouldn't work anymore, since this could indeed happen
> temporarily. I think this was also Christian's concern.

Indeed. We don't want hardware/drivers to game the scheduler, but want to
guarantee forward progress, e.g. a job with N number of credits completes,
and those credits are added to the available credits, and then a new,
smaller or bigger job is accepted for execution (prepare-job, run-job, etc.).

Feel free to rename "units" to "credits" as this is what is used in
hardware and link protocols, and naturally this is what I'm used to
calling such mechanisms.

I say, implement it, post the patch, and we'll take a look. It's a good thing
and we should definitely develop it.

Thanks!
Boris Brezillon Sept. 27, 2023, 7:25 a.m. UTC | #9
On Wed, 27 Sep 2023 02:13:59 +0200
Danilo Krummrich <dakr@redhat.com> wrote:

> On 9/26/23 22:43, Luben Tuikov wrote:
> > Hi,
> > 
> > On 2023-09-24 18:43, Danilo Krummrich wrote:  
> >> Currently, job flow control is implemented simply by limiting the amount
> >> of jobs in flight. Therefore, a scheduler is initialized with a
> >> submission limit that corresponds to a certain amount of jobs.  
> > 
> > "certain"? How about this instead:
> > " ... that corresponds to the number of jobs which can be sent
> >    to the hardware."?
> >   
> >>
> >> This implies that for each job drivers need to account for the maximum  
> >                                  ^,
> > Please add a comma after "job".
> >   
> >> job size possible in order to not overflow the ring buffer.  
> > 
> > Well, different hardware designs would implement this differently.
> > Ideally, you only want pointers into the ring buffer, and then
> > the hardware consumes as much as it can. But this is a moot point
> > and it's always a good idea to have a "job size" hint from the client.
> > So this is a good patch.
> > 
> > Ideally, you want to say that the hardware needs to be able to
> > accommodate the number of jobs which can fit in the hardware
> > queue times the largest job. This is a waste of resources
> > however, and it is better to give a hint as to the size of a job,
> > by the client. If the hardware can peek and understand dependencies,
> > on top of knowing the "size of the job", it can be an extremely
> > efficient scheduler.
> >   
> >>
> >> However, there are drivers, such as Nouveau, where the job size has a
> >> rather large range. For such drivers it can easily happen that job
> >> submissions not even filling the ring by 1% can block subsequent
> >> submissions, which, in the worst case, can lead to the ring run dry.
> >>
> >> In order to overcome this issue, allow for tracking the actual job size
> >> instead of the amount job jobs. Therefore, add a field to track a job's  
> > 
> > "the amount job jobs." --> "the number of jobs."  
> 
> Yeah, I somehow manage to always get this wrong, which I guess you noticed
> below already.
> 
> That's all good points below - gonna address them.
> 
> Did you see Boris' response regarding a separate callback in order to fetch
> the job's submission units dynamically? Since this is needed by PowerVR, I'd
> like to include this in V2. What's your take on that?
> 
> My only concern with that would be that if I got what Boris was saying
> correctly calling
> 
> WARN_ON(s_job->submission_units > sched->submission_limit);
> 
> from drm_sched_can_queue() wouldn't work anymore, since this could indeed happen
> temporarily. I think this was also Christian's concern.

Actually, I think that's fine to account for the max job size in the
first check, we're unlikely to have so many native fence waits that our
job can't fit in an empty ring buffer.
Danilo Krummrich Sept. 27, 2023, 11:45 a.m. UTC | #10
On 9/27/23 09:25, Boris Brezillon wrote:
> On Wed, 27 Sep 2023 02:13:59 +0200
> Danilo Krummrich <dakr@redhat.com> wrote:
> 
>> On 9/26/23 22:43, Luben Tuikov wrote:
>>> Hi,
>>>
>>> On 2023-09-24 18:43, Danilo Krummrich wrote:
>>>> Currently, job flow control is implemented simply by limiting the amount
>>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>>> submission limit that corresponds to a certain amount of jobs.
>>>
>>> "certain"? How about this instead:
>>> " ... that corresponds to the number of jobs which can be sent
>>>     to the hardware."?
>>>    
>>>>
>>>> This implies that for each job drivers need to account for the maximum
>>>                                   ^,
>>> Please add a comma after "job".
>>>    
>>>> job size possible in order to not overflow the ring buffer.
>>>
>>> Well, different hardware designs would implement this differently.
>>> Ideally, you only want pointers into the ring buffer, and then
>>> the hardware consumes as much as it can. But this is a moot point
>>> and it's always a good idea to have a "job size" hint from the client.
>>> So this is a good patch.
>>>
>>> Ideally, you want to say that the hardware needs to be able to
>>> accommodate the number of jobs which can fit in the hardware
>>> queue times the largest job. This is a waste of resources
>>> however, and it is better to give a hint as to the size of a job,
>>> by the client. If the hardware can peek and understand dependencies,
>>> on top of knowing the "size of the job", it can be an extremely
>>> efficient scheduler.
>>>    
>>>>
>>>> However, there are drivers, such as Nouveau, where the job size has a
>>>> rather large range. For such drivers it can easily happen that job
>>>> submissions not even filling the ring by 1% can block subsequent
>>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>>
>>>> In order to overcome this issue, allow for tracking the actual job size
>>>> instead of the amount job jobs. Therefore, add a field to track a job's
>>>
>>> "the amount job jobs." --> "the number of jobs."
>>
>> Yeah, I somehow manage to always get this wrong, which I guess you noticed
>> below already.
>>
>> That's all good points below - gonna address them.
>>
>> Did you see Boris' response regarding a separate callback in order to fetch
>> the job's submission units dynamically? Since this is needed by PowerVR, I'd
>> like to include this in V2. What's your take on that?
>>
>> My only concern with that would be that if I got what Boris was saying
>> correctly calling
>>
>> WARN_ON(s_job->submission_units > sched->submission_limit);
>>
>> from drm_sched_can_queue() wouldn't work anymore, since this could indeed happen
>> temporarily. I think this was also Christian's concern.
> 
> Actually, I think that's fine to account for the max job size in the
> first check, we're unlikely to have so many native fence waits that our
> job can't fit in an empty ring buffer.
> 

But it can happen, right? Hence, we can't have this check, do we?
Christian König Sept. 27, 2023, 11:54 a.m. UTC | #11
Am 26.09.23 um 09:11 schrieb Boris Brezillon:
> On Mon, 25 Sep 2023 19:55:21 +0200
> Christian König <christian.koenig@amd.com> wrote:
>
>> Am 25.09.23 um 14:55 schrieb Boris Brezillon:
>>> +The imagination team, who's probably interested too.
>>>
>>> On Mon, 25 Sep 2023 00:43:06 +0200
>>> Danilo Krummrich <dakr@redhat.com> wrote:
>>>   
>>>> Currently, job flow control is implemented simply by limiting the amount
>>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>>> submission limit that corresponds to a certain amount of jobs.
>>>>
>>>> This implies that for each job drivers need to account for the maximum
>>>> job size possible in order to not overflow the ring buffer.
>>>>
>>>> However, there are drivers, such as Nouveau, where the job size has a
>>>> rather large range. For such drivers it can easily happen that job
>>>> submissions not even filling the ring by 1% can block subsequent
>>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>>
>>>> In order to overcome this issue, allow for tracking the actual job size
>>>> instead of the amount job jobs. Therefore, add a field to track a job's
>>>> submission units, which represents the amount of units a job contributes
>>>> to the scheduler's submission limit.
>>> As mentioned earlier, this might allow some simplifications in the
>>> PowerVR driver where we do flow-control using a dma_fence returned
>>> through ->prepare_job(). The only thing that'd be missing is a way to
>>> dynamically query the size of a job (a new hook?), instead of having the
>>> size fixed at creation time, because PVR jobs embed native fence waits,
>>> and the number of native fences will decrease if some of these fences
>>> are signalled before ->run_job() is called, thus reducing the job size.
>> Exactly that is a little bit questionable since it allows for the device
>> to postpone jobs infinitely.
>>
>> It would be good if the scheduler is able to validate if it's ever able
>> to run the job when it is pushed into the entity.
> Yes, we do that already. We check that the immutable part of the job
> (everything that's not a native fence wait) fits in the ringbuf.

Yeah, but thinking more about it there might be really bad side effects. 
We shouldn't use a callback nor job credits because it might badly 
influence fairness between entities.

In other words when one entity submits always large jobs and another one 
always small ones then the scheduler would prefer the one which submits 
the smaller ones because they are easier to fit into the ring buffer.

What we can do is the follow:
1. The scheduler has some initial credits it can use to push jobs.
2. Each scheduler fence (and *not* the job) has a credits field of how 
much it will use.
3. After letting a a job run the credits of it's fence are subtracted 
from the available credits of the scheduler.
4. The scheduler can keep running jobs as long as it has a positive 
credit count.
5. When the credit count becomes negative it goes to sleep until a 
scheduler fence signals and the count becomes positive again.

This way jobs are handled equally, you can still push jobs up to at 
least halve your ring buffer size and you should be able to handle your 
PowerVR case by calculating the credits you actually used in your 
run_job() callback.

As far as I can see that approach should work, shouldn't it?

Regards,
Christian.
Boris Brezillon Sept. 27, 2023, 11:54 a.m. UTC | #12
On Wed, 27 Sep 2023 13:45:37 +0200
Danilo Krummrich <dakr@redhat.com> wrote:

> On 9/27/23 09:25, Boris Brezillon wrote:
> > On Wed, 27 Sep 2023 02:13:59 +0200
> > Danilo Krummrich <dakr@redhat.com> wrote:
> >   
> >> On 9/26/23 22:43, Luben Tuikov wrote:  
> >>> Hi,
> >>>
> >>> On 2023-09-24 18:43, Danilo Krummrich wrote:  
> >>>> Currently, job flow control is implemented simply by limiting the amount
> >>>> of jobs in flight. Therefore, a scheduler is initialized with a
> >>>> submission limit that corresponds to a certain amount of jobs.  
> >>>
> >>> "certain"? How about this instead:
> >>> " ... that corresponds to the number of jobs which can be sent
> >>>     to the hardware."?
> >>>      
> >>>>
> >>>> This implies that for each job drivers need to account for the maximum  
> >>>                                   ^,
> >>> Please add a comma after "job".
> >>>      
> >>>> job size possible in order to not overflow the ring buffer.  
> >>>
> >>> Well, different hardware designs would implement this differently.
> >>> Ideally, you only want pointers into the ring buffer, and then
> >>> the hardware consumes as much as it can. But this is a moot point
> >>> and it's always a good idea to have a "job size" hint from the client.
> >>> So this is a good patch.
> >>>
> >>> Ideally, you want to say that the hardware needs to be able to
> >>> accommodate the number of jobs which can fit in the hardware
> >>> queue times the largest job. This is a waste of resources
> >>> however, and it is better to give a hint as to the size of a job,
> >>> by the client. If the hardware can peek and understand dependencies,
> >>> on top of knowing the "size of the job", it can be an extremely
> >>> efficient scheduler.
> >>>      
> >>>>
> >>>> However, there are drivers, such as Nouveau, where the job size has a
> >>>> rather large range. For such drivers it can easily happen that job
> >>>> submissions not even filling the ring by 1% can block subsequent
> >>>> submissions, which, in the worst case, can lead to the ring run dry.
> >>>>
> >>>> In order to overcome this issue, allow for tracking the actual job size
> >>>> instead of the amount job jobs. Therefore, add a field to track a job's  
> >>>
> >>> "the amount job jobs." --> "the number of jobs."  
> >>
> >> Yeah, I somehow manage to always get this wrong, which I guess you noticed
> >> below already.
> >>
> >> That's all good points below - gonna address them.
> >>
> >> Did you see Boris' response regarding a separate callback in order to fetch
> >> the job's submission units dynamically? Since this is needed by PowerVR, I'd
> >> like to include this in V2. What's your take on that?
> >>
> >> My only concern with that would be that if I got what Boris was saying
> >> correctly calling
> >>
> >> WARN_ON(s_job->submission_units > sched->submission_limit);
> >>
> >> from drm_sched_can_queue() wouldn't work anymore, since this could indeed happen
> >> temporarily. I think this was also Christian's concern.  
> > 
> > Actually, I think that's fine to account for the max job size in the
> > first check, we're unlikely to have so many native fence waits that our
> > job can't fit in an empty ring buffer.
> >   
> 
> But it can happen, right? Hence, we can't have this check, do we?
> 

I theory, yes, in practice, given the size of the ring buffers, and the
size of a fence wait command, I guess we can refuse to queue a job (at
the driver level) if the maximum job size (static + maximum dynamic
part of the job) doesn't fit in the ring buffer. I that ever becomes a
problem, we can revisit it at that point.
Danilo Krummrich Sept. 27, 2023, 12:11 p.m. UTC | #13
On 9/27/23 13:54, Christian König wrote:
> Am 26.09.23 um 09:11 schrieb Boris Brezillon:
>> On Mon, 25 Sep 2023 19:55:21 +0200
>> Christian König <christian.koenig@amd.com> wrote:
>>
>>> Am 25.09.23 um 14:55 schrieb Boris Brezillon:
>>>> +The imagination team, who's probably interested too.
>>>>
>>>> On Mon, 25 Sep 2023 00:43:06 +0200
>>>> Danilo Krummrich <dakr@redhat.com> wrote:
>>>>> Currently, job flow control is implemented simply by limiting the amount
>>>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>>>> submission limit that corresponds to a certain amount of jobs.
>>>>>
>>>>> This implies that for each job drivers need to account for the maximum
>>>>> job size possible in order to not overflow the ring buffer.
>>>>>
>>>>> However, there are drivers, such as Nouveau, where the job size has a
>>>>> rather large range. For such drivers it can easily happen that job
>>>>> submissions not even filling the ring by 1% can block subsequent
>>>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>>>
>>>>> In order to overcome this issue, allow for tracking the actual job size
>>>>> instead of the amount job jobs. Therefore, add a field to track a job's
>>>>> submission units, which represents the amount of units a job contributes
>>>>> to the scheduler's submission limit.
>>>> As mentioned earlier, this might allow some simplifications in the
>>>> PowerVR driver where we do flow-control using a dma_fence returned
>>>> through ->prepare_job(). The only thing that'd be missing is a way to
>>>> dynamically query the size of a job (a new hook?), instead of having the
>>>> size fixed at creation time, because PVR jobs embed native fence waits,
>>>> and the number of native fences will decrease if some of these fences
>>>> are signalled before ->run_job() is called, thus reducing the job size.
>>> Exactly that is a little bit questionable since it allows for the device
>>> to postpone jobs infinitely.
>>>
>>> It would be good if the scheduler is able to validate if it's ever able
>>> to run the job when it is pushed into the entity.
>> Yes, we do that already. We check that the immutable part of the job
>> (everything that's not a native fence wait) fits in the ringbuf.
> 
> Yeah, but thinking more about it there might be really bad side effects. We shouldn't use a callback nor job credits because it might badly influence fairness between entities.
> 
> In other words when one entity submits always large jobs and another one always small ones then the scheduler would prefer the one which submits the smaller ones because they are easier to fit into the ring buffer.

That's true. Admittedly I mostly had DRM_SCHED_POLICY_SINGLE_ENTITY​ in mind
where obviously we just have a single entity.

> 
> What we can do is the follow:
> 1. The scheduler has some initial credits it can use to push jobs.
> 2. Each scheduler fence (and *not* the job) has a credits field of how much it will use.
> 3. After letting a a job run the credits of it's fence are subtracted from the available credits of the scheduler.
> 4. The scheduler can keep running jobs as long as it has a positive credit count.
> 5. When the credit count becomes negative it goes to sleep until a scheduler fence signals and the count becomes positive again.

Wouldn't it be possible that we overflow the ring with that or at least end up in
a busy wait in run_job()? What if the remaining credit count is greater than 0 but
smaller than the number of credits the next picked job requires?

> 
> This way jobs are handled equally, you can still push jobs up to at least halve your ring buffer size and you should be able to handle your PowerVR case by calculating the credits you actually used in your run_job() callback.
> 
> As far as I can see that approach should work, shouldn't it?
> 
> Regards,
> Christian.
>
Christian König Sept. 27, 2023, 12:15 p.m. UTC | #14
Am 27.09.23 um 14:11 schrieb Danilo Krummrich:
> On 9/27/23 13:54, Christian König wrote:
>> Am 26.09.23 um 09:11 schrieb Boris Brezillon:
>>> On Mon, 25 Sep 2023 19:55:21 +0200
>>> Christian König <christian.koenig@amd.com> wrote:
>>>
>>>> Am 25.09.23 um 14:55 schrieb Boris Brezillon:
>>>>> +The imagination team, who's probably interested too.
>>>>>
>>>>> On Mon, 25 Sep 2023 00:43:06 +0200
>>>>> Danilo Krummrich <dakr@redhat.com> wrote:
>>>>>> Currently, job flow control is implemented simply by limiting the 
>>>>>> amount
>>>>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>>>>> submission limit that corresponds to a certain amount of jobs.
>>>>>>
>>>>>> This implies that for each job drivers need to account for the 
>>>>>> maximum
>>>>>> job size possible in order to not overflow the ring buffer.
>>>>>>
>>>>>> However, there are drivers, such as Nouveau, where the job size 
>>>>>> has a
>>>>>> rather large range. For such drivers it can easily happen that job
>>>>>> submissions not even filling the ring by 1% can block subsequent
>>>>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>>>>
>>>>>> In order to overcome this issue, allow for tracking the actual 
>>>>>> job size
>>>>>> instead of the amount job jobs. Therefore, add a field to track a 
>>>>>> job's
>>>>>> submission units, which represents the amount of units a job 
>>>>>> contributes
>>>>>> to the scheduler's submission limit.
>>>>> As mentioned earlier, this might allow some simplifications in the
>>>>> PowerVR driver where we do flow-control using a dma_fence returned
>>>>> through ->prepare_job(). The only thing that'd be missing is a way to
>>>>> dynamically query the size of a job (a new hook?), instead of 
>>>>> having the
>>>>> size fixed at creation time, because PVR jobs embed native fence 
>>>>> waits,
>>>>> and the number of native fences will decrease if some of these fences
>>>>> are signalled before ->run_job() is called, thus reducing the job 
>>>>> size.
>>>> Exactly that is a little bit questionable since it allows for the 
>>>> device
>>>> to postpone jobs infinitely.
>>>>
>>>> It would be good if the scheduler is able to validate if it's ever 
>>>> able
>>>> to run the job when it is pushed into the entity.
>>> Yes, we do that already. We check that the immutable part of the job
>>> (everything that's not a native fence wait) fits in the ringbuf.
>>
>> Yeah, but thinking more about it there might be really bad side 
>> effects. We shouldn't use a callback nor job credits because it might 
>> badly influence fairness between entities.
>>
>> In other words when one entity submits always large jobs and another 
>> one always small ones then the scheduler would prefer the one which 
>> submits the smaller ones because they are easier to fit into the ring 
>> buffer.
>
> That's true. Admittedly I mostly had DRM_SCHED_POLICY_SINGLE_ENTITY​ 
> in mind
> where obviously we just have a single entity.

I also only stumbled over it after thinking Boris suggestions through. 
That problem really wasn't obvious.

>
>>
>> What we can do is the follow:
>> 1. The scheduler has some initial credits it can use to push jobs.
>> 2. Each scheduler fence (and *not* the job) has a credits field of 
>> how much it will use.
>> 3. After letting a a job run the credits of it's fence are subtracted 
>> from the available credits of the scheduler.
>> 4. The scheduler can keep running jobs as long as it has a positive 
>> credit count.
>> 5. When the credit count becomes negative it goes to sleep until a 
>> scheduler fence signals and the count becomes positive again.
>
> Wouldn't it be possible that we overflow the ring with that or at 
> least end up in
> a busy wait in run_job()? What if the remaining credit count is 
> greater than 0 but
> smaller than the number of credits the next picked job requires?

The initial credits the scheduler gets should be only halve the ring size.

So as long as that is positive you should have enough space even for the 
biggest jobs.

We should still have a warning if userspace tries to push something 
bigger into an entity.

Regards,
Christian.

>
>>
>> This way jobs are handled equally, you can still push jobs up to at 
>> least halve your ring buffer size and you should be able to handle 
>> your PowerVR case by calculating the credits you actually used in 
>> your run_job() callback.
>>
>> As far as I can see that approach should work, shouldn't it?
>>
>> Regards,
>> Christian.
>>
>
Luben Tuikov Sept. 27, 2023, 12:41 p.m. UTC | #15
On 2023-09-27 08:15, Christian König wrote:
> Am 27.09.23 um 14:11 schrieb Danilo Krummrich:
>> On 9/27/23 13:54, Christian König wrote:
>>> Am 26.09.23 um 09:11 schrieb Boris Brezillon:
>>>> On Mon, 25 Sep 2023 19:55:21 +0200
>>>> Christian König <christian.koenig@amd.com> wrote:
>>>>
>>>>> Am 25.09.23 um 14:55 schrieb Boris Brezillon:
>>>>>> +The imagination team, who's probably interested too.
>>>>>>
>>>>>> On Mon, 25 Sep 2023 00:43:06 +0200
>>>>>> Danilo Krummrich <dakr@redhat.com> wrote:
>>>>>>> Currently, job flow control is implemented simply by limiting the 
>>>>>>> amount
>>>>>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>>>>>> submission limit that corresponds to a certain amount of jobs.
>>>>>>>
>>>>>>> This implies that for each job drivers need to account for the 
>>>>>>> maximum
>>>>>>> job size possible in order to not overflow the ring buffer.
>>>>>>>
>>>>>>> However, there are drivers, such as Nouveau, where the job size 
>>>>>>> has a
>>>>>>> rather large range. For such drivers it can easily happen that job
>>>>>>> submissions not even filling the ring by 1% can block subsequent
>>>>>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>>>>>
>>>>>>> In order to overcome this issue, allow for tracking the actual 
>>>>>>> job size
>>>>>>> instead of the amount job jobs. Therefore, add a field to track a 
>>>>>>> job's
>>>>>>> submission units, which represents the amount of units a job 
>>>>>>> contributes
>>>>>>> to the scheduler's submission limit.
>>>>>> As mentioned earlier, this might allow some simplifications in the
>>>>>> PowerVR driver where we do flow-control using a dma_fence returned
>>>>>> through ->prepare_job(). The only thing that'd be missing is a way to
>>>>>> dynamically query the size of a job (a new hook?), instead of 
>>>>>> having the
>>>>>> size fixed at creation time, because PVR jobs embed native fence 
>>>>>> waits,
>>>>>> and the number of native fences will decrease if some of these fences
>>>>>> are signalled before ->run_job() is called, thus reducing the job 
>>>>>> size.
>>>>> Exactly that is a little bit questionable since it allows for the 
>>>>> device
>>>>> to postpone jobs infinitely.
>>>>>
>>>>> It would be good if the scheduler is able to validate if it's ever 
>>>>> able
>>>>> to run the job when it is pushed into the entity.
>>>> Yes, we do that already. We check that the immutable part of the job
>>>> (everything that's not a native fence wait) fits in the ringbuf.
>>>
>>> Yeah, but thinking more about it there might be really bad side 
>>> effects. We shouldn't use a callback nor job credits because it might 
>>> badly influence fairness between entities.

I feel the entity and job selection algorithm should not be influenced by
how many credits are available, but by the respective scheduling policy,
RR, FIFO, single-entity (job-FIFO).

Credits only tell us, "Can the hardware take on the next selected
job _at this time_?"

As long as,
1. the next selected job doesn't exceed the maximum job credit the hardware
   is capable of processing, and,
2. pending jobs are being completed by the hardware and returned back to
   the kernel, and their credits are added to the available credits,
we can guarantee forward progress, i.e. that the next selected job
would, in a finite time, execute and complete.

>>> In other words when one entity submits always large jobs and another 
>>> one always small ones then the scheduler would prefer the one which 
>>> submits the smaller ones because they are easier to fit into the ring 
>>> buffer.

I think we should stave off credit availability from scheduling. DRM
scheduling is based on factors which are "above the hardware", e.g. how long
an entity has been held off, how long a job therein has been waiting, etc.,
while "job credits" merely tell us if the next selected job, whichever one
it is, can be taken by the hardware _at this time_.

So, credits tell us something about the hardware. Scheduling tells
us something about entities, jobs, wait times, etc. I feel the DRM
scheduler should stay away from mixing credits and scheduling, since
that wouldn't be portable across different hardware/drivers we support,
and would make the code unnecessarily complicated and prone to bugs,
timeouts, possibly deadlocks, etc.

>>
>> That's true. Admittedly I mostly had DRM_SCHED_POLICY_SINGLE_ENTITY​ 
>> in mind
>> where obviously we just have a single entity.
> 
> I also only stumbled over it after thinking Boris suggestions through. 
> That problem really wasn't obvious.
> 
>>
>>>
>>> What we can do is the follow:
>>> 1. The scheduler has some initial credits it can use to push jobs.
>>> 2. Each scheduler fence (and *not* the job) has a credits field of 
>>> how much it will use.
>>> 3. After letting a a job run the credits of it's fence are subtracted 
>>> from the available credits of the scheduler.
>>> 4. The scheduler can keep running jobs as long as it has a positive 
>>> credit count.
>>> 5. When the credit count becomes negative it goes to sleep until a 
>>> scheduler fence signals and the count becomes positive again.
>>
>> Wouldn't it be possible that we overflow the ring with that or at 
>> least end up in
>> a busy wait in run_job()? What if the remaining credit count is 
>> greater than 0 but
>> smaller than the number of credits the next picked job requires?

That's fine and this is _exactly_ the purpose of job credits--you want
to know if you can push the next job _now_ or if you should wait until
some pending jobs in hardware finish executing in order to free resources
in the hardware, so that you _can_ push the next picked job down to
the hardware.

> 
> The initial credits the scheduler gets should be only halve the ring size.
> 
> So as long as that is positive you should have enough space even for the 
> biggest jobs.
> 
> We should still have a warning if userspace tries to push something 
> bigger into an entity.

Right. Generally, if userspace tries to push a job which exceeds the capability
of the hardware, as measured by job credits which are set by the hardware,
then emit a warning.
Danilo Krummrich Sept. 27, 2023, 2:12 p.m. UTC | #16
On 9/27/23 14:15, Christian König wrote:
> Am 27.09.23 um 14:11 schrieb Danilo Krummrich:
>> On 9/27/23 13:54, Christian König wrote:
>>> Am 26.09.23 um 09:11 schrieb Boris Brezillon:
>>>> On Mon, 25 Sep 2023 19:55:21 +0200
>>>> Christian König <christian.koenig@amd.com> wrote:
>>>>
>>>>> Am 25.09.23 um 14:55 schrieb Boris Brezillon:
>>>>>> +The imagination team, who's probably interested too.
>>>>>>
>>>>>> On Mon, 25 Sep 2023 00:43:06 +0200
>>>>>> Danilo Krummrich <dakr@redhat.com> wrote:
>>>>>>> Currently, job flow control is implemented simply by limiting the amount
>>>>>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>>>>>> submission limit that corresponds to a certain amount of jobs.
>>>>>>>
>>>>>>> This implies that for each job drivers need to account for the maximum
>>>>>>> job size possible in order to not overflow the ring buffer.
>>>>>>>
>>>>>>> However, there are drivers, such as Nouveau, where the job size has a
>>>>>>> rather large range. For such drivers it can easily happen that job
>>>>>>> submissions not even filling the ring by 1% can block subsequent
>>>>>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>>>>>
>>>>>>> In order to overcome this issue, allow for tracking the actual job size
>>>>>>> instead of the amount job jobs. Therefore, add a field to track a job's
>>>>>>> submission units, which represents the amount of units a job contributes
>>>>>>> to the scheduler's submission limit.
>>>>>> As mentioned earlier, this might allow some simplifications in the
>>>>>> PowerVR driver where we do flow-control using a dma_fence returned
>>>>>> through ->prepare_job(). The only thing that'd be missing is a way to
>>>>>> dynamically query the size of a job (a new hook?), instead of having the
>>>>>> size fixed at creation time, because PVR jobs embed native fence waits,
>>>>>> and the number of native fences will decrease if some of these fences
>>>>>> are signalled before ->run_job() is called, thus reducing the job size.
>>>>> Exactly that is a little bit questionable since it allows for the device
>>>>> to postpone jobs infinitely.
>>>>>
>>>>> It would be good if the scheduler is able to validate if it's ever able
>>>>> to run the job when it is pushed into the entity.
>>>> Yes, we do that already. We check that the immutable part of the job
>>>> (everything that's not a native fence wait) fits in the ringbuf.
>>>
>>> Yeah, but thinking more about it there might be really bad side effects. We shouldn't use a callback nor job credits because it might badly influence fairness between entities.
>>>
>>> In other words when one entity submits always large jobs and another one always small ones then the scheduler would prefer the one which submits the smaller ones because they are easier to fit into the ring buffer.
>>
>> That's true. Admittedly I mostly had DRM_SCHED_POLICY_SINGLE_ENTITY​ in mind
>> where obviously we just have a single entity.
> 
> I also only stumbled over it after thinking Boris suggestions through. That problem really wasn't obvious.
> 
>>
>>>
>>> What we can do is the follow:
>>> 1. The scheduler has some initial credits it can use to push jobs.
>>> 2. Each scheduler fence (and *not* the job) has a credits field of how much it will use.
>>> 3. After letting a a job run the credits of it's fence are subtracted from the available credits of the scheduler.
>>> 4. The scheduler can keep running jobs as long as it has a positive credit count.
>>> 5. When the credit count becomes negative it goes to sleep until a scheduler fence signals and the count becomes positive again.
>>
>> Wouldn't it be possible that we overflow the ring with that or at least end up in
>> a busy wait in run_job()? What if the remaining credit count is greater than 0 but
>> smaller than the number of credits the next picked job requires?
> 
> The initial credits the scheduler gets should be only halve the ring size.

Ok, with that premise it works. I'd be fine with that, although this means that as soon
as we hit RING_SIZE / 2 + 1 credits we don't push more stuff to the ring even if it would
actually fit.

> So as long as that is positive you should have enough space even for the biggest jobs.
> 
> We should still have a warning if userspace tries to push something bigger into an entity.

Well, if the driver thinks it's fine and it won't exceed the capacity once it hits run_job()
it's probably fine thinking of PowerVR. However, we can have a warning when run_job() returns
with more credits than we can handle.

> 
> Regards,
> Christian.
> 
>>
>>>
>>> This way jobs are handled equally, you can still push jobs up to at least halve your ring buffer size and you should be able to handle your PowerVR case by calculating the credits you actually used in your run_job() callback.
>>>
>>> As far as I can see that approach should work, shouldn't it?
>>>
>>> Regards,
>>> Christian.
>>>
>>
>
Boris Brezillon Sept. 28, 2023, 8:02 a.m. UTC | #17
On Wed, 27 Sep 2023 13:54:38 +0200
Christian König <christian.koenig@amd.com> wrote:

> Am 26.09.23 um 09:11 schrieb Boris Brezillon:
> > On Mon, 25 Sep 2023 19:55:21 +0200
> > Christian König <christian.koenig@amd.com> wrote:
> >  
> >> Am 25.09.23 um 14:55 schrieb Boris Brezillon:  
> >>> +The imagination team, who's probably interested too.
> >>>
> >>> On Mon, 25 Sep 2023 00:43:06 +0200
> >>> Danilo Krummrich <dakr@redhat.com> wrote:
> >>>     
> >>>> Currently, job flow control is implemented simply by limiting the amount
> >>>> of jobs in flight. Therefore, a scheduler is initialized with a
> >>>> submission limit that corresponds to a certain amount of jobs.
> >>>>
> >>>> This implies that for each job drivers need to account for the maximum
> >>>> job size possible in order to not overflow the ring buffer.
> >>>>
> >>>> However, there are drivers, such as Nouveau, where the job size has a
> >>>> rather large range. For such drivers it can easily happen that job
> >>>> submissions not even filling the ring by 1% can block subsequent
> >>>> submissions, which, in the worst case, can lead to the ring run dry.
> >>>>
> >>>> In order to overcome this issue, allow for tracking the actual job size
> >>>> instead of the amount job jobs. Therefore, add a field to track a job's
> >>>> submission units, which represents the amount of units a job contributes
> >>>> to the scheduler's submission limit.  
> >>> As mentioned earlier, this might allow some simplifications in the
> >>> PowerVR driver where we do flow-control using a dma_fence returned
> >>> through ->prepare_job(). The only thing that'd be missing is a way to
> >>> dynamically query the size of a job (a new hook?), instead of having the
> >>> size fixed at creation time, because PVR jobs embed native fence waits,
> >>> and the number of native fences will decrease if some of these fences
> >>> are signalled before ->run_job() is called, thus reducing the job size.  
> >> Exactly that is a little bit questionable since it allows for the device
> >> to postpone jobs infinitely.
> >>
> >> It would be good if the scheduler is able to validate if it's ever able
> >> to run the job when it is pushed into the entity.  
> > Yes, we do that already. We check that the immutable part of the job
> > (everything that's not a native fence wait) fits in the ringbuf.  
> 
> Yeah, but thinking more about it there might be really bad side effects. 
> We shouldn't use a callback nor job credits because it might badly 
> influence fairness between entities.
> 
> In other words when one entity submits always large jobs and another one 
> always small ones then the scheduler would prefer the one which submits 
> the smaller ones because they are easier to fit into the ring buffer.

Yeah, I was assuming SINGLE_ENTITY sched policy here. As soon as we
have a ring buffer that's shared by several entities it becomes tricky
to be fair if the job sizes are dynamic. In the multi-entity case, the
->prepare_job()+dma_fence approach addresses the problem, because the
first job to call ->prepare_job() and add its fence to the list of jobs
waiting for ringbuf space will also be the first one to be checked when
some space is freed, and if there's still not enough space, we won't
test other jobs coming after in the list.

> 
> What we can do is the follow:
> 1. The scheduler has some initial credits it can use to push jobs.
> 2. Each scheduler fence (and *not* the job) has a credits field of how 
> much it will use.

When are the credits assigned to the scheduler fence? As said earlier,
on PowerVR, we might start with N credits when the job is queued, and
(N - M) when it gets submitted, so we need a hook to force a
recalculation every time the scheduler is considering the job for
submission.

> 3. After letting a a job run the credits of it's fence are subtracted 
> from the available credits of the scheduler.

Uh, what happens if the job you picked make the scheduler
available credits pass under zero? I guess that's relying on the
fact you only expose half of the ring buffer capacity, thus enforcing
that a job is never bigger than half the ring buffer. The latter is
acceptable, but the fact your utilization is then half the maximum
capacity is not great IMHO.

> 4. The scheduler can keep running jobs as long as it has a positive 
> credit count.

Why not just check that 'next_job_credits < available_credits', and
force the scheduler to go to sleep if that's not the case. When it's
woken up because the parent fence of some previous job is signaled, we
re-evaluate the condition, and go back to sleep if we still don't have
enough credits. In the PowerVR case, I'd need a wait to recalculate the
number of credits every time the condition is re-evaluated, but that's
just a matter of adding an optional hook to force the re-calculation.

> 5. When the credit count becomes negative it goes to sleep until a 
> scheduler fence signals and the count becomes positive again.
> 
> This way jobs are handled equally, you can still push jobs up to at 
> least halve your ring buffer size

I think that's the aspect I'm not fond of. I don't see why we'd want to
keep half of the ring buffer unused. I mean, there might be good
reasons to do so, if, for instance, the same ring buffer is used for
some high-priority commands sent by the kernel or something like that.
But it looks like a driver-specific decision to not fully use the ring
buffer.

> and you should be able to handle your 
> PowerVR case by calculating the credits you actually used in your 
> run_job() callback.

Hm, ideally the credits adjustment should happen every time the
scheduler is considering a job for submission (every time it got
unblocked because available credits got increased), otherwise you might
wait longer than strictly needed if some native fences got signaled in
the meantime.
Luben Tuikov Sept. 28, 2023, 2:44 p.m. UTC | #18
On 2023-09-28 04:02, Boris Brezillon wrote:
> On Wed, 27 Sep 2023 13:54:38 +0200
> Christian König <christian.koenig@amd.com> wrote:
> 
>> Am 26.09.23 um 09:11 schrieb Boris Brezillon:
>>> On Mon, 25 Sep 2023 19:55:21 +0200
>>> Christian König <christian.koenig@amd.com> wrote:
>>>  
>>>> Am 25.09.23 um 14:55 schrieb Boris Brezillon:  
>>>>> +The imagination team, who's probably interested too.
>>>>>
>>>>> On Mon, 25 Sep 2023 00:43:06 +0200
>>>>> Danilo Krummrich <dakr@redhat.com> wrote:
>>>>>     
>>>>>> Currently, job flow control is implemented simply by limiting the amount
>>>>>> of jobs in flight. Therefore, a scheduler is initialized with a
>>>>>> submission limit that corresponds to a certain amount of jobs.
>>>>>>
>>>>>> This implies that for each job drivers need to account for the maximum
>>>>>> job size possible in order to not overflow the ring buffer.
>>>>>>
>>>>>> However, there are drivers, such as Nouveau, where the job size has a
>>>>>> rather large range. For such drivers it can easily happen that job
>>>>>> submissions not even filling the ring by 1% can block subsequent
>>>>>> submissions, which, in the worst case, can lead to the ring run dry.
>>>>>>
>>>>>> In order to overcome this issue, allow for tracking the actual job size
>>>>>> instead of the amount job jobs. Therefore, add a field to track a job's
>>>>>> submission units, which represents the amount of units a job contributes
>>>>>> to the scheduler's submission limit.  
>>>>> As mentioned earlier, this might allow some simplifications in the
>>>>> PowerVR driver where we do flow-control using a dma_fence returned
>>>>> through ->prepare_job(). The only thing that'd be missing is a way to
>>>>> dynamically query the size of a job (a new hook?), instead of having the
>>>>> size fixed at creation time, because PVR jobs embed native fence waits,
>>>>> and the number of native fences will decrease if some of these fences
>>>>> are signalled before ->run_job() is called, thus reducing the job size.  
>>>> Exactly that is a little bit questionable since it allows for the device
>>>> to postpone jobs infinitely.
>>>>
>>>> It would be good if the scheduler is able to validate if it's ever able
>>>> to run the job when it is pushed into the entity.  
>>> Yes, we do that already. We check that the immutable part of the job
>>> (everything that's not a native fence wait) fits in the ringbuf.  
>>
>> Yeah, but thinking more about it there might be really bad side effects. 
>> We shouldn't use a callback nor job credits because it might badly 
>> influence fairness between entities.
>>
>> In other words when one entity submits always large jobs and another one 
>> always small ones then the scheduler would prefer the one which submits 
>> the smaller ones because they are easier to fit into the ring buffer.
> 
> Yeah, I was assuming SINGLE_ENTITY sched policy here. As soon as we

Right--it's a job-FIFO.

> have a ring buffer that's shared by several entities it becomes tricky
> to be fair if the job sizes are dynamic. In the multi-entity case, the

Right--for the job credit scheme to work, you need to use a job-FIFO
at the DRM scheduler level. (Once the job is received into the hardware,
the firmware/hardware may choose to reorder/parallelize execution of
several pending jobs, but that's beyond the scope of this thread.)

> ->prepare_job()+dma_fence approach addresses the problem, because the
> first job to call ->prepare_job() and add its fence to the list of jobs
> waiting for ringbuf space will also be the first one to be checked when
> some space is freed, and if there's still not enough space, we won't
> test other jobs coming after in the list.

Right--you shouldn't.

> 
>>
>> What we can do is the follow:
>> 1. The scheduler has some initial credits it can use to push jobs.
>> 2. Each scheduler fence (and *not* the job) has a credits field of how 
>> much it will use.
> 
> When are the credits assigned to the scheduler fence? As said earlier,
> on PowerVR, we might start with N credits when the job is queued, and
> (N - M) when it gets submitted, so we need a hook to force a
> recalculation every time the scheduler is considering the job for
> submission.

"Credits" is something the firmware/hardware engineers tell you. It's a
known fixed quantity at ASIC boot. It changes only as you submit jobs
into the hardware/firmware.

No hook, but rather a peek. You'd peek at the hardware to figure out
how many credits you have available to submit new jobs, or you'd keep
a running count of this quantity--depending on how the ASIC works.

When a job completes, you add it's credits to the available credit
count (or you may ask the hardware how many are available now),
and add/reset that amount to the available count kept in the scheduler
struct (for instance). Then, if the next job to be pushed--which has been
known from the outset as we use a job-FIFO--is using less than or equal
number of credits than the available ones, then you push the job, and
subtract from the availability count (or, again, peek at the hardware
for that count).

> 
>> 3. After letting a a job run the credits of it's fence are subtracted 
>> from the available credits of the scheduler.
> 
> Uh, what happens if the job you picked make the scheduler
> available credits pass under zero? I guess that's relying on the
> fact you only expose half of the ring buffer capacity, thus enforcing
> that a job is never bigger than half the ring buffer. The latter is
> acceptable, but the fact your utilization is then half the maximum
> capacity is not great IMHO.

The credit count you keep should never go negative from the action of pushing
jobs to the hardware. If it did, it tells you the software design is not
consistent.

Hardware/firmware engineers will not appreciate the fact that only 1/2 credits
are being exposed due to poor software design principles, nor would
the sales team.

(See also message-id: 61c0d884-b8d4-4109-be75-23927b61cb52@amd.com.)

> 
>> 4. The scheduler can keep running jobs as long as it has a positive 
>> credit count.
> 
> Why not just check that 'next_job_credits < available_credits', and

Yes, see message-id: 61c0d884-b8d4-4109-be75-23927b61cb52@amd.com.

> force the scheduler to go to sleep if that's not the case. When it's
> woken up because the parent fence of some previous job is signaled, we

"pending job"

> re-evaluate the condition, and go back to sleep if we still don't have
> enough credits. In the PowerVR case, I'd need a wait to recalculate the
> number of credits every time the condition is re-evaluated, but that's
> just a matter of adding an optional hook to force the re-calculation.

Right.

> 
>> 5. When the credit count becomes negative it goes to sleep until a 
>> scheduler fence signals and the count becomes positive again.
>>
>> This way jobs are handled equally, you can still push jobs up to at 
>> least halve your ring buffer size
> 
> I think that's the aspect I'm not fond of. I don't see why we'd want to
> keep half of the ring buffer unused. I mean, there might be good

We don't. We absolutely don't. Hardware engineers would absolutely
not appreciate this, and you shouldn't write the code to do that.

> reasons to do so, if, for instance, the same ring buffer is used for
> some high-priority commands sent by the kernel or something like that.

Ideally, you'd want a separate ring with its own credits for high-priority
jobs, since a high-priority job can be as large as the credit capacity,
which would force the code to insert it at the head of the FIFO. Anyway,
I digress. 

> But it looks like a driver-specific decision to not fully use the ring
> buffer.

The full potential of the hardware should be utilized at any point in time.

> 
>> and you should be able to handle your 
>> PowerVR case by calculating the credits you actually used in your 
>> run_job() callback.
> 
> Hm, ideally the credits adjustment should happen every time the
> scheduler is considering a job for submission (every time it got
> unblocked because available credits got increased), otherwise you might
> wait longer than strictly needed if some native fences got signaled in
> the meantime.

Ideally, at the time you're considering whether you can push a job to the hardware,
you should have the credit capacity ready--i.e. you should just read it
off a variable/register/etc., possibly atomically. "Calculating" anything
might induce delays, and future temptation to add more code to do more things
there, thus degrading design.

You'd calculate credit capacity when a pending job completes, i.e. returns
back to the scheduler from the hardware.
Boris Brezillon Sept. 28, 2023, 4:26 p.m. UTC | #19
On Thu, 28 Sep 2023 10:44:47 -0400
Luben Tuikov <luben.tuikov@amd.com> wrote:

> >>
> >> What we can do is the follow:
> >> 1. The scheduler has some initial credits it can use to push jobs.
> >> 2. Each scheduler fence (and *not* the job) has a credits field of how 
> >> much it will use.  
> > 
> > When are the credits assigned to the scheduler fence? As said earlier,
> > on PowerVR, we might start with N credits when the job is queued, and
> > (N - M) when it gets submitted, so we need a hook to force a
> > recalculation every time the scheduler is considering the job for
> > submission.  
> 
> "Credits" is something the firmware/hardware engineers tell you. It's a
> known fixed quantity at ASIC boot. It changes only as you submit jobs
> into the hardware/firmware.
> 
> No hook, but rather a peek. You'd peek at the hardware to figure out
> how many credits you have available to submit new jobs, or you'd keep
> a running count of this quantity--depending on how the ASIC works.

Knowing how many 'credits' are available is not really a problem, and I
don't need a hook for that. I need a hook to know how many 'credits' a
job will consume, because that's the part being dynamic in PowerVR
(signaled fences get evicted from the list of commands to issue when
the job is pushed to the ring buffer).

> 
> When a job completes, you add it's credits to the available credit
> count (or you may ask the hardware how many are available now),
> and add/reset that amount to the available count kept in the scheduler
> struct (for instance).

Again, I have no problem with the available_credits tracking. It's the
'how many credits do I need for this job?' that's dynamic.

> Then, if the next job to be pushed--which has been
> known from the outset as we use a job-FIFO--is using less than or equal
> number of credits than the available ones, then you push the job, and
> subtract from the availability count (or, again, peek at the hardware
> for that count).
> 
> >   
> >> 3. After letting a a job run the credits of it's fence are subtracted 
> >> from the available credits of the scheduler.  
> > 
> > Uh, what happens if the job you picked make the scheduler
> > available credits pass under zero? I guess that's relying on the
> > fact you only expose half of the ring buffer capacity, thus enforcing
> > that a job is never bigger than half the ring buffer. The latter is
> > acceptable, but the fact your utilization is then half the maximum
> > capacity is not great IMHO.  
> 
> The credit count you keep should never go negative from the action of pushing
> jobs to the hardware. If it did, it tells you the software design is not
> consistent.

AFAICT, that's what Christian suggested: exposing half the ring buffer
capacity so that any job can be queued even it means making the
sched->available_credits value negative. In order to submit new jobs, we
need to wait for sched->available_credits to become positive again.

> 
> Hardware/firmware engineers will not appreciate the fact that only 1/2 credits
> are being exposed due to poor software design principles, nor would
> the sales team.

Exactly, and again, that's not something I suggested doing, I was
replying to Christian's suggestion. Maybe I just misunderstood what he
was suggesting.

> 
> (See also message-id: 61c0d884-b8d4-4109-be75-23927b61cb52@amd.com.)
> 
> >   
> >> 4. The scheduler can keep running jobs as long as it has a positive 
> >> credit count.  
> > 
> > Why not just check that 'next_job_credits < available_credits', and  
> 
> Yes, see message-id: 61c0d884-b8d4-4109-be75-23927b61cb52@amd.com.
> 
> > force the scheduler to go to sleep if that's not the case. When it's
> > woken up because the parent fence of some previous job is signaled, we  
> 
> "pending job"
> 
> > re-evaluate the condition, and go back to sleep if we still don't have
> > enough credits. In the PowerVR case, I'd need a wait to recalculate the
> > number of credits every time the condition is re-evaluated, but that's
> > just a matter of adding an optional hook to force the re-calculation.  
> 
> Right.
> 
> >   
> >> 5. When the credit count becomes negative it goes to sleep until a 
> >> scheduler fence signals and the count becomes positive again.
> >>
> >> This way jobs are handled equally, you can still push jobs up to at 
> >> least halve your ring buffer size  
> > 
> > I think that's the aspect I'm not fond of. I don't see why we'd want to
> > keep half of the ring buffer unused. I mean, there might be good  
> 
> We don't. We absolutely don't. Hardware engineers would absolutely
> not appreciate this, and you shouldn't write the code to do that.

Again, that's my understanding of Christian's suggestion.

> 
> > reasons to do so, if, for instance, the same ring buffer is used for
> > some high-priority commands sent by the kernel or something like that.  
> 
> Ideally, you'd want a separate ring with its own credits for high-priority
> jobs, since a high-priority job can be as large as the credit capacity,
> which would force the code to insert it at the head of the FIFO. Anyway,
> I digress. 

Exactly.

> 
> > But it looks like a driver-specific decision to not fully use the ring
> > buffer.  
> 
> The full potential of the hardware should be utilized at any point in time.
> 
> >   
> >> and you should be able to handle your 
> >> PowerVR case by calculating the credits you actually used in your 
> >> run_job() callback.  
> > 
> > Hm, ideally the credits adjustment should happen every time the
> > scheduler is considering a job for submission (every time it got
> > unblocked because available credits got increased), otherwise you might
> > wait longer than strictly needed if some native fences got signaled in
> > the meantime.  
> 
> Ideally, at the time you're considering whether you can push a job to the hardware,
> you should have the credit capacity ready

I'm not after recalculating sched->available_credits here, but rather
job->required_credits.

I could recalculate job->required_credits when ->prepare_job() is
called (AKA, your job is almost ready for submission), but this value
might be slightly bigger than what needed when the job finally gets
submitted (see below).


> --i.e. you should just read it
> off a variable/register/etc., possibly atomically. "Calculating" anything
> might induce delays, and future temptation to add more code to do more things
> there, thus degrading design.

Evicting signaled fences is not something we can do without a trigger,
so reading a variable won't help. I can recalculate the number of
credits needed for a job in the ->prepare_job() hook, but that means
credits will only be recalculated the first time the job is considered
for submission, not when the scheduler is woken up to re-evaluate
jobs waiting for ringbuf space. Given we're adding new infra to support
dynamic job sizes, I thought it'd okay to add an optional hook to query
the job required_credits value (the only overhead for other drivers is
the 'if (hook_present)' test).
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 78476bc75b4e..d54daaf64bf1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -115,7 +115,7 @@  int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (!entity)
 		return 0;
 
-	return drm_sched_job_init(&(*job)->base, entity, owner);
+	return drm_sched_job_init(&(*job)->base, entity, 1, owner);
 }
 
 int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index 45403ea38906..74a446711207 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -538,7 +538,7 @@  int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	ret = drm_sched_job_init(&submit->sched_job,
 				 &ctx->sched_entity[args->pipe],
-				 submit->ctx);
+				 1, submit->ctx);
 	if (ret)
 		goto err_submit_put;
 
diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index 50c2075228aa..5dc6678e1eb9 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -123,7 +123,7 @@  int lima_sched_task_init(struct lima_sched_task *task,
 	for (i = 0; i < num_bos; i++)
 		drm_gem_object_get(&bos[i]->base.base);
 
-	err = drm_sched_job_init(&task->base, &context->base, vm);
+	err = drm_sched_job_init(&task->base, &context->base, 1, vm);
 	if (err) {
 		kfree(task->bos);
 		return err;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 3f1aa4de3b87..6d230c38e4f5 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -48,7 +48,7 @@  static struct msm_gem_submit *submit_create(struct drm_device *dev,
 		return ERR_PTR(ret);
 	}
 
-	ret = drm_sched_job_init(&submit->base, queue->entity, queue);
+	ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue);
 	if (ret) {
 		kfree(submit->hw_fence);
 		kfree(submit);
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
index f26a814a9920..e991426d86e4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -89,7 +89,7 @@  nouveau_job_init(struct nouveau_job *job,
 
 	}
 
-	ret = drm_sched_job_init(&job->base, &entity->base, NULL);
+	ret = drm_sched_job_init(&job->base, &entity->base, 1, NULL);
 	if (ret)
 		goto err_free_chains;
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index a2ab99698ca8..d5e777deee5c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -272,7 +272,7 @@  static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
 
 	ret = drm_sched_job_init(&job->base,
 				 &file_priv->sched_entity[slot],
-				 NULL);
+				 1, NULL);
 	if (ret)
 		goto out_put_job;
 
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
index 3143ecaaff86..2e4ffdecc5dc 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
@@ -51,7 +51,7 @@  DECLARE_EVENT_CLASS(drm_sched_job,
 			   __assign_str(name, sched_job->sched->name);
 			   __entry->job_count = spsc_queue_count(&entity->job_queue);
 			   __entry->hw_job_count = atomic_read(
-				   &sched_job->sched->hw_rq_count);
+				   &sched_job->sched->submission_count);
 			   ),
 	    TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
 		      __entry->entity, __entry->id,
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 437c50867c99..6395090d5784 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -401,7 +401,8 @@  static void drm_sched_entity_wakeup(struct dma_fence *f,
 		container_of(cb, struct drm_sched_entity, cb);
 
 	drm_sched_entity_clear_dep(f, cb);
-	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity));
+	drm_sched_wakeup_if_can_queue(drm_sched_entity_to_scheduler(entity),
+				      entity);
 }
 
 /**
@@ -645,7 +646,7 @@  void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
 		if (fifo)
 			drm_sched_rq_update_fifo(entity, submit_ts);
 
-		drm_sched_wakeup_if_can_queue(sched);
+		drm_sched_wakeup_if_can_queue(sched, entity);
 	}
 }
 EXPORT_SYMBOL(drm_sched_entity_push_job);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 88ef8be2d3c7..857622dd842e 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -93,6 +93,9 @@  int drm_sched_policy_default = DRM_SCHED_POLICY_FIFO;
 MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
 module_param_named(sched_policy, drm_sched_policy_default, int, 0444);
 
+static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
+				struct drm_sched_entity *entity);
+
 static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
 							    const struct rb_node *b)
 {
@@ -212,13 +215,15 @@  void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
 /**
  * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
  *
+ * @sched: the gpu scheduler
  * @rq: scheduler run queue to check.
  * @dequeue: dequeue selected entity
  *
  * Try to find a ready entity, returns NULL if none found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
+drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
+			      struct drm_sched_rq *rq, bool dequeue)
 {
 	struct drm_sched_entity *entity;
 
@@ -227,7 +232,8 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
 	entity = rq->current_entity;
 	if (entity) {
 		list_for_each_entry_continue(entity, &rq->entities, list) {
-			if (drm_sched_entity_is_ready(entity)) {
+			if (drm_sched_entity_is_ready(entity) &&
+			    drm_sched_can_queue(sched, entity)) {
 				if (dequeue) {
 					rq->current_entity = entity;
 					reinit_completion(&entity->entity_idle);
@@ -240,7 +246,8 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
 
 	list_for_each_entry(entity, &rq->entities, list) {
 
-		if (drm_sched_entity_is_ready(entity)) {
+		if (drm_sched_entity_is_ready(entity) &&
+		    drm_sched_can_queue(sched, entity)) {
 			if (dequeue) {
 				rq->current_entity = entity;
 				reinit_completion(&entity->entity_idle);
@@ -261,13 +268,15 @@  drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq, bool dequeue)
 /**
  * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
  *
+ * @sched: the gpu scheduler
  * @rq: scheduler run queue to check.
  * @dequeue: dequeue selected entity
  *
  * Find oldest waiting ready entity, returns NULL if none found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
+drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
+				struct drm_sched_rq *rq, bool dequeue)
 {
 	struct rb_node *rb;
 
@@ -276,7 +285,8 @@  drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq, bool dequeue)
 		struct drm_sched_entity *entity;
 
 		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
-		if (drm_sched_entity_is_ready(entity)) {
+		if (drm_sched_entity_is_ready(entity) &&
+		    drm_sched_can_queue(sched, entity)) {
 			if (dequeue) {
 				rq->current_entity = entity;
 				reinit_completion(&entity->entity_idle);
@@ -300,15 +310,27 @@  static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
 }
 
 /**
- * drm_sched_can_queue -- Can we queue more to the hardware?
+ * drm_sched_can_queue - can we queue more jobs?
  * @sched: scheduler instance
+ * @entity: the scheduler entity
  *
- * Return true if we can push more jobs to the hw, otherwise false.
+ * Return true if we can push at least one more job from @entity, false
+ * otherwise.
  */
-static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
+static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
+				struct drm_sched_entity *entity)
 {
-	return atomic_read(&sched->hw_rq_count) <
-		sched->hw_submission_limit;
+	struct drm_sched_job *s_job;
+
+	s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
+	if (!s_job)
+		return false;
+
+	WARN_ON(s_job->submission_units > sched->submission_limit);
+
+	return (sched->submission_limit -
+		atomic_read(&sched->submission_count)) >=
+		s_job->submission_units;
 }
 
 /**
@@ -325,12 +347,10 @@  drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
 	struct drm_sched_entity *entity;
 	int i;
 
-	if (!drm_sched_can_queue(sched))
-		return NULL;
-
 	if (sched->single_entity) {
 		if (!READ_ONCE(sched->single_entity->stopped) &&
-		    drm_sched_entity_is_ready(sched->single_entity))
+		    drm_sched_entity_is_ready(sched->single_entity) &&
+		    drm_sched_can_queue(sched, sched->single_entity))
 			return sched->single_entity;
 
 		return NULL;
@@ -339,9 +359,11 @@  drm_sched_select_entity(struct drm_gpu_scheduler *sched, bool dequeue)
 	/* Kernel run queue has higher priority than normal run queue*/
 	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
 		entity = sched->sched_policy == DRM_SCHED_POLICY_FIFO ?
-			drm_sched_rq_select_entity_fifo(&sched->sched_rq[i],
+			drm_sched_rq_select_entity_fifo(sched,
+							&sched->sched_rq[i],
 							dequeue) :
-			drm_sched_rq_select_entity_rr(&sched->sched_rq[i],
+			drm_sched_rq_select_entity_rr(sched,
+						      &sched->sched_rq[i],
 						      dequeue);
 		if (entity)
 			break;
@@ -399,7 +421,7 @@  static void drm_sched_job_done(struct drm_sched_job *s_job, int result)
 	struct drm_sched_fence *s_fence = s_job->s_fence;
 	struct drm_gpu_scheduler *sched = s_fence->sched;
 
-	atomic_dec(&sched->hw_rq_count);
+	atomic_sub(s_job->submission_units, &sched->submission_count);
 	atomic_dec(sched->score);
 
 	trace_drm_sched_process_job(s_fence);
@@ -622,7 +644,8 @@  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 					      &s_job->cb)) {
 			dma_fence_put(s_job->s_fence->parent);
 			s_job->s_fence->parent = NULL;
-			atomic_dec(&sched->hw_rq_count);
+			atomic_sub(s_job->submission_units,
+				   &sched->submission_count);
 		} else {
 			/*
 			 * remove job from pending_list.
@@ -683,7 +706,7 @@  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 		struct dma_fence *fence = s_job->s_fence->parent;
 
-		atomic_inc(&sched->hw_rq_count);
+		atomic_add(s_job->submission_units, &sched->submission_count);
 
 		if (!full_recovery)
 			continue;
@@ -764,6 +787,8 @@  EXPORT_SYMBOL(drm_sched_resubmit_jobs);
  * drm_sched_job_init - init a scheduler job
  * @job: scheduler job to init
  * @entity: scheduler entity to use
+ * @submission_units: the amount of units this job contributes to the schdulers
+ * submission limit
  * @owner: job owner for debugging
  *
  * Refer to drm_sched_entity_push_job() documentation
@@ -781,6 +806,7 @@  EXPORT_SYMBOL(drm_sched_resubmit_jobs);
  */
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_sched_entity *entity,
+		       u32 submission_units,
 		       void *owner)
 {
 	if (!entity->rq && !entity->single_sched)
@@ -792,6 +818,7 @@  int drm_sched_job_init(struct drm_sched_job *job,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&job->list);
+	job->submission_units = submission_units ? submission_units : 1;
 
 	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
 
@@ -1004,12 +1031,14 @@  EXPORT_SYMBOL(drm_sched_job_cleanup);
 /**
  * drm_sched_wakeup_if_can_queue - Wake up the scheduler
  * @sched: scheduler instance
+ * @entity: the scheduler entity
  *
  * Wake up the scheduler if we can queue jobs.
  */
-void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched)
+void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
+				   struct drm_sched_entity *entity)
 {
-	if (drm_sched_can_queue(sched))
+	if (drm_sched_can_queue(sched, entity))
 		drm_sched_run_job_queue(sched);
 }
 
@@ -1147,7 +1176,7 @@  static void drm_sched_run_job_work(struct work_struct *w)
 
 	s_fence = sched_job->s_fence;
 
-	atomic_inc(&sched->hw_rq_count);
+	atomic_add(sched_job->submission_units, &sched->submission_count);
 	drm_sched_job_begin(sched_job);
 
 	trace_drm_run_job(sched_job, entity);
@@ -1183,7 +1212,7 @@  static void drm_sched_run_job_work(struct work_struct *w)
  * @ops: backend operations for this scheduler
  * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is
  *	       allocated and used
- * @hw_submission: number of hw submissions that can be in flight
+ * @max_submission_units: number of submission units that can be in flight
  * @hang_limit: number of times to allow a job to hang before dropping it
  * @timeout: timeout value in jiffies for the scheduler
  * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
@@ -1198,7 +1227,7 @@  static void drm_sched_run_job_work(struct work_struct *w)
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
 		   struct workqueue_struct *submit_wq,
-		   unsigned hw_submission, unsigned hang_limit,
+		   unsigned max_submission_units, unsigned hang_limit,
 		   long timeout, struct workqueue_struct *timeout_wq,
 		   atomic_t *score, const char *name,
 		   enum drm_sched_policy sched_policy,
@@ -1211,7 +1240,7 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 
 	sched->ops = ops;
 	sched->single_entity = NULL;
-	sched->hw_submission_limit = hw_submission;
+	sched->submission_limit = max_submission_units;
 	sched->name = name;
 	if (!submit_wq) {
 		sched->submit_wq = alloc_ordered_workqueue(name, 0);
@@ -1238,7 +1267,7 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 	init_waitqueue_head(&sched->job_scheduled);
 	INIT_LIST_HEAD(&sched->pending_list);
 	spin_lock_init(&sched->job_list_lock);
-	atomic_set(&sched->hw_rq_count, 0);
+	atomic_set(&sched->submission_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
 	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
 	INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 2e94ce788c71..8479e5302f7b 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -417,7 +417,7 @@  v3d_job_init(struct v3d_dev *v3d, struct drm_file *file_priv,
 	job->free = free;
 
 	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
-				 v3d_priv);
+				 1, v3d_priv);
 	if (ret)
 		goto fail;
 
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 27f5778bbd6d..89b0aecd02e3 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -329,6 +329,8 @@  struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
  * @sched: the scheduler instance on which this job is scheduled.
  * @s_fence: contains the fences for the scheduling of job.
  * @finish_cb: the callback for the finished fence.
+ * @submission_units: the amount of submission units this job contributes to
+ *                    the scheduler
  * @work: Helper to reschdeule job kill to different context.
  * @id: a unique id assigned to each job scheduled on the scheduler.
  * @karma: increment on every hang caused by this job. If this exceeds the hang
@@ -348,6 +350,8 @@  struct drm_sched_job {
 	struct drm_gpu_scheduler	*sched;
 	struct drm_sched_fence		*s_fence;
 
+	u32				submission_units;
+
 	/*
 	 * work is used only after finish_cb has been used and will not be
 	 * accessed anymore.
@@ -478,14 +482,14 @@  struct drm_sched_backend_ops {
  *
  * @ops: backend operations provided by the driver.
  * @single_entity: Single entity for the scheduler
- * @hw_submission_limit: the max size of the hardware queue.
+ * @submission_limit: the maximim amount of submission units
+ * @submission_count: the number current amount of submission units in flight
  * @timeout: the time after which a job is removed from the scheduler.
  * @name: name of the ring for which this scheduler is being used.
  * @sched_rq: priority wise array of run queues.
  * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
  *                 waits on this wait queue until all the scheduled jobs are
  *                 finished.
- * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
  * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
  * @timeout_wq: workqueue used to queue @work_tdr
@@ -511,12 +515,12 @@  struct drm_sched_backend_ops {
 struct drm_gpu_scheduler {
 	const struct drm_sched_backend_ops	*ops;
 	struct drm_sched_entity		*single_entity;
-	uint32_t			hw_submission_limit;
+	u32				submission_limit;
+	atomic_t			submission_count;
 	long				timeout;
 	const char			*name;
 	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
 	wait_queue_head_t		job_scheduled;
-	atomic_t			hw_rq_count;
 	atomic64_t			job_id_count;
 	struct workqueue_struct		*submit_wq;
 	struct workqueue_struct		*timeout_wq;
@@ -539,7 +543,7 @@  struct drm_gpu_scheduler {
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
 		   struct workqueue_struct *submit_wq,
-		   uint32_t hw_submission, unsigned hang_limit,
+		   uint32_t max_submission_units, unsigned hang_limit,
 		   long timeout, struct workqueue_struct *timeout_wq,
 		   atomic_t *score, const char *name,
 		   enum drm_sched_policy sched_policy,
@@ -548,6 +552,7 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_sched_entity *entity,
+		       u32 submission_units,
 		       void *owner);
 void drm_sched_job_arm(struct drm_sched_job *job);
 int drm_sched_job_add_dependency(struct drm_sched_job *job,
@@ -570,7 +575,8 @@  void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
 
 void drm_sched_tdr_queue_imm(struct drm_gpu_scheduler *sched);
 void drm_sched_job_cleanup(struct drm_sched_job *job);
-void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched);
+void drm_sched_wakeup_if_can_queue(struct drm_gpu_scheduler *sched,
+				   struct drm_sched_entity *entity);
 bool drm_sched_submit_ready(struct drm_gpu_scheduler *sched);
 void drm_sched_submit_stop(struct drm_gpu_scheduler *sched);
 void drm_sched_submit_start(struct drm_gpu_scheduler *sched);