diff mbox series

[v5,02/16] drm/sched: Allow using a dedicated workqueue for the timeout/fault tdr

Message ID 20210629073510.2764391-3-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series drm/panfrost: Misc improvements | expand

Commit Message

Boris Brezillon June 29, 2021, 7:34 a.m. UTC
Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
reset. This leads to extra complexity when we need to synchronize timeout
works with the reset work. One solution to address that is to have an
ordered workqueue at the driver level that will be used by the different
schedulers to queue their timeout work. Thanks to the serialization
provided by the ordered workqueue we are guaranteed that timeout
handlers are executed sequentially, and can thus easily reset the GPU
from the timeout handler without extra synchronization.

v5:
* Add a new paragraph to the timedout_job() method

v3:
* New patch

v4:
* Actually use the timeout_wq to queue the timeout work

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
Cc: Qiang Yu <yuq825@gmail.com>
Cc: Emma Anholt <emma@anholt.net>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Christian König" <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
 drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
 drivers/gpu/drm/panfrost/panfrost_job.c   |  3 ++-
 drivers/gpu/drm/scheduler/sched_main.c    | 14 +++++++++-----
 drivers/gpu/drm/v3d/v3d_sched.c           | 10 +++++-----
 include/drm/gpu_scheduler.h               | 23 ++++++++++++++++++++++-
 7 files changed, 43 insertions(+), 15 deletions(-)

Comments

Daniel Vetter June 29, 2021, 8:50 a.m. UTC | #1
On Tue, Jun 29, 2021 at 09:34:56AM +0200, Boris Brezillon wrote:
> Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
> reset. This leads to extra complexity when we need to synchronize timeout
> works with the reset work. One solution to address that is to have an
> ordered workqueue at the driver level that will be used by the different
> schedulers to queue their timeout work. Thanks to the serialization
> provided by the ordered workqueue we are guaranteed that timeout
> handlers are executed sequentially, and can thus easily reset the GPU
> from the timeout handler without extra synchronization.
> 
> v5:
> * Add a new paragraph to the timedout_job() method
> 
> v3:
> * New patch
> 
> v4:
> * Actually use the timeout_wq to queue the timeout work
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> Reviewed-by: Steven Price <steven.price@arm.com>
> Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
> Cc: Qiang Yu <yuq825@gmail.com>
> Cc: Emma Anholt <emma@anholt.net>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: "Christian König" <christian.koenig@amd.com>

Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>

Also since I'm occasionally blinded by my own pride, add suggested-by: me?
I did spend quite a bit pondering how to untangle your various lockdep
splats in the trd handler :-)

Cheers, Daniel

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>  drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>  drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>  drivers/gpu/drm/panfrost/panfrost_job.c   |  3 ++-
>  drivers/gpu/drm/scheduler/sched_main.c    | 14 +++++++++-----
>  drivers/gpu/drm/v3d/v3d_sched.c           | 10 +++++-----
>  include/drm/gpu_scheduler.h               | 23 ++++++++++++++++++++++-
>  7 files changed, 43 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 47ea46859618..532636ea20bc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -488,7 +488,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>  
>  	r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>  			   num_hw_submission, amdgpu_job_hang_limit,
> -			   timeout, sched_score, ring->name);
> +			   timeout, NULL, sched_score, ring->name);
>  	if (r) {
>  		DRM_ERROR("Failed to create scheduler on ring %s.\n",
>  			  ring->name);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index 19826e504efc..feb6da1b6ceb 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -190,7 +190,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>  
>  	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>  			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> -			     msecs_to_jiffies(500), NULL, dev_name(gpu->dev));
> +			     msecs_to_jiffies(500), NULL, NULL,
> +			     dev_name(gpu->dev));
>  	if (ret)
>  		return ret;
>  
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index ecf3267334ff..dba8329937a3 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -508,7 +508,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
>  	INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
>  
>  	return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
> -			      lima_job_hang_limit, msecs_to_jiffies(timeout),
> +			      lima_job_hang_limit,
> +			      msecs_to_jiffies(timeout), NULL,
>  			      NULL, name);
>  }
>  
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index 682f2161b999..8ff79fd49577 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -626,7 +626,8 @@ int panfrost_job_init(struct panfrost_device *pfdev)
>  
>  		ret = drm_sched_init(&js->queue[j].sched,
>  				     &panfrost_sched_ops,
> -				     1, 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
> +				     1, 0,
> +				     msecs_to_jiffies(JOB_TIMEOUT_MS), NULL,
>  				     NULL, "pan_js");
>  		if (ret) {
>  			dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index c0a2f8f8d472..3e180f0d4305 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -232,7 +232,7 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>  {
>  	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>  	    !list_empty(&sched->pending_list))
> -		schedule_delayed_work(&sched->work_tdr, sched->timeout);
> +		queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
>  }
>  
>  /**
> @@ -244,7 +244,7 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>   */
>  void drm_sched_fault(struct drm_gpu_scheduler *sched)
>  {
> -	mod_delayed_work(system_wq, &sched->work_tdr, 0);
> +	mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
>  }
>  EXPORT_SYMBOL(drm_sched_fault);
>  
> @@ -270,7 +270,7 @@ unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
>  	 * Modify the timeout to an arbitrarily large value. This also prevents
>  	 * the timeout to be restarted when new submissions arrive
>  	 */
> -	if (mod_delayed_work(system_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
> +	if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
>  			&& time_after(sched_timeout, now))
>  		return sched_timeout - now;
>  	else
> @@ -294,7 +294,7 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>  	if (list_empty(&sched->pending_list))
>  		cancel_delayed_work(&sched->work_tdr);
>  	else
> -		mod_delayed_work(system_wq, &sched->work_tdr, remaining);
> +		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
>  
>  	spin_unlock(&sched->job_list_lock);
>  }
> @@ -837,6 +837,8 @@ static int drm_sched_main(void *param)
>   * @hw_submission: number of hw submissions that can be in flight
>   * @hang_limit: number of times to allow a job to hang before dropping it
>   * @timeout: timeout value in jiffies for the scheduler
> + * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
> + *		used
>   * @score: optional score atomic shared with other schedulers
>   * @name: name used for debugging
>   *
> @@ -844,7 +846,8 @@ static int drm_sched_main(void *param)
>   */
>  int drm_sched_init(struct drm_gpu_scheduler *sched,
>  		   const struct drm_sched_backend_ops *ops,
> -		   unsigned hw_submission, unsigned hang_limit, long timeout,
> +		   unsigned hw_submission, unsigned hang_limit,
> +		   long timeout, struct workqueue_struct *timeout_wq,
>  		   atomic_t *score, const char *name)
>  {
>  	int i, ret;
> @@ -852,6 +855,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  	sched->hw_submission_limit = hw_submission;
>  	sched->name = name;
>  	sched->timeout = timeout;
> +	sched->timeout_wq = timeout_wq ? : system_wq;
>  	sched->hang_limit = hang_limit;
>  	sched->score = score ? score : &sched->_score;
>  	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 8992480c88fa..a39bdd5cfc4f 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -402,7 +402,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>  	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
>  			     &v3d_bin_sched_ops,
>  			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     msecs_to_jiffies(hang_limit_ms), NULL,
>  			     NULL, "v3d_bin");
>  	if (ret) {
>  		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
> @@ -412,7 +412,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>  	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
>  			     &v3d_render_sched_ops,
>  			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     msecs_to_jiffies(hang_limit_ms), NULL,
>  			     NULL, "v3d_render");
>  	if (ret) {
>  		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
> @@ -424,7 +424,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>  	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
>  			     &v3d_tfu_sched_ops,
>  			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     msecs_to_jiffies(hang_limit_ms), NULL,
>  			     NULL, "v3d_tfu");
>  	if (ret) {
>  		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
> @@ -437,7 +437,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>  		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
>  				     &v3d_csd_sched_ops,
>  				     hw_jobs_limit, job_hang_limit,
> -				     msecs_to_jiffies(hang_limit_ms),
> +				     msecs_to_jiffies(hang_limit_ms), NULL,
>  				     NULL, "v3d_csd");
>  		if (ret) {
>  			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
> @@ -449,7 +449,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>  		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
>  				     &v3d_cache_clean_sched_ops,
>  				     hw_jobs_limit, job_hang_limit,
> -				     msecs_to_jiffies(hang_limit_ms),
> +				     msecs_to_jiffies(hang_limit_ms), NULL,
>  				     NULL, "v3d_cache_clean");
>  		if (ret) {
>  			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 65700511e074..9c41904ffd83 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -253,6 +253,24 @@ struct drm_sched_backend_ops {
>  	 * 5. Restart the scheduler using drm_sched_start(). At that point, new
>  	 *    jobs can be queued, and the scheduler thread is unblocked
>  	 *
> +	 * Note that some GPUs have distinct hardware queues but need to reset
> +	 * the GPU globally, which requires extra synchronization between the
> +	 * timeout handler of the different &drm_gpu_scheduler. One way to
> +	 * achieve this synchronization is to create an ordered workqueue
> +	 * (using alloc_ordered_workqueue()) at the driver level, and pass this
> +	 * queue to drm_sched_init(), to guarantee that timeout handlers are
> +	 * executed sequentially. The above workflow needs to be slightly
> +	 * adjusted in that case:
> +	 *
> +	 * 1. Stop all schedulers impacted by the reset using drm_sched_stop()
> +	 * 2. Try to gracefully stop non-faulty jobs on all queues impacted by
> +	 *    the reset (optional)
> +	 * 3. Issue a GPU reset on all faulty queues (driver-specific)
> +	 * 4. Re-submit jobs on all schedulers impacted by the reset using
> +	 *    drm_sched_resubmit_jobs()
> +	 * 5. Restart all schedulers that were stopped in step #1 using
> +	 *    drm_sched_start()
> +	 *
>  	 * Return DRM_GPU_SCHED_STAT_NOMINAL, when all is normal,
>  	 * and the underlying driver has started or completed recovery.
>  	 *
> @@ -283,6 +301,7 @@ struct drm_sched_backend_ops {
>   *                 finished.
>   * @hw_rq_count: the number of jobs currently in the hardware queue.
>   * @job_id_count: used to assign unique id to the each job.
> + * @timeout_wq: workqueue used to queue @work_tdr
>   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>   *            timeout interval is over.
>   * @thread: the kthread on which the scheduler which run.
> @@ -307,6 +326,7 @@ struct drm_gpu_scheduler {
>  	wait_queue_head_t		job_scheduled;
>  	atomic_t			hw_rq_count;
>  	atomic64_t			job_id_count;
> +	struct workqueue_struct		*timeout_wq;
>  	struct delayed_work		work_tdr;
>  	struct task_struct		*thread;
>  	struct list_head		pending_list;
> @@ -320,7 +340,8 @@ struct drm_gpu_scheduler {
>  
>  int drm_sched_init(struct drm_gpu_scheduler *sched,
>  		   const struct drm_sched_backend_ops *ops,
> -		   uint32_t hw_submission, unsigned hang_limit, long timeout,
> +		   uint32_t hw_submission, unsigned hang_limit,
> +		   long timeout, struct workqueue_struct *timeout_wq,
>  		   atomic_t *score, const char *name);
>  
>  void drm_sched_fini(struct drm_gpu_scheduler *sched);
> -- 
> 2.31.1
>
Boris Brezillon June 29, 2021, 8:58 a.m. UTC | #2
On Tue, 29 Jun 2021 10:50:36 +0200
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Jun 29, 2021 at 09:34:56AM +0200, Boris Brezillon wrote:
> > Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
> > reset. This leads to extra complexity when we need to synchronize timeout
> > works with the reset work. One solution to address that is to have an
> > ordered workqueue at the driver level that will be used by the different
> > schedulers to queue their timeout work. Thanks to the serialization
> > provided by the ordered workqueue we are guaranteed that timeout
> > handlers are executed sequentially, and can thus easily reset the GPU
> > from the timeout handler without extra synchronization.
> > 
> > v5:
> > * Add a new paragraph to the timedout_job() method
> > 
> > v3:
> > * New patch
> > 
> > v4:
> > * Actually use the timeout_wq to queue the timeout work
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > Reviewed-by: Steven Price <steven.price@arm.com>
> > Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
> > Cc: Qiang Yu <yuq825@gmail.com>
> > Cc: Emma Anholt <emma@anholt.net>
> > Cc: Alex Deucher <alexander.deucher@amd.com>
> > Cc: "Christian König" <christian.koenig@amd.com>  
> 
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> Also since I'm occasionally blinded by my own pride, add suggested-by: me?

Duh, it's an oversight (I thought I had that 'Suggested-by: Daniel
Vetter ...' already).

> I did spend quite a bit pondering how to untangle your various lockdep
> splats in the trd handler :-)

And I'm grateful for your help ;-).
Christian König June 29, 2021, 11:03 a.m. UTC | #3
Am 29.06.21 um 09:34 schrieb Boris Brezillon:
> Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
> reset. This leads to extra complexity when we need to synchronize timeout
> works with the reset work. One solution to address that is to have an
> ordered workqueue at the driver level that will be used by the different
> schedulers to queue their timeout work. Thanks to the serialization
> provided by the ordered workqueue we are guaranteed that timeout
> handlers are executed sequentially, and can thus easily reset the GPU
> from the timeout handler without extra synchronization.

Well, we had already tried this and it didn't worked the way it is expected.

The major problem is that you not only want to serialize the queue, but 
rather have a single reset for all queues.

Otherwise you schedule multiple resets for each hardware queue. E.g. for 
your 3 hardware queues you would reset the GPU 3 times if all of them 
time out at the same time (which is rather likely).

Using a single delayed work item doesn't work either because you then 
only have one timeout.

What could be done is to cancel all delayed work items from all stopped 
schedulers.

Regards,
Christian.

>
> v5:
> * Add a new paragraph to the timedout_job() method
>
> v3:
> * New patch
>
> v4:
> * Actually use the timeout_wq to queue the timeout work
>
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> Reviewed-by: Steven Price <steven.price@arm.com>
> Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
> Cc: Qiang Yu <yuq825@gmail.com>
> Cc: Emma Anholt <emma@anholt.net>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: "Christian König" <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>   drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>   drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>   drivers/gpu/drm/panfrost/panfrost_job.c   |  3 ++-
>   drivers/gpu/drm/scheduler/sched_main.c    | 14 +++++++++-----
>   drivers/gpu/drm/v3d/v3d_sched.c           | 10 +++++-----
>   include/drm/gpu_scheduler.h               | 23 ++++++++++++++++++++++-
>   7 files changed, 43 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 47ea46859618..532636ea20bc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -488,7 +488,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   
>   	r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>   			   num_hw_submission, amdgpu_job_hang_limit,
> -			   timeout, sched_score, ring->name);
> +			   timeout, NULL, sched_score, ring->name);
>   	if (r) {
>   		DRM_ERROR("Failed to create scheduler on ring %s.\n",
>   			  ring->name);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index 19826e504efc..feb6da1b6ceb 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -190,7 +190,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>   
>   	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>   			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> -			     msecs_to_jiffies(500), NULL, dev_name(gpu->dev));
> +			     msecs_to_jiffies(500), NULL, NULL,
> +			     dev_name(gpu->dev));
>   	if (ret)
>   		return ret;
>   
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index ecf3267334ff..dba8329937a3 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -508,7 +508,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
>   	INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
>   
>   	return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
> -			      lima_job_hang_limit, msecs_to_jiffies(timeout),
> +			      lima_job_hang_limit,
> +			      msecs_to_jiffies(timeout), NULL,
>   			      NULL, name);
>   }
>   
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index 682f2161b999..8ff79fd49577 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -626,7 +626,8 @@ int panfrost_job_init(struct panfrost_device *pfdev)
>   
>   		ret = drm_sched_init(&js->queue[j].sched,
>   				     &panfrost_sched_ops,
> -				     1, 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
> +				     1, 0,
> +				     msecs_to_jiffies(JOB_TIMEOUT_MS), NULL,
>   				     NULL, "pan_js");
>   		if (ret) {
>   			dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index c0a2f8f8d472..3e180f0d4305 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -232,7 +232,7 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>   {
>   	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>   	    !list_empty(&sched->pending_list))
> -		schedule_delayed_work(&sched->work_tdr, sched->timeout);
> +		queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
>   }
>   
>   /**
> @@ -244,7 +244,7 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>    */
>   void drm_sched_fault(struct drm_gpu_scheduler *sched)
>   {
> -	mod_delayed_work(system_wq, &sched->work_tdr, 0);
> +	mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
>   }
>   EXPORT_SYMBOL(drm_sched_fault);
>   
> @@ -270,7 +270,7 @@ unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
>   	 * Modify the timeout to an arbitrarily large value. This also prevents
>   	 * the timeout to be restarted when new submissions arrive
>   	 */
> -	if (mod_delayed_work(system_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
> +	if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
>   			&& time_after(sched_timeout, now))
>   		return sched_timeout - now;
>   	else
> @@ -294,7 +294,7 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>   	if (list_empty(&sched->pending_list))
>   		cancel_delayed_work(&sched->work_tdr);
>   	else
> -		mod_delayed_work(system_wq, &sched->work_tdr, remaining);
> +		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
>   
>   	spin_unlock(&sched->job_list_lock);
>   }
> @@ -837,6 +837,8 @@ static int drm_sched_main(void *param)
>    * @hw_submission: number of hw submissions that can be in flight
>    * @hang_limit: number of times to allow a job to hang before dropping it
>    * @timeout: timeout value in jiffies for the scheduler
> + * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
> + *		used
>    * @score: optional score atomic shared with other schedulers
>    * @name: name used for debugging
>    *
> @@ -844,7 +846,8 @@ static int drm_sched_main(void *param)
>    */
>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   const struct drm_sched_backend_ops *ops,
> -		   unsigned hw_submission, unsigned hang_limit, long timeout,
> +		   unsigned hw_submission, unsigned hang_limit,
> +		   long timeout, struct workqueue_struct *timeout_wq,
>   		   atomic_t *score, const char *name)
>   {
>   	int i, ret;
> @@ -852,6 +855,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   	sched->hw_submission_limit = hw_submission;
>   	sched->name = name;
>   	sched->timeout = timeout;
> +	sched->timeout_wq = timeout_wq ? : system_wq;
>   	sched->hang_limit = hang_limit;
>   	sched->score = score ? score : &sched->_score;
>   	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 8992480c88fa..a39bdd5cfc4f 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -402,7 +402,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>   	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
>   			     &v3d_bin_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     msecs_to_jiffies(hang_limit_ms), NULL,
>   			     NULL, "v3d_bin");
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
> @@ -412,7 +412,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>   	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
>   			     &v3d_render_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     msecs_to_jiffies(hang_limit_ms), NULL,
>   			     NULL, "v3d_render");
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
> @@ -424,7 +424,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>   	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
>   			     &v3d_tfu_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     msecs_to_jiffies(hang_limit_ms), NULL,
>   			     NULL, "v3d_tfu");
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
> @@ -437,7 +437,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>   		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
>   				     &v3d_csd_sched_ops,
>   				     hw_jobs_limit, job_hang_limit,
> -				     msecs_to_jiffies(hang_limit_ms),
> +				     msecs_to_jiffies(hang_limit_ms), NULL,
>   				     NULL, "v3d_csd");
>   		if (ret) {
>   			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
> @@ -449,7 +449,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>   		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
>   				     &v3d_cache_clean_sched_ops,
>   				     hw_jobs_limit, job_hang_limit,
> -				     msecs_to_jiffies(hang_limit_ms),
> +				     msecs_to_jiffies(hang_limit_ms), NULL,
>   				     NULL, "v3d_cache_clean");
>   		if (ret) {
>   			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 65700511e074..9c41904ffd83 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -253,6 +253,24 @@ struct drm_sched_backend_ops {
>   	 * 5. Restart the scheduler using drm_sched_start(). At that point, new
>   	 *    jobs can be queued, and the scheduler thread is unblocked
>   	 *
> +	 * Note that some GPUs have distinct hardware queues but need to reset
> +	 * the GPU globally, which requires extra synchronization between the
> +	 * timeout handler of the different &drm_gpu_scheduler. One way to
> +	 * achieve this synchronization is to create an ordered workqueue
> +	 * (using alloc_ordered_workqueue()) at the driver level, and pass this
> +	 * queue to drm_sched_init(), to guarantee that timeout handlers are
> +	 * executed sequentially. The above workflow needs to be slightly
> +	 * adjusted in that case:
> +	 *
> +	 * 1. Stop all schedulers impacted by the reset using drm_sched_stop()
> +	 * 2. Try to gracefully stop non-faulty jobs on all queues impacted by
> +	 *    the reset (optional)
> +	 * 3. Issue a GPU reset on all faulty queues (driver-specific)
> +	 * 4. Re-submit jobs on all schedulers impacted by the reset using
> +	 *    drm_sched_resubmit_jobs()
> +	 * 5. Restart all schedulers that were stopped in step #1 using
> +	 *    drm_sched_start()
> +	 *
>   	 * Return DRM_GPU_SCHED_STAT_NOMINAL, when all is normal,
>   	 * and the underlying driver has started or completed recovery.
>   	 *
> @@ -283,6 +301,7 @@ struct drm_sched_backend_ops {
>    *                 finished.
>    * @hw_rq_count: the number of jobs currently in the hardware queue.
>    * @job_id_count: used to assign unique id to the each job.
> + * @timeout_wq: workqueue used to queue @work_tdr
>    * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>    *            timeout interval is over.
>    * @thread: the kthread on which the scheduler which run.
> @@ -307,6 +326,7 @@ struct drm_gpu_scheduler {
>   	wait_queue_head_t		job_scheduled;
>   	atomic_t			hw_rq_count;
>   	atomic64_t			job_id_count;
> +	struct workqueue_struct		*timeout_wq;
>   	struct delayed_work		work_tdr;
>   	struct task_struct		*thread;
>   	struct list_head		pending_list;
> @@ -320,7 +340,8 @@ struct drm_gpu_scheduler {
>   
>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   const struct drm_sched_backend_ops *ops,
> -		   uint32_t hw_submission, unsigned hang_limit, long timeout,
> +		   uint32_t hw_submission, unsigned hang_limit,
> +		   long timeout, struct workqueue_struct *timeout_wq,
>   		   atomic_t *score, const char *name);
>   
>   void drm_sched_fini(struct drm_gpu_scheduler *sched);
Boris Brezillon June 29, 2021, 11:18 a.m. UTC | #4
Hi Christian,

On Tue, 29 Jun 2021 13:03:58 +0200
Christian König <christian.koenig@amd.com> wrote:

> Am 29.06.21 um 09:34 schrieb Boris Brezillon:
> > Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
> > reset. This leads to extra complexity when we need to synchronize timeout
> > works with the reset work. One solution to address that is to have an
> > ordered workqueue at the driver level that will be used by the different
> > schedulers to queue their timeout work. Thanks to the serialization
> > provided by the ordered workqueue we are guaranteed that timeout
> > handlers are executed sequentially, and can thus easily reset the GPU
> > from the timeout handler without extra synchronization.  
> 
> Well, we had already tried this and it didn't worked the way it is expected.
> 
> The major problem is that you not only want to serialize the queue, but 
> rather have a single reset for all queues.
> 
> Otherwise you schedule multiple resets for each hardware queue. E.g. for 
> your 3 hardware queues you would reset the GPU 3 times if all of them 
> time out at the same time (which is rather likely).
> 
> Using a single delayed work item doesn't work either because you then 
> only have one timeout.
> 
> What could be done is to cancel all delayed work items from all stopped 
> schedulers.

drm_sched_stop() does that already, and since we call drm_sched_stop()
on all queues in the timeout handler, we end up with only one global
reset happening even if several queues report a timeout at the same
time.

Regards,

Boris
Christian König June 29, 2021, 11:24 a.m. UTC | #5
Am 29.06.21 um 13:18 schrieb Boris Brezillon:
> Hi Christian,
>
> On Tue, 29 Jun 2021 13:03:58 +0200
> Christian König <christian.koenig@amd.com> wrote:
>
>> Am 29.06.21 um 09:34 schrieb Boris Brezillon:
>>> Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
>>> reset. This leads to extra complexity when we need to synchronize timeout
>>> works with the reset work. One solution to address that is to have an
>>> ordered workqueue at the driver level that will be used by the different
>>> schedulers to queue their timeout work. Thanks to the serialization
>>> provided by the ordered workqueue we are guaranteed that timeout
>>> handlers are executed sequentially, and can thus easily reset the GPU
>>> from the timeout handler without extra synchronization.
>> Well, we had already tried this and it didn't worked the way it is expected.
>>
>> The major problem is that you not only want to serialize the queue, but
>> rather have a single reset for all queues.
>>
>> Otherwise you schedule multiple resets for each hardware queue. E.g. for
>> your 3 hardware queues you would reset the GPU 3 times if all of them
>> time out at the same time (which is rather likely).
>>
>> Using a single delayed work item doesn't work either because you then
>> only have one timeout.
>>
>> What could be done is to cancel all delayed work items from all stopped
>> schedulers.
> drm_sched_stop() does that already, and since we call drm_sched_stop()
> on all queues in the timeout handler, we end up with only one global
> reset happening even if several queues report a timeout at the same
> time.

Ah, nice. Yeah, in this case it should indeed work as expected.

Feel free to add an Acked-by: Christian König <christian.koenig@amd.com> 
to it.

Regards,
Christian.

>
> Regards,
>
> Boris
Daniel Vetter June 29, 2021, 2:05 p.m. UTC | #6
On Tue, Jun 29, 2021 at 01:24:53PM +0200, Christian König wrote:
> Am 29.06.21 um 13:18 schrieb Boris Brezillon:
> > Hi Christian,
> > 
> > On Tue, 29 Jun 2021 13:03:58 +0200
> > Christian König <christian.koenig@amd.com> wrote:
> > 
> > > Am 29.06.21 um 09:34 schrieb Boris Brezillon:
> > > > Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
> > > > reset. This leads to extra complexity when we need to synchronize timeout
> > > > works with the reset work. One solution to address that is to have an
> > > > ordered workqueue at the driver level that will be used by the different
> > > > schedulers to queue their timeout work. Thanks to the serialization
> > > > provided by the ordered workqueue we are guaranteed that timeout
> > > > handlers are executed sequentially, and can thus easily reset the GPU
> > > > from the timeout handler without extra synchronization.
> > > Well, we had already tried this and it didn't worked the way it is expected.
> > > 
> > > The major problem is that you not only want to serialize the queue, but
> > > rather have a single reset for all queues.
> > > 
> > > Otherwise you schedule multiple resets for each hardware queue. E.g. for
> > > your 3 hardware queues you would reset the GPU 3 times if all of them
> > > time out at the same time (which is rather likely).
> > > 
> > > Using a single delayed work item doesn't work either because you then
> > > only have one timeout.
> > > 
> > > What could be done is to cancel all delayed work items from all stopped
> > > schedulers.
> > drm_sched_stop() does that already, and since we call drm_sched_stop()
> > on all queues in the timeout handler, we end up with only one global
> > reset happening even if several queues report a timeout at the same
> > time.
> 
> Ah, nice. Yeah, in this case it should indeed work as expected.
> 
> Feel free to add an Acked-by: Christian König <christian.koenig@amd.com> to
> it.

Yeah this is also what we want for the case where you have cascading
reset. Like if you have per-engine reset, but if that fails, have to reset
the entire chip. That's similar but not quite to the soft recover, then
chip reset amdgpu does since for the engine reset you do have to stop the
scheduler. For full context the i915 reset cascade:

- try to evict the offending work with preemption (similar in it's impact
  to amdgpu's soft recovery I think), other requests on the same engine
  are unaffected here.

- reset the affected engine

- reset the entire chip

So my plan at least is that we'd only call drm_sched_stop as we escalate.
Also on earlier chips first and second might not be available.

Lucky for us chip reset only resets the render side of things, display
survives on gen4+, so we don't have the nasty locking issue amdgpu has.
-Daniel
Andrey Grodzovsky Sept. 7, 2021, 6:53 p.m. UTC | #7
On 2021-06-29 7:24 a.m., Christian König wrote:

> Am 29.06.21 um 13:18 schrieb Boris Brezillon:
>> Hi Christian,
>>
>> On Tue, 29 Jun 2021 13:03:58 +0200
>> Christian König <christian.koenig@amd.com> wrote:
>>
>>> Am 29.06.21 um 09:34 schrieb Boris Brezillon:
>>>> Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
>>>> reset. This leads to extra complexity when we need to synchronize 
>>>> timeout
>>>> works with the reset work. One solution to address that is to have an
>>>> ordered workqueue at the driver level that will be used by the 
>>>> different
>>>> schedulers to queue their timeout work. Thanks to the serialization
>>>> provided by the ordered workqueue we are guaranteed that timeout
>>>> handlers are executed sequentially, and can thus easily reset the GPU
>>>> from the timeout handler without extra synchronization.
>>> Well, we had already tried this and it didn't worked the way it is 
>>> expected.
>>>
>>> The major problem is that you not only want to serialize the queue, but
>>> rather have a single reset for all queues.
>>>
>>> Otherwise you schedule multiple resets for each hardware queue. E.g. 
>>> for
>>> your 3 hardware queues you would reset the GPU 3 times if all of them
>>> time out at the same time (which is rather likely).
>>>
>>> Using a single delayed work item doesn't work either because you then
>>> only have one timeout.
>>>
>>> What could be done is to cancel all delayed work items from all stopped
>>> schedulers.
>> drm_sched_stop() does that already, and since we call drm_sched_stop()
>> on all queues in the timeout handler, we end up with only one global
>> reset happening even if several queues report a timeout at the same
>> time.
>
> Ah, nice. Yeah, in this case it should indeed work as expected.
>
> Feel free to add an Acked-by: Christian König 
> <christian.koenig@amd.com> to it.
>
> Regards,
> Christian.


Seems to me that for this to work we need to change cancel_delayed_work 
to cancel_delayed_work_sync
so not only pending TO handlers  are cancelled but also any in progress 
are waited for and to to prevent rearming.
Also move it right after kthread_park - before we start touching pending 
list.

Andrey


>
>>
>> Regards,
>>
>> Boris
>
Boris Brezillon Sept. 8, 2021, 6:50 a.m. UTC | #8
On Tue, 7 Sep 2021 14:53:58 -0400
Andrey Grodzovsky <andrey.grodzovsky@amd.com> wrote:

> On 2021-06-29 7:24 a.m., Christian König wrote:
> 
> > Am 29.06.21 um 13:18 schrieb Boris Brezillon:  
> >> Hi Christian,
> >>
> >> On Tue, 29 Jun 2021 13:03:58 +0200
> >> Christian König <christian.koenig@amd.com> wrote:
> >>  
> >>> Am 29.06.21 um 09:34 schrieb Boris Brezillon:  
> >>>> Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
> >>>> reset. This leads to extra complexity when we need to synchronize 
> >>>> timeout
> >>>> works with the reset work. One solution to address that is to have an
> >>>> ordered workqueue at the driver level that will be used by the 
> >>>> different
> >>>> schedulers to queue their timeout work. Thanks to the serialization
> >>>> provided by the ordered workqueue we are guaranteed that timeout
> >>>> handlers are executed sequentially, and can thus easily reset the GPU
> >>>> from the timeout handler without extra synchronization.  
> >>> Well, we had already tried this and it didn't worked the way it is 
> >>> expected.
> >>>
> >>> The major problem is that you not only want to serialize the queue, but
> >>> rather have a single reset for all queues.
> >>>
> >>> Otherwise you schedule multiple resets for each hardware queue. E.g. 
> >>> for
> >>> your 3 hardware queues you would reset the GPU 3 times if all of them
> >>> time out at the same time (which is rather likely).
> >>>
> >>> Using a single delayed work item doesn't work either because you then
> >>> only have one timeout.
> >>>
> >>> What could be done is to cancel all delayed work items from all stopped
> >>> schedulers.  
> >> drm_sched_stop() does that already, and since we call drm_sched_stop()
> >> on all queues in the timeout handler, we end up with only one global
> >> reset happening even if several queues report a timeout at the same
> >> time.  
> >
> > Ah, nice. Yeah, in this case it should indeed work as expected.
> >
> > Feel free to add an Acked-by: Christian König 
> > <christian.koenig@amd.com> to it.
> >
> > Regards,
> > Christian.  
> 
> 
> Seems to me that for this to work we need to change cancel_delayed_work 
> to cancel_delayed_work_sync
> so not only pending TO handlers  are cancelled but also any in progress 
> are waited for and to to prevent rearming.
> Also move it right after kthread_park - before we start touching pending 
> list.

I'm probably missing something, but I don't really see why this
specific change would require replacing cancel_delayed_work() calls by
the sync variant. I mean, if there's a situation where we need to wait
for in-flight timeout handler to return, it was already the case before
that patch. Note that we need to be careful to not call the sync
variant in helpers that are called from the interrupt handler itself
to avoid deadlocks (i.e. drm_sched_stop()).
Andrey Grodzovsky Sept. 8, 2021, 2:53 p.m. UTC | #9
On 2021-09-08 2:50 a.m., Boris Brezillon wrote:
> On Tue, 7 Sep 2021 14:53:58 -0400
> Andrey Grodzovsky <andrey.grodzovsky@amd.com> wrote:
>
>> On 2021-06-29 7:24 a.m., Christian König wrote:
>>
>>> Am 29.06.21 um 13:18 schrieb Boris Brezillon:
>>>> Hi Christian,
>>>>
>>>> On Tue, 29 Jun 2021 13:03:58 +0200
>>>> Christian König <christian.koenig@amd.com> wrote:
>>>>   
>>>>> Am 29.06.21 um 09:34 schrieb Boris Brezillon:
>>>>>> Mali Midgard/Bifrost GPUs have 3 hardware queues but only a global GPU
>>>>>> reset. This leads to extra complexity when we need to synchronize
>>>>>> timeout
>>>>>> works with the reset work. One solution to address that is to have an
>>>>>> ordered workqueue at the driver level that will be used by the
>>>>>> different
>>>>>> schedulers to queue their timeout work. Thanks to the serialization
>>>>>> provided by the ordered workqueue we are guaranteed that timeout
>>>>>> handlers are executed sequentially, and can thus easily reset the GPU
>>>>>> from the timeout handler without extra synchronization.
>>>>> Well, we had already tried this and it didn't worked the way it is
>>>>> expected.
>>>>>
>>>>> The major problem is that you not only want to serialize the queue, but
>>>>> rather have a single reset for all queues.
>>>>>
>>>>> Otherwise you schedule multiple resets for each hardware queue. E.g.
>>>>> for
>>>>> your 3 hardware queues you would reset the GPU 3 times if all of them
>>>>> time out at the same time (which is rather likely).
>>>>>
>>>>> Using a single delayed work item doesn't work either because you then
>>>>> only have one timeout.
>>>>>
>>>>> What could be done is to cancel all delayed work items from all stopped
>>>>> schedulers.
>>>> drm_sched_stop() does that already, and since we call drm_sched_stop()
>>>> on all queues in the timeout handler, we end up with only one global
>>>> reset happening even if several queues report a timeout at the same
>>>> time.
>>> Ah, nice. Yeah, in this case it should indeed work as expected.
>>>
>>> Feel free to add an Acked-by: Christian König
>>> <christian.koenig@amd.com> to it.
>>>
>>> Regards,
>>> Christian.
>>
>> Seems to me that for this to work we need to change cancel_delayed_work
>> to cancel_delayed_work_sync
>> so not only pending TO handlers  are cancelled but also any in progress
>> are waited for and to to prevent rearming.
>> Also move it right after kthread_park - before we start touching pending
>> list.
> I'm probably missing something, but I don't really see why this
> specific change would require replacing cancel_delayed_work() calls by
> the sync variant.


I see, I missed the point that since now we have a single threaded 
processing and
only one TDR handler runs at given time there is no need to wait for 
other parallel in flight TDR handlers.


> I mean, if there's a situation where we need to wait
> for in-flight timeout handler to return, it was already the case before
> that patch.


In amdgpu case we avoided that by trylock on a common lock
and returning early in case it was already taken by another TDR handler


> Note that we need to be careful to not call the sync
> variant in helpers that are called from the interrupt handler itself
> to avoid deadlocks (i.e. drm_sched_stop()).


I am not clear here - which interrupt handler is drm_sched_stop
called from ? It's called from TDR work as far as I see in the code.

Andrey
Boris Brezillon Sept. 8, 2021, 2:55 p.m. UTC | #10
On Wed, 8 Sep 2021 10:53:21 -0400
Andrey Grodzovsky <andrey.grodzovsky@amd.com> wrote:

> > Note that we need to be careful to not call the sync
> > variant in helpers that are called from the interrupt handler itself
> > to avoid deadlocks (i.e. drm_sched_stop()).  
> 
> 
> I am not clear here - which interrupt handler is drm_sched_stop
> called from ? It's called from TDR work as far as I see in the code.

My bad, I meant the timeout handler, not the interrupt handler.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 47ea46859618..532636ea20bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -488,7 +488,7 @@  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 
 	r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
 			   num_hw_submission, amdgpu_job_hang_limit,
-			   timeout, sched_score, ring->name);
+			   timeout, NULL, sched_score, ring->name);
 	if (r) {
 		DRM_ERROR("Failed to create scheduler on ring %s.\n",
 			  ring->name);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 19826e504efc..feb6da1b6ceb 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -190,7 +190,8 @@  int etnaviv_sched_init(struct etnaviv_gpu *gpu)
 
 	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
 			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
-			     msecs_to_jiffies(500), NULL, dev_name(gpu->dev));
+			     msecs_to_jiffies(500), NULL, NULL,
+			     dev_name(gpu->dev));
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index ecf3267334ff..dba8329937a3 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -508,7 +508,8 @@  int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
 	INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
 
 	return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
-			      lima_job_hang_limit, msecs_to_jiffies(timeout),
+			      lima_job_hang_limit,
+			      msecs_to_jiffies(timeout), NULL,
 			      NULL, name);
 }
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 682f2161b999..8ff79fd49577 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -626,7 +626,8 @@  int panfrost_job_init(struct panfrost_device *pfdev)
 
 		ret = drm_sched_init(&js->queue[j].sched,
 				     &panfrost_sched_ops,
-				     1, 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
+				     1, 0,
+				     msecs_to_jiffies(JOB_TIMEOUT_MS), NULL,
 				     NULL, "pan_js");
 		if (ret) {
 			dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index c0a2f8f8d472..3e180f0d4305 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -232,7 +232,7 @@  static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
 {
 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
 	    !list_empty(&sched->pending_list))
-		schedule_delayed_work(&sched->work_tdr, sched->timeout);
+		queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
 }
 
 /**
@@ -244,7 +244,7 @@  static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
  */
 void drm_sched_fault(struct drm_gpu_scheduler *sched)
 {
-	mod_delayed_work(system_wq, &sched->work_tdr, 0);
+	mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
 }
 EXPORT_SYMBOL(drm_sched_fault);
 
@@ -270,7 +270,7 @@  unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
 	 * Modify the timeout to an arbitrarily large value. This also prevents
 	 * the timeout to be restarted when new submissions arrive
 	 */
-	if (mod_delayed_work(system_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
+	if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
 			&& time_after(sched_timeout, now))
 		return sched_timeout - now;
 	else
@@ -294,7 +294,7 @@  void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
 	if (list_empty(&sched->pending_list))
 		cancel_delayed_work(&sched->work_tdr);
 	else
-		mod_delayed_work(system_wq, &sched->work_tdr, remaining);
+		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
 
 	spin_unlock(&sched->job_list_lock);
 }
@@ -837,6 +837,8 @@  static int drm_sched_main(void *param)
  * @hw_submission: number of hw submissions that can be in flight
  * @hang_limit: number of times to allow a job to hang before dropping it
  * @timeout: timeout value in jiffies for the scheduler
+ * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
+ *		used
  * @score: optional score atomic shared with other schedulers
  * @name: name used for debugging
  *
@@ -844,7 +846,8 @@  static int drm_sched_main(void *param)
  */
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
-		   unsigned hw_submission, unsigned hang_limit, long timeout,
+		   unsigned hw_submission, unsigned hang_limit,
+		   long timeout, struct workqueue_struct *timeout_wq,
 		   atomic_t *score, const char *name)
 {
 	int i, ret;
@@ -852,6 +855,7 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 	sched->hw_submission_limit = hw_submission;
 	sched->name = name;
 	sched->timeout = timeout;
+	sched->timeout_wq = timeout_wq ? : system_wq;
 	sched->hang_limit = hang_limit;
 	sched->score = score ? score : &sched->_score;
 	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 8992480c88fa..a39bdd5cfc4f 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -402,7 +402,7 @@  v3d_sched_init(struct v3d_dev *v3d)
 	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
 			     &v3d_bin_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
-			     msecs_to_jiffies(hang_limit_ms),
+			     msecs_to_jiffies(hang_limit_ms), NULL,
 			     NULL, "v3d_bin");
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
@@ -412,7 +412,7 @@  v3d_sched_init(struct v3d_dev *v3d)
 	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
 			     &v3d_render_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
-			     msecs_to_jiffies(hang_limit_ms),
+			     msecs_to_jiffies(hang_limit_ms), NULL,
 			     NULL, "v3d_render");
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
@@ -424,7 +424,7 @@  v3d_sched_init(struct v3d_dev *v3d)
 	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
 			     &v3d_tfu_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
-			     msecs_to_jiffies(hang_limit_ms),
+			     msecs_to_jiffies(hang_limit_ms), NULL,
 			     NULL, "v3d_tfu");
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
@@ -437,7 +437,7 @@  v3d_sched_init(struct v3d_dev *v3d)
 		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
 				     &v3d_csd_sched_ops,
 				     hw_jobs_limit, job_hang_limit,
-				     msecs_to_jiffies(hang_limit_ms),
+				     msecs_to_jiffies(hang_limit_ms), NULL,
 				     NULL, "v3d_csd");
 		if (ret) {
 			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
@@ -449,7 +449,7 @@  v3d_sched_init(struct v3d_dev *v3d)
 		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
 				     &v3d_cache_clean_sched_ops,
 				     hw_jobs_limit, job_hang_limit,
-				     msecs_to_jiffies(hang_limit_ms),
+				     msecs_to_jiffies(hang_limit_ms), NULL,
 				     NULL, "v3d_cache_clean");
 		if (ret) {
 			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 65700511e074..9c41904ffd83 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -253,6 +253,24 @@  struct drm_sched_backend_ops {
 	 * 5. Restart the scheduler using drm_sched_start(). At that point, new
 	 *    jobs can be queued, and the scheduler thread is unblocked
 	 *
+	 * Note that some GPUs have distinct hardware queues but need to reset
+	 * the GPU globally, which requires extra synchronization between the
+	 * timeout handler of the different &drm_gpu_scheduler. One way to
+	 * achieve this synchronization is to create an ordered workqueue
+	 * (using alloc_ordered_workqueue()) at the driver level, and pass this
+	 * queue to drm_sched_init(), to guarantee that timeout handlers are
+	 * executed sequentially. The above workflow needs to be slightly
+	 * adjusted in that case:
+	 *
+	 * 1. Stop all schedulers impacted by the reset using drm_sched_stop()
+	 * 2. Try to gracefully stop non-faulty jobs on all queues impacted by
+	 *    the reset (optional)
+	 * 3. Issue a GPU reset on all faulty queues (driver-specific)
+	 * 4. Re-submit jobs on all schedulers impacted by the reset using
+	 *    drm_sched_resubmit_jobs()
+	 * 5. Restart all schedulers that were stopped in step #1 using
+	 *    drm_sched_start()
+	 *
 	 * Return DRM_GPU_SCHED_STAT_NOMINAL, when all is normal,
 	 * and the underlying driver has started or completed recovery.
 	 *
@@ -283,6 +301,7 @@  struct drm_sched_backend_ops {
  *                 finished.
  * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
+ * @timeout_wq: workqueue used to queue @work_tdr
  * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
  *            timeout interval is over.
  * @thread: the kthread on which the scheduler which run.
@@ -307,6 +326,7 @@  struct drm_gpu_scheduler {
 	wait_queue_head_t		job_scheduled;
 	atomic_t			hw_rq_count;
 	atomic64_t			job_id_count;
+	struct workqueue_struct		*timeout_wq;
 	struct delayed_work		work_tdr;
 	struct task_struct		*thread;
 	struct list_head		pending_list;
@@ -320,7 +340,8 @@  struct drm_gpu_scheduler {
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
-		   uint32_t hw_submission, unsigned hang_limit, long timeout,
+		   uint32_t hw_submission, unsigned hang_limit,
+		   long timeout, struct workqueue_struct *timeout_wq,
 		   atomic_t *score, const char *name);
 
 void drm_sched_fini(struct drm_gpu_scheduler *sched);