diff mbox series

[v3,07/12] drm/sched: Prevent any job recoveries after device is unplugged.

Message ID 1605936082-3099-8-git-send-email-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series RFC Support hot device unplug in amdgpu | expand

Commit Message

Andrey Grodzovsky Nov. 21, 2020, 5:21 a.m. UTC
No point to try recovery if device is gone, it's meaningless.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
 drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
 drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
 drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
 drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
 include/drm/gpu_scheduler.h               |  6 +++++-
 7 files changed, 35 insertions(+), 11 deletions(-)

Comments

Christian König Nov. 22, 2020, 11:57 a.m. UTC | #1
Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
> No point to try recovery if device is gone, it's meaningless.

I think that this should go into the device specific recovery function 
and not in the scheduler.

Christian.

>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>   drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>   drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>   drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>   drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>   drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>   include/drm/gpu_scheduler.h               |  6 +++++-
>   7 files changed, 35 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index d56f402..d0b0021 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   
>   		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>   				   num_hw_submission, amdgpu_job_hang_limit,
> -				   timeout, ring->name);
> +				   timeout, ring->name, &adev->ddev);
>   		if (r) {
>   			DRM_ERROR("Failed to create scheduler on ring %s.\n",
>   				  ring->name);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index cd46c88..7678287 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>   
>   	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>   			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> -			     msecs_to_jiffies(500), dev_name(gpu->dev));
> +			     msecs_to_jiffies(500), dev_name(gpu->dev),
> +			     gpu->drm);
>   	if (ret)
>   		return ret;
>   
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index dc6df9e..8a7e5d7ca 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
>   
>   	return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>   			      lima_job_hang_limit, msecs_to_jiffies(timeout),
> -			      name);
> +			      name,
> +			      pipe->ldev->ddev);
>   }
>   
>   void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index 30e7b71..37b03b01 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
>   		ret = drm_sched_init(&js->queue[j].sched,
>   				     &panfrost_sched_ops,
>   				     1, 0, msecs_to_jiffies(500),
> -				     "pan_js");
> +				     "pan_js", pfdev->ddev);
>   		if (ret) {
>   			dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
>   			goto err_sched;
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index c3f0bd0..95db8c6 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -53,6 +53,7 @@
>   #include <drm/drm_print.h>
>   #include <drm/gpu_scheduler.h>
>   #include <drm/spsc_queue.h>
> +#include <drm/drm_drv.h>
>   
>   #define CREATE_TRACE_POINTS
>   #include "gpu_scheduler_trace.h"
> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct work_struct *work)
>   	struct drm_gpu_scheduler *sched;
>   	struct drm_sched_job *job;
>   
> +	int idx;
> +
>   	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>   
> +	if (!drm_dev_enter(sched->ddev, &idx)) {
> +		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> +			 __func__, sched->name);
> +		return;
> +	}
> +
>   	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
>   	spin_lock(&sched->job_list_lock);
>   	job = list_first_entry_or_null(&sched->ring_mirror_list,
> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct work_struct *work)
>   	spin_lock(&sched->job_list_lock);
>   	drm_sched_start_timeout(sched);
>   	spin_unlock(&sched->job_list_lock);
> +
> +	drm_dev_exit(idx);
>   }
>   
>    /**
> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   unsigned hw_submission,
>   		   unsigned hang_limit,
>   		   long timeout,
> -		   const char *name)
> +		   const char *name,
> +		   struct drm_device *ddev)
>   {
>   	int i, ret;
>   	sched->ops = ops;
> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   	sched->name = name;
>   	sched->timeout = timeout;
>   	sched->hang_limit = hang_limit;
> +	sched->ddev = ddev;
>   	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
>   		drm_sched_rq_init(sched, &sched->sched_rq[i]);
>   
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 0747614..f5076e5 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>   			     &v3d_bin_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
>   			     msecs_to_jiffies(hang_limit_ms),
> -			     "v3d_bin");
> +			     "v3d_bin",
> +			     &v3d->drm);
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
>   		return ret;
> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>   			     &v3d_render_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
>   			     msecs_to_jiffies(hang_limit_ms),
> -			     "v3d_render");
> +			     "v3d_render",
> +			     &v3d->drm);
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
>   			ret);
> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>   			     &v3d_tfu_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
>   			     msecs_to_jiffies(hang_limit_ms),
> -			     "v3d_tfu");
> +			     "v3d_tfu",
> +			     &v3d->drm);
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>   			ret);
> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>   				     &v3d_csd_sched_ops,
>   				     hw_jobs_limit, job_hang_limit,
>   				     msecs_to_jiffies(hang_limit_ms),
> -				     "v3d_csd");
> +				     "v3d_csd",
> +				     &v3d->drm);
>   		if (ret) {
>   			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
>   				ret);
> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>   				     &v3d_cache_clean_sched_ops,
>   				     hw_jobs_limit, job_hang_limit,
>   				     msecs_to_jiffies(hang_limit_ms),
> -				     "v3d_cache_clean");
> +				     "v3d_cache_clean",
> +				     &v3d->drm);
>   		if (ret) {
>   			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
>   				ret);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 9243655..a980709 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -32,6 +32,7 @@
>   
>   struct drm_gpu_scheduler;
>   struct drm_sched_rq;
> +struct drm_device;
>   
>   /* These are often used as an (initial) index
>    * to an array, and as such should start at 0.
> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>    * @score: score to help loadbalancer pick a idle sched
>    * @ready: marks if the underlying HW is ready to work
>    * @free_guilty: A hit to time out handler to free the guilty job.
> + * @ddev: Pointer to drm device of this scheduler.
>    *
>    * One scheduler is implemented for each hardware ring.
>    */
> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>   	atomic_t                        score;
>   	bool				ready;
>   	bool				free_guilty;
> +	struct drm_device		*ddev;
>   };
>   
>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   const struct drm_sched_backend_ops *ops,
>   		   uint32_t hw_submission, unsigned hang_limit, long timeout,
> -		   const char *name);
> +		   const char *name,
> +		   struct drm_device *ddev);
>   
>   void drm_sched_fini(struct drm_gpu_scheduler *sched);
>   int drm_sched_job_init(struct drm_sched_job *job,
Andrey Grodzovsky Nov. 23, 2020, 5:37 a.m. UTC | #2
On 11/22/20 6:57 AM, Christian König wrote:
> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>> No point to try recovery if device is gone, it's meaningless.
>
> I think that this should go into the device specific recovery function and not 
> in the scheduler.


The timeout timer is rearmed here, so this prevents any new recovery work to 
restart from here
after drm_dev_unplug was executed from amdgpu_pci_remove.It will not cover other 
places like
job cleanup or starting new job but those should stop once the scheduler thread 
is stopped later.

Andrey


>
> Christian.
>
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>   drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>   drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>   drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>   drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>   drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>   include/drm/gpu_scheduler.h               |  6 +++++-
>>   7 files changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index d56f402..d0b0021 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>             r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>                      num_hw_submission, amdgpu_job_hang_limit,
>> -                   timeout, ring->name);
>> +                   timeout, ring->name, &adev->ddev);
>>           if (r) {
>>               DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>                     ring->name);
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> index cd46c88..7678287 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>         ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>                    etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>> +                 gpu->drm);
>>       if (ret)
>>           return ret;
>>   diff --git a/drivers/gpu/drm/lima/lima_sched.c 
>> b/drivers/gpu/drm/lima/lima_sched.c
>> index dc6df9e..8a7e5d7ca 100644
>> --- a/drivers/gpu/drm/lima/lima_sched.c
>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe *pipe, 
>> const char *name)
>>         return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>                     lima_job_hang_limit, msecs_to_jiffies(timeout),
>> -                  name);
>> +                  name,
>> +                  pipe->ldev->ddev);
>>   }
>>     void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c 
>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>> index 30e7b71..37b03b01 100644
>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
>>           ret = drm_sched_init(&js->queue[j].sched,
>>                        &panfrost_sched_ops,
>>                        1, 0, msecs_to_jiffies(500),
>> -                     "pan_js");
>> +                     "pan_js", pfdev->ddev);
>>           if (ret) {
>>               dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
>>               goto err_sched;
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index c3f0bd0..95db8c6 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -53,6 +53,7 @@
>>   #include <drm/drm_print.h>
>>   #include <drm/gpu_scheduler.h>
>>   #include <drm/spsc_queue.h>
>> +#include <drm/drm_drv.h>
>>     #define CREATE_TRACE_POINTS
>>   #include "gpu_scheduler_trace.h"
>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct work_struct 
>> *work)
>>       struct drm_gpu_scheduler *sched;
>>       struct drm_sched_job *job;
>>   +    int idx;
>> +
>>       sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>>   +    if (!drm_dev_enter(sched->ddev, &idx)) {
>> +        DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
>> +             __func__, sched->name);
>> +        return;
>> +    }
>> +
>>       /* Protects against concurrent deletion in drm_sched_get_cleanup_job */
>>       spin_lock(&sched->job_list_lock);
>>       job = list_first_entry_or_null(&sched->ring_mirror_list,
>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct work_struct *work)
>>       spin_lock(&sched->job_list_lock);
>>       drm_sched_start_timeout(sched);
>>       spin_unlock(&sched->job_list_lock);
>> +
>> +    drm_dev_exit(idx);
>>   }
>>      /**
>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>              unsigned hw_submission,
>>              unsigned hang_limit,
>>              long timeout,
>> -           const char *name)
>> +           const char *name,
>> +           struct drm_device *ddev)
>>   {
>>       int i, ret;
>>       sched->ops = ops;
>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>       sched->name = name;
>>       sched->timeout = timeout;
>>       sched->hang_limit = hang_limit;
>> +    sched->ddev = ddev;
>>       for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
>>           drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>   diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
>> index 0747614..f5076e5 100644
>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>                    &v3d_bin_sched_ops,
>>                    hw_jobs_limit, job_hang_limit,
>>                    msecs_to_jiffies(hang_limit_ms),
>> -                 "v3d_bin");
>> +                 "v3d_bin",
>> +                 &v3d->drm);
>>       if (ret) {
>>           dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
>>           return ret;
>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>                    &v3d_render_sched_ops,
>>                    hw_jobs_limit, job_hang_limit,
>>                    msecs_to_jiffies(hang_limit_ms),
>> -                 "v3d_render");
>> +                 "v3d_render",
>> +                 &v3d->drm);
>>       if (ret) {
>>           dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
>>               ret);
>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>                    &v3d_tfu_sched_ops,
>>                    hw_jobs_limit, job_hang_limit,
>>                    msecs_to_jiffies(hang_limit_ms),
>> -                 "v3d_tfu");
>> +                 "v3d_tfu",
>> +                 &v3d->drm);
>>       if (ret) {
>>           dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>               ret);
>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>                        &v3d_csd_sched_ops,
>>                        hw_jobs_limit, job_hang_limit,
>>                        msecs_to_jiffies(hang_limit_ms),
>> -                     "v3d_csd");
>> +                     "v3d_csd",
>> +                     &v3d->drm);
>>           if (ret) {
>>               dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
>>                   ret);
>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>                        &v3d_cache_clean_sched_ops,
>>                        hw_jobs_limit, job_hang_limit,
>>                        msecs_to_jiffies(hang_limit_ms),
>> -                     "v3d_cache_clean");
>> +                     "v3d_cache_clean",
>> +                     &v3d->drm);
>>           if (ret) {
>>               dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: 
>> %d.",
>>                   ret);
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 9243655..a980709 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -32,6 +32,7 @@
>>     struct drm_gpu_scheduler;
>>   struct drm_sched_rq;
>> +struct drm_device;
>>     /* These are often used as an (initial) index
>>    * to an array, and as such should start at 0.
>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>    * @score: score to help loadbalancer pick a idle sched
>>    * @ready: marks if the underlying HW is ready to work
>>    * @free_guilty: A hit to time out handler to free the guilty job.
>> + * @ddev: Pointer to drm device of this scheduler.
>>    *
>>    * One scheduler is implemented for each hardware ring.
>>    */
>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>       atomic_t                        score;
>>       bool                ready;
>>       bool                free_guilty;
>> +    struct drm_device        *ddev;
>>   };
>>     int drm_sched_init(struct drm_gpu_scheduler *sched,
>>              const struct drm_sched_backend_ops *ops,
>>              uint32_t hw_submission, unsigned hang_limit, long timeout,
>> -           const char *name);
>> +           const char *name,
>> +           struct drm_device *ddev);
>>     void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>   int drm_sched_job_init(struct drm_sched_job *job,
>
Christian König Nov. 23, 2020, 8:06 a.m. UTC | #3
Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>
> On 11/22/20 6:57 AM, Christian König wrote:
>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>> No point to try recovery if device is gone, it's meaningless.
>>
>> I think that this should go into the device specific recovery 
>> function and not in the scheduler.
>
>
> The timeout timer is rearmed here, so this prevents any new recovery 
> work to restart from here
> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not 
> cover other places like
> job cleanup or starting new job but those should stop once the 
> scheduler thread is stopped later.

Yeah, but this is rather unclean. We should probably return an error 
code instead if the timer should be rearmed or not.

Christian.

>
> Andrey
>
>
>>
>> Christian.
>>
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>   drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>   drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>   drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>   drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>   drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>   include/drm/gpu_scheduler.h               |  6 +++++-
>>>   7 files changed, 35 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index d56f402..d0b0021 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct 
>>> amdgpu_ring *ring,
>>>             r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>                      num_hw_submission, amdgpu_job_hang_limit,
>>> -                   timeout, ring->name);
>>> +                   timeout, ring->name, &adev->ddev);
>>>           if (r) {
>>>               DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>                     ring->name);
>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> index cd46c88..7678287 100644
>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>         ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>                    etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>> +                 gpu->drm);
>>>       if (ret)
>>>           return ret;
>>>   diff --git a/drivers/gpu/drm/lima/lima_sched.c 
>>> b/drivers/gpu/drm/lima/lima_sched.c
>>> index dc6df9e..8a7e5d7ca 100644
>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe 
>>> *pipe, const char *name)
>>>         return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>                     lima_job_hang_limit, msecs_to_jiffies(timeout),
>>> -                  name);
>>> +                  name,
>>> +                  pipe->ldev->ddev);
>>>   }
>>>     void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c 
>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>> index 30e7b71..37b03b01 100644
>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device 
>>> *pfdev)
>>>           ret = drm_sched_init(&js->queue[j].sched,
>>>                        &panfrost_sched_ops,
>>>                        1, 0, msecs_to_jiffies(500),
>>> -                     "pan_js");
>>> +                     "pan_js", pfdev->ddev);
>>>           if (ret) {
>>>               dev_err(pfdev->dev, "Failed to create scheduler: %d.", 
>>> ret);
>>>               goto err_sched;
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> index c3f0bd0..95db8c6 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -53,6 +53,7 @@
>>>   #include <drm/drm_print.h>
>>>   #include <drm/gpu_scheduler.h>
>>>   #include <drm/spsc_queue.h>
>>> +#include <drm/drm_drv.h>
>>>     #define CREATE_TRACE_POINTS
>>>   #include "gpu_scheduler_trace.h"
>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct 
>>> work_struct *work)
>>>       struct drm_gpu_scheduler *sched;
>>>       struct drm_sched_job *job;
>>>   +    int idx;
>>> +
>>>       sched = container_of(work, struct drm_gpu_scheduler, 
>>> work_tdr.work);
>>>   +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>> +        DRM_INFO("%s - device unplugged skipping recovery on 
>>> scheduler:%s",
>>> +             __func__, sched->name);
>>> +        return;
>>> +    }
>>> +
>>>       /* Protects against concurrent deletion in 
>>> drm_sched_get_cleanup_job */
>>>       spin_lock(&sched->job_list_lock);
>>>       job = list_first_entry_or_null(&sched->ring_mirror_list,
>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct 
>>> work_struct *work)
>>>       spin_lock(&sched->job_list_lock);
>>>       drm_sched_start_timeout(sched);
>>>       spin_unlock(&sched->job_list_lock);
>>> +
>>> +    drm_dev_exit(idx);
>>>   }
>>>      /**
>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>              unsigned hw_submission,
>>>              unsigned hang_limit,
>>>              long timeout,
>>> -           const char *name)
>>> +           const char *name,
>>> +           struct drm_device *ddev)
>>>   {
>>>       int i, ret;
>>>       sched->ops = ops;
>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>       sched->name = name;
>>>       sched->timeout = timeout;
>>>       sched->hang_limit = hang_limit;
>>> +    sched->ddev = ddev;
>>>       for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; 
>>> i++)
>>>           drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>   diff --git a/drivers/gpu/drm/v3d/v3d_sched.c 
>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>> index 0747614..f5076e5 100644
>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>                    &v3d_bin_sched_ops,
>>>                    hw_jobs_limit, job_hang_limit,
>>>                    msecs_to_jiffies(hang_limit_ms),
>>> -                 "v3d_bin");
>>> +                 "v3d_bin",
>>> +                 &v3d->drm);
>>>       if (ret) {
>>>           dev_err(v3d->drm.dev, "Failed to create bin scheduler: 
>>> %d.", ret);
>>>           return ret;
>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>                    &v3d_render_sched_ops,
>>>                    hw_jobs_limit, job_hang_limit,
>>>                    msecs_to_jiffies(hang_limit_ms),
>>> -                 "v3d_render");
>>> +                 "v3d_render",
>>> +                 &v3d->drm);
>>>       if (ret) {
>>>           dev_err(v3d->drm.dev, "Failed to create render scheduler: 
>>> %d.",
>>>               ret);
>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>                    &v3d_tfu_sched_ops,
>>>                    hw_jobs_limit, job_hang_limit,
>>>                    msecs_to_jiffies(hang_limit_ms),
>>> -                 "v3d_tfu");
>>> +                 "v3d_tfu",
>>> +                 &v3d->drm);
>>>       if (ret) {
>>>           dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>               ret);
>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>                        &v3d_csd_sched_ops,
>>>                        hw_jobs_limit, job_hang_limit,
>>>                        msecs_to_jiffies(hang_limit_ms),
>>> -                     "v3d_csd");
>>> +                     "v3d_csd",
>>> +                     &v3d->drm);
>>>           if (ret) {
>>>               dev_err(v3d->drm.dev, "Failed to create CSD scheduler: 
>>> %d.",
>>>                   ret);
>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>                        &v3d_cache_clean_sched_ops,
>>>                        hw_jobs_limit, job_hang_limit,
>>>                        msecs_to_jiffies(hang_limit_ms),
>>> -                     "v3d_cache_clean");
>>> +                     "v3d_cache_clean",
>>> +                     &v3d->drm);
>>>           if (ret) {
>>>               dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN 
>>> scheduler: %d.",
>>>                   ret);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 9243655..a980709 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -32,6 +32,7 @@
>>>     struct drm_gpu_scheduler;
>>>   struct drm_sched_rq;
>>> +struct drm_device;
>>>     /* These are often used as an (initial) index
>>>    * to an array, and as such should start at 0.
>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>    * @score: score to help loadbalancer pick a idle sched
>>>    * @ready: marks if the underlying HW is ready to work
>>>    * @free_guilty: A hit to time out handler to free the guilty job.
>>> + * @ddev: Pointer to drm device of this scheduler.
>>>    *
>>>    * One scheduler is implemented for each hardware ring.
>>>    */
>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>       atomic_t                        score;
>>>       bool                ready;
>>>       bool                free_guilty;
>>> +    struct drm_device        *ddev;
>>>   };
>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>              const struct drm_sched_backend_ops *ops,
>>>              uint32_t hw_submission, unsigned hang_limit, long timeout,
>>> -           const char *name);
>>> +           const char *name,
>>> +           struct drm_device *ddev);
>>>     void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>   int drm_sched_job_init(struct drm_sched_job *job,
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Luben Tuikov Nov. 24, 2020, 1:12 a.m. UTC | #4
On 2020-11-23 3:06 a.m., Christian König wrote:
> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>
>> On 11/22/20 6:57 AM, Christian König wrote:
>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>> No point to try recovery if device is gone, it's meaningless.
>>>
>>> I think that this should go into the device specific recovery 
>>> function and not in the scheduler.
>>
>>
>> The timeout timer is rearmed here, so this prevents any new recovery 
>> work to restart from here
>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not 
>> cover other places like
>> job cleanup or starting new job but those should stop once the 
>> scheduler thread is stopped later.
> 
> Yeah, but this is rather unclean. We should probably return an error 
> code instead if the timer should be rearmed or not.

Christian, this is exactly my work I told you about
last week on Wednesday in our weekly meeting. And
which I wrote to you in an email last year about this
time.

So what do we do now?

I can submit those changes without the last part,
which builds on this change.

I'm still testing the last part and was hoping
to submit it all in one sequence of patches,
after my testing.

Regards,
Luben

> 
> Christian.
> 
>>
>> Andrey
>>
>>
>>>
>>> Christian.
>>>
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>>   drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>>   drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>>   drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>>   drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>>   drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>>   include/drm/gpu_scheduler.h               |  6 +++++-
>>>>   7 files changed, 35 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> index d56f402..d0b0021 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct 
>>>> amdgpu_ring *ring,
>>>>             r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>>                      num_hw_submission, amdgpu_job_hang_limit,
>>>> -                   timeout, ring->name);
>>>> +                   timeout, ring->name, &adev->ddev);
>>>>           if (r) {
>>>>               DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>>                     ring->name);
>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>> index cd46c88..7678287 100644
>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>>         ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>>                    etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>>> +                 gpu->drm);
>>>>       if (ret)
>>>>           return ret;
>>>>   diff --git a/drivers/gpu/drm/lima/lima_sched.c 
>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>> index dc6df9e..8a7e5d7ca 100644
>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe 
>>>> *pipe, const char *name)
>>>>         return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>>                     lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>> -                  name);
>>>> +                  name,
>>>> +                  pipe->ldev->ddev);
>>>>   }
>>>>     void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c 
>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>> index 30e7b71..37b03b01 100644
>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device 
>>>> *pfdev)
>>>>           ret = drm_sched_init(&js->queue[j].sched,
>>>>                        &panfrost_sched_ops,
>>>>                        1, 0, msecs_to_jiffies(500),
>>>> -                     "pan_js");
>>>> +                     "pan_js", pfdev->ddev);
>>>>           if (ret) {
>>>>               dev_err(pfdev->dev, "Failed to create scheduler: %d.", 
>>>> ret);
>>>>               goto err_sched;
>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>> index c3f0bd0..95db8c6 100644
>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>> @@ -53,6 +53,7 @@
>>>>   #include <drm/drm_print.h>
>>>>   #include <drm/gpu_scheduler.h>
>>>>   #include <drm/spsc_queue.h>
>>>> +#include <drm/drm_drv.h>
>>>>     #define CREATE_TRACE_POINTS
>>>>   #include "gpu_scheduler_trace.h"
>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct 
>>>> work_struct *work)
>>>>       struct drm_gpu_scheduler *sched;
>>>>       struct drm_sched_job *job;
>>>>   +    int idx;
>>>> +
>>>>       sched = container_of(work, struct drm_gpu_scheduler, 
>>>> work_tdr.work);
>>>>   +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>>> +        DRM_INFO("%s - device unplugged skipping recovery on 
>>>> scheduler:%s",
>>>> +             __func__, sched->name);
>>>> +        return;
>>>> +    }
>>>> +
>>>>       /* Protects against concurrent deletion in 
>>>> drm_sched_get_cleanup_job */
>>>>       spin_lock(&sched->job_list_lock);
>>>>       job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct 
>>>> work_struct *work)
>>>>       spin_lock(&sched->job_list_lock);
>>>>       drm_sched_start_timeout(sched);
>>>>       spin_unlock(&sched->job_list_lock);
>>>> +
>>>> +    drm_dev_exit(idx);
>>>>   }
>>>>      /**
>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>              unsigned hw_submission,
>>>>              unsigned hang_limit,
>>>>              long timeout,
>>>> -           const char *name)
>>>> +           const char *name,
>>>> +           struct drm_device *ddev)
>>>>   {
>>>>       int i, ret;
>>>>       sched->ops = ops;
>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>       sched->name = name;
>>>>       sched->timeout = timeout;
>>>>       sched->hang_limit = hang_limit;
>>>> +    sched->ddev = ddev;
>>>>       for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; 
>>>> i++)
>>>>           drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>>   diff --git a/drivers/gpu/drm/v3d/v3d_sched.c 
>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>> index 0747614..f5076e5 100644
>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>                    &v3d_bin_sched_ops,
>>>>                    hw_jobs_limit, job_hang_limit,
>>>>                    msecs_to_jiffies(hang_limit_ms),
>>>> -                 "v3d_bin");
>>>> +                 "v3d_bin",
>>>> +                 &v3d->drm);
>>>>       if (ret) {
>>>>           dev_err(v3d->drm.dev, "Failed to create bin scheduler: 
>>>> %d.", ret);
>>>>           return ret;
>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>                    &v3d_render_sched_ops,
>>>>                    hw_jobs_limit, job_hang_limit,
>>>>                    msecs_to_jiffies(hang_limit_ms),
>>>> -                 "v3d_render");
>>>> +                 "v3d_render",
>>>> +                 &v3d->drm);
>>>>       if (ret) {
>>>>           dev_err(v3d->drm.dev, "Failed to create render scheduler: 
>>>> %d.",
>>>>               ret);
>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>                    &v3d_tfu_sched_ops,
>>>>                    hw_jobs_limit, job_hang_limit,
>>>>                    msecs_to_jiffies(hang_limit_ms),
>>>> -                 "v3d_tfu");
>>>> +                 "v3d_tfu",
>>>> +                 &v3d->drm);
>>>>       if (ret) {
>>>>           dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>>               ret);
>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>                        &v3d_csd_sched_ops,
>>>>                        hw_jobs_limit, job_hang_limit,
>>>>                        msecs_to_jiffies(hang_limit_ms),
>>>> -                     "v3d_csd");
>>>> +                     "v3d_csd",
>>>> +                     &v3d->drm);
>>>>           if (ret) {
>>>>               dev_err(v3d->drm.dev, "Failed to create CSD scheduler: 
>>>> %d.",
>>>>                   ret);
>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>                        &v3d_cache_clean_sched_ops,
>>>>                        hw_jobs_limit, job_hang_limit,
>>>>                        msecs_to_jiffies(hang_limit_ms),
>>>> -                     "v3d_cache_clean");
>>>> +                     "v3d_cache_clean",
>>>> +                     &v3d->drm);
>>>>           if (ret) {
>>>>               dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN 
>>>> scheduler: %d.",
>>>>                   ret);
>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>> index 9243655..a980709 100644
>>>> --- a/include/drm/gpu_scheduler.h
>>>> +++ b/include/drm/gpu_scheduler.h
>>>> @@ -32,6 +32,7 @@
>>>>     struct drm_gpu_scheduler;
>>>>   struct drm_sched_rq;
>>>> +struct drm_device;
>>>>     /* These are often used as an (initial) index
>>>>    * to an array, and as such should start at 0.
>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>>    * @score: score to help loadbalancer pick a idle sched
>>>>    * @ready: marks if the underlying HW is ready to work
>>>>    * @free_guilty: A hit to time out handler to free the guilty job.
>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>>    *
>>>>    * One scheduler is implemented for each hardware ring.
>>>>    */
>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>>       atomic_t                        score;
>>>>       bool                ready;
>>>>       bool                free_guilty;
>>>> +    struct drm_device        *ddev;
>>>>   };
>>>>     int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>              const struct drm_sched_backend_ops *ops,
>>>>              uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>> -           const char *name);
>>>> +           const char *name,
>>>> +           struct drm_device *ddev);
>>>>     void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>>   int drm_sched_job_init(struct drm_sched_job *job,
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7206e081871546cde52408d88f86a3c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637417155725505874%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=NejMsBm%2Fk9gheoQJv29vIe9f59jelk12ViF9%2Bt2UUWU%3D&amp;reserved=0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7206e081871546cde52408d88f86a3c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637417155725515872%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=WtDcIF22HvCMJHObfEhLD%2F7%2BZ37%2FxQC1465YoOrMEjc%3D&amp;reserved=0
>
Christian König Nov. 24, 2020, 7:50 a.m. UTC | #5
Am 24.11.20 um 02:12 schrieb Luben Tuikov:
> On 2020-11-23 3:06 a.m., Christian König wrote:
>> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>> On 11/22/20 6:57 AM, Christian König wrote:
>>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>>> No point to try recovery if device is gone, it's meaningless.
>>>> I think that this should go into the device specific recovery
>>>> function and not in the scheduler.
>>>
>>> The timeout timer is rearmed here, so this prevents any new recovery
>>> work to restart from here
>>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not
>>> cover other places like
>>> job cleanup or starting new job but those should stop once the
>>> scheduler thread is stopped later.
>> Yeah, but this is rather unclean. We should probably return an error
>> code instead if the timer should be rearmed or not.
> Christian, this is exactly my work I told you about
> last week on Wednesday in our weekly meeting. And
> which I wrote to you in an email last year about this
> time.

Yeah, that's why I'm suggesting it here as well.

> So what do we do now?

Split your patches into smaller parts and submit them chunk by chunk.

E.g. renames first and then functional changes grouped by area they change.

Regards,
Christian.

>
> I can submit those changes without the last part,
> which builds on this change.
>
> I'm still testing the last part and was hoping
> to submit it all in one sequence of patches,
> after my testing.
>
> Regards,
> Luben
>
>> Christian.
>>
>>> Andrey
>>>
>>>
>>>> Christian.
>>>>
>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>>>    drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>>>    drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>>>    drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>>>    drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>>>    drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>>>    include/drm/gpu_scheduler.h               |  6 +++++-
>>>>>    7 files changed, 35 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> index d56f402..d0b0021 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct
>>>>> amdgpu_ring *ring,
>>>>>              r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>>>                       num_hw_submission, amdgpu_job_hang_limit,
>>>>> -                   timeout, ring->name);
>>>>> +                   timeout, ring->name, &adev->ddev);
>>>>>            if (r) {
>>>>>                DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>>>                      ring->name);
>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>> index cd46c88..7678287 100644
>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>>>          ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>>>                     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>>>> +                 gpu->drm);
>>>>>        if (ret)
>>>>>            return ret;
>>>>>    diff --git a/drivers/gpu/drm/lima/lima_sched.c
>>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>>> index dc6df9e..8a7e5d7ca 100644
>>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe
>>>>> *pipe, const char *name)
>>>>>          return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>>>                      lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>>> -                  name);
>>>>> +                  name,
>>>>> +                  pipe->ldev->ddev);
>>>>>    }
>>>>>      void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>> index 30e7b71..37b03b01 100644
>>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device
>>>>> *pfdev)
>>>>>            ret = drm_sched_init(&js->queue[j].sched,
>>>>>                         &panfrost_sched_ops,
>>>>>                         1, 0, msecs_to_jiffies(500),
>>>>> -                     "pan_js");
>>>>> +                     "pan_js", pfdev->ddev);
>>>>>            if (ret) {
>>>>>                dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>>>>> ret);
>>>>>                goto err_sched;
>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> index c3f0bd0..95db8c6 100644
>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>> @@ -53,6 +53,7 @@
>>>>>    #include <drm/drm_print.h>
>>>>>    #include <drm/gpu_scheduler.h>
>>>>>    #include <drm/spsc_queue.h>
>>>>> +#include <drm/drm_drv.h>
>>>>>      #define CREATE_TRACE_POINTS
>>>>>    #include "gpu_scheduler_trace.h"
>>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct
>>>>> work_struct *work)
>>>>>        struct drm_gpu_scheduler *sched;
>>>>>        struct drm_sched_job *job;
>>>>>    +    int idx;
>>>>> +
>>>>>        sched = container_of(work, struct drm_gpu_scheduler,
>>>>> work_tdr.work);
>>>>>    +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>>>> +        DRM_INFO("%s - device unplugged skipping recovery on
>>>>> scheduler:%s",
>>>>> +             __func__, sched->name);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>>        /* Protects against concurrent deletion in
>>>>> drm_sched_get_cleanup_job */
>>>>>        spin_lock(&sched->job_list_lock);
>>>>>        job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct
>>>>> work_struct *work)
>>>>>        spin_lock(&sched->job_list_lock);
>>>>>        drm_sched_start_timeout(sched);
>>>>>        spin_unlock(&sched->job_list_lock);
>>>>> +
>>>>> +    drm_dev_exit(idx);
>>>>>    }
>>>>>       /**
>>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>               unsigned hw_submission,
>>>>>               unsigned hang_limit,
>>>>>               long timeout,
>>>>> -           const char *name)
>>>>> +           const char *name,
>>>>> +           struct drm_device *ddev)
>>>>>    {
>>>>>        int i, ret;
>>>>>        sched->ops = ops;
>>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>        sched->name = name;
>>>>>        sched->timeout = timeout;
>>>>>        sched->hang_limit = hang_limit;
>>>>> +    sched->ddev = ddev;
>>>>>        for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT;
>>>>> i++)
>>>>>            drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>>>    diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> index 0747614..f5076e5 100644
>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>                     &v3d_bin_sched_ops,
>>>>>                     hw_jobs_limit, job_hang_limit,
>>>>>                     msecs_to_jiffies(hang_limit_ms),
>>>>> -                 "v3d_bin");
>>>>> +                 "v3d_bin",
>>>>> +                 &v3d->drm);
>>>>>        if (ret) {
>>>>>            dev_err(v3d->drm.dev, "Failed to create bin scheduler:
>>>>> %d.", ret);
>>>>>            return ret;
>>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>                     &v3d_render_sched_ops,
>>>>>                     hw_jobs_limit, job_hang_limit,
>>>>>                     msecs_to_jiffies(hang_limit_ms),
>>>>> -                 "v3d_render");
>>>>> +                 "v3d_render",
>>>>> +                 &v3d->drm);
>>>>>        if (ret) {
>>>>>            dev_err(v3d->drm.dev, "Failed to create render scheduler:
>>>>> %d.",
>>>>>                ret);
>>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>                     &v3d_tfu_sched_ops,
>>>>>                     hw_jobs_limit, job_hang_limit,
>>>>>                     msecs_to_jiffies(hang_limit_ms),
>>>>> -                 "v3d_tfu");
>>>>> +                 "v3d_tfu",
>>>>> +                 &v3d->drm);
>>>>>        if (ret) {
>>>>>            dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>>>                ret);
>>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>                         &v3d_csd_sched_ops,
>>>>>                         hw_jobs_limit, job_hang_limit,
>>>>>                         msecs_to_jiffies(hang_limit_ms),
>>>>> -                     "v3d_csd");
>>>>> +                     "v3d_csd",
>>>>> +                     &v3d->drm);
>>>>>            if (ret) {
>>>>>                dev_err(v3d->drm.dev, "Failed to create CSD scheduler:
>>>>> %d.",
>>>>>                    ret);
>>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>                         &v3d_cache_clean_sched_ops,
>>>>>                         hw_jobs_limit, job_hang_limit,
>>>>>                         msecs_to_jiffies(hang_limit_ms),
>>>>> -                     "v3d_cache_clean");
>>>>> +                     "v3d_cache_clean",
>>>>> +                     &v3d->drm);
>>>>>            if (ret) {
>>>>>                dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN
>>>>> scheduler: %d.",
>>>>>                    ret);
>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>> index 9243655..a980709 100644
>>>>> --- a/include/drm/gpu_scheduler.h
>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>> @@ -32,6 +32,7 @@
>>>>>      struct drm_gpu_scheduler;
>>>>>    struct drm_sched_rq;
>>>>> +struct drm_device;
>>>>>      /* These are often used as an (initial) index
>>>>>     * to an array, and as such should start at 0.
>>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>>>     * @score: score to help loadbalancer pick a idle sched
>>>>>     * @ready: marks if the underlying HW is ready to work
>>>>>     * @free_guilty: A hit to time out handler to free the guilty job.
>>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>>>     *
>>>>>     * One scheduler is implemented for each hardware ring.
>>>>>     */
>>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>>>        atomic_t                        score;
>>>>>        bool                ready;
>>>>>        bool                free_guilty;
>>>>> +    struct drm_device        *ddev;
>>>>>    };
>>>>>      int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>               const struct drm_sched_backend_ops *ops,
>>>>>               uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>>> -           const char *name);
>>>>> +           const char *name,
>>>>> +           struct drm_device *ddev);
>>>>>      void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>    int drm_sched_job_init(struct drm_sched_job *job,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7206e081871546cde52408d88f86a3c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637417155725505874%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=NejMsBm%2Fk9gheoQJv29vIe9f59jelk12ViF9%2Bt2UUWU%3D&amp;reserved=0
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7206e081871546cde52408d88f86a3c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637417155725515872%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=WtDcIF22HvCMJHObfEhLD%2F7%2BZ37%2FxQC1465YoOrMEjc%3D&amp;reserved=0
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Luben Tuikov Nov. 24, 2020, 5:11 p.m. UTC | #6
On 2020-11-24 2:50 a.m., Christian König wrote:
> Am 24.11.20 um 02:12 schrieb Luben Tuikov:
>> On 2020-11-23 3:06 a.m., Christian König wrote:
>>> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>>> On 11/22/20 6:57 AM, Christian König wrote:
>>>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>>>> No point to try recovery if device is gone, it's meaningless.
>>>>> I think that this should go into the device specific recovery
>>>>> function and not in the scheduler.
>>>>
>>>> The timeout timer is rearmed here, so this prevents any new recovery
>>>> work to restart from here
>>>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not
>>>> cover other places like
>>>> job cleanup or starting new job but those should stop once the
>>>> scheduler thread is stopped later.
>>> Yeah, but this is rather unclean. We should probably return an error
>>> code instead if the timer should be rearmed or not.
>> Christian, this is exactly my work I told you about
>> last week on Wednesday in our weekly meeting. And
>> which I wrote to you in an email last year about this
>> time.
> 
> Yeah, that's why I'm suggesting it here as well.

It seems you're suggesting that Andrey do it, while
all too well you know I've been working on this
for some time now.

I wrote you about this last year same time
in an email. And I discussed it on the Wednesday
meeting.

You could've mentioned that here the first time.

> 
>> So what do we do now?
> 
> Split your patches into smaller parts and submit them chunk by chunk.
> 
> E.g. renames first and then functional changes grouped by area they change.

I have, but my final patch, a tiny one but which implements
the core reason for the change seems buggy, and I'm looking
for a way to debug it.

Regards,
Luben


> 
> Regards,
> Christian.
> 
>>
>> I can submit those changes without the last part,
>> which builds on this change.
>>
>> I'm still testing the last part and was hoping
>> to submit it all in one sequence of patches,
>> after my testing.
>>
>> Regards,
>> Luben
>>
>>> Christian.
>>>
>>>> Andrey
>>>>
>>>>
>>>>> Christian.
>>>>>
>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>> ---
>>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>>>>    drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>>>>    drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>>>>    drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>>>>    drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>>>>    drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>>>>    include/drm/gpu_scheduler.h               |  6 +++++-
>>>>>>    7 files changed, 35 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> index d56f402..d0b0021 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct
>>>>>> amdgpu_ring *ring,
>>>>>>              r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>>>>                       num_hw_submission, amdgpu_job_hang_limit,
>>>>>> -                   timeout, ring->name);
>>>>>> +                   timeout, ring->name, &adev->ddev);
>>>>>>            if (r) {
>>>>>>                DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>>>>                      ring->name);
>>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>> index cd46c88..7678287 100644
>>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>>>>          ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>>>>                     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>>>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>>>>> +                 gpu->drm);
>>>>>>        if (ret)
>>>>>>            return ret;
>>>>>>    diff --git a/drivers/gpu/drm/lima/lima_sched.c
>>>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>>>> index dc6df9e..8a7e5d7ca 100644
>>>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe
>>>>>> *pipe, const char *name)
>>>>>>          return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>>>>                      lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>>>> -                  name);
>>>>>> +                  name,
>>>>>> +                  pipe->ldev->ddev);
>>>>>>    }
>>>>>>      void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>> index 30e7b71..37b03b01 100644
>>>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device
>>>>>> *pfdev)
>>>>>>            ret = drm_sched_init(&js->queue[j].sched,
>>>>>>                         &panfrost_sched_ops,
>>>>>>                         1, 0, msecs_to_jiffies(500),
>>>>>> -                     "pan_js");
>>>>>> +                     "pan_js", pfdev->ddev);
>>>>>>            if (ret) {
>>>>>>                dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>>>>>> ret);
>>>>>>                goto err_sched;
>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> index c3f0bd0..95db8c6 100644
>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>> @@ -53,6 +53,7 @@
>>>>>>    #include <drm/drm_print.h>
>>>>>>    #include <drm/gpu_scheduler.h>
>>>>>>    #include <drm/spsc_queue.h>
>>>>>> +#include <drm/drm_drv.h>
>>>>>>      #define CREATE_TRACE_POINTS
>>>>>>    #include "gpu_scheduler_trace.h"
>>>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct
>>>>>> work_struct *work)
>>>>>>        struct drm_gpu_scheduler *sched;
>>>>>>        struct drm_sched_job *job;
>>>>>>    +    int idx;
>>>>>> +
>>>>>>        sched = container_of(work, struct drm_gpu_scheduler,
>>>>>> work_tdr.work);
>>>>>>    +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>>>>> +        DRM_INFO("%s - device unplugged skipping recovery on
>>>>>> scheduler:%s",
>>>>>> +             __func__, sched->name);
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>>        /* Protects against concurrent deletion in
>>>>>> drm_sched_get_cleanup_job */
>>>>>>        spin_lock(&sched->job_list_lock);
>>>>>>        job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct
>>>>>> work_struct *work)
>>>>>>        spin_lock(&sched->job_list_lock);
>>>>>>        drm_sched_start_timeout(sched);
>>>>>>        spin_unlock(&sched->job_list_lock);
>>>>>> +
>>>>>> +    drm_dev_exit(idx);
>>>>>>    }
>>>>>>       /**
>>>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>               unsigned hw_submission,
>>>>>>               unsigned hang_limit,
>>>>>>               long timeout,
>>>>>> -           const char *name)
>>>>>> +           const char *name,
>>>>>> +           struct drm_device *ddev)
>>>>>>    {
>>>>>>        int i, ret;
>>>>>>        sched->ops = ops;
>>>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>        sched->name = name;
>>>>>>        sched->timeout = timeout;
>>>>>>        sched->hang_limit = hang_limit;
>>>>>> +    sched->ddev = ddev;
>>>>>>        for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT;
>>>>>> i++)
>>>>>>            drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>>>>    diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>> index 0747614..f5076e5 100644
>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>                     &v3d_bin_sched_ops,
>>>>>>                     hw_jobs_limit, job_hang_limit,
>>>>>>                     msecs_to_jiffies(hang_limit_ms),
>>>>>> -                 "v3d_bin");
>>>>>> +                 "v3d_bin",
>>>>>> +                 &v3d->drm);
>>>>>>        if (ret) {
>>>>>>            dev_err(v3d->drm.dev, "Failed to create bin scheduler:
>>>>>> %d.", ret);
>>>>>>            return ret;
>>>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>                     &v3d_render_sched_ops,
>>>>>>                     hw_jobs_limit, job_hang_limit,
>>>>>>                     msecs_to_jiffies(hang_limit_ms),
>>>>>> -                 "v3d_render");
>>>>>> +                 "v3d_render",
>>>>>> +                 &v3d->drm);
>>>>>>        if (ret) {
>>>>>>            dev_err(v3d->drm.dev, "Failed to create render scheduler:
>>>>>> %d.",
>>>>>>                ret);
>>>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>                     &v3d_tfu_sched_ops,
>>>>>>                     hw_jobs_limit, job_hang_limit,
>>>>>>                     msecs_to_jiffies(hang_limit_ms),
>>>>>> -                 "v3d_tfu");
>>>>>> +                 "v3d_tfu",
>>>>>> +                 &v3d->drm);
>>>>>>        if (ret) {
>>>>>>            dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>>>>                ret);
>>>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>                         &v3d_csd_sched_ops,
>>>>>>                         hw_jobs_limit, job_hang_limit,
>>>>>>                         msecs_to_jiffies(hang_limit_ms),
>>>>>> -                     "v3d_csd");
>>>>>> +                     "v3d_csd",
>>>>>> +                     &v3d->drm);
>>>>>>            if (ret) {
>>>>>>                dev_err(v3d->drm.dev, "Failed to create CSD scheduler:
>>>>>> %d.",
>>>>>>                    ret);
>>>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>                         &v3d_cache_clean_sched_ops,
>>>>>>                         hw_jobs_limit, job_hang_limit,
>>>>>>                         msecs_to_jiffies(hang_limit_ms),
>>>>>> -                     "v3d_cache_clean");
>>>>>> +                     "v3d_cache_clean",
>>>>>> +                     &v3d->drm);
>>>>>>            if (ret) {
>>>>>>                dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN
>>>>>> scheduler: %d.",
>>>>>>                    ret);
>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>> index 9243655..a980709 100644
>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>> @@ -32,6 +32,7 @@
>>>>>>      struct drm_gpu_scheduler;
>>>>>>    struct drm_sched_rq;
>>>>>> +struct drm_device;
>>>>>>      /* These are often used as an (initial) index
>>>>>>     * to an array, and as such should start at 0.
>>>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>>>>     * @score: score to help loadbalancer pick a idle sched
>>>>>>     * @ready: marks if the underlying HW is ready to work
>>>>>>     * @free_guilty: A hit to time out handler to free the guilty job.
>>>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>>>>     *
>>>>>>     * One scheduler is implemented for each hardware ring.
>>>>>>     */
>>>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>>>>        atomic_t                        score;
>>>>>>        bool                ready;
>>>>>>        bool                free_guilty;
>>>>>> +    struct drm_device        *ddev;
>>>>>>    };
>>>>>>      int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>               const struct drm_sched_backend_ops *ops,
>>>>>>               uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>>>> -           const char *name);
>>>>>> +           const char *name,
>>>>>> +           struct drm_device *ddev);
>>>>>>      void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>>    int drm_sched_job_init(struct drm_sched_job *job,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548375418%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=wNLdozuhVS3smIpAuWB0tjFO3XDo1OmmZEgTCxviJaI%3D&amp;reserved=0
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=OZGMVRwFXiuhoG3%2FTP54e6vk0xpMQujqAlNxtCcX7kA%3D&amp;reserved=0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>
Andrey Grodzovsky Nov. 24, 2020, 5:17 p.m. UTC | #7
On 11/24/20 12:11 PM, Luben Tuikov wrote:
> On 2020-11-24 2:50 a.m., Christian König wrote:
>> Am 24.11.20 um 02:12 schrieb Luben Tuikov:
>>> On 2020-11-23 3:06 a.m., Christian König wrote:
>>>> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>>>> On 11/22/20 6:57 AM, Christian König wrote:
>>>>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>>>>> No point to try recovery if device is gone, it's meaningless.
>>>>>> I think that this should go into the device specific recovery
>>>>>> function and not in the scheduler.
>>>>> The timeout timer is rearmed here, so this prevents any new recovery
>>>>> work to restart from here
>>>>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not
>>>>> cover other places like
>>>>> job cleanup or starting new job but those should stop once the
>>>>> scheduler thread is stopped later.
>>>> Yeah, but this is rather unclean. We should probably return an error
>>>> code instead if the timer should be rearmed or not.
>>> Christian, this is exactly my work I told you about
>>> last week on Wednesday in our weekly meeting. And
>>> which I wrote to you in an email last year about this
>>> time.
>> Yeah, that's why I'm suggesting it here as well.
> It seems you're suggesting that Andrey do it, while
> all too well you know I've been working on this
> for some time now.
>
> I wrote you about this last year same time
> in an email. And I discussed it on the Wednesday
> meeting.
>
> You could've mentioned that here the first time.


Luben, I actually strongly prefer that you do it and share ur patch with me 
since I don't
want to do unneeded refactoring which will conflict with with ur work. Also, please
usedrm-misc for this since it's not amdgpu specific work and will be easier for me.

Andrey


>
>>> So what do we do now?
>> Split your patches into smaller parts and submit them chunk by chunk.
>>
>> E.g. renames first and then functional changes grouped by area they change.
> I have, but my final patch, a tiny one but which implements
> the core reason for the change seems buggy, and I'm looking
> for a way to debug it.
>
> Regards,
> Luben
>
>
>> Regards,
>> Christian.
>>
>>> I can submit those changes without the last part,
>>> which builds on this change.
>>>
>>> I'm still testing the last part and was hoping
>>> to submit it all in one sequence of patches,
>>> after my testing.
>>>
>>> Regards,
>>> Luben
>>>
>>>> Christian.
>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>> Christian.
>>>>>>
>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>>>>>     drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>>>>>     drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>>>>>     drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>>>>>     drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>>>>>     drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>>>>>     include/drm/gpu_scheduler.h               |  6 +++++-
>>>>>>>     7 files changed, 35 insertions(+), 11 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> index d56f402..d0b0021 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct
>>>>>>> amdgpu_ring *ring,
>>>>>>>               r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>>>>>                        num_hw_submission, amdgpu_job_hang_limit,
>>>>>>> -                   timeout, ring->name);
>>>>>>> +                   timeout, ring->name, &adev->ddev);
>>>>>>>             if (r) {
>>>>>>>                 DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>>>>>                       ring->name);
>>>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> index cd46c88..7678287 100644
>>>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>>>>>           ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>>>>>                      etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>>>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>>>>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>>>>>> +                 gpu->drm);
>>>>>>>         if (ret)
>>>>>>>             return ret;
>>>>>>>     diff --git a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> index dc6df9e..8a7e5d7ca 100644
>>>>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe
>>>>>>> *pipe, const char *name)
>>>>>>>           return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>>>>>                       lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>>>>> -                  name);
>>>>>>> +                  name,
>>>>>>> +                  pipe->ldev->ddev);
>>>>>>>     }
>>>>>>>       void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> index 30e7b71..37b03b01 100644
>>>>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device
>>>>>>> *pfdev)
>>>>>>>             ret = drm_sched_init(&js->queue[j].sched,
>>>>>>>                          &panfrost_sched_ops,
>>>>>>>                          1, 0, msecs_to_jiffies(500),
>>>>>>> -                     "pan_js");
>>>>>>> +                     "pan_js", pfdev->ddev);
>>>>>>>             if (ret) {
>>>>>>>                 dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>>>>>>> ret);
>>>>>>>                 goto err_sched;
>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> index c3f0bd0..95db8c6 100644
>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> @@ -53,6 +53,7 @@
>>>>>>>     #include <drm/drm_print.h>
>>>>>>>     #include <drm/gpu_scheduler.h>
>>>>>>>     #include <drm/spsc_queue.h>
>>>>>>> +#include <drm/drm_drv.h>
>>>>>>>       #define CREATE_TRACE_POINTS
>>>>>>>     #include "gpu_scheduler_trace.h"
>>>>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct
>>>>>>> work_struct *work)
>>>>>>>         struct drm_gpu_scheduler *sched;
>>>>>>>         struct drm_sched_job *job;
>>>>>>>     +    int idx;
>>>>>>> +
>>>>>>>         sched = container_of(work, struct drm_gpu_scheduler,
>>>>>>> work_tdr.work);
>>>>>>>     +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>>>>>> +        DRM_INFO("%s - device unplugged skipping recovery on
>>>>>>> scheduler:%s",
>>>>>>> +             __func__, sched->name);
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>>         /* Protects against concurrent deletion in
>>>>>>> drm_sched_get_cleanup_job */
>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct
>>>>>>> work_struct *work)
>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>         drm_sched_start_timeout(sched);
>>>>>>>         spin_unlock(&sched->job_list_lock);
>>>>>>> +
>>>>>>> +    drm_dev_exit(idx);
>>>>>>>     }
>>>>>>>        /**
>>>>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>                unsigned hw_submission,
>>>>>>>                unsigned hang_limit,
>>>>>>>                long timeout,
>>>>>>> -           const char *name)
>>>>>>> +           const char *name,
>>>>>>> +           struct drm_device *ddev)
>>>>>>>     {
>>>>>>>         int i, ret;
>>>>>>>         sched->ops = ops;
>>>>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>         sched->name = name;
>>>>>>>         sched->timeout = timeout;
>>>>>>>         sched->hang_limit = hang_limit;
>>>>>>> +    sched->ddev = ddev;
>>>>>>>         for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT;
>>>>>>> i++)
>>>>>>>             drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>>>>>     diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> index 0747614..f5076e5 100644
>>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                      &v3d_bin_sched_ops,
>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                 "v3d_bin");
>>>>>>> +                 "v3d_bin",
>>>>>>> +                 &v3d->drm);
>>>>>>>         if (ret) {
>>>>>>>             dev_err(v3d->drm.dev, "Failed to create bin scheduler:
>>>>>>> %d.", ret);
>>>>>>>             return ret;
>>>>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                      &v3d_render_sched_ops,
>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                 "v3d_render");
>>>>>>> +                 "v3d_render",
>>>>>>> +                 &v3d->drm);
>>>>>>>         if (ret) {
>>>>>>>             dev_err(v3d->drm.dev, "Failed to create render scheduler:
>>>>>>> %d.",
>>>>>>>                 ret);
>>>>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                      &v3d_tfu_sched_ops,
>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                 "v3d_tfu");
>>>>>>> +                 "v3d_tfu",
>>>>>>> +                 &v3d->drm);
>>>>>>>         if (ret) {
>>>>>>>             dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>>>>>                 ret);
>>>>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                          &v3d_csd_sched_ops,
>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                     "v3d_csd");
>>>>>>> +                     "v3d_csd",
>>>>>>> +                     &v3d->drm);
>>>>>>>             if (ret) {
>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CSD scheduler:
>>>>>>> %d.",
>>>>>>>                     ret);
>>>>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                          &v3d_cache_clean_sched_ops,
>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                     "v3d_cache_clean");
>>>>>>> +                     "v3d_cache_clean",
>>>>>>> +                     &v3d->drm);
>>>>>>>             if (ret) {
>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN
>>>>>>> scheduler: %d.",
>>>>>>>                     ret);
>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>> index 9243655..a980709 100644
>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>> @@ -32,6 +32,7 @@
>>>>>>>       struct drm_gpu_scheduler;
>>>>>>>     struct drm_sched_rq;
>>>>>>> +struct drm_device;
>>>>>>>       /* These are often used as an (initial) index
>>>>>>>      * to an array, and as such should start at 0.
>>>>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>>>>>      * @score: score to help loadbalancer pick a idle sched
>>>>>>>      * @ready: marks if the underlying HW is ready to work
>>>>>>>      * @free_guilty: A hit to time out handler to free the guilty job.
>>>>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>>>>>      *
>>>>>>>      * One scheduler is implemented for each hardware ring.
>>>>>>>      */
>>>>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>>>>>         atomic_t                        score;
>>>>>>>         bool                ready;
>>>>>>>         bool                free_guilty;
>>>>>>> +    struct drm_device        *ddev;
>>>>>>>     };
>>>>>>>       int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>                const struct drm_sched_backend_ops *ops,
>>>>>>>                uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>>>>> -           const char *name);
>>>>>>> +           const char *name,
>>>>>>> +           struct drm_device *ddev);
>>>>>>>       void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>>>     int drm_sched_job_init(struct drm_sched_job *job,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548375418%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=wNLdozuhVS3smIpAuWB0tjFO3XDo1OmmZEgTCxviJaI%3D&amp;reserved=0
>>>> _______________________________________________
>>>> dri-devel mailing list
>>>> dri-devel@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=OZGMVRwFXiuhoG3%2FTP54e6vk0xpMQujqAlNxtCcX7kA%3D&amp;reserved=0
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>>
Christian König Nov. 24, 2020, 5:40 p.m. UTC | #8
Am 24.11.20 um 18:11 schrieb Luben Tuikov:
> On 2020-11-24 2:50 a.m., Christian König wrote:
>> Am 24.11.20 um 02:12 schrieb Luben Tuikov:
>>> On 2020-11-23 3:06 a.m., Christian König wrote:
>>>> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>>>> On 11/22/20 6:57 AM, Christian König wrote:
>>>>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>>>>> No point to try recovery if device is gone, it's meaningless.
>>>>>> I think that this should go into the device specific recovery
>>>>>> function and not in the scheduler.
>>>>> The timeout timer is rearmed here, so this prevents any new recovery
>>>>> work to restart from here
>>>>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not
>>>>> cover other places like
>>>>> job cleanup or starting new job but those should stop once the
>>>>> scheduler thread is stopped later.
>>>> Yeah, but this is rather unclean. We should probably return an error
>>>> code instead if the timer should be rearmed or not.
>>> Christian, this is exactly my work I told you about
>>> last week on Wednesday in our weekly meeting. And
>>> which I wrote to you in an email last year about this
>>> time.
>> Yeah, that's why I'm suggesting it here as well.
> It seems you're suggesting that Andrey do it, while
> all too well you know I've been working on this
> for some time now.

Changing the return value is just a minimal change and I didn't want to 
block Andrey in any way.

>
> I wrote you about this last year same time
> in an email. And I discussed it on the Wednesday
> meeting.
>
> You could've mentioned that here the first time.
>
>>> So what do we do now?
>> Split your patches into smaller parts and submit them chunk by chunk.
>>
>> E.g. renames first and then functional changes grouped by area they change.
> I have, but my final patch, a tiny one but which implements
> the core reason for the change seems buggy, and I'm looking
> for a way to debug it.

Just send it out in chunks, e.g. non functional changes like renames 
shouldn't cause any problems and having them in the branch early 
minimizes conflicts with work from others.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>> Regards,
>> Christian.
>>
>>> I can submit those changes without the last part,
>>> which builds on this change.
>>>
>>> I'm still testing the last part and was hoping
>>> to submit it all in one sequence of patches,
>>> after my testing.
>>>
>>> Regards,
>>> Luben
>>>
>>>> Christian.
>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>> Christian.
>>>>>>
>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>>>>>     drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>>>>>     drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>>>>>     drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>>>>>     drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>>>>>     drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>>>>>     include/drm/gpu_scheduler.h               |  6 +++++-
>>>>>>>     7 files changed, 35 insertions(+), 11 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> index d56f402..d0b0021 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct
>>>>>>> amdgpu_ring *ring,
>>>>>>>               r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>>>>>                        num_hw_submission, amdgpu_job_hang_limit,
>>>>>>> -                   timeout, ring->name);
>>>>>>> +                   timeout, ring->name, &adev->ddev);
>>>>>>>             if (r) {
>>>>>>>                 DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>>>>>                       ring->name);
>>>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> index cd46c88..7678287 100644
>>>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>>>>>           ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>>>>>                      etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>>>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>>>>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>>>>>> +                 gpu->drm);
>>>>>>>         if (ret)
>>>>>>>             return ret;
>>>>>>>     diff --git a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> index dc6df9e..8a7e5d7ca 100644
>>>>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe
>>>>>>> *pipe, const char *name)
>>>>>>>           return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>>>>>                       lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>>>>> -                  name);
>>>>>>> +                  name,
>>>>>>> +                  pipe->ldev->ddev);
>>>>>>>     }
>>>>>>>       void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> index 30e7b71..37b03b01 100644
>>>>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device
>>>>>>> *pfdev)
>>>>>>>             ret = drm_sched_init(&js->queue[j].sched,
>>>>>>>                          &panfrost_sched_ops,
>>>>>>>                          1, 0, msecs_to_jiffies(500),
>>>>>>> -                     "pan_js");
>>>>>>> +                     "pan_js", pfdev->ddev);
>>>>>>>             if (ret) {
>>>>>>>                 dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>>>>>>> ret);
>>>>>>>                 goto err_sched;
>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> index c3f0bd0..95db8c6 100644
>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>> @@ -53,6 +53,7 @@
>>>>>>>     #include <drm/drm_print.h>
>>>>>>>     #include <drm/gpu_scheduler.h>
>>>>>>>     #include <drm/spsc_queue.h>
>>>>>>> +#include <drm/drm_drv.h>
>>>>>>>       #define CREATE_TRACE_POINTS
>>>>>>>     #include "gpu_scheduler_trace.h"
>>>>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct
>>>>>>> work_struct *work)
>>>>>>>         struct drm_gpu_scheduler *sched;
>>>>>>>         struct drm_sched_job *job;
>>>>>>>     +    int idx;
>>>>>>> +
>>>>>>>         sched = container_of(work, struct drm_gpu_scheduler,
>>>>>>> work_tdr.work);
>>>>>>>     +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>>>>>> +        DRM_INFO("%s - device unplugged skipping recovery on
>>>>>>> scheduler:%s",
>>>>>>> +             __func__, sched->name);
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>>         /* Protects against concurrent deletion in
>>>>>>> drm_sched_get_cleanup_job */
>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct
>>>>>>> work_struct *work)
>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>         drm_sched_start_timeout(sched);
>>>>>>>         spin_unlock(&sched->job_list_lock);
>>>>>>> +
>>>>>>> +    drm_dev_exit(idx);
>>>>>>>     }
>>>>>>>        /**
>>>>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>                unsigned hw_submission,
>>>>>>>                unsigned hang_limit,
>>>>>>>                long timeout,
>>>>>>> -           const char *name)
>>>>>>> +           const char *name,
>>>>>>> +           struct drm_device *ddev)
>>>>>>>     {
>>>>>>>         int i, ret;
>>>>>>>         sched->ops = ops;
>>>>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>         sched->name = name;
>>>>>>>         sched->timeout = timeout;
>>>>>>>         sched->hang_limit = hang_limit;
>>>>>>> +    sched->ddev = ddev;
>>>>>>>         for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT;
>>>>>>> i++)
>>>>>>>             drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>>>>>     diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> index 0747614..f5076e5 100644
>>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                      &v3d_bin_sched_ops,
>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                 "v3d_bin");
>>>>>>> +                 "v3d_bin",
>>>>>>> +                 &v3d->drm);
>>>>>>>         if (ret) {
>>>>>>>             dev_err(v3d->drm.dev, "Failed to create bin scheduler:
>>>>>>> %d.", ret);
>>>>>>>             return ret;
>>>>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                      &v3d_render_sched_ops,
>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                 "v3d_render");
>>>>>>> +                 "v3d_render",
>>>>>>> +                 &v3d->drm);
>>>>>>>         if (ret) {
>>>>>>>             dev_err(v3d->drm.dev, "Failed to create render scheduler:
>>>>>>> %d.",
>>>>>>>                 ret);
>>>>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                      &v3d_tfu_sched_ops,
>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                 "v3d_tfu");
>>>>>>> +                 "v3d_tfu",
>>>>>>> +                 &v3d->drm);
>>>>>>>         if (ret) {
>>>>>>>             dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>>>>>                 ret);
>>>>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                          &v3d_csd_sched_ops,
>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                     "v3d_csd");
>>>>>>> +                     "v3d_csd",
>>>>>>> +                     &v3d->drm);
>>>>>>>             if (ret) {
>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CSD scheduler:
>>>>>>> %d.",
>>>>>>>                     ret);
>>>>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>                          &v3d_cache_clean_sched_ops,
>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>> -                     "v3d_cache_clean");
>>>>>>> +                     "v3d_cache_clean",
>>>>>>> +                     &v3d->drm);
>>>>>>>             if (ret) {
>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN
>>>>>>> scheduler: %d.",
>>>>>>>                     ret);
>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>> index 9243655..a980709 100644
>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>> @@ -32,6 +32,7 @@
>>>>>>>       struct drm_gpu_scheduler;
>>>>>>>     struct drm_sched_rq;
>>>>>>> +struct drm_device;
>>>>>>>       /* These are often used as an (initial) index
>>>>>>>      * to an array, and as such should start at 0.
>>>>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>>>>>      * @score: score to help loadbalancer pick a idle sched
>>>>>>>      * @ready: marks if the underlying HW is ready to work
>>>>>>>      * @free_guilty: A hit to time out handler to free the guilty job.
>>>>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>>>>>      *
>>>>>>>      * One scheduler is implemented for each hardware ring.
>>>>>>>      */
>>>>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>>>>>         atomic_t                        score;
>>>>>>>         bool                ready;
>>>>>>>         bool                free_guilty;
>>>>>>> +    struct drm_device        *ddev;
>>>>>>>     };
>>>>>>>       int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>                const struct drm_sched_backend_ops *ops,
>>>>>>>                uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>>>>> -           const char *name);
>>>>>>> +           const char *name,
>>>>>>> +           struct drm_device *ddev);
>>>>>>>       void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>>>     int drm_sched_job_init(struct drm_sched_job *job,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548375418%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=wNLdozuhVS3smIpAuWB0tjFO3XDo1OmmZEgTCxviJaI%3D&amp;reserved=0
>>>> _______________________________________________
>>>> dri-devel mailing list
>>>> dri-devel@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=OZGMVRwFXiuhoG3%2FTP54e6vk0xpMQujqAlNxtCcX7kA%3D&amp;reserved=0
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Luben Tuikov Nov. 24, 2020, 5:41 p.m. UTC | #9
On 2020-11-24 12:17 p.m., Andrey Grodzovsky wrote:
> 
> On 11/24/20 12:11 PM, Luben Tuikov wrote:
>> On 2020-11-24 2:50 a.m., Christian König wrote:
>>> Am 24.11.20 um 02:12 schrieb Luben Tuikov:
>>>> On 2020-11-23 3:06 a.m., Christian König wrote:
>>>>> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>>>>> On 11/22/20 6:57 AM, Christian König wrote:
>>>>>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>>>>>> No point to try recovery if device is gone, it's meaningless.
>>>>>>> I think that this should go into the device specific recovery
>>>>>>> function and not in the scheduler.
>>>>>> The timeout timer is rearmed here, so this prevents any new recovery
>>>>>> work to restart from here
>>>>>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not
>>>>>> cover other places like
>>>>>> job cleanup or starting new job but those should stop once the
>>>>>> scheduler thread is stopped later.
>>>>> Yeah, but this is rather unclean. We should probably return an error
>>>>> code instead if the timer should be rearmed or not.
>>>> Christian, this is exactly my work I told you about
>>>> last week on Wednesday in our weekly meeting. And
>>>> which I wrote to you in an email last year about this
>>>> time.
>>> Yeah, that's why I'm suggesting it here as well.
>> It seems you're suggesting that Andrey do it, while
>> all too well you know I've been working on this
>> for some time now.
>>
>> I wrote you about this last year same time
>> in an email. And I discussed it on the Wednesday
>> meeting.
>>
>> You could've mentioned that here the first time.
> 
> 
> Luben, I actually strongly prefer that you do it and share ur patch with me 
> since I don't
> want to do unneeded refactoring which will conflict with with ur work. Also, please
> usedrm-misc for this since it's not amdgpu specific work and will be easier for me.
> 
> Andrey

No problem, Andrey--will do.

Regards,
Luben

> 
> 
>>
>>>> So what do we do now?
>>> Split your patches into smaller parts and submit them chunk by chunk.
>>>
>>> E.g. renames first and then functional changes grouped by area they change.
>> I have, but my final patch, a tiny one but which implements
>> the core reason for the change seems buggy, and I'm looking
>> for a way to debug it.
>>
>> Regards,
>> Luben
>>
>>
>>> Regards,
>>> Christian.
>>>
>>>> I can submit those changes without the last part,
>>>> which builds on this change.
>>>>
>>>> I'm still testing the last part and was hoping
>>>> to submit it all in one sequence of patches,
>>>> after my testing.
>>>>
>>>> Regards,
>>>> Luben
>>>>
>>>>> Christian.
>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> ---
>>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>>>>>>     drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>>>>>>     drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>>>>>>     drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>>>>>>     drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>>>>>>     drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>>>>>>     include/drm/gpu_scheduler.h               |  6 +++++-
>>>>>>>>     7 files changed, 35 insertions(+), 11 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> index d56f402..d0b0021 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct
>>>>>>>> amdgpu_ring *ring,
>>>>>>>>               r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>>>>>>                        num_hw_submission, amdgpu_job_hang_limit,
>>>>>>>> -                   timeout, ring->name);
>>>>>>>> +                   timeout, ring->name, &adev->ddev);
>>>>>>>>             if (r) {
>>>>>>>>                 DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>>>>>>                       ring->name);
>>>>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> index cd46c88..7678287 100644
>>>>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>>>>>>           ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>>>>>>                      etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>>>>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>>>>>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>>>>>>> +                 gpu->drm);
>>>>>>>>         if (ret)
>>>>>>>>             return ret;
>>>>>>>>     diff --git a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> index dc6df9e..8a7e5d7ca 100644
>>>>>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe
>>>>>>>> *pipe, const char *name)
>>>>>>>>           return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>>>>>>                       lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>>>>>> -                  name);
>>>>>>>> +                  name,
>>>>>>>> +                  pipe->ldev->ddev);
>>>>>>>>     }
>>>>>>>>       void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>>>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> index 30e7b71..37b03b01 100644
>>>>>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device
>>>>>>>> *pfdev)
>>>>>>>>             ret = drm_sched_init(&js->queue[j].sched,
>>>>>>>>                          &panfrost_sched_ops,
>>>>>>>>                          1, 0, msecs_to_jiffies(500),
>>>>>>>> -                     "pan_js");
>>>>>>>> +                     "pan_js", pfdev->ddev);
>>>>>>>>             if (ret) {
>>>>>>>>                 dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>>>>>>>> ret);
>>>>>>>>                 goto err_sched;
>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> index c3f0bd0..95db8c6 100644
>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> @@ -53,6 +53,7 @@
>>>>>>>>     #include <drm/drm_print.h>
>>>>>>>>     #include <drm/gpu_scheduler.h>
>>>>>>>>     #include <drm/spsc_queue.h>
>>>>>>>> +#include <drm/drm_drv.h>
>>>>>>>>       #define CREATE_TRACE_POINTS
>>>>>>>>     #include "gpu_scheduler_trace.h"
>>>>>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct
>>>>>>>> work_struct *work)
>>>>>>>>         struct drm_gpu_scheduler *sched;
>>>>>>>>         struct drm_sched_job *job;
>>>>>>>>     +    int idx;
>>>>>>>> +
>>>>>>>>         sched = container_of(work, struct drm_gpu_scheduler,
>>>>>>>> work_tdr.work);
>>>>>>>>     +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>>>>>>> +        DRM_INFO("%s - device unplugged skipping recovery on
>>>>>>>> scheduler:%s",
>>>>>>>> +             __func__, sched->name);
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>>         /* Protects against concurrent deletion in
>>>>>>>> drm_sched_get_cleanup_job */
>>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct
>>>>>>>> work_struct *work)
>>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>>         drm_sched_start_timeout(sched);
>>>>>>>>         spin_unlock(&sched->job_list_lock);
>>>>>>>> +
>>>>>>>> +    drm_dev_exit(idx);
>>>>>>>>     }
>>>>>>>>        /**
>>>>>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>                unsigned hw_submission,
>>>>>>>>                unsigned hang_limit,
>>>>>>>>                long timeout,
>>>>>>>> -           const char *name)
>>>>>>>> +           const char *name,
>>>>>>>> +           struct drm_device *ddev)
>>>>>>>>     {
>>>>>>>>         int i, ret;
>>>>>>>>         sched->ops = ops;
>>>>>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>         sched->name = name;
>>>>>>>>         sched->timeout = timeout;
>>>>>>>>         sched->hang_limit = hang_limit;
>>>>>>>> +    sched->ddev = ddev;
>>>>>>>>         for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT;
>>>>>>>> i++)
>>>>>>>>             drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>>>>>>     diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> index 0747614..f5076e5 100644
>>>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                      &v3d_bin_sched_ops,
>>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                 "v3d_bin");
>>>>>>>> +                 "v3d_bin",
>>>>>>>> +                 &v3d->drm);
>>>>>>>>         if (ret) {
>>>>>>>>             dev_err(v3d->drm.dev, "Failed to create bin scheduler:
>>>>>>>> %d.", ret);
>>>>>>>>             return ret;
>>>>>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                      &v3d_render_sched_ops,
>>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                 "v3d_render");
>>>>>>>> +                 "v3d_render",
>>>>>>>> +                 &v3d->drm);
>>>>>>>>         if (ret) {
>>>>>>>>             dev_err(v3d->drm.dev, "Failed to create render scheduler:
>>>>>>>> %d.",
>>>>>>>>                 ret);
>>>>>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                      &v3d_tfu_sched_ops,
>>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                 "v3d_tfu");
>>>>>>>> +                 "v3d_tfu",
>>>>>>>> +                 &v3d->drm);
>>>>>>>>         if (ret) {
>>>>>>>>             dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>>>>>>                 ret);
>>>>>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                          &v3d_csd_sched_ops,
>>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                     "v3d_csd");
>>>>>>>> +                     "v3d_csd",
>>>>>>>> +                     &v3d->drm);
>>>>>>>>             if (ret) {
>>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CSD scheduler:
>>>>>>>> %d.",
>>>>>>>>                     ret);
>>>>>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                          &v3d_cache_clean_sched_ops,
>>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                     "v3d_cache_clean");
>>>>>>>> +                     "v3d_cache_clean",
>>>>>>>> +                     &v3d->drm);
>>>>>>>>             if (ret) {
>>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN
>>>>>>>> scheduler: %d.",
>>>>>>>>                     ret);
>>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>>> index 9243655..a980709 100644
>>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>>> @@ -32,6 +32,7 @@
>>>>>>>>       struct drm_gpu_scheduler;
>>>>>>>>     struct drm_sched_rq;
>>>>>>>> +struct drm_device;
>>>>>>>>       /* These are often used as an (initial) index
>>>>>>>>      * to an array, and as such should start at 0.
>>>>>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>>>>>>      * @score: score to help loadbalancer pick a idle sched
>>>>>>>>      * @ready: marks if the underlying HW is ready to work
>>>>>>>>      * @free_guilty: A hit to time out handler to free the guilty job.
>>>>>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>>>>>>      *
>>>>>>>>      * One scheduler is implemented for each hardware ring.
>>>>>>>>      */
>>>>>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>>>>>>         atomic_t                        score;
>>>>>>>>         bool                ready;
>>>>>>>>         bool                free_guilty;
>>>>>>>> +    struct drm_device        *ddev;
>>>>>>>>     };
>>>>>>>>       int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>                const struct drm_sched_backend_ops *ops,
>>>>>>>>                uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>>>>>> -           const char *name);
>>>>>>>> +           const char *name,
>>>>>>>> +           struct drm_device *ddev);
>>>>>>>>       void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>>>>     int drm_sched_job_init(struct drm_sched_job *job,
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548375418%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=wNLdozuhVS3smIpAuWB0tjFO3XDo1OmmZEgTCxviJaI%3D&amp;reserved=0
>>>>> _______________________________________________
>>>>> dri-devel mailing list
>>>>> dri-devel@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=OZGMVRwFXiuhoG3%2FTP54e6vk0xpMQujqAlNxtCcX7kA%3D&amp;reserved=0
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C644a4f3feb79447fd6a408d8904dab27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418010548385367%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=qXKgWmi%2FU042boaDF43w5uIKRLFVNgwiPYrEN%2FxV0pc%3D&amp;reserved=0
>>>
Luben Tuikov Nov. 24, 2020, 5:44 p.m. UTC | #10
On 2020-11-24 12:40 p.m., Christian König wrote:
> Am 24.11.20 um 18:11 schrieb Luben Tuikov:
>> On 2020-11-24 2:50 a.m., Christian König wrote:
>>> Am 24.11.20 um 02:12 schrieb Luben Tuikov:
>>>> On 2020-11-23 3:06 a.m., Christian König wrote:
>>>>> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>>>>> On 11/22/20 6:57 AM, Christian König wrote:
>>>>>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>>>>>> No point to try recovery if device is gone, it's meaningless.
>>>>>>> I think that this should go into the device specific recovery
>>>>>>> function and not in the scheduler.
>>>>>> The timeout timer is rearmed here, so this prevents any new recovery
>>>>>> work to restart from here
>>>>>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not
>>>>>> cover other places like
>>>>>> job cleanup or starting new job but those should stop once the
>>>>>> scheduler thread is stopped later.
>>>>> Yeah, but this is rather unclean. We should probably return an error
>>>>> code instead if the timer should be rearmed or not.
>>>> Christian, this is exactly my work I told you about
>>>> last week on Wednesday in our weekly meeting. And
>>>> which I wrote to you in an email last year about this
>>>> time.
>>> Yeah, that's why I'm suggesting it here as well.
>> It seems you're suggesting that Andrey do it, while
>> all too well you know I've been working on this
>> for some time now.
> 
> Changing the return value is just a minimal change and I didn't want to 
> block Andrey in any way.
> 

But it is the suggestion I had last year this time.
It is the whole root of my changes--it's a gamechanger.

>>
>> I wrote you about this last year same time
>> in an email. And I discussed it on the Wednesday
>> meeting.
>>
>> You could've mentioned that here the first time.
>>
>>>> So what do we do now?
>>> Split your patches into smaller parts and submit them chunk by chunk.
>>>
>>> E.g. renames first and then functional changes grouped by area they change.
>> I have, but my final patch, a tiny one but which implements
>> the core reason for the change seems buggy, and I'm looking
>> for a way to debug it.
> 
> Just send it out in chunks, e.g. non functional changes like renames 
> shouldn't cause any problems and having them in the branch early 
> minimizes conflicts with work from others.

Yeah, I agree, that's a good idea.

My final tiny patch is causing me grief and I'd rather
have had it working. :'-(

Regards,
Luben

> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>> Regards,
>>> Christian.
>>>
>>>> I can submit those changes without the last part,
>>>> which builds on this change.
>>>>
>>>> I'm still testing the last part and was hoping
>>>> to submit it all in one sequence of patches,
>>>> after my testing.
>>>>
>>>> Regards,
>>>> Luben
>>>>
>>>>> Christian.
>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> ---
>>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  2 +-
>>>>>>>>     drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>>>>>>>>     drivers/gpu/drm/lima/lima_sched.c         |  3 ++-
>>>>>>>>     drivers/gpu/drm/panfrost/panfrost_job.c   |  2 +-
>>>>>>>>     drivers/gpu/drm/scheduler/sched_main.c    | 15 ++++++++++++++-
>>>>>>>>     drivers/gpu/drm/v3d/v3d_sched.c           | 15 ++++++++++-----
>>>>>>>>     include/drm/gpu_scheduler.h               |  6 +++++-
>>>>>>>>     7 files changed, 35 insertions(+), 11 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> index d56f402..d0b0021 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct
>>>>>>>> amdgpu_ring *ring,
>>>>>>>>               r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>>>>>>                        num_hw_submission, amdgpu_job_hang_limit,
>>>>>>>> -                   timeout, ring->name);
>>>>>>>> +                   timeout, ring->name, &adev->ddev);
>>>>>>>>             if (r) {
>>>>>>>>                 DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>>>>>>                       ring->name);
>>>>>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> index cd46c88..7678287 100644
>>>>>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>>>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>>>>>>           ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>>>>>>                      etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>>>>>> -                 msecs_to_jiffies(500), dev_name(gpu->dev));
>>>>>>>> +                 msecs_to_jiffies(500), dev_name(gpu->dev),
>>>>>>>> +                 gpu->drm);
>>>>>>>>         if (ret)
>>>>>>>>             return ret;
>>>>>>>>     diff --git a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> index dc6df9e..8a7e5d7ca 100644
>>>>>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>>>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe
>>>>>>>> *pipe, const char *name)
>>>>>>>>           return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>>>>>>                       lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>>>>>> -                  name);
>>>>>>>> +                  name,
>>>>>>>> +                  pipe->ldev->ddev);
>>>>>>>>     }
>>>>>>>>       void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>>>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> index 30e7b71..37b03b01 100644
>>>>>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>>>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device
>>>>>>>> *pfdev)
>>>>>>>>             ret = drm_sched_init(&js->queue[j].sched,
>>>>>>>>                          &panfrost_sched_ops,
>>>>>>>>                          1, 0, msecs_to_jiffies(500),
>>>>>>>> -                     "pan_js");
>>>>>>>> +                     "pan_js", pfdev->ddev);
>>>>>>>>             if (ret) {
>>>>>>>>                 dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>>>>>>>> ret);
>>>>>>>>                 goto err_sched;
>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> index c3f0bd0..95db8c6 100644
>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> @@ -53,6 +53,7 @@
>>>>>>>>     #include <drm/drm_print.h>
>>>>>>>>     #include <drm/gpu_scheduler.h>
>>>>>>>>     #include <drm/spsc_queue.h>
>>>>>>>> +#include <drm/drm_drv.h>
>>>>>>>>       #define CREATE_TRACE_POINTS
>>>>>>>>     #include "gpu_scheduler_trace.h"
>>>>>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct
>>>>>>>> work_struct *work)
>>>>>>>>         struct drm_gpu_scheduler *sched;
>>>>>>>>         struct drm_sched_job *job;
>>>>>>>>     +    int idx;
>>>>>>>> +
>>>>>>>>         sched = container_of(work, struct drm_gpu_scheduler,
>>>>>>>> work_tdr.work);
>>>>>>>>     +    if (!drm_dev_enter(sched->ddev, &idx)) {
>>>>>>>> +        DRM_INFO("%s - device unplugged skipping recovery on
>>>>>>>> scheduler:%s",
>>>>>>>> +             __func__, sched->name);
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>>         /* Protects against concurrent deletion in
>>>>>>>> drm_sched_get_cleanup_job */
>>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>>         job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>>>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct
>>>>>>>> work_struct *work)
>>>>>>>>         spin_lock(&sched->job_list_lock);
>>>>>>>>         drm_sched_start_timeout(sched);
>>>>>>>>         spin_unlock(&sched->job_list_lock);
>>>>>>>> +
>>>>>>>> +    drm_dev_exit(idx);
>>>>>>>>     }
>>>>>>>>        /**
>>>>>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>                unsigned hw_submission,
>>>>>>>>                unsigned hang_limit,
>>>>>>>>                long timeout,
>>>>>>>> -           const char *name)
>>>>>>>> +           const char *name,
>>>>>>>> +           struct drm_device *ddev)
>>>>>>>>     {
>>>>>>>>         int i, ret;
>>>>>>>>         sched->ops = ops;
>>>>>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>         sched->name = name;
>>>>>>>>         sched->timeout = timeout;
>>>>>>>>         sched->hang_limit = hang_limit;
>>>>>>>> +    sched->ddev = ddev;
>>>>>>>>         for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT;
>>>>>>>> i++)
>>>>>>>>             drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>>>>>>     diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> index 0747614..f5076e5 100644
>>>>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                      &v3d_bin_sched_ops,
>>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                 "v3d_bin");
>>>>>>>> +                 "v3d_bin",
>>>>>>>> +                 &v3d->drm);
>>>>>>>>         if (ret) {
>>>>>>>>             dev_err(v3d->drm.dev, "Failed to create bin scheduler:
>>>>>>>> %d.", ret);
>>>>>>>>             return ret;
>>>>>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                      &v3d_render_sched_ops,
>>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                 "v3d_render");
>>>>>>>> +                 "v3d_render",
>>>>>>>> +                 &v3d->drm);
>>>>>>>>         if (ret) {
>>>>>>>>             dev_err(v3d->drm.dev, "Failed to create render scheduler:
>>>>>>>> %d.",
>>>>>>>>                 ret);
>>>>>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                      &v3d_tfu_sched_ops,
>>>>>>>>                      hw_jobs_limit, job_hang_limit,
>>>>>>>>                      msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                 "v3d_tfu");
>>>>>>>> +                 "v3d_tfu",
>>>>>>>> +                 &v3d->drm);
>>>>>>>>         if (ret) {
>>>>>>>>             dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>>>>>>                 ret);
>>>>>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                          &v3d_csd_sched_ops,
>>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                     "v3d_csd");
>>>>>>>> +                     "v3d_csd",
>>>>>>>> +                     &v3d->drm);
>>>>>>>>             if (ret) {
>>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CSD scheduler:
>>>>>>>> %d.",
>>>>>>>>                     ret);
>>>>>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>>>>>>                          &v3d_cache_clean_sched_ops,
>>>>>>>>                          hw_jobs_limit, job_hang_limit,
>>>>>>>>                          msecs_to_jiffies(hang_limit_ms),
>>>>>>>> -                     "v3d_cache_clean");
>>>>>>>> +                     "v3d_cache_clean",
>>>>>>>> +                     &v3d->drm);
>>>>>>>>             if (ret) {
>>>>>>>>                 dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN
>>>>>>>> scheduler: %d.",
>>>>>>>>                     ret);
>>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>>> index 9243655..a980709 100644
>>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>>> @@ -32,6 +32,7 @@
>>>>>>>>       struct drm_gpu_scheduler;
>>>>>>>>     struct drm_sched_rq;
>>>>>>>> +struct drm_device;
>>>>>>>>       /* These are often used as an (initial) index
>>>>>>>>      * to an array, and as such should start at 0.
>>>>>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>>>>>>      * @score: score to help loadbalancer pick a idle sched
>>>>>>>>      * @ready: marks if the underlying HW is ready to work
>>>>>>>>      * @free_guilty: A hit to time out handler to free the guilty job.
>>>>>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>>>>>>      *
>>>>>>>>      * One scheduler is implemented for each hardware ring.
>>>>>>>>      */
>>>>>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>>>>>>         atomic_t                        score;
>>>>>>>>         bool                ready;
>>>>>>>>         bool                free_guilty;
>>>>>>>> +    struct drm_device        *ddev;
>>>>>>>>     };
>>>>>>>>       int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>>                const struct drm_sched_backend_ops *ops,
>>>>>>>>                uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>>>>>> -           const char *name);
>>>>>>>> +           const char *name,
>>>>>>>> +           struct drm_device *ddev);
>>>>>>>>       void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>>>>     int drm_sched_job_init(struct drm_sched_job *job,
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7e6fb0df75384eb9f09808d890a00481%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418364235890016%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=rNvsJqt6zhqRnOWfxsdW1oeUBNRMLBTb%2FTngmMP99O0%3D&amp;reserved=0
>>>>> _______________________________________________
>>>>> dri-devel mailing list
>>>>> dri-devel@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7e6fb0df75384eb9f09808d890a00481%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418364235890016%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=I6v%2FjvfWLBP655oaROZRE7xkHhxrXHSWCJ5gJNrm8ac%3D&amp;reserved=0
>>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7e6fb0df75384eb9f09808d890a00481%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418364235890016%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=rNvsJqt6zhqRnOWfxsdW1oeUBNRMLBTb%2FTngmMP99O0%3D&amp;reserved=0
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7e6fb0df75384eb9f09808d890a00481%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418364235899973%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=bpJPveuR5svZfujWWolwD2p4pdMChuPIExrSpeVOXZc%3D&amp;reserved=0
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7e6fb0df75384eb9f09808d890a00481%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418364235899973%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=H%2Fne14RjbANAps4jh2seXZ6UNrraDxoUkkbK2fXODvM%3D&amp;reserved=0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C7e6fb0df75384eb9f09808d890a00481%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637418364235899973%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=bpJPveuR5svZfujWWolwD2p4pdMChuPIExrSpeVOXZc%3D&amp;reserved=0
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index d56f402..d0b0021 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -487,7 +487,7 @@  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 
 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
 				   num_hw_submission, amdgpu_job_hang_limit,
-				   timeout, ring->name);
+				   timeout, ring->name, &adev->ddev);
 		if (r) {
 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
 				  ring->name);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index cd46c88..7678287 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -185,7 +185,8 @@  int etnaviv_sched_init(struct etnaviv_gpu *gpu)
 
 	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
 			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
-			     msecs_to_jiffies(500), dev_name(gpu->dev));
+			     msecs_to_jiffies(500), dev_name(gpu->dev),
+			     gpu->drm);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index dc6df9e..8a7e5d7ca 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -505,7 +505,8 @@  int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
 
 	return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
 			      lima_job_hang_limit, msecs_to_jiffies(timeout),
-			      name);
+			      name,
+			      pipe->ldev->ddev);
 }
 
 void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 30e7b71..37b03b01 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -520,7 +520,7 @@  int panfrost_job_init(struct panfrost_device *pfdev)
 		ret = drm_sched_init(&js->queue[j].sched,
 				     &panfrost_sched_ops,
 				     1, 0, msecs_to_jiffies(500),
-				     "pan_js");
+				     "pan_js", pfdev->ddev);
 		if (ret) {
 			dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
 			goto err_sched;
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index c3f0bd0..95db8c6 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -53,6 +53,7 @@ 
 #include <drm/drm_print.h>
 #include <drm/gpu_scheduler.h>
 #include <drm/spsc_queue.h>
+#include <drm/drm_drv.h>
 
 #define CREATE_TRACE_POINTS
 #include "gpu_scheduler_trace.h"
@@ -283,8 +284,16 @@  static void drm_sched_job_timedout(struct work_struct *work)
 	struct drm_gpu_scheduler *sched;
 	struct drm_sched_job *job;
 
+	int idx;
+
 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
 
+	if (!drm_dev_enter(sched->ddev, &idx)) {
+		DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
+			 __func__, sched->name);
+		return;
+	}
+
 	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
 	spin_lock(&sched->job_list_lock);
 	job = list_first_entry_or_null(&sched->ring_mirror_list,
@@ -316,6 +325,8 @@  static void drm_sched_job_timedout(struct work_struct *work)
 	spin_lock(&sched->job_list_lock);
 	drm_sched_start_timeout(sched);
 	spin_unlock(&sched->job_list_lock);
+
+	drm_dev_exit(idx);
 }
 
  /**
@@ -845,7 +856,8 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   unsigned hw_submission,
 		   unsigned hang_limit,
 		   long timeout,
-		   const char *name)
+		   const char *name,
+		   struct drm_device *ddev)
 {
 	int i, ret;
 	sched->ops = ops;
@@ -853,6 +865,7 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 	sched->name = name;
 	sched->timeout = timeout;
 	sched->hang_limit = hang_limit;
+	sched->ddev = ddev;
 	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
 		drm_sched_rq_init(sched, &sched->sched_rq[i]);
 
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 0747614..f5076e5 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -401,7 +401,8 @@  v3d_sched_init(struct v3d_dev *v3d)
 			     &v3d_bin_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms),
-			     "v3d_bin");
+			     "v3d_bin",
+			     &v3d->drm);
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
 		return ret;
@@ -411,7 +412,8 @@  v3d_sched_init(struct v3d_dev *v3d)
 			     &v3d_render_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms),
-			     "v3d_render");
+			     "v3d_render",
+			     &v3d->drm);
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
 			ret);
@@ -423,7 +425,8 @@  v3d_sched_init(struct v3d_dev *v3d)
 			     &v3d_tfu_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms),
-			     "v3d_tfu");
+			     "v3d_tfu",
+			     &v3d->drm);
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
 			ret);
@@ -436,7 +439,8 @@  v3d_sched_init(struct v3d_dev *v3d)
 				     &v3d_csd_sched_ops,
 				     hw_jobs_limit, job_hang_limit,
 				     msecs_to_jiffies(hang_limit_ms),
-				     "v3d_csd");
+				     "v3d_csd",
+				     &v3d->drm);
 		if (ret) {
 			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
 				ret);
@@ -448,7 +452,8 @@  v3d_sched_init(struct v3d_dev *v3d)
 				     &v3d_cache_clean_sched_ops,
 				     hw_jobs_limit, job_hang_limit,
 				     msecs_to_jiffies(hang_limit_ms),
-				     "v3d_cache_clean");
+				     "v3d_cache_clean",
+				     &v3d->drm);
 		if (ret) {
 			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
 				ret);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 9243655..a980709 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -32,6 +32,7 @@ 
 
 struct drm_gpu_scheduler;
 struct drm_sched_rq;
+struct drm_device;
 
 /* These are often used as an (initial) index
  * to an array, and as such should start at 0.
@@ -267,6 +268,7 @@  struct drm_sched_backend_ops {
  * @score: score to help loadbalancer pick a idle sched
  * @ready: marks if the underlying HW is ready to work
  * @free_guilty: A hit to time out handler to free the guilty job.
+ * @ddev: Pointer to drm device of this scheduler.
  *
  * One scheduler is implemented for each hardware ring.
  */
@@ -288,12 +290,14 @@  struct drm_gpu_scheduler {
 	atomic_t                        score;
 	bool				ready;
 	bool				free_guilty;
+	struct drm_device		*ddev;
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
 		   uint32_t hw_submission, unsigned hang_limit, long timeout,
-		   const char *name);
+		   const char *name,
+		   struct drm_device *ddev);
 
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,