diff mbox series

[1/3] drm/sched: Add callback to mark if sched is ready to work.

Message ID 1539888261-331-1-git-send-email-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/sched: Add callback to mark if sched is ready to work. | expand

Commit Message

Andrey Grodzovsky Oct. 18, 2018, 6:44 p.m. UTC
Problem:
A particular scheduler may become unsuable (underlying HW) after
some event (e.g. GPU reset). If it's later chosen by
the get free sched. policy a command will fail to be
submitted.

Fix:
Add a driver specific callback to report the sced. status so
rq with bad sched. can be avoided in favor of working one or
none in which case job init will fail.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  9 ++++++++-
 drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
 drivers/gpu/drm/scheduler/sched_entity.c  | 11 ++++++++++-
 drivers/gpu/drm/scheduler/sched_main.c    |  8 +++++++-
 drivers/gpu/drm/v3d/v3d_sched.c           |  3 ++-
 include/drm/gpu_scheduler.h               |  5 ++++-
 6 files changed, 33 insertions(+), 6 deletions(-)

Comments

Christian König Oct. 19, 2018, 7:03 a.m. UTC | #1
Am 18.10.18 um 20:44 schrieb Andrey Grodzovsky:
> Problem:
> A particular scheduler may become unsuable (underlying HW) after
> some event (e.g. GPU reset). If it's later chosen by
> the get free sched. policy a command will fail to be
> submitted.
>
> Fix:
> Add a driver specific callback to report the sced. status so
> rq with bad sched. can be avoided in favor of working one or
> none in which case job init will fail.

I would rather go with the approach of moving the ready flag completely 
into the scheduler instead.

Christian.

>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  9 ++++++++-
>   drivers/gpu/drm/etnaviv/etnaviv_sched.c   |  3 ++-
>   drivers/gpu/drm/scheduler/sched_entity.c  | 11 ++++++++++-
>   drivers/gpu/drm/scheduler/sched_main.c    |  8 +++++++-
>   drivers/gpu/drm/v3d/v3d_sched.c           |  3 ++-
>   include/drm/gpu_scheduler.h               |  5 ++++-
>   6 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 5448cf2..bbfe7f501 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -404,6 +404,13 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>   	return 0;
>   }
>   
> +static bool amdgpu_ring_ready(struct drm_gpu_scheduler *sched)
> +{
> +	struct amdgpu_ring *ring = to_amdgpu_ring(sched);
> +
> +	return ring->ready;
> +}
> +
>   /**
>    * amdgpu_fence_driver_init_ring - init the fence driver
>    * for the requested ring.
> @@ -450,7 +457,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   
>   		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>   				   num_hw_submission, amdgpu_job_hang_limit,
> -				   timeout, ring->name);
> +				   timeout, ring->name, amdgpu_ring_ready);
>   		if (r) {
>   			DRM_ERROR("Failed to create scheduler on ring %s.\n",
>   				  ring->name);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index f8c5f1e..5094013 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -178,7 +178,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>   
>   	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>   			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> -			     msecs_to_jiffies(500), dev_name(gpu->dev));
> +			     msecs_to_jiffies(500), dev_name(gpu->dev),
> +				 NULL);
>   	if (ret)
>   		return ret;
>   
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 3e22a54..320c77a 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -130,7 +130,16 @@ drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
>   	int i;
>   
>   	for (i = 0; i < entity->num_rq_list; ++i) {
> -		num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
> +		struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
> +
> +		if (entity->rq_list[i]->sched->ready &&
> +				!entity->rq_list[i]->sched->ready(sched)) {
> +			DRM_WARN("sched%s is not ready, skipping", sched->name);
> +
> +			continue;
> +		}
> +
> +		num_jobs = atomic_read(&sched->num_jobs);
>   		if (num_jobs < min_jobs) {
>   			min_jobs = num_jobs;
>   			rq = entity->rq_list[i];
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 63b997d..6b151f2 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -420,6 +420,9 @@ int drm_sched_job_init(struct drm_sched_job *job,
>   	struct drm_gpu_scheduler *sched;
>   
>   	drm_sched_entity_select_rq(entity);
> +	if (!entity->rq)
> +		return -ENOENT;
> +
>   	sched = entity->rq->sched;
>   
>   	job->sched = sched;
> @@ -606,7 +609,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   unsigned hw_submission,
>   		   unsigned hang_limit,
>   		   long timeout,
> -		   const char *name)
> +		   const char *name,
> +		   bool (*ready)(struct drm_gpu_scheduler *sched))
>   {
>   	int i;
>   	sched->ops = ops;
> @@ -633,6 +637,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		return PTR_ERR(sched->thread);
>   	}
>   
> +	sched->ready = ready;
> +
>   	return 0;
>   }
>   EXPORT_SYMBOL(drm_sched_init);
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 80b641f..e06cc0d 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -212,7 +212,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>   			     &v3d_sched_ops,
>   			     hw_jobs_limit, job_hang_limit,
>   			     msecs_to_jiffies(hang_limit_ms),
> -			     "v3d_bin");
> +			     "v3d_bin",
> +				 NULL);
>   	if (ret) {
>   		dev_err(v3d->dev, "Failed to create bin scheduler: %d.", ret);
>   		return ret;
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 0684dcd..a6e18fe 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -283,12 +283,15 @@ struct drm_gpu_scheduler {
>   	spinlock_t			job_list_lock;
>   	int				hang_limit;
>   	atomic_t                        num_jobs;
> +
> +	bool 			(*ready)(struct drm_gpu_scheduler *sched);
>   };
>   
>   int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		   const struct drm_sched_backend_ops *ops,
>   		   uint32_t hw_submission, unsigned hang_limit, long timeout,
> -		   const char *name);
> +		   const char *name,
> +		   bool (*ready)(struct drm_gpu_scheduler *sched));
>   void drm_sched_fini(struct drm_gpu_scheduler *sched);
>   int drm_sched_job_init(struct drm_sched_job *job,
>   		       struct drm_sched_entity *entity,
Michel Dänzer Oct. 19, 2018, 7:13 a.m. UTC | #2
On 2018-10-18 8:44 p.m., Andrey Grodzovsky wrote:
> Problem:
> A particular scheduler may become unsuable (underlying HW) after
> some event (e.g. GPU reset). If it's later chosen by
> the get free sched. policy a command will fail to be
> submitted.
> 
> Fix:
> Add a driver specific callback to report the sced. status so
> rq with bad sched. can be avoided in favor of working one or
> none in which case job init will fail.
> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> 
> [...]
>  
> @@ -450,7 +457,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>  
>  		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>  				   num_hw_submission, amdgpu_job_hang_limit,
> -				   timeout, ring->name);
> +				   timeout, ring->name, amdgpu_ring_ready);
>  		if (r) {
>  			DRM_ERROR("Failed to create scheduler on ring %s.\n",
>  				  ring->name);
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index f8c5f1e..5094013 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -178,7 +178,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>  
>  	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>  			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
> -			     msecs_to_jiffies(500), dev_name(gpu->dev));
> +			     msecs_to_jiffies(500), dev_name(gpu->dev),
> +				 NULL);

Indentation looks wrong for the NULL here...


> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 3e22a54..320c77a 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -130,7 +130,16 @@ drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
>  	int i;
>  
>  	for (i = 0; i < entity->num_rq_list; ++i) {
> -		num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
> +		struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
> +
> +		if (entity->rq_list[i]->sched->ready &&
> +				!entity->rq_list[i]->sched->ready(sched)) {

... the second line of the if here...


> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 80b641f..e06cc0d 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -212,7 +212,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>  			     &v3d_sched_ops,
>  			     hw_jobs_limit, job_hang_limit,
>  			     msecs_to_jiffies(hang_limit_ms),
> -			     "v3d_bin");
> +			     "v3d_bin",
> +				 NULL);

... the NULL here...


> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 0684dcd..a6e18fe 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -283,12 +283,15 @@ struct drm_gpu_scheduler {
>  	spinlock_t			job_list_lock;
>  	int				hang_limit;
>  	atomic_t                        num_jobs;
> +
> +	bool 			(*ready)(struct drm_gpu_scheduler *sched);
>  };

... and the new line here.

Which editor are you using?
Andrey Grodzovsky Oct. 19, 2018, 2:45 p.m. UTC | #3
Eclipse

Andrey


On 10/19/2018 03:13 AM, Michel Dänzer wrote:
> ... and the new line here.
>
> Which editor are you using?
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5448cf2..bbfe7f501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -404,6 +404,13 @@  int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
 	return 0;
 }
 
+static bool amdgpu_ring_ready(struct drm_gpu_scheduler *sched)
+{
+	struct amdgpu_ring *ring = to_amdgpu_ring(sched);
+
+	return ring->ready;
+}
+
 /**
  * amdgpu_fence_driver_init_ring - init the fence driver
  * for the requested ring.
@@ -450,7 +457,7 @@  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 
 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
 				   num_hw_submission, amdgpu_job_hang_limit,
-				   timeout, ring->name);
+				   timeout, ring->name, amdgpu_ring_ready);
 		if (r) {
 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
 				  ring->name);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index f8c5f1e..5094013 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -178,7 +178,8 @@  int etnaviv_sched_init(struct etnaviv_gpu *gpu)
 
 	ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
 			     etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
-			     msecs_to_jiffies(500), dev_name(gpu->dev));
+			     msecs_to_jiffies(500), dev_name(gpu->dev),
+				 NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 3e22a54..320c77a 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -130,7 +130,16 @@  drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
 	int i;
 
 	for (i = 0; i < entity->num_rq_list; ++i) {
-		num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
+		struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
+
+		if (entity->rq_list[i]->sched->ready &&
+				!entity->rq_list[i]->sched->ready(sched)) {
+			DRM_WARN("sched%s is not ready, skipping", sched->name);
+
+			continue;
+		}
+
+		num_jobs = atomic_read(&sched->num_jobs);
 		if (num_jobs < min_jobs) {
 			min_jobs = num_jobs;
 			rq = entity->rq_list[i];
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 63b997d..6b151f2 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -420,6 +420,9 @@  int drm_sched_job_init(struct drm_sched_job *job,
 	struct drm_gpu_scheduler *sched;
 
 	drm_sched_entity_select_rq(entity);
+	if (!entity->rq)
+		return -ENOENT;
+
 	sched = entity->rq->sched;
 
 	job->sched = sched;
@@ -606,7 +609,8 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   unsigned hw_submission,
 		   unsigned hang_limit,
 		   long timeout,
-		   const char *name)
+		   const char *name,
+		   bool (*ready)(struct drm_gpu_scheduler *sched))
 {
 	int i;
 	sched->ops = ops;
@@ -633,6 +637,8 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 		return PTR_ERR(sched->thread);
 	}
 
+	sched->ready = ready;
+
 	return 0;
 }
 EXPORT_SYMBOL(drm_sched_init);
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 80b641f..e06cc0d 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -212,7 +212,8 @@  v3d_sched_init(struct v3d_dev *v3d)
 			     &v3d_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
 			     msecs_to_jiffies(hang_limit_ms),
-			     "v3d_bin");
+			     "v3d_bin",
+				 NULL);
 	if (ret) {
 		dev_err(v3d->dev, "Failed to create bin scheduler: %d.", ret);
 		return ret;
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 0684dcd..a6e18fe 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -283,12 +283,15 @@  struct drm_gpu_scheduler {
 	spinlock_t			job_list_lock;
 	int				hang_limit;
 	atomic_t                        num_jobs;
+
+	bool 			(*ready)(struct drm_gpu_scheduler *sched);
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
 		   uint32_t hw_submission, unsigned hang_limit, long timeout,
-		   const char *name);
+		   const char *name,
+		   bool (*ready)(struct drm_gpu_scheduler *sched));
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_sched_entity *entity,