diff mbox series

[4/4] drm/panthor: Call panthor_sched_post_reset() even if the reset failed

Message ID 20240502183813.1612017-5-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series drm/panthor: More reset fixes | expand

Commit Message

Boris Brezillon May 2, 2024, 6:38 p.m. UTC
We need to undo what was done in panthor_sched_pre_reset() even if the
reset failed. We just flag all previously running groups as terminated
when that happens to unblock things.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_device.c |  7 +------
 drivers/gpu/drm/panthor/panthor_sched.c  | 19 ++++++++++++++-----
 drivers/gpu/drm/panthor/panthor_sched.h  |  2 +-
 3 files changed, 16 insertions(+), 12 deletions(-)

Comments

Steven Price May 3, 2024, 9:22 a.m. UTC | #1
On 02/05/2024 19:38, Boris Brezillon wrote:
> We need to undo what was done in panthor_sched_pre_reset() even if the
> reset failed. We just flag all previously running groups as terminated
> when that happens to unblock things.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Seems reasonable, although I hope this case doesn't happen in practice ;)

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_device.c |  7 +------
>  drivers/gpu/drm/panthor/panthor_sched.c  | 19 ++++++++++++++-----
>  drivers/gpu/drm/panthor/panthor_sched.h  |  2 +-
>  3 files changed, 16 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> index 4c5b54e7abb7..4082c8f2951d 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.c
> +++ b/drivers/gpu/drm/panthor/panthor_device.c
> @@ -129,13 +129,8 @@ static void panthor_device_reset_work(struct work_struct *work)
>  	panthor_gpu_l2_power_on(ptdev);
>  	panthor_mmu_post_reset(ptdev);
>  	ret = panthor_fw_post_reset(ptdev);
> -	if (ret)
> -		goto out_dev_exit;
> -
>  	atomic_set(&ptdev->reset.pending, 0);
> -	panthor_sched_post_reset(ptdev);
> -
> -out_dev_exit:
> +	panthor_sched_post_reset(ptdev, ret != 0);
>  	drm_dev_exit(cookie);
>  
>  	if (ret) {
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 6ea094b00cf9..fc43ff62c77d 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -2728,15 +2728,22 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev)
>  	mutex_unlock(&sched->reset.lock);
>  }
>  
> -void panthor_sched_post_reset(struct panthor_device *ptdev)
> +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed)
>  {
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_group *group, *group_tmp;
>  
>  	mutex_lock(&sched->reset.lock);
>  
> -	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node)
> +	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) {
> +		/* Consider all previously running group as terminated if the
> +		 * reset failed.
> +		 */
> +		if (reset_failed)
> +			group->state = PANTHOR_CS_GROUP_TERMINATED;
> +
>  		panthor_group_start(group);
> +	}
>  
>  	/* We're done resetting the GPU, clear the reset.in_progress bit so we can
>  	 * kick the scheduler.
> @@ -2744,9 +2751,11 @@ void panthor_sched_post_reset(struct panthor_device *ptdev)
>  	atomic_set(&sched->reset.in_progress, false);
>  	mutex_unlock(&sched->reset.lock);
>  
> -	sched_queue_delayed_work(sched, tick, 0);
> -
> -	sched_queue_work(sched, sync_upd);
> +	/* No need to queue a tick and update syncs if the reset failed. */
> +	if (!reset_failed) {
> +		sched_queue_delayed_work(sched, tick, 0);
> +		sched_queue_work(sched, sync_upd);
> +	}
>  }
>  
>  static void group_sync_upd_work(struct work_struct *work)
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
> index 66438b1f331f..3a30d2328b30 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.h
> +++ b/drivers/gpu/drm/panthor/panthor_sched.h
> @@ -40,7 +40,7 @@ void panthor_group_pool_destroy(struct panthor_file *pfile);
>  int panthor_sched_init(struct panthor_device *ptdev);
>  void panthor_sched_unplug(struct panthor_device *ptdev);
>  void panthor_sched_pre_reset(struct panthor_device *ptdev);
> -void panthor_sched_post_reset(struct panthor_device *ptdev);
> +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed);
>  void panthor_sched_suspend(struct panthor_device *ptdev);
>  void panthor_sched_resume(struct panthor_device *ptdev);
>
Liviu Dudau May 3, 2024, 11:49 a.m. UTC | #2
On Thu, May 02, 2024 at 08:38:12PM +0200, Boris Brezillon wrote:
> We need to undo what was done in panthor_sched_pre_reset() even if the
> reset failed. We just flag all previously running groups as terminated
> when that happens to unblock things.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_device.c |  7 +------
>  drivers/gpu/drm/panthor/panthor_sched.c  | 19 ++++++++++++++-----
>  drivers/gpu/drm/panthor/panthor_sched.h  |  2 +-
>  3 files changed, 16 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> index 4c5b54e7abb7..4082c8f2951d 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.c
> +++ b/drivers/gpu/drm/panthor/panthor_device.c
> @@ -129,13 +129,8 @@ static void panthor_device_reset_work(struct work_struct *work)
>  	panthor_gpu_l2_power_on(ptdev);
>  	panthor_mmu_post_reset(ptdev);
>  	ret = panthor_fw_post_reset(ptdev);
> -	if (ret)
> -		goto out_dev_exit;
> -
>  	atomic_set(&ptdev->reset.pending, 0);
> -	panthor_sched_post_reset(ptdev);
> -
> -out_dev_exit:
> +	panthor_sched_post_reset(ptdev, ret != 0);
>  	drm_dev_exit(cookie);
>  
>  	if (ret) {
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 6ea094b00cf9..fc43ff62c77d 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -2728,15 +2728,22 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev)
>  	mutex_unlock(&sched->reset.lock);
>  }
>  
> -void panthor_sched_post_reset(struct panthor_device *ptdev)
> +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed)
>  {
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_group *group, *group_tmp;
>  
>  	mutex_lock(&sched->reset.lock);
>  
> -	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node)
> +	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) {
> +		/* Consider all previously running group as terminated if the
> +		 * reset failed.
> +		 */
> +		if (reset_failed)
> +			group->state = PANTHOR_CS_GROUP_TERMINATED;
> +
>  		panthor_group_start(group);
> +	}
>  
>  	/* We're done resetting the GPU, clear the reset.in_progress bit so we can
>  	 * kick the scheduler.
> @@ -2744,9 +2751,11 @@ void panthor_sched_post_reset(struct panthor_device *ptdev)
>  	atomic_set(&sched->reset.in_progress, false);
>  	mutex_unlock(&sched->reset.lock);
>  
> -	sched_queue_delayed_work(sched, tick, 0);
> -
> -	sched_queue_work(sched, sync_upd);
> +	/* No need to queue a tick and update syncs if the reset failed. */
> +	if (!reset_failed) {
> +		sched_queue_delayed_work(sched, tick, 0);
> +		sched_queue_work(sched, sync_upd);
> +	}
>  }
>  
>  static void group_sync_upd_work(struct work_struct *work)
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
> index 66438b1f331f..3a30d2328b30 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.h
> +++ b/drivers/gpu/drm/panthor/panthor_sched.h
> @@ -40,7 +40,7 @@ void panthor_group_pool_destroy(struct panthor_file *pfile);
>  int panthor_sched_init(struct panthor_device *ptdev);
>  void panthor_sched_unplug(struct panthor_device *ptdev);
>  void panthor_sched_pre_reset(struct panthor_device *ptdev);
> -void panthor_sched_post_reset(struct panthor_device *ptdev);
> +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed);
>  void panthor_sched_suspend(struct panthor_device *ptdev);
>  void panthor_sched_resume(struct panthor_device *ptdev);
>  
> -- 
> 2.44.0
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
index 4c5b54e7abb7..4082c8f2951d 100644
--- a/drivers/gpu/drm/panthor/panthor_device.c
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -129,13 +129,8 @@  static void panthor_device_reset_work(struct work_struct *work)
 	panthor_gpu_l2_power_on(ptdev);
 	panthor_mmu_post_reset(ptdev);
 	ret = panthor_fw_post_reset(ptdev);
-	if (ret)
-		goto out_dev_exit;
-
 	atomic_set(&ptdev->reset.pending, 0);
-	panthor_sched_post_reset(ptdev);
-
-out_dev_exit:
+	panthor_sched_post_reset(ptdev, ret != 0);
 	drm_dev_exit(cookie);
 
 	if (ret) {
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 6ea094b00cf9..fc43ff62c77d 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -2728,15 +2728,22 @@  void panthor_sched_pre_reset(struct panthor_device *ptdev)
 	mutex_unlock(&sched->reset.lock);
 }
 
-void panthor_sched_post_reset(struct panthor_device *ptdev)
+void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed)
 {
 	struct panthor_scheduler *sched = ptdev->scheduler;
 	struct panthor_group *group, *group_tmp;
 
 	mutex_lock(&sched->reset.lock);
 
-	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node)
+	list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) {
+		/* Consider all previously running group as terminated if the
+		 * reset failed.
+		 */
+		if (reset_failed)
+			group->state = PANTHOR_CS_GROUP_TERMINATED;
+
 		panthor_group_start(group);
+	}
 
 	/* We're done resetting the GPU, clear the reset.in_progress bit so we can
 	 * kick the scheduler.
@@ -2744,9 +2751,11 @@  void panthor_sched_post_reset(struct panthor_device *ptdev)
 	atomic_set(&sched->reset.in_progress, false);
 	mutex_unlock(&sched->reset.lock);
 
-	sched_queue_delayed_work(sched, tick, 0);
-
-	sched_queue_work(sched, sync_upd);
+	/* No need to queue a tick and update syncs if the reset failed. */
+	if (!reset_failed) {
+		sched_queue_delayed_work(sched, tick, 0);
+		sched_queue_work(sched, sync_upd);
+	}
 }
 
 static void group_sync_upd_work(struct work_struct *work)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
index 66438b1f331f..3a30d2328b30 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -40,7 +40,7 @@  void panthor_group_pool_destroy(struct panthor_file *pfile);
 int panthor_sched_init(struct panthor_device *ptdev);
 void panthor_sched_unplug(struct panthor_device *ptdev);
 void panthor_sched_pre_reset(struct panthor_device *ptdev);
-void panthor_sched_post_reset(struct panthor_device *ptdev);
+void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed);
 void panthor_sched_suspend(struct panthor_device *ptdev);
 void panthor_sched_resume(struct panthor_device *ptdev);