diff mbox series

[v3,5/5] drm/panthor: Fix the fast-reset logic

Message ID 20241211075419.2333731-6-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series drm/panthor: Be robust against failures in the resume path | expand

Commit Message

Boris Brezillon Dec. 11, 2024, 7:54 a.m. UTC
If we do a GPU soft-reset, that's no longer fast reset. This also means
the slow reset fallback doesn't work because the MCU state is only reset
after a GPU soft-reset.

Let's move the retry logic to panthor_device_resume() to issue a
soft-reset between the fast and slow attempts, and patch
panthor_gpu_suspend() to only power-off the L2 when a fast reset is
requested.

v3:
- No changes

v2:
- Add R-b

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
---
 drivers/gpu/drm/panthor/panthor_device.c | 32 ++++++++++----
 drivers/gpu/drm/panthor/panthor_device.h | 11 +++++
 drivers/gpu/drm/panthor/panthor_fw.c     | 54 ++++++------------------
 drivers/gpu/drm/panthor/panthor_gpu.c    | 11 ++---
 4 files changed, 53 insertions(+), 55 deletions(-)

Comments

Liviu Dudau Dec. 11, 2024, 9:57 a.m. UTC | #1
On Wed, Dec 11, 2024 at 08:54:19AM +0100, Boris Brezillon wrote:
> If we do a GPU soft-reset, that's no longer fast reset. This also means
> the slow reset fallback doesn't work because the MCU state is only reset
> after a GPU soft-reset.
> 
> Let's move the retry logic to panthor_device_resume() to issue a
> soft-reset between the fast and slow attempts, and patch
> panthor_gpu_suspend() to only power-off the L2 when a fast reset is
> requested.
> 
> v3:
> - No changes
> 
> v2:
> - Add R-b
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> Reviewed-by: Steven Price <steven.price@arm.com>

For reasons that are not clear yet to me my v2 R-bs emails seem to have not
reached you or the dri-devel mailing lists.


> ---
>  drivers/gpu/drm/panthor/panthor_device.c | 32 ++++++++++----
>  drivers/gpu/drm/panthor/panthor_device.h | 11 +++++
>  drivers/gpu/drm/panthor/panthor_fw.c     | 54 ++++++------------------
>  drivers/gpu/drm/panthor/panthor_gpu.c    | 11 ++---
>  4 files changed, 53 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> index 0362101ea896..2c817e65e6be 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.c
> +++ b/drivers/gpu/drm/panthor/panthor_device.c
> @@ -431,6 +431,22 @@ int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct *
>  	return 0;
>  }
>  
> +static int panthor_device_resume_hw_components(struct panthor_device *ptdev)
> +{
> +	int ret;
> +
> +	panthor_gpu_resume(ptdev);
> +	panthor_mmu_resume(ptdev);
> +
> +	ret = panthor_fw_resume(ptdev);
> +	if (!ret)
> +		return 0;
> +
> +	panthor_mmu_suspend(ptdev);
> +	panthor_gpu_suspend(ptdev);
> +	return ret;

My only comment was a nit here where I prefer the construct:

	if (ret) {
		panthor_mmu_suspend(ptdev);
		panthor_gpu_suspend(ptdev);
	}

	return ret;

but feel free to ignore it.

For the whole series: Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>

Best regards,
Liviu

> +}
> +
>  int panthor_device_resume(struct device *dev)
>  {
>  	struct panthor_device *ptdev = dev_get_drvdata(dev);
> @@ -457,16 +473,16 @@ int panthor_device_resume(struct device *dev)
>  
>  	if (panthor_device_is_initialized(ptdev) &&
>  	    drm_dev_enter(&ptdev->base, &cookie)) {
> -		panthor_gpu_resume(ptdev);
> -		panthor_mmu_resume(ptdev);
> -		ret = panthor_fw_resume(ptdev);
> -		if (!drm_WARN_ON(&ptdev->base, ret)) {
> -			panthor_sched_resume(ptdev);
> -		} else {
> -			panthor_mmu_suspend(ptdev);
> -			panthor_gpu_suspend(ptdev);
> +		ret = panthor_device_resume_hw_components(ptdev);
> +		if (ret && ptdev->reset.fast) {
> +			drm_err(&ptdev->base, "Fast reset failed, trying a slow reset");
> +			ptdev->reset.fast = false;
> +			ret = panthor_device_resume_hw_components(ptdev);
>  		}
>  
> +		if (!ret)
> +			panthor_sched_resume(ptdev);
> +
>  		drm_dev_exit(cookie);
>  
>  		if (ret)
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index b6c4f25a5d6e..da6574021664 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -157,6 +157,17 @@ struct panthor_device {
>  
>  		/** @pending: Set to true if a reset is pending. */
>  		atomic_t pending;
> +
> +		/**
> +		 * @fast: True if the post_reset logic can proceed with a fast reset.
> +		 *
> +		 * A fast reset is just a reset where the driver doesn't reload the FW sections.
> +		 *
> +		 * Any time the firmware is properly suspended, a fast reset can take place.
> +		 * On the other hand, if the halt operation failed, the driver will reload
> +		 * all FW sections to make sure we start from a fresh state.
> +		 */
> +		bool fast;
>  	} reset;
>  
>  	/** @pm: Power management related data. */
> diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
> index 02789558788d..5b68dc02b5ce 100644
> --- a/drivers/gpu/drm/panthor/panthor_fw.c
> +++ b/drivers/gpu/drm/panthor/panthor_fw.c
> @@ -263,17 +263,6 @@ struct panthor_fw {
>  	/** @booted: True is the FW is booted */
>  	bool booted;
>  
> -	/**
> -	 * @fast_reset: True if the post_reset logic can proceed with a fast reset.
> -	 *
> -	 * A fast reset is just a reset where the driver doesn't reload the FW sections.
> -	 *
> -	 * Any time the firmware is properly suspended, a fast reset can take place.
> -	 * On the other hand, if the halt operation failed, the driver will reload
> -	 * all sections to make sure we start from a fresh state.
> -	 */
> -	bool fast_reset;
> -
>  	/** @irq: Job irq data. */
>  	struct panthor_irq irq;
>  };
> @@ -1090,7 +1079,7 @@ void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang)
>  	/* Make sure we won't be woken up by a ping. */
>  	cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work);
>  
> -	ptdev->fw->fast_reset = false;
> +	ptdev->reset.fast = false;
>  
>  	if (!on_hang) {
>  		struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
> @@ -1100,7 +1089,7 @@ void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang)
>  		gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
>  		if (!readl_poll_timeout(ptdev->iomem + MCU_STATUS, status,
>  					status == MCU_STATUS_HALT, 10, 100000)) {
> -			ptdev->fw->fast_reset = true;
> +			ptdev->reset.fast = true;
>  		} else {
>  			drm_warn(&ptdev->base, "Failed to cleanly suspend MCU");
>  		}
> @@ -1125,49 +1114,30 @@ int panthor_fw_post_reset(struct panthor_device *ptdev)
>  	if (ret)
>  		return ret;
>  
> -	/* If this is a fast reset, try to start the MCU without reloading
> -	 * the FW sections. If it fails, go for a full reset.
> -	 */
> -	if (ptdev->fw->fast_reset) {
> +	if (!ptdev->reset.fast) {
> +		/* On a slow reset, reload all sections, including RO ones.
> +		 * We're not supposed to end up here anyway, let's just assume
> +		 * the overhead of reloading everything is acceptable.
> +		 */
> +		panthor_reload_fw_sections(ptdev, true);
> +	} else {
>  		/* The FW detects 0 -> 1 transitions. Make sure we reset
>  		 * the HALT bit before the FW is rebooted.
>  		 * This is not needed on a slow reset because FW sections are
>  		 * re-initialized.
>  		 */
>  		struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
> +
>  		panthor_fw_update_reqs(glb_iface, req, 0, GLB_HALT);
> -
> -		ret = panthor_fw_start(ptdev);
> -		if (!ret)
> -			goto out;
> -
> -		/* Forcibly reset the MCU and force a slow reset, so we get a
> -		 * fresh boot on the next panthor_fw_start() call.
> -		 */
> -		panthor_fw_stop(ptdev);
> -		ptdev->fw->fast_reset = false;
> -		drm_err(&ptdev->base, "FW fast reset failed, trying a slow reset");
> -
> -		ret = panthor_vm_flush_all(ptdev->fw->vm);
> -		if (ret) {
> -			drm_err(&ptdev->base, "FW slow reset failed (couldn't flush FW's AS l2cache)");
> -			return ret;
> -		}
>  	}
>  
> -	/* Reload all sections, including RO ones. We're not supposed
> -	 * to end up here anyway, let's just assume the overhead of
> -	 * reloading everything is acceptable.
> -	 */
> -	panthor_reload_fw_sections(ptdev, true);
> -
>  	ret = panthor_fw_start(ptdev);
>  	if (ret) {
> -		drm_err(&ptdev->base, "FW slow reset failed (couldn't start the FW )");
> +		drm_err(&ptdev->base, "FW %s reset failed",
> +			ptdev->reset.fast ?  "fast" : "slow");
>  		return ret;
>  	}
>  
> -out:
>  	/* We must re-initialize the global interface even on fast-reset. */
>  	panthor_fw_init_global_iface(ptdev);
>  	return 0;
> diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
> index ee85a371bc38..671049020afa 100644
> --- a/drivers/gpu/drm/panthor/panthor_gpu.c
> +++ b/drivers/gpu/drm/panthor/panthor_gpu.c
> @@ -470,11 +470,12 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
>   */
>  void panthor_gpu_suspend(struct panthor_device *ptdev)
>  {
> -	/*
> -	 * It may be preferable to simply power down the L2, but for now just
> -	 * soft-reset which will leave the L2 powered down.
> -	 */
> -	panthor_gpu_soft_reset(ptdev);
> +	/* On a fast reset, simply power down the L2. */
> +	if (!ptdev->reset.fast)
> +		panthor_gpu_soft_reset(ptdev);
> +	else
> +		panthor_gpu_power_off(ptdev, L2, 1, 20000);
> +
>  	panthor_gpu_irq_suspend(&ptdev->gpu->irq);
>  }
>  
> -- 
> 2.47.0
>
Boris Brezillon Dec. 11, 2024, 10:09 a.m. UTC | #2
On Wed, 11 Dec 2024 09:57:07 +0000
Liviu Dudau <liviu.dudau@arm.com> wrote:

> On Wed, Dec 11, 2024 at 08:54:19AM +0100, Boris Brezillon wrote:
> > If we do a GPU soft-reset, that's no longer fast reset. This also means
> > the slow reset fallback doesn't work because the MCU state is only reset
> > after a GPU soft-reset.
> > 
> > Let's move the retry logic to panthor_device_resume() to issue a
> > soft-reset between the fast and slow attempts, and patch
> > panthor_gpu_suspend() to only power-off the L2 when a fast reset is
> > requested.
> > 
> > v3:
> > - No changes
> > 
> > v2:
> > - Add R-b
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > Reviewed-by: Steven Price <steven.price@arm.com>  
> 
> For reasons that are not clear yet to me my v2 R-bs emails seem to have not
> reached you or the dri-devel mailing lists.
> 
> 
> > ---
> >  drivers/gpu/drm/panthor/panthor_device.c | 32 ++++++++++----
> >  drivers/gpu/drm/panthor/panthor_device.h | 11 +++++
> >  drivers/gpu/drm/panthor/panthor_fw.c     | 54 ++++++------------------
> >  drivers/gpu/drm/panthor/panthor_gpu.c    | 11 ++---
> >  4 files changed, 53 insertions(+), 55 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> > index 0362101ea896..2c817e65e6be 100644
> > --- a/drivers/gpu/drm/panthor/panthor_device.c
> > +++ b/drivers/gpu/drm/panthor/panthor_device.c
> > @@ -431,6 +431,22 @@ int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct *
> >  	return 0;
> >  }
> >  
> > +static int panthor_device_resume_hw_components(struct panthor_device *ptdev)
> > +{
> > +	int ret;
> > +
> > +	panthor_gpu_resume(ptdev);
> > +	panthor_mmu_resume(ptdev);
> > +
> > +	ret = panthor_fw_resume(ptdev);
> > +	if (!ret)
> > +		return 0;
> > +
> > +	panthor_mmu_suspend(ptdev);
> > +	panthor_gpu_suspend(ptdev);
> > +	return ret;  
> 
> My only comment was a nit here where I prefer the construct:
> 
> 	if (ret) {
> 		panthor_mmu_suspend(ptdev);
> 		panthor_gpu_suspend(ptdev);
> 	}
> 
> 	return ret;
> 
> but feel free to ignore it.
> 
> For the whole series: Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>

I applied the series before seeing your replies. Sorry about that :-/.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
index 0362101ea896..2c817e65e6be 100644
--- a/drivers/gpu/drm/panthor/panthor_device.c
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -431,6 +431,22 @@  int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct *
 	return 0;
 }
 
+static int panthor_device_resume_hw_components(struct panthor_device *ptdev)
+{
+	int ret;
+
+	panthor_gpu_resume(ptdev);
+	panthor_mmu_resume(ptdev);
+
+	ret = panthor_fw_resume(ptdev);
+	if (!ret)
+		return 0;
+
+	panthor_mmu_suspend(ptdev);
+	panthor_gpu_suspend(ptdev);
+	return ret;
+}
+
 int panthor_device_resume(struct device *dev)
 {
 	struct panthor_device *ptdev = dev_get_drvdata(dev);
@@ -457,16 +473,16 @@  int panthor_device_resume(struct device *dev)
 
 	if (panthor_device_is_initialized(ptdev) &&
 	    drm_dev_enter(&ptdev->base, &cookie)) {
-		panthor_gpu_resume(ptdev);
-		panthor_mmu_resume(ptdev);
-		ret = panthor_fw_resume(ptdev);
-		if (!drm_WARN_ON(&ptdev->base, ret)) {
-			panthor_sched_resume(ptdev);
-		} else {
-			panthor_mmu_suspend(ptdev);
-			panthor_gpu_suspend(ptdev);
+		ret = panthor_device_resume_hw_components(ptdev);
+		if (ret && ptdev->reset.fast) {
+			drm_err(&ptdev->base, "Fast reset failed, trying a slow reset");
+			ptdev->reset.fast = false;
+			ret = panthor_device_resume_hw_components(ptdev);
 		}
 
+		if (!ret)
+			panthor_sched_resume(ptdev);
+
 		drm_dev_exit(cookie);
 
 		if (ret)
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index b6c4f25a5d6e..da6574021664 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -157,6 +157,17 @@  struct panthor_device {
 
 		/** @pending: Set to true if a reset is pending. */
 		atomic_t pending;
+
+		/**
+		 * @fast: True if the post_reset logic can proceed with a fast reset.
+		 *
+		 * A fast reset is just a reset where the driver doesn't reload the FW sections.
+		 *
+		 * Any time the firmware is properly suspended, a fast reset can take place.
+		 * On the other hand, if the halt operation failed, the driver will reload
+		 * all FW sections to make sure we start from a fresh state.
+		 */
+		bool fast;
 	} reset;
 
 	/** @pm: Power management related data. */
diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
index 02789558788d..5b68dc02b5ce 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.c
+++ b/drivers/gpu/drm/panthor/panthor_fw.c
@@ -263,17 +263,6 @@  struct panthor_fw {
 	/** @booted: True is the FW is booted */
 	bool booted;
 
-	/**
-	 * @fast_reset: True if the post_reset logic can proceed with a fast reset.
-	 *
-	 * A fast reset is just a reset where the driver doesn't reload the FW sections.
-	 *
-	 * Any time the firmware is properly suspended, a fast reset can take place.
-	 * On the other hand, if the halt operation failed, the driver will reload
-	 * all sections to make sure we start from a fresh state.
-	 */
-	bool fast_reset;
-
 	/** @irq: Job irq data. */
 	struct panthor_irq irq;
 };
@@ -1090,7 +1079,7 @@  void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang)
 	/* Make sure we won't be woken up by a ping. */
 	cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work);
 
-	ptdev->fw->fast_reset = false;
+	ptdev->reset.fast = false;
 
 	if (!on_hang) {
 		struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
@@ -1100,7 +1089,7 @@  void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang)
 		gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
 		if (!readl_poll_timeout(ptdev->iomem + MCU_STATUS, status,
 					status == MCU_STATUS_HALT, 10, 100000)) {
-			ptdev->fw->fast_reset = true;
+			ptdev->reset.fast = true;
 		} else {
 			drm_warn(&ptdev->base, "Failed to cleanly suspend MCU");
 		}
@@ -1125,49 +1114,30 @@  int panthor_fw_post_reset(struct panthor_device *ptdev)
 	if (ret)
 		return ret;
 
-	/* If this is a fast reset, try to start the MCU without reloading
-	 * the FW sections. If it fails, go for a full reset.
-	 */
-	if (ptdev->fw->fast_reset) {
+	if (!ptdev->reset.fast) {
+		/* On a slow reset, reload all sections, including RO ones.
+		 * We're not supposed to end up here anyway, let's just assume
+		 * the overhead of reloading everything is acceptable.
+		 */
+		panthor_reload_fw_sections(ptdev, true);
+	} else {
 		/* The FW detects 0 -> 1 transitions. Make sure we reset
 		 * the HALT bit before the FW is rebooted.
 		 * This is not needed on a slow reset because FW sections are
 		 * re-initialized.
 		 */
 		struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
 		panthor_fw_update_reqs(glb_iface, req, 0, GLB_HALT);
-
-		ret = panthor_fw_start(ptdev);
-		if (!ret)
-			goto out;
-
-		/* Forcibly reset the MCU and force a slow reset, so we get a
-		 * fresh boot on the next panthor_fw_start() call.
-		 */
-		panthor_fw_stop(ptdev);
-		ptdev->fw->fast_reset = false;
-		drm_err(&ptdev->base, "FW fast reset failed, trying a slow reset");
-
-		ret = panthor_vm_flush_all(ptdev->fw->vm);
-		if (ret) {
-			drm_err(&ptdev->base, "FW slow reset failed (couldn't flush FW's AS l2cache)");
-			return ret;
-		}
 	}
 
-	/* Reload all sections, including RO ones. We're not supposed
-	 * to end up here anyway, let's just assume the overhead of
-	 * reloading everything is acceptable.
-	 */
-	panthor_reload_fw_sections(ptdev, true);
-
 	ret = panthor_fw_start(ptdev);
 	if (ret) {
-		drm_err(&ptdev->base, "FW slow reset failed (couldn't start the FW )");
+		drm_err(&ptdev->base, "FW %s reset failed",
+			ptdev->reset.fast ?  "fast" : "slow");
 		return ret;
 	}
 
-out:
 	/* We must re-initialize the global interface even on fast-reset. */
 	panthor_fw_init_global_iface(ptdev);
 	return 0;
diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
index ee85a371bc38..671049020afa 100644
--- a/drivers/gpu/drm/panthor/panthor_gpu.c
+++ b/drivers/gpu/drm/panthor/panthor_gpu.c
@@ -470,11 +470,12 @@  int panthor_gpu_soft_reset(struct panthor_device *ptdev)
  */
 void panthor_gpu_suspend(struct panthor_device *ptdev)
 {
-	/*
-	 * It may be preferable to simply power down the L2, but for now just
-	 * soft-reset which will leave the L2 powered down.
-	 */
-	panthor_gpu_soft_reset(ptdev);
+	/* On a fast reset, simply power down the L2. */
+	if (!ptdev->reset.fast)
+		panthor_gpu_soft_reset(ptdev);
+	else
+		panthor_gpu_power_off(ptdev, L2, 1, 20000);
+
 	panthor_gpu_irq_suspend(&ptdev->gpu->irq);
 }