diff mbox series

[v2,3/3] drm/i915/gt: Timeout when waiting for idle in suspending

Message ID 20230815011210.1188379-4-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series Resolve suspend-resume racing with GuC destroy-context-worker | expand

Commit Message

Teres Alexis, Alan Previn Aug. 15, 2023, 1:12 a.m. UTC
When suspending, add a timeout when calling
intel_gt_pm_wait_for_idle else if we have a lost
G2H event that holds a wakeref (which would be
indicative of a bug elsewhere in the driver),
driver will at least complete the suspend-resume
cycle, (albeit not hitting all the targets for
low power hw counters), instead of hanging in the kernel.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
 drivers/gpu/drm/i915/gt/intel_gt_pm.c     |  7 ++++++-
 drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  7 ++++++-
 drivers/gpu/drm/i915/intel_wakeref.c      | 14 ++++++++++----
 drivers/gpu/drm/i915/intel_wakeref.h      |  5 +++--
 5 files changed, 26 insertions(+), 9 deletions(-)

Comments

Rodrigo Vivi Aug. 15, 2023, 1:51 p.m. UTC | #1
On Mon, Aug 14, 2023 at 06:12:10PM -0700, Alan Previn wrote:
> When suspending, add a timeout when calling
> intel_gt_pm_wait_for_idle else if we have a lost
> G2H event that holds a wakeref (which would be
> indicative of a bug elsewhere in the driver),
> driver will at least complete the suspend-resume
> cycle, (albeit not hitting all the targets for
> low power hw counters), instead of hanging in the kernel.
> 
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
>  drivers/gpu/drm/i915/gt/intel_gt_pm.c     |  7 ++++++-
>  drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  7 ++++++-
>  drivers/gpu/drm/i915/intel_wakeref.c      | 14 ++++++++++----
>  drivers/gpu/drm/i915/intel_wakeref.h      |  5 +++--
>  5 files changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index ee15486fed0d..090438eb8682 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -688,7 +688,7 @@ void intel_engines_release(struct intel_gt *gt)
>  		if (!engine->release)
>  			continue;
>  
> -		intel_wakeref_wait_for_idle(&engine->wakeref);
> +		intel_wakeref_wait_for_idle(&engine->wakeref, 0);
>  		GEM_BUG_ON(intel_engine_pm_is_awake(engine));
>  
>  		engine->release(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 5a942af0a14e..e8b006c3ef29 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -289,6 +289,8 @@ int intel_gt_resume(struct intel_gt *gt)
>  
>  static void wait_for_suspend(struct intel_gt *gt)
>  {
> +	int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 10000;
> +
>  	if (!intel_gt_pm_is_awake(gt))
>  		return;
>  
> @@ -301,7 +303,10 @@ static void wait_for_suspend(struct intel_gt *gt)
>  		intel_gt_retire_requests(gt);
>  	}
>  
> -	intel_gt_pm_wait_for_idle(gt);
> +	/* we are suspending, so we shouldn't be waiting forever */
> +	if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIME)

you forgot to change the error code here..........................^

but maybe we don't even need this here and a simple
if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms)) should be enough
since the error from the killable one is unlikely and the only place
I error I could check on that path would be a catastrophic -ERESTARTSYS.

but up to you.

> +		gt_warn(gt, "bailing from %s after %d milisec timeout\n",
> +			__func__, timeout_ms);
>  }
>  
>  void intel_gt_suspend_prepare(struct intel_gt *gt)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> index 6c9a46452364..5358acc2b5b1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> @@ -68,7 +68,12 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt)
>  
>  static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
>  {
> -	return intel_wakeref_wait_for_idle(&gt->wakeref);
> +	return intel_wakeref_wait_for_idle(&gt->wakeref, 0);
> +}
> +
> +static inline int intel_gt_pm_wait_timeout_for_idle(struct intel_gt *gt, int timeout_ms)
> +{
> +	return intel_wakeref_wait_for_idle(&gt->wakeref, timeout_ms);
>  }

I was going to ask why you created a single use function here, but then I
noticed the above one. So it makes sense.
Then I was going to ask why in here you didn't use the same change of
timeout = 0 in the existent function like you used below, but then I
noticed that the above function is called in multiple places and the
patch with this change is much cleaner and the function is static inline
so your approach was good here.

>  
>  void intel_gt_pm_init_early(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
> index 718f2f1b6174..383a37521415 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.c
> +++ b/drivers/gpu/drm/i915/intel_wakeref.c
> @@ -111,14 +111,20 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
>  			 "wakeref.work", &key->work, 0);
>  }
>  

Please add a documentation for this function making sure you have the following
mentions:

/**
[snip]
* @timeout_ms: Timeout in ums, 0 means never timeout.
*
* Returns 0 on success, -ETIMEDOUT upon a timeout, or the unlikely
* error propagation from wait_var_event_killable if timeout_ms is 0.
*/

with the return error fixed above and the documentation in place:

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

> -int intel_wakeref_wait_for_idle(struct intel_wakeref *wf)
> +int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms)
>  {
> -	int err;
> +	int err = 0;
>  
>  	might_sleep();
>  
> -	err = wait_var_event_killable(&wf->wakeref,
> -				      !intel_wakeref_is_active(wf));
> +	if (!timeout_ms)
> +		err = wait_var_event_killable(&wf->wakeref,
> +					      !intel_wakeref_is_active(wf));
> +	else if (wait_var_event_timeout(&wf->wakeref,
> +					!intel_wakeref_is_active(wf),
> +					msecs_to_jiffies(timeout_ms)) < 1)
> +		err = -ETIMEDOUT;
> +
>  	if (err)
>  		return err;
>  
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
> index ec881b097368..6fbb7a2fb6ea 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.h
> +++ b/drivers/gpu/drm/i915/intel_wakeref.h
> @@ -251,15 +251,16 @@ __intel_wakeref_defer_park(struct intel_wakeref *wf)
>  /**
>   * intel_wakeref_wait_for_idle: Wait until the wakeref is idle
>   * @wf: the wakeref
> + * @timeout_ms: timeout to wait in milisecs, zero means forever
>   *
>   * Wait for the earlier asynchronous release of the wakeref. Note
>   * this will wait for any third party as well, so make sure you only wait
>   * when you have control over the wakeref and trust no one else is acquiring
>   * it.
>   *
> - * Return: 0 on success, error code if killed.
> + * Return: 0 on success, error code if killed, -ETIME if timed-out.
>   */
> -int intel_wakeref_wait_for_idle(struct intel_wakeref *wf);
> +int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms);
>  
>  struct intel_wakeref_auto {
>  	struct drm_i915_private *i915;
> -- 
> 2.39.0
>
Teres Alexis, Alan Previn Aug. 15, 2023, 7 p.m. UTC | #2
Thanks Rodrigo - agreed on everything below - will re-rev.

On Tue, 2023-08-15 at 09:51 -0400, Vivi, Rodrigo wrote:
> On Mon, Aug 14, 2023 at 06:12:10PM -0700, Alan Previn wrote:
> > When suspending, add a timeout when calling
> > intel_gt_pm_wait_for_idle else if we have a lost
> > G2H event that holds a wakeref (which would be
> > 
[snip]

> > @@ -301,7 +303,10 @@ static void wait_for_suspend(struct intel_gt *gt)
> >  		intel_gt_retire_requests(gt);
> >  	}
> >  
> > -	intel_gt_pm_wait_for_idle(gt);
> > +	/* we are suspending, so we shouldn't be waiting forever */
> > +	if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIME)
> 
> you forgot to change the error code here..........................^
> 
> but maybe we don't even need this here and a simple
> if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms)) should be enough
> since the error from the killable one is unlikely and the only place
> I error I could check on that path would be a catastrophic -ERESTARTSYS.
> 
> but up to you.
alan: my bad - I'll fix it - but i agree with not needing to check the failure type.
and I'll keep the error the same ("bailing from...")
[snip]

> > +static inline int intel_gt_pm_wait_timeout_for_idle(struct intel_gt *gt, int timeout_ms)
> > +{
> > +	return intel_wakeref_wait_for_idle(&gt->wakeref, timeout_ms);
> >  }
> 
> I was going to ask why you created a single use function here, but then I
> noticed the above one. So it makes sense.
> Then I was going to ask why in here you didn't use the same change of
> timeout = 0 in the existent function like you used below, but then I
> noticed that the above function is called in multiple places and the
> patch with this change is much cleaner and the function is static inline
> so your approach was good here.
alan: yes that was my exact reason - thus no impact across other callers.
[snip]


> Please add a documentation for this function making sure you have the following
> mentions:
alan: good catch -will do.

> 
> /**
> [snip]
> * @timeout_ms: Timeout in ums, 0 means never timeout.
> *
> * Returns 0 on success, -ETIMEDOUT upon a timeout, or the unlikely
> * error propagation from wait_var_event_killable if timeout_ms is 0.
> */
> 
> with the return error fixed above and the documentation in place:
> 
> Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> 
> > -int intel_wakeref_wait_for_idle(struct intel_wakeref *wf)
> > +int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms)
> >  {
> > -	int err;
> > 
[snip]
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index ee15486fed0d..090438eb8682 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -688,7 +688,7 @@  void intel_engines_release(struct intel_gt *gt)
 		if (!engine->release)
 			continue;
 
-		intel_wakeref_wait_for_idle(&engine->wakeref);
+		intel_wakeref_wait_for_idle(&engine->wakeref, 0);
 		GEM_BUG_ON(intel_engine_pm_is_awake(engine));
 
 		engine->release(engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 5a942af0a14e..e8b006c3ef29 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -289,6 +289,8 @@  int intel_gt_resume(struct intel_gt *gt)
 
 static void wait_for_suspend(struct intel_gt *gt)
 {
+	int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 10000;
+
 	if (!intel_gt_pm_is_awake(gt))
 		return;
 
@@ -301,7 +303,10 @@  static void wait_for_suspend(struct intel_gt *gt)
 		intel_gt_retire_requests(gt);
 	}
 
-	intel_gt_pm_wait_for_idle(gt);
+	/* we are suspending, so we shouldn't be waiting forever */
+	if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIME)
+		gt_warn(gt, "bailing from %s after %d milisec timeout\n",
+			__func__, timeout_ms);
 }
 
 void intel_gt_suspend_prepare(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index 6c9a46452364..5358acc2b5b1 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -68,7 +68,12 @@  static inline void intel_gt_pm_might_put(struct intel_gt *gt)
 
 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
 {
-	return intel_wakeref_wait_for_idle(&gt->wakeref);
+	return intel_wakeref_wait_for_idle(&gt->wakeref, 0);
+}
+
+static inline int intel_gt_pm_wait_timeout_for_idle(struct intel_gt *gt, int timeout_ms)
+{
+	return intel_wakeref_wait_for_idle(&gt->wakeref, timeout_ms);
 }
 
 void intel_gt_pm_init_early(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
index 718f2f1b6174..383a37521415 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -111,14 +111,20 @@  void __intel_wakeref_init(struct intel_wakeref *wf,
 			 "wakeref.work", &key->work, 0);
 }
 
-int intel_wakeref_wait_for_idle(struct intel_wakeref *wf)
+int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms)
 {
-	int err;
+	int err = 0;
 
 	might_sleep();
 
-	err = wait_var_event_killable(&wf->wakeref,
-				      !intel_wakeref_is_active(wf));
+	if (!timeout_ms)
+		err = wait_var_event_killable(&wf->wakeref,
+					      !intel_wakeref_is_active(wf));
+	else if (wait_var_event_timeout(&wf->wakeref,
+					!intel_wakeref_is_active(wf),
+					msecs_to_jiffies(timeout_ms)) < 1)
+		err = -ETIMEDOUT;
+
 	if (err)
 		return err;
 
diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
index ec881b097368..6fbb7a2fb6ea 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.h
+++ b/drivers/gpu/drm/i915/intel_wakeref.h
@@ -251,15 +251,16 @@  __intel_wakeref_defer_park(struct intel_wakeref *wf)
 /**
  * intel_wakeref_wait_for_idle: Wait until the wakeref is idle
  * @wf: the wakeref
+ * @timeout_ms: timeout to wait in milisecs, zero means forever
  *
  * Wait for the earlier asynchronous release of the wakeref. Note
  * this will wait for any third party as well, so make sure you only wait
  * when you have control over the wakeref and trust no one else is acquiring
  * it.
  *
- * Return: 0 on success, error code if killed.
+ * Return: 0 on success, error code if killed, -ETIME if timed-out.
  */
-int intel_wakeref_wait_for_idle(struct intel_wakeref *wf);
+int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms);
 
 struct intel_wakeref_auto {
 	struct drm_i915_private *i915;