drm/i915/gt: Always reset the engine, even if inactive, on execlists failure
diff mbox series

Message ID 20200711091349.28865-1-chris@chris-wilson.co.uk
State New
Headers show
Series
  • drm/i915/gt: Always reset the engine, even if inactive, on execlists failure
Related show

Commit Message

Chris Wilson July 11, 2020, 9:13 a.m. UTC
If something has gone awry with the CSB processing, we need to pause,
unwind and restart the request submission and event processing. However,
currently we skip the engine reset if we raise an error but discover no
active context, in the mistaken belief that it was merely a glitch in
the matrix. The glitches are real enough, and we do need to unwind even
if the engine appears idle (as it has gone permanently idle!) The
simplest way to unwind and recover is simply do the engine reset, which
should be very fast and _safe_ as nothing is active.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

Comments

Mika Kuoppala July 13, 2020, 9:34 a.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> If something has gone awry with the CSB processing, we need to pause,
> unwind and restart the request submission and event processing. However,
> currently we skip the engine reset if we raise an error but discover no
> active context, in the mistaken belief that it was merely a glitch in
> the matrix. The glitches are real enough, and we do need to unwind even
> if the engine appears idle (as it has gone permanently idle!) The
> simplest way to unwind and recover is simply do the engine reset, which
> should be very fast and _safe_ as nothing is active.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/gt/intel_lrc.c | 15 ++++++---------
>  1 file changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index cd4262cc96e2..3ea05a86dc95 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -3029,12 +3029,12 @@ static u32 active_ccid(struct intel_engine_cs *engine)
>  	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
>  }
>  
> -static bool execlists_capture(struct intel_engine_cs *engine)
> +static void execlists_capture(struct intel_engine_cs *engine)
>  {
>  	struct execlists_capture *cap;
>  
>  	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
> -		return true;
> +		return;
>  
>  	/*
>  	 * We need to _quickly_ capture the engine state before we reset.
> @@ -3043,7 +3043,7 @@ static bool execlists_capture(struct intel_engine_cs *engine)
>  	 */
>  	cap = capture_regs(engine);
>  	if (!cap)
> -		return true;
> +		return;
>  
>  	spin_lock_irq(&engine->active.lock);
>  	cap->rq = active_context(engine, active_ccid(engine));
> @@ -3080,14 +3080,13 @@ static bool execlists_capture(struct intel_engine_cs *engine)
>  
>  	INIT_WORK(&cap->work, execlists_capture_work);
>  	schedule_work(&cap->work);
> -	return true;
> +	return;
>  
>  err_rq:
>  	i915_request_put(cap->rq);
>  err_free:
>  	i915_gpu_coredump_put(cap->error);
>  	kfree(cap);
> -	return false;
>  }
>  
>  static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
> @@ -3107,10 +3106,8 @@ static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
>  	tasklet_disable_nosync(&engine->execlists.tasklet);
>  
>  	ring_set_paused(engine, 1); /* Freeze the current request in place */
> -	if (execlists_capture(engine))
> -		intel_engine_reset(engine, msg);
> -	else
> -		ring_set_paused(engine, 0);
> +	execlists_capture(engine);
> +	intel_engine_reset(engine, msg);
>  
>  	tasklet_enable(&engine->execlists.tasklet);
>  	clear_and_wake_up_bit(bit, lock);
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index cd4262cc96e2..3ea05a86dc95 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -3029,12 +3029,12 @@  static u32 active_ccid(struct intel_engine_cs *engine)
 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
 }
 
-static bool execlists_capture(struct intel_engine_cs *engine)
+static void execlists_capture(struct intel_engine_cs *engine)
 {
 	struct execlists_capture *cap;
 
 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
-		return true;
+		return;
 
 	/*
 	 * We need to _quickly_ capture the engine state before we reset.
@@ -3043,7 +3043,7 @@  static bool execlists_capture(struct intel_engine_cs *engine)
 	 */
 	cap = capture_regs(engine);
 	if (!cap)
-		return true;
+		return;
 
 	spin_lock_irq(&engine->active.lock);
 	cap->rq = active_context(engine, active_ccid(engine));
@@ -3080,14 +3080,13 @@  static bool execlists_capture(struct intel_engine_cs *engine)
 
 	INIT_WORK(&cap->work, execlists_capture_work);
 	schedule_work(&cap->work);
-	return true;
+	return;
 
 err_rq:
 	i915_request_put(cap->rq);
 err_free:
 	i915_gpu_coredump_put(cap->error);
 	kfree(cap);
-	return false;
 }
 
 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
@@ -3107,10 +3106,8 @@  static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
 	tasklet_disable_nosync(&engine->execlists.tasklet);
 
 	ring_set_paused(engine, 1); /* Freeze the current request in place */
-	if (execlists_capture(engine))
-		intel_engine_reset(engine, msg);
-	else
-		ring_set_paused(engine, 0);
+	execlists_capture(engine);
+	intel_engine_reset(engine, msg);
 
 	tasklet_enable(&engine->execlists.tasklet);
 	clear_and_wake_up_bit(bit, lock);