diff mbox

[v4] drm/i915/lrc: Scrub the GPU state of the guilty hanging request

Message ID 20180428111532.15819-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 28, 2018, 11:15 a.m. UTC
Previously, we just reset the ring register in the context image such
that we could skip over the broken batch and emit the closing
breadcrumb. However, on resume the context image and GPU state would be
reloaded, which may have been left in an inconsistent state by the
reset. The presumption was that at worst it would just cause another
reset and skip again until it recovered, however it seems just as likely
to cause an unrecoverable hang. Instead of risking loading an incomplete
context image, restore it back to the default state.

v2: Fix up off-by-one from including the ppHSWP in with the register
state.
v3: Use a ring local to compact a few lines.
v4: Beware setting the ring local before checking for a NULL request.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Michel Thierry <michel.thierry@intel.com> #v2
---
 drivers/gpu/drm/i915/intel_lrc.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

Comments

Michel Thierry April 30, 2018, 3:49 p.m. UTC | #1
On 04/28/2018 04:15 AM, Chris Wilson wrote:
> Previously, we just reset the ring register in the context image such
> that we could skip over the broken batch and emit the closing
> breadcrumb. However, on resume the context image and GPU state would be
> reloaded, which may have been left in an inconsistent state by the
> reset. The presumption was that at worst it would just cause another
> reset and skip again until it recovered, however it seems just as likely
> to cause an unrecoverable hang. Instead of risking loading an incomplete
> context image, restore it back to the default state.
> 
> v2: Fix up off-by-one from including the ppHSWP in with the register
> state.
> v3: Use a ring local to compact a few lines.
> v4: Beware setting the ring local before checking for a NULL request.

Didn't you want to set the ring local after this check?
	if (!request || request->fence.error != -EIO)

This is identical to v2.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Reviewed-by: Michel Thierry <michel.thierry@intel.com> #v2
> ---
>   drivers/gpu/drm/i915/intel_lrc.c | 24 +++++++++++++++++-------
>   1 file changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ce23d5116482..513aee6b3634 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1804,8 +1804,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   			      struct i915_request *request)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> -	struct intel_context *ce;
>   	unsigned long flags;
> +	u32 *regs;
>   
>   	GEM_TRACE("%s request global=%x, current=%d\n",
>   		  engine->name, request ? request->global_seqno : 0,
> @@ -1855,14 +1855,24 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   	 * future request will be after userspace has had the opportunity
>   	 * to recreate its own state.
>   	 */
> -	ce = &request->ctx->engine[engine->id];
> -	execlists_init_reg_state(ce->lrc_reg_state,
> -				 request->ctx, engine, ce->ring);
> +	regs = request->ctx->engine[engine->id].lrc_reg_state;
> +	if (engine->default_state) {
> +		void *defaults;
> +
> +		defaults = i915_gem_object_pin_map(engine->default_state,
> +						   I915_MAP_WB);
> +		if (!IS_ERR(defaults)) {
> +			memcpy(regs, /* skip restoring the vanilla PPHWSP */
> +			       defaults + LRC_STATE_PN * PAGE_SIZE,
> +			       engine->context_size - PAGE_SIZE);
> +			i915_gem_object_unpin_map(engine->default_state);
> +		}
> +	}
> +	execlists_init_reg_state(regs, request->ctx, engine, request->ring);
>   
>   	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> -	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> -		i915_ggtt_offset(ce->ring->vma);
> -	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
> +	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
> +	regs[CTX_RING_HEAD + 1] = request->postfix;
>   
>   	request->ring->head = request->postfix;
>   	intel_ring_update_space(request->ring);
>
Chris Wilson April 30, 2018, 3:53 p.m. UTC | #2
Quoting Michel Thierry (2018-04-30 16:49:53)
> On 04/28/2018 04:15 AM, Chris Wilson wrote:
> > Previously, we just reset the ring register in the context image such
> > that we could skip over the broken batch and emit the closing
> > breadcrumb. However, on resume the context image and GPU state would be
> > reloaded, which may have been left in an inconsistent state by the
> > reset. The presumption was that at worst it would just cause another
> > reset and skip again until it recovered, however it seems just as likely
> > to cause an unrecoverable hang. Instead of risking loading an incomplete
> > context image, restore it back to the default state.
> > 
> > v2: Fix up off-by-one from including the ppHSWP in with the register
> > state.
> > v3: Use a ring local to compact a few lines.
> > v4: Beware setting the ring local before checking for a NULL request.
> 
> Didn't you want to set the ring local after this check?
>         if (!request || request->fence.error != -EIO)

I just removed adding the ring local. Fewer changes...
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ce23d5116482..513aee6b3634 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1804,8 +1804,8 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 			      struct i915_request *request)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct intel_context *ce;
 	unsigned long flags;
+	u32 *regs;
 
 	GEM_TRACE("%s request global=%x, current=%d\n",
 		  engine->name, request ? request->global_seqno : 0,
@@ -1855,14 +1855,24 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 	 * future request will be after userspace has had the opportunity
 	 * to recreate its own state.
 	 */
-	ce = &request->ctx->engine[engine->id];
-	execlists_init_reg_state(ce->lrc_reg_state,
-				 request->ctx, engine, ce->ring);
+	regs = request->ctx->engine[engine->id].lrc_reg_state;
+	if (engine->default_state) {
+		void *defaults;
+
+		defaults = i915_gem_object_pin_map(engine->default_state,
+						   I915_MAP_WB);
+		if (!IS_ERR(defaults)) {
+			memcpy(regs, /* skip restoring the vanilla PPHWSP */
+			       defaults + LRC_STATE_PN * PAGE_SIZE,
+			       engine->context_size - PAGE_SIZE);
+			i915_gem_object_unpin_map(engine->default_state);
+		}
+	}
+	execlists_init_reg_state(regs, request->ctx, engine, request->ring);
 
 	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
-		i915_ggtt_offset(ce->ring->vma);
-	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
+	regs[CTX_RING_HEAD + 1] = request->postfix;
 
 	request->ring->head = request->postfix;
 	intel_ring_update_space(request->ring);