diff mbox

drm/i915/lrc: Scrub the GPU state of the guilty hanging request

Message ID 20180427193224.11628-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 27, 2018, 7:32 p.m. UTC
Previously, we just reset the ring register in the context image such
that we could skip over the broken batch and emit the closing
breadcrumb. However, on resume the context image and GPU state would be
reloaded, which may have been left in an inconsistent state by the
reset. The presumption was that at worst it would just cause another
reset and skip again until it recovered, however it seems just as likely
to cause an unrecoverable hang. Instead of risking loading an incomplete
context image, restore it back to the default state.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

Comments

Michel Thierry April 27, 2018, 8:12 p.m. UTC | #1
On 4/27/2018 12:32 PM, Chris Wilson wrote:
> Previously, we just reset the ring register in the context image such
> that we could skip over the broken batch and emit the closing
> breadcrumb. However, on resume the context image and GPU state would be
> reloaded, which may have been left in an inconsistent state by the
> reset. The presumption was that at worst it would just cause another
> reset and skip again until it recovered, however it seems just as likely
> to cause an unrecoverable hang. Instead of risking loading an incomplete
> context image, restore it back to the default state.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_lrc.c | 24 +++++++++++++++++-------
>   1 file changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ce23d5116482..422b05290ed6 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1804,8 +1804,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   			      struct i915_request *request)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> -	struct intel_context *ce;
>   	unsigned long flags;
> +	u32 *regs;
>   
>   	GEM_TRACE("%s request global=%x, current=%d\n",
>   		  engine->name, request ? request->global_seqno : 0,
> @@ -1855,14 +1855,24 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   	 * future request will be after userspace has had the opportunity
>   	 * to recreate its own state.
>   	 */
> -	ce = &request->ctx->engine[engine->id];
> -	execlists_init_reg_state(ce->lrc_reg_state,
> -				 request->ctx, engine, ce->ring);
> +	regs = request->ctx->engine[engine->id].lrc_reg_state;
> +	if (engine->default_state) {
> +		void *defaults;
> +
> +		defaults = i915_gem_object_pin_map(engine->default_state,
> +						   I915_MAP_WB);
> +		if (!IS_ERR(defaults)) {
> +			memcpy(regs,
> +			       defaults + LRC_HEADER_PAGES * PAGE_SIZE,
> +			       engine->context_size);
Hi,

The context_size is taking into count the PP_HWSP page, do we also need 
to rewrite the PP_HSWP? (or just the logical state).

Also regs is already pointing to the start of the logical state
(vaddr + LRC_STATE_PN * PAGE_SIZE).

So if we want to overwrite from the PP_HWSP, then regs is not the right 
offset, or if we only want to change the logical state then it should be 
from 'defaults +  LRC_STATE_PN * PAGE_SIZE'.

-Michel

> +			i915_gem_object_unpin_map(engine->default_state);
> +		}
> +	}
> +	execlists_init_reg_state(regs, request->ctx, engine, request->ring);
>   
>   	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> -	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
> -		i915_ggtt_offset(ce->ring->vma);
> -	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
> +	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
> +	regs[CTX_RING_HEAD + 1] = request->postfix;
>   
>   	request->ring->head = request->postfix;
>   	intel_ring_update_space(request->ring);
>
Chris Wilson April 27, 2018, 8:20 p.m. UTC | #2
Quoting Michel Thierry (2018-04-27 21:12:38)
> On 4/27/2018 12:32 PM, Chris Wilson wrote:
> > Previously, we just reset the ring register in the context image such
> > that we could skip over the broken batch and emit the closing
> > breadcrumb. However, on resume the context image and GPU state would be
> > reloaded, which may have been left in an inconsistent state by the
> > reset. The presumption was that at worst it would just cause another
> > reset and skip again until it recovered, however it seems just as likely
> > to cause an unrecoverable hang. Instead of risking loading an incomplete
> > context image, restore it back to the default state.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > Cc: Michel Thierry <michel.thierry@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/intel_lrc.c | 24 +++++++++++++++++-------
> >   1 file changed, 17 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index ce23d5116482..422b05290ed6 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -1804,8 +1804,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> >                             struct i915_request *request)
> >   {
> >       struct intel_engine_execlists * const execlists = &engine->execlists;
> > -     struct intel_context *ce;
> >       unsigned long flags;
> > +     u32 *regs;
> >   
> >       GEM_TRACE("%s request global=%x, current=%d\n",
> >                 engine->name, request ? request->global_seqno : 0,
> > @@ -1855,14 +1855,24 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> >        * future request will be after userspace has had the opportunity
> >        * to recreate its own state.
> >        */
> > -     ce = &request->ctx->engine[engine->id];
> > -     execlists_init_reg_state(ce->lrc_reg_state,
> > -                              request->ctx, engine, ce->ring);
> > +     regs = request->ctx->engine[engine->id].lrc_reg_state;
> > +     if (engine->default_state) {
> > +             void *defaults;
> > +
> > +             defaults = i915_gem_object_pin_map(engine->default_state,
> > +                                                I915_MAP_WB);
> > +             if (!IS_ERR(defaults)) {
> > +                     memcpy(regs,
> > +                            defaults + LRC_HEADER_PAGES * PAGE_SIZE,
> > +                            engine->context_size);
> Hi,
> 
> The context_size is taking into count the PP_HWSP page, do we also need 
> to rewrite the PP_HSWP? (or just the logical state).
> 
> Also regs is already pointing to the start of the logical state
> (vaddr + LRC_STATE_PN * PAGE_SIZE).

Yeah, I was aiming for just the register state, and had a nice little
off by one in comparing the macros.
 
> So if we want to overwrite from the PP_HWSP, then regs is not the right 
> offset, or if we only want to change the logical state then it should be 
> from 'defaults +  LRC_STATE_PN * PAGE_SIZE'.

Right, I don't think we need to scrub the HWSP, just the register state.
The context is lost at this point, and what I want to protect is the
read of the image following the reset. Afaik, we don't issue any reads
from PPHWSP.
-Chris
Chris Wilson April 30, 2018, 1:06 p.m. UTC | #3
Quoting Patchwork (2018-04-30 13:38:55)
> == Series Details ==
> 
> Series: drm/i915/lrc: Scrub the GPU state of the guilty hanging request (rev4)
> URL   : https://patchwork.freedesktop.org/series/42425/
> State : success
> 
> == Summary ==
> 
> = CI Bug Log - changes from CI_DRM_4112_full -> Patchwork_8835_full =
> 
> == Summary - WARNING ==
> 
>   Minor unknown changes coming with Patchwork_8835_full need to be verified
>   manually.
>   
>   If you think the reported changes have nothing to do with the changes
>   introduced in Patchwork_8835_full, please notify your bug team to allow them
>   to document this new failure mode, which will reduce false positives in CI.
> 
>   External URL: https://patchwork.freedesktop.org/api/1.0/series/42425/revisions/4/mbox/
> 
> == Possible new issues ==
> 
>   Here are the unknown changes that may have been introduced in Patchwork_8835_full:
> 
>   === IGT changes ===
> 
>     ==== Warnings ====
> 
>     igt@gem_exec_schedule@deep-bsd1:
>       shard-kbl:          PASS -> SKIP
> 
>     igt@gem_mocs_settings@mocs-rc6-blt:
>       shard-kbl:          SKIP -> PASS

Sold! Thanks for the review, hopefully this will nip some CI bugs, but
at an error rate of less than 1% it will be some time before we can be
sure.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ce23d5116482..422b05290ed6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1804,8 +1804,8 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 			      struct i915_request *request)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct intel_context *ce;
 	unsigned long flags;
+	u32 *regs;
 
 	GEM_TRACE("%s request global=%x, current=%d\n",
 		  engine->name, request ? request->global_seqno : 0,
@@ -1855,14 +1855,24 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 	 * future request will be after userspace has had the opportunity
 	 * to recreate its own state.
 	 */
-	ce = &request->ctx->engine[engine->id];
-	execlists_init_reg_state(ce->lrc_reg_state,
-				 request->ctx, engine, ce->ring);
+	regs = request->ctx->engine[engine->id].lrc_reg_state;
+	if (engine->default_state) {
+		void *defaults;
+
+		defaults = i915_gem_object_pin_map(engine->default_state,
+						   I915_MAP_WB);
+		if (!IS_ERR(defaults)) {
+			memcpy(regs,
+			       defaults + LRC_HEADER_PAGES * PAGE_SIZE,
+			       engine->context_size);
+			i915_gem_object_unpin_map(engine->default_state);
+		}
+	}
+	execlists_init_reg_state(regs, request->ctx, engine, request->ring);
 
 	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
-		i915_ggtt_offset(ce->ring->vma);
-	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
+	regs[CTX_RING_HEAD + 1] = request->postfix;
 
 	request->ring->head = request->postfix;
 	intel_ring_update_space(request->ring);