diff mbox

[3/3] drm/i915: Declare the driver wedged if hangcheck makes no progress

Message ID 20180604073441.6737-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson June 4, 2018, 7:34 a.m. UTC
Hangcheck is our back up in case the GPU or the driver gets stuck. It
detects when the GPU is not making any progress and issues a GPU reset.
However, if the driver is failing to make any progress, we can get
ourselves into a situation where we continually try resetting the GPU to
no avail. Employ a second timeout such that if we continue to see the
same seqno (the stalled engine has made no progress at all) over the
course of several hangchecks, declare the driver wedged and attempt to
start afresh.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  5 +++--
 drivers/gpu/drm/i915/i915_drv.h         |  2 ++
 drivers/gpu/drm/i915/intel_hangcheck.c  | 17 ++++++++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++-
 4 files changed, 23 insertions(+), 4 deletions(-)

Comments

Mika Kuoppala June 5, 2018, 9:26 a.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Hangcheck is our back up in case the GPU or the driver gets stuck. It
> detects when the GPU is not making any progress and issues a GPU reset.
> However, if the driver is failing to make any progress, we can get
> ourselves into a situation where we continually try resetting the GPU to
> no avail. Employ a second timeout such that if we continue to see the
> same seqno (the stalled engine has made no progress at all) over the
> course of several hangchecks, declare the driver wedged and attempt to
> start afresh.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     |  5 +++--
>  drivers/gpu/drm/i915/i915_drv.h         |  2 ++
>  drivers/gpu/drm/i915/intel_hangcheck.c  | 17 ++++++++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++-
>  4 files changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 15e86d34a81c..2eae3abb0ec1 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1359,11 +1359,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>  		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
>  			   engine->hangcheck.seqno, seqno[id],
>  			   intel_engine_last_submit(engine));
> -		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s\n",
> +		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
>  			   yesno(intel_engine_has_waiter(engine)),
>  			   yesno(test_bit(engine->id,
>  					  &dev_priv->gpu_error.missed_irq_rings)),
> -			   yesno(engine->hangcheck.stalled));
> +			   yesno(engine->hangcheck.stalled),
> +			   yesno(engine->hangcheck.wedged));
>  
>  		spin_lock_irq(&b->rb_lock);
>  		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 38157df6ff5c..a4ed0baeb0ed 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -994,6 +994,8 @@ struct i915_gem_mm {
>  #define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
>  #define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
>  
> +#define I915_ENGINE_WEDGED_TIMEOUT  (60 * HZ)  /* Reset but no recovery? */
> +
>  enum modeset_restore {
>  	MODESET_ON_LID_OPEN,
>  	MODESET_DONE,
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index d47e346bd49e..2fc7a0dd0df9 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -294,6 +294,7 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
>  	engine->hangcheck.seqno = hc->seqno;
>  	engine->hangcheck.action = hc->action;
>  	engine->hangcheck.stalled = hc->stalled;
> +	engine->hangcheck.wedged = hc->wedged;
>  }
>  
>  static enum intel_engine_hangcheck_action
> @@ -368,6 +369,9 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
>  
>  	hc->stalled = time_after(jiffies,
>  				 engine->hangcheck.action_timestamp + timeout);
> +	hc->wedged = time_after(jiffies,
> +				 engine->hangcheck.action_timestamp +
> +				 I915_ENGINE_WEDGED_TIMEOUT);

As the init_hangcheck() will clear the wedged state,
our reset failure paths needs to steer clear of this.

Would it be better if we warn on the wedged being set
in init paths and explicitly clear it on unset_wedged?

-Mika


>  }
>  
>  static void hangcheck_declare_hang(struct drm_i915_private *i915,
> @@ -409,7 +413,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  			     gpu_error.hangcheck_work.work);
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
> -	unsigned int hung = 0, stuck = 0;
> +	unsigned int hung = 0, stuck = 0, wedged = 0;
>  
>  	if (!i915_modparams.enable_hangcheck)
>  		return;
> @@ -440,6 +444,17 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  			if (hc.action != ENGINE_DEAD)
>  				stuck |= intel_engine_flag(engine);
>  		}
> +
> +		if (engine->hangcheck.wedged)
> +			wedged |= intel_engine_flag(engine);
> +	}
> +
> +	if (wedged) {
> +		dev_err(dev_priv->drm.dev,
> +			"GPU recovery timed out,"
> +			" cancelling all in-flight rendering.\n");
> +		GEM_TRACE_DUMP();
> +		i915_gem_set_wedged(dev_priv);
>  	}
>  
>  	if (hung)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index acef385c4c80..d9e8d79c56e4 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -122,7 +122,8 @@ struct intel_engine_hangcheck {
>  	int deadlock;
>  	struct intel_instdone instdone;
>  	struct i915_request *active_request;
> -	bool stalled;
> +	bool stalled:1;
> +	bool wedged:1;
>  };
>  
>  struct intel_ring {
> -- 
> 2.17.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson June 5, 2018, 9:33 a.m. UTC | #2
Quoting Mika Kuoppala (2018-06-05 10:26:37)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Hangcheck is our back up in case the GPU or the driver gets stuck. It
> > detects when the GPU is not making any progress and issues a GPU reset.
> > However, if the driver is failing to make any progress, we can get
> > ourselves into a situation where we continually try resetting the GPU to
> > no avail. Employ a second timeout such that if we continue to see the
> > same seqno (the stalled engine has made no progress at all) over the
> > course of several hangchecks, declare the driver wedged and attempt to
> > start afresh.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c     |  5 +++--
> >  drivers/gpu/drm/i915/i915_drv.h         |  2 ++
> >  drivers/gpu/drm/i915/intel_hangcheck.c  | 17 ++++++++++++++++-
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++-
> >  4 files changed, 23 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index 15e86d34a81c..2eae3abb0ec1 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -1359,11 +1359,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> >               seq_printf(m, "    seqno = %x [current %x, last %x]\n",
> >                          engine->hangcheck.seqno, seqno[id],
> >                          intel_engine_last_submit(engine));
> > -             seq_printf(m, "    waiters? %s, fake irq active? %s, stalled? %s\n",
> > +             seq_printf(m, "    waiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
> >                          yesno(intel_engine_has_waiter(engine)),
> >                          yesno(test_bit(engine->id,
> >                                         &dev_priv->gpu_error.missed_irq_rings)),
> > -                        yesno(engine->hangcheck.stalled));
> > +                        yesno(engine->hangcheck.stalled),
> > +                        yesno(engine->hangcheck.wedged));
> >  
> >               spin_lock_irq(&b->rb_lock);
> >               for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 38157df6ff5c..a4ed0baeb0ed 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -994,6 +994,8 @@ struct i915_gem_mm {
> >  #define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
> >  #define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
> >  
> > +#define I915_ENGINE_WEDGED_TIMEOUT  (60 * HZ)  /* Reset but no recovery? */
> > +
> >  enum modeset_restore {
> >       MODESET_ON_LID_OPEN,
> >       MODESET_DONE,
> > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> > index d47e346bd49e..2fc7a0dd0df9 100644
> > --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> > @@ -294,6 +294,7 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
> >       engine->hangcheck.seqno = hc->seqno;
> >       engine->hangcheck.action = hc->action;
> >       engine->hangcheck.stalled = hc->stalled;
> > +     engine->hangcheck.wedged = hc->wedged;
> >  }
> >  
> >  static enum intel_engine_hangcheck_action
> > @@ -368,6 +369,9 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
> >  
> >       hc->stalled = time_after(jiffies,
> >                                engine->hangcheck.action_timestamp + timeout);
> > +     hc->wedged = time_after(jiffies,
> > +                              engine->hangcheck.action_timestamp +
> > +                              I915_ENGINE_WEDGED_TIMEOUT);
> 
> As the init_hangcheck() will clear the wedged state,
> our reset failure paths needs to steer clear of this.

They do by definition (since the reset failed :)
 
> Would it be better if we warn on the wedged being set
> in init paths and explicitly clear it on unset_wedged?

I don't think so, if we reset, we may as well regard the reset as
completed. We detect repeated attempts to do the same reset; it's just
we need a back in case we don't do any reset.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 15e86d34a81c..2eae3abb0ec1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1359,11 +1359,12 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s\n",
+		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
 					  &dev_priv->gpu_error.missed_irq_rings)),
-			   yesno(engine->hangcheck.stalled));
+			   yesno(engine->hangcheck.stalled),
+			   yesno(engine->hangcheck.wedged));
 
 		spin_lock_irq(&b->rb_lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 38157df6ff5c..a4ed0baeb0ed 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -994,6 +994,8 @@  struct i915_gem_mm {
 #define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
 #define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
 
+#define I915_ENGINE_WEDGED_TIMEOUT  (60 * HZ)  /* Reset but no recovery? */
+
 enum modeset_restore {
 	MODESET_ON_LID_OPEN,
 	MODESET_DONE,
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index d47e346bd49e..2fc7a0dd0df9 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -294,6 +294,7 @@  static void hangcheck_store_sample(struct intel_engine_cs *engine,
 	engine->hangcheck.seqno = hc->seqno;
 	engine->hangcheck.action = hc->action;
 	engine->hangcheck.stalled = hc->stalled;
+	engine->hangcheck.wedged = hc->wedged;
 }
 
 static enum intel_engine_hangcheck_action
@@ -368,6 +369,9 @@  static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 
 	hc->stalled = time_after(jiffies,
 				 engine->hangcheck.action_timestamp + timeout);
+	hc->wedged = time_after(jiffies,
+				 engine->hangcheck.action_timestamp +
+				 I915_ENGINE_WEDGED_TIMEOUT);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -409,7 +413,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	unsigned int hung = 0, stuck = 0;
+	unsigned int hung = 0, stuck = 0, wedged = 0;
 
 	if (!i915_modparams.enable_hangcheck)
 		return;
@@ -440,6 +444,17 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 			if (hc.action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
 		}
+
+		if (engine->hangcheck.wedged)
+			wedged |= intel_engine_flag(engine);
+	}
+
+	if (wedged) {
+		dev_err(dev_priv->drm.dev,
+			"GPU recovery timed out,"
+			" cancelling all in-flight rendering.\n");
+		GEM_TRACE_DUMP();
+		i915_gem_set_wedged(dev_priv);
 	}
 
 	if (hung)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index acef385c4c80..d9e8d79c56e4 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -122,7 +122,8 @@  struct intel_engine_hangcheck {
 	int deadlock;
 	struct intel_instdone instdone;
 	struct i915_request *active_request;
-	bool stalled;
+	bool stalled:1;
+	bool wedged:1;
 };
 
 struct intel_ring {