diff mbox

drm/i915: Inspect subunit states on hangcheck

Message ID 1448972245-13236-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Dec. 1, 2015, 12:17 p.m. UTC
If head seems stuck and engine in question is rcs,
inspect subunit state transitions before deciding that
this really is a hang instead of limited progress.

References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         | 49 +++++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
 2 files changed, 45 insertions(+), 5 deletions(-)

Comments

Chris Wilson Dec. 1, 2015, 12:31 p.m. UTC | #1
On Tue, Dec 01, 2015 at 02:17:25PM +0200, Mika Kuoppala wrote:
> If head seems stuck and engine in question is rcs,
> inspect subunit state transitions before deciding that
> this really is a hang instead of limited progress.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
arun.siluvery@linux.intel.com Dec. 1, 2015, 12:56 p.m. UTC | #2
On 01/12/2015 12:17, Mika Kuoppala wrote:
> If head seems stuck and engine in question is rcs,
> inspect subunit state transitions before deciding that
> this really is a hang instead of limited progress.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_irq.c         | 49 +++++++++++++++++++++++++++++----
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>   2 files changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index e88d692..e6ae54f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2913,13 +2913,31 @@ static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
>   		ring->hangcheck.deadlock = 0;
>   }
>
> -static enum intel_ring_hangcheck_action
> -ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +static bool subunits_stuck(struct intel_engine_cs *ring)
>   {
> -	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	u32 tmp;
> +	int i;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
> +	bool stuck;
> +
> +	if (ring->id != RCS)
> +		return true;
> +
> +	i915_get_extra_instdone(ring->dev, instdone);
>
> +	stuck = true;
> +	for (i = 0; i < I915_NUM_INSTDONE_REG; i++) {
> +		if (instdone[i] != ring->hangcheck.instdone[i])
> +			stuck = false;

This may not be completely reliable. Tomas Elf in his TDR tests observed 
that instdone kept changing even when CS is hung and in a stable state.

regards
Arun

> +
> +		ring->hangcheck.instdone[i] = instdone[i];
> +	}
> +
> +	return stuck;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +head_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
>   	if (acthd != ring->hangcheck.acthd) {
>   		if (acthd > ring->hangcheck.max_acthd) {
>   			ring->hangcheck.max_acthd = acthd;
> @@ -2929,6 +2947,24 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>   		return HANGCHECK_ACTIVE_LOOP;
>   	}
>
> +	if (!subunits_stuck(ring))
> +		return HANGCHECK_ACTIVE_LOOP;
> +
> +	return HANGCHECK_HUNG;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
> +	struct drm_device *dev = ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	enum intel_ring_hangcheck_action ha;
> +	u32 tmp;
> +
> +	ha = head_stuck(ring, acthd);
> +	if (ha != HANGCHECK_HUNG)
> +		return ha;
> +
>   	if (IS_GEN2(dev))
>   		return HANGCHECK_HUNG;
>
> @@ -3064,6 +3100,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   				ring->hangcheck.score--;
>
>   			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
> +
> +			memset(ring->hangcheck.instdone, 0,
> +			       sizeof(ring->hangcheck.instdone));
>   		}
>
>   		ring->hangcheck.seqno = seqno;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d1eb20..b8fe92e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -93,6 +93,7 @@ struct intel_ring_hangcheck {
>   	int score;
>   	enum intel_ring_hangcheck_action action;
>   	int deadlock;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
>   };
>
>   struct intel_ringbuffer {
>
Mika Kuoppala Dec. 1, 2015, 12:58 p.m. UTC | #3
Mika Kuoppala <mika.kuoppala@linux.intel.com> writes:

> If head seems stuck and engine in question is rcs,
> inspect subunit state transitions before deciding that
> this really is a hang instead of limited progress.
>

One thing to add into this description is that
now as we always have one extra hangcheck step even
in true stuck cases, this makes the hang declaration
one tick longer, 7.5s.

If this becomes a problem, we can sample the instdone
state on first check converge back to old behaviour.

One real, but hard to achive, improvement in determinism
would be to normalize the amount of 'work' each gen does in
a given tick. This way the shader progressions would be
more or less equal and same amount of shader work would
cause same hangcheck behaviour regardless of actual
speed of computation.

-Mika

> References: https://bugs.freedesktop.org/show_bug.cgi?id=93029
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         | 49 +++++++++++++++++++++++++++++----
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>  2 files changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index e88d692..e6ae54f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2913,13 +2913,31 @@ static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
>  		ring->hangcheck.deadlock = 0;
>  }
>  
> -static enum intel_ring_hangcheck_action
> -ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +static bool subunits_stuck(struct intel_engine_cs *ring)
>  {
> -	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	u32 tmp;
> +	int i;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
> +	bool stuck;
> +
> +	if (ring->id != RCS)
> +		return true;
> +
> +	i915_get_extra_instdone(ring->dev, instdone);
>  
> +	stuck = true;
> +	for (i = 0; i < I915_NUM_INSTDONE_REG; i++) {
> +		if (instdone[i] != ring->hangcheck.instdone[i])
> +			stuck = false;
> +
> +		ring->hangcheck.instdone[i] = instdone[i];
> +	}
> +
> +	return stuck;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +head_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
>  	if (acthd != ring->hangcheck.acthd) {
>  		if (acthd > ring->hangcheck.max_acthd) {
>  			ring->hangcheck.max_acthd = acthd;
> @@ -2929,6 +2947,24 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
>  		return HANGCHECK_ACTIVE_LOOP;
>  	}
>  
> +	if (!subunits_stuck(ring))
> +		return HANGCHECK_ACTIVE_LOOP;
> +
> +	return HANGCHECK_HUNG;
> +}
> +
> +static enum intel_ring_hangcheck_action
> +ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> +{
> +	struct drm_device *dev = ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	enum intel_ring_hangcheck_action ha;
> +	u32 tmp;
> +
> +	ha = head_stuck(ring, acthd);
> +	if (ha != HANGCHECK_HUNG)
> +		return ha;
> +
>  	if (IS_GEN2(dev))
>  		return HANGCHECK_HUNG;
>  
> @@ -3064,6 +3100,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  				ring->hangcheck.score--;
>  
>  			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
> +
> +			memset(ring->hangcheck.instdone, 0,
> +			       sizeof(ring->hangcheck.instdone));
>  		}
>  
>  		ring->hangcheck.seqno = seqno;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d1eb20..b8fe92e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -93,6 +93,7 @@ struct intel_ring_hangcheck {
>  	int score;
>  	enum intel_ring_hangcheck_action action;
>  	int deadlock;
> +	u32 instdone[I915_NUM_INSTDONE_REG];
>  };
>  
>  struct intel_ringbuffer {
> -- 
> 2.5.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index e88d692..e6ae54f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2913,13 +2913,31 @@  static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
 		ring->hangcheck.deadlock = 0;
 }
 
-static enum intel_ring_hangcheck_action
-ring_stuck(struct intel_engine_cs *ring, u64 acthd)
+static bool subunits_stuck(struct intel_engine_cs *ring)
 {
-	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 tmp;
+	int i;
+	u32 instdone[I915_NUM_INSTDONE_REG];
+	bool stuck;
+
+	if (ring->id != RCS)
+		return true;
+
+	i915_get_extra_instdone(ring->dev, instdone);
 
+	stuck = true;
+	for (i = 0; i < I915_NUM_INSTDONE_REG; i++) {
+		if (instdone[i] != ring->hangcheck.instdone[i])
+			stuck = false;
+
+		ring->hangcheck.instdone[i] = instdone[i];
+	}
+
+	return stuck;
+}
+
+static enum intel_ring_hangcheck_action
+head_stuck(struct intel_engine_cs *ring, u64 acthd)
+{
 	if (acthd != ring->hangcheck.acthd) {
 		if (acthd > ring->hangcheck.max_acthd) {
 			ring->hangcheck.max_acthd = acthd;
@@ -2929,6 +2947,24 @@  ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 		return HANGCHECK_ACTIVE_LOOP;
 	}
 
+	if (!subunits_stuck(ring))
+		return HANGCHECK_ACTIVE_LOOP;
+
+	return HANGCHECK_HUNG;
+}
+
+static enum intel_ring_hangcheck_action
+ring_stuck(struct intel_engine_cs *ring, u64 acthd)
+{
+	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	enum intel_ring_hangcheck_action ha;
+	u32 tmp;
+
+	ha = head_stuck(ring, acthd);
+	if (ha != HANGCHECK_HUNG)
+		return ha;
+
 	if (IS_GEN2(dev))
 		return HANGCHECK_HUNG;
 
@@ -3064,6 +3100,9 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 				ring->hangcheck.score--;
 
 			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+
+			memset(ring->hangcheck.instdone, 0,
+			       sizeof(ring->hangcheck.instdone));
 		}
 
 		ring->hangcheck.seqno = seqno;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d1eb20..b8fe92e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -93,6 +93,7 @@  struct intel_ring_hangcheck {
 	int score;
 	enum intel_ring_hangcheck_action action;
 	int deadlock;
+	u32 instdone[I915_NUM_INSTDONE_REG];
 };
 
 struct intel_ringbuffer {