diff mbox

[1/6] drm/i915: Split up hangcheck phases

Message ID 1479220596-2784-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Nov. 15, 2016, 2:36 p.m. UTC
In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.

Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c   |   8 +-
 drivers/gpu/drm/i915/intel_hangcheck.c  | 241 ++++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +-
 3 files changed, 146 insertions(+), 107 deletions(-)

Comments

Chris Wilson Nov. 15, 2016, 4:39 p.m. UTC | #1
On Tue, Nov 15, 2016 at 04:36:31PM +0200, Mika Kuoppala wrote:
> In order to simplify hangcheck state keeping, split hangcheck
> per engine loop in three phases: state load, action, state save.
> 
> Add few more hangcheck actions to separate between seqno, head
> and subunit movements. This helps to gather all the hangcheck
> actions under a single switch umbrella.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
Chris Wilson Nov. 15, 2016, 4:47 p.m. UTC | #2
On Tue, Nov 15, 2016 at 03:45:41PM -0000, Patchwork wrote:
> == Series Details ==
> 
> Series: series starting with [1/6] drm/i915: Split up hangcheck phases
> URL   : https://patchwork.freedesktop.org/series/15351/
> State : success
> 
> == Summary ==
> 
> Series 15351v1 Series without cover letter
> https://patchwork.freedesktop.org/api/1.0/series/15351/revisions/1/mbox/
> 
> 
> fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
> fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40 
> fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
> fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
> fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
> fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
> fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
> fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
> fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
> fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
> fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
> fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
> fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21 
> fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21 
> fi-skl-6770hq    total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
> fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
> fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 

Worth asking: what's the impact on the hangcheck tests runtime within BAT?

Presumably still the same timeouts...
-Chris
Mika Kuoppala Nov. 15, 2016, 5:27 p.m. UTC | #3
Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Tue, Nov 15, 2016 at 03:45:41PM -0000, Patchwork wrote:
>> == Series Details ==
>> 
>> Series: series starting with [1/6] drm/i915: Split up hangcheck phases
>> URL   : https://patchwork.freedesktop.org/series/15351/
>> State : success
>> 
>> == Summary ==
>> 
>> Series 15351v1 Series without cover letter
>> https://patchwork.freedesktop.org/api/1.0/series/15351/revisions/1/mbox/
>> 
>> 
>> fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
>> fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40 
>> fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
>> fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
>> fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
>> fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
>> fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
>> fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
>> fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
>> fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
>> fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
>> fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
>> fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21 
>> fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21 
>> fi-skl-6770hq    total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
>> fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
>> fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 
>
> Worth asking: what's the impact on the hangcheck tests runtime within BAT?
>

Will measure.

I suspect that we will be a tad faster due to brutal
nature of igt hangs being now deterministcally detected in hangcheck
resolution accuracy...

> Presumably still the same timeouts...

and that the simplest engine fastest detection was 4.5secs
on old code. Now it is 4secs.

I just wanted second resolution the defines.

-Mika

> -Chris
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d620bd..f02f581 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -323,8 +323,12 @@  static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
 		return "idle";
 	case HANGCHECK_WAIT:
 		return "wait";
-	case HANGCHECK_ACTIVE:
-		return "active";
+	case HANGCHECK_ACTIVE_SEQNO:
+		return "active seqno";
+	case HANGCHECK_ACTIVE_HEAD:
+		return "active head";
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		return "active subunits";
 	case HANGCHECK_KICK:
 		return "kick";
 	case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 53df5b1..3d2e81c 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,11 +236,11 @@  head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_HEAD;
 	}
 
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_SUBUNITS;
 
 	return HANGCHECK_HUNG;
 }
@@ -291,6 +291,129 @@  engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
+static void hangcheck_load_sample(struct intel_engine_cs *engine,
+				  struct intel_engine_hangcheck *hc)
+{
+	/* We don't strictly need an irq-barrier here, as we are not
+	 * serving an interrupt request, be paranoid in case the
+	 * barrier has side-effects (such as preventing a broken
+	 * cacheline snoop) and so be sure that we can see the seqno
+	 * advance. If the seqno should stick, due to a stale
+	 * cacheline, we would erroneously declare the GPU hung.
+	 */
+	if (engine->irq_seqno_barrier)
+		engine->irq_seqno_barrier(engine);
+
+	hc->acthd = intel_engine_get_active_head(engine);
+	hc->seqno = intel_engine_get_seqno(engine);
+	hc->score = engine->hangcheck.score;
+}
+
+static void hangcheck_store_sample(struct intel_engine_cs *engine,
+				   const struct intel_engine_hangcheck *hc)
+{
+	engine->hangcheck.acthd = hc->acthd;
+	engine->hangcheck.seqno = hc->seqno;
+	engine->hangcheck.score = hc->score;
+	engine->hangcheck.action = hc->action;
+}
+
+static enum intel_engine_hangcheck_action
+hangcheck_get_action(struct intel_engine_cs *engine,
+		     const struct intel_engine_hangcheck *hc)
+{
+	if (engine->hangcheck.seqno != hc->seqno)
+		return HANGCHECK_ACTIVE_SEQNO;
+
+	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
+		return HANGCHECK_IDLE;
+
+	return engine_stuck(engine, hc->acthd);
+}
+
+static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
+					struct intel_engine_hangcheck *hc)
+{
+	hc->action = hangcheck_get_action(engine, hc);
+
+	switch (hc->action) {
+	case HANGCHECK_IDLE:
+	case HANGCHECK_WAIT:
+		break;
+
+	case HANGCHECK_ACTIVE_HEAD:
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		/* We always increment the hangcheck score
+		 * if the engine is busy and still processing
+		 * the same request, so that no single request
+		 * can run indefinitely (such as a chain of
+		 * batches). The only time we do not increment
+		 * the hangcheck score on this ring, if this
+		 * engine is in a legitimate wait for another
+		 * engine. In that case the waiting engine is a
+		 * victim and we want to be sure we catch the
+		 * right culprit. Then every time we do kick
+		 * the ring, add a small increment to the
+		 * score so that we can catch a batch that is
+		 * being repeatedly kicked and so responsible
+		 * for stalling the machine.
+		 */
+		hc->score += 1;
+		break;
+
+	case HANGCHECK_KICK:
+		hc->score += 5;
+		break;
+
+	case HANGCHECK_HUNG:
+		hc->score += 20;
+		break;
+
+	case HANGCHECK_ACTIVE_SEQNO:
+		/* Gradually reduce the count so that we catch DoS
+		 * attempts across multiple batches.
+		 */
+		if (hc->score > 0)
+			hc->score -= 15;
+		if (hc->score < 0)
+			hc->score = 0;
+
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
+
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
+		break;
+
+	default:
+		MISSING_CASE(hc->action);
+	}
+}
+
+static void hangcheck_declare_hang(struct drm_i915_private *i915,
+				   unsigned int hung,
+				   unsigned int stuck)
+{
+	struct intel_engine_cs *engine;
+	char msg[80];
+	unsigned int tmp;
+	int len;
+
+	/* If some rings hung but others were still busy, only
+	 * blame the hanging rings in the synopsis.
+	 */
+	if (stuck != hung)
+		hung &= ~stuck;
+	len = scnprintf(msg, sizeof(msg),
+			"%s on ", stuck == hung ? "No progress" : "Hang");
+	for_each_engine_masked(engine, i915, hung, tmp)
+		len += scnprintf(msg + len, sizeof(msg) - len,
+				 "%s, ", engine->name);
+	msg[len-2] = '\0';
+
+	return i915_handle_error(i915, hung, msg);
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -308,10 +431,6 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	enum intel_engine_id id;
 	unsigned int hung = 0, stuck = 0;
 	int busy_count = 0;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define ACTIVE_DECAY 15
 
 	if (!i915.enable_hangcheck)
 		return;
@@ -326,112 +445,26 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		bool busy = intel_engine_has_waiter(engine);
-		u64 acthd;
-		u32 seqno;
-		u32 submit;
+		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
+		const bool busy = intel_engine_has_waiter(engine);
 
 		semaphore_clear_deadlocks(dev_priv);
 
-		/* We don't strictly need an irq-barrier here, as we are not
-		 * serving an interrupt request, be paranoid in case the
-		 * barrier has side-effects (such as preventing a broken
-		 * cacheline snoop) and so be sure that we can see the seqno
-		 * advance. If the seqno should stick, due to a stale
-		 * cacheline, we would erroneously declare the GPU hung.
-		 */
-		if (engine->irq_seqno_barrier)
-			engine->irq_seqno_barrier(engine);
-
-		acthd = intel_engine_get_active_head(engine);
-		seqno = intel_engine_get_seqno(engine);
-		submit = intel_engine_last_submit(engine);
-
-		if (engine->hangcheck.seqno == seqno) {
-			if (i915_seqno_passed(seqno, submit)) {
-				engine->hangcheck.action = HANGCHECK_IDLE;
-			} else {
-				/* We always increment the hangcheck score
-				 * if the engine is busy and still processing
-				 * the same request, so that no single request
-				 * can run indefinitely (such as a chain of
-				 * batches). The only time we do not increment
-				 * the hangcheck score on this ring, if this
-				 * engine is in a legitimate wait for another
-				 * engine. In that case the waiting engine is a
-				 * victim and we want to be sure we catch the
-				 * right culprit. Then every time we do kick
-				 * the ring, add a small increment to the
-				 * score so that we can catch a batch that is
-				 * being repeatedly kicked and so responsible
-				 * for stalling the machine.
-				 */
-				engine->hangcheck.action =
-					engine_stuck(engine, acthd);
-
-				switch (engine->hangcheck.action) {
-				case HANGCHECK_IDLE:
-				case HANGCHECK_WAIT:
-					break;
-				case HANGCHECK_ACTIVE:
-					engine->hangcheck.score += BUSY;
-					break;
-				case HANGCHECK_KICK:
-					engine->hangcheck.score += KICK;
-					break;
-				case HANGCHECK_HUNG:
-					engine->hangcheck.score += HUNG;
-					break;
-				}
-			}
-
-			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-				hung |= intel_engine_flag(engine);
-				if (engine->hangcheck.action != HANGCHECK_HUNG)
-					stuck |= intel_engine_flag(engine);
-			}
-		} else {
-			engine->hangcheck.action = HANGCHECK_ACTIVE;
-
-			/* Gradually reduce the count so that we catch DoS
-			 * attempts across multiple batches.
-			 */
-			if (engine->hangcheck.score > 0)
-				engine->hangcheck.score -= ACTIVE_DECAY;
-			if (engine->hangcheck.score < 0)
-				engine->hangcheck.score = 0;
-
-			/* Clear head and subunit states on seqno movement */
-			acthd = 0;
-
-			memset(&engine->hangcheck.instdone, 0,
-			       sizeof(engine->hangcheck.instdone));
+		hangcheck_load_sample(engine, hc);
+		hangcheck_accumulate_sample(engine, hc);
+		hangcheck_store_sample(engine, hc);
+
+		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+			hung |= intel_engine_flag(engine);
+			if (hc->action != HANGCHECK_HUNG)
+				stuck |= intel_engine_flag(engine);
 		}
 
-		engine->hangcheck.seqno = seqno;
-		engine->hangcheck.acthd = acthd;
 		busy_count += busy;
 	}
 
-	if (hung) {
-		char msg[80];
-		unsigned int tmp;
-		int len;
-
-		/* If some rings hung but others were still busy, only
-		 * blame the hanging rings in the synopsis.
-		 */
-		if (stuck != hung)
-			hung &= ~stuck;
-		len = scnprintf(msg, sizeof(msg),
-				"%s on ", stuck == hung ? "No progress" : "Hang");
-		for_each_engine_masked(engine, dev_priv, hung, tmp)
-			len += scnprintf(msg + len, sizeof(msg) - len,
-					 "%s, ", engine->name);
-		msg[len-2] = '\0';
-
-		return i915_handle_error(dev_priv, hung, msg);
-	}
+	if (hung)
+		hangcheck_declare_hang(dev_priv, hung, stuck);
 
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3466b4e..3152b2b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -67,7 +67,9 @@  struct intel_hw_status_page {
 enum intel_engine_hangcheck_action {
 	HANGCHECK_IDLE = 0,
 	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE,
+	HANGCHECK_ACTIVE_SEQNO,
+	HANGCHECK_ACTIVE_HEAD,
+	HANGCHECK_ACTIVE_SUBUNITS,
 	HANGCHECK_KICK,
 	HANGCHECK_HUNG,
 };