diff mbox

drm/i915/hangcheck: Prevent long walks across full-ppgtt

Message ID 1456930109-21532-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala March 2, 2016, 2:48 p.m. UTC
With full-ppgtt, it takes the GPU an eon to traverse the entire 256PiB
address space, causing a loop to be detected. Under the current scheme,
if ACTHD walks off the end of a batch buffer and into an empty
address space, we "never" detect the hang. If we always increment the
score as the ACTHD is progressing then we will eventually timeout (after
~46.5s (31 * 1.5s) without advancing onto a new batch). To counter act
this, increase the amount we reduce the score for good batches, so that
only a series of almost-bad batches trigger a full reset. DoS detection
suffers slightly but series of long running shader tests will benefit.

Based on a patch from Chris Wilson.

Testcase: igt/drv_hangman/hangcheck-unterminated
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  2 --
 drivers/gpu/drm/i915/i915_gpu_error.c   |  2 --
 drivers/gpu/drm/i915/i915_irq.c         | 17 +++++++----------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 --
 4 files changed, 7 insertions(+), 16 deletions(-)

Comments

Chris Wilson March 2, 2016, 3:39 p.m. UTC | #1
On Wed, Mar 02, 2016 at 04:48:29PM +0200, Mika Kuoppala wrote:
> With full-ppgtt, it takes the GPU an eon to traverse the entire 256PiB
> address space, causing a loop to be detected. Under the current scheme,
> if ACTHD walks off the end of a batch buffer and into an empty
> address space, we "never" detect the hang. If we always increment the
> score as the ACTHD is progressing then we will eventually timeout (after
> ~46.5s (31 * 1.5s) without advancing onto a new batch). To counter act
> this, increase the amount we reduce the score for good batches, so that
> only a series of almost-bad batches trigger a full reset. DoS detection
> suffers slightly but series of long running shader tests will benefit.
> 
> Based on a patch from Chris Wilson.
> 
> Testcase: igt/drv_hangman/hangcheck-unterminated
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
Mika Kuoppala March 3, 2016, 11:05 a.m. UTC | #2
Patchwork <patchwork@emeril.freedesktop.org> writes:

> == Series Details ==
>
> Series: drm/i915/hangcheck: Prevent long walks across full-ppgtt
> URL   : https://patchwork.freedesktop.org/series/4023/
> State : warning
>
> == Summary ==
>
> Series 4023v1 drm/i915/hangcheck: Prevent long walks across full-ppgtt
> http://patchwork.freedesktop.org/api/1.0/series/4023/revisions/1/mbox/
>
> Test drv_module_reload_basic:
>                 pass       -> DMESG-WARN (ilk-hp8440p)

https://bugs.freedesktop.org/show_bug.cgi?id=94385

> Test kms_flip:
>         Subgroup basic-flip-vs-dpms:
>                 pass       -> DMESG-WARN (ilk-hp8440p) UNSTABLE
>         Subgroup basic-flip-vs-modeset:
>                 pass       -> INCOMPLETE (ilk-hp8440p) UNSTABLE
> Test kms_force_connector_basic:
>         Subgroup force-load-detect:
>                 skip       -> PASS       (ivb-t430s)
> Test kms_pipe_crc_basic:
>         Subgroup nonblocking-crc-pipe-b-frame-sequence:
>                 pass       -> DMESG-WARN (snb-x220t)

https://bugs.freedesktop.org/show_bug.cgi?id=94349

>                 dmesg-warn -> PASS       (hsw-brixbox)
>         Subgroup suspend-read-crc-pipe-a:
>                 incomplete -> PASS       (hsw-gt2)
>         Subgroup suspend-read-crc-pipe-c:
>                 dmesg-warn -> PASS       (bsw-nuc-2)
> Test pm_rpm:
>         Subgroup basic-rte:
>                 pass       -> DMESG-WARN (snb-dellxps)

https://bugs.freedesktop.org/show_bug.cgi?id=94349

>
> bdw-nuci7        total:169  pass:158  dwarn:0   dfail:0   fail:0   skip:11 
> bdw-ultra        total:169  pass:155  dwarn:0   dfail:0   fail:0   skip:14 
> bsw-nuc-2        total:169  pass:138  dwarn:0   dfail:0   fail:1   skip:30 
> byt-nuc          total:169  pass:144  dwarn:0   dfail:0   fail:0   skip:25 
> hsw-brixbox      total:169  pass:154  dwarn:0   dfail:0   fail:0   skip:15 
> hsw-gt2          total:169  pass:158  dwarn:1   dfail:0   fail:0   skip:10 
> ilk-hp8440p      total:156  pass:106  dwarn:2   dfail:0   fail:0   skip:47 
> ivb-t430s        total:169  pass:154  dwarn:0   dfail:0   fail:0   skip:15 
> skl-i5k-2        total:169  pass:153  dwarn:0   dfail:0   fail:0   skip:16 
> skl-i7k-2        total:169  pass:153  dwarn:0   dfail:0   fail:0   skip:16 
> snb-dellxps      total:169  pass:144  dwarn:2   dfail:0   fail:0   skip:23 
> snb-x220t        total:169  pass:144  dwarn:2   dfail:0   fail:1   skip:22 
>
> Results at /archive/results/CI_IGT_test/Patchwork_1517/
>
> db506392f6706faffdc965c53c4cdea58cc16a02 drm-intel-nightly: 2016y-03m-02d-13h-47m-11s UTC integration manifest
> 73a64b9e04a74b5bed5333823b5eebe930396689 drm/i915/hangcheck: Prevent long walks across full-ppgtt
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a0f1bd711b53..15aacd0ee66f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1367,8 +1367,6 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)ring->hangcheck.acthd,
 			   (long long)acthd[i]);
-		seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
-			   (long long)ring->hangcheck.max_acthd);
 		seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
 		seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 3b6bfbf35482..13b5f3aed01c 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -230,8 +230,6 @@  static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
 		return "wait";
 	case HANGCHECK_ACTIVE:
 		return "active";
-	case HANGCHECK_ACTIVE_LOOP:
-		return "active (loop)";
 	case HANGCHECK_KICK:
 		return "kick";
 	case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d1a46ef5ab3f..53e5104964b3 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -3001,12 +3001,7 @@  head_stuck(struct intel_engine_cs *ring, u64 acthd)
 		memset(ring->hangcheck.instdone, 0,
 		       sizeof(ring->hangcheck.instdone));
 
-		if (acthd > ring->hangcheck.max_acthd) {
-			ring->hangcheck.max_acthd = acthd;
-			return HANGCHECK_ACTIVE;
-		}
-
-		return HANGCHECK_ACTIVE_LOOP;
+		return HANGCHECK_ACTIVE;
 	}
 
 	if (!subunits_stuck(ring))
@@ -3083,6 +3078,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
+#define ACTIVE_DECAY 15
 
 	if (!i915.enable_hangcheck)
 		return;
@@ -3151,9 +3147,8 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 				switch (ring->hangcheck.action) {
 				case HANGCHECK_IDLE:
 				case HANGCHECK_WAIT:
-				case HANGCHECK_ACTIVE:
 					break;
-				case HANGCHECK_ACTIVE_LOOP:
+				case HANGCHECK_ACTIVE:
 					ring->hangcheck.score += BUSY;
 					break;
 				case HANGCHECK_KICK:
@@ -3172,10 +3167,12 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 			 * attempts across multiple batches.
 			 */
 			if (ring->hangcheck.score > 0)
-				ring->hangcheck.score--;
+				ring->hangcheck.score -= ACTIVE_DECAY;
+			if (ring->hangcheck.score < 0)
+				ring->hangcheck.score = 0;
 
 			/* Clear head and subunit states on seqno movement */
-			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+			ring->hangcheck.acthd = 0;
 
 			memset(ring->hangcheck.instdone, 0,
 			       sizeof(ring->hangcheck.instdone));
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index dd910d30a380..4b1439deb7fe 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -79,7 +79,6 @@  enum intel_ring_hangcheck_action {
 	HANGCHECK_IDLE = 0,
 	HANGCHECK_WAIT,
 	HANGCHECK_ACTIVE,
-	HANGCHECK_ACTIVE_LOOP,
 	HANGCHECK_KICK,
 	HANGCHECK_HUNG,
 };
@@ -88,7 +87,6 @@  enum intel_ring_hangcheck_action {
 
 struct intel_ring_hangcheck {
 	u64 acthd;
-	u64 max_acthd;
 	u32 seqno;
 	int score;
 	enum intel_ring_hangcheck_action action;