diff mbox

[06/13] drm/i915: detect hang using per ring hangcheck_score

Message ID 1361876716-8625-7-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Feb. 26, 2013, 11:05 a.m. UTC
Add per ring score of possible culprit for gpu hang. If
ring is busy and not waiting, it will get the highest score
across calls to i915_hangcheck_elapsed. This way we are
most likely to find the ring that caused the hang among
the waiting ones.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         |   65 +++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
 2 files changed, 36 insertions(+), 30 deletions(-)

Comments

Chris Wilson Feb. 26, 2013, 2:16 p.m. UTC | #1
On Tue, Feb 26, 2013 at 01:05:09PM +0200, Mika Kuoppala wrote:
> Add per ring score of possible culprit for gpu hang. If
> ring is busy and not waiting, it will get the highest score
> across calls to i915_hangcheck_elapsed. This way we are
> most likely to find the ring that caused the hang among
> the waiting ones.

I think you want to incorporate
https://patchwork.kernel.org/patch/1958381/
into this series as it addresses one case of many waiters, none guilty.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b828807..4da8691 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -356,7 +356,6 @@  static void notify_ring(struct drm_device *dev,
 
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->gpu_error.hangcheck_count = 0;
 		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
 			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 	}
@@ -1818,52 +1817,58 @@  void i915_hangcheck_elapsed(unsigned long data)
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
-	bool err = false, idle;
 	int i;
-	u32 seqno[I915_NUM_RINGS];
-	bool work_done;
+	int busy_count = 0, rings_hung = 0;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		seqno[i] = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
-	}
+		u32 seqno;
+		bool idle, err = false;
+
+		seqno = ring->get_seqno(ring, false);
+		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
-	if (idle) {
-		if (err) {
-			if (i915_hangcheck_hung(dev))
-				return;
+		if (idle) {
+			if (err)
+				ring->hangcheck_score++;
+			else
+				ring->hangcheck_score = 0;
+		} else {
+			busy_count++;
 
-			goto repeat;
+			if (ring->hangcheck_seqno == seqno) {
+				ring->hangcheck_score++;
+
+				/* If the ring is not waiting, raise
+				   the score further */
+				if (i915_hangcheck_ring_hung(dev, ring))
+					ring->hangcheck_score++;
+			} else {
+				ring->hangcheck_score = 0;
+			}
 		}
 
-		dev_priv->gpu_error.hangcheck_count = 0;
-		return;
+		ring->hangcheck_seqno = seqno;
 	}
 
-	work_done = false;
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck_seqno != seqno[i]) {
-			work_done = true;
-			ring->hangcheck_seqno = seqno[i];
+		if (ring->hangcheck_score > 2) {
+			rings_hung++;
+			DRM_ERROR("%s seems hung\n", ring->name);
 		}
 	}
 
-	if (!work_done) {
-		if (i915_hangcheck_hung(dev))
-			return;
-	} else {
-		dev_priv->gpu_error.hangcheck_count = 0;
-	}
+	if (rings_hung)
+		return i915_handle_error(dev, true);
 
-repeat:
-	/* Reset timer case chip hangs without another request being added */
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if (busy_count)
+		/* Reset timer case chip hangs without another request
+		 * being added */
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  round_jiffies_up(jiffies +
+					   DRM_I915_HANGCHECK_JIFFIES));
 }
 
 /* drm_dma.h hooks
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 9599c56..97b8f37 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -138,6 +138,7 @@  struct  intel_ring_buffer {
 	struct drm_i915_gem_object *last_context_obj;
 
 	u32 hangcheck_seqno;
+	int hangcheck_score;
 
 	void *private;
 };