diff mbox

[04/13] drm/i915: track ring progression using seqnos

Message ID 1361876716-8625-5-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Feb. 26, 2013, 11:05 a.m. UTC
Instead of relying in acthd, track ring seqno progression
to detect if ring has hung.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    2 --
 drivers/gpu/drm/i915/i915_irq.c         |   30 +++++++++++++-----------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
 3 files changed, 15 insertions(+), 19 deletions(-)

Comments

Chris Wilson Feb. 26, 2013, 2:12 p.m. UTC | #1
On Tue, Feb 26, 2013 at 01:05:07PM +0200, Mika Kuoppala wrote:
> Instead of relying in acthd, track ring seqno progression
> to detect if ring has hung.

This needs a comment that it has a user visible side-effect of limiting
batches to a maximum of 1.5s runtime. Before, that limit was softer in
that we had a chance to spot that the GPU was busy - even if we could be
fooled by an infinite loop.

Did you write an i-g-t for detecting a ring of chained batchbuffers?
-Chris
Mika Kuoppala Feb. 26, 2013, 3:09 p.m. UTC | #2
Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Tue, Feb 26, 2013 at 01:05:07PM +0200, Mika Kuoppala wrote:
>> Instead of relying in acthd, track ring seqno progression
>> to detect if ring has hung.
>
> This needs a comment that it has a user visible side-effect of limiting
> batches to a maximum of 1.5s runtime. Before, that limit was softer in
> that we had a chance to spot that the GPU was busy - even if we could be
> fooled by an infinite loop.

As the current code has:

> if (dev_priv->gpu_error.hangcheck_count++ > 1)

to trigger the hang, so it will trigger at 3rd timer call (4.5seconds)

With my patch it will be 4.5 seconds if rings are waiting 
and 3 seconds if there is non waiting ring involved.

As i can't explain what is the added benefit to declare hang earlier
if there is a non waiting ring, do you want me to simplify this
to just declare hang if there is no progress in 4.5 seconds in both cases?
To match the old trigger timing.

> Did you write an i-g-t for detecting a ring of chained batchbuffers?
> -Chris

Yes, you can find it in here:

https://github.com/mkuoppal/intel-gpu-tools/commit/1df7d49ff9ecedf9c55933a9e36b1eb41f07abc6

If you wan't to compile/run, you need also:
https://github.com/mkuoppal/linux/commit/f24c1d64f89b070c74afa38ab5ac148f56c84aaf

The interface will change but currently the test is based on old ioctl.

Thanks,
-Mika
Chris Wilson Feb. 26, 2013, 3:31 p.m. UTC | #3
On Tue, Feb 26, 2013 at 05:09:03PM +0200, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > On Tue, Feb 26, 2013 at 01:05:07PM +0200, Mika Kuoppala wrote:
> >> Instead of relying in acthd, track ring seqno progression
> >> to detect if ring has hung.
> >
> > This needs a comment that it has a user visible side-effect of limiting
> > batches to a maximum of 1.5s runtime. Before, that limit was softer in
> > that we had a chance to spot that the GPU was busy - even if we could be
> > fooled by an infinite loop.
> 
> As the current code has:
> 
> > if (dev_priv->gpu_error.hangcheck_count++ > 1)
> 
> to trigger the hang, so it will trigger at 3rd timer call (4.5seconds)
> 
> With my patch it will be 4.5 seconds if rings are waiting 
> and 3 seconds if there is non waiting ring involved.
> 
> As i can't explain what is the added benefit to declare hang earlier
> if there is a non waiting ring, do you want me to simplify this
> to just declare hang if there is no progress in 4.5 seconds in both cases?
> To match the old trigger timing.

Yes, that would be good if you can simplify the logic so that it does
not react differently for no apparent reason. I would prefer the 3s
timeout.

> > Did you write an i-g-t for detecting a ring of chained batchbuffers?
> > -Chris
> 
> Yes, you can find it in here:
> 
> https://github.com/mkuoppal/intel-gpu-tools/commit/1df7d49ff9ecedf9c55933a9e36b1eb41f07abc6

I definitely add an exercise for an infinite loop that was not caught by
the current hangcheck, and add a fixes: note to this changelog.
(Couldn't spot that explicit case in the reset-status test, and it can
be added independently of this series as an demonstration of a known
breakage.)
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9f1a75d..fb51b4f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -797,8 +797,6 @@  struct i915_gpu_error {
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
 	int hangcheck_count;
-	uint32_t last_acthd[I915_NUM_RINGS];
-	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 4f60c87..de5af12 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1810,22 +1810,19 @@  void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd[I915_NUM_RINGS], instdone[I915_NUM_INSTDONE_REG];
 	struct intel_ring_buffer *ring;
 	bool err = false, idle;
 	int i;
+	u32 seqno[I915_NUM_RINGS];
+	bool work_done;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	memset(acthd, 0, sizeof(acthd));
 	idle = true;
 	for_each_ring(ring, dev_priv, i) {
-		u32 seqno;
-
-		seqno = ring->get_seqno(ring, false);
-		idle &= i915_hangcheck_ring_idle(ring, seqno, &err);
-	    acthd[i] = intel_ring_get_active_head(ring);
+		seqno[i] = ring->get_seqno(ring, false);
+		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
 	}
 
 	/* If all work is done then ACTHD clearly hasn't advanced. */
@@ -1841,20 +1838,19 @@  void i915_hangcheck_elapsed(unsigned long data)
 		return;
 	}
 
-	i915_get_extra_instdone(dev, instdone);
-	if (memcmp(dev_priv->gpu_error.last_acthd, acthd,
-		   sizeof(acthd)) == 0 &&
-	    memcmp(dev_priv->gpu_error.prev_instdone, instdone,
-		   sizeof(instdone)) == 0) {
+	work_done = false;
+	for_each_ring(ring, dev_priv, i) {
+		if (ring->hangcheck_seqno != seqno[i]) {
+			work_done = true;
+			ring->hangcheck_seqno = seqno[i];
+		}
+	}
+
+	if (!work_done) {
 		if (i915_hangcheck_hung(dev))
 			return;
 	} else {
 		dev_priv->gpu_error.hangcheck_count = 0;
-
-		memcpy(dev_priv->gpu_error.last_acthd, acthd,
-		       sizeof(acthd));
-		memcpy(dev_priv->gpu_error.prev_instdone, instdone,
-		       sizeof(instdone));
 	}
 
 repeat:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d66208c..9599c56 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -137,6 +137,8 @@  struct  intel_ring_buffer {
 	struct i915_hw_context *default_context;
 	struct drm_i915_gem_object *last_context_obj;
 
+	u32 hangcheck_seqno;
+
 	void *private;
 };