[v2,4/6] drm/i915: Harden detection of missed interrupts
diff mbox

Message ID 1452937580-3625-4-git-send-email-chris@chris-wilson.co.uk
State New
Headers show

Commit Message

Chris Wilson Jan. 16, 2016, 9:46 a.m. UTC
Only declare a missed interrupt if we find that the GPU is idle with
waiters and a hangcheck interval has passed in which no new user
interrupts have been raised.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     | 11 +++++++----
 drivers/gpu/drm/i915/i915_irq.c         |  7 ++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
 3 files changed, 15 insertions(+), 5 deletions(-)

Comments

Mika Kuoppala Jan. 18, 2016, 1:07 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Only declare a missed interrupt if we find that the GPU is idle with
> waiters and a hangcheck interval has passed in which no new user
> interrupts have been raised.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     | 11 +++++++----
>  drivers/gpu/drm/i915/i915_irq.c         |  7 ++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
>  3 files changed, 15 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index b421b53ca128..966fc022418c 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -730,10 +730,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
>  static void i915_ring_seqno_info(struct seq_file *m,
>  				 struct intel_engine_cs *ring)
>  {
> -	if (ring->get_seqno) {
> -		seq_printf(m, "Current sequence (%s): %x\n",
> -			   ring->name, ring->get_seqno(ring));
> -	}
> +	seq_printf(m, "Current sequence (%s): %x\n",
> +		   ring->name, ring->get_seqno(ring));
> +	seq_printf(m, "Current user interrupts (%s): %x\n",
> +		   ring->name, READ_ONCE(ring->user_interrupts));
>  }
>  
>  static int i915_gem_seqno_info(struct seq_file *m, void *data)
> @@ -1361,6 +1361,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>  		seq_printf(m, "%s:\n", ring->name);
>  		seq_printf(m, "\tseqno = %x [current %x]\n",
>  			   ring->hangcheck.seqno, seqno[i]);
> +		seq_printf(m, "\tuser interrupts = %x [current %x]\n",
> +			   ring->hangcheck.user_interrupts,
> +			   ring->user_interrupts);
>  		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
>  			   (long long)ring->hangcheck.acthd,
>  			   (long long)acthd[i]);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 07bc2cdd6252..978eebcf4594 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *ring)
>  		return;
>  
>  	trace_i915_gem_request_notify(ring);
> +	ring->user_interrupts++;
>  

For 100% accuracy we would neeb (w)mb() here?

Now you only do READ_ONCE on reader side which is not
enough.

I admit that the chances to hit this are slim to none,
but is this all to avoid mb on irq path?

-Mika



>  	wake_up_all(&ring->irq_queue);
>  }
> @@ -3097,6 +3098,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  	for_each_ring(ring, dev_priv, i) {
>  		u64 acthd;
>  		u32 seqno;
> +		unsigned user_interrupts;
>  		bool busy = true;
>  
>  		semaphore_clear_deadlocks(dev_priv);
> @@ -3113,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  
>  		acthd = intel_ring_get_active_head(ring);
>  		seqno = ring->get_seqno(ring);
> +		user_interrupts = READ_ONCE(ring->user_interrupts);
>  
>  		if (ring->hangcheck.seqno == seqno) {
>  			if (ring_idle(ring, seqno)) {
> @@ -3120,7 +3123,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  
>  				if (waitqueue_active(&ring->irq_queue)) {
>  					/* Issue a wake-up to catch stuck h/w. */
> -					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> +					if (ring->hangcheck.user_interrupts == user_interrupts &&
> +					    !test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
>  						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
>  							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
>  								  ring->name);
> @@ -3187,6 +3191,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  
>  		ring->hangcheck.seqno = seqno;
>  		ring->hangcheck.acthd = acthd;
> +		ring->hangcheck.user_interrupts = user_interrupts;
>  		busy_count += busy;
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index e1797d42054c..b30ad99a54bf 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -90,6 +90,7 @@ struct intel_ring_hangcheck {
>  	u64 acthd;
>  	u64 max_acthd;
>  	u32 seqno;
> +	unsigned user_interrupts;
>  	int score;
>  	enum intel_ring_hangcheck_action action;
>  	int deadlock;
> @@ -301,6 +302,7 @@ struct  intel_engine_cs {
>  	 * inspecting request list.
>  	 */
>  	u32 last_submitted_seqno;
> +	unsigned user_interrupts;
>  
>  	bool gpu_caches_dirty;
>  
> -- 
> 2.7.0.rc3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 18, 2016, 3:35 p.m. UTC | #2
On Mon, Jan 18, 2016 at 03:07:16PM +0200, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Only declare a missed interrupt if we find that the GPU is idle with
> > waiters and a hangcheck interval has passed in which no new user
> > interrupts have been raised.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c     | 11 +++++++----
> >  drivers/gpu/drm/i915/i915_irq.c         |  7 ++++++-
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
> >  3 files changed, 15 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index b421b53ca128..966fc022418c 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -730,10 +730,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
> >  static void i915_ring_seqno_info(struct seq_file *m,
> >  				 struct intel_engine_cs *ring)
> >  {
> > -	if (ring->get_seqno) {
> > -		seq_printf(m, "Current sequence (%s): %x\n",
> > -			   ring->name, ring->get_seqno(ring));
> > -	}
> > +	seq_printf(m, "Current sequence (%s): %x\n",
> > +		   ring->name, ring->get_seqno(ring));
> > +	seq_printf(m, "Current user interrupts (%s): %x\n",
> > +		   ring->name, READ_ONCE(ring->user_interrupts));
> >  }
> >  
> >  static int i915_gem_seqno_info(struct seq_file *m, void *data)
> > @@ -1361,6 +1361,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> >  		seq_printf(m, "%s:\n", ring->name);
> >  		seq_printf(m, "\tseqno = %x [current %x]\n",
> >  			   ring->hangcheck.seqno, seqno[i]);
> > +		seq_printf(m, "\tuser interrupts = %x [current %x]\n",
> > +			   ring->hangcheck.user_interrupts,
> > +			   ring->user_interrupts);
> >  		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
> >  			   (long long)ring->hangcheck.acthd,
> >  			   (long long)acthd[i]);
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index 07bc2cdd6252..978eebcf4594 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *ring)
> >  		return;
> >  
> >  	trace_i915_gem_request_notify(ring);
> > +	ring->user_interrupts++;
> >  
> 
> For 100% accuracy we would neeb (w)mb() here?

No. Or rather there is, we don't need another one.
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index b421b53ca128..966fc022418c 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -730,10 +730,10 @@  static int i915_gem_request_info(struct seq_file *m, void *data)
 static void i915_ring_seqno_info(struct seq_file *m,
 				 struct intel_engine_cs *ring)
 {
-	if (ring->get_seqno) {
-		seq_printf(m, "Current sequence (%s): %x\n",
-			   ring->name, ring->get_seqno(ring));
-	}
+	seq_printf(m, "Current sequence (%s): %x\n",
+		   ring->name, ring->get_seqno(ring));
+	seq_printf(m, "Current user interrupts (%s): %x\n",
+		   ring->name, READ_ONCE(ring->user_interrupts));
 }
 
 static int i915_gem_seqno_info(struct seq_file *m, void *data)
@@ -1361,6 +1361,9 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "%s:\n", ring->name);
 		seq_printf(m, "\tseqno = %x [current %x]\n",
 			   ring->hangcheck.seqno, seqno[i]);
+		seq_printf(m, "\tuser interrupts = %x [current %x]\n",
+			   ring->hangcheck.user_interrupts,
+			   ring->user_interrupts);
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)ring->hangcheck.acthd,
 			   (long long)acthd[i]);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 07bc2cdd6252..978eebcf4594 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1000,6 +1000,7 @@  static void notify_ring(struct intel_engine_cs *ring)
 		return;
 
 	trace_i915_gem_request_notify(ring);
+	ring->user_interrupts++;
 
 	wake_up_all(&ring->irq_queue);
 }
@@ -3097,6 +3098,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	for_each_ring(ring, dev_priv, i) {
 		u64 acthd;
 		u32 seqno;
+		unsigned user_interrupts;
 		bool busy = true;
 
 		semaphore_clear_deadlocks(dev_priv);
@@ -3113,6 +3115,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 
 		acthd = intel_ring_get_active_head(ring);
 		seqno = ring->get_seqno(ring);
+		user_interrupts = READ_ONCE(ring->user_interrupts);
 
 		if (ring->hangcheck.seqno == seqno) {
 			if (ring_idle(ring, seqno)) {
@@ -3120,7 +3123,8 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 
 				if (waitqueue_active(&ring->irq_queue)) {
 					/* Issue a wake-up to catch stuck h/w. */
-					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
+					if (ring->hangcheck.user_interrupts == user_interrupts &&
+					    !test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
 						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
 							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
 								  ring->name);
@@ -3187,6 +3191,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 
 		ring->hangcheck.seqno = seqno;
 		ring->hangcheck.acthd = acthd;
+		ring->hangcheck.user_interrupts = user_interrupts;
 		busy_count += busy;
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e1797d42054c..b30ad99a54bf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -90,6 +90,7 @@  struct intel_ring_hangcheck {
 	u64 acthd;
 	u64 max_acthd;
 	u32 seqno;
+	unsigned user_interrupts;
 	int score;
 	enum intel_ring_hangcheck_action action;
 	int deadlock;
@@ -301,6 +302,7 @@  struct  intel_engine_cs {
 	 * inspecting request list.
 	 */
 	u32 last_submitted_seqno;
+	unsigned user_interrupts;
 
 	bool gpu_caches_dirty;