[2/3] drm/i915: Seek only one guilty batch per hanged ring

Message ID	1389968431-24123-2-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Mika Kuoppala <mika.kuoppala@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Fri, 17 Jan 2014 16:20:30 +0200 Message-Id: <1389968431-24123-2-git-send-email-mika.kuoppala@intel.com> In-Reply-To: <1389968431-24123-1-git-send-email-mika.kuoppala@intel.com> References: <1389968431-24123-1-git-send-email-mika.kuoppala@intel.com> Subject: [Intel-gfx] [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: intel-gfx-bounces@lists.freedesktop.org Errors-To: intel-gfx-bounces@lists.freedesktop.org

Message ID

1389968431-24123-2-git-send-email-mika.kuoppala@intel.com (mailing list archive)

State

New, archived

Headers

From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Date: Fri, 17 Jan 2014 16:20:30 +0200
Message-Id: <1389968431-24123-2-git-send-email-mika.kuoppala@intel.com>
In-Reply-To: <1389968431-24123-1-git-send-email-mika.kuoppala@intel.com>
References: <1389968431-24123-1-git-send-email-mika.kuoppala@intel.com>
Subject: [Intel-gfx] [PATCH 2/3] drm/i915: Seek only one guilty batch per
	hanged ring
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: intel-gfx-bounces@lists.freedesktop.org
Errors-To: intel-gfx-bounces@lists.freedesktop.org

Commit Message

Mika Kuoppala Jan. 17, 2014, 2:20 p.m. UTC

Instead of going through all the requests to find a batch that
hanged the machine, use hangcheck score and the fact that
first noncompleted request on hanged ring is, with great
probability, the guilty one. This also ensure that we get one
guilty batch per hang instead of possibly more (for each ring)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
 drivers/gpu/drm/i915/i915_irq.c         |    3 +--
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
 3 files changed, 13 insertions(+), 11 deletions(-)

Comments

Mika Kuoppala Jan. 17, 2014, 2:50 p.m. UTC | #1

Mika Kuoppala <mika.kuoppala@linux.intel.com> writes:

> Instead of going through all the requests to find a batch that
> hanged the machine, use hangcheck score and the fact that
> first noncompleted request on hanged ring is, with great
> probability, the guilty one. This also ensure that we get one
> guilty batch per hang instead of possibly more (for each ring)
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>

missing in here.
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
>  drivers/gpu/drm/i915/i915_irq.c         |    3 +--
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
>  3 files changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d270351..27a97c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  
>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
> -				  u32 acthd)
> +				  u32 acthd, const bool guilty)
>  {
>  	struct i915_ctx_hang_stats *hs = NULL;
> -	bool inside, guilty;
> +	bool inside;
>  	unsigned long offset = 0;
>  
> -	/* Innocent until proven guilty */
> -	guilty = false;
> -
>  	if (request->batch_obj)
>  		offset = i915_gem_obj_offset(request->batch_obj,
>  					     request_to_vm(request));
>  
> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
> +	if (guilty &&
>  	    i915_request_guilty(request, acthd, &inside)) {
>  		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>  			  ring->name,
> @@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  			  offset,
>  			  request->ctx ? request->ctx->id : 0,
>  			  acthd);
> -
> -		guilty = true;
>  	}
>  
>  	/* If contexts are disabled or this is the default context, use
> @@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	u32 completed_seqno = ring->get_seqno(ring, false);
>  	u32 acthd = intel_ring_get_active_head(ring);
>  	struct drm_i915_gem_request *request;
> +	bool guilty = false;
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
>  			continue;
>  
> -		i915_set_reset_status(ring, request, acthd);
> +		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
> +			guilty = true;
> +			i915_set_reset_status(ring, request, acthd, true);
> +		} else {
> +			i915_set_reset_status(ring, request, acthd, false);
> +		}
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6d11e25..e24f9ef 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  #define BUSY 1
>  #define KICK 5
>  #define HUNG 20
> -#define FIRE 30
>  
>  	if (!i915_enable_hangcheck)
>  		return;
> @@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score > FIRE) {
> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 ring->name);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 71a73f4..6018793 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
>  	HANGCHECK_HUNG,
>  };
>  
> +#define HANGCHECK_SCORE_GUILTY 31
> +
>  struct intel_ring_hangcheck {
>  	bool deadlock;
>  	u32 seqno;
> -- 
> 1.7.9.5

Ben Widawsky Jan. 26, 2014, 6:49 p.m. UTC | #2

On Fri, Jan 17, 2014 at 04:20:30PM +0200, Mika Kuoppala wrote:
> Instead of going through all the requests to find a batch that
> hanged the machine, use hangcheck score and the fact that
hung, hanged???
> first noncompleted request on hanged ring is, with great
> probability, the guilty one. This also ensure that we get one
> guilty batch per hang instead of possibly more (for each ring)
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
>  drivers/gpu/drm/i915/i915_irq.c         |    3 +--
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
>  3 files changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d270351..27a97c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  
>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
> -				  u32 acthd)
> +				  u32 acthd, const bool guilty)
>  {
>  	struct i915_ctx_hang_stats *hs = NULL;
> -	bool inside, guilty;
> +	bool inside;
>  	unsigned long offset = 0;
>  
> -	/* Innocent until proven guilty */
> -	guilty = false;
> -
>  	if (request->batch_obj)
>  		offset = i915_gem_obj_offset(request->batch_obj,
>  					     request_to_vm(request));
>  
> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
> +	if (guilty &&
>  	    i915_request_guilty(request, acthd, &inside)) {
>  		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>  			  ring->name,
> @@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  			  offset,
>  			  request->ctx ? request->ctx->id : 0,
>  			  acthd);
> -
> -		guilty = true;
>  	}
>  
>  	/* If contexts are disabled or this is the default context, use
> @@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	u32 completed_seqno = ring->get_seqno(ring, false);
>  	u32 acthd = intel_ring_get_active_head(ring);
>  	struct drm_i915_gem_request *request;
> +	bool guilty = false;
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
>  			continue;
>  
> -		i915_set_reset_status(ring, request, acthd);
> +		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {

checkpatch complains about the above.

> +			guilty = true;
> +			i915_set_reset_status(ring, request, acthd, true);
> +		} else {
> +			i915_set_reset_status(ring, request, acthd, false);
> +		}

I don't think the logic is correct. This will find the first request
(sequentially) that was hung, and not the first ring that hung.
Shouldn't we scan everything and take the first incomplete request with
the highest hangcheck.score?

Maybe I am crazy, but suppose we emit seqno 1 to ring X, and seqno 2 to
ring Y (the latter occurring after the first hang check)

event		RING X       RING Y
seqno 1 emit
seqno 2 emit	BUSY
		HUNG	      HUNG
		HUNG	      HUNG

The case here is somewhat academic since they both hung, but one might
argue that the first hang on ring Y is what caused the hang on ring X.

BTW, this change has convinced me that we really need to define
BUSY/KICK/HUNG as relative and not absolute values...

>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6d11e25..e24f9ef 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  #define BUSY 1
>  #define KICK 5
>  #define HUNG 20
> -#define FIRE 30
>  
>  	if (!i915_enable_hangcheck)
>  		return;
> @@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score > FIRE) {
> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 ring->name);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 71a73f4..6018793 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
>  	HANGCHECK_HUNG,
>  };
>  
> +#define HANGCHECK_SCORE_GUILTY 31
> +
>  struct intel_ring_hangcheck {
>  	bool deadlock;
>  	u32 seqno;

Mika Kuoppala Jan. 29, 2014, 3:20 p.m. UTC | #3

Ben Widawsky <ben@bwidawsk.net> writes:

> On Fri, Jan 17, 2014 at 04:20:30PM +0200, Mika Kuoppala wrote:
>> Instead of going through all the requests to find a batch that
>> hanged the machine, use hangcheck score and the fact that
> hung, hanged???
>> first noncompleted request on hanged ring is, with great
>> probability, the guilty one. This also ensure that we get one
>> guilty batch per hang instead of possibly more (for each ring)
>> 
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
>>  drivers/gpu/drm/i915/i915_irq.c         |    3 +--
>>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
>>  3 files changed, 13 insertions(+), 11 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index d270351..27a97c3 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>>  
>>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>>  				  struct drm_i915_gem_request *request,
>> -				  u32 acthd)
>> +				  u32 acthd, const bool guilty)
>>  {
>>  	struct i915_ctx_hang_stats *hs = NULL;
>> -	bool inside, guilty;
>> +	bool inside;
>>  	unsigned long offset = 0;
>>  
>> -	/* Innocent until proven guilty */
>> -	guilty = false;
>> -
>>  	if (request->batch_obj)
>>  		offset = i915_gem_obj_offset(request->batch_obj,
>>  					     request_to_vm(request));
>>  
>> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
>> +	if (guilty &&
>>  	    i915_request_guilty(request, acthd, &inside)) {
>>  		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>>  			  ring->name,
>> @@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>>  			  offset,
>>  			  request->ctx ? request->ctx->id : 0,
>>  			  acthd);
>> -
>> -		guilty = true;
>>  	}
>>  
>>  	/* If contexts are disabled or this is the default context, use
>> @@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>>  	u32 completed_seqno = ring->get_seqno(ring, false);
>>  	u32 acthd = intel_ring_get_active_head(ring);
>>  	struct drm_i915_gem_request *request;
>> +	bool guilty = false;
>>  
>>  	list_for_each_entry(request, &ring->request_list, list) {
>>  		if (i915_seqno_passed(completed_seqno, request->seqno))
>>  			continue;
>>  
>> -		i915_set_reset_status(ring, request, acthd);
>> +		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>
> checkpatch complains about the above.
>> +			guilty = true;
>> +			i915_set_reset_status(ring, request, acthd, true);
>> +		} else {
>> +			i915_set_reset_status(ring, request, acthd, false);
>> +		}
>
> I don't think the logic is correct. This will find the first request
> (sequentially) that was hung, and not the first ring that hung.
> Shouldn't we scan everything and take the first incomplete request with
> the highest hangcheck.score?
>
> Maybe I am crazy, but suppose we emit seqno 1 to ring X, and seqno 2 to
> ring Y (the latter occurring after the first hang check)
>
> event		RING X       RING Y
> seqno 1 emit
> seqno 2 emit	BUSY
> 		HUNG	      HUNG
> 		HUNG	      HUNG
>
> The case here is somewhat academic since they both hung, but one might
> argue that the first hang on ring Y is what caused the hang on ring X.
> 

Logic was wrong. Based on above and the feedback I got from Chris
and Daniel from my test RFC thread this is the reworked version:

1391007939-5741-4-git-send-email-mika.kuoppala@intel.com

We can get, depending ofcourse of hangcheck triggering timing, multiple
guilty batches for one hang if there were multiple rings involved.

> BTW, this change has convinced me that we really need to define
> BUSY/KICK/HUNG as relative and not absolute values...
>

I hope the reworked version of the patch voids your concern.
If not, could you elaborate this a bit more.

-Mika

>>  	}
>>  }
>>  
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index 6d11e25..e24f9ef 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
>>  #define BUSY 1
>>  #define KICK 5
>>  #define HUNG 20
>> -#define FIRE 30
>>  
>>  	if (!i915_enable_hangcheck)
>>  		return;
>> @@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>>  	}
>>  
>>  	for_each_ring(ring, dev_priv, i) {
>> -		if (ring->hangcheck.score > FIRE) {
>> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>>  			DRM_INFO("%s on %s\n",
>>  				 stuck[i] ? "stuck" : "no progress",
>>  				 ring->name);
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 71a73f4..6018793 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
>>  	HANGCHECK_HUNG,
>>  };
>>  
>> +#define HANGCHECK_SCORE_GUILTY 31
>> +
>>  struct intel_ring_hangcheck {
>>  	bool deadlock;
>>  	u32 seqno;
>
> -- 
> Ben Widawsky, Intel Open Source Technology Center

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d270351..27a97c3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2322,20 +2322,17 @@  static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
 
 static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
-				  u32 acthd)
+				  u32 acthd, const bool guilty)
 {
 	struct i915_ctx_hang_stats *hs = NULL;
-	bool inside, guilty;
+	bool inside;
 	unsigned long offset = 0;
 
-	/* Innocent until proven guilty */
-	guilty = false;
-
 	if (request->batch_obj)
 		offset = i915_gem_obj_offset(request->batch_obj,
 					     request_to_vm(request));
 
-	if (ring->hangcheck.action != HANGCHECK_WAIT &&
+	if (guilty &&
 	    i915_request_guilty(request, acthd, &inside)) {
 		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
 			  ring->name,
@@ -2343,8 +2340,6 @@  static void i915_set_reset_status(struct intel_ring_buffer *ring,
 			  offset,
 			  request->ctx ? request->ctx->id : 0,
 			  acthd);
-
-		guilty = true;
 	}
 
 	/* If contexts are disabled or this is the default context, use
@@ -2383,12 +2378,18 @@  static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
 	u32 completed_seqno = ring->get_seqno(ring, false);
 	u32 acthd = intel_ring_get_active_head(ring);
 	struct drm_i915_gem_request *request;
+	bool guilty = false;
 
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(completed_seqno, request->seqno))
 			continue;
 
-		i915_set_reset_status(ring, request, acthd);
+		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
+			guilty = true;
+			i915_set_reset_status(ring, request, acthd, true);
+		} else {
+			i915_set_reset_status(ring, request, acthd, false);
+		}
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6d11e25..e24f9ef 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2473,7 +2473,6 @@  static void i915_hangcheck_elapsed(unsigned long data)
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
-#define FIRE 30
 
 	if (!i915_enable_hangcheck)
 		return;
@@ -2557,7 +2556,7 @@  static void i915_hangcheck_elapsed(unsigned long data)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score > FIRE) {
+		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
 			DRM_INFO("%s on %s\n",
 				 stuck[i] ? "stuck" : "no progress",
 				 ring->name);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 71a73f4..6018793 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -41,6 +41,8 @@  enum intel_ring_hangcheck_action {
 	HANGCHECK_HUNG,
 };
 
+#define HANGCHECK_SCORE_GUILTY 31
+
 struct intel_ring_hangcheck {
 	bool deadlock;
 	u32 seqno;

[2/3] drm/i915: Seek only one guilty batch per hanged ring

Commit Message

Comments

Patch