[RFC,7/7] drm/i915: find guilty batch buffer on ring resets

Message ID	1359986683-29788-8-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org> From: Mika Kuoppala <mika.kuoppala@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Mon, 4 Feb 2013 16:04:43 +0200 Message-Id: <1359986683-29788-8-git-send-email-mika.kuoppala@intel.com> In-Reply-To: <1359986683-29788-1-git-send-email-mika.kuoppala@intel.com> References: <1359986683-29788-1-git-send-email-mika.kuoppala@intel.com> Subject: [Intel-gfx] [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on ring resets Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org Errors-To: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org

Message ID

1359986683-29788-8-git-send-email-mika.kuoppala@intel.com (mailing list archive)

State

New, archived

Headers

From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Date: Mon,  4 Feb 2013 16:04:43 +0200
Message-Id: <1359986683-29788-8-git-send-email-mika.kuoppala@intel.com>
In-Reply-To: <1359986683-29788-1-git-send-email-mika.kuoppala@intel.com>
References: <1359986683-29788-1-git-send-email-mika.kuoppala@intel.com>
Subject: [Intel-gfx] [RFC] [PATCH 7/7] drm/i915: find guilty batch buffer on
	ring resets
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org
Errors-To: intel-gfx-bounces+patchwork-intel-gfx=patchwork.kernel.org@lists.freedesktop.org

Commit Message

Mika Kuoppala Feb. 4, 2013, 2:04 p.m. UTC

After hang check timer has declared gpu to be hang,
rings are reset. In ring reset, when clearing
request list, do post mortem analysis to find out
the guilty batch buffer.

Select requests for further analysis by inspecting
the completed sequence number which has been updated
into the HWS page. If request was completed, it can't
be related to the hang.

For completed requests mark the batch as guilty
if the ring was not waiting and the ring head was
stuck inside the buffer object or in the flush region
right after the batch. For everything else, mark
them as innocents.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

Comments

Ville Syrjälä Feb. 7, 2013, 2:11 p.m. UTC | #1

On Mon, Feb 04, 2013 at 04:04:43PM +0200, Mika Kuoppala wrote:
> After hang check timer has declared gpu to be hang,
> rings are reset. In ring reset, when clearing
> request list, do post mortem analysis to find out
> the guilty batch buffer.
> 
> Select requests for further analysis by inspecting
> the completed sequence number which has been updated
> into the HWS page. If request was completed, it can't
> be related to the hang.
> 
> For completed requests mark the batch as guilty
      ^^^^^^^^^

That's a typo, right?

> if the ring was not waiting and the ring head was
> stuck inside the buffer object or in the flush region
> right after the batch. For everything else, mark
> them as innocents.
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 91 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b304b06..db0f3e3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2092,9 +2092,97 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>  
> +static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
> +{
> +	if (acthd >= obj->gtt_offset &&
> +	    acthd < obj->gtt_offset + obj->base.size)
> +		return true;
> +
> +	return false;
> +}
> +
> +static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
> +{
> +	if (rs < re) {
> +		if (acthd >= rs && acthd < re)
> +			return true;
> +	} else if (rs > re) {
> +		if (acthd >= rs || acthd < re)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static bool i915_request_guilty(struct drm_i915_gem_request *request,
> +				const u32 acthd, bool *inside)
> +{
> +	if (request->batch_obj) {
> +		if (i915_head_inside_object(acthd, request->batch_obj)) {
> +			*inside = true;
> +			return true;
> +		}
> +	}
> +
> +	if (i915_head_inside_request(acthd, request->head, request->tail)) {
> +		*inside = false;
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static void i915_set_reset_status(struct intel_ring_buffer *ring,
> +				  struct drm_i915_gem_request *request,
> +				  u32 acthd)
> +{
> +	bool inside;
> +	struct i915_reset_stats *rs = NULL;
> +	bool guilty;
> +
> +	/* Innocent until proven guilty */
> +	guilty = false;
> +
> +	if (!ring->hangcheck_waiting &&
> +	    i915_request_guilty(request, acthd, &inside)) {
> +		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
> +			  ring->name,
> +			  inside ? "inside" : "flushing",
> +			  request->batch_obj ?
> +			  request->batch_obj->gtt_offset : 0,
> +			  request->ctx ? request->ctx->id : 0,
> +			  acthd);
> +
> +		guilty = true;
> +	}
> +
> +	/* If contexts are disabled or this is the default context, use
> +	 * file_priv->reset_stats
> +	 */
> +	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
> +		rs = &request->ctx->reset_stats;
> +	else if (request->file_priv)
> +		rs = &request->file_priv->reset_stats;
> +
> +	if (rs) {
> +		rs->total++;
> +
> +		if (guilty)
> +			rs->guilty++;
> +		else
> +			rs->innocent++;
> +	}
> +}
> +
>  static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>  				      struct intel_ring_buffer *ring)
>  {
> +	u32 completed_seqno;
> +	u32 acthd;
> +
> +	acthd = intel_ring_get_active_head(ring);
> +	completed_seqno = ring->get_seqno(ring, false);
> +
>  	while (!list_empty(&ring->request_list)) {
>  		struct drm_i915_gem_request *request;
>  
> @@ -2102,6 +2190,9 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>  					   struct drm_i915_gem_request,
>  					   list);
>  
> +		if (request->seqno > completed_seqno)

i915_seqno_passed()?

> +			i915_set_reset_status(ring, request, acthd);
> +
>  		list_del(&request->list);
>  		i915_gem_request_remove_from_client(request);
>  
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Mika Kuoppala Feb. 15, 2013, 2:12 p.m. UTC | #2

Ville Syrjälä <ville.syrjala@linux.intel.com> writes:

> On Mon, Feb 04, 2013 at 04:04:43PM +0200, Mika Kuoppala wrote:
>> After hang check timer has declared gpu to be hang,
>> rings are reset. In ring reset, when clearing
>> request list, do post mortem analysis to find out
>> the guilty batch buffer.
>> 
>> Select requests for further analysis by inspecting
>> the completed sequence number which has been updated
>> into the HWS page. If request was completed, it can't
>> be related to the hang.
>> 
>> For completed requests mark the batch as guilty
>       ^^^^^^^^^
>
> That's a typo, right?

It sure is. Will fix.

>> if the ring was not waiting and the ring head was
>> stuck inside the buffer object or in the flush region
>> right after the batch. For everything else, mark
>> them as innocents.
>> 
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 91 insertions(+)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index b304b06..db0f3e3 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2092,9 +2092,97 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>>  	spin_unlock(&file_priv->mm.lock);
>>  }
>>  
>> +static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
>> +{
>> +	if (acthd >= obj->gtt_offset &&
>> +	    acthd < obj->gtt_offset + obj->base.size)
>> +		return true;
>> +
>> +	return false;
>> +}
>> +
>> +static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
>> +{
>> +	if (rs < re) {
>> +		if (acthd >= rs && acthd < re)
>> +			return true;
>> +	} else if (rs > re) {
>> +		if (acthd >= rs || acthd < re)
>> +			return true;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>> +static bool i915_request_guilty(struct drm_i915_gem_request *request,
>> +				const u32 acthd, bool *inside)
>> +{
>> +	if (request->batch_obj) {
>> +		if (i915_head_inside_object(acthd, request->batch_obj)) {
>> +			*inside = true;
>> +			return true;
>> +		}
>> +	}
>> +
>> +	if (i915_head_inside_request(acthd, request->head, request->tail)) {
>> +		*inside = false;
>> +		return true;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>> +static void i915_set_reset_status(struct intel_ring_buffer *ring,
>> +				  struct drm_i915_gem_request *request,
>> +				  u32 acthd)
>> +{
>> +	bool inside;
>> +	struct i915_reset_stats *rs = NULL;
>> +	bool guilty;
>> +
>> +	/* Innocent until proven guilty */
>> +	guilty = false;
>> +
>> +	if (!ring->hangcheck_waiting &&
>> +	    i915_request_guilty(request, acthd, &inside)) {
>> +		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
>> +			  ring->name,
>> +			  inside ? "inside" : "flushing",
>> +			  request->batch_obj ?
>> +			  request->batch_obj->gtt_offset : 0,
>> +			  request->ctx ? request->ctx->id : 0,
>> +			  acthd);
>> +
>> +		guilty = true;
>> +	}
>> +
>> +	/* If contexts are disabled or this is the default context, use
>> +	 * file_priv->reset_stats
>> +	 */
>> +	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
>> +		rs = &request->ctx->reset_stats;
>> +	else if (request->file_priv)
>> +		rs = &request->file_priv->reset_stats;
>> +
>> +	if (rs) {
>> +		rs->total++;
>> +
>> +		if (guilty)
>> +			rs->guilty++;
>> +		else
>> +			rs->innocent++;
>> +	}
>> +}
>> +
>>  static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>>  				      struct intel_ring_buffer *ring)
>>  {
>> +	u32 completed_seqno;
>> +	u32 acthd;
>> +
>> +	acthd = intel_ring_get_active_head(ring);
>> +	completed_seqno = ring->get_seqno(ring, false);
>> +
>>  	while (!list_empty(&ring->request_list)) {
>>  		struct drm_i915_gem_request *request;
>>  
>> @@ -2102,6 +2190,9 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
>>  					   struct drm_i915_gem_request,
>>  					   list);
>>  
>> +		if (request->seqno > completed_seqno)
>
> i915_seqno_passed()?

For readability or for correctness?

When seqno wraps, the request queue will be cleaned up so
we can't have cross wrap boundary stuff in here.

Or did you have something else in mind that i have missed.

Thanks for review,
-Mika

>
>> +			i915_set_reset_status(ring, request, acthd);
>> +
>>  		list_del(&request->list);
>>  		i915_gem_request_remove_from_client(request);
>>  
>> -- 
>> 1.7.9.5
>> 
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> -- 
> Ville Syrjälä
> Intel OTC

Ville Syrjälä Feb. 15, 2013, 2:55 p.m. UTC | #3

On Fri, Feb 15, 2013 at 04:12:16PM +0200, Mika Kuoppala wrote:
> Ville Syrjälä <ville.syrjala@linux.intel.com> writes:
> 
> > On Mon, Feb 04, 2013 at 04:04:43PM +0200, Mika Kuoppala wrote:
> >> After hang check timer has declared gpu to be hang,
> >> rings are reset. In ring reset, when clearing
> >> request list, do post mortem analysis to find out
> >> the guilty batch buffer.
> >> 
> >> Select requests for further analysis by inspecting
> >> the completed sequence number which has been updated
> >> into the HWS page. If request was completed, it can't
> >> be related to the hang.
> >> 
> >> For completed requests mark the batch as guilty
> >       ^^^^^^^^^
> >
> > That's a typo, right?
> 
> It sure is. Will fix.
> 
> >> if the ring was not waiting and the ring head was
> >> stuck inside the buffer object or in the flush region
> >> right after the batch. For everything else, mark
> >> them as innocents.
> >> 
> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/i915_gem.c |   91 +++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 91 insertions(+)
> >> 
> >> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >> index b304b06..db0f3e3 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >> @@ -2092,9 +2092,97 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
> >>  	spin_unlock(&file_priv->mm.lock);
> >>  }
> >>  
> >> +static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
> >> +{
> >> +	if (acthd >= obj->gtt_offset &&
> >> +	    acthd < obj->gtt_offset + obj->base.size)
> >> +		return true;
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
> >> +{
> >> +	if (rs < re) {
> >> +		if (acthd >= rs && acthd < re)
> >> +			return true;
> >> +	} else if (rs > re) {
> >> +		if (acthd >= rs || acthd < re)
> >> +			return true;
> >> +	}
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +static bool i915_request_guilty(struct drm_i915_gem_request *request,
> >> +				const u32 acthd, bool *inside)
> >> +{
> >> +	if (request->batch_obj) {
> >> +		if (i915_head_inside_object(acthd, request->batch_obj)) {
> >> +			*inside = true;
> >> +			return true;
> >> +		}
> >> +	}
> >> +
> >> +	if (i915_head_inside_request(acthd, request->head, request->tail)) {
> >> +		*inside = false;
> >> +		return true;
> >> +	}
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +static void i915_set_reset_status(struct intel_ring_buffer *ring,
> >> +				  struct drm_i915_gem_request *request,
> >> +				  u32 acthd)
> >> +{
> >> +	bool inside;
> >> +	struct i915_reset_stats *rs = NULL;
> >> +	bool guilty;
> >> +
> >> +	/* Innocent until proven guilty */
> >> +	guilty = false;
> >> +
> >> +	if (!ring->hangcheck_waiting &&
> >> +	    i915_request_guilty(request, acthd, &inside)) {
> >> +		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
> >> +			  ring->name,
> >> +			  inside ? "inside" : "flushing",
> >> +			  request->batch_obj ?
> >> +			  request->batch_obj->gtt_offset : 0,
> >> +			  request->ctx ? request->ctx->id : 0,
> >> +			  acthd);
> >> +
> >> +		guilty = true;
> >> +	}
> >> +
> >> +	/* If contexts are disabled or this is the default context, use
> >> +	 * file_priv->reset_stats
> >> +	 */
> >> +	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
> >> +		rs = &request->ctx->reset_stats;
> >> +	else if (request->file_priv)
> >> +		rs = &request->file_priv->reset_stats;
> >> +
> >> +	if (rs) {
> >> +		rs->total++;
> >> +
> >> +		if (guilty)
> >> +			rs->guilty++;
> >> +		else
> >> +			rs->innocent++;
> >> +	}
> >> +}
> >> +
> >>  static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
> >>  				      struct intel_ring_buffer *ring)
> >>  {
> >> +	u32 completed_seqno;
> >> +	u32 acthd;
> >> +
> >> +	acthd = intel_ring_get_active_head(ring);
> >> +	completed_seqno = ring->get_seqno(ring, false);
> >> +
> >>  	while (!list_empty(&ring->request_list)) {
> >>  		struct drm_i915_gem_request *request;
> >>  
> >> @@ -2102,6 +2190,9 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
> >>  					   struct drm_i915_gem_request,
> >>  					   list);
> >>  
> >> +		if (request->seqno > completed_seqno)
> >
> > i915_seqno_passed()?
> 
> For readability or for correctness?
> 
> When seqno wraps, the request queue will be cleaned up so
> we can't have cross wrap boundary stuff in here.
> 
> Or did you have something else in mind that i have missed.

Nah. It just seems suspicious to have a direct comparison with any
comment why i915_seqno_passed() isn't used.

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b304b06..db0f3e3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2092,9 +2092,97 @@  i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj)
+{
+	if (acthd >= obj->gtt_offset &&
+	    acthd < obj->gtt_offset + obj->base.size)
+		return true;
+
+	return false;
+}
+
+static bool i915_head_inside_request(u32 acthd, u32 rs, u32 re)
+{
+	if (rs < re) {
+		if (acthd >= rs && acthd < re)
+			return true;
+	} else if (rs > re) {
+		if (acthd >= rs || acthd < re)
+			return true;
+	}
+
+	return false;
+}
+
+static bool i915_request_guilty(struct drm_i915_gem_request *request,
+				const u32 acthd, bool *inside)
+{
+	if (request->batch_obj) {
+		if (i915_head_inside_object(acthd, request->batch_obj)) {
+			*inside = true;
+			return true;
+		}
+	}
+
+	if (i915_head_inside_request(acthd, request->head, request->tail)) {
+		*inside = false;
+		return true;
+	}
+
+	return false;
+}
+
+static void i915_set_reset_status(struct intel_ring_buffer *ring,
+				  struct drm_i915_gem_request *request,
+				  u32 acthd)
+{
+	bool inside;
+	struct i915_reset_stats *rs = NULL;
+	bool guilty;
+
+	/* Innocent until proven guilty */
+	guilty = false;
+
+	if (!ring->hangcheck_waiting &&
+	    i915_request_guilty(request, acthd, &inside)) {
+		DRM_ERROR("%s hung %s bo (0x%x ctx %d) at 0x%x\n",
+			  ring->name,
+			  inside ? "inside" : "flushing",
+			  request->batch_obj ?
+			  request->batch_obj->gtt_offset : 0,
+			  request->ctx ? request->ctx->id : 0,
+			  acthd);
+
+		guilty = true;
+	}
+
+	/* If contexts are disabled or this is the default context, use
+	 * file_priv->reset_stats
+	 */
+	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
+		rs = &request->ctx->reset_stats;
+	else if (request->file_priv)
+		rs = &request->file_priv->reset_stats;
+
+	if (rs) {
+		rs->total++;
+
+		if (guilty)
+			rs->guilty++;
+		else
+			rs->innocent++;
+	}
+}
+
 static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 				      struct intel_ring_buffer *ring)
 {
+	u32 completed_seqno;
+	u32 acthd;
+
+	acthd = intel_ring_get_active_head(ring);
+	completed_seqno = ring->get_seqno(ring, false);
+
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 
@@ -2102,6 +2190,9 @@  static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 					   struct drm_i915_gem_request,
 					   list);
 
+		if (request->seqno > completed_seqno)
+			i915_set_reset_status(ring, request, acthd);
+
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);

[RFC,7/7] drm/i915: find guilty batch buffer on ring resets

Commit Message

Comments

Patch