diff mbox

[1/3] drm/i915: Rely on accurate request tracking for finding hung batches

Message ID 1392042651-3278-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Feb. 10, 2014, 2:30 p.m. UTC
From: Chris Wilson <chris@chris-wilson.co.uk>

In the past, it was possible to have multiple batches per request due to
a stray signal or ENOMEM. As a result we had to scan each active object
(filtered by those having the COMMAND domain) for the one that contained
the ACTHD pointer. This was then made more complicated by the
introduction of ppgtt, whereby ACTHD then pointed into the address space
of the context and so also needed to be taken into account.

This is a fairly robust approach (though the implementation is a little
fragile and depends upon the per-generation setup, registers and
parameters). However, due to the requirements for hangstats, we needed a
robust method for associating batches with a particular request and
having that we can rely upon it for finding the associated batch object
for error capture.

If the batch buffer tracking is not robust enough, that should become
apparent quite quickly through an erroneous error capture. That should
also help to make sure that the runtime reporting to userspace is
robust. It also means that we then report the oldest incomplete batch on
each ring, which can be useful for determining the state of userspace at
the time of a hang.

v2: Use i915_gem_find_active_request (Mika)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v2)
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       |    3 ++
 drivers/gpu/drm/i915/i915_gem.c       |   13 +++++--
 drivers/gpu/drm/i915/i915_gpu_error.c |   66 ++++-----------------------------
 3 files changed, 20 insertions(+), 62 deletions(-)

Comments

Ben Widawsky Feb. 12, 2014, 1:57 a.m. UTC | #1
On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> In the past, it was possible to have multiple batches per request due to
> a stray signal or ENOMEM. As a result we had to scan each active object
> (filtered by those having the COMMAND domain) for the one that contained
> the ACTHD pointer. This was then made more complicated by the
> introduction of ppgtt, whereby ACTHD then pointed into the address space
> of the context and so also needed to be taken into account.
> 
> This is a fairly robust approach (though the implementation is a little
> fragile and depends upon the per-generation setup, registers and
> parameters). However, due to the requirements for hangstats, we needed a
> robust method for associating batches with a particular request and
> having that we can rely upon it for finding the associated batch object
> for error capture.
> 
> If the batch buffer tracking is not robust enough, that should become
> apparent quite quickly through an erroneous error capture. That should
> also help to make sure that the runtime reporting to userspace is
> robust. It also means that we then report the oldest incomplete batch on
> each ring, which can be useful for determining the state of userspace at
> the time of a hang.
> 
> v2: Use i915_gem_find_active_request (Mika)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v2)
> Cc: Ben Widawsky <benjamin.widawsky@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |    3 ++
>  drivers/gpu/drm/i915/i915_gem.c       |   13 +++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c |   66 ++++-----------------------------
>  3 files changed, 20 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b1e91c3..eb260bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
>  	}
>  }
>  
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring);
> +
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
>  int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b0a244a..20e55ef 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
>  	kfree(request);
>  }
>  
> -static struct drm_i915_gem_request *
> -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring)
>  {
>  	struct drm_i915_gem_request *request;
> -	const u32 completed_seqno = ring->get_seqno(ring, false);
> +	u32 completed_seqno;
> +
> +	if (WARN_ON(!ring->get_seqno))
> +		return NULL;

ring->get_seqno(ring, false) ?

This looks like it's currently wrong below as well.

> +
> +	completed_seqno = ring->get_seqno(ring, false);
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
> @@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	struct drm_i915_gem_request *request;
>  	bool ring_hung;
>  
> -	request = i915_gem_find_first_non_complete(ring);
> +	request = i915_gem_find_active_request(ring);
>  
>  	if (request == NULL)
>  		return;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index dc47bb9..5bd075a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device *dev,
>  	}
>  }
>  
> -/* This assumes all batchbuffers are executed from the PPGTT. It might have to
> - * change in the future. */
> -static bool is_active_vm(struct i915_address_space *vm,
> -			 struct intel_ring_buffer *ring)
> -{
> -	struct drm_device *dev = vm->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	struct i915_hw_ppgtt *ppgtt;
> -
> -	if (INTEL_INFO(dev)->gen < 7)
> -		return i915_is_ggtt(vm);
> -
> -	/* FIXME: This ignores that the global gtt vm is also on this list. */
> -	ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
> -
> -	if (INTEL_INFO(dev)->gen >= 8) {
> -		u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
> -		pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
> -		return pdp0 == ppgtt->pd_dma_addr[0];
> -	} else {
> -		u32 pp_db;
> -		pp_db = I915_READ(RING_PP_DIR_BASE(ring));
> -		return (pp_db >> 10) == ppgtt->pd_offset;
> -	}
> -}
> -
>  static struct drm_i915_error_object *
>  i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			     struct intel_ring_buffer *ring)
>  {
> -	struct i915_address_space *vm;
> -	struct i915_vma *vma;
> -	struct drm_i915_gem_object *obj;
> -	bool found_active = false;
> -	u32 seqno;
> -
> -	if (!ring->get_seqno)
> -		return NULL;
> +	struct drm_i915_gem_request *request;
>  
>  	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> +		struct drm_i915_gem_object *obj;
>  		u32 acthd = I915_READ(ACTHD);
>  
>  		if (WARN_ON(ring->id != RCS))
> @@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			return i915_error_ggtt_object_create(dev_priv, obj);
>  	}
>  
> -	seqno = ring->get_seqno(ring, false);
> -	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -		if (!is_active_vm(vm, ring))
> -			continue;
> -
> -		found_active = true;
> -
> -		list_for_each_entry(vma, &vm->active_list, mm_list) {
> -			obj = vma->obj;
> -			if (obj->ring != ring)
> -				continue;
> -
> -			if (i915_seqno_passed(seqno, obj->last_read_seqno))
> -				continue;
> -
> -			if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> -				continue;
> -
> -			/* We need to copy these to an anonymous buffer as the simplest
> -			 * method to avoid being overwritten by userspace.
> -			 */
> -			return i915_error_object_create(dev_priv, obj, vm);
> -		}
> +	request = i915_gem_find_active_request(ring);
> +	if (request) {
> +		/* We need to copy these to an anonymous buffer as the simplest
> +		 * method to avoid being overwritten by userspace.
> +		 */
> +		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);

Wrap this one. It's too long even for my undiscerning eye.

>  	}
>  
> -	WARN_ON(!found_active);
>  	return NULL;
>  }
>  


I still would have preferred to keep both methods for a while until we
felt really comfortable with this... the commit message's statement that
getting bad error state is inevitably a good thing is total bunk, I
think.

However, I'll act as a Daniel proxy here who seems in favor of this
path. The code is definitely simpler, I am just a chicken.

With the above fixed:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Chris Wilson Feb. 12, 2014, 8:25 a.m. UTC | #2
On Tue, Feb 11, 2014 at 05:57:19PM -0800, Ben Widawsky wrote:
> On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> > -static struct drm_i915_gem_request *
> > -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> > +struct drm_i915_gem_request *
> > +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> >  {
> >  	struct drm_i915_gem_request *request;
> > -	const u32 completed_seqno = ring->get_seqno(ring, false);
> > +	u32 completed_seqno;
> > +
> > +	if (WARN_ON(!ring->get_seqno))
> > +		return NULL;
> 
> ring->get_seqno(ring, false) ?

?

This was originally used in the error capture code to detect
uninitialised rings.

> This looks like it's currently wrong below as well.

?
 
> I still would have preferred to keep both methods for a while until we
> felt really comfortable with this... the commit message's statement that
> getting bad error state is inevitably a good thing is total bunk, I
> think.

I strongly disagree. One of the critical factors you first scan for when
reading error states is whether it is self-consistent. If the error
state is inconsistent, we also know that some of our critical
infrastructure is wrong - which we can't know if the error capture code
was different. And in the cases where the error state is inconsistent,
the scanning method would also be snafu (because ACTHD is no longer
within the expected bounds).
-Chris
Chris Wilson Feb. 12, 2014, 5:47 p.m. UTC | #3
On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> +	request = i915_gem_find_active_request(ring);
> +	if (request) {
> +		/* We need to copy these to an anonymous buffer as the simplest
> +		 * method to avoid being overwritten by userspace.
> +		 */
> +		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);

This needs to protect against ctx == NULL, e.g.
  request->ctx ? request->ctx->vm : &dev_priv->gtt.base)
-Chris
Ben Widawsky Feb. 12, 2014, 7:02 p.m. UTC | #4
On Wed, Feb 12, 2014 at 08:25:58AM +0000, Chris Wilson wrote:
> On Tue, Feb 11, 2014 at 05:57:19PM -0800, Ben Widawsky wrote:
> > On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> > > -static struct drm_i915_gem_request *
> > > -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> > > +struct drm_i915_gem_request *
> > > +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> > >  {
> > >  	struct drm_i915_gem_request *request;
> > > -	const u32 completed_seqno = ring->get_seqno(ring, false);
> > > +	u32 completed_seqno;
> > > +
> > > +	if (WARN_ON(!ring->get_seqno))
> > > +		return NULL;
> > 
> > ring->get_seqno(ring, false) ?
> 
> ?
> 
> This was originally used in the error capture code to detect
> uninitialised rings.

We have a new flag for that now, don't we? Also, I don't think we can
actually have an uninitialized ring at this point.

> 
> > This looks like it's currently wrong below as well.
> 
> ?
>  
> > I still would have preferred to keep both methods for a while until we
> > felt really comfortable with this... the commit message's statement that
> > getting bad error state is inevitably a good thing is total bunk, I
> > think.
> 
> I strongly disagree. One of the critical factors you first scan for when
> reading error states is whether it is self-consistent. If the error
> state is inconsistent, we also know that some of our critical
> infrastructure is wrong - which we can't know if the error capture code
> was different. And in the cases where the error state is inconsistent,
> the scanning method would also be snafu (because ACTHD is no longer
> within the expected bounds).
> -Chris
> 

The ACTHD case is one of the reasons I can't argue for keeping the old
method. (Though I think capturing all buffers with COMMAND domain gets
you to the same end). Since you read way more error states than I, I'm
happy to favor your opinion. As I said [but you cut] I am just a
chicken.

You already got my r-b, stop arguing.

> -- 
> Chris Wilson, Intel Open Source Technology Centre
Chris Wilson Feb. 12, 2014, 7:15 p.m. UTC | #5
On Wed, Feb 12, 2014 at 11:02:34AM -0800, Ben Widawsky wrote:
> On Wed, Feb 12, 2014 at 08:25:58AM +0000, Chris Wilson wrote:
> > On Tue, Feb 11, 2014 at 05:57:19PM -0800, Ben Widawsky wrote:
> > > On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> > > > -static struct drm_i915_gem_request *
> > > > -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> > > > +struct drm_i915_gem_request *
> > > > +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> > > >  {
> > > >  	struct drm_i915_gem_request *request;
> > > > -	const u32 completed_seqno = ring->get_seqno(ring, false);
> > > > +	u32 completed_seqno;
> > > > +
> > > > +	if (WARN_ON(!ring->get_seqno))
> > > > +		return NULL;
> > > 
> > > ring->get_seqno(ring, false) ?
> > 
> > ?
> > 
> > This was originally used in the error capture code to detect
> > uninitialised rings.
> 
> We have a new flag for that now, don't we? Also, I don't think we can
> actually have an uninitialized ring at this point.

Considering that I have a vebox being reported on my SNB atm, something
is a little fishy with the detection of valid rings for error capture.
Yes, by the point we call i915_gem_find_active_request(), we should have
ring->get_seqno and so could just drop the WARN_ON()
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b1e91c3..eb260bc 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2147,6 +2147,9 @@  i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
 	}
 }
 
+struct drm_i915_gem_request *
+i915_gem_find_active_request(struct intel_ring_buffer *ring);
+
 bool i915_gem_retire_requests(struct drm_device *dev);
 void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
 int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b0a244a..20e55ef 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2298,11 +2298,16 @@  static void i915_gem_free_request(struct drm_i915_gem_request *request)
 	kfree(request);
 }
 
-static struct drm_i915_gem_request *
-i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
+struct drm_i915_gem_request *
+i915_gem_find_active_request(struct intel_ring_buffer *ring)
 {
 	struct drm_i915_gem_request *request;
-	const u32 completed_seqno = ring->get_seqno(ring, false);
+	u32 completed_seqno;
+
+	if (WARN_ON(!ring->get_seqno))
+		return NULL;
+
+	completed_seqno = ring->get_seqno(ring, false);
 
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(completed_seqno, request->seqno))
@@ -2320,7 +2325,7 @@  static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
 	struct drm_i915_gem_request *request;
 	bool ring_hung;
 
-	request = i915_gem_find_first_non_complete(ring);
+	request = i915_gem_find_active_request(ring);
 
 	if (request == NULL)
 		return;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index dc47bb9..5bd075a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -713,46 +713,14 @@  static void i915_gem_record_fences(struct drm_device *dev,
 	}
 }
 
-/* This assumes all batchbuffers are executed from the PPGTT. It might have to
- * change in the future. */
-static bool is_active_vm(struct i915_address_space *vm,
-			 struct intel_ring_buffer *ring)
-{
-	struct drm_device *dev = vm->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct i915_hw_ppgtt *ppgtt;
-
-	if (INTEL_INFO(dev)->gen < 7)
-		return i915_is_ggtt(vm);
-
-	/* FIXME: This ignores that the global gtt vm is also on this list. */
-	ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
-
-	if (INTEL_INFO(dev)->gen >= 8) {
-		u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
-		pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
-		return pdp0 == ppgtt->pd_dma_addr[0];
-	} else {
-		u32 pp_db;
-		pp_db = I915_READ(RING_PP_DIR_BASE(ring));
-		return (pp_db >> 10) == ppgtt->pd_offset;
-	}
-}
-
 static struct drm_i915_error_object *
 i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 			     struct intel_ring_buffer *ring)
 {
-	struct i915_address_space *vm;
-	struct i915_vma *vma;
-	struct drm_i915_gem_object *obj;
-	bool found_active = false;
-	u32 seqno;
-
-	if (!ring->get_seqno)
-		return NULL;
+	struct drm_i915_gem_request *request;
 
 	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
+		struct drm_i915_gem_object *obj;
 		u32 acthd = I915_READ(ACTHD);
 
 		if (WARN_ON(ring->id != RCS))
@@ -765,32 +733,14 @@  i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 			return i915_error_ggtt_object_create(dev_priv, obj);
 	}
 
-	seqno = ring->get_seqno(ring, false);
-	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
-		if (!is_active_vm(vm, ring))
-			continue;
-
-		found_active = true;
-
-		list_for_each_entry(vma, &vm->active_list, mm_list) {
-			obj = vma->obj;
-			if (obj->ring != ring)
-				continue;
-
-			if (i915_seqno_passed(seqno, obj->last_read_seqno))
-				continue;
-
-			if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
-				continue;
-
-			/* We need to copy these to an anonymous buffer as the simplest
-			 * method to avoid being overwritten by userspace.
-			 */
-			return i915_error_object_create(dev_priv, obj, vm);
-		}
+	request = i915_gem_find_active_request(ring);
+	if (request) {
+		/* We need to copy these to an anonymous buffer as the simplest
+		 * method to avoid being overwritten by userspace.
+		 */
+		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
 	}
 
-	WARN_ON(!found_active);
 	return NULL;
 }