diff mbox

[v2,1/3] drm/i915: Record the ringbuffer associated with the request

Message ID 1449874764-18735-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Dec. 11, 2015, 10:59 p.m. UTC
The request tells us where to read the ringbuf from, so use that
information to simplify the error capture. If no request was active at
the time of the hang, the ring is idle and there is no information
inside the ring pertaining to the hang.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

Comments

Dave Gordon Dec. 14, 2015, 11:14 a.m. UTC | #1
On 11/12/15 22:59, Chris Wilson wrote:
> The request tells us where to read the ringbuf from, so use that
> information to simplify the error capture. If no request was active at
> the time of the hang, the ring is idle and there is no information
> inside the ring pertaining to the hang.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
>   1 file changed, 10 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 3e137fc701cf..6eefe9c36931 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
>
>   	for (i = 0; i < I915_NUM_RINGS; i++) {
>   		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_ringbuffer *rbuf;
> +		struct intel_ringbuffer *rbuf = NULL;
>
>   		error->ring[i].pid = -1;
>
> @@ -1039,26 +1039,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
>   				}
>   				rcu_read_unlock();
>   			}
> +
> +			rbuf = request->ringbuf;
>   		}
>
> -		if (i915.enable_execlists) {
> -			/* TODO: This is only a small fix to keep basic error
> -			 * capture working, but we need to add more information
> -			 * for it to be useful (e.g. dump the context being
> -			 * executed).
> -			 */
> -			if (request)
> -				rbuf = request->ctx->engine[ring->id].ringbuf;
> -			else
> -				rbuf = ring->default_context->engine[ring->id].ringbuf;
> -		} else
> -			rbuf = ring->buffer;
> -
> -		error->ring[i].cpu_ring_head = rbuf->head;
> -		error->ring[i].cpu_ring_tail = rbuf->tail;
> -
> -		error->ring[i].ringbuffer =
> -			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
> +		if (rbuf) {
> +			error->ring[i].cpu_ring_head = rbuf->head;
> +			error->ring[i].cpu_ring_tail = rbuf->tail;
> +			error->ring[i].ringbuffer =
> +				i915_error_ggtt_object_create(dev_priv,
> +							      rbuf->obj);
> +		}
>
>   		error->ring[i].hws_page =
>   			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);

I think the code you deleted is intended to capture the *default* 
ringbuffer if there is no request active -- sometimes we will switch an 
engine to the default context (and therefore ringbuffer) when there's no 
work to be done.

Another option that's sometimes useful is to capture the ringbuffer 
pointed to by the START register. This helps in finding situations where 
the driver and the GPU disagree about what should be in progress.

I've got a few patches that update some of the error capture that's 
always been missing in execlist mode (like, actually capturing the 
active context), and add some more decoding of what we do capture.
John Harrison posted them as part of the "Preemption support for GPU 
scheduler" patchset last week, although they're not really anything to 
do with preemption per se.

One of them "drm/i915/error: improve CSB reporting" also updates this 
area of the code, so maybe I should incorporate your change into the 
next revision of that patch?

.Dave.
Chris Wilson Dec. 14, 2015, 11:28 a.m. UTC | #2
On Mon, Dec 14, 2015 at 11:14:31AM +0000, Dave Gordon wrote:
> On 11/12/15 22:59, Chris Wilson wrote:
> >The request tells us where to read the ringbuf from, so use that
> >information to simplify the error capture. If no request was active at
> >the time of the hang, the ring is idle and there is no information
> >inside the ring pertaining to the hang.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
> >  1 file changed, 10 insertions(+), 19 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> >index 3e137fc701cf..6eefe9c36931 100644
> >--- a/drivers/gpu/drm/i915/i915_gpu_error.c
> >+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> >@@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
> >
> >  	for (i = 0; i < I915_NUM_RINGS; i++) {
> >  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> >-		struct intel_ringbuffer *rbuf;
> >+		struct intel_ringbuffer *rbuf = NULL;
> >
> >  		error->ring[i].pid = -1;
> >
> >@@ -1039,26 +1039,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
> >  				}
> >  				rcu_read_unlock();
> >  			}
> >+
> >+			rbuf = request->ringbuf;
> >  		}
> >
> >-		if (i915.enable_execlists) {
> >-			/* TODO: This is only a small fix to keep basic error
> >-			 * capture working, but we need to add more information
> >-			 * for it to be useful (e.g. dump the context being
> >-			 * executed).
> >-			 */
> >-			if (request)
> >-				rbuf = request->ctx->engine[ring->id].ringbuf;
> >-			else
> >-				rbuf = ring->default_context->engine[ring->id].ringbuf;
> >-		} else
> >-			rbuf = ring->buffer;
> >-
> >-		error->ring[i].cpu_ring_head = rbuf->head;
> >-		error->ring[i].cpu_ring_tail = rbuf->tail;
> >-
> >-		error->ring[i].ringbuffer =
> >-			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
> >+		if (rbuf) {
> >+			error->ring[i].cpu_ring_head = rbuf->head;
> >+			error->ring[i].cpu_ring_tail = rbuf->tail;
> >+			error->ring[i].ringbuffer =
> >+				i915_error_ggtt_object_create(dev_priv,
> >+							      rbuf->obj);
> >+		}
> >
> >  		error->ring[i].hws_page =
> >  			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);
> 
> I think the code you deleted is intended to capture the *default*
> ringbuffer if there is no request active -- sometimes we will switch
> an engine to the default context (and therefore ringbuffer) when
> there's no work to be done.

So answer the question, why? I don't have a use for it. This code in
particular is nothing more than a hack for execlists and in no way
reflects my intentions for the postmortem debugging tool.

> Another option that's sometimes useful is to capture the ringbuffer
> pointed to by the START register. This helps in finding situations
> where the driver and the GPU disagree about what should be in
> progress.

That is a possibitly, except is no more interesting than inspecting the
START vs expected (and requires the stop_machine rework to walk the
lists without crashing).
 
> I've got a few patches that update some of the error capture that's
> always been missing in execlist mode (like, actually capturing the
> active context), and add some more decoding of what we do capture.

No decoding. That is easier done in userspace. And I sent patches to do
more capturing many, many months ago, along with fixing up most of the
invalid ppgtt state.
-Chris
Dave Gordon Dec. 15, 2015, 4:53 p.m. UTC | #3
On 14/12/15 11:28, Chris Wilson wrote:
> On Mon, Dec 14, 2015 at 11:14:31AM +0000, Dave Gordon wrote:
>> On 11/12/15 22:59, Chris Wilson wrote:
>>> The request tells us where to read the ringbuf from, so use that
>>> information to simplify the error capture. If no request was active at
>>> the time of the hang, the ring is idle and there is no information
>>> inside the ring pertaining to the hang.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
>>>   1 file changed, 10 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> index 3e137fc701cf..6eefe9c36931 100644
>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> @@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
>>>
>>>   	for (i = 0; i < I915_NUM_RINGS; i++) {
>>>   		struct intel_engine_cs *ring = &dev_priv->ring[i];
>>> -		struct intel_ringbuffer *rbuf;
>>> +		struct intel_ringbuffer *rbuf = NULL;
>>>
>>>   		error->ring[i].pid = -1;
>>>
>>> @@ -1039,26 +1039,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
>>>   				}
>>>   				rcu_read_unlock();
>>>   			}
>>> +
>>> +			rbuf = request->ringbuf;
>>>   		}
>>>
>>> -		if (i915.enable_execlists) {
>>> -			/* TODO: This is only a small fix to keep basic error
>>> -			 * capture working, but we need to add more information
>>> -			 * for it to be useful (e.g. dump the context being
>>> -			 * executed).
>>> -			 */
>>> -			if (request)
>>> -				rbuf = request->ctx->engine[ring->id].ringbuf;
>>> -			else
>>> -				rbuf = ring->default_context->engine[ring->id].ringbuf;
>>> -		} else
>>> -			rbuf = ring->buffer;
>>> -
>>> -		error->ring[i].cpu_ring_head = rbuf->head;
>>> -		error->ring[i].cpu_ring_tail = rbuf->tail;
>>> -
>>> -		error->ring[i].ringbuffer =
>>> -			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
>>> +		if (rbuf) {
>>> +			error->ring[i].cpu_ring_head = rbuf->head;
>>> +			error->ring[i].cpu_ring_tail = rbuf->tail;
>>> +			error->ring[i].ringbuffer =
>>> +				i915_error_ggtt_object_create(dev_priv,
>>> +							      rbuf->obj);
>>> +		}
>>>
>>>   		error->ring[i].hws_page =
>>>   			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);
>>
>> I think the code you deleted is intended to capture the *default*
>> ringbuffer if there is no request active -- sometimes we will switch
>> an engine to the default context (and therefore ringbuffer) when
>> there's no work to be done.
>
> So answer the question, why? I don't have a use for it. This code in
> particular is nothing more than a hack for execlists and in no way
> reflects my intentions for the postmortem debugging tool.
>
>> Another option that's sometimes useful is to capture the ringbuffer
>> pointed to by the START register. This helps in finding situations
>> where the driver and the GPU disagree about what should be in
>> progress.
>
> That is a possibitly, except is no more interesting than inspecting the
> START vs expected (and requires the stop_machine rework to walk the
> lists without crashing).
>
>> I've got a few patches that update some of the error capture that's
>> always been missing in execlist mode (like, actually capturing the
>> active context), and add some more decoding of what we do capture.
>
> No decoding. That is easier done in userspace. And I sent patches to do
> more capturing many, many months ago, along with fixing up most of the
> invalid ppgtt state.
> -Chris

Anyway, the removal of the unnecessary execlist/non-execlist is 
worthwhile, so

Reviewed-by: Dave Gordon <david.s.gordon@intel.com>

and then maybe I'll rework the default and/or START capture on top of 
this later.

.Dave.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 3e137fc701cf..6eefe9c36931 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -995,7 +995,7 @@  static void i915_gem_record_rings(struct drm_device *dev,
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_engine_cs *ring = &dev_priv->ring[i];
-		struct intel_ringbuffer *rbuf;
+		struct intel_ringbuffer *rbuf = NULL;
 
 		error->ring[i].pid = -1;
 
@@ -1039,26 +1039,17 @@  static void i915_gem_record_rings(struct drm_device *dev,
 				}
 				rcu_read_unlock();
 			}
+
+			rbuf = request->ringbuf;
 		}
 
-		if (i915.enable_execlists) {
-			/* TODO: This is only a small fix to keep basic error
-			 * capture working, but we need to add more information
-			 * for it to be useful (e.g. dump the context being
-			 * executed).
-			 */
-			if (request)
-				rbuf = request->ctx->engine[ring->id].ringbuf;
-			else
-				rbuf = ring->default_context->engine[ring->id].ringbuf;
-		} else
-			rbuf = ring->buffer;
-
-		error->ring[i].cpu_ring_head = rbuf->head;
-		error->ring[i].cpu_ring_tail = rbuf->tail;
-
-		error->ring[i].ringbuffer =
-			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
+		if (rbuf) {
+			error->ring[i].cpu_ring_head = rbuf->head;
+			error->ring[i].cpu_ring_tail = rbuf->tail;
+			error->ring[i].ringbuffer =
+				i915_error_ggtt_object_create(dev_priv,
+							      rbuf->obj);
+		}
 
 		error->ring[i].hws_page =
 			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);