diff mbox

[13/20] drm/i915: Augment i915 error state to include the dump of GuC log buffer

Message ID 1470983123-22127-14-git-send-email-akash.goel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

akash.goel@intel.com Aug. 12, 2016, 6:25 a.m. UTC
From: Akash Goel <akash.goel@intel.com>

Added the dump of GuC log buffer to i915 error state, as the contents of
GuC log buffer would also be useful to determine that why the GPU reset
was triggered.

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

Comments

Tvrtko Ursulin Aug. 12, 2016, 3:20 p.m. UTC | #1
On 12/08/16 07:25, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
>
> Added the dump of GuC log buffer to i915 error state, as the contents of
> GuC log buffer would also be useful to determine that why the GPU reset
> was triggered.
>
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h       |  1 +
>   drivers/gpu/drm/i915/i915_gpu_error.c | 27 +++++++++++++++++++++++++++
>   2 files changed, 28 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 28ffac5..4bd3790 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -509,6 +509,7 @@ struct drm_i915_error_state {
>   	struct intel_overlay_error_state *overlay;
>   	struct intel_display_error_state *display;
>   	struct drm_i915_error_object *semaphore_obj;
> +	struct drm_i915_error_object *guc_log_obj;
>
>   	struct drm_i915_error_engine {
>   		int engine_id;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index eecb870..561b523 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -546,6 +546,21 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>   		}
>   	}
>
> +	if ((obj = error->guc_log_obj)) {
> +		err_printf(m, "GuC log buffer = 0x%08x\n",
> +			   lower_32_bits(obj->gtt_offset));
> +		for (i = 0; i < obj->page_count; i++) {
> +			for (elt = 0; elt < PAGE_SIZE/4; elt += 4) {

Should the condition be PAGE_SIZE / 16 ? I am not sure, looks like it is 
counting in u32 * 4 chunks so it might be. Or I might be confused..

> +				err_printf(m, "[%08x] %08x %08x %08x %08x\n",
> +					   (u32)(i*PAGE_SIZE) + elt*4,
> +					   obj->pages[i][elt],
> +					   obj->pages[i][elt+1],
> +					   obj->pages[i][elt+2],
> +					   obj->pages[i][elt+3]);
> +			}
> +		}
> +	}
> +
>   	if (error->overlay)
>   		intel_overlay_print_error_state(m, error->overlay);
>
> @@ -625,6 +640,7 @@ static void i915_error_state_free(struct kref *error_ref)
>   	}
>
>   	i915_error_object_free(error->semaphore_obj);
> +	i915_error_object_free(error->guc_log_obj);
>
>   	for (i = 0; i < error->vm_count; i++)
>   		kfree(error->active_bo[i]);
> @@ -1210,6 +1226,16 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>   	}
>   }
>
> +static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv,
> +				     struct drm_i915_error_state *error)

Alignment.

> +{
> +	if (!dev_priv->guc.log.obj)
> +		return;
> +
> +	error->guc_log_obj = i915_error_ggtt_object_create(dev_priv,
> +						dev_priv->guc.log.obj);
> +}
> +
>   /* FIXME: Since pin count/bound list is global, we duplicate what we capture per
>    * VM.
>    */
> @@ -1439,6 +1465,7 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
>   	i915_gem_capture_buffers(dev_priv, error);
>   	i915_gem_record_fences(dev_priv, error);
>   	i915_gem_record_rings(dev_priv, error);
> +	i915_gem_capture_guc_log_buffer(dev_priv, error);
>
>   	do_gettimeofday(&error->time);
>
>

Regards,

Tvrtko
Chris Wilson Aug. 12, 2016, 3:32 p.m. UTC | #2
On Fri, Aug 12, 2016 at 04:20:03PM +0100, Tvrtko Ursulin wrote:
> 
> On 12/08/16 07:25, akash.goel@intel.com wrote:
> >From: Akash Goel <akash.goel@intel.com>
> >
> >Added the dump of GuC log buffer to i915 error state, as the contents of
> >GuC log buffer would also be useful to determine that why the GPU reset
> >was triggered.
> >
> >Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> >Signed-off-by: Akash Goel <akash.goel@intel.com>
> >---
> >  drivers/gpu/drm/i915/i915_drv.h       |  1 +
> >  drivers/gpu/drm/i915/i915_gpu_error.c | 27 +++++++++++++++++++++++++++
> >  2 files changed, 28 insertions(+)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 28ffac5..4bd3790 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -509,6 +509,7 @@ struct drm_i915_error_state {
> >  	struct intel_overlay_error_state *overlay;
> >  	struct intel_display_error_state *display;
> >  	struct drm_i915_error_object *semaphore_obj;
> >+	struct drm_i915_error_object *guc_log_obj;
> >
> >  	struct drm_i915_error_engine {
> >  		int engine_id;
> >diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> >index eecb870..561b523 100644
> >--- a/drivers/gpu/drm/i915/i915_gpu_error.c
> >+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> >@@ -546,6 +546,21 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
> >  		}
> >  	}
> >
> >+	if ((obj = error->guc_log_obj)) {
> >+		err_printf(m, "GuC log buffer = 0x%08x\n",
> >+			   lower_32_bits(obj->gtt_offset));
> >+		for (i = 0; i < obj->page_count; i++) {
> >+			for (elt = 0; elt < PAGE_SIZE/4; elt += 4) {
> 
> Should the condition be PAGE_SIZE / 16 ? I am not sure, looks like
> it is counting in u32 * 4 chunks so it might be. Or I might be
> confused..

There's (or will be) a function to dump the error object in a uniform
manner. This patch is obsolete.
-Chris
akash.goel@intel.com Aug. 12, 2016, 3:46 p.m. UTC | #3
On 8/12/2016 9:02 PM, Chris Wilson wrote:
> On Fri, Aug 12, 2016 at 04:20:03PM +0100, Tvrtko Ursulin wrote:
>>
>> On 12/08/16 07:25, akash.goel@intel.com wrote:
>>> From: Akash Goel <akash.goel@intel.com>
>>>
>>> Added the dump of GuC log buffer to i915 error state, as the contents of
>>> GuC log buffer would also be useful to determine that why the GPU reset
>>> was triggered.
>>>
>>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>>>  drivers/gpu/drm/i915/i915_gpu_error.c | 27 +++++++++++++++++++++++++++
>>>  2 files changed, 28 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index 28ffac5..4bd3790 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -509,6 +509,7 @@ struct drm_i915_error_state {
>>>  	struct intel_overlay_error_state *overlay;
>>>  	struct intel_display_error_state *display;
>>>  	struct drm_i915_error_object *semaphore_obj;
>>> +	struct drm_i915_error_object *guc_log_obj;
>>>
>>>  	struct drm_i915_error_engine {
>>>  		int engine_id;
>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> index eecb870..561b523 100644
>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> @@ -546,6 +546,21 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>>>  		}
>>>  	}
>>>
>>> +	if ((obj = error->guc_log_obj)) {
>>> +		err_printf(m, "GuC log buffer = 0x%08x\n",
>>> +			   lower_32_bits(obj->gtt_offset));
>>> +		for (i = 0; i < obj->page_count; i++) {
>>> +			for (elt = 0; elt < PAGE_SIZE/4; elt += 4) {
>>
>> Should the condition be PAGE_SIZE / 16 ? I am not sure, looks like
>> it is counting in u32 * 4 chunks so it might be. Or I might be
>> confused..
It will be PAGE_SIZE / 4 only. It took me some iterations to get it right.
PAGE_SIZE/4 is number of dwords and
elt+=4      is covering 4 dwords in every iteration


>
> There's (or will be) a function to dump the error object in a uniform
> manner. This patch is obsolete.

There is a print_error_obj() function, but that prints one dword per line.
For GuC log buffer its better (for ease of interpretation) to print 4 
dwords per line as each sample if of 4 dwords, also headers are of 8 dwords.
Other benefit is that it reduces the line count of the error state file 
(Compared to other captured buffers like ring buffer, batch buffers, 
status page, size of Log buffer is more, 76 KB).

Best regards
Akash



> -Chris
>
Chris Wilson Aug. 12, 2016, 3:52 p.m. UTC | #4
On Fri, Aug 12, 2016 at 09:16:03PM +0530, Goel, Akash wrote:
> On 8/12/2016 9:02 PM, Chris Wilson wrote:
> >There's (or will be) a function to dump the error object in a uniform
> >manner. This patch is obsolete.
> 
> There is a print_error_obj() function, but that prints one dword per line.

It used to. It will shortly be a compressed stream. Pretty printing is
left to userspace.
-Chris
akash.goel@intel.com Aug. 12, 2016, 4:04 p.m. UTC | #5
On 8/12/2016 9:22 PM, Chris Wilson wrote:
> On Fri, Aug 12, 2016 at 09:16:03PM +0530, Goel, Akash wrote:
>> On 8/12/2016 9:02 PM, Chris Wilson wrote:
>>> There's (or will be) a function to dump the error object in a uniform
>>> manner. This patch is obsolete.
>>
>> There is a print_error_obj() function, but that prints one dword per line.
>
> It used to. It will shortly be a compressed stream.

> Pretty printing is left to userspace.
But invariably, we only will be interpreting the error state or Guc log 
buffer dump, and it will be really convenient if we can have 4 dwords 
per line matching the log sample size.


Best regards
Akash
> -Chris
>
Chris Wilson Aug. 12, 2016, 4:09 p.m. UTC | #6
On Fri, Aug 12, 2016 at 09:34:23PM +0530, Goel, Akash wrote:
> 
> 
> On 8/12/2016 9:22 PM, Chris Wilson wrote:
> >On Fri, Aug 12, 2016 at 09:16:03PM +0530, Goel, Akash wrote:
> >>On 8/12/2016 9:02 PM, Chris Wilson wrote:
> >>>There's (or will be) a function to dump the error object in a uniform
> >>>manner. This patch is obsolete.
> >>
> >>There is a print_error_obj() function, but that prints one dword per line.
> >
> >It used to. It will shortly be a compressed stream.
> 
> >Pretty printing is left to userspace.
> But invariably, we only will be interpreting the error state or Guc
> log buffer dump, and it will be really convenient if we can have 4
> dwords per line matching the log sample size.

That's fine. Do it in userspace.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 28ffac5..4bd3790 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -509,6 +509,7 @@  struct drm_i915_error_state {
 	struct intel_overlay_error_state *overlay;
 	struct intel_display_error_state *display;
 	struct drm_i915_error_object *semaphore_obj;
+	struct drm_i915_error_object *guc_log_obj;
 
 	struct drm_i915_error_engine {
 		int engine_id;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index eecb870..561b523 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -546,6 +546,21 @@  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		}
 	}
 
+	if ((obj = error->guc_log_obj)) {
+		err_printf(m, "GuC log buffer = 0x%08x\n",
+			   lower_32_bits(obj->gtt_offset));
+		for (i = 0; i < obj->page_count; i++) {
+			for (elt = 0; elt < PAGE_SIZE/4; elt += 4) {
+				err_printf(m, "[%08x] %08x %08x %08x %08x\n",
+					   (u32)(i*PAGE_SIZE) + elt*4,
+					   obj->pages[i][elt],
+					   obj->pages[i][elt+1],
+					   obj->pages[i][elt+2],
+					   obj->pages[i][elt+3]);
+			}
+		}
+	}
+
 	if (error->overlay)
 		intel_overlay_print_error_state(m, error->overlay);
 
@@ -625,6 +640,7 @@  static void i915_error_state_free(struct kref *error_ref)
 	}
 
 	i915_error_object_free(error->semaphore_obj);
+	i915_error_object_free(error->guc_log_obj);
 
 	for (i = 0; i < error->vm_count; i++)
 		kfree(error->active_bo[i]);
@@ -1210,6 +1226,16 @@  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 	}
 }
 
+static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv,
+				     struct drm_i915_error_state *error)
+{
+	if (!dev_priv->guc.log.obj)
+		return;
+
+	error->guc_log_obj = i915_error_ggtt_object_create(dev_priv,
+						dev_priv->guc.log.obj);
+}
+
 /* FIXME: Since pin count/bound list is global, we duplicate what we capture per
  * VM.
  */
@@ -1439,6 +1465,7 @@  void i915_capture_error_state(struct drm_i915_private *dev_priv,
 	i915_gem_capture_buffers(dev_priv, error);
 	i915_gem_record_fences(dev_priv, error);
 	i915_gem_record_rings(dev_priv, error);
+	i915_gem_capture_guc_log_buffer(dev_priv, error);
 
 	do_gettimeofday(&error->time);