Message ID | 1470983123-22127-14-git-send-email-akash.goel@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 12/08/16 07:25, akash.goel@intel.com wrote: > From: Akash Goel <akash.goel@intel.com> > > Added the dump of GuC log buffer to i915 error state, as the contents of > GuC log buffer would also be useful to determine that why the GPU reset > was triggered. > > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> > Signed-off-by: Akash Goel <akash.goel@intel.com> > --- > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/i915_gpu_error.c | 27 +++++++++++++++++++++++++++ > 2 files changed, 28 insertions(+) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 28ffac5..4bd3790 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -509,6 +509,7 @@ struct drm_i915_error_state { > struct intel_overlay_error_state *overlay; > struct intel_display_error_state *display; > struct drm_i915_error_object *semaphore_obj; > + struct drm_i915_error_object *guc_log_obj; > > struct drm_i915_error_engine { > int engine_id; > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index eecb870..561b523 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -546,6 +546,21 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > } > } > > + if ((obj = error->guc_log_obj)) { > + err_printf(m, "GuC log buffer = 0x%08x\n", > + lower_32_bits(obj->gtt_offset)); > + for (i = 0; i < obj->page_count; i++) { > + for (elt = 0; elt < PAGE_SIZE/4; elt += 4) { Should the condition be PAGE_SIZE / 16 ? I am not sure, looks like it is counting in u32 * 4 chunks so it might be. Or I might be confused.. > + err_printf(m, "[%08x] %08x %08x %08x %08x\n", > + (u32)(i*PAGE_SIZE) + elt*4, > + obj->pages[i][elt], > + obj->pages[i][elt+1], > + obj->pages[i][elt+2], > + obj->pages[i][elt+3]); > + } > + } > + } > + > if (error->overlay) > intel_overlay_print_error_state(m, error->overlay); > > @@ -625,6 +640,7 @@ static void i915_error_state_free(struct kref *error_ref) > } > > i915_error_object_free(error->semaphore_obj); > + i915_error_object_free(error->guc_log_obj); > > for (i = 0; i < error->vm_count; i++) > kfree(error->active_bo[i]); > @@ -1210,6 +1226,16 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, > } > } > > +static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv, > + struct drm_i915_error_state *error) Alignment. > +{ > + if (!dev_priv->guc.log.obj) > + return; > + > + error->guc_log_obj = i915_error_ggtt_object_create(dev_priv, > + dev_priv->guc.log.obj); > +} > + > /* FIXME: Since pin count/bound list is global, we duplicate what we capture per > * VM. > */ > @@ -1439,6 +1465,7 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, > i915_gem_capture_buffers(dev_priv, error); > i915_gem_record_fences(dev_priv, error); > i915_gem_record_rings(dev_priv, error); > + i915_gem_capture_guc_log_buffer(dev_priv, error); > > do_gettimeofday(&error->time); > > Regards, Tvrtko
On Fri, Aug 12, 2016 at 04:20:03PM +0100, Tvrtko Ursulin wrote: > > On 12/08/16 07:25, akash.goel@intel.com wrote: > >From: Akash Goel <akash.goel@intel.com> > > > >Added the dump of GuC log buffer to i915 error state, as the contents of > >GuC log buffer would also be useful to determine that why the GPU reset > >was triggered. > > > >Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> > >Signed-off-by: Akash Goel <akash.goel@intel.com> > >--- > > drivers/gpu/drm/i915/i915_drv.h | 1 + > > drivers/gpu/drm/i915/i915_gpu_error.c | 27 +++++++++++++++++++++++++++ > > 2 files changed, 28 insertions(+) > > > >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > >index 28ffac5..4bd3790 100644 > >--- a/drivers/gpu/drm/i915/i915_drv.h > >+++ b/drivers/gpu/drm/i915/i915_drv.h > >@@ -509,6 +509,7 @@ struct drm_i915_error_state { > > struct intel_overlay_error_state *overlay; > > struct intel_display_error_state *display; > > struct drm_i915_error_object *semaphore_obj; > >+ struct drm_i915_error_object *guc_log_obj; > > > > struct drm_i915_error_engine { > > int engine_id; > >diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > >index eecb870..561b523 100644 > >--- a/drivers/gpu/drm/i915/i915_gpu_error.c > >+++ b/drivers/gpu/drm/i915/i915_gpu_error.c > >@@ -546,6 +546,21 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > > } > > } > > > >+ if ((obj = error->guc_log_obj)) { > >+ err_printf(m, "GuC log buffer = 0x%08x\n", > >+ lower_32_bits(obj->gtt_offset)); > >+ for (i = 0; i < obj->page_count; i++) { > >+ for (elt = 0; elt < PAGE_SIZE/4; elt += 4) { > > Should the condition be PAGE_SIZE / 16 ? I am not sure, looks like > it is counting in u32 * 4 chunks so it might be. Or I might be > confused.. There's (or will be) a function to dump the error object in a uniform manner. This patch is obsolete. -Chris
On 8/12/2016 9:02 PM, Chris Wilson wrote: > On Fri, Aug 12, 2016 at 04:20:03PM +0100, Tvrtko Ursulin wrote: >> >> On 12/08/16 07:25, akash.goel@intel.com wrote: >>> From: Akash Goel <akash.goel@intel.com> >>> >>> Added the dump of GuC log buffer to i915 error state, as the contents of >>> GuC log buffer would also be useful to determine that why the GPU reset >>> was triggered. >>> >>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> >>> Signed-off-by: Akash Goel <akash.goel@intel.com> >>> --- >>> drivers/gpu/drm/i915/i915_drv.h | 1 + >>> drivers/gpu/drm/i915/i915_gpu_error.c | 27 +++++++++++++++++++++++++++ >>> 2 files changed, 28 insertions(+) >>> >>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h >>> index 28ffac5..4bd3790 100644 >>> --- a/drivers/gpu/drm/i915/i915_drv.h >>> +++ b/drivers/gpu/drm/i915/i915_drv.h >>> @@ -509,6 +509,7 @@ struct drm_i915_error_state { >>> struct intel_overlay_error_state *overlay; >>> struct intel_display_error_state *display; >>> struct drm_i915_error_object *semaphore_obj; >>> + struct drm_i915_error_object *guc_log_obj; >>> >>> struct drm_i915_error_engine { >>> int engine_id; >>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c >>> index eecb870..561b523 100644 >>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c >>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c >>> @@ -546,6 +546,21 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, >>> } >>> } >>> >>> + if ((obj = error->guc_log_obj)) { >>> + err_printf(m, "GuC log buffer = 0x%08x\n", >>> + lower_32_bits(obj->gtt_offset)); >>> + for (i = 0; i < obj->page_count; i++) { >>> + for (elt = 0; elt < PAGE_SIZE/4; elt += 4) { >> >> Should the condition be PAGE_SIZE / 16 ? I am not sure, looks like >> it is counting in u32 * 4 chunks so it might be. Or I might be >> confused.. It will be PAGE_SIZE / 4 only. It took me some iterations to get it right. PAGE_SIZE/4 is number of dwords and elt+=4 is covering 4 dwords in every iteration > > There's (or will be) a function to dump the error object in a uniform > manner. This patch is obsolete. There is a print_error_obj() function, but that prints one dword per line. For GuC log buffer its better (for ease of interpretation) to print 4 dwords per line as each sample if of 4 dwords, also headers are of 8 dwords. Other benefit is that it reduces the line count of the error state file (Compared to other captured buffers like ring buffer, batch buffers, status page, size of Log buffer is more, 76 KB). Best regards Akash > -Chris >
On Fri, Aug 12, 2016 at 09:16:03PM +0530, Goel, Akash wrote: > On 8/12/2016 9:02 PM, Chris Wilson wrote: > >There's (or will be) a function to dump the error object in a uniform > >manner. This patch is obsolete. > > There is a print_error_obj() function, but that prints one dword per line. It used to. It will shortly be a compressed stream. Pretty printing is left to userspace. -Chris
On 8/12/2016 9:22 PM, Chris Wilson wrote: > On Fri, Aug 12, 2016 at 09:16:03PM +0530, Goel, Akash wrote: >> On 8/12/2016 9:02 PM, Chris Wilson wrote: >>> There's (or will be) a function to dump the error object in a uniform >>> manner. This patch is obsolete. >> >> There is a print_error_obj() function, but that prints one dword per line. > > It used to. It will shortly be a compressed stream. > Pretty printing is left to userspace. But invariably, we only will be interpreting the error state or Guc log buffer dump, and it will be really convenient if we can have 4 dwords per line matching the log sample size. Best regards Akash > -Chris >
On Fri, Aug 12, 2016 at 09:34:23PM +0530, Goel, Akash wrote: > > > On 8/12/2016 9:22 PM, Chris Wilson wrote: > >On Fri, Aug 12, 2016 at 09:16:03PM +0530, Goel, Akash wrote: > >>On 8/12/2016 9:02 PM, Chris Wilson wrote: > >>>There's (or will be) a function to dump the error object in a uniform > >>>manner. This patch is obsolete. > >> > >>There is a print_error_obj() function, but that prints one dword per line. > > > >It used to. It will shortly be a compressed stream. > > >Pretty printing is left to userspace. > But invariably, we only will be interpreting the error state or Guc > log buffer dump, and it will be really convenient if we can have 4 > dwords per line matching the log sample size. That's fine. Do it in userspace. -Chris
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 28ffac5..4bd3790 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -509,6 +509,7 @@ struct drm_i915_error_state { struct intel_overlay_error_state *overlay; struct intel_display_error_state *display; struct drm_i915_error_object *semaphore_obj; + struct drm_i915_error_object *guc_log_obj; struct drm_i915_error_engine { int engine_id; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index eecb870..561b523 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -546,6 +546,21 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, } } + if ((obj = error->guc_log_obj)) { + err_printf(m, "GuC log buffer = 0x%08x\n", + lower_32_bits(obj->gtt_offset)); + for (i = 0; i < obj->page_count; i++) { + for (elt = 0; elt < PAGE_SIZE/4; elt += 4) { + err_printf(m, "[%08x] %08x %08x %08x %08x\n", + (u32)(i*PAGE_SIZE) + elt*4, + obj->pages[i][elt], + obj->pages[i][elt+1], + obj->pages[i][elt+2], + obj->pages[i][elt+3]); + } + } + } + if (error->overlay) intel_overlay_print_error_state(m, error->overlay); @@ -625,6 +640,7 @@ static void i915_error_state_free(struct kref *error_ref) } i915_error_object_free(error->semaphore_obj); + i915_error_object_free(error->guc_log_obj); for (i = 0; i < error->vm_count; i++) kfree(error->active_bo[i]); @@ -1210,6 +1226,16 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, } } +static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv, + struct drm_i915_error_state *error) +{ + if (!dev_priv->guc.log.obj) + return; + + error->guc_log_obj = i915_error_ggtt_object_create(dev_priv, + dev_priv->guc.log.obj); +} + /* FIXME: Since pin count/bound list is global, we duplicate what we capture per * VM. */ @@ -1439,6 +1465,7 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, i915_gem_capture_buffers(dev_priv, error); i915_gem_record_fences(dev_priv, error); i915_gem_record_rings(dev_priv, error); + i915_gem_capture_guc_log_buffer(dev_priv, error); do_gettimeofday(&error->time);