diff mbox

[v6,1/3] drm/i915: Add Guc/HuC firmware details to error state

Message ID 20171026173657.49648-1-michal.wajdeczko@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michal Wajdeczko Oct. 26, 2017, 5:36 p.m. UTC
Include GuC and HuC firmware details in captured error state
to provide additional debug information. To reuse existing
uc firmware pretty printer, introduce new drm-printer variant
that works with our i915_error_state_buf output. Also update
uc firmware pretty printer to accept const input.

v2: don't rely on current caps (Chris)
    dump correct fw info (Michal)
v3: simplify capture of custom paths (Chris)
v4: improve 'why' comment (Joonas)
    trim output if no fw path (Michal)
    group code around uc error state (Michal)
v5: use error in cleanup_uc (Michal)

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       |  5 +++
 drivers/gpu/drm/i915/i915_gpu_error.c | 65 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_uc_fw.c    |  6 +++-
 drivers/gpu/drm/i915/intel_uc_fw.h    |  2 +-
 4 files changed, 76 insertions(+), 2 deletions(-)

Comments

Chris Wilson Nov. 6, 2017, 2:15 p.m. UTC | #1
Quoting Michal Wajdeczko (2017-10-26 18:36:55)
> Include GuC and HuC firmware details in captured error state
> to provide additional debug information. To reuse existing
> uc firmware pretty printer, introduce new drm-printer variant
> that works with our i915_error_state_buf output. Also update
> uc firmware pretty printer to accept const input.
> 
> v2: don't rely on current caps (Chris)
>     dump correct fw info (Michal)
> v3: simplify capture of custom paths (Chris)
> v4: improve 'why' comment (Joonas)
>     trim output if no fw path (Michal)
>     group code around uc error state (Michal)
> v5: use error in cleanup_uc (Michal)
> 
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  5 +++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 65 +++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_uc_fw.c    |  6 +++-
>  drivers/gpu/drm/i915/intel_uc_fw.h    |  2 +-
>  4 files changed, 76 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 366ba74..f19f0fa 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -911,6 +911,11 @@ struct i915_gpu_state {
>         struct intel_device_info device_info;
>         struct i915_params params;
>  
> +       struct i915_error_uc {
> +               struct intel_uc_fw guc_fw;
> +               struct intel_uc_fw huc_fw;
> +       } uc;
> +
>         /* Generic register state */
>         u32 eir;
>         u32 pgtbl_er;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 653fb69..4500fc8 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -30,6 +30,8 @@
>  #include <generated/utsrelease.h>
>  #include <linux/stop_machine.h>
>  #include <linux/zlib.h>
> +#include <drm/drm_print.h>
> +
>  #include "i915_drv.h"
>  
>  static const char *engine_str(int engine)
> @@ -175,6 +177,21 @@ static void i915_error_puts(struct drm_i915_error_state_buf *e,
>  #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
>  #define err_puts(e, s) i915_error_puts(e, s)
>  
> +static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
> +{
> +       i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
> +}
> +
> +static inline struct drm_printer
> +i915_error_printer(struct drm_i915_error_state_buf *e)
> +{
> +       struct drm_printer p = {
> +               .printfn = __i915_printfn_error,
> +               .arg = e,
> +       };
> +       return p;
> +}
> +
>  #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
>  
>  struct compress {
> @@ -589,11 +606,26 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m,
>                    pdev->subsystem_device);
>  }
>  
> +static void err_print_uc(struct drm_i915_error_state_buf *m,
> +                        const struct i915_error_uc *error_uc)
> +{
> +       struct drm_printer p = i915_error_printer(m);
> +       const struct i915_gpu_state *error =
> +               container_of(error_uc, typeof(*error), uc);
> +
> +       if (!error->device_info.has_guc)
> +               return;

I am still not keen on how derived state is mixed in with checking
whether or not a piece of fw was presented to HW before the hang, it is
still better than before.

> +
> +       intel_uc_fw_dump(&error_uc->guc_fw, &p);
> +       intel_uc_fw_dump(&error_uc->huc_fw, &p);
> +}
> +
>  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>                             const struct i915_gpu_state *error)
>  {
>         struct drm_i915_private *dev_priv = m->i915;
>         struct drm_i915_error_object *obj;
> +
>         int i, j;
>  
>         if (!error) {
> @@ -773,6 +805,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  
>         err_print_capabilities(m, &error->device_info);
>         err_print_params(m, &error->params);
> +       err_print_uc(m, &error->uc);
>  
>         if (m->bytes == 0 && m->err)
>                 return m->err;
> @@ -831,6 +864,14 @@ static __always_inline void free_param(const char *type, void *x)
>                 kfree(*(void **)x);
>  }
>  
> +static void cleanup_uc_state(struct i915_gpu_state *error)
> +{
> +       struct i915_error_uc *error_uc = &error->uc;
> +
> +       kfree(error_uc->guc_fw.path);
> +       kfree(error_uc->huc_fw.path);
> +}
> +
>  void __i915_gpu_state_free(struct kref *error_ref)
>  {
>         struct i915_gpu_state *error =
> @@ -870,6 +911,8 @@ void __i915_gpu_state_free(struct kref *error_ref)
>         I915_PARAMS_FOR_EACH(FREE);
>  #undef FREE
>  
> +       cleanup_uc_state(error);
> +
>         kfree(error);
>  }
>  
> @@ -1559,6 +1602,26 @@ static void i915_capture_pinned_buffers(struct drm_i915_private *dev_priv,
>         error->pinned_bo = bo;
>  }
>  
> +static void capture_uc_state(struct i915_gpu_state *error)
> +{
> +       struct drm_i915_private *i915 = error->i915;
> +       struct i915_error_uc *error_uc = &error->uc;
> +
> +       /* Capturing uC state won't be useful if there is no GuC */
> +       if (!error->device_info.has_guc)
> +               return;
> +
> +       error_uc->guc_fw = i915->guc.fw;
> +       error_uc->huc_fw = i915->huc.fw;
> +
> +       /* Non-default firmware paths will be specified by the modparam.
> +        * As modparams are generally accesible from the userspace make
> +        * explicit copies of the firmware paths.
> +        */
> +       error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
> +       error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
> +}
> +
>  static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv,
>                                             struct i915_gpu_state *error)
>  {
> @@ -1710,6 +1773,8 @@ static int capture(void *data)
>         I915_PARAMS_FOR_EACH(DUP);
>  #undef DUP
>  
> +       capture_uc_state(error);
> +
>         i915_capture_gen_state(error->i915, error);
>         i915_capture_reg_state(error->i915, error);
>         i915_gem_record_fences(error->i915, error);
> diff --git a/drivers/gpu/drm/i915/intel_uc_fw.c b/drivers/gpu/drm/i915/intel_uc_fw.c
> index 973888e..79a8797 100644
> --- a/drivers/gpu/drm/i915/intel_uc_fw.c
> +++ b/drivers/gpu/drm/i915/intel_uc_fw.c
> @@ -299,10 +299,14 @@ void intel_uc_fw_fini(struct intel_uc_fw *uc_fw)
>   *
>   * Pretty printer for uC firmware.
>   */
> -void intel_uc_fw_dump(struct intel_uc_fw *uc_fw, struct drm_printer *p)
> +void intel_uc_fw_dump(const struct intel_uc_fw *uc_fw, struct drm_printer *p)
>  {
>         drm_printf(p, "%s firmware: %s\n",
>                    intel_uc_fw_type_repr(uc_fw->type), uc_fw->path);
> +
> +       if (!uc_fw->path)
> +               return;

This could be NULL simply due to allocation failure. You still want the
status and version info.

As the path isn't dereferenced here, it is safe enough to drop this
chunk, as you currently don't even try and pretty-print the error state
unless it is enabled.

Removed the chunk and applied.
-Chris
Joonas Lahtinen Nov. 8, 2017, 11:27 a.m. UTC | #2
On Mon, 2017-11-06 at 14:15 +0000, Chris Wilson wrote:
> Quoting Michal Wajdeczko (2017-10-26 18:36:55)
> > Include GuC and HuC firmware details in captured error state
> > to provide additional debug information. To reuse existing
> > uc firmware pretty printer, introduce new drm-printer variant
> > that works with our i915_error_state_buf output. Also update
> > uc firmware pretty printer to accept const input.
> > 
> > v2: don't rely on current caps (Chris)
> >     dump correct fw info (Michal)
> > v3: simplify capture of custom paths (Chris)
> > v4: improve 'why' comment (Joonas)
> >     trim output if no fw path (Michal)
> >     group code around uc error state (Michal)
> > v5: use error in cleanup_uc (Michal)
> > 
> > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

<SNIP>

> 
> Removed the chunk and applied.

So, this got merged. What's is the next series to look forward to? I
again see multiple GuC series in my inbox and mailing list.

Regards, Joonas
Michal Wajdeczko Nov. 8, 2017, 12:20 p.m. UTC | #3
On Wed, 08 Nov 2017 12:27:20 +0100, Joonas Lahtinen  
<joonas.lahtinen@linux.intel.com> wrote:

> On Mon, 2017-11-06 at 14:15 +0000, Chris Wilson wrote:
>> Quoting Michal Wajdeczko (2017-10-26 18:36:55)
>> > Include GuC and HuC firmware details in captured error state
>> > to provide additional debug information. To reuse existing
>> > uc firmware pretty printer, introduce new drm-printer variant
>> > that works with our i915_error_state_buf output. Also update
>> > uc firmware pretty printer to accept const input.
>> >
>> > v2: don't rely on current caps (Chris)
>> >     dump correct fw info (Michal)
>> > v3: simplify capture of custom paths (Chris)
>> > v4: improve 'why' comment (Joonas)
>> >     trim output if no fw path (Michal)
>> >     group code around uc error state (Michal)
>> > v5: use error in cleanup_uc (Michal)
>> >
>> > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>
> <SNIP>
>
>>
>> Removed the chunk and applied.
>
> So, this got merged. What's is the next series to look forward to? I
> again see multiple GuC series in my inbox and mailing list.
>

I think we should wait now for new series from Sujaritha

Michal
Michal Wajdeczko Nov. 8, 2017, 8:21 p.m. UTC | #4
On Wed, 08 Nov 2017 13:20:16 +0100, Michal Wajdeczko  
<michal.wajdeczko@intel.com> wrote:

> On Wed, 08 Nov 2017 12:27:20 +0100, Joonas Lahtinen  
> <joonas.lahtinen@linux.intel.com> wrote:
>
>> On Mon, 2017-11-06 at 14:15 +0000, Chris Wilson wrote:
>>> Quoting Michal Wajdeczko (2017-10-26 18:36:55)
>>> > Include GuC and HuC firmware details in captured error state
>>> > to provide additional debug information. To reuse existing
>>> > uc firmware pretty printer, introduce new drm-printer variant
>>> > that works with our i915_error_state_buf output. Also update
>>> > uc firmware pretty printer to accept const input.
>>> >
>>> > v2: don't rely on current caps (Chris)
>>> >     dump correct fw info (Michal)
>>> > v3: simplify capture of custom paths (Chris)
>>> > v4: improve 'why' comment (Joonas)
>>> >     trim output if no fw path (Michal)
>>> >     group code around uc error state (Michal)
>>> > v5: use error in cleanup_uc (Michal)
>>> >
>>> > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> > Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>
>> <SNIP>
>>
>>>
>>> Removed the chunk and applied.
>>
>> So, this got merged. What's is the next series to look forward to? I
>> again see multiple GuC series in my inbox and mailing list.
>>
>
> I think we should wait now for new series from Sujaritha
>

I forgot about this series (already reviewed by Sagar) that
you can look while waiting for Sujarita update:

https://patchwork.freedesktop.org/series/33135/

Michal
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 366ba74..f19f0fa 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -911,6 +911,11 @@  struct i915_gpu_state {
 	struct intel_device_info device_info;
 	struct i915_params params;
 
+	struct i915_error_uc {
+		struct intel_uc_fw guc_fw;
+		struct intel_uc_fw huc_fw;
+	} uc;
+
 	/* Generic register state */
 	u32 eir;
 	u32 pgtbl_er;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 653fb69..4500fc8 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -30,6 +30,8 @@ 
 #include <generated/utsrelease.h>
 #include <linux/stop_machine.h>
 #include <linux/zlib.h>
+#include <drm/drm_print.h>
+
 #include "i915_drv.h"
 
 static const char *engine_str(int engine)
@@ -175,6 +177,21 @@  static void i915_error_puts(struct drm_i915_error_state_buf *e,
 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
 #define err_puts(e, s) i915_error_puts(e, s)
 
+static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
+{
+	i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
+}
+
+static inline struct drm_printer
+i915_error_printer(struct drm_i915_error_state_buf *e)
+{
+	struct drm_printer p = {
+		.printfn = __i915_printfn_error,
+		.arg = e,
+	};
+	return p;
+}
+
 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
 
 struct compress {
@@ -589,11 +606,26 @@  static void err_print_pciid(struct drm_i915_error_state_buf *m,
 		   pdev->subsystem_device);
 }
 
+static void err_print_uc(struct drm_i915_error_state_buf *m,
+			 const struct i915_error_uc *error_uc)
+{
+	struct drm_printer p = i915_error_printer(m);
+	const struct i915_gpu_state *error =
+		container_of(error_uc, typeof(*error), uc);
+
+	if (!error->device_info.has_guc)
+		return;
+
+	intel_uc_fw_dump(&error_uc->guc_fw, &p);
+	intel_uc_fw_dump(&error_uc->huc_fw, &p);
+}
+
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			    const struct i915_gpu_state *error)
 {
 	struct drm_i915_private *dev_priv = m->i915;
 	struct drm_i915_error_object *obj;
+
 	int i, j;
 
 	if (!error) {
@@ -773,6 +805,7 @@  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 
 	err_print_capabilities(m, &error->device_info);
 	err_print_params(m, &error->params);
+	err_print_uc(m, &error->uc);
 
 	if (m->bytes == 0 && m->err)
 		return m->err;
@@ -831,6 +864,14 @@  static __always_inline void free_param(const char *type, void *x)
 		kfree(*(void **)x);
 }
 
+static void cleanup_uc_state(struct i915_gpu_state *error)
+{
+	struct i915_error_uc *error_uc = &error->uc;
+
+	kfree(error_uc->guc_fw.path);
+	kfree(error_uc->huc_fw.path);
+}
+
 void __i915_gpu_state_free(struct kref *error_ref)
 {
 	struct i915_gpu_state *error =
@@ -870,6 +911,8 @@  void __i915_gpu_state_free(struct kref *error_ref)
 	I915_PARAMS_FOR_EACH(FREE);
 #undef FREE
 
+	cleanup_uc_state(error);
+
 	kfree(error);
 }
 
@@ -1559,6 +1602,26 @@  static void i915_capture_pinned_buffers(struct drm_i915_private *dev_priv,
 	error->pinned_bo = bo;
 }
 
+static void capture_uc_state(struct i915_gpu_state *error)
+{
+	struct drm_i915_private *i915 = error->i915;
+	struct i915_error_uc *error_uc = &error->uc;
+
+	/* Capturing uC state won't be useful if there is no GuC */
+	if (!error->device_info.has_guc)
+		return;
+
+	error_uc->guc_fw = i915->guc.fw;
+	error_uc->huc_fw = i915->huc.fw;
+
+	/* Non-default firmware paths will be specified by the modparam.
+	 * As modparams are generally accesible from the userspace make
+	 * explicit copies of the firmware paths.
+	 */
+	error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
+	error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
+}
+
 static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv,
 					    struct i915_gpu_state *error)
 {
@@ -1710,6 +1773,8 @@  static int capture(void *data)
 	I915_PARAMS_FOR_EACH(DUP);
 #undef DUP
 
+	capture_uc_state(error);
+
 	i915_capture_gen_state(error->i915, error);
 	i915_capture_reg_state(error->i915, error);
 	i915_gem_record_fences(error->i915, error);
diff --git a/drivers/gpu/drm/i915/intel_uc_fw.c b/drivers/gpu/drm/i915/intel_uc_fw.c
index 973888e..79a8797 100644
--- a/drivers/gpu/drm/i915/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/intel_uc_fw.c
@@ -299,10 +299,14 @@  void intel_uc_fw_fini(struct intel_uc_fw *uc_fw)
  *
  * Pretty printer for uC firmware.
  */
-void intel_uc_fw_dump(struct intel_uc_fw *uc_fw, struct drm_printer *p)
+void intel_uc_fw_dump(const struct intel_uc_fw *uc_fw, struct drm_printer *p)
 {
 	drm_printf(p, "%s firmware: %s\n",
 		   intel_uc_fw_type_repr(uc_fw->type), uc_fw->path);
+
+	if (!uc_fw->path)
+		return;
+
 	drm_printf(p, "\tstatus: fetch %s, load %s\n",
 		   intel_uc_fw_status_repr(uc_fw->fetch_status),
 		   intel_uc_fw_status_repr(uc_fw->load_status));
diff --git a/drivers/gpu/drm/i915/intel_uc_fw.h b/drivers/gpu/drm/i915/intel_uc_fw.h
index 1329036..5394d9d 100644
--- a/drivers/gpu/drm/i915/intel_uc_fw.h
+++ b/drivers/gpu/drm/i915/intel_uc_fw.h
@@ -116,6 +116,6 @@  int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,
 		       int (*xfer)(struct intel_uc_fw *uc_fw,
 				   struct i915_vma *vma));
 void intel_uc_fw_fini(struct intel_uc_fw *uc_fw);
-void intel_uc_fw_dump(struct intel_uc_fw *uc_fw, struct drm_printer *p);
+void intel_uc_fw_dump(const struct intel_uc_fw *uc_fw, struct drm_printer *p);
 
 #endif