diff mbox series

[RFC,2/2] drm/i915: Use ABI engine class in error state ecode

Message ID 20201104134743.916027-2-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/2] drm/i915: Improve record of hung engines in error state | expand

Commit Message

Tvrtko Ursulin Nov. 4, 2020, 1:47 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Instead of printing out the internal engine mask, which can change between
kernel versions making it difficult to map to actual engines, present a
bitmask of hanging engines ABI classes. For example:

  [drm] GPU HANG: ecode 9:24dffffd:8, in gem_exec_schedu [1334]

Notice the swapped the order of ecode and bitmask which makes the new
versus old bug reports are obvious.

Engine ABI class is useful to quickly categorize render vs media etc hangs
in bug reports. Considering virtual engine even more so than the current
scheme.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

Comments

Chris Wilson Nov. 4, 2020, 11:30 p.m. UTC | #1
Quoting Tvrtko Ursulin (2020-11-04 13:47:43)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Instead of printing out the internal engine mask, which can change between
> kernel versions making it difficult to map to actual engines, present a
> bitmask of hanging engines ABI classes. For example:
> 
>   [drm] GPU HANG: ecode 9:24dffffd:8, in gem_exec_schedu [1334]
> 
> Notice the swapped the order of ecode and bitmask which makes the new
> versus old bug reports are obvious.
> 
> Engine ABI class is useful to quickly categorize render vs media etc hangs
> in bug reports. Considering virtual engine even more so than the current
> scheme.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++++++------
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 857db66cc4a3..e7d9af184d58 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1659,17 +1659,16 @@ static u32 generate_ecode(const struct intel_engine_coredump *ee)
>  static const char *error_msg(struct i915_gpu_coredump *error)
>  {
>         struct intel_engine_coredump *first = NULL;
> +       unsigned int hung_classes = 0;
>         struct intel_gt_coredump *gt;
> -       intel_engine_mask_t engines;
>         int len;
>  
> -       engines = 0;
>         for (gt = error->gt; gt; gt = gt->next) {
>                 struct intel_engine_coredump *cs;
>  
>                 for (cs = gt->engine; cs; cs = cs->next) {
>                         if (cs->hung) {
> -                               engines |= cs->engine->mask;
> +                               hung_classes |= BIT(cs->engine->uabi_class);

Your argument makes sense.

>                                 if (!first)
>                                         first = cs;
>                         }
> @@ -1677,9 +1676,11 @@ static const char *error_msg(struct i915_gpu_coredump *error)
>         }
>  
>         len = scnprintf(error->error_msg, sizeof(error->error_msg),
> -                       "GPU HANG: ecode %d:%x:%08x",
> -                       INTEL_GEN(error->i915), engines,
> -                       generate_ecode(first));
> +                       "GPU HANG: ecode %d:%08x:%x",
> +                       INTEL_GEN(error->i915),
> +                       generate_ecode(first),
> +                       hung_classes);

I vote for keeping gen:engines:ecode order, for me that is biggest to
smallest.
-Chris
Tvrtko Ursulin Nov. 5, 2020, 9:31 a.m. UTC | #2
On 04/11/2020 23:30, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-11-04 13:47:43)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Instead of printing out the internal engine mask, which can change between
>> kernel versions making it difficult to map to actual engines, present a
>> bitmask of hanging engines ABI classes. For example:
>>
>>    [drm] GPU HANG: ecode 9:24dffffd:8, in gem_exec_schedu [1334]
>>
>> Notice the swapped the order of ecode and bitmask which makes the new
>> versus old bug reports are obvious.
>>
>> Engine ABI class is useful to quickly categorize render vs media etc hangs
>> in bug reports. Considering virtual engine even more so than the current
>> scheme.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++++++------
>>   1 file changed, 7 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>> index 857db66cc4a3..e7d9af184d58 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> @@ -1659,17 +1659,16 @@ static u32 generate_ecode(const struct intel_engine_coredump *ee)
>>   static const char *error_msg(struct i915_gpu_coredump *error)
>>   {
>>          struct intel_engine_coredump *first = NULL;
>> +       unsigned int hung_classes = 0;
>>          struct intel_gt_coredump *gt;
>> -       intel_engine_mask_t engines;
>>          int len;
>>   
>> -       engines = 0;
>>          for (gt = error->gt; gt; gt = gt->next) {
>>                  struct intel_engine_coredump *cs;
>>   
>>                  for (cs = gt->engine; cs; cs = cs->next) {
>>                          if (cs->hung) {
>> -                               engines |= cs->engine->mask;
>> +                               hung_classes |= BIT(cs->engine->uabi_class);
> 
> Your argument makes sense.
> 
>>                                  if (!first)
>>                                          first = cs;
>>                          }
>> @@ -1677,9 +1676,11 @@ static const char *error_msg(struct i915_gpu_coredump *error)
>>          }
>>   
>>          len = scnprintf(error->error_msg, sizeof(error->error_msg),
>> -                       "GPU HANG: ecode %d:%x:%08x",
>> -                       INTEL_GEN(error->i915), engines,
>> -                       generate_ecode(first));
>> +                       "GPU HANG: ecode %d:%08x:%x",
>> +                       INTEL_GEN(error->i915),
>> +                       generate_ecode(first),
>> +                       hung_classes);
> 
> I vote for keeping gen:engines:ecode order, for me that is biggest to
> smallest.

It would not be obvious what kind of bits are in the mask then, say 
looking from the ecode in maybe bugzilla titles and two different bugs 
may be incorrectly marked as duplicate. Maybe instead of the order we 
could change the separator(s)? Or add prefix/suffix to the mask?

Regards,

Tvrtko
Chris Wilson Nov. 5, 2020, 9:49 a.m. UTC | #3
Quoting Tvrtko Ursulin (2020-11-05 09:31:07)
> 
> On 04/11/2020 23:30, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-11-04 13:47:43)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> Instead of printing out the internal engine mask, which can change between
> >> kernel versions making it difficult to map to actual engines, present a
> >> bitmask of hanging engines ABI classes. For example:
> >>
> >>    [drm] GPU HANG: ecode 9:24dffffd:8, in gem_exec_schedu [1334]
> >>
> >> Notice the swapped the order of ecode and bitmask which makes the new
> >> versus old bug reports are obvious.
> >>
> >> Engine ABI class is useful to quickly categorize render vs media etc hangs
> >> in bug reports. Considering virtual engine even more so than the current
> >> scheme.
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> ---
> >>   drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++++++------
> >>   1 file changed, 7 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> >> index 857db66cc4a3..e7d9af184d58 100644
> >> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> >> @@ -1659,17 +1659,16 @@ static u32 generate_ecode(const struct intel_engine_coredump *ee)
> >>   static const char *error_msg(struct i915_gpu_coredump *error)
> >>   {
> >>          struct intel_engine_coredump *first = NULL;
> >> +       unsigned int hung_classes = 0;
> >>          struct intel_gt_coredump *gt;
> >> -       intel_engine_mask_t engines;
> >>          int len;
> >>   
> >> -       engines = 0;
> >>          for (gt = error->gt; gt; gt = gt->next) {
> >>                  struct intel_engine_coredump *cs;
> >>   
> >>                  for (cs = gt->engine; cs; cs = cs->next) {
> >>                          if (cs->hung) {
> >> -                               engines |= cs->engine->mask;
> >> +                               hung_classes |= BIT(cs->engine->uabi_class);
> > 
> > Your argument makes sense.
> > 
> >>                                  if (!first)
> >>                                          first = cs;
> >>                          }
> >> @@ -1677,9 +1676,11 @@ static const char *error_msg(struct i915_gpu_coredump *error)
> >>          }
> >>   
> >>          len = scnprintf(error->error_msg, sizeof(error->error_msg),
> >> -                       "GPU HANG: ecode %d:%x:%08x",
> >> -                       INTEL_GEN(error->i915), engines,
> >> -                       generate_ecode(first));
> >> +                       "GPU HANG: ecode %d:%08x:%x",
> >> +                       INTEL_GEN(error->i915),
> >> +                       generate_ecode(first),
> >> +                       hung_classes);
> > 
> > I vote for keeping gen:engines:ecode order, for me that is biggest to
> > smallest.
> 
> It would not be obvious what kind of bits are in the mask then, say 
> looking from the ecode in maybe bugzilla titles and two different bugs 
> may be incorrectly marked as duplicate.

No one should be marking bugs as duplicate based on this string. It
really is that useless.

> Maybe instead of the order we 
> could change the separator(s)? Or add prefix/suffix to the mask?

I don't see the point; we've changed the construction several times.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 857db66cc4a3..e7d9af184d58 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1659,17 +1659,16 @@  static u32 generate_ecode(const struct intel_engine_coredump *ee)
 static const char *error_msg(struct i915_gpu_coredump *error)
 {
 	struct intel_engine_coredump *first = NULL;
+	unsigned int hung_classes = 0;
 	struct intel_gt_coredump *gt;
-	intel_engine_mask_t engines;
 	int len;
 
-	engines = 0;
 	for (gt = error->gt; gt; gt = gt->next) {
 		struct intel_engine_coredump *cs;
 
 		for (cs = gt->engine; cs; cs = cs->next) {
 			if (cs->hung) {
-				engines |= cs->engine->mask;
+				hung_classes |= BIT(cs->engine->uabi_class);
 				if (!first)
 					first = cs;
 			}
@@ -1677,9 +1676,11 @@  static const char *error_msg(struct i915_gpu_coredump *error)
 	}
 
 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
-			"GPU HANG: ecode %d:%x:%08x",
-			INTEL_GEN(error->i915), engines,
-			generate_ecode(first));
+			"GPU HANG: ecode %d:%08x:%x",
+			INTEL_GEN(error->i915),
+			generate_ecode(first),
+			hung_classes);
+
 	if (first && first->context.pid) {
 		/* Just show the first executing process, more is confusing */
 		len += scnprintf(error->error_msg + len,