diff mbox series

[3/3] drm/i915: Log catastrophic errors on gen11

Message ID 20190412153723.31931-3-mika.kuoppala@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/i915: Shortcut readiness to reset check | expand

Commit Message

Mika Kuoppala April 12, 2019, 3:37 p.m. UTC
Add a log entry to indicate if engine reported an intr
condition that is a sign of halted command streamer.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c | 24 +++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h |  4 ++++
 2 files changed, 25 insertions(+), 3 deletions(-)

Comments

Chris Wilson April 12, 2019, 3:42 p.m. UTC | #1
Quoting Mika Kuoppala (2019-04-12 16:37:23)
> Add a log entry to indicate if engine reported an intr
> condition that is a sign of halted command streamer.

It's a user error. Isn't that we spam the "there has been an error;
resetting the gpu" message enough?

You could just look at the iir in the error capture for the same effect.

> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c | 24 +++++++++++++++++++++---
>  drivers/gpu/drm/i915/i915_reg.h |  4 ++++
>  2 files changed, 25 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index d934545445e1..c64ef1291d62 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1480,7 +1480,7 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  }
>  
>  static void
> -gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> +gen8_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
>  {
>         bool tasklet = false;
>  
> @@ -1496,6 +1496,20 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
>                 tasklet_hi_schedule(&engine->execlists.tasklet);
>  }
>  
> +static void
> +gen11_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
> +{
> +       const u32 error_intrs =
> +               GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT |
> +                GT_CATASTROPHIC_ERROR_INTERRUPT |
> +               GT_CS_MASTER_ERROR_INTERRUPT;
> +
> +       if (likely(!(error_intrs & iir)))

Likely?

> +               return gen8_cs_irq_handler(engine, iir);
> +
> +       DRM_ERROR("%s: intr 0x%08x\n", engine->name, iir);
> +}
> +
>  static void gen8_gt_irq_ack(struct drm_i915_private *i915,
>                             u32 master_ctl, u32 gt_iir[4])
>  {
> @@ -3002,7 +3016,7 @@ gen11_engine_irq_handler(struct drm_i915_private * const i915,
>                 engine = NULL;
>  
>         if (likely(engine))
> -               return gen8_cs_irq_handler(engine, iir);
> +               return gen11_cs_irq_handler(engine, iir);
>  
>         WARN_ONCE(1, "unhandled engine interrupt class=0x%x, instance=0x%x\n",
>                   class, instance);
> @@ -4120,7 +4134,11 @@ static int gen8_irq_postinstall(struct drm_device *dev)
>  
>  static void gen11_gt_irq_postinstall(struct drm_i915_private *dev_priv)
>  {
> -       const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
> +       const u32 irqs = GT_CATASTROPHIC_ERROR_INTERRUPT |
> +               GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT |
> +               GT_CS_MASTER_ERROR_INTERRUPT |
> +               GT_RENDER_USER_INTERRUPT |
> +               GT_CONTEXT_SWITCH_INTERRUPT;
>  
>         BUILD_BUG_ON(irqs & 0xffff0000);
>  
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index c1c0f7ab03e9..fb1cff071f7a 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2996,6 +2996,10 @@ enum i915_power_well_id {
>  #define GT_RENDER_DEBUG_INTERRUPT              (1 <<  1)
>  #define GT_RENDER_USER_INTERRUPT               (1 <<  0)
>  
> +#define GT_CATASTROPHIC_ERROR_INTERRUPT                (1 << 15) /* gen11 */
> +#define GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT     (1 << 7)  /* gen11 */
> +#define GT_CS_MASTER_ERROR_INTERRUPT           (1 << 3) /* gen11 */
> +
>  #define PM_VEBOX_CS_ERROR_INTERRUPT            (1 << 12) /* hsw+ */
>  #define PM_VEBOX_USER_INTERRUPT                        (1 << 10) /* hsw+ */
>  
> -- 
> 2.17.1
>
Mika Kuoppala April 12, 2019, 3:47 p.m. UTC | #2
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2019-04-12 16:37:23)
>> Add a log entry to indicate if engine reported an intr
>> condition that is a sign of halted command streamer.
>
> It's a user error. Isn't that we spam the "there has been an error;
> resetting the gpu" message enough?
>
> You could just look at the iir in the error capture for the same effect.
>

Yeah it is debatable if this adds any value at all in this form.
Could drop this and introduce only if I get motivation
to handle reset faster straight from encountering error
intr.

>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_irq.c | 24 +++++++++++++++++++++---
>>  drivers/gpu/drm/i915/i915_reg.h |  4 ++++
>>  2 files changed, 25 insertions(+), 3 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index d934545445e1..c64ef1291d62 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -1480,7 +1480,7 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>>  }
>>  
>>  static void
>> -gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
>> +gen8_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
>>  {
>>         bool tasklet = false;
>>  
>> @@ -1496,6 +1496,20 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
>>                 tasklet_hi_schedule(&engine->execlists.tasklet);
>>  }
>>  
>> +static void
>> +gen11_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
>> +{
>> +       const u32 error_intrs =
>> +               GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT |
>> +                GT_CATASTROPHIC_ERROR_INTERRUPT |
>> +               GT_CS_MASTER_ERROR_INTERRUPT;
>> +
>> +       if (likely(!(error_intrs & iir)))
>
> Likely?
>

Likely to not have errors.
-Mika

>> +               return gen8_cs_irq_handler(engine, iir);
>> +
>> +       DRM_ERROR("%s: intr 0x%08x\n", engine->name, iir);
>> +}
>> +
>>  static void gen8_gt_irq_ack(struct drm_i915_private *i915,
>>                             u32 master_ctl, u32 gt_iir[4])
>>  {
>> @@ -3002,7 +3016,7 @@ gen11_engine_irq_handler(struct drm_i915_private * const i915,
>>                 engine = NULL;
>>  
>>         if (likely(engine))
>> -               return gen8_cs_irq_handler(engine, iir);
>> +               return gen11_cs_irq_handler(engine, iir);
>>  
>>         WARN_ONCE(1, "unhandled engine interrupt class=0x%x, instance=0x%x\n",
>>                   class, instance);
>> @@ -4120,7 +4134,11 @@ static int gen8_irq_postinstall(struct drm_device *dev)
>>  
>>  static void gen11_gt_irq_postinstall(struct drm_i915_private *dev_priv)
>>  {
>> -       const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
>> +       const u32 irqs = GT_CATASTROPHIC_ERROR_INTERRUPT |
>> +               GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT |
>> +               GT_CS_MASTER_ERROR_INTERRUPT |
>> +               GT_RENDER_USER_INTERRUPT |
>> +               GT_CONTEXT_SWITCH_INTERRUPT;
>>  
>>         BUILD_BUG_ON(irqs & 0xffff0000);
>>  
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index c1c0f7ab03e9..fb1cff071f7a 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -2996,6 +2996,10 @@ enum i915_power_well_id {
>>  #define GT_RENDER_DEBUG_INTERRUPT              (1 <<  1)
>>  #define GT_RENDER_USER_INTERRUPT               (1 <<  0)
>>  
>> +#define GT_CATASTROPHIC_ERROR_INTERRUPT                (1 << 15) /* gen11 */
>> +#define GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT     (1 << 7)  /* gen11 */
>> +#define GT_CS_MASTER_ERROR_INTERRUPT           (1 << 3) /* gen11 */
>> +
>>  #define PM_VEBOX_CS_ERROR_INTERRUPT            (1 << 12) /* hsw+ */
>>  #define PM_VEBOX_USER_INTERRUPT                        (1 << 10) /* hsw+ */
>>  
>> -- 
>> 2.17.1
>>
Chris Wilson April 12, 2019, 3:53 p.m. UTC | #3
Quoting Mika Kuoppala (2019-04-12 16:47:11)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2019-04-12 16:37:23)
> >> Add a log entry to indicate if engine reported an intr
> >> condition that is a sign of halted command streamer.
> >
> > It's a user error. Isn't that we spam the "there has been an error;
> > resetting the gpu" message enough?
> >
> > You could just look at the iir in the error capture for the same effect.
> >
> 
> Yeah it is debatable if this adds any value at all in this form.
> Could drop this and introduce only if I get motivation
> to handle reset faster straight from encountering error
> intr.
> 
> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/i915_irq.c | 24 +++++++++++++++++++++---
> >>  drivers/gpu/drm/i915/i915_reg.h |  4 ++++
> >>  2 files changed, 25 insertions(+), 3 deletions(-)
> >> 
> >> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> >> index d934545445e1..c64ef1291d62 100644
> >> --- a/drivers/gpu/drm/i915/i915_irq.c
> >> +++ b/drivers/gpu/drm/i915/i915_irq.c
> >> @@ -1480,7 +1480,7 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
> >>  }
> >>  
> >>  static void
> >> -gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> >> +gen8_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
> >>  {
> >>         bool tasklet = false;
> >>  
> >> @@ -1496,6 +1496,20 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> >>                 tasklet_hi_schedule(&engine->execlists.tasklet);
> >>  }
> >>  
> >> +static void
> >> +gen11_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
> >> +{
> >> +       const u32 error_intrs =
> >> +               GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT |
> >> +                GT_CATASTROPHIC_ERROR_INTERRUPT |
> >> +               GT_CS_MASTER_ERROR_INTERRUPT;
> >> +
> >> +       if (likely(!(error_intrs & iir)))
> >
> > Likely?
> >
> 
> Likely to not have errors.

Oh, this is the CS/USER_INTERRUPT handler. Yeah, much more likely. So
much more, I'd push more for this to be polled by the error handler :)
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d934545445e1..c64ef1291d62 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1480,7 +1480,7 @@  static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 }
 
 static void
-gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
+gen8_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
 {
 	bool tasklet = false;
 
@@ -1496,6 +1496,20 @@  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
 		tasklet_hi_schedule(&engine->execlists.tasklet);
 }
 
+static void
+gen11_cs_irq_handler(struct intel_engine_cs *engine, const u32 iir)
+{
+	const u32 error_intrs =
+		GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT |
+		 GT_CATASTROPHIC_ERROR_INTERRUPT |
+		GT_CS_MASTER_ERROR_INTERRUPT;
+
+	if (likely(!(error_intrs & iir)))
+		return gen8_cs_irq_handler(engine, iir);
+
+	DRM_ERROR("%s: intr 0x%08x\n", engine->name, iir);
+}
+
 static void gen8_gt_irq_ack(struct drm_i915_private *i915,
 			    u32 master_ctl, u32 gt_iir[4])
 {
@@ -3002,7 +3016,7 @@  gen11_engine_irq_handler(struct drm_i915_private * const i915,
 		engine = NULL;
 
 	if (likely(engine))
-		return gen8_cs_irq_handler(engine, iir);
+		return gen11_cs_irq_handler(engine, iir);
 
 	WARN_ONCE(1, "unhandled engine interrupt class=0x%x, instance=0x%x\n",
 		  class, instance);
@@ -4120,7 +4134,11 @@  static int gen8_irq_postinstall(struct drm_device *dev)
 
 static void gen11_gt_irq_postinstall(struct drm_i915_private *dev_priv)
 {
-	const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
+	const u32 irqs = GT_CATASTROPHIC_ERROR_INTERRUPT |
+		GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT |
+		GT_CS_MASTER_ERROR_INTERRUPT |
+		GT_RENDER_USER_INTERRUPT |
+		GT_CONTEXT_SWITCH_INTERRUPT;
 
 	BUILD_BUG_ON(irqs & 0xffff0000);
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index c1c0f7ab03e9..fb1cff071f7a 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2996,6 +2996,10 @@  enum i915_power_well_id {
 #define GT_RENDER_DEBUG_INTERRUPT		(1 <<  1)
 #define GT_RENDER_USER_INTERRUPT		(1 <<  0)
 
+#define GT_CATASTROPHIC_ERROR_INTERRUPT		(1 << 15) /* gen11 */
+#define GT_LEGACY_CONTEXT_PPGTT_PAGE_FAULT	(1 << 7)  /* gen11 */
+#define GT_CS_MASTER_ERROR_INTERRUPT		(1 << 3) /* gen11 */
+
 #define PM_VEBOX_CS_ERROR_INTERRUPT		(1 << 12) /* hsw+ */
 #define PM_VEBOX_USER_INTERRUPT			(1 << 10) /* hsw+ */