diff mbox series

[3/8] drm/i915: Verify GT workaround state at runtime

Message ID 20181130113201.13007-4-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Restore workarounds after engine reset and unify their handling | expand

Commit Message

Tvrtko Ursulin Nov. 30, 2018, 11:31 a.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Since we now have all the GT workarounds in a table, by adding a simple
shared helper function we can now verify that their values are still
applied after some interesting events in the lifetime of the driver.

At this stage these are the driver initialization and engine reset.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c          |  4 +++
 drivers/gpu/drm/i915/i915_gem.c          |  3 ++
 drivers/gpu/drm/i915/intel_workarounds.c | 46 ++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_workarounds.h |  2 ++
 4 files changed, 55 insertions(+)

Comments

Chris Wilson Nov. 30, 2018, 11:38 a.m. UTC | #1
Quoting Tvrtko Ursulin (2018-11-30 11:31:56)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Since we now have all the GT workarounds in a table, by adding a simple
> shared helper function we can now verify that their values are still
> applied after some interesting events in the lifetime of the driver.
> 
> At this stage these are the driver initialization and engine reset.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c          |  4 +++
>  drivers/gpu/drm/i915/i915_gem.c          |  3 ++
>  drivers/gpu/drm/i915/intel_workarounds.c | 46 ++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_workarounds.h |  2 ++
>  4 files changed, 55 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 2f3dc1cf83a6..14d019c9455b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -53,6 +53,7 @@
>  #include "i915_vgpu.h"
>  #include "intel_drv.h"
>  #include "intel_uc.h"
> +#include "intel_workarounds.h"
>  
>  static struct drm_driver driver;
>  
> @@ -2362,6 +2363,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
>                 goto out;
>         }
>  
> +       /* Catch GT workarounds affected by engine reset. */
> +       intel_gt_workarounds_verify(engine->i915, engine->name);

I'd rather, quite strongly, not have this inside i915_reset_engine()
itself. i915_reset_engine() is [designed to be] called from atomic
context. Adding I915_READ() here is questionable. My preference is to
have all this inside selftests/intel_workarounds where we can cross
check the gt/all-engines after device reset and other engine reset.
-Chris
Chris Wilson Nov. 30, 2018, 11:54 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-11-30 11:31:56)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Since we now have all the GT workarounds in a table, by adding a simple
> shared helper function we can now verify that their values are still
> applied after some interesting events in the lifetime of the driver.
> 
> At this stage these are the driver initialization and engine reset.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c          |  4 +++
>  drivers/gpu/drm/i915/i915_gem.c          |  3 ++
>  drivers/gpu/drm/i915/intel_workarounds.c | 46 ++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_workarounds.h |  2 ++
>  4 files changed, 55 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 2f3dc1cf83a6..14d019c9455b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -53,6 +53,7 @@
>  #include "i915_vgpu.h"
>  #include "intel_drv.h"
>  #include "intel_uc.h"
> +#include "intel_workarounds.h"
>  
>  static struct drm_driver driver;
>  
> @@ -2362,6 +2363,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
>                 goto out;
>         }
>  
> +       /* Catch GT workarounds affected by engine reset. */
> +       intel_gt_workarounds_verify(engine->i915, engine->name);
> +
>         /*
>          * The request that caused the hang is stuck on elsp, we know the
>          * active request and can drop it, adjust head to skip the offending
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 18adb3dd1fcd..1eff471d4366 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5334,7 +5334,10 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
>                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
>                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
>  
> +       /* Apply the GT workarounds... */
>         intel_gt_workarounds_apply(dev_priv);
> +       /* ...and determine whether they are sticking. */
> +       intel_gt_workarounds_verify(dev_priv, "init");
>  
>         i915_gem_init_swizzling(dev_priv);
>  
> diff --git a/drivers/gpu/drm/i915/intel_workarounds.c b/drivers/gpu/drm/i915/intel_workarounds.c
> index be63a2af3481..a5c0d206b2a4 100644
> --- a/drivers/gpu/drm/i915/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/intel_workarounds.c
> @@ -981,6 +981,52 @@ void intel_gt_workarounds_apply(struct drm_i915_private *dev_priv)
>         wa_list_apply(dev_priv, &dev_priv->gt_wa_list);
>  }
>  
> +static void
> +wa_fail(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
> +{
> +       DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x, mask=%x)\n",
> +                 name, from,
> +                 i915_mmio_reg_offset(wa->reg),
> +                 cur, cur & wa->mask, wa->val, wa->mask);
> +}
> +
> +static void
> +wa_verify_bits(const struct i915_wa *wa, u32 cur, const char *name,
> +              const char *from)
> +{
> +       u32 bits = wa->mask;
> +       u32 cur_ = cur;
> +       u32 val_ = wa->val;

Make sure wa->mask is set to ~0u for whole registers, which it must be
for the wa_fail.

if ((cur ^ wa->val) & wa->mask) {
	wa_fail(wa, cur, name, whom);
	return false; /* make it bool now to save churn later? */
}

-Chris
Tvrtko Ursulin Nov. 30, 2018, 12:02 p.m. UTC | #3
On 30/11/2018 11:38, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-11-30 11:31:56)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Since we now have all the GT workarounds in a table, by adding a simple
>> shared helper function we can now verify that their values are still
>> applied after some interesting events in the lifetime of the driver.
>>
>> At this stage these are the driver initialization and engine reset.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.c          |  4 +++
>>   drivers/gpu/drm/i915/i915_gem.c          |  3 ++
>>   drivers/gpu/drm/i915/intel_workarounds.c | 46 ++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/intel_workarounds.h |  2 ++
>>   4 files changed, 55 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>> index 2f3dc1cf83a6..14d019c9455b 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.c
>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>> @@ -53,6 +53,7 @@
>>   #include "i915_vgpu.h"
>>   #include "intel_drv.h"
>>   #include "intel_uc.h"
>> +#include "intel_workarounds.h"
>>   
>>   static struct drm_driver driver;
>>   
>> @@ -2362,6 +2363,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
>>                  goto out;
>>          }
>>   
>> +       /* Catch GT workarounds affected by engine reset. */
>> +       intel_gt_workarounds_verify(engine->i915, engine->name);
> 
> I'd rather, quite strongly, not have this inside i915_reset_engine()
> itself. i915_reset_engine() is [designed to be] called from atomic
> context. Adding I915_READ() here is questionable. My preference is to
> have all this inside selftests/intel_workarounds where we can cross
> check the gt/all-engines after device reset and other engine reset.

I wasn't sure myself how much value checking it here adds, but regarding 
I915_READs, they are happening during engine->init_hw anyway. So I don't 
think a few more would harm. But yes, as said, since I was unsure myself 
I am happy to drop this hunk.

Regards,

Tvrtko
Chris Wilson Nov. 30, 2018, 2:36 p.m. UTC | #4
Quoting Tvrtko Ursulin (2018-11-30 12:02:56)
> 
> On 30/11/2018 11:38, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-11-30 11:31:56)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> Since we now have all the GT workarounds in a table, by adding a simple
> >> shared helper function we can now verify that their values are still
> >> applied after some interesting events in the lifetime of the driver.
> >>
> >> At this stage these are the driver initialization and engine reset.
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> ---
> >>   drivers/gpu/drm/i915/i915_drv.c          |  4 +++
> >>   drivers/gpu/drm/i915/i915_gem.c          |  3 ++
> >>   drivers/gpu/drm/i915/intel_workarounds.c | 46 ++++++++++++++++++++++++
> >>   drivers/gpu/drm/i915/intel_workarounds.h |  2 ++
> >>   4 files changed, 55 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> >> index 2f3dc1cf83a6..14d019c9455b 100644
> >> --- a/drivers/gpu/drm/i915/i915_drv.c
> >> +++ b/drivers/gpu/drm/i915/i915_drv.c
> >> @@ -53,6 +53,7 @@
> >>   #include "i915_vgpu.h"
> >>   #include "intel_drv.h"
> >>   #include "intel_uc.h"
> >> +#include "intel_workarounds.h"
> >>   
> >>   static struct drm_driver driver;
> >>   
> >> @@ -2362,6 +2363,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> >>                  goto out;
> >>          }
> >>   
> >> +       /* Catch GT workarounds affected by engine reset. */
> >> +       intel_gt_workarounds_verify(engine->i915, engine->name);
> > 
> > I'd rather, quite strongly, not have this inside i915_reset_engine()
> > itself. i915_reset_engine() is [designed to be] called from atomic
> > context. Adding I915_READ() here is questionable. My preference is to
> > have all this inside selftests/intel_workarounds where we can cross
> > check the gt/all-engines after device reset and other engine reset.
> 
> I wasn't sure myself how much value checking it here adds, but regarding 
> I915_READs, they are happening during engine->init_hw anyway. So I don't 
> think a few more would harm. But yes, as said, since I was unsure myself 
> I am happy to drop this hunk.

Adding the I915_READ isn't the end-of-the-world, I am more wary of the
slipperly slope and suddenly finding ourselves with a per-engine reset
that is no longer suitable for fast resets, possibly from hardirq
context. (Although that's more likely to be softirq really.)

In this case, I think we can satisfy ourselves with a comprehensive yet
fast test suite. And there's no harm in having the checks on idle for
runtime consistency checking (we can even inject SRM before idling for
completeness).
-Chris
Tvrtko Ursulin Nov. 30, 2018, 3:21 p.m. UTC | #5
On 30/11/2018 11:54, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-11-30 11:31:56)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Since we now have all the GT workarounds in a table, by adding a simple
>> shared helper function we can now verify that their values are still
>> applied after some interesting events in the lifetime of the driver.
>>
>> At this stage these are the driver initialization and engine reset.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.c          |  4 +++
>>   drivers/gpu/drm/i915/i915_gem.c          |  3 ++
>>   drivers/gpu/drm/i915/intel_workarounds.c | 46 ++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/intel_workarounds.h |  2 ++
>>   4 files changed, 55 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>> index 2f3dc1cf83a6..14d019c9455b 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.c
>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>> @@ -53,6 +53,7 @@
>>   #include "i915_vgpu.h"
>>   #include "intel_drv.h"
>>   #include "intel_uc.h"
>> +#include "intel_workarounds.h"
>>   
>>   static struct drm_driver driver;
>>   
>> @@ -2362,6 +2363,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
>>                  goto out;
>>          }
>>   
>> +       /* Catch GT workarounds affected by engine reset. */
>> +       intel_gt_workarounds_verify(engine->i915, engine->name);
>> +
>>          /*
>>           * The request that caused the hang is stuck on elsp, we know the
>>           * active request and can drop it, adjust head to skip the offending
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 18adb3dd1fcd..1eff471d4366 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -5334,7 +5334,10 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
>>                  I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
>>                             LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
>>   
>> +       /* Apply the GT workarounds... */
>>          intel_gt_workarounds_apply(dev_priv);
>> +       /* ...and determine whether they are sticking. */
>> +       intel_gt_workarounds_verify(dev_priv, "init");
>>   
>>          i915_gem_init_swizzling(dev_priv);
>>   
>> diff --git a/drivers/gpu/drm/i915/intel_workarounds.c b/drivers/gpu/drm/i915/intel_workarounds.c
>> index be63a2af3481..a5c0d206b2a4 100644
>> --- a/drivers/gpu/drm/i915/intel_workarounds.c
>> +++ b/drivers/gpu/drm/i915/intel_workarounds.c
>> @@ -981,6 +981,52 @@ void intel_gt_workarounds_apply(struct drm_i915_private *dev_priv)
>>          wa_list_apply(dev_priv, &dev_priv->gt_wa_list);
>>   }
>>   
>> +static void
>> +wa_fail(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
>> +{
>> +       DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x, mask=%x)\n",
>> +                 name, from,
>> +                 i915_mmio_reg_offset(wa->reg),
>> +                 cur, cur & wa->mask, wa->val, wa->mask);
>> +}
>> +
>> +static void
>> +wa_verify_bits(const struct i915_wa *wa, u32 cur, const char *name,
>> +              const char *from)
>> +{
>> +       u32 bits = wa->mask;
>> +       u32 cur_ = cur;
>> +       u32 val_ = wa->val;
> 
> Make sure wa->mask is set to ~0u for whole registers, which it must be
> for the wa_fail.

It is (wa_write), unless you spotted a miss?

> if ((cur ^ wa->val) & wa->mask) {
> 	wa_fail(wa, cur, name, whom);

Not sure if I feel like laughing or crying at my implementation. :) I 
was cleaning up the lot when I realized it all reduces to simplicity, 
but obviously missed this one..

> 	return false; /* make it bool now to save churn later? */
> }

Okay.

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 2f3dc1cf83a6..14d019c9455b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -53,6 +53,7 @@ 
 #include "i915_vgpu.h"
 #include "intel_drv.h"
 #include "intel_uc.h"
+#include "intel_workarounds.h"
 
 static struct drm_driver driver;
 
@@ -2362,6 +2363,9 @@  int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 		goto out;
 	}
 
+	/* Catch GT workarounds affected by engine reset. */
+	intel_gt_workarounds_verify(engine->i915, engine->name);
+
 	/*
 	 * The request that caused the hang is stuck on elsp, we know the
 	 * active request and can drop it, adjust head to skip the offending
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 18adb3dd1fcd..1eff471d4366 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5334,7 +5334,10 @@  int i915_gem_init_hw(struct drm_i915_private *dev_priv)
 		I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
 			   LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
 
+	/* Apply the GT workarounds... */
 	intel_gt_workarounds_apply(dev_priv);
+	/* ...and determine whether they are sticking. */
+	intel_gt_workarounds_verify(dev_priv, "init");
 
 	i915_gem_init_swizzling(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/intel_workarounds.c b/drivers/gpu/drm/i915/intel_workarounds.c
index be63a2af3481..a5c0d206b2a4 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/intel_workarounds.c
@@ -981,6 +981,52 @@  void intel_gt_workarounds_apply(struct drm_i915_private *dev_priv)
 	wa_list_apply(dev_priv, &dev_priv->gt_wa_list);
 }
 
+static void
+wa_fail(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
+{
+	DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x, mask=%x)\n",
+		  name, from,
+		  i915_mmio_reg_offset(wa->reg),
+		  cur, cur & wa->mask, wa->val, wa->mask);
+}
+
+static void
+wa_verify_bits(const struct i915_wa *wa, u32 cur, const char *name,
+	       const char *from)
+{
+	u32 bits = wa->mask;
+	u32 cur_ = cur;
+	u32 val_ = wa->val;
+
+	while (bits) {
+		if ((bits & 1) && ((cur_ & 1) != (val_ & 1))) {
+			wa_fail(wa, cur, name, from);
+			break;
+		}
+
+		bits >>= 1;
+		cur_ >>= 1;
+		val_ >>= 1;
+	}
+}
+
+static void wa_list_verify(struct drm_i915_private *dev_priv,
+			   const struct i915_wa_list *wal,
+			   const char *from)
+{
+	struct i915_wa *wa;
+	unsigned int i;
+
+	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
+		wa_verify_bits(wa, I915_READ(wa->reg), wal->name, from);
+}
+
+void intel_gt_workarounds_verify(struct drm_i915_private *dev_priv,
+				 const char *from)
+{
+	wa_list_verify(dev_priv, &dev_priv->gt_wa_list, from);
+}
+
 struct whitelist {
 	i915_reg_t reg[RING_MAX_NONPRIV_SLOTS];
 	unsigned int count;
diff --git a/drivers/gpu/drm/i915/intel_workarounds.h b/drivers/gpu/drm/i915/intel_workarounds.h
index 2998767d51ca..845c18dc110d 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.h
+++ b/drivers/gpu/drm/i915/intel_workarounds.h
@@ -33,6 +33,8 @@  int intel_ctx_workarounds_emit(struct i915_request *rq);
 
 void intel_gt_workarounds_init(struct drm_i915_private *dev_priv);
 void intel_gt_workarounds_apply(struct drm_i915_private *dev_priv);
+void intel_gt_workarounds_verify(struct drm_i915_private *dev_priv,
+				 const char *from);
 
 void intel_whitelist_workarounds_apply(struct intel_engine_cs *engine);