diff mbox series

[17/17] drm/i915/selftests: Exercise rc6 handling

Message ID 20191119100929.2628356-17-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/17] drm/i915/gem: Manually dump the debug trace on GEM_BUG_ON | expand

Commit Message

Chris Wilson Nov. 19, 2019, 10:09 a.m. UTC
Reading from CTX_INFO upsets rc6, requiring us to detect and prevent
possible rc6 context corruption. Poke at the bear!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_rc6.c           |   4 +
 drivers/gpu/drm/i915/gt/selftest_gt_pm.c      |  13 ++
 drivers/gpu/drm/i915/gt/selftest_rc6.c        | 146 ++++++++++++++++++
 drivers/gpu/drm/i915/gt/selftest_rc6.h        |  12 ++
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 5 files changed, 176 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/gt/selftest_rc6.c
 create mode 100644 drivers/gpu/drm/i915/gt/selftest_rc6.h

Comments

Andi Shyti Nov. 19, 2019, 3:24 p.m. UTC | #1
Hi Chris,

On Tue, Nov 19, 2019 at 10:09:29AM +0000, Chris Wilson wrote:
> Reading from CTX_INFO upsets rc6, requiring us to detect and prevent
> possible rc6 context corruption. Poke at the bear!
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Imre Deak <imre.deak@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

it looks good,

Reviewed-by: Andi Shyti <andi.shyti@intel.com>
Tested-by: Andi Shyti <andi.shyti@intel.com>

just a question, though

> diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
> index d1752f15702a..1d5bf93d1258 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
> @@ -6,6 +6,7 @@
>   */
>  
>  #include "selftest_llc.h"
> +#include "selftest_rc6.h"
>  
>  static int live_gt_resume(void *arg)
>  {
> @@ -58,3 +59,15 @@ int intel_gt_pm_live_selftests(struct drm_i915_private *i915)
>  
>  	return intel_gt_live_subtests(tests, &i915->gt);
>  }
> +
> +int intel_gt_pm_late_selftests(struct drm_i915_private *i915)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(live_rc6_ctx),
> +	};
> +
> +	if (intel_gt_is_wedged(&i915->gt))
> +		return 0;
> +
> +	return intel_gt_live_subtests(tests, &i915->gt);
> +}

are you thinking of making this file a hub for all power
management selftests? wouldn't it be more neat if rc6, rps and
others have their own selftests declared independently,
considering that we might want to add more of them?

> +static const u32 *__live_rc6_ctx(struct intel_context *ce)
> +{
> +	struct i915_request *rq;
> +	u32 const *result;

'const' here? you like reckless life! :)

Andi
Chris Wilson Nov. 19, 2019, 3:27 p.m. UTC | #2
Quoting Andi Shyti (2019-11-19 15:24:59)
> Hi Chris,
> 
> On Tue, Nov 19, 2019 at 10:09:29AM +0000, Chris Wilson wrote:
> > Reading from CTX_INFO upsets rc6, requiring us to detect and prevent
> > possible rc6 context corruption. Poke at the bear!
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Imre Deak <imre.deak@intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> 
> it looks good,
> 
> Reviewed-by: Andi Shyti <andi.shyti@intel.com>
> Tested-by: Andi Shyti <andi.shyti@intel.com>
> 
> just a question, though
> 
> > diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
> > index d1752f15702a..1d5bf93d1258 100644
> > --- a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
> > +++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
> > @@ -6,6 +6,7 @@
> >   */
> >  
> >  #include "selftest_llc.h"
> > +#include "selftest_rc6.h"
> >  
> >  static int live_gt_resume(void *arg)
> >  {
> > @@ -58,3 +59,15 @@ int intel_gt_pm_live_selftests(struct drm_i915_private *i915)
> >  
> >       return intel_gt_live_subtests(tests, &i915->gt);
> >  }
> > +
> > +int intel_gt_pm_late_selftests(struct drm_i915_private *i915)
> > +{
> > +     static const struct i915_subtest tests[] = {
> > +             SUBTEST(live_rc6_ctx),
> > +     };
> > +
> > +     if (intel_gt_is_wedged(&i915->gt))
> > +             return 0;
> > +
> > +     return intel_gt_live_subtests(tests, &i915->gt);
> > +}
> 
> are you thinking of making this file a hub for all power
> management selftests? wouldn't it be more neat if rc6, rps and
> others have their own selftests declared independently,
> considering that we might want to add more of them?

The hub is over at intel_gt_pm_selftests :)

I started by adding this one there, except that this deliberately breaks
the system and so should be run at the end of CI and rebooted
afterwards.

> 
> > +static const u32 *__live_rc6_ctx(struct intel_context *ce)
> > +{
> > +     struct i915_request *rq;
> > +     u32 const *result;
> 
> 'const' here? you like reckless life! :)

Shame on me.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c
index 977a617a196d..7a0bc6dde009 100644
--- a/drivers/gpu/drm/i915/gt/intel_rc6.c
+++ b/drivers/gpu/drm/i915/gt/intel_rc6.c
@@ -783,3 +783,7 @@  u64 intel_rc6_residency_us(struct intel_rc6 *rc6, i915_reg_t reg)
 {
 	return DIV_ROUND_UP_ULL(intel_rc6_residency_ns(rc6, reg), 1000);
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_rc6.c"
+#endif
diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
index d1752f15702a..1d5bf93d1258 100644
--- a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
@@ -6,6 +6,7 @@ 
  */
 
 #include "selftest_llc.h"
+#include "selftest_rc6.h"
 
 static int live_gt_resume(void *arg)
 {
@@ -58,3 +59,15 @@  int intel_gt_pm_live_selftests(struct drm_i915_private *i915)
 
 	return intel_gt_live_subtests(tests, &i915->gt);
 }
+
+int intel_gt_pm_late_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(live_rc6_ctx),
+	};
+
+	if (intel_gt_is_wedged(&i915->gt))
+		return 0;
+
+	return intel_gt_live_subtests(tests, &i915->gt);
+}
diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
new file mode 100644
index 000000000000..c8b729f7e93e
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
@@ -0,0 +1,146 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include "intel_context.h"
+#include "intel_engine_pm.h"
+#include "intel_gt_requests.h"
+#include "intel_ring.h"
+#include "selftest_rc6.h"
+
+#include "selftests/i915_random.h"
+
+static const u32 *__live_rc6_ctx(struct intel_context *ce)
+{
+	struct i915_request *rq;
+	u32 const *result;
+	u32 cmd;
+	u32 *cs;
+
+	rq = intel_context_create_request(ce);
+	if (IS_ERR(rq))
+		return ERR_CAST(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		i915_request_add(rq);
+		return cs;
+	}
+
+	cmd = MI_STORE_REGISTER_MEM | MI_USE_GGTT;
+	if (INTEL_GEN(rq->i915) >= 8)
+		cmd++;
+
+	*cs++ = cmd;
+	*cs++ = i915_mmio_reg_offset(GEN8_RC6_CTX_INFO);
+	*cs++ = ce->timeline->hwsp_offset + 8;
+	*cs++ = 0;
+	intel_ring_advance(rq, cs);
+
+	result = rq->hwsp_seqno + 2;
+	i915_request_add(rq);
+
+	return result;
+}
+
+static struct intel_engine_cs **
+randomised_engines(struct intel_gt *gt,
+		   struct rnd_state *prng,
+		   unsigned int *count)
+{
+	struct intel_engine_cs *engine, **engines;
+	enum intel_engine_id id;
+	int n;
+
+	n = 0;
+	for_each_engine(engine, gt, id)
+		n++;
+	if (!n)
+		return NULL;
+
+	engines = kmalloc_array(n, sizeof(*engines), GFP_KERNEL);
+	if (!engines)
+		return NULL;
+
+	n = 0;
+	for_each_engine(engine, gt, id)
+		engines[n++] = engine;
+
+	i915_prandom_shuffle(engines, sizeof(*engines), n, prng);
+
+	*count = n;
+	return engines;
+}
+
+int live_rc6_ctx(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs **engines;
+	unsigned int n, count;
+	I915_RND_STATE(prng);
+	int err = 0;
+
+	/* A read of CTX_INFO upsets rc6. Poke the bear! */
+	if (INTEL_GEN(gt->i915) < 8)
+		return 0;
+
+	engines = randomised_engines(gt, &prng, &count);
+	if (!engines)
+		return 0;
+
+	for (n = 0; n < count; n++) {
+		struct intel_engine_cs *engine = engines[n];
+		int pass;
+
+		for (pass = 0; pass < 2; pass++) {
+			struct intel_context *ce;
+			unsigned int resets =
+				i915_reset_engine_count(&gt->i915->gpu_error,
+							engine);
+			const u32 *res;
+
+			/* Use a sacrifical context */
+			ce = intel_context_create(engine->kernel_context->gem_context,
+						  engine);
+			if (IS_ERR(ce)) {
+				err = PTR_ERR(ce);
+				goto out;
+			}
+
+			intel_engine_pm_get(engine);
+			res = __live_rc6_ctx(ce);
+			intel_engine_pm_put(engine);
+			intel_context_put(ce);
+			if (IS_ERR(res)) {
+				err = PTR_ERR(res);
+				goto out;
+			}
+
+			if (intel_gt_wait_for_idle(gt, HZ / 5) == -ETIME) {
+				intel_gt_set_wedged(gt);
+				err = -ETIME;
+				goto out;
+			}
+
+			intel_gt_pm_wait_for_idle(gt);
+			pr_debug("%s: CTX_INFO=%0x\n",
+				 engine->name, READ_ONCE(*res));
+
+			if (resets !=
+			    i915_reset_engine_count(&gt->i915->gpu_error,
+						    engine)) {
+				pr_err("%s: GPU reset required\n",
+				       engine->name);
+				add_taint_for_CI(TAINT_WARN);
+				err = -EIO;
+				goto out;
+			}
+		}
+	}
+
+out:
+	kfree(engines);
+	return err;
+}
diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.h b/drivers/gpu/drm/i915/gt/selftest_rc6.h
new file mode 100644
index 000000000000..230c6b4c7dc0
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_rc6.h
@@ -0,0 +1,12 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef SELFTEST_RC6_H
+#define SELFTEST_RC6_H
+
+int live_rc6_ctx(void *arg);
+
+#endif /* SELFTEST_RC6_H */
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index 11b40bc58e6d..beff59ee9f6f 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -39,3 +39,4 @@  selftest(hangcheck, intel_hangcheck_live_selftests)
 selftest(execlists, intel_execlists_live_selftests)
 selftest(guc, intel_guc_live_selftest)
 selftest(perf, i915_perf_live_selftests)
+selftest(late_gt_pm, intel_gt_pm_late_selftests)