Message ID | 20191119100929.2628356-17-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/17] drm/i915/gem: Manually dump the debug trace on GEM_BUG_ON | expand |
Hi Chris, On Tue, Nov 19, 2019 at 10:09:29AM +0000, Chris Wilson wrote: > Reading from CTX_INFO upsets rc6, requiring us to detect and prevent > possible rc6 context corruption. Poke at the bear! > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Imre Deak <imre.deak@intel.com> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> it looks good, Reviewed-by: Andi Shyti <andi.shyti@intel.com> Tested-by: Andi Shyti <andi.shyti@intel.com> just a question, though > diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c > index d1752f15702a..1d5bf93d1258 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c > +++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c > @@ -6,6 +6,7 @@ > */ > > #include "selftest_llc.h" > +#include "selftest_rc6.h" > > static int live_gt_resume(void *arg) > { > @@ -58,3 +59,15 @@ int intel_gt_pm_live_selftests(struct drm_i915_private *i915) > > return intel_gt_live_subtests(tests, &i915->gt); > } > + > +int intel_gt_pm_late_selftests(struct drm_i915_private *i915) > +{ > + static const struct i915_subtest tests[] = { > + SUBTEST(live_rc6_ctx), > + }; > + > + if (intel_gt_is_wedged(&i915->gt)) > + return 0; > + > + return intel_gt_live_subtests(tests, &i915->gt); > +} are you thinking of making this file a hub for all power management selftests? wouldn't it be more neat if rc6, rps and others have their own selftests declared independently, considering that we might want to add more of them? > +static const u32 *__live_rc6_ctx(struct intel_context *ce) > +{ > + struct i915_request *rq; > + u32 const *result; 'const' here? you like reckless life! :) Andi
Quoting Andi Shyti (2019-11-19 15:24:59) > Hi Chris, > > On Tue, Nov 19, 2019 at 10:09:29AM +0000, Chris Wilson wrote: > > Reading from CTX_INFO upsets rc6, requiring us to detect and prevent > > possible rc6 context corruption. Poke at the bear! > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Imre Deak <imre.deak@intel.com> > > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > > it looks good, > > Reviewed-by: Andi Shyti <andi.shyti@intel.com> > Tested-by: Andi Shyti <andi.shyti@intel.com> > > just a question, though > > > diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c > > index d1752f15702a..1d5bf93d1258 100644 > > --- a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c > > +++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c > > @@ -6,6 +6,7 @@ > > */ > > > > #include "selftest_llc.h" > > +#include "selftest_rc6.h" > > > > static int live_gt_resume(void *arg) > > { > > @@ -58,3 +59,15 @@ int intel_gt_pm_live_selftests(struct drm_i915_private *i915) > > > > return intel_gt_live_subtests(tests, &i915->gt); > > } > > + > > +int intel_gt_pm_late_selftests(struct drm_i915_private *i915) > > +{ > > + static const struct i915_subtest tests[] = { > > + SUBTEST(live_rc6_ctx), > > + }; > > + > > + if (intel_gt_is_wedged(&i915->gt)) > > + return 0; > > + > > + return intel_gt_live_subtests(tests, &i915->gt); > > +} > > are you thinking of making this file a hub for all power > management selftests? wouldn't it be more neat if rc6, rps and > others have their own selftests declared independently, > considering that we might want to add more of them? The hub is over at intel_gt_pm_selftests :) I started by adding this one there, except that this deliberately breaks the system and so should be run at the end of CI and rebooted afterwards. > > > +static const u32 *__live_rc6_ctx(struct intel_context *ce) > > +{ > > + struct i915_request *rq; > > + u32 const *result; > > 'const' here? you like reckless life! :) Shame on me. -Chris
diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index 977a617a196d..7a0bc6dde009 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -783,3 +783,7 @@ u64 intel_rc6_residency_us(struct intel_rc6 *rc6, i915_reg_t reg) { return DIV_ROUND_UP_ULL(intel_rc6_residency_ns(rc6, reg), 1000); } + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_rc6.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c index d1752f15702a..1d5bf93d1258 100644 --- a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c @@ -6,6 +6,7 @@ */ #include "selftest_llc.h" +#include "selftest_rc6.h" static int live_gt_resume(void *arg) { @@ -58,3 +59,15 @@ int intel_gt_pm_live_selftests(struct drm_i915_private *i915) return intel_gt_live_subtests(tests, &i915->gt); } + +int intel_gt_pm_late_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_rc6_ctx), + }; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c new file mode 100644 index 000000000000..c8b729f7e93e --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c @@ -0,0 +1,146 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "intel_context.h" +#include "intel_engine_pm.h" +#include "intel_gt_requests.h" +#include "intel_ring.h" +#include "selftest_rc6.h" + +#include "selftests/i915_random.h" + +static const u32 *__live_rc6_ctx(struct intel_context *ce) +{ + struct i915_request *rq; + u32 const *result; + u32 cmd; + u32 *cs; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return ERR_CAST(rq); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + return cs; + } + + cmd = MI_STORE_REGISTER_MEM | MI_USE_GGTT; + if (INTEL_GEN(rq->i915) >= 8) + cmd++; + + *cs++ = cmd; + *cs++ = i915_mmio_reg_offset(GEN8_RC6_CTX_INFO); + *cs++ = ce->timeline->hwsp_offset + 8; + *cs++ = 0; + intel_ring_advance(rq, cs); + + result = rq->hwsp_seqno + 2; + i915_request_add(rq); + + return result; +} + +static struct intel_engine_cs ** +randomised_engines(struct intel_gt *gt, + struct rnd_state *prng, + unsigned int *count) +{ + struct intel_engine_cs *engine, **engines; + enum intel_engine_id id; + int n; + + n = 0; + for_each_engine(engine, gt, id) + n++; + if (!n) + return NULL; + + engines = kmalloc_array(n, sizeof(*engines), GFP_KERNEL); + if (!engines) + return NULL; + + n = 0; + for_each_engine(engine, gt, id) + engines[n++] = engine; + + i915_prandom_shuffle(engines, sizeof(*engines), n, prng); + + *count = n; + return engines; +} + +int live_rc6_ctx(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs **engines; + unsigned int n, count; + I915_RND_STATE(prng); + int err = 0; + + /* A read of CTX_INFO upsets rc6. Poke the bear! */ + if (INTEL_GEN(gt->i915) < 8) + return 0; + + engines = randomised_engines(gt, &prng, &count); + if (!engines) + return 0; + + for (n = 0; n < count; n++) { + struct intel_engine_cs *engine = engines[n]; + int pass; + + for (pass = 0; pass < 2; pass++) { + struct intel_context *ce; + unsigned int resets = + i915_reset_engine_count(>->i915->gpu_error, + engine); + const u32 *res; + + /* Use a sacrifical context */ + ce = intel_context_create(engine->kernel_context->gem_context, + engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + intel_engine_pm_get(engine); + res = __live_rc6_ctx(ce); + intel_engine_pm_put(engine); + intel_context_put(ce); + if (IS_ERR(res)) { + err = PTR_ERR(res); + goto out; + } + + if (intel_gt_wait_for_idle(gt, HZ / 5) == -ETIME) { + intel_gt_set_wedged(gt); + err = -ETIME; + goto out; + } + + intel_gt_pm_wait_for_idle(gt); + pr_debug("%s: CTX_INFO=%0x\n", + engine->name, READ_ONCE(*res)); + + if (resets != + i915_reset_engine_count(>->i915->gpu_error, + engine)) { + pr_err("%s: GPU reset required\n", + engine->name); + add_taint_for_CI(TAINT_WARN); + err = -EIO; + goto out; + } + } + } + +out: + kfree(engines); + return err; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.h b/drivers/gpu/drm/i915/gt/selftest_rc6.h new file mode 100644 index 000000000000..230c6b4c7dc0 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.h @@ -0,0 +1,12 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef SELFTEST_RC6_H +#define SELFTEST_RC6_H + +int live_rc6_ctx(void *arg); + +#endif /* SELFTEST_RC6_H */ diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h index 11b40bc58e6d..beff59ee9f6f 100644 --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h @@ -39,3 +39,4 @@ selftest(hangcheck, intel_hangcheck_live_selftests) selftest(execlists, intel_execlists_live_selftests) selftest(guc, intel_guc_live_selftest) selftest(perf, i915_perf_live_selftests) +selftest(late_gt_pm, intel_gt_pm_late_selftests)
Reading from CTX_INFO upsets rc6, requiring us to detect and prevent possible rc6 context corruption. Poke at the bear! Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Imre Deak <imre.deak@intel.com> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> --- drivers/gpu/drm/i915/gt/intel_rc6.c | 4 + drivers/gpu/drm/i915/gt/selftest_gt_pm.c | 13 ++ drivers/gpu/drm/i915/gt/selftest_rc6.c | 146 ++++++++++++++++++ drivers/gpu/drm/i915/gt/selftest_rc6.h | 12 ++ .../drm/i915/selftests/i915_live_selftests.h | 1 + 5 files changed, 176 insertions(+) create mode 100644 drivers/gpu/drm/i915/gt/selftest_rc6.c create mode 100644 drivers/gpu/drm/i915/gt/selftest_rc6.h