Message ID | 20200326231810.16852-2-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/2] drm/i915/execlists: Prevent GPU death on ELSP[1] promotion to idle context | expand |
On 26/03/2020 23:18, Chris Wilson wrote: > Upon a GPU reset, we copy the default context image over top of the > guilty image. This will rollback the CTX_TIMESTAMP register to before > our value of ce->runtime.last. Reset both back to 0 so that we do not > encounter an underflow on the next schedule out after resume. > > This should not be a huge issue in practice, as hangs should be rare in > correct code. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > drivers/gpu/drm/i915/gt/intel_lrc.c | 24 +++++++++++++----------- > 1 file changed, 13 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index 4edda15eba26..47cec545a069 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -238,6 +238,17 @@ __execlists_update_reg_state(const struct intel_context *ce, > const struct intel_engine_cs *engine, > u32 head); > > +static u32 intel_context_get_runtime(const struct intel_context *ce) > +{ > + /* > + * We can use either ppHWSP[16] which is recorded before the context > + * switch (and so excludes the cost of context switches) or use the > + * value from the context image itself, which is saved/restored earlier > + * and so includes the cost of the save. > + */ > + return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); > +} > + > static void mark_eio(struct i915_request *rq) > { > if (i915_request_completed(rq)) > @@ -1154,6 +1165,7 @@ static void restore_default_state(struct intel_context *ce, > engine->context_size - PAGE_SIZE); > > execlists_init_reg_state(regs, ce, engine, ce->ring, false); > + ce->runtime.last = intel_context_get_runtime(ce); > } > > static void reset_active(struct i915_request *rq, > @@ -1195,17 +1207,6 @@ static void reset_active(struct i915_request *rq, > ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; > } > > -static u32 intel_context_get_runtime(const struct intel_context *ce) > -{ > - /* > - * We can use either ppHWSP[16] which is recorded before the context > - * switch (and so excludes the cost of context switches) or use the > - * value from the context image itself, which is saved/restored earlier > - * and so includes the cost of the save. > - */ > - return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); > -} > - > static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) > { > #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > @@ -4581,6 +4582,7 @@ static void init_common_reg_state(u32 * const regs, > regs[CTX_CONTEXT_CONTROL] = ctl; > > regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; > + regs[CTX_TIMESTAMP] = 0; > } > > static void init_wa_bb_reg_state(u32 * const regs, > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 4edda15eba26..47cec545a069 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -238,6 +238,17 @@ __execlists_update_reg_state(const struct intel_context *ce, const struct intel_engine_cs *engine, u32 head); +static u32 intel_context_get_runtime(const struct intel_context *ce) +{ + /* + * We can use either ppHWSP[16] which is recorded before the context + * switch (and so excludes the cost of context switches) or use the + * value from the context image itself, which is saved/restored earlier + * and so includes the cost of the save. + */ + return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); +} + static void mark_eio(struct i915_request *rq) { if (i915_request_completed(rq)) @@ -1154,6 +1165,7 @@ static void restore_default_state(struct intel_context *ce, engine->context_size - PAGE_SIZE); execlists_init_reg_state(regs, ce, engine, ce->ring, false); + ce->runtime.last = intel_context_get_runtime(ce); } static void reset_active(struct i915_request *rq, @@ -1195,17 +1207,6 @@ static void reset_active(struct i915_request *rq, ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; } -static u32 intel_context_get_runtime(const struct intel_context *ce) -{ - /* - * We can use either ppHWSP[16] which is recorded before the context - * switch (and so excludes the cost of context switches) or use the - * value from the context image itself, which is saved/restored earlier - * and so includes the cost of the save. - */ - return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); -} - static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) { #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) @@ -4581,6 +4582,7 @@ static void init_common_reg_state(u32 * const regs, regs[CTX_CONTEXT_CONTROL] = ctl; regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; + regs[CTX_TIMESTAMP] = 0; } static void init_wa_bb_reg_state(u32 * const regs,
Upon a GPU reset, we copy the default context image over top of the guilty image. This will rollback the CTX_TIMESTAMP register to before our value of ce->runtime.last. Reset both back to 0 so that we do not encounter an underflow on the next schedule out after resume. This should not be a huge issue in practice, as hangs should be rare in correct code. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/gt/intel_lrc.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-)