Message ID | 20180320100449.1360-2-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 3/20/2018 3:04 AM, Chris Wilson wrote: > Not all callers want the GPU error to handled in the same way, so expose > a control parameter. In the first instance, some callers do not want the > heavyweight error capture so add a bit to request the state to be > captured and saved. > > v2: Pass msg down to i915_reset/i915_reset_engine so that we include the > reason for the reset in the dev_notice(), superseding the earlier option > to not print that notice. > v3: Stash the reason inside the i915->gpu_error to handover to the direct > reset from the blocking waiter. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Jeff McGee <jeff.mcgee@intel.com> > Cc: Mika Kuoppala <mika.kuoppala@intel.com> > Cc: Michel Thierry <michel.thierry@intel.com> Reviewed-by: Michel Thierry <michel.thierry@intel.com> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 4 +- > drivers/gpu/drm/i915/i915_drv.c | 17 ++++---- > drivers/gpu/drm/i915/i915_drv.h | 10 ++--- > drivers/gpu/drm/i915/i915_gpu_error.h | 3 ++ > drivers/gpu/drm/i915/i915_irq.c | 55 ++++++++++++++---------- > drivers/gpu/drm/i915/i915_request.c | 2 +- > drivers/gpu/drm/i915/intel_hangcheck.c | 13 +++--- > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 13 +++--- > 8 files changed, 62 insertions(+), 55 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 964ea1a12357..7816cd53100a 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -4011,8 +4011,8 @@ i915_wedged_set(void *data, u64 val) > engine->hangcheck.stalled = true; > } > > - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx", > - val); > + i915_handle_error(i915, val, I915_ERROR_CAPTURE, > + "Manually set wedged engine mask = %llx", val); > > wait_on_bit(&i915->gpu_error.flags, > I915_RESET_HANDOFF, > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index 1021bf40e236..6b04cc3be513 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1869,7 +1869,6 @@ static int i915_resume_switcheroo(struct drm_device *dev) > /** > * i915_reset - reset chip after a hang > * @i915: #drm_i915_private to reset > - * @flags: Instructions > * > * Reset the chip. Useful if a hang is detected. Marks the device as wedged > * on failure. > @@ -1884,7 +1883,7 @@ static int i915_resume_switcheroo(struct drm_device *dev) > * - re-init interrupt state > * - re-init display > */ > -void i915_reset(struct drm_i915_private *i915, unsigned int flags) > +void i915_reset(struct drm_i915_private *i915) > { > struct i915_gpu_error *error = &i915->gpu_error; > int ret; > @@ -1901,8 +1900,9 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags) > if (!i915_gem_unset_wedged(i915)) > goto wakeup; > > - if (!(flags & I915_RESET_QUIET)) > - dev_notice(i915->drm.dev, "Resetting chip after gpu hang\n"); > + if (error->reason) > + dev_notice(i915->drm.dev, > + "Resetting chip for %s\n", error->reason); > error->reset_count++; > > disable_irq(i915->drm.irq); > @@ -2003,7 +2003,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv, > /** > * i915_reset_engine - reset GPU engine to recover from a hang > * @engine: engine to reset > - * @flags: options > + * @msg: reason for GPU reset; or NULL for no dev_notice() > * > * Reset a specific GPU engine. Useful if a hang is detected. > * Returns zero on successful reset or otherwise an error code. > @@ -2013,7 +2013,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv, > * - reset engine (which will force the engine to idle) > * - re-init/configure engine > */ > -int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags) > +int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) > { > struct i915_gpu_error *error = &engine->i915->gpu_error; > struct i915_request *active_request; > @@ -2028,10 +2028,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags) > goto out; > } > > - if (!(flags & I915_RESET_QUIET)) { > + if (msg) > dev_notice(engine->i915->drm.dev, > - "Resetting %s after gpu hang\n", engine->name); > - } > + "Resetting %s for %s\n", engine->name, msg); > error->reset_engine_count[engine->id]++; > > if (!engine->i915->guc.execbuf_client) > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index e27ba8fb64e6..c9c3b2ba6a86 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2700,10 +2700,8 @@ extern void i915_driver_unload(struct drm_device *dev); > extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask); > extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv); > > -#define I915_RESET_QUIET BIT(0) > -extern void i915_reset(struct drm_i915_private *i915, unsigned int flags); > -extern int i915_reset_engine(struct intel_engine_cs *engine, > - unsigned int flags); > +extern void i915_reset(struct drm_i915_private *i915); > +extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg); > > extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv); > extern int intel_reset_guc(struct drm_i915_private *dev_priv); > @@ -2751,10 +2749,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) > &dev_priv->gpu_error.hangcheck_work, delay); > } > > -__printf(3, 4) > +__printf(4, 5) > void i915_handle_error(struct drm_i915_private *dev_priv, > u32 engine_mask, > + unsigned long flags, > const char *fmt, ...); > +#define I915_ERROR_CAPTURE BIT(0) > > extern void intel_irq_init(struct drm_i915_private *dev_priv); > extern void intel_irq_fini(struct drm_i915_private *dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h > index ebbdf37e2879..ac5760673cc9 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.h > +++ b/drivers/gpu/drm/i915/i915_gpu_error.h > @@ -269,6 +269,9 @@ struct i915_gpu_error { > /** Number of times an engine has been reset */ > u32 reset_engine_count[I915_NUM_ENGINES]; > > + /** Reason for the current *global* reset */ > + const char *reason; > + > /** > * Waitqueue to signal when a hang is detected. Used to for waiters > * to release the struct_mutex for the reset to procede. > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 44eef355e12c..fa7310766217 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2877,15 +2877,10 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg) > return IRQ_HANDLED; > } > > -/** > - * i915_reset_device - do process context error handling work > - * @dev_priv: i915 device private > - * > - * Fire an error uevent so userspace can see that a hang or error > - * was detected. > - */ > -static void i915_reset_device(struct drm_i915_private *dev_priv) > +static void i915_reset_device(struct drm_i915_private *dev_priv, > + const char *msg) > { > + struct i915_gpu_error *error = &dev_priv->gpu_error; > struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj; > char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; > char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; > @@ -2901,29 +2896,32 @@ static void i915_reset_device(struct drm_i915_private *dev_priv) > i915_wedge_on_timeout(&w, dev_priv, 5*HZ) { > intel_prepare_reset(dev_priv); > > + error->reason = msg; > + > /* Signal that locked waiters should reset the GPU */ > - set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags); > - wake_up_all(&dev_priv->gpu_error.wait_queue); > + set_bit(I915_RESET_HANDOFF, &error->flags); > + wake_up_all(&error->wait_queue); > > /* Wait for anyone holding the lock to wakeup, without > * blocking indefinitely on struct_mutex. > */ > do { > if (mutex_trylock(&dev_priv->drm.struct_mutex)) { > - i915_reset(dev_priv, 0); > + i915_reset(dev_priv); > mutex_unlock(&dev_priv->drm.struct_mutex); > } > - } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags, > + } while (wait_on_bit_timeout(&error->flags, > I915_RESET_HANDOFF, > TASK_UNINTERRUPTIBLE, > 1)); > > + error->reason = NULL; > + > intel_finish_reset(dev_priv); > } > > - if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags)) > - kobject_uevent_env(kobj, > - KOBJ_CHANGE, reset_done_event); > + if (!test_bit(I915_WEDGED, &error->flags)) > + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); > } > > static void i915_clear_error_registers(struct drm_i915_private *dev_priv) > @@ -2955,6 +2953,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) > * i915_handle_error - handle a gpu error > * @dev_priv: i915 device private > * @engine_mask: mask representing engines that are hung > + * @flags: control flags > * @fmt: Error message format string > * > * Do some basic checking of register state at error time and > @@ -2965,16 +2964,23 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) > */ > void i915_handle_error(struct drm_i915_private *dev_priv, > u32 engine_mask, > + unsigned long flags, > const char *fmt, ...) > { > struct intel_engine_cs *engine; > unsigned int tmp; > - va_list args; > char error_msg[80]; > + char *msg = NULL; > > - va_start(args, fmt); > - vscnprintf(error_msg, sizeof(error_msg), fmt, args); > - va_end(args); > + if (fmt) { > + va_list args; > + > + va_start(args, fmt); > + vscnprintf(error_msg, sizeof(error_msg), fmt, args); > + va_end(args); > + > + msg = error_msg; > + } > > /* > * In most cases it's guaranteed that we get here with an RPM > @@ -2986,8 +2992,11 @@ void i915_handle_error(struct drm_i915_private *dev_priv, > intel_runtime_pm_get(dev_priv); > > engine_mask &= INTEL_INFO(dev_priv)->ring_mask; > - i915_capture_error_state(dev_priv, engine_mask, error_msg); > - i915_clear_error_registers(dev_priv); > + > + if (flags & I915_ERROR_CAPTURE) { > + i915_capture_error_state(dev_priv, engine_mask, msg); > + i915_clear_error_registers(dev_priv); > + } > > /* > * Try engine reset when available. We fall back to full reset if > @@ -3000,7 +3009,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, > &dev_priv->gpu_error.flags)) > continue; > > - if (i915_reset_engine(engine, 0) == 0) > + if (i915_reset_engine(engine, msg) == 0) > engine_mask &= ~intel_engine_flag(engine); > > clear_bit(I915_RESET_ENGINE + engine->id, > @@ -3030,7 +3039,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, > TASK_UNINTERRUPTIBLE); > } > > - i915_reset_device(dev_priv); > + i915_reset_device(dev_priv, msg); > > for_each_engine(engine, dev_priv, tmp) { > clear_bit(I915_RESET_ENGINE + engine->id, > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > index 43c7134a9b93..2325886d1d55 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -1229,7 +1229,7 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request) > return false; > > __set_current_state(TASK_RUNNING); > - i915_reset(request->i915, 0); > + i915_reset(request->i915); > return true; > } > > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c > index c8ea510629fa..fd0ffb8328d0 100644 > --- a/drivers/gpu/drm/i915/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c > @@ -246,9 +246,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) > */ > tmp = I915_READ_CTL(engine); > if (tmp & RING_WAIT) { > - i915_handle_error(dev_priv, BIT(engine->id), > - "Kicking stuck wait on %s", > - engine->name); > + i915_handle_error(dev_priv, BIT(engine->id), 0, > + "stuck wait on %s", engine->name); > I915_WRITE_CTL(engine, tmp); > return ENGINE_WAIT_KICK; > } > @@ -258,8 +257,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) > default: > return ENGINE_DEAD; > case 1: > - i915_handle_error(dev_priv, ALL_ENGINES, > - "Kicking stuck semaphore on %s", > + i915_handle_error(dev_priv, ALL_ENGINES, 0, > + "stuck semaphore on %s", > engine->name); > I915_WRITE_CTL(engine, tmp); > return ENGINE_WAIT_KICK; > @@ -386,13 +385,13 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, > if (stuck != hung) > hung &= ~stuck; > len = scnprintf(msg, sizeof(msg), > - "%s on ", stuck == hung ? "No progress" : "Hang"); > + "%s on ", stuck == hung ? "no progress" : "hang"); > for_each_engine_masked(engine, i915, hung, tmp) > len += scnprintf(msg + len, sizeof(msg) - len, > "%s, ", engine->name); > msg[len-2] = '\0'; > > - return i915_handle_error(i915, hung, "%s", msg); > + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg); > } > > /* > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index df7898c8edcb..4372826998aa 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -433,7 +433,7 @@ static int igt_global_reset(void *arg) > mutex_lock(&i915->drm.struct_mutex); > reset_count = i915_reset_count(&i915->gpu_error); > > - i915_reset(i915, I915_RESET_QUIET); > + i915_reset(i915); > > if (i915_reset_count(&i915->gpu_error) == reset_count) { > pr_err("No GPU reset recorded!\n"); > @@ -518,7 +518,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) > engine->hangcheck.seqno = > intel_engine_get_seqno(engine); > > - err = i915_reset_engine(engine, I915_RESET_QUIET); > + err = i915_reset_engine(engine, NULL); > if (err) { > pr_err("i915_reset_engine failed\n"); > break; > @@ -725,7 +725,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915, > engine->hangcheck.seqno = > intel_engine_get_seqno(engine); > > - err = i915_reset_engine(engine, I915_RESET_QUIET); > + err = i915_reset_engine(engine, NULL); > if (err) { > pr_err("i915_reset_engine(%s:%s) failed, err=%d\n", > engine->name, active ? "active" : "idle", err); > @@ -865,7 +865,6 @@ static int igt_wait_reset(void *arg) > __func__, rq->fence.seqno, hws_seqno(&h, rq)); > intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); > > - i915_reset(i915, 0); > i915_gem_set_wedged(i915); > > err = -EIO; > @@ -962,7 +961,6 @@ static int igt_reset_queue(void *arg) > i915_request_put(rq); > i915_request_put(prev); > > - i915_reset(i915, 0); > i915_gem_set_wedged(i915); > > err = -EIO; > @@ -971,7 +969,7 @@ static int igt_reset_queue(void *arg) > > reset_count = fake_hangcheck(prev); > > - i915_reset(i915, I915_RESET_QUIET); > + i915_reset(i915); > > GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, > &i915->gpu_error.flags)); > @@ -1069,7 +1067,6 @@ static int igt_handle_error(void *arg) > __func__, rq->fence.seqno, hws_seqno(&h, rq)); > intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); > > - i915_reset(i915, 0); > i915_gem_set_wedged(i915); > > err = -EIO; > @@ -1084,7 +1081,7 @@ static int igt_handle_error(void *arg) > engine->hangcheck.stalled = true; > engine->hangcheck.seqno = intel_engine_get_seqno(engine); > > - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); > + i915_handle_error(i915, intel_engine_flag(engine), 0, NULL); > > xchg(&i915->gpu_error.first_error, error); > >
Quoting Michel Thierry (2018-03-20 14:11:03) > On 3/20/2018 3:04 AM, Chris Wilson wrote: > > Not all callers want the GPU error to handled in the same way, so expose > > a control parameter. In the first instance, some callers do not want the > > heavyweight error capture so add a bit to request the state to be > > captured and saved. > > > > v2: Pass msg down to i915_reset/i915_reset_engine so that we include the > > reason for the reset in the dev_notice(), superseding the earlier option > > to not print that notice. > > v3: Stash the reason inside the i915->gpu_error to handover to the direct > > reset from the blocking waiter. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Jeff McGee <jeff.mcgee@intel.com> > > Cc: Mika Kuoppala <mika.kuoppala@intel.com> > > Cc: Michel Thierry <michel.thierry@intel.com> > > Reviewed-by: Michel Thierry <michel.thierry@intel.com> Thanks for the pointing out the regression and the review, pushed. -Chris
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 964ea1a12357..7816cd53100a 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -4011,8 +4011,8 @@ i915_wedged_set(void *data, u64 val) engine->hangcheck.stalled = true; } - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx", - val); + i915_handle_error(i915, val, I915_ERROR_CAPTURE, + "Manually set wedged engine mask = %llx", val); wait_on_bit(&i915->gpu_error.flags, I915_RESET_HANDOFF, diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 1021bf40e236..6b04cc3be513 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1869,7 +1869,6 @@ static int i915_resume_switcheroo(struct drm_device *dev) /** * i915_reset - reset chip after a hang * @i915: #drm_i915_private to reset - * @flags: Instructions * * Reset the chip. Useful if a hang is detected. Marks the device as wedged * on failure. @@ -1884,7 +1883,7 @@ static int i915_resume_switcheroo(struct drm_device *dev) * - re-init interrupt state * - re-init display */ -void i915_reset(struct drm_i915_private *i915, unsigned int flags) +void i915_reset(struct drm_i915_private *i915) { struct i915_gpu_error *error = &i915->gpu_error; int ret; @@ -1901,8 +1900,9 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags) if (!i915_gem_unset_wedged(i915)) goto wakeup; - if (!(flags & I915_RESET_QUIET)) - dev_notice(i915->drm.dev, "Resetting chip after gpu hang\n"); + if (error->reason) + dev_notice(i915->drm.dev, + "Resetting chip for %s\n", error->reason); error->reset_count++; disable_irq(i915->drm.irq); @@ -2003,7 +2003,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv, /** * i915_reset_engine - reset GPU engine to recover from a hang * @engine: engine to reset - * @flags: options + * @msg: reason for GPU reset; or NULL for no dev_notice() * * Reset a specific GPU engine. Useful if a hang is detected. * Returns zero on successful reset or otherwise an error code. @@ -2013,7 +2013,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv, * - reset engine (which will force the engine to idle) * - re-init/configure engine */ -int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags) +int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) { struct i915_gpu_error *error = &engine->i915->gpu_error; struct i915_request *active_request; @@ -2028,10 +2028,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags) goto out; } - if (!(flags & I915_RESET_QUIET)) { + if (msg) dev_notice(engine->i915->drm.dev, - "Resetting %s after gpu hang\n", engine->name); - } + "Resetting %s for %s\n", engine->name, msg); error->reset_engine_count[engine->id]++; if (!engine->i915->guc.execbuf_client) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e27ba8fb64e6..c9c3b2ba6a86 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2700,10 +2700,8 @@ extern void i915_driver_unload(struct drm_device *dev); extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask); extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv); -#define I915_RESET_QUIET BIT(0) -extern void i915_reset(struct drm_i915_private *i915, unsigned int flags); -extern int i915_reset_engine(struct intel_engine_cs *engine, - unsigned int flags); +extern void i915_reset(struct drm_i915_private *i915); +extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg); extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv); extern int intel_reset_guc(struct drm_i915_private *dev_priv); @@ -2751,10 +2749,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) &dev_priv->gpu_error.hangcheck_work, delay); } -__printf(3, 4) +__printf(4, 5) void i915_handle_error(struct drm_i915_private *dev_priv, u32 engine_mask, + unsigned long flags, const char *fmt, ...); +#define I915_ERROR_CAPTURE BIT(0) extern void intel_irq_init(struct drm_i915_private *dev_priv); extern void intel_irq_fini(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index ebbdf37e2879..ac5760673cc9 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -269,6 +269,9 @@ struct i915_gpu_error { /** Number of times an engine has been reset */ u32 reset_engine_count[I915_NUM_ENGINES]; + /** Reason for the current *global* reset */ + const char *reason; + /** * Waitqueue to signal when a hang is detected. Used to for waiters * to release the struct_mutex for the reset to procede. diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 44eef355e12c..fa7310766217 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2877,15 +2877,10 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg) return IRQ_HANDLED; } -/** - * i915_reset_device - do process context error handling work - * @dev_priv: i915 device private - * - * Fire an error uevent so userspace can see that a hang or error - * was detected. - */ -static void i915_reset_device(struct drm_i915_private *dev_priv) +static void i915_reset_device(struct drm_i915_private *dev_priv, + const char *msg) { + struct i915_gpu_error *error = &dev_priv->gpu_error; struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj; char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; @@ -2901,29 +2896,32 @@ static void i915_reset_device(struct drm_i915_private *dev_priv) i915_wedge_on_timeout(&w, dev_priv, 5*HZ) { intel_prepare_reset(dev_priv); + error->reason = msg; + /* Signal that locked waiters should reset the GPU */ - set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags); - wake_up_all(&dev_priv->gpu_error.wait_queue); + set_bit(I915_RESET_HANDOFF, &error->flags); + wake_up_all(&error->wait_queue); /* Wait for anyone holding the lock to wakeup, without * blocking indefinitely on struct_mutex. */ do { if (mutex_trylock(&dev_priv->drm.struct_mutex)) { - i915_reset(dev_priv, 0); + i915_reset(dev_priv); mutex_unlock(&dev_priv->drm.struct_mutex); } - } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags, + } while (wait_on_bit_timeout(&error->flags, I915_RESET_HANDOFF, TASK_UNINTERRUPTIBLE, 1)); + error->reason = NULL; + intel_finish_reset(dev_priv); } - if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags)) - kobject_uevent_env(kobj, - KOBJ_CHANGE, reset_done_event); + if (!test_bit(I915_WEDGED, &error->flags)) + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); } static void i915_clear_error_registers(struct drm_i915_private *dev_priv) @@ -2955,6 +2953,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) * i915_handle_error - handle a gpu error * @dev_priv: i915 device private * @engine_mask: mask representing engines that are hung + * @flags: control flags * @fmt: Error message format string * * Do some basic checking of register state at error time and @@ -2965,16 +2964,23 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) */ void i915_handle_error(struct drm_i915_private *dev_priv, u32 engine_mask, + unsigned long flags, const char *fmt, ...) { struct intel_engine_cs *engine; unsigned int tmp; - va_list args; char error_msg[80]; + char *msg = NULL; - va_start(args, fmt); - vscnprintf(error_msg, sizeof(error_msg), fmt, args); - va_end(args); + if (fmt) { + va_list args; + + va_start(args, fmt); + vscnprintf(error_msg, sizeof(error_msg), fmt, args); + va_end(args); + + msg = error_msg; + } /* * In most cases it's guaranteed that we get here with an RPM @@ -2986,8 +2992,11 @@ void i915_handle_error(struct drm_i915_private *dev_priv, intel_runtime_pm_get(dev_priv); engine_mask &= INTEL_INFO(dev_priv)->ring_mask; - i915_capture_error_state(dev_priv, engine_mask, error_msg); - i915_clear_error_registers(dev_priv); + + if (flags & I915_ERROR_CAPTURE) { + i915_capture_error_state(dev_priv, engine_mask, msg); + i915_clear_error_registers(dev_priv); + } /* * Try engine reset when available. We fall back to full reset if @@ -3000,7 +3009,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, &dev_priv->gpu_error.flags)) continue; - if (i915_reset_engine(engine, 0) == 0) + if (i915_reset_engine(engine, msg) == 0) engine_mask &= ~intel_engine_flag(engine); clear_bit(I915_RESET_ENGINE + engine->id, @@ -3030,7 +3039,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, TASK_UNINTERRUPTIBLE); } - i915_reset_device(dev_priv); + i915_reset_device(dev_priv, msg); for_each_engine(engine, dev_priv, tmp) { clear_bit(I915_RESET_ENGINE + engine->id, diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 43c7134a9b93..2325886d1d55 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1229,7 +1229,7 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request) return false; __set_current_state(TASK_RUNNING); - i915_reset(request->i915, 0); + i915_reset(request->i915); return true; } diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index c8ea510629fa..fd0ffb8328d0 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -246,9 +246,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) */ tmp = I915_READ_CTL(engine); if (tmp & RING_WAIT) { - i915_handle_error(dev_priv, BIT(engine->id), - "Kicking stuck wait on %s", - engine->name); + i915_handle_error(dev_priv, BIT(engine->id), 0, + "stuck wait on %s", engine->name); I915_WRITE_CTL(engine, tmp); return ENGINE_WAIT_KICK; } @@ -258,8 +257,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) default: return ENGINE_DEAD; case 1: - i915_handle_error(dev_priv, ALL_ENGINES, - "Kicking stuck semaphore on %s", + i915_handle_error(dev_priv, ALL_ENGINES, 0, + "stuck semaphore on %s", engine->name); I915_WRITE_CTL(engine, tmp); return ENGINE_WAIT_KICK; @@ -386,13 +385,13 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, if (stuck != hung) hung &= ~stuck; len = scnprintf(msg, sizeof(msg), - "%s on ", stuck == hung ? "No progress" : "Hang"); + "%s on ", stuck == hung ? "no progress" : "hang"); for_each_engine_masked(engine, i915, hung, tmp) len += scnprintf(msg + len, sizeof(msg) - len, "%s, ", engine->name); msg[len-2] = '\0'; - return i915_handle_error(i915, hung, "%s", msg); + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg); } /* diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index df7898c8edcb..4372826998aa 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -433,7 +433,7 @@ static int igt_global_reset(void *arg) mutex_lock(&i915->drm.struct_mutex); reset_count = i915_reset_count(&i915->gpu_error); - i915_reset(i915, I915_RESET_QUIET); + i915_reset(i915); if (i915_reset_count(&i915->gpu_error) == reset_count) { pr_err("No GPU reset recorded!\n"); @@ -518,7 +518,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) engine->hangcheck.seqno = intel_engine_get_seqno(engine); - err = i915_reset_engine(engine, I915_RESET_QUIET); + err = i915_reset_engine(engine, NULL); if (err) { pr_err("i915_reset_engine failed\n"); break; @@ -725,7 +725,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915, engine->hangcheck.seqno = intel_engine_get_seqno(engine); - err = i915_reset_engine(engine, I915_RESET_QUIET); + err = i915_reset_engine(engine, NULL); if (err) { pr_err("i915_reset_engine(%s:%s) failed, err=%d\n", engine->name, active ? "active" : "idle", err); @@ -865,7 +865,6 @@ static int igt_wait_reset(void *arg) __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); - i915_reset(i915, 0); i915_gem_set_wedged(i915); err = -EIO; @@ -962,7 +961,6 @@ static int igt_reset_queue(void *arg) i915_request_put(rq); i915_request_put(prev); - i915_reset(i915, 0); i915_gem_set_wedged(i915); err = -EIO; @@ -971,7 +969,7 @@ static int igt_reset_queue(void *arg) reset_count = fake_hangcheck(prev); - i915_reset(i915, I915_RESET_QUIET); + i915_reset(i915); GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); @@ -1069,7 +1067,6 @@ static int igt_handle_error(void *arg) __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); - i915_reset(i915, 0); i915_gem_set_wedged(i915); err = -EIO; @@ -1084,7 +1081,7 @@ static int igt_handle_error(void *arg) engine->hangcheck.stalled = true; engine->hangcheck.seqno = intel_engine_get_seqno(engine); - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); + i915_handle_error(i915, intel_engine_flag(engine), 0, NULL); xchg(&i915->gpu_error.first_error, error);
Not all callers want the GPU error to handled in the same way, so expose a control parameter. In the first instance, some callers do not want the heavyweight error capture so add a bit to request the state to be captured and saved. v2: Pass msg down to i915_reset/i915_reset_engine so that we include the reason for the reset in the dev_notice(), superseding the earlier option to not print that notice. v3: Stash the reason inside the i915->gpu_error to handover to the direct reset from the blocking waiter. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Jeff McGee <jeff.mcgee@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Michel Thierry <michel.thierry@intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 4 +- drivers/gpu/drm/i915/i915_drv.c | 17 ++++---- drivers/gpu/drm/i915/i915_drv.h | 10 ++--- drivers/gpu/drm/i915/i915_gpu_error.h | 3 ++ drivers/gpu/drm/i915/i915_irq.c | 55 ++++++++++++++---------- drivers/gpu/drm/i915/i915_request.c | 2 +- drivers/gpu/drm/i915/intel_hangcheck.c | 13 +++--- drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 13 +++--- 8 files changed, 62 insertions(+), 55 deletions(-)