Message ID | 20180316215001.12391-2-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 16/03/18 14:50, Chris Wilson wrote: > Not all callers want the GPU error to handled in the same way, so expose > a control parameter. In the first instance, some callers do not want the > heavyweight error capture so add a bit to request the state to be > captured and saved. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Jeff McGee <jeff.mcgee@intel.com> > Cc: Mika Kuoppala <mika.kuoppala@intel.com> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 4 ++-- > drivers/gpu/drm/i915/i915_drv.h | 4 +++- > drivers/gpu/drm/i915/i915_irq.c | 22 ++++++++++++++-------- > drivers/gpu/drm/i915/intel_hangcheck.c | 6 +++--- > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +- > 5 files changed, 23 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 5378863e3238..03b74a92caed 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -3952,8 +3952,8 @@ i915_wedged_set(void *data, u64 val) > engine->hangcheck.stalled = true; > } > > - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx", > - val); > + i915_handle_error(i915, val, I915_ERROR_CAPTURE, > + "Manually set wedged engine mask = %llx", val); > > wait_on_bit(&i915->gpu_error.flags, > I915_RESET_HANDOFF, > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index e27ba8fb64e6..53009ba50640 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2751,10 +2751,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) > &dev_priv->gpu_error.hangcheck_work, delay); > } > > -__printf(3, 4) > +__printf(4, 5) > void i915_handle_error(struct drm_i915_private *dev_priv, > u32 engine_mask, > + unsigned long flags, > const char *fmt, ...); > +#define I915_ERROR_CAPTURE BIT(0) > Should this be in the new i915_gpu_error.h? > extern void intel_irq_init(struct drm_i915_private *dev_priv); > extern void intel_irq_fini(struct drm_i915_private *dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 44eef355e12c..b3a4dc7cb26c 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2955,6 +2955,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) > * i915_handle_error - handle a gpu error > * @dev_priv: i915 device private > * @engine_mask: mask representing engines that are hung > + * @flags: control flags > * @fmt: Error message format string > * > * Do some basic checking of register state at error time and > @@ -2965,16 +2966,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) > */ > void i915_handle_error(struct drm_i915_private *dev_priv, > u32 engine_mask, > + unsigned long flags, > const char *fmt, ...) > { > struct intel_engine_cs *engine; > unsigned int tmp; > - va_list args; > - char error_msg[80]; > - > - va_start(args, fmt); > - vscnprintf(error_msg, sizeof(error_msg), fmt, args); > - va_end(args); > > /* > * In most cases it's guaranteed that we get here with an RPM > @@ -2986,8 +2982,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv, > intel_runtime_pm_get(dev_priv); > > engine_mask &= INTEL_INFO(dev_priv)->ring_mask; > - i915_capture_error_state(dev_priv, engine_mask, error_msg); > - i915_clear_error_registers(dev_priv); > + > + if (flags & I915_ERROR_CAPTURE) { > + char error_msg[80]; > + va_list args; > + > + va_start(args, fmt); > + vscnprintf(error_msg, sizeof(error_msg), fmt, args); > + va_end(args); > + > + i915_capture_error_state(dev_priv, engine_mask, error_msg); > + i915_clear_error_registers(dev_priv); > + } else DRM_INFO or DRM_NOTE the error_msg ? Otherwise the 'kicking wait/semaphore' text from below will never be printed. > > /* > * Try engine reset when available. We fall back to full reset if > diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c > index 42e45ae87393..13d1a269c771 100644 > --- a/drivers/gpu/drm/i915/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/intel_hangcheck.c > @@ -246,7 +246,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) > */ > tmp = I915_READ_CTL(engine); > if (tmp & RING_WAIT) { > - i915_handle_error(dev_priv, 0, > + i915_handle_error(dev_priv, 0, 0, > "Kicking stuck wait on %s", > engine->name); > I915_WRITE_CTL(engine, tmp); > @@ -258,7 +258,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) > default: > return ENGINE_DEAD; > case 1: > - i915_handle_error(dev_priv, 0, > + i915_handle_error(dev_priv, 0, 0, > "Kicking stuck semaphore on %s", > engine->name); > I915_WRITE_CTL(engine, tmp); > @@ -392,7 +392,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, > "%s, ", engine->name); > msg[len-2] = '\0'; > > - return i915_handle_error(i915, hung, "%s", msg); > + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg); > } > > /* > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index df7898c8edcb..84aad2b4825d 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -1084,7 +1084,7 @@ static int igt_handle_error(void *arg) > engine->hangcheck.stalled = true; > engine->hangcheck.seqno = intel_engine_get_seqno(engine); > > - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); > + i915_handle_error(i915, intel_engine_flag(engine), 0, "%s", __func__); > > xchg(&i915->gpu_error.first_error, error); > >
Quoting Michel Thierry (2018-03-19 16:48:01) > On 16/03/18 14:50, Chris Wilson wrote: > > Not all callers want the GPU error to handled in the same way, so expose > > a control parameter. In the first instance, some callers do not want the > > heavyweight error capture so add a bit to request the state to be > > captured and saved. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Jeff McGee <jeff.mcgee@intel.com> > > Cc: Mika Kuoppala <mika.kuoppala@intel.com> > > --- > > drivers/gpu/drm/i915/i915_debugfs.c | 4 ++-- > > drivers/gpu/drm/i915/i915_drv.h | 4 +++- > > drivers/gpu/drm/i915/i915_irq.c | 22 ++++++++++++++-------- > > drivers/gpu/drm/i915/intel_hangcheck.c | 6 +++--- > > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +- > > 5 files changed, 23 insertions(+), 15 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > > index 5378863e3238..03b74a92caed 100644 > > --- a/drivers/gpu/drm/i915/i915_debugfs.c > > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > > @@ -3952,8 +3952,8 @@ i915_wedged_set(void *data, u64 val) > > engine->hangcheck.stalled = true; > > } > > > > - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx", > > - val); > > + i915_handle_error(i915, val, I915_ERROR_CAPTURE, > > + "Manually set wedged engine mask = %llx", val); > > > > wait_on_bit(&i915->gpu_error.flags, > > I915_RESET_HANDOFF, > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > > index e27ba8fb64e6..53009ba50640 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -2751,10 +2751,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) > > &dev_priv->gpu_error.hangcheck_work, delay); > > } > > > > -__printf(3, 4) > > +__printf(4, 5) > > void i915_handle_error(struct drm_i915_private *dev_priv, > > u32 engine_mask, > > + unsigned long flags, > > const char *fmt, ...); > > +#define I915_ERROR_CAPTURE BIT(0) > > > Should this be in the new i915_gpu_error.h? And move it over from i915_irq.c to somewhere more useful? Probably intel_hangcheck.c since i915_gpu_error.c is conditionally compiled. > > extern void intel_irq_init(struct drm_i915_private *dev_priv); > > extern void intel_irq_fini(struct drm_i915_private *dev_priv); > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > > index 44eef355e12c..b3a4dc7cb26c 100644 > > --- a/drivers/gpu/drm/i915/i915_irq.c > > +++ b/drivers/gpu/drm/i915/i915_irq.c > > @@ -2955,6 +2955,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) > > * i915_handle_error - handle a gpu error > > * @dev_priv: i915 device private > > * @engine_mask: mask representing engines that are hung > > + * @flags: control flags > > * @fmt: Error message format string > > * > > * Do some basic checking of register state at error time and > > @@ -2965,16 +2966,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) > > */ > > void i915_handle_error(struct drm_i915_private *dev_priv, > > u32 engine_mask, > > + unsigned long flags, > > const char *fmt, ...) > > { > > struct intel_engine_cs *engine; > > unsigned int tmp; > > - va_list args; > > - char error_msg[80]; > > - > > - va_start(args, fmt); > > - vscnprintf(error_msg, sizeof(error_msg), fmt, args); > > - va_end(args); > > > > /* > > * In most cases it's guaranteed that we get here with an RPM > > @@ -2986,8 +2982,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv, > > intel_runtime_pm_get(dev_priv); > > > > engine_mask &= INTEL_INFO(dev_priv)->ring_mask; > > - i915_capture_error_state(dev_priv, engine_mask, error_msg); > > - i915_clear_error_registers(dev_priv); > > + > > + if (flags & I915_ERROR_CAPTURE) { > > + char error_msg[80]; > > + va_list args; > > + > > + va_start(args, fmt); > > + vscnprintf(error_msg, sizeof(error_msg), fmt, args); > > + va_end(args); > > + > > + i915_capture_error_state(dev_priv, engine_mask, error_msg); > > + i915_clear_error_registers(dev_priv); > > + } > else > DRM_INFO or DRM_NOTE the error_msg ? > > Otherwise the 'kicking wait/semaphore' text from below will never be > printed. I liked it disappearing ;) So we should feed it to i915_reset(const char *reason). That could then probably replace I915_RESET_QUIET. -Chris
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 5378863e3238..03b74a92caed 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3952,8 +3952,8 @@ i915_wedged_set(void *data, u64 val) engine->hangcheck.stalled = true; } - i915_handle_error(i915, val, "Manually set wedged engine mask = %llx", - val); + i915_handle_error(i915, val, I915_ERROR_CAPTURE, + "Manually set wedged engine mask = %llx", val); wait_on_bit(&i915->gpu_error.flags, I915_RESET_HANDOFF, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e27ba8fb64e6..53009ba50640 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2751,10 +2751,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) &dev_priv->gpu_error.hangcheck_work, delay); } -__printf(3, 4) +__printf(4, 5) void i915_handle_error(struct drm_i915_private *dev_priv, u32 engine_mask, + unsigned long flags, const char *fmt, ...); +#define I915_ERROR_CAPTURE BIT(0) extern void intel_irq_init(struct drm_i915_private *dev_priv); extern void intel_irq_fini(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 44eef355e12c..b3a4dc7cb26c 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2955,6 +2955,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) * i915_handle_error - handle a gpu error * @dev_priv: i915 device private * @engine_mask: mask representing engines that are hung + * @flags: control flags * @fmt: Error message format string * * Do some basic checking of register state at error time and @@ -2965,16 +2966,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) */ void i915_handle_error(struct drm_i915_private *dev_priv, u32 engine_mask, + unsigned long flags, const char *fmt, ...) { struct intel_engine_cs *engine; unsigned int tmp; - va_list args; - char error_msg[80]; - - va_start(args, fmt); - vscnprintf(error_msg, sizeof(error_msg), fmt, args); - va_end(args); /* * In most cases it's guaranteed that we get here with an RPM @@ -2986,8 +2982,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv, intel_runtime_pm_get(dev_priv); engine_mask &= INTEL_INFO(dev_priv)->ring_mask; - i915_capture_error_state(dev_priv, engine_mask, error_msg); - i915_clear_error_registers(dev_priv); + + if (flags & I915_ERROR_CAPTURE) { + char error_msg[80]; + va_list args; + + va_start(args, fmt); + vscnprintf(error_msg, sizeof(error_msg), fmt, args); + va_end(args); + + i915_capture_error_state(dev_priv, engine_mask, error_msg); + i915_clear_error_registers(dev_priv); + } /* * Try engine reset when available. We fall back to full reset if diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 42e45ae87393..13d1a269c771 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -246,7 +246,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) */ tmp = I915_READ_CTL(engine); if (tmp & RING_WAIT) { - i915_handle_error(dev_priv, 0, + i915_handle_error(dev_priv, 0, 0, "Kicking stuck wait on %s", engine->name); I915_WRITE_CTL(engine, tmp); @@ -258,7 +258,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) default: return ENGINE_DEAD; case 1: - i915_handle_error(dev_priv, 0, + i915_handle_error(dev_priv, 0, 0, "Kicking stuck semaphore on %s", engine->name); I915_WRITE_CTL(engine, tmp); @@ -392,7 +392,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, "%s, ", engine->name); msg[len-2] = '\0'; - return i915_handle_error(i915, hung, "%s", msg); + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg); } /* diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index df7898c8edcb..84aad2b4825d 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -1084,7 +1084,7 @@ static int igt_handle_error(void *arg) engine->hangcheck.stalled = true; engine->hangcheck.seqno = intel_engine_get_seqno(engine); - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); + i915_handle_error(i915, intel_engine_flag(engine), 0, "%s", __func__); xchg(&i915->gpu_error.first_error, error);
Not all callers want the GPU error to handled in the same way, so expose a control parameter. In the first instance, some callers do not want the heavyweight error capture so add a bit to request the state to be captured and saved. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Jeff McGee <jeff.mcgee@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 4 ++-- drivers/gpu/drm/i915/i915_drv.h | 4 +++- drivers/gpu/drm/i915/i915_irq.c | 22 ++++++++++++++-------- drivers/gpu/drm/i915/intel_hangcheck.c | 6 +++--- drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +- 5 files changed, 23 insertions(+), 15 deletions(-)