diff mbox

[07/20] drm/i915: Watchdog timeout: Hang detection integration into error handler

Message ID 1452706112-8617-8-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

arun.siluvery@linux.intel.com Jan. 13, 2016, 5:28 p.m. UTC
From: Tomas Elf <tomas.elf@intel.com>

This patch enables watchdog timeout hang detection as an entrypoint into the
driver error handler. This form of hang detection overrides the promotion logic
normally used by the periodic hang checker and instead allows for direct access
to the per-engine hang recovery path.

NOTE: I don't know if Ben Widawsky had any part in this code from 3 years
ago. There have been so many people involved in this already that I am in no
position to know. If I've missed anyone's sob line please let me know.

Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@intel.com>
Signed-off-by: Ian Lister <ian.lister@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.h     |  6 +++---
 drivers/gpu/drm/i915/i915_irq.c     | 43 ++++++++++++++++++++++---------------
 3 files changed, 30 insertions(+), 21 deletions(-)

Comments

Chris Wilson Jan. 13, 2016, 9:13 p.m. UTC | #1
On Wed, Jan 13, 2016 at 05:28:19PM +0000, Arun Siluvery wrote:
>  /* i915_irq.c */
>  void i915_queue_hangcheck(struct drm_device *dev);
> -__printf(4, 5)
> -void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
> -		       const char *fmt, ...);
> +__printf(5, 6)
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask,
> +		       bool watchdog, bool wedged, const char *fmt, ...);
>  
>  extern void intel_irq_init(struct drm_i915_private *dev_priv);
>  int intel_irq_install(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 8937c82..0710724 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2726,6 +2726,7 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>   *			If a previous engine reset was attempted too recently
>   *			or if one of the current engine resets fails we fall
>   *			back to legacy full GPU reset.
> + * @watchdog: 		true = Engine hang detected by hardware watchdog.
>   * @wedged: 		true = Hang detected, invoke hang recovery.

A bitmask and 2 booleans? Whilst this isn't going to be the most widely
used of functions, those parameters are just inviting trouble.

>   * @fmt, ...: 		Error message describing reason for error.
>   *
> @@ -2737,8 +2738,8 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>   * reset the associated engine. Failing that, try to fall back to legacy
>   * full GPU reset recovery mode.
>   */
> -void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
> -		       const char *fmt, ...)
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask,
> +                       bool watchdog, bool wedged, const char *fmt, ...)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	va_list args;
> @@ -2776,20 +2777,27 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
>  			u32 i;
>  
>  			for_each_ring(engine, dev_priv, i) {
> -				u32 now, last_engine_reset_timediff;

Oops skipped a patch, I'll be back.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6d1b6c3..dabddda 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4720,7 +4720,7 @@  i915_wedged_set(void *data, u64 val)
 
 	intel_runtime_pm_get(dev_priv);
 
-	i915_handle_error(dev, 0x0, val,
+	i915_handle_error(dev, 0x0, false, val,
 			  "Manually setting wedged to %llu", val);
 
 	intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 072ca37..80e6d01 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2766,9 +2766,9 @@  static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
 
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
-__printf(4, 5)
-void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
-		       const char *fmt, ...);
+__printf(5, 6)
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
+		       bool watchdog, bool wedged, const char *fmt, ...);
 
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
 int intel_irq_install(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 8937c82..0710724 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2726,6 +2726,7 @@  static void i915_report_and_clear_eir(struct drm_device *dev)
  *			If a previous engine reset was attempted too recently
  *			or if one of the current engine resets fails we fall
  *			back to legacy full GPU reset.
+ * @watchdog: 		true = Engine hang detected by hardware watchdog.
  * @wedged: 		true = Hang detected, invoke hang recovery.
  * @fmt, ...: 		Error message describing reason for error.
  *
@@ -2737,8 +2738,8 @@  static void i915_report_and_clear_eir(struct drm_device *dev)
  * reset the associated engine. Failing that, try to fall back to legacy
  * full GPU reset recovery mode.
  */
-void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
-		       const char *fmt, ...)
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
+                       bool watchdog, bool wedged, const char *fmt, ...)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	va_list args;
@@ -2776,20 +2777,27 @@  void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
 			u32 i;
 
 			for_each_ring(engine, dev_priv, i) {
-				u32 now, last_engine_reset_timediff;
 
 				if (!(intel_ring_flag(engine) & engine_mask))
 					continue;
 
-				/* Measure the time since this engine was last reset */
-				now = get_seconds();
-				last_engine_reset_timediff =
-					now - engine->hangcheck.last_engine_reset_time;
-
-				full_reset = last_engine_reset_timediff <
-					i915.gpu_reset_promotion_time;
-
-				engine->hangcheck.last_engine_reset_time = now;
+				if (!watchdog) {
+					/* Measure the time since this engine was last reset */
+					u32 now = get_seconds();
+					u32 last_engine_reset_timediff =
+						now - engine->hangcheck.last_engine_reset_time;
+
+					full_reset = last_engine_reset_timediff <
+						i915.gpu_reset_promotion_time;
+
+					engine->hangcheck.last_engine_reset_time = now;
+				} else {
+					/*
+					 * Watchdog timeout always results
+					 * in engine reset.
+					 */
+					full_reset = false;
+				}
 
 				/*
 				 * This engine was not reset too recently - go ahead
@@ -2800,10 +2808,11 @@  void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
 				 * This can still be overridden by a global
 				 * reset e.g. if per-engine reset fails.
 				 */
-				if (!full_reset)
+				if (watchdog || !full_reset)
 					atomic_or(I915_ENGINE_RESET_IN_PROGRESS,
 						&engine->hangcheck.flags);
-				else
+
+				if (full_reset)
 					break;
 
 			} /* for_each_ring */
@@ -3187,7 +3196,7 @@  ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 	 */
 	tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
-		i915_handle_error(dev, intel_ring_flag(ring), false,
+		i915_handle_error(dev, intel_ring_flag(ring), false, false,
 				  "Kicking stuck wait on %s",
 				  ring->name);
 		I915_WRITE_CTL(ring, tmp);
@@ -3199,7 +3208,7 @@  ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 		default:
 			return HANGCHECK_HUNG;
 		case 1:
-			i915_handle_error(dev, intel_ring_flag(ring), false,
+			i915_handle_error(dev, intel_ring_flag(ring), false, false,
 					  "Kicking stuck semaphore on %s",
 					  ring->name);
 			I915_WRITE_CTL(ring, tmp);
@@ -3349,7 +3358,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	}
 
 	if (engine_mask) {
-		i915_handle_error(dev, engine_mask, true, "Ring hung (0x%02x)", engine_mask);
+		i915_handle_error(dev, engine_mask, false, true, "Ring hung (0x%02x)", engine_mask);
 		goto out;
 	}