diff mbox

[2/3] drm/i915/tdr: Prepare error handler to accept mask of hung engines

Message ID 1458331676-567-3-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

arun.siluvery@linux.intel.com March 18, 2016, 8:07 p.m. UTC
In preparation for engine reset, the wedged argument of i915_handle_error()
is extended to reflect as a mask of engines that are hung. This is further
passed down to error state capture functions which are also updated.

Engine reset recovery mechanism uses this mask and schedules recovery work
for those particular engines.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       |  4 ++--
 drivers/gpu/drm/i915/i915_gpu_error.c |  8 ++++----
 drivers/gpu/drm/i915/i915_irq.c       | 16 ++++++++--------
 3 files changed, 14 insertions(+), 14 deletions(-)

Comments

Mika Kuoppala March 22, 2016, 12:04 p.m. UTC | #1
Arun Siluvery <arun.siluvery@linux.intel.com> writes:

> [ text/plain ]
> In preparation for engine reset, the wedged argument of i915_handle_error()
> is extended to reflect as a mask of engines that are hung. This is further
> passed down to error state capture functions which are also updated.
>
> Engine reset recovery mechanism uses this mask and schedules recovery work
> for those particular engines.
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  4 ++--
>  drivers/gpu/drm/i915/i915_gpu_error.c |  8 ++++----
>  drivers/gpu/drm/i915/i915_irq.c       | 16 ++++++++--------
>  3 files changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 549a232..49ac065 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2735,7 +2735,7 @@ bool intel_hpd_pin_to_port(enum hpd_pin pin, enum port *port);
>  /* i915_irq.c */
>  void i915_queue_hangcheck(struct drm_device *dev);
>  __printf(3, 4)
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask,
>  		       const char *fmt, ...);
>  
>  extern void intel_irq_init(struct drm_i915_private *dev_priv);
> @@ -3321,7 +3321,7 @@ static inline void i915_error_state_buf_release(
>  {
>  	kfree(eb->buf);
>  }
> -void i915_capture_error_state(struct drm_device *dev, bool wedge,
> +void i915_capture_error_state(struct drm_device *dev, u32 engine_mask,
>  			      const char *error_msg);
>  void i915_error_state_get(struct drm_device *dev,
>  			  struct i915_error_state_file_priv *error_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index db8600a..1f8ff06 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1301,7 +1301,7 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
>  
>  static void i915_error_capture_msg(struct drm_device *dev,
>  				   struct drm_i915_error_state *error,
> -				   bool wedged,
> +				   u32 engine_mask,
>  				   const char *error_msg)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -1324,7 +1324,7 @@ static void i915_error_capture_msg(struct drm_device *dev,
>  	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
>  		  ", reason: %s, action: %s",
>  		  error_msg,
> -		  wedged ? "reset" : "continue");
> +		  engine_mask ? "reset" : "continue");
>  }
>  
>  static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
> @@ -1347,7 +1347,7 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
>   * out a structure which becomes available in debugfs for user level tools
>   * to pick up.
>   */
> -void i915_capture_error_state(struct drm_device *dev, bool wedged,
> +void i915_capture_error_state(struct drm_device *dev, u32 engine_mask,
>  			      const char *error_msg)
>  {
>  	static bool warned;
> @@ -1375,7 +1375,7 @@ void i915_capture_error_state(struct drm_device *dev, bool wedged,
>  	error->overlay = intel_overlay_capture_error_state(dev);
>  	error->display = intel_display_capture_error_state(dev);
>  
> -	i915_error_capture_msg(dev, error, wedged, error_msg);
> +	i915_error_capture_msg(dev, error, engine_mask, error_msg);
>  	DRM_INFO("%s\n", error->error_msg);
>  
>  	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 8f3e330..a55a7cc 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2653,14 +2653,14 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>  /**
>   * i915_handle_error - handle a gpu error
>   * @dev: drm device
> - *
> + * @engine_mask: mask representing engines that are hung
>   * Do some basic checking of register state at error time and
>   * dump it to the syslog.  Also call i915_capture_error_state() to make
>   * sure we get a record and make it available in debugfs.  Fire a uevent
>   * so userspace knows something bad happened (should trigger collection
>   * of a ring dump etc.).
>   */
> -void i915_handle_error(struct drm_device *dev, bool wedged,
> +void i915_handle_error(struct drm_device *dev, u32 engine_mask,
>  		       const char *fmt, ...)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -2671,10 +2671,10 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
>  	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
>  	va_end(args);
>  
> -	i915_capture_error_state(dev, wedged, error_msg);
> +	i915_capture_error_state(dev, engine_mask, error_msg);
>  	i915_report_and_clear_eir(dev);
>  
> -	if (wedged) {
> +	if (engine_mask) {
>  		atomic_or(I915_RESET_IN_PROGRESS_FLAG,
>  				&dev_priv->gpu_error.reset_counter);
>  
> @@ -3033,7 +3033,7 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd)
>  	 */
>  	tmp = I915_READ_CTL(engine);
>  	if (tmp & RING_WAIT) {
> -		i915_handle_error(dev, false,
> +		i915_handle_error(dev, 0,
>  				  "Kicking stuck wait on %s",
>  				  engine->name);
>  		I915_WRITE_CTL(engine, tmp);
> @@ -3045,7 +3045,7 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd)
>  		default:
>  			return HANGCHECK_HUNG;
>  		case 1:
> -			i915_handle_error(dev, false,
> +			i915_handle_error(dev, 0,
>  					  "Kicking stuck semaphore on %s",
>  					  engine->name);
>  			I915_WRITE_CTL(engine, tmp);
> @@ -3189,12 +3189,12 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 engine->name);
> -			rings_hung++;
> +			rings_hung |= intel_engine_flag(engine);

We can change the int to u32 when we rename rings_hung to engines_hung.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

>  		}
>  	}
>  
>  	if (rings_hung) {
> -		i915_handle_error(dev, true, "Ring hung");
> +		i915_handle_error(dev, rings_hung, "Engine(s) hung");
>  		goto out;
>  	}
>  
> -- 
> 1.9.1
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 549a232..49ac065 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2735,7 +2735,7 @@  bool intel_hpd_pin_to_port(enum hpd_pin pin, enum port *port);
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
 __printf(3, 4)
-void i915_handle_error(struct drm_device *dev, bool wedged,
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		       const char *fmt, ...);
 
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
@@ -3321,7 +3321,7 @@  static inline void i915_error_state_buf_release(
 {
 	kfree(eb->buf);
 }
-void i915_capture_error_state(struct drm_device *dev, bool wedge,
+void i915_capture_error_state(struct drm_device *dev, u32 engine_mask,
 			      const char *error_msg);
 void i915_error_state_get(struct drm_device *dev,
 			  struct i915_error_state_file_priv *error_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index db8600a..1f8ff06 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1301,7 +1301,7 @@  static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
 
 static void i915_error_capture_msg(struct drm_device *dev,
 				   struct drm_i915_error_state *error,
-				   bool wedged,
+				   u32 engine_mask,
 				   const char *error_msg)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -1324,7 +1324,7 @@  static void i915_error_capture_msg(struct drm_device *dev,
 	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
 		  ", reason: %s, action: %s",
 		  error_msg,
-		  wedged ? "reset" : "continue");
+		  engine_mask ? "reset" : "continue");
 }
 
 static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
@@ -1347,7 +1347,7 @@  static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
-void i915_capture_error_state(struct drm_device *dev, bool wedged,
+void i915_capture_error_state(struct drm_device *dev, u32 engine_mask,
 			      const char *error_msg)
 {
 	static bool warned;
@@ -1375,7 +1375,7 @@  void i915_capture_error_state(struct drm_device *dev, bool wedged,
 	error->overlay = intel_overlay_capture_error_state(dev);
 	error->display = intel_display_capture_error_state(dev);
 
-	i915_error_capture_msg(dev, error, wedged, error_msg);
+	i915_error_capture_msg(dev, error, engine_mask, error_msg);
 	DRM_INFO("%s\n", error->error_msg);
 
 	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 8f3e330..a55a7cc 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2653,14 +2653,14 @@  static void i915_report_and_clear_eir(struct drm_device *dev)
 /**
  * i915_handle_error - handle a gpu error
  * @dev: drm device
- *
+ * @engine_mask: mask representing engines that are hung
  * Do some basic checking of register state at error time and
  * dump it to the syslog.  Also call i915_capture_error_state() to make
  * sure we get a record and make it available in debugfs.  Fire a uevent
  * so userspace knows something bad happened (should trigger collection
  * of a ring dump etc.).
  */
-void i915_handle_error(struct drm_device *dev, bool wedged,
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
 		       const char *fmt, ...)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -2671,10 +2671,10 @@  void i915_handle_error(struct drm_device *dev, bool wedged,
 	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
 	va_end(args);
 
-	i915_capture_error_state(dev, wedged, error_msg);
+	i915_capture_error_state(dev, engine_mask, error_msg);
 	i915_report_and_clear_eir(dev);
 
-	if (wedged) {
+	if (engine_mask) {
 		atomic_or(I915_RESET_IN_PROGRESS_FLAG,
 				&dev_priv->gpu_error.reset_counter);
 
@@ -3033,7 +3033,7 @@  ring_stuck(struct intel_engine_cs *engine, u64 acthd)
 	 */
 	tmp = I915_READ_CTL(engine);
 	if (tmp & RING_WAIT) {
-		i915_handle_error(dev, false,
+		i915_handle_error(dev, 0,
 				  "Kicking stuck wait on %s",
 				  engine->name);
 		I915_WRITE_CTL(engine, tmp);
@@ -3045,7 +3045,7 @@  ring_stuck(struct intel_engine_cs *engine, u64 acthd)
 		default:
 			return HANGCHECK_HUNG;
 		case 1:
-			i915_handle_error(dev, false,
+			i915_handle_error(dev, 0,
 					  "Kicking stuck semaphore on %s",
 					  engine->name);
 			I915_WRITE_CTL(engine, tmp);
@@ -3189,12 +3189,12 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 			DRM_INFO("%s on %s\n",
 				 stuck[i] ? "stuck" : "no progress",
 				 engine->name);
-			rings_hung++;
+			rings_hung |= intel_engine_flag(engine);
 		}
 	}
 
 	if (rings_hung) {
-		i915_handle_error(dev, true, "Ring hung");
+		i915_handle_error(dev, rings_hung, "Engine(s) hung");
 		goto out;
 	}