diff mbox

[v2] drm/i915: Record more information about the hanging contexts

Message ID 20170129092433.10483-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Jan. 29, 2017, 9:24 a.m. UTC
Include extra information such as the user_handle and hw_id so that
userspace can identify which of their contexts hung, useful if they are
performing self-diagnositics.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       | 14 +++++--
 drivers/gpu/drm/i915/i915_gpu_error.c | 77 ++++++++++++++++++++++-------------
 2 files changed, 59 insertions(+), 32 deletions(-)

Comments

Mika Kuoppala Jan. 30, 2017, 3:24 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Include extra information such as the user_handle and hw_id so that
> userspace can identify which of their contexts hung, useful if they are
> performing self-diagnositics.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       | 14 +++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c | 77 ++++++++++++++++++++++-------------
>  2 files changed, 59 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c1fde816db63..7e7bc4504c94 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -970,6 +970,16 @@ struct drm_i915_error_state {
>  		u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
>  		struct intel_instdone instdone;
>  
> +		struct drm_i915_error_context {
> +			char comm[TASK_COMM_LEN];
> +			int pid;

s/int/pid_t

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> +			u32 handle;
> +			u32 hw_id;
> +			int ban_score;
> +			int active;
> +			int guilty;
> +		} context;
> +
>  		struct drm_i915_error_object {
>  			u64 gtt_offset;
>  			u64 gtt_size;
> @@ -1003,10 +1013,6 @@ struct drm_i915_error_state {
>  				u32 pp_dir_base;
>  			};
>  		} vm_info;
> -
> -		pid_t pid;
> -		char comm[TASK_COMM_LEN];
> -		int context_bans;
>  	} engine[I915_NUM_ENGINES];
>  
>  	struct drm_i915_error_buffer {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index e5375323eb06..5283fe815a4d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
>  		   erq->head, erq->tail);
>  }
>  
> +static void error_print_context(struct drm_i915_error_state_buf *m,
> +				const char *header,
> +				struct drm_i915_error_context *ctx)
> +{
> +	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
> +		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> +		   ctx->ban_score, ctx->guilty, ctx->active);
> +}
> +
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
>  			       struct drm_i915_error_engine *ee)
>  {
> @@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>  
>  	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
>  	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
> +	error_print_context(m, "  Active context: ", &ee->context);
>  }
>  
>  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
> @@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  
>  	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>  		if (error->engine[i].hangcheck_stalled &&
> -		    error->engine[i].pid != -1) {
> -			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
> +		    error->engine[i].context.pid) {
> +			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
>  				   engine_str(i),
> -				   error->engine[i].comm,
> -				   error->engine[i].pid,
> -				   error->engine[i].context_bans);
> +				   error->engine[i].context.comm,
> +				   error->engine[i].context.pid,
> +				   error->engine[i].context.ban_score);
>  		}
>  	}
>  	err_printf(m, "Reset count: %u\n", error->reset_count);
> @@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  		obj = ee->batchbuffer;
>  		if (obj) {
>  			err_puts(m, dev_priv->engine[i]->name);
> -			if (ee->pid != -1)
> -				err_printf(m, " (submitted by %s [%d], bans %d)",
> -					   ee->comm,
> -					   ee->pid,
> -					   ee->context_bans);
> +			if (ee->context.pid)
> +				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
> +					   ee->context.comm,
> +					   ee->context.pid,
> +					   ee->context.handle,
> +					   ee->context.hw_id,
> +					   ee->context.ban_score);
>  			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
>  				   upper_32_bits(obj->gtt_offset),
>  				   lower_32_bits(obj->gtt_offset));
> @@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
>  				       &ee->execlist[n]);
>  }
>  
> +static void record_context(struct drm_i915_error_context *e,
> +			   struct i915_gem_context *ctx)
> +{
> +	if (ctx->pid) {
> +		struct task_struct *task;
> +
> +		rcu_read_lock();
> +		task = pid_task(ctx->pid, PIDTYPE_PID);
> +		if (task) {
> +			strcpy(e->comm, task->comm);
> +			e->pid = task->pid;
> +		}
> +		rcu_read_unlock();
> +	}
> +
> +	e->handle = ctx->user_handle;
> +	e->hw_id = ctx->hw_id;
> +	e->ban_score = ctx->ban_score;
> +	e->guilty = ctx->guilty_count;
> +	e->active = ctx->active_count;
> +}
> +
>  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  				  struct drm_i915_error_state *error)
>  {
> @@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  		struct drm_i915_error_engine *ee = &error->engine[i];
>  		struct drm_i915_gem_request *request;
>  
> -		ee->pid = -1;
>  		ee->engine_id = -1;
>  
>  		if (!engine)
> @@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  		request = i915_gem_find_active_request(engine);
>  		if (request) {
>  			struct intel_ring *ring;
> -			struct pid *pid;
>  
>  			ee->vm = request->ctx->ppgtt ?
>  				&request->ctx->ppgtt->base : &ggtt->base;
>  
> +			record_context(&ee->context, request->ctx);
> +
>  			/* We need to copy these to an anonymous buffer
>  			 * as the simplest method to avoid being overwritten
>  			 * by userspace.
> @@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  				i915_error_object_create(dev_priv,
>  							 request->ctx->engine[i].state);
>  
> -			pid = request->ctx->pid;
> -			if (pid) {
> -				struct task_struct *task;
> -
> -				rcu_read_lock();
> -				task = pid_task(pid, PIDTYPE_PID);
> -				if (task) {
> -					strcpy(ee->comm, task->comm);
> -					ee->pid = task->pid;
> -				}
> -				rcu_read_unlock();
> -			}
> -
>  			error->simulated |=
>  				i915_gem_context_no_error_capture(request->ctx);
>  
> @@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
>  			"GPU HANG: ecode %d:%d:0x%08x",
>  			INTEL_GEN(dev_priv), engine_id, ecode);
>  
> -	if (engine_id != -1 && error->engine[engine_id].pid != -1)
> +	if (engine_id != -1 && error->engine[engine_id].context.pid)
>  		len += scnprintf(error->error_msg + len,
>  				 sizeof(error->error_msg) - len,
>  				 ", in %s [%d]",
> -				 error->engine[engine_id].comm,
> -				 error->engine[engine_id].pid);
> +				 error->engine[engine_id].context.comm,
> +				 error->engine[engine_id].context.pid);
>  
>  	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
>  		  ", reason: %s, action: %s",
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c1fde816db63..7e7bc4504c94 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -970,6 +970,16 @@  struct drm_i915_error_state {
 		u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
 		struct intel_instdone instdone;
 
+		struct drm_i915_error_context {
+			char comm[TASK_COMM_LEN];
+			int pid;
+			u32 handle;
+			u32 hw_id;
+			int ban_score;
+			int active;
+			int guilty;
+		} context;
+
 		struct drm_i915_error_object {
 			u64 gtt_offset;
 			u64 gtt_size;
@@ -1003,10 +1013,6 @@  struct drm_i915_error_state {
 				u32 pp_dir_base;
 			};
 		} vm_info;
-
-		pid_t pid;
-		char comm[TASK_COMM_LEN];
-		int context_bans;
 	} engine[I915_NUM_ENGINES];
 
 	struct drm_i915_error_buffer {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index e5375323eb06..5283fe815a4d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -384,6 +384,15 @@  static void error_print_request(struct drm_i915_error_state_buf *m,
 		   erq->head, erq->tail);
 }
 
+static void error_print_context(struct drm_i915_error_state_buf *m,
+				const char *header,
+				struct drm_i915_error_context *ctx)
+{
+	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
+		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
+		   ctx->ban_score, ctx->guilty, ctx->active);
+}
+
 static void error_print_engine(struct drm_i915_error_state_buf *m,
 			       struct drm_i915_error_engine *ee)
 {
@@ -457,6 +466,7 @@  static void error_print_engine(struct drm_i915_error_state_buf *m,
 
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
+	error_print_context(m, "  Active context: ", &ee->context);
 }
 
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
@@ -562,12 +572,12 @@  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		if (error->engine[i].hangcheck_stalled &&
-		    error->engine[i].pid != -1) {
-			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
+		    error->engine[i].context.pid) {
+			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
 				   engine_str(i),
-				   error->engine[i].comm,
-				   error->engine[i].pid,
-				   error->engine[i].context_bans);
+				   error->engine[i].context.comm,
+				   error->engine[i].context.pid,
+				   error->engine[i].context.ban_score);
 		}
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -658,11 +668,13 @@  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		obj = ee->batchbuffer;
 		if (obj) {
 			err_puts(m, dev_priv->engine[i]->name);
-			if (ee->pid != -1)
-				err_printf(m, " (submitted by %s [%d], bans %d)",
-					   ee->comm,
-					   ee->pid,
-					   ee->context_bans);
+			if (ee->context.pid)
+				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
+					   ee->context.comm,
+					   ee->context.pid,
+					   ee->context.handle,
+					   ee->context.hw_id,
+					   ee->context.ban_score);
 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 				   upper_32_bits(obj->gtt_offset),
 				   lower_32_bits(obj->gtt_offset));
@@ -1267,6 +1279,28 @@  static void error_record_engine_execlists(struct intel_engine_cs *engine,
 				       &ee->execlist[n]);
 }
 
+static void record_context(struct drm_i915_error_context *e,
+			   struct i915_gem_context *ctx)
+{
+	if (ctx->pid) {
+		struct task_struct *task;
+
+		rcu_read_lock();
+		task = pid_task(ctx->pid, PIDTYPE_PID);
+		if (task) {
+			strcpy(e->comm, task->comm);
+			e->pid = task->pid;
+		}
+		rcu_read_unlock();
+	}
+
+	e->handle = ctx->user_handle;
+	e->hw_id = ctx->hw_id;
+	e->ban_score = ctx->ban_score;
+	e->guilty = ctx->guilty_count;
+	e->active = ctx->active_count;
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct drm_i915_error_state *error)
 {
@@ -1281,7 +1315,6 @@  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 		struct drm_i915_error_engine *ee = &error->engine[i];
 		struct drm_i915_gem_request *request;
 
-		ee->pid = -1;
 		ee->engine_id = -1;
 
 		if (!engine)
@@ -1296,11 +1329,12 @@  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 		request = i915_gem_find_active_request(engine);
 		if (request) {
 			struct intel_ring *ring;
-			struct pid *pid;
 
 			ee->vm = request->ctx->ppgtt ?
 				&request->ctx->ppgtt->base : &ggtt->base;
 
+			record_context(&ee->context, request->ctx);
+
 			/* We need to copy these to an anonymous buffer
 			 * as the simplest method to avoid being overwritten
 			 * by userspace.
@@ -1318,19 +1352,6 @@  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				i915_error_object_create(dev_priv,
 							 request->ctx->engine[i].state);
 
-			pid = request->ctx->pid;
-			if (pid) {
-				struct task_struct *task;
-
-				rcu_read_lock();
-				task = pid_task(pid, PIDTYPE_PID);
-				if (task) {
-					strcpy(ee->comm, task->comm);
-					ee->pid = task->pid;
-				}
-				rcu_read_unlock();
-			}
-
 			error->simulated |=
 				i915_gem_context_no_error_capture(request->ctx);
 
@@ -1534,12 +1555,12 @@  static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
 			"GPU HANG: ecode %d:%d:0x%08x",
 			INTEL_GEN(dev_priv), engine_id, ecode);
 
-	if (engine_id != -1 && error->engine[engine_id].pid != -1)
+	if (engine_id != -1 && error->engine[engine_id].context.pid)
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
 				 ", in %s [%d]",
-				 error->engine[engine_id].comm,
-				 error->engine[engine_id].pid);
+				 error->engine[engine_id].context.comm,
+				 error->engine[engine_id].context.pid);
 
 	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
 		  ", reason: %s, action: %s",