diff mbox series

[RFC,1/2] drm/i915: Improve record of hung engines in error state

Message ID 20201104122043.876567-1-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/2] drm/i915: Improve record of hung engines in error state | expand

Commit Message

Tvrtko Ursulin Nov. 4, 2020, 12:20 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Between events which trigger engine and GPU resets and capturing the error
state we lose information on which engine triggered the reset. Improve
this by passing in the hung engine mask down to error capture.

Result is that the list of engines in user visible "GPU HANG: ecode
<gen>:<engines>:<ecode>, <process>" is now a list of hanging and not just
active engines. Most importantly the displayed process is now the one
which was actually hung.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c   |  2 ++
 drivers/gpu/drm/i915/gt/intel_reset.c |  2 +-
 drivers/gpu/drm/i915/i915_debugfs.c   |  2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c | 35 ++++++++++++++++++---------
 drivers/gpu/drm/i915/i915_gpu_error.h |  7 ++++--
 5 files changed, 32 insertions(+), 16 deletions(-)

Comments

Chris Wilson Nov. 4, 2020, 12:30 p.m. UTC | #1
Quoting Tvrtko Ursulin (2020-11-04 12:20:42)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Between events which trigger engine and GPU resets and capturing the error
> state we lose information on which engine triggered the reset. Improve
> this by passing in the hung engine mask down to error capture.
> 
> Result is that the list of engines in user visible "GPU HANG: ecode
> <gen>:<engines>:<ecode>, <process>" is now a list of hanging and not just
> active engines. Most importantly the displayed process is now the one
> which was actually hung.

You could also suggest to only include the hanging engine in the report,
as is intended to be the normal means of generating the report

> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 0220b0992808..3a7ca90a3436 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -59,6 +59,7 @@ struct i915_request_coredump {
>  struct intel_engine_coredump {
>         const struct intel_engine_cs *engine;
>  
> +       bool hung;
>         bool simulated;
>         u32 reset_count;
>  
> @@ -218,8 +219,10 @@ struct drm_i915_error_state_buf {
>  __printf(2, 3)
>  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
>  
> -struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
> -void i915_capture_error_state(struct drm_i915_private *i915);
> +struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
> +                                           intel_engine_mask_t engine_mask);
> +void i915_capture_error_state(struct intel_gt *gt,
> +                             intel_engine_mask_t engine_mask);

Don't forget the stubs.
-Chris
Tvrtko Ursulin Nov. 4, 2020, 1:03 p.m. UTC | #2
On 04/11/2020 12:30, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-11-04 12:20:42)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Between events which trigger engine and GPU resets and capturing the error
>> state we lose information on which engine triggered the reset. Improve
>> this by passing in the hung engine mask down to error capture.
>>
>> Result is that the list of engines in user visible "GPU HANG: ecode
>> <gen>:<engines>:<ecode>, <process>" is now a list of hanging and not just
>> active engines. Most importantly the displayed process is now the one
>> which was actually hung.
> 
> You could also suggest to only include the hanging engine in the report,
> as is intended to be the normal means of generating the report

I thought it is potentially useful to have a full picture, but can do 
that as well.

>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
>> index 0220b0992808..3a7ca90a3436 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
>> @@ -59,6 +59,7 @@ struct i915_request_coredump {
>>   struct intel_engine_coredump {
>>          const struct intel_engine_cs *engine;
>>   
>> +       bool hung;
>>          bool simulated;
>>          u32 reset_count;
>>   
>> @@ -218,8 +219,10 @@ struct drm_i915_error_state_buf {
>>   __printf(2, 3)
>>   void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
>>   
>> -struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
>> -void i915_capture_error_state(struct drm_i915_private *i915);
>> +struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
>> +                                           intel_engine_mask_t engine_mask);
>> +void i915_capture_error_state(struct intel_gt *gt,
>> +                             intel_engine_mask_t engine_mask);
> 
> Don't forget the stubs.

Right, thanks.

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index f3eb68a76a25..8a51c1c3a091 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -3037,6 +3037,8 @@  static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
 	if (!cap->error->gt->engine)
 		goto err_gt;
 
+	cap->error->gt->engine->hung = true;
+
 	return cap;
 
 err_gt:
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 4e5e13dc95da..9fb4306b2900 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1251,7 +1251,7 @@  void intel_gt_handle_error(struct intel_gt *gt,
 	engine_mask &= gt->info.engine_mask;
 
 	if (flags & I915_ERROR_CAPTURE) {
-		i915_capture_error_state(gt->i915);
+		i915_capture_error_state(gt, engine_mask);
 		intel_gt_clear_error_registers(gt, engine_mask);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 200f6b86f864..77e76b665098 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -725,7 +725,7 @@  static int i915_gpu_info_open(struct inode *inode, struct file *file)
 
 	gpu = NULL;
 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
-		gpu = i915_gpu_coredump(i915);
+		gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES);
 	if (IS_ERR(gpu))
 		return PTR_ERR(gpu);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index b3f3a2e07408..857db66cc4a3 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -570,6 +570,7 @@  static void error_print_engine(struct drm_i915_error_state_buf *m,
 				   ee->vm_info.pp_dir_base);
 		}
 	}
+	err_printf(m, "  hung: %u\n", ee->hung);
 	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 
 	for (n = 0; n < ee->num_ports; n++) {
@@ -1456,6 +1457,7 @@  capture_engine(struct intel_engine_cs *engine,
 
 static void
 gt_record_engines(struct intel_gt_coredump *gt,
+		  intel_engine_mask_t engine_mask,
 		  struct i915_vma_compress *compress)
 {
 	struct intel_engine_cs *engine;
@@ -1471,6 +1473,8 @@  gt_record_engines(struct intel_gt_coredump *gt,
 		if (!ee)
 			continue;
 
+		ee->hung = engine->mask & engine_mask;
+
 		gt->simulated |= ee->simulated;
 		if (ee->simulated) {
 			kfree(ee);
@@ -1663,11 +1667,13 @@  static const char *error_msg(struct i915_gpu_coredump *error)
 	for (gt = error->gt; gt; gt = gt->next) {
 		struct intel_engine_coredump *cs;
 
-		if (gt->engine && !first)
-			first = gt->engine;
-
-		for (cs = gt->engine; cs; cs = cs->next)
-			engines |= cs->engine->mask;
+		for (cs = gt->engine; cs; cs = cs->next) {
+			if (cs->hung) {
+				engines |= cs->engine->mask;
+				if (!first)
+					first = cs;
+			}
+		}
 	}
 
 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
@@ -1781,8 +1787,10 @@  void i915_vma_capture_finish(struct intel_gt_coredump *gt,
 	kfree(compress);
 }
 
-struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
+struct i915_gpu_coredump *
+i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask)
 {
+	struct drm_i915_private *i915 = gt->i915;
 	struct i915_gpu_coredump *error;
 
 	/* Check if GPU capture has been disabled */
@@ -1794,7 +1802,7 @@  struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
 	if (!error)
 		return ERR_PTR(-ENOMEM);
 
-	error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL);
+	error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL);
 	if (error->gt) {
 		struct i915_vma_compress *compress;
 
@@ -1806,7 +1814,7 @@  struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
 		}
 
 		gt_record_info(error->gt);
-		gt_record_engines(error->gt, compress);
+		gt_record_engines(error->gt, engine_mask, compress);
 
 		if (INTEL_INFO(i915)->has_gt_uc)
 			error->gt->uc = gt_record_uc(error->gt, compress);
@@ -1853,20 +1861,23 @@  void i915_error_state_store(struct i915_gpu_coredump *error)
 
 /**
  * i915_capture_error_state - capture an error record for later analysis
- * @i915: i915 device
+ * @gt: intel_gt which originated the hang
+ * @engine_mask: hung engines
+ *
  *
  * Should be called when an error is detected (either a hang or an error
  * interrupt) to capture error state from the time of the error.  Fills
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
-void i915_capture_error_state(struct drm_i915_private *i915)
+void i915_capture_error_state(struct intel_gt *gt,
+			      intel_engine_mask_t engine_mask)
 {
 	struct i915_gpu_coredump *error;
 
-	error = i915_gpu_coredump(i915);
+	error = i915_gpu_coredump(gt, engine_mask);
 	if (IS_ERR(error)) {
-		cmpxchg(&i915->gpu_error.first_error, NULL, error);
+		cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
 		return;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 0220b0992808..3a7ca90a3436 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -59,6 +59,7 @@  struct i915_request_coredump {
 struct intel_engine_coredump {
 	const struct intel_engine_cs *engine;
 
+	bool hung;
 	bool simulated;
 	u32 reset_count;
 
@@ -218,8 +219,10 @@  struct drm_i915_error_state_buf {
 __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 
-struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
-void i915_capture_error_state(struct drm_i915_private *i915);
+struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
+					    intel_engine_mask_t engine_mask);
+void i915_capture_error_state(struct intel_gt *gt,
+			      intel_engine_mask_t engine_mask);
 
 struct i915_gpu_coredump *
 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);