diff mbox series

[v2,2/2] drm/i915/guc: Dump error capture to dmesg on CTB error

Message ID 20230418181744.3251240-3-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show
Series Add support for dumping error captures via kernel logging | expand

Commit Message

John Harrison April 18, 2023, 6:17 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

In the past, There have been sporadic CTB failures which proved hard
to reproduce manually. The most effective solution was to dump the GuC
log at the point of failure and let the CI system do the repro. It is
preferable not to dump the GuC log via dmesg for all issues as it is
not always necessary and is not helpful for end users. But rather than
trying to re-invent the code to do this each time it is wanted, commit
the code but for DEBUG_GUC builds only.

v2: Use IS_ENABLED for testing config options.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 53 +++++++++++++++++++++++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  6 +++
 2 files changed, 59 insertions(+)

Comments

Vinay Belgaumkar May 16, 2023, 7:17 p.m. UTC | #1
On 4/18/2023 11:17 AM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> In the past, There have been sporadic CTB failures which proved hard
> to reproduce manually. The most effective solution was to dump the GuC
> log at the point of failure and let the CI system do the repro. It is
> preferable not to dump the GuC log via dmesg for all issues as it is
> not always necessary and is not helpful for end users. But rather than
> trying to re-invent the code to do this each time it is wanted, commit
> the code but for DEBUG_GUC builds only.
>
> v2: Use IS_ENABLED for testing config options.

LGTM,

Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>

>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 53 +++++++++++++++++++++++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  6 +++
>   2 files changed, 59 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index 1803a633ed648..dc5cd712f1ff5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -13,6 +13,30 @@
>   #include "intel_guc_ct.h"
>   #include "intel_guc_print.h"
>   
> +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
> +enum {
> +	CT_DEAD_ALIVE = 0,
> +	CT_DEAD_SETUP,
> +	CT_DEAD_WRITE,
> +	CT_DEAD_DEADLOCK,
> +	CT_DEAD_H2G_HAS_ROOM,
> +	CT_DEAD_READ,
> +	CT_DEAD_PROCESS_FAILED,
> +};
> +
> +static void ct_dead_ct_worker_func(struct work_struct *w);
> +
> +#define CT_DEAD(ct, reason)	\
> +	do { \
> +		if (!(ct)->dead_ct_reported) { \
> +			(ct)->dead_ct_reason |= 1 << CT_DEAD_##reason; \
> +			queue_work(system_unbound_wq, &(ct)->dead_ct_worker); \
> +		} \
> +	} while (0)
> +#else
> +#define CT_DEAD(ct, reason)	do { } while (0)
> +#endif
> +
>   static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct)
>   {
>   	return container_of(ct, struct intel_guc, ct);
> @@ -93,6 +117,9 @@ void intel_guc_ct_init_early(struct intel_guc_ct *ct)
>   	spin_lock_init(&ct->requests.lock);
>   	INIT_LIST_HEAD(&ct->requests.pending);
>   	INIT_LIST_HEAD(&ct->requests.incoming);
> +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
> +	INIT_WORK(&ct->dead_ct_worker, ct_dead_ct_worker_func);
> +#endif
>   	INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func);
>   	tasklet_setup(&ct->receive_tasklet, ct_receive_tasklet_func);
>   	init_waitqueue_head(&ct->wq);
> @@ -319,11 +346,16 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
>   
>   	ct->enabled = true;
>   	ct->stall_time = KTIME_MAX;
> +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
> +	ct->dead_ct_reported = false;
> +	ct->dead_ct_reason = CT_DEAD_ALIVE;
> +#endif
>   
>   	return 0;
>   
>   err_out:
>   	CT_PROBE_ERROR(ct, "Failed to enable CTB (%pe)\n", ERR_PTR(err));
> +	CT_DEAD(ct, SETUP);
>   	return err;
>   }
>   
> @@ -434,6 +466,7 @@ static int ct_write(struct intel_guc_ct *ct,
>   corrupted:
>   	CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
>   		 desc->head, desc->tail, desc->status);
> +	CT_DEAD(ct, WRITE);
>   	ctb->broken = true;
>   	return -EPIPE;
>   }
> @@ -504,6 +537,7 @@ static inline bool ct_deadlocked(struct intel_guc_ct *ct)
>   		CT_ERROR(ct, "Head: %u\n (Dwords)", ct->ctbs.recv.desc->head);
>   		CT_ERROR(ct, "Tail: %u\n (Dwords)", ct->ctbs.recv.desc->tail);
>   
> +		CT_DEAD(ct, DEADLOCK);
>   		ct->ctbs.send.broken = true;
>   	}
>   
> @@ -552,6 +586,7 @@ static inline bool h2g_has_room(struct intel_guc_ct *ct, u32 len_dw)
>   			 head, ctb->size);
>   		desc->status |= GUC_CTB_STATUS_OVERFLOW;
>   		ctb->broken = true;
> +		CT_DEAD(ct, H2G_HAS_ROOM);
>   		return false;
>   	}
>   
> @@ -908,6 +943,7 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
>   	CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
>   		 desc->head, desc->tail, desc->status);
>   	ctb->broken = true;
> +	CT_DEAD(ct, READ);
>   	return -EPIPE;
>   }
>   
> @@ -1057,6 +1093,7 @@ static bool ct_process_incoming_requests(struct intel_guc_ct *ct)
>   	if (unlikely(err)) {
>   		CT_ERROR(ct, "Failed to process CT message (%pe) %*ph\n",
>   			 ERR_PTR(err), 4 * request->size, request->msg);
> +		CT_DEAD(ct, PROCESS_FAILED);
>   		ct_free_msg(request);
>   	}
>   
> @@ -1233,3 +1270,19 @@ void intel_guc_ct_print_info(struct intel_guc_ct *ct,
>   	drm_printf(p, "Tail: %u\n",
>   		   ct->ctbs.recv.desc->tail);
>   }
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
> +static void ct_dead_ct_worker_func(struct work_struct *w)
> +{
> +	struct intel_guc_ct *ct = container_of(w, struct intel_guc_ct, dead_ct_worker);
> +	struct intel_guc *guc = ct_to_guc(ct);
> +
> +	if (ct->dead_ct_reported)
> +		return;
> +
> +	ct->dead_ct_reported = true;
> +
> +	guc_info(guc, "CTB is dead - reason=0x%X\n", ct->dead_ct_reason);
> +	intel_klog_error_capture(guc_to_gt(guc), (intel_engine_mask_t)~0U);
> +}
> +#endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> index f709a19c7e214..818415b64f4d1 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> @@ -85,6 +85,12 @@ struct intel_guc_ct {
>   
>   	/** @stall_time: time of first time a CTB submission is stalled */
>   	ktime_t stall_time;
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
> +	int dead_ct_reason;
> +	bool dead_ct_reported;
> +	struct work_struct dead_ct_worker;
> +#endif
>   };
>   
>   void intel_guc_ct_init_early(struct intel_guc_ct *ct);
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 1803a633ed648..dc5cd712f1ff5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -13,6 +13,30 @@ 
 #include "intel_guc_ct.h"
 #include "intel_guc_print.h"
 
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+enum {
+	CT_DEAD_ALIVE = 0,
+	CT_DEAD_SETUP,
+	CT_DEAD_WRITE,
+	CT_DEAD_DEADLOCK,
+	CT_DEAD_H2G_HAS_ROOM,
+	CT_DEAD_READ,
+	CT_DEAD_PROCESS_FAILED,
+};
+
+static void ct_dead_ct_worker_func(struct work_struct *w);
+
+#define CT_DEAD(ct, reason)	\
+	do { \
+		if (!(ct)->dead_ct_reported) { \
+			(ct)->dead_ct_reason |= 1 << CT_DEAD_##reason; \
+			queue_work(system_unbound_wq, &(ct)->dead_ct_worker); \
+		} \
+	} while (0)
+#else
+#define CT_DEAD(ct, reason)	do { } while (0)
+#endif
+
 static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct)
 {
 	return container_of(ct, struct intel_guc, ct);
@@ -93,6 +117,9 @@  void intel_guc_ct_init_early(struct intel_guc_ct *ct)
 	spin_lock_init(&ct->requests.lock);
 	INIT_LIST_HEAD(&ct->requests.pending);
 	INIT_LIST_HEAD(&ct->requests.incoming);
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+	INIT_WORK(&ct->dead_ct_worker, ct_dead_ct_worker_func);
+#endif
 	INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func);
 	tasklet_setup(&ct->receive_tasklet, ct_receive_tasklet_func);
 	init_waitqueue_head(&ct->wq);
@@ -319,11 +346,16 @@  int intel_guc_ct_enable(struct intel_guc_ct *ct)
 
 	ct->enabled = true;
 	ct->stall_time = KTIME_MAX;
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+	ct->dead_ct_reported = false;
+	ct->dead_ct_reason = CT_DEAD_ALIVE;
+#endif
 
 	return 0;
 
 err_out:
 	CT_PROBE_ERROR(ct, "Failed to enable CTB (%pe)\n", ERR_PTR(err));
+	CT_DEAD(ct, SETUP);
 	return err;
 }
 
@@ -434,6 +466,7 @@  static int ct_write(struct intel_guc_ct *ct,
 corrupted:
 	CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
 		 desc->head, desc->tail, desc->status);
+	CT_DEAD(ct, WRITE);
 	ctb->broken = true;
 	return -EPIPE;
 }
@@ -504,6 +537,7 @@  static inline bool ct_deadlocked(struct intel_guc_ct *ct)
 		CT_ERROR(ct, "Head: %u\n (Dwords)", ct->ctbs.recv.desc->head);
 		CT_ERROR(ct, "Tail: %u\n (Dwords)", ct->ctbs.recv.desc->tail);
 
+		CT_DEAD(ct, DEADLOCK);
 		ct->ctbs.send.broken = true;
 	}
 
@@ -552,6 +586,7 @@  static inline bool h2g_has_room(struct intel_guc_ct *ct, u32 len_dw)
 			 head, ctb->size);
 		desc->status |= GUC_CTB_STATUS_OVERFLOW;
 		ctb->broken = true;
+		CT_DEAD(ct, H2G_HAS_ROOM);
 		return false;
 	}
 
@@ -908,6 +943,7 @@  static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
 	CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
 		 desc->head, desc->tail, desc->status);
 	ctb->broken = true;
+	CT_DEAD(ct, READ);
 	return -EPIPE;
 }
 
@@ -1057,6 +1093,7 @@  static bool ct_process_incoming_requests(struct intel_guc_ct *ct)
 	if (unlikely(err)) {
 		CT_ERROR(ct, "Failed to process CT message (%pe) %*ph\n",
 			 ERR_PTR(err), 4 * request->size, request->msg);
+		CT_DEAD(ct, PROCESS_FAILED);
 		ct_free_msg(request);
 	}
 
@@ -1233,3 +1270,19 @@  void intel_guc_ct_print_info(struct intel_guc_ct *ct,
 	drm_printf(p, "Tail: %u\n",
 		   ct->ctbs.recv.desc->tail);
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+static void ct_dead_ct_worker_func(struct work_struct *w)
+{
+	struct intel_guc_ct *ct = container_of(w, struct intel_guc_ct, dead_ct_worker);
+	struct intel_guc *guc = ct_to_guc(ct);
+
+	if (ct->dead_ct_reported)
+		return;
+
+	ct->dead_ct_reported = true;
+
+	guc_info(guc, "CTB is dead - reason=0x%X\n", ct->dead_ct_reason);
+	intel_klog_error_capture(guc_to_gt(guc), (intel_engine_mask_t)~0U);
+}
+#endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
index f709a19c7e214..818415b64f4d1 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
@@ -85,6 +85,12 @@  struct intel_guc_ct {
 
 	/** @stall_time: time of first time a CTB submission is stalled */
 	ktime_t stall_time;
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+	int dead_ct_reason;
+	bool dead_ct_reported;
+	struct work_struct dead_ct_worker;
+#endif
 };
 
 void intel_guc_ct_init_early(struct intel_guc_ct *ct);