Message ID | 20230418181744.3251240-3-John.C.Harrison@Intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add support for dumping error captures via kernel logging | expand |
On 4/18/2023 11:17 AM, John.C.Harrison@Intel.com wrote: > From: John Harrison <John.C.Harrison@Intel.com> > > In the past, There have been sporadic CTB failures which proved hard > to reproduce manually. The most effective solution was to dump the GuC > log at the point of failure and let the CI system do the repro. It is > preferable not to dump the GuC log via dmesg for all issues as it is > not always necessary and is not helpful for end users. But rather than > trying to re-invent the code to do this each time it is wanted, commit > the code but for DEBUG_GUC builds only. > > v2: Use IS_ENABLED for testing config options. LGTM, Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com> > > Signed-off-by: John Harrison <John.C.Harrison@Intel.com> > --- > drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 53 +++++++++++++++++++++++ > drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h | 6 +++ > 2 files changed, 59 insertions(+) > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > index 1803a633ed648..dc5cd712f1ff5 100644 > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > @@ -13,6 +13,30 @@ > #include "intel_guc_ct.h" > #include "intel_guc_print.h" > > +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) > +enum { > + CT_DEAD_ALIVE = 0, > + CT_DEAD_SETUP, > + CT_DEAD_WRITE, > + CT_DEAD_DEADLOCK, > + CT_DEAD_H2G_HAS_ROOM, > + CT_DEAD_READ, > + CT_DEAD_PROCESS_FAILED, > +}; > + > +static void ct_dead_ct_worker_func(struct work_struct *w); > + > +#define CT_DEAD(ct, reason) \ > + do { \ > + if (!(ct)->dead_ct_reported) { \ > + (ct)->dead_ct_reason |= 1 << CT_DEAD_##reason; \ > + queue_work(system_unbound_wq, &(ct)->dead_ct_worker); \ > + } \ > + } while (0) > +#else > +#define CT_DEAD(ct, reason) do { } while (0) > +#endif > + > static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct) > { > return container_of(ct, struct intel_guc, ct); > @@ -93,6 +117,9 @@ void intel_guc_ct_init_early(struct intel_guc_ct *ct) > spin_lock_init(&ct->requests.lock); > INIT_LIST_HEAD(&ct->requests.pending); > INIT_LIST_HEAD(&ct->requests.incoming); > +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) > + INIT_WORK(&ct->dead_ct_worker, ct_dead_ct_worker_func); > +#endif > INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func); > tasklet_setup(&ct->receive_tasklet, ct_receive_tasklet_func); > init_waitqueue_head(&ct->wq); > @@ -319,11 +346,16 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct) > > ct->enabled = true; > ct->stall_time = KTIME_MAX; > +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) > + ct->dead_ct_reported = false; > + ct->dead_ct_reason = CT_DEAD_ALIVE; > +#endif > > return 0; > > err_out: > CT_PROBE_ERROR(ct, "Failed to enable CTB (%pe)\n", ERR_PTR(err)); > + CT_DEAD(ct, SETUP); > return err; > } > > @@ -434,6 +466,7 @@ static int ct_write(struct intel_guc_ct *ct, > corrupted: > CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n", > desc->head, desc->tail, desc->status); > + CT_DEAD(ct, WRITE); > ctb->broken = true; > return -EPIPE; > } > @@ -504,6 +537,7 @@ static inline bool ct_deadlocked(struct intel_guc_ct *ct) > CT_ERROR(ct, "Head: %u\n (Dwords)", ct->ctbs.recv.desc->head); > CT_ERROR(ct, "Tail: %u\n (Dwords)", ct->ctbs.recv.desc->tail); > > + CT_DEAD(ct, DEADLOCK); > ct->ctbs.send.broken = true; > } > > @@ -552,6 +586,7 @@ static inline bool h2g_has_room(struct intel_guc_ct *ct, u32 len_dw) > head, ctb->size); > desc->status |= GUC_CTB_STATUS_OVERFLOW; > ctb->broken = true; > + CT_DEAD(ct, H2G_HAS_ROOM); > return false; > } > > @@ -908,6 +943,7 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg) > CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n", > desc->head, desc->tail, desc->status); > ctb->broken = true; > + CT_DEAD(ct, READ); > return -EPIPE; > } > > @@ -1057,6 +1093,7 @@ static bool ct_process_incoming_requests(struct intel_guc_ct *ct) > if (unlikely(err)) { > CT_ERROR(ct, "Failed to process CT message (%pe) %*ph\n", > ERR_PTR(err), 4 * request->size, request->msg); > + CT_DEAD(ct, PROCESS_FAILED); > ct_free_msg(request); > } > > @@ -1233,3 +1270,19 @@ void intel_guc_ct_print_info(struct intel_guc_ct *ct, > drm_printf(p, "Tail: %u\n", > ct->ctbs.recv.desc->tail); > } > + > +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) > +static void ct_dead_ct_worker_func(struct work_struct *w) > +{ > + struct intel_guc_ct *ct = container_of(w, struct intel_guc_ct, dead_ct_worker); > + struct intel_guc *guc = ct_to_guc(ct); > + > + if (ct->dead_ct_reported) > + return; > + > + ct->dead_ct_reported = true; > + > + guc_info(guc, "CTB is dead - reason=0x%X\n", ct->dead_ct_reason); > + intel_klog_error_capture(guc_to_gt(guc), (intel_engine_mask_t)~0U); > +} > +#endif > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h > index f709a19c7e214..818415b64f4d1 100644 > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h > @@ -85,6 +85,12 @@ struct intel_guc_ct { > > /** @stall_time: time of first time a CTB submission is stalled */ > ktime_t stall_time; > + > +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) > + int dead_ct_reason; > + bool dead_ct_reported; > + struct work_struct dead_ct_worker; > +#endif > }; > > void intel_guc_ct_init_early(struct intel_guc_ct *ct);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c index 1803a633ed648..dc5cd712f1ff5 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c @@ -13,6 +13,30 @@ #include "intel_guc_ct.h" #include "intel_guc_print.h" +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) +enum { + CT_DEAD_ALIVE = 0, + CT_DEAD_SETUP, + CT_DEAD_WRITE, + CT_DEAD_DEADLOCK, + CT_DEAD_H2G_HAS_ROOM, + CT_DEAD_READ, + CT_DEAD_PROCESS_FAILED, +}; + +static void ct_dead_ct_worker_func(struct work_struct *w); + +#define CT_DEAD(ct, reason) \ + do { \ + if (!(ct)->dead_ct_reported) { \ + (ct)->dead_ct_reason |= 1 << CT_DEAD_##reason; \ + queue_work(system_unbound_wq, &(ct)->dead_ct_worker); \ + } \ + } while (0) +#else +#define CT_DEAD(ct, reason) do { } while (0) +#endif + static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct) { return container_of(ct, struct intel_guc, ct); @@ -93,6 +117,9 @@ void intel_guc_ct_init_early(struct intel_guc_ct *ct) spin_lock_init(&ct->requests.lock); INIT_LIST_HEAD(&ct->requests.pending); INIT_LIST_HEAD(&ct->requests.incoming); +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) + INIT_WORK(&ct->dead_ct_worker, ct_dead_ct_worker_func); +#endif INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func); tasklet_setup(&ct->receive_tasklet, ct_receive_tasklet_func); init_waitqueue_head(&ct->wq); @@ -319,11 +346,16 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct) ct->enabled = true; ct->stall_time = KTIME_MAX; +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) + ct->dead_ct_reported = false; + ct->dead_ct_reason = CT_DEAD_ALIVE; +#endif return 0; err_out: CT_PROBE_ERROR(ct, "Failed to enable CTB (%pe)\n", ERR_PTR(err)); + CT_DEAD(ct, SETUP); return err; } @@ -434,6 +466,7 @@ static int ct_write(struct intel_guc_ct *ct, corrupted: CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n", desc->head, desc->tail, desc->status); + CT_DEAD(ct, WRITE); ctb->broken = true; return -EPIPE; } @@ -504,6 +537,7 @@ static inline bool ct_deadlocked(struct intel_guc_ct *ct) CT_ERROR(ct, "Head: %u\n (Dwords)", ct->ctbs.recv.desc->head); CT_ERROR(ct, "Tail: %u\n (Dwords)", ct->ctbs.recv.desc->tail); + CT_DEAD(ct, DEADLOCK); ct->ctbs.send.broken = true; } @@ -552,6 +586,7 @@ static inline bool h2g_has_room(struct intel_guc_ct *ct, u32 len_dw) head, ctb->size); desc->status |= GUC_CTB_STATUS_OVERFLOW; ctb->broken = true; + CT_DEAD(ct, H2G_HAS_ROOM); return false; } @@ -908,6 +943,7 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg) CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n", desc->head, desc->tail, desc->status); ctb->broken = true; + CT_DEAD(ct, READ); return -EPIPE; } @@ -1057,6 +1093,7 @@ static bool ct_process_incoming_requests(struct intel_guc_ct *ct) if (unlikely(err)) { CT_ERROR(ct, "Failed to process CT message (%pe) %*ph\n", ERR_PTR(err), 4 * request->size, request->msg); + CT_DEAD(ct, PROCESS_FAILED); ct_free_msg(request); } @@ -1233,3 +1270,19 @@ void intel_guc_ct_print_info(struct intel_guc_ct *ct, drm_printf(p, "Tail: %u\n", ct->ctbs.recv.desc->tail); } + +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) +static void ct_dead_ct_worker_func(struct work_struct *w) +{ + struct intel_guc_ct *ct = container_of(w, struct intel_guc_ct, dead_ct_worker); + struct intel_guc *guc = ct_to_guc(ct); + + if (ct->dead_ct_reported) + return; + + ct->dead_ct_reported = true; + + guc_info(guc, "CTB is dead - reason=0x%X\n", ct->dead_ct_reason); + intel_klog_error_capture(guc_to_gt(guc), (intel_engine_mask_t)~0U); +} +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h index f709a19c7e214..818415b64f4d1 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h @@ -85,6 +85,12 @@ struct intel_guc_ct { /** @stall_time: time of first time a CTB submission is stalled */ ktime_t stall_time; + +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC) + int dead_ct_reason; + bool dead_ct_reported; + struct work_struct dead_ct_worker; +#endif }; void intel_guc_ct_init_early(struct intel_guc_ct *ct);