Message ID | 20230426205713.512695-12-rodrigo.vivi@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce xe_devcoredump. | expand |
On Wed, Apr 26, 2023 at 04:57:10PM -0400, Rodrigo Vivi wrote: > Let's continue to add our existent simple logs to devcoredump one > by one. Any format change should come on follow-up work. > > Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com> > --- > drivers/gpu/drm/xe/xe_devcoredump.c | 45 +++++++++++++++++++++++ > drivers/gpu/drm/xe/xe_devcoredump_types.h | 4 ++ > 2 files changed, 49 insertions(+) > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c > index 0e7ec654a9f2..1ffd12646a99 100644 > --- a/drivers/gpu/drm/xe/xe_devcoredump.c > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c > @@ -9,10 +9,13 @@ > #include <linux/devcoredump.h> > #include <generated/utsrelease.h> > > +#include "xe_device.h" > #include "xe_engine.h" > +#include "xe_force_wake.h" > #include "xe_gt.h" > #include "xe_guc_ct.h" > #include "xe_guc_submit.h" > +#include "xe_hw_engine.h" > > /** > * DOC: Xe device coredump > @@ -62,6 +65,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, > struct drm_printer p; > struct drm_print_iterator iter; > struct timespec64 ts; > + struct xe_engine *e; > + struct xe_hw_engine *hwe; > + enum xe_hw_engine_id id; > > /* Our device is gone already... */ > if (!data || !coredump_to_xe(coredump)) > @@ -75,6 +81,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, > mutex_lock(&coredump->lock); > > ss = &coredump->snapshot; > + e = coredump->faulty_engine; > p = drm_coredump_printer(&iter); > > drm_printf(&p, "**** Xe Device Coredump ****\n"); > @@ -92,6 +99,10 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, > xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p); > xe_guc_engine_snapshot_print(coredump->snapshot.ge, &p); > > + drm_printf(&p, "\n**** HW Engines ****\n"); > + for_each_hw_engine(hwe, e->gt, id) > + xe_hw_engine_snapshot_print(coredump->snapshot.hwe[id], &p); > + > mutex_unlock(&coredump->lock); > > return count - iter.remain; > @@ -100,6 +111,8 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, > static void xe_devcoredump_free(void *data) > { > struct xe_devcoredump *coredump = data; > + struct xe_hw_engine *hwe; > + enum xe_hw_engine_id id; > > /* Our device is gone. Nothing to do... */ > if (!data || !coredump_to_xe(coredump)) > @@ -109,6 +122,8 @@ static void xe_devcoredump_free(void *data) > > xe_guc_ct_snapshot_free(coredump->snapshot.ct); > xe_guc_engine_snapshot_free(coredump->snapshot.ge); > + for_each_hw_engine(hwe, coredump->faulty_engine->gt, id) > + xe_hw_engine_snapshot_free(coredump->snapshot.hwe[id]); > > coredump->faulty_engine = NULL; > drm_info(&coredump_to_xe(coredump)->drm, > @@ -122,13 +137,43 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump) > struct xe_devcoredump_snapshot *ss = &coredump->snapshot; > struct xe_engine *e = coredump->faulty_engine; > struct xe_guc *guc = engine_to_guc(e); > + struct xe_hw_engine *hwe; > + enum xe_hw_engine_id id; > + u32 adj_logical_mask = e->logical_mask; > + u32 width_mask = (0x1 << e->width) - 1; > + int i; > + bool cookie; > > lockdep_assert_held(&coredump->lock); > ss->snapshot_time = ktime_get_real(); > ss->boot_time = ktime_get_boottime(); > > + cookie = dma_fence_begin_signalling(); Why the annotation here? Otherwise LGTM. Matt > + for (i = 0; e->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) { > + if (adj_logical_mask & BIT(i)) { > + adj_logical_mask |= width_mask << i; > + i += e->width; > + } else { > + ++i; > + } > + } > + > + xe_force_wake_get(gt_to_fw(e->gt), XE_FORCEWAKE_ALL); > + > coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct); > coredump->snapshot.ge = xe_guc_engine_snapshot_capture(e); > + > + for_each_hw_engine(hwe, e->gt, id) { > + if (hwe->class != e->hwe->class || > + !(BIT(hwe->logical_instance) & adj_logical_mask)) { > + coredump->snapshot.hwe[id] = NULL; > + continue; > + } > + coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe); > + } > + > + xe_force_wake_put(gt_to_fw(e->gt), XE_FORCEWAKE_ALL); > + dma_fence_end_signalling(cookie); > } > > /** > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h > index e055b266af70..8b17ecf1b6e6 100644 > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h > @@ -9,6 +9,8 @@ > #include <linux/ktime.h> > #include <linux/mutex.h> > > +#include "xe_hw_engine_types.h" > + > struct xe_device; > > /** > @@ -29,6 +31,8 @@ struct xe_devcoredump_snapshot { > struct xe_guc_ct_snapshot *ct; > /** @ge: Guc Engine snapshot */ > struct xe_guc_submit_engine_snapshot *ge; > + /** @hwe: HW Engine snapshot array */ > + struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES]; > }; > > /** > -- > 2.39.2 >
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index 0e7ec654a9f2..1ffd12646a99 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -9,10 +9,13 @@ #include <linux/devcoredump.h> #include <generated/utsrelease.h> +#include "xe_device.h" #include "xe_engine.h" +#include "xe_force_wake.h" #include "xe_gt.h" #include "xe_guc_ct.h" #include "xe_guc_submit.h" +#include "xe_hw_engine.h" /** * DOC: Xe device coredump @@ -62,6 +65,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, struct drm_printer p; struct drm_print_iterator iter; struct timespec64 ts; + struct xe_engine *e; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; /* Our device is gone already... */ if (!data || !coredump_to_xe(coredump)) @@ -75,6 +81,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, mutex_lock(&coredump->lock); ss = &coredump->snapshot; + e = coredump->faulty_engine; p = drm_coredump_printer(&iter); drm_printf(&p, "**** Xe Device Coredump ****\n"); @@ -92,6 +99,10 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p); xe_guc_engine_snapshot_print(coredump->snapshot.ge, &p); + drm_printf(&p, "\n**** HW Engines ****\n"); + for_each_hw_engine(hwe, e->gt, id) + xe_hw_engine_snapshot_print(coredump->snapshot.hwe[id], &p); + mutex_unlock(&coredump->lock); return count - iter.remain; @@ -100,6 +111,8 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, static void xe_devcoredump_free(void *data) { struct xe_devcoredump *coredump = data; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; /* Our device is gone. Nothing to do... */ if (!data || !coredump_to_xe(coredump)) @@ -109,6 +122,8 @@ static void xe_devcoredump_free(void *data) xe_guc_ct_snapshot_free(coredump->snapshot.ct); xe_guc_engine_snapshot_free(coredump->snapshot.ge); + for_each_hw_engine(hwe, coredump->faulty_engine->gt, id) + xe_hw_engine_snapshot_free(coredump->snapshot.hwe[id]); coredump->faulty_engine = NULL; drm_info(&coredump_to_xe(coredump)->drm, @@ -122,13 +137,43 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump) struct xe_devcoredump_snapshot *ss = &coredump->snapshot; struct xe_engine *e = coredump->faulty_engine; struct xe_guc *guc = engine_to_guc(e); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + u32 adj_logical_mask = e->logical_mask; + u32 width_mask = (0x1 << e->width) - 1; + int i; + bool cookie; lockdep_assert_held(&coredump->lock); ss->snapshot_time = ktime_get_real(); ss->boot_time = ktime_get_boottime(); + cookie = dma_fence_begin_signalling(); + for (i = 0; e->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) { + if (adj_logical_mask & BIT(i)) { + adj_logical_mask |= width_mask << i; + i += e->width; + } else { + ++i; + } + } + + xe_force_wake_get(gt_to_fw(e->gt), XE_FORCEWAKE_ALL); + coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct); coredump->snapshot.ge = xe_guc_engine_snapshot_capture(e); + + for_each_hw_engine(hwe, e->gt, id) { + if (hwe->class != e->hwe->class || + !(BIT(hwe->logical_instance) & adj_logical_mask)) { + coredump->snapshot.hwe[id] = NULL; + continue; + } + coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe); + } + + xe_force_wake_put(gt_to_fw(e->gt), XE_FORCEWAKE_ALL); + dma_fence_end_signalling(cookie); } /** diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h index e055b266af70..8b17ecf1b6e6 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h @@ -9,6 +9,8 @@ #include <linux/ktime.h> #include <linux/mutex.h> +#include "xe_hw_engine_types.h" + struct xe_device; /** @@ -29,6 +31,8 @@ struct xe_devcoredump_snapshot { struct xe_guc_ct_snapshot *ct; /** @ge: Guc Engine snapshot */ struct xe_guc_submit_engine_snapshot *ge; + /** @hwe: HW Engine snapshot array */ + struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES]; }; /**
Let's continue to add our existent simple logs to devcoredump one by one. Any format change should come on follow-up work. Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com> --- drivers/gpu/drm/xe/xe_devcoredump.c | 45 +++++++++++++++++++++++ drivers/gpu/drm/xe/xe_devcoredump_types.h | 4 ++ 2 files changed, 49 insertions(+)