diff mbox series

[11/14] drm/xe: Add HW Engine snapshot to xe_devcoredump.

Message ID 20230426205713.512695-12-rodrigo.vivi@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce xe_devcoredump. | expand

Commit Message

Rodrigo Vivi April 26, 2023, 8:57 p.m. UTC
Let's continue to add our existent simple logs to devcoredump one
by one. Any format change should come on follow-up work.

Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 45 +++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ++
 2 files changed, 49 insertions(+)

Comments

Matthew Brost May 2, 2023, 3:30 p.m. UTC | #1
On Wed, Apr 26, 2023 at 04:57:10PM -0400, Rodrigo Vivi wrote:
> Let's continue to add our existent simple logs to devcoredump one
> by one. Any format change should come on follow-up work.
> 
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 45 +++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ++
>  2 files changed, 49 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 0e7ec654a9f2..1ffd12646a99 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -9,10 +9,13 @@
>  #include <linux/devcoredump.h>
>  #include <generated/utsrelease.h>
>  
> +#include "xe_device.h"
>  #include "xe_engine.h"
> +#include "xe_force_wake.h"
>  #include "xe_gt.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_submit.h"
> +#include "xe_hw_engine.h"
>  
>  /**
>   * DOC: Xe device coredump
> @@ -62,6 +65,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	struct drm_printer p;
>  	struct drm_print_iterator iter;
>  	struct timespec64 ts;
> +	struct xe_engine *e;
> +	struct xe_hw_engine *hwe;
> +	enum xe_hw_engine_id id;
>  
>  	/* Our device is gone already... */
>  	if (!data || !coredump_to_xe(coredump))
> @@ -75,6 +81,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	mutex_lock(&coredump->lock);
>  
>  	ss = &coredump->snapshot;
> +	e = coredump->faulty_engine;
>  	p = drm_coredump_printer(&iter);
>  
>  	drm_printf(&p, "**** Xe Device Coredump ****\n");
> @@ -92,6 +99,10 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p);
>  	xe_guc_engine_snapshot_print(coredump->snapshot.ge, &p);
>  
> +	drm_printf(&p, "\n**** HW Engines ****\n");
> +	for_each_hw_engine(hwe, e->gt, id)
> +		xe_hw_engine_snapshot_print(coredump->snapshot.hwe[id], &p);
> +
>  	mutex_unlock(&coredump->lock);
>  
>  	return count - iter.remain;
> @@ -100,6 +111,8 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  static void xe_devcoredump_free(void *data)
>  {
>  	struct xe_devcoredump *coredump = data;
> +	struct xe_hw_engine *hwe;
> +	enum xe_hw_engine_id id;
>  
>  	/* Our device is gone. Nothing to do... */
>  	if (!data || !coredump_to_xe(coredump))
> @@ -109,6 +122,8 @@ static void xe_devcoredump_free(void *data)
>  
>  	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
>  	xe_guc_engine_snapshot_free(coredump->snapshot.ge);
> +	for_each_hw_engine(hwe, coredump->faulty_engine->gt, id)
> +		xe_hw_engine_snapshot_free(coredump->snapshot.hwe[id]);
>  
>  	coredump->faulty_engine = NULL;
>  	drm_info(&coredump_to_xe(coredump)->drm,
> @@ -122,13 +137,43 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump)
>  	struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
>  	struct xe_engine *e = coredump->faulty_engine;
>  	struct xe_guc *guc = engine_to_guc(e);
> +	struct xe_hw_engine *hwe;
> +	enum xe_hw_engine_id id;
> +	u32 adj_logical_mask = e->logical_mask;
> +	u32 width_mask = (0x1 << e->width) - 1;
> +	int i;
> +	bool cookie;
>  
>  	lockdep_assert_held(&coredump->lock);
>  	ss->snapshot_time = ktime_get_real();
>  	ss->boot_time = ktime_get_boottime();
>  
> +	cookie = dma_fence_begin_signalling();

Why the annotation here? Otherwise LGTM.

Matt

> +	for (i = 0; e->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) {
> +		if (adj_logical_mask & BIT(i)) {
> +			adj_logical_mask |= width_mask << i;
> +			i += e->width;
> +		} else {
> +			++i;
> +		}
> +	}
> +
> +	xe_force_wake_get(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
> +
>  	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct);
>  	coredump->snapshot.ge = xe_guc_engine_snapshot_capture(e);
> +
> +	for_each_hw_engine(hwe, e->gt, id) {
> +		if (hwe->class != e->hwe->class ||
> +		    !(BIT(hwe->logical_instance) & adj_logical_mask)) {
> +			coredump->snapshot.hwe[id] = NULL;
> +			continue;
> +		}
> +		coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe);
> +	}
> +
> +	xe_force_wake_put(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
> +	dma_fence_end_signalling(cookie);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index e055b266af70..8b17ecf1b6e6 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -9,6 +9,8 @@
>  #include <linux/ktime.h>
>  #include <linux/mutex.h>
>  
> +#include "xe_hw_engine_types.h"
> +
>  struct xe_device;
>  
>  /**
> @@ -29,6 +31,8 @@ struct xe_devcoredump_snapshot {
>  	struct xe_guc_ct_snapshot *ct;
>  	/** @ge: Guc Engine snapshot */
>  	struct xe_guc_submit_engine_snapshot *ge;
> +	/** @hwe: HW Engine snapshot array */
> +	struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
>  };
>  
>  /**
> -- 
> 2.39.2
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 0e7ec654a9f2..1ffd12646a99 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -9,10 +9,13 @@ 
 #include <linux/devcoredump.h>
 #include <generated/utsrelease.h>
 
+#include "xe_device.h"
 #include "xe_engine.h"
+#include "xe_force_wake.h"
 #include "xe_gt.h"
 #include "xe_guc_ct.h"
 #include "xe_guc_submit.h"
+#include "xe_hw_engine.h"
 
 /**
  * DOC: Xe device coredump
@@ -62,6 +65,9 @@  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	struct drm_printer p;
 	struct drm_print_iterator iter;
 	struct timespec64 ts;
+	struct xe_engine *e;
+	struct xe_hw_engine *hwe;
+	enum xe_hw_engine_id id;
 
 	/* Our device is gone already... */
 	if (!data || !coredump_to_xe(coredump))
@@ -75,6 +81,7 @@  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	mutex_lock(&coredump->lock);
 
 	ss = &coredump->snapshot;
+	e = coredump->faulty_engine;
 	p = drm_coredump_printer(&iter);
 
 	drm_printf(&p, "**** Xe Device Coredump ****\n");
@@ -92,6 +99,10 @@  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p);
 	xe_guc_engine_snapshot_print(coredump->snapshot.ge, &p);
 
+	drm_printf(&p, "\n**** HW Engines ****\n");
+	for_each_hw_engine(hwe, e->gt, id)
+		xe_hw_engine_snapshot_print(coredump->snapshot.hwe[id], &p);
+
 	mutex_unlock(&coredump->lock);
 
 	return count - iter.remain;
@@ -100,6 +111,8 @@  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 static void xe_devcoredump_free(void *data)
 {
 	struct xe_devcoredump *coredump = data;
+	struct xe_hw_engine *hwe;
+	enum xe_hw_engine_id id;
 
 	/* Our device is gone. Nothing to do... */
 	if (!data || !coredump_to_xe(coredump))
@@ -109,6 +122,8 @@  static void xe_devcoredump_free(void *data)
 
 	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
 	xe_guc_engine_snapshot_free(coredump->snapshot.ge);
+	for_each_hw_engine(hwe, coredump->faulty_engine->gt, id)
+		xe_hw_engine_snapshot_free(coredump->snapshot.hwe[id]);
 
 	coredump->faulty_engine = NULL;
 	drm_info(&coredump_to_xe(coredump)->drm,
@@ -122,13 +137,43 @@  static void devcoredump_snapshot(struct xe_devcoredump *coredump)
 	struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
 	struct xe_engine *e = coredump->faulty_engine;
 	struct xe_guc *guc = engine_to_guc(e);
+	struct xe_hw_engine *hwe;
+	enum xe_hw_engine_id id;
+	u32 adj_logical_mask = e->logical_mask;
+	u32 width_mask = (0x1 << e->width) - 1;
+	int i;
+	bool cookie;
 
 	lockdep_assert_held(&coredump->lock);
 	ss->snapshot_time = ktime_get_real();
 	ss->boot_time = ktime_get_boottime();
 
+	cookie = dma_fence_begin_signalling();
+	for (i = 0; e->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) {
+		if (adj_logical_mask & BIT(i)) {
+			adj_logical_mask |= width_mask << i;
+			i += e->width;
+		} else {
+			++i;
+		}
+	}
+
+	xe_force_wake_get(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
+
 	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct);
 	coredump->snapshot.ge = xe_guc_engine_snapshot_capture(e);
+
+	for_each_hw_engine(hwe, e->gt, id) {
+		if (hwe->class != e->hwe->class ||
+		    !(BIT(hwe->logical_instance) & adj_logical_mask)) {
+			coredump->snapshot.hwe[id] = NULL;
+			continue;
+		}
+		coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe);
+	}
+
+	xe_force_wake_put(gt_to_fw(e->gt), XE_FORCEWAKE_ALL);
+	dma_fence_end_signalling(cookie);
 }
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index e055b266af70..8b17ecf1b6e6 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -9,6 +9,8 @@ 
 #include <linux/ktime.h>
 #include <linux/mutex.h>
 
+#include "xe_hw_engine_types.h"
+
 struct xe_device;
 
 /**
@@ -29,6 +31,8 @@  struct xe_devcoredump_snapshot {
 	struct xe_guc_ct_snapshot *ct;
 	/** @ge: Guc Engine snapshot */
 	struct xe_guc_submit_engine_snapshot *ge;
+	/** @hwe: HW Engine snapshot array */
+	struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
 };
 
 /**