diff mbox series

[v13,09/13] drm/i915/guc: Check sizing of guc_capture output

Message ID 20220321164527.2500062-10-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series Add GuC Error Capture Support | expand

Commit Message

Teres Alexis, Alan Previn March 21, 2022, 4:45 p.m. UTC
Add intel_guc_capture_output_min_size_est function to
provide a reasonable minimum size for error-capture
region before allocating the shared buffer.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 48 +++++++++++++++++++
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h    |  1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c    |  7 ++-
 3 files changed, 55 insertions(+), 1 deletion(-)

Comments

Lucas De Marchi March 22, 2022, 4:55 p.m. UTC | #1
On Mon, Mar 21, 2022 at 09:45:23AM -0700, Alan Previn wrote:
>Add intel_guc_capture_output_min_size_est function to
>provide a reasonable minimum size for error-capture
>region before allocating the shared buffer.
>
>Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
>Reviewed-by: Matthew Brost <matthew.brost@intel.com>
>---
> .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 48 +++++++++++++++++++
> .../gpu/drm/i915/gt/uc/intel_guc_capture.h    |  1 +
> drivers/gpu/drm/i915/gt/uc/intel_guc_log.c    |  7 ++-
> 3 files changed, 55 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
>index 63ef407a2fd0..f87fee216430 100644
>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
>@@ -663,6 +663,54 @@ intel_guc_capture_getnullheader(struct intel_guc *guc,
> 	return 0;
> }
>
>+#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3

missing blank line here

>+int
>+intel_guc_capture_output_min_size_est(struct intel_guc *guc)
>+{
>+	struct intel_gt *gt = guc_to_gt(guc);
>+	struct intel_engine_cs *engine;
>+	enum intel_engine_id id;
>+	int worst_min_size = 0, num_regs = 0;
>+	size_t tmp = 0;
>+
>+	/*
>+	 * If every single engine-instance suffered a failure in quick succession but
>+	 * were all unrelated, then a burst of multiple error-capture events would dump
>+	 * registers for every one engine instance, one at a time. In this case, GuC
>+	 * would even dump the global-registers repeatedly.
>+	 *
>+	 * For each engine instance, there would be 1 x guc_state_capture_group_t output
>+	 * followed by 3 x guc_state_capture_t lists. The latter is how the register
>+	 * dumps are split across different register types (where the '3' are global vs class
>+	 * vs instance). Finally, let's multiply the whole thing by 3x (just so we are
>+	 * not limited to just 1 round of data in a worst case full register dump log)
>+	 *
>+	 * NOTE: intel_guc_log that allocates the log buffer would round this size up to
>+	 * a power of two.
>+	 */
>+
>+	for_each_engine(engine, gt, id) {
>+		worst_min_size += sizeof(struct guc_state_capture_group_header_t) +
>+				  (3 * sizeof(struct guc_state_capture_header_t));
>+
>+		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp))
>+			num_regs += tmp;
>+
>+		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
>+						   engine->class, &tmp)) {
>+			num_regs += tmp;
>+		}
>+		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
>+						   engine->class, &tmp)) {
>+			num_regs += tmp;
>+		}
>+	}
>+
>+	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
>+
>+	return (worst_min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER);
>+}
>+
> static void
> guc_capture_free_ads_cache(struct intel_guc_state_capture *gc)
> {
>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
>index 8de7704e12eb..540d72079462 100644
>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
>@@ -11,6 +11,7 @@
> struct guc_gt_system_info;
> struct intel_guc;
>
>+int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
> int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
> 			      void **outptr);
> int intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
>index fe4b2d3f305d..ed05b1a04f9c 100644
>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
>@@ -7,10 +7,11 @@
> #include <linux/string_helpers.h>
>
> #include "gt/intel_gt.h"
>+#include "intel_guc_capture.h"
>+#include "intel_guc_log.h"
> #include "i915_drv.h"
> #include "i915_irq.h"
> #include "i915_memcpy.h"
>-#include "intel_guc_log.h"

This seems to be an artifact of rebasing or you tried to sort the
includes... when sorting make sure you use LANG=C to avoid locale
differences wrt sorting

The previous placement of intel_guc_log.h was actually the correct one.
I'm squashing this and the previous blank line I mentioned as a fixup.

thanks
Lucas De Marchi

>
> static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log);
>
>@@ -466,6 +467,10 @@ int intel_guc_log_create(struct intel_guc_log *log)
> 	 *  |         Capture logs          |
> 	 *  +===============================+ + CAPTURE_SIZE
> 	 */
>+	if (intel_guc_capture_output_min_size_est(guc) > CAPTURE_BUFFER_SIZE)
>+		DRM_WARN("GuC log buffer for state_capture maybe too small. %d < %d\n",
>+			 CAPTURE_BUFFER_SIZE, intel_guc_capture_output_min_size_est(guc));
>+
> 	guc_log_size = PAGE_SIZE + CRASH_BUFFER_SIZE + DEBUG_BUFFER_SIZE +
> 		       CAPTURE_BUFFER_SIZE;
>
>-- 
>2.25.1
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 63ef407a2fd0..f87fee216430 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -663,6 +663,54 @@  intel_guc_capture_getnullheader(struct intel_guc *guc,
 	return 0;
 }
 
+#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
+int
+intel_guc_capture_output_min_size_est(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int worst_min_size = 0, num_regs = 0;
+	size_t tmp = 0;
+
+	/*
+	 * If every single engine-instance suffered a failure in quick succession but
+	 * were all unrelated, then a burst of multiple error-capture events would dump
+	 * registers for every one engine instance, one at a time. In this case, GuC
+	 * would even dump the global-registers repeatedly.
+	 *
+	 * For each engine instance, there would be 1 x guc_state_capture_group_t output
+	 * followed by 3 x guc_state_capture_t lists. The latter is how the register
+	 * dumps are split across different register types (where the '3' are global vs class
+	 * vs instance). Finally, let's multiply the whole thing by 3x (just so we are
+	 * not limited to just 1 round of data in a worst case full register dump log)
+	 *
+	 * NOTE: intel_guc_log that allocates the log buffer would round this size up to
+	 * a power of two.
+	 */
+
+	for_each_engine(engine, gt, id) {
+		worst_min_size += sizeof(struct guc_state_capture_group_header_t) +
+				  (3 * sizeof(struct guc_state_capture_header_t));
+
+		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp))
+			num_regs += tmp;
+
+		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
+						   engine->class, &tmp)) {
+			num_regs += tmp;
+		}
+		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
+						   engine->class, &tmp)) {
+			num_regs += tmp;
+		}
+	}
+
+	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
+
+	return (worst_min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER);
+}
+
 static void
 guc_capture_free_ads_cache(struct intel_guc_state_capture *gc)
 {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
index 8de7704e12eb..540d72079462 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
@@ -11,6 +11,7 @@ 
 struct guc_gt_system_info;
 struct intel_guc;
 
+int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
 int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
 			      void **outptr);
 int intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index fe4b2d3f305d..ed05b1a04f9c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -7,10 +7,11 @@ 
 #include <linux/string_helpers.h>
 
 #include "gt/intel_gt.h"
+#include "intel_guc_capture.h"
+#include "intel_guc_log.h"
 #include "i915_drv.h"
 #include "i915_irq.h"
 #include "i915_memcpy.h"
-#include "intel_guc_log.h"
 
 static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log);
 
@@ -466,6 +467,10 @@  int intel_guc_log_create(struct intel_guc_log *log)
 	 *  |         Capture logs          |
 	 *  +===============================+ + CAPTURE_SIZE
 	 */
+	if (intel_guc_capture_output_min_size_est(guc) > CAPTURE_BUFFER_SIZE)
+		DRM_WARN("GuC log buffer for state_capture maybe too small. %d < %d\n",
+			 CAPTURE_BUFFER_SIZE, intel_guc_capture_output_min_size_est(guc));
+
 	guc_log_size = PAGE_SIZE + CRASH_BUFFER_SIZE + DEBUG_BUFFER_SIZE +
 		       CAPTURE_BUFFER_SIZE;