diff mbox series

[v3,1/1] drm/i915/guc: Fix GuC error capture sizing estimation and reporting

Message ID 20221006053129.663793-2-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series Fix Guc-Err-Capture sizing warning | expand

Commit Message

Teres Alexis, Alan Previn Oct. 6, 2022, 5:31 a.m. UTC
During GuC error capture initialization, we estimate the amount of size
we need for the error-capture-region of the shared GuC-log-buffer.
This calculation was incorrect so fix that. With the fixed calculation
we can reduce the allocation of error-capture region from 4MB to 1MB
(see note2 below for reasoning). Additionally, switch from drm_notice to
drm_debug for the 3X spare size check since that would be impossible to
hit without redesigning gpu_coredump framework to hold multiple captures.

NOTE1: Even for 1x the min size estimation case, actually running out
of space is a corner case because it can only occur if all engine
instances get reset all at once and i915 isn't able extract the capture
data fast enough within G2H handler worker.

NOTE2: With the corrected calculation, a DG2 part required ~77K and a PVC
required ~115K (1X min-est-size that is calculated as one-shot all-engine-
reset scenario).

Fixes d7c15d76a5547: drm/i915/guc: Check sizing of guc_capture output

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 29 ++++++++++++-------
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c    |  6 ++--
 2 files changed, 21 insertions(+), 14 deletions(-)

Comments

John Harrison Oct. 6, 2022, 6:39 p.m. UTC | #1
On 10/5/2022 22:31, Alan Previn wrote:
> During GuC error capture initialization, we estimate the amount of size
> we need for the error-capture-region of the shared GuC-log-buffer.
> This calculation was incorrect so fix that. With the fixed calculation
> we can reduce the allocation of error-capture region from 4MB to 1MB
> (see note2 below for reasoning). Additionally, switch from drm_notice to
> drm_debug for the 3X spare size check since that would be impossible to
> hit without redesigning gpu_coredump framework to hold multiple captures.
>
> NOTE1: Even for 1x the min size estimation case, actually running out
> of space is a corner case because it can only occur if all engine
> instances get reset all at once and i915 isn't able extract the capture
> data fast enough within G2H handler worker.
The 'fast enough handler' issue is about multiple captures not a single 
capture. Whereas min_size is the largest possible single capture size.

>
> NOTE2: With the corrected calculation, a DG2 part required ~77K and a PVC
> required ~115K (1X min-est-size that is calculated as one-shot all-engine-
> reset scenario).
>
> Fixes d7c15d76a5547: drm/i915/guc: Check sizing of guc_capture output
>
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> ---
>   .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 29 ++++++++++++-------
>   drivers/gpu/drm/i915/gt/uc/intel_guc_log.c    |  6 ++--
>   2 files changed, 21 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> index 8f1165146013..9fa76f384abc 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> @@ -502,8 +502,9 @@ intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 cl
>   	if (!num_regs)
>   		return -ENODATA;
>   
> -	*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> -			   (num_regs * sizeof(struct guc_mmio_reg)));
> +	if (size)
> +		*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> +				   (num_regs * sizeof(struct guc_mmio_reg)));
>   
>   	return 0;
>   }
> @@ -606,7 +607,7 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
>   	struct intel_gt *gt = guc_to_gt(guc);
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
> -	int worst_min_size = 0, num_regs = 0;
> +	int worst_min_size = 0;
>   	size_t tmp = 0;
>   
>   	if (!guc->capture)
> @@ -628,20 +629,18 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
>   					 (3 * sizeof(struct guc_state_capture_header_t));
>   
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp))
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
>   						   engine->class, &tmp)) {
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   		}
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
>   						   engine->class, &tmp)) {
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   		}
>   	}
>   
> -	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
> -
>   	return worst_min_size;
>   }
>   
> @@ -658,15 +657,23 @@ static void check_guc_capture_size(struct intel_guc *guc)
>   	int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
>   	u32 buffer_size = intel_guc_log_section_size_capture(&guc->log);
>   
> +	/*
> +	 * NOTE: min_size is much smaller than the capture region allocation (DG2: <80K vs 1MB)
> +	 * Additionally, its based on space needed to fit all engines getting reset at once
> +	 * within the same G2H handler task slot. This is very unlikely. However, if GuC really
> +	 * does run out of space for whatever reason, we will see an separate warning message
> +	 * when processing the G2H event capture-notification, search for:
> +	 * INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
> +	 */
>   	if (min_size < 0)
>   		drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
>   			 min_size);
>   	else if (min_size > buffer_size)
> -		drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
> +		drm_warn(&i915->drm, "GuC error state capture buffer maybe small: %d < %d\n",
>   			 buffer_size, min_size);
>   	else if (spare_size > buffer_size)
> -		drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
> -			   buffer_size, spare_size, min_size);
> +		drm_dbg(&i915->drm, "GuC error state capture buffer lacks spare size: %d < %d (min = %d)\n",
> +			buffer_size, spare_size, min_size);
>   }
>   
>   /*
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> index 55d3ef93e86f..68331c538b0a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> @@ -16,15 +16,15 @@
>   #if defined(CONFIG_DRM_I915_DEBUG_GUC)
>   #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE	SZ_2M
>   #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE	SZ_16M
> -#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_4M
> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_1M
>   #elif defined(CONFIG_DRM_I915_DEBUG_GEM)
>   #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE	SZ_1M
>   #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE	SZ_2M
> -#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_4M
> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_1M
>   #else
>   #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE	SZ_8K
>   #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE	SZ_64K
> -#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_2M
> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_1M
>   #endif
I would have pulled these outside the if/else ladder not that they are 
all the same. But doesn't really matter.

Reviewed-by: John Harrison <John.C.Harrison@Intel.com>

>   
>   static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log);
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 8f1165146013..9fa76f384abc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -502,8 +502,9 @@  intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 cl
 	if (!num_regs)
 		return -ENODATA;
 
-	*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
-			   (num_regs * sizeof(struct guc_mmio_reg)));
+	if (size)
+		*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
+				   (num_regs * sizeof(struct guc_mmio_reg)));
 
 	return 0;
 }
@@ -606,7 +607,7 @@  guc_capture_output_min_size_est(struct intel_guc *guc)
 	struct intel_gt *gt = guc_to_gt(guc);
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	int worst_min_size = 0, num_regs = 0;
+	int worst_min_size = 0;
 	size_t tmp = 0;
 
 	if (!guc->capture)
@@ -628,20 +629,18 @@  guc_capture_output_min_size_est(struct intel_guc *guc)
 					 (3 * sizeof(struct guc_state_capture_header_t));
 
 		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp))
-			num_regs += tmp;
+			worst_min_size += tmp;
 
 		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
 						   engine->class, &tmp)) {
-			num_regs += tmp;
+			worst_min_size += tmp;
 		}
 		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
 						   engine->class, &tmp)) {
-			num_regs += tmp;
+			worst_min_size += tmp;
 		}
 	}
 
-	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
-
 	return worst_min_size;
 }
 
@@ -658,15 +657,23 @@  static void check_guc_capture_size(struct intel_guc *guc)
 	int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
 	u32 buffer_size = intel_guc_log_section_size_capture(&guc->log);
 
+	/*
+	 * NOTE: min_size is much smaller than the capture region allocation (DG2: <80K vs 1MB)
+	 * Additionally, its based on space needed to fit all engines getting reset at once
+	 * within the same G2H handler task slot. This is very unlikely. However, if GuC really
+	 * does run out of space for whatever reason, we will see an separate warning message
+	 * when processing the G2H event capture-notification, search for:
+	 * INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
+	 */
 	if (min_size < 0)
 		drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
 			 min_size);
 	else if (min_size > buffer_size)
-		drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
+		drm_warn(&i915->drm, "GuC error state capture buffer maybe small: %d < %d\n",
 			 buffer_size, min_size);
 	else if (spare_size > buffer_size)
-		drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
-			   buffer_size, spare_size, min_size);
+		drm_dbg(&i915->drm, "GuC error state capture buffer lacks spare size: %d < %d (min = %d)\n",
+			buffer_size, spare_size, min_size);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index 55d3ef93e86f..68331c538b0a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -16,15 +16,15 @@ 
 #if defined(CONFIG_DRM_I915_DEBUG_GUC)
 #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE	SZ_2M
 #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE	SZ_16M
-#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_4M
+#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_1M
 #elif defined(CONFIG_DRM_I915_DEBUG_GEM)
 #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE	SZ_1M
 #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE	SZ_2M
-#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_4M
+#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_1M
 #else
 #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE	SZ_8K
 #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE	SZ_64K
-#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_2M
+#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE	SZ_1M
 #endif
 
 static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log);