diff mbox series

[2/7] drm/i915/guc: Fix capture size warning and bump the size

Message ID 20220728022028.2190627-3-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show
Series Fixes and improvements to GuC logging and error capture | expand

Commit Message

John Harrison July 28, 2022, 2:20 a.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

There was a size check to warn if the GuC error state capture buffer
allocation would be too small to fit a reasonable amount of capture
data for the current platform. Unfortunately, the test was done too
early in the boot sequence and was actually testing 'if(-ENODEV >
size)'.

Move the check to be later. The check is only used to print a warning
message, so it doesn't really matter how early or late it is done.
Note that it is not possible to dynamically size the buffer because
the allocation needs to be done before the engine information is
available (at least, it would be in the intended two-phase GuC init
process).

Now that the check works, it is reporting size too small for newer
platforms. The check includes a 3x oversample multiplier to allow for
multiple error captures to be bufferd by GuC before i915 has a chance
to read them out. This is less important than simply being big enough
to fit the first capture.

So a) bump the default size to be large enough for one capture minimum
and b) make the warning only if one capture won't fit, instead use a
notice for the 3x size.

Note that the size estimate is a worst case scenario. Actual captures
will likely be smaller.

Lastly, use drm_warn istead of DRM_WARN as the former provides more
infmration and the latter is deprecated.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 40 ++++++++++++++-----
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h    |  1 -
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c    |  4 --
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.h    |  4 +-
 4 files changed, 31 insertions(+), 18 deletions(-)

Comments

Teres Alexis, Alan Previn Aug. 2, 2022, 5:46 p.m. UTC | #1
Straight forward change - LGTM.

Reviewed-by: Alan Previn <alan.previn.teres.alexis@intel.com>


On Wed, 2022-07-27 at 19:20 -0700, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> There was a size check to warn if the GuC error state capture buffer
> allocation would be too small to fit a reasonable amount of capture
> data for the current platform. Unfortunately, the test was done too
> early in the boot sequence and was actually testing 'if(-ENODEV >
> size)'.
> 
> Move the check to be later. The check is only used to print a warning
> message, so it doesn't really matter how early or late it is done.
> Note that it is not possible to dynamically size the buffer because
> the allocation needs to be done before the engine information is
> available (at least, it would be in the intended two-phase GuC init
> process).
> 
> Now that the check works, it is reporting size too small for newer
> platforms. The check includes a 3x oversample multiplier to allow for
> multiple error captures to be bufferd by GuC before i915 has a chance
> to read them out. This is less important than simply being big enough
> to fit the first capture.
> 
> So a) bump the default size to be large enough for one capture minimum
> and b) make the warning only if one capture won't fit, instead use a
> notice for the 3x size.
> 
> Note that the size estimate is a worst case scenario. Actual captures
> will likely be smaller.
> 
> Lastly, use drm_warn istead of DRM_WARN as the former provides more
> infmration and the latter is deprecated.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>  .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 40 ++++++++++++++-----
>  .../gpu/drm/i915/gt/uc/intel_guc_capture.h    |  1 -
>  drivers/gpu/drm/i915/gt/uc/intel_guc_log.c    |  4 --
>  drivers/gpu/drm/i915/gt/uc/intel_guc_log.h    |  4 +-
>  4 files changed, 31 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> index 75257bd20ff01..b54b7883320b1 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> @@ -600,10 +600,8 @@ intel_guc_capture_getnullheader(struct intel_guc *guc,
>  	return 0;
>  }
>  
> -#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
> -
> -int
> -intel_guc_capture_output_min_size_est(struct intel_guc *guc)
> +static int
> +guc_capture_output_min_size_est(struct intel_guc *guc)
>  {
>  	struct intel_gt *gt = guc_to_gt(guc);
>  	struct intel_engine_cs *engine;
> @@ -623,13 +621,8 @@ intel_guc_capture_output_min_size_est(struct intel_guc *guc)
>  	 * For each engine instance, there would be 1 x guc_state_capture_group_t output
>  	 * followed by 3 x guc_state_capture_t lists. The latter is how the register
>  	 * dumps are split across different register types (where the '3' are global vs class
> -	 * vs instance). Finally, let's multiply the whole thing by 3x (just so we are
> -	 * not limited to just 1 round of data in a worst case full register dump log)
> -	 *
> -	 * NOTE: intel_guc_log that allocates the log buffer would round this size up to
> -	 * a power of two.
> +	 * vs instance).
>  	 */
> -
>  	for_each_engine(engine, gt, id) {
>  		worst_min_size += sizeof(struct guc_state_capture_group_header_t) +
>  					 (3 * sizeof(struct guc_state_capture_header_t));
> @@ -649,7 +642,30 @@ intel_guc_capture_output_min_size_est(struct intel_guc *guc)
>  
>  	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
>  
> -	return (worst_min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER);
> +	return worst_min_size;
> +}
> +
> +/*
> + * Add on a 3x multiplier to allow for multiple back-to-back captures occurring
> + * before the i915 can read the data out and process it
> + */
> +#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
> +
> +static void check_guc_capture_size(struct intel_guc *guc)
> +{
> +	struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
> +	int min_size = guc_capture_output_min_size_est(guc);
> +	int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
> +
> +	if (min_size < 0)
> +		drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
> +			 min_size);
> +	else if (min_size > CAPTURE_BUFFER_SIZE)
> +		drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
> +			 CAPTURE_BUFFER_SIZE, min_size);
> +	else if (spare_size > CAPTURE_BUFFER_SIZE)
> +		drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
> +			   CAPTURE_BUFFER_SIZE, spare_size, min_size);
>  }
>  
>  /*
> @@ -1580,5 +1596,7 @@ int intel_guc_capture_init(struct intel_guc *guc)
>  	INIT_LIST_HEAD(&guc->capture->outlist);
>  	INIT_LIST_HEAD(&guc->capture->cachelist);
>  
> +	check_guc_capture_size(guc);
> +
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
> index d3d7bd0b6db64..fbd3713c7832d 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
> @@ -21,7 +21,6 @@ int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *m,
>  void intel_guc_capture_get_matching_node(struct intel_gt *gt, struct intel_engine_coredump *ee,
>  					 struct intel_context *ce);
>  void intel_guc_capture_process(struct intel_guc *guc);
> -int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
>  int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
>  			      void **outptr);
>  int intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> index 492bbf419d4df..991d4a02248dc 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> @@ -487,10 +487,6 @@ int intel_guc_log_create(struct intel_guc_log *log)
>  
>  	GEM_BUG_ON(log->vma);
>  
> -	if (intel_guc_capture_output_min_size_est(guc) > CAPTURE_BUFFER_SIZE)
> -		DRM_WARN("GuC log buffer for state_capture maybe too small. %d < %d\n",
> -			 CAPTURE_BUFFER_SIZE, intel_guc_capture_output_min_size_est(guc));
> -
>  	guc_log_size = intel_guc_log_size(log);
>  
>  	vma = intel_guc_allocate_vma(guc, guc_log_size);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
> index 18007e639be99..dc9715411d626 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
> @@ -22,11 +22,11 @@ struct intel_guc;
>  #elif defined(CONFIG_DRM_I915_DEBUG_GEM)
>  #define CRASH_BUFFER_SIZE	SZ_1M
>  #define DEBUG_BUFFER_SIZE	SZ_2M
> -#define CAPTURE_BUFFER_SIZE	SZ_1M
> +#define CAPTURE_BUFFER_SIZE	SZ_4M
>  #else
>  #define CRASH_BUFFER_SIZE	SZ_8K
>  #define DEBUG_BUFFER_SIZE	SZ_64K
> -#define CAPTURE_BUFFER_SIZE	SZ_16K
> +#define CAPTURE_BUFFER_SIZE	SZ_2M
>  #endif
>  
>  /*
> -- 
> 2.37.1
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 75257bd20ff01..b54b7883320b1 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -600,10 +600,8 @@  intel_guc_capture_getnullheader(struct intel_guc *guc,
 	return 0;
 }
 
-#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
-
-int
-intel_guc_capture_output_min_size_est(struct intel_guc *guc)
+static int
+guc_capture_output_min_size_est(struct intel_guc *guc)
 {
 	struct intel_gt *gt = guc_to_gt(guc);
 	struct intel_engine_cs *engine;
@@ -623,13 +621,8 @@  intel_guc_capture_output_min_size_est(struct intel_guc *guc)
 	 * For each engine instance, there would be 1 x guc_state_capture_group_t output
 	 * followed by 3 x guc_state_capture_t lists. The latter is how the register
 	 * dumps are split across different register types (where the '3' are global vs class
-	 * vs instance). Finally, let's multiply the whole thing by 3x (just so we are
-	 * not limited to just 1 round of data in a worst case full register dump log)
-	 *
-	 * NOTE: intel_guc_log that allocates the log buffer would round this size up to
-	 * a power of two.
+	 * vs instance).
 	 */
-
 	for_each_engine(engine, gt, id) {
 		worst_min_size += sizeof(struct guc_state_capture_group_header_t) +
 					 (3 * sizeof(struct guc_state_capture_header_t));
@@ -649,7 +642,30 @@  intel_guc_capture_output_min_size_est(struct intel_guc *guc)
 
 	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
 
-	return (worst_min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER);
+	return worst_min_size;
+}
+
+/*
+ * Add on a 3x multiplier to allow for multiple back-to-back captures occurring
+ * before the i915 can read the data out and process it
+ */
+#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
+
+static void check_guc_capture_size(struct intel_guc *guc)
+{
+	struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
+	int min_size = guc_capture_output_min_size_est(guc);
+	int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
+
+	if (min_size < 0)
+		drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
+			 min_size);
+	else if (min_size > CAPTURE_BUFFER_SIZE)
+		drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
+			 CAPTURE_BUFFER_SIZE, min_size);
+	else if (spare_size > CAPTURE_BUFFER_SIZE)
+		drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
+			   CAPTURE_BUFFER_SIZE, spare_size, min_size);
 }
 
 /*
@@ -1580,5 +1596,7 @@  int intel_guc_capture_init(struct intel_guc *guc)
 	INIT_LIST_HEAD(&guc->capture->outlist);
 	INIT_LIST_HEAD(&guc->capture->cachelist);
 
+	check_guc_capture_size(guc);
+
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
index d3d7bd0b6db64..fbd3713c7832d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
@@ -21,7 +21,6 @@  int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *m,
 void intel_guc_capture_get_matching_node(struct intel_gt *gt, struct intel_engine_coredump *ee,
 					 struct intel_context *ce);
 void intel_guc_capture_process(struct intel_guc *guc);
-int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
 int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
 			      void **outptr);
 int intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index 492bbf419d4df..991d4a02248dc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -487,10 +487,6 @@  int intel_guc_log_create(struct intel_guc_log *log)
 
 	GEM_BUG_ON(log->vma);
 
-	if (intel_guc_capture_output_min_size_est(guc) > CAPTURE_BUFFER_SIZE)
-		DRM_WARN("GuC log buffer for state_capture maybe too small. %d < %d\n",
-			 CAPTURE_BUFFER_SIZE, intel_guc_capture_output_min_size_est(guc));
-
 	guc_log_size = intel_guc_log_size(log);
 
 	vma = intel_guc_allocate_vma(guc, guc_log_size);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
index 18007e639be99..dc9715411d626 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
@@ -22,11 +22,11 @@  struct intel_guc;
 #elif defined(CONFIG_DRM_I915_DEBUG_GEM)
 #define CRASH_BUFFER_SIZE	SZ_1M
 #define DEBUG_BUFFER_SIZE	SZ_2M
-#define CAPTURE_BUFFER_SIZE	SZ_1M
+#define CAPTURE_BUFFER_SIZE	SZ_4M
 #else
 #define CRASH_BUFFER_SIZE	SZ_8K
 #define DEBUG_BUFFER_SIZE	SZ_64K
-#define CAPTURE_BUFFER_SIZE	SZ_16K
+#define CAPTURE_BUFFER_SIZE	SZ_2M
 #endif
 
 /*