diff mbox series

drm/i915: Remove unnecessary memory quiescing for aux inval

Message ID 20230920111131.2696-1-nirmoy.das@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Remove unnecessary memory quiescing for aux inval | expand

Commit Message

Das, Nirmoy Sept. 20, 2023, 11:11 a.m. UTC
i915 already does memory quiesce before signaling
breadcrumb so remove extra memory quiescing for aux
invalidation which can cause unnecessary side effects.

Fixes: 78a6ccd65fa3 ("drm/i915/gt: Ensure memory quiesced before invalidation")
Cc: Jonathan Cavitt <jonathan.cavitt@intel.com>
Cc: Andi Shyti <andi.shyti@linux.intel.com>
Cc: <stable@vger.kernel.org> # v5.8+
Cc: Andrzej Hajda <andrzej.hajda@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Tejas Upadhyay <tejas.upadhyay@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
Cc: Tapani Pälli <tapani.palli@intel.com>
Cc: Mark Janes <mark.janes@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
---
 drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 50 ++++++++++++------------
 1 file changed, 26 insertions(+), 24 deletions(-)

Comments

Tapani Pälli Sept. 20, 2023, 12:36 p.m. UTC | #1
I tested this first against tests that were failing for Mesa and it 
fixes all of the regressed cases (TGL LP). Also did a full run of all 
KHR-GLES31* and KHR-GLES32* test groups in the Khronos CTS suite, no 
regressions observed.

Tested-by: Tapani Pälli <tapani.palli@intel.com>


On 20.9.2023 14.11, Nirmoy Das wrote:
> i915 already does memory quiesce before signaling
> breadcrumb so remove extra memory quiescing for aux
> invalidation which can cause unnecessary side effects.
>
> Fixes: 78a6ccd65fa3 ("drm/i915/gt: Ensure memory quiesced before invalidation")
> Cc: Jonathan Cavitt <jonathan.cavitt@intel.com>
> Cc: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: <stable@vger.kernel.org> # v5.8+
> Cc: Andrzej Hajda <andrzej.hajda@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Cc: Tejas Upadhyay <tejas.upadhyay@intel.com>
> Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> Cc: Tapani Pälli <tapani.palli@intel.com>
> Cc: Mark Janes <mark.janes@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 50 ++++++++++++------------
>   1 file changed, 26 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> index 0143445dba83..5001670046a0 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> @@ -248,11 +248,7 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
>   {
>   	struct intel_engine_cs *engine = rq->engine;
>   
> -	/*
> -	 * On Aux CCS platforms the invalidation of the Aux
> -	 * table requires quiescing memory traffic beforehand
> -	 */
> -	if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) {
> +	if (mode & EMIT_FLUSH) {
>   		u32 bit_group_0 = 0;
>   		u32 bit_group_1 = 0;
>   		int err;
> @@ -264,13 +260,6 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
>   
>   		bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
>   
> -		/*
> -		 * When required, in MTL and beyond platforms we
> -		 * need to set the CCS_FLUSH bit in the pipe control
> -		 */
> -		if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
> -			bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
> -
>   		bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH;
>   		bit_group_1 |= PIPE_CONTROL_FLUSH_L3;
>   		bit_group_1 |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
> @@ -800,14 +789,15 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
>   {
>   	struct drm_i915_private *i915 = rq->i915;
>   	struct intel_gt *gt = rq->engine->gt;
> -	u32 flags = (PIPE_CONTROL_CS_STALL |
> -		     PIPE_CONTROL_TLB_INVALIDATE |
> -		     PIPE_CONTROL_TILE_CACHE_FLUSH |
> -		     PIPE_CONTROL_FLUSH_L3 |
> -		     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> -		     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> -		     PIPE_CONTROL_DC_FLUSH_ENABLE |
> -		     PIPE_CONTROL_FLUSH_ENABLE);
> +	u32 bit_group_0 = PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
> +	u32 bit_group_1 = (PIPE_CONTROL_CS_STALL |
> +			   PIPE_CONTROL_TLB_INVALIDATE |
> +			   PIPE_CONTROL_TILE_CACHE_FLUSH |
> +			   PIPE_CONTROL_FLUSH_L3 |
> +			   PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> +			   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +			   PIPE_CONTROL_DC_FLUSH_ENABLE |
> +			   PIPE_CONTROL_FLUSH_ENABLE);
>   
>   	/* Wa_14016712196 */
>   	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915))
> @@ -817,14 +807,26 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
>   
>   	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
>   		/* Wa_1409600907 */
> -		flags |= PIPE_CONTROL_DEPTH_STALL;
> +		bit_group_1 |= PIPE_CONTROL_DEPTH_STALL;
>   
>   	if (!HAS_3D_PIPELINE(rq->i915))
> -		flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
> +		bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
>   	else if (rq->engine->class == COMPUTE_CLASS)
> -		flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
> +		bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
> +
> +	/*
> +	 * On Aux CCS platforms the invalidation of the Aux
> +	 * table requires quiescing memory traffic beforehand.
> +	 * gen12_emit_fini_breadcrumb_rcs() does this for us as
> +	 * all memory traffic has to be completed before we signal
> +	 * the breadcrumb. In MTL and beyond platforms, we need to also
> +	 * add CCS_FLUSH bit in the pipe control for that purpose.
> +	 */
> +
> +	if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
> +		bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
>   
> -	cs = gen12_emit_pipe_control(cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, flags, 0);
> +	cs = gen12_emit_pipe_control(cs, bit_group_0, bit_group_1, 0);
>   
>   	/*XXX: Look at gen8_emit_fini_breadcrumb_rcs */
>   	cs = gen12_emit_ggtt_write_rcs(cs,
Matt Roper Sept. 20, 2023, 5:28 p.m. UTC | #2
On Wed, Sep 20, 2023 at 01:11:31PM +0200, Nirmoy Das wrote:
> i915 already does memory quiesce before signaling
> breadcrumb so remove extra memory quiescing for aux
> invalidation which can cause unnecessary side effects.

This explanation seems confusing to me.  If we've already performed the
necessary flushing and quiesced all cache<->memory traffic, then
performing another flush should just be a noop, right?  If we're seeing
side effects, then doesn't that imply that there was still something in
the cache that hadn't made its way to memory yet?

Is the breadcrumb code flushing all the necessary bits?  What about
PIPE_CONTROL_CCS_FLUSH?


Matt

> 
> Fixes: 78a6ccd65fa3 ("drm/i915/gt: Ensure memory quiesced before invalidation")
> Cc: Jonathan Cavitt <jonathan.cavitt@intel.com>
> Cc: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: <stable@vger.kernel.org> # v5.8+
> Cc: Andrzej Hajda <andrzej.hajda@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Cc: Tejas Upadhyay <tejas.upadhyay@intel.com>
> Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
> Cc: Tapani Pälli <tapani.palli@intel.com>
> Cc: Mark Janes <mark.janes@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 50 ++++++++++++------------
>  1 file changed, 26 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> index 0143445dba83..5001670046a0 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> @@ -248,11 +248,7 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
>  {
>  	struct intel_engine_cs *engine = rq->engine;
>  
> -	/*
> -	 * On Aux CCS platforms the invalidation of the Aux
> -	 * table requires quiescing memory traffic beforehand
> -	 */
> -	if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) {
> +	if (mode & EMIT_FLUSH) {
>  		u32 bit_group_0 = 0;
>  		u32 bit_group_1 = 0;
>  		int err;
> @@ -264,13 +260,6 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
>  
>  		bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
>  
> -		/*
> -		 * When required, in MTL and beyond platforms we
> -		 * need to set the CCS_FLUSH bit in the pipe control
> -		 */
> -		if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
> -			bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
> -
>  		bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH;
>  		bit_group_1 |= PIPE_CONTROL_FLUSH_L3;
>  		bit_group_1 |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
> @@ -800,14 +789,15 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
>  {
>  	struct drm_i915_private *i915 = rq->i915;
>  	struct intel_gt *gt = rq->engine->gt;
> -	u32 flags = (PIPE_CONTROL_CS_STALL |
> -		     PIPE_CONTROL_TLB_INVALIDATE |
> -		     PIPE_CONTROL_TILE_CACHE_FLUSH |
> -		     PIPE_CONTROL_FLUSH_L3 |
> -		     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> -		     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> -		     PIPE_CONTROL_DC_FLUSH_ENABLE |
> -		     PIPE_CONTROL_FLUSH_ENABLE);
> +	u32 bit_group_0 = PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
> +	u32 bit_group_1 = (PIPE_CONTROL_CS_STALL |
> +			   PIPE_CONTROL_TLB_INVALIDATE |
> +			   PIPE_CONTROL_TILE_CACHE_FLUSH |
> +			   PIPE_CONTROL_FLUSH_L3 |
> +			   PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> +			   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +			   PIPE_CONTROL_DC_FLUSH_ENABLE |
> +			   PIPE_CONTROL_FLUSH_ENABLE);
>  
>  	/* Wa_14016712196 */
>  	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915))
> @@ -817,14 +807,26 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
>  
>  	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
>  		/* Wa_1409600907 */
> -		flags |= PIPE_CONTROL_DEPTH_STALL;
> +		bit_group_1 |= PIPE_CONTROL_DEPTH_STALL;
>  
>  	if (!HAS_3D_PIPELINE(rq->i915))
> -		flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
> +		bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
>  	else if (rq->engine->class == COMPUTE_CLASS)
> -		flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
> +		bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
> +
> +	/*
> +	 * On Aux CCS platforms the invalidation of the Aux
> +	 * table requires quiescing memory traffic beforehand.
> +	 * gen12_emit_fini_breadcrumb_rcs() does this for us as
> +	 * all memory traffic has to be completed before we signal
> +	 * the breadcrumb. In MTL and beyond platforms, we need to also
> +	 * add CCS_FLUSH bit in the pipe control for that purpose.
> +	 */
> +
> +	if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
> +		bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
>  
> -	cs = gen12_emit_pipe_control(cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, flags, 0);
> +	cs = gen12_emit_pipe_control(cs, bit_group_0, bit_group_1, 0);
>  
>  	/*XXX: Look at gen8_emit_fini_breadcrumb_rcs */
>  	cs = gen12_emit_ggtt_write_rcs(cs,
> -- 
> 2.41.0
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index 0143445dba83..5001670046a0 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -248,11 +248,7 @@  int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 {
 	struct intel_engine_cs *engine = rq->engine;
 
-	/*
-	 * On Aux CCS platforms the invalidation of the Aux
-	 * table requires quiescing memory traffic beforehand
-	 */
-	if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) {
+	if (mode & EMIT_FLUSH) {
 		u32 bit_group_0 = 0;
 		u32 bit_group_1 = 0;
 		int err;
@@ -264,13 +260,6 @@  int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 
 		bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
 
-		/*
-		 * When required, in MTL and beyond platforms we
-		 * need to set the CCS_FLUSH bit in the pipe control
-		 */
-		if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
-			bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
-
 		bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 		bit_group_1 |= PIPE_CONTROL_FLUSH_L3;
 		bit_group_1 |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
@@ -800,14 +789,15 @@  u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
 	struct drm_i915_private *i915 = rq->i915;
 	struct intel_gt *gt = rq->engine->gt;
-	u32 flags = (PIPE_CONTROL_CS_STALL |
-		     PIPE_CONTROL_TLB_INVALIDATE |
-		     PIPE_CONTROL_TILE_CACHE_FLUSH |
-		     PIPE_CONTROL_FLUSH_L3 |
-		     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
-		     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-		     PIPE_CONTROL_DC_FLUSH_ENABLE |
-		     PIPE_CONTROL_FLUSH_ENABLE);
+	u32 bit_group_0 = PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
+	u32 bit_group_1 = (PIPE_CONTROL_CS_STALL |
+			   PIPE_CONTROL_TLB_INVALIDATE |
+			   PIPE_CONTROL_TILE_CACHE_FLUSH |
+			   PIPE_CONTROL_FLUSH_L3 |
+			   PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+			   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+			   PIPE_CONTROL_DC_FLUSH_ENABLE |
+			   PIPE_CONTROL_FLUSH_ENABLE);
 
 	/* Wa_14016712196 */
 	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915))
@@ -817,14 +807,26 @@  u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 
 	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
 		/* Wa_1409600907 */
-		flags |= PIPE_CONTROL_DEPTH_STALL;
+		bit_group_1 |= PIPE_CONTROL_DEPTH_STALL;
 
 	if (!HAS_3D_PIPELINE(rq->i915))
-		flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
+		bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
 	else if (rq->engine->class == COMPUTE_CLASS)
-		flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
+		bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
+
+	/*
+	 * On Aux CCS platforms the invalidation of the Aux
+	 * table requires quiescing memory traffic beforehand.
+	 * gen12_emit_fini_breadcrumb_rcs() does this for us as
+	 * all memory traffic has to be completed before we signal
+	 * the breadcrumb. In MTL and beyond platforms, we need to also
+	 * add CCS_FLUSH bit in the pipe control for that purpose.
+	 */
+
+	if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
+		bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
 
-	cs = gen12_emit_pipe_control(cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, flags, 0);
+	cs = gen12_emit_pipe_control(cs, bit_group_0, bit_group_1, 0);
 
 	/*XXX: Look at gen8_emit_fini_breadcrumb_rcs */
 	cs = gen12_emit_ggtt_write_rcs(cs,