diff mbox series

drm/i915/ringbuffer: Delay after invalidating gen6+ xcs

Message ID 20180828170429.28492-2-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915/ringbuffer: Delay after invalidating gen6+ xcs | expand

Commit Message

Chris Wilson Aug. 28, 2018, 5:04 p.m. UTC
During stress testing of full-ppgtt (on Baytrail at least), we found
that the invalidation around a context/mm switch was insufficient (writes
would go astray). Adding a second MI_FLUSH_DW barrier prevents this, but
it is unclear as to whether this is merely a delaying tactic or if it is
truly serialising with the TLB invalidation. Either way, it is
empirically required.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107715
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Matthew Auld <matthew.william.auld@gmail.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 101 +++++++++++-------------
 1 file changed, 47 insertions(+), 54 deletions(-)

Comments

Ville Syrjälä Aug. 30, 2018, 3:45 p.m. UTC | #1
On Tue, Aug 28, 2018 at 06:04:29PM +0100, Chris Wilson wrote:
> During stress testing of full-ppgtt (on Baytrail at least), we found
> that the invalidation around a context/mm switch was insufficient (writes
> would go astray). Adding a second MI_FLUSH_DW barrier prevents this, but
> it is unclear as to whether this is merely a delaying tactic or if it is
> truly serialising with the TLB invalidation. Either way, it is
> empirically required.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107715
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Matthew Auld <matthew.william.auld@gmail.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 101 +++++++++++-------------
>  1 file changed, 47 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index d40f55a8dc34..952b6269bab0 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1944,40 +1944,62 @@ static void gen6_bsd_submit_request(struct i915_request *request)
>  	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>  }
>  
> -static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
> +static int emit_mi_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
>  {
>  	u32 cmd, *cs;
>  
> -	cs = intel_ring_begin(rq, 4);
> -	if (IS_ERR(cs))
> -		return PTR_ERR(cs);
> +	do {
> +		cs = intel_ring_begin(rq, 4);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
>  
> -	cmd = MI_FLUSH_DW;
> +		cmd = MI_FLUSH_DW;
>  
> -	/* We always require a command barrier so that subsequent
> -	 * commands, such as breadcrumb interrupts, are strictly ordered
> -	 * wrt the contents of the write cache being flushed to memory
> -	 * (and thus being coherent from the CPU).
> -	 */
> -	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
> +		/*
> +		 * We always require a command barrier so that subsequent
> +		 * commands, such as breadcrumb interrupts, are strictly ordered
> +		 * wrt the contents of the write cache being flushed to memory
> +		 * (and thus being coherent from the CPU).
> +		 */
> +		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
>  
> -	/*
> -	 * Bspec vol 1c.5 - video engine command streamer:
> -	 * "If ENABLED, all TLBs will be invalidated once the flush
> -	 * operation is complete. This bit is only valid when the
> -	 * Post-Sync Operation field is a value of 1h or 3h."
> -	 */
> -	if (mode & EMIT_INVALIDATE)
> -		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;
> +		/*
> +		 * Bspec vol 1c.3 - blitter engine command streamer:
> +		 * "If ENABLED, all TLBs will be invalidated once the flush
> +		 * operation is complete. This bit is only valid when the
> +		 * Post-Sync Operation field is a value of 1h or 3h."
> +		 */
> +		if (mode & EMIT_INVALIDATE)
> +			cmd |= invflags;
> +		*cs++ = cmd;
> +		*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
> +		*cs++ = 0;
> +		*cs++ = MI_NOOP;
> +		intel_ring_advance(rq, cs);
> +
> +		/*
> +		 * Not only do we need a full barrier (post-sync write) after
> +		 * invalidating the TLBs, but we need to wait a little bit
> +		 * longer. Whether this is merely delaying us, or the
> +		 * subsequent flush is a key part of serialising with the
> +		 * post-sync op, this extra pass appears vital before a
> +		 * mm switch!
> +		 */
> +		if (!(mode & EMIT_INVALIDATE))
> +			break;
> +
> +		mode &= ~EMIT_INVALIDATE;
> +	} while (1);

I find the loop thingy somewhat hard to read. I'd probably have
written it as something like

{
	if (mode & EMIT_INVALIDATE)
		mi_flush(INVALIDATE_TLB);
	mi_flush(0);
}

Either way it seems to do what it says so
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>

>  
> -	*cs++ = cmd;
> -	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
> -	*cs++ = 0;
> -	*cs++ = MI_NOOP;
> -	intel_ring_advance(rq, cs);
>  	return 0;
>  }
>  
> +static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
> +{
> +	return emit_mi_flush_dw(rq, mode,
> +				MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
> +}
> +
>  static int
>  hsw_emit_bb_start(struct i915_request *rq,
>  		  u64 offset, u32 len,
> @@ -2022,36 +2044,7 @@ gen6_emit_bb_start(struct i915_request *rq,
>  
>  static int gen6_ring_flush(struct i915_request *rq, u32 mode)
>  {
> -	u32 cmd, *cs;
> -
> -	cs = intel_ring_begin(rq, 4);
> -	if (IS_ERR(cs))
> -		return PTR_ERR(cs);
> -
> -	cmd = MI_FLUSH_DW;
> -
> -	/* We always require a command barrier so that subsequent
> -	 * commands, such as breadcrumb interrupts, are strictly ordered
> -	 * wrt the contents of the write cache being flushed to memory
> -	 * (and thus being coherent from the CPU).
> -	 */
> -	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
> -
> -	/*
> -	 * Bspec vol 1c.3 - blitter engine command streamer:
> -	 * "If ENABLED, all TLBs will be invalidated once the flush
> -	 * operation is complete. This bit is only valid when the
> -	 * Post-Sync Operation field is a value of 1h or 3h."
> -	 */
> -	if (mode & EMIT_INVALIDATE)
> -		cmd |= MI_INVALIDATE_TLB;
> -	*cs++ = cmd;
> -	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
> -	*cs++ = 0;
> -	*cs++ = MI_NOOP;
> -	intel_ring_advance(rq, cs);
> -
> -	return 0;
> +	return emit_mi_flush_dw(rq, mode, MI_INVALIDATE_TLB);
>  }
>  
>  static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
> -- 
> 2.18.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index d40f55a8dc34..952b6269bab0 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1944,40 +1944,62 @@  static void gen6_bsd_submit_request(struct i915_request *request)
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 }
 
-static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
+static int emit_mi_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
 {
 	u32 cmd, *cs;
 
-	cs = intel_ring_begin(rq, 4);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
+	do {
+		cs = intel_ring_begin(rq, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
 
-	cmd = MI_FLUSH_DW;
+		cmd = MI_FLUSH_DW;
 
-	/* We always require a command barrier so that subsequent
-	 * commands, such as breadcrumb interrupts, are strictly ordered
-	 * wrt the contents of the write cache being flushed to memory
-	 * (and thus being coherent from the CPU).
-	 */
-	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
+		/*
+		 * We always require a command barrier so that subsequent
+		 * commands, such as breadcrumb interrupts, are strictly ordered
+		 * wrt the contents of the write cache being flushed to memory
+		 * (and thus being coherent from the CPU).
+		 */
+		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 
-	/*
-	 * Bspec vol 1c.5 - video engine command streamer:
-	 * "If ENABLED, all TLBs will be invalidated once the flush
-	 * operation is complete. This bit is only valid when the
-	 * Post-Sync Operation field is a value of 1h or 3h."
-	 */
-	if (mode & EMIT_INVALIDATE)
-		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;
+		/*
+		 * Bspec vol 1c.3 - blitter engine command streamer:
+		 * "If ENABLED, all TLBs will be invalidated once the flush
+		 * operation is complete. This bit is only valid when the
+		 * Post-Sync Operation field is a value of 1h or 3h."
+		 */
+		if (mode & EMIT_INVALIDATE)
+			cmd |= invflags;
+		*cs++ = cmd;
+		*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
+		*cs++ = 0;
+		*cs++ = MI_NOOP;
+		intel_ring_advance(rq, cs);
+
+		/*
+		 * Not only do we need a full barrier (post-sync write) after
+		 * invalidating the TLBs, but we need to wait a little bit
+		 * longer. Whether this is merely delaying us, or the
+		 * subsequent flush is a key part of serialising with the
+		 * post-sync op, this extra pass appears vital before a
+		 * mm switch!
+		 */
+		if (!(mode & EMIT_INVALIDATE))
+			break;
+
+		mode &= ~EMIT_INVALIDATE;
+	} while (1);
 
-	*cs++ = cmd;
-	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
-	*cs++ = 0;
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
 	return 0;
 }
 
+static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
+{
+	return emit_mi_flush_dw(rq, mode,
+				MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
+}
+
 static int
 hsw_emit_bb_start(struct i915_request *rq,
 		  u64 offset, u32 len,
@@ -2022,36 +2044,7 @@  gen6_emit_bb_start(struct i915_request *rq,
 
 static int gen6_ring_flush(struct i915_request *rq, u32 mode)
 {
-	u32 cmd, *cs;
-
-	cs = intel_ring_begin(rq, 4);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	cmd = MI_FLUSH_DW;
-
-	/* We always require a command barrier so that subsequent
-	 * commands, such as breadcrumb interrupts, are strictly ordered
-	 * wrt the contents of the write cache being flushed to memory
-	 * (and thus being coherent from the CPU).
-	 */
-	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
-
-	/*
-	 * Bspec vol 1c.3 - blitter engine command streamer:
-	 * "If ENABLED, all TLBs will be invalidated once the flush
-	 * operation is complete. This bit is only valid when the
-	 * Post-Sync Operation field is a value of 1h or 3h."
-	 */
-	if (mode & EMIT_INVALIDATE)
-		cmd |= MI_INVALIDATE_TLB;
-	*cs++ = cmd;
-	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
-	*cs++ = 0;
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
-
-	return 0;
+	return emit_mi_flush_dw(rq, mode, MI_INVALIDATE_TLB);
 }
 
 static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,