diff mbox series

[3/9] drm/i915/ringbuffer: Pull the render flush into breadcrumb emission

Message ID 20181219145747.19835-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/9] drm/i915: Remove HW semaphores for gen7 inter-engine synchronisation | expand

Commit Message

Chris Wilson Dec. 19, 2018, 2:57 p.m. UTC
In preparation for removing the manual EMIT_FLUSH prior to emitting the
breadcrumb implement the flush inline with writing the breadcrumb for
ringbuffer emission.

With a combined flush+breadcrumb, we can use a single operation to both
flush and after the flush is complete (post-sync) write the breadcrumb.
This gives us a strongly ordered operation that should be sufficient to
serialise the write before we emit the interrupt; and therefore we may
take the opportunity to remove the irq_seqno_barrier w/a for gen6+.
Although using the PIPECONTROL to write the breadcrumb is slower than
MI_STORE_DWORD_IMM, by combining the operations into one and removing the
extra flush (next patch) it is faster

For gen2-5, we simply combine the MI_FLUSH into the breadcrumb emission,
though maybe we could find a solution here to the seqno-vs-interrupt
issue on Ironlake by mixing up the flush? The answer is no, adding an
MI_FLUSH before the interrupt is insufficient.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 104 ++++++++++++++++++++++--
 1 file changed, 97 insertions(+), 7 deletions(-)

Comments

Mika Kuoppala Dec. 28, 2018, 12:03 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> In preparation for removing the manual EMIT_FLUSH prior to emitting the
> breadcrumb implement the flush inline with writing the breadcrumb for
> ringbuffer emission.
>
> With a combined flush+breadcrumb, we can use a single operation to both
> flush and after the flush is complete (post-sync) write the breadcrumb.
> This gives us a strongly ordered operation that should be sufficient to
> serialise the write before we emit the interrupt; and therefore we may
> take the opportunity to remove the irq_seqno_barrier w/a for gen6+.
> Although using the PIPECONTROL to write the breadcrumb is slower than
> MI_STORE_DWORD_IMM, by combining the operations into one and removing the
> extra flush (next patch) it is faster
>
> For gen2-5, we simply combine the MI_FLUSH into the breadcrumb emission,
> though maybe we could find a solution here to the seqno-vs-interrupt
> issue on Ironlake by mixing up the flush? The answer is no, adding an
> MI_FLUSH before the interrupt is insufficient.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 104 ++++++++++++++++++++++--
>  1 file changed, 97 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index d5a9edbcf1be..169c54995ca9 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -217,7 +217,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode)
>   * really our business.  That leaves only stall at scoreboard.
>   */
>  static int
> -intel_emit_post_sync_nonzero_flush(struct i915_request *rq)
> +gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
>  {
>  	u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
>  	u32 *cs;
> @@ -257,7 +257,7 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode)
>  	int ret;
>  
>  	/* Force SNB workarounds for PIPE_CONTROL flushes */
> -	ret = intel_emit_post_sync_nonzero_flush(rq);
> +	ret = gen6_emit_post_sync_nonzero_flush(rq);
>  	if (ret)
>  		return ret;
>  
> @@ -300,6 +300,37 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode)
>  	return 0;
>  }
>  
> +static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +{
> +	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_QW_WRITE;
> +	*cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
> +	*cs++ = 0;
> +
> +	/* Finally we can flush and with it emit the breadcrumb */
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> +		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +		 PIPE_CONTROL_DC_FLUSH_ENABLE |
> +		 PIPE_CONTROL_QW_WRITE |
> +		 PIPE_CONTROL_CS_STALL);
> +	*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
> +	*cs++ = rq->global_seqno;
> +
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
> +
> +	rq->tail = intel_ring_offset(rq, cs);
> +	assert_ring_tail_valid(rq->ring, rq->tail);
> +}
> +static const int gen6_rcs_emit_breadcrumb_sz = 14;
> +
>  static int
>  gen7_render_ring_cs_stall_wa(struct i915_request *rq)
>  {
> @@ -379,6 +410,39 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode)
>  	return 0;
>  }
>  
> +static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +{
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> +		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +		 PIPE_CONTROL_DC_FLUSH_ENABLE |
> +		 PIPE_CONTROL_FLUSH_ENABLE |
> +		 PIPE_CONTROL_QW_WRITE |
> +		 PIPE_CONTROL_GLOBAL_GTT_IVB |
> +		 PIPE_CONTROL_CS_STALL);

There is a faint scent of engine->flush_flags in the air but
I think you favour opencoding everything behind these
indirect calls.

But we can take a second look later, with gen11 flush
improvements.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> +	*cs++ = intel_hws_seqno_address(rq->engine);
> +	*cs++ = rq->global_seqno;
> +
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
> +
> +	rq->tail = intel_ring_offset(rq, cs);
> +	assert_ring_tail_valid(rq->ring, rq->tail);
> +}
> +static const int gen7_rcs_emit_breadcrumb_sz = 6;
> +
> +static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +{
> +	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
> +	*cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
> +	*cs++ = rq->global_seqno;
> +	*cs++ = MI_USER_INTERRUPT;
> +
> +	rq->tail = intel_ring_offset(rq, cs);
> +	assert_ring_tail_valid(rq->ring, rq->tail);
> +}
> +static const int gen6_xcs_emit_breadcrumb_sz = 4;
> +
>  static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
>  {
>  	/*
> @@ -777,16 +841,20 @@ static void i9xx_submit_request(struct i915_request *request)
>  
>  static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  {
> +	*cs++ = MI_FLUSH;
> +
>  	*cs++ = MI_STORE_DWORD_INDEX;
>  	*cs++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
>  	*cs++ = rq->global_seqno;
> +
>  	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
>  
>  	rq->tail = intel_ring_offset(rq, cs);
>  	assert_ring_tail_valid(rq->ring, rq->tail);
>  }
>  
> -static const int i9xx_emit_breadcrumb_sz = 4;
> +static const int i9xx_emit_breadcrumb_sz = 6;
>  
>  static void
>  gen5_seqno_barrier(struct intel_engine_cs *engine)
> @@ -2050,7 +2118,6 @@ static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
>  	if (INTEL_GEN(dev_priv) >= 6) {
>  		engine->irq_enable = gen6_irq_enable;
>  		engine->irq_disable = gen6_irq_disable;
> -		engine->irq_seqno_barrier = gen6_seqno_barrier;
>  	} else if (INTEL_GEN(dev_priv) >= 5) {
>  		engine->irq_enable = gen5_irq_enable;
>  		engine->irq_disable = gen5_irq_disable;
> @@ -2122,11 +2189,18 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
>  
>  	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
>  
> -	if (INTEL_GEN(dev_priv) >= 6) {
> +	if (INTEL_GEN(dev_priv) >= 7) {
>  		engine->init_context = intel_rcs_ctx_init;
>  		engine->emit_flush = gen7_render_ring_flush;
> -		if (IS_GEN(dev_priv, 6))
> -			engine->emit_flush = gen6_render_ring_flush;
> +		engine->emit_breadcrumb = gen7_rcs_emit_breadcrumb;
> +		engine->emit_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz;
> +		engine->irq_seqno_barrier = gen6_seqno_barrier;
> +	} else if (IS_GEN(dev_priv, 6)) {
> +		engine->init_context = intel_rcs_ctx_init;
> +		engine->emit_flush = gen6_render_ring_flush;
> +		engine->emit_breadcrumb = gen6_rcs_emit_breadcrumb;
> +		engine->emit_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz;
> +		engine->irq_seqno_barrier = gen6_seqno_barrier;
>  	} else if (IS_GEN(dev_priv, 5)) {
>  		engine->emit_flush = gen4_render_ring_flush;
>  	} else {
> @@ -2161,6 +2235,10 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
>  			engine->set_default_submission = gen6_bsd_set_default_submission;
>  		engine->emit_flush = gen6_bsd_ring_flush;
>  		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
> +
> +		engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> +		engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> +		engine->irq_seqno_barrier = gen6_seqno_barrier;
>  	} else {
>  		engine->emit_flush = bsd_ring_flush;
>  		if (IS_GEN(dev_priv, 5))
> @@ -2176,11 +2254,17 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_private *dev_priv = engine->i915;
>  
> +	GEM_BUG_ON(INTEL_GEN(dev_priv) < 6);
> +
>  	intel_ring_default_vfuncs(dev_priv, engine);
>  
>  	engine->emit_flush = gen6_ring_flush;
>  	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
>  
> +	engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> +	engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> +	engine->irq_seqno_barrier = gen6_seqno_barrier;
> +
>  	return intel_init_ring_buffer(engine);
>  }
>  
> @@ -2188,6 +2272,8 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_private *dev_priv = engine->i915;
>  
> +	GEM_BUG_ON(INTEL_GEN(dev_priv) < 7);
> +
>  	intel_ring_default_vfuncs(dev_priv, engine);
>  
>  	engine->emit_flush = gen6_ring_flush;
> @@ -2195,5 +2281,9 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
>  	engine->irq_enable = hsw_vebox_irq_enable;
>  	engine->irq_disable = hsw_vebox_irq_disable;
>  
> +	engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> +	engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> +	engine->irq_seqno_barrier = gen6_seqno_barrier;
> +
>  	return intel_init_ring_buffer(engine);
>  }
> -- 
> 2.20.0
Chris Wilson Dec. 28, 2018, 3:11 p.m. UTC | #2
Quoting Mika Kuoppala (2018-12-28 12:03:26)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > In preparation for removing the manual EMIT_FLUSH prior to emitting the
> > breadcrumb implement the flush inline with writing the breadcrumb for
> > ringbuffer emission.
> >
> > With a combined flush+breadcrumb, we can use a single operation to both
> > flush and after the flush is complete (post-sync) write the breadcrumb.
> > This gives us a strongly ordered operation that should be sufficient to
> > serialise the write before we emit the interrupt; and therefore we may
> > take the opportunity to remove the irq_seqno_barrier w/a for gen6+.
> > Although using the PIPECONTROL to write the breadcrumb is slower than
> > MI_STORE_DWORD_IMM, by combining the operations into one and removing the
> > extra flush (next patch) it is faster
> >
> > For gen2-5, we simply combine the MI_FLUSH into the breadcrumb emission,
> > though maybe we could find a solution here to the seqno-vs-interrupt
> > issue on Ironlake by mixing up the flush? The answer is no, adding an
> > MI_FLUSH before the interrupt is insufficient.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/intel_ringbuffer.c | 104 ++++++++++++++++++++++--
> >  1 file changed, 97 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index d5a9edbcf1be..169c54995ca9 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -217,7 +217,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode)
> >   * really our business.  That leaves only stall at scoreboard.
> >   */
> >  static int
> > -intel_emit_post_sync_nonzero_flush(struct i915_request *rq)
> > +gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
> >  {
> >       u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
> >       u32 *cs;
> > @@ -257,7 +257,7 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode)
> >       int ret;
> >  
> >       /* Force SNB workarounds for PIPE_CONTROL flushes */
> > -     ret = intel_emit_post_sync_nonzero_flush(rq);
> > +     ret = gen6_emit_post_sync_nonzero_flush(rq);
> >       if (ret)
> >               return ret;
> >  
> > @@ -300,6 +300,37 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode)
> >       return 0;
> >  }
> >  
> > +static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> > +{
> > +     /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
> > +     *cs++ = 0;
> > +     *cs++ = 0;
> > +
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = PIPE_CONTROL_QW_WRITE;
> > +     *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
> > +     *cs++ = 0;
> > +
> > +     /* Finally we can flush and with it emit the breadcrumb */
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> > +              PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > +              PIPE_CONTROL_DC_FLUSH_ENABLE |
> > +              PIPE_CONTROL_QW_WRITE |
> > +              PIPE_CONTROL_CS_STALL);
> > +     *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
> > +     *cs++ = rq->global_seqno;
> > +
> > +     *cs++ = MI_USER_INTERRUPT;
> > +     *cs++ = MI_NOOP;
> > +
> > +     rq->tail = intel_ring_offset(rq, cs);
> > +     assert_ring_tail_valid(rq->ring, rq->tail);
> > +}
> > +static const int gen6_rcs_emit_breadcrumb_sz = 14;
> > +
> >  static int
> >  gen7_render_ring_cs_stall_wa(struct i915_request *rq)
> >  {
> > @@ -379,6 +410,39 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode)
> >       return 0;
> >  }
> >  
> > +static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> > +{
> > +     *cs++ = GFX_OP_PIPE_CONTROL(4);
> > +     *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> > +              PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > +              PIPE_CONTROL_DC_FLUSH_ENABLE |
> > +              PIPE_CONTROL_FLUSH_ENABLE |
> > +              PIPE_CONTROL_QW_WRITE |
> > +              PIPE_CONTROL_GLOBAL_GTT_IVB |
> > +              PIPE_CONTROL_CS_STALL);
> 
> There is a faint scent of engine->flush_flags in the air but
> I think you favour opencoding everything behind these
> indirect calls.

Maybe an EMIT_FLUSH | EMIT_DWORD + u32 parameter. I did also at one
stage consider replacing the manual emit_breadcrumb_sz with a dummy
emission during module init (i.e. emit the tail and cound). That still
seems like an improvement as the manual sz is quite easy to break.

The reason I stuck with extending ->emit_breadcrumb with an inline flush
is that it becomes hairy in the next few patches, and I prefer getting
stuck into the mess without too much indirection. But if we do see a way
of coalescing, or see a cleaner abstraction, we can always come back.
Perhaps though, this might be the last time we need to venture in here?
:)
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index d5a9edbcf1be..169c54995ca9 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -217,7 +217,7 @@  gen4_render_ring_flush(struct i915_request *rq, u32 mode)
  * really our business.  That leaves only stall at scoreboard.
  */
 static int
-intel_emit_post_sync_nonzero_flush(struct i915_request *rq)
+gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
 {
 	u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 	u32 *cs;
@@ -257,7 +257,7 @@  gen6_render_ring_flush(struct i915_request *rq, u32 mode)
 	int ret;
 
 	/* Force SNB workarounds for PIPE_CONTROL flushes */
-	ret = intel_emit_post_sync_nonzero_flush(rq);
+	ret = gen6_emit_post_sync_nonzero_flush(rq);
 	if (ret)
 		return ret;
 
@@ -300,6 +300,37 @@  gen6_render_ring_flush(struct i915_request *rq, u32 mode)
 	return 0;
 }
 
+static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+{
+	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*cs++ = 0;
+	*cs++ = 0;
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_QW_WRITE;
+	*cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = 0;
+
+	/* Finally we can flush and with it emit the breadcrumb */
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+		 PIPE_CONTROL_DC_FLUSH_ENABLE |
+		 PIPE_CONTROL_QW_WRITE |
+		 PIPE_CONTROL_CS_STALL);
+	*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = rq->global_seqno;
+
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+}
+static const int gen6_rcs_emit_breadcrumb_sz = 14;
+
 static int
 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
 {
@@ -379,6 +410,39 @@  gen7_render_ring_flush(struct i915_request *rq, u32 mode)
 	return 0;
 }
 
+static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+{
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+		 PIPE_CONTROL_DC_FLUSH_ENABLE |
+		 PIPE_CONTROL_FLUSH_ENABLE |
+		 PIPE_CONTROL_QW_WRITE |
+		 PIPE_CONTROL_GLOBAL_GTT_IVB |
+		 PIPE_CONTROL_CS_STALL);
+	*cs++ = intel_hws_seqno_address(rq->engine);
+	*cs++ = rq->global_seqno;
+
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+}
+static const int gen7_rcs_emit_breadcrumb_sz = 6;
+
+static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+{
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
+	*cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
+	*cs++ = rq->global_seqno;
+	*cs++ = MI_USER_INTERRUPT;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+}
+static const int gen6_xcs_emit_breadcrumb_sz = 4;
+
 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
 {
 	/*
@@ -777,16 +841,20 @@  static void i9xx_submit_request(struct i915_request *request)
 
 static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
+	*cs++ = MI_FLUSH;
+
 	*cs++ = MI_STORE_DWORD_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
 	*cs++ = rq->global_seqno;
+
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
 
-static const int i9xx_emit_breadcrumb_sz = 4;
+static const int i9xx_emit_breadcrumb_sz = 6;
 
 static void
 gen5_seqno_barrier(struct intel_engine_cs *engine)
@@ -2050,7 +2118,6 @@  static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
 	if (INTEL_GEN(dev_priv) >= 6) {
 		engine->irq_enable = gen6_irq_enable;
 		engine->irq_disable = gen6_irq_disable;
-		engine->irq_seqno_barrier = gen6_seqno_barrier;
 	} else if (INTEL_GEN(dev_priv) >= 5) {
 		engine->irq_enable = gen5_irq_enable;
 		engine->irq_disable = gen5_irq_disable;
@@ -2122,11 +2189,18 @@  int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
 
 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
 
-	if (INTEL_GEN(dev_priv) >= 6) {
+	if (INTEL_GEN(dev_priv) >= 7) {
 		engine->init_context = intel_rcs_ctx_init;
 		engine->emit_flush = gen7_render_ring_flush;
-		if (IS_GEN(dev_priv, 6))
-			engine->emit_flush = gen6_render_ring_flush;
+		engine->emit_breadcrumb = gen7_rcs_emit_breadcrumb;
+		engine->emit_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz;
+		engine->irq_seqno_barrier = gen6_seqno_barrier;
+	} else if (IS_GEN(dev_priv, 6)) {
+		engine->init_context = intel_rcs_ctx_init;
+		engine->emit_flush = gen6_render_ring_flush;
+		engine->emit_breadcrumb = gen6_rcs_emit_breadcrumb;
+		engine->emit_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz;
+		engine->irq_seqno_barrier = gen6_seqno_barrier;
 	} else if (IS_GEN(dev_priv, 5)) {
 		engine->emit_flush = gen4_render_ring_flush;
 	} else {
@@ -2161,6 +2235,10 @@  int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
 			engine->set_default_submission = gen6_bsd_set_default_submission;
 		engine->emit_flush = gen6_bsd_ring_flush;
 		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
+
+		engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
+		engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
+		engine->irq_seqno_barrier = gen6_seqno_barrier;
 	} else {
 		engine->emit_flush = bsd_ring_flush;
 		if (IS_GEN(dev_priv, 5))
@@ -2176,11 +2254,17 @@  int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
+	GEM_BUG_ON(INTEL_GEN(dev_priv) < 6);
+
 	intel_ring_default_vfuncs(dev_priv, engine);
 
 	engine->emit_flush = gen6_ring_flush;
 	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
 
+	engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
+	engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
+	engine->irq_seqno_barrier = gen6_seqno_barrier;
+
 	return intel_init_ring_buffer(engine);
 }
 
@@ -2188,6 +2272,8 @@  int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
+	GEM_BUG_ON(INTEL_GEN(dev_priv) < 7);
+
 	intel_ring_default_vfuncs(dev_priv, engine);
 
 	engine->emit_flush = gen6_ring_flush;
@@ -2195,5 +2281,9 @@  int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
 	engine->irq_enable = hsw_vebox_irq_enable;
 	engine->irq_disable = hsw_vebox_irq_disable;
 
+	engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
+	engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
+	engine->irq_seqno_barrier = gen6_seqno_barrier;
+
 	return intel_init_ring_buffer(engine);
 }