Message ID | 20181219145747.19835-3-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/9] drm/i915: Remove HW semaphores for gen7 inter-engine synchronisation | expand |
Chris Wilson <chris@chris-wilson.co.uk> writes: > In preparation for removing the manual EMIT_FLUSH prior to emitting the > breadcrumb implement the flush inline with writing the breadcrumb for > ringbuffer emission. > > With a combined flush+breadcrumb, we can use a single operation to both > flush and after the flush is complete (post-sync) write the breadcrumb. > This gives us a strongly ordered operation that should be sufficient to > serialise the write before we emit the interrupt; and therefore we may > take the opportunity to remove the irq_seqno_barrier w/a for gen6+. > Although using the PIPECONTROL to write the breadcrumb is slower than > MI_STORE_DWORD_IMM, by combining the operations into one and removing the > extra flush (next patch) it is faster > > For gen2-5, we simply combine the MI_FLUSH into the breadcrumb emission, > though maybe we could find a solution here to the seqno-vs-interrupt > issue on Ironlake by mixing up the flush? The answer is no, adding an > MI_FLUSH before the interrupt is insufficient. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/intel_ringbuffer.c | 104 ++++++++++++++++++++++-- > 1 file changed, 97 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index d5a9edbcf1be..169c54995ca9 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -217,7 +217,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode) > * really our business. That leaves only stall at scoreboard. > */ > static int > -intel_emit_post_sync_nonzero_flush(struct i915_request *rq) > +gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) > { > u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; > u32 *cs; > @@ -257,7 +257,7 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode) > int ret; > > /* Force SNB workarounds for PIPE_CONTROL flushes */ > - ret = intel_emit_post_sync_nonzero_flush(rq); > + ret = gen6_emit_post_sync_nonzero_flush(rq); > if (ret) > return ret; > > @@ -300,6 +300,37 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode) > return 0; > } > > +static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > +{ > + /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ > + *cs++ = GFX_OP_PIPE_CONTROL(4); > + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; > + *cs++ = 0; > + *cs++ = 0; > + > + *cs++ = GFX_OP_PIPE_CONTROL(4); > + *cs++ = PIPE_CONTROL_QW_WRITE; > + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; > + *cs++ = 0; > + > + /* Finally we can flush and with it emit the breadcrumb */ > + *cs++ = GFX_OP_PIPE_CONTROL(4); > + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | > + PIPE_CONTROL_DEPTH_CACHE_FLUSH | > + PIPE_CONTROL_DC_FLUSH_ENABLE | > + PIPE_CONTROL_QW_WRITE | > + PIPE_CONTROL_CS_STALL); > + *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT; > + *cs++ = rq->global_seqno; > + > + *cs++ = MI_USER_INTERRUPT; > + *cs++ = MI_NOOP; > + > + rq->tail = intel_ring_offset(rq, cs); > + assert_ring_tail_valid(rq->ring, rq->tail); > +} > +static const int gen6_rcs_emit_breadcrumb_sz = 14; > + > static int > gen7_render_ring_cs_stall_wa(struct i915_request *rq) > { > @@ -379,6 +410,39 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode) > return 0; > } > > +static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > +{ > + *cs++ = GFX_OP_PIPE_CONTROL(4); > + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | > + PIPE_CONTROL_DEPTH_CACHE_FLUSH | > + PIPE_CONTROL_DC_FLUSH_ENABLE | > + PIPE_CONTROL_FLUSH_ENABLE | > + PIPE_CONTROL_QW_WRITE | > + PIPE_CONTROL_GLOBAL_GTT_IVB | > + PIPE_CONTROL_CS_STALL); There is a faint scent of engine->flush_flags in the air but I think you favour opencoding everything behind these indirect calls. But we can take a second look later, with gen11 flush improvements. Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> > + *cs++ = intel_hws_seqno_address(rq->engine); > + *cs++ = rq->global_seqno; > + > + *cs++ = MI_USER_INTERRUPT; > + *cs++ = MI_NOOP; > + > + rq->tail = intel_ring_offset(rq, cs); > + assert_ring_tail_valid(rq->ring, rq->tail); > +} > +static const int gen7_rcs_emit_breadcrumb_sz = 6; > + > +static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > +{ > + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW; > + *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT; > + *cs++ = rq->global_seqno; > + *cs++ = MI_USER_INTERRUPT; > + > + rq->tail = intel_ring_offset(rq, cs); > + assert_ring_tail_valid(rq->ring, rq->tail); > +} > +static const int gen6_xcs_emit_breadcrumb_sz = 4; > + > static void set_hwstam(struct intel_engine_cs *engine, u32 mask) > { > /* > @@ -777,16 +841,20 @@ static void i9xx_submit_request(struct i915_request *request) > > static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs) > { > + *cs++ = MI_FLUSH; > + > *cs++ = MI_STORE_DWORD_INDEX; > *cs++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT; > *cs++ = rq->global_seqno; > + > *cs++ = MI_USER_INTERRUPT; > + *cs++ = MI_NOOP; > > rq->tail = intel_ring_offset(rq, cs); > assert_ring_tail_valid(rq->ring, rq->tail); > } > > -static const int i9xx_emit_breadcrumb_sz = 4; > +static const int i9xx_emit_breadcrumb_sz = 6; > > static void > gen5_seqno_barrier(struct intel_engine_cs *engine) > @@ -2050,7 +2118,6 @@ static void intel_ring_init_irq(struct drm_i915_private *dev_priv, > if (INTEL_GEN(dev_priv) >= 6) { > engine->irq_enable = gen6_irq_enable; > engine->irq_disable = gen6_irq_disable; > - engine->irq_seqno_barrier = gen6_seqno_barrier; > } else if (INTEL_GEN(dev_priv) >= 5) { > engine->irq_enable = gen5_irq_enable; > engine->irq_disable = gen5_irq_disable; > @@ -2122,11 +2189,18 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine) > > engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT; > > - if (INTEL_GEN(dev_priv) >= 6) { > + if (INTEL_GEN(dev_priv) >= 7) { > engine->init_context = intel_rcs_ctx_init; > engine->emit_flush = gen7_render_ring_flush; > - if (IS_GEN(dev_priv, 6)) > - engine->emit_flush = gen6_render_ring_flush; > + engine->emit_breadcrumb = gen7_rcs_emit_breadcrumb; > + engine->emit_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz; > + engine->irq_seqno_barrier = gen6_seqno_barrier; > + } else if (IS_GEN(dev_priv, 6)) { > + engine->init_context = intel_rcs_ctx_init; > + engine->emit_flush = gen6_render_ring_flush; > + engine->emit_breadcrumb = gen6_rcs_emit_breadcrumb; > + engine->emit_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz; > + engine->irq_seqno_barrier = gen6_seqno_barrier; > } else if (IS_GEN(dev_priv, 5)) { > engine->emit_flush = gen4_render_ring_flush; > } else { > @@ -2161,6 +2235,10 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine) > engine->set_default_submission = gen6_bsd_set_default_submission; > engine->emit_flush = gen6_bsd_ring_flush; > engine->irq_enable_mask = GT_BSD_USER_INTERRUPT; > + > + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb; > + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz; > + engine->irq_seqno_barrier = gen6_seqno_barrier; > } else { > engine->emit_flush = bsd_ring_flush; > if (IS_GEN(dev_priv, 5)) > @@ -2176,11 +2254,17 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine) > { > struct drm_i915_private *dev_priv = engine->i915; > > + GEM_BUG_ON(INTEL_GEN(dev_priv) < 6); > + > intel_ring_default_vfuncs(dev_priv, engine); > > engine->emit_flush = gen6_ring_flush; > engine->irq_enable_mask = GT_BLT_USER_INTERRUPT; > > + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb; > + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz; > + engine->irq_seqno_barrier = gen6_seqno_barrier; > + > return intel_init_ring_buffer(engine); > } > > @@ -2188,6 +2272,8 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine) > { > struct drm_i915_private *dev_priv = engine->i915; > > + GEM_BUG_ON(INTEL_GEN(dev_priv) < 7); > + > intel_ring_default_vfuncs(dev_priv, engine); > > engine->emit_flush = gen6_ring_flush; > @@ -2195,5 +2281,9 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine) > engine->irq_enable = hsw_vebox_irq_enable; > engine->irq_disable = hsw_vebox_irq_disable; > > + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb; > + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz; > + engine->irq_seqno_barrier = gen6_seqno_barrier; > + > return intel_init_ring_buffer(engine); > } > -- > 2.20.0
Quoting Mika Kuoppala (2018-12-28 12:03:26) > Chris Wilson <chris@chris-wilson.co.uk> writes: > > > In preparation for removing the manual EMIT_FLUSH prior to emitting the > > breadcrumb implement the flush inline with writing the breadcrumb for > > ringbuffer emission. > > > > With a combined flush+breadcrumb, we can use a single operation to both > > flush and after the flush is complete (post-sync) write the breadcrumb. > > This gives us a strongly ordered operation that should be sufficient to > > serialise the write before we emit the interrupt; and therefore we may > > take the opportunity to remove the irq_seqno_barrier w/a for gen6+. > > Although using the PIPECONTROL to write the breadcrumb is slower than > > MI_STORE_DWORD_IMM, by combining the operations into one and removing the > > extra flush (next patch) it is faster > > > > For gen2-5, we simply combine the MI_FLUSH into the breadcrumb emission, > > though maybe we could find a solution here to the seqno-vs-interrupt > > issue on Ironlake by mixing up the flush? The answer is no, adding an > > MI_FLUSH before the interrupt is insufficient. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > --- > > drivers/gpu/drm/i915/intel_ringbuffer.c | 104 ++++++++++++++++++++++-- > > 1 file changed, 97 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > > index d5a9edbcf1be..169c54995ca9 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > > @@ -217,7 +217,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode) > > * really our business. That leaves only stall at scoreboard. > > */ > > static int > > -intel_emit_post_sync_nonzero_flush(struct i915_request *rq) > > +gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) > > { > > u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; > > u32 *cs; > > @@ -257,7 +257,7 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode) > > int ret; > > > > /* Force SNB workarounds for PIPE_CONTROL flushes */ > > - ret = intel_emit_post_sync_nonzero_flush(rq); > > + ret = gen6_emit_post_sync_nonzero_flush(rq); > > if (ret) > > return ret; > > > > @@ -300,6 +300,37 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode) > > return 0; > > } > > > > +static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > > +{ > > + /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ > > + *cs++ = GFX_OP_PIPE_CONTROL(4); > > + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; > > + *cs++ = 0; > > + *cs++ = 0; > > + > > + *cs++ = GFX_OP_PIPE_CONTROL(4); > > + *cs++ = PIPE_CONTROL_QW_WRITE; > > + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; > > + *cs++ = 0; > > + > > + /* Finally we can flush and with it emit the breadcrumb */ > > + *cs++ = GFX_OP_PIPE_CONTROL(4); > > + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | > > + PIPE_CONTROL_DEPTH_CACHE_FLUSH | > > + PIPE_CONTROL_DC_FLUSH_ENABLE | > > + PIPE_CONTROL_QW_WRITE | > > + PIPE_CONTROL_CS_STALL); > > + *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT; > > + *cs++ = rq->global_seqno; > > + > > + *cs++ = MI_USER_INTERRUPT; > > + *cs++ = MI_NOOP; > > + > > + rq->tail = intel_ring_offset(rq, cs); > > + assert_ring_tail_valid(rq->ring, rq->tail); > > +} > > +static const int gen6_rcs_emit_breadcrumb_sz = 14; > > + > > static int > > gen7_render_ring_cs_stall_wa(struct i915_request *rq) > > { > > @@ -379,6 +410,39 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode) > > return 0; > > } > > > > +static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > > +{ > > + *cs++ = GFX_OP_PIPE_CONTROL(4); > > + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | > > + PIPE_CONTROL_DEPTH_CACHE_FLUSH | > > + PIPE_CONTROL_DC_FLUSH_ENABLE | > > + PIPE_CONTROL_FLUSH_ENABLE | > > + PIPE_CONTROL_QW_WRITE | > > + PIPE_CONTROL_GLOBAL_GTT_IVB | > > + PIPE_CONTROL_CS_STALL); > > There is a faint scent of engine->flush_flags in the air but > I think you favour opencoding everything behind these > indirect calls. Maybe an EMIT_FLUSH | EMIT_DWORD + u32 parameter. I did also at one stage consider replacing the manual emit_breadcrumb_sz with a dummy emission during module init (i.e. emit the tail and cound). That still seems like an improvement as the manual sz is quite easy to break. The reason I stuck with extending ->emit_breadcrumb with an inline flush is that it becomes hairy in the next few patches, and I prefer getting stuck into the mess without too much indirection. But if we do see a way of coalescing, or see a cleaner abstraction, we can always come back. Perhaps though, this might be the last time we need to venture in here? :) -Chris
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index d5a9edbcf1be..169c54995ca9 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -217,7 +217,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode) * really our business. That leaves only stall at scoreboard. */ static int -intel_emit_post_sync_nonzero_flush(struct i915_request *rq) +gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) { u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; u32 *cs; @@ -257,7 +257,7 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode) int ret; /* Force SNB workarounds for PIPE_CONTROL flushes */ - ret = intel_emit_post_sync_nonzero_flush(rq); + ret = gen6_emit_post_sync_nonzero_flush(rq); if (ret) return ret; @@ -300,6 +300,37 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode) return 0; } +static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; + *cs++ = 0; + *cs++ = 0; + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_QW_WRITE; + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + + /* Finally we can flush and with it emit the breadcrumb */ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_CS_STALL); + *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = rq->global_seqno; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); +} +static const int gen6_rcs_emit_breadcrumb_sz = 14; + static int gen7_render_ring_cs_stall_wa(struct i915_request *rq) { @@ -379,6 +410,39 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode) return 0; } +static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_FLUSH_ENABLE | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL); + *cs++ = intel_hws_seqno_address(rq->engine); + *cs++ = rq->global_seqno; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); +} +static const int gen7_rcs_emit_breadcrumb_sz = 6; + +static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW; + *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT; + *cs++ = rq->global_seqno; + *cs++ = MI_USER_INTERRUPT; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); +} +static const int gen6_xcs_emit_breadcrumb_sz = 4; + static void set_hwstam(struct intel_engine_cs *engine, u32 mask) { /* @@ -777,16 +841,20 @@ static void i9xx_submit_request(struct i915_request *request) static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs) { + *cs++ = MI_FLUSH; + *cs++ = MI_STORE_DWORD_INDEX; *cs++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT; *cs++ = rq->global_seqno; + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); } -static const int i9xx_emit_breadcrumb_sz = 4; +static const int i9xx_emit_breadcrumb_sz = 6; static void gen5_seqno_barrier(struct intel_engine_cs *engine) @@ -2050,7 +2118,6 @@ static void intel_ring_init_irq(struct drm_i915_private *dev_priv, if (INTEL_GEN(dev_priv) >= 6) { engine->irq_enable = gen6_irq_enable; engine->irq_disable = gen6_irq_disable; - engine->irq_seqno_barrier = gen6_seqno_barrier; } else if (INTEL_GEN(dev_priv) >= 5) { engine->irq_enable = gen5_irq_enable; engine->irq_disable = gen5_irq_disable; @@ -2122,11 +2189,18 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine) engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT; - if (INTEL_GEN(dev_priv) >= 6) { + if (INTEL_GEN(dev_priv) >= 7) { engine->init_context = intel_rcs_ctx_init; engine->emit_flush = gen7_render_ring_flush; - if (IS_GEN(dev_priv, 6)) - engine->emit_flush = gen6_render_ring_flush; + engine->emit_breadcrumb = gen7_rcs_emit_breadcrumb; + engine->emit_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz; + engine->irq_seqno_barrier = gen6_seqno_barrier; + } else if (IS_GEN(dev_priv, 6)) { + engine->init_context = intel_rcs_ctx_init; + engine->emit_flush = gen6_render_ring_flush; + engine->emit_breadcrumb = gen6_rcs_emit_breadcrumb; + engine->emit_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz; + engine->irq_seqno_barrier = gen6_seqno_barrier; } else if (IS_GEN(dev_priv, 5)) { engine->emit_flush = gen4_render_ring_flush; } else { @@ -2161,6 +2235,10 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine) engine->set_default_submission = gen6_bsd_set_default_submission; engine->emit_flush = gen6_bsd_ring_flush; engine->irq_enable_mask = GT_BSD_USER_INTERRUPT; + + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb; + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz; + engine->irq_seqno_barrier = gen6_seqno_barrier; } else { engine->emit_flush = bsd_ring_flush; if (IS_GEN(dev_priv, 5)) @@ -2176,11 +2254,17 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine) { struct drm_i915_private *dev_priv = engine->i915; + GEM_BUG_ON(INTEL_GEN(dev_priv) < 6); + intel_ring_default_vfuncs(dev_priv, engine); engine->emit_flush = gen6_ring_flush; engine->irq_enable_mask = GT_BLT_USER_INTERRUPT; + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb; + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz; + engine->irq_seqno_barrier = gen6_seqno_barrier; + return intel_init_ring_buffer(engine); } @@ -2188,6 +2272,8 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine) { struct drm_i915_private *dev_priv = engine->i915; + GEM_BUG_ON(INTEL_GEN(dev_priv) < 7); + intel_ring_default_vfuncs(dev_priv, engine); engine->emit_flush = gen6_ring_flush; @@ -2195,5 +2281,9 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine) engine->irq_enable = hsw_vebox_irq_enable; engine->irq_disable = hsw_vebox_irq_disable; + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb; + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz; + engine->irq_seqno_barrier = gen6_seqno_barrier; + return intel_init_ring_buffer(engine); }
In preparation for removing the manual EMIT_FLUSH prior to emitting the breadcrumb implement the flush inline with writing the breadcrumb for ringbuffer emission. With a combined flush+breadcrumb, we can use a single operation to both flush and after the flush is complete (post-sync) write the breadcrumb. This gives us a strongly ordered operation that should be sufficient to serialise the write before we emit the interrupt; and therefore we may take the opportunity to remove the irq_seqno_barrier w/a for gen6+. Although using the PIPECONTROL to write the breadcrumb is slower than MI_STORE_DWORD_IMM, by combining the operations into one and removing the extra flush (next patch) it is faster For gen2-5, we simply combine the MI_FLUSH into the breadcrumb emission, though maybe we could find a solution here to the seqno-vs-interrupt issue on Ironlake by mixing up the flush? The answer is no, adding an MI_FLUSH before the interrupt is insufficient. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/intel_ringbuffer.c | 104 ++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 7 deletions(-)