diff mbox series

[02/33] drm/i915: Measure the required reserved size for request emission

Message ID 20190125023005.1007-2-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/33] drm/i915/execlists: Move RPCS setup to context pin | expand

Commit Message

Chris Wilson Jan. 25, 2019, 2:29 a.m. UTC
Instead of tediously and fragilely counting up the number of dwords
required to emit the breadcrumb to seal a request, fake a request and
measure it automatically once during engine setup.

The downside is that this requires a fair amount of mocking to create a
proper breadcrumb. Still, should be less error prone in future as the
breadcrumb size fluctuates!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_engine_cs.c       | 49 ++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c             | 12 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.c      | 24 +++++++---
 drivers/gpu/drm/i915/intel_ringbuffer.h      |  2 +-
 drivers/gpu/drm/i915/selftests/mock_engine.c |  4 +-
 5 files changed, 77 insertions(+), 14 deletions(-)

Comments

Mika Kuoppala Jan. 25, 2019, 8:34 a.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Instead of tediously and fragilely counting up the number of dwords
> required to emit the breadcrumb to seal a request, fake a request and
> measure it automatically once during engine setup.
>
> The downside is that this requires a fair amount of mocking to create a
> proper breadcrumb. Still, should be less error prone in future as the
> breadcrumb size fluctuates!

We are quick to notice, but this method saves brains and time,
review time.

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_engine_cs.c       | 49 ++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.c             | 12 +++--
>  drivers/gpu/drm/i915/intel_ringbuffer.c      | 24 +++++++---
>  drivers/gpu/drm/i915/intel_ringbuffer.h      |  2 +-
>  drivers/gpu/drm/i915/selftests/mock_engine.c |  4 +-
>  5 files changed, 77 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 2f3c71f6d313..883ba208d1c2 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -604,6 +604,47 @@ static void __intel_context_unpin(struct i915_gem_context *ctx,
>  	intel_context_unpin(to_intel_context(ctx, engine));
>  }
>  
> +struct measure_breadcrumb {
> +	struct i915_request rq;
> +	struct i915_timeline timeline;
> +	struct intel_ring ring;
> +	u32 cs[1024];
> +};
> +
> +static int measure_breadcrumb_sz(struct intel_engine_cs *engine)
> +{
> +	struct measure_breadcrumb *frame;
> +	unsigned int dw;
> +
> +	GEM_BUG_ON(!engine->i915->gt.scratch);
> +
> +	frame = kzalloc(sizeof(*frame), GFP_KERNEL);
> +	if (!frame)
> +		return -ENOMEM;
> +
> +	i915_timeline_init(engine->i915, &frame->timeline, engine->name);

You could init with null name. This is so short lived
and we dont expect debugs. If it ever leaks into wild,
blowout would be instant. Now the name is the same as
the real deal.

> +
> +	frame->ring.timeline = &frame->timeline;
> +	frame->ring.vaddr = frame->cs;
> +	frame->ring.size = sizeof(frame->cs);
> +	frame->ring.effective_size = frame->ring.size;
> +	frame->ring.space = frame->ring.size - 8;

Why 2 dwords short? Just curious as it doesn't seem
to matter in this use case.

> +	INIT_LIST_HEAD(&frame->ring.request_list);
> +
> +	frame->rq.i915 = engine->i915;
> +	frame->rq.engine = engine;
> +	frame->rq.ring = &frame->ring;
> +	frame->rq.timeline = &frame->timeline;
> +
> +	dw = engine->emit_breadcrumb(&frame->rq, frame->cs) - frame->cs;
> +	GEM_BUG_ON(dw != engine->emit_breadcrumb_sz);

Peace of mind provided =)

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> +
> +	i915_timeline_fini(&frame->timeline);
> +	kfree(frame);
> +
> +	return dw;
> +}
> +
>  /**
>   * intel_engines_init_common - initialize cengine state which might require hw access
>   * @engine: Engine to initialize.
> @@ -657,8 +698,16 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>  	if (ret)
>  		goto err_breadcrumbs;
>  
> +	ret = measure_breadcrumb_sz(engine);
> +	if (ret < 0)
> +		goto err_status_page;
> +
> +	engine->emit_breadcrumb_sz = ret;
> +
>  	return 0;
>  
> +err_status_page:
> +	cleanup_status_page(engine);
>  err_breadcrumbs:
>  	intel_engine_fini_breadcrumbs(engine);
>  err_unpin_preempt:
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9155cc675924..d2299425cf2f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2051,15 +2051,17 @@ static int gen8_emit_flush_render(struct i915_request *request,
>   * used as a workaround for not being allowed to do lite
>   * restore with HEAD==TAIL (WaIdleLiteRestore).
>   */
> -static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
> +static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
>  {
>  	/* Ensure there's always at least one preemption point per-request. */
>  	*cs++ = MI_ARB_CHECK;
>  	*cs++ = MI_NOOP;
>  	request->wa_tail = intel_ring_offset(request, cs);
> +
> +	return cs;
>  }
>  
> -static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
> +static u32 *gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
>  {
>  	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
>  	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
> @@ -2071,11 +2073,11 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
>  	request->tail = intel_ring_offset(request, cs);
>  	assert_ring_tail_valid(request->ring, request->tail);
>  
> -	gen8_emit_wa_tail(request, cs);
> +	return gen8_emit_wa_tail(request, cs);
>  }
>  static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
>  
> -static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
> +static u32 *gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>  {
>  	/* We're using qword write, seqno should be aligned to 8 bytes. */
>  	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
> @@ -2095,7 +2097,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>  	request->tail = intel_ring_offset(request, cs);
>  	assert_ring_tail_valid(request->ring, request->tail);
>  
> -	gen8_emit_wa_tail(request, cs);
> +	return gen8_emit_wa_tail(request, cs);
>  }
>  static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index e39e483d8d16..107c4934e2fa 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -299,7 +299,7 @@ gen6_render_ring_flush(struct i915_request *rq, u32 mode)
>  	return 0;
>  }
>  
> -static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  {
>  	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
>  	*cs++ = GFX_OP_PIPE_CONTROL(4);
> @@ -327,6 +327,8 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  
>  	rq->tail = intel_ring_offset(rq, cs);
>  	assert_ring_tail_valid(rq->ring, rq->tail);
> +
> +	return cs;
>  }
>  static const int gen6_rcs_emit_breadcrumb_sz = 14;
>  
> @@ -409,7 +411,7 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode)
>  	return 0;
>  }
>  
> -static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  {
>  	*cs++ = GFX_OP_PIPE_CONTROL(4);
>  	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> @@ -427,10 +429,12 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  
>  	rq->tail = intel_ring_offset(rq, cs);
>  	assert_ring_tail_valid(rq->ring, rq->tail);
> +
> +	return cs;
>  }
>  static const int gen7_rcs_emit_breadcrumb_sz = 6;
>  
> -static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  {
>  	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
>  	*cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
> @@ -439,11 +443,13 @@ static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  
>  	rq->tail = intel_ring_offset(rq, cs);
>  	assert_ring_tail_valid(rq->ring, rq->tail);
> +
> +	return cs;
>  }
>  static const int gen6_xcs_emit_breadcrumb_sz = 4;
>  
>  #define GEN7_XCS_WA 32
> -static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  {
>  	int i;
>  
> @@ -466,6 +472,8 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  
>  	rq->tail = intel_ring_offset(rq, cs);
>  	assert_ring_tail_valid(rq->ring, rq->tail);
> +
> +	return cs;
>  }
>  static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3;
>  #undef GEN7_XCS_WA
> @@ -861,7 +869,7 @@ static void i9xx_submit_request(struct i915_request *request)
>  			intel_ring_set_tail(request->ring, request->tail));
>  }
>  
> -static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  {
>  	*cs++ = MI_FLUSH;
>  
> @@ -874,11 +882,13 @@ static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  
>  	rq->tail = intel_ring_offset(rq, cs);
>  	assert_ring_tail_valid(rq->ring, rq->tail);
> +
> +	return cs;
>  }
>  static const int i9xx_emit_breadcrumb_sz = 6;
>  
>  #define GEN5_WA_STORES 8 /* must be at least 1! */
> -static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  {
>  	int i;
>  
> @@ -895,6 +905,8 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>  
>  	rq->tail = intel_ring_offset(rq, cs);
>  	assert_ring_tail_valid(rq->ring, rq->tail);
> +
> +	return cs;
>  }
>  static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 2;
>  #undef GEN5_WA_STORES
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index c3ef0f9bf321..479bd53d4ac6 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -470,7 +470,7 @@ struct intel_engine_cs {
>  					 unsigned int dispatch_flags);
>  #define I915_DISPATCH_SECURE BIT(0)
>  #define I915_DISPATCH_PINNED BIT(1)
> -	void		(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
> +	u32		*(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
>  	int		emit_breadcrumb_sz;
>  
>  	/* Pass the request to the hardware queue (e.g. directly into
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index 442ec2aeec81..905318b7ae18 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -159,9 +159,9 @@ static int mock_emit_flush(struct i915_request *request,
>  	return 0;
>  }
>  
> -static void mock_emit_breadcrumb(struct i915_request *request,
> -				 u32 *flags)
> +static u32 *mock_emit_breadcrumb(struct i915_request *request, u32 *cs)
>  {
> +	return cs;
>  }
>  
>  static void mock_submit_request(struct i915_request *request)
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 25, 2019, 9:52 a.m. UTC | #2
Quoting Mika Kuoppala (2019-01-25 08:34:37)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Instead of tediously and fragilely counting up the number of dwords
> > required to emit the breadcrumb to seal a request, fake a request and
> > measure it automatically once during engine setup.
> >
> > The downside is that this requires a fair amount of mocking to create a
> > proper breadcrumb. Still, should be less error prone in future as the
> > breadcrumb size fluctuates!
> 
> We are quick to notice, but this method saves brains and time,
> review time.
> 
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/intel_engine_cs.c       | 49 ++++++++++++++++++++
> >  drivers/gpu/drm/i915/intel_lrc.c             | 12 +++--
> >  drivers/gpu/drm/i915/intel_ringbuffer.c      | 24 +++++++---
> >  drivers/gpu/drm/i915/intel_ringbuffer.h      |  2 +-
> >  drivers/gpu/drm/i915/selftests/mock_engine.c |  4 +-
> >  5 files changed, 77 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> > index 2f3c71f6d313..883ba208d1c2 100644
> > --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> > @@ -604,6 +604,47 @@ static void __intel_context_unpin(struct i915_gem_context *ctx,
> >       intel_context_unpin(to_intel_context(ctx, engine));
> >  }
> >  
> > +struct measure_breadcrumb {
> > +     struct i915_request rq;
> > +     struct i915_timeline timeline;
> > +     struct intel_ring ring;
> > +     u32 cs[1024];
> > +};
> > +
> > +static int measure_breadcrumb_sz(struct intel_engine_cs *engine)
> > +{
> > +     struct measure_breadcrumb *frame;
> > +     unsigned int dw;
> > +
> > +     GEM_BUG_ON(!engine->i915->gt.scratch);
> > +
> > +     frame = kzalloc(sizeof(*frame), GFP_KERNEL);
> > +     if (!frame)
> > +             return -ENOMEM;
> > +
> > +     i915_timeline_init(engine->i915, &frame->timeline, engine->name);
> 
> You could init with null name. This is so short lived
> and we dont expect debugs. If it ever leaks into wild,
> blowout would be instant. Now the name is the same as
> the real deal.

Good point. I was just cutting and pasting for convenience, but having
the same name may be doubly confusing for the strange bugs where it might
matter :)

> > +     frame->ring.timeline = &frame->timeline;
> > +     frame->ring.vaddr = frame->cs;
> > +     frame->ring.size = sizeof(frame->cs);
> > +     frame->ring.effective_size = frame->ring.size;
> > +     frame->ring.space = frame->ring.size - 8;
> 
> Why 2 dwords short? Just curious as it doesn't seem
> to matter in this use case.

Just rules of the HW. Head steps in qword jumps. But should just
intel_ring_update_space ftw.

> > +     INIT_LIST_HEAD(&frame->ring.request_list);
> > +
> > +     frame->rq.i915 = engine->i915;
> > +     frame->rq.engine = engine;
> > +     frame->rq.ring = &frame->ring;
> > +     frame->rq.timeline = &frame->timeline;
> > +
> > +     dw = engine->emit_breadcrumb(&frame->rq, frame->cs) - frame->cs;
> > +     GEM_BUG_ON(dw != engine->emit_breadcrumb_sz);
> 
> Peace of mind provided =)

For full peace of mind, see the earlier runs with the BUG active.
https://patchwork.freedesktop.org/series/55683/
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 2f3c71f6d313..883ba208d1c2 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -604,6 +604,47 @@  static void __intel_context_unpin(struct i915_gem_context *ctx,
 	intel_context_unpin(to_intel_context(ctx, engine));
 }
 
+struct measure_breadcrumb {
+	struct i915_request rq;
+	struct i915_timeline timeline;
+	struct intel_ring ring;
+	u32 cs[1024];
+};
+
+static int measure_breadcrumb_sz(struct intel_engine_cs *engine)
+{
+	struct measure_breadcrumb *frame;
+	unsigned int dw;
+
+	GEM_BUG_ON(!engine->i915->gt.scratch);
+
+	frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+	if (!frame)
+		return -ENOMEM;
+
+	i915_timeline_init(engine->i915, &frame->timeline, engine->name);
+
+	frame->ring.timeline = &frame->timeline;
+	frame->ring.vaddr = frame->cs;
+	frame->ring.size = sizeof(frame->cs);
+	frame->ring.effective_size = frame->ring.size;
+	frame->ring.space = frame->ring.size - 8;
+	INIT_LIST_HEAD(&frame->ring.request_list);
+
+	frame->rq.i915 = engine->i915;
+	frame->rq.engine = engine;
+	frame->rq.ring = &frame->ring;
+	frame->rq.timeline = &frame->timeline;
+
+	dw = engine->emit_breadcrumb(&frame->rq, frame->cs) - frame->cs;
+	GEM_BUG_ON(dw != engine->emit_breadcrumb_sz);
+
+	i915_timeline_fini(&frame->timeline);
+	kfree(frame);
+
+	return dw;
+}
+
 /**
  * intel_engines_init_common - initialize cengine state which might require hw access
  * @engine: Engine to initialize.
@@ -657,8 +698,16 @@  int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret)
 		goto err_breadcrumbs;
 
+	ret = measure_breadcrumb_sz(engine);
+	if (ret < 0)
+		goto err_status_page;
+
+	engine->emit_breadcrumb_sz = ret;
+
 	return 0;
 
+err_status_page:
+	cleanup_status_page(engine);
 err_breadcrumbs:
 	intel_engine_fini_breadcrumbs(engine);
 err_unpin_preempt:
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 9155cc675924..d2299425cf2f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2051,15 +2051,17 @@  static int gen8_emit_flush_render(struct i915_request *request,
  * used as a workaround for not being allowed to do lite
  * restore with HEAD==TAIL (WaIdleLiteRestore).
  */
-static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
+static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
 {
 	/* Ensure there's always at least one preemption point per-request. */
 	*cs++ = MI_ARB_CHECK;
 	*cs++ = MI_NOOP;
 	request->wa_tail = intel_ring_offset(request, cs);
+
+	return cs;
 }
 
-static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
+static u32 *gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 {
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
@@ -2071,11 +2073,11 @@  static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 	request->tail = intel_ring_offset(request, cs);
 	assert_ring_tail_valid(request->ring, request->tail);
 
-	gen8_emit_wa_tail(request, cs);
+	return gen8_emit_wa_tail(request, cs);
 }
 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
 
-static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
+static u32 *gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 {
 	/* We're using qword write, seqno should be aligned to 8 bytes. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
@@ -2095,7 +2097,7 @@  static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 	request->tail = intel_ring_offset(request, cs);
 	assert_ring_tail_valid(request->ring, request->tail);
 
-	gen8_emit_wa_tail(request, cs);
+	return gen8_emit_wa_tail(request, cs);
 }
 static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index e39e483d8d16..107c4934e2fa 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -299,7 +299,7 @@  gen6_render_ring_flush(struct i915_request *rq, u32 mode)
 	return 0;
 }
 
-static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
@@ -327,6 +327,8 @@  static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
 }
 static const int gen6_rcs_emit_breadcrumb_sz = 14;
 
@@ -409,7 +411,7 @@  gen7_render_ring_flush(struct i915_request *rq, u32 mode)
 	return 0;
 }
 
-static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
@@ -427,10 +429,12 @@  static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
 }
 static const int gen7_rcs_emit_breadcrumb_sz = 6;
 
-static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
 	*cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
@@ -439,11 +443,13 @@  static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
 }
 static const int gen6_xcs_emit_breadcrumb_sz = 4;
 
 #define GEN7_XCS_WA 32
-static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	int i;
 
@@ -466,6 +472,8 @@  static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
 }
 static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3;
 #undef GEN7_XCS_WA
@@ -861,7 +869,7 @@  static void i9xx_submit_request(struct i915_request *request)
 			intel_ring_set_tail(request->ring, request->tail));
 }
 
-static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	*cs++ = MI_FLUSH;
 
@@ -874,11 +882,13 @@  static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
 }
 static const int i9xx_emit_breadcrumb_sz = 6;
 
 #define GEN5_WA_STORES 8 /* must be at least 1! */
-static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	int i;
 
@@ -895,6 +905,8 @@  static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
 }
 static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 2;
 #undef GEN5_WA_STORES
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c3ef0f9bf321..479bd53d4ac6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -470,7 +470,7 @@  struct intel_engine_cs {
 					 unsigned int dispatch_flags);
 #define I915_DISPATCH_SECURE BIT(0)
 #define I915_DISPATCH_PINNED BIT(1)
-	void		(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
+	u32		*(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
 	int		emit_breadcrumb_sz;
 
 	/* Pass the request to the hardware queue (e.g. directly into
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 442ec2aeec81..905318b7ae18 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -159,9 +159,9 @@  static int mock_emit_flush(struct i915_request *request,
 	return 0;
 }
 
-static void mock_emit_breadcrumb(struct i915_request *request,
-				 u32 *flags)
+static u32 *mock_emit_breadcrumb(struct i915_request *request, u32 *cs)
 {
+	return cs;
 }
 
 static void mock_submit_request(struct i915_request *request)