diff mbox series

[26/34] drm/i915: Identify active requests

Message ID 20190121222117.23305-27-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/34] drm/i915/execlists: Mark up priority boost on preemption | expand

Commit Message

Chris Wilson Jan. 21, 2019, 10:21 p.m. UTC
To allow requests to forgo a common execution timeline, one question we
need to be able to answer is "is this request running?". To track
whether a request has started on HW, we can emit a breadcrumb at the
beginning of the request and check its timeline's HWSP to see if the
breadcrumb has advanced past the start of this request. (This is in
contrast to the global timeline where we need only ask if we are on the
global timeline and if the timeline has advanced past the end of the
previous request.)

There is still confusion from a preempted request, which has already
started but relinquished the HW to a high priority request. For the
common case, this discrepancy should be negligible. However, for
identification of hung requests, knowing which one was running at the
time of the hang will be much more important.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c   |  6 +++
 drivers/gpu/drm/i915/i915_request.c          |  9 ++--
 drivers/gpu/drm/i915/i915_request.h          |  1 +
 drivers/gpu/drm/i915/i915_timeline.c         |  1 +
 drivers/gpu/drm/i915/i915_timeline.h         |  2 +
 drivers/gpu/drm/i915/intel_engine_cs.c       |  4 +-
 drivers/gpu/drm/i915/intel_lrc.c             | 47 ++++++++++++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.c      | 43 ++++++++++--------
 drivers/gpu/drm/i915/intel_ringbuffer.h      |  6 ++-
 drivers/gpu/drm/i915/selftests/mock_engine.c |  2 +-
 10 files changed, 86 insertions(+), 35 deletions(-)

Comments

Tvrtko Ursulin Jan. 22, 2019, 3:34 p.m. UTC | #1
On 21/01/2019 22:21, Chris Wilson wrote:
> To allow requests to forgo a common execution timeline, one question we
> need to be able to answer is "is this request running?". To track
> whether a request has started on HW, we can emit a breadcrumb at the
> beginning of the request and check its timeline's HWSP to see if the
> breadcrumb has advanced past the start of this request. (This is in
> contrast to the global timeline where we need only ask if we are on the
> global timeline and if the timeline has advanced past the end of the
> previous request.)
> 
> There is still confusion from a preempted request, which has already
> started but relinquished the HW to a high priority request. For the
> common case, this discrepancy should be negligible. However, for
> identification of hung requests, knowing which one was running at the
> time of the hang will be much more important.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c   |  6 +++
>   drivers/gpu/drm/i915/i915_request.c          |  9 ++--
>   drivers/gpu/drm/i915/i915_request.h          |  1 +
>   drivers/gpu/drm/i915/i915_timeline.c         |  1 +
>   drivers/gpu/drm/i915/i915_timeline.h         |  2 +
>   drivers/gpu/drm/i915/intel_engine_cs.c       |  4 +-
>   drivers/gpu/drm/i915/intel_lrc.c             | 47 ++++++++++++++++----
>   drivers/gpu/drm/i915/intel_ringbuffer.c      | 43 ++++++++++--------
>   drivers/gpu/drm/i915/intel_ringbuffer.h      |  6 ++-
>   drivers/gpu/drm/i915/selftests/mock_engine.c |  2 +-
>   10 files changed, 86 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index f250109e1f66..defe7d60bb88 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1976,6 +1976,12 @@ static int eb_submit(struct i915_execbuffer *eb)
>   			return err;
>   	}
>   
> +	if (eb->engine->emit_init_breadcrumb) {
> +		err = eb->engine->emit_init_breadcrumb(eb->request);
> +		if (err)
> +			return err;
> +	}
> +
>   	err = eb->engine->emit_bb_start(eb->request,
>   					eb->batch->node.start +
>   					eb->batch_start_offset,
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index bb2885f1dc1e..0a8a2a1bf55d 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -333,6 +333,7 @@ void i915_request_retire_upto(struct i915_request *rq)
>   
>   static u32 timeline_get_seqno(struct i915_timeline *tl)
>   {
> +	tl->seqno += tl->has_initial_breadcrumb;
>   	return ++tl->seqno;

return tl->seqno += 1 + tl->has_initial_breadcrumb?

Not sure if it would make any difference in the code.

>   }
>   
> @@ -382,8 +383,8 @@ void __i915_request_submit(struct i915_request *request)
>   		intel_engine_enable_signaling(request, false);
>   	spin_unlock(&request->lock);
>   
> -	engine->emit_breadcrumb(request,
> -				request->ring->vaddr + request->postfix);
> +	engine->emit_fini_breadcrumb(request,
> +				     request->ring->vaddr + request->postfix);
>   
>   	/* Transfer from per-context onto the global per-engine timeline */
>   	move_to_timeline(request, &engine->timeline);
> @@ -657,7 +658,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	 * around inside i915_request_add() there is sufficient space at
>   	 * the beginning of the ring as well.
>   	 */
> -	rq->reserved_space = 2 * engine->emit_breadcrumb_sz * sizeof(u32);
> +	rq->reserved_space = 2 * engine->emit_fini_breadcrumb_sz * sizeof(u32);

Logic being fini breadcrumb is at least as big as the init one? I can't 
think of any easy asserts to verify that.

Also, a little bit of ring space wastage but I guess we don't care.

>   
>   	/*
>   	 * Record the position of the start of the request so that
> @@ -908,7 +909,7 @@ void i915_request_add(struct i915_request *request)
>   	 * GPU processing the request, we never over-estimate the
>   	 * position of the ring's HEAD.
>   	 */
> -	cs = intel_ring_begin(request, engine->emit_breadcrumb_sz);
> +	cs = intel_ring_begin(request, engine->emit_fini_breadcrumb_sz);
>   	GEM_BUG_ON(IS_ERR(cs));
>   	request->postfix = intel_ring_offset(request, cs);
>   
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 96c586d6ff4d..340d6216791c 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -344,6 +344,7 @@ static inline bool i915_request_started(const struct i915_request *rq)
>   	if (i915_request_signaled(rq))
>   		return true;
>   
> +	/* Remember: started but may have since been preempted! */
>   	return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index 007348b1b469..7bc9164733bc 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -132,6 +132,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
>   	timeline->i915 = i915;
>   	timeline->name = name;
>   	timeline->pin_count = 0;
> +	timeline->has_initial_breadcrumb = !hwsp;
>   
>   	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>   	if (!hwsp) {
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index ab736e2e5707..8caeb66d1cd5 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -48,6 +48,8 @@ struct i915_timeline {
>   	struct i915_vma *hwsp_ggtt;
>   	u32 hwsp_offset;
>   
> +	bool has_initial_breadcrumb;
> +
>   	/**
>   	 * List of breadcrumbs associated with GPU requests currently
>   	 * outstanding.
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index e532b4b27239..2a4c547240a1 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1239,7 +1239,9 @@ static void print_request(struct drm_printer *m,
>   	drm_printf(m, "%s%x%s [%llx:%llx]%s @ %dms: %s\n",
>   		   prefix,
>   		   rq->global_seqno,
> -		   i915_request_completed(rq) ? "!" : "",
> +		   i915_request_completed(rq) ? "!" :
> +		   i915_request_started(rq) ? "*" :
> +		   "",
>   		   rq->fence.context, rq->fence.seqno,
>   		   buf,
>   		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 1bf178ca3e00..0a2d53f19625 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -649,7 +649,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		 * WaIdleLiteRestore:bdw,skl
>   		 * Apply the wa NOOPs to prevent
>   		 * ring:HEAD == rq:TAIL as we resubmit the
> -		 * request. See gen8_emit_breadcrumb() for
> +		 * request. See gen8_emit_fini_breadcrumb() for
>   		 * where we prepare the padding after the
>   		 * end of the request.
>   		 */
> @@ -1294,6 +1294,34 @@ execlists_context_pin(struct intel_engine_cs *engine,
>   	return __execlists_context_pin(engine, ctx, ce);
>   }
>   
> +static int gen8_emit_init_breadcrumb(struct i915_request *rq)
> +{
> +	u32 *cs;
> +
> +	GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
> +
> +	cs = intel_ring_begin(rq, 6);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/*
> +	 * Check if we have been preempted before we even get started.
> +	 *
> +	 * After this point i915_request_started() reports true, even if
> +	 * we get preempted and so are no longer running.
> +	 */
> +	*cs++ = MI_ARB_CHECK;
> +	*cs++ = MI_NOOP;
> +
> +	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +	*cs++ = i915_timeline_seqno_address(rq->timeline);
> +	*cs++ = 0;
> +	*cs++ = rq->fence.seqno - 1;
> +
> +	intel_ring_advance(rq, cs);
> +	return 0;
> +}
> +
>   static int emit_pdps(struct i915_request *rq)
>   {
>   	const struct intel_engine_cs * const engine = rq->engine;
> @@ -2049,7 +2077,7 @@ static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
>   	request->wa_tail = intel_ring_offset(request, cs);
>   }
>   
> -static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
> +static void gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
>   {
>   	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
>   	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
> @@ -2070,9 +2098,9 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
>   
>   	gen8_emit_wa_tail(request, cs);
>   }
> -static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
> +static const int gen8_emit_fini_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
>   
> -static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
> +static void gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>   {
>   	cs = gen8_emit_ggtt_write_rcs(cs,
>   				      request->fence.seqno,
> @@ -2096,7 +2124,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>   
>   	gen8_emit_wa_tail(request, cs);
>   }
> -static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
> +static const int gen8_emit_fini_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
>   
>   static int gen8_init_rcs_context(struct i915_request *rq)
>   {
> @@ -2188,8 +2216,9 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   	engine->request_alloc = execlists_request_alloc;
>   
>   	engine->emit_flush = gen8_emit_flush;
> -	engine->emit_breadcrumb = gen8_emit_breadcrumb;
> -	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
> +	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
> +	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
> +	engine->emit_fini_breadcrumb_sz = gen8_emit_fini_breadcrumb_sz;
>   
>   	engine->set_default_submission = intel_execlists_set_default_submission;
>   
> @@ -2302,8 +2331,8 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
>   	/* Override some for render ring. */
>   	engine->init_context = gen8_init_rcs_context;
>   	engine->emit_flush = gen8_emit_flush_render;
> -	engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
> -	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
> +	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
> +	engine->emit_fini_breadcrumb_sz = gen8_emit_fini_breadcrumb_rcs_sz;
>   
>   	ret = logical_ring_init(engine);
>   	if (ret)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 751bd4e7da42..f6b30eb46263 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1594,6 +1594,7 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   		err = PTR_ERR(timeline);
>   		goto err;
>   	}
> +	GEM_BUG_ON(timeline->has_initial_breadcrumb);
>   
>   	ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
>   	i915_timeline_put(timeline);
> @@ -1947,6 +1948,7 @@ static int ring_request_alloc(struct i915_request *request)
>   	int ret;
>   
>   	GEM_BUG_ON(!request->hw_context->pin_count);
> +	GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
>   
>   	/*
>   	 * Flush enough space to reduce the likelihood of waiting after
> @@ -2283,11 +2285,16 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
>   	engine->context_pin = intel_ring_context_pin;
>   	engine->request_alloc = ring_request_alloc;
>   
> -	engine->emit_breadcrumb = i9xx_emit_breadcrumb;
> -	engine->emit_breadcrumb_sz = i9xx_emit_breadcrumb_sz;
> +	/*
> +	 * Using a global execution timeline; the previous final breadcrumb is
> +	 * equivalent to our next initial bread so we can elide
> +	 * engine->emit_init_breadcrumb().
> +	 */
> +	engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
> +	engine->emit_fini_breadcrumb_sz = i9xx_emit_breadcrumb_sz;
>   	if (IS_GEN(dev_priv, 5)) {
> -		engine->emit_breadcrumb = gen5_emit_breadcrumb;
> -		engine->emit_breadcrumb_sz = gen5_emit_breadcrumb_sz;
> +		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
> +		engine->emit_fini_breadcrumb_sz = gen5_emit_breadcrumb_sz;
>   	}
>   
>   	engine->set_default_submission = i9xx_set_default_submission;
> @@ -2317,13 +2324,13 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
>   	if (INTEL_GEN(dev_priv) >= 7) {
>   		engine->init_context = intel_rcs_ctx_init;
>   		engine->emit_flush = gen7_render_ring_flush;
> -		engine->emit_breadcrumb = gen7_rcs_emit_breadcrumb;
> -		engine->emit_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz;
> +		engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
> +		engine->emit_fini_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz;
>   	} else if (IS_GEN(dev_priv, 6)) {
>   		engine->init_context = intel_rcs_ctx_init;
>   		engine->emit_flush = gen6_render_ring_flush;
> -		engine->emit_breadcrumb = gen6_rcs_emit_breadcrumb;
> -		engine->emit_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz;
> +		engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
> +		engine->emit_fini_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz;
>   	} else if (IS_GEN(dev_priv, 5)) {
>   		engine->emit_flush = gen4_render_ring_flush;
>   	} else {
> @@ -2360,11 +2367,11 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
>   		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
>   
>   		if (IS_GEN(dev_priv, 6)) {
> -			engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> -			engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> +			engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
> +			engine->emit_fini_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
>   		} else {
> -			engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
> -			engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
> +			engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
> +			engine->emit_fini_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
>   		}
>   	} else {
>   		engine->emit_flush = bsd_ring_flush;
> @@ -2389,11 +2396,11 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
>   	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
>   
>   	if (IS_GEN(dev_priv, 6)) {
> -		engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> -		engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> +		engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
> +		engine->emit_fini_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
>   	} else {
> -		engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
> -		engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
> +		engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
> +		engine->emit_fini_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
>   	}
>   
>   	return intel_init_ring_buffer(engine);
> @@ -2412,8 +2419,8 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
>   	engine->irq_enable = hsw_vebox_irq_enable;
>   	engine->irq_disable = hsw_vebox_irq_disable;
>   
> -	engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
> -	engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
> +	engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
> +	engine->emit_fini_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
>   
>   	return intel_init_ring_buffer(engine);
>   }
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index a792bacf2930..d3d4f3667afb 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -463,8 +463,10 @@ struct intel_engine_cs {
>   					 unsigned int dispatch_flags);
>   #define I915_DISPATCH_SECURE BIT(0)
>   #define I915_DISPATCH_PINNED BIT(1)
> -	void		(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
> -	int		emit_breadcrumb_sz;
> +	int		(*emit_init_breadcrumb)(struct i915_request *rq);
> +	void		(*emit_fini_breadcrumb)(struct i915_request *rq,
> +						u32 *cs);
> +	unsigned int	emit_fini_breadcrumb_sz;
>   
>   	/* Pass the request to the hardware queue (e.g. directly into
>   	 * the legacy ringbuffer or to the end of an execlist).
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index c0a408828415..2515cffb4490 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -227,7 +227,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   	engine->base.context_pin = mock_context_pin;
>   	engine->base.request_alloc = mock_request_alloc;
>   	engine->base.emit_flush = mock_emit_flush;
> -	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
> +	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
>   	engine->base.submit_request = mock_submit_request;
>   
>   	if (i915_timeline_init(i915,
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Chris Wilson Jan. 22, 2019, 3:45 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-01-22 15:34:07)
> 
> On 21/01/2019 22:21, Chris Wilson wrote:
> > To allow requests to forgo a common execution timeline, one question we
> > need to be able to answer is "is this request running?". To track
> > whether a request has started on HW, we can emit a breadcrumb at the
> > beginning of the request and check its timeline's HWSP to see if the
> > breadcrumb has advanced past the start of this request. (This is in
> > contrast to the global timeline where we need only ask if we are on the
> > global timeline and if the timeline has advanced past the end of the
> > previous request.)
> > 
> > There is still confusion from a preempted request, which has already
> > started but relinquished the HW to a high priority request. For the
> > common case, this discrepancy should be negligible. However, for
> > identification of hung requests, knowing which one was running at the
> > time of the hang will be much more important.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/i915_gem_execbuffer.c   |  6 +++
> >   drivers/gpu/drm/i915/i915_request.c          |  9 ++--
> >   drivers/gpu/drm/i915/i915_request.h          |  1 +
> >   drivers/gpu/drm/i915/i915_timeline.c         |  1 +
> >   drivers/gpu/drm/i915/i915_timeline.h         |  2 +
> >   drivers/gpu/drm/i915/intel_engine_cs.c       |  4 +-
> >   drivers/gpu/drm/i915/intel_lrc.c             | 47 ++++++++++++++++----
> >   drivers/gpu/drm/i915/intel_ringbuffer.c      | 43 ++++++++++--------
> >   drivers/gpu/drm/i915/intel_ringbuffer.h      |  6 ++-
> >   drivers/gpu/drm/i915/selftests/mock_engine.c |  2 +-
> >   10 files changed, 86 insertions(+), 35 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index f250109e1f66..defe7d60bb88 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -1976,6 +1976,12 @@ static int eb_submit(struct i915_execbuffer *eb)
> >                       return err;
> >       }
> >   
> > +     if (eb->engine->emit_init_breadcrumb) {
> > +             err = eb->engine->emit_init_breadcrumb(eb->request);
> > +             if (err)
> > +                     return err;
> > +     }
> > +
> >       err = eb->engine->emit_bb_start(eb->request,
> >                                       eb->batch->node.start +
> >                                       eb->batch_start_offset,
> > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> > index bb2885f1dc1e..0a8a2a1bf55d 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -333,6 +333,7 @@ void i915_request_retire_upto(struct i915_request *rq)
> >   
> >   static u32 timeline_get_seqno(struct i915_timeline *tl)
> >   {
> > +     tl->seqno += tl->has_initial_breadcrumb;
> >       return ++tl->seqno;
> 
> return tl->seqno += 1 + tl->has_initial_breadcrumb?
> 
> Not sure if it would make any difference in the code.

Identical code generation, but looks better than conditional increment
then pre-increment.

> > @@ -382,8 +383,8 @@ void __i915_request_submit(struct i915_request *request)
> >               intel_engine_enable_signaling(request, false);
> >       spin_unlock(&request->lock);
> >   
> > -     engine->emit_breadcrumb(request,
> > -                             request->ring->vaddr + request->postfix);
> > +     engine->emit_fini_breadcrumb(request,
> > +                                  request->ring->vaddr + request->postfix);
> >   
> >       /* Transfer from per-context onto the global per-engine timeline */
> >       move_to_timeline(request, &engine->timeline);
> > @@ -657,7 +658,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> >        * around inside i915_request_add() there is sufficient space at
> >        * the beginning of the ring as well.
> >        */
> > -     rq->reserved_space = 2 * engine->emit_breadcrumb_sz * sizeof(u32);
> > +     rq->reserved_space = 2 * engine->emit_fini_breadcrumb_sz * sizeof(u32);
> 
> Logic being fini breadcrumb is at least as big as the init one? I can't 
> think of any easy asserts to verify that.

We emit engine->emit_init_breadcrumbs() normally, it's just
engine->emit_fini_breadcrumbs() that is in the reserved portion.

The factor of 2 is to waste space on wraparound.
 
> Also, a little bit of ring space wastage but I guess we don't care.

We don't actually waste space, we only use emit_fini_breadcrumbs_sz, we
just flush enough of the ring for 2*sz to be sure that even if we have
to wrap, there's enough room at the start of the ring for our emit.

So overzealous on flushing if the ring is full, in which case we are
throttling a millisecond earlier than is strictly required (given that
the ring already contains about a few seconds worth of batches)

The real problem here is that throttling one client, strangles them all.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index f250109e1f66..defe7d60bb88 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1976,6 +1976,12 @@  static int eb_submit(struct i915_execbuffer *eb)
 			return err;
 	}
 
+	if (eb->engine->emit_init_breadcrumb) {
+		err = eb->engine->emit_init_breadcrumb(eb->request);
+		if (err)
+			return err;
+	}
+
 	err = eb->engine->emit_bb_start(eb->request,
 					eb->batch->node.start +
 					eb->batch_start_offset,
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index bb2885f1dc1e..0a8a2a1bf55d 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -333,6 +333,7 @@  void i915_request_retire_upto(struct i915_request *rq)
 
 static u32 timeline_get_seqno(struct i915_timeline *tl)
 {
+	tl->seqno += tl->has_initial_breadcrumb;
 	return ++tl->seqno;
 }
 
@@ -382,8 +383,8 @@  void __i915_request_submit(struct i915_request *request)
 		intel_engine_enable_signaling(request, false);
 	spin_unlock(&request->lock);
 
-	engine->emit_breadcrumb(request,
-				request->ring->vaddr + request->postfix);
+	engine->emit_fini_breadcrumb(request,
+				     request->ring->vaddr + request->postfix);
 
 	/* Transfer from per-context onto the global per-engine timeline */
 	move_to_timeline(request, &engine->timeline);
@@ -657,7 +658,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 * around inside i915_request_add() there is sufficient space at
 	 * the beginning of the ring as well.
 	 */
-	rq->reserved_space = 2 * engine->emit_breadcrumb_sz * sizeof(u32);
+	rq->reserved_space = 2 * engine->emit_fini_breadcrumb_sz * sizeof(u32);
 
 	/*
 	 * Record the position of the start of the request so that
@@ -908,7 +909,7 @@  void i915_request_add(struct i915_request *request)
 	 * GPU processing the request, we never over-estimate the
 	 * position of the ring's HEAD.
 	 */
-	cs = intel_ring_begin(request, engine->emit_breadcrumb_sz);
+	cs = intel_ring_begin(request, engine->emit_fini_breadcrumb_sz);
 	GEM_BUG_ON(IS_ERR(cs));
 	request->postfix = intel_ring_offset(request, cs);
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 96c586d6ff4d..340d6216791c 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -344,6 +344,7 @@  static inline bool i915_request_started(const struct i915_request *rq)
 	if (i915_request_signaled(rq))
 		return true;
 
+	/* Remember: started but may have since been preempted! */
 	return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 007348b1b469..7bc9164733bc 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -132,6 +132,7 @@  int i915_timeline_init(struct drm_i915_private *i915,
 	timeline->i915 = i915;
 	timeline->name = name;
 	timeline->pin_count = 0;
+	timeline->has_initial_breadcrumb = !hwsp;
 
 	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
 	if (!hwsp) {
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index ab736e2e5707..8caeb66d1cd5 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -48,6 +48,8 @@  struct i915_timeline {
 	struct i915_vma *hwsp_ggtt;
 	u32 hwsp_offset;
 
+	bool has_initial_breadcrumb;
+
 	/**
 	 * List of breadcrumbs associated with GPU requests currently
 	 * outstanding.
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index e532b4b27239..2a4c547240a1 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1239,7 +1239,9 @@  static void print_request(struct drm_printer *m,
 	drm_printf(m, "%s%x%s [%llx:%llx]%s @ %dms: %s\n",
 		   prefix,
 		   rq->global_seqno,
-		   i915_request_completed(rq) ? "!" : "",
+		   i915_request_completed(rq) ? "!" :
+		   i915_request_started(rq) ? "*" :
+		   "",
 		   rq->fence.context, rq->fence.seqno,
 		   buf,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1bf178ca3e00..0a2d53f19625 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -649,7 +649,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 		 * WaIdleLiteRestore:bdw,skl
 		 * Apply the wa NOOPs to prevent
 		 * ring:HEAD == rq:TAIL as we resubmit the
-		 * request. See gen8_emit_breadcrumb() for
+		 * request. See gen8_emit_fini_breadcrumb() for
 		 * where we prepare the padding after the
 		 * end of the request.
 		 */
@@ -1294,6 +1294,34 @@  execlists_context_pin(struct intel_engine_cs *engine,
 	return __execlists_context_pin(engine, ctx, ce);
 }
 
+static int gen8_emit_init_breadcrumb(struct i915_request *rq)
+{
+	u32 *cs;
+
+	GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
+
+	cs = intel_ring_begin(rq, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/*
+	 * Check if we have been preempted before we even get started.
+	 *
+	 * After this point i915_request_started() reports true, even if
+	 * we get preempted and so are no longer running.
+	 */
+	*cs++ = MI_ARB_CHECK;
+	*cs++ = MI_NOOP;
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = i915_timeline_seqno_address(rq->timeline);
+	*cs++ = 0;
+	*cs++ = rq->fence.seqno - 1;
+
+	intel_ring_advance(rq, cs);
+	return 0;
+}
+
 static int emit_pdps(struct i915_request *rq)
 {
 	const struct intel_engine_cs * const engine = rq->engine;
@@ -2049,7 +2077,7 @@  static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
 	request->wa_tail = intel_ring_offset(request, cs);
 }
 
-static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
+static void gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
 {
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
@@ -2070,9 +2098,9 @@  static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
+static const int gen8_emit_fini_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
 
-static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
+static void gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 {
 	cs = gen8_emit_ggtt_write_rcs(cs,
 				      request->fence.seqno,
@@ -2096,7 +2124,7 @@  static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
+static const int gen8_emit_fini_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
 
 static int gen8_init_rcs_context(struct i915_request *rq)
 {
@@ -2188,8 +2216,9 @@  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->request_alloc = execlists_request_alloc;
 
 	engine->emit_flush = gen8_emit_flush;
-	engine->emit_breadcrumb = gen8_emit_breadcrumb;
-	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
+	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
+	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
+	engine->emit_fini_breadcrumb_sz = gen8_emit_fini_breadcrumb_sz;
 
 	engine->set_default_submission = intel_execlists_set_default_submission;
 
@@ -2302,8 +2331,8 @@  int logical_render_ring_init(struct intel_engine_cs *engine)
 	/* Override some for render ring. */
 	engine->init_context = gen8_init_rcs_context;
 	engine->emit_flush = gen8_emit_flush_render;
-	engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
-	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
+	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
+	engine->emit_fini_breadcrumb_sz = gen8_emit_fini_breadcrumb_rcs_sz;
 
 	ret = logical_ring_init(engine);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 751bd4e7da42..f6b30eb46263 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1594,6 +1594,7 @@  static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 		err = PTR_ERR(timeline);
 		goto err;
 	}
+	GEM_BUG_ON(timeline->has_initial_breadcrumb);
 
 	ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
 	i915_timeline_put(timeline);
@@ -1947,6 +1948,7 @@  static int ring_request_alloc(struct i915_request *request)
 	int ret;
 
 	GEM_BUG_ON(!request->hw_context->pin_count);
+	GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
 
 	/*
 	 * Flush enough space to reduce the likelihood of waiting after
@@ -2283,11 +2285,16 @@  static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 	engine->context_pin = intel_ring_context_pin;
 	engine->request_alloc = ring_request_alloc;
 
-	engine->emit_breadcrumb = i9xx_emit_breadcrumb;
-	engine->emit_breadcrumb_sz = i9xx_emit_breadcrumb_sz;
+	/*
+	 * Using a global execution timeline; the previous final breadcrumb is
+	 * equivalent to our next initial bread so we can elide
+	 * engine->emit_init_breadcrumb().
+	 */
+	engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
+	engine->emit_fini_breadcrumb_sz = i9xx_emit_breadcrumb_sz;
 	if (IS_GEN(dev_priv, 5)) {
-		engine->emit_breadcrumb = gen5_emit_breadcrumb;
-		engine->emit_breadcrumb_sz = gen5_emit_breadcrumb_sz;
+		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
+		engine->emit_fini_breadcrumb_sz = gen5_emit_breadcrumb_sz;
 	}
 
 	engine->set_default_submission = i9xx_set_default_submission;
@@ -2317,13 +2324,13 @@  int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
 	if (INTEL_GEN(dev_priv) >= 7) {
 		engine->init_context = intel_rcs_ctx_init;
 		engine->emit_flush = gen7_render_ring_flush;
-		engine->emit_breadcrumb = gen7_rcs_emit_breadcrumb;
-		engine->emit_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz;
+		engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb_sz = gen7_rcs_emit_breadcrumb_sz;
 	} else if (IS_GEN(dev_priv, 6)) {
 		engine->init_context = intel_rcs_ctx_init;
 		engine->emit_flush = gen6_render_ring_flush;
-		engine->emit_breadcrumb = gen6_rcs_emit_breadcrumb;
-		engine->emit_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz;
+		engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb_sz = gen6_rcs_emit_breadcrumb_sz;
 	} else if (IS_GEN(dev_priv, 5)) {
 		engine->emit_flush = gen4_render_ring_flush;
 	} else {
@@ -2360,11 +2367,11 @@  int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
 		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
 
 		if (IS_GEN(dev_priv, 6)) {
-			engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
-			engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
+			engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
+			engine->emit_fini_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
 		} else {
-			engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
-			engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
+			engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
+			engine->emit_fini_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
 		}
 	} else {
 		engine->emit_flush = bsd_ring_flush;
@@ -2389,11 +2396,11 @@  int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
 	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
 
 	if (IS_GEN(dev_priv, 6)) {
-		engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
-		engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
+		engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
 	} else {
-		engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
-		engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
+		engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
 	}
 
 	return intel_init_ring_buffer(engine);
@@ -2412,8 +2419,8 @@  int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
 	engine->irq_enable = hsw_vebox_irq_enable;
 	engine->irq_disable = hsw_vebox_irq_disable;
 
-	engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
-	engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
+	engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
+	engine->emit_fini_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
 
 	return intel_init_ring_buffer(engine);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a792bacf2930..d3d4f3667afb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -463,8 +463,10 @@  struct intel_engine_cs {
 					 unsigned int dispatch_flags);
 #define I915_DISPATCH_SECURE BIT(0)
 #define I915_DISPATCH_PINNED BIT(1)
-	void		(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
-	int		emit_breadcrumb_sz;
+	int		(*emit_init_breadcrumb)(struct i915_request *rq);
+	void		(*emit_fini_breadcrumb)(struct i915_request *rq,
+						u32 *cs);
+	unsigned int	emit_fini_breadcrumb_sz;
 
 	/* Pass the request to the hardware queue (e.g. directly into
 	 * the legacy ringbuffer or to the end of an execlist).
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index c0a408828415..2515cffb4490 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -227,7 +227,7 @@  struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	engine->base.context_pin = mock_context_pin;
 	engine->base.request_alloc = mock_request_alloc;
 	engine->base.emit_flush = mock_emit_flush;
-	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
+	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
 	engine->base.submit_request = mock_submit_request;
 
 	if (i915_timeline_init(i915,