diff mbox

[v3,11/14] HACK drm/i915/scheduler: emulate a scheduler for guc

Message ID 20161114085703.16540-11-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Nov. 14, 2016, 8:57 a.m. UTC
This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 85 +++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_irq.c            |  4 +-
 drivers/gpu/drm/i915/intel_lrc.c           |  3 --
 3 files changed, 76 insertions(+), 16 deletions(-)

Comments

Tvrtko Ursulin Nov. 14, 2016, 11:31 a.m. UTC | #1
On 14/11/2016 08:57, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

Don't know what to do with this one. It feels like it should be a 
separate patch so it can be performance evaluated properly?

It is also not clear to me why we don't need any similar limiting for 
the execlists request merging?

Regards,

Tvrtko

> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 85 +++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_irq.c            |  4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |  3 --
>  3 files changed, 76 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 4462112725ef..088f5a99ecfc 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
>  	freespace -= gc->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		gc->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>
>  	return ret;
>  }
> @@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	gc->wq_rsvd -= wqi_size;
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -644,7 +644,7 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	rq->previous_context = engine->last_context;
>  	engine->last_context = rq->ctx;
>
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
> @@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock(&client->wq_lock);
>  }
>
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			i915_gem_request_assign(&port->request, last);
> +			dma_fence_enable_sw_signaling(&last->fence);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		dma_fence_enable_sw_signaling(&last->fence);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}
> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			rq = port[1].request;
> +			port[0].request = rq;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -1531,16 +1595,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request,
> -				    &engine->timeline->requests, link) {
> +		list_for_each_entry(request, &engine->timeline->requests, link)
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> -			if (i915_sw_fence_done(&request->submit))
> -				i915_guc_submit(request);
> -		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index cb8a75f6ca16..18dce4c66d56 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);
>  		notify_ring(engine);
> +	}
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
>  		tasklet_schedule(&engine->irq_tasklet);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d13a335ad83a..ffab255e55a7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1425,9 +1425,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
>  	if (request->ctx != port[0].request->ctx) {
>
Chris Wilson Nov. 14, 2016, 2:40 p.m. UTC | #2
On Mon, Nov 14, 2016 at 11:31:11AM +0000, Tvrtko Ursulin wrote:
> 
> On 14/11/2016 08:57, Chris Wilson wrote:
> >This emulates execlists on top of the GuC in order to defer submission of
> >requests to the hardware. This deferral allows time for high priority
> >requests to gazump their way to the head of the queue, however it nerfs
> >the GuC by converting it back into a simple execlist (where the CPU has
> >to wake up after every request to feed new commands into the GuC).
> 
> Don't know what to do with this one. It feels like it should be a
> separate patch so it can be performance evaluated properly?

Yes. It is not clear if this is the right approach for the guc, since
the firmware may have other ideas on how to do scheduling. It is an
interesting thought experiment into how easy it would be to add
scheduling on top!
 
> It is also not clear to me why we don't need any similar limiting
> for the execlists request merging?

This uses exactly the same merging strategy as execlists (with the
exception of not supporting gvt's single request dispatch), so we should
be merging a ringfill of requests from one context if available. Which
of course has it's downsides wrt to scheduling latency without preemption,
and I'm not if there is even a right answer without preemption +
timeslicing - the choice is more or less merge everything, or merge
nothing, so we stick with the status quo of merge everything until
proven otherwise.
-Chris
Tvrtko Ursulin Dec. 1, 2016, 10:45 a.m. UTC | #3
On 14/11/2016 08:57, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

As it is starting to sink in we'll have to do add this hack sooner or 
later, review comments below.

Also, would you be OK to rebase this or would prefer to delegate it?

> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 85 +++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_irq.c            |  4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |  3 --
>  3 files changed, 76 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 4462112725ef..088f5a99ecfc 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
>  	freespace -= gc->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		gc->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>
>  	return ret;
>  }
> @@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	gc->wq_rsvd -= wqi_size;
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -644,7 +644,7 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	rq->previous_context = engine->last_context;
>  	engine->last_context = rq->ctx;
>
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
> @@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock(&client->wq_lock);
>  }

Confused me at first here until I noticed engine->submit_request will be 
the execlist_submit_request later. Perhaps it would be good to rename a 
lot of things now, like engine->request_queue, 
intel_engine_submit_request and maybe more?

>
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {

Not sure if GVT comes into the picture here, but it does not sounds like 
it would harm to use can_merge_ctx here?

> +			if (port != engine->execlist_port)
> +				break;

It may be an overkill for the first version, but I was thinking that we 
don't have to limit it to two at a time. And it would depend on 
measuring of course. But perhaps it would make sense to do the 
generalisation of the number of supported ports straight away.

> +
> +			i915_gem_request_assign(&port->request, last);
> +			dma_fence_enable_sw_signaling(&last->fence);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		dma_fence_enable_sw_signaling(&last->fence);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}

We could theoretically share most of the execlist_dequeue and just do a 
couple things differently depending on the mode.

Looks like one could be a new engine->submit_ports vfunc. And there is 
also the lite restore WA and sw signalling to design in nicely, but it 
may be worth sharing the code. It would be renamed to sometihng like 
scheduler_dequeue or something.

> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			rq = port[1].request;
> +			port[0].request = rq;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -1531,16 +1595,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request,
> -				    &engine->timeline->requests, link) {
> +		list_for_each_entry(request, &engine->timeline->requests, link)
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> -			if (i915_sw_fence_done(&request->submit))
> -				i915_guc_submit(request);
> -		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index cb8a75f6ca16..18dce4c66d56 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);

This would be better made conditional on GuC submission just to calling 
tasklet_schedule twice (occasionally) in execlist mode.

>  		notify_ring(engine);
> +	}
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
>  		tasklet_schedule(&engine->irq_tasklet);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d13a335ad83a..ffab255e55a7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1425,9 +1425,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
>  	if (request->ctx != port[0].request->ctx) {
>

Regards,

Tvrtko
Chris Wilson Dec. 1, 2016, 11:18 a.m. UTC | #4
On Thu, Dec 01, 2016 at 10:45:51AM +0000, Tvrtko Ursulin wrote:
> 
> On 14/11/2016 08:57, Chris Wilson wrote:
> >This emulates execlists on top of the GuC in order to defer submission of
> >requests to the hardware. This deferral allows time for high priority
> >requests to gazump their way to the head of the queue, however it nerfs
> >the GuC by converting it back into a simple execlist (where the CPU has
> >to wake up after every request to feed new commands into the GuC).
> 
> As it is starting to sink in we'll have to do add this hack sooner
> or later, review comments below.
> 
> Also, would you be OK to rebase this or would prefer to delegate it?

It's been very easy to keep it current.

> >@@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
> > 	spin_unlock(&client->wq_lock);
> > }
> 
> Confused me at first here until I noticed engine->submit_request
> will be the execlist_submit_request later. Perhaps it would be good
> to rename a lot of things now, like engine->request_queue,
> intel_engine_submit_request and maybe more?

Looking at the lifecycle of the request and getting clear names for the
phases and their vfuncs would be sensible. It may also be worthwhile to
decide that some are not engine tasks and belong to a new entity
(dev_priv->gt.X ?)
 
> >+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> >+{
> >+	struct execlist_port *port = engine->execlist_port;
> >+	struct drm_i915_gem_request *last = port[0].request;
> >+	unsigned long flags;
> >+	struct rb_node *rb;
> >+	bool submit = false;
> >+
> >+	spin_lock_irqsave(&engine->timeline->lock, flags);
> >+	rb = engine->execlist_first;
> >+	while (rb) {
> >+		struct drm_i915_gem_request *cursor =
> >+			rb_entry(rb, typeof(*cursor), priotree.node);
> >+
> >+		if (last && cursor->ctx != last->ctx) {
> 
> Not sure if GVT comes into the picture here, but it does not sounds
> like it would harm to use can_merge_ctx here?

I wasn't sure what path GVT would take either. So just went with the
simple version that looked as similar to the current guc submission as
possible. Also offloading the scheduling to the guc via semaphores will
likely make this whole chain look completely different.

> >+			if (port != engine->execlist_port)
> >+				break;
> 
> It may be an overkill for the first version, but I was thinking that
> we don't have to limit it to two at a time. And it would depend on
> measuring of course. But perhaps it would make sense to do the
> generalisation of the number of supported ports straight away.

Definitely. I was just looking at a minimal conversion, hence reusing
the existing tracking, and limits.

> We could theoretically share most of the execlist_dequeue and just
> do a couple things differently depending on the mode.

Yes, there were just a couple of intrusive warts that I felt justified
in keeping the routines separate. But mostly it was the merge_ctx
decision that looked to be backend specific.
 
> Looks like one could be a new engine->submit_ports vfunc. And there
> is also the lite restore WA and sw signalling to design in nicely,
> but it may be worth sharing the code. It would be renamed to
> sometihng like scheduler_dequeue or something.

> >diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> >index cb8a75f6ca16..18dce4c66d56 100644
> >--- a/drivers/gpu/drm/i915/i915_irq.c
> >+++ b/drivers/gpu/drm/i915/i915_irq.c
> >@@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
> > static __always_inline void
> > gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
> > {
> >-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> >+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> >+		tasklet_schedule(&engine->irq_tasklet);
> 
> This would be better made conditional on GuC submission just to
> calling tasklet_schedule twice (occasionally) in execlist mode.

It's not a huge cost if we do schedule the tasklet twice (in the same
interrupt context at least). Otherwise, yes we are incurring more mmio
reads to determine that the CSB didn't advance. On the other hand, I
really don't want to have if (i915.enable_guc_submission) here. Maybe
if (engine->user_irq_tasklet) tasklet_schedule(engine->user_irq_tasklet)
?

Speaking of CSB, the spec keeps hinting that "CSB occupies a whole
cacheline, making the read cheap". I'm not aware of how we can do a UC
to cacheline read in a single transfer. Ideas?  movntdqa I thought was
specifically WC.
-Chris
Tvrtko Ursulin Dec. 1, 2016, 12:45 p.m. UTC | #5
On 01/12/2016 11:18, Chris Wilson wrote:
> On Thu, Dec 01, 2016 at 10:45:51AM +0000, Tvrtko Ursulin wrote:
>>
>> On 14/11/2016 08:57, Chris Wilson wrote:
>>> This emulates execlists on top of the GuC in order to defer submission of
>>> requests to the hardware. This deferral allows time for high priority
>>> requests to gazump their way to the head of the queue, however it nerfs
>>> the GuC by converting it back into a simple execlist (where the CPU has
>>> to wake up after every request to feed new commands into the GuC).
>>
>> As it is starting to sink in we'll have to do add this hack sooner
>> or later, review comments below.
>>
>> Also, would you be OK to rebase this or would prefer to delegate it?
>
> It's been very easy to keep it current.
>
>>> @@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>>> 	spin_unlock(&client->wq_lock);
>>> }
>>
>> Confused me at first here until I noticed engine->submit_request
>> will be the execlist_submit_request later. Perhaps it would be good
>> to rename a lot of things now, like engine->request_queue,
>> intel_engine_submit_request and maybe more?
>
> Looking at the lifecycle of the request and getting clear names for the
> phases and their vfuncs would be sensible. It may also be worthwhile to
> decide that some are not engine tasks and belong to a new entity
> (dev_priv->gt.X ?)

Like submit_request? Makes sense yes.

>>> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>>> +{
>>> +	struct execlist_port *port = engine->execlist_port;
>>> +	struct drm_i915_gem_request *last = port[0].request;
>>> +	unsigned long flags;
>>> +	struct rb_node *rb;
>>> +	bool submit = false;
>>> +
>>> +	spin_lock_irqsave(&engine->timeline->lock, flags);
>>> +	rb = engine->execlist_first;
>>> +	while (rb) {
>>> +		struct drm_i915_gem_request *cursor =
>>> +			rb_entry(rb, typeof(*cursor), priotree.node);
>>> +
>>> +		if (last && cursor->ctx != last->ctx) {
>>
>> Not sure if GVT comes into the picture here, but it does not sounds
>> like it would harm to use can_merge_ctx here?
>
> I wasn't sure what path GVT would take either. So just went with the
> simple version that looked as similar to the current guc submission as
> possible. Also offloading the scheduling to the guc via semaphores will
> likely make this whole chain look completely different.

Hmm I am not up to speed with that. So you are saying it doesn't make 
sense to unify this?

>>> +			if (port != engine->execlist_port)
>>> +				break;
>>
>> It may be an overkill for the first version, but I was thinking that
>> we don't have to limit it to two at a time. And it would depend on
>> measuring of course. But perhaps it would make sense to do the
>> generalisation of the number of supported ports straight away.
>
> Definitely. I was just looking at a minimal conversion, hence reusing
> the existing tracking, and limits.

Definitely leave it for later, or definitely it makes sense to 
generalise right now? I was just thinking that when someone goes to test 
this and finds the throughput regresses, that it might be easier to just 
say please try i915.guc_submit_ports=8 or something.

>> We could theoretically share most of the execlist_dequeue and just
>> do a couple things differently depending on the mode.
>
> Yes, there were just a couple of intrusive warts that I felt justified
> in keeping the routines separate. But mostly it was the merge_ctx
> decision that looked to be backend specific.

Cool.

>> Looks like one could be a new engine->submit_ports vfunc. And there
>> is also the lite restore WA and sw signalling to design in nicely,
>> but it may be worth sharing the code. It would be renamed to
>> sometihng like scheduler_dequeue or something.
>
>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>> index cb8a75f6ca16..18dce4c66d56 100644
>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>>> static __always_inline void
>>> gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>>> {
>>> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
>>> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
>>> +		tasklet_schedule(&engine->irq_tasklet);
>>
>> This would be better made conditional on GuC submission just to
>> calling tasklet_schedule twice (occasionally) in execlist mode.
>
> It's not a huge cost if we do schedule the tasklet twice (in the same
> interrupt context at least). Otherwise, yes we are incurring more mmio
> reads to determine that the CSB didn't advance. On the other hand, I
> really don't want to have if (i915.enable_guc_submission) here. Maybe
> if (engine->user_irq_tasklet) tasklet_schedule(engine->user_irq_tasklet)
> ?

Hm, I don't know. One conditional plus a redundant engine struct member 
or just one conditional. Maybe dev_priv->gt.handle_user_interrupt and 
dev_priv->gt.handle_ctx_switch_interrupt? You won't like the indirection 
I am sure. Go with the user_irq_tasklet then.

> Speaking of CSB, the spec keeps hinting that "CSB occupies a whole
> cacheline, making the read cheap". I'm not aware of how we can do a UC
> to cacheline read in a single transfer. Ideas?  movntdqa I thought was
> specifically WC.

Unfortunately I don't know. Low-level x86 is not my strong area. I only 
learnt of movntdqa when Akash and you started using it for example.

Regards,

Tvrtko
Chris Wilson Dec. 1, 2016, 1:01 p.m. UTC | #6
On Thu, Dec 01, 2016 at 12:45:18PM +0000, Tvrtko Ursulin wrote:
> 
> On 01/12/2016 11:18, Chris Wilson wrote:
> >On Thu, Dec 01, 2016 at 10:45:51AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 14/11/2016 08:57, Chris Wilson wrote:
> >>>+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> >>>+{
> >>>+	struct execlist_port *port = engine->execlist_port;
> >>>+	struct drm_i915_gem_request *last = port[0].request;
> >>>+	unsigned long flags;
> >>>+	struct rb_node *rb;
> >>>+	bool submit = false;
> >>>+
> >>>+	spin_lock_irqsave(&engine->timeline->lock, flags);
> >>>+	rb = engine->execlist_first;
> >>>+	while (rb) {
> >>>+		struct drm_i915_gem_request *cursor =
> >>>+			rb_entry(rb, typeof(*cursor), priotree.node);
> >>>+
> >>>+		if (last && cursor->ctx != last->ctx) {
> >>
> >>Not sure if GVT comes into the picture here, but it does not sounds
> >>like it would harm to use can_merge_ctx here?
> >
> >I wasn't sure what path GVT would take either. So just went with the
> >simple version that looked as similar to the current guc submission as
> >possible. Also offloading the scheduling to the guc via semaphores will
> >likely make this whole chain look completely different.
> 
> Hmm I am not up to speed with that. So you are saying it doesn't
> make sense to unify this?

Just not sure yet. Too much duplication, too much engineering are both
traps we may make for ourselves.

> >>>+			if (port != engine->execlist_port)
> >>>+				break;
> >>
> >>It may be an overkill for the first version, but I was thinking that
> >>we don't have to limit it to two at a time. And it would depend on
> >>measuring of course. But perhaps it would make sense to do the
> >>generalisation of the number of supported ports straight away.
> >
> >Definitely. I was just looking at a minimal conversion, hence reusing
> >the existing tracking, and limits.
> 
> Definitely leave it for later, or definitely it makes sense to
> generalise right now? I was just thinking that when someone goes to
> test this and finds the throughput regresses, that it might be
> easier to just say please try i915.guc_submit_ports=8 or something.

It was "definitely not worth it in this patch and definitely makes sense
to investigate". Very rapid diminishing returns, it comes down to how
many requests will complete in the service time of the first irq. You'll
be looking at the no-op switching workloads that stress the driver,
rather than the actual workloads that stress the system. The cheapest
typical ping-pong is GL client -> display server -> GL client, though
OpenCL may beat that, but for that GL sequence, 3 ports would easily
cover us. [1 active, 2 pending slots really.]
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 4462112725ef..088f5a99ecfc 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -469,7 +469,7 @@  int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
 	freespace -= gc->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -479,7 +479,7 @@  int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		gc->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 
 	return ret;
 }
@@ -491,9 +491,9 @@  void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	gc->wq_rsvd -= wqi_size;
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -644,7 +644,7 @@  static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	rq->previous_context = engine->last_context;
 	engine->last_context = rq->ctx;
 
-	i915_gem_request_submit(rq);
+	__i915_gem_request_submit(rq);
 
 	spin_lock(&client->wq_lock);
 	guc_wq_item_append(client, rq);
@@ -665,6 +665,70 @@  static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	spin_unlock(&client->wq_lock);
 }
 
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			dma_fence_enable_sw_signaling(&last->fence);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		dma_fence_enable_sw_signaling(&last->fence);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			rq = port[1].request;
+			port[0].request = rq;
+			port[1].request = NULL;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -1531,16 +1595,13 @@  int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
-		list_for_each_entry(request,
-				    &engine->timeline->requests, link) {
+		list_for_each_entry(request, &engine->timeline->requests, link)
 			client->wq_rsvd += sizeof(struct guc_wq_item);
-			if (i915_sw_fence_done(&request->submit))
-				i915_guc_submit(request);
-		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index cb8a75f6ca16..18dce4c66d56 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1341,8 +1341,10 @@  static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
+		tasklet_schedule(&engine->irq_tasklet);
 		notify_ring(engine);
+	}
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
 		tasklet_schedule(&engine->irq_tasklet);
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d13a335ad83a..ffab255e55a7 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1425,9 +1425,6 @@  static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
 	if (request->ctx != port[0].request->ctx) {