diff mbox series

[10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+

Message ID 20190130021906.17933-10-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset | expand

Commit Message

Chris Wilson Jan. 30, 2019, 2:19 a.m. UTC
Having introduced per-context seqno, we now have a means to identity
progress across the system without feel of rollback as befell the
global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
advance of submission safe in the knowledge that our target seqno and
address is stable.

However, since we are telling the GPU to busy-spin on the target address
until it matches the signaling seqno, we only want to do so when we are
sure that busy-spin will be completed quickly. To achieve this we only
submit the request to HW once the signaler is itself executing (modulo
preemption causing us to wait longer), and we only do so for default and
above priority requests (so that idle priority tasks never themselves
hog the GPU waiting for others).

But what AB-BA deadlocks? If you remove B, there can be no deadlock...
The issue is that with a deep ELSP queue, we can queue up a pair of
AB-BA on different engines, thus forming a classic mutual exclusion
deadlock. We side-step that issue by restricting the queue depth to
avoid having multiple semaphores in flight and so we only ever take one
set of locks at a time.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_request.c       | 153 +++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_request.h       |   1 +
 drivers/gpu/drm/i915/i915_scheduler.c     |   1 +
 drivers/gpu/drm/i915/i915_scheduler.h     |   1 +
 drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
 drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
 drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
 drivers/gpu/drm/i915/intel_lrc.c          |  14 +-
 8 files changed, 178 insertions(+), 4 deletions(-)

Comments

Tvrtko Ursulin Jan. 31, 2019, 1:19 p.m. UTC | #1
On 30/01/2019 02:19, Chris Wilson wrote:
> Having introduced per-context seqno, we now have a means to identity
> progress across the system without feel of rollback as befell the
> global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> advance of submission safe in the knowledge that our target seqno and
> address is stable.
> 
> However, since we are telling the GPU to busy-spin on the target address
> until it matches the signaling seqno, we only want to do so when we are
> sure that busy-spin will be completed quickly. To achieve this we only
> submit the request to HW once the signaler is itself executing (modulo
> preemption causing us to wait longer), and we only do so for default and
> above priority requests (so that idle priority tasks never themselves
> hog the GPU waiting for others).

It could be milliseconds though. I think apart from media-bench saying 
this is faster, we would need to look at performance per Watt as well.

RING_SEMA_WAIT_POLL is a potential tunable as well. Not that I have an 
idea how to tune it.

Eventually, do we dare adding this without a runtime switch? (There, I 
mentioned the taboo.)

What about signal mode and handling this via context switches?

> But what AB-BA deadlocks? If you remove B, there can be no deadlock...
> The issue is that with a deep ELSP queue, we can queue up a pair of
> AB-BA on different engines, thus forming a classic mutual exclusion
> deadlock. We side-step that issue by restricting the queue depth to
> avoid having multiple semaphores in flight and so we only ever take one
> set of locks at a time.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_request.c       | 153 +++++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_request.h       |   1 +
>   drivers/gpu/drm/i915/i915_scheduler.c     |   1 +
>   drivers/gpu/drm/i915/i915_scheduler.h     |   1 +
>   drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
>   drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
>   drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
>   drivers/gpu/drm/i915/intel_lrc.c          |  14 +-
>   8 files changed, 178 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 07e4c3c68ecd..6d825cd28ae6 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -22,8 +22,9 @@
>    *
>    */
>   
> -#include <linux/prefetch.h>
>   #include <linux/dma-fence-array.h>
> +#include <linux/irq_work.h>
> +#include <linux/prefetch.h>
>   #include <linux/sched.h>
>   #include <linux/sched/clock.h>
>   #include <linux/sched/signal.h>
> @@ -326,6 +327,76 @@ void i915_request_retire_upto(struct i915_request *rq)
>   	} while (tmp != rq);
>   }
>   
> +struct execute_cb {
> +	struct list_head link;
> +	struct irq_work work;
> +	struct i915_sw_fence *fence;
> +};
> +
> +static void irq_execute_cb(struct irq_work *wrk)
> +{
> +	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
> +
> +	i915_sw_fence_complete(cb->fence);
> +	kfree(cb);
> +}
> +
> +static void __notify_execute_cb(struct i915_request *rq)
> +{
> +	struct execute_cb *cb;
> +
> +	lockdep_assert_held(&rq->lock);
> +
> +	if (list_empty(&rq->execute_cb))
> +		return;
> +
> +	list_for_each_entry(cb, &rq->execute_cb, link)
> +		irq_work_queue(&cb->work);
> +
> +	/*
> +	 * XXX Rollback on __i915_request_unsubmit()
> +	 *
> +	 * In the future, perhaps when we have an active time-slicing scheduler,
> +	 * it will be interesting to unsubmit parallel execution and remove
> +	 * busywaits from the GPU until their master is restarted. This is
> +	 * quite hairy, we have to carefully rollback the fence and do a
> +	 * preempt-to-idle cycle on the target engine, all the while the
> +	 * master execute_cb may refire.
> +	 */
> +	INIT_LIST_HEAD(&rq->execute_cb);
> +}
> +
> +static int
> +i915_request_await_execution(struct i915_request *rq,
> +			     struct i915_request *signal,
> +			     gfp_t gfp)
> +{
> +	struct execute_cb *cb;
> +	unsigned long flags;
> +
> +	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags))
> +		return 0;
> +
> +	cb = kmalloc(sizeof(*cb), gfp);
> +	if (!cb)
> +		return -ENOMEM;
> +
> +	cb->fence = &rq->submit;
> +	i915_sw_fence_await(cb->fence);
> +	init_irq_work(&cb->work, irq_execute_cb);
> +
> +	spin_lock_irqsave(&signal->lock, flags);
> +	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags)) {
> +		i915_sw_fence_complete(cb->fence);
> +		kfree(cb);
> +	} else {
> +		list_add_tail(&cb->link, &signal->execute_cb);
> +	}
> +	spin_unlock_irqrestore(&signal->lock, flags);
> +
> +	return 0;
> +}
> +
>   static void move_to_timeline(struct i915_request *request,
>   			     struct i915_timeline *timeline)
>   {
> @@ -373,6 +444,7 @@ void __i915_request_submit(struct i915_request *request)
>   	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
>   	    !i915_request_enable_breadcrumb(request))
>   		intel_engine_queue_breadcrumbs(engine);
> +	__notify_execute_cb(request);
>   	spin_unlock(&request->lock);
>   
>   	engine->emit_fini_breadcrumb(request,
> @@ -613,6 +685,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	}
>   
>   	INIT_LIST_HEAD(&rq->active_list);
> +	INIT_LIST_HEAD(&rq->execute_cb);
>   
>   	tl = ce->ring->timeline;
>   	ret = i915_timeline_get_seqno(tl, rq, &seqno);
> @@ -700,6 +773,81 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	return ERR_PTR(ret);
>   }
>   
> +static int
> +emit_semaphore_wait(struct i915_request *to,
> +		    struct i915_request *from,
> +		    gfp_t gfp)
> +{
> +	u32 *cs;
> +	int err;
> +
> +	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
> +
> +	err = i915_timeline_read_lock(from->timeline, to);
> +	if (err)
> +		return err;
> +
> +	/*
> +	 * If we know our signaling request has started, we know that it
> +	 * must, at least, have passed its initial breadcrumb and that its
> +	 * seqno can only increase, therefore any change in its breadcrumb
> +	 * must indicate completion. By using a "not equal to start" compare
> +	 * we avoid the murky issue of how to handle seqno wraparound in an
> +	 * async environment (short answer, we must stop the world whenever
> +	 * any context wraps!) as the likelihood of missing one request then
> +	 * seeing the same start value for a new request is 1 in 2^31, and
> +	 * even then we know that the new request has started and is in
> +	 * progress, so we are sure it will complete soon enough (not to
> +	 * worry about).
> +	 */
> +	if (i915_request_started(from)) {
> +		cs = intel_ring_begin(to, 4);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		*cs++ = MI_SEMAPHORE_WAIT |
> +			MI_SEMAPHORE_GLOBAL_GTT |
> +			MI_SEMAPHORE_POLL |
> +			MI_SEMAPHORE_SAD_NEQ_SDD;
> +		*cs++ = from->fence.seqno - 1;
> +		*cs++ = from->timeline->hwsp_offset;
> +		*cs++ = 0;
> +
> +		intel_ring_advance(to, cs);
> +	} else {
> +		int err;
> +
> +		err = i915_request_await_execution(to, from, gfp);
> +		if (err)
> +			return err;
> +
> +		cs = intel_ring_begin(to, 4);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		/*
> +		 * Using greater-than-or-equal here means we have to worry
> +		 * about seqno wraparound. To side step that issue, we swap
> +		 * the timeline HWSP upon wrapping, so that everyone listening
> +		 * for the old (pre-wrap) values do not see the much smaller
> +		 * (post-wrap) values than they were expecting (and so wait
> +		 * forever).
> +		 */
> +		*cs++ = MI_SEMAPHORE_WAIT |
> +			MI_SEMAPHORE_GLOBAL_GTT |
> +			MI_SEMAPHORE_POLL |
> +			MI_SEMAPHORE_SAD_GTE_SDD;
> +		*cs++ = from->fence.seqno;
> +		*cs++ = from->timeline->hwsp_offset;
> +		*cs++ = 0;
> +
> +		intel_ring_advance(to, cs);
> +	}

Would it not work to have a single path which emits the wait on NEQ 
from->fence.seqno - 1, just i915_request_await_execution conditional on 
i915_request_started?

In the !started case, having added the await, we would know the 
semaphore wait would not run until after the dependency has started, and 
NEQ would be true when it completes. The same as the above started path.

> +
> +	to->sched.semaphore = true;
> +	return 0;
> +}
> +
>   static int
>   i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   {
> @@ -723,6 +871,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
>   						       &from->submit,
>   						       I915_FENCE_GFP);
> +	} else if (HAS_EXECLISTS(to->i915) &&
> +		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
> +		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
>   	} else {
>   		ret = i915_sw_fence_await_dma_fence(&to->submit,
>   						    &from->fence, 0,
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 40f3e8dcbdd5..66a374ee177a 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -127,6 +127,7 @@ struct i915_request {
>   	 */
>   	struct i915_sw_fence submit;
>   	wait_queue_entry_t submitq;
> +	struct list_head execute_cb;
>   
>   	/*
>   	 * A list of everyone we wait upon, and everyone who waits upon us.
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index d01683167c77..aa6c663dca09 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -29,6 +29,7 @@ void i915_sched_node_init(struct i915_sched_node *node)
>   	INIT_LIST_HEAD(&node->waiters_list);
>   	INIT_LIST_HEAD(&node->link);
>   	node->attr.priority = I915_PRIORITY_INVALID;
> +	node->semaphore = false;
>   }
>   
>   static struct i915_dependency *
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index dbe9cb7ecd82..d764cf10536f 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -72,6 +72,7 @@ struct i915_sched_node {
>   	struct list_head waiters_list; /* those after us, they depend upon us */
>   	struct list_head link;
>   	struct i915_sched_attr attr;
> +	bool semaphore;
>   };
>   
>   struct i915_dependency {
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 7c58b049ecb5..8d1400d378d7 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
>   	__i915_sw_fence_notify(fence, FENCE_FREE);
>   }
>   
> -static void i915_sw_fence_complete(struct i915_sw_fence *fence)
> +void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   
> @@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   	__i915_sw_fence_complete(fence, NULL);
>   }
>   
> -static void i915_sw_fence_await(struct i915_sw_fence *fence)
> +void i915_sw_fence_await(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 0e055ea0179f..6dec9e1d1102 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    unsigned long timeout,
>   				    gfp_t gfp);
>   
> +void i915_sw_fence_await(struct i915_sw_fence *fence);
> +void i915_sw_fence_complete(struct i915_sw_fence *fence);
> +
>   static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
>   {
>   	return atomic_read(&fence->pending) <= 0;
> diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
> index b96a31bc1080..0efaadd3bc32 100644
> --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> @@ -106,7 +106,12 @@
>   #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
>   #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
>   #define   MI_SEMAPHORE_POLL		(1<<15)
> +#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
>   #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
> +#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
> +#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
> +#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
> +#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
>   #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
>   #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
>   #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e97ce54138d3..80d17b75b2e6 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -421,7 +421,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   	 * in the priority queue, but they will not gain immediate access to
>   	 * the GPU.
>   	 */
> -	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY) {
> +	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY &&
> +	    i915_request_started(active)) {
>   		prio |= ACTIVE_PRIORITY;
>   		active->sched.attr.priority = prio;
>   		list_move_tail(&active->sched.link,
> @@ -605,6 +606,17 @@ static bool can_merge_rq(const struct i915_request *prev,
>   {
>   	GEM_BUG_ON(!assert_priority_queue(prev, next));
>   
> +	/*
> +	 * To avoid AB-BA deadlocks, we simply restrict ourselves to only
> +	 * submitting one semaphore (think HW spinlock) to HW at a time. This
> +	 * prevents the execution callback on a later sempahore from being
> +	 * queued on another engine, so no cycle can be formed. Preemption
> +	 * rules should mean that if this semaphore is preempted, its
> +	 * dependency chain is preserved and suitably promoted via PI.
> +	 */
> +	if (prev->sched.semaphore && !i915_request_started(prev))
> +		return false;
> +
>   	if (!can_merge_ctx(prev->hw_context, next->hw_context))
>   		return false;
>   
> 

Regards,

Tvrtko
Chris Wilson Jan. 31, 2019, 1:39 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > Having introduced per-context seqno, we now have a means to identity
> > progress across the system without feel of rollback as befell the
> > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > advance of submission safe in the knowledge that our target seqno and
> > address is stable.
> > 
> > However, since we are telling the GPU to busy-spin on the target address
> > until it matches the signaling seqno, we only want to do so when we are
> > sure that busy-spin will be completed quickly. To achieve this we only
> > submit the request to HW once the signaler is itself executing (modulo
> > preemption causing us to wait longer), and we only do so for default and
> > above priority requests (so that idle priority tasks never themselves
> > hog the GPU waiting for others).
> 
> It could be milliseconds though. I think apart from media-bench saying 
> this is faster, we would need to look at performance per Watt as well.

All throughput measurements are substantially faster, as you would
expect, and inter-engine latency decreased. I would hope it would
powergate/rc6 the EU while the CS was spinning, but I don't know :)
 
> RING_SEMA_WAIT_POLL is a potential tunable as well. Not that I have an 
> idea how to tune it.
> 
> Eventually, do we dare adding this without a runtime switch? (There, I 
> mentioned the taboo.)

Yes :p

> What about signal mode and handling this via context switches?

That's 99% of the timeslicing scheduler right there -- the handling of
deferred work with the complication of it impacting other engines.
 
> > +static int
> > +emit_semaphore_wait(struct i915_request *to,
> > +                 struct i915_request *from,
> > +                 gfp_t gfp)
> > +{
> > +     u32 *cs;
> > +     int err;
> > +
> > +     GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
> > +
> > +     err = i915_timeline_read_lock(from->timeline, to);
> > +     if (err)
> > +             return err;
> > +
> > +     /*
> > +      * If we know our signaling request has started, we know that it
> > +      * must, at least, have passed its initial breadcrumb and that its
> > +      * seqno can only increase, therefore any change in its breadcrumb
> > +      * must indicate completion. By using a "not equal to start" compare
> > +      * we avoid the murky issue of how to handle seqno wraparound in an
> > +      * async environment (short answer, we must stop the world whenever
> > +      * any context wraps!) as the likelihood of missing one request then
> > +      * seeing the same start value for a new request is 1 in 2^31, and
> > +      * even then we know that the new request has started and is in
> > +      * progress, so we are sure it will complete soon enough (not to
> > +      * worry about).
> > +      */
> > +     if (i915_request_started(from)) {
> > +             cs = intel_ring_begin(to, 4);
> > +             if (IS_ERR(cs))
> > +                     return PTR_ERR(cs);
> > +
> > +             *cs++ = MI_SEMAPHORE_WAIT |
> > +                     MI_SEMAPHORE_GLOBAL_GTT |
> > +                     MI_SEMAPHORE_POLL |
> > +                     MI_SEMAPHORE_SAD_NEQ_SDD;
> > +             *cs++ = from->fence.seqno - 1;
> > +             *cs++ = from->timeline->hwsp_offset;
> > +             *cs++ = 0;
> > +
> > +             intel_ring_advance(to, cs);
> > +     } else {
> > +             int err;
> > +
> > +             err = i915_request_await_execution(to, from, gfp);
> > +             if (err)
> > +                     return err;
> > +
> > +             cs = intel_ring_begin(to, 4);
> > +             if (IS_ERR(cs))
> > +                     return PTR_ERR(cs);
> > +
> > +             /*
> > +              * Using greater-than-or-equal here means we have to worry
> > +              * about seqno wraparound. To side step that issue, we swap
> > +              * the timeline HWSP upon wrapping, so that everyone listening
> > +              * for the old (pre-wrap) values do not see the much smaller
> > +              * (post-wrap) values than they were expecting (and so wait
> > +              * forever).
> > +              */
> > +             *cs++ = MI_SEMAPHORE_WAIT |
> > +                     MI_SEMAPHORE_GLOBAL_GTT |
> > +                     MI_SEMAPHORE_POLL |
> > +                     MI_SEMAPHORE_SAD_GTE_SDD;
> > +             *cs++ = from->fence.seqno;
> > +             *cs++ = from->timeline->hwsp_offset;
> > +             *cs++ = 0;
> > +
> > +             intel_ring_advance(to, cs);
> > +     }
> 
> Would it not work to have a single path which emits the wait on NEQ 
> from->fence.seqno - 1, just i915_request_await_execution conditional on 
> i915_request_started?
> 
> In the !started case, having added the await, we would know the 
> semaphore wait would not run until after the dependency has started, and 
> NEQ would be true when it completes. The same as the above started path.

We may have previously submitted the signaler in a very long queue to
its engine so cannot determine its position, in which case we could
sample a long time before it even begins. Even if we launch both
requests on the different engines at the same time, we could sample
before the started semaphore.

I should remove the current NEQ path, it was before I committed myself
to handling the HWSP across wraparounds, and is now just needless
complication.
-Chris
Chris Wilson Jan. 31, 2019, 4:20 p.m. UTC | #3
Quoting Chris Wilson (2019-01-31 13:39:50)
> Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> > 
> > On 30/01/2019 02:19, Chris Wilson wrote:
> > > Having introduced per-context seqno, we now have a means to identity
> > > progress across the system without feel of rollback as befell the
> > > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > > advance of submission safe in the knowledge that our target seqno and
> > > address is stable.
> > > 
> > > However, since we are telling the GPU to busy-spin on the target address
> > > until it matches the signaling seqno, we only want to do so when we are
> > > sure that busy-spin will be completed quickly. To achieve this we only
> > > submit the request to HW once the signaler is itself executing (modulo
> > > preemption causing us to wait longer), and we only do so for default and
> > > above priority requests (so that idle priority tasks never themselves
> > > hog the GPU waiting for others).
> > 
> > It could be milliseconds though. I think apart from media-bench saying 
> > this is faster, we would need to look at performance per Watt as well.
> 
> All throughput measurements are substantially faster, as you would
> expect, and inter-engine latency decreased. I would hope it would
> powergate/rc6 the EU while the CS was spinning, but I don't know :)

Fwiw, it's about the power cost of simply spinning with the CS without
any additional cost of utilizing the engine.
-Chris
Chris Wilson Jan. 31, 2019, 5:21 p.m. UTC | #4
Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > Having introduced per-context seqno, we now have a means to identity
> > progress across the system without feel of rollback as befell the
> > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > advance of submission safe in the knowledge that our target seqno and
> > address is stable.
> > 
> > However, since we are telling the GPU to busy-spin on the target address
> > until it matches the signaling seqno, we only want to do so when we are
> > sure that busy-spin will be completed quickly. To achieve this we only
> > submit the request to HW once the signaler is itself executing (modulo
> > preemption causing us to wait longer), and we only do so for default and
> > above priority requests (so that idle priority tasks never themselves
> > hog the GPU waiting for others).
> 
> It could be milliseconds though. I think apart from media-bench saying 
> this is faster, we would need to look at performance per Watt as well.
> 
> RING_SEMA_WAIT_POLL is a potential tunable as well. Not that I have an 
> idea how to tune it.
> 
> Eventually, do we dare adding this without a runtime switch? (There, I 
> mentioned the taboo.)

Yes, we could make it a context setparam. I used priority here as
arguing that idle workloads don't want the extra power draw makes sense.

Downside of making it opt-in, nobody benefits. Still it's pretty limited
to media workloads at the moment (who else uses multiple rings atm), but
even there reducing latency for desktop video is justifiable imo.

(Now having said that, I should go out and find a video player to
benchmark... Maybe we can demonstrate reduced frame drop for Kodi. If I
say "Kodi, Kodi, Kodi" I summon a Kodi dev right?)

Downside of making it opt-out: everybody gets to experience our bugs,
and the onus is on us in making the right choice.

> > @@ -605,6 +606,17 @@ static bool can_merge_rq(const struct i915_request *prev,
> >   {
> >       GEM_BUG_ON(!assert_priority_queue(prev, next));
> >   
> > +     /*
> > +      * To avoid AB-BA deadlocks, we simply restrict ourselves to only
> > +      * submitting one semaphore (think HW spinlock) to HW at a time. This
> > +      * prevents the execution callback on a later sempahore from being
> > +      * queued on another engine, so no cycle can be formed. Preemption
> > +      * rules should mean that if this semaphore is preempted, its
> > +      * dependency chain is preserved and suitably promoted via PI.
> > +      */
> > +     if (prev->sched.semaphore && !i915_request_started(prev))
> > +             return false;

The other way I was thinking we could solve this is to move the
execute_cb from i915_request_submit until we actually insert the request
in ELSP[0] (or do the promotion from ELSP[1]).

I don't like either much. I don't really want to walk the list of
requests for port0 checking for execute_cb, but I don't also like
arbitrary splitting contexts (however, there seems to be reasons to do
that anyway).

It all depends on how fast we can service CS interrupts, and that needs
to always be fast. :|
-Chris
Chris Wilson Feb. 1, 2019, 9:03 a.m. UTC | #5
Quoting Chris Wilson (2019-01-31 16:20:37)
> Quoting Chris Wilson (2019-01-31 13:39:50)
> > Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> > > 
> > > On 30/01/2019 02:19, Chris Wilson wrote:
> > > > Having introduced per-context seqno, we now have a means to identity
> > > > progress across the system without feel of rollback as befell the
> > > > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > > > advance of submission safe in the knowledge that our target seqno and
> > > > address is stable.
> > > > 
> > > > However, since we are telling the GPU to busy-spin on the target address
> > > > until it matches the signaling seqno, we only want to do so when we are
> > > > sure that busy-spin will be completed quickly. To achieve this we only
> > > > submit the request to HW once the signaler is itself executing (modulo
> > > > preemption causing us to wait longer), and we only do so for default and
> > > > above priority requests (so that idle priority tasks never themselves
> > > > hog the GPU waiting for others).
> > > 
> > > It could be milliseconds though. I think apart from media-bench saying 
> > > this is faster, we would need to look at performance per Watt as well.
> > 
> > All throughput measurements are substantially faster, as you would
> > expect, and inter-engine latency decreased. I would hope it would
> > powergate/rc6 the EU while the CS was spinning, but I don't know :)
> 
> Fwiw, it's about the power cost of simply spinning with the CS without
> any additional cost of utilizing the engine.

Another interesting data point is that the *total* energy consumed for a
latency bound test that passes a piece of work from one engine to the
next reduced by 30%, with a speed increase of nearly 100% (on glk).
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 07e4c3c68ecd..6d825cd28ae6 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -22,8 +22,9 @@ 
  *
  */
 
-#include <linux/prefetch.h>
 #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/signal.h>
@@ -326,6 +327,76 @@  void i915_request_retire_upto(struct i915_request *rq)
 	} while (tmp != rq);
 }
 
+struct execute_cb {
+	struct list_head link;
+	struct irq_work work;
+	struct i915_sw_fence *fence;
+};
+
+static void irq_execute_cb(struct irq_work *wrk)
+{
+	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+	i915_sw_fence_complete(cb->fence);
+	kfree(cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+	struct execute_cb *cb;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (list_empty(&rq->execute_cb))
+		return;
+
+	list_for_each_entry(cb, &rq->execute_cb, link)
+		irq_work_queue(&cb->work);
+
+	/*
+	 * XXX Rollback on __i915_request_unsubmit()
+	 *
+	 * In the future, perhaps when we have an active time-slicing scheduler,
+	 * it will be interesting to unsubmit parallel execution and remove
+	 * busywaits from the GPU until their master is restarted. This is
+	 * quite hairy, we have to carefully rollback the fence and do a
+	 * preempt-to-idle cycle on the target engine, all the while the
+	 * master execute_cb may refire.
+	 */
+	INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+			     struct i915_request *signal,
+			     gfp_t gfp)
+{
+	struct execute_cb *cb;
+	unsigned long flags;
+
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags))
+		return 0;
+
+	cb = kmalloc(sizeof(*cb), gfp);
+	if (!cb)
+		return -ENOMEM;
+
+	cb->fence = &rq->submit;
+	i915_sw_fence_await(cb->fence);
+	init_irq_work(&cb->work, irq_execute_cb);
+
+	spin_lock_irqsave(&signal->lock, flags);
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags)) {
+		i915_sw_fence_complete(cb->fence);
+		kfree(cb);
+	} else {
+		list_add_tail(&cb->link, &signal->execute_cb);
+	}
+	spin_unlock_irqrestore(&signal->lock, flags);
+
+	return 0;
+}
+
 static void move_to_timeline(struct i915_request *request,
 			     struct i915_timeline *timeline)
 {
@@ -373,6 +444,7 @@  void __i915_request_submit(struct i915_request *request)
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
 	    !i915_request_enable_breadcrumb(request))
 		intel_engine_queue_breadcrumbs(engine);
+	__notify_execute_cb(request);
 	spin_unlock(&request->lock);
 
 	engine->emit_fini_breadcrumb(request,
@@ -613,6 +685,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	}
 
 	INIT_LIST_HEAD(&rq->active_list);
+	INIT_LIST_HEAD(&rq->execute_cb);
 
 	tl = ce->ring->timeline;
 	ret = i915_timeline_get_seqno(tl, rq, &seqno);
@@ -700,6 +773,81 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	return ERR_PTR(ret);
 }
 
+static int
+emit_semaphore_wait(struct i915_request *to,
+		    struct i915_request *from,
+		    gfp_t gfp)
+{
+	u32 *cs;
+	int err;
+
+	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+
+	err = i915_timeline_read_lock(from->timeline, to);
+	if (err)
+		return err;
+
+	/*
+	 * If we know our signaling request has started, we know that it
+	 * must, at least, have passed its initial breadcrumb and that its
+	 * seqno can only increase, therefore any change in its breadcrumb
+	 * must indicate completion. By using a "not equal to start" compare
+	 * we avoid the murky issue of how to handle seqno wraparound in an
+	 * async environment (short answer, we must stop the world whenever
+	 * any context wraps!) as the likelihood of missing one request then
+	 * seeing the same start value for a new request is 1 in 2^31, and
+	 * even then we know that the new request has started and is in
+	 * progress, so we are sure it will complete soon enough (not to
+	 * worry about).
+	 */
+	if (i915_request_started(from)) {
+		cs = intel_ring_begin(to, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_NEQ_SDD;
+		*cs++ = from->fence.seqno - 1;
+		*cs++ = from->timeline->hwsp_offset;
+		*cs++ = 0;
+
+		intel_ring_advance(to, cs);
+	} else {
+		int err;
+
+		err = i915_request_await_execution(to, from, gfp);
+		if (err)
+			return err;
+
+		cs = intel_ring_begin(to, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		/*
+		 * Using greater-than-or-equal here means we have to worry
+		 * about seqno wraparound. To side step that issue, we swap
+		 * the timeline HWSP upon wrapping, so that everyone listening
+		 * for the old (pre-wrap) values do not see the much smaller
+		 * (post-wrap) values than they were expecting (and so wait
+		 * forever).
+		 */
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = from->fence.seqno;
+		*cs++ = from->timeline->hwsp_offset;
+		*cs++ = 0;
+
+		intel_ring_advance(to, cs);
+	}
+
+	to->sched.semaphore = true;
+	return 0;
+}
+
 static int
 i915_request_await_request(struct i915_request *to, struct i915_request *from)
 {
@@ -723,6 +871,9 @@  i915_request_await_request(struct i915_request *to, struct i915_request *from)
 		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
 						       &from->submit,
 						       I915_FENCE_GFP);
+	} else if (HAS_EXECLISTS(to->i915) &&
+		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
 	} else {
 		ret = i915_sw_fence_await_dma_fence(&to->submit,
 						    &from->fence, 0,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 40f3e8dcbdd5..66a374ee177a 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -127,6 +127,7 @@  struct i915_request {
 	 */
 	struct i915_sw_fence submit;
 	wait_queue_entry_t submitq;
+	struct list_head execute_cb;
 
 	/*
 	 * A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index d01683167c77..aa6c663dca09 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -29,6 +29,7 @@  void i915_sched_node_init(struct i915_sched_node *node)
 	INIT_LIST_HEAD(&node->waiters_list);
 	INIT_LIST_HEAD(&node->link);
 	node->attr.priority = I915_PRIORITY_INVALID;
+	node->semaphore = false;
 }
 
 static struct i915_dependency *
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index dbe9cb7ecd82..d764cf10536f 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -72,6 +72,7 @@  struct i915_sched_node {
 	struct list_head waiters_list; /* those after us, they depend upon us */
 	struct list_head link;
 	struct i915_sched_attr attr;
+	bool semaphore;
 };
 
 struct i915_dependency {
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 7c58b049ecb5..8d1400d378d7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -192,7 +192,7 @@  static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
 	__i915_sw_fence_notify(fence, FENCE_FREE);
 }
 
-static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+void i915_sw_fence_complete(struct i915_sw_fence *fence)
 {
 	debug_fence_assert(fence);
 
@@ -202,7 +202,7 @@  static void i915_sw_fence_complete(struct i915_sw_fence *fence)
 	__i915_sw_fence_complete(fence, NULL);
 }
 
-static void i915_sw_fence_await(struct i915_sw_fence *fence)
+void i915_sw_fence_await(struct i915_sw_fence *fence)
 {
 	debug_fence_assert(fence);
 	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 0e055ea0179f..6dec9e1d1102 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -79,6 +79,9 @@  int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    unsigned long timeout,
 				    gfp_t gfp);
 
+void i915_sw_fence_await(struct i915_sw_fence *fence);
+void i915_sw_fence_complete(struct i915_sw_fence *fence);
+
 static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
 {
 	return atomic_read(&fence->pending) <= 0;
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
index b96a31bc1080..0efaadd3bc32 100644
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -106,7 +106,12 @@ 
 #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
 #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
 #define   MI_SEMAPHORE_POLL		(1<<15)
+#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
 #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
+#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
 #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
 #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
 #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e97ce54138d3..80d17b75b2e6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -421,7 +421,8 @@  __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	 * in the priority queue, but they will not gain immediate access to
 	 * the GPU.
 	 */
-	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY) {
+	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY &&
+	    i915_request_started(active)) {
 		prio |= ACTIVE_PRIORITY;
 		active->sched.attr.priority = prio;
 		list_move_tail(&active->sched.link,
@@ -605,6 +606,17 @@  static bool can_merge_rq(const struct i915_request *prev,
 {
 	GEM_BUG_ON(!assert_priority_queue(prev, next));
 
+	/*
+	 * To avoid AB-BA deadlocks, we simply restrict ourselves to only
+	 * submitting one semaphore (think HW spinlock) to HW at a time. This
+	 * prevents the execution callback on a later sempahore from being
+	 * queued on another engine, so no cycle can be formed. Preemption
+	 * rules should mean that if this semaphore is preempted, its
+	 * dependency chain is preserved and suitably promoted via PI.
+	 */
+	if (prev->sched.semaphore && !i915_request_started(prev))
+		return false;
+
 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
 		return false;