diff mbox

[1/2] drm/i915: signal first fence from irq handler if complete

Message ID 20170217155150.7837-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Feb. 17, 2017, 3:51 p.m. UTC
As execlists and other non-semaphore multi-engine devices coordinate
between engines using interrupts, we can shave off a few 10s of
microsecond of scheduling latency by doing the fence signaling from the
interrupt as opposed to a RT kthread. (Realistically the delay adds
about 1% to an individual cross-engine workload.) We only signal the
first fence in order to limit the amount of work we move into the
interrupt handler. We also have to remember that our breadcrumbs may be
unordered with respect to the interrupt and so we still require the
waiter process to perform some heavyweight coherency fixups, as well as
traversing the tree of waiters.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h          | 19 +++++++++-------
 drivers/gpu/drm/i915/i915_gem_request.c  |  1 +
 drivers/gpu/drm/i915/i915_gem_request.h  |  2 ++
 drivers/gpu/drm/i915/i915_irq.c          | 38 ++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 10 +--------
 drivers/gpu/drm/i915/intel_ringbuffer.h  | 30 +++++++++----------------
 6 files changed, 62 insertions(+), 38 deletions(-)

Comments

Tvrtko Ursulin Feb. 22, 2017, 4:53 p.m. UTC | #1
On 17/02/2017 15:51, Chris Wilson wrote:
> As execlists and other non-semaphore multi-engine devices coordinate
> between engines using interrupts, we can shave off a few 10s of
> microsecond of scheduling latency by doing the fence signaling from the
> interrupt as opposed to a RT kthread. (Realistically the delay adds
> about 1% to an individual cross-engine workload.) We only signal the
> first fence in order to limit the amount of work we move into the
> interrupt handler. We also have to remember that our breadcrumbs may be
> unordered with respect to the interrupt and so we still require the
> waiter process to perform some heavyweight coherency fixups, as well as
> traversing the tree of waiters.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h          | 19 +++++++++-------
>  drivers/gpu/drm/i915/i915_gem_request.c  |  1 +
>  drivers/gpu/drm/i915/i915_gem_request.h  |  2 ++
>  drivers/gpu/drm/i915/i915_irq.c          | 38 ++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_breadcrumbs.c | 10 +--------
>  drivers/gpu/drm/i915/intel_ringbuffer.h  | 30 +++++++++----------------
>  6 files changed, 62 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5005922f267b..2592a15d7727 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -4005,7 +4005,10 @@ static inline bool
>  __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  {
>  	struct intel_engine_cs *engine = req->engine;
> -	u32 seqno = i915_gem_request_global_seqno(req);
> +	u32 seqno;
> +
> +	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &req->fence.flags))
> +		return true;
>
>  	/* The request was dequeued before we were awoken. We check after
>  	 * inspecting the hw to confirm that this was the same request
> @@ -4013,6 +4016,7 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  	 * the request execution are sufficient to ensure that a check
>  	 * after reading the value from hw matches this request.
>  	 */
> +	seqno = i915_gem_request_global_seqno(req);
>  	if (!seqno)
>  		return false;
>
> @@ -4034,9 +4038,8 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  	 * is woken.
>  	 */
>  	if (engine->irq_seqno_barrier &&
> -	    rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh) == current &&
>  	    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted)) {
> -		struct task_struct *tsk;
> +		unsigned long flags;
>
>  		/* The ordering of irq_posted versus applying the barrier
>  		 * is crucial. The clearing of the current irq_posted must
> @@ -4058,17 +4061,17 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
>  		 * the seqno before we believe it coherent since they see
>  		 * irq_posted == false but we are still running).
>  		 */
> -		rcu_read_lock();
> -		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
> -		if (tsk && tsk != current)
> +		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
> +		if (engine->breadcrumbs.first_wait &&
> +		    engine->breadcrumbs.first_wait->tsk != current)
>  			/* Note that if the bottom-half is changed as we
>  			 * are sending the wake-up, the new bottom-half will
>  			 * be woken by whomever made the change. We only have
>  			 * to worry about when we steal the irq-posted for
>  			 * ourself.
>  			 */
> -			wake_up_process(tsk);
> -		rcu_read_unlock();
> +			wake_up_process(engine->breadcrumbs.first_wait->tsk);
> +		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);

Worth caching &engine->breadcrumbs maybe?

>
>  		if (__i915_gem_request_completed(req, seqno))
>  			return true;
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index e22eacec022d..2e7bdb0cf069 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -1083,6 +1083,7 @@ long i915_wait_request(struct drm_i915_gem_request *req,
>  	}
>
>  	wait.tsk = current;
> +	wait.request = req;

I guess this was the preemption prep tree already. If you decide to keep 
the intel_wait_init helper could add req to it.

>
>  restart:
>  	do {
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 5f73d8c0a38a..0efee879df23 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -32,10 +32,12 @@
>
>  struct drm_file;
>  struct drm_i915_gem_object;
> +struct drm_i915_gem_request;
>
>  struct intel_wait {
>  	struct rb_node node;
>  	struct task_struct *tsk;
> +	struct drm_i915_gem_request *request;
>  	u32 seqno;
>  };
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 57fa1bf78a85..0c370c687c2a 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1033,10 +1033,44 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
>
>  static void notify_ring(struct intel_engine_cs *engine)
>  {
> +	struct drm_i915_gem_request *rq = NULL;
> +	struct intel_wait *wait;
> +
> +	if (!intel_engine_has_waiter(engine))
> +		return;
> +
> +	trace_i915_gem_request_notify(engine);
>  	atomic_inc(&engine->irq_count);
>  	set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
> -	if (intel_engine_wakeup(engine))
> -		trace_i915_gem_request_notify(engine);
> +
> +	rcu_read_lock();

Not sure this is required from an irq handler. But I don't think it 
harms either, so maybe has an usefulness in documenting things.

> +
> +	spin_lock(&engine->breadcrumbs.lock);
> +	wait = engine->breadcrumbs.first_wait;
> +	if (wait) {
> +		/* We use a callback from the dma-fence to submit
> +		 * requests after waiting on our own requests. To
> +		 * ensure minimum delay in queuing the next request to
> +		 * hardware, signal the fence now rather than wait for
> +		 * the signaler to be woken up. We still wake up the
> +		 * waiter in order to handle the irq-seqno coherency
> +		 * issues (we may receive the interrupt before the
> +		 * seqno is written, see __i915_request_irq_complete())
> +		 * and to handle coalescing of multiple seqno updates
> +		 * and many waiters.
> +		 */
> +		if (i915_seqno_passed(intel_engine_get_seqno(engine),
> +				      wait->seqno))
> +			rq = wait->request;
> +
> +		wake_up_process(wait->tsk);
> +	}
> +	spin_unlock(&engine->breadcrumbs.lock);
> +
> +	if (rq)
> +		dma_fence_signal(&rq->fence);
> +
> +	rcu_read_unlock();
>  }
>
>  static void vlv_c0_read(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 3a080c7345d5..860372653a59 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -276,7 +276,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  	}
>  	rb_link_node(&wait->node, parent, p);
>  	rb_insert_color(&wait->node, &b->waiters);
> -	GEM_BUG_ON(!first && !rcu_access_pointer(b->irq_seqno_bh));
>
>  	if (completed) {
>  		struct rb_node *next = rb_next(completed);
> @@ -285,7 +284,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  		if (next && next != &wait->node) {
>  			GEM_BUG_ON(first);
>  			b->first_wait = to_wait(next);
> -			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
>  			/* As there is a delay between reading the current
>  			 * seqno, processing the completed tasks and selecting
>  			 * the next waiter, we may have missed the interrupt
> @@ -312,7 +310,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  	if (first) {
>  		GEM_BUG_ON(rb_first(&b->waiters) != &wait->node);
>  		b->first_wait = wait;
> -		rcu_assign_pointer(b->irq_seqno_bh, wait->tsk);
>  		/* After assigning ourselves as the new bottom-half, we must
>  		 * perform a cursory check to prevent a missed interrupt.
>  		 * Either we miss the interrupt whilst programming the hardware,
> @@ -323,7 +320,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
>  		 */
>  		__intel_breadcrumbs_enable_irq(b);
>  	}
> -	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh));
>  	GEM_BUG_ON(!b->first_wait);
>  	GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node);
>
> @@ -371,8 +367,6 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
>  		const int priority = wakeup_priority(b, wait->tsk);
>  		struct rb_node *next;
>
> -		GEM_BUG_ON(rcu_access_pointer(b->irq_seqno_bh) != wait->tsk);
> -
>  		/* We are the current bottom-half. Find the next candidate,
>  		 * the first waiter in the queue on the remaining oldest
>  		 * request. As multiple seqnos may complete in the time it
> @@ -414,13 +408,11 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
>  			 * exception rather than a seqno completion.
>  			 */
>  			b->first_wait = to_wait(next);
> -			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
>  			if (b->first_wait->seqno != wait->seqno)
>  				__intel_breadcrumbs_enable_irq(b);
>  			wake_up_process(b->first_wait->tsk);
>  		} else {
>  			b->first_wait = NULL;
> -			rcu_assign_pointer(b->irq_seqno_bh, NULL);
>  			__intel_breadcrumbs_disable_irq(b);
>  		}
>  	} else {
> @@ -434,7 +426,6 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
>  	GEM_BUG_ON(b->first_wait == wait);
>  	GEM_BUG_ON(rb_first(&b->waiters) !=
>  		   (b->first_wait ? &b->first_wait->node : NULL));
> -	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh) ^ RB_EMPTY_ROOT(&b->waiters));
>  }
>
>  void intel_engine_remove_wait(struct intel_engine_cs *engine,
> @@ -598,6 +589,7 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request)
>  		return;
>
>  	request->signaling.wait.tsk = b->signaler;
> +	request->signaling.wait.request = request;
>  	request->signaling.wait.seqno = seqno;
>  	i915_gem_request_get(request);
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 45d2c2fa946e..1271b6ebdd4d 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -235,8 +235,6 @@ struct intel_engine_cs {
>  	 * the overhead of waking that client is much preferred.
>  	 */
>  	struct intel_breadcrumbs {
> -		struct task_struct __rcu *irq_seqno_bh; /* bh for interrupts */
> -
>  		spinlock_t lock; /* protects the lists of requests; irqsafe */
>  		struct rb_root waiters; /* sorted by retirement, priority */
>  		struct rb_root signals; /* sorted by retirement */
> @@ -602,31 +600,25 @@ void intel_engine_cancel_signaling(struct drm_i915_gem_request *request);
>
>  static inline bool intel_engine_has_waiter(const struct intel_engine_cs *engine)
>  {
> -	return rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh);
> +	return READ_ONCE(engine->breadcrumbs.first_wait);
>  }
>
> -static inline bool intel_engine_wakeup(const struct intel_engine_cs *engine)
> +static inline bool intel_engine_wakeup(struct intel_engine_cs *engine)
>  {
> -	bool wakeup = false;
> +	struct intel_wait *wait = NULL;
> +	unsigned long flags;
>
> -	/* Note that for this not to dangerously chase a dangling pointer,
> -	 * we must hold the rcu_read_lock here.
> -	 *
> -	 * Also note that tsk is likely to be in !TASK_RUNNING state so an
> -	 * early test for tsk->state != TASK_RUNNING before wake_up_process()
> -	 * is unlikely to be beneficial.
> -	 */
>  	if (intel_engine_has_waiter(engine)) {
> -		struct task_struct *tsk;
> +		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
> +
> +		wait = engine->breadcrumbs.first_wait;
> +		if (wait)
> +			wake_up_process(wait->tsk);
>
> -		rcu_read_lock();
> -		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
> -		if (tsk)
> -			wakeup = wake_up_process(tsk);
> -		rcu_read_unlock();
> +		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);
>  	}
>
> -	return wakeup;
> +	return wait;
>  }
>
>  void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine);
>

Looks OK.

Last time I tested it was a definitive latency improvement but also a 
slight throughput regression on crazy microbenchmarks like gem_latency 
IIRC. But I think overall a good thing for more realistic workloads.

I leave to you the ordering wrt/ the preemption prep series. I can apply 
my r-b at that point, not sure if it makes sense now?

Regards,

Tvrtko
Chris Wilson Feb. 22, 2017, 5:10 p.m. UTC | #2
On Wed, Feb 22, 2017 at 04:53:35PM +0000, Tvrtko Ursulin wrote:
> 
> On 17/02/2017 15:51, Chris Wilson wrote:
> >@@ -4034,9 +4038,8 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
> > 	 * is woken.
> > 	 */
> > 	if (engine->irq_seqno_barrier &&
> >-	    rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh) == current &&
> > 	    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted)) {
> >-		struct task_struct *tsk;
> >+		unsigned long flags;
> >
> > 		/* The ordering of irq_posted versus applying the barrier
> > 		 * is crucial. The clearing of the current irq_posted must
> >@@ -4058,17 +4061,17 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
> > 		 * the seqno before we believe it coherent since they see
> > 		 * irq_posted == false but we are still running).
> > 		 */
> >-		rcu_read_lock();
> >-		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
> >-		if (tsk && tsk != current)
> >+		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
> >+		if (engine->breadcrumbs.first_wait &&
> >+		    engine->breadcrumbs.first_wait->tsk != current)
> > 			/* Note that if the bottom-half is changed as we
> > 			 * are sending the wake-up, the new bottom-half will
> > 			 * be woken by whomever made the change. We only have
> > 			 * to worry about when we steal the irq-posted for
> > 			 * ourself.
> > 			 */
> >-			wake_up_process(tsk);
> >-		rcu_read_unlock();
> >+			wake_up_process(engine->breadcrumbs.first_wait->tsk);
> >+		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);
> 
> Worth caching &engine->breadcrumbs maybe?

I'll ask gcc.

> >
> > 		if (__i915_gem_request_completed(req, seqno))
> > 			return true;
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> >index e22eacec022d..2e7bdb0cf069 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >@@ -1083,6 +1083,7 @@ long i915_wait_request(struct drm_i915_gem_request *req,
> > 	}
> >
> > 	wait.tsk = current;
> >+	wait.request = req;
> 
> I guess this was the preemption prep tree already. If you decide to
> keep the intel_wait_init helper could add req to it.

Challenge being not all users of intel_wait_init have a request :)

> > restart:
> > 	do {
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> >index 5f73d8c0a38a..0efee879df23 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.h
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.h
> >@@ -32,10 +32,12 @@
> >
> > struct drm_file;
> > struct drm_i915_gem_object;
> >+struct drm_i915_gem_request;
> >
> > struct intel_wait {
> > 	struct rb_node node;
> > 	struct task_struct *tsk;
> >+	struct drm_i915_gem_request *request;
> > 	u32 seqno;
> > };
> >
> >diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> >index 57fa1bf78a85..0c370c687c2a 100644
> >--- a/drivers/gpu/drm/i915/i915_irq.c
> >+++ b/drivers/gpu/drm/i915/i915_irq.c
> >@@ -1033,10 +1033,44 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
> >
> > static void notify_ring(struct intel_engine_cs *engine)
> > {
> >+	struct drm_i915_gem_request *rq = NULL;
> >+	struct intel_wait *wait;
> >+
> >+	if (!intel_engine_has_waiter(engine))
> >+		return;
> >+
> >+	trace_i915_gem_request_notify(engine);
> > 	atomic_inc(&engine->irq_count);
> > 	set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
> >-	if (intel_engine_wakeup(engine))
> >-		trace_i915_gem_request_notify(engine);
> >+
> >+	rcu_read_lock();
> 
> Not sure this is required from an irq handler. But I don't think it
> harms either, so maybe has an usefulness in documenting things.

That's what I said! It's not, but it's cheap enough that I think it is
worth keeping to document the game being played with rq.

> Looks OK.
> 
> Last time I tested it was a definitive latency improvement but also
> a slight throughput regression on crazy microbenchmarks like
> gem_latency IIRC. But I think overall a good thing for more
> realistic workloads.

Yup. It's a big one for gem_exec_whisper which is latency sensitive (and
gem_exec_latency). I didn't see gem_latency (using tests/gem_latency)
suffer too much. To be precise, which gem_latency are you thinking of?
:) I presume ezbench -b gem:latency ?

I did look at trying to reduce the cost of the spinlocks (by trying to
share them between requests, or the engine->timeline and breadcrumbs)
but came away bruised and battered.

> I leave to you the ordering wrt/ the preemption prep series. I can
> apply my r-b at that point, not sure if it makes sense now?

It's in the queue, and I'll resend it once the earlier patches land so
that CI has a run over it, and to catch any more ideas.
-Chris
Chris Wilson Feb. 22, 2017, 6:09 p.m. UTC | #3
On Wed, Feb 22, 2017 at 04:53:35PM +0000, Tvrtko Ursulin wrote:
> 
> On 17/02/2017 15:51, Chris Wilson wrote:
> > 	if (engine->irq_seqno_barrier &&
> >-	    rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh) == current &&
> > 	    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted)) {
> >-		struct task_struct *tsk;
> >+		unsigned long flags;
> >
> > 		/* The ordering of irq_posted versus applying the barrier
> > 		 * is crucial. The clearing of the current irq_posted must
> >@@ -4058,17 +4061,17 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
> > 		 * the seqno before we believe it coherent since they see
> > 		 * irq_posted == false but we are still running).
> > 		 */
> >-		rcu_read_lock();
> >-		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
> >-		if (tsk && tsk != current)
> >+		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
> >+		if (engine->breadcrumbs.first_wait &&
> >+		    engine->breadcrumbs.first_wait->tsk != current)
> > 			/* Note that if the bottom-half is changed as we
> > 			 * are sending the wake-up, the new bottom-half will
> > 			 * be woken by whomever made the change. We only have
> > 			 * to worry about when we steal the irq-posted for
> > 			 * ourself.
> > 			 */
> >-			wake_up_process(tsk);
> >-		rcu_read_unlock();
> >+			wake_up_process(engine->breadcrumbs.first_wait->tsk);
> >+		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);
> 
> Worth caching &engine->breadcrumbs maybe?

Makes no difference to object code, but makes it more pleasant to read,
so yes.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5005922f267b..2592a15d7727 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -4005,7 +4005,10 @@  static inline bool
 __i915_request_irq_complete(const struct drm_i915_gem_request *req)
 {
 	struct intel_engine_cs *engine = req->engine;
-	u32 seqno = i915_gem_request_global_seqno(req);
+	u32 seqno;
+
+	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &req->fence.flags))
+		return true;
 
 	/* The request was dequeued before we were awoken. We check after
 	 * inspecting the hw to confirm that this was the same request
@@ -4013,6 +4016,7 @@  __i915_request_irq_complete(const struct drm_i915_gem_request *req)
 	 * the request execution are sufficient to ensure that a check
 	 * after reading the value from hw matches this request.
 	 */
+	seqno = i915_gem_request_global_seqno(req);
 	if (!seqno)
 		return false;
 
@@ -4034,9 +4038,8 @@  __i915_request_irq_complete(const struct drm_i915_gem_request *req)
 	 * is woken.
 	 */
 	if (engine->irq_seqno_barrier &&
-	    rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh) == current &&
 	    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted)) {
-		struct task_struct *tsk;
+		unsigned long flags;
 
 		/* The ordering of irq_posted versus applying the barrier
 		 * is crucial. The clearing of the current irq_posted must
@@ -4058,17 +4061,17 @@  __i915_request_irq_complete(const struct drm_i915_gem_request *req)
 		 * the seqno before we believe it coherent since they see
 		 * irq_posted == false but we are still running).
 		 */
-		rcu_read_lock();
-		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
-		if (tsk && tsk != current)
+		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
+		if (engine->breadcrumbs.first_wait &&
+		    engine->breadcrumbs.first_wait->tsk != current)
 			/* Note that if the bottom-half is changed as we
 			 * are sending the wake-up, the new bottom-half will
 			 * be woken by whomever made the change. We only have
 			 * to worry about when we steal the irq-posted for
 			 * ourself.
 			 */
-			wake_up_process(tsk);
-		rcu_read_unlock();
+			wake_up_process(engine->breadcrumbs.first_wait->tsk);
+		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);
 
 		if (__i915_gem_request_completed(req, seqno))
 			return true;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index e22eacec022d..2e7bdb0cf069 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -1083,6 +1083,7 @@  long i915_wait_request(struct drm_i915_gem_request *req,
 	}
 
 	wait.tsk = current;
+	wait.request = req;
 
 restart:
 	do {
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 5f73d8c0a38a..0efee879df23 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -32,10 +32,12 @@ 
 
 struct drm_file;
 struct drm_i915_gem_object;
+struct drm_i915_gem_request;
 
 struct intel_wait {
 	struct rb_node node;
 	struct task_struct *tsk;
+	struct drm_i915_gem_request *request;
 	u32 seqno;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 57fa1bf78a85..0c370c687c2a 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1033,10 +1033,44 @@  static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
 
 static void notify_ring(struct intel_engine_cs *engine)
 {
+	struct drm_i915_gem_request *rq = NULL;
+	struct intel_wait *wait;
+
+	if (!intel_engine_has_waiter(engine))
+		return;
+
+	trace_i915_gem_request_notify(engine);
 	atomic_inc(&engine->irq_count);
 	set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
-	if (intel_engine_wakeup(engine))
-		trace_i915_gem_request_notify(engine);
+
+	rcu_read_lock();
+
+	spin_lock(&engine->breadcrumbs.lock);
+	wait = engine->breadcrumbs.first_wait;
+	if (wait) {
+		/* We use a callback from the dma-fence to submit
+		 * requests after waiting on our own requests. To
+		 * ensure minimum delay in queuing the next request to
+		 * hardware, signal the fence now rather than wait for
+		 * the signaler to be woken up. We still wake up the
+		 * waiter in order to handle the irq-seqno coherency
+		 * issues (we may receive the interrupt before the
+		 * seqno is written, see __i915_request_irq_complete())
+		 * and to handle coalescing of multiple seqno updates
+		 * and many waiters.
+		 */
+		if (i915_seqno_passed(intel_engine_get_seqno(engine),
+				      wait->seqno))
+			rq = wait->request;
+
+		wake_up_process(wait->tsk);
+	}
+	spin_unlock(&engine->breadcrumbs.lock);
+
+	if (rq)
+		dma_fence_signal(&rq->fence);
+
+	rcu_read_unlock();
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 3a080c7345d5..860372653a59 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -276,7 +276,6 @@  static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 	}
 	rb_link_node(&wait->node, parent, p);
 	rb_insert_color(&wait->node, &b->waiters);
-	GEM_BUG_ON(!first && !rcu_access_pointer(b->irq_seqno_bh));
 
 	if (completed) {
 		struct rb_node *next = rb_next(completed);
@@ -285,7 +284,6 @@  static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		if (next && next != &wait->node) {
 			GEM_BUG_ON(first);
 			b->first_wait = to_wait(next);
-			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
 			/* As there is a delay between reading the current
 			 * seqno, processing the completed tasks and selecting
 			 * the next waiter, we may have missed the interrupt
@@ -312,7 +310,6 @@  static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 	if (first) {
 		GEM_BUG_ON(rb_first(&b->waiters) != &wait->node);
 		b->first_wait = wait;
-		rcu_assign_pointer(b->irq_seqno_bh, wait->tsk);
 		/* After assigning ourselves as the new bottom-half, we must
 		 * perform a cursory check to prevent a missed interrupt.
 		 * Either we miss the interrupt whilst programming the hardware,
@@ -323,7 +320,6 @@  static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		 */
 		__intel_breadcrumbs_enable_irq(b);
 	}
-	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh));
 	GEM_BUG_ON(!b->first_wait);
 	GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node);
 
@@ -371,8 +367,6 @@  static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
 		const int priority = wakeup_priority(b, wait->tsk);
 		struct rb_node *next;
 
-		GEM_BUG_ON(rcu_access_pointer(b->irq_seqno_bh) != wait->tsk);
-
 		/* We are the current bottom-half. Find the next candidate,
 		 * the first waiter in the queue on the remaining oldest
 		 * request. As multiple seqnos may complete in the time it
@@ -414,13 +408,11 @@  static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
 			 * exception rather than a seqno completion.
 			 */
 			b->first_wait = to_wait(next);
-			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
 			if (b->first_wait->seqno != wait->seqno)
 				__intel_breadcrumbs_enable_irq(b);
 			wake_up_process(b->first_wait->tsk);
 		} else {
 			b->first_wait = NULL;
-			rcu_assign_pointer(b->irq_seqno_bh, NULL);
 			__intel_breadcrumbs_disable_irq(b);
 		}
 	} else {
@@ -434,7 +426,6 @@  static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
 	GEM_BUG_ON(b->first_wait == wait);
 	GEM_BUG_ON(rb_first(&b->waiters) !=
 		   (b->first_wait ? &b->first_wait->node : NULL));
-	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh) ^ RB_EMPTY_ROOT(&b->waiters));
 }
 
 void intel_engine_remove_wait(struct intel_engine_cs *engine,
@@ -598,6 +589,7 @@  void intel_engine_enable_signaling(struct drm_i915_gem_request *request)
 		return;
 
 	request->signaling.wait.tsk = b->signaler;
+	request->signaling.wait.request = request;
 	request->signaling.wait.seqno = seqno;
 	i915_gem_request_get(request);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 45d2c2fa946e..1271b6ebdd4d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -235,8 +235,6 @@  struct intel_engine_cs {
 	 * the overhead of waking that client is much preferred.
 	 */
 	struct intel_breadcrumbs {
-		struct task_struct __rcu *irq_seqno_bh; /* bh for interrupts */
-
 		spinlock_t lock; /* protects the lists of requests; irqsafe */
 		struct rb_root waiters; /* sorted by retirement, priority */
 		struct rb_root signals; /* sorted by retirement */
@@ -602,31 +600,25 @@  void intel_engine_cancel_signaling(struct drm_i915_gem_request *request);
 
 static inline bool intel_engine_has_waiter(const struct intel_engine_cs *engine)
 {
-	return rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh);
+	return READ_ONCE(engine->breadcrumbs.first_wait);
 }
 
-static inline bool intel_engine_wakeup(const struct intel_engine_cs *engine)
+static inline bool intel_engine_wakeup(struct intel_engine_cs *engine)
 {
-	bool wakeup = false;
+	struct intel_wait *wait = NULL;
+	unsigned long flags;
 
-	/* Note that for this not to dangerously chase a dangling pointer,
-	 * we must hold the rcu_read_lock here.
-	 *
-	 * Also note that tsk is likely to be in !TASK_RUNNING state so an
-	 * early test for tsk->state != TASK_RUNNING before wake_up_process()
-	 * is unlikely to be beneficial.
-	 */
 	if (intel_engine_has_waiter(engine)) {
-		struct task_struct *tsk;
+		spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
+
+		wait = engine->breadcrumbs.first_wait;
+		if (wait)
+			wake_up_process(wait->tsk);
 
-		rcu_read_lock();
-		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
-		if (tsk)
-			wakeup = wake_up_process(tsk);
-		rcu_read_unlock();
+		spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);
 	}
 
-	return wakeup;
+	return wait;
 }
 
 void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine);