[08/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
diff mbox series

Message ID 20190226102404.29153-8-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [01/11] drm/i915: Skip scanning for signalers if we are already inflight
Related show

Commit Message

Chris Wilson Feb. 26, 2019, 10:24 a.m. UTC
Having introduced per-context seqno, we now have a means to identity
progress across the system without feel of rollback as befell the
global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
advance of submission safe in the knowledge that our target seqno and
address is stable.

However, since we are telling the GPU to busy-spin on the target address
until it matches the signaling seqno, we only want to do so when we are
sure that busy-spin will be completed quickly. To achieve this we only
submit the request to HW once the signaler is itself executing (modulo
preemption causing us to wait longer), and we only do so for default and
above priority requests (so that idle priority tasks never themselves
hog the GPU waiting for others).

As might be reasonably expected, HW semaphores excel in inter-engine
synchronisation microbenchmarks (where the reduced latency / increased
throughput more than offset the power cost of spinning on a second ring)
and have significant improvement for single clients that utilize multiple
engines (typically media players), without regressing multiple clients
that can saturate the system.

v3: Drop the older NEQ branch, now we pin the signaler's HWSP anyway.
v4: Tell the world and include it as part of scheduler caps.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c           |   2 +-
 drivers/gpu/drm/i915/i915_request.c       | 138 +++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_request.h       |   1 +
 drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
 drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
 drivers/gpu/drm/i915/intel_engine_cs.c    |   1 +
 drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
 drivers/gpu/drm/i915/intel_lrc.c          |   1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h   |   7 ++
 include/uapi/drm/i915_drm.h               |   1 +
 10 files changed, 158 insertions(+), 5 deletions(-)

Comments

Tvrtko Ursulin Feb. 28, 2019, 10:49 a.m. UTC | #1
On 26/02/2019 10:24, Chris Wilson wrote:
> Having introduced per-context seqno, we now have a means to identity
> progress across the system without feel of rollback as befell the
> global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> advance of submission safe in the knowledge that our target seqno and
> address is stable.
> 
> However, since we are telling the GPU to busy-spin on the target address
> until it matches the signaling seqno, we only want to do so when we are
> sure that busy-spin will be completed quickly. To achieve this we only
> submit the request to HW once the signaler is itself executing (modulo
> preemption causing us to wait longer), and we only do so for default and
> above priority requests (so that idle priority tasks never themselves
> hog the GPU waiting for others).
> 
> As might be reasonably expected, HW semaphores excel in inter-engine
> synchronisation microbenchmarks (where the reduced latency / increased
> throughput more than offset the power cost of spinning on a second ring)
> and have significant improvement for single clients that utilize multiple
> engines (typically media players), without regressing multiple clients
> that can saturate the system.
> 
> v3: Drop the older NEQ branch, now we pin the signaler's HWSP anyway.
> v4: Tell the world and include it as part of scheduler caps.

I think I asked for a table of results showing power, and power/perf to 
be put in the commit message.

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.c           |   2 +-
>   drivers/gpu/drm/i915/i915_request.c       | 138 +++++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_request.h       |   1 +
>   drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
>   drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
>   drivers/gpu/drm/i915/intel_engine_cs.c    |   1 +
>   drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
>   drivers/gpu/drm/i915/intel_lrc.c          |   1 +
>   drivers/gpu/drm/i915/intel_ringbuffer.h   |   7 ++
>   include/uapi/drm/i915_drm.h               |   1 +
>   10 files changed, 158 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index c6354f6cdbdb..c08abdef5eb6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -351,7 +351,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
>   		value = min_t(int, INTEL_PPGTT(dev_priv), I915_GEM_PPGTT_FULL);
>   		break;
>   	case I915_PARAM_HAS_SEMAPHORES:
> -		value = 0;
> +		value = !!(dev_priv->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
>   		break;
>   	case I915_PARAM_HAS_SECURE_BATCHES:
>   		value = capable(CAP_SYS_ADMIN);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index d354967d6ae8..59e30b8c4ee9 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -22,8 +22,9 @@
>    *
>    */
>   
> -#include <linux/prefetch.h>
>   #include <linux/dma-fence-array.h>
> +#include <linux/irq_work.h>
> +#include <linux/prefetch.h>
>   #include <linux/sched.h>
>   #include <linux/sched/clock.h>
>   #include <linux/sched/signal.h>
> @@ -32,9 +33,16 @@
>   #include "i915_active.h"
>   #include "i915_reset.h"
>   
> +struct execute_cb {
> +	struct list_head link;
> +	struct irq_work work;
> +	struct i915_sw_fence *fence;
> +};
> +
>   static struct i915_global_request {
>   	struct kmem_cache *slab_requests;
>   	struct kmem_cache *slab_dependencies;
> +	struct kmem_cache *slab_execute_cbs;
>   } global;
>   
>   static const char *i915_fence_get_driver_name(struct dma_fence *fence)
> @@ -325,6 +333,69 @@ void i915_request_retire_upto(struct i915_request *rq)
>   	} while (tmp != rq);
>   }
>   
> +static void irq_execute_cb(struct irq_work *wrk)
> +{
> +	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
> +
> +	i915_sw_fence_complete(cb->fence);
> +	kmem_cache_free(global.slab_execute_cbs, cb);
> +}
> +
> +static void __notify_execute_cb(struct i915_request *rq)
> +{
> +	struct execute_cb *cb;
> +
> +	lockdep_assert_held(&rq->lock);
> +
> +	if (list_empty(&rq->execute_cb))
> +		return;
> +
> +	list_for_each_entry(cb, &rq->execute_cb, link)
> +		irq_work_queue(&cb->work);
> +
> +	/*
> +	 * XXX Rollback on __i915_request_unsubmit()
> +	 *
> +	 * In the future, perhaps when we have an active time-slicing scheduler,
> +	 * it will be interesting to unsubmit parallel execution and remove
> +	 * busywaits from the GPU until their master is restarted. This is
> +	 * quite hairy, we have to carefully rollback the fence and do a
> +	 * preempt-to-idle cycle on the target engine, all the while the
> +	 * master execute_cb may refire.
> +	 */
> +	INIT_LIST_HEAD(&rq->execute_cb);
> +}
> +
> +static int
> +i915_request_await_execution(struct i915_request *rq,
> +			     struct i915_request *signal,
> +			     gfp_t gfp)
> +{
> +	struct execute_cb *cb;
> +
> +	if (i915_request_is_active(signal))
> +		return 0;
> +
> +	cb = kmem_cache_alloc(global.slab_execute_cbs, gfp);
> +	if (!cb)
> +		return -ENOMEM;
> +
> +	cb->fence = &rq->submit;
> +	i915_sw_fence_await(cb->fence);
> +	init_irq_work(&cb->work, irq_execute_cb);
> +
> +	spin_lock_irq(&signal->lock);
> +	if (i915_request_is_active(signal)) {
> +		i915_sw_fence_complete(cb->fence);
> +		kmem_cache_free(global.slab_execute_cbs, cb);
> +	} else {
> +		list_add_tail(&cb->link, &signal->execute_cb);
> +	}
> +	spin_unlock_irq(&signal->lock);
> +
> +	return 0;
> +}
> +
>   static void move_to_timeline(struct i915_request *request,
>   			     struct i915_timeline *timeline)
>   {
> @@ -361,6 +432,8 @@ void __i915_request_submit(struct i915_request *request)
>   	    !i915_request_enable_breadcrumb(request))
>   		intel_engine_queue_breadcrumbs(engine);
>   
> +	__notify_execute_cb(request);
> +
>   	spin_unlock(&request->lock);
>   
>   	engine->emit_fini_breadcrumb(request,
> @@ -608,6 +681,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	}
>   
>   	INIT_LIST_HEAD(&rq->active_list);
> +	INIT_LIST_HEAD(&rq->execute_cb);
>   
>   	tl = ce->ring->timeline;
>   	ret = i915_timeline_get_seqno(tl, rq, &seqno);
> @@ -696,6 +770,52 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	return ERR_PTR(ret);
>   }
>   
> +static int
> +emit_semaphore_wait(struct i915_request *to,
> +		    struct i915_request *from,
> +		    gfp_t gfp)
> +{
> +	u32 hwsp_offset;
> +	u32 *cs;
> +	int err;
> +
> +	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
> +	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
> +
> +	/* We need to pin the signaler's HWSP until we are finished reading. */
> +	err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
> +	if (err)
> +		return err;
> +
> +	/* Only submit our spinner after the signaler is running! */
> +	err = i915_request_await_execution(to, from, gfp);
> +	if (err)
> +		return err;
> +
> +	cs = intel_ring_begin(to, 4);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/*
> +	 * Using greater-than-or-equal here means we have to worry
> +	 * about seqno wraparound. To side step that issue, we swap
> +	 * the timeline HWSP upon wrapping, so that everyone listening
> +	 * for the old (pre-wrap) values do not see the much smaller
> +	 * (post-wrap) values than they were expecting (and so wait
> +	 * forever).
> +	 */
> +	*cs++ = MI_SEMAPHORE_WAIT |
> +		MI_SEMAPHORE_GLOBAL_GTT |
> +		MI_SEMAPHORE_POLL |
> +		MI_SEMAPHORE_SAD_GTE_SDD;
> +	*cs++ = from->fence.seqno;
> +	*cs++ = hwsp_offset;
> +	*cs++ = 0;
> +
> +	intel_ring_advance(to, cs);
> +	return 0;
> +}
> +
>   static int
>   i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   {
> @@ -717,6 +837,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
>   						       &from->submit,
>   						       I915_FENCE_GFP);
> +	} else if (intel_engine_has_semaphores(to->engine) &&
> +		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
> +		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
>   	} else {
>   		ret = i915_sw_fence_await_dma_fence(&to->submit,
>   						    &from->fence, 0,
> @@ -1208,14 +1331,23 @@ int __init i915_global_request_init(void)
>   	if (!global.slab_requests)
>   		return -ENOMEM;
>   
> +	global.slab_execute_cbs = KMEM_CACHE(execute_cb,
> +					     SLAB_HWCACHE_ALIGN |
> +					     SLAB_RECLAIM_ACCOUNT |
> +					     SLAB_TYPESAFE_BY_RCU);
> +	if (!global.slab_execute_cbs)
> +		goto err_requests;
> +
>   	global.slab_dependencies = KMEM_CACHE(i915_dependency,
>   					      SLAB_HWCACHE_ALIGN |
>   					      SLAB_RECLAIM_ACCOUNT);
>   	if (!global.slab_dependencies)
> -		goto err_requests;
> +		goto err_execute_cbs;
>   
>   	return 0;
>   
> +err_execute_cbs:
> +	kmem_cache_destroy(global.slab_execute_cbs);
>   err_requests:
>   	kmem_cache_destroy(global.slab_requests);
>   	return -ENOMEM;
> @@ -1224,11 +1356,13 @@ int __init i915_global_request_init(void)
>   void i915_global_request_shrink(void)
>   {
>   	kmem_cache_shrink(global.slab_dependencies);
> +	kmem_cache_shrink(global.slab_execute_cbs);
>   	kmem_cache_shrink(global.slab_requests);
>   }
>   
>   void i915_global_request_exit(void)
>   {
>   	kmem_cache_destroy(global.slab_dependencies);
> +	kmem_cache_destroy(global.slab_execute_cbs);
>   	kmem_cache_destroy(global.slab_requests);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index ea1e6f0ade53..3e0d62d35226 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -129,6 +129,7 @@ struct i915_request {
>   	 */
>   	struct i915_sw_fence submit;
>   	wait_queue_entry_t submitq;
> +	struct list_head execute_cb;
>   
>   	/*
>   	 * A list of everyone we wait upon, and everyone who waits upon us.
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 7c58b049ecb5..8d1400d378d7 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
>   	__i915_sw_fence_notify(fence, FENCE_FREE);
>   }
>   
> -static void i915_sw_fence_complete(struct i915_sw_fence *fence)
> +void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   
> @@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   	__i915_sw_fence_complete(fence, NULL);
>   }
>   
> -static void i915_sw_fence_await(struct i915_sw_fence *fence)
> +void i915_sw_fence_await(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 0e055ea0179f..6dec9e1d1102 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    unsigned long timeout,
>   				    gfp_t gfp);
>   
> +void i915_sw_fence_await(struct i915_sw_fence *fence);
> +void i915_sw_fence_complete(struct i915_sw_fence *fence);
> +
>   static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
>   {
>   	return atomic_read(&fence->pending) <= 0;
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index ef49b1b0537b..7e0fe433ab17 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -616,6 +616,7 @@ void intel_engines_set_scheduler_caps(struct drm_i915_private *i915)
>   	} map[] = {
>   #define MAP(x, y) { ilog2(I915_ENGINE_HAS_##x), ilog2(I915_SCHEDULER_CAP_##y) }
>   		MAP(PREEMPTION, PREEMPTION),
> +		MAP(SEMAPHORES, SEMAPHORES),
>   #undef MAP
>   	};
>   	struct intel_engine_cs *engine;
> diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
> index b96a31bc1080..0efaadd3bc32 100644
> --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> @@ -106,7 +106,12 @@
>   #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
>   #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
>   #define   MI_SEMAPHORE_POLL		(1<<15)
> +#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
>   #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
> +#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
> +#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
> +#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
> +#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
>   #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
>   #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
>   #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index f57cfe2fc078..53d6f7fdb50e 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2324,6 +2324,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
>   	engine->park = NULL;
>   	engine->unpark = NULL;
>   
> +	engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
>   	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>   	if (engine->i915->preempt_context)
>   		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index b8ec7e40a59b..2e7264119ec4 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -499,6 +499,7 @@ struct intel_engine_cs {
>   #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
>   #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
>   #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> +#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
>   	unsigned int flags;
>   
>   	/*
> @@ -576,6 +577,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
>   	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
>   }
>   
> +static inline bool
> +intel_engine_has_semaphores(const struct intel_engine_cs *engine)
> +{
> +	return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
> +}
> +
>   void intel_engines_set_scheduler_caps(struct drm_i915_private *i915);
>   
>   static inline bool __execlists_need_preempt(int prio, int last)
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 8304a7f1ec3f..b10eea3f6d24 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -479,6 +479,7 @@ typedef struct drm_i915_irq_wait {
>   #define   I915_SCHEDULER_CAP_ENABLED	(1ul << 0)
>   #define   I915_SCHEDULER_CAP_PRIORITY	(1ul << 1)
>   #define   I915_SCHEDULER_CAP_PREEMPTION	(1ul << 2)
> +#define   I915_SCHEDULER_CAP_SEMAPHORES	(1ul << 3)
>   
>   #define I915_PARAM_HUC_STATUS		 42
>   
>

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c6354f6cdbdb..c08abdef5eb6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -351,7 +351,7 @@  static int i915_getparam_ioctl(struct drm_device *dev, void *data,
 		value = min_t(int, INTEL_PPGTT(dev_priv), I915_GEM_PPGTT_FULL);
 		break;
 	case I915_PARAM_HAS_SEMAPHORES:
-		value = 0;
+		value = !!(dev_priv->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
 		break;
 	case I915_PARAM_HAS_SECURE_BATCHES:
 		value = capable(CAP_SYS_ADMIN);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index d354967d6ae8..59e30b8c4ee9 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -22,8 +22,9 @@ 
  *
  */
 
-#include <linux/prefetch.h>
 #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/signal.h>
@@ -32,9 +33,16 @@ 
 #include "i915_active.h"
 #include "i915_reset.h"
 
+struct execute_cb {
+	struct list_head link;
+	struct irq_work work;
+	struct i915_sw_fence *fence;
+};
+
 static struct i915_global_request {
 	struct kmem_cache *slab_requests;
 	struct kmem_cache *slab_dependencies;
+	struct kmem_cache *slab_execute_cbs;
 } global;
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
@@ -325,6 +333,69 @@  void i915_request_retire_upto(struct i915_request *rq)
 	} while (tmp != rq);
 }
 
+static void irq_execute_cb(struct irq_work *wrk)
+{
+	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+	i915_sw_fence_complete(cb->fence);
+	kmem_cache_free(global.slab_execute_cbs, cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+	struct execute_cb *cb;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (list_empty(&rq->execute_cb))
+		return;
+
+	list_for_each_entry(cb, &rq->execute_cb, link)
+		irq_work_queue(&cb->work);
+
+	/*
+	 * XXX Rollback on __i915_request_unsubmit()
+	 *
+	 * In the future, perhaps when we have an active time-slicing scheduler,
+	 * it will be interesting to unsubmit parallel execution and remove
+	 * busywaits from the GPU until their master is restarted. This is
+	 * quite hairy, we have to carefully rollback the fence and do a
+	 * preempt-to-idle cycle on the target engine, all the while the
+	 * master execute_cb may refire.
+	 */
+	INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+			     struct i915_request *signal,
+			     gfp_t gfp)
+{
+	struct execute_cb *cb;
+
+	if (i915_request_is_active(signal))
+		return 0;
+
+	cb = kmem_cache_alloc(global.slab_execute_cbs, gfp);
+	if (!cb)
+		return -ENOMEM;
+
+	cb->fence = &rq->submit;
+	i915_sw_fence_await(cb->fence);
+	init_irq_work(&cb->work, irq_execute_cb);
+
+	spin_lock_irq(&signal->lock);
+	if (i915_request_is_active(signal)) {
+		i915_sw_fence_complete(cb->fence);
+		kmem_cache_free(global.slab_execute_cbs, cb);
+	} else {
+		list_add_tail(&cb->link, &signal->execute_cb);
+	}
+	spin_unlock_irq(&signal->lock);
+
+	return 0;
+}
+
 static void move_to_timeline(struct i915_request *request,
 			     struct i915_timeline *timeline)
 {
@@ -361,6 +432,8 @@  void __i915_request_submit(struct i915_request *request)
 	    !i915_request_enable_breadcrumb(request))
 		intel_engine_queue_breadcrumbs(engine);
 
+	__notify_execute_cb(request);
+
 	spin_unlock(&request->lock);
 
 	engine->emit_fini_breadcrumb(request,
@@ -608,6 +681,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	}
 
 	INIT_LIST_HEAD(&rq->active_list);
+	INIT_LIST_HEAD(&rq->execute_cb);
 
 	tl = ce->ring->timeline;
 	ret = i915_timeline_get_seqno(tl, rq, &seqno);
@@ -696,6 +770,52 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	return ERR_PTR(ret);
 }
 
+static int
+emit_semaphore_wait(struct i915_request *to,
+		    struct i915_request *from,
+		    gfp_t gfp)
+{
+	u32 hwsp_offset;
+	u32 *cs;
+	int err;
+
+	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
+
+	/* We need to pin the signaler's HWSP until we are finished reading. */
+	err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
+	if (err)
+		return err;
+
+	/* Only submit our spinner after the signaler is running! */
+	err = i915_request_await_execution(to, from, gfp);
+	if (err)
+		return err;
+
+	cs = intel_ring_begin(to, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/*
+	 * Using greater-than-or-equal here means we have to worry
+	 * about seqno wraparound. To side step that issue, we swap
+	 * the timeline HWSP upon wrapping, so that everyone listening
+	 * for the old (pre-wrap) values do not see the much smaller
+	 * (post-wrap) values than they were expecting (and so wait
+	 * forever).
+	 */
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		MI_SEMAPHORE_SAD_GTE_SDD;
+	*cs++ = from->fence.seqno;
+	*cs++ = hwsp_offset;
+	*cs++ = 0;
+
+	intel_ring_advance(to, cs);
+	return 0;
+}
+
 static int
 i915_request_await_request(struct i915_request *to, struct i915_request *from)
 {
@@ -717,6 +837,9 @@  i915_request_await_request(struct i915_request *to, struct i915_request *from)
 		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
 						       &from->submit,
 						       I915_FENCE_GFP);
+	} else if (intel_engine_has_semaphores(to->engine) &&
+		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
 	} else {
 		ret = i915_sw_fence_await_dma_fence(&to->submit,
 						    &from->fence, 0,
@@ -1208,14 +1331,23 @@  int __init i915_global_request_init(void)
 	if (!global.slab_requests)
 		return -ENOMEM;
 
+	global.slab_execute_cbs = KMEM_CACHE(execute_cb,
+					     SLAB_HWCACHE_ALIGN |
+					     SLAB_RECLAIM_ACCOUNT |
+					     SLAB_TYPESAFE_BY_RCU);
+	if (!global.slab_execute_cbs)
+		goto err_requests;
+
 	global.slab_dependencies = KMEM_CACHE(i915_dependency,
 					      SLAB_HWCACHE_ALIGN |
 					      SLAB_RECLAIM_ACCOUNT);
 	if (!global.slab_dependencies)
-		goto err_requests;
+		goto err_execute_cbs;
 
 	return 0;
 
+err_execute_cbs:
+	kmem_cache_destroy(global.slab_execute_cbs);
 err_requests:
 	kmem_cache_destroy(global.slab_requests);
 	return -ENOMEM;
@@ -1224,11 +1356,13 @@  int __init i915_global_request_init(void)
 void i915_global_request_shrink(void)
 {
 	kmem_cache_shrink(global.slab_dependencies);
+	kmem_cache_shrink(global.slab_execute_cbs);
 	kmem_cache_shrink(global.slab_requests);
 }
 
 void i915_global_request_exit(void)
 {
 	kmem_cache_destroy(global.slab_dependencies);
+	kmem_cache_destroy(global.slab_execute_cbs);
 	kmem_cache_destroy(global.slab_requests);
 }
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index ea1e6f0ade53..3e0d62d35226 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -129,6 +129,7 @@  struct i915_request {
 	 */
 	struct i915_sw_fence submit;
 	wait_queue_entry_t submitq;
+	struct list_head execute_cb;
 
 	/*
 	 * A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 7c58b049ecb5..8d1400d378d7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -192,7 +192,7 @@  static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
 	__i915_sw_fence_notify(fence, FENCE_FREE);
 }
 
-static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+void i915_sw_fence_complete(struct i915_sw_fence *fence)
 {
 	debug_fence_assert(fence);
 
@@ -202,7 +202,7 @@  static void i915_sw_fence_complete(struct i915_sw_fence *fence)
 	__i915_sw_fence_complete(fence, NULL);
 }
 
-static void i915_sw_fence_await(struct i915_sw_fence *fence)
+void i915_sw_fence_await(struct i915_sw_fence *fence)
 {
 	debug_fence_assert(fence);
 	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 0e055ea0179f..6dec9e1d1102 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -79,6 +79,9 @@  int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    unsigned long timeout,
 				    gfp_t gfp);
 
+void i915_sw_fence_await(struct i915_sw_fence *fence);
+void i915_sw_fence_complete(struct i915_sw_fence *fence);
+
 static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
 {
 	return atomic_read(&fence->pending) <= 0;
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index ef49b1b0537b..7e0fe433ab17 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -616,6 +616,7 @@  void intel_engines_set_scheduler_caps(struct drm_i915_private *i915)
 	} map[] = {
 #define MAP(x, y) { ilog2(I915_ENGINE_HAS_##x), ilog2(I915_SCHEDULER_CAP_##y) }
 		MAP(PREEMPTION, PREEMPTION),
+		MAP(SEMAPHORES, SEMAPHORES),
 #undef MAP
 	};
 	struct intel_engine_cs *engine;
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
index b96a31bc1080..0efaadd3bc32 100644
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -106,7 +106,12 @@ 
 #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
 #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
 #define   MI_SEMAPHORE_POLL		(1<<15)
+#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
 #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
+#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
 #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
 #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
 #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f57cfe2fc078..53d6f7fdb50e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2324,6 +2324,7 @@  void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 	engine->park = NULL;
 	engine->unpark = NULL;
 
+	engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
 	if (engine->i915->preempt_context)
 		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index b8ec7e40a59b..2e7264119ec4 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -499,6 +499,7 @@  struct intel_engine_cs {
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
 	unsigned int flags;
 
 	/*
@@ -576,6 +577,12 @@  intel_engine_has_preemption(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
 }
 
+static inline bool
+intel_engine_has_semaphores(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
+}
+
 void intel_engines_set_scheduler_caps(struct drm_i915_private *i915);
 
 static inline bool __execlists_need_preempt(int prio, int last)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 8304a7f1ec3f..b10eea3f6d24 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -479,6 +479,7 @@  typedef struct drm_i915_irq_wait {
 #define   I915_SCHEDULER_CAP_ENABLED	(1ul << 0)
 #define   I915_SCHEDULER_CAP_PRIORITY	(1ul << 1)
 #define   I915_SCHEDULER_CAP_PREEMPTION	(1ul << 2)
+#define   I915_SCHEDULER_CAP_SEMAPHORES	(1ul << 3)
 
 #define I915_PARAM_HUC_STATUS		 42