diff mbox series

[19/38] drm/i915: Allow contexts to share a single timeline across all engines

Message ID 20190301140404.26690-19-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/38] drm/i915/execlists: Suppress redundant preemption | expand

Commit Message

Chris Wilson March 1, 2019, 2:03 p.m. UTC
Previously, our view has been always to run the engines independently
within a context. (Multiple engines happened before we had contexts and
timelines, so they always operated independently and that behaviour
persisted into contexts.) However, at the user level the context often
represents a single timeline (e.g. GL contexts) and userspace must
ensure that the individual engines are serialised to present that
ordering to the client (or forgot about this detail entirely and hope no
one notices - a fair ploy if the client can only directly control one
engine themselves ;)

In the next patch, we will want to construct a set of engines that
operate as one, that have a single timeline interwoven between them, to
present a single virtual engine to the user. (They submit to the virtual
engine, then we decide which engine to execute on based.)

To that end, we want to be able to create contexts which have a single
timeline (fence context) shared between all engines, rather than multiple
timelines.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_context.c       | 32 ++++++++++++---
 drivers/gpu/drm/i915/i915_gem_context.h       |  3 ++
 drivers/gpu/drm/i915/i915_request.c           | 10 ++++-
 drivers/gpu/drm/i915/i915_request.h           |  5 ++-
 drivers/gpu/drm/i915/i915_sw_fence.c          | 39 ++++++++++++++++---
 drivers/gpu/drm/i915/i915_sw_fence.h          | 13 ++++++-
 drivers/gpu/drm/i915/intel_lrc.c              |  5 ++-
 .../gpu/drm/i915/selftests/i915_gem_context.c | 19 +++++----
 drivers/gpu/drm/i915/selftests/mock_context.c |  2 +-
 include/uapi/drm/i915_drm.h                   |  3 +-
 10 files changed, 104 insertions(+), 27 deletions(-)

Comments

Tvrtko Ursulin March 5, 2019, 3:54 p.m. UTC | #1
On 01/03/2019 14:03, Chris Wilson wrote:
> Previously, our view has been always to run the engines independently
> within a context. (Multiple engines happened before we had contexts and
> timelines, so they always operated independently and that behaviour
> persisted into contexts.) However, at the user level the context often
> represents a single timeline (e.g. GL contexts) and userspace must
> ensure that the individual engines are serialised to present that
> ordering to the client (or forgot about this detail entirely and hope no
> one notices - a fair ploy if the client can only directly control one
> engine themselves ;)
> 
> In the next patch, we will want to construct a set of engines that
> operate as one, that have a single timeline interwoven between them, to
> present a single virtual engine to the user. (They submit to the virtual
> engine, then we decide which engine to execute on based.)
> 
> To that end, we want to be able to create contexts which have a single
> timeline (fence context) shared between all engines, rather than multiple
> timelines.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c       | 32 ++++++++++++---
>   drivers/gpu/drm/i915/i915_gem_context.h       |  3 ++
>   drivers/gpu/drm/i915/i915_request.c           | 10 ++++-
>   drivers/gpu/drm/i915/i915_request.h           |  5 ++-
>   drivers/gpu/drm/i915/i915_sw_fence.c          | 39 ++++++++++++++++---
>   drivers/gpu/drm/i915/i915_sw_fence.h          | 13 ++++++-
>   drivers/gpu/drm/i915/intel_lrc.c              |  5 ++-
>   .../gpu/drm/i915/selftests/i915_gem_context.c | 19 +++++----
>   drivers/gpu/drm/i915/selftests/mock_context.c |  2 +-
>   include/uapi/drm/i915_drm.h                   |  3 +-
>   10 files changed, 104 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index f883d99653a3..d8e2228636ba 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -239,6 +239,9 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
>   			ce->ops->destroy(ce);
>   	}
>   
> +	if (ctx->timeline)
> +		i915_timeline_put(ctx->timeline);
> +
>   	kfree(ctx->name);
>   	put_pid(ctx->pid);
>   
> @@ -478,12 +481,17 @@ static void __assign_ppgtt(struct i915_gem_context *ctx,
>   
>   static struct i915_gem_context *
>   i915_gem_create_context(struct drm_i915_private *dev_priv,
> -			struct drm_i915_file_private *file_priv)
> +			struct drm_i915_file_private *file_priv,
> +			unsigned int flags)
>   {
>   	struct i915_gem_context *ctx;
>   
>   	lockdep_assert_held(&dev_priv->drm.struct_mutex);
>   
> +	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE &&
> +	    !HAS_EXECLISTS(dev_priv))
> +		return ERR_PTR(-EINVAL);
> +
>   	/* Reap the most stale context */
>   	contexts_free_first(dev_priv);
>   
> @@ -506,6 +514,18 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
>   		i915_ppgtt_put(ppgtt);
>   	}
>   
> +	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) {
> +		struct i915_timeline *timeline;
> +
> +		timeline = i915_timeline_create(dev_priv, ctx->name, NULL);
> +		if (IS_ERR(timeline)) {
> +			__destroy_hw_context(ctx, file_priv);
> +			return ERR_CAST(timeline);
> +		}
> +
> +		ctx->timeline = timeline;
> +	}
> +
>   	trace_i915_context_create(ctx);
>   
>   	return ctx;
> @@ -534,7 +554,7 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>   	if (ret)
>   		return ERR_PTR(ret);
>   
> -	ctx = i915_gem_create_context(to_i915(dev), NULL);
> +	ctx = i915_gem_create_context(to_i915(dev), NULL, 0);
>   	if (IS_ERR(ctx))
>   		goto out;
>   
> @@ -570,7 +590,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
>   	struct i915_gem_context *ctx;
>   	int err;
>   
> -	ctx = i915_gem_create_context(i915, NULL);
> +	ctx = i915_gem_create_context(i915, NULL, 0);
>   	if (IS_ERR(ctx))
>   		return ctx;
>   
> @@ -702,7 +722,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
>   	idr_init_base(&file_priv->vm_idr, 1);
>   
>   	mutex_lock(&i915->drm.struct_mutex);
> -	ctx = i915_gem_create_context(i915, file_priv);
> +	ctx = i915_gem_create_context(i915, file_priv, 0);
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	if (IS_ERR(ctx)) {
>   		idr_destroy(&file_priv->context_idr);
> @@ -818,7 +838,7 @@ last_request_on_engine(struct i915_timeline *timeline,
>   
>   	rq = i915_active_request_raw(&timeline->last_request,
>   				     &engine->i915->drm.struct_mutex);
> -	if (rq && rq->engine == engine) {
> +	if (rq && rq->engine->mask & engine->mask) {
>   		GEM_TRACE("last request for %s on engine %s: %llx:%llu\n",
>   			  timeline->name, engine->name,
>   			  rq->fence.context, rq->fence.seqno);
> @@ -1476,7 +1496,7 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>   	if (ret)
>   		return ret;
>   
> -	ctx = i915_gem_create_context(i915, file_priv);
> +	ctx = i915_gem_create_context(i915, file_priv, args->flags);
>   	mutex_unlock(&dev->struct_mutex);
>   	if (IS_ERR(ctx))
>   		return PTR_ERR(ctx);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 97cf9d3d07ae..e1f270c098f0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -43,6 +43,7 @@ struct drm_i915_private;
>   struct drm_i915_file_private;
>   struct i915_hw_ppgtt;
>   struct i915_request;
> +struct i915_timeline;
>   struct i915_vma;
>   struct intel_ring;
>   
> @@ -78,6 +79,8 @@ struct i915_gem_context {
>   	/** file_priv: owning file descriptor */
>   	struct drm_i915_file_private *file_priv;
>   
> +	struct i915_timeline *timeline;
> +
>   	/**
>   	 * @ppgtt: unique address space (GTT)
>   	 *
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 9d111eedad5a..e0807a61dcf4 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1045,8 +1045,14 @@ void i915_request_add(struct i915_request *request)
>   	prev = i915_active_request_raw(&timeline->last_request,
>   				       &request->i915->drm.struct_mutex);
>   	if (prev && !i915_request_completed(prev)) {
> -		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
> -					     &request->submitq);
> +		if (is_power_of_2(prev->engine->mask | engine->mask))
> +			i915_sw_fence_await_sw_fence(&request->submit,
> +						     &prev->submit,
> +						     &request->submitq);
> +		else
> +			__i915_sw_fence_await_dma_fence(&request->submit,
> +							&prev->fence,
> +							&request->dmaq);

Drop a comment here explaining what's happening in this if block.

The subtlety of why we need a special flavours of await helper, new 
which use the builtin call back storage, vs using the existing ones 
which allocate that, totally escapes me at the moment.

It's probably a good idea to put a paragraph in the commit message 
explaining what new sw fence facility needs to be added to implement 
this and why.

>   		if (engine->schedule)
>   			__i915_sched_node_add_dependency(&request->sched,
>   							 &prev->sched,
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 3e0d62d35226..d4113074a8f6 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -128,7 +128,10 @@ struct i915_request {
>   	 * It is used by the driver to then queue the request for execution.
>   	 */
>   	struct i915_sw_fence submit;
> -	wait_queue_entry_t submitq;
> +	union {
> +		wait_queue_entry_t submitq;
> +		struct i915_sw_dma_fence_cb dmaq;
> +	};

Union deserves a comment as well I think, like this is for that and that 
is for this, and only one can be in use at a time because of the third 
thing.

>   	struct list_head execute_cb;
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 8d1400d378d7..5387aafd3424 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -359,11 +359,6 @@ int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
>   	return __i915_sw_fence_await_sw_fence(fence, signaler, NULL, gfp);
>   }
>   
> -struct i915_sw_dma_fence_cb {
> -	struct dma_fence_cb base;
> -	struct i915_sw_fence *fence;
> -};
> -
>   struct i915_sw_dma_fence_cb_timer {
>   	struct i915_sw_dma_fence_cb base;
>   	struct dma_fence *dma;
> @@ -480,6 +475,40 @@ int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
>   	return ret;
>   }
>   
> +static void __dma_i915_sw_fence_wake(struct dma_fence *dma,
> +				     struct dma_fence_cb *data)
> +{
> +	struct i915_sw_dma_fence_cb *cb = container_of(data, typeof(*cb), base);
> +
> +	i915_sw_fence_complete(cb->fence);
> +}
> +
> +int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				    struct dma_fence *dma,
> +				    struct i915_sw_dma_fence_cb *cb)
> +{
> +	int ret;
> +
> +	debug_fence_assert(fence);
> +
> +	if (dma_fence_is_signaled(dma))
> +		return 0;
> +
> +	cb->fence = fence;
> +	i915_sw_fence_await(fence);
> +
> +	ret = dma_fence_add_callback(dma, &cb->base, __dma_i915_sw_fence_wake);
> +	if (ret == 0) {
> +		ret = 1;
> +	} else {
> +		i915_sw_fence_complete(fence);
> +		if (ret == -ENOENT) /* fence already signaled */
> +			ret = 0;
> +	}
> +
> +	return ret;
> +}
> +
>   int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    struct reservation_object *resv,
>   				    const struct dma_fence_ops *exclude,
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 6dec9e1d1102..9cb5c3b307a6 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -9,14 +9,13 @@
>   #ifndef _I915_SW_FENCE_H_
>   #define _I915_SW_FENCE_H_
>   
> +#include <linux/dma-fence.h>
>   #include <linux/gfp.h>
>   #include <linux/kref.h>
>   #include <linux/notifier.h> /* for NOTIFY_DONE */
>   #include <linux/wait.h>
>   
>   struct completion;
> -struct dma_fence;
> -struct dma_fence_ops;
>   struct reservation_object;
>   
>   struct i915_sw_fence {
> @@ -68,10 +67,20 @@ int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
>   int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
>   				     struct i915_sw_fence *after,
>   				     gfp_t gfp);
> +
> +struct i915_sw_dma_fence_cb {
> +	struct dma_fence_cb base;
> +	struct i915_sw_fence *fence;
> +};
> +
> +int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				    struct dma_fence *dma,
> +				    struct i915_sw_dma_fence_cb *cb);
>   int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
>   				  struct dma_fence *dma,
>   				  unsigned long timeout,
>   				  gfp_t gfp);
> +
>   int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    struct reservation_object *resv,
>   				    const struct dma_fence_ops *exclude,
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 50a7c87705c8..d50a33c578c5 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2853,7 +2853,10 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
>   		goto error_deref_obj;
>   	}
>   
> -	timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
> +	if (ctx->timeline)
> +		timeline = i915_timeline_get(ctx->timeline);
> +	else
> +		timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
>   	if (IS_ERR(timeline)) {
>   		ret = PTR_ERR(timeline);
>   		goto error_deref_obj;
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 9133afc03135..15f016ca8e0d 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -76,7 +76,7 @@ static int live_nop_switch(void *arg)
>   	}
>   
>   	for (n = 0; n < nctx; n++) {
> -		ctx[n] = i915_gem_create_context(i915, file->driver_priv);
> +		ctx[n] = i915_gem_create_context(i915, file->driver_priv, 0);
>   		if (IS_ERR(ctx[n])) {
>   			err = PTR_ERR(ctx[n]);
>   			goto out_unlock;
> @@ -526,7 +526,8 @@ static int igt_ctx_exec(void *arg)
>   			struct i915_gem_context *ctx;
>   			intel_wakeref_t wakeref;
>   
> -			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			ctx = i915_gem_create_context(i915,
> +						      file->driver_priv, 0);
>   			if (IS_ERR(ctx)) {
>   				err = PTR_ERR(ctx);
>   				goto out_unlock;
> @@ -623,7 +624,8 @@ static int igt_shared_ctx_exec(void *arg)
>   		if (err)
>   			goto out_unlock;
>   
> -		parent = i915_gem_create_context(i915, file->driver_priv);
> +		parent = i915_gem_create_context(i915,
> +						 file->driver_priv, 0);
>   		if (IS_ERR(parent)) {
>   			err = PTR_ERR(parent);
>   			if (err == -ENODEV) /* no logical ctx support */
> @@ -645,7 +647,8 @@ static int igt_shared_ctx_exec(void *arg)
>   			if (ctx)
>   				__destroy_hw_context(ctx, file->driver_priv);
>   
> -			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			ctx = i915_gem_create_context(i915,
> +						      file->driver_priv, 0);
>   			if (IS_ERR(ctx)) {
>   				err = PTR_ERR(ctx);
>   				goto out_unlock;
> @@ -1091,7 +1094,7 @@ __igt_ctx_sseu(struct drm_i915_private *i915,
>   
>   	mutex_lock(&i915->drm.struct_mutex);
>   
> -	ctx = i915_gem_create_context(i915, file->driver_priv);
> +	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx)) {
>   		ret = PTR_ERR(ctx);
>   		goto out_unlock;
> @@ -1201,7 +1204,7 @@ static int igt_ctx_readonly(void *arg)
>   	if (err)
>   		goto out_unlock;
>   
> -	ctx = i915_gem_create_context(i915, file->driver_priv);
> +	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx)) {
>   		err = PTR_ERR(ctx);
>   		goto out_unlock;
> @@ -1527,13 +1530,13 @@ static int igt_vm_isolation(void *arg)
>   	if (err)
>   		goto out_unlock;
>   
> -	ctx_a = i915_gem_create_context(i915, file->driver_priv);
> +	ctx_a = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx_a)) {
>   		err = PTR_ERR(ctx_a);
>   		goto out_unlock;
>   	}
>   
> -	ctx_b = i915_gem_create_context(i915, file->driver_priv);
> +	ctx_b = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx_b)) {
>   		err = PTR_ERR(ctx_b);
>   		goto out_unlock;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
> index 5eddf9fcfe8a..5d0ff2293abc 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_context.c
> @@ -95,7 +95,7 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
>   {
>   	lockdep_assert_held(&i915->drm.struct_mutex);
>   
> -	return i915_gem_create_context(i915, file->driver_priv);
> +	return i915_gem_create_context(i915, file->driver_priv, 0);
>   }
>   
>   struct i915_gem_context *
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index eec635fb2e1c..451d2f36830b 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1452,8 +1452,9 @@ struct drm_i915_gem_context_create_ext {
>   	__u32 ctx_id; /* output: id of new context*/
>   	__u32 flags;
>   #define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS	(1u << 0)
> +#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE	(1u << 1)

And some kernel doc please.

>   #define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \
> -	(-(I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS << 1))
> +	(-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1))
>   	__u64 extensions;
>   };
>   
> 

Look fine, no complaints, apart from needing help to remind me how some 
things work.

Regards,

Tvrtko
Chris Wilson March 5, 2019, 4:26 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-03-05 15:54:25)
> 
> On 01/03/2019 14:03, Chris Wilson wrote:
> > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> > index 9d111eedad5a..e0807a61dcf4 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -1045,8 +1045,14 @@ void i915_request_add(struct i915_request *request)
> >       prev = i915_active_request_raw(&timeline->last_request,
> >                                      &request->i915->drm.struct_mutex);
> >       if (prev && !i915_request_completed(prev)) {
> > -             i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
> > -                                          &request->submitq);
> > +             if (is_power_of_2(prev->engine->mask | engine->mask))
> > +                     i915_sw_fence_await_sw_fence(&request->submit,
> > +                                                  &prev->submit,
> > +                                                  &request->submitq);
> > +             else
> > +                     __i915_sw_fence_await_dma_fence(&request->submit,
> > +                                                     &prev->fence,
> > +                                                     &request->dmaq);
> 
> Drop a comment here explaining what's happening in this if block.
> 
> The subtlety of why we need a special flavours of await helper, new 
> which use the builtin call back storage, vs using the existing ones 
> which allocate that, totally escapes me at the moment.
> 
> It's probably a good idea to put a paragraph in the commit message 
> explaining what new sw fence facility needs to be added to implement 
> this and why.

Alternatively, we could do add this fence during request_alloc and use
an actual malloc rather than rely on the embedded struct. We still have
to bypass the usual await call as that filters out awaits on the same
timeline (because we know we have this dependency). But since we always
have to to this allocation, why not keep on embedding it in the request
itself.

I'm leaning towards keeping the embedded fence for tracking the
dependency along the timeline (rather than kmalloc) as surely there will
be others later. Surely.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index f883d99653a3..d8e2228636ba 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -239,6 +239,9 @@  static void i915_gem_context_free(struct i915_gem_context *ctx)
 			ce->ops->destroy(ce);
 	}
 
+	if (ctx->timeline)
+		i915_timeline_put(ctx->timeline);
+
 	kfree(ctx->name);
 	put_pid(ctx->pid);
 
@@ -478,12 +481,17 @@  static void __assign_ppgtt(struct i915_gem_context *ctx,
 
 static struct i915_gem_context *
 i915_gem_create_context(struct drm_i915_private *dev_priv,
-			struct drm_i915_file_private *file_priv)
+			struct drm_i915_file_private *file_priv,
+			unsigned int flags)
 {
 	struct i915_gem_context *ctx;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
+	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE &&
+	    !HAS_EXECLISTS(dev_priv))
+		return ERR_PTR(-EINVAL);
+
 	/* Reap the most stale context */
 	contexts_free_first(dev_priv);
 
@@ -506,6 +514,18 @@  i915_gem_create_context(struct drm_i915_private *dev_priv,
 		i915_ppgtt_put(ppgtt);
 	}
 
+	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) {
+		struct i915_timeline *timeline;
+
+		timeline = i915_timeline_create(dev_priv, ctx->name, NULL);
+		if (IS_ERR(timeline)) {
+			__destroy_hw_context(ctx, file_priv);
+			return ERR_CAST(timeline);
+		}
+
+		ctx->timeline = timeline;
+	}
+
 	trace_i915_context_create(ctx);
 
 	return ctx;
@@ -534,7 +554,7 @@  i915_gem_context_create_gvt(struct drm_device *dev)
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = i915_gem_create_context(to_i915(dev), NULL);
+	ctx = i915_gem_create_context(to_i915(dev), NULL, 0);
 	if (IS_ERR(ctx))
 		goto out;
 
@@ -570,7 +590,7 @@  i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
 	struct i915_gem_context *ctx;
 	int err;
 
-	ctx = i915_gem_create_context(i915, NULL);
+	ctx = i915_gem_create_context(i915, NULL, 0);
 	if (IS_ERR(ctx))
 		return ctx;
 
@@ -702,7 +722,7 @@  int i915_gem_context_open(struct drm_i915_private *i915,
 	idr_init_base(&file_priv->vm_idr, 1);
 
 	mutex_lock(&i915->drm.struct_mutex);
-	ctx = i915_gem_create_context(i915, file_priv);
+	ctx = i915_gem_create_context(i915, file_priv, 0);
 	mutex_unlock(&i915->drm.struct_mutex);
 	if (IS_ERR(ctx)) {
 		idr_destroy(&file_priv->context_idr);
@@ -818,7 +838,7 @@  last_request_on_engine(struct i915_timeline *timeline,
 
 	rq = i915_active_request_raw(&timeline->last_request,
 				     &engine->i915->drm.struct_mutex);
-	if (rq && rq->engine == engine) {
+	if (rq && rq->engine->mask & engine->mask) {
 		GEM_TRACE("last request for %s on engine %s: %llx:%llu\n",
 			  timeline->name, engine->name,
 			  rq->fence.context, rq->fence.seqno);
@@ -1476,7 +1496,7 @@  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		return ret;
 
-	ctx = i915_gem_create_context(i915, file_priv);
+	ctx = i915_gem_create_context(i915, file_priv, args->flags);
 	mutex_unlock(&dev->struct_mutex);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 97cf9d3d07ae..e1f270c098f0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -43,6 +43,7 @@  struct drm_i915_private;
 struct drm_i915_file_private;
 struct i915_hw_ppgtt;
 struct i915_request;
+struct i915_timeline;
 struct i915_vma;
 struct intel_ring;
 
@@ -78,6 +79,8 @@  struct i915_gem_context {
 	/** file_priv: owning file descriptor */
 	struct drm_i915_file_private *file_priv;
 
+	struct i915_timeline *timeline;
+
 	/**
 	 * @ppgtt: unique address space (GTT)
 	 *
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 9d111eedad5a..e0807a61dcf4 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1045,8 +1045,14 @@  void i915_request_add(struct i915_request *request)
 	prev = i915_active_request_raw(&timeline->last_request,
 				       &request->i915->drm.struct_mutex);
 	if (prev && !i915_request_completed(prev)) {
-		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
-					     &request->submitq);
+		if (is_power_of_2(prev->engine->mask | engine->mask))
+			i915_sw_fence_await_sw_fence(&request->submit,
+						     &prev->submit,
+						     &request->submitq);
+		else
+			__i915_sw_fence_await_dma_fence(&request->submit,
+							&prev->fence,
+							&request->dmaq);
 		if (engine->schedule)
 			__i915_sched_node_add_dependency(&request->sched,
 							 &prev->sched,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 3e0d62d35226..d4113074a8f6 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -128,7 +128,10 @@  struct i915_request {
 	 * It is used by the driver to then queue the request for execution.
 	 */
 	struct i915_sw_fence submit;
-	wait_queue_entry_t submitq;
+	union {
+		wait_queue_entry_t submitq;
+		struct i915_sw_dma_fence_cb dmaq;
+	};
 	struct list_head execute_cb;
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 8d1400d378d7..5387aafd3424 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -359,11 +359,6 @@  int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 	return __i915_sw_fence_await_sw_fence(fence, signaler, NULL, gfp);
 }
 
-struct i915_sw_dma_fence_cb {
-	struct dma_fence_cb base;
-	struct i915_sw_fence *fence;
-};
-
 struct i915_sw_dma_fence_cb_timer {
 	struct i915_sw_dma_fence_cb base;
 	struct dma_fence *dma;
@@ -480,6 +475,40 @@  int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
 	return ret;
 }
 
+static void __dma_i915_sw_fence_wake(struct dma_fence *dma,
+				     struct dma_fence_cb *data)
+{
+	struct i915_sw_dma_fence_cb *cb = container_of(data, typeof(*cb), base);
+
+	i915_sw_fence_complete(cb->fence);
+}
+
+int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				    struct dma_fence *dma,
+				    struct i915_sw_dma_fence_cb *cb)
+{
+	int ret;
+
+	debug_fence_assert(fence);
+
+	if (dma_fence_is_signaled(dma))
+		return 0;
+
+	cb->fence = fence;
+	i915_sw_fence_await(fence);
+
+	ret = dma_fence_add_callback(dma, &cb->base, __dma_i915_sw_fence_wake);
+	if (ret == 0) {
+		ret = 1;
+	} else {
+		i915_sw_fence_complete(fence);
+		if (ret == -ENOENT) /* fence already signaled */
+			ret = 0;
+	}
+
+	return ret;
+}
+
 int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    struct reservation_object *resv,
 				    const struct dma_fence_ops *exclude,
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 6dec9e1d1102..9cb5c3b307a6 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -9,14 +9,13 @@ 
 #ifndef _I915_SW_FENCE_H_
 #define _I915_SW_FENCE_H_
 
+#include <linux/dma-fence.h>
 #include <linux/gfp.h>
 #include <linux/kref.h>
 #include <linux/notifier.h> /* for NOTIFY_DONE */
 #include <linux/wait.h>
 
 struct completion;
-struct dma_fence;
-struct dma_fence_ops;
 struct reservation_object;
 
 struct i915_sw_fence {
@@ -68,10 +67,20 @@  int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 				     struct i915_sw_fence *after,
 				     gfp_t gfp);
+
+struct i915_sw_dma_fence_cb {
+	struct dma_fence_cb base;
+	struct i915_sw_fence *fence;
+};
+
+int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				    struct dma_fence *dma,
+				    struct i915_sw_dma_fence_cb *cb);
 int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
 				  struct dma_fence *dma,
 				  unsigned long timeout,
 				  gfp_t gfp);
+
 int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    struct reservation_object *resv,
 				    const struct dma_fence_ops *exclude,
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 50a7c87705c8..d50a33c578c5 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2853,7 +2853,10 @@  static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 		goto error_deref_obj;
 	}
 
-	timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
+	if (ctx->timeline)
+		timeline = i915_timeline_get(ctx->timeline);
+	else
+		timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
 	if (IS_ERR(timeline)) {
 		ret = PTR_ERR(timeline);
 		goto error_deref_obj;
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 9133afc03135..15f016ca8e0d 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -76,7 +76,7 @@  static int live_nop_switch(void *arg)
 	}
 
 	for (n = 0; n < nctx; n++) {
-		ctx[n] = i915_gem_create_context(i915, file->driver_priv);
+		ctx[n] = i915_gem_create_context(i915, file->driver_priv, 0);
 		if (IS_ERR(ctx[n])) {
 			err = PTR_ERR(ctx[n]);
 			goto out_unlock;
@@ -526,7 +526,8 @@  static int igt_ctx_exec(void *arg)
 			struct i915_gem_context *ctx;
 			intel_wakeref_t wakeref;
 
-			ctx = i915_gem_create_context(i915, file->driver_priv);
+			ctx = i915_gem_create_context(i915,
+						      file->driver_priv, 0);
 			if (IS_ERR(ctx)) {
 				err = PTR_ERR(ctx);
 				goto out_unlock;
@@ -623,7 +624,8 @@  static int igt_shared_ctx_exec(void *arg)
 		if (err)
 			goto out_unlock;
 
-		parent = i915_gem_create_context(i915, file->driver_priv);
+		parent = i915_gem_create_context(i915,
+						 file->driver_priv, 0);
 		if (IS_ERR(parent)) {
 			err = PTR_ERR(parent);
 			if (err == -ENODEV) /* no logical ctx support */
@@ -645,7 +647,8 @@  static int igt_shared_ctx_exec(void *arg)
 			if (ctx)
 				__destroy_hw_context(ctx, file->driver_priv);
 
-			ctx = i915_gem_create_context(i915, file->driver_priv);
+			ctx = i915_gem_create_context(i915,
+						      file->driver_priv, 0);
 			if (IS_ERR(ctx)) {
 				err = PTR_ERR(ctx);
 				goto out_unlock;
@@ -1091,7 +1094,7 @@  __igt_ctx_sseu(struct drm_i915_private *i915,
 
 	mutex_lock(&i915->drm.struct_mutex);
 
-	ctx = i915_gem_create_context(i915, file->driver_priv);
+	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
 	if (IS_ERR(ctx)) {
 		ret = PTR_ERR(ctx);
 		goto out_unlock;
@@ -1201,7 +1204,7 @@  static int igt_ctx_readonly(void *arg)
 	if (err)
 		goto out_unlock;
 
-	ctx = i915_gem_create_context(i915, file->driver_priv);
+	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto out_unlock;
@@ -1527,13 +1530,13 @@  static int igt_vm_isolation(void *arg)
 	if (err)
 		goto out_unlock;
 
-	ctx_a = i915_gem_create_context(i915, file->driver_priv);
+	ctx_a = i915_gem_create_context(i915, file->driver_priv, 0);
 	if (IS_ERR(ctx_a)) {
 		err = PTR_ERR(ctx_a);
 		goto out_unlock;
 	}
 
-	ctx_b = i915_gem_create_context(i915, file->driver_priv);
+	ctx_b = i915_gem_create_context(i915, file->driver_priv, 0);
 	if (IS_ERR(ctx_b)) {
 		err = PTR_ERR(ctx_b);
 		goto out_unlock;
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 5eddf9fcfe8a..5d0ff2293abc 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -95,7 +95,7 @@  live_context(struct drm_i915_private *i915, struct drm_file *file)
 {
 	lockdep_assert_held(&i915->drm.struct_mutex);
 
-	return i915_gem_create_context(i915, file->driver_priv);
+	return i915_gem_create_context(i915, file->driver_priv, 0);
 }
 
 struct i915_gem_context *
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index eec635fb2e1c..451d2f36830b 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1452,8 +1452,9 @@  struct drm_i915_gem_context_create_ext {
 	__u32 ctx_id; /* output: id of new context*/
 	__u32 flags;
 #define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS	(1u << 0)
+#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE	(1u << 1)
 #define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \
-	(-(I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS << 1))
+	(-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1))
 	__u64 extensions;
 };