diff mbox series

[1/3] drm/i915/gem: Don't leak non-persistent requests on changing engines

Message ID 20200207111124.2762388-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/i915/gem: Don't leak non-persistent requests on changing engines | expand

Commit Message

Chris Wilson Feb. 7, 2020, 11:11 a.m. UTC
If we have a set of active engines marked as being non-persistent, we
lose track of those if the user replaces those engines with
I915_CONTEXT_PARAM_ENGINES. As part of our uABI contract is that
non-persistent requests are terminated if they are no longer being
tracked by the user's context (in order to prevent a lost request
causing an untracked and so unstoppable GPU hang), we need to apply the
same context cancellation upon changing engines.

v2: Track stale engines[] so we only reap at context closure.

Fixes: a0e047156cde ("drm/i915/gem: Make context persistence optional")
Testcase: igt/gem_ctx_peristence/replace
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 118 ++++++++++++++++--
 .../gpu/drm/i915/gem/i915_gem_context_types.h |  13 +-
 drivers/gpu/drm/i915/i915_sw_fence.c          |  15 ++-
 drivers/gpu/drm/i915/i915_sw_fence.h          |   2 +-
 4 files changed, 135 insertions(+), 13 deletions(-)

Comments

Tvrtko Ursulin Feb. 7, 2020, 4:46 p.m. UTC | #1
If you want quick&dirty feedback read below, if you want something 
smarter wait some more. :)

On 07/02/2020 11:11, Chris Wilson wrote:
> If we have a set of active engines marked as being non-persistent, we
> lose track of those if the user replaces those engines with
> I915_CONTEXT_PARAM_ENGINES. As part of our uABI contract is that
> non-persistent requests are terminated if they are no longer being
> tracked by the user's context (in order to prevent a lost request
> causing an untracked and so unstoppable GPU hang), we need to apply the
> same context cancellation upon changing engines.
> 
> v2: Track stale engines[] so we only reap at context closure.
> 
> Fixes: a0e047156cde ("drm/i915/gem: Make context persistence optional")
> Testcase: igt/gem_ctx_peristence/replace
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   | 118 ++++++++++++++++--
>   .../gpu/drm/i915/gem/i915_gem_context_types.h |  13 +-
>   drivers/gpu/drm/i915/i915_sw_fence.c          |  15 ++-
>   drivers/gpu/drm/i915/i915_sw_fence.h          |   2 +-
>   4 files changed, 135 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 52a749691a8d..579215161c21 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -270,7 +270,8 @@ static struct i915_gem_engines *default_engines(struct i915_gem_context *ctx)
>   	if (!e)
>   		return ERR_PTR(-ENOMEM);
>   
> -	init_rcu_head(&e->rcu);
> +	e->ctx = ctx;
> +
>   	for_each_engine(engine, gt, id) {
>   		struct intel_context *ce;
>   
> @@ -450,7 +451,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
>   	return engine;
>   }
>   
> -static void kill_context(struct i915_gem_context *ctx)
> +static void kill_engines(struct i915_gem_engines *engines)
>   {
>   	struct i915_gem_engines_iter it;
>   	struct intel_context *ce;
> @@ -462,7 +463,7 @@ static void kill_context(struct i915_gem_context *ctx)
>   	 * However, we only care about pending requests, so only include
>   	 * engines on which there are incomplete requests.
>   	 */
> -	for_each_gem_engine(ce, __context_engines_static(ctx), it) {
> +	for_each_gem_engine(ce, engines, it) {
>   		struct intel_engine_cs *engine;
>   
>   		if (intel_context_set_banned(ce))
> @@ -484,10 +485,41 @@ static void kill_context(struct i915_gem_context *ctx)
>   			 * the context from the GPU, we have to resort to a full
>   			 * reset. We hope the collateral damage is worth it.
>   			 */
> -			__reset_context(ctx, engine);
> +			__reset_context(engines->ctx, engine);
>   	}
>   }
>   
> +static void kill_stale_engines(struct i915_gem_context *ctx)
> +{
> +	struct i915_gem_engines *pos, *next;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&ctx->stale.lock, flags);
> +	list_for_each_entry_safe(pos, next, &ctx->stale.engines, link) {
> +		if (!i915_sw_fence_await(&pos->fence))
> +			continue;

When is this path hit?

> +
> +		spin_unlock_irqrestore(&ctx->stale.lock, flags);
> +
> +		kill_engines(pos);
> +
> +		spin_lock_irqsave(&ctx->stale.lock, flags);
> +		list_safe_reset_next(pos, next, link);
> +		list_del_init(&pos->link);
> +
> +		i915_sw_fence_complete(&pos->fence);

This will trigger FENCE_FREE below?

> +	}
> +	spin_unlock_irqrestore(&ctx->stale.lock, flags);
> +}
> +
> +static void kill_context(struct i915_gem_context *ctx)
> +{
> +	if (!list_empty(&ctx->stale.engines))
> +		kill_stale_engines(ctx);
> +
> +	kill_engines(__context_engines_static(ctx));
> +}
> +
>   static void set_closed_name(struct i915_gem_context *ctx)
>   {
>   	char *s;
> @@ -602,6 +634,9 @@ __create_context(struct drm_i915_private *i915)
>   	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
>   	mutex_init(&ctx->mutex);
>   
> +	spin_lock_init(&ctx->stale.lock);
> +	INIT_LIST_HEAD(&ctx->stale.engines);
> +
>   	mutex_init(&ctx->engines_mutex);
>   	e = default_engines(ctx);
>   	if (IS_ERR(e)) {
> @@ -1529,6 +1564,71 @@ static const i915_user_extension_fn set_engines__extensions[] = {
>   	[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
>   };
>   
> +static int engines_notify(struct i915_sw_fence *fence,
> +			  enum i915_sw_fence_notify state)
> +{
> +	struct i915_gem_engines *engines =
> +		container_of(fence, typeof(*engines), fence);
> +
> +	switch (state) {
> +	case FENCE_COMPLETE:
> +		if (!list_empty(&engines->link)) {

Why it is safe to look at the state of engines->link outside the lock? 
We can have a race between context close and completion event on a stale 
engine, right?

> +			struct i915_gem_context *ctx = engines->ctx;
> +			unsigned long flags;
> +
> +			spin_lock_irqsave(&ctx->stale.lock, flags);
> +			list_del(&engines->link);
> +			spin_unlock_irqrestore(&ctx->stale.lock, flags);
> +		}
> +		break;
> +
> +	case FENCE_FREE:
> +		init_rcu_head(&engines->rcu);
> +		call_rcu(&engines->rcu, free_engines_rcu);
> +		break;
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static void engines_idle_release(struct i915_gem_engines *engines)
> +{
> +	struct i915_gem_engines_iter it;
> +	struct intel_context *ce;
> +	unsigned long flags;
> +
> +	GEM_BUG_ON(!engines);
> +	i915_sw_fence_init(&engines->fence, engines_notify);
> +
> +	spin_lock_irqsave(&engines->ctx->stale.lock, flags);
> +	list_add(&engines->link, &engines->ctx->stale.engines);
> +	spin_unlock_irqrestore(&engines->ctx->stale.lock, flags);
> +
> +	for_each_gem_engine(ce, engines, it) {
> +		struct dma_fence *fence;
> +		int err;
> +
> +		if (!ce->timeline)
> +			continue;

When does this happen?

> +
> +		fence = i915_active_fence_get(&ce->timeline->last_request);
> +		if (!fence)
> +			continue;
> +
> +		err = i915_sw_fence_await_dma_fence(&engines->fence,
> +						    fence, 0,
> +						    GFP_KERNEL);
> +
> +		dma_fence_put(fence);
> +		if (err < 0) {
> +			kill_engines(engines);
> +			break;

Okay to leave already setup awaits active in this case?

> +		}
> +	}
> +
> +	i915_sw_fence_commit(&engines->fence);
> +}
> +
>   static int
>   set_engines(struct i915_gem_context *ctx,
>   	    const struct drm_i915_gem_context_param *args)
> @@ -1571,7 +1671,8 @@ set_engines(struct i915_gem_context *ctx,
>   	if (!set.engines)
>   		return -ENOMEM;
>   
> -	init_rcu_head(&set.engines->rcu);
> +	set.engines->ctx = ctx;
> +
>   	for (n = 0; n < num_engines; n++) {
>   		struct i915_engine_class_instance ci;
>   		struct intel_engine_cs *engine;
> @@ -1631,7 +1732,8 @@ set_engines(struct i915_gem_context *ctx,
>   	set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1);
>   	mutex_unlock(&ctx->engines_mutex);
>   
> -	call_rcu(&set.engines->rcu, free_engines_rcu);
> +	/* Keep track of old engine sets for kill_context() */
> +	engines_idle_release(set.engines);
>   
>   	return 0;
>   }
> @@ -1646,7 +1748,6 @@ __copy_engines(struct i915_gem_engines *e)
>   	if (!copy)
>   		return ERR_PTR(-ENOMEM);
>   
> -	init_rcu_head(&copy->rcu);
>   	for (n = 0; n < e->num_engines; n++) {
>   		if (e->engines[n])
>   			copy->engines[n] = intel_context_get(e->engines[n]);
> @@ -1890,7 +1991,8 @@ static int clone_engines(struct i915_gem_context *dst,
>   	if (!clone)
>   		goto err_unlock;
>   
> -	init_rcu_head(&clone->rcu);
> +	clone->ctx = dst;
> +
>   	for (n = 0; n < e->num_engines; n++) {
>   		struct intel_engine_cs *engine;
>   
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index 017ca803ab47..8d996dde8046 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -20,6 +20,7 @@
>   #include "gt/intel_context_types.h"
>   
>   #include "i915_scheduler.h"
> +#include "i915_sw_fence.h"
>   
>   struct pid;
>   
> @@ -30,7 +31,12 @@ struct intel_timeline;
>   struct intel_ring;
>   
>   struct i915_gem_engines {
> -	struct rcu_head rcu;
> +	union {
> +		struct rcu_head rcu;
> +		struct list_head link;
> +	};
> +	struct i915_sw_fence fence;
> +	struct i915_gem_context *ctx;
>   	unsigned int num_engines;
>   	struct intel_context *engines[];
>   };
> @@ -173,6 +179,11 @@ struct i915_gem_context {
>   	 * context in messages.
>   	 */
>   	char name[TASK_COMM_LEN + 8];
> +
> +	struct {
> +		struct spinlock lock;
> +		struct list_head engines;
> +	} stale;
>   };
>   
>   #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 51ba97daf2a0..9a20b7246f91 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -211,10 +211,19 @@ void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   	__i915_sw_fence_complete(fence, NULL);
>   }
>   
> -void i915_sw_fence_await(struct i915_sw_fence *fence)
> +bool i915_sw_fence_await(struct i915_sw_fence *fence)
>   {
> -	debug_fence_assert(fence);
> -	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> +	int old, new;
> +
> +	new = atomic_read(&fence->pending);
> +	do {
> +		if (new < 1)
> +			return false;
> +
> +		old = new++;
> +	} while ((new = atomic_cmpxchg(&fence->pending, old, new)) != old);
> +
> +	return true;

No idea what's happening here. Why was the existing code inadequate and 
what are you changing?

Regards,

Tvrtko

>   }
>   
>   void __i915_sw_fence_init(struct i915_sw_fence *fence,
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 19e806ce43bc..30a863353ee6 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -91,7 +91,7 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    unsigned long timeout,
>   				    gfp_t gfp);
>   
> -void i915_sw_fence_await(struct i915_sw_fence *fence);
> +bool i915_sw_fence_await(struct i915_sw_fence *fence);
>   void i915_sw_fence_complete(struct i915_sw_fence *fence);
>   
>   static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
>
Chris Wilson Feb. 7, 2020, 4:55 p.m. UTC | #2
Quoting Tvrtko Ursulin (2020-02-07 16:46:55)
> 
> If you want quick&dirty feedback read below, if you want something 
> smarter wait some more. :)
> 
> On 07/02/2020 11:11, Chris Wilson wrote:
> > +static void kill_stale_engines(struct i915_gem_context *ctx)
> > +{
> > +     struct i915_gem_engines *pos, *next;
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&ctx->stale.lock, flags);
> > +     list_for_each_entry_safe(pos, next, &ctx->stale.engines, link) {
> > +             if (!i915_sw_fence_await(&pos->fence))
> > +                     continue;
> 
> When is this path hit?

Race with the interrupt callback.

> > +
> > +             spin_unlock_irqrestore(&ctx->stale.lock, flags);
> > +
> > +             kill_engines(pos);
> > +
> > +             spin_lock_irqsave(&ctx->stale.lock, flags);
> > +             list_safe_reset_next(pos, next, link);
> > +             list_del_init(&pos->link);
> > +
> > +             i915_sw_fence_complete(&pos->fence);
> 
> This will trigger FENCE_FREE below?

Yes, the final completion sends both notifications.

> > +static int engines_notify(struct i915_sw_fence *fence,
> > +                       enum i915_sw_fence_notify state)
> > +{
> > +     struct i915_gem_engines *engines =
> > +             container_of(fence, typeof(*engines), fence);
> > +
> > +     switch (state) {
> > +     case FENCE_COMPLETE:
> > +             if (!list_empty(&engines->link)) {
> 
> Why it is safe to look at the state of engines->link outside the lock? 
> We can have a race between context close and completion event on a stale 
> engine, right?

There is no race :)

It's just coordination with kill_stale_engines().

> > +static void engines_idle_release(struct i915_gem_engines *engines)
> > +{
> > +     struct i915_gem_engines_iter it;
> > +     struct intel_context *ce;
> > +     unsigned long flags;
> > +
> > +     GEM_BUG_ON(!engines);
> > +     i915_sw_fence_init(&engines->fence, engines_notify);
> > +
> > +     spin_lock_irqsave(&engines->ctx->stale.lock, flags);
> > +     list_add(&engines->link, &engines->ctx->stale.engines);
> > +     spin_unlock_irqrestore(&engines->ctx->stale.lock, flags);
> > +
> > +     for_each_gem_engine(ce, engines, it) {
> > +             struct dma_fence *fence;
> > +             int err;
> > +
> > +             if (!ce->timeline)
> > +                     continue;
> 
> When does this happen?

Replacing the default engines before use. Or any engine set prior to
use.

> > +
> > +             fence = i915_active_fence_get(&ce->timeline->last_request);
> > +             if (!fence)
> > +                     continue;
> > +
> > +             err = i915_sw_fence_await_dma_fence(&engines->fence,
> > +                                                 fence, 0,
> > +                                                 GFP_KERNEL);
> > +
> > +             dma_fence_put(fence);
> > +             if (err < 0) {
> > +                     kill_engines(engines);
> > +                     break;
> 
> Okay to leave already setup awaits active in this case?

Yes. They will be signaled. It may seem a bit harsh, but we fell into an
unlikely error path and have to do so something.

> > -void i915_sw_fence_await(struct i915_sw_fence *fence)
> > +bool i915_sw_fence_await(struct i915_sw_fence *fence)
> >   {
> > -     debug_fence_assert(fence);
> > -     WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> > +     int old, new;
> > +
> > +     new = atomic_read(&fence->pending);
> > +     do {
> > +             if (new < 1)
> > +                     return false;
> > +
> > +             old = new++;
> > +     } while ((new = atomic_cmpxchg(&fence->pending, old, new)) != old);
> > +
> > +     return true;
> 
> No idea what's happening here. Why was the existing code inadequate and 
> what are you changing?

I needed an await_if_busy to handle the race with the interrupts.
-Chris
Chris Wilson Feb. 10, 2020, 4:06 p.m. UTC | #3
Quoting Patchwork (2020-02-10 16:00:41)
> #### Possible fixes ####
>
>   * {igt@gem_ctx_persistence@replace-hostile@vcs0}:
>     - shard-kbl:          [FAIL][27] ([i915#1154]) -> [PASS][28] +9 similar issues
>    [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7886/shard-kbl1/igt@gem_ctx_persistence@replace-hostile@vcs0.html
>    [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16478/shard-kbl1/igt@gem_ctx_persistence@replace-hostile@vcs0.html
> 
>   * {igt@gem_ctx_persistence@replace@vcs0}:
>     - shard-apl:          [FAIL][29] ([i915#1154]) -> [PASS][30] +7 similar issues
>    [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7886/shard-apl4/igt@gem_ctx_persistence@replace@vcs0.html
>    [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16478/shard-apl3/igt@gem_ctx_persistence@replace@vcs0.html
>     - shard-iclb:         [FAIL][31] ([i915#1154]) -> [PASS][32] +7 similar issues
>    [31]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7886/shard-iclb2/igt@gem_ctx_persistence@replace@vcs0.html
>    [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16478/shard-iclb7/igt@gem_ctx_persistence@replace@vcs0.html
>     - shard-glk:          [FAIL][33] ([i915#1154]) -> [PASS][34] +7 similar issues
>    [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7886/shard-glk9/igt@gem_ctx_persistence@replace@vcs0.html
>    [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16478/shard-glk9/igt@gem_ctx_persistence@replace@vcs0.html
> 
>   * {igt@gem_ctx_persistence@replace@vecs0}:
>     - shard-tglb:         [FAIL][35] ([i915#1154]) -> [PASS][36] +9 similar issues
>    [35]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7886/shard-tglb1/igt@gem_ctx_persistence@replace@vecs0.html
>    [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16478/shard-tglb7/igt@gem_ctx_persistence@replace@vecs0.html

Yay \o/
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 52a749691a8d..579215161c21 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -270,7 +270,8 @@  static struct i915_gem_engines *default_engines(struct i915_gem_context *ctx)
 	if (!e)
 		return ERR_PTR(-ENOMEM);
 
-	init_rcu_head(&e->rcu);
+	e->ctx = ctx;
+
 	for_each_engine(engine, gt, id) {
 		struct intel_context *ce;
 
@@ -450,7 +451,7 @@  static struct intel_engine_cs *active_engine(struct intel_context *ce)
 	return engine;
 }
 
-static void kill_context(struct i915_gem_context *ctx)
+static void kill_engines(struct i915_gem_engines *engines)
 {
 	struct i915_gem_engines_iter it;
 	struct intel_context *ce;
@@ -462,7 +463,7 @@  static void kill_context(struct i915_gem_context *ctx)
 	 * However, we only care about pending requests, so only include
 	 * engines on which there are incomplete requests.
 	 */
-	for_each_gem_engine(ce, __context_engines_static(ctx), it) {
+	for_each_gem_engine(ce, engines, it) {
 		struct intel_engine_cs *engine;
 
 		if (intel_context_set_banned(ce))
@@ -484,10 +485,41 @@  static void kill_context(struct i915_gem_context *ctx)
 			 * the context from the GPU, we have to resort to a full
 			 * reset. We hope the collateral damage is worth it.
 			 */
-			__reset_context(ctx, engine);
+			__reset_context(engines->ctx, engine);
 	}
 }
 
+static void kill_stale_engines(struct i915_gem_context *ctx)
+{
+	struct i915_gem_engines *pos, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->stale.lock, flags);
+	list_for_each_entry_safe(pos, next, &ctx->stale.engines, link) {
+		if (!i915_sw_fence_await(&pos->fence))
+			continue;
+
+		spin_unlock_irqrestore(&ctx->stale.lock, flags);
+
+		kill_engines(pos);
+
+		spin_lock_irqsave(&ctx->stale.lock, flags);
+		list_safe_reset_next(pos, next, link);
+		list_del_init(&pos->link);
+
+		i915_sw_fence_complete(&pos->fence);
+	}
+	spin_unlock_irqrestore(&ctx->stale.lock, flags);
+}
+
+static void kill_context(struct i915_gem_context *ctx)
+{
+	if (!list_empty(&ctx->stale.engines))
+		kill_stale_engines(ctx);
+
+	kill_engines(__context_engines_static(ctx));
+}
+
 static void set_closed_name(struct i915_gem_context *ctx)
 {
 	char *s;
@@ -602,6 +634,9 @@  __create_context(struct drm_i915_private *i915)
 	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
 	mutex_init(&ctx->mutex);
 
+	spin_lock_init(&ctx->stale.lock);
+	INIT_LIST_HEAD(&ctx->stale.engines);
+
 	mutex_init(&ctx->engines_mutex);
 	e = default_engines(ctx);
 	if (IS_ERR(e)) {
@@ -1529,6 +1564,71 @@  static const i915_user_extension_fn set_engines__extensions[] = {
 	[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
 };
 
+static int engines_notify(struct i915_sw_fence *fence,
+			  enum i915_sw_fence_notify state)
+{
+	struct i915_gem_engines *engines =
+		container_of(fence, typeof(*engines), fence);
+
+	switch (state) {
+	case FENCE_COMPLETE:
+		if (!list_empty(&engines->link)) {
+			struct i915_gem_context *ctx = engines->ctx;
+			unsigned long flags;
+
+			spin_lock_irqsave(&ctx->stale.lock, flags);
+			list_del(&engines->link);
+			spin_unlock_irqrestore(&ctx->stale.lock, flags);
+		}
+		break;
+
+	case FENCE_FREE:
+		init_rcu_head(&engines->rcu);
+		call_rcu(&engines->rcu, free_engines_rcu);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void engines_idle_release(struct i915_gem_engines *engines)
+{
+	struct i915_gem_engines_iter it;
+	struct intel_context *ce;
+	unsigned long flags;
+
+	GEM_BUG_ON(!engines);
+	i915_sw_fence_init(&engines->fence, engines_notify);
+
+	spin_lock_irqsave(&engines->ctx->stale.lock, flags);
+	list_add(&engines->link, &engines->ctx->stale.engines);
+	spin_unlock_irqrestore(&engines->ctx->stale.lock, flags);
+
+	for_each_gem_engine(ce, engines, it) {
+		struct dma_fence *fence;
+		int err;
+
+		if (!ce->timeline)
+			continue;
+
+		fence = i915_active_fence_get(&ce->timeline->last_request);
+		if (!fence)
+			continue;
+
+		err = i915_sw_fence_await_dma_fence(&engines->fence,
+						    fence, 0,
+						    GFP_KERNEL);
+
+		dma_fence_put(fence);
+		if (err < 0) {
+			kill_engines(engines);
+			break;
+		}
+	}
+
+	i915_sw_fence_commit(&engines->fence);
+}
+
 static int
 set_engines(struct i915_gem_context *ctx,
 	    const struct drm_i915_gem_context_param *args)
@@ -1571,7 +1671,8 @@  set_engines(struct i915_gem_context *ctx,
 	if (!set.engines)
 		return -ENOMEM;
 
-	init_rcu_head(&set.engines->rcu);
+	set.engines->ctx = ctx;
+
 	for (n = 0; n < num_engines; n++) {
 		struct i915_engine_class_instance ci;
 		struct intel_engine_cs *engine;
@@ -1631,7 +1732,8 @@  set_engines(struct i915_gem_context *ctx,
 	set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1);
 	mutex_unlock(&ctx->engines_mutex);
 
-	call_rcu(&set.engines->rcu, free_engines_rcu);
+	/* Keep track of old engine sets for kill_context() */
+	engines_idle_release(set.engines);
 
 	return 0;
 }
@@ -1646,7 +1748,6 @@  __copy_engines(struct i915_gem_engines *e)
 	if (!copy)
 		return ERR_PTR(-ENOMEM);
 
-	init_rcu_head(&copy->rcu);
 	for (n = 0; n < e->num_engines; n++) {
 		if (e->engines[n])
 			copy->engines[n] = intel_context_get(e->engines[n]);
@@ -1890,7 +1991,8 @@  static int clone_engines(struct i915_gem_context *dst,
 	if (!clone)
 		goto err_unlock;
 
-	init_rcu_head(&clone->rcu);
+	clone->ctx = dst;
+
 	for (n = 0; n < e->num_engines; n++) {
 		struct intel_engine_cs *engine;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 017ca803ab47..8d996dde8046 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -20,6 +20,7 @@ 
 #include "gt/intel_context_types.h"
 
 #include "i915_scheduler.h"
+#include "i915_sw_fence.h"
 
 struct pid;
 
@@ -30,7 +31,12 @@  struct intel_timeline;
 struct intel_ring;
 
 struct i915_gem_engines {
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		struct list_head link;
+	};
+	struct i915_sw_fence fence;
+	struct i915_gem_context *ctx;
 	unsigned int num_engines;
 	struct intel_context *engines[];
 };
@@ -173,6 +179,11 @@  struct i915_gem_context {
 	 * context in messages.
 	 */
 	char name[TASK_COMM_LEN + 8];
+
+	struct {
+		struct spinlock lock;
+		struct list_head engines;
+	} stale;
 };
 
 #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 51ba97daf2a0..9a20b7246f91 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -211,10 +211,19 @@  void i915_sw_fence_complete(struct i915_sw_fence *fence)
 	__i915_sw_fence_complete(fence, NULL);
 }
 
-void i915_sw_fence_await(struct i915_sw_fence *fence)
+bool i915_sw_fence_await(struct i915_sw_fence *fence)
 {
-	debug_fence_assert(fence);
-	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
+	int old, new;
+
+	new = atomic_read(&fence->pending);
+	do {
+		if (new < 1)
+			return false;
+
+		old = new++;
+	} while ((new = atomic_cmpxchg(&fence->pending, old, new)) != old);
+
+	return true;
 }
 
 void __i915_sw_fence_init(struct i915_sw_fence *fence,
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 19e806ce43bc..30a863353ee6 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -91,7 +91,7 @@  int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    unsigned long timeout,
 				    gfp_t gfp);
 
-void i915_sw_fence_await(struct i915_sw_fence *fence);
+bool i915_sw_fence_await(struct i915_sw_fence *fence);
 void i915_sw_fence_complete(struct i915_sw_fence *fence);
 
 static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)