diff mbox series

[3/5] drm/i915: Remove requirement for holding i915_request.lock for breadcrumbs

Message ID 20200716113357.7644-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/5] drm/i915: Be wary of data races when reading the active execlists | expand

Commit Message

Chris Wilson July 16, 2020, 11:33 a.m. UTC
Since the breadcrumb enabling/cancelling itself is serialised by the
breadcrumbs.irq_lock, with a bit of care we can remove the outer
serialisation with i915_request.lock for concurrent
dma_fence_enable_signaling(). This has the important side-effect of
eliminating the nested i915_request.lock within request submission.

The challenge in serialisation is around the unsubmission where we take
an active request that wants a breadcrumb on the signaling engine and
put it to sleep. We do not want a concurrent
dma_fence_enable_signaling() to attach a breadcrumb as we unsubmit, so
we must mark the request as no longer active before serialising with the
concurrent enable-signaling.

On retire, we serialise with the concurrent enable-signaling, but
instead of clearing ACTIVE, we mark it as SIGNALED.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 130 +++++++++++++-------
 drivers/gpu/drm/i915/gt/intel_lrc.c         |  14 ---
 drivers/gpu/drm/i915/i915_request.c         |  39 +++---
 3 files changed, 100 insertions(+), 83 deletions(-)

Comments

Tvrtko Ursulin July 16, 2020, 3:09 p.m. UTC | #1
On 16/07/2020 12:33, Chris Wilson wrote:
> Since the breadcrumb enabling/cancelling itself is serialised by the
> breadcrumbs.irq_lock, with a bit of care we can remove the outer
> serialisation with i915_request.lock for concurrent
> dma_fence_enable_signaling(). This has the important side-effect of
> eliminating the nested i915_request.lock within request submission.
> 
> The challenge in serialisation is around the unsubmission where we take
> an active request that wants a breadcrumb on the signaling engine and
> put it to sleep. We do not want a concurrent
> dma_fence_enable_signaling() to attach a breadcrumb as we unsubmit, so
> we must mark the request as no longer active before serialising with the
> concurrent enable-signaling.
> 
> On retire, we serialise with the concurrent enable-signaling, but
> instead of clearing ACTIVE, we mark it as SIGNALED.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 130 +++++++++++++-------
>   drivers/gpu/drm/i915/gt/intel_lrc.c         |  14 ---
>   drivers/gpu/drm/i915/i915_request.c         |  39 +++---
>   3 files changed, 100 insertions(+), 83 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> index 91786310c114..a0f52417238c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> @@ -220,17 +220,17 @@ static void signal_irq_work(struct irq_work *work)
>   	}
>   }
>   
> -static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
> +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   {
>   	struct intel_engine_cs *engine =
>   		container_of(b, struct intel_engine_cs, breadcrumbs);
>   
>   	lockdep_assert_held(&b->irq_lock);
>   	if (b->irq_armed)
> -		return true;
> +		return;
>   
>   	if (!intel_gt_pm_get_if_awake(engine->gt))
> -		return false;
> +		return;
>   
>   	/*
>   	 * The breadcrumb irq will be disarmed on the interrupt after the
> @@ -250,8 +250,6 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   
>   	if (!b->irq_enabled++)
>   		irq_enable(engine);
> -
> -	return true;
>   }
>   
>   void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
> @@ -310,57 +308,99 @@ void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
>   {
>   }
>   
> -bool i915_request_enable_breadcrumb(struct i915_request *rq)
> +static void insert_breadcrumb(struct i915_request *rq,
> +			      struct intel_breadcrumbs *b)
>   {
> -	lockdep_assert_held(&rq->lock);
> +	struct intel_context *ce = rq->context;
> +	struct list_head *pos;
>   
> -	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
> -		return true;
> +	if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
> +		return;
>   
> -	if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) {
> -		struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
> -		struct intel_context *ce = rq->context;
> -		struct list_head *pos;
> +	__intel_breadcrumbs_arm_irq(b);
>   
> -		spin_lock(&b->irq_lock);
> +	/*
> +	 * We keep the seqno in retirement order, so we can break
> +	 * inside intel_engine_signal_breadcrumbs as soon as we've
> +	 * passed the last completed request (or seen a request that
> +	 * hasn't event started). We could walk the timeline->requests,
> +	 * but keeping a separate signalers_list has the advantage of
> +	 * hopefully being much smaller than the full list and so
> +	 * provides faster iteration and detection when there are no
> +	 * more interrupts required for this context.
> +	 *
> +	 * We typically expect to add new signalers in order, so we
> +	 * start looking for our insertion point from the tail of
> +	 * the list.
> +	 */
> +	list_for_each_prev(pos, &ce->signals) {
> +		struct i915_request *it =
> +			list_entry(pos, typeof(*it), signal_link);
>   
> -		if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
> -			goto unlock;
> +		if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno))
> +			break;
> +	}
> +	list_add(&rq->signal_link, pos);
> +	if (pos == &ce->signals) /* catch transitions from empty list */
> +		list_move_tail(&ce->signal_link, &b->signalers);
> +	GEM_BUG_ON(!check_signal_order(ce, rq));
>   
> -		if (!__intel_breadcrumbs_arm_irq(b))
> -			goto unlock;
> +	set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
> +}
>   
> -		/*
> -		 * We keep the seqno in retirement order, so we can break
> -		 * inside intel_engine_signal_breadcrumbs as soon as we've
> -		 * passed the last completed request (or seen a request that
> -		 * hasn't event started). We could walk the timeline->requests,
> -		 * but keeping a separate signalers_list has the advantage of
> -		 * hopefully being much smaller than the full list and so
> -		 * provides faster iteration and detection when there are no
> -		 * more interrupts required for this context.
> -		 *
> -		 * We typically expect to add new signalers in order, so we
> -		 * start looking for our insertion point from the tail of
> -		 * the list.
> -		 */
> -		list_for_each_prev(pos, &ce->signals) {
> -			struct i915_request *it =
> -				list_entry(pos, typeof(*it), signal_link);
> +bool i915_request_enable_breadcrumb(struct i915_request *rq)
> +{
> +	struct intel_breadcrumbs *b;
>   
> -			if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno))
> -				break;
> -		}
> -		list_add(&rq->signal_link, pos);
> -		if (pos == &ce->signals) /* catch transitions from empty list */
> -			list_move_tail(&ce->signal_link, &b->signalers);
> -		GEM_BUG_ON(!check_signal_order(ce, rq));
> +	/* Seralises with i915_request_retire() using rq->lock */

Serialises

> +	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
> +		return true;
>   
> -		set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
> -unlock:
> +	/*
> +	 * Peek at i915_request_submit()/i915_request_unsubmit() status.
> +	 *
> +	 * If the request is not yet active (and not signaled), we will
> +	 * attach the breadcrumb later.
> +	 */
> +	if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags))
> +		return true;
> +
> +	/*
> +	 * rq->engine is locked by rq->engine->active.lock. That however
> +	 * is not known until after rq->engine has been dereferenced and
> +	 * the lock acquired. Hence we acquire the lock and then validate
> +	 * that rq->engine still matches the lock we hold for it.
> +	 *
> +	 * Here, we are using the breadcrumb lock as a proxy for the
> +	 * rq->engine->active.lock, and we know that since the breadcrumb
> +	 * will be serialised within i915_request_submit/i915_request_unsubmit,
> +	 * the engine cannot change while active as long as we hold the
> +	 * breadcrumb lock on that engine.
> +	 *
> +	 * From the dma_fence_enable_signaling() path, we are outside of the
> +	 * request submit/unsubmit path, and so we must be more careful to
> +	 * acquire the right lock.
> +	 */
> +	b = &READ_ONCE(rq->engine)->breadcrumbs;
> +	spin_lock(&b->irq_lock);
> +	while (unlikely(b != &READ_ONCE(rq->engine)->breadcrumbs)) {
>   		spin_unlock(&b->irq_lock);
> +		b = &READ_ONCE(rq->engine)->breadcrumbs;
> +		spin_lock(&b->irq_lock);
>   	}
>   
> +	/*
> +	 * Now that we are finally serialised with request submit/unsubmit,
> +	 * [with b->irq_lock] and with i915_request_reitre() [via checking

retire

> +	 * SIGNALED with rq->lock] confirm the request is indeed active. If
> +	 * it is no longer active, the breadcrumb will be attached upon
> +	 * i915_request_submit().
> +	 */
> +	if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags))
> +		insert_breadcrumb(rq, b);
> +
> +	spin_unlock(&b->irq_lock);
> +
>   	return !__request_completed(rq);
>   }
>   
> @@ -368,8 +408,6 @@ void i915_request_cancel_breadcrumb(struct i915_request *rq)
>   {
>   	struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
>   
> -	lockdep_assert_held(&rq->lock);
> -
>   	/*
>   	 * We must wait for b->irq_lock so that we know the interrupt handler
>   	 * has released its reference to the intel_context and has completed
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 29c0fde8b4df..21c16e31c4fe 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1148,20 +1148,6 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   		} else {
>   			struct intel_engine_cs *owner = rq->context->engine;
>   
> -			/*
> -			 * Decouple the virtual breadcrumb before moving it
> -			 * back to the virtual engine -- we don't want the
> -			 * request to complete in the background and try
> -			 * and cancel the breadcrumb on the virtual engine
> -			 * (instead of the old engine where it is linked)!
> -			 */
> -			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> -				     &rq->fence.flags)) {
> -				spin_lock_nested(&rq->lock,
> -						 SINGLE_DEPTH_NESTING);
> -				i915_request_cancel_breadcrumb(rq);
> -				spin_unlock(&rq->lock);
> -			}
>   			WRITE_ONCE(rq->engine, owner);
>   			owner->submit_request(rq);
>   			active = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 2ef17b11ca4b..8c345ead04a6 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -320,11 +320,12 @@ bool i915_request_retire(struct i915_request *rq)
>   		dma_fence_signal_locked(&rq->fence);
>   	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags))
>   		i915_request_cancel_breadcrumb(rq);
> +	spin_unlock_irq(&rq->lock);
> +
>   	if (i915_request_has_waitboost(rq)) {
>   		GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters));
>   		atomic_dec(&rq->engine->gt->rps.num_waiters);
>   	}
> -	spin_unlock_irq(&rq->lock);
>   
>   	/*
>   	 * We only loosely track inflight requests across preemption,
> @@ -608,17 +609,9 @@ bool __i915_request_submit(struct i915_request *request)
>   	 */
>   	__notify_execute_cb_irq(request);
>   
> -	/* We may be recursing from the signal callback of another i915 fence */
> -	if (!i915_request_signaled(request)) {
> -		spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
> -
> -		if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> -			     &request->fence.flags) &&
> -		    !i915_request_enable_breadcrumb(request))
> -			intel_engine_signal_breadcrumbs(engine);
> -
> -		spin_unlock(&request->lock);
> -	}
> +	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
> +	    !i915_request_enable_breadcrumb(request))
> +		intel_engine_signal_breadcrumbs(engine);
>   
>   	return result;
>   }
> @@ -640,27 +633,27 @@ void __i915_request_unsubmit(struct i915_request *request)
>   {
>   	struct intel_engine_cs *engine = request->engine;
>   
> +	/*
> +	 * Only unwind in reverse order, required so that the per-context list
> +	 * is kept in seqno/ring order.
> +	 */
>   	RQ_TRACE(request, "\n");
>   
>   	GEM_BUG_ON(!irqs_disabled());
>   	lockdep_assert_held(&engine->active.lock);
>   
>   	/*
> -	 * Only unwind in reverse order, required so that the per-context list
> -	 * is kept in seqno/ring order.
> +	 * Before we remove this breadcrumb from the signal list, we have
> +	 * to ensure that a concurrent dma_fence_enable_signaling() does not
> +	 * attach itself. We first mark the request as no longer active and
> +	 * make sure that is visible to other cores, and then remove the
> +	 * breadcrumb if attached.
>   	 */
> -
> -	/* We may be recursing from the signal callback of another i915 fence */
> -	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
> -
> +	GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
> +	clear_bit_unlock(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
>   	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
>   		i915_request_cancel_breadcrumb(request);
>   
> -	GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
> -	clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
> -
> -	spin_unlock(&request->lock);
> -
>   	/* We've already spun, don't charge on resubmitting. */
>   	if (request->sched.semaphores && i915_request_started(request))
>   		request->sched.semaphores = 0;
> 

I did not find a hole (race) after quite a bit of straining the grey 
matter so.. lets see..

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index 91786310c114..a0f52417238c 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -220,17 +220,17 @@  static void signal_irq_work(struct irq_work *work)
 	}
 }
 
-static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
+static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 {
 	struct intel_engine_cs *engine =
 		container_of(b, struct intel_engine_cs, breadcrumbs);
 
 	lockdep_assert_held(&b->irq_lock);
 	if (b->irq_armed)
-		return true;
+		return;
 
 	if (!intel_gt_pm_get_if_awake(engine->gt))
-		return false;
+		return;
 
 	/*
 	 * The breadcrumb irq will be disarmed on the interrupt after the
@@ -250,8 +250,6 @@  static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 
 	if (!b->irq_enabled++)
 		irq_enable(engine);
-
-	return true;
 }
 
 void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -310,57 +308,99 @@  void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 {
 }
 
-bool i915_request_enable_breadcrumb(struct i915_request *rq)
+static void insert_breadcrumb(struct i915_request *rq,
+			      struct intel_breadcrumbs *b)
 {
-	lockdep_assert_held(&rq->lock);
+	struct intel_context *ce = rq->context;
+	struct list_head *pos;
 
-	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
-		return true;
+	if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
+		return;
 
-	if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) {
-		struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
-		struct intel_context *ce = rq->context;
-		struct list_head *pos;
+	__intel_breadcrumbs_arm_irq(b);
 
-		spin_lock(&b->irq_lock);
+	/*
+	 * We keep the seqno in retirement order, so we can break
+	 * inside intel_engine_signal_breadcrumbs as soon as we've
+	 * passed the last completed request (or seen a request that
+	 * hasn't event started). We could walk the timeline->requests,
+	 * but keeping a separate signalers_list has the advantage of
+	 * hopefully being much smaller than the full list and so
+	 * provides faster iteration and detection when there are no
+	 * more interrupts required for this context.
+	 *
+	 * We typically expect to add new signalers in order, so we
+	 * start looking for our insertion point from the tail of
+	 * the list.
+	 */
+	list_for_each_prev(pos, &ce->signals) {
+		struct i915_request *it =
+			list_entry(pos, typeof(*it), signal_link);
 
-		if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
-			goto unlock;
+		if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno))
+			break;
+	}
+	list_add(&rq->signal_link, pos);
+	if (pos == &ce->signals) /* catch transitions from empty list */
+		list_move_tail(&ce->signal_link, &b->signalers);
+	GEM_BUG_ON(!check_signal_order(ce, rq));
 
-		if (!__intel_breadcrumbs_arm_irq(b))
-			goto unlock;
+	set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
+}
 
-		/*
-		 * We keep the seqno in retirement order, so we can break
-		 * inside intel_engine_signal_breadcrumbs as soon as we've
-		 * passed the last completed request (or seen a request that
-		 * hasn't event started). We could walk the timeline->requests,
-		 * but keeping a separate signalers_list has the advantage of
-		 * hopefully being much smaller than the full list and so
-		 * provides faster iteration and detection when there are no
-		 * more interrupts required for this context.
-		 *
-		 * We typically expect to add new signalers in order, so we
-		 * start looking for our insertion point from the tail of
-		 * the list.
-		 */
-		list_for_each_prev(pos, &ce->signals) {
-			struct i915_request *it =
-				list_entry(pos, typeof(*it), signal_link);
+bool i915_request_enable_breadcrumb(struct i915_request *rq)
+{
+	struct intel_breadcrumbs *b;
 
-			if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno))
-				break;
-		}
-		list_add(&rq->signal_link, pos);
-		if (pos == &ce->signals) /* catch transitions from empty list */
-			list_move_tail(&ce->signal_link, &b->signalers);
-		GEM_BUG_ON(!check_signal_order(ce, rq));
+	/* Seralises with i915_request_retire() using rq->lock */
+	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
+		return true;
 
-		set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
-unlock:
+	/*
+	 * Peek at i915_request_submit()/i915_request_unsubmit() status.
+	 *
+	 * If the request is not yet active (and not signaled), we will
+	 * attach the breadcrumb later.
+	 */
+	if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags))
+		return true;
+
+	/*
+	 * rq->engine is locked by rq->engine->active.lock. That however
+	 * is not known until after rq->engine has been dereferenced and
+	 * the lock acquired. Hence we acquire the lock and then validate
+	 * that rq->engine still matches the lock we hold for it.
+	 *
+	 * Here, we are using the breadcrumb lock as a proxy for the
+	 * rq->engine->active.lock, and we know that since the breadcrumb
+	 * will be serialised within i915_request_submit/i915_request_unsubmit,
+	 * the engine cannot change while active as long as we hold the
+	 * breadcrumb lock on that engine.
+	 *
+	 * From the dma_fence_enable_signaling() path, we are outside of the
+	 * request submit/unsubmit path, and so we must be more careful to
+	 * acquire the right lock.
+	 */
+	b = &READ_ONCE(rq->engine)->breadcrumbs;
+	spin_lock(&b->irq_lock);
+	while (unlikely(b != &READ_ONCE(rq->engine)->breadcrumbs)) {
 		spin_unlock(&b->irq_lock);
+		b = &READ_ONCE(rq->engine)->breadcrumbs;
+		spin_lock(&b->irq_lock);
 	}
 
+	/*
+	 * Now that we are finally serialised with request submit/unsubmit,
+	 * [with b->irq_lock] and with i915_request_reitre() [via checking
+	 * SIGNALED with rq->lock] confirm the request is indeed active. If
+	 * it is no longer active, the breadcrumb will be attached upon
+	 * i915_request_submit().
+	 */
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags))
+		insert_breadcrumb(rq, b);
+
+	spin_unlock(&b->irq_lock);
+
 	return !__request_completed(rq);
 }
 
@@ -368,8 +408,6 @@  void i915_request_cancel_breadcrumb(struct i915_request *rq)
 {
 	struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
 
-	lockdep_assert_held(&rq->lock);
-
 	/*
 	 * We must wait for b->irq_lock so that we know the interrupt handler
 	 * has released its reference to the intel_context and has completed
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 29c0fde8b4df..21c16e31c4fe 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1148,20 +1148,6 @@  __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		} else {
 			struct intel_engine_cs *owner = rq->context->engine;
 
-			/*
-			 * Decouple the virtual breadcrumb before moving it
-			 * back to the virtual engine -- we don't want the
-			 * request to complete in the background and try
-			 * and cancel the breadcrumb on the virtual engine
-			 * (instead of the old engine where it is linked)!
-			 */
-			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
-				     &rq->fence.flags)) {
-				spin_lock_nested(&rq->lock,
-						 SINGLE_DEPTH_NESTING);
-				i915_request_cancel_breadcrumb(rq);
-				spin_unlock(&rq->lock);
-			}
 			WRITE_ONCE(rq->engine, owner);
 			owner->submit_request(rq);
 			active = NULL;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 2ef17b11ca4b..8c345ead04a6 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -320,11 +320,12 @@  bool i915_request_retire(struct i915_request *rq)
 		dma_fence_signal_locked(&rq->fence);
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags))
 		i915_request_cancel_breadcrumb(rq);
+	spin_unlock_irq(&rq->lock);
+
 	if (i915_request_has_waitboost(rq)) {
 		GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters));
 		atomic_dec(&rq->engine->gt->rps.num_waiters);
 	}
-	spin_unlock_irq(&rq->lock);
 
 	/*
 	 * We only loosely track inflight requests across preemption,
@@ -608,17 +609,9 @@  bool __i915_request_submit(struct i915_request *request)
 	 */
 	__notify_execute_cb_irq(request);
 
-	/* We may be recursing from the signal callback of another i915 fence */
-	if (!i915_request_signaled(request)) {
-		spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-
-		if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
-			     &request->fence.flags) &&
-		    !i915_request_enable_breadcrumb(request))
-			intel_engine_signal_breadcrumbs(engine);
-
-		spin_unlock(&request->lock);
-	}
+	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
+	    !i915_request_enable_breadcrumb(request))
+		intel_engine_signal_breadcrumbs(engine);
 
 	return result;
 }
@@ -640,27 +633,27 @@  void __i915_request_unsubmit(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 
+	/*
+	 * Only unwind in reverse order, required so that the per-context list
+	 * is kept in seqno/ring order.
+	 */
 	RQ_TRACE(request, "\n");
 
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->active.lock);
 
 	/*
-	 * Only unwind in reverse order, required so that the per-context list
-	 * is kept in seqno/ring order.
+	 * Before we remove this breadcrumb from the signal list, we have
+	 * to ensure that a concurrent dma_fence_enable_signaling() does not
+	 * attach itself. We first mark the request as no longer active and
+	 * make sure that is visible to other cores, and then remove the
+	 * breadcrumb if attached.
 	 */
-
-	/* We may be recursing from the signal callback of another i915 fence */
-	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-
+	GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
+	clear_bit_unlock(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
 		i915_request_cancel_breadcrumb(request);
 
-	GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
-	clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
-
-	spin_unlock(&request->lock);
-
 	/* We've already spun, don't charge on resubmitting. */
 	if (request->sched.semaphores && i915_request_started(request))
 		request->sched.semaphores = 0;