diff mbox series

[v2] drm/i915/gt: Ensure irqs' status does not change with spin_unlock

Message ID 5fnb3l7s5hr3yfehkpvf4jgcunm6qclxagvssdobvfxbwtjiyc@jzko7kb7svud (mailing list archive)
State New
Headers show
Series [v2] drm/i915/gt: Ensure irqs' status does not change with spin_unlock | expand

Commit Message

Krzysztof Karas Jan. 14, 2025, 9 a.m. UTC
spin_unlock() function enables irqs regardless of their state
before spin_lock() was called. This might result in an interrupt
while holding a lock further down in the execution, as seen in
GitLab issue #13399.

Try to remedy the problem by saving irq state before spin lock
acquisition.

v2: add irqs' state save/restore calls to all locks/unlocks in
 signal_irq_work() execution (Maciej)

Signed-off-by: Krzysztof Karas <krzysztof.karas@intel.com>
---
This issue is hit rarely on CI and I was not able to reproduce
it locally. There might be more places where we should save and
restore irq state, so I am not adding "Closes" label for the
issue yet.

 drivers/gpu/drm/i915/gt/intel_breadcrumbs.c   | 21 ++++++++++++-------
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
 2 files changed, 16 insertions(+), 10 deletions(-)

Comments

Tvrtko Ursulin Jan. 14, 2025, 10:06 a.m. UTC | #1
Hi,

On 14/01/2025 09:00, Krzysztof Karas wrote:
> spin_unlock() function enables irqs regardless of their state

It doesn't, you confuse spin_unlock with spin_unlock_irq.

> before spin_lock() was called. This might result in an interrupt
> while holding a lock further down in the execution, as seen in
> GitLab issue #13399.
> 
> Try to remedy the problem by saving irq state before spin lock
> acquisition.

Please check guc_lrc_desc_unpin(). It gets called from the 
destroyed_worker_func as  hinted by lockdep in 13399. There is a plain 
spin_lock() in there (in contradiction with itself). Fixing that one may 
be all that is needed to fix this correctly.

If that turns out right then also:

Fixes: 2f2cc53b5fe7 ("drm/i915/guc: Close deregister-context race 
against CT-loss")
Cc: <stable@vger.kernel.org> # v6.9+

Regards,

Tvrtko

> 
> v2: add irqs' state save/restore calls to all locks/unlocks in
>   signal_irq_work() execution (Maciej)
> 
> Signed-off-by: Krzysztof Karas <krzysztof.karas@intel.com>
> ---
> This issue is hit rarely on CI and I was not able to reproduce
> it locally. There might be more places where we should save and
> restore irq state, so I am not adding "Closes" label for the
> issue yet.
> 
>   drivers/gpu/drm/i915/gt/intel_breadcrumbs.c   | 21 ++++++++++++-------
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
>   2 files changed, 16 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> index cc866773ba6f..dd5542726b41 100644
> --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> @@ -53,13 +53,15 @@ static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   
>   static void intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
>   {
> +	unsigned long flags;
> +
>   	if (!b->irq_engine)
>   		return;
>   
> -	spin_lock(&b->irq_lock);
> +	spin_lock_irqsave(&b->irq_lock, flags);
>   	if (!b->irq_armed)
>   		__intel_breadcrumbs_arm_irq(b);
> -	spin_unlock(&b->irq_lock);
> +	spin_unlock_irqrestore(&b->irq_lock, flags);
>   }
>   
>   static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
> @@ -76,10 +78,12 @@ static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
>   
>   static void intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
>   {
> -	spin_lock(&b->irq_lock);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&b->irq_lock, flags);
>   	if (b->irq_armed)
>   		__intel_breadcrumbs_disarm_irq(b);
> -	spin_unlock(&b->irq_lock);
> +	spin_unlock_irqrestore(&b->irq_lock, flags);
>   }
>   
>   static void add_signaling_context(struct intel_breadcrumbs *b,
> @@ -173,6 +177,7 @@ static void signal_irq_work(struct irq_work *work)
>   	const ktime_t timestamp = ktime_get();
>   	struct llist_node *signal, *sn;
>   	struct intel_context *ce;
> +	unsigned long flags;
>   
>   	signal = NULL;
>   	if (unlikely(!llist_empty(&b->signaled_requests)))
> @@ -226,10 +231,10 @@ static void signal_irq_work(struct irq_work *work)
>   			 * spinlock as the callback chain may end up adding
>   			 * more signalers to the same context or engine.
>   			 */
> -			spin_lock(&ce->signal_lock);
> +			spin_lock_irqsave(&ce->signal_lock, flags);
>   			list_del_rcu(&rq->signal_link);
>   			release = remove_signaling_context(b, ce);
> -			spin_unlock(&ce->signal_lock);
> +			spin_unlock_irqrestore(&ce->signal_lock, flags);
>   			if (release) {
>   				if (intel_timeline_is_last(ce->timeline, rq))
>   					add_retire(b, ce->timeline);
> @@ -254,11 +259,11 @@ static void signal_irq_work(struct irq_work *work)
>   		if (rq->engine->sched_engine->retire_inflight_request_prio)
>   			rq->engine->sched_engine->retire_inflight_request_prio(rq);
>   
> -		spin_lock(&rq->lock);
> +		spin_lock_irqsave(&rq->lock, flags);
>   		list_replace(&rq->fence.cb_list, &cb_list);
>   		__dma_fence_signal__timestamp(&rq->fence, timestamp);
>   		__dma_fence_signal__notify(&rq->fence, &cb_list);
> -		spin_unlock(&rq->lock);
> +		spin_unlock_irqrestore(&rq->lock, flags);
>   
>   		i915_request_put(rq);
>   	}
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 12f1ba7ca9c1..e9102f7246f5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -4338,10 +4338,11 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq,
>   static void guc_retire_inflight_request_prio(struct i915_request *rq)
>   {
>   	struct intel_context *ce = request_to_scheduling_context(rq);
> +	unsigned long flags;
>   
> -	spin_lock(&ce->guc_state.lock);
> +	spin_lock_irqsave(&ce->guc_state.lock, flags);
>   	guc_prio_fini(rq, ce);
> -	spin_unlock(&ce->guc_state.lock);
> +	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
>   }
>   
>   static void sanitize_hwsp(struct intel_engine_cs *engine)
Krzysztof Karas Jan. 14, 2025, 11:54 a.m. UTC | #2
Hi Tvrtko,

> On 14/01/2025 09:00, Krzysztof Karas wrote:
> > spin_unlock() function enables irqs regardless of their state
> 
> It doesn't, you confuse spin_unlock with spin_unlock_irq.
> 
> > before spin_lock() was called. This might result in an interrupt
> > while holding a lock further down in the execution, as seen in
> > GitLab issue #13399.
> > 
> > Try to remedy the problem by saving irq state before spin lock
> > acquisition.
> 
> Please check guc_lrc_desc_unpin(). It gets called from the
> destroyed_worker_func as  hinted by lockdep in 13399. There is a plain
> spin_lock() in there (in contradiction with itself). Fixing that one may be
> all that is needed to fix this correctly.
> 
> If that turns out right then also:
> 
> Fixes: 2f2cc53b5fe7 ("drm/i915/guc: Close deregister-context race against
> CT-loss")
> Cc: <stable@vger.kernel.org> # v6.9+
Thanks for explaining. I'll try that out then.

Krzysztof

> 
> Regards,
> 
> Tvrtko
> 
> > 
> > v2: add irqs' state save/restore calls to all locks/unlocks in
> >   signal_irq_work() execution (Maciej)
> > 
> > Signed-off-by: Krzysztof Karas <krzysztof.karas@intel.com>
> > ---
> > This issue is hit rarely on CI and I was not able to reproduce
> > it locally. There might be more places where we should save and
> > restore irq state, so I am not adding "Closes" label for the
> > issue yet.
> > 
> >   drivers/gpu/drm/i915/gt/intel_breadcrumbs.c   | 21 ++++++++++++-------
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  5 +++--
> >   2 files changed, 16 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> > index cc866773ba6f..dd5542726b41 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> > @@ -53,13 +53,15 @@ static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
> >   static void intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
> >   {
> > +	unsigned long flags;
> > +
> >   	if (!b->irq_engine)
> >   		return;
> > -	spin_lock(&b->irq_lock);
> > +	spin_lock_irqsave(&b->irq_lock, flags);
> >   	if (!b->irq_armed)
> >   		__intel_breadcrumbs_arm_irq(b);
> > -	spin_unlock(&b->irq_lock);
> > +	spin_unlock_irqrestore(&b->irq_lock, flags);
> >   }
> >   static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
> > @@ -76,10 +78,12 @@ static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
> >   static void intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
> >   {
> > -	spin_lock(&b->irq_lock);
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&b->irq_lock, flags);
> >   	if (b->irq_armed)
> >   		__intel_breadcrumbs_disarm_irq(b);
> > -	spin_unlock(&b->irq_lock);
> > +	spin_unlock_irqrestore(&b->irq_lock, flags);
> >   }
> >   static void add_signaling_context(struct intel_breadcrumbs *b,
> > @@ -173,6 +177,7 @@ static void signal_irq_work(struct irq_work *work)
> >   	const ktime_t timestamp = ktime_get();
> >   	struct llist_node *signal, *sn;
> >   	struct intel_context *ce;
> > +	unsigned long flags;
> >   	signal = NULL;
> >   	if (unlikely(!llist_empty(&b->signaled_requests)))
> > @@ -226,10 +231,10 @@ static void signal_irq_work(struct irq_work *work)
> >   			 * spinlock as the callback chain may end up adding
> >   			 * more signalers to the same context or engine.
> >   			 */
> > -			spin_lock(&ce->signal_lock);
> > +			spin_lock_irqsave(&ce->signal_lock, flags);
> >   			list_del_rcu(&rq->signal_link);
> >   			release = remove_signaling_context(b, ce);
> > -			spin_unlock(&ce->signal_lock);
> > +			spin_unlock_irqrestore(&ce->signal_lock, flags);
> >   			if (release) {
> >   				if (intel_timeline_is_last(ce->timeline, rq))
> >   					add_retire(b, ce->timeline);
> > @@ -254,11 +259,11 @@ static void signal_irq_work(struct irq_work *work)
> >   		if (rq->engine->sched_engine->retire_inflight_request_prio)
> >   			rq->engine->sched_engine->retire_inflight_request_prio(rq);
> > -		spin_lock(&rq->lock);
> > +		spin_lock_irqsave(&rq->lock, flags);
> >   		list_replace(&rq->fence.cb_list, &cb_list);
> >   		__dma_fence_signal__timestamp(&rq->fence, timestamp);
> >   		__dma_fence_signal__notify(&rq->fence, &cb_list);
> > -		spin_unlock(&rq->lock);
> > +		spin_unlock_irqrestore(&rq->lock, flags);
> >   		i915_request_put(rq);
> >   	}
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 12f1ba7ca9c1..e9102f7246f5 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -4338,10 +4338,11 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq,
> >   static void guc_retire_inflight_request_prio(struct i915_request *rq)
> >   {
> >   	struct intel_context *ce = request_to_scheduling_context(rq);
> > +	unsigned long flags;
> > -	spin_lock(&ce->guc_state.lock);
> > +	spin_lock_irqsave(&ce->guc_state.lock, flags);
> >   	guc_prio_fini(rq, ce);
> > -	spin_unlock(&ce->guc_state.lock);
> > +	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> >   }
> >   static void sanitize_hwsp(struct intel_engine_cs *engine)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index cc866773ba6f..dd5542726b41 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -53,13 +53,15 @@  static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 
 static void intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 {
+	unsigned long flags;
+
 	if (!b->irq_engine)
 		return;
 
-	spin_lock(&b->irq_lock);
+	spin_lock_irqsave(&b->irq_lock, flags);
 	if (!b->irq_armed)
 		__intel_breadcrumbs_arm_irq(b);
-	spin_unlock(&b->irq_lock);
+	spin_unlock_irqrestore(&b->irq_lock, flags);
 }
 
 static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
@@ -76,10 +78,12 @@  static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
 
 static void intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
 {
-	spin_lock(&b->irq_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&b->irq_lock, flags);
 	if (b->irq_armed)
 		__intel_breadcrumbs_disarm_irq(b);
-	spin_unlock(&b->irq_lock);
+	spin_unlock_irqrestore(&b->irq_lock, flags);
 }
 
 static void add_signaling_context(struct intel_breadcrumbs *b,
@@ -173,6 +177,7 @@  static void signal_irq_work(struct irq_work *work)
 	const ktime_t timestamp = ktime_get();
 	struct llist_node *signal, *sn;
 	struct intel_context *ce;
+	unsigned long flags;
 
 	signal = NULL;
 	if (unlikely(!llist_empty(&b->signaled_requests)))
@@ -226,10 +231,10 @@  static void signal_irq_work(struct irq_work *work)
 			 * spinlock as the callback chain may end up adding
 			 * more signalers to the same context or engine.
 			 */
-			spin_lock(&ce->signal_lock);
+			spin_lock_irqsave(&ce->signal_lock, flags);
 			list_del_rcu(&rq->signal_link);
 			release = remove_signaling_context(b, ce);
-			spin_unlock(&ce->signal_lock);
+			spin_unlock_irqrestore(&ce->signal_lock, flags);
 			if (release) {
 				if (intel_timeline_is_last(ce->timeline, rq))
 					add_retire(b, ce->timeline);
@@ -254,11 +259,11 @@  static void signal_irq_work(struct irq_work *work)
 		if (rq->engine->sched_engine->retire_inflight_request_prio)
 			rq->engine->sched_engine->retire_inflight_request_prio(rq);
 
-		spin_lock(&rq->lock);
+		spin_lock_irqsave(&rq->lock, flags);
 		list_replace(&rq->fence.cb_list, &cb_list);
 		__dma_fence_signal__timestamp(&rq->fence, timestamp);
 		__dma_fence_signal__notify(&rq->fence, &cb_list);
-		spin_unlock(&rq->lock);
+		spin_unlock_irqrestore(&rq->lock, flags);
 
 		i915_request_put(rq);
 	}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 12f1ba7ca9c1..e9102f7246f5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -4338,10 +4338,11 @@  static void guc_bump_inflight_request_prio(struct i915_request *rq,
 static void guc_retire_inflight_request_prio(struct i915_request *rq)
 {
 	struct intel_context *ce = request_to_scheduling_context(rq);
+	unsigned long flags;
 
-	spin_lock(&ce->guc_state.lock);
+	spin_lock_irqsave(&ce->guc_state.lock, flags);
 	guc_prio_fini(rq, ce);
-	spin_unlock(&ce->guc_state.lock);
+	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
 }
 
 static void sanitize_hwsp(struct intel_engine_cs *engine)