diff mbox series

[4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

Message ID 20211211005612.8575-5-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Fix stealing guc_ids + test | expand

Commit Message

Matthew Brost Dec. 11, 2021, 12:56 a.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There was deadlock as such, it's just that the H2G queue was
full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ++++++++++++-------
 1 file changed, 28 insertions(+), 17 deletions(-)

Comments

John Harrison Dec. 11, 2021, 1:07 a.m. UTC | #1
On 12/10/2021 16:56, Matthew Brost wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> While attempting to debug a CT deadlock issue in various CI failures
> (most easily reproduced with gem_ctx_create/basic-files), I was seeing
> CPU deadlock errors being reported. This were because the context
> destroy loop was blocking waiting on H2G space from inside an IRQ
> spinlock. There was deadlock as such, it's just that the H2G queue was
There was *no* deadlock as such

John.

> full of context destroy commands and GuC was taking a long time to
> process them. However, the kernel was seeing the large amount of time
> spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
> then happen (heartbeat failures, CT deadlock errors, outstanding H2G
> WARNs, etc.).
>
> Re-working the loop to only acquire the spinlock around the list
> management (which is all it is meant to protect) rather than the
> entire destroy operation seems to fix all the above issues.
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ++++++++++++-------
>   1 file changed, 28 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 36c2965db49b..96fcf869e3ff 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct intel_context *ce)
>   	unsigned long flags;
>   	bool disabled;
>   
> -	lockdep_assert_held(&guc->submission_state.lock);
>   	GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
>   	GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
>   	GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
> @@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct intel_context *ce)
>   	}
>   	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
>   	if (unlikely(disabled)) {
> -		__release_guc_id(guc, ce);
> +		release_guc_id(guc, ce);
>   		__guc_context_destroy(ce);
>   		return;
>   	}
> @@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct intel_context *ce)
>   
>   static void guc_flush_destroyed_contexts(struct intel_guc *guc)
>   {
> -	struct intel_context *ce, *cn;
> +	struct intel_context *ce;
>   	unsigned long flags;
>   
>   	GEM_BUG_ON(!submission_disabled(guc) &&
>   		   guc_submission_initialized(guc));
>   
> -	spin_lock_irqsave(&guc->submission_state.lock, flags);
> -	list_for_each_entry_safe(ce, cn,
> -				 &guc->submission_state.destroyed_contexts,
> -				 destroyed_link) {
> -		list_del_init(&ce->destroyed_link);
> -		__release_guc_id(guc, ce);
> +	while (!list_empty(&guc->submission_state.destroyed_contexts)) {
> +		spin_lock_irqsave(&guc->submission_state.lock, flags);
> +		ce = list_first_entry_or_null(&guc->submission_state.destroyed_contexts,
> +					      struct intel_context,
> +					      destroyed_link);
> +		if (ce)
> +			list_del_init(&ce->destroyed_link);
> +		spin_unlock_irqrestore(&guc->submission_state.lock, flags);
> +
> +		if (!ce)
> +			break;
> +
> +		release_guc_id(guc, ce);
>   		__guc_context_destroy(ce);
>   	}
> -	spin_unlock_irqrestore(&guc->submission_state.lock, flags);
>   }
>   
>   static void deregister_destroyed_contexts(struct intel_guc *guc)
>   {
> -	struct intel_context *ce, *cn;
> +	struct intel_context *ce;
>   	unsigned long flags;
>   
> -	spin_lock_irqsave(&guc->submission_state.lock, flags);
> -	list_for_each_entry_safe(ce, cn,
> -				 &guc->submission_state.destroyed_contexts,
> -				 destroyed_link) {
> -		list_del_init(&ce->destroyed_link);
> +	while (!list_empty(&guc->submission_state.destroyed_contexts)) {
> +		spin_lock_irqsave(&guc->submission_state.lock, flags);
> +		ce = list_first_entry_or_null(&guc->submission_state.destroyed_contexts,
> +					      struct intel_context,
> +					      destroyed_link);
> +		if (ce)
> +			list_del_init(&ce->destroyed_link);
> +		spin_unlock_irqrestore(&guc->submission_state.lock, flags);
> +
> +		if (!ce)
> +			break;
> +
>   		guc_lrc_desc_unpin(ce);
>   	}
> -	spin_unlock_irqrestore(&guc->submission_state.lock, flags);
>   }
>   
>   static void destroyed_worker_func(struct work_struct *w)
Matthew Brost Dec. 11, 2021, 1:10 a.m. UTC | #2
On Fri, Dec 10, 2021 at 05:07:12PM -0800, John Harrison wrote:
> On 12/10/2021 16:56, Matthew Brost wrote:
> > From: John Harrison <John.C.Harrison@Intel.com>
> > 
> > While attempting to debug a CT deadlock issue in various CI failures
> > (most easily reproduced with gem_ctx_create/basic-files), I was seeing
> > CPU deadlock errors being reported. This were because the context
> > destroy loop was blocking waiting on H2G space from inside an IRQ
> > spinlock. There was deadlock as such, it's just that the H2G queue was
> There was *no* deadlock as such
> 

Let's fix this up when applying the series.

With that:
Reviewed-by: Matthew Brost <matthew.brost@intel.com>

> John.
> 
> > full of context destroy commands and GuC was taking a long time to
> > process them. However, the kernel was seeing the large amount of time
> > spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
> > then happen (heartbeat failures, CT deadlock errors, outstanding H2G
> > WARNs, etc.).
> > 
> > Re-working the loop to only acquire the spinlock around the list
> > management (which is all it is meant to protect) rather than the
> > entire destroy operation seems to fix all the above issues.
> > 
> > Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> > ---
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ++++++++++++-------
> >   1 file changed, 28 insertions(+), 17 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 36c2965db49b..96fcf869e3ff 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct intel_context *ce)
> >   	unsigned long flags;
> >   	bool disabled;
> > -	lockdep_assert_held(&guc->submission_state.lock);
> >   	GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
> >   	GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
> >   	GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
> > @@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct intel_context *ce)
> >   	}
> >   	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> >   	if (unlikely(disabled)) {
> > -		__release_guc_id(guc, ce);
> > +		release_guc_id(guc, ce);
> >   		__guc_context_destroy(ce);
> >   		return;
> >   	}
> > @@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct intel_context *ce)
> >   static void guc_flush_destroyed_contexts(struct intel_guc *guc)
> >   {
> > -	struct intel_context *ce, *cn;
> > +	struct intel_context *ce;
> >   	unsigned long flags;
> >   	GEM_BUG_ON(!submission_disabled(guc) &&
> >   		   guc_submission_initialized(guc));
> > -	spin_lock_irqsave(&guc->submission_state.lock, flags);
> > -	list_for_each_entry_safe(ce, cn,
> > -				 &guc->submission_state.destroyed_contexts,
> > -				 destroyed_link) {
> > -		list_del_init(&ce->destroyed_link);
> > -		__release_guc_id(guc, ce);
> > +	while (!list_empty(&guc->submission_state.destroyed_contexts)) {
> > +		spin_lock_irqsave(&guc->submission_state.lock, flags);
> > +		ce = list_first_entry_or_null(&guc->submission_state.destroyed_contexts,
> > +					      struct intel_context,
> > +					      destroyed_link);
> > +		if (ce)
> > +			list_del_init(&ce->destroyed_link);
> > +		spin_unlock_irqrestore(&guc->submission_state.lock, flags);
> > +
> > +		if (!ce)
> > +			break;
> > +
> > +		release_guc_id(guc, ce);
> >   		__guc_context_destroy(ce);
> >   	}
> > -	spin_unlock_irqrestore(&guc->submission_state.lock, flags);
> >   }
> >   static void deregister_destroyed_contexts(struct intel_guc *guc)
> >   {
> > -	struct intel_context *ce, *cn;
> > +	struct intel_context *ce;
> >   	unsigned long flags;
> > -	spin_lock_irqsave(&guc->submission_state.lock, flags);
> > -	list_for_each_entry_safe(ce, cn,
> > -				 &guc->submission_state.destroyed_contexts,
> > -				 destroyed_link) {
> > -		list_del_init(&ce->destroyed_link);
> > +	while (!list_empty(&guc->submission_state.destroyed_contexts)) {
> > +		spin_lock_irqsave(&guc->submission_state.lock, flags);
> > +		ce = list_first_entry_or_null(&guc->submission_state.destroyed_contexts,
> > +					      struct intel_context,
> > +					      destroyed_link);
> > +		if (ce)
> > +			list_del_init(&ce->destroyed_link);
> > +		spin_unlock_irqrestore(&guc->submission_state.lock, flags);
> > +
> > +		if (!ce)
> > +			break;
> > +
> >   		guc_lrc_desc_unpin(ce);
> >   	}
> > -	spin_unlock_irqrestore(&guc->submission_state.lock, flags);
> >   }
> >   static void destroyed_worker_func(struct work_struct *w)
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@  static inline void guc_lrc_desc_unpin(struct intel_context *ce)
 	unsigned long flags;
 	bool disabled;
 
-	lockdep_assert_held(&guc->submission_state.lock);
 	GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
 	GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
 	GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@  static inline void guc_lrc_desc_unpin(struct intel_context *ce)
 	}
 	spin_unlock_irqrestore(&ce->guc_state.lock, flags);
 	if (unlikely(disabled)) {
-		__release_guc_id(guc, ce);
+		release_guc_id(guc, ce);
 		__guc_context_destroy(ce);
 		return;
 	}
@@ -2694,36 +2693,48 @@  static void __guc_context_destroy(struct intel_context *ce)
 
 static void guc_flush_destroyed_contexts(struct intel_guc *guc)
 {
-	struct intel_context *ce, *cn;
+	struct intel_context *ce;
 	unsigned long flags;
 
 	GEM_BUG_ON(!submission_disabled(guc) &&
 		   guc_submission_initialized(guc));
 
-	spin_lock_irqsave(&guc->submission_state.lock, flags);
-	list_for_each_entry_safe(ce, cn,
-				 &guc->submission_state.destroyed_contexts,
-				 destroyed_link) {
-		list_del_init(&ce->destroyed_link);
-		__release_guc_id(guc, ce);
+	while (!list_empty(&guc->submission_state.destroyed_contexts)) {
+		spin_lock_irqsave(&guc->submission_state.lock, flags);
+		ce = list_first_entry_or_null(&guc->submission_state.destroyed_contexts,
+					      struct intel_context,
+					      destroyed_link);
+		if (ce)
+			list_del_init(&ce->destroyed_link);
+		spin_unlock_irqrestore(&guc->submission_state.lock, flags);
+
+		if (!ce)
+			break;
+
+		release_guc_id(guc, ce);
 		__guc_context_destroy(ce);
 	}
-	spin_unlock_irqrestore(&guc->submission_state.lock, flags);
 }
 
 static void deregister_destroyed_contexts(struct intel_guc *guc)
 {
-	struct intel_context *ce, *cn;
+	struct intel_context *ce;
 	unsigned long flags;
 
-	spin_lock_irqsave(&guc->submission_state.lock, flags);
-	list_for_each_entry_safe(ce, cn,
-				 &guc->submission_state.destroyed_contexts,
-				 destroyed_link) {
-		list_del_init(&ce->destroyed_link);
+	while (!list_empty(&guc->submission_state.destroyed_contexts)) {
+		spin_lock_irqsave(&guc->submission_state.lock, flags);
+		ce = list_first_entry_or_null(&guc->submission_state.destroyed_contexts,
+					      struct intel_context,
+					      destroyed_link);
+		if (ce)
+			list_del_init(&ce->destroyed_link);
+		spin_unlock_irqrestore(&guc->submission_state.lock, flags);
+
+		if (!ce)
+			break;
+
 		guc_lrc_desc_unpin(ce);
 	}
-	spin_unlock_irqrestore(&guc->submission_state.lock, flags);
 }
 
 static void destroyed_worker_func(struct work_struct *w)