diff mbox series

[v2,1/3] drm/i915/guc: Flush context destruction worker at suspend

Message ID 20230815011210.1188379-2-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series Resolve suspend-resume racing with GuC destroy-context-worker | expand

Commit Message

Teres Alexis, Alan Previn Aug. 15, 2023, 1:12 a.m. UTC
When suspending, flush the context-guc-id
deregistration worker at the final stages of
intel_gt_suspend_late when we finally call gt_sanitize
that eventually leads down to __uc_sanitize so that
the deregistration worker doesn't fire off later as
we reset the GuC microcontroller.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +++++
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h | 2 ++
 drivers/gpu/drm/i915/gt/uc/intel_uc.c             | 2 ++
 3 files changed, 9 insertions(+)

Comments

Rodrigo Vivi Aug. 15, 2023, 1:53 p.m. UTC | #1
On Mon, Aug 14, 2023 at 06:12:08PM -0700, Alan Previn wrote:
> When suspending, flush the context-guc-id
> deregistration worker at the final stages of
> intel_gt_suspend_late when we finally call gt_sanitize
> that eventually leads down to __uc_sanitize so that
> the deregistration worker doesn't fire off later as
> we reset the GuC microcontroller.
> 
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +++++
>  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h | 2 ++
>  drivers/gpu/drm/i915/gt/uc/intel_uc.c             | 2 ++
>  3 files changed, 9 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index a0e3ef1c65d2..050572bb8dbe 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1578,6 +1578,11 @@ static void guc_flush_submissions(struct intel_guc *guc)
>  	spin_unlock_irqrestore(&sched_engine->lock, flags);
>  }
>  
> +void intel_guc_submission_flush_work(struct intel_guc *guc)
> +{
> +	flush_work(&guc->submission_state.destroyed_worker);
> +}
> +
>  static void guc_flush_destroyed_contexts(struct intel_guc *guc);
>  
>  void intel_guc_submission_reset_prepare(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index c57b29cdb1a6..b6df75622d3b 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -38,6 +38,8 @@ int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
>  				   bool interruptible,
>  				   long timeout);
>  
> +void intel_guc_submission_flush_work(struct intel_guc *guc);
> +
>  static inline bool intel_guc_submission_is_supported(struct intel_guc *guc)
>  {
>  	return guc->submission_supported;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index 98b103375b7a..eb3554cb5ea4 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -693,6 +693,8 @@ void intel_uc_suspend(struct intel_uc *uc)
>  		return;
>  	}
>  
> +	intel_guc_submission_flush_work(guc);
> +

what happens if a new job comes exactly here?
This still sounds a bit racy, although this already looks
much cleaner than the previous version.

>  	with_intel_runtime_pm(&uc_to_gt(uc)->i915->runtime_pm, wakeref) {
>  		err = intel_guc_suspend(guc);
>  		if (err)
> -- 
> 2.39.0
>
Teres Alexis, Alan Previn Aug. 25, 2023, 6:48 p.m. UTC | #2
Thanks again Rodrigo for reviewing and apologies for my tardy replies.
We are stil testing on shipping platforms and these latest patches seemed
to have reduced the frequency and solved the "system hangs" while suspending
but its still causing issues so we continue to debug. (issue is that
its running thousands of cycles overnight on multiple systems and takes
time). 

[snip]

> > @@ -693,6 +693,8 @@ void intel_uc_suspend(struct intel_uc *uc)
> >  		return;
> >  	}
> >  
> > +	intel_guc_submission_flush_work(guc);
> > +
> 
> what happens if a new job comes exactly here?
> This still sounds a bit racy, although this already looks
> much cleaner than the previous version.
alan: intel_uc_suspend is called from suspend-late or init-failure.
and the very next function -> intel_guc_suspend will soft reset the guc
and eventually nuke it. this is the most appropriate "latests as
possible" place to put this.
> 
> >  	with_intel_runtime_pm(&uc_to_gt(uc)->i915->runtime_pm, wakeref) {
> >  		err = intel_guc_suspend(guc);
> >  		if (err)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..050572bb8dbe 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1578,6 +1578,11 @@  static void guc_flush_submissions(struct intel_guc *guc)
 	spin_unlock_irqrestore(&sched_engine->lock, flags);
 }
 
+void intel_guc_submission_flush_work(struct intel_guc *guc)
+{
+	flush_work(&guc->submission_state.destroyed_worker);
+}
+
 static void guc_flush_destroyed_contexts(struct intel_guc *guc);
 
 void intel_guc_submission_reset_prepare(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c57b29cdb1a6..b6df75622d3b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -38,6 +38,8 @@  int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
 				   bool interruptible,
 				   long timeout);
 
+void intel_guc_submission_flush_work(struct intel_guc *guc);
+
 static inline bool intel_guc_submission_is_supported(struct intel_guc *guc)
 {
 	return guc->submission_supported;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 98b103375b7a..eb3554cb5ea4 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -693,6 +693,8 @@  void intel_uc_suspend(struct intel_uc *uc)
 		return;
 	}
 
+	intel_guc_submission_flush_work(guc);
+
 	with_intel_runtime_pm(&uc_to_gt(uc)->i915->runtime_pm, wakeref) {
 		err = intel_guc_suspend(guc);
 		if (err)