diff mbox series

drm/i915/gt: Mark the GT as dead when mmio is unreliable

Message ID 20240807091014.469992-1-andi.shyti@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/gt: Mark the GT as dead when mmio is unreliable | expand

Commit Message

Andi Shyti Aug. 7, 2024, 9:10 a.m. UTC
From: Chris Wilson <chris.p.wilson@intel.com>

After we detect that mmio is returning all 0xff, we believe that the GPU
has dropped off the pci bus and is dead. Mark the device as wedged such
that we can propagate the failure back to userspace and wait for
recovery.

Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt.h       |  6 ++++++
 drivers/gpu/drm/i915/gt/intel_gt_types.h |  2 ++
 drivers/gpu/drm/i915/gt/intel_reset.c    | 12 +++++++++++-
 drivers/gpu/drm/i915/intel_uncore.c      |  7 +++++--
 4 files changed, 24 insertions(+), 3 deletions(-)

Comments

Cavitt, Jonathan Aug. 7, 2024, 2:06 p.m. UTC | #1
-----Original Message-----
From: Andi Shyti <andi.shyti@linux.intel.com> 
Sent: Wednesday, August 7, 2024 2:10 AM
To: intel-gfx <intel-gfx@lists.freedesktop.org>; dri-devel <dri-devel@lists.freedesktop.org>
Cc: Chris Wilson <chris.p.wilson@linux.intel.com>; Das, Nirmoy <nirmoy.das@intel.com>; Cavitt, Jonathan <jonathan.cavitt@intel.com>; Andi Shyti <andi.shyti@linux.intel.com>
Subject: [PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable
> 
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> After we detect that mmio is returning all 0xff, we believe that the GPU
> has dropped off the pci bus and is dead. Mark the device as wedged such
> that we can propagate the failure back to userspace and wait for
> recovery.
> 
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>

LGTM.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt

> ---
>  drivers/gpu/drm/i915/gt/intel_gt.h       |  6 ++++++
>  drivers/gpu/drm/i915/gt/intel_gt_types.h |  2 ++
>  drivers/gpu/drm/i915/gt/intel_reset.c    | 12 +++++++++++-
>  drivers/gpu/drm/i915/intel_uncore.c      |  7 +++++--
>  4 files changed, 24 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index b5e114d284ad..b73555889d50 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt,
>  void intel_gt_bind_context_set_ready(struct intel_gt *gt);
>  void intel_gt_bind_context_set_unready(struct intel_gt *gt);
>  bool intel_gt_is_bind_context_ready(struct intel_gt *gt);
> +
> +static inline void intel_gt_set_wedged_async(struct intel_gt *gt)
> +{
> +	queue_work(system_highpri_wq, &gt->wedge);
> +}
> +
>  #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index cfdd2ad5e954..bcee084b1f27 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -292,6 +292,8 @@ struct intel_gt {
>  	struct gt_defaults defaults;
>  	struct kobject *sysfs_defaults;
>  
> +	struct work_struct wedge;
> +
>  	struct i915_perf_gt perf;
>  
>  	/** link: &ggtt.gt_list */
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 735cd23a43c6..8f1ea95471ef 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
>  	GT_TRACE(gt, "end\n");
>  }
>  
> +static void set_wedged_work(struct work_struct *w)
> +{
> +	struct intel_gt *gt = container_of(w, struct intel_gt, wedge);
> +	intel_wakeref_t wf;
> +
> +	with_intel_runtime_pm(gt->uncore->rpm, wf)
> +		__intel_gt_set_wedged(gt);
> +}
> +
>  void intel_gt_set_wedged(struct intel_gt *gt)
>  {
>  	intel_wakeref_t wakeref;
> @@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt)
>  	init_waitqueue_head(&gt->reset.queue);
>  	mutex_init(&gt->reset.mutex);
>  	init_srcu_struct(&gt->reset.backoff_srcu);
> +	INIT_WORK(&gt->wedge, set_wedged_work);
>  
>  	/*
>  	 * While undesirable to wait inside the shrinker, complain anyway.
> @@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work)
>  	struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
>  
>  	gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name);
> -	intel_gt_set_wedged(w->gt);
> +	set_wedged_work(&w->gt->wedge);
>  }
>  
>  void __intel_init_wedge(struct intel_wedge_me *w,
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 2eba289d88ad..6aa179a3e92a 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -24,6 +24,7 @@
>  #include <drm/drm_managed.h>
>  #include <linux/pm_runtime.h>
>  
> +#include "gt/intel_gt.h"
>  #include "gt/intel_engine_regs.h"
>  #include "gt/intel_gt_regs.h"
>  
> @@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d)
>  	if (!wait_ack_clear(d, FORCEWAKE_KERNEL))
>  		return;
>  
> -	if (fw_ack(d) == ~0)
> +	if (fw_ack(d) == ~0) {
>  		drm_err(&d->uncore->i915->drm,
>  			"%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n",
>  			intel_uncore_forcewake_domain_to_str(d->id));
> -	else
> +		intel_gt_set_wedged_async(d->uncore->gt);
> +	} else {
>  		drm_err(&d->uncore->i915->drm,
>  			"%s: timed out waiting for forcewake ack to clear.\n",
>  			intel_uncore_forcewake_domain_to_str(d->id));
> +	}
>  
>  	add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */
>  }
> -- 
> 2.45.2
> 
>
Andi Shyti Aug. 9, 2024, 11:15 a.m. UTC | #2
Hi,

> Series:  drm/i915/gt: Mark the GT as dead when mmio is unreliable (rev5)
> URL:     https://patchwork.freedesktop.org/series/136975/
> State:   failure
> Details: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_136975v5/index.html

...

> Possible new issues
> 
> Here are the unknown changes that may have been introduced in
> Patchwork_136975v5_full:
> 
> IGT changes
> 
> Possible regressions
> 
>   • igt@gem_cs_tlb@engines@bcs0:
> 
>       □ shard-dg1: NOTRUN -> INCOMPLETE
>   • igt@i915_pm_rps@thresholds:

For an excess of zeal I kept testing this series because this
test was consistently failing but it's absolutely unrelated.

This deserves an igt fix.

Cc'eing Kasa to see whether there is any action that needs to be
taken and Tejas to set up CI filters.

I'm going ahead and merge the patch.

Thanks,
Andi
Andi Shyti Aug. 9, 2024, 12:07 p.m. UTC | #3
Hi,

On Wed, Aug 07, 2024 at 10:10:14AM +0100, Andi Shyti wrote:
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> After we detect that mmio is returning all 0xff, we believe that the GPU
> has dropped off the pci bus and is dead. Mark the device as wedged such
> that we can propagate the failure back to userspace and wait for
> recovery.
> 
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>

merged to drm-intel-gt-next.

Thanks,
Andi
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
index b5e114d284ad..b73555889d50 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt.h
@@ -208,4 +208,10 @@  enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt,
 void intel_gt_bind_context_set_ready(struct intel_gt *gt);
 void intel_gt_bind_context_set_unready(struct intel_gt *gt);
 bool intel_gt_is_bind_context_ready(struct intel_gt *gt);
+
+static inline void intel_gt_set_wedged_async(struct intel_gt *gt)
+{
+	queue_work(system_highpri_wq, &gt->wedge);
+}
+
 #endif /* __INTEL_GT_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index cfdd2ad5e954..bcee084b1f27 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -292,6 +292,8 @@  struct intel_gt {
 	struct gt_defaults defaults;
 	struct kobject *sysfs_defaults;
 
+	struct work_struct wedge;
+
 	struct i915_perf_gt perf;
 
 	/** link: &ggtt.gt_list */
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 735cd23a43c6..8f1ea95471ef 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1013,6 +1013,15 @@  static void __intel_gt_set_wedged(struct intel_gt *gt)
 	GT_TRACE(gt, "end\n");
 }
 
+static void set_wedged_work(struct work_struct *w)
+{
+	struct intel_gt *gt = container_of(w, struct intel_gt, wedge);
+	intel_wakeref_t wf;
+
+	with_intel_runtime_pm(gt->uncore->rpm, wf)
+		__intel_gt_set_wedged(gt);
+}
+
 void intel_gt_set_wedged(struct intel_gt *gt)
 {
 	intel_wakeref_t wakeref;
@@ -1614,6 +1623,7 @@  void intel_gt_init_reset(struct intel_gt *gt)
 	init_waitqueue_head(&gt->reset.queue);
 	mutex_init(&gt->reset.mutex);
 	init_srcu_struct(&gt->reset.backoff_srcu);
+	INIT_WORK(&gt->wedge, set_wedged_work);
 
 	/*
 	 * While undesirable to wait inside the shrinker, complain anyway.
@@ -1640,7 +1650,7 @@  static void intel_wedge_me(struct work_struct *work)
 	struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
 
 	gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name);
-	intel_gt_set_wedged(w->gt);
+	set_wedged_work(&w->gt->wedge);
 }
 
 void __intel_init_wedge(struct intel_wedge_me *w,
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 2eba289d88ad..6aa179a3e92a 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -24,6 +24,7 @@ 
 #include <drm/drm_managed.h>
 #include <linux/pm_runtime.h>
 
+#include "gt/intel_gt.h"
 #include "gt/intel_engine_regs.h"
 #include "gt/intel_gt_regs.h"
 
@@ -180,14 +181,16 @@  fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d)
 	if (!wait_ack_clear(d, FORCEWAKE_KERNEL))
 		return;
 
-	if (fw_ack(d) == ~0)
+	if (fw_ack(d) == ~0) {
 		drm_err(&d->uncore->i915->drm,
 			"%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n",
 			intel_uncore_forcewake_domain_to_str(d->id));
-	else
+		intel_gt_set_wedged_async(d->uncore->gt);
+	} else {
 		drm_err(&d->uncore->i915->drm,
 			"%s: timed out waiting for forcewake ack to clear.\n",
 			intel_uncore_forcewake_domain_to_str(d->id));
+	}
 
 	add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */
 }