Message ID | 20240807091014.469992-1-andi.shyti@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915/gt: Mark the GT as dead when mmio is unreliable | expand |
-----Original Message----- From: Andi Shyti <andi.shyti@linux.intel.com> Sent: Wednesday, August 7, 2024 2:10 AM To: intel-gfx <intel-gfx@lists.freedesktop.org>; dri-devel <dri-devel@lists.freedesktop.org> Cc: Chris Wilson <chris.p.wilson@linux.intel.com>; Das, Nirmoy <nirmoy.das@intel.com>; Cavitt, Jonathan <jonathan.cavitt@intel.com>; Andi Shyti <andi.shyti@linux.intel.com> Subject: [PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable > > From: Chris Wilson <chris.p.wilson@intel.com> > > After we detect that mmio is returning all 0xff, we believe that the GPU > has dropped off the pci bus and is dead. Mark the device as wedged such > that we can propagate the failure back to userspace and wait for > recovery. > > Signed-off-by: Chris Wilson <chris.p.wilson@intel.com> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> LGTM. Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com> -Jonathan Cavitt > --- > drivers/gpu/drm/i915/gt/intel_gt.h | 6 ++++++ > drivers/gpu/drm/i915/gt/intel_gt_types.h | 2 ++ > drivers/gpu/drm/i915/gt/intel_reset.c | 12 +++++++++++- > drivers/gpu/drm/i915/intel_uncore.c | 7 +++++-- > 4 files changed, 24 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h > index b5e114d284ad..b73555889d50 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt.h > @@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt, > void intel_gt_bind_context_set_ready(struct intel_gt *gt); > void intel_gt_bind_context_set_unready(struct intel_gt *gt); > bool intel_gt_is_bind_context_ready(struct intel_gt *gt); > + > +static inline void intel_gt_set_wedged_async(struct intel_gt *gt) > +{ > + queue_work(system_highpri_wq, >->wedge); > +} > + > #endif /* __INTEL_GT_H__ */ > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h > index cfdd2ad5e954..bcee084b1f27 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h > @@ -292,6 +292,8 @@ struct intel_gt { > struct gt_defaults defaults; > struct kobject *sysfs_defaults; > > + struct work_struct wedge; > + > struct i915_perf_gt perf; > > /** link: &ggtt.gt_list */ > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c > index 735cd23a43c6..8f1ea95471ef 100644 > --- a/drivers/gpu/drm/i915/gt/intel_reset.c > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c > @@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt) > GT_TRACE(gt, "end\n"); > } > > +static void set_wedged_work(struct work_struct *w) > +{ > + struct intel_gt *gt = container_of(w, struct intel_gt, wedge); > + intel_wakeref_t wf; > + > + with_intel_runtime_pm(gt->uncore->rpm, wf) > + __intel_gt_set_wedged(gt); > +} > + > void intel_gt_set_wedged(struct intel_gt *gt) > { > intel_wakeref_t wakeref; > @@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt) > init_waitqueue_head(>->reset.queue); > mutex_init(>->reset.mutex); > init_srcu_struct(>->reset.backoff_srcu); > + INIT_WORK(>->wedge, set_wedged_work); > > /* > * While undesirable to wait inside the shrinker, complain anyway. > @@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work) > struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); > > gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name); > - intel_gt_set_wedged(w->gt); > + set_wedged_work(&w->gt->wedge); > } > > void __intel_init_wedge(struct intel_wedge_me *w, > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c > index 2eba289d88ad..6aa179a3e92a 100644 > --- a/drivers/gpu/drm/i915/intel_uncore.c > +++ b/drivers/gpu/drm/i915/intel_uncore.c > @@ -24,6 +24,7 @@ > #include <drm/drm_managed.h> > #include <linux/pm_runtime.h> > > +#include "gt/intel_gt.h" > #include "gt/intel_engine_regs.h" > #include "gt/intel_gt_regs.h" > > @@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d) > if (!wait_ack_clear(d, FORCEWAKE_KERNEL)) > return; > > - if (fw_ack(d) == ~0) > + if (fw_ack(d) == ~0) { > drm_err(&d->uncore->i915->drm, > "%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n", > intel_uncore_forcewake_domain_to_str(d->id)); > - else > + intel_gt_set_wedged_async(d->uncore->gt); > + } else { > drm_err(&d->uncore->i915->drm, > "%s: timed out waiting for forcewake ack to clear.\n", > intel_uncore_forcewake_domain_to_str(d->id)); > + } > > add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */ > } > -- > 2.45.2 > >
Hi, On Wed, Aug 07, 2024 at 10:10:14AM +0100, Andi Shyti wrote: > From: Chris Wilson <chris.p.wilson@intel.com> > > After we detect that mmio is returning all 0xff, we believe that the GPU > has dropped off the pci bus and is dead. Mark the device as wedged such > that we can propagate the failure back to userspace and wait for > recovery. > > Signed-off-by: Chris Wilson <chris.p.wilson@intel.com> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> merged to drm-intel-gt-next. Thanks, Andi
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index b5e114d284ad..b73555889d50 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt, void intel_gt_bind_context_set_ready(struct intel_gt *gt); void intel_gt_bind_context_set_unready(struct intel_gt *gt); bool intel_gt_is_bind_context_ready(struct intel_gt *gt); + +static inline void intel_gt_set_wedged_async(struct intel_gt *gt) +{ + queue_work(system_highpri_wq, >->wedge); +} + #endif /* __INTEL_GT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index cfdd2ad5e954..bcee084b1f27 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -292,6 +292,8 @@ struct intel_gt { struct gt_defaults defaults; struct kobject *sysfs_defaults; + struct work_struct wedge; + struct i915_perf_gt perf; /** link: &ggtt.gt_list */ diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 735cd23a43c6..8f1ea95471ef 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt) GT_TRACE(gt, "end\n"); } +static void set_wedged_work(struct work_struct *w) +{ + struct intel_gt *gt = container_of(w, struct intel_gt, wedge); + intel_wakeref_t wf; + + with_intel_runtime_pm(gt->uncore->rpm, wf) + __intel_gt_set_wedged(gt); +} + void intel_gt_set_wedged(struct intel_gt *gt) { intel_wakeref_t wakeref; @@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt) init_waitqueue_head(>->reset.queue); mutex_init(>->reset.mutex); init_srcu_struct(>->reset.backoff_srcu); + INIT_WORK(>->wedge, set_wedged_work); /* * While undesirable to wait inside the shrinker, complain anyway. @@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work) struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name); - intel_gt_set_wedged(w->gt); + set_wedged_work(&w->gt->wedge); } void __intel_init_wedge(struct intel_wedge_me *w, diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index 2eba289d88ad..6aa179a3e92a 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -24,6 +24,7 @@ #include <drm/drm_managed.h> #include <linux/pm_runtime.h> +#include "gt/intel_gt.h" #include "gt/intel_engine_regs.h" #include "gt/intel_gt_regs.h" @@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d) if (!wait_ack_clear(d, FORCEWAKE_KERNEL)) return; - if (fw_ack(d) == ~0) + if (fw_ack(d) == ~0) { drm_err(&d->uncore->i915->drm, "%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n", intel_uncore_forcewake_domain_to_str(d->id)); - else + intel_gt_set_wedged_async(d->uncore->gt); + } else { drm_err(&d->uncore->i915->drm, "%s: timed out waiting for forcewake ack to clear.\n", intel_uncore_forcewake_domain_to_str(d->id)); + } add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */ }