diff mbox

[v2,2/2] drm/i915: Abandon the reset if we fail to stop the engines

Message ID 20171027104011.2341-2-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Oct. 27, 2017, 10:40 a.m. UTC
Some machines, *cough* snb *cough*, fail catastrophically if asked to
reset the GPU under certain conditions. The initial guess is that this
is when the rings are still busy at the time of the reset request
(because that's a pattern we've seen elsewhere, hence why we do try
gen3_stop_engines() before reset) so abandon the reset and leave the
device wedged, if gen3_stop_engines() fails.

v2: Only give up if not idle after emptying the ring.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103240
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_uncore.c | 43 ++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 10 deletions(-)

Comments

Mika Kuoppala Oct. 27, 2017, 12:18 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Some machines, *cough* snb *cough*, fail catastrophically if asked to
> reset the GPU under certain conditions. The initial guess is that this
> is when the rings are still busy at the time of the reset request
> (because that's a pattern we've seen elsewhere, hence why we do try
> gen3_stop_engines() before reset) so abandon the reset and leave the
> device wedged, if gen3_stop_engines() fails.
>
> v2: Only give up if not idle after emptying the ring.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103240
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_uncore.c | 43 ++++++++++++++++++++++++++++---------
>  1 file changed, 33 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 96ee6b2754be..f80dbff3595f 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1372,7 +1372,7 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>  	return ret;
>  }
>  
> -static void gen3_stop_engine(struct intel_engine_cs *engine)
> +static bool gen3_stop_engine(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_private *dev_priv = engine->i915;
>  	const u32 base = engine->mmio_base;
> @@ -1392,26 +1392,46 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
>  	I915_WRITE_FW(RING_HEAD(base), 0);
>  	I915_WRITE_FW(RING_TAIL(base), 0);
>  
> +	/* Check acts as a post */
> +	if (intel_wait_for_register_fw(dev_priv,
> +				       mode,
> +				       MODE_IDLE,
> +				       MODE_IDLE,
> +				       1000)) {
> +		DRM_DEBUG_DRIVER("%s: timed out after clearing ring\n",
> +				 engine->name);
> +		return false;
> +	}
> +

I recall that this bailout was the reason I didn't want to
use the stop_ring in intel_ringbuffer.c

Now if you choose to reintroduce the bailout, I think you can
make a generic stop_engine and get rid of the copy.

-Mika

>  	/* The ring must be empty before it is disabled */
>  	I915_WRITE_FW(RING_CTL(base), 0);
> +	POSTING_READ_FW(RING_CTL(base));
>  
> -	/* Check acts as a post */
> -	if (I915_READ_FW(RING_HEAD(base)) != 0)
> -		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> -				 engine->name);
> +	return true;
>  }
>  
> -static void i915_stop_engines(struct drm_i915_private *dev_priv,
> -			      unsigned engine_mask)
> +static int i915_stop_engines(struct drm_i915_private *dev_priv,
> +			     unsigned engine_mask)
>  {
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
> +	bool idle;
>  
>  	if (INTEL_GEN(dev_priv) < 3)
> -		return;
> +		return true;
>  
> +	idle = true;
>  	for_each_engine_masked(engine, dev_priv, engine_mask, id)
> -		gen3_stop_engine(engine);
> +		idle &= gen3_stop_engine(engine);
> +	if (idle)
> +		return true;
> +
> +	dev_err(dev_priv->drm.dev, "Failed to stop all engines\n");
> +	for_each_engine_masked(engine, dev_priv, engine_mask, id) {
> +		struct drm_printer p = drm_debug_printer(__func__);
> +		intel_engine_dump(engine, &p);
> +	}
> +	return false;
>  }
>  
>  static bool i915_reset_complete(struct pci_dev *pdev)
> @@ -1772,7 +1792,10 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>  		 *
>  		 * FIXME: Wa for more modern gens needs to be validated
>  		 */
> -		i915_stop_engines(dev_priv, engine_mask);
> +		if (!i915_stop_engines(dev_priv, engine_mask)) {
> +			ret = -EIO;
> +			break;
> +		}
>  
>  		ret = -ENODEV;
>  		if (reset)
> -- 
> 2.15.0.rc2
Chris Wilson Oct. 27, 2017, 12:34 p.m. UTC | #2
Quoting Mika Kuoppala (2017-10-27 13:18:44)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Some machines, *cough* snb *cough*, fail catastrophically if asked to
> > reset the GPU under certain conditions. The initial guess is that this
> > is when the rings are still busy at the time of the reset request
> > (because that's a pattern we've seen elsewhere, hence why we do try
> > gen3_stop_engines() before reset) so abandon the reset and leave the
> > device wedged, if gen3_stop_engines() fails.
> >
> > v2: Only give up if not idle after emptying the ring.
> >
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103240
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/intel_uncore.c | 43 ++++++++++++++++++++++++++++---------
> >  1 file changed, 33 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> > index 96ee6b2754be..f80dbff3595f 100644
> > --- a/drivers/gpu/drm/i915/intel_uncore.c
> > +++ b/drivers/gpu/drm/i915/intel_uncore.c
> > @@ -1372,7 +1372,7 @@ int i915_reg_read_ioctl(struct drm_device *dev,
> >       return ret;
> >  }
> >  
> > -static void gen3_stop_engine(struct intel_engine_cs *engine)
> > +static bool gen3_stop_engine(struct intel_engine_cs *engine)
> >  {
> >       struct drm_i915_private *dev_priv = engine->i915;
> >       const u32 base = engine->mmio_base;
> > @@ -1392,26 +1392,46 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
> >       I915_WRITE_FW(RING_HEAD(base), 0);
> >       I915_WRITE_FW(RING_TAIL(base), 0);
> >  
> > +     /* Check acts as a post */
> > +     if (intel_wait_for_register_fw(dev_priv,
> > +                                    mode,
> > +                                    MODE_IDLE,
> > +                                    MODE_IDLE,
> > +                                    1000)) {
> > +             DRM_DEBUG_DRIVER("%s: timed out after clearing ring\n",
> > +                              engine->name);
> > +             return false;
> > +     }
> > +
> 
> I recall that this bailout was the reason I didn't want to
> use the stop_ring in intel_ringbuffer.c
> 
> Now if you choose to reintroduce the bailout, I think you can
> make a generic stop_engine and get rid of the copy.

This bailout isn't looking too hot. For sure it prevents snb from
eating itself in this situation, but it looks too general and the
collateral damage is unacceptable.

Atm, one lead I have is that the RING_CTL change doesn't take. That may
indeed relate to Ville's observation that it only takes effect when the
CP hits an arbitration point. But in the igt, we use... oh no we don't,
not for inject hang.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 96ee6b2754be..f80dbff3595f 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1372,7 +1372,7 @@  int i915_reg_read_ioctl(struct drm_device *dev,
 	return ret;
 }
 
-static void gen3_stop_engine(struct intel_engine_cs *engine)
+static bool gen3_stop_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 	const u32 base = engine->mmio_base;
@@ -1392,26 +1392,46 @@  static void gen3_stop_engine(struct intel_engine_cs *engine)
 	I915_WRITE_FW(RING_HEAD(base), 0);
 	I915_WRITE_FW(RING_TAIL(base), 0);
 
+	/* Check acts as a post */
+	if (intel_wait_for_register_fw(dev_priv,
+				       mode,
+				       MODE_IDLE,
+				       MODE_IDLE,
+				       1000)) {
+		DRM_DEBUG_DRIVER("%s: timed out after clearing ring\n",
+				 engine->name);
+		return false;
+	}
+
 	/* The ring must be empty before it is disabled */
 	I915_WRITE_FW(RING_CTL(base), 0);
+	POSTING_READ_FW(RING_CTL(base));
 
-	/* Check acts as a post */
-	if (I915_READ_FW(RING_HEAD(base)) != 0)
-		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
-				 engine->name);
+	return true;
 }
 
-static void i915_stop_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
+static int i915_stop_engines(struct drm_i915_private *dev_priv,
+			     unsigned engine_mask)
 {
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
+	bool idle;
 
 	if (INTEL_GEN(dev_priv) < 3)
-		return;
+		return true;
 
+	idle = true;
 	for_each_engine_masked(engine, dev_priv, engine_mask, id)
-		gen3_stop_engine(engine);
+		idle &= gen3_stop_engine(engine);
+	if (idle)
+		return true;
+
+	dev_err(dev_priv->drm.dev, "Failed to stop all engines\n");
+	for_each_engine_masked(engine, dev_priv, engine_mask, id) {
+		struct drm_printer p = drm_debug_printer(__func__);
+		intel_engine_dump(engine, &p);
+	}
+	return false;
 }
 
 static bool i915_reset_complete(struct pci_dev *pdev)
@@ -1772,7 +1792,10 @@  int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 		 *
 		 * FIXME: Wa for more modern gens needs to be validated
 		 */
-		i915_stop_engines(dev_priv, engine_mask);
+		if (!i915_stop_engines(dev_priv, engine_mask)) {
+			ret = -EIO;
+			break;
+		}
 
 		ret = -ENODEV;
 		if (reset)