drm/i915: Abandon the reset if we fail to stop the engines

Message ID	20171026121212.22900-1-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Thu, 26 Oct 2017 13:12:12 +0100 Message-Id: <20171026121212.22900-1-chris@chris-wilson.co.uk> Subject: [Intel-gfx] [PATCH] drm/i915: Abandon the reset if we fail to stop the engines Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Message ID

20171026121212.22900-1-chris@chris-wilson.co.uk (mailing list archive)

State

New, archived

Headers

From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Thu, 26 Oct 2017 13:12:12 +0100
Message-Id: <20171026121212.22900-1-chris@chris-wilson.co.uk>
Subject: [Intel-gfx] [PATCH] drm/i915: Abandon the reset if we fail to stop
	the engines
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Commit Message

Chris Wilson Oct. 26, 2017, 12:12 p.m. UTC

Some machines, *cough* snb *cough*, fail catastrophically if asked to
reset the GPU under certain conditions. The initial guess is that this
is when the rings are still busy at the time of the reset request
(because that's a pattern we've seen elsewhere, hence why we do try
gen3_stop_engines() before reset) so abandon the reset and leave the
device wedged, if gen3_stop_engines() fails.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103240
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
Whee! Let's see how much breaks!
-Chris
---
 drivers/gpu/drm/i915/intel_uncore.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

Comments

Chris Wilson Oct. 26, 2017, 12:46 p.m. UTC | #1

Quoting Patchwork (2017-10-26 13:33:06)
> == Series Details ==
> 
> Series: drm/i915: Abandon the reset if we fail to stop the engines
> URL   : https://patchwork.freedesktop.org/series/32692/
> State : failure
> 
> == Summary ==
> 
> Series 32692v1 drm/i915: Abandon the reset if we fail to stop the engines
> https://patchwork.freedesktop.org/api/1.0/series/32692/revisions/1/mbox/
> 
> Test core_auth:
>         Subgroup basic-auth:
>                 pass       -> SKIP       (fi-ilk-650)
> Test core_prop_blob:
>         Subgroup basic:
>                 pass       -> SKIP       (fi-ilk-650)
> Test debugfs_test:
>         Subgroup read_all_entries:
>                 pass       -> SKIP       (fi-ilk-650)
> Test drv_getparams_basic:
>         Subgroup basic-eu-total:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-subslice-total:
>                 pass       -> SKIP       (fi-ilk-650)
> Test drv_hangman:
>         Subgroup error-state-basic:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_basic:
>         Subgroup bad-close:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup create-close:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup create-fd-close:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_busy:
>         Subgroup basic-busy-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-hang-default:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_close_race:
>         Subgroup basic-process:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-threads:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_cpu_reloc:
>         Subgroup basic:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_cs_tlb:
>         Subgroup basic-default:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_exec_basic:
>         Subgroup basic-bsd:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-render:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup gtt-bsd:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup gtt-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup gtt-render:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup readonly-bsd:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup readonly-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup readonly-render:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_exec_create:
>         Subgroup basic:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_exec_fence:
>         Subgroup basic-busy-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-wait-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-await-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup await-hang-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup nb-await-default:
>                 pass       -> SKIP       (fi-ilk-650)
> Test gem_exec_flush:
>         Subgroup basic-batch-kernel-default-uc:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-batch-kernel-default-wb:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-uc-pro-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-uc-prw-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-uc-ro-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-uc-rw-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-uc-set-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-wb-pro-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-wb-prw-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-wb-ro-before-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-wb-ro-default:
>                 pass       -> SKIP       (fi-ilk-650)
>         Subgroup basic-wb-rw-before-default:
> WARNING: Long output truncated
> fi-cnl-y failed to connect after reboot

That's suspiciously much less fallout than I expected. We have a severe
lack of GPU reset stress.
-Chris

Ville Syrjälä Oct. 26, 2017, 12:59 p.m. UTC | #2

On Thu, Oct 26, 2017 at 01:12:12PM +0100, Chris Wilson wrote:
> Some machines, *cough* snb *cough*, fail catastrophically if asked to
> reset the GPU under certain conditions.

Did we try skipping the gen6_rps_disable() already?

> The initial guess is that this
> is when the rings are still busy at the time of the reset request
> (because that's a pattern we've seen elsewhere, hence why we do try
> gen3_stop_engines() before reset) so abandon the reset and leave the
> device wedged, if gen3_stop_engines() fails.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103240
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
> Whee! Let's see how much breaks!
> -Chris
> ---
>  drivers/gpu/drm/i915/intel_uncore.c | 33 ++++++++++++++++++++++++++-------
>  1 file changed, 26 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 20e3c65c0999..c9a254b6125f 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1372,20 +1372,23 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>  	return ret;
>  }
>  
> -static void gen3_stop_engine(struct intel_engine_cs *engine)
> +static bool gen3_stop_engine(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_private *dev_priv = engine->i915;
>  	const u32 base = engine->mmio_base;
>  	const i915_reg_t mode = RING_MI_MODE(base);
>  
> +
>  	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
>  	if (intel_wait_for_register_fw(dev_priv,
>  				       mode,
>  				       MODE_IDLE,
>  				       MODE_IDLE,
> -				       500))
> +				       500)) {
>  		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
>  				 engine->name);
> +		return false;
> +	}
>  
>  	I915_WRITE_FW(RING_CTL(base), 0);
>  	I915_WRITE_FW(RING_HEAD(base), 0);
> @@ -1395,19 +1398,32 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
>  	if (I915_READ_FW(RING_HEAD(base)) != 0)
>  		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
>  				 engine->name);
> +
> +	return true;
>  }
>  
> -static void i915_stop_engines(struct drm_i915_private *dev_priv,
> -			      unsigned engine_mask)
> +static int i915_stop_engines(struct drm_i915_private *dev_priv,
> +			     unsigned engine_mask)
>  {
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
> +	bool idle;
>  
>  	if (INTEL_GEN(dev_priv) < 3)
> -		return;
> +		return true;
>  
> +	idle = true;
>  	for_each_engine_masked(engine, dev_priv, engine_mask, id)
> -		gen3_stop_engine(engine);
> +		idle &= gen3_stop_engine(engine);
> +	if (idle)
> +		return idle;
> +
> +	dev_err(dev_priv->drm.dev, "Failed to stop all engines\n");
> +	for_each_engine_masked(engine, dev_priv, engine_mask, id) {
> +		struct drm_printer p = drm_debug_printer(__func__);
> +		intel_engine_dump(engine, &p);
> +	}
> +	return false;
>  }
>  
>  static bool i915_reset_complete(struct pci_dev *pdev)
> @@ -1768,7 +1784,10 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>  		 *
>  		 * FIXME: Wa for more modern gens needs to be validated
>  		 */
> -		i915_stop_engines(dev_priv, engine_mask);
> +		if (!i915_stop_engines(dev_priv, engine_mask)) {
> +			ret = -EIO;
> +			break;
> +		}
>  
>  		ret = -ENODEV;
>  		if (reset)
> -- 
> 2.15.0.rc2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Chris Wilson Oct. 26, 2017, 1:11 p.m. UTC | #3

Quoting Ville Syrjälä (2017-10-26 13:59:05)
> On Thu, Oct 26, 2017 at 01:12:12PM +0100, Chris Wilson wrote:
> > Some machines, *cough* snb *cough*, fail catastrophically if asked to
> > reset the GPU under certain conditions.
> 
> Did we try skipping the gen6_rps_disable() already?

I had thought we had taken that out a while ago... 

commit f2a91d1a6f5960c08f1ca60bd076f4dc020c50c6
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Wed Sep 21 14:51:06 2016 +0100

    drm/i915: Restore current RPS state after reset

removes the frobbing inside i915_reset() itself, but still talks about
RPS needing to be restored... Ok, that's the post-reset stuff to make
sure that the hw/sw tracking align.

We are not touching rc6/rps prior to hitting GDRST. Maybe we should?
(Unless I'm blind and overlooked something in the reset.)
-Chris

Ville Syrjälä Oct. 26, 2017, 1:30 p.m. UTC | #4

On Thu, Oct 26, 2017 at 02:11:22PM +0100, Chris Wilson wrote:
> Quoting Ville Syrjälä (2017-10-26 13:59:05)
> > On Thu, Oct 26, 2017 at 01:12:12PM +0100, Chris Wilson wrote:
> > > Some machines, *cough* snb *cough*, fail catastrophically if asked to
> > > reset the GPU under certain conditions.
> > 
> > Did we try skipping the gen6_rps_disable() already?
> 
> I had thought we had taken that out a while ago... 
> 
> commit f2a91d1a6f5960c08f1ca60bd076f4dc020c50c6
> Author: Chris Wilson <chris@chris-wilson.co.uk>
> Date:   Wed Sep 21 14:51:06 2016 +0100
> 
>     drm/i915: Restore current RPS state after reset
> 
> removes the frobbing inside i915_reset() itself, but still talks about
> RPS needing to be restored... Ok, that's the post-reset stuff to make
> sure that the hw/sw tracking align.

Hmm. Right. It looks like we do the disable+re-enable back to back after
the reset. I guess at that point it should be safe, assuming the reset
actually worked. 

> 
> We are not touching rc6/rps prior to hitting GDRST. Maybe we should?

Based on what I remember that would be more dangerous if the engined is
stuck in a bad way.

So I guess these reset problems are something else then.

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 20e3c65c0999..c9a254b6125f 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1372,20 +1372,23 @@  int i915_reg_read_ioctl(struct drm_device *dev,
 	return ret;
 }
 
-static void gen3_stop_engine(struct intel_engine_cs *engine)
+static bool gen3_stop_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 	const u32 base = engine->mmio_base;
 	const i915_reg_t mode = RING_MI_MODE(base);
 
+
 	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
 	if (intel_wait_for_register_fw(dev_priv,
 				       mode,
 				       MODE_IDLE,
 				       MODE_IDLE,
-				       500))
+				       500)) {
 		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
 				 engine->name);
+		return false;
+	}
 
 	I915_WRITE_FW(RING_CTL(base), 0);
 	I915_WRITE_FW(RING_HEAD(base), 0);
@@ -1395,19 +1398,32 @@  static void gen3_stop_engine(struct intel_engine_cs *engine)
 	if (I915_READ_FW(RING_HEAD(base)) != 0)
 		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
 				 engine->name);
+
+	return true;
 }
 
-static void i915_stop_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
+static int i915_stop_engines(struct drm_i915_private *dev_priv,
+			     unsigned engine_mask)
 {
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
+	bool idle;
 
 	if (INTEL_GEN(dev_priv) < 3)
-		return;
+		return true;
 
+	idle = true;
 	for_each_engine_masked(engine, dev_priv, engine_mask, id)
-		gen3_stop_engine(engine);
+		idle &= gen3_stop_engine(engine);
+	if (idle)
+		return idle;
+
+	dev_err(dev_priv->drm.dev, "Failed to stop all engines\n");
+	for_each_engine_masked(engine, dev_priv, engine_mask, id) {
+		struct drm_printer p = drm_debug_printer(__func__);
+		intel_engine_dump(engine, &p);
+	}
+	return false;
 }
 
 static bool i915_reset_complete(struct pci_dev *pdev)
@@ -1768,7 +1784,10 @@  int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 		 *
 		 * FIXME: Wa for more modern gens needs to be validated
 		 */
-		i915_stop_engines(dev_priv, engine_mask);
+		if (!i915_stop_engines(dev_priv, engine_mask)) {
+			ret = -EIO;
+			break;
+		}
 
 		ret = -ENODEV;
 		if (reset)