[10/15] drm/i915: Avoid sleeping inside per-engine reset

Message ID	20180328211857.32457-11-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Wed, 28 Mar 2018 22:18:52 +0100 Message-Id: <20180328211857.32457-11-chris@chris-wilson.co.uk> In-Reply-To: <20180328211857.32457-1-chris@chris-wilson.co.uk> References: <20180328211857.32457-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 10/15] drm/i915: Avoid sleeping inside per-engine reset Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Message ID

20180328211857.32457-11-chris@chris-wilson.co.uk (mailing list archive)

State

New, archived

Headers

From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Wed, 28 Mar 2018 22:18:52 +0100
Message-Id: <20180328211857.32457-11-chris@chris-wilson.co.uk>
In-Reply-To: <20180328211857.32457-1-chris@chris-wilson.co.uk>
References: <20180328211857.32457-1-chris@chris-wilson.co.uk>
MIME-Version: 1.0
Subject: [Intel-gfx] [PATCH 10/15] drm/i915: Avoid sleeping inside
	per-engine reset
Precedence: list
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Commit Message

Chris Wilson March 28, 2018, 9:18 p.m. UTC

Only sleep and repeat when asked for a full device reset (ALL_ENGINES)
and avoid using sleeping waits when asked for a per-engine reset. The
goal is to be able to use a per-engine reset from hardirq/softirq/timer
context. A consequence is that our individual wait timeouts are a
thousand times shorter, on the order of a hundred microseconds rather
than hundreds of millisecond. This may make hitting the timeouts more
common, but hopefully the fallover to the full-device reset will be
sufficient to pick up the pieces.

Note, that the sleeps inside older gen (pre-gen8) have been left as they
are only used in full device reset mode.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
CC: Michel Thierry <michel.thierry@intel.com>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_uncore.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

Comments

Michel Thierry March 28, 2018, 9:47 p.m. UTC | #1

On 28/03/18 14:18, Chris Wilson wrote:
> Only sleep and repeat when asked for a full device reset (ALL_ENGINES)
> and avoid using sleeping waits when asked for a per-engine reset. The
> goal is to be able to use a per-engine reset from hardirq/softirq/timer
> context. A consequence is that our individual wait timeouts are a
> thousand times shorter, on the order of a hundred microseconds rather
> than hundreds of millisecond. This may make hitting the timeouts more
> common, but hopefully the fallover to the full-device reset will be
> sufficient to pick up the pieces.
> 
> Note, that the sleeps inside older gen (pre-gen8) have been left as they
> are only used in full device reset mode.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_uncore.c | 31 ++++++++++++++++---------------
>   1 file changed, 16 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 44c4654443ba..9fa60d8f897e 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1702,11 +1702,10 @@ static void gen3_stop_engine(struct intel_engine_cs *engine)
>   	const i915_reg_t mode = RING_MI_MODE(base);
>   
>   	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
> -	if (intel_wait_for_register_fw(dev_priv,
> -				       mode,
> -				       MODE_IDLE,
> -				       MODE_IDLE,
> -				       500))
> +	if (__intel_wait_for_register_fw(dev_priv,
> +					 mode, MODE_IDLE, MODE_IDLE,
> +					 500, 0,
> +					 NULL))

I had to read the commit message several times to understand the change 
from 500ms to 500us is correct.

>   		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
>   				 engine->name);
>   
> @@ -1860,9 +1859,10 @@ static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
>   	__raw_i915_write32(dev_priv, GEN6_GDRST, hw_domain_mask);
>   
>   	/* Wait for the device to ack the reset requests */
> -	err = intel_wait_for_register_fw(dev_priv,
> -					  GEN6_GDRST, hw_domain_mask, 0,
> -					  500);
> +	err = __intel_wait_for_register_fw(dev_priv,
> +					   GEN6_GDRST, hw_domain_mask, 0,
> +					   500, 0,
> +					   NULL);
>   	if (err)
>   		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
>   				 hw_domain_mask);
> @@ -2027,11 +2027,12 @@ static int gen8_reset_engine_start(struct intel_engine_cs *engine)
>   	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
>   		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
>   
> -	ret = intel_wait_for_register_fw(dev_priv,
> -					 RING_RESET_CTL(engine->mmio_base),
> -					 RESET_CTL_READY_TO_RESET,
> -					 RESET_CTL_READY_TO_RESET,
> -					 700);
> +	ret = __intel_wait_for_register_fw(dev_priv,
> +					   RING_RESET_CTL(engine->mmio_base),
> +					   RESET_CTL_READY_TO_RESET,
> +					   RESET_CTL_READY_TO_RESET,
> +					   700, 0,
> +					   NULL);
>   	if (ret)
>   		DRM_ERROR("%s: reset request timeout\n", engine->name);
>   
> @@ -2094,7 +2095,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>   	int retry;
>   	int ret;
>   
> -	might_sleep();
> +	might_sleep_if(engine_mask == ALL_ENGINES);

I think this should also be checking for intel_has_reset_engine.

If i915.reset is not 2, engine_mask can be != ALL_ENGINES and still be a 
full device reset.

>   
>   	/* If the power well sleeps during the reset, the reset
>   	 * request may be dropped and never completes (causing -EIO).
> @@ -2120,7 +2121,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>   			GEM_TRACE("engine_mask=%x\n", engine_mask);
>   			ret = reset(dev_priv, engine_mask);
>   		}
> -		if (ret != -ETIMEDOUT)
> +		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
>   			break;
>   
>   		cond_resched();
>

Chris Wilson March 28, 2018, 9:52 p.m. UTC | #2

Quoting Michel Thierry (2018-03-28 22:47:55)
> On 28/03/18 14:18, Chris Wilson wrote:
> > @@ -2094,7 +2095,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
> >       int retry;
> >       int ret;
> >   
> > -     might_sleep();
> > +     might_sleep_if(engine_mask == ALL_ENGINES);
> 
> I think this should also be checking for intel_has_reset_engine.
> 
> If i915.reset is not 2, engine_mask can be != ALL_ENGINES and still be a 
> full device reset.

Can it?

i915_reset -> intel_gpu_reset(ALL_ENGINES);
i915_reset_engine -> intel_gt_reset_engine -> intel_gpu_reset(BIT(engine->id));

Plus a couple of others poking at intel_gpu_reset(ALL_ENGINES);

Have I missed someone using intel_gpu_reset() directly?
-Chris

Michel Thierry March 28, 2018, 9:56 p.m. UTC | #3

On 28/03/18 14:52, Chris Wilson wrote:
> Quoting Michel Thierry (2018-03-28 22:47:55)
>> On 28/03/18 14:18, Chris Wilson wrote:
>>> @@ -2094,7 +2095,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>>>        int retry;
>>>        int ret;
>>>    
>>> -     might_sleep();
>>> +     might_sleep_if(engine_mask == ALL_ENGINES);
>>
>> I think this should also be checking for intel_has_reset_engine.
>>
>> If i915.reset is not 2, engine_mask can be != ALL_ENGINES and still be a
>> full device reset.
> 
> Can it?
> 
> i915_reset -> intel_gpu_reset(ALL_ENGINES);
> i915_reset_engine -> intel_gt_reset_engine -> intel_gpu_reset(BIT(engine->id));
> 
> Plus a couple of others poking at intel_gpu_reset(ALL_ENGINES);
> 
> Have I missed someone using intel_gpu_reset() directly?

No, you're right, I didn't see i915_reset was passing ALL_ENGINES.

And as you also noted, the only one passing something different than 
ALL_ENGINES mask is intel_gt_reset_engine.

> -Chris
> 

Reviewed-by: Michel Thierry <michel.thierry@intel.com>

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 44c4654443ba..9fa60d8f897e 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1702,11 +1702,10 @@  static void gen3_stop_engine(struct intel_engine_cs *engine)
 	const i915_reg_t mode = RING_MI_MODE(base);
 
 	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
-	if (intel_wait_for_register_fw(dev_priv,
-				       mode,
-				       MODE_IDLE,
-				       MODE_IDLE,
-				       500))
+	if (__intel_wait_for_register_fw(dev_priv,
+					 mode, MODE_IDLE, MODE_IDLE,
+					 500, 0,
+					 NULL))
 		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
 				 engine->name);
 
@@ -1860,9 +1859,10 @@  static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
 	__raw_i915_write32(dev_priv, GEN6_GDRST, hw_domain_mask);
 
 	/* Wait for the device to ack the reset requests */
-	err = intel_wait_for_register_fw(dev_priv,
-					  GEN6_GDRST, hw_domain_mask, 0,
-					  500);
+	err = __intel_wait_for_register_fw(dev_priv,
+					   GEN6_GDRST, hw_domain_mask, 0,
+					   500, 0,
+					   NULL);
 	if (err)
 		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
 				 hw_domain_mask);
@@ -2027,11 +2027,12 @@  static int gen8_reset_engine_start(struct intel_engine_cs *engine)
 	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
 		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
 
-	ret = intel_wait_for_register_fw(dev_priv,
-					 RING_RESET_CTL(engine->mmio_base),
-					 RESET_CTL_READY_TO_RESET,
-					 RESET_CTL_READY_TO_RESET,
-					 700);
+	ret = __intel_wait_for_register_fw(dev_priv,
+					   RING_RESET_CTL(engine->mmio_base),
+					   RESET_CTL_READY_TO_RESET,
+					   RESET_CTL_READY_TO_RESET,
+					   700, 0,
+					   NULL);
 	if (ret)
 		DRM_ERROR("%s: reset request timeout\n", engine->name);
 
@@ -2094,7 +2095,7 @@  int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 	int retry;
 	int ret;
 
-	might_sleep();
+	might_sleep_if(engine_mask == ALL_ENGINES);
 
 	/* If the power well sleeps during the reset, the reset
 	 * request may be dropped and never completes (causing -EIO).
@@ -2120,7 +2121,7 @@  int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 			GEM_TRACE("engine_mask=%x\n", engine_mask);
 			ret = reset(dev_priv, engine_mask);
 		}
-		if (ret != -ETIMEDOUT)
+		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
 			break;
 
 		cond_resched();

[10/15] drm/i915: Avoid sleeping inside per-engine reset

Commit Message

Comments

Patch