diff mbox

[4/9] drm/i915/tdr: Add support for per engine reset recovery

Message ID 20161216202010.7983-5-michel.thierry@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michel Thierry Dec. 16, 2016, 8:20 p.m. UTC
From: Arun Siluvery <arun.siluvery@linux.intel.com>

This change implements support for per-engine reset as an initial, less
intrusive hang recovery option to be attempted before falling back to the
legacy full GPU reset recovery mode if necessary. This is only supported
from Gen8 onwards.

Hangchecker determines which engines are hung and invokes error handler to
recover from it. Error handler schedules recovery for each of those engines
that are hung. The recovery procedure is as follows,
 - identifies the request that caused the hang and it is dropped
 - force engine to idle: this is done by issuing a reset request
 - reset and re-init engine
 - restart submissions to the engine

If engine reset fails then we fall back to heavy weight full gpu reset
which resets all engines and reinitiazes complete state of HW and SW.

v2: Rebase.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c     | 56 +++++++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_drv.h     |  3 ++
 drivers/gpu/drm/i915/i915_gem.c     |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c    | 12 ++++++++
 drivers/gpu/drm/i915/intel_lrc.h    |  1 +
 drivers/gpu/drm/i915/intel_uncore.c | 41 ++++++++++++++++++++++++---
 6 files changed, 108 insertions(+), 7 deletions(-)

Comments

Chris Wilson Dec. 16, 2016, 8:45 p.m. UTC | #1
On Fri, Dec 16, 2016 at 12:20:05PM -0800, Michel Thierry wrote:
> From: Arun Siluvery <arun.siluvery@linux.intel.com>
> 
> This change implements support for per-engine reset as an initial, less
> intrusive hang recovery option to be attempted before falling back to the
> legacy full GPU reset recovery mode if necessary. This is only supported
> from Gen8 onwards.
> 
> Hangchecker determines which engines are hung and invokes error handler to
> recover from it. Error handler schedules recovery for each of those engines
> that are hung. The recovery procedure is as follows,
>  - identifies the request that caused the hang and it is dropped
>  - force engine to idle: this is done by issuing a reset request
>  - reset and re-init engine
>  - restart submissions to the engine
> 
> If engine reset fails then we fall back to heavy weight full gpu reset
> which resets all engines and reinitiazes complete state of HW and SW.
> 
> v2: Rebase.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c     | 56 +++++++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/i915_drv.h     |  3 ++
>  drivers/gpu/drm/i915/i915_gem.c     |  2 +-
>  drivers/gpu/drm/i915/intel_lrc.c    | 12 ++++++++
>  drivers/gpu/drm/i915/intel_lrc.h    |  1 +
>  drivers/gpu/drm/i915/intel_uncore.c | 41 ++++++++++++++++++++++++---
>  6 files changed, 108 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index e5688edd62cd..a034793bc246 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1830,18 +1830,70 @@ void i915_reset(struct drm_i915_private *dev_priv)
>   *
>   * Reset a specific GPU engine. Useful if a hang is detected.
>   * Returns zero on successful reset or otherwise an error code.
> + *
> + * Procedure is fairly simple:
> + *  - identifies the request that caused the hang and it is dropped
> + *  - force engine to idle: this is done by issuing a reset request
> + *  - reset engine
> + *  - restart submissions to the engine
>   */
>  int i915_reset_engine(struct intel_engine_cs *engine)

What's the serialisation between potential callers of
i915_reset_engine()?

>  {
>  	int ret;
>  	struct drm_i915_private *dev_priv = engine->i915;
>  
> -	/* FIXME: replace me with engine reset sequence */
> -	ret = -ENODEV;
> +	/*
> +	 * We need to first idle the engine by issuing a reset request,
> +	 * then perform soft reset and re-initialize hw state, for all of
> +	 * this GT power need to be awake so ensure it does throughout the
> +	 * process
> +	 */
> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> +
> +	/*
> +	 * the request that caused the hang is stuck on elsp, identify the
> +	 * active request and drop it, adjust head to skip the offending
> +	 * request to resume executing remaining requests in the queue.
> +	 */
> +	i915_gem_reset_engine(engine);

Must freeze the engine and irqs first, before calling
i915_gem_reset_engine() (i.e. something like disable_engines_irq,
cancelling tasklet)

Eeek note that the current i915_gem_reset_engine() is lacking a
spinlock.

> +
> +	ret = intel_engine_reset_begin(engine);
> +	if (ret) {
> +		DRM_ERROR("Failed to disable %s\n", engine->name);
> +		goto error;
> +	}
> +
> +	ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
> +	if (ret) {
> +		DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
> +		intel_engine_reset_cancel(engine);
> +		goto error;
> +	}
> +
> +	ret = engine->init_hw(engine);
> +	if (ret)
> +		goto error;
>  
> +	intel_engine_reset_cancel(engine);
> +	intel_execlists_restart_submission(engine);

engine->init_hw(engine) *is* intel_execlists_restart_submission.

> +
> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> +	return 0;
> +
> +error:
>  	/* use full gpu reset to recover on error */
>  	set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
>  
> +	/* Engine reset is performed without taking struct_mutex, since it
> +	 * failed we now fallback to full gpu reset. Wakeup any waiters
> +	 * which should now see the reset_in_progress and release
> +	 * struct_mutex for us to continue recovery.
> +	 */
> +	rcu_read_lock();
> +	intel_engine_wakeup(engine);
> +	rcu_read_unlock();
> +
> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>  	return ret;
>  }
Michel Thierry Dec. 19, 2016, 5:02 a.m. UTC | #2
On 12/16/2016 12:45 PM, Chris Wilson wrote:
> On Fri, Dec 16, 2016 at 12:20:05PM -0800, Michel Thierry wrote:
>> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>>
>> This change implements support for per-engine reset as an initial, less
>> intrusive hang recovery option to be attempted before falling back to the
>> legacy full GPU reset recovery mode if necessary. This is only supported
>> from Gen8 onwards.
>>
>> Hangchecker determines which engines are hung and invokes error handler to
>> recover from it. Error handler schedules recovery for each of those engines
>> that are hung. The recovery procedure is as follows,
>>  - identifies the request that caused the hang and it is dropped
>>  - force engine to idle: this is done by issuing a reset request
>>  - reset and re-init engine
>>  - restart submissions to the engine
>>
>> If engine reset fails then we fall back to heavy weight full gpu reset
>> which resets all engines and reinitiazes complete state of HW and SW.
>>
>> v2: Rebase.
>>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.c     | 56 +++++++++++++++++++++++++++++++++++--
>>  drivers/gpu/drm/i915/i915_drv.h     |  3 ++
>>  drivers/gpu/drm/i915/i915_gem.c     |  2 +-
>>  drivers/gpu/drm/i915/intel_lrc.c    | 12 ++++++++
>>  drivers/gpu/drm/i915/intel_lrc.h    |  1 +
>>  drivers/gpu/drm/i915/intel_uncore.c | 41 ++++++++++++++++++++++++---
>>  6 files changed, 108 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>> index e5688edd62cd..a034793bc246 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.c
>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>> @@ -1830,18 +1830,70 @@ void i915_reset(struct drm_i915_private *dev_priv)
>>   *
>>   * Reset a specific GPU engine. Useful if a hang is detected.
>>   * Returns zero on successful reset or otherwise an error code.
>> + *
>> + * Procedure is fairly simple:
>> + *  - identifies the request that caused the hang and it is dropped
>> + *  - force engine to idle: this is done by issuing a reset request
>> + *  - reset engine
>> + *  - restart submissions to the engine
>>   */
>>  int i915_reset_engine(struct intel_engine_cs *engine)
>
> What's the serialisation between potential callers of
> i915_reset_engine()?
>

I haven't seen simultaneous calls happening yet, would a 
reset_engine-specific mutex be enough?

>>  {
>>  	int ret;
>>  	struct drm_i915_private *dev_priv = engine->i915;
>>
>> -	/* FIXME: replace me with engine reset sequence */
>> -	ret = -ENODEV;
>> +	/*
>> +	 * We need to first idle the engine by issuing a reset request,
>> +	 * then perform soft reset and re-initialize hw state, for all of
>> +	 * this GT power need to be awake so ensure it does throughout the
>> +	 * process
>> +	 */
>> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
>> +
>> +	/*
>> +	 * the request that caused the hang is stuck on elsp, identify the
>> +	 * active request and drop it, adjust head to skip the offending
>> +	 * request to resume executing remaining requests in the queue.
>> +	 */
>> +	i915_gem_reset_engine(engine);
>
> Must freeze the engine and irqs first, before calling
> i915_gem_reset_engine() (i.e. something like disable_engines_irq,
> cancelling tasklet)
>
Will do.

> Eeek note that the current i915_gem_reset_engine() is lacking a
> spinlock.
>

The new mutex (for i915_reset_engine) should cover this.

>> +
>> +	ret = intel_engine_reset_begin(engine);
>> +	if (ret) {
>> +		DRM_ERROR("Failed to disable %s\n", engine->name);
>> +		goto error;
>> +	}
>> +
>> +	ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
>> +	if (ret) {
>> +		DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
>> +		intel_engine_reset_cancel(engine);
>> +		goto error;
>> +	}
>> +
>> +	ret = engine->init_hw(engine);
>> +	if (ret)
>> +		goto error;
>>
>> +	intel_engine_reset_cancel(engine);
>> +	intel_execlists_restart_submission(engine);
>
> engine->init_hw(engine) *is* intel_execlists_restart_submission.
>

I'll make these changes,
Thanks.

>> +
>> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>> +	return 0;
>> +
>> +error:
>>  	/* use full gpu reset to recover on error */
>>  	set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
>>
>> +	/* Engine reset is performed without taking struct_mutex, since it
>> +	 * failed we now fallback to full gpu reset. Wakeup any waiters
>> +	 * which should now see the reset_in_progress and release
>> +	 * struct_mutex for us to continue recovery.
>> +	 */
>> +	rcu_read_lock();
>> +	intel_engine_wakeup(engine);
>> +	rcu_read_unlock();
>> +
>> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>>  	return ret;
>>  }
>
Chris Wilson Dec. 19, 2016, 9:51 a.m. UTC | #3
On Sun, Dec 18, 2016 at 09:02:33PM -0800, Michel Thierry wrote:
> 
> 
> On 12/16/2016 12:45 PM, Chris Wilson wrote:
> >On Fri, Dec 16, 2016 at 12:20:05PM -0800, Michel Thierry wrote:
> >>From: Arun Siluvery <arun.siluvery@linux.intel.com>
> >>
> >>This change implements support for per-engine reset as an initial, less
> >>intrusive hang recovery option to be attempted before falling back to the
> >>legacy full GPU reset recovery mode if necessary. This is only supported
> >>from Gen8 onwards.
> >>
> >>Hangchecker determines which engines are hung and invokes error handler to
> >>recover from it. Error handler schedules recovery for each of those engines
> >>that are hung. The recovery procedure is as follows,
> >> - identifies the request that caused the hang and it is dropped
> >> - force engine to idle: this is done by issuing a reset request
> >> - reset and re-init engine
> >> - restart submissions to the engine
> >>
> >>If engine reset fails then we fall back to heavy weight full gpu reset
> >>which resets all engines and reinitiazes complete state of HW and SW.
> >>
> >>v2: Rebase.
> >>
> >>Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >>Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >>Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> >>Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> >>Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> >>---
> >> drivers/gpu/drm/i915/i915_drv.c     | 56 +++++++++++++++++++++++++++++++++++--
> >> drivers/gpu/drm/i915/i915_drv.h     |  3 ++
> >> drivers/gpu/drm/i915/i915_gem.c     |  2 +-
> >> drivers/gpu/drm/i915/intel_lrc.c    | 12 ++++++++
> >> drivers/gpu/drm/i915/intel_lrc.h    |  1 +
> >> drivers/gpu/drm/i915/intel_uncore.c | 41 ++++++++++++++++++++++++---
> >> 6 files changed, 108 insertions(+), 7 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> >>index e5688edd62cd..a034793bc246 100644
> >>--- a/drivers/gpu/drm/i915/i915_drv.c
> >>+++ b/drivers/gpu/drm/i915/i915_drv.c
> >>@@ -1830,18 +1830,70 @@ void i915_reset(struct drm_i915_private *dev_priv)
> >>  *
> >>  * Reset a specific GPU engine. Useful if a hang is detected.
> >>  * Returns zero on successful reset or otherwise an error code.
> >>+ *
> >>+ * Procedure is fairly simple:
> >>+ *  - identifies the request that caused the hang and it is dropped
> >>+ *  - force engine to idle: this is done by issuing a reset request
> >>+ *  - reset engine
> >>+ *  - restart submissions to the engine
> >>  */
> >> int i915_reset_engine(struct intel_engine_cs *engine)
> >
> >What's the serialisation between potential callers of
> >i915_reset_engine()?
> >
> 
> I haven't seen simultaneous calls happening yet, would a
> reset_engine-specific mutex be enough?

That is what it more or less boils down to. But first, do we ensure we
don't declare a second hang on this engine before the first reset is
complete? Then we only a barrier between a single engine reset and a
full reset.

> >> {
> >> 	int ret;
> >> 	struct drm_i915_private *dev_priv = engine->i915;
> >>
> >>-	/* FIXME: replace me with engine reset sequence */
> >>-	ret = -ENODEV;
> >>+	/*
> >>+	 * We need to first idle the engine by issuing a reset request,
> >>+	 * then perform soft reset and re-initialize hw state, for all of
> >>+	 * this GT power need to be awake so ensure it does throughout the
> >>+	 * process
> >>+	 */
> >>+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> >>+
> >>+	/*
> >>+	 * the request that caused the hang is stuck on elsp, identify the
> >>+	 * active request and drop it, adjust head to skip the offending
> >>+	 * request to resume executing remaining requests in the queue.
> >>+	 */
> >>+	i915_gem_reset_engine(engine);
> >
> >Must freeze the engine and irqs first, before calling
> >i915_gem_reset_engine() (i.e. something like disable_engines_irq,
> >cancelling tasklet)
> >
> Will do.
> 
> >Eeek note that the current i915_gem_reset_engine() is lacking a
> >spinlock.
> >
> 
> The new mutex (for i915_reset_engine) should cover this.

No. We need to protect these lists from concurrent manipulation in the
fences. The spinlock protection there needs to be extended here, being
covered by struct_mutex is no longer sufficient protection for the
current code.
-Chris
Mika Kuoppala Dec. 19, 2016, 2:24 p.m. UTC | #4
Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Fri, Dec 16, 2016 at 12:20:05PM -0800, Michel Thierry wrote:
>> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>> 
>> This change implements support for per-engine reset as an initial, less
>> intrusive hang recovery option to be attempted before falling back to the
>> legacy full GPU reset recovery mode if necessary. This is only supported
>> from Gen8 onwards.
>> 
>> Hangchecker determines which engines are hung and invokes error handler to
>> recover from it. Error handler schedules recovery for each of those engines
>> that are hung. The recovery procedure is as follows,
>>  - identifies the request that caused the hang and it is dropped
>>  - force engine to idle: this is done by issuing a reset request
>>  - reset and re-init engine
>>  - restart submissions to the engine
>> 
>> If engine reset fails then we fall back to heavy weight full gpu reset
>> which resets all engines and reinitiazes complete state of HW and SW.
>> 
>> v2: Rebase.
>> 
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.c     | 56 +++++++++++++++++++++++++++++++++++--
>>  drivers/gpu/drm/i915/i915_drv.h     |  3 ++
>>  drivers/gpu/drm/i915/i915_gem.c     |  2 +-
>>  drivers/gpu/drm/i915/intel_lrc.c    | 12 ++++++++
>>  drivers/gpu/drm/i915/intel_lrc.h    |  1 +
>>  drivers/gpu/drm/i915/intel_uncore.c | 41 ++++++++++++++++++++++++---
>>  6 files changed, 108 insertions(+), 7 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>> index e5688edd62cd..a034793bc246 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.c
>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>> @@ -1830,18 +1830,70 @@ void i915_reset(struct drm_i915_private *dev_priv)
>>   *
>>   * Reset a specific GPU engine. Useful if a hang is detected.
>>   * Returns zero on successful reset or otherwise an error code.
>> + *
>> + * Procedure is fairly simple:
>> + *  - identifies the request that caused the hang and it is dropped
>> + *  - force engine to idle: this is done by issuing a reset request
>> + *  - reset engine
>> + *  - restart submissions to the engine
>>   */
>>  int i915_reset_engine(struct intel_engine_cs *engine)
>
> What's the serialisation between potential callers of
> i915_reset_engine()?
>

I feel that making 'reset_in_progress' per engine feature
would clarify this and would be more fitting as now it is
the one engine that can be in reset at particular point in time,
decoupled with others.

Having global 'reset_in_progress' and then a per engine
resets just doesn't feel right.

-Mika

>>  {
>>  	int ret;
>>  	struct drm_i915_private *dev_priv = engine->i915;
>>  
>> -	/* FIXME: replace me with engine reset sequence */
>> -	ret = -ENODEV;
>> +	/*
>> +	 * We need to first idle the engine by issuing a reset request,
>> +	 * then perform soft reset and re-initialize hw state, for all of
>> +	 * this GT power need to be awake so ensure it does throughout the
>> +	 * process
>> +	 */
>> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
>> +
>> +	/*
>> +	 * the request that caused the hang is stuck on elsp, identify the
>> +	 * active request and drop it, adjust head to skip the offending
>> +	 * request to resume executing remaining requests in the queue.
>> +	 */
>> +	i915_gem_reset_engine(engine);
>
> Must freeze the engine and irqs first, before calling
> i915_gem_reset_engine() (i.e. something like disable_engines_irq,
> cancelling tasklet)
>
> Eeek note that the current i915_gem_reset_engine() is lacking a
> spinlock.
>
>> +
>> +	ret = intel_engine_reset_begin(engine);
>> +	if (ret) {
>> +		DRM_ERROR("Failed to disable %s\n", engine->name);
>> +		goto error;
>> +	}
>> +
>> +	ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
>> +	if (ret) {
>> +		DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
>> +		intel_engine_reset_cancel(engine);
>> +		goto error;
>> +	}
>> +
>> +	ret = engine->init_hw(engine);
>> +	if (ret)
>> +		goto error;
>>  
>> +	intel_engine_reset_cancel(engine);
>> +	intel_execlists_restart_submission(engine);
>
> engine->init_hw(engine) *is* intel_execlists_restart_submission.
>
>> +
>> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>> +	return 0;
>> +
>> +error:
>>  	/* use full gpu reset to recover on error */
>>  	set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
>>  
>> +	/* Engine reset is performed without taking struct_mutex, since it
>> +	 * failed we now fallback to full gpu reset. Wakeup any waiters
>> +	 * which should now see the reset_in_progress and release
>> +	 * struct_mutex for us to continue recovery.
>> +	 */
>> +	rcu_read_lock();
>> +	intel_engine_wakeup(engine);
>> +	rcu_read_unlock();
>> +
>> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>>  	return ret;
>>  }
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
Chris Wilson Dec. 19, 2016, 2:42 p.m. UTC | #5
On Mon, Dec 19, 2016 at 04:24:18PM +0200, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > On Fri, Dec 16, 2016 at 12:20:05PM -0800, Michel Thierry wrote:
> >> From: Arun Siluvery <arun.siluvery@linux.intel.com>
> >> 
> >> This change implements support for per-engine reset as an initial, less
> >> intrusive hang recovery option to be attempted before falling back to the
> >> legacy full GPU reset recovery mode if necessary. This is only supported
> >> from Gen8 onwards.
> >> 
> >> Hangchecker determines which engines are hung and invokes error handler to
> >> recover from it. Error handler schedules recovery for each of those engines
> >> that are hung. The recovery procedure is as follows,
> >>  - identifies the request that caused the hang and it is dropped
> >>  - force engine to idle: this is done by issuing a reset request
> >>  - reset and re-init engine
> >>  - restart submissions to the engine
> >> 
> >> If engine reset fails then we fall back to heavy weight full gpu reset
> >> which resets all engines and reinitiazes complete state of HW and SW.
> >> 
> >> v2: Rebase.
> >> 
> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> >> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> >> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/i915_drv.c     | 56 +++++++++++++++++++++++++++++++++++--
> >>  drivers/gpu/drm/i915/i915_drv.h     |  3 ++
> >>  drivers/gpu/drm/i915/i915_gem.c     |  2 +-
> >>  drivers/gpu/drm/i915/intel_lrc.c    | 12 ++++++++
> >>  drivers/gpu/drm/i915/intel_lrc.h    |  1 +
> >>  drivers/gpu/drm/i915/intel_uncore.c | 41 ++++++++++++++++++++++++---
> >>  6 files changed, 108 insertions(+), 7 deletions(-)
> >> 
> >> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> >> index e5688edd62cd..a034793bc246 100644
> >> --- a/drivers/gpu/drm/i915/i915_drv.c
> >> +++ b/drivers/gpu/drm/i915/i915_drv.c
> >> @@ -1830,18 +1830,70 @@ void i915_reset(struct drm_i915_private *dev_priv)
> >>   *
> >>   * Reset a specific GPU engine. Useful if a hang is detected.
> >>   * Returns zero on successful reset or otherwise an error code.
> >> + *
> >> + * Procedure is fairly simple:
> >> + *  - identifies the request that caused the hang and it is dropped
> >> + *  - force engine to idle: this is done by issuing a reset request
> >> + *  - reset engine
> >> + *  - restart submissions to the engine
> >>   */
> >>  int i915_reset_engine(struct intel_engine_cs *engine)
> >
> > What's the serialisation between potential callers of
> > i915_reset_engine()?
> >
> 
> I feel that making 'reset_in_progress' per engine feature
> would clarify this and would be more fitting as now it is
> the one engine that can be in reset at particular point in time,
> decoupled with others.

That is not what "reset_in_progress" means. It principally means
MUTEX_BACKOFF.
-Chris
Michel Thierry Jan. 5, 2017, 11:55 p.m. UTC | #6
On 19/12/16 01:51, Chris Wilson wrote:
> On Sun, Dec 18, 2016 at 09:02:33PM -0800, Michel Thierry wrote:
>>
>>
>> On 12/16/2016 12:45 PM, Chris Wilson wrote:
>>> On Fri, Dec 16, 2016 at 12:20:05PM -0800, Michel Thierry wrote:
>>>> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>>>>
>>>> This change implements support for per-engine reset as an initial, less
>>>> intrusive hang recovery option to be attempted before falling back to the
>>>> legacy full GPU reset recovery mode if necessary. This is only supported
>>> >from Gen8 onwards.
>>>>
>>>> Hangchecker determines which engines are hung and invokes error handler to
>>>> recover from it. Error handler schedules recovery for each of those engines
>>>> that are hung. The recovery procedure is as follows,
>>>> - identifies the request that caused the hang and it is dropped
>>>> - force engine to idle: this is done by issuing a reset request
>>>> - reset and re-init engine
>>>> - restart submissions to the engine
>>>>
>>>> If engine reset fails then we fall back to heavy weight full gpu reset
>>>> which resets all engines and reinitiazes complete state of HW and SW.
>>>>
>>>> v2: Rebase.
>>>>
>>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>>> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
>>>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>>>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>>>> ---
>>>> drivers/gpu/drm/i915/i915_drv.c     | 56 +++++++++++++++++++++++++++++++++++--
>>>> drivers/gpu/drm/i915/i915_drv.h     |  3 ++
>>>> drivers/gpu/drm/i915/i915_gem.c     |  2 +-
>>>> drivers/gpu/drm/i915/intel_lrc.c    | 12 ++++++++
>>>> drivers/gpu/drm/i915/intel_lrc.h    |  1 +
>>>> drivers/gpu/drm/i915/intel_uncore.c | 41 ++++++++++++++++++++++++---
>>>> 6 files changed, 108 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>>>> index e5688edd62cd..a034793bc246 100644
>>>> --- a/drivers/gpu/drm/i915/i915_drv.c
>>>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>>>> @@ -1830,18 +1830,70 @@ void i915_reset(struct drm_i915_private *dev_priv)
>>>>  *
>>>>  * Reset a specific GPU engine. Useful if a hang is detected.
>>>>  * Returns zero on successful reset or otherwise an error code.
>>>> + *
>>>> + * Procedure is fairly simple:
>>>> + *  - identifies the request that caused the hang and it is dropped
>>>> + *  - force engine to idle: this is done by issuing a reset request
>>>> + *  - reset engine
>>>> + *  - restart submissions to the engine
>>>>  */
>>>> int i915_reset_engine(struct intel_engine_cs *engine)
>>>
>>> What's the serialisation between potential callers of
>>> i915_reset_engine()?
>>>
>>
>> I haven't seen simultaneous calls happening yet, would a
>> reset_engine-specific mutex be enough?
>
> That is what it more or less boils down to. But first, do we ensure we
> don't declare a second hang on this engine before the first reset is
> complete? Then we only a barrier between a single engine reset and a
> full reset.
>
>>>> {
>>>> 	int ret;
>>>> 	struct drm_i915_private *dev_priv = engine->i915;
>>>>
>>>> -	/* FIXME: replace me with engine reset sequence */
>>>> -	ret = -ENODEV;
>>>> +	/*
>>>> +	 * We need to first idle the engine by issuing a reset request,
>>>> +	 * then perform soft reset and re-initialize hw state, for all of
>>>> +	 * this GT power need to be awake so ensure it does throughout the
>>>> +	 * process
>>>> +	 */
>>>> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
>>>> +
>>>> +	/*
>>>> +	 * the request that caused the hang is stuck on elsp, identify the
>>>> +	 * active request and drop it, adjust head to skip the offending
>>>> +	 * request to resume executing remaining requests in the queue.
>>>> +	 */
>>>> +	i915_gem_reset_engine(engine);
>>>
>>> Must freeze the engine and irqs first, before calling
>>> i915_gem_reset_engine() (i.e. something like disable_engines_irq,
>>> cancelling tasklet)
>>>
>> Will do.
>>
>>> Eeek note that the current i915_gem_reset_engine() is lacking a
>>> spinlock.
>>>
>>
>> The new mutex (for i915_reset_engine) should cover this.
>
> No. We need to protect these lists from concurrent manipulation in the
> fences. The spinlock protection there needs to be extended here, being
> covered by struct_mutex is no longer sufficient protection for the
> current code.

I think you already addressed the last point in "drm/i915: Prevent 
timeline updates whilst performing reset" 
(00c25e3f40083a6d5f1111955baccd287ee49258), right?

But looking at these (and the interaction with possible waiters), do you 
think it is still achievable to reset a engine without grabbing the 
struct_mutex? Then both i915_reset (chip) and i915_reset_engine can use 
the "reset_in_progress".

Thanks
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index e5688edd62cd..a034793bc246 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1830,18 +1830,70 @@  void i915_reset(struct drm_i915_private *dev_priv)
  *
  * Reset a specific GPU engine. Useful if a hang is detected.
  * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is fairly simple:
+ *  - identifies the request that caused the hang and it is dropped
+ *  - force engine to idle: this is done by issuing a reset request
+ *  - reset engine
+ *  - restart submissions to the engine
  */
 int i915_reset_engine(struct intel_engine_cs *engine)
 {
 	int ret;
 	struct drm_i915_private *dev_priv = engine->i915;
 
-	/* FIXME: replace me with engine reset sequence */
-	ret = -ENODEV;
+	/*
+	 * We need to first idle the engine by issuing a reset request,
+	 * then perform soft reset and re-initialize hw state, for all of
+	 * this GT power need to be awake so ensure it does throughout the
+	 * process
+	 */
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	/*
+	 * the request that caused the hang is stuck on elsp, identify the
+	 * active request and drop it, adjust head to skip the offending
+	 * request to resume executing remaining requests in the queue.
+	 */
+	i915_gem_reset_engine(engine);
+
+	ret = intel_engine_reset_begin(engine);
+	if (ret) {
+		DRM_ERROR("Failed to disable %s\n", engine->name);
+		goto error;
+	}
+
+	ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+	if (ret) {
+		DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+		intel_engine_reset_cancel(engine);
+		goto error;
+	}
+
+	ret = engine->init_hw(engine);
+	if (ret)
+		goto error;
 
+	intel_engine_reset_cancel(engine);
+	intel_execlists_restart_submission(engine);
+
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+	return 0;
+
+error:
 	/* use full gpu reset to recover on error */
 	set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags);
 
+	/* Engine reset is performed without taking struct_mutex, since it
+	 * failed we now fallback to full gpu reset. Wakeup any waiters
+	 * which should now see the reset_in_progress and release
+	 * struct_mutex for us to continue recovery.
+	 */
+	rcu_read_lock();
+	intel_engine_wakeup(engine);
+	rcu_read_unlock();
+
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6b4e8ee19905..a97cc8f50ade 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3009,6 +3009,8 @@  extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
 extern void i915_reset(struct drm_i915_private *dev_priv);
 extern bool intel_has_engine_reset(struct drm_i915_private *dev_priv);
+extern int intel_engine_reset_begin(struct intel_engine_cs *engine);
+extern int intel_engine_reset_cancel(struct intel_engine_cs *engine);
 extern int i915_reset_engine(struct intel_engine_cs *engine);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
@@ -3397,6 +3399,7 @@  static inline u32 i915_reset_count(struct i915_gpu_error *error)
 
 void i915_gem_reset(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
+void i915_gem_reset_engine(struct intel_engine_cs *engine);
 void i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
 int __must_check i915_gem_init_hw(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f86a71d9fe37..d2e9d30bf755 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2718,7 +2718,7 @@  static void reset_request(struct drm_i915_gem_request *request)
 	memset(vaddr + head, 0, request->postfix - head);
 }
 
-static void i915_gem_reset_engine(struct intel_engine_cs *engine)
+void i915_gem_reset_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *request;
 	struct i915_gem_context *incomplete_ctx;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index bc6aec619a9d..266ad7703862 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -676,6 +676,18 @@  static void execlists_submit_request(struct drm_i915_gem_request *request)
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
+void intel_execlists_restart_submission(struct intel_engine_cs *engine)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+
+	if (execlists_elsp_idle(engine))
+		tasklet_hi_schedule(&engine->irq_tasklet);
+
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+}
+
 static struct intel_engine_cs *
 pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
 {
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 8df97a97f18d..da1a169e5bb6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -96,6 +96,7 @@  int intel_lr_rcs_context_setup_trtt(struct i915_gem_context *ctx);
 int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
 				    int enable_execlists);
 void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
+void intel_execlists_restart_submission(struct intel_engine_cs *engine);
 bool intel_execlists_idle(struct drm_i915_private *dev_priv);
 
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 97ce324570ad..9105c166a6d5 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1758,7 +1758,7 @@  int intel_wait_for_register(struct drm_i915_private *dev_priv,
 	return ret;
 }
 
-static int gen8_request_engine_reset(struct intel_engine_cs *engine)
+static int gen8_engine_reset_begin(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 	int ret;
@@ -1777,7 +1777,7 @@  static int gen8_request_engine_reset(struct intel_engine_cs *engine)
 	return ret;
 }
 
-static void gen8_unrequest_engine_reset(struct intel_engine_cs *engine)
+static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
@@ -1792,14 +1792,14 @@  static int gen8_reset_engines(struct drm_i915_private *dev_priv,
 	unsigned int tmp;
 
 	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-		if (gen8_request_engine_reset(engine))
+		if (gen8_engine_reset_begin(engine))
 			goto not_ready;
 
 	return gen6_reset_engines(dev_priv, engine_mask);
 
 not_ready:
 	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-		gen8_unrequest_engine_reset(engine);
+		gen8_engine_reset_cancel(engine);
 
 	return -EIO;
 }
@@ -1881,6 +1881,39 @@  int intel_guc_reset(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
+/*
+ * On gen8+ a reset request has to be issued via the reset control register
+ * before a GPU engine can be reset in order to stop the command streamer
+ * and idle the engine. This replaces the legacy way of stopping an engine
+ * by writing to the stop ring bit in the MI_MODE register.
+ */
+int intel_engine_reset_begin(struct intel_engine_cs *engine)
+{
+	if (!intel_has_engine_reset(engine->i915)) {
+		DRM_ERROR("Engine Reset not supported on Gen%d\n",
+			  INTEL_INFO(engine->i915)->gen);
+		return -EINVAL;
+	}
+
+	return gen8_engine_reset_begin(engine);
+}
+
+/*
+ * It is possible to back off from a previously issued reset request by simply
+ * clearing the reset request bit in the reset control register.
+ */
+int intel_engine_reset_cancel(struct intel_engine_cs *engine)
+{
+	if (!intel_has_engine_reset(engine->i915)) {
+		DRM_ERROR("Request to clear reset not supported on Gen%d\n",
+			  INTEL_INFO(engine->i915)->gen);
+		return -EINVAL;
+	}
+
+	gen8_engine_reset_cancel(engine);
+	return 0;
+}
+
 bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
 {
 	return check_for_unclaimed_mmio(dev_priv);