diff mbox series

[04/19] drm/i915/selftests: Check we can recover a wedged device

Message ID 20181212134149.26981-5-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/19] drm/i915: Return immediately if trylock fails for direct-reclaim | expand

Commit Message

Chris Wilson Dec. 12, 2018, 1:41 p.m. UTC
After declaring a terminally wedged device, we allow ourselves to
recover on the next GPU reset (manually triggered), or resume. Check
that resetting a wedged device does work.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 .../gpu/drm/i915/selftests/intel_hangcheck.c  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

Comments

Tvrtko Ursulin Dec. 13, 2018, 8:22 a.m. UTC | #1
On 12/12/2018 13:41, Chris Wilson wrote:
> After declaring a terminally wedged device, we allow ourselves to
> recover on the next GPU reset (manually triggered), or resume. Check
> that resetting a wedged device does work.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>   .../gpu/drm/i915/selftests/intel_hangcheck.c  | 23 +++++++++++++++++++
>   1 file changed, 23 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 805e40aff407..65c318d14077 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -385,6 +385,28 @@ static int igt_global_reset(void *arg)
>   	return err;
>   }
>   
> +static int igt_wedged_reset(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +
> +	/* Check that we can recover a wedged device with a GPU reset */
> +
> +	igt_global_reset_lock(i915);
> +	mutex_lock(&i915->drm.struct_mutex);
> +
> +	i915_gem_set_wedged(i915);
> +	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
> +
> +	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
> +	i915_reset(i915, ALL_ENGINES, NULL);
> +	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> +
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	igt_global_reset_unlock(i915);
> +
> +	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
> +}
> +
>   static bool wait_for_idle(struct intel_engine_cs *engine)
>   {
>   	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
> @@ -1452,6 +1474,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
>   		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
> +		SUBTEST(igt_wedged_reset),
>   		SUBTEST(igt_hang_sanitycheck),
>   		SUBTEST(igt_reset_idle_engine),
>   		SUBTEST(igt_reset_active_engine),
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Tvrtko Ursulin Dec. 13, 2018, 8:28 a.m. UTC | #2
On 12/12/2018 13:41, Chris Wilson wrote:
> After declaring a terminally wedged device, we allow ourselves to
> recover on the next GPU reset (manually triggered), or resume. Check
> that resetting a wedged device does work.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>   .../gpu/drm/i915/selftests/intel_hangcheck.c  | 23 +++++++++++++++++++
>   1 file changed, 23 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 805e40aff407..65c318d14077 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -385,6 +385,28 @@ static int igt_global_reset(void *arg)
>   	return err;
>   }
>   
> +static int igt_wedged_reset(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +
> +	/* Check that we can recover a wedged device with a GPU reset */
> +
> +	igt_global_reset_lock(i915);
> +	mutex_lock(&i915->drm.struct_mutex);

rpm get?

> +
> +	i915_gem_set_wedged(i915);
> +	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
> +
> +	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
> +	i915_reset(i915, ALL_ENGINES, NULL);
> +	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> +
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	igt_global_reset_unlock(i915);
> +
> +	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
> +}
> +
>   static bool wait_for_idle(struct intel_engine_cs *engine)
>   {
>   	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
> @@ -1452,6 +1474,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
>   		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
> +		SUBTEST(igt_wedged_reset),
>   		SUBTEST(igt_hang_sanitycheck),
>   		SUBTEST(igt_reset_idle_engine),
>   		SUBTEST(igt_reset_active_engine),
>
Chris Wilson Dec. 13, 2018, 8:39 a.m. UTC | #3
Quoting Tvrtko Ursulin (2018-12-13 08:28:54)
> 
> On 12/12/2018 13:41, Chris Wilson wrote:
> > After declaring a terminally wedged device, we allow ourselves to
> > recover on the next GPU reset (manually triggered), or resume. Check
> > that resetting a wedged device does work.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >   .../gpu/drm/i915/selftests/intel_hangcheck.c  | 23 +++++++++++++++++++
> >   1 file changed, 23 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > index 805e40aff407..65c318d14077 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > @@ -385,6 +385,28 @@ static int igt_global_reset(void *arg)
> >       return err;
> >   }
> >   
> > +static int igt_wedged_reset(void *arg)
> > +{
> > +     struct drm_i915_private *i915 = arg;
> > +
> > +     /* Check that we can recover a wedged device with a GPU reset */
> > +
> > +     igt_global_reset_lock(i915);
> > +     mutex_lock(&i915->drm.struct_mutex);
> 
> rpm get?

True.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 805e40aff407..65c318d14077 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -385,6 +385,28 @@  static int igt_global_reset(void *arg)
 	return err;
 }
 
+static int igt_wedged_reset(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+
+	/* Check that we can recover a wedged device with a GPU reset */
+
+	igt_global_reset_lock(i915);
+	mutex_lock(&i915->drm.struct_mutex);
+
+	i915_gem_set_wedged(i915);
+	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
+
+	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
+	i915_reset(i915, ALL_ENGINES, NULL);
+	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
+
+	mutex_unlock(&i915->drm.struct_mutex);
+	igt_global_reset_unlock(i915);
+
+	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+}
+
 static bool wait_for_idle(struct intel_engine_cs *engine)
 {
 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
@@ -1452,6 +1474,7 @@  int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
 		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
+		SUBTEST(igt_wedged_reset),
 		SUBTEST(igt_hang_sanitycheck),
 		SUBTEST(igt_reset_idle_engine),
 		SUBTEST(igt_reset_active_engine),