diff mbox

drm/i915/selftests: Wait for idle between idle resets as well

Message ID 20180411120346.27618-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 11, 2018, 12:03 p.m. UTC
Even though we weren't injecting guilty requests to be reset, we could
still fall over the issue of resetting the same request too fast -- where
the GPU refuses to start again. (Although it is interesting to note that
reloading the driver is sufficient, suggesting that we could recover if
we delayed the setup after reset?) Continue to paper over the problem by
adding a small delay by waiting for the engine to idle between tests,
and ensure that the engines are idle before starting the idle tests.

References: 028666793a02 ("drm/i915/selftests: Avoid repeatedly harming the same innocent context")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 .../gpu/drm/i915/selftests/intel_hangcheck.c  | 48 ++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

Comments

Chris Wilson April 26, 2018, 10:08 a.m. UTC | #1
Quoting Chris Wilson (2018-04-11 13:03:46)
> Even though we weren't injecting guilty requests to be reset, we could
> still fall over the issue of resetting the same request too fast -- where
> the GPU refuses to start again. (Although it is interesting to note that
> reloading the driver is sufficient, suggesting that we could recover if
> we delayed the setup after reset?) Continue to paper over the problem by
> adding a small delay by waiting for the engine to idle between tests,
> and ensure that the engines are idle before starting the idle tests.
> 
> References: 028666793a02 ("drm/i915/selftests: Avoid repeatedly harming the same innocent context")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Ping?

> ---
>  .../gpu/drm/i915/selftests/intel_hangcheck.c  | 48 ++++++++++++++++++-
>  1 file changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 24f913f26a7b..7e23e6a6719c 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -454,6 +454,11 @@ static int igt_global_reset(void *arg)
>         return err;
>  }
>  
> +static bool wait_for_idle(struct intel_engine_cs *engine)
> +{
> +       return wait_for(intel_engine_is_idle(engine), 50) == 0;
> +}
> +
>  static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>  {
>         struct intel_engine_cs *engine;
> @@ -481,6 +486,13 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>                 if (active && !intel_engine_can_store_dword(engine))
>                         continue;
>  
> +               if (!wait_for_idle(engine)) {
> +                       pr_err("%s failed to idle before reset\n",
> +                              engine->name);
> +                       err = -EIO;
> +                       break;
> +               }
> +
>                 reset_count = i915_reset_count(&i915->gpu_error);
>                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
>                                                              engine);
> @@ -542,6 +554,19 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>                                 err = -EINVAL;
>                                 break;
>                         }
> +
> +                       if (!wait_for_idle(engine)) {
> +                               struct drm_printer p =
> +                                       drm_info_printer(i915->drm.dev);
> +
> +                               pr_err("%s failed to idle after reset\n",
> +                                      engine->name);
> +                               intel_engine_dump(engine, &p,
> +                                                 "%s\n", engine->name);
> +
> +                               err = -EIO;
> +                               break;
> +                       }
>                 } while (time_before(jiffies, end_time));
>                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
>  
> @@ -696,6 +721,13 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
>                     !intel_engine_can_store_dword(engine))
>                         continue;
>  
> +               if (!wait_for_idle(engine)) {
> +                       pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
> +                              engine->name, test_name);
> +                       err = -EIO;
> +                       break;
> +               }
> +
>                 memset(threads, 0, sizeof(threads));
>                 for_each_engine(other, i915, tmp) {
>                         struct task_struct *tsk;
> @@ -772,6 +804,20 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
>                                 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
>                                 i915_request_put(rq);
>                         }
> +
> +                       if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
> +                               struct drm_printer p =
> +                                       drm_info_printer(i915->drm.dev);
> +
> +                               pr_err("i915_reset_engine(%s:%s):"
> +                                      " failed to idle after reset\n",
> +                                      engine->name, test_name);
> +                               intel_engine_dump(engine, &p,
> +                                                 "%s\n", engine->name);
> +
> +                               err = -EIO;
> +                               break;
> +                       }
>                 } while (time_before(jiffies, end_time));
>                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
>                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
> @@ -981,7 +1027,7 @@ static int wait_for_others(struct drm_i915_private *i915,
>                 if (engine == exclude)
>                         continue;
>  
> -               if (wait_for(intel_engine_is_idle(engine), 10))
> +               if (!wait_for_idle(engine))
>                         return -EIO;
>         }
>  
> -- 
> 2.17.0
>
Michał Winiarski April 26, 2018, 4:21 p.m. UTC | #2
On Wed, Apr 11, 2018 at 01:03:46PM +0100, Chris Wilson wrote:
> Even though we weren't injecting guilty requests to be reset, we could
> still fall over the issue of resetting the same request too fast -- where
> the GPU refuses to start again. (Although it is interesting to note that
> reloading the driver is sufficient, suggesting that we could recover if
> we delayed the setup after reset?) Continue to paper over the problem by
> adding a small delay by waiting for the engine to idle between tests,
> and ensure that the engines are idle before starting the idle tests.
> 
> References: 028666793a02 ("drm/i915/selftests: Avoid repeatedly harming the same innocent context")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  .../gpu/drm/i915/selftests/intel_hangcheck.c  | 48 ++++++++++++++++++-
>  1 file changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 24f913f26a7b..7e23e6a6719c 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -454,6 +454,11 @@ static int igt_global_reset(void *arg)
>  	return err;
>  }
>  

#define IGT_IDLE_TIMEOUT 50 ?
It should even fit within a line.

With or without that:

Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>

-Michał

> +static bool wait_for_idle(struct intel_engine_cs *engine)
> +{
> +	return wait_for(intel_engine_is_idle(engine), 50) == 0;
> +}
> +
>  static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>  {
>  	struct intel_engine_cs *engine;
> @@ -481,6 +486,13 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>  		if (active && !intel_engine_can_store_dword(engine))
>  			continue;
>  
> +		if (!wait_for_idle(engine)) {
> +			pr_err("%s failed to idle before reset\n",
> +			       engine->name);
> +			err = -EIO;
> +			break;
> +		}
> +
>  		reset_count = i915_reset_count(&i915->gpu_error);
>  		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
>  							     engine);
> @@ -542,6 +554,19 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>  				err = -EINVAL;
>  				break;
>  			}
> +
> +			if (!wait_for_idle(engine)) {
> +				struct drm_printer p =
> +					drm_info_printer(i915->drm.dev);
> +
> +				pr_err("%s failed to idle after reset\n",
> +				       engine->name);
> +				intel_engine_dump(engine, &p,
> +						  "%s\n", engine->name);
> +
> +				err = -EIO;
> +				break;
> +			}
>  		} while (time_before(jiffies, end_time));
>  		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
>  
> @@ -696,6 +721,13 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
>  		    !intel_engine_can_store_dword(engine))
>  			continue;
>  
> +		if (!wait_for_idle(engine)) {
> +			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
> +			       engine->name, test_name);
> +			err = -EIO;
> +			break;
> +		}
> +
>  		memset(threads, 0, sizeof(threads));
>  		for_each_engine(other, i915, tmp) {
>  			struct task_struct *tsk;
> @@ -772,6 +804,20 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
>  				i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
>  				i915_request_put(rq);
>  			}
> +
> +			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
> +				struct drm_printer p =
> +					drm_info_printer(i915->drm.dev);
> +
> +				pr_err("i915_reset_engine(%s:%s):"
> +				       " failed to idle after reset\n",
> +				       engine->name, test_name);
> +				intel_engine_dump(engine, &p,
> +						  "%s\n", engine->name);
> +
> +				err = -EIO;
> +				break;
> +			}
>  		} while (time_before(jiffies, end_time));
>  		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
>  		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
> @@ -981,7 +1027,7 @@ static int wait_for_others(struct drm_i915_private *i915,
>  		if (engine == exclude)
>  			continue;
>  
> -		if (wait_for(intel_engine_is_idle(engine), 10))
> +		if (!wait_for_idle(engine))
>  			return -EIO;
>  	}
>  
> -- 
> 2.17.0
>
Chris Wilson April 26, 2018, 4:34 p.m. UTC | #3
Quoting Michał Winiarski (2018-04-26 17:21:39)
> On Wed, Apr 11, 2018 at 01:03:46PM +0100, Chris Wilson wrote:
> > Even though we weren't injecting guilty requests to be reset, we could
> > still fall over the issue of resetting the same request too fast -- where
> > the GPU refuses to start again. (Although it is interesting to note that
> > reloading the driver is sufficient, suggesting that we could recover if
> > we delayed the setup after reset?) Continue to paper over the problem by
> > adding a small delay by waiting for the engine to idle between tests,
> > and ensure that the engines are idle before starting the idle tests.
> > 
> > References: 028666793a02 ("drm/i915/selftests: Avoid repeatedly harming the same innocent context")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > Cc: Michel Thierry <michel.thierry@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >  .../gpu/drm/i915/selftests/intel_hangcheck.c  | 48 ++++++++++++++++++-
> >  1 file changed, 47 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > index 24f913f26a7b..7e23e6a6719c 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > @@ -454,6 +454,11 @@ static int igt_global_reset(void *arg)
> >       return err;
> >  }
> >  
> 
> #define IGT_IDLE_TIMEOUT 50 ?
> It should even fit within a line.
> 
> With or without that:
> 
> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>

Done. Thanks for the review, pushed.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 24f913f26a7b..7e23e6a6719c 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -454,6 +454,11 @@  static int igt_global_reset(void *arg)
 	return err;
 }
 
+static bool wait_for_idle(struct intel_engine_cs *engine)
+{
+	return wait_for(intel_engine_is_idle(engine), 50) == 0;
+}
+
 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 {
 	struct intel_engine_cs *engine;
@@ -481,6 +486,13 @@  static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 		if (active && !intel_engine_can_store_dword(engine))
 			continue;
 
+		if (!wait_for_idle(engine)) {
+			pr_err("%s failed to idle before reset\n",
+			       engine->name);
+			err = -EIO;
+			break;
+		}
+
 		reset_count = i915_reset_count(&i915->gpu_error);
 		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 							     engine);
@@ -542,6 +554,19 @@  static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 				err = -EINVAL;
 				break;
 			}
+
+			if (!wait_for_idle(engine)) {
+				struct drm_printer p =
+					drm_info_printer(i915->drm.dev);
+
+				pr_err("%s failed to idle after reset\n",
+				       engine->name);
+				intel_engine_dump(engine, &p,
+						  "%s\n", engine->name);
+
+				err = -EIO;
+				break;
+			}
 		} while (time_before(jiffies, end_time));
 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 
@@ -696,6 +721,13 @@  static int __igt_reset_engines(struct drm_i915_private *i915,
 		    !intel_engine_can_store_dword(engine))
 			continue;
 
+		if (!wait_for_idle(engine)) {
+			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
+			       engine->name, test_name);
+			err = -EIO;
+			break;
+		}
+
 		memset(threads, 0, sizeof(threads));
 		for_each_engine(other, i915, tmp) {
 			struct task_struct *tsk;
@@ -772,6 +804,20 @@  static int __igt_reset_engines(struct drm_i915_private *i915,
 				i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
 				i915_request_put(rq);
 			}
+
+			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
+				struct drm_printer p =
+					drm_info_printer(i915->drm.dev);
+
+				pr_err("i915_reset_engine(%s:%s):"
+				       " failed to idle after reset\n",
+				       engine->name, test_name);
+				intel_engine_dump(engine, &p,
+						  "%s\n", engine->name);
+
+				err = -EIO;
+				break;
+			}
 		} while (time_before(jiffies, end_time));
 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
@@ -981,7 +1027,7 @@  static int wait_for_others(struct drm_i915_private *i915,
 		if (engine == exclude)
 			continue;
 
-		if (wait_for(intel_engine_is_idle(engine), 10))
+		if (!wait_for_idle(engine))
 			return -EIO;
 	}