diff mbox

[v2] drm/i915/selftests: Include the trace as a debug aide

Message ID 20180322074908.10838-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson March 22, 2018, 7:49 a.m. UTC
If we fail to reset the GPU in a timely fashion, dump the GEM trace so
that we can see what operations were in flight when the GPU got stuck.

v2: There's more than one timeout that deserves tracing!
v3: Silence checkpatch by not even using a product at all!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

Comments

Mika Kuoppala March 22, 2018, 2:26 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> that we can see what operations were in flight when the GPU got stuck.
>
> v2: There's more than one timeout that deserves tracing!
> v3: Silence checkpatch by not even using a product at all!
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
>  1 file changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 4372826998aa..9b235dae8dd9 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
>  {
>  	struct wedge_me *w = container_of(work, typeof(*w), work.work);
>  
> -	pr_err("%pS timed out, cancelling all further testing.\n",
> -	       w->symbol);
> +	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> +
> +	GEM_TRACE("%pS timed out.\n", w->symbol);
> +	GEM_TRACE_DUMP();
> +
>  	i915_gem_set_wedged(w->i915);
>  }
>  
> @@ -621,9 +624,19 @@ static int active_engine(void *data)
>  		mutex_unlock(&engine->i915->drm.struct_mutex);
>  
>  		if (old) {
> -			i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> +			if (i915_request_wait(old, 0, HZ) < 0) {
> +				GEM_TRACE("%s timed out.\n", engine->name);
> +				GEM_TRACE_DUMP();
> +
> +				i915_gem_set_wedged(engine->i915);
> +				i915_request_put(old);
> +				err = -EIO;
> +				break;
> +			}

Using err = i915_request_wait() could have saved one extra request_put
but I dunno if it would be any cleaner.

>  			i915_request_put(old);
>  		}
> +
> +		cond_resched();

To give more slack for other engines and main thread to proceed?

>  	}
>  
>  	for (count = 0; count < ARRAY_SIZE(rq); count++)
> @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
>  
>  	err = i915_subtests(tests, i915);
>  
> +	mutex_lock(&i915->drm.struct_mutex);
> +	flush_test(i915, I915_WAIT_LOCKED);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +

To wash out leftovers.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

>  	i915_modparams.enable_hangcheck = saved_hangcheck;
>  	intel_runtime_pm_put(i915);
>  
> -- 
> 2.16.2
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson March 22, 2018, 2:30 p.m. UTC | #2
Quoting Mika Kuoppala (2018-03-22 14:26:41)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > that we can see what operations were in flight when the GPU got stuck.
> >
> > v2: There's more than one timeout that deserves tracing!
> > v3: Silence checkpatch by not even using a product at all!
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> >  1 file changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > index 4372826998aa..9b235dae8dd9 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> >  {
> >       struct wedge_me *w = container_of(work, typeof(*w), work.work);
> >  
> > -     pr_err("%pS timed out, cancelling all further testing.\n",
> > -            w->symbol);
> > +     pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > +
> > +     GEM_TRACE("%pS timed out.\n", w->symbol);
> > +     GEM_TRACE_DUMP();
> > +
> >       i915_gem_set_wedged(w->i915);
> >  }
> >  
> > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> >               mutex_unlock(&engine->i915->drm.struct_mutex);
> >  
> >               if (old) {
> > -                     i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > +                     if (i915_request_wait(old, 0, HZ) < 0) {
> > +                             GEM_TRACE("%s timed out.\n", engine->name);
> > +                             GEM_TRACE_DUMP();
> > +
> > +                             i915_gem_set_wedged(engine->i915);
> > +                             i915_request_put(old);
> > +                             err = -EIO;
> > +                             break;
> > +                     }
> 
> Using err = i915_request_wait() could have saved one extra request_put
> but I dunno if it would be any cleaner.

It's also -ETIME, which didn't fit my intention.

> 
> >                       i915_request_put(old);
> >               }
> > +
> > +             cond_resched();
> 
> To give more slack for other engines and main thread to proceed?

Yes. Otherwise, it spins mighty fine.
> 
> >       }
> >  
> >       for (count = 0; count < ARRAY_SIZE(rq); count++)
> > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> >  
> >       err = i915_subtests(tests, i915);
> >  
> > +     mutex_lock(&i915->drm.struct_mutex);
> > +     flush_test(i915, I915_WAIT_LOCKED);
> > +     mutex_unlock(&i915->drm.struct_mutex);
> > +
> 
> To wash out leftovers.

Yeah, from the early abort we left requests unaccounted for and needed
to grab the struct_mutex to run retire. Otherwise we hit assertions
later on about trying to unload the driver with requests still inflight.
-Chris
jeff.mcgee@intel.com March 22, 2018, 7:29 p.m. UTC | #3
On Thu, Mar 22, 2018 at 02:30:09PM +0000, Chris Wilson wrote:
> Quoting Mika Kuoppala (2018-03-22 14:26:41)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > 
> > > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > > that we can see what operations were in flight when the GPU got stuck.
> > >
> > > v2: There's more than one timeout that deserves tracing!
> > > v3: Silence checkpatch by not even using a product at all!
> > >
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > ---
> > >  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> > >  1 file changed, 20 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > index 4372826998aa..9b235dae8dd9 100644
> > > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> > >  {
> > >       struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > >  
> > > -     pr_err("%pS timed out, cancelling all further testing.\n",
> > > -            w->symbol);
> > > +     pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > > +
> > > +     GEM_TRACE("%pS timed out.\n", w->symbol);
> > > +     GEM_TRACE_DUMP();
> > > +
> > >       i915_gem_set_wedged(w->i915);
> > >  }
> > >  
> > > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> > >               mutex_unlock(&engine->i915->drm.struct_mutex);
> > >  
> > >               if (old) {
> > > -                     i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > > +                     if (i915_request_wait(old, 0, HZ) < 0) {
> > > +                             GEM_TRACE("%s timed out.\n", engine->name);
> > > +                             GEM_TRACE_DUMP();
> > > +
> > > +                             i915_gem_set_wedged(engine->i915);
> > > +                             i915_request_put(old);
> > > +                             err = -EIO;
> > > +                             break;
> > > +                     }
> > 
> > Using err = i915_request_wait() could have saved one extra request_put
> > but I dunno if it would be any cleaner.
> 
> It's also -ETIME, which didn't fit my intention.
> 
> > 
> > >                       i915_request_put(old);
> > >               }
> > > +
> > > +             cond_resched();
> > 
> > To give more slack for other engines and main thread to proceed?
> 
> Yes. Otherwise, it spins mighty fine.
> > 
> > >       }
> > >  
> > >       for (count = 0; count < ARRAY_SIZE(rq); count++)
> > > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> > >  
> > >       err = i915_subtests(tests, i915);
> > >  
> > > +     mutex_lock(&i915->drm.struct_mutex);
> > > +     flush_test(i915, I915_WAIT_LOCKED);
> > > +     mutex_unlock(&i915->drm.struct_mutex);
> > > +
> > 
> > To wash out leftovers.
> 
> Yeah, from the early abort we left requests unaccounted for and needed
> to grab the struct_mutex to run retire. Otherwise we hit assertions
> later on about trying to unload the driver with requests still inflight.
> -Chris

On this and the 3 others in this series...

Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>
Chris Wilson March 22, 2018, 8:37 p.m. UTC | #4
Quoting Jeff McGee (2018-03-22 19:29:16)
> On Thu, Mar 22, 2018 at 02:30:09PM +0000, Chris Wilson wrote:
> > Quoting Mika Kuoppala (2018-03-22 14:26:41)
> > > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > > 
> > > > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > > > that we can see what operations were in flight when the GPU got stuck.
> > > >
> > > > v2: There's more than one timeout that deserves tracing!
> > > > v3: Silence checkpatch by not even using a product at all!
> > > >
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > ---
> > > >  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> > > >  1 file changed, 20 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > index 4372826998aa..9b235dae8dd9 100644
> > > > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> > > >  {
> > > >       struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > > >  
> > > > -     pr_err("%pS timed out, cancelling all further testing.\n",
> > > > -            w->symbol);
> > > > +     pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > > > +
> > > > +     GEM_TRACE("%pS timed out.\n", w->symbol);
> > > > +     GEM_TRACE_DUMP();
> > > > +
> > > >       i915_gem_set_wedged(w->i915);
> > > >  }
> > > >  
> > > > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> > > >               mutex_unlock(&engine->i915->drm.struct_mutex);
> > > >  
> > > >               if (old) {
> > > > -                     i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > > > +                     if (i915_request_wait(old, 0, HZ) < 0) {
> > > > +                             GEM_TRACE("%s timed out.\n", engine->name);
> > > > +                             GEM_TRACE_DUMP();
> > > > +
> > > > +                             i915_gem_set_wedged(engine->i915);
> > > > +                             i915_request_put(old);
> > > > +                             err = -EIO;
> > > > +                             break;
> > > > +                     }
> > > 
> > > Using err = i915_request_wait() could have saved one extra request_put
> > > but I dunno if it would be any cleaner.
> > 
> > It's also -ETIME, which didn't fit my intention.
> > 
> > > 
> > > >                       i915_request_put(old);
> > > >               }
> > > > +
> > > > +             cond_resched();
> > > 
> > > To give more slack for other engines and main thread to proceed?
> > 
> > Yes. Otherwise, it spins mighty fine.
> > > 
> > > >       }
> > > >  
> > > >       for (count = 0; count < ARRAY_SIZE(rq); count++)
> > > > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> > > >  
> > > >       err = i915_subtests(tests, i915);
> > > >  
> > > > +     mutex_lock(&i915->drm.struct_mutex);
> > > > +     flush_test(i915, I915_WAIT_LOCKED);
> > > > +     mutex_unlock(&i915->drm.struct_mutex);
> > > > +
> > > 
> > > To wash out leftovers.
> > 
> > Yeah, from the early abort we left requests unaccounted for and needed
> > to grab the struct_mutex to run retire. Otherwise we hit assertions
> > later on about trying to unload the driver with requests still inflight.
> > -Chris
> 
> On this and the 3 others in this series...
> 
> Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>

Much appreciated. Pushed, let's hope CI holds up and that we're ready to
start talking about the real changes required for forced preemption as
opposed to getting the existing code working.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4372826998aa..9b235dae8dd9 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -260,8 +260,11 @@  static void wedge_me(struct work_struct *work)
 {
 	struct wedge_me *w = container_of(work, typeof(*w), work.work);
 
-	pr_err("%pS timed out, cancelling all further testing.\n",
-	       w->symbol);
+	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
+
+	GEM_TRACE("%pS timed out.\n", w->symbol);
+	GEM_TRACE_DUMP();
+
 	i915_gem_set_wedged(w->i915);
 }
 
@@ -621,9 +624,19 @@  static int active_engine(void *data)
 		mutex_unlock(&engine->i915->drm.struct_mutex);
 
 		if (old) {
-			i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
+			if (i915_request_wait(old, 0, HZ) < 0) {
+				GEM_TRACE("%s timed out.\n", engine->name);
+				GEM_TRACE_DUMP();
+
+				i915_gem_set_wedged(engine->i915);
+				i915_request_put(old);
+				err = -EIO;
+				break;
+			}
 			i915_request_put(old);
 		}
+
+		cond_resched();
 	}
 
 	for (count = 0; count < ARRAY_SIZE(rq); count++)
@@ -1126,6 +1139,10 @@  int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 
 	err = i915_subtests(tests, i915);
 
+	mutex_lock(&i915->drm.struct_mutex);
+	flush_test(i915, I915_WAIT_LOCKED);
+	mutex_unlock(&i915->drm.struct_mutex);
+
 	i915_modparams.enable_hangcheck = saved_hangcheck;
 	intel_runtime_pm_put(i915);