diff mbox series

[6/6] drm/i915/gt: Lift set-wedged engine dumping out of user paths

Message ID 20200127231540.3302516-6-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/6] drm/i915: Skip capturing errors from internal contexts | expand

Commit Message

Chris Wilson Jan. 27, 2020, 11:15 p.m. UTC
The user (e.g. gem_eio) can manipulate the driver into wedging itself,
allowing the user to trigger voluminous logging of inconsequential
details. If we lift the dump to direct calls to intel_gt_set_wedged(),
out of the intel_reset failure handling, we keep the detail logging for
what we expect are true HW or test failures without being tricked.

Reported-by: Tomi Sarvela <tomi.p.sarvela@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 30 +++++++++++++++++++--------
 1 file changed, 21 insertions(+), 9 deletions(-)

Comments

Mika Kuoppala Jan. 28, 2020, 12:34 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> The user (e.g. gem_eio) can manipulate the driver into wedging itself,
> allowing the user to trigger voluminous logging of inconsequential
> details. If we lift the dump to direct calls to intel_gt_set_wedged(),
> out of the intel_reset failure handling, we keep the detail logging for
> what we expect are true HW or test failures without being tricked.
>
> Reported-by: Tomi Sarvela <tomi.p.sarvela@intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_reset.c | 30 +++++++++++++++++++--------
>  1 file changed, 21 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index beee0cf89bce..423a02506b2d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -800,13 +800,6 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
>  	if (test_bit(I915_WEDGED, &gt->reset.flags))
>  		return;
>  
> -	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
> -		struct drm_printer p = drm_debug_printer(__func__);
> -
> -		for_each_engine(engine, gt, id)
> -			intel_engine_dump(engine, &p, "%s\n", engine->name);
> -	}
> -
>  	GT_TRACE(gt, "start\n");
>  
>  	/*
> @@ -845,10 +838,29 @@ void intel_gt_set_wedged(struct intel_gt *gt)
>  {
>  	intel_wakeref_t wakeref;
>  
> +	if (test_bit(I915_WEDGED, &gt->reset.flags))
> +		return;
> +
> +	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
>  	mutex_lock(&gt->reset.mutex);
> -	with_intel_runtime_pm(gt->uncore->rpm, wakeref)
> -		__intel_gt_set_wedged(gt);
> +
> +	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {

As you inspect each engine separately the precursory idle check
seems superfluous.

-Mika

> +		struct drm_printer p = drm_debug_printer(__func__);
> +		struct intel_engine_cs *engine;
> +		enum intel_engine_id id;
> +
> +		for_each_engine(engine, gt, id) {
> +			if (intel_engine_is_idle(engine))
> +				continue;
> +
> +			intel_engine_dump(engine, &p, "%s\n", engine->name);
> +		}
> +	}
> +
> +	__intel_gt_set_wedged(gt);
> +
>  	mutex_unlock(&gt->reset.mutex);
> +	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
>  }
>  
>  static bool __intel_gt_unset_wedged(struct intel_gt *gt)
> -- 
> 2.25.0
Chris Wilson Jan. 28, 2020, 12:46 p.m. UTC | #2
Quoting Mika Kuoppala (2020-01-28 12:34:42)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > The user (e.g. gem_eio) can manipulate the driver into wedging itself,
> > allowing the user to trigger voluminous logging of inconsequential
> > details. If we lift the dump to direct calls to intel_gt_set_wedged(),
> > out of the intel_reset failure handling, we keep the detail logging for
> > what we expect are true HW or test failures without being tricked.
> >
> > Reported-by: Tomi Sarvela <tomi.p.sarvela@intel.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> > ---
> >  drivers/gpu/drm/i915/gt/intel_reset.c | 30 +++++++++++++++++++--------
> >  1 file changed, 21 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> > index beee0cf89bce..423a02506b2d 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> > @@ -800,13 +800,6 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
> >       if (test_bit(I915_WEDGED, &gt->reset.flags))
> >               return;
> >  
> > -     if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
> > -             struct drm_printer p = drm_debug_printer(__func__);
> > -
> > -             for_each_engine(engine, gt, id)
> > -                     intel_engine_dump(engine, &p, "%s\n", engine->name);
> > -     }
> > -
> >       GT_TRACE(gt, "start\n");
> >  
> >       /*
> > @@ -845,10 +838,29 @@ void intel_gt_set_wedged(struct intel_gt *gt)
> >  {
> >       intel_wakeref_t wakeref;
> >  
> > +     if (test_bit(I915_WEDGED, &gt->reset.flags))
> > +             return;
> > +
> > +     wakeref = intel_runtime_pm_get(gt->uncore->rpm);
> >       mutex_lock(&gt->reset.mutex);
> > -     with_intel_runtime_pm(gt->uncore->rpm, wakeref)
> > -             __intel_gt_set_wedged(gt);
> > +
> > +     if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
> 
> As you inspect each engine separately the precursory idle check
> seems superfluous.

Picky. We may throw some other debug in between :)
-Chris
Mika Kuoppala Jan. 28, 2020, 12:50 p.m. UTC | #3
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2020-01-28 12:34:42)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > The user (e.g. gem_eio) can manipulate the driver into wedging itself,
>> > allowing the user to trigger voluminous logging of inconsequential
>> > details. If we lift the dump to direct calls to intel_gt_set_wedged(),
>> > out of the intel_reset failure handling, we keep the detail logging for
>> > what we expect are true HW or test failures without being tricked.
>> >
>> > Reported-by: Tomi Sarvela <tomi.p.sarvela@intel.com>
>> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> > Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/gt/intel_reset.c | 30 +++++++++++++++++++--------
>> >  1 file changed, 21 insertions(+), 9 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
>> > index beee0cf89bce..423a02506b2d 100644
>> > --- a/drivers/gpu/drm/i915/gt/intel_reset.c
>> > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
>> > @@ -800,13 +800,6 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
>> >       if (test_bit(I915_WEDGED, &gt->reset.flags))
>> >               return;
>> >  
>> > -     if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
>> > -             struct drm_printer p = drm_debug_printer(__func__);
>> > -
>> > -             for_each_engine(engine, gt, id)
>> > -                     intel_engine_dump(engine, &p, "%s\n", engine->name);
>> > -     }
>> > -
>> >       GT_TRACE(gt, "start\n");
>> >  
>> >       /*
>> > @@ -845,10 +838,29 @@ void intel_gt_set_wedged(struct intel_gt *gt)
>> >  {
>> >       intel_wakeref_t wakeref;
>> >  
>> > +     if (test_bit(I915_WEDGED, &gt->reset.flags))
>> > +             return;
>> > +
>> > +     wakeref = intel_runtime_pm_get(gt->uncore->rpm);
>> >       mutex_lock(&gt->reset.mutex);
>> > -     with_intel_runtime_pm(gt->uncore->rpm, wakeref)
>> > -             __intel_gt_set_wedged(gt);
>> > +
>> > +     if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
>> 
>> As you inspect each engine separately the precursory idle check
>> seems superfluous.
>
> Picky. We may throw some other debug in between :)

Not picky. Just my tinfoil hat too tight.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> -Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index beee0cf89bce..423a02506b2d 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -800,13 +800,6 @@  static void __intel_gt_set_wedged(struct intel_gt *gt)
 	if (test_bit(I915_WEDGED, &gt->reset.flags))
 		return;
 
-	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
-		struct drm_printer p = drm_debug_printer(__func__);
-
-		for_each_engine(engine, gt, id)
-			intel_engine_dump(engine, &p, "%s\n", engine->name);
-	}
-
 	GT_TRACE(gt, "start\n");
 
 	/*
@@ -845,10 +838,29 @@  void intel_gt_set_wedged(struct intel_gt *gt)
 {
 	intel_wakeref_t wakeref;
 
+	if (test_bit(I915_WEDGED, &gt->reset.flags))
+		return;
+
+	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
 	mutex_lock(&gt->reset.mutex);
-	with_intel_runtime_pm(gt->uncore->rpm, wakeref)
-		__intel_gt_set_wedged(gt);
+
+	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
+		struct drm_printer p = drm_debug_printer(__func__);
+		struct intel_engine_cs *engine;
+		enum intel_engine_id id;
+
+		for_each_engine(engine, gt, id) {
+			if (intel_engine_is_idle(engine))
+				continue;
+
+			intel_engine_dump(engine, &p, "%s\n", engine->name);
+		}
+	}
+
+	__intel_gt_set_wedged(gt);
+
 	mutex_unlock(&gt->reset.mutex);
+	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
 }
 
 static bool __intel_gt_unset_wedged(struct intel_gt *gt)