diff mbox series

[2/2] drm/i915: Force reset on unready engine

Message ID 20180810140036.24240-2-mika.kuoppala@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/i915: Expose retry count to per gen reset logic | expand

Commit Message

Mika Kuoppala Aug. 10, 2018, 2 p.m. UTC
If engine reports that it is not ready for reset, we
give up. Evidence shows that forcing a per engine reset
on an engine which is not reporting to be ready for reset,
can bring it back into a working order. There is risk that
we corrupt the context image currently executing on that
engine. But that is a risk worth taking as if we unblock
the engine, we prevent a whole device wedging in a case
of full gpu reset.

Reset individual engine even if it reports that it is not
prepared for reset, but only if we aim for full gpu reset
and not on first reset attempt.

v2: force reset only on later attempts, readability (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_uncore.c | 49 +++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 10 deletions(-)

Comments

Chris Wilson Aug. 10, 2018, 2:13 p.m. UTC | #1
Quoting Mika Kuoppala (2018-08-10 15:00:36)
> If engine reports that it is not ready for reset, we
> give up. Evidence shows that forcing a per engine reset
> on an engine which is not reporting to be ready for reset,
> can bring it back into a working order. There is risk that
> we corrupt the context image currently executing on that
> engine. But that is a risk worth taking as if we unblock
> the engine, we prevent a whole device wedging in a case
> of full gpu reset.
> 
> Reset individual engine even if it reports that it is not
> prepared for reset, but only if we aim for full gpu reset
> and not on first reset attempt.
> 
> v2: force reset only on later attempts, readability (Chris)
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_uncore.c | 49 +++++++++++++++++++++++------
>  1 file changed, 39 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 027d14574bfa..d24026839b17 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -2099,9 +2099,6 @@ static int gen8_reset_engine_start(struct intel_engine_cs *engine)
>                                            RESET_CTL_READY_TO_RESET,
>                                            700, 0,
>                                            NULL);
> -       if (ret)
> -               DRM_ERROR("%s: reset request timeout\n", engine->name);
> -
>         return ret;
>  }
>  
> @@ -2113,6 +2110,42 @@ static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
>                       _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
>  }
>  
> +static int reset_engines(struct drm_i915_private *i915,
> +                        unsigned int engine_mask,
> +                        unsigned int retry)
> +{
> +       if (INTEL_GEN(i915) >= 11)
> +               return gen11_reset_engines(i915, engine_mask);
> +       else
> +               return gen6_reset_engines(i915, engine_mask, retry);
> +}
> +
> +static int gen8_prepare_engine_for_reset(struct intel_engine_cs *engine,
> +                                        unsigned int retry)
> +{
> +       const bool force_reset_on_non_ready = retry >= 1;
> +       int ret;
> +
> +       ret = gen8_reset_engine_start(engine);
> +
> +       if (ret && force_reset_on_non_ready) {
> +               /*
> +                * Try to unblock a single non-ready engine by risking
> +                * context corruption.
> +                */
> +               ret = reset_engines(engine->i915,
> +                                   intel_engine_flag(engine),
> +                                   retry);
> +               if (!ret)
> +                       ret = gen8_reset_engine_start(engine);
> +
> +               DRM_ERROR("%s: reset request timeout, forcing reset (%d)\n",
> +                         engine->name, ret);

This looks dubious now ;)

After the force you then do a reset in the caller. Twice the reset for
twice the unpreparedness.

> +       }
> +
> +       return ret;
> +}
> +
>  static int gen8_reset_engines(struct drm_i915_private *dev_priv,
>                               unsigned int engine_mask,
>                               unsigned int retry)
> @@ -2122,16 +2155,12 @@ static int gen8_reset_engines(struct drm_i915_private *dev_priv,
>         int ret;
>  
>         for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> -               if (gen8_reset_engine_start(engine)) {
> -                       ret = -EIO;
> +               ret = gen8_prepare_engine_for_reset(engine, retry);
> +               if (ret)

if (ret && !force_reset_unread)

>                         goto not_ready;
> -               }
>         }
>  
> -       if (INTEL_GEN(dev_priv) >= 11)
> -               ret = gen11_reset_engines(dev_priv, engine_mask);
> -       else
> -               ret = gen6_reset_engines(dev_priv, engine_mask, retry);
> +       ret = reset_engines(dev_priv, engine_mask, retry);
>  
>  not_ready:
>         for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -- 
> 2.17.1
>
Mika Kuoppala Aug. 13, 2018, 8:18 a.m. UTC | #2
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2018-08-10 15:00:36)
>> If engine reports that it is not ready for reset, we
>> give up. Evidence shows that forcing a per engine reset
>> on an engine which is not reporting to be ready for reset,
>> can bring it back into a working order. There is risk that
>> we corrupt the context image currently executing on that
>> engine. But that is a risk worth taking as if we unblock
>> the engine, we prevent a whole device wedging in a case
>> of full gpu reset.
>> 
>> Reset individual engine even if it reports that it is not
>> prepared for reset, but only if we aim for full gpu reset
>> and not on first reset attempt.
>> 
>> v2: force reset only on later attempts, readability (Chris)
>> 
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> ---
>>  drivers/gpu/drm/i915/intel_uncore.c | 49 +++++++++++++++++++++++------
>>  1 file changed, 39 insertions(+), 10 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
>> index 027d14574bfa..d24026839b17 100644
>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>> @@ -2099,9 +2099,6 @@ static int gen8_reset_engine_start(struct intel_engine_cs *engine)
>>                                            RESET_CTL_READY_TO_RESET,
>>                                            700, 0,
>>                                            NULL);
>> -       if (ret)
>> -               DRM_ERROR("%s: reset request timeout\n", engine->name);
>> -
>>         return ret;
>>  }
>>  
>> @@ -2113,6 +2110,42 @@ static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
>>                       _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
>>  }
>>  
>> +static int reset_engines(struct drm_i915_private *i915,
>> +                        unsigned int engine_mask,
>> +                        unsigned int retry)
>> +{
>> +       if (INTEL_GEN(i915) >= 11)
>> +               return gen11_reset_engines(i915, engine_mask);
>> +       else
>> +               return gen6_reset_engines(i915, engine_mask, retry);
>> +}
>> +
>> +static int gen8_prepare_engine_for_reset(struct intel_engine_cs *engine,
>> +                                        unsigned int retry)
>> +{
>> +       const bool force_reset_on_non_ready = retry >= 1;
>> +       int ret;
>> +
>> +       ret = gen8_reset_engine_start(engine);
>> +
>> +       if (ret && force_reset_on_non_ready) {
>> +               /*
>> +                * Try to unblock a single non-ready engine by risking
>> +                * context corruption.
>> +                */
>> +               ret = reset_engines(engine->i915,
>> +                                   intel_engine_flag(engine),
>> +                                   retry);
>> +               if (!ret)
>> +                       ret = gen8_reset_engine_start(engine);
>> +
>> +               DRM_ERROR("%s: reset request timeout, forcing reset (%d)\n",
>> +                         engine->name, ret);
>
> This looks dubious now ;)
>
> After the force you then do a reset in the caller. Twice the reset for
> twice the unpreparedness.

It is intentional. First we make the engine unstuck and then do
a full cycle with ready to reset involved. I don't know if
it really matters tho. It could be that the engine is already
in pristine condition after first.

I considered the engine reset here to only be the hammer,
and then the reset afterwards to be the validation.

-Mika

>
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>>  static int gen8_reset_engines(struct drm_i915_private *dev_priv,
>>                               unsigned int engine_mask,
>>                               unsigned int retry)
>> @@ -2122,16 +2155,12 @@ static int gen8_reset_engines(struct drm_i915_private *dev_priv,
>>         int ret;
>>  
>>         for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
>> -               if (gen8_reset_engine_start(engine)) {
>> -                       ret = -EIO;
>> +               ret = gen8_prepare_engine_for_reset(engine, retry);
>> +               if (ret)
>
> if (ret && !force_reset_unread)
>
>>                         goto not_ready;
>> -               }
>>         }
>>  
>> -       if (INTEL_GEN(dev_priv) >= 11)
>> -               ret = gen11_reset_engines(dev_priv, engine_mask);
>> -       else
>> -               ret = gen6_reset_engines(dev_priv, engine_mask, retry);
>> +       ret = reset_engines(dev_priv, engine_mask, retry);
>>  
>>  not_ready:
>>         for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
>> -- 
>> 2.17.1
>>
Chris Wilson Aug. 13, 2018, 8:32 a.m. UTC | #3
Quoting Mika Kuoppala (2018-08-13 09:18:07)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2018-08-10 15:00:36)
> >> If engine reports that it is not ready for reset, we
> >> give up. Evidence shows that forcing a per engine reset
> >> on an engine which is not reporting to be ready for reset,
> >> can bring it back into a working order. There is risk that
> >> we corrupt the context image currently executing on that
> >> engine. But that is a risk worth taking as if we unblock
> >> the engine, we prevent a whole device wedging in a case
> >> of full gpu reset.
> >> 
> >> Reset individual engine even if it reports that it is not
> >> prepared for reset, but only if we aim for full gpu reset
> >> and not on first reset attempt.
> >> 
> >> v2: force reset only on later attempts, readability (Chris)
> >> 
> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/intel_uncore.c | 49 +++++++++++++++++++++++------
> >>  1 file changed, 39 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> >> index 027d14574bfa..d24026839b17 100644
> >> --- a/drivers/gpu/drm/i915/intel_uncore.c
> >> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> >> @@ -2099,9 +2099,6 @@ static int gen8_reset_engine_start(struct intel_engine_cs *engine)
> >>                                            RESET_CTL_READY_TO_RESET,
> >>                                            700, 0,
> >>                                            NULL);
> >> -       if (ret)
> >> -               DRM_ERROR("%s: reset request timeout\n", engine->name);
> >> -
> >>         return ret;
> >>  }
> >>  
> >> @@ -2113,6 +2110,42 @@ static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
> >>                       _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
> >>  }
> >>  
> >> +static int reset_engines(struct drm_i915_private *i915,
> >> +                        unsigned int engine_mask,
> >> +                        unsigned int retry)
> >> +{
> >> +       if (INTEL_GEN(i915) >= 11)
> >> +               return gen11_reset_engines(i915, engine_mask);
> >> +       else
> >> +               return gen6_reset_engines(i915, engine_mask, retry);
> >> +}
> >> +
> >> +static int gen8_prepare_engine_for_reset(struct intel_engine_cs *engine,
> >> +                                        unsigned int retry)
> >> +{
> >> +       const bool force_reset_on_non_ready = retry >= 1;
> >> +       int ret;
> >> +
> >> +       ret = gen8_reset_engine_start(engine);
> >> +
> >> +       if (ret && force_reset_on_non_ready) {
> >> +               /*
> >> +                * Try to unblock a single non-ready engine by risking
> >> +                * context corruption.
> >> +                */
> >> +               ret = reset_engines(engine->i915,
> >> +                                   intel_engine_flag(engine),
> >> +                                   retry);
> >> +               if (!ret)
> >> +                       ret = gen8_reset_engine_start(engine);
> >> +
> >> +               DRM_ERROR("%s: reset request timeout, forcing reset (%d)\n",
> >> +                         engine->name, ret);
> >
> > This looks dubious now ;)
> >
> > After the force you then do a reset in the caller. Twice the reset for
> > twice the unpreparedness.
> 
> It is intentional. First we make the engine unstuck and then do
> a full cycle with ready to reset involved. I don't know if
> it really matters tho. It could be that the engine is already
> in pristine condition after first.

Looks extremely weird way of going about it. If you want to do a double
reset after the first try fails, try

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 4e5826045cbf..6fe137d7d455 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -450,6 +450,8 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
                        GEM_TRACE("engine_mask=%x\n", engine_mask);
                        preempt_disable();
                        ret = reset(i915, engine_mask);
+                       if (retry > 0 && !ret) /* Double check reset worked */
+                               ret = reset(i915, engine_mask);
                        preempt_enable();
                }
                if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)

as a separate patch.

P.S. you really need to review the i915_reset.c patch ;)
-Chris
Mika Kuoppala Aug. 13, 2018, 9:58 a.m. UTC | #4
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2018-08-13 09:18:07)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Mika Kuoppala (2018-08-10 15:00:36)
>> >> If engine reports that it is not ready for reset, we
>> >> give up. Evidence shows that forcing a per engine reset
>> >> on an engine which is not reporting to be ready for reset,
>> >> can bring it back into a working order. There is risk that
>> >> we corrupt the context image currently executing on that
>> >> engine. But that is a risk worth taking as if we unblock
>> >> the engine, we prevent a whole device wedging in a case
>> >> of full gpu reset.
>> >> 
>> >> Reset individual engine even if it reports that it is not
>> >> prepared for reset, but only if we aim for full gpu reset
>> >> and not on first reset attempt.
>> >> 
>> >> v2: force reset only on later attempts, readability (Chris)
>> >> 
>> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> >> ---
>> >>  drivers/gpu/drm/i915/intel_uncore.c | 49 +++++++++++++++++++++++------
>> >>  1 file changed, 39 insertions(+), 10 deletions(-)
>> >> 
>> >> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
>> >> index 027d14574bfa..d24026839b17 100644
>> >> --- a/drivers/gpu/drm/i915/intel_uncore.c
>> >> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>> >> @@ -2099,9 +2099,6 @@ static int gen8_reset_engine_start(struct intel_engine_cs *engine)
>> >>                                            RESET_CTL_READY_TO_RESET,
>> >>                                            700, 0,
>> >>                                            NULL);
>> >> -       if (ret)
>> >> -               DRM_ERROR("%s: reset request timeout\n", engine->name);
>> >> -
>> >>         return ret;
>> >>  }
>> >>  
>> >> @@ -2113,6 +2110,42 @@ static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
>> >>                       _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
>> >>  }
>> >>  
>> >> +static int reset_engines(struct drm_i915_private *i915,
>> >> +                        unsigned int engine_mask,
>> >> +                        unsigned int retry)
>> >> +{
>> >> +       if (INTEL_GEN(i915) >= 11)
>> >> +               return gen11_reset_engines(i915, engine_mask);
>> >> +       else
>> >> +               return gen6_reset_engines(i915, engine_mask, retry);
>> >> +}
>> >> +
>> >> +static int gen8_prepare_engine_for_reset(struct intel_engine_cs *engine,
>> >> +                                        unsigned int retry)
>> >> +{
>> >> +       const bool force_reset_on_non_ready = retry >= 1;
>> >> +       int ret;
>> >> +
>> >> +       ret = gen8_reset_engine_start(engine);
>> >> +
>> >> +       if (ret && force_reset_on_non_ready) {
>> >> +               /*
>> >> +                * Try to unblock a single non-ready engine by risking
>> >> +                * context corruption.
>> >> +                */
>> >> +               ret = reset_engines(engine->i915,
>> >> +                                   intel_engine_flag(engine),
>> >> +                                   retry);
>> >> +               if (!ret)
>> >> +                       ret = gen8_reset_engine_start(engine);
>> >> +
>> >> +               DRM_ERROR("%s: reset request timeout, forcing reset (%d)\n",
>> >> +                         engine->name, ret);
>> >
>> > This looks dubious now ;)
>> >
>> > After the force you then do a reset in the caller. Twice the reset for
>> > twice the unpreparedness.
>> 
>> It is intentional. First we make the engine unstuck and then do
>> a full cycle with ready to reset involved. I don't know if
>> it really matters tho. It could be that the engine is already
>> in pristine condition after first.
>
> Looks extremely weird way of going about it. If you want to do a double
> reset after the first try fails, try

The crux is: failing to reset or failing to preparing to reset.
It is the ready for reset dance not working that we are trying to
by doing extra per engine reset. So trying in the higher lever,
again with the same preconditions would not help.

-Mika

>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 4e5826045cbf..6fe137d7d455 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -450,6 +450,8 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
>                         GEM_TRACE("engine_mask=%x\n", engine_mask);
>                         preempt_disable();
>                         ret = reset(i915, engine_mask);
> +                       if (retry > 0 && !ret) /* Double check reset worked */
> +                               ret = reset(i915, engine_mask);
>                         preempt_enable();
>                 }
>                 if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
>
> as a separate patch.
>
> P.S. you really need to review the i915_reset.c patch ;)
> -Chris
Chris Wilson Aug. 13, 2018, 10:03 a.m. UTC | #5
Quoting Mika Kuoppala (2018-08-13 10:58:10)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2018-08-13 09:18:07)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> 
> >> > Quoting Mika Kuoppala (2018-08-10 15:00:36)
> >> >> If engine reports that it is not ready for reset, we
> >> >> give up. Evidence shows that forcing a per engine reset
> >> >> on an engine which is not reporting to be ready for reset,
> >> >> can bring it back into a working order. There is risk that
> >> >> we corrupt the context image currently executing on that
> >> >> engine. But that is a risk worth taking as if we unblock
> >> >> the engine, we prevent a whole device wedging in a case
> >> >> of full gpu reset.
> >> >> 
> >> >> Reset individual engine even if it reports that it is not
> >> >> prepared for reset, but only if we aim for full gpu reset
> >> >> and not on first reset attempt.
> >> >> 
> >> >> v2: force reset only on later attempts, readability (Chris)
> >> >> 
> >> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >> >> ---
> >> >>  drivers/gpu/drm/i915/intel_uncore.c | 49 +++++++++++++++++++++++------
> >> >>  1 file changed, 39 insertions(+), 10 deletions(-)
> >> >> 
> >> >> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> >> >> index 027d14574bfa..d24026839b17 100644
> >> >> --- a/drivers/gpu/drm/i915/intel_uncore.c
> >> >> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> >> >> @@ -2099,9 +2099,6 @@ static int gen8_reset_engine_start(struct intel_engine_cs *engine)
> >> >>                                            RESET_CTL_READY_TO_RESET,
> >> >>                                            700, 0,
> >> >>                                            NULL);
> >> >> -       if (ret)
> >> >> -               DRM_ERROR("%s: reset request timeout\n", engine->name);
> >> >> -
> >> >>         return ret;
> >> >>  }
> >> >>  
> >> >> @@ -2113,6 +2110,42 @@ static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
> >> >>                       _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
> >> >>  }
> >> >>  
> >> >> +static int reset_engines(struct drm_i915_private *i915,
> >> >> +                        unsigned int engine_mask,
> >> >> +                        unsigned int retry)
> >> >> +{
> >> >> +       if (INTEL_GEN(i915) >= 11)
> >> >> +               return gen11_reset_engines(i915, engine_mask);
> >> >> +       else
> >> >> +               return gen6_reset_engines(i915, engine_mask, retry);
> >> >> +}
> >> >> +
> >> >> +static int gen8_prepare_engine_for_reset(struct intel_engine_cs *engine,
> >> >> +                                        unsigned int retry)
> >> >> +{
> >> >> +       const bool force_reset_on_non_ready = retry >= 1;
> >> >> +       int ret;
> >> >> +
> >> >> +       ret = gen8_reset_engine_start(engine);
> >> >> +
> >> >> +       if (ret && force_reset_on_non_ready) {
> >> >> +               /*
> >> >> +                * Try to unblock a single non-ready engine by risking
> >> >> +                * context corruption.
> >> >> +                */
> >> >> +               ret = reset_engines(engine->i915,
> >> >> +                                   intel_engine_flag(engine),
> >> >> +                                   retry);
> >> >> +               if (!ret)
> >> >> +                       ret = gen8_reset_engine_start(engine);
> >> >> +
> >> >> +               DRM_ERROR("%s: reset request timeout, forcing reset (%d)\n",
> >> >> +                         engine->name, ret);
> >> >
> >> > This looks dubious now ;)
> >> >
> >> > After the force you then do a reset in the caller. Twice the reset for
> >> > twice the unpreparedness.
> >> 
> >> It is intentional. First we make the engine unstuck and then do
> >> a full cycle with ready to reset involved. I don't know if
> >> it really matters tho. It could be that the engine is already
> >> in pristine condition after first.
> >
> > Looks extremely weird way of going about it. If you want to do a double
> > reset after the first try fails, try
> 
> The crux is: failing to reset or failing to preparing to reset.
> It is the ready for reset dance not working that we are trying to
> by doing extra per engine reset. So trying in the higher lever,
> again with the same preconditions would not help.

No, you made it into two.

The first is that we don't want to skip the reset even if we fail to
prepare (as a last resort).

The second is that you added the double reset.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 027d14574bfa..d24026839b17 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -2099,9 +2099,6 @@  static int gen8_reset_engine_start(struct intel_engine_cs *engine)
 					   RESET_CTL_READY_TO_RESET,
 					   700, 0,
 					   NULL);
-	if (ret)
-		DRM_ERROR("%s: reset request timeout\n", engine->name);
-
 	return ret;
 }
 
@@ -2113,6 +2110,42 @@  static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
 		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
 }
 
+static int reset_engines(struct drm_i915_private *i915,
+			 unsigned int engine_mask,
+			 unsigned int retry)
+{
+	if (INTEL_GEN(i915) >= 11)
+		return gen11_reset_engines(i915, engine_mask);
+	else
+		return gen6_reset_engines(i915, engine_mask, retry);
+}
+
+static int gen8_prepare_engine_for_reset(struct intel_engine_cs *engine,
+					 unsigned int retry)
+{
+	const bool force_reset_on_non_ready = retry >= 1;
+	int ret;
+
+	ret = gen8_reset_engine_start(engine);
+
+	if (ret && force_reset_on_non_ready) {
+		/*
+		 * Try to unblock a single non-ready engine by risking
+		 * context corruption.
+		 */
+		ret = reset_engines(engine->i915,
+				    intel_engine_flag(engine),
+				    retry);
+		if (!ret)
+			ret = gen8_reset_engine_start(engine);
+
+		DRM_ERROR("%s: reset request timeout, forcing reset (%d)\n",
+			  engine->name, ret);
+	}
+
+	return ret;
+}
+
 static int gen8_reset_engines(struct drm_i915_private *dev_priv,
 			      unsigned int engine_mask,
 			      unsigned int retry)
@@ -2122,16 +2155,12 @@  static int gen8_reset_engines(struct drm_i915_private *dev_priv,
 	int ret;
 
 	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
-		if (gen8_reset_engine_start(engine)) {
-			ret = -EIO;
+		ret = gen8_prepare_engine_for_reset(engine, retry);
+		if (ret)
 			goto not_ready;
-		}
 	}
 
-	if (INTEL_GEN(dev_priv) >= 11)
-		ret = gen11_reset_engines(dev_priv, engine_mask);
-	else
-		ret = gen6_reset_engines(dev_priv, engine_mask, retry);
+	ret = reset_engines(dev_priv, engine_mask, retry);
 
 not_ready:
 	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)