diff mbox series

[v2] drm/i915: fix SFC reset flow

Message ID 20190919015330.15435-1-daniele.ceraolospurio@intel.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915: fix SFC reset flow | expand

Commit Message

Daniele Ceraolo Spurio Sept. 19, 2019, 1:53 a.m. UTC
Our assumption that the we can ask the HW to lock the SFC even if not
currently in use does not match the HW commitment. The expectation from
the HW is that SW will not try to lock the SFC if the engine is not
using it and if we do that the behavior is undefined; on ICL the HW
ends up to returning the ack and ignoring our lock request, but this is
not guaranteed and we shouldn't expect it going forward.

Also, failing to get the ack while the SFC is in use means that we can't
cleanly reset it, so fail the engine reset in that scenario.

v2: drop rmw change, keep the log as debug and handle failure (Chris),
    improve comments (Tvrtko).

Reported-by: Owen Zhang <owen.zhang@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 51 +++++++++++++++++----------
 1 file changed, 33 insertions(+), 18 deletions(-)

Comments

Chris Wilson Sept. 19, 2019, 7:51 a.m. UTC | #1
Quoting Daniele Ceraolo Spurio (2019-09-19 02:53:30)
> Our assumption that the we can ask the HW to lock the SFC even if not
> currently in use does not match the HW commitment. The expectation from
> the HW is that SW will not try to lock the SFC if the engine is not
> using it and if we do that the behavior is undefined; on ICL the HW
> ends up to returning the ack and ignoring our lock request, but this is
> not guaranteed and we shouldn't expect it going forward.
> 
> Also, failing to get the ack while the SFC is in use means that we can't
> cleanly reset it, so fail the engine reset in that scenario.
> 
> v2: drop rmw change, keep the log as debug and handle failure (Chris),
>     improve comments (Tvrtko).
> 
> Reported-by: Owen Zhang <owen.zhang@intel.com>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gt/intel_reset.c | 51 +++++++++++++++++----------
>  1 file changed, 33 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 8327220ac558..797cf50625cb 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -309,7 +309,7 @@ static int gen6_reset_engines(struct intel_gt *gt,
>         return gen6_hw_domain_reset(gt, hw_mask);
>  }
>  
> -static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
> +static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
>  {
>         struct intel_uncore *uncore = engine->uncore;
>         u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
> @@ -318,6 +318,7 @@ static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
>         i915_reg_t sfc_usage;
>         u32 sfc_usage_bit;
>         u32 sfc_reset_bit;
> +       int ret;
>  
>         switch (engine->class) {
>         case VIDEO_DECODE_CLASS:
> @@ -352,28 +353,33 @@ static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
>         }
>  
>         /*
> -        * Tell the engine that a software reset is going to happen. The engine
> -        * will then try to force lock the SFC (if currently locked, it will
> -        * remain so until we tell the engine it is safe to unlock; if currently
> -        * unlocked, it will ignore this and all new lock requests). If SFC
> -        * ends up being locked to the engine we want to reset, we have to reset
> -        * it as well (we will unlock it once the reset sequence is completed).
> +        * If the engine is using a SFC, tell the engine that a software reset
> +        * is going to happen. The engine will then try to force lock the SFC.
> +        * If SFC ends up being locked to the engine we want to reset, we have
> +        * to reset it as well (we will unlock it once the reset sequence is
> +        * completed).
>          */
> +       if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
> +               return 0;
> +
>         rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
>  
> -       if (__intel_wait_for_register_fw(uncore,
> -                                        sfc_forced_lock_ack,
> -                                        sfc_forced_lock_ack_bit,
> -                                        sfc_forced_lock_ack_bit,
> -                                        1000, 0, NULL)) {
> -               DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
> +       ret = __intel_wait_for_register_fw(uncore,
> +                                          sfc_forced_lock_ack,
> +                                          sfc_forced_lock_ack_bit,
> +                                          sfc_forced_lock_ack_bit,
> +                                          1000, 0, NULL);
> +
> +       /* was the SFC released while we were trying to lock it? */
> +       if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
>                 return 0;
> -       }
>  
> -       if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
> -               return sfc_reset_bit;
> +       if (ret)
> +               DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");

This unnerves me as on the lock_sfc it would result in lots of full-gpu
resets. However, I do believe that we catch those fallbacks in our
selftests -- so the tests that were triggering the wait timeout must be
taking the earlier return for sfc_usage == 0

Not having per-engine reset is greatly upsetting; we are baking the
assumption we can reset engines & contexts independently into our system
management. We rely on it...

However, this passed on icl so it can't be all bad!
Acked-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
Tvrtko Ursulin Sept. 19, 2019, 9:34 a.m. UTC | #2
On 19/09/2019 02:53, Daniele Ceraolo Spurio wrote:
> Our assumption that the we can ask the HW to lock the SFC even if not
> currently in use does not match the HW commitment. The expectation from
> the HW is that SW will not try to lock the SFC if the engine is not
> using it and if we do that the behavior is undefined; on ICL the HW
> ends up to returning the ack and ignoring our lock request, but this is
> not guaranteed and we shouldn't expect it going forward.
> 
> Also, failing to get the ack while the SFC is in use means that we can't
> cleanly reset it, so fail the engine reset in that scenario.
> 
> v2: drop rmw change, keep the log as debug and handle failure (Chris),
>      improve comments (Tvrtko).
> 
> Reported-by: Owen Zhang <owen.zhang@intel.com>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gt/intel_reset.c | 51 +++++++++++++++++----------
>   1 file changed, 33 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 8327220ac558..797cf50625cb 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -309,7 +309,7 @@ static int gen6_reset_engines(struct intel_gt *gt,
>   	return gen6_hw_domain_reset(gt, hw_mask);
>   }
>   
> -static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
> +static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
>   {
>   	struct intel_uncore *uncore = engine->uncore;
>   	u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
> @@ -318,6 +318,7 @@ static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
>   	i915_reg_t sfc_usage;
>   	u32 sfc_usage_bit;
>   	u32 sfc_reset_bit;
> +	int ret;
>   
>   	switch (engine->class) {
>   	case VIDEO_DECODE_CLASS:
> @@ -352,28 +353,33 @@ static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
>   	}
>   
>   	/*
> -	 * Tell the engine that a software reset is going to happen. The engine
> -	 * will then try to force lock the SFC (if currently locked, it will
> -	 * remain so until we tell the engine it is safe to unlock; if currently
> -	 * unlocked, it will ignore this and all new lock requests). If SFC
> -	 * ends up being locked to the engine we want to reset, we have to reset
> -	 * it as well (we will unlock it once the reset sequence is completed).
> +	 * If the engine is using a SFC, tell the engine that a software reset
> +	 * is going to happen. The engine will then try to force lock the SFC.
> +	 * If SFC ends up being locked to the engine we want to reset, we have
> +	 * to reset it as well (we will unlock it once the reset sequence is
> +	 * completed).
>   	 */
> +	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
> +		return 0;
> +
>   	rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
>   
> -	if (__intel_wait_for_register_fw(uncore,
> -					 sfc_forced_lock_ack,
> -					 sfc_forced_lock_ack_bit,
> -					 sfc_forced_lock_ack_bit,
> -					 1000, 0, NULL)) {
> -		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
> +	ret = __intel_wait_for_register_fw(uncore,
> +					   sfc_forced_lock_ack,
> +					   sfc_forced_lock_ack_bit,
> +					   sfc_forced_lock_ack_bit,
> +					   1000, 0, NULL);
> +
> +	/* was the SFC released while we were trying to lock it? */
> +	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
>   		return 0;
> -	}
>   
> -	if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
> -		return sfc_reset_bit;
> +	if (ret)
> +		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
> +	else
> +		*hw_mask |= sfc_reset_bit;
>   
> -	return 0;
> +	return ret;
>   }
>   
>   static void gen11_unlock_sfc(struct intel_engine_cs *engine)
> @@ -430,12 +436,21 @@ static int gen11_reset_engines(struct intel_gt *gt,
>   		for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
>   			GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
>   			hw_mask |= hw_engine_mask[engine->id];
> -			hw_mask |= gen11_lock_sfc(engine);
> +			ret = gen11_lock_sfc(engine, &hw_mask);
> +			if (ret)
> +				goto sfc_unlock;

Break on first failure looks unsafe to me. I think it would be more 
robust to continue, no? Like if we have been asked to reset multiple 
engines and only one failed, why not do the ones we can?

>   		}
>   	}
>   
>   	ret = gen6_hw_domain_reset(gt, hw_mask);
>   
> +sfc_unlock:
> +	/*
> +	 * we unlock the SFC based on the lock status and not the result of
> +	 * gen11_lock_sfc to make sure that we clean properly if something
> +	 * wrong happened during the lock (e.g. lock acquired after timeout
> +	 * expiration).
> +	 */
>   	if (engine_mask != ALL_ENGINES)
>   		for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
>   			gen11_unlock_sfc(engine);
> 

So you decided not to read the register and cross check?

Regards,

Tvrtko
Chris Wilson Sept. 19, 2019, 9:48 a.m. UTC | #3
Quoting Tvrtko Ursulin (2019-09-19 10:34:15)
> 
> On 19/09/2019 02:53, Daniele Ceraolo Spurio wrote:
> > Our assumption that the we can ask the HW to lock the SFC even if not
> > currently in use does not match the HW commitment. The expectation from
> > the HW is that SW will not try to lock the SFC if the engine is not
> > using it and if we do that the behavior is undefined; on ICL the HW
> > ends up to returning the ack and ignoring our lock request, but this is
> > not guaranteed and we shouldn't expect it going forward.
> > 
> > Also, failing to get the ack while the SFC is in use means that we can't
> > cleanly reset it, so fail the engine reset in that scenario.
> > 
> > v2: drop rmw change, keep the log as debug and handle failure (Chris),
> >      improve comments (Tvrtko).
> > 
> > Reported-by: Owen Zhang <owen.zhang@intel.com>
> > Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   static void gen11_unlock_sfc(struct intel_engine_cs *engine)
> > @@ -430,12 +436,21 @@ static int gen11_reset_engines(struct intel_gt *gt,
> >               for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
> >                       GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
> >                       hw_mask |= hw_engine_mask[engine->id];
> > -                     hw_mask |= gen11_lock_sfc(engine);
> > +                     ret = gen11_lock_sfc(engine, &hw_mask);
> > +                     if (ret)
> > +                             goto sfc_unlock;
> 
> Break on first failure looks unsafe to me. I think it would be more 
> robust to continue, no? Like if we have been asked to reset multiple 
> engines and only one failed, why not do the ones we can?

Any failure means a fallback to a full device reset. It doesn't matter
if we could reset one of two+ engines at that point, it's past the point
of no return.

> >       ret = gen6_hw_domain_reset(gt, hw_mask);
> >   
> > +sfc_unlock:
> > +     /*
> > +      * we unlock the SFC based on the lock status and not the result of
> > +      * gen11_lock_sfc to make sure that we clean properly if something
> > +      * wrong happened during the lock (e.g. lock acquired after timeout
> > +      * expiration).
> > +      */
> >       if (engine_mask != ALL_ENGINES)
> >               for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
> >                       gen11_unlock_sfc(engine);
> > 
> 
> So you decided not to read the register and cross check?

Very meh, that check didn't seem like it would improve our ability to
handle resets. If you wanted to actually check, we should check the
lock_ack_bit is cleared as well. (Or at least check that is clear before
we start the next lock_sfc().) I think skipping an unnecessary
gen11_lock_sfc() is a solid improvement by itself (I'm hopeful that's
enough to make icl more robust...). Tightening up the unlock can be done
separately.
-Chris
Tvrtko Ursulin Sept. 19, 2019, 9:58 a.m. UTC | #4
On 19/09/2019 10:34, Tvrtko Ursulin wrote:
> 
> On 19/09/2019 02:53, Daniele Ceraolo Spurio wrote:
>> Our assumption that the we can ask the HW to lock the SFC even if not
>> currently in use does not match the HW commitment. The expectation from
>> the HW is that SW will not try to lock the SFC if the engine is not
>> using it and if we do that the behavior is undefined; on ICL the HW
>> ends up to returning the ack and ignoring our lock request, but this is
>> not guaranteed and we shouldn't expect it going forward.
>>
>> Also, failing to get the ack while the SFC is in use means that we can't
>> cleanly reset it, so fail the engine reset in that scenario.
>>
>> v2: drop rmw change, keep the log as debug and handle failure (Chris),
>>      improve comments (Tvrtko).
>>
>> Reported-by: Owen Zhang <owen.zhang@intel.com>
>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   drivers/gpu/drm/i915/gt/intel_reset.c | 51 +++++++++++++++++----------
>>   1 file changed, 33 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
>> b/drivers/gpu/drm/i915/gt/intel_reset.c
>> index 8327220ac558..797cf50625cb 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
>> @@ -309,7 +309,7 @@ static int gen6_reset_engines(struct intel_gt *gt,
>>       return gen6_hw_domain_reset(gt, hw_mask);
>>   }
>> -static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
>> +static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
>>   {
>>       struct intel_uncore *uncore = engine->uncore;
>>       u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
>> @@ -318,6 +318,7 @@ static u32 gen11_lock_sfc(struct intel_engine_cs 
>> *engine)
>>       i915_reg_t sfc_usage;
>>       u32 sfc_usage_bit;
>>       u32 sfc_reset_bit;
>> +    int ret;
>>       switch (engine->class) {
>>       case VIDEO_DECODE_CLASS:
>> @@ -352,28 +353,33 @@ static u32 gen11_lock_sfc(struct intel_engine_cs 
>> *engine)
>>       }
>>       /*
>> -     * Tell the engine that a software reset is going to happen. The 
>> engine
>> -     * will then try to force lock the SFC (if currently locked, it will
>> -     * remain so until we tell the engine it is safe to unlock; if 
>> currently
>> -     * unlocked, it will ignore this and all new lock requests). If SFC
>> -     * ends up being locked to the engine we want to reset, we have 
>> to reset
>> -     * it as well (we will unlock it once the reset sequence is 
>> completed).
>> +     * If the engine is using a SFC, tell the engine that a software 
>> reset
>> +     * is going to happen. The engine will then try to force lock the 
>> SFC.
>> +     * If SFC ends up being locked to the engine we want to reset, we 
>> have
>> +     * to reset it as well (we will unlock it once the reset sequence is
>> +     * completed).
>>        */
>> +    if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
>> +        return 0;
>> +
>>       rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
>> -    if (__intel_wait_for_register_fw(uncore,
>> -                     sfc_forced_lock_ack,
>> -                     sfc_forced_lock_ack_bit,
>> -                     sfc_forced_lock_ack_bit,
>> -                     1000, 0, NULL)) {
>> -        DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
>> +    ret = __intel_wait_for_register_fw(uncore,
>> +                       sfc_forced_lock_ack,
>> +                       sfc_forced_lock_ack_bit,
>> +                       sfc_forced_lock_ack_bit,
>> +                       1000, 0, NULL);
>> +
>> +    /* was the SFC released while we were trying to lock it? */
>> +    if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
>>           return 0;
>> -    }
>> -    if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
>> -        return sfc_reset_bit;
>> +    if (ret)
>> +        DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
>> +    else
>> +        *hw_mask |= sfc_reset_bit;
>> -    return 0;
>> +    return ret;
>>   }
>>   static void gen11_unlock_sfc(struct intel_engine_cs *engine)
>> @@ -430,12 +436,21 @@ static int gen11_reset_engines(struct intel_gt *gt,
>>           for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
>>               GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
>>               hw_mask |= hw_engine_mask[engine->id];
>> -            hw_mask |= gen11_lock_sfc(engine);
>> +            ret = gen11_lock_sfc(engine, &hw_mask);
>> +            if (ret)
>> +                goto sfc_unlock;
> 
> Break on first failure looks unsafe to me. I think it would be more 
> robust to continue, no? Like if we have been asked to reset multiple 
> engines and only one failed, why not do the ones we can?

Chris corrected me on IRC explaining that as longs as we fail to reset 
one engine from engine_mask we fall back to full reset anyway. So this 
early return is immaterial to end behavior and I have no further 
complaints. :)

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> 
>>           }
>>       }
>>       ret = gen6_hw_domain_reset(gt, hw_mask);
>> +sfc_unlock:
>> +    /*
>> +     * we unlock the SFC based on the lock status and not the result of
>> +     * gen11_lock_sfc to make sure that we clean properly if something
>> +     * wrong happened during the lock (e.g. lock acquired after timeout
>> +     * expiration).
>> +     */
>>       if (engine_mask != ALL_ENGINES)
>>           for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
>>               gen11_unlock_sfc(engine);
>>
> 
> So you decided not to read the register and cross check?
> 
> Regards,
> 
> Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 8327220ac558..797cf50625cb 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -309,7 +309,7 @@  static int gen6_reset_engines(struct intel_gt *gt,
 	return gen6_hw_domain_reset(gt, hw_mask);
 }
 
-static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
+static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
 {
 	struct intel_uncore *uncore = engine->uncore;
 	u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
@@ -318,6 +318,7 @@  static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
 	i915_reg_t sfc_usage;
 	u32 sfc_usage_bit;
 	u32 sfc_reset_bit;
+	int ret;
 
 	switch (engine->class) {
 	case VIDEO_DECODE_CLASS:
@@ -352,28 +353,33 @@  static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
 	}
 
 	/*
-	 * Tell the engine that a software reset is going to happen. The engine
-	 * will then try to force lock the SFC (if currently locked, it will
-	 * remain so until we tell the engine it is safe to unlock; if currently
-	 * unlocked, it will ignore this and all new lock requests). If SFC
-	 * ends up being locked to the engine we want to reset, we have to reset
-	 * it as well (we will unlock it once the reset sequence is completed).
+	 * If the engine is using a SFC, tell the engine that a software reset
+	 * is going to happen. The engine will then try to force lock the SFC.
+	 * If SFC ends up being locked to the engine we want to reset, we have
+	 * to reset it as well (we will unlock it once the reset sequence is
+	 * completed).
 	 */
+	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
+		return 0;
+
 	rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
 
-	if (__intel_wait_for_register_fw(uncore,
-					 sfc_forced_lock_ack,
-					 sfc_forced_lock_ack_bit,
-					 sfc_forced_lock_ack_bit,
-					 1000, 0, NULL)) {
-		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
+	ret = __intel_wait_for_register_fw(uncore,
+					   sfc_forced_lock_ack,
+					   sfc_forced_lock_ack_bit,
+					   sfc_forced_lock_ack_bit,
+					   1000, 0, NULL);
+
+	/* was the SFC released while we were trying to lock it? */
+	if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
 		return 0;
-	}
 
-	if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
-		return sfc_reset_bit;
+	if (ret)
+		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
+	else
+		*hw_mask |= sfc_reset_bit;
 
-	return 0;
+	return ret;
 }
 
 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
@@ -430,12 +436,21 @@  static int gen11_reset_engines(struct intel_gt *gt,
 		for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
 			GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
 			hw_mask |= hw_engine_mask[engine->id];
-			hw_mask |= gen11_lock_sfc(engine);
+			ret = gen11_lock_sfc(engine, &hw_mask);
+			if (ret)
+				goto sfc_unlock;
 		}
 	}
 
 	ret = gen6_hw_domain_reset(gt, hw_mask);
 
+sfc_unlock:
+	/*
+	 * we unlock the SFC based on the lock status and not the result of
+	 * gen11_lock_sfc to make sure that we clean properly if something
+	 * wrong happened during the lock (e.g. lock acquired after timeout
+	 * expiration).
+	 */
 	if (engine_mask != ALL_ENGINES)
 		for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
 			gen11_unlock_sfc(engine);