diff mbox series

[6/6] drm/i915/guc: Don't abort on CTB_UNUSED status

Message ID 20220728024225.2363663-7-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show
Series Random assortment of (mostly) GuC related patches | expand

Commit Message

John Harrison July 28, 2022, 2:42 a.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

When the KMD sends a CLIENT_RESET request to GuC (as part of the
suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the
KMD then checked the CTB queue, it would see a non-zero status value
and report the buffer as corrupted.

Technically, no G2H messages should be received once the CLIENT_RESET
has been sent. However, if a context was outstanding on an engine then
it would get reset and a reset notification would be sent. So, don't
actually treat UNUSED as a catastrophic error. Just flag it up as
unexpected and keep going.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

Comments

Michal Wajdeczko July 28, 2022, 7:06 p.m. UTC | #1
On 28.07.2022 04:42, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> When the KMD sends a CLIENT_RESET request to GuC (as part of the
> suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the

hmm, GuC shouldn't do that on CLIENT_RESET, GuC shall only mark CTB as
UNUSED when we explicitly disable CTB using CONTROL_CTB as only then CTB
descriptors are known to be valid

> KMD then checked the CTB queue, it would see a non-zero status value
> and report the buffer as corrupted.
> 
> Technically, no G2H messages should be received once the CLIENT_RESET
> has been sent. However, if a context was outstanding on an engine then
> it would get reset and a reset notification would be sent. So, don't
> actually treat UNUSED as a catastrophic error. Just flag it up as
> unexpected and keep going.

we should have already marked locally that CTB is disabled, either as
part of the explicit disabling of CTB with CONTROL_CTB, or implicit due
to issued CLIENT_RESET, but in both cases we shouldn't try to read CTB
any more, even it there are any outstanding messages ...

is this due to a race with ct->enabled ?

> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>  .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
>  2 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> index df83c1cc7c7a6..28b8387f97b77 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> @@ -37,6 +37,7 @@
>   *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too large)     |
>   *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated message)      |
>   *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail modified)      |
> + *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in use)         |
>   *  +---+-------+--------------------------------------------------------------+
>   *  |...|       | RESERVED = MBZ                                               |
>   *  +---+-------+--------------------------------------------------------------+
> @@ -49,9 +50,10 @@ struct guc_ct_buffer_desc {
>  	u32 tail;
>  	u32 status;
>  #define GUC_CTB_STATUS_NO_ERROR				0
> -#define GUC_CTB_STATUS_OVERFLOW				(1 << 0)
> -#define GUC_CTB_STATUS_UNDERFLOW			(1 << 1)
> -#define GUC_CTB_STATUS_MISMATCH				(1 << 2)
> +#define GUC_CTB_STATUS_OVERFLOW				BIT(0)
> +#define GUC_CTB_STATUS_UNDERFLOW			BIT(1)
> +#define GUC_CTB_STATUS_MISMATCH				BIT(2)
> +#define GUC_CTB_STATUS_UNUSED				BIT(3)

nit: our goal was to use plain C definitions in ABI headers as much as
possible without introducing any dependency on external macros

>  	u32 reserved[13];
>  } __packed;
>  static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index f01325cd1b625..11b5d4ddb19ce 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -816,8 +816,22 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
>  	if (unlikely(ctb->broken))
>  		return -EPIPE;
>  
> -	if (unlikely(desc->status))
> -		goto corrupted;
> +	if (unlikely(desc->status)) {
> +		u32 status = desc->status;
> +
> +		if (status & GUC_CTB_STATUS_UNUSED) {
> +			/*
> +			 * Potentially valid if a CLIENT_RESET request resulted in
> +			 * contexts/engines being reset. But should never happen as
> +			 * no contexts should be active when CLIENT_RESET is sent.
> +			 */
> +			CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
> +			status &= ~GUC_CTB_STATUS_UNUSED;

do you really want to continue read messages from already disabled CTB ?
maybe instead of clearing GUC_CTB_STATUS_UNUSED bit we should just return?

Michal

> +		}
> +
> +		if (status)
> +			goto corrupted;
> +	}
>  
>  	GEM_BUG_ON(head > size);
>
John Harrison July 28, 2022, 7:38 p.m. UTC | #2
On 7/28/2022 12:06, Michal Wajdeczko wrote:
> On 28.07.2022 04:42, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> When the KMD sends a CLIENT_RESET request to GuC (as part of the
>> suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the
> hmm, GuC shouldn't do that on CLIENT_RESET, GuC shall only mark CTB as
> UNUSED when we explicitly disable CTB using CONTROL_CTB as only then CTB
> descriptors are known to be valid
GuC very definitely does do that.

>
>> KMD then checked the CTB queue, it would see a non-zero status value
>> and report the buffer as corrupted.
>>
>> Technically, no G2H messages should be received once the CLIENT_RESET
>> has been sent. However, if a context was outstanding on an engine then
>> it would get reset and a reset notification would be sent. So, don't
>> actually treat UNUSED as a catastrophic error. Just flag it up as
>> unexpected and keep going.
> we should have already marked locally that CTB is disabled, either as
> part of the explicit disabling of CTB with CONTROL_CTB, or implicit due
> to issued CLIENT_RESET, but in both cases we shouldn't try to read CTB
> any more, even it there are any outstanding messages ...
>
> is this due to a race with ct->enabled ?
As per review comments on previous revision, it was only hit during POC 
work of a hardware w/a that led to the G2H processing code being called 
even when no G2H message had been sent.

And you can't mark the CTB as disabled before sending a H2G message. 
That would result in not sending the CLIENT_RESET H2G at all. We do mark 
it disabled after having sent the message. But there will always exist a 
potential race condition where GuC sends us a message before we get that 
far.

>
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
>>   2 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> index df83c1cc7c7a6..28b8387f97b77 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> @@ -37,6 +37,7 @@
>>    *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too large)     |
>>    *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated message)      |
>>    *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail modified)      |
>> + *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in use)         |
>>    *  +---+-------+--------------------------------------------------------------+
>>    *  |...|       | RESERVED = MBZ                                               |
>>    *  +---+-------+--------------------------------------------------------------+
>> @@ -49,9 +50,10 @@ struct guc_ct_buffer_desc {
>>   	u32 tail;
>>   	u32 status;
>>   #define GUC_CTB_STATUS_NO_ERROR				0
>> -#define GUC_CTB_STATUS_OVERFLOW				(1 << 0)
>> -#define GUC_CTB_STATUS_UNDERFLOW			(1 << 1)
>> -#define GUC_CTB_STATUS_MISMATCH				(1 << 2)
>> +#define GUC_CTB_STATUS_OVERFLOW				BIT(0)
>> +#define GUC_CTB_STATUS_UNDERFLOW			BIT(1)
>> +#define GUC_CTB_STATUS_MISMATCH				BIT(2)
>> +#define GUC_CTB_STATUS_UNUSED				BIT(3)
> nit: our goal was to use plain C definitions in ABI headers as much as
> possible without introducing any dependency on external macros
Except that checkpatch complains like a complainy thing.

>
>>   	u32 reserved[13];
>>   } __packed;
>>   static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> index f01325cd1b625..11b5d4ddb19ce 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> @@ -816,8 +816,22 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
>>   	if (unlikely(ctb->broken))
>>   		return -EPIPE;
>>   
>> -	if (unlikely(desc->status))
>> -		goto corrupted;
>> +	if (unlikely(desc->status)) {
>> +		u32 status = desc->status;
>> +
>> +		if (status & GUC_CTB_STATUS_UNUSED) {
>> +			/*
>> +			 * Potentially valid if a CLIENT_RESET request resulted in
>> +			 * contexts/engines being reset. But should never happen as
>> +			 * no contexts should be active when CLIENT_RESET is sent.
>> +			 */
>> +			CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
>> +			status &= ~GUC_CTB_STATUS_UNUSED;
> do you really want to continue read messages from already disabled CTB ?
> maybe instead of clearing GUC_CTB_STATUS_UNUSED bit we should just return?
GuC could have sent us a valid message right before shutting down its 
end of the CTB. We should still process that message. Note that the 
clear is only of the local status variable. The CTB itself is still 
marked as closed by GuC.

John.


>
> Michal
>
>> +		}
>> +
>> +		if (status)
>> +			goto corrupted;
>> +	}
>>   
>>   	GEM_BUG_ON(head > size);
>>
Daniele Ceraolo Spurio July 29, 2022, midnight UTC | #3
On 7/27/2022 7:42 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> When the KMD sends a CLIENT_RESET request to GuC (as part of the
> suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the
> KMD then checked the CTB queue, it would see a non-zero status value
> and report the buffer as corrupted.
>
> Technically, no G2H messages should be received once the CLIENT_RESET
> has been sent. However, if a context was outstanding on an engine then
> it would get reset and a reset notification would be sent. So, don't
> actually treat UNUSED as a catastrophic error. Just flag it up as
> unexpected and keep going.

Given that we disable CTs right after sending the CLIENT_RESET, there is 
only a small window for the kernel to receive a G2H interrupt before we 
turn everything off. If we want to support catching unexpected G2Hs 
coming at that time, maybe we should instead make sure all CT messages 
(if any) have been processed before the disable. Not a blocker for this 
patch, can be done as a follow-up.

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
>   2 files changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> index df83c1cc7c7a6..28b8387f97b77 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> @@ -37,6 +37,7 @@
>    *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too large)     |
>    *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated message)      |
>    *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail modified)      |
> + *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in use)         |
>    *  +---+-------+--------------------------------------------------------------+
>    *  |...|       | RESERVED = MBZ                                               |
>    *  +---+-------+--------------------------------------------------------------+
> @@ -49,9 +50,10 @@ struct guc_ct_buffer_desc {
>   	u32 tail;
>   	u32 status;
>   #define GUC_CTB_STATUS_NO_ERROR				0
> -#define GUC_CTB_STATUS_OVERFLOW				(1 << 0)
> -#define GUC_CTB_STATUS_UNDERFLOW			(1 << 1)
> -#define GUC_CTB_STATUS_MISMATCH				(1 << 2)
> +#define GUC_CTB_STATUS_OVERFLOW				BIT(0)
> +#define GUC_CTB_STATUS_UNDERFLOW			BIT(1)
> +#define GUC_CTB_STATUS_MISMATCH				BIT(2)
> +#define GUC_CTB_STATUS_UNUSED				BIT(3)
>   	u32 reserved[13];
>   } __packed;
>   static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index f01325cd1b625..11b5d4ddb19ce 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -816,8 +816,22 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
>   	if (unlikely(ctb->broken))
>   		return -EPIPE;
>   
> -	if (unlikely(desc->status))
> -		goto corrupted;
> +	if (unlikely(desc->status)) {
> +		u32 status = desc->status;
> +
> +		if (status & GUC_CTB_STATUS_UNUSED) {
> +			/*
> +			 * Potentially valid if a CLIENT_RESET request resulted in
> +			 * contexts/engines being reset. But should never happen as
> +			 * no contexts should be active when CLIENT_RESET is sent.
> +			 */
> +			CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
> +			status &= ~GUC_CTB_STATUS_UNUSED;
> +		}
> +
> +		if (status)
> +			goto corrupted;
> +	}
>   
>   	GEM_BUG_ON(head > size);
>
John Harrison July 29, 2022, 12:35 a.m. UTC | #4
On 7/28/2022 17:00, Ceraolo Spurio, Daniele wrote:
> On 7/27/2022 7:42 PM, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> When the KMD sends a CLIENT_RESET request to GuC (as part of the
>> suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the
>> KMD then checked the CTB queue, it would see a non-zero status value
>> and report the buffer as corrupted.
>>
>> Technically, no G2H messages should be received once the CLIENT_RESET
>> has been sent. However, if a context was outstanding on an engine then
>> it would get reset and a reset notification would be sent. So, don't
>> actually treat UNUSED as a catastrophic error. Just flag it up as
>> unexpected and keep going.
>
> Given that we disable CTs right after sending the CLIENT_RESET, there 
> is only a small window for the kernel to receive a G2H interrupt 
> before we turn everything off. If we want to support catching 
> unexpected G2Hs coming at that time, maybe we should instead make sure 
> all CT messages (if any) have been processed before the disable. Not a 
> blocker for this patch, can be done as a follow-up.
Yeah, it gets messy. How do you check for messages in a CTB that is 
already marked as 'do not touch me'? The current check for available 
work (head != tail) is after the status check. This is specifically so 
that we don't try to process corrupted messages in a corrupted buffer. 
But by definition, if the send(CLIENT_RESEET) call has returned then the 
status is already 'do not use'. Ideally, we would just want to flush out 
any pending interrupts before turning interrupts off in the sanitise 
code. But then, is there a race where the interrupt hasn't quite made it 
far enough by that time? Do we need to stall for a bit? How long?

As noted, in the case where we actually hit the issue the interrupt 
handler did get to run in the gap between sending the reset message and 
turning off the i915 side of the CTB. So we are basically into windows 
of opportunity and diminishing returns. Given that it is supposedly an 
impossible situation anyway, I'm not sure it is worth putting a complex 
solution in to solve. But yeah, can think more and maybe get some kind 
of extra check in there as a follow up.

John.


>
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>
> Daniele
>
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
>>   2 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git 
>> a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h 
>> b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> index df83c1cc7c7a6..28b8387f97b77 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> @@ -37,6 +37,7 @@
>>    *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too 
>> large)     |
>>    *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated 
>> message)      |
>>    *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail 
>> modified)      |
>> + *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in 
>> use)         |
>>    * 
>> +---+-------+--------------------------------------------------------------+
>>    *  |...|       | RESERVED = 
>> MBZ                                               |
>>    * 
>> +---+-------+--------------------------------------------------------------+
>> @@ -49,9 +50,10 @@ struct guc_ct_buffer_desc {
>>       u32 tail;
>>       u32 status;
>>   #define GUC_CTB_STATUS_NO_ERROR                0
>> -#define GUC_CTB_STATUS_OVERFLOW                (1 << 0)
>> -#define GUC_CTB_STATUS_UNDERFLOW            (1 << 1)
>> -#define GUC_CTB_STATUS_MISMATCH                (1 << 2)
>> +#define GUC_CTB_STATUS_OVERFLOW                BIT(0)
>> +#define GUC_CTB_STATUS_UNDERFLOW            BIT(1)
>> +#define GUC_CTB_STATUS_MISMATCH                BIT(2)
>> +#define GUC_CTB_STATUS_UNUSED                BIT(3)
>>       u32 reserved[13];
>>   } __packed;
>>   static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> index f01325cd1b625..11b5d4ddb19ce 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> @@ -816,8 +816,22 @@ static int ct_read(struct intel_guc_ct *ct, 
>> struct ct_incoming_msg **msg)
>>       if (unlikely(ctb->broken))
>>           return -EPIPE;
>>   -    if (unlikely(desc->status))
>> -        goto corrupted;
>> +    if (unlikely(desc->status)) {
>> +        u32 status = desc->status;
>> +
>> +        if (status & GUC_CTB_STATUS_UNUSED) {
>> +            /*
>> +             * Potentially valid if a CLIENT_RESET request resulted in
>> +             * contexts/engines being reset. But should never happen as
>> +             * no contexts should be active when CLIENT_RESET is sent.
>> +             */
>> +            CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
>> +            status &= ~GUC_CTB_STATUS_UNUSED;
>> +        }
>> +
>> +        if (status)
>> +            goto corrupted;
>> +    }
>>         GEM_BUG_ON(head > size);
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
index df83c1cc7c7a6..28b8387f97b77 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
@@ -37,6 +37,7 @@ 
  *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too large)     |
  *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated message)      |
  *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail modified)      |
+ *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in use)         |
  *  +---+-------+--------------------------------------------------------------+
  *  |...|       | RESERVED = MBZ                                               |
  *  +---+-------+--------------------------------------------------------------+
@@ -49,9 +50,10 @@  struct guc_ct_buffer_desc {
 	u32 tail;
 	u32 status;
 #define GUC_CTB_STATUS_NO_ERROR				0
-#define GUC_CTB_STATUS_OVERFLOW				(1 << 0)
-#define GUC_CTB_STATUS_UNDERFLOW			(1 << 1)
-#define GUC_CTB_STATUS_MISMATCH				(1 << 2)
+#define GUC_CTB_STATUS_OVERFLOW				BIT(0)
+#define GUC_CTB_STATUS_UNDERFLOW			BIT(1)
+#define GUC_CTB_STATUS_MISMATCH				BIT(2)
+#define GUC_CTB_STATUS_UNUSED				BIT(3)
 	u32 reserved[13];
 } __packed;
 static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index f01325cd1b625..11b5d4ddb19ce 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -816,8 +816,22 @@  static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
 	if (unlikely(ctb->broken))
 		return -EPIPE;
 
-	if (unlikely(desc->status))
-		goto corrupted;
+	if (unlikely(desc->status)) {
+		u32 status = desc->status;
+
+		if (status & GUC_CTB_STATUS_UNUSED) {
+			/*
+			 * Potentially valid if a CLIENT_RESET request resulted in
+			 * contexts/engines being reset. But should never happen as
+			 * no contexts should be active when CLIENT_RESET is sent.
+			 */
+			CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
+			status &= ~GUC_CTB_STATUS_UNUSED;
+		}
+
+		if (status)
+			goto corrupted;
+	}
 
 	GEM_BUG_ON(head > size);