diff mbox series

[11/12] drm/i915/guc: Don't abort on CTB_UNUSED status

Message ID 20220712233136.1044951-12-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show
Series Random assortment of (mostly) GuC related patches | expand

Commit Message

John Harrison July 12, 2022, 11:31 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

When the KMD sends a CLIENT_RESET request to GuC (as part of the
suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the
KMD then checked the CTB queue, it would see a non-zero status value
and report the buffer as corrupted.

Technically, no G2H messages should be received once the CLIENT_RESET
has been sent. However, if a context was outstanding on an engine then
it would get reset and a reset notification would be sent. So, don't
actually treat UNUSED as a catastrophic error. Just flag it up as
unexpected and keep going.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

Comments

Tvrtko Ursulin July 18, 2022, 12:36 p.m. UTC | #1
On 13/07/2022 00:31, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> When the KMD sends a CLIENT_RESET request to GuC (as part of the
> suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the
> KMD then checked the CTB queue, it would see a non-zero status value
> and report the buffer as corrupted.
> 
> Technically, no G2H messages should be received once the CLIENT_RESET
> has been sent. However, if a context was outstanding on an engine then
> it would get reset and a reset notification would be sent. So, don't
> actually treat UNUSED as a catastrophic error. Just flag it up as
> unexpected and keep going.

Does it need a Fixes: tag?

Regards,

Tvrtko

> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
>   2 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> index df83c1cc7c7a6..28b8387f97b77 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
> @@ -37,6 +37,7 @@
>    *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too large)     |
>    *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated message)      |
>    *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail modified)      |
> + *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in use)         |
>    *  +---+-------+--------------------------------------------------------------+
>    *  |...|       | RESERVED = MBZ                                               |
>    *  +---+-------+--------------------------------------------------------------+
> @@ -49,9 +50,10 @@ struct guc_ct_buffer_desc {
>   	u32 tail;
>   	u32 status;
>   #define GUC_CTB_STATUS_NO_ERROR				0
> -#define GUC_CTB_STATUS_OVERFLOW				(1 << 0)
> -#define GUC_CTB_STATUS_UNDERFLOW			(1 << 1)
> -#define GUC_CTB_STATUS_MISMATCH				(1 << 2)
> +#define GUC_CTB_STATUS_OVERFLOW				BIT(0)
> +#define GUC_CTB_STATUS_UNDERFLOW			BIT(1)
> +#define GUC_CTB_STATUS_MISMATCH				BIT(2)
> +#define GUC_CTB_STATUS_UNUSED				BIT(3)
>   	u32 reserved[13];
>   } __packed;
>   static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index f01325cd1b625..11b5d4ddb19ce 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -816,8 +816,22 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
>   	if (unlikely(ctb->broken))
>   		return -EPIPE;
>   
> -	if (unlikely(desc->status))
> -		goto corrupted;
> +	if (unlikely(desc->status)) {
> +		u32 status = desc->status;
> +
> +		if (status & GUC_CTB_STATUS_UNUSED) {
> +			/*
> +			 * Potentially valid if a CLIENT_RESET request resulted in
> +			 * contexts/engines being reset. But should never happen as
> +			 * no contexts should be active when CLIENT_RESET is sent.
> +			 */
> +			CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
> +			status &= ~GUC_CTB_STATUS_UNUSED;
> +		}
> +
> +		if (status)
> +			goto corrupted;
> +	}
>   
>   	GEM_BUG_ON(head > size);
>
John Harrison July 19, 2022, 12:16 a.m. UTC | #2
On 7/18/2022 05:36, Tvrtko Ursulin wrote:
> On 13/07/2022 00:31, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> When the KMD sends a CLIENT_RESET request to GuC (as part of the
>> suspend sequence), GuC will mark the CTB buffer as 'UNUSED'. If the
>> KMD then checked the CTB queue, it would see a non-zero status value
>> and report the buffer as corrupted.
>>
>> Technically, no G2H messages should be received once the CLIENT_RESET
>> has been sent. However, if a context was outstanding on an engine then
>> it would get reset and a reset notification would be sent. So, don't
>> actually treat UNUSED as a catastrophic error. Just flag it up as
>> unexpected and keep going.
>
> Does it need a Fixes: tag?
Not really. It's a theoretical hole only. It was exposed during POC work 
for a nasty w/a that ultimately didn't need to go forwards. The POC was 
generating fake interrupts and causing us to check the CTB channel after 
the CLIENT_RESET had been processed. We have never had an actual 
instance of an outstanding request during CLIENT_RESET. That would be a 
much more serious bug - trying to suspend while an engine is still 
processing a request.

John.


>
> Regards,
>
> Tvrtko
>
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   .../i915/gt/uc/abi/guc_communication_ctb_abi.h |  8 +++++---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c      | 18 ++++++++++++++++--
>>   2 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git 
>> a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h 
>> b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> index df83c1cc7c7a6..28b8387f97b77 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
>> @@ -37,6 +37,7 @@
>>    *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too 
>> large)     |
>>    *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated 
>> message)      |
>>    *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail 
>> modified)      |
>> + *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in 
>> use)         |
>>    * 
>> +---+-------+--------------------------------------------------------------+
>>    *  |...|       | RESERVED = 
>> MBZ                                               |
>>    * 
>> +---+-------+--------------------------------------------------------------+
>> @@ -49,9 +50,10 @@ struct guc_ct_buffer_desc {
>>       u32 tail;
>>       u32 status;
>>   #define GUC_CTB_STATUS_NO_ERROR                0
>> -#define GUC_CTB_STATUS_OVERFLOW                (1 << 0)
>> -#define GUC_CTB_STATUS_UNDERFLOW            (1 << 1)
>> -#define GUC_CTB_STATUS_MISMATCH                (1 << 2)
>> +#define GUC_CTB_STATUS_OVERFLOW                BIT(0)
>> +#define GUC_CTB_STATUS_UNDERFLOW            BIT(1)
>> +#define GUC_CTB_STATUS_MISMATCH                BIT(2)
>> +#define GUC_CTB_STATUS_UNUSED                BIT(3)
>>       u32 reserved[13];
>>   } __packed;
>>   static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> index f01325cd1b625..11b5d4ddb19ce 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> @@ -816,8 +816,22 @@ static int ct_read(struct intel_guc_ct *ct, 
>> struct ct_incoming_msg **msg)
>>       if (unlikely(ctb->broken))
>>           return -EPIPE;
>>   -    if (unlikely(desc->status))
>> -        goto corrupted;
>> +    if (unlikely(desc->status)) {
>> +        u32 status = desc->status;
>> +
>> +        if (status & GUC_CTB_STATUS_UNUSED) {
>> +            /*
>> +             * Potentially valid if a CLIENT_RESET request resulted in
>> +             * contexts/engines being reset. But should never happen as
>> +             * no contexts should be active when CLIENT_RESET is sent.
>> +             */
>> +            CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
>> +            status &= ~GUC_CTB_STATUS_UNUSED;
>> +        }
>> +
>> +        if (status)
>> +            goto corrupted;
>> +    }
>>         GEM_BUG_ON(head > size);
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
index df83c1cc7c7a6..28b8387f97b77 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_communication_ctb_abi.h
@@ -37,6 +37,7 @@ 
  *  |   |       |   - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too large)     |
  *  |   |       |   - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated message)      |
  *  |   |       |   - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail modified)      |
+ *  |   |       |   - _`GUC_CTB_STATUS_UNUSED` = 8 (CTB is not in use)         |
  *  +---+-------+--------------------------------------------------------------+
  *  |...|       | RESERVED = MBZ                                               |
  *  +---+-------+--------------------------------------------------------------+
@@ -49,9 +50,10 @@  struct guc_ct_buffer_desc {
 	u32 tail;
 	u32 status;
 #define GUC_CTB_STATUS_NO_ERROR				0
-#define GUC_CTB_STATUS_OVERFLOW				(1 << 0)
-#define GUC_CTB_STATUS_UNDERFLOW			(1 << 1)
-#define GUC_CTB_STATUS_MISMATCH				(1 << 2)
+#define GUC_CTB_STATUS_OVERFLOW				BIT(0)
+#define GUC_CTB_STATUS_UNDERFLOW			BIT(1)
+#define GUC_CTB_STATUS_MISMATCH				BIT(2)
+#define GUC_CTB_STATUS_UNUSED				BIT(3)
 	u32 reserved[13];
 } __packed;
 static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index f01325cd1b625..11b5d4ddb19ce 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -816,8 +816,22 @@  static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
 	if (unlikely(ctb->broken))
 		return -EPIPE;
 
-	if (unlikely(desc->status))
-		goto corrupted;
+	if (unlikely(desc->status)) {
+		u32 status = desc->status;
+
+		if (status & GUC_CTB_STATUS_UNUSED) {
+			/*
+			 * Potentially valid if a CLIENT_RESET request resulted in
+			 * contexts/engines being reset. But should never happen as
+			 * no contexts should be active when CLIENT_RESET is sent.
+			 */
+			CT_ERROR(ct, "Unexpected G2H after GuC has stopped!\n");
+			status &= ~GUC_CTB_STATUS_UNUSED;
+		}
+
+		if (status)
+			goto corrupted;
+	}
 
 	GEM_BUG_ON(head > size);