[1/5] drm/i915/guc: Don't GEM_BUG_ON on corrupted G2H CTB

Message ID	20200115140822.55756-2-michal.wajdeczko@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=nAAw=3E=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 0E65724656 From: Michal Wajdeczko <michal.wajdeczko@intel.com> To: intel-gfx@lists.freedesktop.org Date: Wed, 15 Jan 2020 14:08:18 +0000 Message-Id: <20200115140822.55756-2-michal.wajdeczko@intel.com> In-Reply-To: <20200115140822.55756-1-michal.wajdeczko@intel.com> References: <20200115140822.55756-1-michal.wajdeczko@intel.com> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 1/5] drm/i915/guc: Don't GEM_BUG_ON on corrupted G2H CTB Precedence: list Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	Misc GuC CT improvements - part II \| expand [0/5] Misc GuC CT improvements - part II [1/5] drm/i915/guc: Don't GEM_BUG_ON on corrupted G2H CTB [2/5] i915/drm/guc: Don't pass CTB while writing [3/5] i915/drm/guc: Don't pass CTB while reading [4/5] drm/i915/guc: Switch to CT_ERROR in ct_read [5/5] drm/i915/guc: Introduce CT_DEBUG

Message ID

20200115140822.55756-2-michal.wajdeczko@intel.com (mailing list archive)

State

New, archived

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 0E65724656
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
To: intel-gfx@lists.freedesktop.org
Date: Wed, 15 Jan 2020 14:08:18 +0000
Message-Id: <20200115140822.55756-2-michal.wajdeczko@intel.com>
In-Reply-To: <20200115140822.55756-1-michal.wajdeczko@intel.com>
References: <20200115140822.55756-1-michal.wajdeczko@intel.com>
MIME-Version: 1.0
Subject: [Intel-gfx] [PATCH 1/5] drm/i915/guc: Don't GEM_BUG_ON on corrupted
 G2H CTB
Precedence: list
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Series

Misc GuC CT improvements - part II | expand

Commit Message

Michal Wajdeczko Jan. 15, 2020, 2:08 p.m. UTC

We should never BUG_ON on any corruption in CTB descriptor as
data there can be also modified by the GuC. Instead we can
use flag "is_in_error" to indicate that we will not process
any further messages over this CTB (until reset). While here
move descriptor error reporting to the function that actually
touches that descriptor.

Note that unexpected content of the specific CT messages, that
still complies with generic CT message format, shall not trigger
disabling whole CTB, as that might just indicate new unsupported
message types.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 42 ++++++++++++++---------
 1 file changed, 26 insertions(+), 16 deletions(-)

Comments

Daniele Ceraolo Spurio Jan. 16, 2020, 6:46 p.m. UTC | #1

On 1/15/20 6:08 AM, Michal Wajdeczko wrote:
> We should never BUG_ON on any corruption in CTB descriptor as
> data there can be also modified by the GuC. Instead we can
> use flag "is_in_error" to indicate that we will not process
> any further messages over this CTB (until reset). While here
> move descriptor error reporting to the function that actually
> touches that descriptor.
> 
> Note that unexpected content of the specific CT messages, that
> still complies with generic CT message format, shall not trigger
> disabling whole CTB, as that might just indicate new unsupported
> message types.
> 
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 42 ++++++++++++++---------
>   1 file changed, 26 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index a55c336cc5ef..0d3556a820a3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -578,19 +578,29 @@ static inline bool ct_header_is_response(u32 header)
>   static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
>   {
>   	struct guc_ct_buffer_desc *desc = ctb->desc;
> -	u32 head = desc->head / 4;	/* in dwords */
> -	u32 tail = desc->tail / 4;	/* in dwords */
> -	u32 size = desc->size / 4;	/* in dwords */
> +	u32 head = desc->head;
> +	u32 tail = desc->tail;
> +	u32 size = desc->size;
>   	u32 *cmds = ctb->cmds;
> -	s32 available;			/* in dwords */
> +	s32 available;
>   	unsigned int len;
>   	unsigned int i;
>   
> -	GEM_BUG_ON(desc->size % 4);
> -	GEM_BUG_ON(desc->head % 4);
> -	GEM_BUG_ON(desc->tail % 4);
> -	GEM_BUG_ON(tail >= size);
> -	GEM_BUG_ON(head >= size);
> +	if (unlikely(desc->is_in_error))
> +		return -EPIPE;

How do we recover from this situation? before we marked the buffer as 
in_error but didn't stop processing of G2H, but with this return here we 
do. Do we need to reset the CTB desc to recover?

> +
> +	if (unlikely(!IS_ALIGNED(head, 4) ||
> +		     !IS_ALIGNED(tail, 4) ||
> +		     !IS_ALIGNED(size, 4) ||
> +		     (tail >= size) || (head >= size))) {
> +		DRM_ERROR("CT: Invalid data in descriptor\n");

nit: this log is redundant since we have a better message after the jump 
which includes the values

Daniele

> +		goto corrupted;
> +	}
> +
> +	/* later calculations will be done in dwords */
> +	head /= 4;
> +	tail /= 4;
> +	size /= 4;
>   
>   	/* tail == head condition indicates empty */
>   	available = tail - head;
> @@ -615,7 +625,7 @@ static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
>   			       size - head : available - 1), &cmds[head],
>   			  4 * (head + available - 1 > size ?
>   			       available - 1 - size + head : 0), &cmds[0]);
> -		return -EPROTO;
> +		goto corrupted;
>   	}
>   
>   	for (i = 1; i < len; i++) {
> @@ -626,6 +636,12 @@ static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
>   
>   	desc->head = head * 4;
>   	return 0;
> +
> +corrupted:
> +	DRM_ERROR("CT: Corrupted descriptor addr=%#x head=%u tail=%u size=%u\n",
> +		  desc->addr, desc->head, desc->tail, desc->size);
> +	desc->is_in_error = 1;
> +	return -EPIPE;
>   }
>   
>   /**
> @@ -836,10 +852,4 @@ void intel_guc_ct_event_handler(struct intel_guc_ct *ct)
>   		else
>   			err = ct_handle_request(ct, msg);
>   	} while (!err);
> -
> -	if (GEM_WARN_ON(err == -EPROTO)) {
> -		CT_ERROR(ct, "Corrupted message: %#x\n", msg[0]);
> -		ctb->desc->is_in_error = 1;
> -	}
>   }
> -
>

Michal Wajdeczko Jan. 16, 2020, 7:13 p.m. UTC | #2

On Thu, 16 Jan 2020 19:46:35 +0100, Daniele Ceraolo Spurio  
<daniele.ceraolospurio@intel.com> wrote:

>
>
> On 1/15/20 6:08 AM, Michal Wajdeczko wrote:
>> We should never BUG_ON on any corruption in CTB descriptor as
>> data there can be also modified by the GuC. Instead we can
>> use flag "is_in_error" to indicate that we will not process
>> any further messages over this CTB (until reset). While here
>> move descriptor error reporting to the function that actually
>> touches that descriptor.
>>  Note that unexpected content of the specific CT messages, that
>> still complies with generic CT message format, shall not trigger
>> disabling whole CTB, as that might just indicate new unsupported
>> message types.
>>  Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>> ---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 42 ++++++++++++++---------
>>   1 file changed, 26 insertions(+), 16 deletions(-)
>>  diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c  
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> index a55c336cc5ef..0d3556a820a3 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> @@ -578,19 +578,29 @@ static inline bool ct_header_is_response(u32  
>> header)
>>   static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
>>   {
>>   	struct guc_ct_buffer_desc *desc = ctb->desc;
>> -	u32 head = desc->head / 4;	/* in dwords */
>> -	u32 tail = desc->tail / 4;	/* in dwords */
>> -	u32 size = desc->size / 4;	/* in dwords */
>> +	u32 head = desc->head;
>> +	u32 tail = desc->tail;
>> +	u32 size = desc->size;
>>   	u32 *cmds = ctb->cmds;
>> -	s32 available;			/* in dwords */
>> +	s32 available;
>>   	unsigned int len;
>>   	unsigned int i;
>>   -	GEM_BUG_ON(desc->size % 4);
>> -	GEM_BUG_ON(desc->head % 4);
>> -	GEM_BUG_ON(desc->tail % 4);
>> -	GEM_BUG_ON(tail >= size);
>> -	GEM_BUG_ON(head >= size);
>> +	if (unlikely(desc->is_in_error))
>> +		return -EPIPE;
>
> How do we recover from this situation? before we marked the buffer as  
> in_error but didn't stop processing of G2H, but with this return here we  
> do. Do we need to reset the CTB desc to recover?

before we should hit BUG_ON followed by PANIC (since we read in irq)
now (or soon) we should be able to detect stalled CTB and then wedge
we can't reset CTB alone as IIRC GuC keeps its own head/tail copies

>
>> +
>> +	if (unlikely(!IS_ALIGNED(head, 4) ||
>> +		     !IS_ALIGNED(tail, 4) ||
>> +		     !IS_ALIGNED(size, 4) ||
>> +		     (tail >= size) || (head >= size))) {
>> +		DRM_ERROR("CT: Invalid data in descriptor\n");
>
> nit: this log is redundant since we have a better message after the jump  
> which includes the values

yeah, looking again and agree that's redundant, will remove

Initially this "better message" was here, then it was reduced after copying
it after jump to allow below error also to have desc details:

	DRM_ERROR("CT: incomplete message %*ph %*ph %*ph\n",

>
> Daniele
>
>> +		goto corrupted;
>> +	}
>> +
>> +	/* later calculations will be done in dwords */
>> +	head /= 4;
>> +	tail /= 4;
>> +	size /= 4;
>>     	/* tail == head condition indicates empty */
>>   	available = tail - head;
>> @@ -615,7 +625,7 @@ static int ctb_read(struct intel_guc_ct_buffer  
>> *ctb, u32 *data)
>>   			       size - head : available - 1), &cmds[head],
>>   			  4 * (head + available - 1 > size ?
>>   			       available - 1 - size + head : 0), &cmds[0]);
>> -		return -EPROTO;
>> +		goto corrupted;
>>   	}
>>     	for (i = 1; i < len; i++) {
>> @@ -626,6 +636,12 @@ static int ctb_read(struct intel_guc_ct_buffer  
>> *ctb, u32 *data)
>>     	desc->head = head * 4;
>>   	return 0;
>> +
>> +corrupted:
>> +	DRM_ERROR("CT: Corrupted descriptor addr=%#x head=%u tail=%u  
>> size=%u\n",
>> +		  desc->addr, desc->head, desc->tail, desc->size);
>> +	desc->is_in_error = 1;
>> +	return -EPIPE;
>>   }
>>     /**
>> @@ -836,10 +852,4 @@ void intel_guc_ct_event_handler(struct  
>> intel_guc_ct *ct)
>>   		else
>>   			err = ct_handle_request(ct, msg);
>>   	} while (!err);
>> -
>> -	if (GEM_WARN_ON(err == -EPROTO)) {
>> -		CT_ERROR(ct, "Corrupted message: %#x\n", msg[0]);
>> -		ctb->desc->is_in_error = 1;
>> -	}
>>   }
>> -

Daniele Ceraolo Spurio Jan. 16, 2020, 7:24 p.m. UTC | #3

On 1/16/20 11:13 AM, Michal Wajdeczko wrote:
> On Thu, 16 Jan 2020 19:46:35 +0100, Daniele Ceraolo Spurio 
> <daniele.ceraolospurio@intel.com> wrote:
> 
>>
>>
>> On 1/15/20 6:08 AM, Michal Wajdeczko wrote:
>>> We should never BUG_ON on any corruption in CTB descriptor as
>>> data there can be also modified by the GuC. Instead we can
>>> use flag "is_in_error" to indicate that we will not process
>>> any further messages over this CTB (until reset). While here
>>> move descriptor error reporting to the function that actually
>>> touches that descriptor.
>>>  Note that unexpected content of the specific CT messages, that
>>> still complies with generic CT message format, shall not trigger
>>> disabling whole CTB, as that might just indicate new unsupported
>>> message types.
>>>  Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 42 ++++++++++++++---------
>>>   1 file changed, 26 insertions(+), 16 deletions(-)
>>>  diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> index a55c336cc5ef..0d3556a820a3 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> @@ -578,19 +578,29 @@ static inline bool ct_header_is_response(u32 
>>> header)
>>>   static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
>>>   {
>>>       struct guc_ct_buffer_desc *desc = ctb->desc;
>>> -    u32 head = desc->head / 4;    /* in dwords */
>>> -    u32 tail = desc->tail / 4;    /* in dwords */
>>> -    u32 size = desc->size / 4;    /* in dwords */
>>> +    u32 head = desc->head;
>>> +    u32 tail = desc->tail;
>>> +    u32 size = desc->size;
>>>       u32 *cmds = ctb->cmds;
>>> -    s32 available;            /* in dwords */
>>> +    s32 available;
>>>       unsigned int len;
>>>       unsigned int i;
>>>   -    GEM_BUG_ON(desc->size % 4);
>>> -    GEM_BUG_ON(desc->head % 4);
>>> -    GEM_BUG_ON(desc->tail % 4);
>>> -    GEM_BUG_ON(tail >= size);
>>> -    GEM_BUG_ON(head >= size);
>>> +    if (unlikely(desc->is_in_error))
>>> +        return -EPIPE;
>>
>> How do we recover from this situation? before we marked the buffer as 
>> in_error but didn't stop processing of G2H, but with this return here 
>> we do. Do we need to reset the CTB desc to recover?
> 
> before we should hit BUG_ON followed by PANIC (since we read in irq)
> now (or soon) we should be able to detect stalled CTB and then wedge
> we can't reset CTB alone as IIRC GuC keeps its own head/tail copies
> 

Ok, this is definitely better than a panic. Anyway AFAICS the only G2H 
message handle at the moment is the log flush, which is only enabled 
when we're using rolling debug logs, so there is basically 0 chance of 
hitting this in the wild. We do need to get a recovery method sorted out 
though before we start relying on having more messages. Maybe 
re-registering the buffers with GuC could work?

>>
>>> +
>>> +    if (unlikely(!IS_ALIGNED(head, 4) ||
>>> +             !IS_ALIGNED(tail, 4) ||
>>> +             !IS_ALIGNED(size, 4) ||
>>> +             (tail >= size) || (head >= size))) {
>>> +        DRM_ERROR("CT: Invalid data in descriptor\n");
>>
>> nit: this log is redundant since we have a better message after the 
>> jump which includes the values
> 
> yeah, looking again and agree that's redundant, will remove
> 
> Initially this "better message" was here, then it was reduced after copying
> it after jump to allow below error also to have desc details:
> 
>      DRM_ERROR("CT: incomplete message %*ph %*ph %*ph\n",
> 

With the logs fixed:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

>>
>> Daniele
>>
>>> +        goto corrupted;
>>> +    }
>>> +
>>> +    /* later calculations will be done in dwords */
>>> +    head /= 4;
>>> +    tail /= 4;
>>> +    size /= 4;
>>>         /* tail == head condition indicates empty */
>>>       available = tail - head;
>>> @@ -615,7 +625,7 @@ static int ctb_read(struct intel_guc_ct_buffer 
>>> *ctb, u32 *data)
>>>                      size - head : available - 1), &cmds[head],
>>>                 4 * (head + available - 1 > size ?
>>>                      available - 1 - size + head : 0), &cmds[0]);
>>> -        return -EPROTO;
>>> +        goto corrupted;
>>>       }
>>>         for (i = 1; i < len; i++) {
>>> @@ -626,6 +636,12 @@ static int ctb_read(struct intel_guc_ct_buffer 
>>> *ctb, u32 *data)
>>>         desc->head = head * 4;
>>>       return 0;
>>> +
>>> +corrupted:
>>> +    DRM_ERROR("CT: Corrupted descriptor addr=%#x head=%u tail=%u 
>>> size=%u\n",
>>> +          desc->addr, desc->head, desc->tail, desc->size);
>>> +    desc->is_in_error = 1;
>>> +    return -EPIPE;
>>>   }
>>>     /**
>>> @@ -836,10 +852,4 @@ void intel_guc_ct_event_handler(struct 
>>> intel_guc_ct *ct)
>>>           else
>>>               err = ct_handle_request(ct, msg);
>>>       } while (!err);
>>> -
>>> -    if (GEM_WARN_ON(err == -EPROTO)) {
>>> -        CT_ERROR(ct, "Corrupted message: %#x\n", msg[0]);
>>> -        ctb->desc->is_in_error = 1;
>>> -    }
>>>   }
>>> -

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index a55c336cc5ef..0d3556a820a3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -578,19 +578,29 @@  static inline bool ct_header_is_response(u32 header)
 static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
 {
 	struct guc_ct_buffer_desc *desc = ctb->desc;
-	u32 head = desc->head / 4;	/* in dwords */
-	u32 tail = desc->tail / 4;	/* in dwords */
-	u32 size = desc->size / 4;	/* in dwords */
+	u32 head = desc->head;
+	u32 tail = desc->tail;
+	u32 size = desc->size;
 	u32 *cmds = ctb->cmds;
-	s32 available;			/* in dwords */
+	s32 available;
 	unsigned int len;
 	unsigned int i;
 
-	GEM_BUG_ON(desc->size % 4);
-	GEM_BUG_ON(desc->head % 4);
-	GEM_BUG_ON(desc->tail % 4);
-	GEM_BUG_ON(tail >= size);
-	GEM_BUG_ON(head >= size);
+	if (unlikely(desc->is_in_error))
+		return -EPIPE;
+
+	if (unlikely(!IS_ALIGNED(head, 4) ||
+		     !IS_ALIGNED(tail, 4) ||
+		     !IS_ALIGNED(size, 4) ||
+		     (tail >= size) || (head >= size))) {
+		DRM_ERROR("CT: Invalid data in descriptor\n");
+		goto corrupted;
+	}
+
+	/* later calculations will be done in dwords */
+	head /= 4;
+	tail /= 4;
+	size /= 4;
 
 	/* tail == head condition indicates empty */
 	available = tail - head;
@@ -615,7 +625,7 @@  static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
 			       size - head : available - 1), &cmds[head],
 			  4 * (head + available - 1 > size ?
 			       available - 1 - size + head : 0), &cmds[0]);
-		return -EPROTO;
+		goto corrupted;
 	}
 
 	for (i = 1; i < len; i++) {
@@ -626,6 +636,12 @@  static int ctb_read(struct intel_guc_ct_buffer *ctb, u32 *data)
 
 	desc->head = head * 4;
 	return 0;
+
+corrupted:
+	DRM_ERROR("CT: Corrupted descriptor addr=%#x head=%u tail=%u size=%u\n",
+		  desc->addr, desc->head, desc->tail, desc->size);
+	desc->is_in_error = 1;
+	return -EPIPE;
 }
 
 /**
@@ -836,10 +852,4 @@  void intel_guc_ct_event_handler(struct intel_guc_ct *ct)
 		else
 			err = ct_handle_request(ct, msg);
 	} while (!err);
-
-	if (GEM_WARN_ON(err == -EPROTO)) {
-		CT_ERROR(ct, "Corrupted message: %#x\n", msg[0]);
-		ctb->desc->is_in_error = 1;
-	}
 }
-

[1/5] drm/i915/guc: Don't GEM_BUG_ON on corrupted G2H CTB

Commit Message

Comments

Patch