diff mbox series

[1/4] drm/i915: Allow error capture without a request

Message ID 20230112025311.2577084-2-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show
Series Allow error capture without a request / on reset failure | expand

Commit Message

John Harrison Jan. 12, 2023, 2:53 a.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

There was a report of error captures occurring without any hung
context being indicated despite the capture being initiated by a 'hung
context notification' from GuC. The problem was not reproducible.
However, it is possible to happen if the context in question has no
active requests. For example, if the hang was in the context switch
itself then the breadcrumb write would have occurred and the KMD would
see an idle context.

In the interests of attempting to provide as much information as
possible about a hang, it seems wise to include the engine info
regardless of whether a request was found or not. As opposed to just
prentending there was no hang at all.

So update the error capture code to always record engine information
if an engine is given. Which means updating record_context() to take a
context instead of a request (which it only ever used to find the
context anyway). And split the request agnostic parts of
intel_engine_coredump_add_request() out into a seaprate function.

v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
pointer.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 61 +++++++++++++++++++--------
 1 file changed, 43 insertions(+), 18 deletions(-)

Comments

Tvrtko Ursulin Jan. 12, 2023, 10:01 a.m. UTC | #1
On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> There was a report of error captures occurring without any hung
> context being indicated despite the capture being initiated by a 'hung
> context notification' from GuC. The problem was not reproducible.
> However, it is possible to happen if the context in question has no
> active requests. For example, if the hang was in the context switch
> itself then the breadcrumb write would have occurred and the KMD would
> see an idle context.
> 
> In the interests of attempting to provide as much information as
> possible about a hang, it seems wise to include the engine info
> regardless of whether a request was found or not. As opposed to just
> prentending there was no hang at all.
> 
> So update the error capture code to always record engine information
> if an engine is given. Which means updating record_context() to take a
> context instead of a request (which it only ever used to find the
> context anyway). And split the request agnostic parts of
> intel_engine_coredump_add_request() out into a seaprate function.
> 
> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
> pointer.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 61 +++++++++++++++++++--------
>   1 file changed, 43 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 9d5d5a397b64e..bd2cf7d235df0 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
>   }
>   
>   static bool record_context(struct i915_gem_context_coredump *e,
> -			   const struct i915_request *rq)
> +			   struct intel_context *ce)
>   {
>   	struct i915_gem_context *ctx;
>   	struct task_struct *task;
>   	bool simulated;
>   
>   	rcu_read_lock();
> -	ctx = rcu_dereference(rq->context->gem_context);
> +	ctx = rcu_dereference(ce->gem_context);
>   	if (ctx && !kref_get_unless_zero(&ctx->ref))
>   		ctx = NULL;
>   	rcu_read_unlock();
> @@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
>   	e->guilty = atomic_read(&ctx->guilty_count);
>   	e->active = atomic_read(&ctx->active_count);
>   
> -	e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
> -	e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
> +	e->total_runtime = intel_context_get_total_runtime_ns(ce);
> +	e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>   
>   	simulated = i915_gem_context_no_error_capture(ctx);
>   
> @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
>   	return ee;
>   }
>   
> +static struct intel_engine_capture_vma *
> +engine_coredump_add_context(struct intel_engine_coredump *ee,
> +			    struct intel_context *ce,
> +			    gfp_t gfp)
> +{
> +	struct intel_engine_capture_vma *vma = NULL;
> +
> +	ee->simulated |= record_context(&ee->context, ce);
> +	if (ee->simulated)
> +		return NULL;
> +
> +	/*
> +	 * We need to copy these to an anonymous buffer
> +	 * as the simplest method to avoid being overwritten
> +	 * by userspace.
> +	 */
> +	vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
> +	vma = capture_vma(vma, ce->state, "HW context", gfp);
> +
> +	return vma;
> +}
> +
>   struct intel_engine_capture_vma *
>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>   				  struct i915_request *rq,
>   				  gfp_t gfp)
>   {
> -	struct intel_engine_capture_vma *vma = NULL;
> +	struct intel_engine_capture_vma *vma;
>   
> -	ee->simulated |= record_context(&ee->context, rq);
> -	if (ee->simulated)
> +	vma = engine_coredump_add_context(ee, rq->context, gfp);
> +	if (!vma)
>   		return NULL;
>   
>   	/*
> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>   	 */
>   	vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>   	vma = capture_user(vma, rq, gfp);
> -	vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
> -	vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>   
>   	ee->rq_head = rq->head;
>   	ee->rq_post = rq->postfix;
> @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine,
>   	if (ce) {
>   		intel_engine_clear_hung_context(engine);
>   		rq = intel_context_find_active_request(ce);
> -		if (!rq || !i915_request_started(rq))
> -			goto no_request_capture;
> +		if (rq && !i915_request_started(rq)) {
> +			drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",

Suggest s/active/started/ since we have both i915_request_active and 
i915_request_started, so to align the terminology.

> +				 engine->name);
> +			rq = NULL;
> +		}
>   	} else {
>   		/*
>   		 * Getting here with GuC enabled means it is a forced error capture
> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs *engine,
>   					       flags);
>   		}
>   	}
> -	if (rq)
> +	if (rq) {
>   		rq = i915_request_get_rcu(rq);
> +		capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> +	} else if (ce) {
> +		capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
> +	}
>   
> -	if (!rq)
> -		goto no_request_capture;
> -
> -	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
>   	if (!capture) {
> -		i915_request_put(rq);
> +		if (rq)
> +			i915_request_put(rq);
>   		goto no_request_capture;
>   	}
>   	if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>   		intel_guc_capture_get_matching_node(engine->gt, ee, ce);

This step requires non-NULL ce, so if you move it under the "else if 
(ce)" above then I *think* exit from the function can be consolidated to 
just:

if (capture) {
	intel_engine_coredump_add_vma(ee, capture, compress);
	if (rq)
		i915_request_put(rq);
} else {
	kfree(ee);
	ee = NULL;
}

return ee;

No "if (rq) i915_request_put()" twice, and goto label can be completely 
removed.

Regards,

Tvrtko

>   
>   	intel_engine_coredump_add_vma(ee, capture, compress);
> -	i915_request_put(rq);
> +	if (rq)
> +		i915_request_put(rq);
>   
>   	return ee;
>
John Harrison Jan. 12, 2023, 8:40 p.m. UTC | #2
On 1/12/2023 02:01, Tvrtko Ursulin wrote:
> On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> There was a report of error captures occurring without any hung
>> context being indicated despite the capture being initiated by a 'hung
>> context notification' from GuC. The problem was not reproducible.
>> However, it is possible to happen if the context in question has no
>> active requests. For example, if the hang was in the context switch
>> itself then the breadcrumb write would have occurred and the KMD would
>> see an idle context.
>>
>> In the interests of attempting to provide as much information as
>> possible about a hang, it seems wise to include the engine info
>> regardless of whether a request was found or not. As opposed to just
>> prentending there was no hang at all.
>>
>> So update the error capture code to always record engine information
>> if an engine is given. Which means updating record_context() to take a
>> context instead of a request (which it only ever used to find the
>> context anyway). And split the request agnostic parts of
>> intel_engine_coredump_add_request() out into a seaprate function.
>>
>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
>> pointer.
>>
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gpu_error.c | 61 +++++++++++++++++++--------
>>   1 file changed, 43 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>> index 9d5d5a397b64e..bd2cf7d235df0 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct 
>> intel_engine_coredump *ee)
>>   }
>>     static bool record_context(struct i915_gem_context_coredump *e,
>> -               const struct i915_request *rq)
>> +               struct intel_context *ce)
>>   {
>>       struct i915_gem_context *ctx;
>>       struct task_struct *task;
>>       bool simulated;
>>         rcu_read_lock();
>> -    ctx = rcu_dereference(rq->context->gem_context);
>> +    ctx = rcu_dereference(ce->gem_context);
>>       if (ctx && !kref_get_unless_zero(&ctx->ref))
>>           ctx = NULL;
>>       rcu_read_unlock();
>> @@ -1396,8 +1396,8 @@ static bool record_context(struct 
>> i915_gem_context_coredump *e,
>>       e->guilty = atomic_read(&ctx->guilty_count);
>>       e->active = atomic_read(&ctx->active_count);
>>   -    e->total_runtime = 
>> intel_context_get_total_runtime_ns(rq->context);
>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>         simulated = i915_gem_context_no_error_capture(ctx);
>>   @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct 
>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>       return ee;
>>   }
>>   +static struct intel_engine_capture_vma *
>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>> +                struct intel_context *ce,
>> +                gfp_t gfp)
>> +{
>> +    struct intel_engine_capture_vma *vma = NULL;
>> +
>> +    ee->simulated |= record_context(&ee->context, ce);
>> +    if (ee->simulated)
>> +        return NULL;
>> +
>> +    /*
>> +     * We need to copy these to an anonymous buffer
>> +     * as the simplest method to avoid being overwritten
>> +     * by userspace.
>> +     */
>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>> +
>> +    return vma;
>> +}
>> +
>>   struct intel_engine_capture_vma *
>>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>>                     struct i915_request *rq,
>>                     gfp_t gfp)
>>   {
>> -    struct intel_engine_capture_vma *vma = NULL;
>> +    struct intel_engine_capture_vma *vma;
>>   -    ee->simulated |= record_context(&ee->context, rq);
>> -    if (ee->simulated)
>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>> +    if (!vma)
>>           return NULL;
>>         /*
>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct 
>> intel_engine_coredump *ee,
>>        */
>>       vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>>       vma = capture_user(vma, rq, gfp);
>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>> -    vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>>         ee->rq_head = rq->head;
>>       ee->rq_post = rq->postfix;
>> @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine,
>>       if (ce) {
>>           intel_engine_clear_hung_context(engine);
>>           rq = intel_context_find_active_request(ce);
>> -        if (!rq || !i915_request_started(rq))
>> -            goto no_request_capture;
>> +        if (rq && !i915_request_started(rq)) {
>> +            drm_info(&engine->gt->i915->drm, "Got hung context on %s 
>> with no active request!\n",
>
> Suggest s/active/started/ since we have both i915_request_active and 
> i915_request_started, so to align the terminology.
The message text was based on the intent of the activity not the naming 
of some internal helper function. Can change it if you really want but 
"with no started request" just reads like bad English to me. Plus it 
gets removed in the next patch anyway...


>
>> +                 engine->name);
>> +            rq = NULL;
>> +        }
>>       } else {
>>           /*
>>            * Getting here with GuC enabled means it is a forced error 
>> capture
>> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs *engine,
>>                              flags);
>>           }
>>       }
>> -    if (rq)
>> +    if (rq) {
>>           rq = i915_request_get_rcu(rq);
>> +        capture = intel_engine_coredump_add_request(ee, rq, 
>> ATOMIC_MAYFAIL);
>> +    } else if (ce) {
>> +        capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>> +    }
>>   -    if (!rq)
>> -        goto no_request_capture;
>> -
>> -    capture = intel_engine_coredump_add_request(ee, rq, 
>> ATOMIC_MAYFAIL);
>>       if (!capture) {
>> -        i915_request_put(rq);
>> +        if (rq)
>> +            i915_request_put(rq);
>>           goto no_request_capture;
>>       }
>>       if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>           intel_guc_capture_get_matching_node(engine->gt, ee, ce);
>
> This step requires non-NULL ce, so if you move it under the "else if 
> (ce)" above then I *think* exit from the function can be consolidated 
> to just:
>
> if (capture) {
>     intel_engine_coredump_add_vma(ee, capture, compress);
>     if (rq)
>         i915_request_put(rq);
Is there any reason the rq ref needs to be held during the add_vma call? 
Can it now just be moved earlier to be:
     if (rq) {
         rq = i915_request_get_rcu(rq);
         capture = intel_engine_coredump_add_request(ee, rq, 
ATOMIC_MAYFAIL);
         i915_request_put(rq);
     }

The internals of the request object are only touched in the above 
_add_request() code. The later _add_vma() call fiddles around with vmas 
that pulled from the request but the capture_vma code inside 
_add_request() has already copied everything, hasn't it? Or rather, it 
has grabbed its own private vma resource locks. So there is no 
requirement to keep the request itself around still?

John.


> } else {
>     kfree(ee);
>     ee = NULL;
> }
>
> return ee;
>
> No "if (rq) i915_request_put()" twice, and goto label can be 
> completely removed.
>
> Regards,
>
> Tvrtko
>
>>         intel_engine_coredump_add_vma(ee, capture, compress);
>> -    i915_request_put(rq);
>> +    if (rq)
>> +        i915_request_put(rq);
>>         return ee;
Tvrtko Ursulin Jan. 13, 2023, 9:51 a.m. UTC | #3
On 12/01/2023 20:40, John Harrison wrote:
> On 1/12/2023 02:01, Tvrtko Ursulin wrote:
>> On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> There was a report of error captures occurring without any hung
>>> context being indicated despite the capture being initiated by a 'hung
>>> context notification' from GuC. The problem was not reproducible.
>>> However, it is possible to happen if the context in question has no
>>> active requests. For example, if the hang was in the context switch
>>> itself then the breadcrumb write would have occurred and the KMD would
>>> see an idle context.
>>>
>>> In the interests of attempting to provide as much information as
>>> possible about a hang, it seems wise to include the engine info
>>> regardless of whether a request was found or not. As opposed to just
>>> prentending there was no hang at all.
>>>
>>> So update the error capture code to always record engine information
>>> if an engine is given. Which means updating record_context() to take a
>>> context instead of a request (which it only ever used to find the
>>> context anyway). And split the request agnostic parts of
>>> intel_engine_coredump_add_request() out into a seaprate function.
>>>
>>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
>>> pointer.
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gpu_error.c | 61 +++++++++++++++++++--------
>>>   1 file changed, 43 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
>>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> index 9d5d5a397b64e..bd2cf7d235df0 100644
>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct 
>>> intel_engine_coredump *ee)
>>>   }
>>>     static bool record_context(struct i915_gem_context_coredump *e,
>>> -               const struct i915_request *rq)
>>> +               struct intel_context *ce)
>>>   {
>>>       struct i915_gem_context *ctx;
>>>       struct task_struct *task;
>>>       bool simulated;
>>>         rcu_read_lock();
>>> -    ctx = rcu_dereference(rq->context->gem_context);
>>> +    ctx = rcu_dereference(ce->gem_context);
>>>       if (ctx && !kref_get_unless_zero(&ctx->ref))
>>>           ctx = NULL;
>>>       rcu_read_unlock();
>>> @@ -1396,8 +1396,8 @@ static bool record_context(struct 
>>> i915_gem_context_coredump *e,
>>>       e->guilty = atomic_read(&ctx->guilty_count);
>>>       e->active = atomic_read(&ctx->active_count);
>>>   -    e->total_runtime = 
>>> intel_context_get_total_runtime_ns(rq->context);
>>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
>>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>>         simulated = i915_gem_context_no_error_capture(ctx);
>>>   @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct 
>>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>>       return ee;
>>>   }
>>>   +static struct intel_engine_capture_vma *
>>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>>> +                struct intel_context *ce,
>>> +                gfp_t gfp)
>>> +{
>>> +    struct intel_engine_capture_vma *vma = NULL;
>>> +
>>> +    ee->simulated |= record_context(&ee->context, ce);
>>> +    if (ee->simulated)
>>> +        return NULL;
>>> +
>>> +    /*
>>> +     * We need to copy these to an anonymous buffer
>>> +     * as the simplest method to avoid being overwritten
>>> +     * by userspace.
>>> +     */
>>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>>> +
>>> +    return vma;
>>> +}
>>> +
>>>   struct intel_engine_capture_vma *
>>>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>>>                     struct i915_request *rq,
>>>                     gfp_t gfp)
>>>   {
>>> -    struct intel_engine_capture_vma *vma = NULL;
>>> +    struct intel_engine_capture_vma *vma;
>>>   -    ee->simulated |= record_context(&ee->context, rq);
>>> -    if (ee->simulated)
>>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>>> +    if (!vma)
>>>           return NULL;
>>>         /*
>>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct 
>>> intel_engine_coredump *ee,
>>>        */
>>>       vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>>>       vma = capture_user(vma, rq, gfp);
>>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>>> -    vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>>>         ee->rq_head = rq->head;
>>>       ee->rq_post = rq->postfix;
>>> @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine,
>>>       if (ce) {
>>>           intel_engine_clear_hung_context(engine);
>>>           rq = intel_context_find_active_request(ce);
>>> -        if (!rq || !i915_request_started(rq))
>>> -            goto no_request_capture;
>>> +        if (rq && !i915_request_started(rq)) {
>>> +            drm_info(&engine->gt->i915->drm, "Got hung context on %s 
>>> with no active request!\n",
>>
>> Suggest s/active/started/ since we have both i915_request_active and 
>> i915_request_started, so to align the terminology.
> The message text was based on the intent of the activity not the naming 
> of some internal helper function. Can change it if you really want but 
> "with no started request" just reads like bad English to me. Plus it 
> gets removed in the next patch anyway...
> 
> 
>>
>>> +                 engine->name);
>>> +            rq = NULL;
>>> +        }
>>>       } else {
>>>           /*
>>>            * Getting here with GuC enabled means it is a forced error 
>>> capture
>>> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs *engine,
>>>                              flags);
>>>           }
>>>       }
>>> -    if (rq)
>>> +    if (rq) {
>>>           rq = i915_request_get_rcu(rq);
>>> +        capture = intel_engine_coredump_add_request(ee, rq, 
>>> ATOMIC_MAYFAIL);
>>> +    } else if (ce) {
>>> +        capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>>> +    }
>>>   -    if (!rq)
>>> -        goto no_request_capture;
>>> -
>>> -    capture = intel_engine_coredump_add_request(ee, rq, 
>>> ATOMIC_MAYFAIL);
>>>       if (!capture) {
>>> -        i915_request_put(rq);
>>> +        if (rq)
>>> +            i915_request_put(rq);
>>>           goto no_request_capture;
>>>       }
>>>       if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>>           intel_guc_capture_get_matching_node(engine->gt, ee, ce);
>>
>> This step requires non-NULL ce, so if you move it under the "else if 
>> (ce)" above then I *think* exit from the function can be consolidated 
>> to just:
>>
>> if (capture) {
>>     intel_engine_coredump_add_vma(ee, capture, compress);
>>     if (rq)
>>         i915_request_put(rq);
> Is there any reason the rq ref needs to be held during the add_vma call? 
> Can it now just be moved earlier to be:
>      if (rq) {
>          rq = i915_request_get_rcu(rq);
>          capture = intel_engine_coredump_add_request(ee, rq, 
> ATOMIC_MAYFAIL);
>          i915_request_put(rq);
>      }
> 
> The internals of the request object are only touched in the above 
> _add_request() code. The later _add_vma() call fiddles around with vmas 
> that pulled from the request but the capture_vma code inside 
> _add_request() has already copied everything, hasn't it? Or rather, it 
> has grabbed its own private vma resource locks. So there is no 
> requirement to keep the request itself around still?

Don't know.. it is a question if changes from 60dc43d1190d ("drm/i915: 
Use struct vma_resource instead of struct vma_snapshot") removed the 
need for holding the rq reference that "long" I guess? Adding Thomas and 
Matt to perhaps comment.

Regards,

Tvrtko


> John.
> 
> 
>> } else {
>>     kfree(ee);
>>     ee = NULL;
>> }
>>
>> return ee;
>>
>> No "if (rq) i915_request_put()" twice, and goto label can be 
>> completely removed.
>>
>> Regards,
>>
>> Tvrtko
>>
>>>         intel_engine_coredump_add_vma(ee, capture, compress);
>>> -    i915_request_put(rq);
>>> +    if (rq)
>>> +        i915_request_put(rq);
>>>         return ee;
>
Hellstrom, Thomas Jan. 13, 2023, 5:46 p.m. UTC | #4
On Fri, 2023-01-13 at 09:51 +0000, Tvrtko Ursulin wrote:
> 
> On 12/01/2023 20:40, John Harrison wrote:
> > On 1/12/2023 02:01, Tvrtko Ursulin wrote:
> > > On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:
> > > > From: John Harrison <John.C.Harrison@Intel.com>
> > > > 
> > > > There was a report of error captures occurring without any hung
> > > > context being indicated despite the capture being initiated by
> > > > a 'hung
> > > > context notification' from GuC. The problem was not
> > > > reproducible.
> > > > However, it is possible to happen if the context in question
> > > > has no
> > > > active requests. For example, if the hang was in the context
> > > > switch
> > > > itself then the breadcrumb write would have occurred and the
> > > > KMD would
> > > > see an idle context.
> > > > 
> > > > In the interests of attempting to provide as much information
> > > > as
> > > > possible about a hang, it seems wise to include the engine info
> > > > regardless of whether a request was found or not. As opposed to
> > > > just
> > > > prentending there was no hang at all.
> > > > 
> > > > So update the error capture code to always record engine
> > > > information
> > > > if an engine is given. Which means updating record_context() to
> > > > take a
> > > > context instead of a request (which it only ever used to find
> > > > the
> > > > context anyway). And split the request agnostic parts of
> > > > intel_engine_coredump_add_request() out into a seaprate
> > > > function.
> > > > 
> > > > v2: Remove a duplicate 'if' statement (Umesh) and fix a put of
> > > > a null
> > > > pointer.
> > > > 
> > > > Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> > > > Reviewed-by: Umesh Nerlige Ramappa
> > > > <umesh.nerlige.ramappa@intel.com>
> > > > ---
> > > >   drivers/gpu/drm/i915/i915_gpu_error.c | 61
> > > > +++++++++++++++++++--------
> > > >   1 file changed, 43 insertions(+), 18 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> > > > b/drivers/gpu/drm/i915/i915_gpu_error.c
> > > > index 9d5d5a397b64e..bd2cf7d235df0 100644
> > > > --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> > > > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> > > > @@ -1370,14 +1370,14 @@ static void
> > > > engine_record_execlists(struct 
> > > > intel_engine_coredump *ee)
> > > >   }
> > > >     static bool record_context(struct i915_gem_context_coredump
> > > > *e,
> > > > -               const struct i915_request *rq)
> > > > +               struct intel_context *ce)
> > > >   {
> > > >       struct i915_gem_context *ctx;
> > > >       struct task_struct *task;
> > > >       bool simulated;
> > > >         rcu_read_lock();
> > > > -    ctx = rcu_dereference(rq->context->gem_context);
> > > > +    ctx = rcu_dereference(ce->gem_context);
> > > >       if (ctx && !kref_get_unless_zero(&ctx->ref))
> > > >           ctx = NULL;
> > > >       rcu_read_unlock();
> > > > @@ -1396,8 +1396,8 @@ static bool record_context(struct 
> > > > i915_gem_context_coredump *e,
> > > >       e->guilty = atomic_read(&ctx->guilty_count);
> > > >       e->active = atomic_read(&ctx->active_count);
> > > >   -    e->total_runtime = 
> > > > intel_context_get_total_runtime_ns(rq->context);
> > > > -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq-
> > > > >context);
> > > > +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
> > > > +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
> > > >         simulated = i915_gem_context_no_error_capture(ctx);
> > > >   @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct 
> > > > intel_engine_cs *engine, gfp_t gfp, u32 dump_
> > > >       return ee;
> > > >   }
> > > >   +static struct intel_engine_capture_vma *
> > > > +engine_coredump_add_context(struct intel_engine_coredump *ee,
> > > > +                struct intel_context *ce,
> > > > +                gfp_t gfp)
> > > > +{
> > > > +    struct intel_engine_capture_vma *vma = NULL;
> > > > +
> > > > +    ee->simulated |= record_context(&ee->context, ce);
> > > > +    if (ee->simulated)
> > > > +        return NULL;
> > > > +
> > > > +    /*
> > > > +     * We need to copy these to an anonymous buffer
> > > > +     * as the simplest method to avoid being overwritten
> > > > +     * by userspace.
> > > > +     */
> > > > +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
> > > > +    vma = capture_vma(vma, ce->state, "HW context", gfp);
> > > > +
> > > > +    return vma;
> > > > +}
> > > > +
> > > >   struct intel_engine_capture_vma *
> > > >   intel_engine_coredump_add_request(struct
> > > > intel_engine_coredump *ee,
> > > >                     struct i915_request *rq,
> > > >                     gfp_t gfp)
> > > >   {
> > > > -    struct intel_engine_capture_vma *vma = NULL;
> > > > +    struct intel_engine_capture_vma *vma;
> > > >   -    ee->simulated |= record_context(&ee->context, rq);
> > > > -    if (ee->simulated)
> > > > +    vma = engine_coredump_add_context(ee, rq->context, gfp);
> > > > +    if (!vma)
> > > >           return NULL;
> > > >         /*
> > > > @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct 
> > > > intel_engine_coredump *ee,
> > > >        */
> > > >       vma = capture_vma_snapshot(vma, rq->batch_res, gfp,
> > > > "batch");
> > > >       vma = capture_user(vma, rq, gfp);
> > > > -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
> > > > -    vma = capture_vma(vma, rq->context->state, "HW context",
> > > > gfp);
> > > >         ee->rq_head = rq->head;
> > > >       ee->rq_post = rq->postfix;
> > > > @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs
> > > > *engine,
> > > >       if (ce) {
> > > >           intel_engine_clear_hung_context(engine);
> > > >           rq = intel_context_find_active_request(ce);
> > > > -        if (!rq || !i915_request_started(rq))
> > > > -            goto no_request_capture;
> > > > +        if (rq && !i915_request_started(rq)) {
> > > > +            drm_info(&engine->gt->i915->drm, "Got hung context
> > > > on %s 
> > > > with no active request!\n",
> > > 
> > > Suggest s/active/started/ since we have both i915_request_active
> > > and 
> > > i915_request_started, so to align the terminology.
> > The message text was based on the intent of the activity not the
> > naming 
> > of some internal helper function. Can change it if you really want
> > but 
> > "with no started request" just reads like bad English to me. Plus
> > it 
> > gets removed in the next patch anyway...
> > 
> > 
> > > 
> > > > +                 engine->name);
> > > > +            rq = NULL;
> > > > +        }
> > > >       } else {
> > > >           /*
> > > >            * Getting here with GuC enabled means it is a forced
> > > > error 
> > > > capture
> > > > @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs
> > > > *engine,
> > > >                              flags);
> > > >           }
> > > >       }
> > > > -    if (rq)
> > > > +    if (rq) {
> > > >           rq = i915_request_get_rcu(rq);
> > > > +        capture = intel_engine_coredump_add_request(ee, rq, 
> > > > ATOMIC_MAYFAIL);
> > > > +    } else if (ce) {
> > > > +        capture = engine_coredump_add_context(ee, ce,
> > > > ATOMIC_MAYFAIL);
> > > > +    }
> > > >   -    if (!rq)
> > > > -        goto no_request_capture;
> > > > -
> > > > -    capture = intel_engine_coredump_add_request(ee, rq, 
> > > > ATOMIC_MAYFAIL);
> > > >       if (!capture) {
> > > > -        i915_request_put(rq);
> > > > +        if (rq)
> > > > +            i915_request_put(rq);
> > > >           goto no_request_capture;
> > > >       }
> > > >       if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
> > > >           intel_guc_capture_get_matching_node(engine->gt, ee,
> > > > ce);
> > > 
> > > This step requires non-NULL ce, so if you move it under the "else
> > > if 
> > > (ce)" above then I *think* exit from the function can be
> > > consolidated 
> > > to just:
> > > 
> > > if (capture) {
> > >     intel_engine_coredump_add_vma(ee, capture, compress);
> > >     if (rq)
> > >         i915_request_put(rq);
> > Is there any reason the rq ref needs to be held during the add_vma
> > call? 
> > Can it now just be moved earlier to be:
> >      if (rq) {
> >          rq = i915_request_get_rcu(rq);
> >          capture = intel_engine_coredump_add_request(ee, rq, 
> > ATOMIC_MAYFAIL);
> >          i915_request_put(rq);
> >      }
> > 
> > The internals of the request object are only touched in the above 
> > _add_request() code. The later _add_vma() call fiddles around with
> > vmas 
> > that pulled from the request but the capture_vma code inside 
> > _add_request() has already copied everything, hasn't it? Or rather,
> > it 
> > has grabbed its own private vma resource locks. So there is no 
> > requirement to keep the request itself around still?

That sounds correct. It was some time ago since I worked with this code
but when i started IIRC KASAN told me the request along with the whole
capture list could disappear under us due to a parallel capture.

So the request reference added then might cover a bit too much now that
we also hold references on vma resources, which it looks like we do in
intel_engine_coredump_add_vma().

Another thing which is crappy with the current error capture code is
that the request capture list needs to be freed with the request and
not when the request signals (We can't block request signalling in the
capture code to keep the capture list around). There might be many
signaled requests hanging around in non-pruned dma_resv objects and
thus many unused capture lists with many unused vma resources. :/

/Thomas


> 
> Don't know.. it is a question if changes from 60dc43d1190d
> ("drm/i915: 
> Use struct vma_resource instead of struct vma_snapshot") removed the 
> need for holding the rq reference that "long" I guess? Adding Thomas
> and 
> Matt to perhaps comment.
> 
> Regards,
> 
> Tvrtko
> 
> 
> > John.
> > 
> > 
> > > } else {
> > >     kfree(ee);
> > >     ee = NULL;
> > > }
> > > 
> > > return ee;
> > > 
> > > No "if (rq) i915_request_put()" twice, and goto label can be 
> > > completely removed.
> > > 
> > > Regards,
> > > 
> > > Tvrtko
> > > 
> > > >         intel_engine_coredump_add_vma(ee, capture, compress);
> > > > -    i915_request_put(rq);
> > > > +    if (rq)
> > > > +        i915_request_put(rq);
> > > >         return ee;
> >
John Harrison Jan. 13, 2023, 9:29 p.m. UTC | #5
On 1/13/2023 09:46, Hellstrom, Thomas wrote:
> On Fri, 2023-01-13 at 09:51 +0000, Tvrtko Ursulin wrote:
>> On 12/01/2023 20:40, John Harrison wrote:
>>> On 1/12/2023 02:01, Tvrtko Ursulin wrote:
>>>> On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:
>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>
>>>>> There was a report of error captures occurring without any hung
>>>>> context being indicated despite the capture being initiated by
>>>>> a 'hung
>>>>> context notification' from GuC. The problem was not
>>>>> reproducible.
>>>>> However, it is possible to happen if the context in question
>>>>> has no
>>>>> active requests. For example, if the hang was in the context
>>>>> switch
>>>>> itself then the breadcrumb write would have occurred and the
>>>>> KMD would
>>>>> see an idle context.
>>>>>
>>>>> In the interests of attempting to provide as much information
>>>>> as
>>>>> possible about a hang, it seems wise to include the engine info
>>>>> regardless of whether a request was found or not. As opposed to
>>>>> just
>>>>> prentending there was no hang at all.
>>>>>
>>>>> So update the error capture code to always record engine
>>>>> information
>>>>> if an engine is given. Which means updating record_context() to
>>>>> take a
>>>>> context instead of a request (which it only ever used to find
>>>>> the
>>>>> context anyway). And split the request agnostic parts of
>>>>> intel_engine_coredump_add_request() out into a seaprate
>>>>> function.
>>>>>
>>>>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of
>>>>> a null
>>>>> pointer.
>>>>>
>>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>> Reviewed-by: Umesh Nerlige Ramappa
>>>>> <umesh.nerlige.ramappa@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/i915_gpu_error.c | 61
>>>>> +++++++++++++++++++--------
>>>>>    1 file changed, 43 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> index 9d5d5a397b64e..bd2cf7d235df0 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> @@ -1370,14 +1370,14 @@ static void
>>>>> engine_record_execlists(struct
>>>>> intel_engine_coredump *ee)
>>>>>    }
>>>>>      static bool record_context(struct i915_gem_context_coredump
>>>>> *e,
>>>>> -               const struct i915_request *rq)
>>>>> +               struct intel_context *ce)
>>>>>    {
>>>>>        struct i915_gem_context *ctx;
>>>>>        struct task_struct *task;
>>>>>        bool simulated;
>>>>>          rcu_read_lock();
>>>>> -    ctx = rcu_dereference(rq->context->gem_context);
>>>>> +    ctx = rcu_dereference(ce->gem_context);
>>>>>        if (ctx && !kref_get_unless_zero(&ctx->ref))
>>>>>            ctx = NULL;
>>>>>        rcu_read_unlock();
>>>>> @@ -1396,8 +1396,8 @@ static bool record_context(struct
>>>>> i915_gem_context_coredump *e,
>>>>>        e->guilty = atomic_read(&ctx->guilty_count);
>>>>>        e->active = atomic_read(&ctx->active_count);
>>>>>    -    e->total_runtime =
>>>>> intel_context_get_total_runtime_ns(rq->context);
>>>>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq-
>>>>>> context);
>>>>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>>>>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>>>>          simulated = i915_gem_context_no_error_capture(ctx);
>>>>>    @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct
>>>>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>>>>        return ee;
>>>>>    }
>>>>>    +static struct intel_engine_capture_vma *
>>>>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>>>>> +                struct intel_context *ce,
>>>>> +                gfp_t gfp)
>>>>> +{
>>>>> +    struct intel_engine_capture_vma *vma = NULL;
>>>>> +
>>>>> +    ee->simulated |= record_context(&ee->context, ce);
>>>>> +    if (ee->simulated)
>>>>> +        return NULL;
>>>>> +
>>>>> +    /*
>>>>> +     * We need to copy these to an anonymous buffer
>>>>> +     * as the simplest method to avoid being overwritten
>>>>> +     * by userspace.
>>>>> +     */
>>>>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>>>>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>>>>> +
>>>>> +    return vma;
>>>>> +}
>>>>> +
>>>>>    struct intel_engine_capture_vma *
>>>>>    intel_engine_coredump_add_request(struct
>>>>> intel_engine_coredump *ee,
>>>>>                      struct i915_request *rq,
>>>>>                      gfp_t gfp)
>>>>>    {
>>>>> -    struct intel_engine_capture_vma *vma = NULL;
>>>>> +    struct intel_engine_capture_vma *vma;
>>>>>    -    ee->simulated |= record_context(&ee->context, rq);
>>>>> -    if (ee->simulated)
>>>>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>>>>> +    if (!vma)
>>>>>            return NULL;
>>>>>          /*
>>>>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct
>>>>> intel_engine_coredump *ee,
>>>>>         */
>>>>>        vma = capture_vma_snapshot(vma, rq->batch_res, gfp,
>>>>> "batch");
>>>>>        vma = capture_user(vma, rq, gfp);
>>>>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>>>>> -    vma = capture_vma(vma, rq->context->state, "HW context",
>>>>> gfp);
>>>>>          ee->rq_head = rq->head;
>>>>>        ee->rq_post = rq->postfix;
>>>>> @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs
>>>>> *engine,
>>>>>        if (ce) {
>>>>>            intel_engine_clear_hung_context(engine);
>>>>>            rq = intel_context_find_active_request(ce);
>>>>> -        if (!rq || !i915_request_started(rq))
>>>>> -            goto no_request_capture;
>>>>> +        if (rq && !i915_request_started(rq)) {
>>>>> +            drm_info(&engine->gt->i915->drm, "Got hung context
>>>>> on %s
>>>>> with no active request!\n",
>>>> Suggest s/active/started/ since we have both i915_request_active
>>>> and
>>>> i915_request_started, so to align the terminology.
>>> The message text was based on the intent of the activity not the
>>> naming
>>> of some internal helper function. Can change it if you really want
>>> but
>>> "with no started request" just reads like bad English to me. Plus
>>> it
>>> gets removed in the next patch anyway...
>>>
>>>
>>>>> +                 engine->name);
>>>>> +            rq = NULL;
>>>>> +        }
>>>>>        } else {
>>>>>            /*
>>>>>             * Getting here with GuC enabled means it is a forced
>>>>> error
>>>>> capture
>>>>> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs
>>>>> *engine,
>>>>>                               flags);
>>>>>            }
>>>>>        }
>>>>> -    if (rq)
>>>>> +    if (rq) {
>>>>>            rq = i915_request_get_rcu(rq);
>>>>> +        capture = intel_engine_coredump_add_request(ee, rq,
>>>>> ATOMIC_MAYFAIL);
>>>>> +    } else if (ce) {
>>>>> +        capture = engine_coredump_add_context(ee, ce,
>>>>> ATOMIC_MAYFAIL);
>>>>> +    }
>>>>>    -    if (!rq)
>>>>> -        goto no_request_capture;
>>>>> -
>>>>> -    capture = intel_engine_coredump_add_request(ee, rq,
>>>>> ATOMIC_MAYFAIL);
>>>>>        if (!capture) {
>>>>> -        i915_request_put(rq);
>>>>> +        if (rq)
>>>>> +            i915_request_put(rq);
>>>>>            goto no_request_capture;
>>>>>        }
>>>>>        if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>>>>            intel_guc_capture_get_matching_node(engine->gt, ee,
>>>>> ce);
>>>> This step requires non-NULL ce, so if you move it under the "else
>>>> if
>>>> (ce)" above then I *think* exit from the function can be
>>>> consolidated
>>>> to just:
>>>>
>>>> if (capture) {
>>>>      intel_engine_coredump_add_vma(ee, capture, compress);
>>>>      if (rq)
>>>>          i915_request_put(rq);
>>> Is there any reason the rq ref needs to be held during the add_vma
>>> call?
>>> Can it now just be moved earlier to be:
>>>       if (rq) {
>>>           rq = i915_request_get_rcu(rq);
>>>           capture = intel_engine_coredump_add_request(ee, rq,
>>> ATOMIC_MAYFAIL);
>>>           i915_request_put(rq);
>>>       }
>>>
>>> The internals of the request object are only touched in the above
>>> _add_request() code. The later _add_vma() call fiddles around with
>>> vmas
>>> that pulled from the request but the capture_vma code inside
>>> _add_request() has already copied everything, hasn't it? Or rather,
>>> it
>>> has grabbed its own private vma resource locks. So there is no
>>> requirement to keep the request itself around still?
> That sounds correct. It was some time ago since I worked with this code
> but when i started IIRC KASAN told me the request along with the whole
> capture list could disappear under us due to a parallel capture.
>
> So the request reference added then might cover a bit too much now that
> we also hold references on vma resources, which it looks like we do in
> intel_engine_coredump_add_vma().
So that means we end up with:
     rq = intel_context_find_active_request(ce);
     ...
     [test stuff like i915_request_started(rq)]
     ...
      if (rq) {
         rq = i915_request_get_rcu(rq);
         capture = intel_engine_coredump_add_request(ee, rq, 
ATOMIC_MAYFAIL);
         i915_request_put(rq);
     }

What is special about coredump_add_request() that it needs the request 
to be extra locked for that call and only that call? If the request can 
magically vanish after being found then what protects the _started() 
query? For that matter, what stops the request_get_rcu() itself being 
called on a pointer that is no longer valid? And if we do actually have 
sufficient locking in place to prevent that, why doesn't that cover the 
coredump_add_request() usage?

John.

>
> Another thing which is crappy with the current error capture code is
> that the request capture list needs to be freed with the request and
> not when the request signals (We can't block request signalling in the
> capture code to keep the capture list around). There might be many
> signaled requests hanging around in non-pruned dma_resv objects and
> thus many unused capture lists with many unused vma resources. :/
>
> /Thomas
>
>
>> Don't know.. it is a question if changes from 60dc43d1190d
>> ("drm/i915:
>> Use struct vma_resource instead of struct vma_snapshot") removed the
>> need for holding the rq reference that "long" I guess? Adding Thomas
>> and
>> Matt to perhaps comment.
>>
>> Regards,
>>
>> Tvrtko
>>
>>
>>> John.
>>>
>>>
>>>> } else {
>>>>      kfree(ee);
>>>>      ee = NULL;
>>>> }
>>>>
>>>> return ee;
>>>>
>>>> No "if (rq) i915_request_put()" twice, and goto label can be
>>>> completely removed.
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>>>          intel_engine_coredump_add_vma(ee, capture, compress);
>>>>> -    i915_request_put(rq);
>>>>> +    if (rq)
>>>>> +        i915_request_put(rq);
>>>>>          return ee;
Tvrtko Ursulin Jan. 16, 2023, 12:13 p.m. UTC | #6
On 13/01/2023 17:46, Hellstrom, Thomas wrote:
> On Fri, 2023-01-13 at 09:51 +0000, Tvrtko Ursulin wrote:
>>
>> On 12/01/2023 20:40, John Harrison wrote:
>>> On 1/12/2023 02:01, Tvrtko Ursulin wrote:
>>>> On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:
>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>
>>>>> There was a report of error captures occurring without any hung
>>>>> context being indicated despite the capture being initiated by
>>>>> a 'hung
>>>>> context notification' from GuC. The problem was not
>>>>> reproducible.
>>>>> However, it is possible to happen if the context in question
>>>>> has no
>>>>> active requests. For example, if the hang was in the context
>>>>> switch
>>>>> itself then the breadcrumb write would have occurred and the
>>>>> KMD would
>>>>> see an idle context.
>>>>>
>>>>> In the interests of attempting to provide as much information
>>>>> as
>>>>> possible about a hang, it seems wise to include the engine info
>>>>> regardless of whether a request was found or not. As opposed to
>>>>> just
>>>>> prentending there was no hang at all.
>>>>>
>>>>> So update the error capture code to always record engine
>>>>> information
>>>>> if an engine is given. Which means updating record_context() to
>>>>> take a
>>>>> context instead of a request (which it only ever used to find
>>>>> the
>>>>> context anyway). And split the request agnostic parts of
>>>>> intel_engine_coredump_add_request() out into a seaprate
>>>>> function.
>>>>>
>>>>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of
>>>>> a null
>>>>> pointer.
>>>>>
>>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>> Reviewed-by: Umesh Nerlige Ramappa
>>>>> <umesh.nerlige.ramappa@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/i915_gpu_error.c | 61
>>>>> +++++++++++++++++++--------
>>>>>    1 file changed, 43 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> index 9d5d5a397b64e..bd2cf7d235df0 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> @@ -1370,14 +1370,14 @@ static void
>>>>> engine_record_execlists(struct
>>>>> intel_engine_coredump *ee)
>>>>>    }
>>>>>      static bool record_context(struct i915_gem_context_coredump
>>>>> *e,
>>>>> -               const struct i915_request *rq)
>>>>> +               struct intel_context *ce)
>>>>>    {
>>>>>        struct i915_gem_context *ctx;
>>>>>        struct task_struct *task;
>>>>>        bool simulated;
>>>>>          rcu_read_lock();
>>>>> -    ctx = rcu_dereference(rq->context->gem_context);
>>>>> +    ctx = rcu_dereference(ce->gem_context);
>>>>>        if (ctx && !kref_get_unless_zero(&ctx->ref))
>>>>>            ctx = NULL;
>>>>>        rcu_read_unlock();
>>>>> @@ -1396,8 +1396,8 @@ static bool record_context(struct
>>>>> i915_gem_context_coredump *e,
>>>>>        e->guilty = atomic_read(&ctx->guilty_count);
>>>>>        e->active = atomic_read(&ctx->active_count);
>>>>>    -    e->total_runtime =
>>>>> intel_context_get_total_runtime_ns(rq->context);
>>>>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq-
>>>>>> context);
>>>>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>>>>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>>>>          simulated = i915_gem_context_no_error_capture(ctx);
>>>>>    @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct
>>>>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>>>>        return ee;
>>>>>    }
>>>>>    +static struct intel_engine_capture_vma *
>>>>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>>>>> +                struct intel_context *ce,
>>>>> +                gfp_t gfp)
>>>>> +{
>>>>> +    struct intel_engine_capture_vma *vma = NULL;
>>>>> +
>>>>> +    ee->simulated |= record_context(&ee->context, ce);
>>>>> +    if (ee->simulated)
>>>>> +        return NULL;
>>>>> +
>>>>> +    /*
>>>>> +     * We need to copy these to an anonymous buffer
>>>>> +     * as the simplest method to avoid being overwritten
>>>>> +     * by userspace.
>>>>> +     */
>>>>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>>>>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>>>>> +
>>>>> +    return vma;
>>>>> +}
>>>>> +
>>>>>    struct intel_engine_capture_vma *
>>>>>    intel_engine_coredump_add_request(struct
>>>>> intel_engine_coredump *ee,
>>>>>                      struct i915_request *rq,
>>>>>                      gfp_t gfp)
>>>>>    {
>>>>> -    struct intel_engine_capture_vma *vma = NULL;
>>>>> +    struct intel_engine_capture_vma *vma;
>>>>>    -    ee->simulated |= record_context(&ee->context, rq);
>>>>> -    if (ee->simulated)
>>>>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>>>>> +    if (!vma)
>>>>>            return NULL;
>>>>>          /*
>>>>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct
>>>>> intel_engine_coredump *ee,
>>>>>         */
>>>>>        vma = capture_vma_snapshot(vma, rq->batch_res, gfp,
>>>>> "batch");
>>>>>        vma = capture_user(vma, rq, gfp);
>>>>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>>>>> -    vma = capture_vma(vma, rq->context->state, "HW context",
>>>>> gfp);
>>>>>          ee->rq_head = rq->head;
>>>>>        ee->rq_post = rq->postfix;
>>>>> @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs
>>>>> *engine,
>>>>>        if (ce) {
>>>>>            intel_engine_clear_hung_context(engine);
>>>>>            rq = intel_context_find_active_request(ce);
>>>>> -        if (!rq || !i915_request_started(rq))
>>>>> -            goto no_request_capture;
>>>>> +        if (rq && !i915_request_started(rq)) {
>>>>> +            drm_info(&engine->gt->i915->drm, "Got hung context
>>>>> on %s
>>>>> with no active request!\n",
>>>>
>>>> Suggest s/active/started/ since we have both i915_request_active
>>>> and
>>>> i915_request_started, so to align the terminology.
>>> The message text was based on the intent of the activity not the
>>> naming
>>> of some internal helper function. Can change it if you really want
>>> but
>>> "with no started request" just reads like bad English to me. Plus
>>> it
>>> gets removed in the next patch anyway...
>>>
>>>
>>>>
>>>>> +                 engine->name);
>>>>> +            rq = NULL;
>>>>> +        }
>>>>>        } else {
>>>>>            /*
>>>>>             * Getting here with GuC enabled means it is a forced
>>>>> error
>>>>> capture
>>>>> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs
>>>>> *engine,
>>>>>                               flags);
>>>>>            }
>>>>>        }
>>>>> -    if (rq)
>>>>> +    if (rq) {
>>>>>            rq = i915_request_get_rcu(rq);
>>>>> +        capture = intel_engine_coredump_add_request(ee, rq,
>>>>> ATOMIC_MAYFAIL);
>>>>> +    } else if (ce) {
>>>>> +        capture = engine_coredump_add_context(ee, ce,
>>>>> ATOMIC_MAYFAIL);
>>>>> +    }
>>>>>    -    if (!rq)
>>>>> -        goto no_request_capture;
>>>>> -
>>>>> -    capture = intel_engine_coredump_add_request(ee, rq,
>>>>> ATOMIC_MAYFAIL);
>>>>>        if (!capture) {
>>>>> -        i915_request_put(rq);
>>>>> +        if (rq)
>>>>> +            i915_request_put(rq);
>>>>>            goto no_request_capture;
>>>>>        }
>>>>>        if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>>>>            intel_guc_capture_get_matching_node(engine->gt, ee,
>>>>> ce);
>>>>
>>>> This step requires non-NULL ce, so if you move it under the "else
>>>> if
>>>> (ce)" above then I *think* exit from the function can be
>>>> consolidated
>>>> to just:
>>>>
>>>> if (capture) {
>>>>      intel_engine_coredump_add_vma(ee, capture, compress);
>>>>      if (rq)
>>>>          i915_request_put(rq);
>>> Is there any reason the rq ref needs to be held during the add_vma
>>> call?
>>> Can it now just be moved earlier to be:
>>>       if (rq) {
>>>           rq = i915_request_get_rcu(rq);
>>>           capture = intel_engine_coredump_add_request(ee, rq,
>>> ATOMIC_MAYFAIL);
>>>           i915_request_put(rq);
>>>       }
>>>
>>> The internals of the request object are only touched in the above
>>> _add_request() code. The later _add_vma() call fiddles around with
>>> vmas
>>> that pulled from the request but the capture_vma code inside
>>> _add_request() has already copied everything, hasn't it? Or rather,
>>> it
>>> has grabbed its own private vma resource locks. So there is no
>>> requirement to keep the request itself around still?
> 
> That sounds correct. It was some time ago since I worked with this code
> but when i started IIRC KASAN told me the request along with the whole
> capture list could disappear under us due to a parallel capture.
> 
> So the request reference added then might cover a bit too much now that
> we also hold references on vma resources, which it looks like we do in
> intel_engine_coredump_add_vma().

If you are not sure maybe just should leave the reference covering as it 
does? I don't think there is any harm in covering too much. Whether or 
not it is immediately released after the call to 
intel_engine_coredump_add_request(), or at the exit of capture_engine() 
I mean, we can skip that clean up if in doubt.

> Another thing which is crappy with the current error capture code is
> that the request capture list needs to be freed with the request and
> not when the request signals (We can't block request signalling in the
> capture code to keep the capture list around). There might be many
> signaled requests hanging around in non-pruned dma_resv objects and
> thus many unused capture lists with many unused vma resources. :/

This last part sounds vaguely familiar - is it really a problem with the 
error capture code and it wasn't some other refactoring which removed 
the pruning of signaled fences from dma_resv?

Regards,

Tvrtko

> 
> /Thomas
> 
> 
>>
>> Don't know.. it is a question if changes from 60dc43d1190d
>> ("drm/i915:
>> Use struct vma_resource instead of struct vma_snapshot") removed the
>> need for holding the rq reference that "long" I guess? Adding Thomas
>> and
>> Matt to perhaps comment.
>>
>> Regards,
>>
>> Tvrtko
>>
>>
>>> John.
>>>
>>>
>>>> } else {
>>>>      kfree(ee);
>>>>      ee = NULL;
>>>> }
>>>>
>>>> return ee;
>>>>
>>>> No "if (rq) i915_request_put()" twice, and goto label can be
>>>> completely removed.
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>>>          intel_engine_coredump_add_vma(ee, capture, compress);
>>>>> -    i915_request_put(rq);
>>>>> +    if (rq)
>>>>> +        i915_request_put(rq);
>>>>>          return ee;
>>>
>
Tvrtko Ursulin Jan. 16, 2023, 12:38 p.m. UTC | #7
On 13/01/2023 21:29, John Harrison wrote:
> On 1/13/2023 09:46, Hellstrom, Thomas wrote:
>> On Fri, 2023-01-13 at 09:51 +0000, Tvrtko Ursulin wrote:
>>> On 12/01/2023 20:40, John Harrison wrote:
>>>> On 1/12/2023 02:01, Tvrtko Ursulin wrote:
>>>>> On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:

[snip]

>>>>>> +                 engine->name);
>>>>>> +            rq = NULL;
>>>>>> +        }
>>>>>>        } else {
>>>>>>            /*
>>>>>>             * Getting here with GuC enabled means it is a forced
>>>>>> error
>>>>>> capture
>>>>>> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs
>>>>>> *engine,
>>>>>>                               flags);
>>>>>>            }
>>>>>>        }
>>>>>> -    if (rq)
>>>>>> +    if (rq) {
>>>>>>            rq = i915_request_get_rcu(rq);
>>>>>> +        capture = intel_engine_coredump_add_request(ee, rq,
>>>>>> ATOMIC_MAYFAIL);
>>>>>> +    } else if (ce) {
>>>>>> +        capture = engine_coredump_add_context(ee, ce,
>>>>>> ATOMIC_MAYFAIL);
>>>>>> +    }
>>>>>>    -    if (!rq)
>>>>>> -        goto no_request_capture;
>>>>>> -
>>>>>> -    capture = intel_engine_coredump_add_request(ee, rq,
>>>>>> ATOMIC_MAYFAIL);
>>>>>>        if (!capture) {
>>>>>> -        i915_request_put(rq);
>>>>>> +        if (rq)
>>>>>> +            i915_request_put(rq);
>>>>>>            goto no_request_capture;
>>>>>>        }
>>>>>>        if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>>>>>            intel_guc_capture_get_matching_node(engine->gt, ee,
>>>>>> ce);
>>>>> This step requires non-NULL ce, so if you move it under the "else
>>>>> if
>>>>> (ce)" above then I *think* exit from the function can be
>>>>> consolidated
>>>>> to just:
>>>>>
>>>>> if (capture) {
>>>>>      intel_engine_coredump_add_vma(ee, capture, compress);
>>>>>      if (rq)
>>>>>          i915_request_put(rq);
>>>> Is there any reason the rq ref needs to be held during the add_vma
>>>> call?
>>>> Can it now just be moved earlier to be:
>>>>       if (rq) {
>>>>           rq = i915_request_get_rcu(rq);
>>>>           capture = intel_engine_coredump_add_request(ee, rq,
>>>> ATOMIC_MAYFAIL);
>>>>           i915_request_put(rq);
>>>>       }
>>>>
>>>> The internals of the request object are only touched in the above
>>>> _add_request() code. The later _add_vma() call fiddles around with
>>>> vmas
>>>> that pulled from the request but the capture_vma code inside
>>>> _add_request() has already copied everything, hasn't it? Or rather,
>>>> it
>>>> has grabbed its own private vma resource locks. So there is no
>>>> requirement to keep the request itself around still?
>> That sounds correct. It was some time ago since I worked with this code
>> but when i started IIRC KASAN told me the request along with the whole
>> capture list could disappear under us due to a parallel capture.
>>
>> So the request reference added then might cover a bit too much now that
>> we also hold references on vma resources, which it looks like we do in
>> intel_engine_coredump_add_vma().
> So that means we end up with:
>      rq = intel_context_find_active_request(ce);
>      ...
>      [test stuff like i915_request_started(rq)]
>      ...
>       if (rq) {
>          rq = i915_request_get_rcu(rq);
>          capture = intel_engine_coredump_add_request(ee, rq, 
> ATOMIC_MAYFAIL);
>          i915_request_put(rq);
>      }
> 
> What is special about coredump_add_request() that it needs the request 
> to be extra locked for that call and only that call? If the request can 
> magically vanish after being found then what protects the _started() 
> query? For that matter, what stops the request_get_rcu() itself being 
> called on a pointer that is no longer valid? And if we do actually have 
> sufficient locking in place to prevent that, why doesn't that cover the 
> coredump_add_request() usage?

There is definitely a red flag there with the difference between the if 
and else blocks at the top of capture_engine(). And funnily enough, the 
first block appears to be GuC only. That is not obvious from the code 
and should probably have a comment, or function names made self-documenting.

I guess the special thing about intel_engine_coredump_add_request() is 
that it dereferences the rq. So it is possibly 573ba126aef3 
("drm/i915/guc: Capture error state on context reset") which added a bug 
where rq can be dereferenced with a reference held. Or perhaps with the 
GuC backend there is a guarantee request cannot be retired from 
elsewhere while error capture is examining it.

To unravel the error entry points into error capture, from execlists, 
debugfs, ringbuffer, I don't have the time to remind myself how all that 
works right now. Quite possibly at least some of those run async to the 
GPU so must be safe against parallel request retirement. So I don't know 
if the i915_request_get_rcu safe in all those cases without spending 
some time to refresh my knowledge a bit.

Sounds like the best plan is not to change this too much - just leave 
the scope of reference held as is and ideally eliminate the necessary 
goto labels. AFAIR that should be doable without changing anything real 
and unblock these improvements.

Regards,

Tvrtko
John Harrison Jan. 17, 2023, 7:40 p.m. UTC | #8
On 1/16/2023 04:38, Tvrtko Ursulin wrote:
> On 13/01/2023 21:29, John Harrison wrote:
>> On 1/13/2023 09:46, Hellstrom, Thomas wrote:
>>> On Fri, 2023-01-13 at 09:51 +0000, Tvrtko Ursulin wrote:
>>>> On 12/01/2023 20:40, John Harrison wrote:
>>>>> On 1/12/2023 02:01, Tvrtko Ursulin wrote:
>>>>>> On 12/01/2023 02:53, John.C.Harrison@Intel.com wrote:
>
> [snip]
>
>>>>>>> + engine->name);
>>>>>>> +            rq = NULL;
>>>>>>> +        }
>>>>>>>        } else {
>>>>>>>            /*
>>>>>>>             * Getting here with GuC enabled means it is a forced
>>>>>>> error
>>>>>>> capture
>>>>>>> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs
>>>>>>> *engine,
>>>>>>>                               flags);
>>>>>>>            }
>>>>>>>        }
>>>>>>> -    if (rq)
>>>>>>> +    if (rq) {
>>>>>>>            rq = i915_request_get_rcu(rq);
>>>>>>> +        capture = intel_engine_coredump_add_request(ee, rq,
>>>>>>> ATOMIC_MAYFAIL);
>>>>>>> +    } else if (ce) {
>>>>>>> +        capture = engine_coredump_add_context(ee, ce,
>>>>>>> ATOMIC_MAYFAIL);
>>>>>>> +    }
>>>>>>>    -    if (!rq)
>>>>>>> -        goto no_request_capture;
>>>>>>> -
>>>>>>> -    capture = intel_engine_coredump_add_request(ee, rq,
>>>>>>> ATOMIC_MAYFAIL);
>>>>>>>        if (!capture) {
>>>>>>> -        i915_request_put(rq);
>>>>>>> +        if (rq)
>>>>>>> +            i915_request_put(rq);
>>>>>>>            goto no_request_capture;
>>>>>>>        }
>>>>>>>        if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>>>>>> intel_guc_capture_get_matching_node(engine->gt, ee,
>>>>>>> ce);
>>>>>> This step requires non-NULL ce, so if you move it under the "else
>>>>>> if
>>>>>> (ce)" above then I *think* exit from the function can be
>>>>>> consolidated
>>>>>> to just:
>>>>>>
>>>>>> if (capture) {
>>>>>>      intel_engine_coredump_add_vma(ee, capture, compress);
>>>>>>      if (rq)
>>>>>>          i915_request_put(rq);
>>>>> Is there any reason the rq ref needs to be held during the add_vma
>>>>> call?
>>>>> Can it now just be moved earlier to be:
>>>>>       if (rq) {
>>>>>           rq = i915_request_get_rcu(rq);
>>>>>           capture = intel_engine_coredump_add_request(ee, rq,
>>>>> ATOMIC_MAYFAIL);
>>>>>           i915_request_put(rq);
>>>>>       }
>>>>>
>>>>> The internals of the request object are only touched in the above
>>>>> _add_request() code. The later _add_vma() call fiddles around with
>>>>> vmas
>>>>> that pulled from the request but the capture_vma code inside
>>>>> _add_request() has already copied everything, hasn't it? Or rather,
>>>>> it
>>>>> has grabbed its own private vma resource locks. So there is no
>>>>> requirement to keep the request itself around still?
>>> That sounds correct. It was some time ago since I worked with this code
>>> but when i started IIRC KASAN told me the request along with the whole
>>> capture list could disappear under us due to a parallel capture.
>>>
>>> So the request reference added then might cover a bit too much now that
>>> we also hold references on vma resources, which it looks like we do in
>>> intel_engine_coredump_add_vma().
>> So that means we end up with:
>>      rq = intel_context_find_active_request(ce);
>>      ...
>>      [test stuff like i915_request_started(rq)]
>>      ...
>>       if (rq) {
>>          rq = i915_request_get_rcu(rq);
>>          capture = intel_engine_coredump_add_request(ee, rq, 
>> ATOMIC_MAYFAIL);
>>          i915_request_put(rq);
>>      }
>>
>> What is special about coredump_add_request() that it needs the 
>> request to be extra locked for that call and only that call? If the 
>> request can magically vanish after being found then what protects the 
>> _started() query? For that matter, what stops the request_get_rcu() 
>> itself being called on a pointer that is no longer valid? And if we 
>> do actually have sufficient locking in place to prevent that, why 
>> doesn't that cover the coredump_add_request() usage?
>
> There is definitely a red flag there with the difference between the 
> if and else blocks at the top of capture_engine(). And funnily enough, 
> the first block appears to be GuC only. That is not obvious from the 
> code and should probably have a comment, or function names made 
> self-documenting.
In terms of 'red flag', you mean the apparent difference in locking in 
this section?
         ce = intel_engine_get_hung_context(engine);
         if (ce) {
                 intel_engine_clear_hung_context(engine);
                 rq = intel_context_find_active_request(ce);
                 if (!rq || !i915_request_started(rq))
                         goto no_request_capture;
         } else {
                 /*
                  * Getting here with GuC enabled means it is a forced 
error capture
                  * with no actual hang. So, no need to attempt the 
execlist search.
                  */
                 if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
spin_lock_irqsave(&engine->sched_engine->lock, flags);
                         rq = 
intel_engine_execlist_find_hung_request(engine);
spin_unlock_irqrestore(&engine->sched_engine->lock,
                                                flags);
                 }
         }

There is no actual locking difference. The first thing 
intel_context_find_active_request() does is to acquire the relevant 
spinlock for the list it is about to traverse. I assume 
intel_engine_execlist_find_hung_request() must be called from other 
places that already have the appropriate lock and hence the lock must be 
done externally for all callers.

Technically, the first part does not have to be GuC only. It is entirely 
possible that a future improvement to execlists would add support for 
tagging the hanging context up front without requiring a fresh search 
here (based on pre-emption timeouts, heartbeat timeouts, or whatever it 
was that decided to call the error capture code in the first place). So 
there is no reason to add unnecessary enforcement of backend 
implementation details to this higher level by marking that as GuC only. 
Whereas, the search for an individual request after the hang has 
happened is an execlist only implementation detail. Hence the 
enforcement that we don't do that for GuC (with comment to explain).

>
> I guess the special thing about intel_engine_coredump_add_request() is 
> that it dereferences the rq. So it is possibly 573ba126aef3 
> ("drm/i915/guc: Capture error state on context reset") which added a 
> bug where rq can be dereferenced with a reference held. Or perhaps 
> with the GuC backend there is a guarantee request cannot be retired 
> from elsewhere while error capture is examining it.
"added a bug where rq can be dereferenced with a reference held." <-- 
did you mean 'with' or 'without'? Dereferencing with a reference held 
sounds correct to me.

You are meaning the loop in intel_context_find_active_request()? It gets 
the lock on the list of requests that it is scanning through. Presumably 
requests can't vanish which they are on that list, they must have been 
reference counted when adding. So dereferencing within the list has to 
be safe, yes? The issue is that there needs to be an extra get on the 
reference that is returned before dropping the list lock. And that 
reference would have to be released by the caller. Yes? And likewise, 
the request_get that currently exists in capture_engine() needs to be 
inside the execlist spinlock. Which is how it used to be before the GuC 
support was added in the above patch. Or rather, there was no explicit 
request_get at all but all the request processing was done inside the 
execlist spinlock (which seems bad given that it would be allocating 
memory and such within the spinlock?!).

Presumably engine_dump_active_requests() is also broken. It asserts that 
the execlist spinlock is held but does nothing about the GuC's 
equivalent spinlock despite pulling a request of the GuC state's list 
and then doing lots of dereferencing on that. It will also need to have 
intel_context_find_active_request() return an extra reference count on 
the request (before dropping the GuC lock as described above) and then 
manually put the request iff it got it via the hung context route rather 
than the hung request route! Or more simply, also get a local reference 
in the hung request path and then just unconditionally put at the end.

Sound plausible?

>
> To unravel the error entry points into error capture, from execlists, 
> debugfs, ringbuffer, I don't have the time to remind myself how all 
> that works right now. Quite possibly at least some of those run async 
> to the GPU so must be safe against parallel request retirement. So I 
> don't know if the i915_request_get_rcu safe in all those cases without 
> spending some time to refresh my knowledge a bit.
>
> Sounds like the best plan is not to change this too much - just leave 
> the scope of reference held as is and ideally eliminate the necessary 
> goto labels. AFAIR that should be doable without changing anything 
> real and unblock these improvements.
Hmm. If you want it all left unchanged, I don't think you can eliminate 
the goto. But it seems like the fix is to move the existing get into the 
execlist only spinlock and add a new get to 
intel_context_find_active_request(). Then everything should be 
guaranteed protected at all times.

John.

>
> Regards,
>
> Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 9d5d5a397b64e..bd2cf7d235df0 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1370,14 +1370,14 @@  static void engine_record_execlists(struct intel_engine_coredump *ee)
 }
 
 static bool record_context(struct i915_gem_context_coredump *e,
-			   const struct i915_request *rq)
+			   struct intel_context *ce)
 {
 	struct i915_gem_context *ctx;
 	struct task_struct *task;
 	bool simulated;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(rq->context->gem_context);
+	ctx = rcu_dereference(ce->gem_context);
 	if (ctx && !kref_get_unless_zero(&ctx->ref))
 		ctx = NULL;
 	rcu_read_unlock();
@@ -1396,8 +1396,8 @@  static bool record_context(struct i915_gem_context_coredump *e,
 	e->guilty = atomic_read(&ctx->guilty_count);
 	e->active = atomic_read(&ctx->active_count);
 
-	e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
-	e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
+	e->total_runtime = intel_context_get_total_runtime_ns(ce);
+	e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
 
 	simulated = i915_gem_context_no_error_capture(ctx);
 
@@ -1532,15 +1532,37 @@  intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
 	return ee;
 }
 
+static struct intel_engine_capture_vma *
+engine_coredump_add_context(struct intel_engine_coredump *ee,
+			    struct intel_context *ce,
+			    gfp_t gfp)
+{
+	struct intel_engine_capture_vma *vma = NULL;
+
+	ee->simulated |= record_context(&ee->context, ce);
+	if (ee->simulated)
+		return NULL;
+
+	/*
+	 * We need to copy these to an anonymous buffer
+	 * as the simplest method to avoid being overwritten
+	 * by userspace.
+	 */
+	vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
+	vma = capture_vma(vma, ce->state, "HW context", gfp);
+
+	return vma;
+}
+
 struct intel_engine_capture_vma *
 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
 				  struct i915_request *rq,
 				  gfp_t gfp)
 {
-	struct intel_engine_capture_vma *vma = NULL;
+	struct intel_engine_capture_vma *vma;
 
-	ee->simulated |= record_context(&ee->context, rq);
-	if (ee->simulated)
+	vma = engine_coredump_add_context(ee, rq->context, gfp);
+	if (!vma)
 		return NULL;
 
 	/*
@@ -1550,8 +1572,6 @@  intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
 	 */
 	vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
 	vma = capture_user(vma, rq, gfp);
-	vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
-	vma = capture_vma(vma, rq->context->state, "HW context", gfp);
 
 	ee->rq_head = rq->head;
 	ee->rq_post = rq->postfix;
@@ -1608,8 +1628,11 @@  capture_engine(struct intel_engine_cs *engine,
 	if (ce) {
 		intel_engine_clear_hung_context(engine);
 		rq = intel_context_find_active_request(ce);
-		if (!rq || !i915_request_started(rq))
-			goto no_request_capture;
+		if (rq && !i915_request_started(rq)) {
+			drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",
+				 engine->name);
+			rq = NULL;
+		}
 	} else {
 		/*
 		 * Getting here with GuC enabled means it is a forced error capture
@@ -1622,22 +1645,24 @@  capture_engine(struct intel_engine_cs *engine,
 					       flags);
 		}
 	}
-	if (rq)
+	if (rq) {
 		rq = i915_request_get_rcu(rq);
+		capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
+	} else if (ce) {
+		capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
+	}
 
-	if (!rq)
-		goto no_request_capture;
-
-	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
 	if (!capture) {
-		i915_request_put(rq);
+		if (rq)
+			i915_request_put(rq);
 		goto no_request_capture;
 	}
 	if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
 		intel_guc_capture_get_matching_node(engine->gt, ee, ce);
 
 	intel_engine_coredump_add_vma(ee, capture, compress);
-	i915_request_put(rq);
+	if (rq)
+		i915_request_put(rq);
 
 	return ee;