diff mbox series

[v3,3/6] drm/i915: Allow error capture without a request

Message ID 20230119065000.1661857-4-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show
Series Allow error capture without a request & fix locking issues | expand

Commit Message

John Harrison Jan. 19, 2023, 6:49 a.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

There was a report of error captures occurring without any hung
context being indicated despite the capture being initiated by a 'hung
context notification' from GuC. The problem was not reproducible.
However, it is possible to happen if the context in question has no
active requests. For example, if the hang was in the context switch
itself then the breadcrumb write would have occurred and the KMD would
see an idle context.

In the interests of attempting to provide as much information as
possible about a hang, it seems wise to include the engine info
regardless of whether a request was found or not. As opposed to just
prentending there was no hang at all.

So update the error capture code to always record engine information
if a context is given. Which means updating record_context() to take a
context instead of a request (which it only ever used to find the
context anyway). And split the request agnostic parts of
intel_engine_coredump_add_request() out into a seaprate function.

v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
pointer.
v3: Tidy up request locking code flow (Tvrtko)

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 70 ++++++++++++++++++---------
 1 file changed, 48 insertions(+), 22 deletions(-)

Comments

Daniele Ceraolo Spurio Jan. 20, 2023, 1:54 a.m. UTC | #1
On 1/18/2023 10:49 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> There was a report of error captures occurring without any hung
> context being indicated despite the capture being initiated by a 'hung
> context notification' from GuC. The problem was not reproducible.
> However, it is possible to happen if the context in question has no
> active requests. For example, if the hang was in the context switch
> itself then the breadcrumb write would have occurred and the KMD would
> see an idle context.
>
> In the interests of attempting to provide as much information as
> possible about a hang, it seems wise to include the engine info
> regardless of whether a request was found or not. As opposed to just
> prentending there was no hang at all.
>
> So update the error capture code to always record engine information
> if a context is given. Which means updating record_context() to take a
> context instead of a request (which it only ever used to find the
> context anyway). And split the request agnostic parts of
> intel_engine_coredump_add_request() out into a seaprate function.
>
> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
> pointer.
> v3: Tidy up request locking code flow (Tvrtko)
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 70 ++++++++++++++++++---------
>   1 file changed, 48 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 78cf95e4dd230..743614fff5472 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
>   }
>   
>   static bool record_context(struct i915_gem_context_coredump *e,
> -			   const struct i915_request *rq)
> +			   struct intel_context *ce)
>   {
>   	struct i915_gem_context *ctx;
>   	struct task_struct *task;
>   	bool simulated;
>   
>   	rcu_read_lock();
> -	ctx = rcu_dereference(rq->context->gem_context);
> +	ctx = rcu_dereference(ce->gem_context);
>   	if (ctx && !kref_get_unless_zero(&ctx->ref))
>   		ctx = NULL;
>   	rcu_read_unlock();
> @@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
>   	e->guilty = atomic_read(&ctx->guilty_count);
>   	e->active = atomic_read(&ctx->active_count);
>   
> -	e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
> -	e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
> +	e->total_runtime = intel_context_get_total_runtime_ns(ce);
> +	e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>   
>   	simulated = i915_gem_context_no_error_capture(ctx);
>   
> @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
>   	return ee;
>   }
>   
> +static struct intel_engine_capture_vma *
> +engine_coredump_add_context(struct intel_engine_coredump *ee,
> +			    struct intel_context *ce,
> +			    gfp_t gfp)
> +{
> +	struct intel_engine_capture_vma *vma = NULL;
> +
> +	ee->simulated |= record_context(&ee->context, ce);
> +	if (ee->simulated)
> +		return NULL;
> +
> +	/*
> +	 * We need to copy these to an anonymous buffer
> +	 * as the simplest method to avoid being overwritten
> +	 * by userspace.
> +	 */
> +	vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
> +	vma = capture_vma(vma, ce->state, "HW context", gfp);
> +
> +	return vma;
> +}
> +
>   struct intel_engine_capture_vma *
>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>   				  struct i915_request *rq,
>   				  gfp_t gfp)
>   {
> -	struct intel_engine_capture_vma *vma = NULL;
> +	struct intel_engine_capture_vma *vma;
>   
> -	ee->simulated |= record_context(&ee->context, rq);
> -	if (ee->simulated)
> +	vma = engine_coredump_add_context(ee, rq->context, gfp);
> +	if (!vma)
>   		return NULL;
>   
>   	/*
> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>   	 */
>   	vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>   	vma = capture_user(vma, rq, gfp);
> -	vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
> -	vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>   
>   	ee->rq_head = rq->head;
>   	ee->rq_post = rq->postfix;
> @@ -1604,25 +1624,31 @@ capture_engine(struct intel_engine_cs *engine,
>   		return NULL;
>   
>   	intel_get_hung_entity(engine, &ce, &rq);
> -	if (!rq || !i915_request_started(rq))
> -		goto no_request_capture;
> +	if (rq && !i915_request_started(rq)) {
> +		drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",
> +			 engine->name);

Shouldn't this print be inside the "else if" case below? otherwise if we 
don't have a rq at all we won't see it.

> +		i915_request_put(rq);
> +		rq = NULL;
> +	}
> +
> +	if (rq) {
> +		capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> +		i915_request_put(rq);
> +	} else if (ce) {
> +		capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
> +	}
>   
> -	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> -	if (!capture)
> -		goto no_request_capture;
>   	if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>   		intel_guc_capture_get_matching_node(engine->gt, ee, ce);

Are you keeping this outside the "if (capture)" below to make sure we 
consume the GuC engine capture even if we fail to produce the error 
state? if so, a comment might be useful.

Daniele

>   
> -	intel_engine_coredump_add_vma(ee, capture, compress);
> -	i915_request_put(rq);
> +	if (capture) {
> +		intel_engine_coredump_add_vma(ee, capture, compress);
> +	} else {
> +		kfree(ee);
> +		ee = NULL;
> +	}
>   
>   	return ee;
> -
> -no_request_capture:
> -	if (rq)
> -		i915_request_put(rq);
> -	kfree(ee);
> -	return NULL;
>   }
>   
>   static void
John Harrison Jan. 20, 2023, 5:36 p.m. UTC | #2
On 1/19/2023 17:54, Ceraolo Spurio, Daniele wrote:
> On 1/18/2023 10:49 PM, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> There was a report of error captures occurring without any hung
>> context being indicated despite the capture being initiated by a 'hung
>> context notification' from GuC. The problem was not reproducible.
>> However, it is possible to happen if the context in question has no
>> active requests. For example, if the hang was in the context switch
>> itself then the breadcrumb write would have occurred and the KMD would
>> see an idle context.
>>
>> In the interests of attempting to provide as much information as
>> possible about a hang, it seems wise to include the engine info
>> regardless of whether a request was found or not. As opposed to just
>> prentending there was no hang at all.
>>
>> So update the error capture code to always record engine information
>> if a context is given. Which means updating record_context() to take a
>> context instead of a request (which it only ever used to find the
>> context anyway). And split the request agnostic parts of
>> intel_engine_coredump_add_request() out into a seaprate function.
>>
>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
>> pointer.
>> v3: Tidy up request locking code flow (Tvrtko)
>>
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gpu_error.c | 70 ++++++++++++++++++---------
>>   1 file changed, 48 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>> index 78cf95e4dd230..743614fff5472 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct 
>> intel_engine_coredump *ee)
>>   }
>>     static bool record_context(struct i915_gem_context_coredump *e,
>> -               const struct i915_request *rq)
>> +               struct intel_context *ce)
>>   {
>>       struct i915_gem_context *ctx;
>>       struct task_struct *task;
>>       bool simulated;
>>         rcu_read_lock();
>> -    ctx = rcu_dereference(rq->context->gem_context);
>> +    ctx = rcu_dereference(ce->gem_context);
>>       if (ctx && !kref_get_unless_zero(&ctx->ref))
>>           ctx = NULL;
>>       rcu_read_unlock();
>> @@ -1396,8 +1396,8 @@ static bool record_context(struct 
>> i915_gem_context_coredump *e,
>>       e->guilty = atomic_read(&ctx->guilty_count);
>>       e->active = atomic_read(&ctx->active_count);
>>   -    e->total_runtime = 
>> intel_context_get_total_runtime_ns(rq->context);
>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>         simulated = i915_gem_context_no_error_capture(ctx);
>>   @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct 
>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>       return ee;
>>   }
>>   +static struct intel_engine_capture_vma *
>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>> +                struct intel_context *ce,
>> +                gfp_t gfp)
>> +{
>> +    struct intel_engine_capture_vma *vma = NULL;
>> +
>> +    ee->simulated |= record_context(&ee->context, ce);
>> +    if (ee->simulated)
>> +        return NULL;
>> +
>> +    /*
>> +     * We need to copy these to an anonymous buffer
>> +     * as the simplest method to avoid being overwritten
>> +     * by userspace.
>> +     */
>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>> +
>> +    return vma;
>> +}
>> +
>>   struct intel_engine_capture_vma *
>>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>>                     struct i915_request *rq,
>>                     gfp_t gfp)
>>   {
>> -    struct intel_engine_capture_vma *vma = NULL;
>> +    struct intel_engine_capture_vma *vma;
>>   -    ee->simulated |= record_context(&ee->context, rq);
>> -    if (ee->simulated)
>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>> +    if (!vma)
>>           return NULL;
>>         /*
>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct 
>> intel_engine_coredump *ee,
>>        */
>>       vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>>       vma = capture_user(vma, rq, gfp);
>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>> -    vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>>         ee->rq_head = rq->head;
>>       ee->rq_post = rq->postfix;
>> @@ -1604,25 +1624,31 @@ capture_engine(struct intel_engine_cs *engine,
>>           return NULL;
>>         intel_get_hung_entity(engine, &ce, &rq);
>> -    if (!rq || !i915_request_started(rq))
>> -        goto no_request_capture;
>> +    if (rq && !i915_request_started(rq)) {
>> +        drm_info(&engine->gt->i915->drm, "Got hung context on %s 
>> with no active request!\n",
>> +             engine->name);
>
> Shouldn't this print be inside the "else if" case below? otherwise if 
> we don't have a rq at all we won't see it.
The intention was that the message is specifically for the case where a 
request exists but is being ignored. It is obvious when looking at the 
capture if no request was found - there just isn't one in there. But 
there is no way to distinguish that case from the situation where a 
request was found but just was not considered 'active'. Hence wanting to 
add a dmesg log instead.

Could pull the better message in from the next patch to this one to make 
it more clear - "Got hung context on %s with active request %lld:%lld 
[0x%04X] not yet started".

>
>> +        i915_request_put(rq);
>> +        rq = NULL;
>> +    }
>> +
>> +    if (rq) {
>> +        capture = intel_engine_coredump_add_request(ee, rq, 
>> ATOMIC_MAYFAIL);
>> +        i915_request_put(rq);
>> +    } else if (ce) {
>> +        capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>> +    }
>>   -    capture = intel_engine_coredump_add_request(ee, rq, 
>> ATOMIC_MAYFAIL);
>> -    if (!capture)
>> -        goto no_request_capture;
>>       if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>           intel_guc_capture_get_matching_node(engine->gt, ee, ce);
>
> Are you keeping this outside the "if (capture)" below to make sure we 
> consume the GuC engine capture even if we fail to produce the error 
> state? if so, a comment might be useful.
Yeah, hadn't really noticed that the original version was effectively 
leaking the GuC capture in the case of a failure. The buffer would 
overflow and discard the oldest entries eventually. But it seems better 
to deliberately free the buffer if the related capture is done whether 
the capture worked or not.

Can add a comment to the patch description about it.

John.

>
> Daniele
>
>>   -    intel_engine_coredump_add_vma(ee, capture, compress);
>> -    i915_request_put(rq);
>> +    if (capture) {
>> +        intel_engine_coredump_add_vma(ee, capture, compress);
>> +    } else {
>> +        kfree(ee);
>> +        ee = NULL;
>> +    }
>>         return ee;
>> -
>> -no_request_capture:
>> -    if (rq)
>> -        i915_request_put(rq);
>> -    kfree(ee);
>> -    return NULL;
>>   }
>>     static void
>
John Harrison Jan. 20, 2023, 5:55 p.m. UTC | #3
On 1/20/2023 09:36, John Harrison wrote:
> On 1/19/2023 17:54, Ceraolo Spurio, Daniele wrote:
>> On 1/18/2023 10:49 PM, John.C.Harrison@Intel.com wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> There was a report of error captures occurring without any hung
>>> context being indicated despite the capture being initiated by a 'hung
>>> context notification' from GuC. The problem was not reproducible.
>>> However, it is possible to happen if the context in question has no
>>> active requests. For example, if the hang was in the context switch
>>> itself then the breadcrumb write would have occurred and the KMD would
>>> see an idle context.
>>>
>>> In the interests of attempting to provide as much information as
>>> possible about a hang, it seems wise to include the engine info
>>> regardless of whether a request was found or not. As opposed to just
>>> prentending there was no hang at all.
>>>
>>> So update the error capture code to always record engine information
>>> if a context is given. Which means updating record_context() to take a
>>> context instead of a request (which it only ever used to find the
>>> context anyway). And split the request agnostic parts of
>>> intel_engine_coredump_add_request() out into a seaprate function.
>>>
>>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
>>> pointer.
>>> v3: Tidy up request locking code flow (Tvrtko)
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gpu_error.c | 70 
>>> ++++++++++++++++++---------
>>>   1 file changed, 48 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
>>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> index 78cf95e4dd230..743614fff5472 100644
>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct 
>>> intel_engine_coredump *ee)
>>>   }
>>>     static bool record_context(struct i915_gem_context_coredump *e,
>>> -               const struct i915_request *rq)
>>> +               struct intel_context *ce)
>>>   {
>>>       struct i915_gem_context *ctx;
>>>       struct task_struct *task;
>>>       bool simulated;
>>>         rcu_read_lock();
>>> -    ctx = rcu_dereference(rq->context->gem_context);
>>> +    ctx = rcu_dereference(ce->gem_context);
>>>       if (ctx && !kref_get_unless_zero(&ctx->ref))
>>>           ctx = NULL;
>>>       rcu_read_unlock();
>>> @@ -1396,8 +1396,8 @@ static bool record_context(struct 
>>> i915_gem_context_coredump *e,
>>>       e->guilty = atomic_read(&ctx->guilty_count);
>>>       e->active = atomic_read(&ctx->active_count);
>>>   -    e->total_runtime = 
>>> intel_context_get_total_runtime_ns(rq->context);
>>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
>>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>>         simulated = i915_gem_context_no_error_capture(ctx);
>>>   @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct 
>>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>>       return ee;
>>>   }
>>>   +static struct intel_engine_capture_vma *
>>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>>> +                struct intel_context *ce,
>>> +                gfp_t gfp)
>>> +{
>>> +    struct intel_engine_capture_vma *vma = NULL;
>>> +
>>> +    ee->simulated |= record_context(&ee->context, ce);
>>> +    if (ee->simulated)
>>> +        return NULL;
>>> +
>>> +    /*
>>> +     * We need to copy these to an anonymous buffer
>>> +     * as the simplest method to avoid being overwritten
>>> +     * by userspace.
>>> +     */
>>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>>> +
>>> +    return vma;
>>> +}
>>> +
>>>   struct intel_engine_capture_vma *
>>>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>>>                     struct i915_request *rq,
>>>                     gfp_t gfp)
>>>   {
>>> -    struct intel_engine_capture_vma *vma = NULL;
>>> +    struct intel_engine_capture_vma *vma;
>>>   -    ee->simulated |= record_context(&ee->context, rq);
>>> -    if (ee->simulated)
>>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>>> +    if (!vma)
>>>           return NULL;
>>>         /*
>>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct 
>>> intel_engine_coredump *ee,
>>>        */
>>>       vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>>>       vma = capture_user(vma, rq, gfp);
>>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>>> -    vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>>>         ee->rq_head = rq->head;
>>>       ee->rq_post = rq->postfix;
>>> @@ -1604,25 +1624,31 @@ capture_engine(struct intel_engine_cs *engine,
>>>           return NULL;
>>>         intel_get_hung_entity(engine, &ce, &rq);
>>> -    if (!rq || !i915_request_started(rq))
>>> -        goto no_request_capture;
>>> +    if (rq && !i915_request_started(rq)) {
>>> +        drm_info(&engine->gt->i915->drm, "Got hung context on %s 
>>> with no active request!\n",
>>> +             engine->name);
>>
>> Shouldn't this print be inside the "else if" case below? otherwise if 
>> we don't have a rq at all we won't see it.
> The intention was that the message is specifically for the case where 
> a request exists but is being ignored. It is obvious when looking at 
> the capture if no request was found - there just isn't one in there. 
> But there is no way to distinguish that case from the situation where 
> a request was found but just was not considered 'active'. Hence 
> wanting to add a dmesg log instead.
>
> Could pull the better message in from the next patch to this one to 
> make it more clear - "Got hung context on %s with active request 
> %lld:%lld [0x%04X] not yet started".
>
>>
>>> +        i915_request_put(rq);
>>> +        rq = NULL;
>>> +    }
>>> +
>>> +    if (rq) {
>>> +        capture = intel_engine_coredump_add_request(ee, rq, 
>>> ATOMIC_MAYFAIL);
>>> +        i915_request_put(rq);
>>> +    } else if (ce) {
>>> +        capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>>> +    }
>>>   -    capture = intel_engine_coredump_add_request(ee, rq, 
>>> ATOMIC_MAYFAIL);
>>> -    if (!capture)
>>> -        goto no_request_capture;
>>>       if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>>           intel_guc_capture_get_matching_node(engine->gt, ee, ce);
>>
>> Are you keeping this outside the "if (capture)" below to make sure we 
>> consume the GuC engine capture even if we fail to produce the error 
>> state? if so, a comment might be useful.
> Yeah, hadn't really noticed that the original version was effectively 
> leaking the GuC capture in the case of a failure. The buffer would 
> overflow and discard the oldest entries eventually. But it seems 
> better to deliberately free the buffer if the related capture is done 
> whether the capture worked or not.
>
> Can add a comment to the patch description about it.
Hmm. Actually, this will now be genuine leaking the capture state if the 
vma stuff fails. I think. It looks like the state object ownership gets 
transferred to the ee but then the ee gets freed below without freeing 
up any owned resources. So need to either not pull out the register 
state if !capture or explicitly free it on failure.

John.


>
> John.
>
>>
>> Daniele
>>
>>>   -    intel_engine_coredump_add_vma(ee, capture, compress);
>>> -    i915_request_put(rq);
>>> +    if (capture) {
>>> +        intel_engine_coredump_add_vma(ee, capture, compress);
>>> +    } else {
>>> +        kfree(ee);
>>> +        ee = NULL;
>>> +    }
>>>         return ee;
>>> -
>>> -no_request_capture:
>>> -    if (rq)
>>> -        i915_request_put(rq);
>>> -    kfree(ee);
>>> -    return NULL;
>>>   }
>>>     static void
>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 78cf95e4dd230..743614fff5472 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1370,14 +1370,14 @@  static void engine_record_execlists(struct intel_engine_coredump *ee)
 }
 
 static bool record_context(struct i915_gem_context_coredump *e,
-			   const struct i915_request *rq)
+			   struct intel_context *ce)
 {
 	struct i915_gem_context *ctx;
 	struct task_struct *task;
 	bool simulated;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(rq->context->gem_context);
+	ctx = rcu_dereference(ce->gem_context);
 	if (ctx && !kref_get_unless_zero(&ctx->ref))
 		ctx = NULL;
 	rcu_read_unlock();
@@ -1396,8 +1396,8 @@  static bool record_context(struct i915_gem_context_coredump *e,
 	e->guilty = atomic_read(&ctx->guilty_count);
 	e->active = atomic_read(&ctx->active_count);
 
-	e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
-	e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
+	e->total_runtime = intel_context_get_total_runtime_ns(ce);
+	e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
 
 	simulated = i915_gem_context_no_error_capture(ctx);
 
@@ -1532,15 +1532,37 @@  intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
 	return ee;
 }
 
+static struct intel_engine_capture_vma *
+engine_coredump_add_context(struct intel_engine_coredump *ee,
+			    struct intel_context *ce,
+			    gfp_t gfp)
+{
+	struct intel_engine_capture_vma *vma = NULL;
+
+	ee->simulated |= record_context(&ee->context, ce);
+	if (ee->simulated)
+		return NULL;
+
+	/*
+	 * We need to copy these to an anonymous buffer
+	 * as the simplest method to avoid being overwritten
+	 * by userspace.
+	 */
+	vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
+	vma = capture_vma(vma, ce->state, "HW context", gfp);
+
+	return vma;
+}
+
 struct intel_engine_capture_vma *
 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
 				  struct i915_request *rq,
 				  gfp_t gfp)
 {
-	struct intel_engine_capture_vma *vma = NULL;
+	struct intel_engine_capture_vma *vma;
 
-	ee->simulated |= record_context(&ee->context, rq);
-	if (ee->simulated)
+	vma = engine_coredump_add_context(ee, rq->context, gfp);
+	if (!vma)
 		return NULL;
 
 	/*
@@ -1550,8 +1572,6 @@  intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
 	 */
 	vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
 	vma = capture_user(vma, rq, gfp);
-	vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
-	vma = capture_vma(vma, rq->context->state, "HW context", gfp);
 
 	ee->rq_head = rq->head;
 	ee->rq_post = rq->postfix;
@@ -1604,25 +1624,31 @@  capture_engine(struct intel_engine_cs *engine,
 		return NULL;
 
 	intel_get_hung_entity(engine, &ce, &rq);
-	if (!rq || !i915_request_started(rq))
-		goto no_request_capture;
+	if (rq && !i915_request_started(rq)) {
+		drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",
+			 engine->name);
+		i915_request_put(rq);
+		rq = NULL;
+	}
+
+	if (rq) {
+		capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
+		i915_request_put(rq);
+	} else if (ce) {
+		capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
+	}
 
-	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
-	if (!capture)
-		goto no_request_capture;
 	if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
 		intel_guc_capture_get_matching_node(engine->gt, ee, ce);
 
-	intel_engine_coredump_add_vma(ee, capture, compress);
-	i915_request_put(rq);
+	if (capture) {
+		intel_engine_coredump_add_vma(ee, capture, compress);
+	} else {
+		kfree(ee);
+		ee = NULL;
+	}
 
 	return ee;
-
-no_request_capture:
-	if (rq)
-		i915_request_put(rq);
-	kfree(ee);
-	return NULL;
 }
 
 static void