diff mbox series

[13/27] drm/i915/guc: Take context ref when cancelling request

Message ID 20210819061639.21051-14-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Clean up GuC CI failures, simplify locking, and kernel DOC | expand

Commit Message

Matthew Brost Aug. 19, 2021, 6:16 a.m. UTC
A context can get destroyed after cancelling a request so take a
reference to context when cancelling a request.

Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Comments

Daniele Ceraolo Spurio Aug. 21, 2021, 12:07 a.m. UTC | #1
On 8/18/2021 11:16 PM, Matthew Brost wrote:
> A context can get destroyed after cancelling a request so take a
> reference to context when cancelling a request.

What's the exact race? AFAICS __i915_request_skip does not have a 
context_put().

Daniele

>
> Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation")
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 ++++-
>   1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index e0e85e4ad512..85f96d325048 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1620,8 +1620,10 @@ static void guc_context_cancel_request(struct intel_context *ce,
>   				       struct i915_request *rq)
>   {
>   	if (i915_sw_fence_signaled(&rq->submit)) {
> -		struct i915_sw_fence *fence = guc_context_block(ce);
> +		struct i915_sw_fence *fence;
>   
> +		intel_context_get(ce);
> +		fence = guc_context_block(ce);
>   		i915_sw_fence_wait(fence);
>   		if (!i915_request_completed(rq)) {
>   			__i915_request_skip(rq);
> @@ -1636,6 +1638,7 @@ static void guc_context_cancel_request(struct intel_context *ce,
>   		flush_work(&ce_to_guc(ce)->ct.requests.worker);
>   
>   		guc_context_unblock(ce);
> +		intel_context_put(ce);
>   	}
>   }
>
Matthew Brost Aug. 24, 2021, 3:42 p.m. UTC | #2
On Fri, Aug 20, 2021 at 05:07:27PM -0700, Daniele Ceraolo Spurio wrote:
> 
> 
> On 8/18/2021 11:16 PM, Matthew Brost wrote:
> > A context can get destroyed after cancelling a request so take a
> > reference to context when cancelling a request.
> 
> What's the exact race? AFAICS __i915_request_skip does not have a
> context_put().

This commit message isn't quite right, it is really a context reset or a
GT reset which could result in the context getting destroyed. I haven't
actually seen this happen but this just being paranoid about ref
counting. Can fix up the commit message.

Matt

> 
> Daniele
> 
> > 
> > Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation")
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 ++++-
> >   1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index e0e85e4ad512..85f96d325048 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -1620,8 +1620,10 @@ static void guc_context_cancel_request(struct intel_context *ce,
> >   				       struct i915_request *rq)
> >   {
> >   	if (i915_sw_fence_signaled(&rq->submit)) {
> > -		struct i915_sw_fence *fence = guc_context_block(ce);
> > +		struct i915_sw_fence *fence;
> > +		intel_context_get(ce);
> > +		fence = guc_context_block(ce);
> >   		i915_sw_fence_wait(fence);
> >   		if (!i915_request_completed(rq)) {
> >   			__i915_request_skip(rq);
> > @@ -1636,6 +1638,7 @@ static void guc_context_cancel_request(struct intel_context *ce,
> >   		flush_work(&ce_to_guc(ce)->ct.requests.worker);
> >   		guc_context_unblock(ce);
> > +		intel_context_put(ce);
> >   	}
> >   }
>
Daniele Ceraolo Spurio Aug. 25, 2021, 1:21 a.m. UTC | #3
On 8/24/2021 8:42 AM, Matthew Brost wrote:
> On Fri, Aug 20, 2021 at 05:07:27PM -0700, Daniele Ceraolo Spurio wrote:
>>
>> On 8/18/2021 11:16 PM, Matthew Brost wrote:
>>> A context can get destroyed after cancelling a request so take a
>>> reference to context when cancelling a request.
>> What's the exact race? AFAICS __i915_request_skip does not have a
>> context_put().
> This commit message isn't quite right, it is really a context reset or a
> GT reset which could result in the context getting destroyed. I haven't
> actually seen this happen but this just being paranoid about ref
> counting. Can fix up the commit message.

ok, with an updated commit message:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

>
> Matt
>
>> Daniele
>>
>>> Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation")
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 ++++-
>>>    1 file changed, 4 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index e0e85e4ad512..85f96d325048 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -1620,8 +1620,10 @@ static void guc_context_cancel_request(struct intel_context *ce,
>>>    				       struct i915_request *rq)
>>>    {
>>>    	if (i915_sw_fence_signaled(&rq->submit)) {
>>> -		struct i915_sw_fence *fence = guc_context_block(ce);
>>> +		struct i915_sw_fence *fence;
>>> +		intel_context_get(ce);
>>> +		fence = guc_context_block(ce);
>>>    		i915_sw_fence_wait(fence);
>>>    		if (!i915_request_completed(rq)) {
>>>    			__i915_request_skip(rq);
>>> @@ -1636,6 +1638,7 @@ static void guc_context_cancel_request(struct intel_context *ce,
>>>    		flush_work(&ce_to_guc(ce)->ct.requests.worker);
>>>    		guc_context_unblock(ce);
>>> +		intel_context_put(ce);
>>>    	}
>>>    }
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index e0e85e4ad512..85f96d325048 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1620,8 +1620,10 @@  static void guc_context_cancel_request(struct intel_context *ce,
 				       struct i915_request *rq)
 {
 	if (i915_sw_fence_signaled(&rq->submit)) {
-		struct i915_sw_fence *fence = guc_context_block(ce);
+		struct i915_sw_fence *fence;
 
+		intel_context_get(ce);
+		fence = guc_context_block(ce);
 		i915_sw_fence_wait(fence);
 		if (!i915_request_completed(rq)) {
 			__i915_request_skip(rq);
@@ -1636,6 +1638,7 @@  static void guc_context_cancel_request(struct intel_context *ce,
 		flush_work(&ce_to_guc(ce)->ct.requests.worker);
 
 		guc_context_unblock(ce);
+		intel_context_put(ce);
 	}
 }