diff mbox

[v12] drm/i915: Extend LRC pinning to cover GPU context writeback

Message ID 1453472727-34622-1-git-send-email-nicholas.hoath@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nick Hoath Jan. 22, 2016, 2:25 p.m. UTC
Use the first retired request on a new context to unpin
the old context. This ensures that the hw context remains
bound until it has been written back to by the GPU.
Now that the context is pinned until later in the request/context
lifecycle, it no longer needs to be pinned from context_queue to
retire_requests.
This fixes an issue with GuC submission where the GPU might not
have finished writing back the context before it is unpinned. This
results in a GPU hang.

v2: Moved the new pin to cover GuC submission (Alex Dai)
    Moved the new unpin to request_retire to fix coverage leak
v3: Added switch to default context if freeing a still pinned
    context just in case the hw was actually still using it
v4: Unwrapped context unpin to allow calling without a request
v5: Only create a switch to idle context if the ring doesn't
    already have a request pending on it (Alex Dai)
    Rename unsaved to dirty to avoid double negatives (Dave Gordon)
    Changed _no_req postfix to __ prefix for consistency (Dave Gordon)
    Split out per engine cleanup from context_free as it
    was getting unwieldy
    Corrected locking (Dave Gordon)
v6: Removed some bikeshedding (Mika Kuoppala)
    Added explanation of the GuC hang that this fixes (Daniel Vetter)
v7: Removed extra per request pinning from ring reset code (Alex Dai)
    Added forced ring unpin/clean in error case in context free (Alex Dai)
v8: Renamed lrc specific last_context to lrc_last_context as there
    were some reset cases where the codepaths leaked (Mika Kuoppala)
    NULL'd last_context in reset case - there was a pointer leak
    if someone did reset->close context.
v9: Rebase over "Fix context/engine cleanup order"
v10: Rebase over nightly, remove WARN_ON which caused the
    dependency on dev.
v11: Kick BAT rerun
v12: Rebase

Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
Issue: VIZ-4277
---
 drivers/gpu/drm/i915/intel_lrc.c | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

Comments

Daniel Vetter Jan. 25, 2016, 6:19 p.m. UTC | #1
On Fri, Jan 22, 2016 at 02:25:27PM +0000, Nick Hoath wrote:
> Use the first retired request on a new context to unpin
> the old context. This ensures that the hw context remains
> bound until it has been written back to by the GPU.
> Now that the context is pinned until later in the request/context
> lifecycle, it no longer needs to be pinned from context_queue to
> retire_requests.
> This fixes an issue with GuC submission where the GPU might not
> have finished writing back the context before it is unpinned. This
> results in a GPU hang.
> 
> v2: Moved the new pin to cover GuC submission (Alex Dai)
>     Moved the new unpin to request_retire to fix coverage leak
> v3: Added switch to default context if freeing a still pinned
>     context just in case the hw was actually still using it
> v4: Unwrapped context unpin to allow calling without a request
> v5: Only create a switch to idle context if the ring doesn't
>     already have a request pending on it (Alex Dai)
>     Rename unsaved to dirty to avoid double negatives (Dave Gordon)
>     Changed _no_req postfix to __ prefix for consistency (Dave Gordon)
>     Split out per engine cleanup from context_free as it
>     was getting unwieldy
>     Corrected locking (Dave Gordon)
> v6: Removed some bikeshedding (Mika Kuoppala)
>     Added explanation of the GuC hang that this fixes (Daniel Vetter)
> v7: Removed extra per request pinning from ring reset code (Alex Dai)
>     Added forced ring unpin/clean in error case in context free (Alex Dai)
> v8: Renamed lrc specific last_context to lrc_last_context as there
>     were some reset cases where the codepaths leaked (Mika Kuoppala)
>     NULL'd last_context in reset case - there was a pointer leak
>     if someone did reset->close context.
> v9: Rebase over "Fix context/engine cleanup order"
> v10: Rebase over nightly, remove WARN_ON which caused the
>     dependency on dev.
> v11: Kick BAT rerun
> v12: Rebase
> 
> Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
> Issue: VIZ-4277

When resending patches, please include everyone who ever commented on this
in Cc: lines here. It's for the record and helps in assigning blame when
things inevitably blow up again ;-)
-Daniel

> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 37 +++++++++++++++----------------------
>  1 file changed, 15 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index dbf3729..b469817 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -779,10 +779,10 @@ intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
>  	if (intel_ring_stopped(request->ring))
>  		return 0;
>  
> -	if (request->ctx != ring->default_context) {
> -		if (!request->ctx->engine[ring->id].dirty) {
> +	if (request->ctx != request->ctx->i915->kernel_context) {
> +		if (!request->ctx->engine[request->ring->id].dirty) {
>  			intel_lr_context_pin(request);
> -			request->ctx->engine[ring->id].dirty = true;
> +			request->ctx->engine[request->ring->id].dirty = true;
>  		}
>  	}
>  
> @@ -2447,9 +2447,7 @@ intel_lr_context_clean_ring(struct intel_context *ctx,
>  			    struct drm_i915_gem_object *ctx_obj,
>  			    struct intel_ringbuffer *ringbuf)
>  {
> -	int ret;
> -
> -	if (ctx == ring->default_context) {
> +	if (ctx == ctx->i915->kernel_context) {
>  		intel_unpin_ringbuffer_obj(ringbuf);
>  		i915_gem_object_ggtt_unpin(ctx_obj);
>  	}
> @@ -2463,13 +2461,10 @@ intel_lr_context_clean_ring(struct intel_context *ctx,
>  		 * otherwise create a switch to idle request
>  		 */
>  		if (list_empty(&ring->request_list)) {
> -			int ret;
> -
> -			ret = i915_gem_request_alloc(
> +			req = i915_gem_request_alloc(
>  					ring,
> -					ring->default_context,
> -					&req);
> -			if (!ret)
> +					NULL);
> +			if (!IS_ERR(req))
>  				i915_add_request(req);
>  			else
>  				DRM_DEBUG("Failed to ensure context saved");
> @@ -2479,6 +2474,8 @@ intel_lr_context_clean_ring(struct intel_context *ctx,
>  					typeof(*req), list);
>  		}
>  		if (req) {
> +			int ret;
> +
>  			ret = i915_wait_request(req);
>  			if (ret != 0) {
>  				/**
> @@ -2515,17 +2512,13 @@ void intel_lr_context_free(struct intel_context *ctx)
>  		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
>  		struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
>  
> -		if (!ctx_obj)
> -			continue;
> -
> -		if (ctx == ctx->i915->kernel_context) {
> -			intel_unpin_ringbuffer_obj(ringbuf);
> -			i915_gem_object_ggtt_unpin(ctx_obj);
> -		}
> +		if (ctx_obj)
> +			intel_lr_context_clean_ring(
> +						ctx,
> +						ringbuf->ring,
> +						ctx_obj,
> +						ringbuf);
>  
> -		WARN_ON(ctx->engine[i].pin_count);
> -		intel_ringbuffer_free(ringbuf);
> -		drm_gem_object_unreference(&ctx_obj->base);
>  	}
>  }
>  
> -- 
> 1.9.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Nick Hoath Jan. 26, 2016, 9:43 a.m. UTC | #2
On 25/01/2016 18:19, Daniel Vetter wrote:
> On Fri, Jan 22, 2016 at 02:25:27PM +0000, Nick Hoath wrote:
>> Use the first retired request on a new context to unpin
>> the old context. This ensures that the hw context remains
>> bound until it has been written back to by the GPU.
>> Now that the context is pinned until later in the request/context
>> lifecycle, it no longer needs to be pinned from context_queue to
>> retire_requests.
>> This fixes an issue with GuC submission where the GPU might not
>> have finished writing back the context before it is unpinned. This
>> results in a GPU hang.
>>
>> v2: Moved the new pin to cover GuC submission (Alex Dai)
>>      Moved the new unpin to request_retire to fix coverage leak
>> v3: Added switch to default context if freeing a still pinned
>>      context just in case the hw was actually still using it
>> v4: Unwrapped context unpin to allow calling without a request
>> v5: Only create a switch to idle context if the ring doesn't
>>      already have a request pending on it (Alex Dai)
>>      Rename unsaved to dirty to avoid double negatives (Dave Gordon)
>>      Changed _no_req postfix to __ prefix for consistency (Dave Gordon)
>>      Split out per engine cleanup from context_free as it
>>      was getting unwieldy
>>      Corrected locking (Dave Gordon)
>> v6: Removed some bikeshedding (Mika Kuoppala)
>>      Added explanation of the GuC hang that this fixes (Daniel Vetter)
>> v7: Removed extra per request pinning from ring reset code (Alex Dai)
>>      Added forced ring unpin/clean in error case in context free (Alex Dai)
>> v8: Renamed lrc specific last_context to lrc_last_context as there
>>      were some reset cases where the codepaths leaked (Mika Kuoppala)
>>      NULL'd last_context in reset case - there was a pointer leak
>>      if someone did reset->close context.
>> v9: Rebase over "Fix context/engine cleanup order"
>> v10: Rebase over nightly, remove WARN_ON which caused the
>>      dependency on dev.
>> v11: Kick BAT rerun
>> v12: Rebase
>>
>> Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
>> Issue: VIZ-4277
>
> When resending patches, please include everyone who ever commented on this
> in Cc: lines here. It's for the record and helps in assigning blame when
> things inevitably blow up again ;-)

Even when it's just a resend to cause a BAT run for coverage?

> -Daniel
>
>> ---
>>   drivers/gpu/drm/i915/intel_lrc.c | 37 +++++++++++++++----------------------
>>   1 file changed, 15 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index dbf3729..b469817 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -779,10 +779,10 @@ intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
>>   	if (intel_ring_stopped(request->ring))
>>   		return 0;
>>
>> -	if (request->ctx != ring->default_context) {
>> -		if (!request->ctx->engine[ring->id].dirty) {
>> +	if (request->ctx != request->ctx->i915->kernel_context) {
>> +		if (!request->ctx->engine[request->ring->id].dirty) {
>>   			intel_lr_context_pin(request);
>> -			request->ctx->engine[ring->id].dirty = true;
>> +			request->ctx->engine[request->ring->id].dirty = true;
>>   		}
>>   	}
>>
>> @@ -2447,9 +2447,7 @@ intel_lr_context_clean_ring(struct intel_context *ctx,
>>   			    struct drm_i915_gem_object *ctx_obj,
>>   			    struct intel_ringbuffer *ringbuf)
>>   {
>> -	int ret;
>> -
>> -	if (ctx == ring->default_context) {
>> +	if (ctx == ctx->i915->kernel_context) {
>>   		intel_unpin_ringbuffer_obj(ringbuf);
>>   		i915_gem_object_ggtt_unpin(ctx_obj);
>>   	}
>> @@ -2463,13 +2461,10 @@ intel_lr_context_clean_ring(struct intel_context *ctx,
>>   		 * otherwise create a switch to idle request
>>   		 */
>>   		if (list_empty(&ring->request_list)) {
>> -			int ret;
>> -
>> -			ret = i915_gem_request_alloc(
>> +			req = i915_gem_request_alloc(
>>   					ring,
>> -					ring->default_context,
>> -					&req);
>> -			if (!ret)
>> +					NULL);
>> +			if (!IS_ERR(req))
>>   				i915_add_request(req);
>>   			else
>>   				DRM_DEBUG("Failed to ensure context saved");
>> @@ -2479,6 +2474,8 @@ intel_lr_context_clean_ring(struct intel_context *ctx,
>>   					typeof(*req), list);
>>   		}
>>   		if (req) {
>> +			int ret;
>> +
>>   			ret = i915_wait_request(req);
>>   			if (ret != 0) {
>>   				/**
>> @@ -2515,17 +2512,13 @@ void intel_lr_context_free(struct intel_context *ctx)
>>   		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
>>   		struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
>>
>> -		if (!ctx_obj)
>> -			continue;
>> -
>> -		if (ctx == ctx->i915->kernel_context) {
>> -			intel_unpin_ringbuffer_obj(ringbuf);
>> -			i915_gem_object_ggtt_unpin(ctx_obj);
>> -		}
>> +		if (ctx_obj)
>> +			intel_lr_context_clean_ring(
>> +						ctx,
>> +						ringbuf->ring,
>> +						ctx_obj,
>> +						ringbuf);
>>
>> -		WARN_ON(ctx->engine[i].pin_count);
>> -		intel_ringbuffer_free(ringbuf);
>> -		drm_gem_object_unreference(&ctx_obj->base);
>>   	}
>>   }
>>
>> --
>> 1.9.1
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
Daniel Vetter Jan. 26, 2016, 10:08 a.m. UTC | #3
On Tue, Jan 26, 2016 at 09:43:42AM +0000, Nick Hoath wrote:
> On 25/01/2016 18:19, Daniel Vetter wrote:
> >On Fri, Jan 22, 2016 at 02:25:27PM +0000, Nick Hoath wrote:
> >>Use the first retired request on a new context to unpin
> >>the old context. This ensures that the hw context remains
> >>bound until it has been written back to by the GPU.
> >>Now that the context is pinned until later in the request/context
> >>lifecycle, it no longer needs to be pinned from context_queue to
> >>retire_requests.
> >>This fixes an issue with GuC submission where the GPU might not
> >>have finished writing back the context before it is unpinned. This
> >>results in a GPU hang.
> >>
> >>v2: Moved the new pin to cover GuC submission (Alex Dai)
> >>     Moved the new unpin to request_retire to fix coverage leak
> >>v3: Added switch to default context if freeing a still pinned
> >>     context just in case the hw was actually still using it
> >>v4: Unwrapped context unpin to allow calling without a request
> >>v5: Only create a switch to idle context if the ring doesn't
> >>     already have a request pending on it (Alex Dai)
> >>     Rename unsaved to dirty to avoid double negatives (Dave Gordon)
> >>     Changed _no_req postfix to __ prefix for consistency (Dave Gordon)
> >>     Split out per engine cleanup from context_free as it
> >>     was getting unwieldy
> >>     Corrected locking (Dave Gordon)
> >>v6: Removed some bikeshedding (Mika Kuoppala)
> >>     Added explanation of the GuC hang that this fixes (Daniel Vetter)
> >>v7: Removed extra per request pinning from ring reset code (Alex Dai)
> >>     Added forced ring unpin/clean in error case in context free (Alex Dai)
> >>v8: Renamed lrc specific last_context to lrc_last_context as there
> >>     were some reset cases where the codepaths leaked (Mika Kuoppala)
> >>     NULL'd last_context in reset case - there was a pointer leak
> >>     if someone did reset->close context.
> >>v9: Rebase over "Fix context/engine cleanup order"
> >>v10: Rebase over nightly, remove WARN_ON which caused the
> >>     dependency on dev.
> >>v11: Kick BAT rerun
> >>v12: Rebase
> >>
> >>Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
> >>Issue: VIZ-4277
> >
> >When resending patches, please include everyone who ever commented on this
> >in Cc: lines here. It's for the record and helps in assigning blame when
> >things inevitably blow up again ;-)
> 
> Even when it's just a resend to cause a BAT run for coverage?

Yes, it's to have a record of folks who participated in a patch's
discussion in git. Since when it blows up you really want to know whom are
the folks you should Cc:, and for that it's best to have a Cc: list
ready-made in the offending patch.

I even go as far as adding Cc: lines for testers/reviwers on top of the
r-b/t-b tags I'm adding when resending, for extra laziness when the
inevitable revert comes around. But that's a bit over the top ;-)

Cheers, Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dbf3729..b469817 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -779,10 +779,10 @@  intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 	if (intel_ring_stopped(request->ring))
 		return 0;
 
-	if (request->ctx != ring->default_context) {
-		if (!request->ctx->engine[ring->id].dirty) {
+	if (request->ctx != request->ctx->i915->kernel_context) {
+		if (!request->ctx->engine[request->ring->id].dirty) {
 			intel_lr_context_pin(request);
-			request->ctx->engine[ring->id].dirty = true;
+			request->ctx->engine[request->ring->id].dirty = true;
 		}
 	}
 
@@ -2447,9 +2447,7 @@  intel_lr_context_clean_ring(struct intel_context *ctx,
 			    struct drm_i915_gem_object *ctx_obj,
 			    struct intel_ringbuffer *ringbuf)
 {
-	int ret;
-
-	if (ctx == ring->default_context) {
+	if (ctx == ctx->i915->kernel_context) {
 		intel_unpin_ringbuffer_obj(ringbuf);
 		i915_gem_object_ggtt_unpin(ctx_obj);
 	}
@@ -2463,13 +2461,10 @@  intel_lr_context_clean_ring(struct intel_context *ctx,
 		 * otherwise create a switch to idle request
 		 */
 		if (list_empty(&ring->request_list)) {
-			int ret;
-
-			ret = i915_gem_request_alloc(
+			req = i915_gem_request_alloc(
 					ring,
-					ring->default_context,
-					&req);
-			if (!ret)
+					NULL);
+			if (!IS_ERR(req))
 				i915_add_request(req);
 			else
 				DRM_DEBUG("Failed to ensure context saved");
@@ -2479,6 +2474,8 @@  intel_lr_context_clean_ring(struct intel_context *ctx,
 					typeof(*req), list);
 		}
 		if (req) {
+			int ret;
+
 			ret = i915_wait_request(req);
 			if (ret != 0) {
 				/**
@@ -2515,17 +2512,13 @@  void intel_lr_context_free(struct intel_context *ctx)
 		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
 		struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
 
-		if (!ctx_obj)
-			continue;
-
-		if (ctx == ctx->i915->kernel_context) {
-			intel_unpin_ringbuffer_obj(ringbuf);
-			i915_gem_object_ggtt_unpin(ctx_obj);
-		}
+		if (ctx_obj)
+			intel_lr_context_clean_ring(
+						ctx,
+						ringbuf->ring,
+						ctx_obj,
+						ringbuf);
 
-		WARN_ON(ctx->engine[i].pin_count);
-		intel_ringbuffer_free(ringbuf);
-		drm_gem_object_unreference(&ctx_obj->base);
 	}
 }