diff mbox

[RFC,03/21] drm/i915: Ensure OLS & PLR are always in sync

Message ID 1412604925-11290-4-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison Oct. 6, 2014, 2:15 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

The new seqno alloction code pre-allocates a 'lazy' request structure and then
tries to allocate the 'lazy' seqno. The seqno allocation can potential wrap
around zero and when doing so, tries to idle the ring by waiting for all
oustanding work to complete. With a scheduler in place, this can mean first
submitting extra work to the ring. However, at this point in time, the lazy
request is valid but the lazy seqno is not. Some existing code was getting
confused by this state and Bad Things would happen.

The safest solution is to still allocate the lazy request in advance (to avoid
having to roll back in an out of memory sitation) but to save the pointer in a
local variable rather than immediately updating the lazy pointer. Only after a
valid seqno has been acquired is the lazy request pointer actually updated.

This guarantees that both lazy values are either invalid or both valid. There
can no longer be an inconsistent state.

For: VIZ-4377
Signed-off-by: John.C.Harrison@Intel.com
---
 drivers/gpu/drm/i915/intel_lrc.c        |   42 ++++++++++++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.c |   29 +++++++++++++++------
 2 files changed, 48 insertions(+), 23 deletions(-)

Comments

Daniel Vetter Oct. 19, 2014, 12:32 p.m. UTC | #1
On Mon, Oct 06, 2014 at 03:15:07PM +0100, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> The new seqno alloction code pre-allocates a 'lazy' request structure and then
> tries to allocate the 'lazy' seqno. The seqno allocation can potential wrap
> around zero and when doing so, tries to idle the ring by waiting for all
> oustanding work to complete. With a scheduler in place, this can mean first
> submitting extra work to the ring. However, at this point in time, the lazy
> request is valid but the lazy seqno is not. Some existing code was getting
> confused by this state and Bad Things would happen.

I'm confused about this description. I think you need to go into more
detail about what exactly you mean with "new seqno allocation code" (does
this reference some future patches), "confused state" and "Bad Things".

I get a bit the impression that this is just a bad interaction between the
scheduler and the seqno wrapping. Which might be due to allocating the
seqno too early when the scheduler it active. So if that's the case then
imo this shouldn't be in the s/seqno/request/ prep work series.

> The safest solution is to still allocate the lazy request in advance (to avoid
> having to roll back in an out of memory sitation) but to save the pointer in a
> local variable rather than immediately updating the lazy pointer. Only after a
> valid seqno has been acquired is the lazy request pointer actually updated.
> 
> This guarantees that both lazy values are either invalid or both valid. There
> can no longer be an inconsistent state.
> 
> For: VIZ-4377
> Signed-off-by: John.C.Harrison@Intel.com
> ---
>  drivers/gpu/drm/i915/intel_lrc.c        |   42 ++++++++++++++++++++-----------
>  drivers/gpu/drm/i915/intel_ringbuffer.c |   29 +++++++++++++++------
>  2 files changed, 48 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index bafd38b..3ac2622 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -796,27 +796,39 @@ void intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf)
>  static int logical_ring_alloc_seqno(struct intel_engine_cs *ring,
>  				    struct intel_context *ctx)
>  {
> -	if (ring->outstanding_lazy_seqno)
> -		return 0;
> +	struct drm_i915_gem_request *request;
> +	int ret;
>  
> -	if (ring->preallocated_lazy_request == NULL) {
> -		struct drm_i915_gem_request *request;
> +	/* The aim is to replace seqno values with request structures. A step
> +	 * along the way is to switch to using the PLR in preference to the
> +	 * OLS. That requires the PLR to only be valid when the OLS is also
> +	 * valid. I.e., the two must be kept in step. */
>  
> -		request = kmalloc(sizeof(*request), GFP_KERNEL);
> -		if (request == NULL)
> -			return -ENOMEM;
> +	if (ring->outstanding_lazy_seqno) {
> +		BUG_ON(ring->preallocated_lazy_request == NULL);

Too many BUG_ON in this patch. This one here can easily be avoided with a
if (WARN_ON) retrun -EIO; or so, the below one would just indicate a leak
so a plain WARN_ON is good enough.

BUG_ON makes it a major pain to debug anything out there in the field, so
the barrier to acceptance is really high.

> +		return 0;
> +	}
> +	BUG_ON(ring->preallocated_lazy_request != NULL);
>  
> -		/* Hold a reference to the context this request belongs to
> -		 * (we will need it when the time comes to emit/retire the
> -		 * request).
> -		 */
> -		request->ctx = ctx;
> -		i915_gem_context_reference(request->ctx);
> +	request = kmalloc(sizeof(*request), GFP_KERNEL);
> +	if (request == NULL)
> +		return -ENOMEM;
>  
> -		ring->preallocated_lazy_request = request;
> +	ret = i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
> +	if (ret) {
> +		kfree(request);
> +		return ret;
>  	}
>  
> -	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
> +	/* Hold a reference to the context this request belongs to
> +	 * (we will need it when the time comes to emit/retire the
> +	 * request).
> +	 */
> +	request->ctx = ctx;
> +	i915_gem_context_reference(request->ctx);
> +
> +	ring->preallocated_lazy_request = request;
> +	return 0;
>  }
>  
>  static int logical_ring_wait_request(struct intel_ringbuffer *ringbuf,
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 25795f2..cceac67 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2000,20 +2000,33 @@ int intel_ring_idle(struct intel_engine_cs *ring)
>  static int
>  intel_ring_alloc_seqno(struct intel_engine_cs *ring)
>  {
> -	if (ring->outstanding_lazy_seqno)
> +	int ret;
> +	struct drm_i915_gem_request *request;
> +
> +	/* The aim is to replace seqno values with request structures. A step
> +	 * along the way is to switch to using the PLR in preference to the
> +	 * OLS. That requires the PLR to only be valid when the OLS is also
> +	 * valid. I.e., the two must be kept in step. */
> +
> +	if (ring->outstanding_lazy_seqno) {
> +		BUG_ON(ring->preallocated_lazy_request == NULL);
>  		return 0;
> +	}
>  
> -	if (ring->preallocated_lazy_request == NULL) {
> -		struct drm_i915_gem_request *request;
> +	BUG_ON(ring->preallocated_lazy_request != NULL);
>  
> -		request = kmalloc(sizeof(*request), GFP_KERNEL);
> -		if (request == NULL)
> -			return -ENOMEM;
> +	request = kmalloc(sizeof(*request), GFP_KERNEL);
> +	if (request == NULL)
> +		return -ENOMEM;
>  
> -		ring->preallocated_lazy_request = request;
> +	ret = i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
> +	if (ret) {
> +		kfree(request);
> +		return ret;
>  	}
>  
> -	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
> +	ring->preallocated_lazy_request = request;
> +	return 0;
>  }
>  
>  static int __intel_ring_prepare(struct intel_engine_cs *ring,
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
John Harrison Oct. 20, 2014, 2:39 p.m. UTC | #2
On 19/10/2014 13:32, Daniel Vetter wrote:
> On Mon, Oct 06, 2014 at 03:15:07PM +0100, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> The new seqno alloction code pre-allocates a 'lazy' request structure and then
>> tries to allocate the 'lazy' seqno. The seqno allocation can potential wrap
>> around zero and when doing so, tries to idle the ring by waiting for all
>> oustanding work to complete. With a scheduler in place, this can mean first
>> submitting extra work to the ring. However, at this point in time, the lazy
>> request is valid but the lazy seqno is not. Some existing code was getting
>> confused by this state and Bad Things would happen.
> I'm confused about this description. I think you need to go into more
> detail about what exactly you mean with "new seqno allocation code" (does
> this reference some future patches), "confused state" and "Bad Things".
>
> I get a bit the impression that this is just a bad interaction between the
> scheduler and the seqno wrapping. Which might be due to allocating the
> seqno too early when the scheduler it active. So if that's the case then
> imo this shouldn't be in the s/seqno/request/ prep work series.

This patch was originally part of the scheduler set. However, it made 
sense to pull this in earlier as part of this patch set as a safer step 
in the shift of focus from seqno to request. I didn't get around to 
updating the description for the new location. However, it might be 
simpler to not bother with this change and just jump straight to 
removing the seqno completely.

The Bad Things were either null pointer derefences when code used a null 
lazy request having only checked for the presence of a lazy seqno, or 
possibly attempting to process a zero seqno having only checked for the 
presence of a lazy request. The former was a kernel panic, the latter a 
silent failure to synchronise properly.

>> The safest solution is to still allocate the lazy request in advance (to avoid
>> having to roll back in an out of memory sitation) but to save the pointer in a
>> local variable rather than immediately updating the lazy pointer. Only after a
>> valid seqno has been acquired is the lazy request pointer actually updated.
>>
>> This guarantees that both lazy values are either invalid or both valid. There
>> can no longer be an inconsistent state.
>>
>> For: VIZ-4377
>> Signed-off-by: John.C.Harrison@Intel.com
>> ---
>>   drivers/gpu/drm/i915/intel_lrc.c        |   42 ++++++++++++++++++++-----------
>>   drivers/gpu/drm/i915/intel_ringbuffer.c |   29 +++++++++++++++------
>>   2 files changed, 48 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index bafd38b..3ac2622 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -796,27 +796,39 @@ void intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf)
>>   static int logical_ring_alloc_seqno(struct intel_engine_cs *ring,
>>   				    struct intel_context *ctx)
>>   {
>> -	if (ring->outstanding_lazy_seqno)
>> -		return 0;
>> +	struct drm_i915_gem_request *request;
>> +	int ret;
>>   
>> -	if (ring->preallocated_lazy_request == NULL) {
>> -		struct drm_i915_gem_request *request;
>> +	/* The aim is to replace seqno values with request structures. A step
>> +	 * along the way is to switch to using the PLR in preference to the
>> +	 * OLS. That requires the PLR to only be valid when the OLS is also
>> +	 * valid. I.e., the two must be kept in step. */
>>   
>> -		request = kmalloc(sizeof(*request), GFP_KERNEL);
>> -		if (request == NULL)
>> -			return -ENOMEM;
>> +	if (ring->outstanding_lazy_seqno) {
>> +		BUG_ON(ring->preallocated_lazy_request == NULL);
> Too many BUG_ON in this patch. This one here can easily be avoided with a
> if (WARN_ON) retrun -EIO; or so, the below one would just indicate a leak
> so a plain WARN_ON is good enough.
>
> BUG_ON makes it a major pain to debug anything out there in the field, so
> the barrier to acceptance is really high.
These BUG_ONs are purely temporary. They disappear when the seqno itself 
is removed and the function becomes '..._alloc_request()'.
>> +		return 0;
>> +	}
>> +	BUG_ON(ring->preallocated_lazy_request != NULL);
>>   
>> -		/* Hold a reference to the context this request belongs to
>> -		 * (we will need it when the time comes to emit/retire the
>> -		 * request).
>> -		 */
>> -		request->ctx = ctx;
>> -		i915_gem_context_reference(request->ctx);
>> +	request = kmalloc(sizeof(*request), GFP_KERNEL);
>> +	if (request == NULL)
>> +		return -ENOMEM;
>>   
>> -		ring->preallocated_lazy_request = request;
>> +	ret = i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
>> +	if (ret) {
>> +		kfree(request);
>> +		return ret;
>>   	}
>>   
>> -	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
>> +	/* Hold a reference to the context this request belongs to
>> +	 * (we will need it when the time comes to emit/retire the
>> +	 * request).
>> +	 */
>> +	request->ctx = ctx;
>> +	i915_gem_context_reference(request->ctx);
>> +
>> +	ring->preallocated_lazy_request = request;
>> +	return 0;
>>   }
>>   
>>   static int logical_ring_wait_request(struct intel_ringbuffer *ringbuf,
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 25795f2..cceac67 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -2000,20 +2000,33 @@ int intel_ring_idle(struct intel_engine_cs *ring)
>>   static int
>>   intel_ring_alloc_seqno(struct intel_engine_cs *ring)
>>   {
>> -	if (ring->outstanding_lazy_seqno)
>> +	int ret;
>> +	struct drm_i915_gem_request *request;
>> +
>> +	/* The aim is to replace seqno values with request structures. A step
>> +	 * along the way is to switch to using the PLR in preference to the
>> +	 * OLS. That requires the PLR to only be valid when the OLS is also
>> +	 * valid. I.e., the two must be kept in step. */
>> +
>> +	if (ring->outstanding_lazy_seqno) {
>> +		BUG_ON(ring->preallocated_lazy_request == NULL);
>>   		return 0;
>> +	}
>>   
>> -	if (ring->preallocated_lazy_request == NULL) {
>> -		struct drm_i915_gem_request *request;
>> +	BUG_ON(ring->preallocated_lazy_request != NULL);
>>   
>> -		request = kmalloc(sizeof(*request), GFP_KERNEL);
>> -		if (request == NULL)
>> -			return -ENOMEM;
>> +	request = kmalloc(sizeof(*request), GFP_KERNEL);
>> +	if (request == NULL)
>> +		return -ENOMEM;
>>   
>> -		ring->preallocated_lazy_request = request;
>> +	ret = i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
>> +	if (ret) {
>> +		kfree(request);
>> +		return ret;
>>   	}
>>   
>> -	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
>> +	ring->preallocated_lazy_request = request;
>> +	return 0;
>>   }
>>   
>>   static int __intel_ring_prepare(struct intel_engine_cs *ring,
>> -- 
>> 1.7.9.5
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index bafd38b..3ac2622 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -796,27 +796,39 @@  void intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf)
 static int logical_ring_alloc_seqno(struct intel_engine_cs *ring,
 				    struct intel_context *ctx)
 {
-	if (ring->outstanding_lazy_seqno)
-		return 0;
+	struct drm_i915_gem_request *request;
+	int ret;
 
-	if (ring->preallocated_lazy_request == NULL) {
-		struct drm_i915_gem_request *request;
+	/* The aim is to replace seqno values with request structures. A step
+	 * along the way is to switch to using the PLR in preference to the
+	 * OLS. That requires the PLR to only be valid when the OLS is also
+	 * valid. I.e., the two must be kept in step. */
 
-		request = kmalloc(sizeof(*request), GFP_KERNEL);
-		if (request == NULL)
-			return -ENOMEM;
+	if (ring->outstanding_lazy_seqno) {
+		BUG_ON(ring->preallocated_lazy_request == NULL);
+		return 0;
+	}
+	BUG_ON(ring->preallocated_lazy_request != NULL);
 
-		/* Hold a reference to the context this request belongs to
-		 * (we will need it when the time comes to emit/retire the
-		 * request).
-		 */
-		request->ctx = ctx;
-		i915_gem_context_reference(request->ctx);
+	request = kmalloc(sizeof(*request), GFP_KERNEL);
+	if (request == NULL)
+		return -ENOMEM;
 
-		ring->preallocated_lazy_request = request;
+	ret = i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
+	if (ret) {
+		kfree(request);
+		return ret;
 	}
 
-	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
+	/* Hold a reference to the context this request belongs to
+	 * (we will need it when the time comes to emit/retire the
+	 * request).
+	 */
+	request->ctx = ctx;
+	i915_gem_context_reference(request->ctx);
+
+	ring->preallocated_lazy_request = request;
+	return 0;
 }
 
 static int logical_ring_wait_request(struct intel_ringbuffer *ringbuf,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 25795f2..cceac67 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2000,20 +2000,33 @@  int intel_ring_idle(struct intel_engine_cs *ring)
 static int
 intel_ring_alloc_seqno(struct intel_engine_cs *ring)
 {
-	if (ring->outstanding_lazy_seqno)
+	int ret;
+	struct drm_i915_gem_request *request;
+
+	/* The aim is to replace seqno values with request structures. A step
+	 * along the way is to switch to using the PLR in preference to the
+	 * OLS. That requires the PLR to only be valid when the OLS is also
+	 * valid. I.e., the two must be kept in step. */
+
+	if (ring->outstanding_lazy_seqno) {
+		BUG_ON(ring->preallocated_lazy_request == NULL);
 		return 0;
+	}
 
-	if (ring->preallocated_lazy_request == NULL) {
-		struct drm_i915_gem_request *request;
+	BUG_ON(ring->preallocated_lazy_request != NULL);
 
-		request = kmalloc(sizeof(*request), GFP_KERNEL);
-		if (request == NULL)
-			return -ENOMEM;
+	request = kmalloc(sizeof(*request), GFP_KERNEL);
+	if (request == NULL)
+		return -ENOMEM;
 
-		ring->preallocated_lazy_request = request;
+	ret = i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
+	if (ret) {
+		kfree(request);
+		return ret;
 	}
 
-	return i915_gem_get_seqno(ring->dev, &ring->outstanding_lazy_seqno);
+	ring->preallocated_lazy_request = request;
+	return 0;
 }
 
 static int __intel_ring_prepare(struct intel_engine_cs *ring,