diff mbox

drm/i915: Prevent recursion by retiring requests when the ring is full

Message ID 1390862587-26669-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Jan. 27, 2014, 10:43 p.m. UTC
As the VM do not track activity of objects and instead use a large
hammer to forcibly idle and evict all of their associated objects when
one is released, it is possible for that to cause a recursion when we
need to wait for free space on a ring and call retire requests.
(intel_ring_begin -> intel_ring_wait_request ->
i915_gem_retire_requests_ring -> i915_gem_context_free ->
i915_gem_evict_vm -> i915_gpu_idle -> intel_ring_begin etc)

In order to remove the requirement for calling retire-requests from
intel_ring_wait_request, we have to inline a couple of steps from
retiring requests, notably we have to record the position of the request
we wait for and use that to update the available ring space.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

Comments

Daniel Vetter Jan. 28, 2014, 8:11 a.m. UTC | #1
On Mon, Jan 27, 2014 at 10:43:07PM +0000, Chris Wilson wrote:
> As the VM do not track activity of objects and instead use a large
> hammer to forcibly idle and evict all of their associated objects when
> one is released, it is possible for that to cause a recursion when we
> need to wait for free space on a ring and call retire requests.
> (intel_ring_begin -> intel_ring_wait_request ->
> i915_gem_retire_requests_ring -> i915_gem_context_free ->
> i915_gem_evict_vm -> i915_gpu_idle -> intel_ring_begin etc)

Is there no way to get rid of the gpu_idle? Imo doing a gpu idle as part
of the cleanup of a retired request is just not great and has a good
chance to trip us up all over the place since it's _really_ unexpected.

So if we can do it (and iirc we've agreed that all the ppgtt cleanup is
mostly fluff) then I prefer we remove the root-cause instead of just
chopping off another head of the hydra.
-Daniel

> 
> In order to remove the requirement for calling retire-requests from
> intel_ring_wait_request, we have to inline a couple of steps from
> retiring requests, notably we have to record the position of the request
> we wait for and use that to update the available ring space.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 25 +++++--------------------
>  1 file changed, 5 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 10ff32d09c14..0da7c257159a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1431,28 +1431,16 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring)
>  	cleanup_status_page(ring);
>  }
>  
> -static int intel_ring_wait_seqno(struct intel_ring_buffer *ring, u32 seqno)
> -{
> -	int ret;
> -
> -	ret = i915_wait_seqno(ring, seqno);
> -	if (!ret)
> -		i915_gem_retire_requests_ring(ring);
> -
> -	return ret;
> -}
> -
>  static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
>  {
>  	struct drm_i915_gem_request *request;
> -	u32 seqno = 0;
> +	u32 seqno = 0, tail;
>  	int ret;
>  
> -	i915_gem_retire_requests_ring(ring);
> -
>  	if (ring->last_retired_head != -1) {
>  		ring->head = ring->last_retired_head;
>  		ring->last_retired_head = -1;
> +
>  		ring->space = ring_space(ring);
>  		if (ring->space >= n)
>  			return 0;
> @@ -1469,6 +1457,7 @@ static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
>  			space += ring->size;
>  		if (space >= n) {
>  			seqno = request->seqno;
> +			tail = request->tail;
>  			break;
>  		}
>  
> @@ -1483,15 +1472,11 @@ static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
>  	if (seqno == 0)
>  		return -ENOSPC;
>  
> -	ret = intel_ring_wait_seqno(ring, seqno);
> +	ret = i915_wait_seqno(ring, seqno);
>  	if (ret)
>  		return ret;
>  
> -	if (WARN_ON(ring->last_retired_head == -1))
> -		return -ENOSPC;
> -
> -	ring->head = ring->last_retired_head;
> -	ring->last_retired_head = -1;
> +	ring->head = tail;
>  	ring->space = ring_space(ring);
>  	if (WARN_ON(ring->space < n))
>  		return -ENOSPC;
> -- 
> 1.8.5.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 28, 2014, 8:22 a.m. UTC | #2
On Tue, Jan 28, 2014 at 09:11:17AM +0100, Daniel Vetter wrote:
> On Mon, Jan 27, 2014 at 10:43:07PM +0000, Chris Wilson wrote:
> > As the VM do not track activity of objects and instead use a large
> > hammer to forcibly idle and evict all of their associated objects when
> > one is released, it is possible for that to cause a recursion when we
> > need to wait for free space on a ring and call retire requests.
> > (intel_ring_begin -> intel_ring_wait_request ->
> > i915_gem_retire_requests_ring -> i915_gem_context_free ->
> > i915_gem_evict_vm -> i915_gpu_idle -> intel_ring_begin etc)
> 
> Is there no way to get rid of the gpu_idle? Imo doing a gpu idle as part
> of the cleanup of a retired request is just not great and has a good
> chance to trip us up all over the place since it's _really_ unexpected.

Yes, that gpu-idle is only an artifact of lacking activity counting on
the vm.
 
> So if we can do it (and iirc we've agreed that all the ppgtt cleanup is
> mostly fluff) then I prefer we remove the root-cause instead of just
> chopping off another head of the hydra.

Look again, I think this is a welcome simplicity to that piece of code.
We lose the optimisation of having the most accurate ring->space
following the wait in exchange for having a straightforward wait&update.
Plus retiring_requests at random points of time has turned out to be a
bad idea - wait long enough and I can probably find several other code
paths that could explode here.
-Chris
Daniel Vetter Jan. 28, 2014, 8:27 a.m. UTC | #3
On Tue, Jan 28, 2014 at 08:22:32AM +0000, Chris Wilson wrote:
> On Tue, Jan 28, 2014 at 09:11:17AM +0100, Daniel Vetter wrote:
> > On Mon, Jan 27, 2014 at 10:43:07PM +0000, Chris Wilson wrote:
> > > As the VM do not track activity of objects and instead use a large
> > > hammer to forcibly idle and evict all of their associated objects when
> > > one is released, it is possible for that to cause a recursion when we
> > > need to wait for free space on a ring and call retire requests.
> > > (intel_ring_begin -> intel_ring_wait_request ->
> > > i915_gem_retire_requests_ring -> i915_gem_context_free ->
> > > i915_gem_evict_vm -> i915_gpu_idle -> intel_ring_begin etc)
> > 
> > Is there no way to get rid of the gpu_idle? Imo doing a gpu idle as part
> > of the cleanup of a retired request is just not great and has a good
> > chance to trip us up all over the place since it's _really_ unexpected.
> 
> Yes, that gpu-idle is only an artifact of lacking activity counting on
> the vm.
>  
> > So if we can do it (and iirc we've agreed that all the ppgtt cleanup is
> > mostly fluff) then I prefer we remove the root-cause instead of just
> > chopping off another head of the hydra.
> 
> Look again, I think this is a welcome simplicity to that piece of code.
> We lose the optimisation of having the most accurate ring->space
> following the wait in exchange for having a straightforward wait&update.
> Plus retiring_requests at random points of time has turned out to be a
> bad idea - wait long enough and I can probably find several other code
> paths that could explode here.

Hm, good point, I'll agree this patch has value in itself. I'll quote our
discussion here in the commit message when merging it and will look for a
review victim now.
-Daniel
Ville Syrjälä Jan. 28, 2014, 11:15 a.m. UTC | #4
On Mon, Jan 27, 2014 at 10:43:07PM +0000, Chris Wilson wrote:
> As the VM do not track activity of objects and instead use a large
> hammer to forcibly idle and evict all of their associated objects when
> one is released, it is possible for that to cause a recursion when we
> need to wait for free space on a ring and call retire requests.
> (intel_ring_begin -> intel_ring_wait_request ->
> i915_gem_retire_requests_ring -> i915_gem_context_free ->
> i915_gem_evict_vm -> i915_gpu_idle -> intel_ring_begin etc)
> 
> In order to remove the requirement for calling retire-requests from
> intel_ring_wait_request, we have to inline a couple of steps from
> retiring requests, notably we have to record the position of the request
> we wait for and use that to update the available ring space.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Looks good to me.
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>

I do have a couple of questions about request->tail though.

We set it to -1 in intel_ring_wait_request(). Isn't that going to cause
problems for i915_request_guilty()?

When not -1, request->tail points to just before the commands that
.add_request() adds to the ring. So that means intel_ring_wait_request()
might have to wait for one extra request, and I guess more importantly
if the GPU hangs inside the .add_request() commands, we won't attribute
the hang to the request in question. Was it designe to be that way, or
is there a bug here?

> ---
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 25 +++++--------------------
>  1 file changed, 5 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 10ff32d09c14..0da7c257159a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1431,28 +1431,16 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring)
>  	cleanup_status_page(ring);
>  }
>  
> -static int intel_ring_wait_seqno(struct intel_ring_buffer *ring, u32 seqno)
> -{
> -	int ret;
> -
> -	ret = i915_wait_seqno(ring, seqno);
> -	if (!ret)
> -		i915_gem_retire_requests_ring(ring);
> -
> -	return ret;
> -}
> -
>  static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
>  {
>  	struct drm_i915_gem_request *request;
> -	u32 seqno = 0;
> +	u32 seqno = 0, tail;
>  	int ret;
>  
> -	i915_gem_retire_requests_ring(ring);
> -
>  	if (ring->last_retired_head != -1) {
>  		ring->head = ring->last_retired_head;
>  		ring->last_retired_head = -1;
> +
>  		ring->space = ring_space(ring);
>  		if (ring->space >= n)
>  			return 0;
> @@ -1469,6 +1457,7 @@ static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
>  			space += ring->size;
>  		if (space >= n) {
>  			seqno = request->seqno;
> +			tail = request->tail;
>  			break;
>  		}
>  
> @@ -1483,15 +1472,11 @@ static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
>  	if (seqno == 0)
>  		return -ENOSPC;
>  
> -	ret = intel_ring_wait_seqno(ring, seqno);
> +	ret = i915_wait_seqno(ring, seqno);
>  	if (ret)
>  		return ret;
>  
> -	if (WARN_ON(ring->last_retired_head == -1))
> -		return -ENOSPC;
> -
> -	ring->head = ring->last_retired_head;
> -	ring->last_retired_head = -1;
> +	ring->head = tail;
>  	ring->space = ring_space(ring);
>  	if (WARN_ON(ring->space < n))
>  		return -ENOSPC;
> -- 
> 1.8.5.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 28, 2014, 11:25 a.m. UTC | #5
On Tue, Jan 28, 2014 at 01:15:35PM +0200, Ville Syrjälä wrote:
> On Mon, Jan 27, 2014 at 10:43:07PM +0000, Chris Wilson wrote:
> > As the VM do not track activity of objects and instead use a large
> > hammer to forcibly idle and evict all of their associated objects when
> > one is released, it is possible for that to cause a recursion when we
> > need to wait for free space on a ring and call retire requests.
> > (intel_ring_begin -> intel_ring_wait_request ->
> > i915_gem_retire_requests_ring -> i915_gem_context_free ->
> > i915_gem_evict_vm -> i915_gpu_idle -> intel_ring_begin etc)
> > 
> > In order to remove the requirement for calling retire-requests from
> > intel_ring_wait_request, we have to inline a couple of steps from
> > retiring requests, notably we have to record the position of the request
> > we wait for and use that to update the available ring space.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Looks good to me.
> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> 
> I do have a couple of questions about request->tail though.
> 
> We set it to -1 in intel_ring_wait_request(). Isn't that going to cause
> problems for i915_request_guilty()?
> 
> When not -1, request->tail points to just before the commands that
> .add_request() adds to the ring. So that means intel_ring_wait_request()
> might have to wait for one extra request, and I guess more importantly
> if the GPU hangs inside the .add_request() commands, we won't attribute
> the hang to the request in question. Was it designe to be that way, or
> is there a bug here?

Ah good questions. I completely forgot about the behaviour here when we
adjusted this for hangstats...

Setting it to -1 should not confuse the guilty search since that should
be done (or at least changed so that is) based on a completion search.
After making that change, we should be able to set request->tail back to
being just after the request. Also I think we are quite safe to drop the
manipulation of request->tail inside intel_ring_wait_request().
-Chris
Daniel Vetter Feb. 6, 2014, 4:44 p.m. UTC | #6
On Tue, Jan 28, 2014 at 11:25:41AM +0000, Chris Wilson wrote:
> On Tue, Jan 28, 2014 at 01:15:35PM +0200, Ville Syrjälä wrote:
> > On Mon, Jan 27, 2014 at 10:43:07PM +0000, Chris Wilson wrote:
> > > As the VM do not track activity of objects and instead use a large
> > > hammer to forcibly idle and evict all of their associated objects when
> > > one is released, it is possible for that to cause a recursion when we
> > > need to wait for free space on a ring and call retire requests.
> > > (intel_ring_begin -> intel_ring_wait_request ->
> > > i915_gem_retire_requests_ring -> i915_gem_context_free ->
> > > i915_gem_evict_vm -> i915_gpu_idle -> intel_ring_begin etc)
> > > 
> > > In order to remove the requirement for calling retire-requests from
> > > intel_ring_wait_request, we have to inline a couple of steps from
> > > retiring requests, notably we have to record the position of the request
> > > we wait for and use that to update the available ring space.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > Looks good to me.
> > Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > 
> > I do have a couple of questions about request->tail though.
> > 
> > We set it to -1 in intel_ring_wait_request(). Isn't that going to cause
> > problems for i915_request_guilty()?
> > 
> > When not -1, request->tail points to just before the commands that
> > .add_request() adds to the ring. So that means intel_ring_wait_request()
> > might have to wait for one extra request, and I guess more importantly
> > if the GPU hangs inside the .add_request() commands, we won't attribute
> > the hang to the request in question. Was it designe to be that way, or
> > is there a bug here?
> 
> Ah good questions. I completely forgot about the behaviour here when we
> adjusted this for hangstats...
> 
> Setting it to -1 should not confuse the guilty search since that should
> be done (or at least changed so that is) based on a completion search.
> After making that change, we should be able to set request->tail back to
> being just after the request. Also I think we are quite safe to drop the
> manipulation of request->tail inside intel_ring_wait_request().

Queued for -next, thanks for the patch.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 10ff32d09c14..0da7c257159a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1431,28 +1431,16 @@  void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring)
 	cleanup_status_page(ring);
 }
 
-static int intel_ring_wait_seqno(struct intel_ring_buffer *ring, u32 seqno)
-{
-	int ret;
-
-	ret = i915_wait_seqno(ring, seqno);
-	if (!ret)
-		i915_gem_retire_requests_ring(ring);
-
-	return ret;
-}
-
 static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
 {
 	struct drm_i915_gem_request *request;
-	u32 seqno = 0;
+	u32 seqno = 0, tail;
 	int ret;
 
-	i915_gem_retire_requests_ring(ring);
-
 	if (ring->last_retired_head != -1) {
 		ring->head = ring->last_retired_head;
 		ring->last_retired_head = -1;
+
 		ring->space = ring_space(ring);
 		if (ring->space >= n)
 			return 0;
@@ -1469,6 +1457,7 @@  static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
 			space += ring->size;
 		if (space >= n) {
 			seqno = request->seqno;
+			tail = request->tail;
 			break;
 		}
 
@@ -1483,15 +1472,11 @@  static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
 	if (seqno == 0)
 		return -ENOSPC;
 
-	ret = intel_ring_wait_seqno(ring, seqno);
+	ret = i915_wait_seqno(ring, seqno);
 	if (ret)
 		return ret;
 
-	if (WARN_ON(ring->last_retired_head == -1))
-		return -ENOSPC;
-
-	ring->head = ring->last_retired_head;
-	ring->last_retired_head = -1;
+	ring->head = tail;
 	ring->space = ring_space(ring);
 	if (WARN_ON(ring->space < n))
 		return -ENOSPC;