diff mbox

[05/51] drm/i915: Add return code check to i915_gem_execbuffer_retire_commands()

Message ID 1423828140-10653-6-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison Feb. 13, 2015, 11:48 a.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

For some reason, the i915_add_request() call in
i915_gem_execbuffer_retire_commands() was explicitly having its return code
ignored. The _retire_commands() function itself was 'void'. Given that
_add_request() can fail without dispatching the batch buffer, this seems odd.

Also shrunk the parameter list to a single structure as everything it requires
is available in the execbuff_params object.

For: VIZ-5115
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |    5 +----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   16 +++++++---------
 drivers/gpu/drm/i915/intel_lrc.c           |    3 +--
 3 files changed, 9 insertions(+), 15 deletions(-)

Comments

Daniel Vetter Feb. 25, 2015, 10:17 p.m. UTC | #1
On Fri, Feb 13, 2015 at 11:48:14AM +0000, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> For some reason, the i915_add_request() call in
> i915_gem_execbuffer_retire_commands() was explicitly having its return code
> ignored. The _retire_commands() function itself was 'void'. Given that
> _add_request() can fail without dispatching the batch buffer, this seems odd.

I was so convinced we've had a commit somewhere explaining this, but
apparently not.

The deal is that after the dispatch call we have the batch commit and
there's no going back any more, which also means we can't return an error
code from the ioctl. So if you return -EIO or -ENOMEM that's a pretty bad
lie and you really have to ignore that error code.

Again I've tried to dig up the commit for that but that was lost in the
maze of the past 5 years of changes. We've had piles of older approaches
to deal with this issue:
- Don't even emit a request, just mark objects as gpu dirty. Only when
  waiting did we emit flushes and requests, which again again gave us a
  context to return the error. This resulted in horrible latency since
  flushes where wait too late and also all that book-keeping was not worth
  it at all. Don't ask ;-)
- Emit flushes right away, but if we fail to alloc the request set the
  outstanding lazy request bit. The job of the check_olr function used in
  waits was to notice that and retry the allocation.
- Preallocate the request, but that still leaves the possibility that the
  gpu dies. But since we've committed hangcheck will clean this up and we
  can just ignore the -EIO.

Given all that backstory: Why does add_request/retire_commands suddenly
need to fail?

Cheers, Daniel

> Also shrunk the parameter list to a single structure as everything it requires
> is available in the execbuff_params object.
> 
> For: VIZ-5115
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h            |    5 +----
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   16 +++++++---------
>  drivers/gpu/drm/i915/intel_lrc.c           |    3 +--
>  3 files changed, 9 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e6d616b..143bc63 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2624,10 +2624,7 @@ int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
>  			     struct drm_file *file_priv);
>  void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>  					struct intel_engine_cs *ring);
> -void i915_gem_execbuffer_retire_commands(struct drm_device *dev,
> -					 struct drm_file *file,
> -					 struct intel_engine_cs *ring,
> -					 struct drm_i915_gem_object *obj);
> +int i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
>  int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>  				   struct drm_i915_gem_execbuffer2 *args,
>  				   struct list_head *vmas);
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 93b0ef0..ca85803 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -989,17 +989,15 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>  	}
>  }
>  
> -void
> -i915_gem_execbuffer_retire_commands(struct drm_device *dev,
> -				    struct drm_file *file,
> -				    struct intel_engine_cs *ring,
> -				    struct drm_i915_gem_object *obj)
> +int
> +i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params)
>  {
>  	/* Unconditionally force add_request to emit a full flush. */
> -	ring->gpu_caches_dirty = true;
> +	params->ring->gpu_caches_dirty = true;
>  
>  	/* Add a breadcrumb for the completion of the batch buffer */
> -	(void)__i915_add_request(ring, file, obj);
> +	return __i915_add_request(params->ring, params->file,
> +				  params->batch_obj);
>  }
>  
>  static int
> @@ -1282,8 +1280,8 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>  	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), params->dispatch_flags);
>  
>  	i915_gem_execbuffer_move_to_active(vmas, ring);
> -	i915_gem_execbuffer_retire_commands(params->dev, params->file, ring,
> -					    params->batch_obj);
> +
> +	ret = i915_gem_execbuffer_retire_commands(params);
>  
>  error:
>  	kfree(cliprects);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ca29290..90400d0d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -706,9 +706,8 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>  	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), params->dispatch_flags);
>  
>  	i915_gem_execbuffer_move_to_active(vmas, ring);
> -	i915_gem_execbuffer_retire_commands(params->dev, params->file, ring, params->batch_obj);
>  
> -	return 0;
> +	return i915_gem_execbuffer_retire_commands(params);
>  }
>  
>  void intel_execlists_retire_requests(struct intel_engine_cs *ring)
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Daniel Vetter Feb. 26, 2015, 2:26 a.m. UTC | #2
On Wed, Feb 25, 2015 at 11:17:00PM +0100, Daniel Vetter wrote:
> On Fri, Feb 13, 2015 at 11:48:14AM +0000, John.C.Harrison@Intel.com wrote:
> > From: John Harrison <John.C.Harrison@Intel.com>
> > 
> > For some reason, the i915_add_request() call in
> > i915_gem_execbuffer_retire_commands() was explicitly having its return code
> > ignored. The _retire_commands() function itself was 'void'. Given that
> > _add_request() can fail without dispatching the batch buffer, this seems odd.
> 
> I was so convinced we've had a commit somewhere explaining this, but
> apparently not.
> 
> The deal is that after the dispatch call we have the batch commit and
> there's no going back any more, which also means we can't return an error
> code from the ioctl. So if you return -EIO or -ENOMEM that's a pretty bad
> lie and you really have to ignore that error code.
> 
> Again I've tried to dig up the commit for that but that was lost in the
> maze of the past 5 years of changes. We've had piles of older approaches
> to deal with this issue:
> - Don't even emit a request, just mark objects as gpu dirty. Only when
>   waiting did we emit flushes and requests, which again again gave us a
>   context to return the error. This resulted in horrible latency since
>   flushes where wait too late and also all that book-keeping was not worth
>   it at all. Don't ask ;-)
> - Emit flushes right away, but if we fail to alloc the request set the
>   outstanding lazy request bit. The job of the check_olr function used in
>   waits was to notice that and retry the allocation.
> - Preallocate the request, but that still leaves the possibility that the
>   gpu dies. But since we've committed hangcheck will clean this up and we
>   can just ignore the -EIO.

It's actually worse since it's not just -EIO but also -EINTR, returned by
intel_ring_begin when we're thrashing the gpu a bit too badly with
requests. Which means we really need to guarantee that the request is
completed properly, eventually since it's not just for fatal gpu hangs.

Atm that's done by only clearing outstanding_lazy_request after we've
really emitted the request fully. That guarantees that even when parts of
the request emission to the ringbuf fails we'll retry on the next wait if
needed.

A possible fix to make this infallible would be to reserve some fixed
amount of ringbuf credit at request creation time and then consume it
here. Of course we'd need checks to make sure we never use more ringspace
than what we reserve. To avoid massive churn we could convert
I915_RING_FREE_SPACE into a variable and increase it enough when
allocating the request. And then reduce it again at the start of
add_request.
-Daniel
John Harrison March 5, 2015, 1:06 p.m. UTC | #3
On 26/02/2015 02:26, Daniel Vetter wrote:
> On Wed, Feb 25, 2015 at 11:17:00PM +0100, Daniel Vetter wrote:
>> On Fri, Feb 13, 2015 at 11:48:14AM +0000, John.C.Harrison@Intel.com wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> For some reason, the i915_add_request() call in
>>> i915_gem_execbuffer_retire_commands() was explicitly having its return code
>>> ignored. The _retire_commands() function itself was 'void'. Given that
>>> _add_request() can fail without dispatching the batch buffer, this seems odd.
>> I was so convinced we've had a commit somewhere explaining this, but
>> apparently not.
>>
>> The deal is that after the dispatch call we have the batch commit and
>> there's no going back any more, which also means we can't return an error
>> code from the ioctl. So if you return -EIO or -ENOMEM that's a pretty bad
>> lie and you really have to ignore that error code.
>>
>> Again I've tried to dig up the commit for that but that was lost in the
>> maze of the past 5 years of changes. We've had piles of older approaches
>> to deal with this issue:
>> - Don't even emit a request, just mark objects as gpu dirty. Only when
>>    waiting did we emit flushes and requests, which again again gave us a
>>    context to return the error. This resulted in horrible latency since
>>    flushes where wait too late and also all that book-keeping was not worth
>>    it at all. Don't ask ;-)
>> - Emit flushes right away, but if we fail to alloc the request set the
>>    outstanding lazy request bit. The job of the check_olr function used in
>>    waits was to notice that and retry the allocation.
>> - Preallocate the request, but that still leaves the possibility that the
>>    gpu dies. But since we've committed hangcheck will clean this up and we
>>    can just ignore the -EIO.
>>
>> Given all that backstory: Why does add_request/retire_commands suddenly
>> need to fail?
The problem is that if add_request() fails and the request is not added 
to ring->request_list then it will be lost. As soon as the execbuff code 
returns, there is no longer a request pointer floating around so it can 
can't have add_request() called on it later. Thus the request will never 
be retired, the objects, context, etc never dereferenced, and basically 
lots of stuff will be leaked. Without the OLR to hoover up the failures, 
the add_request() call really must not be allowed to give up.

> It's actually worse since it's not just -EIO but also -EINTR, returned by
> intel_ring_begin when we're thrashing the gpu a bit too badly with
> requests. Which means we really need to guarantee that the request is
> completed properly, eventually since it's not just for fatal gpu hangs.
>
> Atm that's done by only clearing outstanding_lazy_request after we've
> really emitted the request fully. That guarantees that even when parts of
> the request emission to the ringbuf fails we'll retry on the next wait if
> needed.
>
> A possible fix to make this infallible would be to reserve some fixed
> amount of ringbuf credit at request creation time and then consume it
> here. Of course we'd need checks to make sure we never use more ringspace
> than what we reserve. To avoid massive churn we could convert
> I915_RING_FREE_SPACE into a variable and increase it enough when
> allocating the request. And then reduce it again at the start of
> add_request.
> -Daniel

I don't think you can guarantee to reserve enough space at request 
creation time. You have no idea how much space will be required by what 
ever piece of code is wanting the request. It could be a few words or it 
might be reams and reams of workaround goo. One of the scheduler patches 
does improve this and do a 'large enough' ring_begin() at the start of 
the execbuffer submission path in order to prevent out of space issues 
and other such problems half way through that could lead to a partial 
submission. However, even that is not absoluetely guaranteed 100% 
failure proof.

How about changing add_request() so that it can't fail. As in, the cache 
flush call and the emit request call can still failure due to running 
out of ring space, but add_request() just ignores that and keeps going 
anyway. That way the request is still correctly tracked and will be 
retired eventually. The only issues are unflushed caches and no seqno 
interrupt being generated. However, if the assumption is that another 
request will be submitted shortly (which is extremely likely if the 
system is busy enough to cause a failure during add_request!) then this 
will be fine. The following request will flush the caches and write the 
next seqno along to the ringbuffer. When that pops out, both the broken 
request and the new one will be considered complete and can be retired. 
The only issue is if the broken request is that last one to be submitted 
and is then waited on. In that case, you will get a timeout/hang as the 
request will never complete. Although that could be worked around by 
setting a 'failed request' flag in the ring and having the wait code (or 
even the currently redundant check_olr function) look at that and 
attempt a brand new (but empty) request submission.

Or maybe a simpler solution is to just keep a 'last failed request' 
pointer in the ring. Sort of a not-quite-OLR. If add_request() fails, it 
saves the request pointer here instead of adding it to the request list. 
A subsequent request allocation call starts by checking the 'last 
failed' value and retries the add_request() call if present. At that 
point it is allowed to fail. I guess it still needs to be done by 
check_olr as well to prevent a wait from stalling if no other requests 
are submitted.

John.
Daniel Vetter March 5, 2015, 2:44 p.m. UTC | #4
On Thu, Mar 05, 2015 at 01:06:10PM +0000, John Harrison wrote:
> On 26/02/2015 02:26, Daniel Vetter wrote:
> >On Wed, Feb 25, 2015 at 11:17:00PM +0100, Daniel Vetter wrote:
> >>On Fri, Feb 13, 2015 at 11:48:14AM +0000, John.C.Harrison@Intel.com wrote:
> >>>From: John Harrison <John.C.Harrison@Intel.com>
> >>>
> >>>For some reason, the i915_add_request() call in
> >>>i915_gem_execbuffer_retire_commands() was explicitly having its return code
> >>>ignored. The _retire_commands() function itself was 'void'. Given that
> >>>_add_request() can fail without dispatching the batch buffer, this seems odd.
> >>I was so convinced we've had a commit somewhere explaining this, but
> >>apparently not.
> >>
> >>The deal is that after the dispatch call we have the batch commit and
> >>there's no going back any more, which also means we can't return an error
> >>code from the ioctl. So if you return -EIO or -ENOMEM that's a pretty bad
> >>lie and you really have to ignore that error code.
> >>
> >>Again I've tried to dig up the commit for that but that was lost in the
> >>maze of the past 5 years of changes. We've had piles of older approaches
> >>to deal with this issue:
> >>- Don't even emit a request, just mark objects as gpu dirty. Only when
> >>   waiting did we emit flushes and requests, which again again gave us a
> >>   context to return the error. This resulted in horrible latency since
> >>   flushes where wait too late and also all that book-keeping was not worth
> >>   it at all. Don't ask ;-)
> >>- Emit flushes right away, but if we fail to alloc the request set the
> >>   outstanding lazy request bit. The job of the check_olr function used in
> >>   waits was to notice that and retry the allocation.
> >>- Preallocate the request, but that still leaves the possibility that the
> >>   gpu dies. But since we've committed hangcheck will clean this up and we
> >>   can just ignore the -EIO.
> >>
> >>Given all that backstory: Why does add_request/retire_commands suddenly
> >>need to fail?
> The problem is that if add_request() fails and the request is not added to
> ring->request_list then it will be lost. As soon as the execbuff code
> returns, there is no longer a request pointer floating around so it can
> can't have add_request() called on it later. Thus the request will never be
> retired, the objects, context, etc never dereferenced, and basically lots of
> stuff will be leaked. Without the OLR to hoover up the failures, the
> add_request() call really must not be allowed to give up.

That's exactly what I mean, add_request can't fail. The other issue is
that you can't fail execbuf at the point you call add_request either any
more since we've already (at least potentially) started executing the
batch.

> >It's actually worse since it's not just -EIO but also -EINTR, returned by
> >intel_ring_begin when we're thrashing the gpu a bit too badly with
> >requests. Which means we really need to guarantee that the request is
> >completed properly, eventually since it's not just for fatal gpu hangs.
> >
> >Atm that's done by only clearing outstanding_lazy_request after we've
> >really emitted the request fully. That guarantees that even when parts of
> >the request emission to the ringbuf fails we'll retry on the next wait if
> >needed.
> >
> >A possible fix to make this infallible would be to reserve some fixed
> >amount of ringbuf credit at request creation time and then consume it
> >here. Of course we'd need checks to make sure we never use more ringspace
> >than what we reserve. To avoid massive churn we could convert
> >I915_RING_FREE_SPACE into a variable and increase it enough when
> >allocating the request. And then reduce it again at the start of
> >add_request.
> >-Daniel
> 
> I don't think you can guarantee to reserve enough space at request creation
> time. You have no idea how much space will be required by what ever piece of
> code is wanting the request. It could be a few words or it might be reams
> and reams of workaround goo. One of the scheduler patches does improve this
> and do a 'large enough' ring_begin() at the start of the execbuffer
> submission path in order to prevent out of space issues and other such
> problems half way through that could lead to a partial submission. However,
> even that is not absoluetely guaranteed 100% failure proof.
> 
> How about changing add_request() so that it can't fail. As in, the cache
> flush call and the emit request call can still failure due to running out of
> ring space, but add_request() just ignores that and keeps going anyway. That
> way the request is still correctly tracked and will be retired eventually.
> The only issues are unflushed caches and no seqno interrupt being generated.
> However, if the assumption is that another request will be submitted shortly
> (which is extremely likely if the system is busy enough to cause a failure
> during add_request!) then this will be fine. The following request will
> flush the caches and write the next seqno along to the ringbuffer. When that
> pops out, both the broken request and the new one will be considered
> complete and can be retired. The only issue is if the broken request is that
> last one to be submitted and is then waited on. In that case, you will get a
> timeout/hang as the request will never complete. Although that could be
> worked around by setting a 'failed request' flag in the ring and having the
> wait code (or even the currently redundant check_olr function) look at that
> and attempt a brand new (but empty) request submission.
> 
> Or maybe a simpler solution is to just keep a 'last failed request' pointer
> in the ring. Sort of a not-quite-OLR. If add_request() fails, it saves the
> request pointer here instead of adding it to the request list. A subsequent
> request allocation call starts by checking the 'last failed' value and
> retries the add_request() call if present. At that point it is allowed to
> fail. I guess it still needs to be done by check_olr as well to prevent a
> wait from stalling if no other requests are submitted.

Imo reserving a bit of ring space for each add_request should be solid.
Userspace uses the exact same reservation logic for adding end-of-batch
workarounds. The only thing needed to make this solid is to WARN if
add_request ends up using more ring space than what we've reserved (not
just when it actually runs out, that obviously doesn't happen often
enough for testing).

Everything else just readds olr through the backdoor, which is kinda what
we wanted to avoid from an accounting pov. Because then you have again
some random request outstanding which scoops up everything it encounters.

For the ringbuf interface itself I think we only need 3 pieces:
- ringbuf_reserve(space): Ensures there's @space available in the ring and
  then sets that as ring->reserved_space. ring_free_space needs to
  subtract that ofc. This would be in the alloc_request function.

- rinbuf_use_reserve(): sets ring->reserve_space to 0 so that
  intel_ring_begin can start eating into reserves. This would be at the
  top of add_request.

- ringbuf_check_reserve(): This would be called at the end of add_request
  and WARNs if we've used more ring space than what we've promised.
  Obviously needs some boo-keeping between use and check but that's just a
  detail. If you go with a flag instead of clearing ->reserve_space you
  can even enforce that these three functions are always called in this
  order and don't end up being nested wrongly.

Cheers, Daniel
John Harrison March 5, 2015, 3:06 p.m. UTC | #5
On 05/03/2015 14:44, Daniel Vetter wrote:
> On Thu, Mar 05, 2015 at 01:06:10PM +0000, John Harrison wrote:
>> On 26/02/2015 02:26, Daniel Vetter wrote:
>>> On Wed, Feb 25, 2015 at 11:17:00PM +0100, Daniel Vetter wrote:
>>>> On Fri, Feb 13, 2015 at 11:48:14AM +0000, John.C.Harrison@Intel.com wrote:
>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>
>>>>> For some reason, the i915_add_request() call in
>>>>> i915_gem_execbuffer_retire_commands() was explicitly having its return code
>>>>> ignored. The _retire_commands() function itself was 'void'. Given that
>>>>> _add_request() can fail without dispatching the batch buffer, this seems odd.
>>>> I was so convinced we've had a commit somewhere explaining this, but
>>>> apparently not.
>>>>
>>>> The deal is that after the dispatch call we have the batch commit and
>>>> there's no going back any more, which also means we can't return an error
>>>> code from the ioctl. So if you return -EIO or -ENOMEM that's a pretty bad
>>>> lie and you really have to ignore that error code.
>>>>
>>>> Again I've tried to dig up the commit for that but that was lost in the
>>>> maze of the past 5 years of changes. We've had piles of older approaches
>>>> to deal with this issue:
>>>> - Don't even emit a request, just mark objects as gpu dirty. Only when
>>>>    waiting did we emit flushes and requests, which again again gave us a
>>>>    context to return the error. This resulted in horrible latency since
>>>>    flushes where wait too late and also all that book-keeping was not worth
>>>>    it at all. Don't ask ;-)
>>>> - Emit flushes right away, but if we fail to alloc the request set the
>>>>    outstanding lazy request bit. The job of the check_olr function used in
>>>>    waits was to notice that and retry the allocation.
>>>> - Preallocate the request, but that still leaves the possibility that the
>>>>    gpu dies. But since we've committed hangcheck will clean this up and we
>>>>    can just ignore the -EIO.
>>>>
>>>> Given all that backstory: Why does add_request/retire_commands suddenly
>>>> need to fail?
>> The problem is that if add_request() fails and the request is not added to
>> ring->request_list then it will be lost. As soon as the execbuff code
>> returns, there is no longer a request pointer floating around so it can
>> can't have add_request() called on it later. Thus the request will never be
>> retired, the objects, context, etc never dereferenced, and basically lots of
>> stuff will be leaked. Without the OLR to hoover up the failures, the
>> add_request() call really must not be allowed to give up.
> That's exactly what I mean, add_request can't fail. The other issue is
> that you can't fail execbuf at the point you call add_request either any
> more since we've already (at least potentially) started executing the
> batch.
>
>>> It's actually worse since it's not just -EIO but also -EINTR, returned by
>>> intel_ring_begin when we're thrashing the gpu a bit too badly with
>>> requests. Which means we really need to guarantee that the request is
>>> completed properly, eventually since it's not just for fatal gpu hangs.
>>>
>>> Atm that's done by only clearing outstanding_lazy_request after we've
>>> really emitted the request fully. That guarantees that even when parts of
>>> the request emission to the ringbuf fails we'll retry on the next wait if
>>> needed.
>>>
>>> A possible fix to make this infallible would be to reserve some fixed
>>> amount of ringbuf credit at request creation time and then consume it
>>> here. Of course we'd need checks to make sure we never use more ringspace
>>> than what we reserve. To avoid massive churn we could convert
>>> I915_RING_FREE_SPACE into a variable and increase it enough when
>>> allocating the request. And then reduce it again at the start of
>>> add_request.
>>> -Daniel
>> I don't think you can guarantee to reserve enough space at request creation
>> time. You have no idea how much space will be required by what ever piece of
>> code is wanting the request. It could be a few words or it might be reams
>> and reams of workaround goo. One of the scheduler patches does improve this
>> and do a 'large enough' ring_begin() at the start of the execbuffer
>> submission path in order to prevent out of space issues and other such
>> problems half way through that could lead to a partial submission. However,
>> even that is not absoluetely guaranteed 100% failure proof.
>>
>> How about changing add_request() so that it can't fail. As in, the cache
>> flush call and the emit request call can still failure due to running out of
>> ring space, but add_request() just ignores that and keeps going anyway. That
>> way the request is still correctly tracked and will be retired eventually.
>> The only issues are unflushed caches and no seqno interrupt being generated.
>> However, if the assumption is that another request will be submitted shortly
>> (which is extremely likely if the system is busy enough to cause a failure
>> during add_request!) then this will be fine. The following request will
>> flush the caches and write the next seqno along to the ringbuffer. When that
>> pops out, both the broken request and the new one will be considered
>> complete and can be retired. The only issue is if the broken request is that
>> last one to be submitted and is then waited on. In that case, you will get a
>> timeout/hang as the request will never complete. Although that could be
>> worked around by setting a 'failed request' flag in the ring and having the
>> wait code (or even the currently redundant check_olr function) look at that
>> and attempt a brand new (but empty) request submission.
>>
>> Or maybe a simpler solution is to just keep a 'last failed request' pointer
>> in the ring. Sort of a not-quite-OLR. If add_request() fails, it saves the
>> request pointer here instead of adding it to the request list. A subsequent
>> request allocation call starts by checking the 'last failed' value and
>> retries the add_request() call if present. At that point it is allowed to
>> fail. I guess it still needs to be done by check_olr as well to prevent a
>> wait from stalling if no other requests are submitted.
> Imo reserving a bit of ring space for each add_request should be solid.
> Userspace uses the exact same reservation logic for adding end-of-batch
> workarounds. The only thing needed to make this solid is to WARN if
> add_request ends up using more ring space than what we've reserved (not
> just when it actually runs out, that obviously doesn't happen often
> enough for testing).
The problem is that there could be multiple requests being processed in 
parallel. This is especially true with the scheduler. Userland could 
submit a whole stream of batches that all get queued up in the 
scheduler. Only later do they get submitted to the hardware. The request 
must be allocated up front because there is no other means of tracking 
them. But reserving space at that point won't work because you either 
end up reserving massive amounts of space if the reserve is cumulative, 
or not enough if only one slot is reserved.

> Everything else just readds olr through the backdoor, which is kinda what
> we wanted to avoid from an accounting pov. Because then you have again
> some random request outstanding which scoops up everything it encounters.
Not quite.  The difference is that with something like an outstanding 
failed request rather than a lazy one, there is still the segregation of 
work. The failed request will be posted and added to the request list in 
its entirety before a new request is allocated and used for the new work.


> For the ringbuf interface itself I think we only need 3 pieces:
> - ringbuf_reserve(space): Ensures there's @space available in the ring and
>    then sets that as ring->reserved_space. ring_free_space needs to
>    subtract that ofc. This would be in the alloc_request function.
>
> - rinbuf_use_reserve(): sets ring->reserve_space to 0 so that
>    intel_ring_begin can start eating into reserves. This would be at the
>    top of add_request.
>
> - ringbuf_check_reserve(): This would be called at the end of add_request
>    and WARNs if we've used more ring space than what we've promised.
>    Obviously needs some boo-keeping between use and check but that's just a
>    detail. If you go with a flag instead of clearing ->reserve_space you
>    can even enforce that these three functions are always called in this
>    order and don't end up being nested wrongly.
>
> Cheers, Daniel
Daniel Vetter March 5, 2015, 4:14 p.m. UTC | #6
On Thu, Mar 05, 2015 at 03:06:42PM +0000, John Harrison wrote:
> On 05/03/2015 14:44, Daniel Vetter wrote:
> >Imo reserving a bit of ring space for each add_request should be solid.
> >Userspace uses the exact same reservation logic for adding end-of-batch
> >workarounds. The only thing needed to make this solid is to WARN if
> >add_request ends up using more ring space than what we've reserved (not
> >just when it actually runs out, that obviously doesn't happen often
> >enough for testing).
> The problem is that there could be multiple requests being processed in
> parallel. This is especially true with the scheduler. Userland could submit
> a whole stream of batches that all get queued up in the scheduler. Only
> later do they get submitted to the hardware. The request must be allocated
> up front because there is no other means of tracking them. But reserving
> space at that point won't work because you either end up reserving massive
> amounts of space if the reserve is cumulative, or not enough if only one
> slot is reserved.

At least with execlist we don't have that problem really since writing the
ringbuffer is done synchronously and directly.

For the legacy scheduler I expect that we won't do any of the ringbuf
writes directly and instead that's all done by the scheduler
asynchronously.

So this should just be an issue while we are converting to the scheduler
or on platforms that will never have one. And imo the request ringbuf
reservation is the solution with the simplest impact on the design.

> >Everything else just readds olr through the backdoor, which is kinda what
> >we wanted to avoid from an accounting pov. Because then you have again
> >some random request outstanding which scoops up everything it encounters.
> Not quite.  The difference is that with something like an outstanding failed
> request rather than a lazy one, there is still the segregation of work. The
> failed request will be posted and added to the request list in its entirety
> before a new request is allocated and used for the new work.

Well history lesson, but that's exactly how olr started out. Then the
hydra grew heads to no end. That's why I don't want to go down this road
again, since I've been on that trip the past 3 years ;-) And since your
motivation for olr light is exactly the one I provided 3 years ago to get
my patch in I think history repeating is likely.
-Daniel
John Harrison March 6, 2015, 11:38 a.m. UTC | #7
On 05/03/2015 16:14, Daniel Vetter wrote:
> On Thu, Mar 05, 2015 at 03:06:42PM +0000, John Harrison wrote:
>> On 05/03/2015 14:44, Daniel Vetter wrote:
>>> Imo reserving a bit of ring space for each add_request should be solid.
>>> Userspace uses the exact same reservation logic for adding end-of-batch
>>> workarounds. The only thing needed to make this solid is to WARN if
>>> add_request ends up using more ring space than what we've reserved (not
>>> just when it actually runs out, that obviously doesn't happen often
>>> enough for testing).
>> The problem is that there could be multiple requests being processed in
>> parallel. This is especially true with the scheduler. Userland could submit
>> a whole stream of batches that all get queued up in the scheduler. Only
>> later do they get submitted to the hardware. The request must be allocated
>> up front because there is no other means of tracking them. But reserving
>> space at that point won't work because you either end up reserving massive
>> amounts of space if the reserve is cumulative, or not enough if only one
>> slot is reserved.
> At least with execlist we don't have that problem really since writing the
> ringbuffer is done synchronously and directly.
>
> For the legacy scheduler I expect that we won't do any of the ringbuf
> writes directly and instead that's all done by the scheduler
> asynchronously.
>
> So this should just be an issue while we are converting to the scheduler
> or on platforms that will never have one. And imo the request ringbuf
> reservation is the solution with the simplest impact on the design.
I don't understand what you mean here. The scheduler decouples the 
submission of a batch buffer to the driver with the submission of that 
batch buffer to the hardware. The submission code path itself is 
identical. The scheduler does not do any hardware writes - it merely 
adds an extra layer between the user interface code and the hardware 
access code. The request creation must be done at driver submission 
time. The hardware submission could be an arbitrary amount of time 
later. In the intervening time, any number of new batch buffers may be 
submitted to the driver and thus new requests be created. This is true 
for both legacy and execlist mode.

The space reserve would have to be done at the start of the hardware 
submission process not at driver submission time. I.e. it would need to 
be a separate and new i915_gem_request_reserve_space() call at the start 
of the back half of the exec buffer code. It could not be done during 
request creation.


>>> Everything else just readds olr through the backdoor, which is kinda what
>>> we wanted to avoid from an accounting pov. Because then you have again
>>> some random request outstanding which scoops up everything it encounters.
>> Not quite.  The difference is that with something like an outstanding failed
>> request rather than a lazy one, there is still the segregation of work. The
>> failed request will be posted and added to the request list in its entirety
>> before a new request is allocated and used for the new work.
> Well history lesson, but that's exactly how olr started out. Then the
> hydra grew heads to no end. That's why I don't want to go down this road
> again, since I've been on that trip the past 3 years ;-) And since your
> motivation for olr light is exactly the one I provided 3 years ago to get
> my patch in I think history repeating is likely.
> -Daniel
Fair enough. Let's stay away from the evil hydra.

John.
Daniel Vetter March 6, 2015, 3:57 p.m. UTC | #8
On Fri, Mar 06, 2015 at 11:38:44AM +0000, John Harrison wrote:
> On 05/03/2015 16:14, Daniel Vetter wrote:
> >On Thu, Mar 05, 2015 at 03:06:42PM +0000, John Harrison wrote:
> >>On 05/03/2015 14:44, Daniel Vetter wrote:
> >>>Imo reserving a bit of ring space for each add_request should be solid.
> >>>Userspace uses the exact same reservation logic for adding end-of-batch
> >>>workarounds. The only thing needed to make this solid is to WARN if
> >>>add_request ends up using more ring space than what we've reserved (not
> >>>just when it actually runs out, that obviously doesn't happen often
> >>>enough for testing).
> >>The problem is that there could be multiple requests being processed in
> >>parallel. This is especially true with the scheduler. Userland could submit
> >>a whole stream of batches that all get queued up in the scheduler. Only
> >>later do they get submitted to the hardware. The request must be allocated
> >>up front because there is no other means of tracking them. But reserving
> >>space at that point won't work because you either end up reserving massive
> >>amounts of space if the reserve is cumulative, or not enough if only one
> >>slot is reserved.
> >At least with execlist we don't have that problem really since writing the
> >ringbuffer is done synchronously and directly.
> >
> >For the legacy scheduler I expect that we won't do any of the ringbuf
> >writes directly and instead that's all done by the scheduler
> >asynchronously.
> >
> >So this should just be an issue while we are converting to the scheduler
> >or on platforms that will never have one. And imo the request ringbuf
> >reservation is the solution with the simplest impact on the design.
> I don't understand what you mean here. The scheduler decouples the
> submission of a batch buffer to the driver with the submission of that batch
> buffer to the hardware. The submission code path itself is identical. The
> scheduler does not do any hardware writes - it merely adds an extra layer
> between the user interface code and the hardware access code. The request
> creation must be done at driver submission time. The hardware submission
> could be an arbitrary amount of time later. In the intervening time, any
> number of new batch buffers may be submitted to the driver and thus new
> requests be created. This is true for both legacy and execlist mode.
> 
> The space reserve would have to be done at the start of the hardware
> submission process not at driver submission time. I.e. it would need to be a
> separate and new i915_gem_request_reserve_space() call at the start of the
> back half of the exec buffer code. It could not be done during request
> creation.

Yeah with the scheduler we need to take out the reservation again since it
doesn't make a lot of sense - the scheduler is the safety net that
guarantees execution (barring gpu death ofc as always) when the execbuf
ioctl returns 0. Maybe in the end we need a special alloc_request function
for execbuf to only reserve when the scheduler isn't around.

That additional work is just part of the prize we have to pay to never
break things while refactoring code. And since we'll get the execlist
scheduler in first that reservation code will probably stick around for a
while (or maybe forever even if we decide against upstreaming the gen7
scheduler).

Does this make sense?

Thanks, Daniel
Dave Gordon March 6, 2015, 5:40 p.m. UTC | #9
On 06/03/15 15:57, Daniel Vetter wrote:
> On Fri, Mar 06, 2015 at 11:38:44AM +0000, John Harrison wrote:
>> On 05/03/2015 16:14, Daniel Vetter wrote:
>>> On Thu, Mar 05, 2015 at 03:06:42PM +0000, John Harrison wrote:
>>>> On 05/03/2015 14:44, Daniel Vetter wrote:
>>>>> Imo reserving a bit of ring space for each add_request should be solid.
>>>>> Userspace uses the exact same reservation logic for adding end-of-batch
>>>>> workarounds. The only thing needed to make this solid is to WARN if
>>>>> add_request ends up using more ring space than what we've reserved (not
>>>>> just when it actually runs out, that obviously doesn't happen often
>>>>> enough for testing).
>>>> The problem is that there could be multiple requests being processed in
>>>> parallel. This is especially true with the scheduler. Userland could submit
>>>> a whole stream of batches that all get queued up in the scheduler. Only
>>>> later do they get submitted to the hardware. The request must be allocated
>>>> up front because there is no other means of tracking them. But reserving
>>>> space at that point won't work because you either end up reserving massive
>>>> amounts of space if the reserve is cumulative, or not enough if only one
>>>> slot is reserved.
>>> At least with execlist we don't have that problem really since writing the
>>> ringbuffer is done synchronously and directly.
>>>
>>> For the legacy scheduler I expect that we won't do any of the ringbuf
>>> writes directly and instead that's all done by the scheduler
>>> asynchronously.
>>>
>>> So this should just be an issue while we are converting to the scheduler
>>> or on platforms that will never have one. And imo the request ringbuf
>>> reservation is the solution with the simplest impact on the design.
>> I don't understand what you mean here. The scheduler decouples the
>> submission of a batch buffer to the driver with the submission of that batch
>> buffer to the hardware. The submission code path itself is identical. The
>> scheduler does not do any hardware writes - it merely adds an extra layer
>> between the user interface code and the hardware access code. The request
>> creation must be done at driver submission time. The hardware submission
>> could be an arbitrary amount of time later. In the intervening time, any
>> number of new batch buffers may be submitted to the driver and thus new
>> requests be created. This is true for both legacy and execlist mode.
>>
>> The space reserve would have to be done at the start of the hardware
>> submission process not at driver submission time. I.e. it would need to be a
>> separate and new i915_gem_request_reserve_space() call at the start of the
>> back half of the exec buffer code. It could not be done during request
>> creation.
> 
> Yeah with the scheduler we need to take out the reservation again since it
> doesn't make a lot of sense - the scheduler is the safety net that
> guarantees execution (barring gpu death ofc as always) when the execbuf
> ioctl returns 0. Maybe in the end we need a special alloc_request function
> for execbuf to only reserve when the scheduler isn't around.
> 
> That additional work is just part of the prize we have to pay to never
> break things while refactoring code. And since we'll get the execlist
> scheduler in first that reservation code will probably stick around for a
> while (or maybe forever even if we decide against upstreaming the gen7
> scheduler).
> 
> Does this make sense?
> 
> Thanks, Daniel

I already put a pre-reservation in the preemptive version of the
scheduler. It won't start to write anything in the ringbuffer unless
it's guaranteed there's enough space to write the whole set of commands
to run a batch, from preamble to finalisation. That means that none of
the ring_begin() commands can fail, and we won't leave a partial set of
commands in the ring to confuse any subsequent (successful) submissions.

And yes, I added warnings if the commands emitted to the ring overran
the amount of reserved space, so that developers would know if they'd
increased the total ringspace requirement.

If there are any other failure modes while writing to the ring, we
should simply back up to the start of the sequence (i.e. rewind the
software copy of TAIL to where it was after we checked for space -- easy
because we haven't updated h/w TAIL yet), and we can leave the request
queued for subsequent retry (or fail it as though it had been submitted
and then killed by TDR, if the failure mode suggests it's not retryable).

[aside] One version of this was posted as
http://lists.freedesktop.org/archives/intel-gfx/2014-December/057247.html but
that re-used intel_ring_begin() as a reserve-space call, whereas another
version has a separate nonblocking function intel_ring_test_space() for
prechecking available space [/aside]

.Dave.
Daniel Vetter March 9, 2015, 8:01 a.m. UTC | #10
On Fri, Mar 06, 2015 at 05:40:50PM +0000, Dave Gordon wrote:
> On 06/03/15 15:57, Daniel Vetter wrote:
> > On Fri, Mar 06, 2015 at 11:38:44AM +0000, John Harrison wrote:
> >> On 05/03/2015 16:14, Daniel Vetter wrote:
> >>> On Thu, Mar 05, 2015 at 03:06:42PM +0000, John Harrison wrote:
> >>>> On 05/03/2015 14:44, Daniel Vetter wrote:
> >>>>> Imo reserving a bit of ring space for each add_request should be solid.
> >>>>> Userspace uses the exact same reservation logic for adding end-of-batch
> >>>>> workarounds. The only thing needed to make this solid is to WARN if
> >>>>> add_request ends up using more ring space than what we've reserved (not
> >>>>> just when it actually runs out, that obviously doesn't happen often
> >>>>> enough for testing).
> >>>> The problem is that there could be multiple requests being processed in
> >>>> parallel. This is especially true with the scheduler. Userland could submit
> >>>> a whole stream of batches that all get queued up in the scheduler. Only
> >>>> later do they get submitted to the hardware. The request must be allocated
> >>>> up front because there is no other means of tracking them. But reserving
> >>>> space at that point won't work because you either end up reserving massive
> >>>> amounts of space if the reserve is cumulative, or not enough if only one
> >>>> slot is reserved.
> >>> At least with execlist we don't have that problem really since writing the
> >>> ringbuffer is done synchronously and directly.
> >>>
> >>> For the legacy scheduler I expect that we won't do any of the ringbuf
> >>> writes directly and instead that's all done by the scheduler
> >>> asynchronously.
> >>>
> >>> So this should just be an issue while we are converting to the scheduler
> >>> or on platforms that will never have one. And imo the request ringbuf
> >>> reservation is the solution with the simplest impact on the design.
> >> I don't understand what you mean here. The scheduler decouples the
> >> submission of a batch buffer to the driver with the submission of that batch
> >> buffer to the hardware. The submission code path itself is identical. The
> >> scheduler does not do any hardware writes - it merely adds an extra layer
> >> between the user interface code and the hardware access code. The request
> >> creation must be done at driver submission time. The hardware submission
> >> could be an arbitrary amount of time later. In the intervening time, any
> >> number of new batch buffers may be submitted to the driver and thus new
> >> requests be created. This is true for both legacy and execlist mode.
> >>
> >> The space reserve would have to be done at the start of the hardware
> >> submission process not at driver submission time. I.e. it would need to be a
> >> separate and new i915_gem_request_reserve_space() call at the start of the
> >> back half of the exec buffer code. It could not be done during request
> >> creation.
> > 
> > Yeah with the scheduler we need to take out the reservation again since it
> > doesn't make a lot of sense - the scheduler is the safety net that
> > guarantees execution (barring gpu death ofc as always) when the execbuf
> > ioctl returns 0. Maybe in the end we need a special alloc_request function
> > for execbuf to only reserve when the scheduler isn't around.
> > 
> > That additional work is just part of the prize we have to pay to never
> > break things while refactoring code. And since we'll get the execlist
> > scheduler in first that reservation code will probably stick around for a
> > while (or maybe forever even if we decide against upstreaming the gen7
> > scheduler).
> > 
> > Does this make sense?
> > 
> > Thanks, Daniel
> 
> I already put a pre-reservation in the preemptive version of the
> scheduler. It won't start to write anything in the ringbuffer unless
> it's guaranteed there's enough space to write the whole set of commands
> to run a batch, from preamble to finalisation. That means that none of
> the ring_begin() commands can fail, and we won't leave a partial set of
> commands in the ring to confuse any subsequent (successful) submissions.
> 
> And yes, I added warnings if the commands emitted to the ring overran
> the amount of reserved space, so that developers would know if they'd
> increased the total ringspace requirement.
> 
> If there are any other failure modes while writing to the ring, we
> should simply back up to the start of the sequence (i.e. rewind the
> software copy of TAIL to where it was after we checked for space -- easy
> because we haven't updated h/w TAIL yet), and we can leave the request
> queued for subsequent retry (or fail it as though it had been submitted
> and then killed by TDR, if the failure mode suggests it's not retryable).

Sounds good.

> [aside] One version of this was posted as
> http://lists.freedesktop.org/archives/intel-gfx/2014-December/057247.html but
> that re-used intel_ring_begin() as a reserve-space call, whereas another
> version has a separate nonblocking function intel_ring_test_space() for
> prechecking available space [/aside]

I prefer the explicit reservation over allowing nesting. Allowing nesting
gives us a lot more rope than we need I think, so much room for "creative
abuse" (i.e. really hard to understand bugs).
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e6d616b..143bc63 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2624,10 +2624,7 @@  int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
 			     struct drm_file *file_priv);
 void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 					struct intel_engine_cs *ring);
-void i915_gem_execbuffer_retire_commands(struct drm_device *dev,
-					 struct drm_file *file,
-					 struct intel_engine_cs *ring,
-					 struct drm_i915_gem_object *obj);
+int i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
 int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
 				   struct drm_i915_gem_execbuffer2 *args,
 				   struct list_head *vmas);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 93b0ef0..ca85803 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -989,17 +989,15 @@  i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 	}
 }
 
-void
-i915_gem_execbuffer_retire_commands(struct drm_device *dev,
-				    struct drm_file *file,
-				    struct intel_engine_cs *ring,
-				    struct drm_i915_gem_object *obj)
+int
+i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params)
 {
 	/* Unconditionally force add_request to emit a full flush. */
-	ring->gpu_caches_dirty = true;
+	params->ring->gpu_caches_dirty = true;
 
 	/* Add a breadcrumb for the completion of the batch buffer */
-	(void)__i915_add_request(ring, file, obj);
+	return __i915_add_request(params->ring, params->file,
+				  params->batch_obj);
 }
 
 static int
@@ -1282,8 +1280,8 @@  i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
 	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), params->dispatch_flags);
 
 	i915_gem_execbuffer_move_to_active(vmas, ring);
-	i915_gem_execbuffer_retire_commands(params->dev, params->file, ring,
-					    params->batch_obj);
+
+	ret = i915_gem_execbuffer_retire_commands(params);
 
 error:
 	kfree(cliprects);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ca29290..90400d0d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -706,9 +706,8 @@  int intel_execlists_submission(struct i915_execbuffer_params *params,
 	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), params->dispatch_flags);
 
 	i915_gem_execbuffer_move_to_active(vmas, ring);
-	i915_gem_execbuffer_retire_commands(params->dev, params->file, ring, params->batch_obj);
 
-	return 0;
+	return i915_gem_execbuffer_retire_commands(params);
 }
 
 void intel_execlists_retire_requests(struct intel_engine_cs *ring)