diff mbox

[07/32] drm/i915: Store the reset counter when constructing a request

Message ID 1449833608-22125-8-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Dec. 11, 2015, 11:33 a.m. UTC
As the request is only valid during the same global reset epoch, we can
record the current reset_counter when constructing the request and reuse
it when waiting upon that request in future. This removes a very hairy
atomic check serialised by the struct_mutex at the time of waiting and
allows us to transfer those waits to a central dispatcher for all
waiters and all requests.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.h         |  2 +-
 drivers/gpu/drm/i915/i915_gem.c         | 40 +++++++++++----------------------
 drivers/gpu/drm/i915/intel_display.c    |  7 +-----
 drivers/gpu/drm/i915/intel_lrc.c        |  7 ------
 drivers/gpu/drm/i915/intel_ringbuffer.c |  6 -----
 5 files changed, 15 insertions(+), 47 deletions(-)

Comments

Daniel Vetter Dec. 16, 2015, 9:44 a.m. UTC | #1
On Fri, Dec 11, 2015 at 11:33:03AM +0000, Chris Wilson wrote:
> As the request is only valid during the same global reset epoch, we can
> record the current reset_counter when constructing the request and reuse
> it when waiting upon that request in future. This removes a very hairy
> atomic check serialised by the struct_mutex at the time of waiting and
> allows us to transfer those waits to a central dispatcher for all
> waiters and all requests.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>

"global reset epoch" got me thinking about what the implications for TDR
are. Now the current TDR patches iirc are still somewhat tacked on the
side of the existing reset handling, and so don't end up incrementing the
reset counter.

But I don't agree with that design decision, and I think any reset must
increment the counter (and do the reset-in-progress flag dance), since
that's the only way to guarantee that waiters will get off locks. Even
though we have fewer of those with every kernel release.

But then there's the problem that at least for some request they'll still
be valid after the reset, and would break with this design here. Which can
easily be fixed by incrementing the reset epoch for every request we
decide should keep running (or which gets re-scheduled/emitted for another
attempt), and doing that explicitly seems like a good idea.

The only implication I see is that for per-ring reset we might want to go
to a per-engine reset epoch counter.

So given all that I think this is a solid idea. But one comment below as a
fallout.

> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c         | 40 +++++++++++----------------------
>  drivers/gpu/drm/i915/intel_display.c    |  7 +-----
>  drivers/gpu/drm/i915/intel_lrc.c        |  7 ------
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  6 -----
>  5 files changed, 15 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 1043ddd670a5..f30c305a6889 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2178,6 +2178,7 @@ struct drm_i915_gem_request {
>  	/** On Which ring this request was generated */
>  	struct drm_i915_private *i915;
>  	struct intel_engine_cs *ring;
> +	unsigned reset_counter;
>  
>  	 /** GEM sequence number associated with the previous request,
>  	  * when the HWS breadcrumb is equal to this the GPU is processing
> @@ -3059,7 +3060,6 @@ void __i915_add_request(struct drm_i915_gem_request *req,
>  #define i915_add_request_no_flush(req) \
>  	__i915_add_request(req, NULL, false)
>  int __i915_wait_request(struct drm_i915_gem_request *req,
> -			unsigned reset_counter,
>  			bool interruptible,
>  			s64 *timeout,
>  			struct intel_rps_client *rps);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 27e617b76418..b17cc0e42a4f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1214,7 +1214,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>  /**
>   * __i915_wait_request - wait until execution of request has finished
>   * @req: duh!
> - * @reset_counter: reset sequence associated with the given request
>   * @interruptible: do an interruptible wait (normally yes)
>   * @timeout: in - how long to wait (NULL forever); out - how much time remaining
>   *
> @@ -1229,7 +1228,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   * errno with remaining time filled in timeout argument.
>   */
>  int __i915_wait_request(struct drm_i915_gem_request *req,
> -			unsigned reset_counter,
>  			bool interruptible,
>  			s64 *timeout,
>  			struct intel_rps_client *rps)
> @@ -1288,7 +1286,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  
>  		/* We need to check whether any gpu reset happened in between
>  		 * the caller grabbing the seqno and now ... */
> -		if (reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> +		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {

READ_ONCE(req->reset) as a future proofing maybe for TDR? Or punt that to
TDR? Could be confusing to READ_ONCE something that's (with the current
code at least) invariant.

So either way: Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>


>  			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
>  			 * is truely gone. */
>  			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> @@ -1461,13 +1459,7 @@ i915_wait_request(struct drm_i915_gem_request *req)
>  
>  	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
>  
> -	ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> -	if (ret)
> -		return ret;
> -
> -	ret = __i915_wait_request(req,
> -				  i915_reset_counter(&dev_priv->gpu_error),
> -				  interruptible, NULL, NULL);
> +	ret = __i915_wait_request(req, interruptible, NULL, NULL);
>  	if (ret)
>  		return ret;
>  
> @@ -1542,7 +1534,6 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
>  	struct drm_device *dev = obj->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct drm_i915_gem_request *requests[I915_NUM_RINGS];
> -	unsigned reset_counter;
>  	int ret, i, n = 0;
>  
>  	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
> @@ -1551,12 +1542,6 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
>  	if (!obj->active)
>  		return 0;
>  
> -	ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
> -	if (ret)
> -		return ret;
> -
> -	reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> -
>  	if (readonly) {
>  		struct drm_i915_gem_request *req;
>  
> @@ -1578,9 +1563,9 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
>  	}
>  
>  	mutex_unlock(&dev->struct_mutex);
> +	ret = 0;
>  	for (i = 0; ret == 0 && i < n; i++)
> -		ret = __i915_wait_request(requests[i], reset_counter, true,
> -					  NULL, rps);
> +		ret = __i915_wait_request(requests[i], true, NULL, rps);
>  	mutex_lock(&dev->struct_mutex);
>  
>  	for (i = 0; i < n; i++) {
> @@ -2684,6 +2669,7 @@ int i915_gem_request_alloc(struct intel_engine_cs *ring,
>  			   struct drm_i915_gem_request **req_out)
>  {
>  	struct drm_i915_private *dev_priv = to_i915(ring->dev);
> +	unsigned reset_counter = i915_reset_counter(&dev_priv->gpu_error);
>  	struct drm_i915_gem_request *req;
>  	int ret;
>  
> @@ -2692,6 +2678,11 @@ int i915_gem_request_alloc(struct intel_engine_cs *ring,
>  
>  	*req_out = NULL;
>  
> +	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> +				   dev_priv->mm.interruptible);
> +	if (ret)
> +		return ret;
> +
>  	req = kmem_cache_zalloc(dev_priv->requests, GFP_KERNEL);
>  	if (req == NULL)
>  		return -ENOMEM;
> @@ -2703,6 +2694,7 @@ int i915_gem_request_alloc(struct intel_engine_cs *ring,
>  	kref_init(&req->ref);
>  	req->i915 = dev_priv;
>  	req->ring = ring;
> +	req->reset_counter = reset_counter;
>  	req->ctx  = ctx;
>  	i915_gem_context_reference(req->ctx);
>  
> @@ -3067,11 +3059,9 @@ retire:
>  int
>  i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  {
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct drm_i915_gem_wait *args = data;
>  	struct drm_i915_gem_object *obj;
>  	struct drm_i915_gem_request *req[I915_NUM_RINGS];
> -	unsigned reset_counter;
>  	int i, n = 0;
>  	int ret;
>  
> @@ -3105,7 +3095,6 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	}
>  
>  	drm_gem_object_unreference(&obj->base);
> -	reset_counter = i915_reset_counter(&dev_priv->gpu_error);
>  
>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>  		if (obj->last_read_req[i] == NULL)
> @@ -3118,7 +3107,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  
>  	for (i = 0; i < n; i++) {
>  		if (ret == 0)
> -			ret = __i915_wait_request(req[i], reset_counter, true,
> +			ret = __i915_wait_request(req[i], true,
>  						  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
>  						  to_rps_client(file));
>  		i915_gem_request_unreference__unlocked(req[i]);
> @@ -3150,7 +3139,6 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj,
>  	if (!i915_semaphore_is_enabled(obj->base.dev)) {
>  		struct drm_i915_private *i915 = to_i915(obj->base.dev);
>  		ret = __i915_wait_request(from_req,
> -					  i915_reset_counter(&i915->gpu_error),
>  					  i915->mm.interruptible,
>  					  NULL,
>  					  &i915->rps.semaphores);
> @@ -4099,7 +4087,6 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
>  	struct drm_i915_file_private *file_priv = file->driver_priv;
>  	unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
>  	struct drm_i915_gem_request *request, *target = NULL;
> -	unsigned reset_counter;
>  	int ret;
>  
>  	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
> @@ -4124,7 +4111,6 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
>  
>  		target = request;
>  	}
> -	reset_counter = i915_reset_counter(&dev_priv->gpu_error);
>  	if (target)
>  		i915_gem_request_reference(target);
>  	spin_unlock(&file_priv->mm.lock);
> @@ -4132,7 +4118,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
>  	if (target == NULL)
>  		return 0;
>  
> -	ret = __i915_wait_request(target, reset_counter, true, NULL, NULL);
> +	ret = __i915_wait_request(target, true, NULL, NULL);
>  	if (ret == 0)
>  		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
>  
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 8b6028cd619f..d59beca928b7 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -11394,7 +11394,6 @@ static void intel_mmio_flip_work_func(struct work_struct *work)
>  
>  	if (mmio_flip->req) {
>  		WARN_ON(__i915_wait_request(mmio_flip->req,
> -					    mmio_flip->crtc->reset_counter,
>  					    false, NULL,
>  					    &mmio_flip->i915->rps.mmioflips));
>  		i915_gem_request_unreference__unlocked(mmio_flip->req);
> @@ -13395,9 +13394,6 @@ static int intel_atomic_prepare_commit(struct drm_device *dev,
>  
>  	ret = drm_atomic_helper_prepare_planes(dev, state);
>  	if (!ret && !async && !i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
> -		u32 reset_counter;
> -
> -		reset_counter = i915_reset_counter(&dev_priv->gpu_error);
>  		mutex_unlock(&dev->struct_mutex);
>  
>  		for_each_plane_in_state(state, plane, plane_state, i) {
> @@ -13408,8 +13404,7 @@ static int intel_atomic_prepare_commit(struct drm_device *dev,
>  				continue;
>  
>  			ret = __i915_wait_request(intel_plane_state->wait_req,
> -						  reset_counter, true,
> -						  NULL, NULL);
> +						  true, NULL, NULL);
>  
>  			/* Swallow -EIO errors to allow updates during hw lockup. */
>  			if (ret == -EIO)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 86404b03dc91..69b298eed69d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -819,16 +819,9 @@ static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes)
>   */
>  int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
>  {
> -	struct drm_i915_private *dev_priv;
>  	int ret;
>  
>  	WARN_ON(req == NULL);
> -	dev_priv = req->ring->dev->dev_private;
> -
> -	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> -				   dev_priv->mm.interruptible);
> -	if (ret)
> -		return ret;
>  
>  	ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t));
>  	if (ret)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index d71fa77055f9..942f86b316c2 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2274,7 +2274,6 @@ int intel_ring_idle(struct intel_engine_cs *ring)
>  
>  	/* Make sure we do not trigger any retires */
>  	return __i915_wait_request(req,
> -				   i915_reset_counter(&to_i915(ring->dev)->gpu_error),
>  				   to_i915(ring->dev)->mm.interruptible,
>  				   NULL, NULL);
>  }
> @@ -2405,11 +2404,6 @@ int intel_ring_begin(struct drm_i915_gem_request *req,
>  	ring = req->ring;
>  	dev_priv = ring->dev->dev_private;
>  
> -	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> -				   dev_priv->mm.interruptible);
> -	if (ret)
> -		return ret;
> -
>  	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
>  	if (ret)
>  		return ret;
> -- 
> 2.6.3
>
Chris Wilson Dec. 16, 2015, 10:19 a.m. UTC | #2
On Wed, Dec 16, 2015 at 10:44:19AM +0100, Daniel Vetter wrote:
> On Fri, Dec 11, 2015 at 11:33:03AM +0000, Chris Wilson wrote:
> > As the request is only valid during the same global reset epoch, we can
> > record the current reset_counter when constructing the request and reuse
> > it when waiting upon that request in future. This removes a very hairy
> > atomic check serialised by the struct_mutex at the time of waiting and
> > allows us to transfer those waits to a central dispatcher for all
> > waiters and all requests.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> "global reset epoch" got me thinking about what the implications for TDR
> are. Now the current TDR patches iirc are still somewhat tacked on the
> side of the existing reset handling, and so don't end up incrementing the
> reset counter.
> 
> But I don't agree with that design decision, and I think any reset must
> increment the counter (and do the reset-in-progress flag dance), since
> that's the only way to guarantee that waiters will get off locks. Even
> though we have fewer of those with every kernel release.
> 
> But then there's the problem that at least for some request they'll still
> be valid after the reset, and would break with this design here. Which can
> easily be fixed by incrementing the reset epoch for every request we
> decide should keep running (or which gets re-scheduled/emitted for another
> attempt), and doing that explicitly seems like a good idea.
> 
> The only implication I see is that for per-ring reset we might want to go
> to a per-engine reset epoch counter.

Yes, I think per-engine epoch's will be required. (I've been trying to
choose my words carefully so that it is clear that it only applies to
today and not in a TDR future!)

> So given all that I think this is a solid idea. But one comment below as a
> fallout.
> 
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         |  2 +-
> >  drivers/gpu/drm/i915/i915_gem.c         | 40 +++++++++++----------------------
> >  drivers/gpu/drm/i915/intel_display.c    |  7 +-----
> >  drivers/gpu/drm/i915/intel_lrc.c        |  7 ------
> >  drivers/gpu/drm/i915/intel_ringbuffer.c |  6 -----
> >  5 files changed, 15 insertions(+), 47 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 1043ddd670a5..f30c305a6889 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2178,6 +2178,7 @@ struct drm_i915_gem_request {
> >  	/** On Which ring this request was generated */
> >  	struct drm_i915_private *i915;
> >  	struct intel_engine_cs *ring;
> > +	unsigned reset_counter;
> >  
> >  	 /** GEM sequence number associated with the previous request,
> >  	  * when the HWS breadcrumb is equal to this the GPU is processing
> > @@ -3059,7 +3060,6 @@ void __i915_add_request(struct drm_i915_gem_request *req,
> >  #define i915_add_request_no_flush(req) \
> >  	__i915_add_request(req, NULL, false)
> >  int __i915_wait_request(struct drm_i915_gem_request *req,
> > -			unsigned reset_counter,
> >  			bool interruptible,
> >  			s64 *timeout,
> >  			struct intel_rps_client *rps);
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 27e617b76418..b17cc0e42a4f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1214,7 +1214,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> >  /**
> >   * __i915_wait_request - wait until execution of request has finished
> >   * @req: duh!
> > - * @reset_counter: reset sequence associated with the given request
> >   * @interruptible: do an interruptible wait (normally yes)
> >   * @timeout: in - how long to wait (NULL forever); out - how much time remaining
> >   *
> > @@ -1229,7 +1228,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> >   * errno with remaining time filled in timeout argument.
> >   */
> >  int __i915_wait_request(struct drm_i915_gem_request *req,
> > -			unsigned reset_counter,
> >  			bool interruptible,
> >  			s64 *timeout,
> >  			struct intel_rps_client *rps)
> > @@ -1288,7 +1286,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
> >  
> >  		/* We need to check whether any gpu reset happened in between
> >  		 * the caller grabbing the seqno and now ... */
> > -		if (reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> > +		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> 
> READ_ONCE(req->reset) as a future proofing maybe for TDR? Or punt that to
> TDR? Could be confusing to READ_ONCE something that's (with the current
> code at least) invariant.

Right, my plan was to solve this in TDR! At the moment, I am thinking
that we regenerate a new request for each one resubmitted (that requires
patching the seqno embedded into the ring before restarting the ring),
but otherwise we can just duplicate the contents of the request.
-Chris
Dave Gordon Jan. 4, 2016, 3:58 p.m. UTC | #3
On 16/12/15 10:19, Chris Wilson wrote:
> On Wed, Dec 16, 2015 at 10:44:19AM +0100, Daniel Vetter wrote:
>> On Fri, Dec 11, 2015 at 11:33:03AM +0000, Chris Wilson wrote:
>>> As the request is only valid during the same global reset epoch, we can
>>> record the current reset_counter when constructing the request and reuse
>>> it when waiting upon that request in future. This removes a very hairy
>>> atomic check serialised by the struct_mutex at the time of waiting and
>>> allows us to transfer those waits to a central dispatcher for all
>>> waiters and all requests.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>
>> "global reset epoch" got me thinking about what the implications for TDR
>> are. Now the current TDR patches iirc are still somewhat tacked on the
>> side of the existing reset handling, and so don't end up incrementing the
>> reset counter.
>>
>> But I don't agree with that design decision, and I think any reset must
>> increment the counter (and do the reset-in-progress flag dance), since
>> that's the only way to guarantee that waiters will get off locks. Even
>> though we have fewer of those with every kernel release.
>>
>> But then there's the problem that at least for some request they'll still
>> be valid after the reset, and would break with this design here. Which can
>> easily be fixed by incrementing the reset epoch for every request we
>> decide should keep running (or which gets re-scheduled/emitted for another
>> attempt), and doing that explicitly seems like a good idea.
>>
>> The only implication I see is that for per-ring reset we might want to go
>> to a per-engine reset epoch counter.
>
> Yes, I think per-engine epoch's will be required. (I've been trying to
> choose my words carefully so that it is clear that it only applies to
> today and not in a TDR future!)
>
>> So given all that I think this is a solid idea. But one comment below as a
>> fallout.
>>
>>> ---
>>>   drivers/gpu/drm/i915/i915_drv.h         |  2 +-
>>>   drivers/gpu/drm/i915/i915_gem.c         | 40 +++++++++++----------------------
>>>   drivers/gpu/drm/i915/intel_display.c    |  7 +-----
>>>   drivers/gpu/drm/i915/intel_lrc.c        |  7 ------
>>>   drivers/gpu/drm/i915/intel_ringbuffer.c |  6 -----
>>>   5 files changed, 15 insertions(+), 47 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index 1043ddd670a5..f30c305a6889 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -2178,6 +2178,7 @@ struct drm_i915_gem_request {
>>>   	/** On Which ring this request was generated */
>>>   	struct drm_i915_private *i915;
>>>   	struct intel_engine_cs *ring;
>>> +	unsigned reset_counter;
>>>
>>>   	 /** GEM sequence number associated with the previous request,
>>>   	  * when the HWS breadcrumb is equal to this the GPU is processing
>>> @@ -3059,7 +3060,6 @@ void __i915_add_request(struct drm_i915_gem_request *req,
>>>   #define i915_add_request_no_flush(req) \
>>>   	__i915_add_request(req, NULL, false)
>>>   int __i915_wait_request(struct drm_i915_gem_request *req,
>>> -			unsigned reset_counter,
>>>   			bool interruptible,
>>>   			s64 *timeout,
>>>   			struct intel_rps_client *rps);
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index 27e617b76418..b17cc0e42a4f 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -1214,7 +1214,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>   /**
>>>    * __i915_wait_request - wait until execution of request has finished
>>>    * @req: duh!
>>> - * @reset_counter: reset sequence associated with the given request
>>>    * @interruptible: do an interruptible wait (normally yes)
>>>    * @timeout: in - how long to wait (NULL forever); out - how much time remaining
>>>    *
>>> @@ -1229,7 +1228,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>    * errno with remaining time filled in timeout argument.
>>>    */
>>>   int __i915_wait_request(struct drm_i915_gem_request *req,
>>> -			unsigned reset_counter,
>>>   			bool interruptible,
>>>   			s64 *timeout,
>>>   			struct intel_rps_client *rps)
>>> @@ -1288,7 +1286,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>>>
>>>   		/* We need to check whether any gpu reset happened in between
>>>   		 * the caller grabbing the seqno and now ... */
>>> -		if (reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
>>> +		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
>>
>> READ_ONCE(req->reset) as a future proofing maybe for TDR? Or punt that to
>> TDR? Could be confusing to READ_ONCE something that's (with the current
>> code at least) invariant.
>
> Right, my plan was to solve this in TDR! At the moment, I am thinking
> that we regenerate a new request for each one resubmitted (that requires
> patching the seqno embedded into the ring before restarting the ring),
> but otherwise we can just duplicate the contents of the request.
> -Chris

With the scheduler, there's no need to patch the contents of the 
ringbuffer; we just reset it (so it's empty) and then have the scheduler 
re-emit the requests, starting after the one that hung.

.Dave.
Chris Wilson Jan. 4, 2016, 4:10 p.m. UTC | #4
On Mon, Jan 04, 2016 at 03:58:21PM +0000, Dave Gordon wrote:
> On 16/12/15 10:19, Chris Wilson wrote:
> >On Wed, Dec 16, 2015 at 10:44:19AM +0100, Daniel Vetter wrote:
> >>On Fri, Dec 11, 2015 at 11:33:03AM +0000, Chris Wilson wrote:
> >>>@@ -1288,7 +1286,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
> >>>
> >>>  		/* We need to check whether any gpu reset happened in between
> >>>  		 * the caller grabbing the seqno and now ... */
> >>>-		if (reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> >>>+		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> >>
> >>READ_ONCE(req->reset) as a future proofing maybe for TDR? Or punt that to
> >>TDR? Could be confusing to READ_ONCE something that's (with the current
> >>code at least) invariant.
> >
> >Right, my plan was to solve this in TDR! At the moment, I am thinking
> >that we regenerate a new request for each one resubmitted (that requires
> >patching the seqno embedded into the ring before restarting the ring),
> >but otherwise we can just duplicate the contents of the request.
> 
> With the scheduler, there's no need to patch the contents of the
> ringbuffer; we just reset it (so it's empty) and then have the
> scheduler re-emit the requests, starting after the one that hung.

Which breaks __i915_wait_request, since it has already noticed that the
reset counter has changed. Hence the discussion.
-Chris
Dave Gordon Jan. 4, 2016, 5:57 p.m. UTC | #5
On 04/01/16 16:10, Chris Wilson wrote:
> On Mon, Jan 04, 2016 at 03:58:21PM +0000, Dave Gordon wrote:
>> On 16/12/15 10:19, Chris Wilson wrote:
>>> On Wed, Dec 16, 2015 at 10:44:19AM +0100, Daniel Vetter wrote:
>>>> On Fri, Dec 11, 2015 at 11:33:03AM +0000, Chris Wilson wrote:
>>>>> @@ -1288,7 +1286,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>>>>>
>>>>>   		/* We need to check whether any gpu reset happened in between
>>>>>   		 * the caller grabbing the seqno and now ... */
>>>>> -		if (reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
>>>>> +		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
>>>>
>>>> READ_ONCE(req->reset) as a future proofing maybe for TDR? Or punt that to
>>>> TDR? Could be confusing to READ_ONCE something that's (with the current
>>>> code at least) invariant.
>>>
>>> Right, my plan was to solve this in TDR! At the moment, I am thinking
>>> that we regenerate a new request for each one resubmitted (that requires
>>> patching the seqno embedded into the ring before restarting the ring),
>>> but otherwise we can just duplicate the contents of the request.
>>
>> With the scheduler, there's no need to patch the contents of the
>> ringbuffer; we just reset it (so it's empty) and then have the
>> scheduler re-emit the requests, starting after the one that hung.
>
> Which breaks __i915_wait_request, since it has already noticed that the
> reset counter has changed. Hence the discussion.
> -Chris

No, I'm saying that it's simpler for TDR to update the requests that are 
going to be re-emitted into an empty ring, rather than patch up the 
ringbuffer contents. Then by the time those requests reach 
__i915_wait_request() they will be seen as within the "current reset 
epoch" and proceed normally. I don't think there's any need to generate 
any new (duplicate) requests.

But maybe you can't take this approach without the scheduler ...

.Dave.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1043ddd670a5..f30c305a6889 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2178,6 +2178,7 @@  struct drm_i915_gem_request {
 	/** On Which ring this request was generated */
 	struct drm_i915_private *i915;
 	struct intel_engine_cs *ring;
+	unsigned reset_counter;
 
 	 /** GEM sequence number associated with the previous request,
 	  * when the HWS breadcrumb is equal to this the GPU is processing
@@ -3059,7 +3060,6 @@  void __i915_add_request(struct drm_i915_gem_request *req,
 #define i915_add_request_no_flush(req) \
 	__i915_add_request(req, NULL, false)
 int __i915_wait_request(struct drm_i915_gem_request *req,
-			unsigned reset_counter,
 			bool interruptible,
 			s64 *timeout,
 			struct intel_rps_client *rps);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 27e617b76418..b17cc0e42a4f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1214,7 +1214,6 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 /**
  * __i915_wait_request - wait until execution of request has finished
  * @req: duh!
- * @reset_counter: reset sequence associated with the given request
  * @interruptible: do an interruptible wait (normally yes)
  * @timeout: in - how long to wait (NULL forever); out - how much time remaining
  *
@@ -1229,7 +1228,6 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
  * errno with remaining time filled in timeout argument.
  */
 int __i915_wait_request(struct drm_i915_gem_request *req,
-			unsigned reset_counter,
 			bool interruptible,
 			s64 *timeout,
 			struct intel_rps_client *rps)
@@ -1288,7 +1286,7 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
-		if (reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
+		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
 			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
 			 * is truely gone. */
 			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
@@ -1461,13 +1459,7 @@  i915_wait_request(struct drm_i915_gem_request *req)
 
 	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
-	if (ret)
-		return ret;
-
-	ret = __i915_wait_request(req,
-				  i915_reset_counter(&dev_priv->gpu_error),
-				  interruptible, NULL, NULL);
+	ret = __i915_wait_request(req, interruptible, NULL, NULL);
 	if (ret)
 		return ret;
 
@@ -1542,7 +1534,6 @@  i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_request *requests[I915_NUM_RINGS];
-	unsigned reset_counter;
 	int ret, i, n = 0;
 
 	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
@@ -1551,12 +1542,6 @@  i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 	if (!obj->active)
 		return 0;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
-	if (ret)
-		return ret;
-
-	reset_counter = i915_reset_counter(&dev_priv->gpu_error);
-
 	if (readonly) {
 		struct drm_i915_gem_request *req;
 
@@ -1578,9 +1563,9 @@  i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 	}
 
 	mutex_unlock(&dev->struct_mutex);
+	ret = 0;
 	for (i = 0; ret == 0 && i < n; i++)
-		ret = __i915_wait_request(requests[i], reset_counter, true,
-					  NULL, rps);
+		ret = __i915_wait_request(requests[i], true, NULL, rps);
 	mutex_lock(&dev->struct_mutex);
 
 	for (i = 0; i < n; i++) {
@@ -2684,6 +2669,7 @@  int i915_gem_request_alloc(struct intel_engine_cs *ring,
 			   struct drm_i915_gem_request **req_out)
 {
 	struct drm_i915_private *dev_priv = to_i915(ring->dev);
+	unsigned reset_counter = i915_reset_counter(&dev_priv->gpu_error);
 	struct drm_i915_gem_request *req;
 	int ret;
 
@@ -2692,6 +2678,11 @@  int i915_gem_request_alloc(struct intel_engine_cs *ring,
 
 	*req_out = NULL;
 
+	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+				   dev_priv->mm.interruptible);
+	if (ret)
+		return ret;
+
 	req = kmem_cache_zalloc(dev_priv->requests, GFP_KERNEL);
 	if (req == NULL)
 		return -ENOMEM;
@@ -2703,6 +2694,7 @@  int i915_gem_request_alloc(struct intel_engine_cs *ring,
 	kref_init(&req->ref);
 	req->i915 = dev_priv;
 	req->ring = ring;
+	req->reset_counter = reset_counter;
 	req->ctx  = ctx;
 	i915_gem_context_reference(req->ctx);
 
@@ -3067,11 +3059,9 @@  retire:
 int
 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_wait *args = data;
 	struct drm_i915_gem_object *obj;
 	struct drm_i915_gem_request *req[I915_NUM_RINGS];
-	unsigned reset_counter;
 	int i, n = 0;
 	int ret;
 
@@ -3105,7 +3095,6 @@  i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	}
 
 	drm_gem_object_unreference(&obj->base);
-	reset_counter = i915_reset_counter(&dev_priv->gpu_error);
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		if (obj->last_read_req[i] == NULL)
@@ -3118,7 +3107,7 @@  i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	for (i = 0; i < n; i++) {
 		if (ret == 0)
-			ret = __i915_wait_request(req[i], reset_counter, true,
+			ret = __i915_wait_request(req[i], true,
 						  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
 						  to_rps_client(file));
 		i915_gem_request_unreference__unlocked(req[i]);
@@ -3150,7 +3139,6 @@  __i915_gem_object_sync(struct drm_i915_gem_object *obj,
 	if (!i915_semaphore_is_enabled(obj->base.dev)) {
 		struct drm_i915_private *i915 = to_i915(obj->base.dev);
 		ret = __i915_wait_request(from_req,
-					  i915_reset_counter(&i915->gpu_error),
 					  i915->mm.interruptible,
 					  NULL,
 					  &i915->rps.semaphores);
@@ -4099,7 +4087,6 @@  i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
 	struct drm_i915_gem_request *request, *target = NULL;
-	unsigned reset_counter;
 	int ret;
 
 	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
@@ -4124,7 +4111,6 @@  i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 
 		target = request;
 	}
-	reset_counter = i915_reset_counter(&dev_priv->gpu_error);
 	if (target)
 		i915_gem_request_reference(target);
 	spin_unlock(&file_priv->mm.lock);
@@ -4132,7 +4118,7 @@  i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	if (target == NULL)
 		return 0;
 
-	ret = __i915_wait_request(target, reset_counter, true, NULL, NULL);
+	ret = __i915_wait_request(target, true, NULL, NULL);
 	if (ret == 0)
 		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 8b6028cd619f..d59beca928b7 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11394,7 +11394,6 @@  static void intel_mmio_flip_work_func(struct work_struct *work)
 
 	if (mmio_flip->req) {
 		WARN_ON(__i915_wait_request(mmio_flip->req,
-					    mmio_flip->crtc->reset_counter,
 					    false, NULL,
 					    &mmio_flip->i915->rps.mmioflips));
 		i915_gem_request_unreference__unlocked(mmio_flip->req);
@@ -13395,9 +13394,6 @@  static int intel_atomic_prepare_commit(struct drm_device *dev,
 
 	ret = drm_atomic_helper_prepare_planes(dev, state);
 	if (!ret && !async && !i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
-		u32 reset_counter;
-
-		reset_counter = i915_reset_counter(&dev_priv->gpu_error);
 		mutex_unlock(&dev->struct_mutex);
 
 		for_each_plane_in_state(state, plane, plane_state, i) {
@@ -13408,8 +13404,7 @@  static int intel_atomic_prepare_commit(struct drm_device *dev,
 				continue;
 
 			ret = __i915_wait_request(intel_plane_state->wait_req,
-						  reset_counter, true,
-						  NULL, NULL);
+						  true, NULL, NULL);
 
 			/* Swallow -EIO errors to allow updates during hw lockup. */
 			if (ret == -EIO)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 86404b03dc91..69b298eed69d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -819,16 +819,9 @@  static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes)
  */
 int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 {
-	struct drm_i915_private *dev_priv;
 	int ret;
 
 	WARN_ON(req == NULL);
-	dev_priv = req->ring->dev->dev_private;
-
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
-				   dev_priv->mm.interruptible);
-	if (ret)
-		return ret;
 
 	ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t));
 	if (ret)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index d71fa77055f9..942f86b316c2 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2274,7 +2274,6 @@  int intel_ring_idle(struct intel_engine_cs *ring)
 
 	/* Make sure we do not trigger any retires */
 	return __i915_wait_request(req,
-				   i915_reset_counter(&to_i915(ring->dev)->gpu_error),
 				   to_i915(ring->dev)->mm.interruptible,
 				   NULL, NULL);
 }
@@ -2405,11 +2404,6 @@  int intel_ring_begin(struct drm_i915_gem_request *req,
 	ring = req->ring;
 	dev_priv = ring->dev->dev_private;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
-				   dev_priv->mm.interruptible);
-	if (ret)
-		return ret;
-
 	ret = __intel_ring_prepare(ring, num_dwords * sizeof(uint32_t));
 	if (ret)
 		return ret;