diff mbox

[CI-ping,15/15] drm/i915: Late request cancellations are harmful

Message ID 1460491389-8602-15-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 12, 2016, 8:03 p.m. UTC
Conceptually, each request is a record of a hardware transaction - we
build up a list of pending commands and then either commit them to
hardware, or cancel them. However, whilst building up the list of
pending commands, we may modify state outside of the request and make
references to the pending request. If we do so and then cancel that
request, external objects then point to the deleted request leading to
both graphical and memory corruption.

The easiest example is to consider object/VMA tracking. When we mark an
object as active in a request, we store a pointer to this, the most
recent request, in the object. Then we want to free that object, we wait
for the most recent request to be idle before proceeding (otherwise the
hardware will write to pages now owned by the system, or we will attempt
to read from those pages before the hardware is finished writing). If
the request was cancelled instead, that wait completes immediately. As a
result, all requests must be committed and not cancelled if the external
state is unknown.

All that remains of i915_gem_request_cancel() users are just a couple of
extremely unlikely allocation failures, so remove the API entirely.

A consequence of committing all incomplete requests is that we generate
excess breadcrumbs and fill the ring much more often with dummy work. We
have completely undone the outstanding_last_seqno optimisation.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/i915/i915_drv.h            |  2 --
 drivers/gpu/drm/i915/i915_gem.c            | 50 ++++++++++++------------------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 15 +++------
 drivers/gpu/drm/i915/intel_display.c       |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c           |  4 +--
 drivers/gpu/drm/i915/intel_overlay.c       |  8 ++---
 6 files changed, 30 insertions(+), 51 deletions(-)

Comments

Daniel Vetter April 13, 2016, 9:57 a.m. UTC | #1
On Tue, Apr 12, 2016 at 09:03:09PM +0100, Chris Wilson wrote:
> Conceptually, each request is a record of a hardware transaction - we
> build up a list of pending commands and then either commit them to
> hardware, or cancel them. However, whilst building up the list of
> pending commands, we may modify state outside of the request and make
> references to the pending request. If we do so and then cancel that
> request, external objects then point to the deleted request leading to
> both graphical and memory corruption.
> 
> The easiest example is to consider object/VMA tracking. When we mark an
> object as active in a request, we store a pointer to this, the most
> recent request, in the object. Then we want to free that object, we wait
> for the most recent request to be idle before proceeding (otherwise the
> hardware will write to pages now owned by the system, or we will attempt
> to read from those pages before the hardware is finished writing). If
> the request was cancelled instead, that wait completes immediately. As a
> result, all requests must be committed and not cancelled if the external
> state is unknown.
> 
> All that remains of i915_gem_request_cancel() users are just a couple of
> extremely unlikely allocation failures, so remove the API entirely.
> 
> A consequence of committing all incomplete requests is that we generate
> excess breadcrumbs and fill the ring much more often with dummy work. We
> have completely undone the outstanding_last_seqno optimisation.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: stable@vger.kernel.org

Cc: John Harrison <John.C.Harrison@Intel.com>

I'd like John's ack on this on too, but patch itself looks sound. Fast r-b
since we've discussed this a while ago already ...

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_drv.h            |  2 --
>  drivers/gpu/drm/i915/i915_gem.c            | 50 ++++++++++++------------------
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 15 +++------
>  drivers/gpu/drm/i915/intel_display.c       |  2 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |  4 +--
>  drivers/gpu/drm/i915/intel_overlay.c       |  8 ++---
>  6 files changed, 30 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 061ecc43d935..de84dd7be971 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2331,7 +2331,6 @@ struct drm_i915_gem_request {
>  struct drm_i915_gem_request * __must_check
>  i915_gem_request_alloc(struct intel_engine_cs *engine,
>  		       struct intel_context *ctx);
> -void i915_gem_request_cancel(struct drm_i915_gem_request *req);
>  void i915_gem_request_free(struct kref *req_ref);
>  int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
>  				   struct drm_file *file);
> @@ -2883,7 +2882,6 @@ int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
>  			     struct drm_file *file_priv);
>  void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>  					struct drm_i915_gem_request *req);
> -void i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
>  int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>  				   struct drm_i915_gem_execbuffer2 *args,
>  				   struct list_head *vmas);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b6879d43dd74..c6f09e7839ea 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2785,7 +2785,8 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
>  		 * fully prepared. Thus it can be cleaned up using the proper
>  		 * free code.
>  		 */
> -		i915_gem_request_cancel(req);
> +		intel_ring_reserved_space_cancel(req->ringbuf);
> +		i915_gem_request_unreference(req);
>  		return ret;
>  	}
>  
> @@ -2822,13 +2823,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  	return err ? ERR_PTR(err) : req;
>  }
>  
> -void i915_gem_request_cancel(struct drm_i915_gem_request *req)
> -{
> -	intel_ring_reserved_space_cancel(req->ringbuf);
> -
> -	i915_gem_request_unreference(req);
> -}
> -
>  struct drm_i915_gem_request *
>  i915_gem_find_active_request(struct intel_engine_cs *engine)
>  {
> @@ -3438,12 +3432,9 @@ int i915_gpu_idle(struct drm_device *dev)
>  				return PTR_ERR(req);
>  
>  			ret = i915_switch_context(req);
> -			if (ret) {
> -				i915_gem_request_cancel(req);
> -				return ret;
> -			}
> -
>  			i915_add_request_no_flush(req);
> +			if (ret)
> +				return ret;
>  		}
>  
>  		ret = intel_engine_idle(engine);
> @@ -4943,34 +4934,33 @@ i915_gem_init_hw(struct drm_device *dev)
>  		req = i915_gem_request_alloc(engine, NULL);
>  		if (IS_ERR(req)) {
>  			ret = PTR_ERR(req);
> -			i915_gem_cleanup_engines(dev);
> -			goto out;
> +			break;
>  		}
>  
>  		if (engine->id == RCS) {
> -			for (j = 0; j < NUM_L3_SLICES(dev); j++)
> -				i915_gem_l3_remap(req, j);
> +			for (j = 0; j < NUM_L3_SLICES(dev); j++) {
> +				ret = i915_gem_l3_remap(req, j);
> +				if (ret)
> +					goto err_request;
> +			}
>  		}
>  
>  		ret = i915_ppgtt_init_ring(req);
> -		if (ret && ret != -EIO) {
> -			DRM_ERROR("PPGTT enable %s failed %d\n",
> -				  engine->name, ret);
> -			i915_gem_request_cancel(req);
> -			i915_gem_cleanup_engines(dev);
> -			goto out;
> -		}
> +		if (ret)
> +			goto err_request;
>  
>  		ret = i915_gem_context_enable(req);
> -		if (ret && ret != -EIO) {
> -			DRM_ERROR("Context enable %s failed %d\n",
> +		if (ret)
> +			goto err_request;
> +
> +err_request:
> +		i915_add_request_no_flush(req);
> +		if (ret) {
> +			DRM_ERROR("Failed to enable %s, error=%d\n",
>  				  engine->name, ret);
> -			i915_gem_request_cancel(req);
>  			i915_gem_cleanup_engines(dev);
> -			goto out;
> +			break;
>  		}
> -
> -		i915_add_request_no_flush(req);
>  	}
>  
>  out:
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 6ee4f00f620c..6f4f2a6cdf93 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1137,7 +1137,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>  	}
>  }
>  
> -void
> +static void
>  i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params)
>  {
>  	/* Unconditionally force add_request to emit a full flush. */
> @@ -1322,7 +1322,6 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>  	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
>  
>  	i915_gem_execbuffer_move_to_active(vmas, params->request);
> -	i915_gem_execbuffer_retire_commands(params);
>  
>  	return 0;
>  }
> @@ -1624,7 +1623,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  
>  	ret = i915_gem_request_add_to_client(req, file);
>  	if (ret)
> -		goto err_batch_unpin;
> +		goto err_request;
>  
>  	/*
>  	 * Save assorted stuff away to pass through to *_submission().
> @@ -1641,6 +1640,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	params->request                 = req;
>  
>  	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
> +err_request:
> +	i915_gem_execbuffer_retire_commands(params);
>  
>  err_batch_unpin:
>  	/*
> @@ -1657,14 +1658,6 @@ err:
>  	i915_gem_context_unreference(ctx);
>  	eb_destroy(eb);
>  
> -	/*
> -	 * If the request was created but not successfully submitted then it
> -	 * must be freed again. If it was submitted then it is being tracked
> -	 * on the active request list and no clean up is required here.
> -	 */
> -	if (ret && !IS_ERR_OR_NULL(req))
> -		i915_gem_request_cancel(req);
> -
>  	mutex_unlock(&dev->struct_mutex);
>  
>  pre_mutex_err:
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index b1b457864e17..3cae596d10a3 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -11664,7 +11664,7 @@ cleanup_unpin:
>  	intel_unpin_fb_obj(fb, crtc->primary->state->rotation);
>  cleanup_pending:
>  	if (!IS_ERR_OR_NULL(request))
> -		i915_gem_request_cancel(request);
> +		i915_add_request_no_flush(request);
>  	atomic_dec(&intel_crtc->unpin_work_count);
>  	mutex_unlock(&dev->struct_mutex);
>  cleanup:
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index b8f6b96472a6..6fc24deaa16a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1010,7 +1010,6 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>  	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
>  
>  	i915_gem_execbuffer_move_to_active(vmas, params->request);
> -	i915_gem_execbuffer_retire_commands(params);
>  
>  	return 0;
>  }
> @@ -2679,13 +2678,12 @@ int intel_lr_context_deferred_alloc(struct intel_context *ctx,
>  		}
>  
>  		ret = engine->init_context(req);
> +		i915_add_request_no_flush(req);
>  		if (ret) {
>  			DRM_ERROR("ring init context: %d\n",
>  				ret);
> -			i915_gem_request_cancel(req);
>  			goto error_ringbuf;
>  		}
> -		i915_add_request_no_flush(req);
>  	}
>  	return 0;
>  
> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
> index 6694e9230cd5..bcc3b6a016d8 100644
> --- a/drivers/gpu/drm/i915/intel_overlay.c
> +++ b/drivers/gpu/drm/i915/intel_overlay.c
> @@ -247,7 +247,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
>  
>  	ret = intel_ring_begin(req, 4);
>  	if (ret) {
> -		i915_gem_request_cancel(req);
> +		i915_add_request_no_flush(req);
>  		return ret;
>  	}
>  
> @@ -290,7 +290,7 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
>  
>  	ret = intel_ring_begin(req, 2);
>  	if (ret) {
> -		i915_gem_request_cancel(req);
> +		i915_add_request_no_flush(req);
>  		return ret;
>  	}
>  
> @@ -356,7 +356,7 @@ static int intel_overlay_off(struct intel_overlay *overlay)
>  
>  	ret = intel_ring_begin(req, 6);
>  	if (ret) {
> -		i915_gem_request_cancel(req);
> +		i915_add_request_no_flush(req);
>  		return ret;
>  	}
>  
> @@ -431,7 +431,7 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
>  
>  		ret = intel_ring_begin(req, 2);
>  		if (ret) {
> -			i915_gem_request_cancel(req);
> +			i915_add_request_no_flush(req);
>  			return ret;
>  		}
>  
> -- 
> 2.8.0.rc3
>
John Harrison April 13, 2016, 2:21 p.m. UTC | #2
On 13/04/2016 10:57, Daniel Vetter wrote:
> On Tue, Apr 12, 2016 at 09:03:09PM +0100, Chris Wilson wrote:
>> Conceptually, each request is a record of a hardware transaction - we
>> build up a list of pending commands and then either commit them to
>> hardware, or cancel them. However, whilst building up the list of
>> pending commands, we may modify state outside of the request and make
>> references to the pending request. If we do so and then cancel that
>> request, external objects then point to the deleted request leading to
>> both graphical and memory corruption.
>>
>> The easiest example is to consider object/VMA tracking. When we mark an
>> object as active in a request, we store a pointer to this, the most
>> recent request, in the object. Then we want to free that object, we wait
>> for the most recent request to be idle before proceeding (otherwise the
>> hardware will write to pages now owned by the system, or we will attempt
>> to read from those pages before the hardware is finished writing). If
>> the request was cancelled instead, that wait completes immediately. As a
>> result, all requests must be committed and not cancelled if the external
>> state is unknown.
>>
>> All that remains of i915_gem_request_cancel() users are just a couple of
>> extremely unlikely allocation failures, so remove the API entirely.
>>
>> A consequence of committing all incomplete requests is that we generate
>> excess breadcrumbs and fill the ring much more often with dummy work. We
>> have completely undone the outstanding_last_seqno optimisation.
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>> Cc: stable@vger.kernel.org
> Cc: John Harrison <John.C.Harrison@Intel.com>
>
> I'd like John's ack on this on too, but patch itself looks sound. Fast r-b
> since we've discussed this a while ago already ...

I think this is going to cause a problem with the scheduler. You are 
effectively saying that the execbuf call cannot fail beyond the point of 
allocating a request. If it gets that far then it must go all the way 
and submit the request to the hardware. With a scheduler, that means 
adding it to the scheduler's queues and tracking it all the way through 
the system to completion. If nothing else, that sounds like a lot of 
extra overhead for no actual work. Or worse if the failure is at a point 
where the request cannot be sent further through the system because it 
was something critical that failed then you are really stuffed.

I'm not sure what the other option would be though, short of being able 
to undo the last read/write object tracking updates.


>
> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h            |  2 --
>>   drivers/gpu/drm/i915/i915_gem.c            | 50 ++++++++++++------------------
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 15 +++------
>>   drivers/gpu/drm/i915/intel_display.c       |  2 +-
>>   drivers/gpu/drm/i915/intel_lrc.c           |  4 +--
>>   drivers/gpu/drm/i915/intel_overlay.c       |  8 ++---
>>   6 files changed, 30 insertions(+), 51 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 061ecc43d935..de84dd7be971 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2331,7 +2331,6 @@ struct drm_i915_gem_request {
>>   struct drm_i915_gem_request * __must_check
>>   i915_gem_request_alloc(struct intel_engine_cs *engine,
>>   		       struct intel_context *ctx);
>> -void i915_gem_request_cancel(struct drm_i915_gem_request *req);
>>   void i915_gem_request_free(struct kref *req_ref);
>>   int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
>>   				   struct drm_file *file);
>> @@ -2883,7 +2882,6 @@ int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
>>   			     struct drm_file *file_priv);
>>   void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>>   					struct drm_i915_gem_request *req);
>> -void i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
>>   int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>>   				   struct drm_i915_gem_execbuffer2 *args,
>>   				   struct list_head *vmas);
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index b6879d43dd74..c6f09e7839ea 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2785,7 +2785,8 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
>>   		 * fully prepared. Thus it can be cleaned up using the proper
>>   		 * free code.
>>   		 */
>> -		i915_gem_request_cancel(req);
>> +		intel_ring_reserved_space_cancel(req->ringbuf);
>> +		i915_gem_request_unreference(req);
>>   		return ret;
>>   	}
>>   
>> @@ -2822,13 +2823,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>>   	return err ? ERR_PTR(err) : req;
>>   }
>>   
>> -void i915_gem_request_cancel(struct drm_i915_gem_request *req)
>> -{
>> -	intel_ring_reserved_space_cancel(req->ringbuf);
>> -
>> -	i915_gem_request_unreference(req);
>> -}
>> -
>>   struct drm_i915_gem_request *
>>   i915_gem_find_active_request(struct intel_engine_cs *engine)
>>   {
>> @@ -3438,12 +3432,9 @@ int i915_gpu_idle(struct drm_device *dev)
>>   				return PTR_ERR(req);
>>   
>>   			ret = i915_switch_context(req);
>> -			if (ret) {
>> -				i915_gem_request_cancel(req);
>> -				return ret;
>> -			}
>> -
>>   			i915_add_request_no_flush(req);
>> +			if (ret)
>> +				return ret;
>>   		}
>>   
>>   		ret = intel_engine_idle(engine);
>> @@ -4943,34 +4934,33 @@ i915_gem_init_hw(struct drm_device *dev)
>>   		req = i915_gem_request_alloc(engine, NULL);
>>   		if (IS_ERR(req)) {
>>   			ret = PTR_ERR(req);
>> -			i915_gem_cleanup_engines(dev);
>> -			goto out;
>> +			break;
>>   		}
>>   
>>   		if (engine->id == RCS) {
>> -			for (j = 0; j < NUM_L3_SLICES(dev); j++)
>> -				i915_gem_l3_remap(req, j);
>> +			for (j = 0; j < NUM_L3_SLICES(dev); j++) {
>> +				ret = i915_gem_l3_remap(req, j);
>> +				if (ret)
>> +					goto err_request;
>> +			}
>>   		}
>>   
>>   		ret = i915_ppgtt_init_ring(req);
>> -		if (ret && ret != -EIO) {
>> -			DRM_ERROR("PPGTT enable %s failed %d\n",
>> -				  engine->name, ret);
>> -			i915_gem_request_cancel(req);
>> -			i915_gem_cleanup_engines(dev);
>> -			goto out;
>> -		}
>> +		if (ret)
>> +			goto err_request;
>>   
>>   		ret = i915_gem_context_enable(req);
>> -		if (ret && ret != -EIO) {
>> -			DRM_ERROR("Context enable %s failed %d\n",
>> +		if (ret)
>> +			goto err_request;
>> +
>> +err_request:
>> +		i915_add_request_no_flush(req);
>> +		if (ret) {
>> +			DRM_ERROR("Failed to enable %s, error=%d\n",
>>   				  engine->name, ret);
>> -			i915_gem_request_cancel(req);
>>   			i915_gem_cleanup_engines(dev);
>> -			goto out;
>> +			break;
>>   		}
>> -
>> -		i915_add_request_no_flush(req);
>>   	}
>>   
>>   out:
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index 6ee4f00f620c..6f4f2a6cdf93 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -1137,7 +1137,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>>   	}
>>   }
>>   
>> -void
>> +static void
>>   i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params)
>>   {
>>   	/* Unconditionally force add_request to emit a full flush. */
>> @@ -1322,7 +1322,6 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>>   	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
>>   
>>   	i915_gem_execbuffer_move_to_active(vmas, params->request);
>> -	i915_gem_execbuffer_retire_commands(params);
>>   
>>   	return 0;
>>   }
>> @@ -1624,7 +1623,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>   
>>   	ret = i915_gem_request_add_to_client(req, file);
>>   	if (ret)
>> -		goto err_batch_unpin;
>> +		goto err_request;
>>   
>>   	/*
>>   	 * Save assorted stuff away to pass through to *_submission().
>> @@ -1641,6 +1640,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>   	params->request                 = req;
>>   
>>   	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
>> +err_request:
>> +	i915_gem_execbuffer_retire_commands(params);
>>   
>>   err_batch_unpin:
>>   	/*
>> @@ -1657,14 +1658,6 @@ err:
>>   	i915_gem_context_unreference(ctx);
>>   	eb_destroy(eb);
>>   
>> -	/*
>> -	 * If the request was created but not successfully submitted then it
>> -	 * must be freed again. If it was submitted then it is being tracked
>> -	 * on the active request list and no clean up is required here.
>> -	 */
>> -	if (ret && !IS_ERR_OR_NULL(req))
>> -		i915_gem_request_cancel(req);
>> -
>>   	mutex_unlock(&dev->struct_mutex);
>>   
>>   pre_mutex_err:
>> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
>> index b1b457864e17..3cae596d10a3 100644
>> --- a/drivers/gpu/drm/i915/intel_display.c
>> +++ b/drivers/gpu/drm/i915/intel_display.c
>> @@ -11664,7 +11664,7 @@ cleanup_unpin:
>>   	intel_unpin_fb_obj(fb, crtc->primary->state->rotation);
>>   cleanup_pending:
>>   	if (!IS_ERR_OR_NULL(request))
>> -		i915_gem_request_cancel(request);
>> +		i915_add_request_no_flush(request);
>>   	atomic_dec(&intel_crtc->unpin_work_count);
>>   	mutex_unlock(&dev->struct_mutex);
>>   cleanup:
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index b8f6b96472a6..6fc24deaa16a 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -1010,7 +1010,6 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>>   	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
>>   
>>   	i915_gem_execbuffer_move_to_active(vmas, params->request);
>> -	i915_gem_execbuffer_retire_commands(params);
>>   
>>   	return 0;
>>   }
>> @@ -2679,13 +2678,12 @@ int intel_lr_context_deferred_alloc(struct intel_context *ctx,
>>   		}
>>   
>>   		ret = engine->init_context(req);
>> +		i915_add_request_no_flush(req);
>>   		if (ret) {
>>   			DRM_ERROR("ring init context: %d\n",
>>   				ret);
>> -			i915_gem_request_cancel(req);
>>   			goto error_ringbuf;
>>   		}
>> -		i915_add_request_no_flush(req);
>>   	}
>>   	return 0;
>>   
>> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
>> index 6694e9230cd5..bcc3b6a016d8 100644
>> --- a/drivers/gpu/drm/i915/intel_overlay.c
>> +++ b/drivers/gpu/drm/i915/intel_overlay.c
>> @@ -247,7 +247,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
>>   
>>   	ret = intel_ring_begin(req, 4);
>>   	if (ret) {
>> -		i915_gem_request_cancel(req);
>> +		i915_add_request_no_flush(req);
>>   		return ret;
>>   	}
>>   
>> @@ -290,7 +290,7 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
>>   
>>   	ret = intel_ring_begin(req, 2);
>>   	if (ret) {
>> -		i915_gem_request_cancel(req);
>> +		i915_add_request_no_flush(req);
>>   		return ret;
>>   	}
>>   
>> @@ -356,7 +356,7 @@ static int intel_overlay_off(struct intel_overlay *overlay)
>>   
>>   	ret = intel_ring_begin(req, 6);
>>   	if (ret) {
>> -		i915_gem_request_cancel(req);
>> +		i915_add_request_no_flush(req);
>>   		return ret;
>>   	}
>>   
>> @@ -431,7 +431,7 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
>>   
>>   		ret = intel_ring_begin(req, 2);
>>   		if (ret) {
>> -			i915_gem_request_cancel(req);
>> +			i915_add_request_no_flush(req);
>>   			return ret;
>>   		}
>>   
>> -- 
>> 2.8.0.rc3
>>
Jani Nikula April 18, 2016, 9:46 a.m. UTC | #3
On Wed, 13 Apr 2016, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, Apr 12, 2016 at 09:03:09PM +0100, Chris Wilson wrote:
>> Conceptually, each request is a record of a hardware transaction - we
>> build up a list of pending commands and then either commit them to
>> hardware, or cancel them. However, whilst building up the list of
>> pending commands, we may modify state outside of the request and make
>> references to the pending request. If we do so and then cancel that
>> request, external objects then point to the deleted request leading to
>> both graphical and memory corruption.
>> 
>> The easiest example is to consider object/VMA tracking. When we mark an
>> object as active in a request, we store a pointer to this, the most
>> recent request, in the object. Then we want to free that object, we wait
>> for the most recent request to be idle before proceeding (otherwise the
>> hardware will write to pages now owned by the system, or we will attempt
>> to read from those pages before the hardware is finished writing). If
>> the request was cancelled instead, that wait completes immediately. As a
>> result, all requests must be committed and not cancelled if the external
>> state is unknown.
>> 
>> All that remains of i915_gem_request_cancel() users are just a couple of
>> extremely unlikely allocation failures, so remove the API entirely.
>> 
>> A consequence of committing all incomplete requests is that we generate
>> excess breadcrumbs and fill the ring much more often with dummy work. We
>> have completely undone the outstanding_last_seqno optimisation.
>> 
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>> Cc: stable@vger.kernel.org
>
> Cc: John Harrison <John.C.Harrison@Intel.com>
>
> I'd like John's ack on this on too, but patch itself looks sound. Fast r-b
> since we've discussed this a while ago already ...
>
> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

FYI, this (*) does not cherry-pick cleanly to drm-intel-fixes.

BR,
Jani.


(*) Well, not exactly *this* but rather
https://patchwork.freedesktop.org/patch/80961/ which was not posted on
the list so I can't reply to it.


>> ---
>>  drivers/gpu/drm/i915/i915_drv.h            |  2 --
>>  drivers/gpu/drm/i915/i915_gem.c            | 50 ++++++++++++------------------
>>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 15 +++------
>>  drivers/gpu/drm/i915/intel_display.c       |  2 +-
>>  drivers/gpu/drm/i915/intel_lrc.c           |  4 +--
>>  drivers/gpu/drm/i915/intel_overlay.c       |  8 ++---
>>  6 files changed, 30 insertions(+), 51 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 061ecc43d935..de84dd7be971 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2331,7 +2331,6 @@ struct drm_i915_gem_request {
>>  struct drm_i915_gem_request * __must_check
>>  i915_gem_request_alloc(struct intel_engine_cs *engine,
>>  		       struct intel_context *ctx);
>> -void i915_gem_request_cancel(struct drm_i915_gem_request *req);
>>  void i915_gem_request_free(struct kref *req_ref);
>>  int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
>>  				   struct drm_file *file);
>> @@ -2883,7 +2882,6 @@ int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
>>  			     struct drm_file *file_priv);
>>  void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>>  					struct drm_i915_gem_request *req);
>> -void i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
>>  int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>>  				   struct drm_i915_gem_execbuffer2 *args,
>>  				   struct list_head *vmas);
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index b6879d43dd74..c6f09e7839ea 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2785,7 +2785,8 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
>>  		 * fully prepared. Thus it can be cleaned up using the proper
>>  		 * free code.
>>  		 */
>> -		i915_gem_request_cancel(req);
>> +		intel_ring_reserved_space_cancel(req->ringbuf);
>> +		i915_gem_request_unreference(req);
>>  		return ret;
>>  	}
>>  
>> @@ -2822,13 +2823,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>>  	return err ? ERR_PTR(err) : req;
>>  }
>>  
>> -void i915_gem_request_cancel(struct drm_i915_gem_request *req)
>> -{
>> -	intel_ring_reserved_space_cancel(req->ringbuf);
>> -
>> -	i915_gem_request_unreference(req);
>> -}
>> -
>>  struct drm_i915_gem_request *
>>  i915_gem_find_active_request(struct intel_engine_cs *engine)
>>  {
>> @@ -3438,12 +3432,9 @@ int i915_gpu_idle(struct drm_device *dev)
>>  				return PTR_ERR(req);
>>  
>>  			ret = i915_switch_context(req);
>> -			if (ret) {
>> -				i915_gem_request_cancel(req);
>> -				return ret;
>> -			}
>> -
>>  			i915_add_request_no_flush(req);
>> +			if (ret)
>> +				return ret;
>>  		}
>>  
>>  		ret = intel_engine_idle(engine);
>> @@ -4943,34 +4934,33 @@ i915_gem_init_hw(struct drm_device *dev)
>>  		req = i915_gem_request_alloc(engine, NULL);
>>  		if (IS_ERR(req)) {
>>  			ret = PTR_ERR(req);
>> -			i915_gem_cleanup_engines(dev);
>> -			goto out;
>> +			break;
>>  		}
>>  
>>  		if (engine->id == RCS) {
>> -			for (j = 0; j < NUM_L3_SLICES(dev); j++)
>> -				i915_gem_l3_remap(req, j);
>> +			for (j = 0; j < NUM_L3_SLICES(dev); j++) {
>> +				ret = i915_gem_l3_remap(req, j);
>> +				if (ret)
>> +					goto err_request;
>> +			}
>>  		}
>>  
>>  		ret = i915_ppgtt_init_ring(req);
>> -		if (ret && ret != -EIO) {
>> -			DRM_ERROR("PPGTT enable %s failed %d\n",
>> -				  engine->name, ret);
>> -			i915_gem_request_cancel(req);
>> -			i915_gem_cleanup_engines(dev);
>> -			goto out;
>> -		}
>> +		if (ret)
>> +			goto err_request;
>>  
>>  		ret = i915_gem_context_enable(req);
>> -		if (ret && ret != -EIO) {
>> -			DRM_ERROR("Context enable %s failed %d\n",
>> +		if (ret)
>> +			goto err_request;
>> +
>> +err_request:
>> +		i915_add_request_no_flush(req);
>> +		if (ret) {
>> +			DRM_ERROR("Failed to enable %s, error=%d\n",
>>  				  engine->name, ret);
>> -			i915_gem_request_cancel(req);
>>  			i915_gem_cleanup_engines(dev);
>> -			goto out;
>> +			break;
>>  		}
>> -
>> -		i915_add_request_no_flush(req);
>>  	}
>>  
>>  out:
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index 6ee4f00f620c..6f4f2a6cdf93 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -1137,7 +1137,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
>>  	}
>>  }
>>  
>> -void
>> +static void
>>  i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params)
>>  {
>>  	/* Unconditionally force add_request to emit a full flush. */
>> @@ -1322,7 +1322,6 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
>>  	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
>>  
>>  	i915_gem_execbuffer_move_to_active(vmas, params->request);
>> -	i915_gem_execbuffer_retire_commands(params);
>>  
>>  	return 0;
>>  }
>> @@ -1624,7 +1623,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>  
>>  	ret = i915_gem_request_add_to_client(req, file);
>>  	if (ret)
>> -		goto err_batch_unpin;
>> +		goto err_request;
>>  
>>  	/*
>>  	 * Save assorted stuff away to pass through to *_submission().
>> @@ -1641,6 +1640,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>  	params->request                 = req;
>>  
>>  	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
>> +err_request:
>> +	i915_gem_execbuffer_retire_commands(params);
>>  
>>  err_batch_unpin:
>>  	/*
>> @@ -1657,14 +1658,6 @@ err:
>>  	i915_gem_context_unreference(ctx);
>>  	eb_destroy(eb);
>>  
>> -	/*
>> -	 * If the request was created but not successfully submitted then it
>> -	 * must be freed again. If it was submitted then it is being tracked
>> -	 * on the active request list and no clean up is required here.
>> -	 */
>> -	if (ret && !IS_ERR_OR_NULL(req))
>> -		i915_gem_request_cancel(req);
>> -
>>  	mutex_unlock(&dev->struct_mutex);
>>  
>>  pre_mutex_err:
>> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
>> index b1b457864e17..3cae596d10a3 100644
>> --- a/drivers/gpu/drm/i915/intel_display.c
>> +++ b/drivers/gpu/drm/i915/intel_display.c
>> @@ -11664,7 +11664,7 @@ cleanup_unpin:
>>  	intel_unpin_fb_obj(fb, crtc->primary->state->rotation);
>>  cleanup_pending:
>>  	if (!IS_ERR_OR_NULL(request))
>> -		i915_gem_request_cancel(request);
>> +		i915_add_request_no_flush(request);
>>  	atomic_dec(&intel_crtc->unpin_work_count);
>>  	mutex_unlock(&dev->struct_mutex);
>>  cleanup:
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index b8f6b96472a6..6fc24deaa16a 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -1010,7 +1010,6 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>>  	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
>>  
>>  	i915_gem_execbuffer_move_to_active(vmas, params->request);
>> -	i915_gem_execbuffer_retire_commands(params);
>>  
>>  	return 0;
>>  }
>> @@ -2679,13 +2678,12 @@ int intel_lr_context_deferred_alloc(struct intel_context *ctx,
>>  		}
>>  
>>  		ret = engine->init_context(req);
>> +		i915_add_request_no_flush(req);
>>  		if (ret) {
>>  			DRM_ERROR("ring init context: %d\n",
>>  				ret);
>> -			i915_gem_request_cancel(req);
>>  			goto error_ringbuf;
>>  		}
>> -		i915_add_request_no_flush(req);
>>  	}
>>  	return 0;
>>  
>> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
>> index 6694e9230cd5..bcc3b6a016d8 100644
>> --- a/drivers/gpu/drm/i915/intel_overlay.c
>> +++ b/drivers/gpu/drm/i915/intel_overlay.c
>> @@ -247,7 +247,7 @@ static int intel_overlay_on(struct intel_overlay *overlay)
>>  
>>  	ret = intel_ring_begin(req, 4);
>>  	if (ret) {
>> -		i915_gem_request_cancel(req);
>> +		i915_add_request_no_flush(req);
>>  		return ret;
>>  	}
>>  
>> @@ -290,7 +290,7 @@ static int intel_overlay_continue(struct intel_overlay *overlay,
>>  
>>  	ret = intel_ring_begin(req, 2);
>>  	if (ret) {
>> -		i915_gem_request_cancel(req);
>> +		i915_add_request_no_flush(req);
>>  		return ret;
>>  	}
>>  
>> @@ -356,7 +356,7 @@ static int intel_overlay_off(struct intel_overlay *overlay)
>>  
>>  	ret = intel_ring_begin(req, 6);
>>  	if (ret) {
>> -		i915_gem_request_cancel(req);
>> +		i915_add_request_no_flush(req);
>>  		return ret;
>>  	}
>>  
>> @@ -431,7 +431,7 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
>>  
>>  		ret = intel_ring_begin(req, 2);
>>  		if (ret) {
>> -			i915_gem_request_cancel(req);
>> +			i915_add_request_no_flush(req);
>>  			return ret;
>>  		}
>>  
>> -- 
>> 2.8.0.rc3
>>
Dave Gordon April 19, 2016, 12:35 p.m. UTC | #4
On 13/04/16 15:21, John Harrison wrote:
> On 13/04/2016 10:57, Daniel Vetter wrote:
>> On Tue, Apr 12, 2016 at 09:03:09PM +0100, Chris Wilson wrote:
>>> Conceptually, each request is a record of a hardware transaction - we
>>> build up a list of pending commands and then either commit them to
>>> hardware, or cancel them. However, whilst building up the list of
>>> pending commands, we may modify state outside of the request and make
>>> references to the pending request. If we do so and then cancel that
>>> request, external objects then point to the deleted request leading to
>>> both graphical and memory corruption.
>>>
>>> The easiest example is to consider object/VMA tracking. When we mark an
>>> object as active in a request, we store a pointer to this, the most
>>> recent request, in the object. Then we want to free that object, we wait
>>> for the most recent request to be idle before proceeding (otherwise the
>>> hardware will write to pages now owned by the system, or we will attempt
>>> to read from those pages before the hardware is finished writing). If
>>> the request was cancelled instead, that wait completes immediately. As a
>>> result, all requests must be committed and not cancelled if the external
>>> state is unknown.
>>>
>>> All that remains of i915_gem_request_cancel() users are just a couple of
>>> extremely unlikely allocation failures, so remove the API entirely.
>>>
>>> A consequence of committing all incomplete requests is that we generate
>>> excess breadcrumbs and fill the ring much more often with dummy work. We
>>> have completely undone the outstanding_last_seqno optimisation.
>>>
>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>> Cc: stable@vger.kernel.org
>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>
>> I'd like John's ack on this on too, but patch itself looks sound. Fast
>> r-b
>> since we've discussed this a while ago already ...
>
> I think this is going to cause a problem with the scheduler. You are
> effectively saying that the execbuf call cannot fail beyond the point of
> allocating a request. If it gets that far then it must go all the way
> and submit the request to the hardware. With a scheduler, that means
> adding it to the scheduler's queues and tracking it all the way through
> the system to completion. If nothing else, that sounds like a lot of
> extra overhead for no actual work. Or worse if the failure is at a point
> where the request cannot be sent further through the system because it
> was something critical that failed then you are really stuffed.
>
> I'm not sure what the other option would be though, short of being able
> to undo the last read/write object tracking updates.

With the chained-ownership code we have in the scheduler, it becomes 
perfectly possible to undo the last-read/write tracking changes.

I'd much rather see any failure during submission rewound and undone, so 
we can just return -EAGAIN at any point and let someone retry if required.

This just looks like a hack to work around not having a properly 
transactional model of request submission :(

.Dave.
John Harrison April 21, 2016, 1:04 p.m. UTC | #5
On 19/04/2016 13:35, Dave Gordon wrote:
> On 13/04/16 15:21, John Harrison wrote:
>> On 13/04/2016 10:57, Daniel Vetter wrote:
>>> On Tue, Apr 12, 2016 at 09:03:09PM +0100, Chris Wilson wrote:
>>>> Conceptually, each request is a record of a hardware transaction - we
>>>> build up a list of pending commands and then either commit them to
>>>> hardware, or cancel them. However, whilst building up the list of
>>>> pending commands, we may modify state outside of the request and make
>>>> references to the pending request. If we do so and then cancel that
>>>> request, external objects then point to the deleted request leading to
>>>> both graphical and memory corruption.
>>>>
>>>> The easiest example is to consider object/VMA tracking. When we 
>>>> mark an
>>>> object as active in a request, we store a pointer to this, the most
>>>> recent request, in the object. Then we want to free that object, we 
>>>> wait
>>>> for the most recent request to be idle before proceeding (otherwise 
>>>> the
>>>> hardware will write to pages now owned by the system, or we will 
>>>> attempt
>>>> to read from those pages before the hardware is finished writing). If
>>>> the request was cancelled instead, that wait completes immediately. 
>>>> As a
>>>> result, all requests must be committed and not cancelled if the 
>>>> external
>>>> state is unknown.
>>>>
>>>> All that remains of i915_gem_request_cancel() users are just a 
>>>> couple of
>>>> extremely unlikely allocation failures, so remove the API entirely.
>>>>
>>>> A consequence of committing all incomplete requests is that we 
>>>> generate
>>>> excess breadcrumbs and fill the ring much more often with dummy 
>>>> work. We
>>>> have completely undone the outstanding_last_seqno optimisation.
>>>>
>>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>>> Cc: stable@vger.kernel.org
>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> I'd like John's ack on this on too, but patch itself looks sound. Fast
>>> r-b
>>> since we've discussed this a while ago already ...
>>
>> I think this is going to cause a problem with the scheduler. You are
>> effectively saying that the execbuf call cannot fail beyond the point of
>> allocating a request. If it gets that far then it must go all the way
>> and submit the request to the hardware. With a scheduler, that means
>> adding it to the scheduler's queues and tracking it all the way through
>> the system to completion. If nothing else, that sounds like a lot of
>> extra overhead for no actual work. Or worse if the failure is at a point
>> where the request cannot be sent further through the system because it
>> was something critical that failed then you are really stuffed.
>>
>> I'm not sure what the other option would be though, short of being able
>> to undo the last read/write object tracking updates.
>
> With the chained-ownership code we have in the scheduler, it becomes 
> perfectly possible to undo the last-read/write tracking changes.
>
> I'd much rather see any failure during submission rewound and undone, 
> so we can just return -EAGAIN at any point and let someone retry if 
> required.
>
> This just looks like a hack to work around not having a properly 
> transactional model of request submission :(
>
> .Dave.

I was thinking if it would be possible to delay the tracking updates 
until later in the execbuf process. I.e. only do it after all potential 
failure points. That would be a much simpler change than putting in 
chained ownership.

However, it seems that the patch has already been merged despite this 
discussion and Daniel Vetter wanting an ack first? Is that correct?

John.
John Harrison April 22, 2016, 10:57 p.m. UTC | #6
On 21/04/2016 14:04, John Harrison wrote:
> On 19/04/2016 13:35, Dave Gordon wrote:
>> On 13/04/16 15:21, John Harrison wrote:
>>> On 13/04/2016 10:57, Daniel Vetter wrote:
>>>> On Tue, Apr 12, 2016 at 09:03:09PM +0100, Chris Wilson wrote:
>>>>> Conceptually, each request is a record of a hardware transaction - we
>>>>> build up a list of pending commands and then either commit them to
>>>>> hardware, or cancel them. However, whilst building up the list of
>>>>> pending commands, we may modify state outside of the request and make
>>>>> references to the pending request. If we do so and then cancel that
>>>>> request, external objects then point to the deleted request 
>>>>> leading to
>>>>> both graphical and memory corruption.
>>>>>
>>>>> The easiest example is to consider object/VMA tracking. When we 
>>>>> mark an
>>>>> object as active in a request, we store a pointer to this, the most
>>>>> recent request, in the object. Then we want to free that object, 
>>>>> we wait
>>>>> for the most recent request to be idle before proceeding 
>>>>> (otherwise the
>>>>> hardware will write to pages now owned by the system, or we will 
>>>>> attempt
>>>>> to read from those pages before the hardware is finished writing). If
>>>>> the request was cancelled instead, that wait completes 
>>>>> immediately. As a
>>>>> result, all requests must be committed and not cancelled if the 
>>>>> external
>>>>> state is unknown.
>>>>>
>>>>> All that remains of i915_gem_request_cancel() users are just a 
>>>>> couple of
>>>>> extremely unlikely allocation failures, so remove the API entirely.
>>>>>
>>>>> A consequence of committing all incomplete requests is that we 
>>>>> generate
>>>>> excess breadcrumbs and fill the ring much more often with dummy 
>>>>> work. We
>>>>> have completely undone the outstanding_last_seqno optimisation.
>>>>>
>>>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>>>> Cc: stable@vger.kernel.org
>>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>>
>>>> I'd like John's ack on this on too, but patch itself looks sound. Fast
>>>> r-b
>>>> since we've discussed this a while ago already ...
>>>
>>> I think this is going to cause a problem with the scheduler. You are
>>> effectively saying that the execbuf call cannot fail beyond the 
>>> point of
>>> allocating a request. If it gets that far then it must go all the way
>>> and submit the request to the hardware. With a scheduler, that means
>>> adding it to the scheduler's queues and tracking it all the way through
>>> the system to completion. If nothing else, that sounds like a lot of
>>> extra overhead for no actual work. Or worse if the failure is at a 
>>> point
>>> where the request cannot be sent further through the system because it
>>> was something critical that failed then you are really stuffed.
>>>
>>> I'm not sure what the other option would be though, short of being able
>>> to undo the last read/write object tracking updates.
>>
>> With the chained-ownership code we have in the scheduler, it becomes 
>> perfectly possible to undo the last-read/write tracking changes.
>>
>> I'd much rather see any failure during submission rewound and undone, 
>> so we can just return -EAGAIN at any point and let someone retry if 
>> required.
>>
>> This just looks like a hack to work around not having a properly 
>> transactional model of request submission :(
>>
>> .Dave.
>
> I was thinking if it would be possible to delay the tracking updates 
> until later in the execbuf process. I.e. only do it after all 
> potential failure points. That would be a much simpler change than 
> putting in chained ownership.
>
> However, it seems that the patch has already been merged despite this 
> discussion and Daniel Vetter wanting an ack first? Is that correct?
>
> John.
>

Dave Gordon and myself had a look at the option of delaying the object 
tracking until beyond the point of last possible failure in the execbuf 
call path. As far as we can tell, it already is. The object tracking 
update occurs in i915_gem_execbuffer_move_to_active(). That function 
cannot return a failure code and is immediately followed (in both LRC 
and legacy mode) by a call to i915_gem_execbuffer_retire_commands() 
which is what flushes out the request to the hardware. So it would 
appear that this patch has no effect on object tracking within the 
execbuf code path. If a request cancellation code path was taken then 
the tracking would not have been updated and so the request is 
irrelevant as it has no references to it. If the tracking was updated 
and the request is being referenced then the request was also guaranteed 
to have been submitted and not cancelled.

Either we are missing something major somewhere or this patch cannot fix 
the stated bug in the stated manner?

I have tried running the failing test myself but when I try to run the 
particular gem_concurrent_blit subtest it tells me that it requires more 
'objects' and/or RAM than I have available. What does one need in order 
to run the test? The bug report also does not say whether it is a 
guaranteed failure every time or a sporadic, once in X many runs kind of 
failure?

Thanks,
John.
Dave Gordon April 27, 2016, 6:52 p.m. UTC | #7
On 22/04/16 23:57, John Harrison wrote:
> On 21/04/2016 14:04, John Harrison wrote:
>> On 19/04/2016 13:35, Dave Gordon wrote:
>>> On 13/04/16 15:21, John Harrison wrote:
>>>> On 13/04/2016 10:57, Daniel Vetter wrote:
>>>>> On Tue, Apr 12, 2016 at 09:03:09PM +0100, Chris Wilson wrote:
>>>>>> Conceptually, each request is a record of a hardware transaction - we
>>>>>> build up a list of pending commands and then either commit them to
>>>>>> hardware, or cancel them. However, whilst building up the list of
>>>>>> pending commands, we may modify state outside of the request and make
>>>>>> references to the pending request. If we do so and then cancel that
>>>>>> request, external objects then point to the deleted request
>>>>>> leading to
>>>>>> both graphical and memory corruption.
>>>>>>
>>>>>> The easiest example is to consider object/VMA tracking. When we
>>>>>> mark an
>>>>>> object as active in a request, we store a pointer to this, the most
>>>>>> recent request, in the object. Then we want to free that object,
>>>>>> we wait
>>>>>> for the most recent request to be idle before proceeding
>>>>>> (otherwise the
>>>>>> hardware will write to pages now owned by the system, or we will
>>>>>> attempt
>>>>>> to read from those pages before the hardware is finished writing). If
>>>>>> the request was cancelled instead, that wait completes
>>>>>> immediately. As a
>>>>>> result, all requests must be committed and not cancelled if the
>>>>>> external
>>>>>> state is unknown.
>>>>>>
>>>>>> All that remains of i915_gem_request_cancel() users are just a
>>>>>> couple of
>>>>>> extremely unlikely allocation failures, so remove the API entirely.
>>>>>>
>>>>>> A consequence of committing all incomplete requests is that we
>>>>>> generate
>>>>>> excess breadcrumbs and fill the ring much more often with dummy
>>>>>> work. We
>>>>>> have completely undone the outstanding_last_seqno optimisation.
>>>>>>
>>>>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93907
>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>>>>> Cc: stable@vger.kernel.org
>>>>> Cc: John Harrison <John.C.Harrison@Intel.com>
>>>>>
>>>>> I'd like John's ack on this on too, but patch itself looks sound. Fast
>>>>> r-b
>>>>> since we've discussed this a while ago already ...
>>>>
>>>> I think this is going to cause a problem with the scheduler. You are
>>>> effectively saying that the execbuf call cannot fail beyond the
>>>> point of
>>>> allocating a request. If it gets that far then it must go all the way
>>>> and submit the request to the hardware. With a scheduler, that means
>>>> adding it to the scheduler's queues and tracking it all the way through
>>>> the system to completion. If nothing else, that sounds like a lot of
>>>> extra overhead for no actual work. Or worse if the failure is at a
>>>> point
>>>> where the request cannot be sent further through the system because it
>>>> was something critical that failed then you are really stuffed.
>>>>
>>>> I'm not sure what the other option would be though, short of being able
>>>> to undo the last read/write object tracking updates.
>>>
>>> With the chained-ownership code we have in the scheduler, it becomes
>>> perfectly possible to undo the last-read/write tracking changes.
>>>
>>> I'd much rather see any failure during submission rewound and undone,
>>> so we can just return -EAGAIN at any point and let someone retry if
>>> required.
>>>
>>> This just looks like a hack to work around not having a properly
>>> transactional model of request submission :(
>>>
>>> .Dave.
>>
>> I was thinking if it would be possible to delay the tracking updates
>> until later in the execbuf process. I.e. only do it after all
>> potential failure points. That would be a much simpler change than
>> putting in chained ownership.
>>
>> However, it seems that the patch has already been merged despite this
>> discussion and Daniel Vetter wanting an ack first? Is that correct?
>>
>> John.
>
> Dave Gordon and myself had a look at the option of delaying the object
> tracking until beyond the point of last possible failure in the execbuf
> call path. As far as we can tell, it already is. The object tracking
> update occurs in i915_gem_execbuffer_move_to_active(). That function
> cannot return a failure code and is immediately followed (in both LRC
> and legacy mode) by a call to i915_gem_execbuffer_retire_commands()
> which is what flushes out the request to the hardware. So it would
> appear that this patch has no effect on object tracking within the
> execbuf code path. If a request cancellation code path was taken then
> the tracking would not have been updated and so the request is
> irrelevant as it has no references to it. If the tracking was updated
> and the request is being referenced then the request was also guaranteed
> to have been submitted and not cancelled.
>
> Either we are missing something major somewhere or this patch cannot fix
> the stated bug in the stated manner?
>
> I have tried running the failing test myself but when I try to run the
> particular gem_concurrent_blit subtest it tells me that it requires more
> 'objects' and/or RAM than I have available. What does one need in order
> to run the test? The bug report also does not say whether it is a
> guaranteed failure every time or a sporadic, once in X many runs kind of
> failure?
>
> Thanks,
> John.

Maybe it's not a cancelled request at all, but a race where submission 
HAS been successful, and the object tracking HAS been updated, but the 
object is nonetheless not (yet) on the request list. Which would leave 
the object list-head UNINITIALISED. And hence crash on retiring :(

My one-line patch for that would fix the crash, but I'd still wonder how 
it got tracked and completed before getting onto the request list.

We could try initialising the list-head to POISON rather than empty, to 
clearly flag removing-before-adding-to-list cases.

.Dave.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 061ecc43d935..de84dd7be971 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2331,7 +2331,6 @@  struct drm_i915_gem_request {
 struct drm_i915_gem_request * __must_check
 i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       struct intel_context *ctx);
-void i915_gem_request_cancel(struct drm_i915_gem_request *req);
 void i915_gem_request_free(struct kref *req_ref);
 int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
 				   struct drm_file *file);
@@ -2883,7 +2882,6 @@  int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
 			     struct drm_file *file_priv);
 void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 					struct drm_i915_gem_request *req);
-void i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
 int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
 				   struct drm_i915_gem_execbuffer2 *args,
 				   struct list_head *vmas);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b6879d43dd74..c6f09e7839ea 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2785,7 +2785,8 @@  __i915_gem_request_alloc(struct intel_engine_cs *engine,
 		 * fully prepared. Thus it can be cleaned up using the proper
 		 * free code.
 		 */
-		i915_gem_request_cancel(req);
+		intel_ring_reserved_space_cancel(req->ringbuf);
+		i915_gem_request_unreference(req);
 		return ret;
 	}
 
@@ -2822,13 +2823,6 @@  i915_gem_request_alloc(struct intel_engine_cs *engine,
 	return err ? ERR_PTR(err) : req;
 }
 
-void i915_gem_request_cancel(struct drm_i915_gem_request *req)
-{
-	intel_ring_reserved_space_cancel(req->ringbuf);
-
-	i915_gem_request_unreference(req);
-}
-
 struct drm_i915_gem_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine)
 {
@@ -3438,12 +3432,9 @@  int i915_gpu_idle(struct drm_device *dev)
 				return PTR_ERR(req);
 
 			ret = i915_switch_context(req);
-			if (ret) {
-				i915_gem_request_cancel(req);
-				return ret;
-			}
-
 			i915_add_request_no_flush(req);
+			if (ret)
+				return ret;
 		}
 
 		ret = intel_engine_idle(engine);
@@ -4943,34 +4934,33 @@  i915_gem_init_hw(struct drm_device *dev)
 		req = i915_gem_request_alloc(engine, NULL);
 		if (IS_ERR(req)) {
 			ret = PTR_ERR(req);
-			i915_gem_cleanup_engines(dev);
-			goto out;
+			break;
 		}
 
 		if (engine->id == RCS) {
-			for (j = 0; j < NUM_L3_SLICES(dev); j++)
-				i915_gem_l3_remap(req, j);
+			for (j = 0; j < NUM_L3_SLICES(dev); j++) {
+				ret = i915_gem_l3_remap(req, j);
+				if (ret)
+					goto err_request;
+			}
 		}
 
 		ret = i915_ppgtt_init_ring(req);
-		if (ret && ret != -EIO) {
-			DRM_ERROR("PPGTT enable %s failed %d\n",
-				  engine->name, ret);
-			i915_gem_request_cancel(req);
-			i915_gem_cleanup_engines(dev);
-			goto out;
-		}
+		if (ret)
+			goto err_request;
 
 		ret = i915_gem_context_enable(req);
-		if (ret && ret != -EIO) {
-			DRM_ERROR("Context enable %s failed %d\n",
+		if (ret)
+			goto err_request;
+
+err_request:
+		i915_add_request_no_flush(req);
+		if (ret) {
+			DRM_ERROR("Failed to enable %s, error=%d\n",
 				  engine->name, ret);
-			i915_gem_request_cancel(req);
 			i915_gem_cleanup_engines(dev);
-			goto out;
+			break;
 		}
-
-		i915_add_request_no_flush(req);
 	}
 
 out:
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6ee4f00f620c..6f4f2a6cdf93 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1137,7 +1137,7 @@  i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 	}
 }
 
-void
+static void
 i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params)
 {
 	/* Unconditionally force add_request to emit a full flush. */
@@ -1322,7 +1322,6 @@  i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
 	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
 
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
-	i915_gem_execbuffer_retire_commands(params);
 
 	return 0;
 }
@@ -1624,7 +1623,7 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	ret = i915_gem_request_add_to_client(req, file);
 	if (ret)
-		goto err_batch_unpin;
+		goto err_request;
 
 	/*
 	 * Save assorted stuff away to pass through to *_submission().
@@ -1641,6 +1640,8 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	params->request                 = req;
 
 	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
+err_request:
+	i915_gem_execbuffer_retire_commands(params);
 
 err_batch_unpin:
 	/*
@@ -1657,14 +1658,6 @@  err:
 	i915_gem_context_unreference(ctx);
 	eb_destroy(eb);
 
-	/*
-	 * If the request was created but not successfully submitted then it
-	 * must be freed again. If it was submitted then it is being tracked
-	 * on the active request list and no clean up is required here.
-	 */
-	if (ret && !IS_ERR_OR_NULL(req))
-		i915_gem_request_cancel(req);
-
 	mutex_unlock(&dev->struct_mutex);
 
 pre_mutex_err:
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index b1b457864e17..3cae596d10a3 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11664,7 +11664,7 @@  cleanup_unpin:
 	intel_unpin_fb_obj(fb, crtc->primary->state->rotation);
 cleanup_pending:
 	if (!IS_ERR_OR_NULL(request))
-		i915_gem_request_cancel(request);
+		i915_add_request_no_flush(request);
 	atomic_dec(&intel_crtc->unpin_work_count);
 	mutex_unlock(&dev->struct_mutex);
 cleanup:
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b8f6b96472a6..6fc24deaa16a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1010,7 +1010,6 @@  int intel_execlists_submission(struct i915_execbuffer_params *params,
 	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
 
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
-	i915_gem_execbuffer_retire_commands(params);
 
 	return 0;
 }
@@ -2679,13 +2678,12 @@  int intel_lr_context_deferred_alloc(struct intel_context *ctx,
 		}
 
 		ret = engine->init_context(req);
+		i915_add_request_no_flush(req);
 		if (ret) {
 			DRM_ERROR("ring init context: %d\n",
 				ret);
-			i915_gem_request_cancel(req);
 			goto error_ringbuf;
 		}
-		i915_add_request_no_flush(req);
 	}
 	return 0;
 
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 6694e9230cd5..bcc3b6a016d8 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -247,7 +247,7 @@  static int intel_overlay_on(struct intel_overlay *overlay)
 
 	ret = intel_ring_begin(req, 4);
 	if (ret) {
-		i915_gem_request_cancel(req);
+		i915_add_request_no_flush(req);
 		return ret;
 	}
 
@@ -290,7 +290,7 @@  static int intel_overlay_continue(struct intel_overlay *overlay,
 
 	ret = intel_ring_begin(req, 2);
 	if (ret) {
-		i915_gem_request_cancel(req);
+		i915_add_request_no_flush(req);
 		return ret;
 	}
 
@@ -356,7 +356,7 @@  static int intel_overlay_off(struct intel_overlay *overlay)
 
 	ret = intel_ring_begin(req, 6);
 	if (ret) {
-		i915_gem_request_cancel(req);
+		i915_add_request_no_flush(req);
 		return ret;
 	}
 
@@ -431,7 +431,7 @@  static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
 
 		ret = intel_ring_begin(req, 2);
 		if (ret) {
-			i915_gem_request_cancel(req);
+			i915_add_request_no_flush(req);
 			return ret;
 		}