diff mbox

[1/4] drm/i915: Unify execlist and legacy request life-cycles

Message ID 1444143124-16030-2-git-send-email-nicholas.hoath@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nick Hoath Oct. 6, 2015, 2:52 p.m. UTC
There is a desire to simplify the i915 driver by reducing the number of
different code paths introduced by the LRC / execlists support.  As the
execlists request is now part of the gem request it is possible and
desirable to unify the request life-cycles for execlist and legacy
requests.

Added a context complete flag to a request which gets set during the
context switch interrupt.

Added a function i915_gem_request_retireable().  A request is considered
retireable if its seqno passed (i.e. the request has completed) and either
it was never submitted to the ELSP or its context completed.  This ensures
that context save is carried out before the last request for a context is
considered retireable.  retire_requests_ring() now uses
i915_gem_request_retireable() rather than request_complete() when deciding
which requests to retire. Requests that were not waiting for a context
switch interrupt (either as a result of being merged into a following
request or by being a legacy request) will be considered retireable as
soon as their seqno has passed.

Removed the extra request reference held for the execlist request.

Removed intel_execlists_retire_requests() and all references to
intel_engine_cs.execlist_retired_req_list.

Moved context unpinning into retire_requests_ring() for now.  Further work
is pending for the context pinning - this patch should allow us to use the
active list to track context and ring buffer objects later.

Changed gen8_cs_irq_handler() so that notify_ring() is called when
contexts complete as well as when a user interrupt occurs so that
notification happens when a request is complete and context save has
finished.

v2: Rebase over the read-read optimisation changes

v3: Reworked IRQ handler after removing IRQ handler cleanup patch

v4: Fixed various pin leaks

Issue: VIZ-4277
Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  6 +++
 drivers/gpu/drm/i915/i915_gem.c         | 67 +++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_irq.c         | 81 +++++++++++++++++++++------------
 drivers/gpu/drm/i915/intel_lrc.c        | 43 +++--------------
 drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
 6 files changed, 118 insertions(+), 82 deletions(-)

Comments

Daniel Vetter Oct. 7, 2015, 4:03 p.m. UTC | #1
On Tue, Oct 06, 2015 at 03:52:01PM +0100, Nick Hoath wrote:
> There is a desire to simplify the i915 driver by reducing the number of
> different code paths introduced by the LRC / execlists support.  As the
> execlists request is now part of the gem request it is possible and
> desirable to unify the request life-cycles for execlist and legacy
> requests.
> 
> Added a context complete flag to a request which gets set during the
> context switch interrupt.
> 
> Added a function i915_gem_request_retireable().  A request is considered
> retireable if its seqno passed (i.e. the request has completed) and either
> it was never submitted to the ELSP or its context completed.  This ensures
> that context save is carried out before the last request for a context is
> considered retireable.  retire_requests_ring() now uses
> i915_gem_request_retireable() rather than request_complete() when deciding
> which requests to retire. Requests that were not waiting for a context
> switch interrupt (either as a result of being merged into a following
> request or by being a legacy request) will be considered retireable as
> soon as their seqno has passed.
> 
> Removed the extra request reference held for the execlist request.
> 
> Removed intel_execlists_retire_requests() and all references to
> intel_engine_cs.execlist_retired_req_list.
> 
> Moved context unpinning into retire_requests_ring() for now.  Further work
> is pending for the context pinning - this patch should allow us to use the
> active list to track context and ring buffer objects later.
> 
> Changed gen8_cs_irq_handler() so that notify_ring() is called when
> contexts complete as well as when a user interrupt occurs so that
> notification happens when a request is complete and context save has
> finished.
> 
> v2: Rebase over the read-read optimisation changes
> 
> v3: Reworked IRQ handler after removing IRQ handler cleanup patch
> 
> v4: Fixed various pin leaks
> 
> Issue: VIZ-4277
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  6 +++
>  drivers/gpu/drm/i915/i915_gem.c         | 67 +++++++++++++++++++++------
>  drivers/gpu/drm/i915/i915_irq.c         | 81 +++++++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_lrc.c        | 43 +++--------------
>  drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
>  6 files changed, 118 insertions(+), 82 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index fbf0ae9..3d217f9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2262,6 +2262,12 @@ struct drm_i915_gem_request {
>  	/** Execlists no. of times this request has been sent to the ELSP */
>  	int elsp_submitted;
>  
> +	/**
> +	 * Execlists: whether this requests's context has completed after
> +	 * submission to the ELSP
> +	 */
> +	bool ctx_complete;
> +
>  };
>  
>  int i915_gem_request_alloc(struct intel_engine_cs *ring,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 52642af..fc82171 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1386,6 +1386,24 @@ __i915_gem_request_retire__upto(struct drm_i915_gem_request *req)
>  				       typeof(*tmp), list);
>  
>  		i915_gem_request_retire(tmp);
> +
> +		if (i915.enable_execlists) {
> +			struct intel_context *ctx = tmp->ctx;
> +			struct drm_i915_private *dev_priv =
> +				engine->dev->dev_private;
> +			unsigned long flags;
> +			struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[engine->id].state;
> +
> +			spin_lock_irqsave(&engine->execlist_lock, flags);
> +
> +			if (ctx_obj && (ctx != engine->default_context))
> +				intel_lr_context_unpin(tmp);
> +
> +			intel_runtime_pm_put(dev_priv);
> +			spin_unlock_irqrestore(&engine->execlist_lock, flags);
> +		}
> +
>  	} while (tmp != req);
>  
>  	WARN_ON(i915_verify_lists(engine->dev));
> @@ -2359,6 +2377,12 @@ void i915_vma_move_to_active(struct i915_vma *vma,
>  	list_move_tail(&vma->mm_list, &vma->vm->active_list);
>  }
>  
> +static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
> +{
> +	return (i915_gem_request_completed(req, true) &&
> +		(!req->elsp_submitted || req->ctx_complete));
> +}
> +
>  static void
>  i915_gem_object_retire__write(struct drm_i915_gem_object *obj)
>  {
> @@ -2829,10 +2853,28 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>  					   struct drm_i915_gem_request,
>  					   list);
>  
> -		if (!i915_gem_request_completed(request, true))
> +		if (!i915_gem_request_retireable(request))
>  			break;
>  
>  		i915_gem_request_retire(request);
> +
> +		if (i915.enable_execlists) {
> +			struct intel_context *ctx = request->ctx;
> +			struct drm_i915_private *dev_priv =
> +				ring->dev->dev_private;
> +			unsigned long flags;
> +			struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[ring->id].state;
> +
> +			spin_lock_irqsave(&ring->execlist_lock, flags);
> +
> +			if (ctx_obj && (ctx != ring->default_context))
> +				intel_lr_context_unpin(request);
> +
> +			intel_runtime_pm_put(dev_priv);
> +			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> +		}
> +
>  	}
>  
>  	/* Move any buffers on the active list that are no longer referenced
> @@ -2848,12 +2890,14 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>  
>  		if (!list_empty(&obj->last_read_req[ring->id]->list))
>  			break;
> +		if (!i915_gem_request_retireable(obj->last_read_req[ring->id]))
> +			break;
>  
>  		i915_gem_object_retire__read(obj, ring->id);
>  	}
>  
>  	if (unlikely(ring->trace_irq_req &&
> -		     i915_gem_request_completed(ring->trace_irq_req, true))) {
> +		     i915_gem_request_retireable(ring->trace_irq_req))) {
>  		ring->irq_put(ring);
>  		i915_gem_request_assign(&ring->trace_irq_req, NULL);
>  	}
> @@ -2872,15 +2916,6 @@ i915_gem_retire_requests(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i) {
>  		i915_gem_retire_requests_ring(ring);
>  		idle &= list_empty(&ring->request_list);
> -		if (i915.enable_execlists) {
> -			unsigned long flags;
> -
> -			spin_lock_irqsave(&ring->execlist_lock, flags);
> -			idle &= list_empty(&ring->execlist_queue);
> -			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> -
> -			intel_execlists_retire_requests(ring);
> -		}
>  	}
>  
>  	if (idle)
> @@ -2956,12 +2991,14 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
>  		if (req == NULL)
>  			continue;
>  
> -		if (list_empty(&req->list))
> -			goto retire;
> +		if (list_empty(&req->list)) {
> +			if (i915_gem_request_retireable(req))
> +				i915_gem_object_retire__read(obj, i);
> +			continue;
> +		}
>  
> -		if (i915_gem_request_completed(req, true)) {
> +		if (i915_gem_request_retireable(req)) {
>  			__i915_gem_request_retire__upto(req);
> -retire:
>  			i915_gem_object_retire__read(obj, i);
>  		}
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 8ca772d..ebbafac 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1291,66 +1291,91 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
>  				       u32 master_ctl)
>  {
>  	irqreturn_t ret = IRQ_NONE;
> +	bool need_notify;
>  
>  	if (master_ctl & (GEN8_GT_RCS_IRQ | GEN8_GT_BCS_IRQ)) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(0));
> -		if (tmp) {
> -			I915_WRITE_FW(GEN8_GT_IIR(0), tmp);
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(0));
> +
> +		if (iir) {
> +			I915_WRITE_FW(GEN8_GT_IIR(0), iir);
>  			ret = IRQ_HANDLED;
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[RCS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_RCS_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[RCS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_RCS_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[RCS]);
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[BCS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_BCS_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[BCS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_BCS_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[BCS]);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT0)!\n");
>  	}
>  
>  	if (master_ctl & (GEN8_GT_VCS1_IRQ | GEN8_GT_VCS2_IRQ)) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(1));
> -		if (tmp) {
> -			I915_WRITE_FW(GEN8_GT_IIR(1), tmp);
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(1));
> +
> +		if (iir) {
> +			I915_WRITE_FW(GEN8_GT_IIR(1), iir);
>  			ret = IRQ_HANDLED;
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[VCS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_VCS1_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[VCS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_VCS1_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[VCS]);
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[VCS2]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_VCS2_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[VCS2]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_VCS2_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[VCS2]);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT1)!\n");
>  	}
>  
>  	if (master_ctl & GEN8_GT_VECS_IRQ) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(3));
> -		if (tmp) {
> -			I915_WRITE_FW(GEN8_GT_IIR(3), tmp);
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(3));
> +
> +		if (iir) {
> +			I915_WRITE_FW(GEN8_GT_IIR(3), iir);
>  			ret = IRQ_HANDLED;
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[VECS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_VECS_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[VECS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_VECS_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[VECS]);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT3)!\n");
>  	}
>  
>  	if (master_ctl & GEN8_GT_PM_IRQ) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(2));
> -		if (tmp & dev_priv->pm_rps_events) {
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(2));
> +
> +		if (iir & dev_priv->pm_rps_events) {
>  			I915_WRITE_FW(GEN8_GT_IIR(2),
> -				      tmp & dev_priv->pm_rps_events);
> +				      iir & dev_priv->pm_rps_events);
>  			ret = IRQ_HANDLED;
> -			gen6_rps_irq_handler(dev_priv, tmp);
> +			gen6_rps_irq_handler(dev_priv, iir);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (PM)!\n");
>  	}

I had a really hard time reading this patch until I realized what you've
done. And this is definitely getting too complicated. I think it'd be good
to first extract a little helper which takes the iir and the shift and
then calls both intel_lrc_irq_handler and notify ring.

Also if you do large-scale replacement like s/tmp/iir/ that needs to be a
separate patch.

With that technicality out of the way let's look at the larger picture.
Let me first describe in more detail how the legacy ctx retiring works,
since it doesn't work like what you've done here really. Let's look at the
case where we have some batch running on ctx A and then switch to a batch
running on ctx B:

| currently ctx A active on the hw
| MI_BB_START/flushes/...
| seqno write + interrupt for the batch just submitted, tracked in request 1
<- ctx A is still used by the hardware and not flushed out
| MI_SET_CTX to switch to B
| batch/flush
| seqno write/irq for request 2
v

What you seem to be doing is changing the logical retire point for request
1 to include the MI_SET_CTX (but in execlist terms, i.e. when the hw has
confirmed the ctx switch to the next lrc in the elsp).

But what the legacy stuff does is wait until request 2 with retiring the
ctx object for ctx B. So what happens in that context switch function
(which is called synchronously from execbuf ioctl code) is that we
directly unpin ctx A and pin ctx B. And then we throw the ctx A object
onto the active list with request 2 attached as a write fence to make sure
the eviction code waits for request 2 to complete, which necessarily means
the ctx switch has completed.

Of course this isn't entirely perfect since we could evict ctx A even
before request 2 has completed, but since throwing out active objects is
really a last resort thing that doesn't really matter.

Now we could do the exact same design for execlist since we never reorder
execlist ctx in upstream. And that's what I originally suggested. But that
would seriously piss of John with his scheduler patches.

Instead I think the proper design would be to create a new, separate
request object every time we submit a new batch for a different context
(we don't want to do this unconditionally for overheads) which will
complete exactly when the ctx switch completes. Since we have full-blown
request abstractio nowadays that should be possible without touching
generic code or adding special-cases to request or active object tracking
code.
-Daniel


> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 825fa7a..e8f5b6c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -426,9 +426,8 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  			/* Same ctx: ignore first request, as second request
>  			 * will update tail past first request's workload */
>  			cursor->elsp_submitted = req0->elsp_submitted;
> +			req0->elsp_submitted = 0;
>  			list_del(&req0->execlist_link);
> -			list_add_tail(&req0->execlist_link,
> -				&ring->execlist_retired_req_list);
>  			req0 = cursor;
>  		} else {
>  			req1 = cursor;
> @@ -478,11 +477,9 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>  		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
>  			WARN(head_req->elsp_submitted == 0,
>  			     "Never submitted head request\n");
> -
>  			if (--head_req->elsp_submitted <= 0) {
> +				head_req->ctx_complete = 1;
>  				list_del(&head_req->execlist_link);
> -				list_add_tail(&head_req->execlist_link,
> -					&ring->execlist_retired_req_list);
>  				return true;
>  			}
>  		}
> @@ -497,8 +494,9 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>   *
>   * Check the unread Context Status Buffers and manage the submission of new
>   * contexts to the ELSP accordingly.
> + * @return whether a context completed
>   */
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring)
> +bool intel_lrc_irq_handler(struct intel_engine_cs *ring)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	u32 status_pointer;
> @@ -558,6 +556,8 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
>  		   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
>  				 ((u32)ring->next_context_status_buffer &
>  				  GEN8_CSB_PTR_MASK) << 8));
> +
> +	return (submit_contexts != 0);
>  }
>  
>  static int execlists_context_queue(struct drm_i915_gem_request *request)
> @@ -569,8 +569,6 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>  	if (request->ctx != ring->default_context)
>  		intel_lr_context_pin(request);
>  
> -	i915_gem_request_reference(request);
> -
>  	spin_lock_irq(&ring->execlist_lock);
>  
>  	list_for_each_entry(cursor, &ring->execlist_queue, execlist_link)
> @@ -588,8 +586,6 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>  			WARN(tail_req->elsp_submitted != 0,
>  				"More than 2 already-submitted reqs queued\n");
>  			list_del(&tail_req->execlist_link);
> -			list_add_tail(&tail_req->execlist_link,
> -				&ring->execlist_retired_req_list);
>  		}
>  	}
>  
> @@ -958,32 +954,6 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>  	return 0;
>  }
>  
> -void intel_execlists_retire_requests(struct intel_engine_cs *ring)
> -{
> -	struct drm_i915_gem_request *req, *tmp;
> -	struct list_head retired_list;
> -
> -	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
> -	if (list_empty(&ring->execlist_retired_req_list))
> -		return;
> -
> -	INIT_LIST_HEAD(&retired_list);
> -	spin_lock_irq(&ring->execlist_lock);
> -	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
> -	spin_unlock_irq(&ring->execlist_lock);
> -
> -	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
> -		struct intel_context *ctx = req->ctx;
> -		struct drm_i915_gem_object *ctx_obj =
> -				ctx->engine[ring->id].state;
> -
> -		if (ctx_obj && (ctx != ring->default_context))
> -			intel_lr_context_unpin(req);
> -		list_del(&req->execlist_link);
> -		i915_gem_request_unreference(req);
> -	}
> -}
> -
>  void intel_logical_ring_stop(struct intel_engine_cs *ring)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> @@ -1938,7 +1908,6 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>  	init_waitqueue_head(&ring->irq_queue);
>  
>  	INIT_LIST_HEAD(&ring->execlist_queue);
> -	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
>  	spin_lock_init(&ring->execlist_lock);
>  
>  	ret = i915_cmd_parser_init_ring(ring);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 4e60d54..e6a4900 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -95,7 +95,7 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>  			       struct list_head *vmas);
>  u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
>  
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring);
> +bool intel_lrc_irq_handler(struct intel_engine_cs *ring);
>  void intel_execlists_retire_requests(struct intel_engine_cs *ring);
>  
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 49fa41d..d99b167 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -264,7 +264,6 @@ struct  intel_engine_cs {
>  	/* Execlists */
>  	spinlock_t execlist_lock;
>  	struct list_head execlist_queue;
> -	struct list_head execlist_retired_req_list;
>  	u8 next_context_status_buffer;
>  	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
>  	int		(*emit_request)(struct drm_i915_gem_request *request);
> -- 
> 1.9.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Oct. 7, 2015, 4:05 p.m. UTC | #2
On Wed, Oct 07, 2015 at 06:03:48PM +0200, Daniel Vetter wrote:
> On Tue, Oct 06, 2015 at 03:52:01PM +0100, Nick Hoath wrote:
> > There is a desire to simplify the i915 driver by reducing the number of
> > different code paths introduced by the LRC / execlists support.  As the
> > execlists request is now part of the gem request it is possible and
> > desirable to unify the request life-cycles for execlist and legacy
> > requests.
> > 
> > Added a context complete flag to a request which gets set during the
> > context switch interrupt.
> > 
> > Added a function i915_gem_request_retireable().  A request is considered
> > retireable if its seqno passed (i.e. the request has completed) and either
> > it was never submitted to the ELSP or its context completed.  This ensures
> > that context save is carried out before the last request for a context is
> > considered retireable.  retire_requests_ring() now uses
> > i915_gem_request_retireable() rather than request_complete() when deciding
> > which requests to retire. Requests that were not waiting for a context
> > switch interrupt (either as a result of being merged into a following
> > request or by being a legacy request) will be considered retireable as
> > soon as their seqno has passed.
> > 
> > Removed the extra request reference held for the execlist request.
> > 
> > Removed intel_execlists_retire_requests() and all references to
> > intel_engine_cs.execlist_retired_req_list.
> > 
> > Moved context unpinning into retire_requests_ring() for now.  Further work
> > is pending for the context pinning - this patch should allow us to use the
> > active list to track context and ring buffer objects later.
> > 
> > Changed gen8_cs_irq_handler() so that notify_ring() is called when
> > contexts complete as well as when a user interrupt occurs so that
> > notification happens when a request is complete and context save has
> > finished.
> > 
> > v2: Rebase over the read-read optimisation changes
> > 
> > v3: Reworked IRQ handler after removing IRQ handler cleanup patch
> > 
> > v4: Fixed various pin leaks
> > 
> > Issue: VIZ-4277
> > Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> > Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         |  6 +++
> >  drivers/gpu/drm/i915/i915_gem.c         | 67 +++++++++++++++++++++------
> >  drivers/gpu/drm/i915/i915_irq.c         | 81 +++++++++++++++++++++------------
> >  drivers/gpu/drm/i915/intel_lrc.c        | 43 +++--------------
> >  drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
> >  6 files changed, 118 insertions(+), 82 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index fbf0ae9..3d217f9 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2262,6 +2262,12 @@ struct drm_i915_gem_request {
> >  	/** Execlists no. of times this request has been sent to the ELSP */
> >  	int elsp_submitted;
> >  
> > +	/**
> > +	 * Execlists: whether this requests's context has completed after
> > +	 * submission to the ELSP
> > +	 */
> > +	bool ctx_complete;
> > +
> >  };
> >  
> >  int i915_gem_request_alloc(struct intel_engine_cs *ring,
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 52642af..fc82171 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1386,6 +1386,24 @@ __i915_gem_request_retire__upto(struct drm_i915_gem_request *req)
> >  				       typeof(*tmp), list);
> >  
> >  		i915_gem_request_retire(tmp);
> > +
> > +		if (i915.enable_execlists) {
> > +			struct intel_context *ctx = tmp->ctx;
> > +			struct drm_i915_private *dev_priv =
> > +				engine->dev->dev_private;
> > +			unsigned long flags;
> > +			struct drm_i915_gem_object *ctx_obj =
> > +				ctx->engine[engine->id].state;
> > +
> > +			spin_lock_irqsave(&engine->execlist_lock, flags);
> > +
> > +			if (ctx_obj && (ctx != engine->default_context))
> > +				intel_lr_context_unpin(tmp);
> > +
> > +			intel_runtime_pm_put(dev_priv);
> > +			spin_unlock_irqrestore(&engine->execlist_lock, flags);
> > +		}
> > +
> >  	} while (tmp != req);

And NAK anyway.
-Chris
Chris Wilson Oct. 8, 2015, 12:32 p.m. UTC | #3
On Tue, Oct 06, 2015 at 03:52:01PM +0100, Nick Hoath wrote:
> There is a desire to simplify the i915 driver by reducing the number of
> different code paths introduced by the LRC / execlists support.  As the
> execlists request is now part of the gem request it is possible and
> desirable to unify the request life-cycles for execlist and legacy
> requests.
> 
> Added a context complete flag to a request which gets set during the
> context switch interrupt.

Wrong approach. Legacy uses the request itself as the indicator for when
the context is complete, if you were to take the same approach for LRC
we would not need to add yet more execlist specific state.

Lines of code is reduced by keeping the GEM semantics the same and just
having execlists hold an indepedent ref on the request for its async
submission approach.
 
> Added a function i915_gem_request_retireable().  A request is considered
> retireable if its seqno passed (i.e. the request has completed) and either
> it was never submitted to the ELSP or its context completed.  This ensures
> that context save is carried out before the last request for a context is
> considered retireable.  retire_requests_ring() now uses
> i915_gem_request_retireable() rather than request_complete() when deciding
> which requests to retire. Requests that were not waiting for a context
> switch interrupt (either as a result of being merged into a following
> request or by being a legacy request) will be considered retireable as
> soon as their seqno has passed.
> 
> Removed the extra request reference held for the execlist request.
> 
> Removed intel_execlists_retire_requests() and all references to
> intel_engine_cs.execlist_retired_req_list.
> 
> Moved context unpinning into retire_requests_ring() for now.  Further work
> is pending for the context pinning - this patch should allow us to use the
> active list to track context and ring buffer objects later.
> 
> Changed gen8_cs_irq_handler() so that notify_ring() is called when
> contexts complete as well as when a user interrupt occurs so that
> notification happens when a request is complete and context save has
> finished.
> 
> v2: Rebase over the read-read optimisation changes
> 
> v3: Reworked IRQ handler after removing IRQ handler cleanup patch
> 
> v4: Fixed various pin leaks
> 
> Issue: VIZ-4277
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  6 +++
>  drivers/gpu/drm/i915/i915_gem.c         | 67 +++++++++++++++++++++------
>  drivers/gpu/drm/i915/i915_irq.c         | 81 +++++++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_lrc.c        | 43 +++--------------
>  drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
>  6 files changed, 118 insertions(+), 82 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index fbf0ae9..3d217f9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2262,6 +2262,12 @@ struct drm_i915_gem_request {
>  	/** Execlists no. of times this request has been sent to the ELSP */
>  	int elsp_submitted;
>  
> +	/**
> +	 * Execlists: whether this requests's context has completed after
> +	 * submission to the ELSP
> +	 */
> +	bool ctx_complete;
> +
>  };
>  
>  int i915_gem_request_alloc(struct intel_engine_cs *ring,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 52642af..fc82171 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1386,6 +1386,24 @@ __i915_gem_request_retire__upto(struct drm_i915_gem_request *req)
>  				       typeof(*tmp), list);
>  
>  		i915_gem_request_retire(tmp);
> +
> +		if (i915.enable_execlists) {
> +			struct intel_context *ctx = tmp->ctx;
> +			struct drm_i915_private *dev_priv =
> +				engine->dev->dev_private;
> +			unsigned long flags;
> +			struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[engine->id].state;
> +
> +			spin_lock_irqsave(&engine->execlist_lock, flags);
> +
> +			if (ctx_obj && (ctx != engine->default_context))
> +				intel_lr_context_unpin(tmp);
> +
> +			intel_runtime_pm_put(dev_priv);
> +			spin_unlock_irqrestore(&engine->execlist_lock, flags);
> +		}

Why here? Surely you intended this to be called by
i915_gem_request_retire(). The runtime pm interaction is wrong, that is
handled by GPU busyness tracking. But by doing this at this juncture you
now increase the frequency at which we have to recreate the iomapping,
most noticeable on bsw+ and take more spinlocks unnecessarily.

Also since you no longer do reference tracking for the
execlists_queue, tmp has just been freed.

>  	} while (tmp != req);
>  
>  	WARN_ON(i915_verify_lists(engine->dev));
> @@ -2359,6 +2377,12 @@ void i915_vma_move_to_active(struct i915_vma *vma,
>  	list_move_tail(&vma->mm_list, &vma->vm->active_list);
>  }
>  
> +static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
> +{
> +	return (i915_gem_request_completed(req, true) &&
> +		(!req->elsp_submitted || req->ctx_complete));

I disagree with this completely. A request must only be considered complete
when it's seqno is passed. The context should be tracking the request
and not the other way around (like legacy does).

The approach is just wrong. The lifecycle of the request is not the
bloat in execlists, and keeping an extra reference on the request struct
itself them until execlists is able to reap them amoritizes the cost of
the spinlock and atomic operations.
-Chris
Daniel Vetter Oct. 9, 2015, 7:58 a.m. UTC | #4
On Thu, Oct 08, 2015 at 01:32:07PM +0100, Chris Wilson wrote:
> On Tue, Oct 06, 2015 at 03:52:01PM +0100, Nick Hoath wrote:
> > There is a desire to simplify the i915 driver by reducing the number of
> > different code paths introduced by the LRC / execlists support.  As the
> > execlists request is now part of the gem request it is possible and
> > desirable to unify the request life-cycles for execlist and legacy
> > requests.
> > 
> > Added a context complete flag to a request which gets set during the
> > context switch interrupt.
> 
> Wrong approach. Legacy uses the request itself as the indicator for when
> the context is complete, if you were to take the same approach for LRC
> we would not need to add yet more execlist specific state.
> 
> Lines of code is reduced by keeping the GEM semantics the same and just
> having execlists hold an indepedent ref on the request for its async
> submission approach.
>  
> > Added a function i915_gem_request_retireable().  A request is considered
> > retireable if its seqno passed (i.e. the request has completed) and either
> > it was never submitted to the ELSP or its context completed.  This ensures
> > that context save is carried out before the last request for a context is
> > considered retireable.  retire_requests_ring() now uses
> > i915_gem_request_retireable() rather than request_complete() when deciding
> > which requests to retire. Requests that were not waiting for a context
> > switch interrupt (either as a result of being merged into a following
> > request or by being a legacy request) will be considered retireable as
> > soon as their seqno has passed.
> > 
> > Removed the extra request reference held for the execlist request.
> > 
> > Removed intel_execlists_retire_requests() and all references to
> > intel_engine_cs.execlist_retired_req_list.
> > 
> > Moved context unpinning into retire_requests_ring() for now.  Further work
> > is pending for the context pinning - this patch should allow us to use the
> > active list to track context and ring buffer objects later.
> > 
> > Changed gen8_cs_irq_handler() so that notify_ring() is called when
> > contexts complete as well as when a user interrupt occurs so that
> > notification happens when a request is complete and context save has
> > finished.
> > 
> > v2: Rebase over the read-read optimisation changes
> > 
> > v3: Reworked IRQ handler after removing IRQ handler cleanup patch
> > 
> > v4: Fixed various pin leaks
> > 
> > Issue: VIZ-4277
> > Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> > Signed-off-by: Nick Hoath <nicholas.hoath@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         |  6 +++
> >  drivers/gpu/drm/i915/i915_gem.c         | 67 +++++++++++++++++++++------
> >  drivers/gpu/drm/i915/i915_irq.c         | 81 +++++++++++++++++++++------------
> >  drivers/gpu/drm/i915/intel_lrc.c        | 43 +++--------------
> >  drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
> >  6 files changed, 118 insertions(+), 82 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index fbf0ae9..3d217f9 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2262,6 +2262,12 @@ struct drm_i915_gem_request {
> >  	/** Execlists no. of times this request has been sent to the ELSP */
> >  	int elsp_submitted;
> >  
> > +	/**
> > +	 * Execlists: whether this requests's context has completed after
> > +	 * submission to the ELSP
> > +	 */
> > +	bool ctx_complete;
> > +
> >  };
> >  
> >  int i915_gem_request_alloc(struct intel_engine_cs *ring,
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 52642af..fc82171 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1386,6 +1386,24 @@ __i915_gem_request_retire__upto(struct drm_i915_gem_request *req)
> >  				       typeof(*tmp), list);
> >  
> >  		i915_gem_request_retire(tmp);
> > +
> > +		if (i915.enable_execlists) {
> > +			struct intel_context *ctx = tmp->ctx;
> > +			struct drm_i915_private *dev_priv =
> > +				engine->dev->dev_private;
> > +			unsigned long flags;
> > +			struct drm_i915_gem_object *ctx_obj =
> > +				ctx->engine[engine->id].state;
> > +
> > +			spin_lock_irqsave(&engine->execlist_lock, flags);
> > +
> > +			if (ctx_obj && (ctx != engine->default_context))
> > +				intel_lr_context_unpin(tmp);
> > +
> > +			intel_runtime_pm_put(dev_priv);
> > +			spin_unlock_irqrestore(&engine->execlist_lock, flags);
> > +		}
> 
> Why here? Surely you intended this to be called by
> i915_gem_request_retire(). The runtime pm interaction is wrong, that is
> handled by GPU busyness tracking. But by doing this at this juncture you
> now increase the frequency at which we have to recreate the iomapping,
> most noticeable on bsw+ and take more spinlocks unnecessarily.
> 
> Also since you no longer do reference tracking for the
> execlists_queue, tmp has just been freed.
> 
> >  	} while (tmp != req);
> >  
> >  	WARN_ON(i915_verify_lists(engine->dev));
> > @@ -2359,6 +2377,12 @@ void i915_vma_move_to_active(struct i915_vma *vma,
> >  	list_move_tail(&vma->mm_list, &vma->vm->active_list);
> >  }
> >  
> > +static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
> > +{
> > +	return (i915_gem_request_completed(req, true) &&
> > +		(!req->elsp_submitted || req->ctx_complete));
> 
> I disagree with this completely. A request must only be considered complete
> when it's seqno is passed. The context should be tracking the request
> and not the other way around (like legacy does).
> 
> The approach is just wrong. The lifecycle of the request is not the
> bloat in execlists, and keeping an extra reference on the request struct
> itself them until execlists is able to reap them amoritizes the cost of
> the spinlock and atomic operations.

Hm, that's slightly different approach than what I suggested with just
creating special ctx-switch requests (since with the scheduler we'll
reorder so can't do the same trick as with legacy ctx switching any more).
This trick here otoh extends the lifetime of a request until it's kicked
out, so that the shrinker doesn't it the request object too early.

How does holding an additional ref on the request prevent this? I mean I
don't see how that would prevent special casing in the eviction code ...
Or is your plan to put the final waiting for the request the ctx to idle
into the ->vma_unbind hook? That seems again a bit a quirk ...
-Daniel
Chris Wilson Oct. 9, 2015, 8:36 a.m. UTC | #5
On Fri, Oct 09, 2015 at 09:58:51AM +0200, Daniel Vetter wrote:
> > > +static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
> > > +{
> > > +	return (i915_gem_request_completed(req, true) &&
> > > +		(!req->elsp_submitted || req->ctx_complete));
> > 
> > I disagree with this completely. A request must only be considered complete
> > when it's seqno is passed. The context should be tracking the request
> > and not the other way around (like legacy does).
> > 
> > The approach is just wrong. The lifecycle of the request is not the
> > bloat in execlists, and keeping an extra reference on the request struct
> > itself them until execlists is able to reap them amoritizes the cost of
> > the spinlock and atomic operations.
> 
> Hm, that's slightly different approach than what I suggested with just
> creating special ctx-switch requests (since with the scheduler we'll
> reorder so can't do the same trick as with legacy ctx switching any more).
> This trick here otoh extends the lifetime of a request until it's kicked
> out, so that the shrinker doesn't it the request object too early.

Note what you are considering here is how to use requests to manage
contexts. execlists is different enough from legacy in how contexts are
only active for as long as the request is, that we don't need anything
more than the current requests to manage their active cycles + lifetimes.

> How does holding an additional ref on the request prevent this? I mean I
> don't see how that would prevent special casing in the eviction code ...
> Or is your plan to put the final waiting for the request the ctx to idle
> into the ->vma_unbind hook? That seems again a bit a quirk ...

The reference is simply that the submission is asynchronous and not
under the purview of GEM, i.e. execlists has a list of requests with
which to submit and so needs a reference - especially as the ordering
between request completion and the interrupt is not guaranteed, and the
requests can be completed asynchronously. The reference is not to
prevent retiring, but simply to keep the request struct alive until
removed from the execlists.  Similarly for anything more exotic than
the FIFO scheduler.

The basic idea I have here is simply that requests are complete when the
GPU finishes executing them, that is when we retire them from the GEM lists.
I want to keep it that simple as then we can build upon requests for
tracking GPU activity at an abstract GEM level. So context lifecycles
depend on requests and not the other way around (which is how we do it
for legacy, pin until switch (whilst the hw references the ctx) then active
until the switch away is complete.)) I think underpining this is we do
have to keep the idea of GPU activity (requests) and object life cycles
as two seperate but interelated ideas/mechanisms.
-Chris
Daniel Vetter Oct. 9, 2015, 9:15 a.m. UTC | #6
On Fri, Oct 09, 2015 at 09:36:58AM +0100, Chris Wilson wrote:
> On Fri, Oct 09, 2015 at 09:58:51AM +0200, Daniel Vetter wrote:
> > > > +static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
> > > > +{
> > > > +	return (i915_gem_request_completed(req, true) &&
> > > > +		(!req->elsp_submitted || req->ctx_complete));
> > > 
> > > I disagree with this completely. A request must only be considered complete
> > > when it's seqno is passed. The context should be tracking the request
> > > and not the other way around (like legacy does).
> > > 
> > > The approach is just wrong. The lifecycle of the request is not the
> > > bloat in execlists, and keeping an extra reference on the request struct
> > > itself them until execlists is able to reap them amoritizes the cost of
> > > the spinlock and atomic operations.
> > 
> > Hm, that's slightly different approach than what I suggested with just
> > creating special ctx-switch requests (since with the scheduler we'll
> > reorder so can't do the same trick as with legacy ctx switching any more).
> > This trick here otoh extends the lifetime of a request until it's kicked
> > out, so that the shrinker doesn't it the request object too early.
> 
> Note what you are considering here is how to use requests to manage
> contexts. execlists is different enough from legacy in how contexts are
> only active for as long as the request is, that we don't need anything
> more than the current requests to manage their active cycles + lifetimes.
> 
> > How does holding an additional ref on the request prevent this? I mean I
> > don't see how that would prevent special casing in the eviction code ...
> > Or is your plan to put the final waiting for the request the ctx to idle
> > into the ->vma_unbind hook? That seems again a bit a quirk ...
> 
> The reference is simply that the submission is asynchronous and not
> under the purview of GEM, i.e. execlists has a list of requests with
> which to submit and so needs a reference - especially as the ordering
> between request completion and the interrupt is not guaranteed, and the
> requests can be completed asynchronously. The reference is not to
> prevent retiring, but simply to keep the request struct alive until
> removed from the execlists.  Similarly for anything more exotic than
> the FIFO scheduler.

Hm yeah I thought we've had that already ... at least there's an
i915_gem_request_unreference in intel_execlists_retire_requests, separate
from the core request retiring.

> The basic idea I have here is simply that requests are complete when the
> GPU finishes executing them, that is when we retire them from the GEM lists.
> I want to keep it that simple as then we can build upon requests for
> tracking GPU activity at an abstract GEM level. So context lifecycles
> depend on requests and not the other way around (which is how we do it
> for legacy, pin until switch (whilst the hw references the ctx) then active
> until the switch away is complete.)) I think underpining this is we do
> have to keep the idea of GPU activity (requests) and object life cycles
> as two seperate but interelated ideas/mechanisms.

Yup agreed on that concept. The difference is in how exactly we track when
the context is no longer in use by the hardware. With execlist we have

1. MI_BB_START
2. seqno write/irq, which completes the requests that we currently have.
3. hw/guc retires the hw context and signals that with another, dedicated
irq (CTX_SWITCH_IRQ).

Nick's approach is to dealy the request completion with ctx_completed from
2 to 3, but open-coded in places (which breaks the shrinker).

You seem to suggest something similar, but I'm not entirely clear.

My idea was to create a new request for 3. which gets signalled by the
scheduler in intel_lrc_irq_handler. My idea was that we'd only create
these when a ctx switch might occur to avoid overhead, but I guess if we
just outright delay all requests a notch if need that might work too. But
I'm really not sure on the implications of that (i.e. does the hardware
really unlod the ctx if it's idle?), and whether that would fly still with
the scheduler.

But figuring this one out here seems to be the cornestone of this reorg.
Without it we can't just throw contexts onto the active list.
-Daniel
Chris Wilson Oct. 9, 2015, 9:45 a.m. UTC | #7
On Fri, Oct 09, 2015 at 11:15:08AM +0200, Daniel Vetter wrote:
> My idea was to create a new request for 3. which gets signalled by the
> scheduler in intel_lrc_irq_handler. My idea was that we'd only create
> these when a ctx switch might occur to avoid overhead, but I guess if we
> just outright delay all requests a notch if need that might work too. But
> I'm really not sure on the implications of that (i.e. does the hardware
> really unlod the ctx if it's idle?), and whether that would fly still with
> the scheduler.
>
> But figuring this one out here seems to be the cornestone of this reorg.
> Without it we can't just throw contexts onto the active list.

(Let me see if I understand it correctly)

Basically the problem is that we can't trust the context object to be
synchronized until after the status interrupt. The way we handled that
for legacy is to track the currently bound context and keep the
vma->pin_count asserted until the request containing the switch away.
Doing the same for execlists would trivially fix the issue and if done
smartly allows us to share more code (been there, done that).

That satisfies me for keeping requests as a basic fence in the GPU
timeline and should keep everyone happy that the context can't vanish
until after it is complete. The only caveat is that we cannot evict the
most recent context. For legacy, we do a switch back to the always
pinned default context. For execlists we don't, but it still means we
should only have one context which cannot be evicted (like legacy). But
it does leave us with the issue that i915_gpu_idle() returns early and
i915_gem_context_fini() must keep the explicit gpu reset to be
absolutely sure that the pending context writes are completed before the
final context is unbound.
-Chris
Daniel Vetter Oct. 9, 2015, 5:18 p.m. UTC | #8
On Fri, Oct 09, 2015 at 10:45:35AM +0100, Chris Wilson wrote:
> On Fri, Oct 09, 2015 at 11:15:08AM +0200, Daniel Vetter wrote:
> > My idea was to create a new request for 3. which gets signalled by the
> > scheduler in intel_lrc_irq_handler. My idea was that we'd only create
> > these when a ctx switch might occur to avoid overhead, but I guess if we
> > just outright delay all requests a notch if need that might work too. But
> > I'm really not sure on the implications of that (i.e. does the hardware
> > really unlod the ctx if it's idle?), and whether that would fly still with
> > the scheduler.
> >
> > But figuring this one out here seems to be the cornestone of this reorg.
> > Without it we can't just throw contexts onto the active list.
> 
> (Let me see if I understand it correctly)
> 
> Basically the problem is that we can't trust the context object to be
> synchronized until after the status interrupt. The way we handled that
> for legacy is to track the currently bound context and keep the
> vma->pin_count asserted until the request containing the switch away.
> Doing the same for execlists would trivially fix the issue and if done
> smartly allows us to share more code (been there, done that).
> 
> That satisfies me for keeping requests as a basic fence in the GPU
> timeline and should keep everyone happy that the context can't vanish
> until after it is complete. The only caveat is that we cannot evict the
> most recent context. For legacy, we do a switch back to the always
> pinned default context. For execlists we don't, but it still means we
> should only have one context which cannot be evicted (like legacy). But
> it does leave us with the issue that i915_gpu_idle() returns early and
> i915_gem_context_fini() must keep the explicit gpu reset to be
> absolutely sure that the pending context writes are completed before the
> final context is unbound.

Yes, and that was what I originally had in mind. Meanwhile the scheduler
(will) happen and that means we won't have FIFO ordering. Which means when
we switch contexts (as opposed to just adding more to the ringbuffer of
the current one) we won't have any idea which context will be the next
one. Which also means we don't know which request to pick to retire the
old context. Hence why I think we need to be better.

Of course we can first implement the legacy ctx scheme and then let John
Harrison deal with the mess again, but he might not like that too much ;-)

The other upside of tracking the real ctx-no-longer-in-use with the ctx
itself is that we don't need to pin anything ever (I think), at least
conceptually. But decidedly less sure about that ...
-Daniel
Chris Wilson Oct. 9, 2015, 5:23 p.m. UTC | #9
On Fri, Oct 09, 2015 at 07:18:21PM +0200, Daniel Vetter wrote:
> On Fri, Oct 09, 2015 at 10:45:35AM +0100, Chris Wilson wrote:
> > On Fri, Oct 09, 2015 at 11:15:08AM +0200, Daniel Vetter wrote:
> > > My idea was to create a new request for 3. which gets signalled by the
> > > scheduler in intel_lrc_irq_handler. My idea was that we'd only create
> > > these when a ctx switch might occur to avoid overhead, but I guess if we
> > > just outright delay all requests a notch if need that might work too. But
> > > I'm really not sure on the implications of that (i.e. does the hardware
> > > really unlod the ctx if it's idle?), and whether that would fly still with
> > > the scheduler.
> > >
> > > But figuring this one out here seems to be the cornestone of this reorg.
> > > Without it we can't just throw contexts onto the active list.
> > 
> > (Let me see if I understand it correctly)
> > 
> > Basically the problem is that we can't trust the context object to be
> > synchronized until after the status interrupt. The way we handled that
> > for legacy is to track the currently bound context and keep the
> > vma->pin_count asserted until the request containing the switch away.
> > Doing the same for execlists would trivially fix the issue and if done
> > smartly allows us to share more code (been there, done that).
> > 
> > That satisfies me for keeping requests as a basic fence in the GPU
> > timeline and should keep everyone happy that the context can't vanish
> > until after it is complete. The only caveat is that we cannot evict the
> > most recent context. For legacy, we do a switch back to the always
> > pinned default context. For execlists we don't, but it still means we
> > should only have one context which cannot be evicted (like legacy). But
> > it does leave us with the issue that i915_gpu_idle() returns early and
> > i915_gem_context_fini() must keep the explicit gpu reset to be
> > absolutely sure that the pending context writes are completed before the
> > final context is unbound.
> 
> Yes, and that was what I originally had in mind. Meanwhile the scheduler
> (will) happen and that means we won't have FIFO ordering. Which means when
> we switch contexts (as opposed to just adding more to the ringbuffer of
> the current one) we won't have any idea which context will be the next
> one. Which also means we don't know which request to pick to retire the
> old context. Hence why I think we need to be better.

But the scheduler does - it is also in charge of making sure the
retirement queue is in order. The essence is that we only actually pin
engine->last_context, which is chosen as we submit stuff to the hw.
 
> Of course we can first implement the legacy ctx scheme and then let John
> Harrison deal with the mess again, but he might not like that too much ;-)
> 
> The other upside of tracking the real ctx-no-longer-in-use with the ctx
> itself is that we don't need to pin anything ever (I think), at least
> conceptually. But decidedly less sure about that ...

Right. There's still the reservation phase, but after that the pin just
tracks the hw.
-Chris
Daniel Vetter Oct. 13, 2015, 11:29 a.m. UTC | #10
On Fri, Oct 09, 2015 at 06:23:50PM +0100, Chris Wilson wrote:
> On Fri, Oct 09, 2015 at 07:18:21PM +0200, Daniel Vetter wrote:
> > On Fri, Oct 09, 2015 at 10:45:35AM +0100, Chris Wilson wrote:
> > > On Fri, Oct 09, 2015 at 11:15:08AM +0200, Daniel Vetter wrote:
> > > > My idea was to create a new request for 3. which gets signalled by the
> > > > scheduler in intel_lrc_irq_handler. My idea was that we'd only create
> > > > these when a ctx switch might occur to avoid overhead, but I guess if we
> > > > just outright delay all requests a notch if need that might work too. But
> > > > I'm really not sure on the implications of that (i.e. does the hardware
> > > > really unlod the ctx if it's idle?), and whether that would fly still with
> > > > the scheduler.
> > > >
> > > > But figuring this one out here seems to be the cornestone of this reorg.
> > > > Without it we can't just throw contexts onto the active list.
> > > 
> > > (Let me see if I understand it correctly)
> > > 
> > > Basically the problem is that we can't trust the context object to be
> > > synchronized until after the status interrupt. The way we handled that
> > > for legacy is to track the currently bound context and keep the
> > > vma->pin_count asserted until the request containing the switch away.
> > > Doing the same for execlists would trivially fix the issue and if done
> > > smartly allows us to share more code (been there, done that).
> > > 
> > > That satisfies me for keeping requests as a basic fence in the GPU
> > > timeline and should keep everyone happy that the context can't vanish
> > > until after it is complete. The only caveat is that we cannot evict the
> > > most recent context. For legacy, we do a switch back to the always
> > > pinned default context. For execlists we don't, but it still means we
> > > should only have one context which cannot be evicted (like legacy). But
> > > it does leave us with the issue that i915_gpu_idle() returns early and
> > > i915_gem_context_fini() must keep the explicit gpu reset to be
> > > absolutely sure that the pending context writes are completed before the
> > > final context is unbound.
> > 
> > Yes, and that was what I originally had in mind. Meanwhile the scheduler
> > (will) happen and that means we won't have FIFO ordering. Which means when
> > we switch contexts (as opposed to just adding more to the ringbuffer of
> > the current one) we won't have any idea which context will be the next
> > one. Which also means we don't know which request to pick to retire the
> > old context. Hence why I think we need to be better.
> 
> But the scheduler does - it is also in charge of making sure the
> retirement queue is in order. The essence is that we only actually pin
> engine->last_context, which is chosen as we submit stuff to the hw.

Well I'm not sure how much it will reorder, but I'd expect it wants to
reorder stuff pretty freely. And as soon as it reorders context (ofc they
can't depend on each another) then the legacy hw ctx tracking won't work.

I think at least ...
-Daniel
Chris Wilson Oct. 13, 2015, 11:36 a.m. UTC | #11
On Tue, Oct 13, 2015 at 01:29:56PM +0200, Daniel Vetter wrote:
> On Fri, Oct 09, 2015 at 06:23:50PM +0100, Chris Wilson wrote:
> > On Fri, Oct 09, 2015 at 07:18:21PM +0200, Daniel Vetter wrote:
> > > On Fri, Oct 09, 2015 at 10:45:35AM +0100, Chris Wilson wrote:
> > > > On Fri, Oct 09, 2015 at 11:15:08AM +0200, Daniel Vetter wrote:
> > > > > My idea was to create a new request for 3. which gets signalled by the
> > > > > scheduler in intel_lrc_irq_handler. My idea was that we'd only create
> > > > > these when a ctx switch might occur to avoid overhead, but I guess if we
> > > > > just outright delay all requests a notch if need that might work too. But
> > > > > I'm really not sure on the implications of that (i.e. does the hardware
> > > > > really unlod the ctx if it's idle?), and whether that would fly still with
> > > > > the scheduler.
> > > > >
> > > > > But figuring this one out here seems to be the cornestone of this reorg.
> > > > > Without it we can't just throw contexts onto the active list.
> > > > 
> > > > (Let me see if I understand it correctly)
> > > > 
> > > > Basically the problem is that we can't trust the context object to be
> > > > synchronized until after the status interrupt. The way we handled that
> > > > for legacy is to track the currently bound context and keep the
> > > > vma->pin_count asserted until the request containing the switch away.
> > > > Doing the same for execlists would trivially fix the issue and if done
> > > > smartly allows us to share more code (been there, done that).
> > > > 
> > > > That satisfies me for keeping requests as a basic fence in the GPU
> > > > timeline and should keep everyone happy that the context can't vanish
> > > > until after it is complete. The only caveat is that we cannot evict the
> > > > most recent context. For legacy, we do a switch back to the always
> > > > pinned default context. For execlists we don't, but it still means we
> > > > should only have one context which cannot be evicted (like legacy). But
> > > > it does leave us with the issue that i915_gpu_idle() returns early and
> > > > i915_gem_context_fini() must keep the explicit gpu reset to be
> > > > absolutely sure that the pending context writes are completed before the
> > > > final context is unbound.
> > > 
> > > Yes, and that was what I originally had in mind. Meanwhile the scheduler
> > > (will) happen and that means we won't have FIFO ordering. Which means when
> > > we switch contexts (as opposed to just adding more to the ringbuffer of
> > > the current one) we won't have any idea which context will be the next
> > > one. Which also means we don't know which request to pick to retire the
> > > old context. Hence why I think we need to be better.
> > 
> > But the scheduler does - it is also in charge of making sure the
> > retirement queue is in order. The essence is that we only actually pin
> > engine->last_context, which is chosen as we submit stuff to the hw.
> 
> Well I'm not sure how much it will reorder, but I'd expect it wants to
> reorder stuff pretty freely. And as soon as it reorders context (ofc they
> can't depend on each another) then the legacy hw ctx tracking won't work.
> 
> I think at least ...

Not the way it is written today, but the principle behind it still
stands. The last_context submitted to the hardware is pinned until a new
one is submitted (such that it remains bound in the GGTT until after the
context switch is complete due to the active reference). Instead of
doing the context tracking at the start of the execbuffer, the context
tracking needs to be pushed down to the submission backend/middleman.
-Chris
Dave Gordon Oct. 14, 2015, 2:42 p.m. UTC | #12
On 13/10/15 12:36, Chris Wilson wrote:
> On Tue, Oct 13, 2015 at 01:29:56PM +0200, Daniel Vetter wrote:
>> On Fri, Oct 09, 2015 at 06:23:50PM +0100, Chris Wilson wrote:
>>> On Fri, Oct 09, 2015 at 07:18:21PM +0200, Daniel Vetter wrote:
>>>> On Fri, Oct 09, 2015 at 10:45:35AM +0100, Chris Wilson wrote:
>>>>> On Fri, Oct 09, 2015 at 11:15:08AM +0200, Daniel Vetter wrote:
>>>>>> My idea was to create a new request for 3. which gets signalled by the
>>>>>> scheduler in intel_lrc_irq_handler. My idea was that we'd only create
>>>>>> these when a ctx switch might occur to avoid overhead, but I guess if we
>>>>>> just outright delay all requests a notch if need that might work too. But
>>>>>> I'm really not sure on the implications of that (i.e. does the hardware
>>>>>> really unlod the ctx if it's idle?), and whether that would fly still with
>>>>>> the scheduler.
>>>>>>
>>>>>> But figuring this one out here seems to be the cornestone of this reorg.
>>>>>> Without it we can't just throw contexts onto the active list.
>>>>>
>>>>> (Let me see if I understand it correctly)
>>>>>
>>>>> Basically the problem is that we can't trust the context object to be
>>>>> synchronized until after the status interrupt. The way we handled that
>>>>> for legacy is to track the currently bound context and keep the
>>>>> vma->pin_count asserted until the request containing the switch away.
>>>>> Doing the same for execlists would trivially fix the issue and if done
>>>>> smartly allows us to share more code (been there, done that).
>>>>>
>>>>> That satisfies me for keeping requests as a basic fence in the GPU
>>>>> timeline and should keep everyone happy that the context can't vanish
>>>>> until after it is complete. The only caveat is that we cannot evict the
>>>>> most recent context. For legacy, we do a switch back to the always
>>>>> pinned default context. For execlists we don't, but it still means we
>>>>> should only have one context which cannot be evicted (like legacy). But
>>>>> it does leave us with the issue that i915_gpu_idle() returns early and
>>>>> i915_gem_context_fini() must keep the explicit gpu reset to be
>>>>> absolutely sure that the pending context writes are completed before the
>>>>> final context is unbound.
>>>>
>>>> Yes, and that was what I originally had in mind. Meanwhile the scheduler
>>>> (will) happen and that means we won't have FIFO ordering. Which means when
>>>> we switch contexts (as opposed to just adding more to the ringbuffer of
>>>> the current one) we won't have any idea which context will be the next
>>>> one. Which also means we don't know which request to pick to retire the
>>>> old context. Hence why I think we need to be better.
>>>
>>> But the scheduler does - it is also in charge of making sure the
>>> retirement queue is in order. The essence is that we only actually pin
>>> engine->last_context, which is chosen as we submit stuff to the hw.
>>
>> Well I'm not sure how much it will reorder, but I'd expect it wants to
>> reorder stuff pretty freely. And as soon as it reorders context (ofc they
>> can't depend on each another) then the legacy hw ctx tracking won't work.
>>
>> I think at least ...
>
> Not the way it is written today, but the principle behind it still
> stands. The last_context submitted to the hardware is pinned until a new
> one is submitted (such that it remains bound in the GGTT until after the
> context switch is complete due to the active reference). Instead of
> doing the context tracking at the start of the execbuffer, the context
> tracking needs to be pushed down to the submission backend/middleman.
> -Chris

Does anyone actually know what guarantees (if any) the GPU provides 
w.r.t access to context images vs. USER_INTERRUPTs and CSB-updated 
interrupts? Does 'active->idle' really mean that the context has been 
fully updated in memory (and can therefore be unmapped), or just that
the engine has stopped processing (but the context might not be saved 
until it's known that it isn't going to be reactivated).

For example, it could implement this:

(End of last batch in current context)
	1.	Update seqno
	2.	Generate USER_INTERRUPT
	3.	Engine finishes work
		(HEAD == TAIL and no further contexts queued in ELSP)
	4.	Save all per-context registers to context image
	5.	Flush to memory and invalidate
	6.	Update CSB
	7.	Flush to memory
	8.	Generate CSB-update interrupt.

(New batch in same context submitted via ELSP)
	9.	Reload entire context image from memory
	10.	Update CSB
	11.	Generate CSB-update interrupt

Or this:
	1. Update seqno
	2. Generate USER_INTERRUPT
	3. Engine finishes work
		(HEAD == TAIL and no further contexts queued in ELSP)
	4. Update CSB
	5. Generate CSB-update interrupt.

(New batch in DIFFERENT context submitted via ELSP)
	6. Save all per-context registers to old context image
	7. Load entire context image from new image
	8. Update CSB
	9. Generate CSB-update interrupt

The former is synchronous and relatively easy to model, the latter is 
more like the way legacy mode works. Any various other permutations are 
possible (sync save vs async save vs deferred save, full reload vs 
lite-restore, etc). So I think we either need to know what really 
happens (and assume future chips will work the same way), or make only 
minimal assumptions and code something that will work no matter how the 
hardware actually behaves. That probably precludes any attempt at 
tracking individual context-switches at the CSB level, which in any case 
aren't passed to the CPU in GuC submission mode.

.Dave.
Nick Hoath Oct. 14, 2015, 4:19 p.m. UTC | #13
On 14/10/2015 15:42, Dave Gordon wrote:
> On 13/10/15 12:36, Chris Wilson wrote:
>> On Tue, Oct 13, 2015 at 01:29:56PM +0200, Daniel Vetter wrote:
>>> On Fri, Oct 09, 2015 at 06:23:50PM +0100, Chris Wilson wrote:
>>>> On Fri, Oct 09, 2015 at 07:18:21PM +0200, Daniel Vetter wrote:
>>>>> On Fri, Oct 09, 2015 at 10:45:35AM +0100, Chris Wilson wrote:
>>>>>> On Fri, Oct 09, 2015 at 11:15:08AM +0200, Daniel Vetter wrote:
>>>>>>> My idea was to create a new request for 3. which gets signalled by the
>>>>>>> scheduler in intel_lrc_irq_handler. My idea was that we'd only create
>>>>>>> these when a ctx switch might occur to avoid overhead, but I guess if we
>>>>>>> just outright delay all requests a notch if need that might work too. But
>>>>>>> I'm really not sure on the implications of that (i.e. does the hardware
>>>>>>> really unlod the ctx if it's idle?), and whether that would fly still with
>>>>>>> the scheduler.
>>>>>>>
>>>>>>> But figuring this one out here seems to be the cornestone of this reorg.
>>>>>>> Without it we can't just throw contexts onto the active list.
>>>>>>
>>>>>> (Let me see if I understand it correctly)
>>>>>>
>>>>>> Basically the problem is that we can't trust the context object to be
>>>>>> synchronized until after the status interrupt. The way we handled that
>>>>>> for legacy is to track the currently bound context and keep the
>>>>>> vma->pin_count asserted until the request containing the switch away.
>>>>>> Doing the same for execlists would trivially fix the issue and if done
>>>>>> smartly allows us to share more code (been there, done that).
>>>>>>
>>>>>> That satisfies me for keeping requests as a basic fence in the GPU
>>>>>> timeline and should keep everyone happy that the context can't vanish
>>>>>> until after it is complete. The only caveat is that we cannot evict the
>>>>>> most recent context. For legacy, we do a switch back to the always
>>>>>> pinned default context. For execlists we don't, but it still means we
>>>>>> should only have one context which cannot be evicted (like legacy). But
>>>>>> it does leave us with the issue that i915_gpu_idle() returns early and
>>>>>> i915_gem_context_fini() must keep the explicit gpu reset to be
>>>>>> absolutely sure that the pending context writes are completed before the
>>>>>> final context is unbound.
>>>>>
>>>>> Yes, and that was what I originally had in mind. Meanwhile the scheduler
>>>>> (will) happen and that means we won't have FIFO ordering. Which means when
>>>>> we switch contexts (as opposed to just adding more to the ringbuffer of
>>>>> the current one) we won't have any idea which context will be the next
>>>>> one. Which also means we don't know which request to pick to retire the
>>>>> old context. Hence why I think we need to be better.
>>>>
>>>> But the scheduler does - it is also in charge of making sure the
>>>> retirement queue is in order. The essence is that we only actually pin
>>>> engine->last_context, which is chosen as we submit stuff to the hw.
>>>
>>> Well I'm not sure how much it will reorder, but I'd expect it wants to
>>> reorder stuff pretty freely. And as soon as it reorders context (ofc they
>>> can't depend on each another) then the legacy hw ctx tracking won't work.
>>>
>>> I think at least ...
>>
>> Not the way it is written today, but the principle behind it still
>> stands. The last_context submitted to the hardware is pinned until a new
>> one is submitted (such that it remains bound in the GGTT until after the
>> context switch is complete due to the active reference). Instead of
>> doing the context tracking at the start of the execbuffer, the context
>> tracking needs to be pushed down to the submission backend/middleman.
>> -Chris
>
> Does anyone actually know what guarantees (if any) the GPU provides
> w.r.t access to context images vs. USER_INTERRUPTs and CSB-updated
> interrupts? Does 'active->idle' really mean that the context has been
> fully updated in memory (and can therefore be unmapped), or just that
> the engine has stopped processing (but the context might not be saved
> until it's known that it isn't going to be reactivated).
>
> For example, it could implement this:
>
> (End of last batch in current context)
> 	1.	Update seqno
> 	2.	Generate USER_INTERRUPT
> 	3.	Engine finishes work
> 		(HEAD == TAIL and no further contexts queued in ELSP)
> 	4.	Save all per-context registers to context image
> 	5.	Flush to memory and invalidate
> 	6.	Update CSB
> 	7.	Flush to memory
> 	8.	Generate CSB-update interrupt.
>
> (New batch in same context submitted via ELSP)
> 	9.	Reload entire context image from memory
> 	10.	Update CSB
> 	11.	Generate CSB-update interrupt
>
> Or this:
> 	1. Update seqno
> 	2. Generate USER_INTERRUPT
> 	3. Engine finishes work
> 		(HEAD == TAIL and no further contexts queued in ELSP)
> 	4. Update CSB
> 	5. Generate CSB-update interrupt.
>
> (New batch in DIFFERENT context submitted via ELSP)
> 	6. Save all per-context registers to old context image
> 	7. Load entire context image from new image
> 	8. Update CSB
> 	9. Generate CSB-update interrupt
>
> The former is synchronous and relatively easy to model, the latter is
> more like the way legacy mode works. Any various other permutations are
> possible (sync save vs async save vs deferred save, full reload vs
> lite-restore, etc). So I think we either need to know what really
> happens (and assume future chips will work the same way), or make only
> minimal assumptions and code something that will work no matter how the
> hardware actually behaves. That probably precludes any attempt at
> tracking individual context-switches at the CSB level, which in any case
> aren't passed to the CPU in GuC submission mode.
>
> .Dave.
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>

Tracking context via last_context at request retirement.

In LRC/ELSP mode:
	At startup:
		- Double refcount default context
		- Set last_context to default context

	When a request is complete
		- If last_context == current_context
			- queue request for cleanup
		- If last_context != current_context
			- unref last_context
			- update last_context to current_context
			- queue request for cleanup

What this achieves:
	Make the code path closer to legacy submission
	Can now use active_list tracking for contexts & ringbufs

Additional work 1:
	- When there is no work pending on an engine, at some point:
		- Send a nop request on the default context
			This moves last_context to be default context,
			allowing previous last_context to be unref'd

Additional work 2:
	- Change legacy mode to use last_context post request completion
		This will allow us to unify the code paths.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fbf0ae9..3d217f9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2262,6 +2262,12 @@  struct drm_i915_gem_request {
 	/** Execlists no. of times this request has been sent to the ELSP */
 	int elsp_submitted;
 
+	/**
+	 * Execlists: whether this requests's context has completed after
+	 * submission to the ELSP
+	 */
+	bool ctx_complete;
+
 };
 
 int i915_gem_request_alloc(struct intel_engine_cs *ring,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 52642af..fc82171 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1386,6 +1386,24 @@  __i915_gem_request_retire__upto(struct drm_i915_gem_request *req)
 				       typeof(*tmp), list);
 
 		i915_gem_request_retire(tmp);
+
+		if (i915.enable_execlists) {
+			struct intel_context *ctx = tmp->ctx;
+			struct drm_i915_private *dev_priv =
+				engine->dev->dev_private;
+			unsigned long flags;
+			struct drm_i915_gem_object *ctx_obj =
+				ctx->engine[engine->id].state;
+
+			spin_lock_irqsave(&engine->execlist_lock, flags);
+
+			if (ctx_obj && (ctx != engine->default_context))
+				intel_lr_context_unpin(tmp);
+
+			intel_runtime_pm_put(dev_priv);
+			spin_unlock_irqrestore(&engine->execlist_lock, flags);
+		}
+
 	} while (tmp != req);
 
 	WARN_ON(i915_verify_lists(engine->dev));
@@ -2359,6 +2377,12 @@  void i915_vma_move_to_active(struct i915_vma *vma,
 	list_move_tail(&vma->mm_list, &vma->vm->active_list);
 }
 
+static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
+{
+	return (i915_gem_request_completed(req, true) &&
+		(!req->elsp_submitted || req->ctx_complete));
+}
+
 static void
 i915_gem_object_retire__write(struct drm_i915_gem_object *obj)
 {
@@ -2829,10 +2853,28 @@  i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 					   struct drm_i915_gem_request,
 					   list);
 
-		if (!i915_gem_request_completed(request, true))
+		if (!i915_gem_request_retireable(request))
 			break;
 
 		i915_gem_request_retire(request);
+
+		if (i915.enable_execlists) {
+			struct intel_context *ctx = request->ctx;
+			struct drm_i915_private *dev_priv =
+				ring->dev->dev_private;
+			unsigned long flags;
+			struct drm_i915_gem_object *ctx_obj =
+				ctx->engine[ring->id].state;
+
+			spin_lock_irqsave(&ring->execlist_lock, flags);
+
+			if (ctx_obj && (ctx != ring->default_context))
+				intel_lr_context_unpin(request);
+
+			intel_runtime_pm_put(dev_priv);
+			spin_unlock_irqrestore(&ring->execlist_lock, flags);
+		}
+
 	}
 
 	/* Move any buffers on the active list that are no longer referenced
@@ -2848,12 +2890,14 @@  i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 
 		if (!list_empty(&obj->last_read_req[ring->id]->list))
 			break;
+		if (!i915_gem_request_retireable(obj->last_read_req[ring->id]))
+			break;
 
 		i915_gem_object_retire__read(obj, ring->id);
 	}
 
 	if (unlikely(ring->trace_irq_req &&
-		     i915_gem_request_completed(ring->trace_irq_req, true))) {
+		     i915_gem_request_retireable(ring->trace_irq_req))) {
 		ring->irq_put(ring);
 		i915_gem_request_assign(&ring->trace_irq_req, NULL);
 	}
@@ -2872,15 +2916,6 @@  i915_gem_retire_requests(struct drm_device *dev)
 	for_each_ring(ring, dev_priv, i) {
 		i915_gem_retire_requests_ring(ring);
 		idle &= list_empty(&ring->request_list);
-		if (i915.enable_execlists) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&ring->execlist_lock, flags);
-			idle &= list_empty(&ring->execlist_queue);
-			spin_unlock_irqrestore(&ring->execlist_lock, flags);
-
-			intel_execlists_retire_requests(ring);
-		}
 	}
 
 	if (idle)
@@ -2956,12 +2991,14 @@  i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
 		if (req == NULL)
 			continue;
 
-		if (list_empty(&req->list))
-			goto retire;
+		if (list_empty(&req->list)) {
+			if (i915_gem_request_retireable(req))
+				i915_gem_object_retire__read(obj, i);
+			continue;
+		}
 
-		if (i915_gem_request_completed(req, true)) {
+		if (i915_gem_request_retireable(req)) {
 			__i915_gem_request_retire__upto(req);
-retire:
 			i915_gem_object_retire__read(obj, i);
 		}
 	}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 8ca772d..ebbafac 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1291,66 +1291,91 @@  static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 				       u32 master_ctl)
 {
 	irqreturn_t ret = IRQ_NONE;
+	bool need_notify;
 
 	if (master_ctl & (GEN8_GT_RCS_IRQ | GEN8_GT_BCS_IRQ)) {
-		u32 tmp = I915_READ_FW(GEN8_GT_IIR(0));
-		if (tmp) {
-			I915_WRITE_FW(GEN8_GT_IIR(0), tmp);
+		u32 iir = I915_READ_FW(GEN8_GT_IIR(0));
+
+		if (iir) {
+			I915_WRITE_FW(GEN8_GT_IIR(0), iir);
 			ret = IRQ_HANDLED;
 
-			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[RCS]);
-			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
+			need_notify = false;
+			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
+					GEN8_RCS_IRQ_SHIFT))
+				need_notify = intel_lrc_irq_handler(
+						&dev_priv->ring[RCS]);
+			if (iir & (GT_RENDER_USER_INTERRUPT <<
+					GEN8_RCS_IRQ_SHIFT) || need_notify)
 				notify_ring(&dev_priv->ring[RCS]);
 
-			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[BCS]);
-			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
+			need_notify = false;
+			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
+					GEN8_BCS_IRQ_SHIFT))
+				need_notify = intel_lrc_irq_handler(
+						&dev_priv->ring[BCS]);
+			if (iir & (GT_RENDER_USER_INTERRUPT <<
+					GEN8_BCS_IRQ_SHIFT) || need_notify)
 				notify_ring(&dev_priv->ring[BCS]);
 		} else
 			DRM_ERROR("The master control interrupt lied (GT0)!\n");
 	}
 
 	if (master_ctl & (GEN8_GT_VCS1_IRQ | GEN8_GT_VCS2_IRQ)) {
-		u32 tmp = I915_READ_FW(GEN8_GT_IIR(1));
-		if (tmp) {
-			I915_WRITE_FW(GEN8_GT_IIR(1), tmp);
+		u32 iir = I915_READ_FW(GEN8_GT_IIR(1));
+
+		if (iir) {
+			I915_WRITE_FW(GEN8_GT_IIR(1), iir);
 			ret = IRQ_HANDLED;
 
-			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[VCS]);
-			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
+			need_notify = false;
+			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
+					GEN8_VCS1_IRQ_SHIFT))
+				need_notify = intel_lrc_irq_handler(
+						&dev_priv->ring[VCS]);
+			if (iir & (GT_RENDER_USER_INTERRUPT <<
+					GEN8_VCS1_IRQ_SHIFT) || need_notify)
 				notify_ring(&dev_priv->ring[VCS]);
 
-			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[VCS2]);
-			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
+			need_notify = false;
+			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
+					GEN8_VCS2_IRQ_SHIFT))
+				need_notify = intel_lrc_irq_handler(
+						&dev_priv->ring[VCS2]);
+			if (iir & (GT_RENDER_USER_INTERRUPT <<
+					GEN8_VCS2_IRQ_SHIFT) || need_notify)
 				notify_ring(&dev_priv->ring[VCS2]);
 		} else
 			DRM_ERROR("The master control interrupt lied (GT1)!\n");
 	}
 
 	if (master_ctl & GEN8_GT_VECS_IRQ) {
-		u32 tmp = I915_READ_FW(GEN8_GT_IIR(3));
-		if (tmp) {
-			I915_WRITE_FW(GEN8_GT_IIR(3), tmp);
+		u32 iir = I915_READ_FW(GEN8_GT_IIR(3));
+
+		if (iir) {
+			I915_WRITE_FW(GEN8_GT_IIR(3), iir);
 			ret = IRQ_HANDLED;
 
-			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[VECS]);
-			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
+			need_notify = false;
+			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
+					GEN8_VECS_IRQ_SHIFT))
+				need_notify = intel_lrc_irq_handler(
+						&dev_priv->ring[VECS]);
+			if (iir & (GT_RENDER_USER_INTERRUPT <<
+					GEN8_VECS_IRQ_SHIFT) || need_notify)
 				notify_ring(&dev_priv->ring[VECS]);
 		} else
 			DRM_ERROR("The master control interrupt lied (GT3)!\n");
 	}
 
 	if (master_ctl & GEN8_GT_PM_IRQ) {
-		u32 tmp = I915_READ_FW(GEN8_GT_IIR(2));
-		if (tmp & dev_priv->pm_rps_events) {
+		u32 iir = I915_READ_FW(GEN8_GT_IIR(2));
+
+		if (iir & dev_priv->pm_rps_events) {
 			I915_WRITE_FW(GEN8_GT_IIR(2),
-				      tmp & dev_priv->pm_rps_events);
+				      iir & dev_priv->pm_rps_events);
 			ret = IRQ_HANDLED;
-			gen6_rps_irq_handler(dev_priv, tmp);
+			gen6_rps_irq_handler(dev_priv, iir);
 		} else
 			DRM_ERROR("The master control interrupt lied (PM)!\n");
 	}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 825fa7a..e8f5b6c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -426,9 +426,8 @@  static void execlists_context_unqueue(struct intel_engine_cs *ring)
 			/* Same ctx: ignore first request, as second request
 			 * will update tail past first request's workload */
 			cursor->elsp_submitted = req0->elsp_submitted;
+			req0->elsp_submitted = 0;
 			list_del(&req0->execlist_link);
-			list_add_tail(&req0->execlist_link,
-				&ring->execlist_retired_req_list);
 			req0 = cursor;
 		} else {
 			req1 = cursor;
@@ -478,11 +477,9 @@  static bool execlists_check_remove_request(struct intel_engine_cs *ring,
 		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
 			WARN(head_req->elsp_submitted == 0,
 			     "Never submitted head request\n");
-
 			if (--head_req->elsp_submitted <= 0) {
+				head_req->ctx_complete = 1;
 				list_del(&head_req->execlist_link);
-				list_add_tail(&head_req->execlist_link,
-					&ring->execlist_retired_req_list);
 				return true;
 			}
 		}
@@ -497,8 +494,9 @@  static bool execlists_check_remove_request(struct intel_engine_cs *ring,
  *
  * Check the unread Context Status Buffers and manage the submission of new
  * contexts to the ELSP accordingly.
+ * @return whether a context completed
  */
-void intel_lrc_irq_handler(struct intel_engine_cs *ring)
+bool intel_lrc_irq_handler(struct intel_engine_cs *ring)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	u32 status_pointer;
@@ -558,6 +556,8 @@  void intel_lrc_irq_handler(struct intel_engine_cs *ring)
 		   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
 				 ((u32)ring->next_context_status_buffer &
 				  GEN8_CSB_PTR_MASK) << 8));
+
+	return (submit_contexts != 0);
 }
 
 static int execlists_context_queue(struct drm_i915_gem_request *request)
@@ -569,8 +569,6 @@  static int execlists_context_queue(struct drm_i915_gem_request *request)
 	if (request->ctx != ring->default_context)
 		intel_lr_context_pin(request);
 
-	i915_gem_request_reference(request);
-
 	spin_lock_irq(&ring->execlist_lock);
 
 	list_for_each_entry(cursor, &ring->execlist_queue, execlist_link)
@@ -588,8 +586,6 @@  static int execlists_context_queue(struct drm_i915_gem_request *request)
 			WARN(tail_req->elsp_submitted != 0,
 				"More than 2 already-submitted reqs queued\n");
 			list_del(&tail_req->execlist_link);
-			list_add_tail(&tail_req->execlist_link,
-				&ring->execlist_retired_req_list);
 		}
 	}
 
@@ -958,32 +954,6 @@  int intel_execlists_submission(struct i915_execbuffer_params *params,
 	return 0;
 }
 
-void intel_execlists_retire_requests(struct intel_engine_cs *ring)
-{
-	struct drm_i915_gem_request *req, *tmp;
-	struct list_head retired_list;
-
-	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
-	if (list_empty(&ring->execlist_retired_req_list))
-		return;
-
-	INIT_LIST_HEAD(&retired_list);
-	spin_lock_irq(&ring->execlist_lock);
-	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
-	spin_unlock_irq(&ring->execlist_lock);
-
-	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
-		struct intel_context *ctx = req->ctx;
-		struct drm_i915_gem_object *ctx_obj =
-				ctx->engine[ring->id].state;
-
-		if (ctx_obj && (ctx != ring->default_context))
-			intel_lr_context_unpin(req);
-		list_del(&req->execlist_link);
-		i915_gem_request_unreference(req);
-	}
-}
-
 void intel_logical_ring_stop(struct intel_engine_cs *ring)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
@@ -1938,7 +1908,6 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 	init_waitqueue_head(&ring->irq_queue);
 
 	INIT_LIST_HEAD(&ring->execlist_queue);
-	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
 	spin_lock_init(&ring->execlist_lock);
 
 	ret = i915_cmd_parser_init_ring(ring);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 4e60d54..e6a4900 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -95,7 +95,7 @@  int intel_execlists_submission(struct i915_execbuffer_params *params,
 			       struct list_head *vmas);
 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
 
-void intel_lrc_irq_handler(struct intel_engine_cs *ring);
+bool intel_lrc_irq_handler(struct intel_engine_cs *ring);
 void intel_execlists_retire_requests(struct intel_engine_cs *ring);
 
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 49fa41d..d99b167 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -264,7 +264,6 @@  struct  intel_engine_cs {
 	/* Execlists */
 	spinlock_t execlist_lock;
 	struct list_head execlist_queue;
-	struct list_head execlist_retired_req_list;
 	u8 next_context_status_buffer;
 	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
 	int		(*emit_request)(struct drm_i915_gem_request *request);