diff mbox series

[v2] drm/i915: Only enqueue already completed requests

Message ID 20190923103231.27025-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915: Only enqueue already completed requests | expand

Commit Message

Chris Wilson Sept. 23, 2019, 10:32 a.m. UTC
If we are asked to submit a completed request, just move it onto the
active-list without modifying it's payload. If we try to emit the
modified payload of a completed request, we risk racing with the
ring->head update during retirement which may advance the head past our
breadcrumb and so we generate a warning for the emission being behind
the RING_HEAD.

v2: Commentary for the sneaky, shared responsibility between functions.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 60 +++++++++++++++++------------
 drivers/gpu/drm/i915/i915_request.c | 44 +++++++++++++++------
 drivers/gpu/drm/i915/i915_request.h |  2 +-
 3 files changed, 68 insertions(+), 38 deletions(-)

Comments

Tvrtko Ursulin Sept. 23, 2019, 10:44 a.m. UTC | #1
On 23/09/2019 11:32, Chris Wilson wrote:
> If we are asked to submit a completed request, just move it onto the
> active-list without modifying it's payload. If we try to emit the
> modified payload of a completed request, we risk racing with the
> ring->head update during retirement which may advance the head past our
> breadcrumb and so we generate a warning for the emission being behind
> the RING_HEAD.
> 
> v2: Commentary for the sneaky, shared responsibility between functions.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c | 60 +++++++++++++++++------------
>   drivers/gpu/drm/i915/i915_request.c | 44 +++++++++++++++------
>   drivers/gpu/drm/i915/i915_request.h |  2 +-
>   3 files changed, 68 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 0a4812ebd184..8c1ea5c315ac 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -799,6 +799,17 @@ static bool can_merge_rq(const struct i915_request *prev,
>   	GEM_BUG_ON(prev == next);
>   	GEM_BUG_ON(!assert_priority_queue(prev, next));
>   
> +	/*
> +	 * We not submit known completed requests. Therefore if the next

"We _do_ not submit"?

> +	 * request is already completed, we can pretend to merge it in
> +	 * with the previous context (and we will skip updating the ELSP
> +	 * and tracking). Thus hopefully keeping the ELSP full with active
> +	 * contexts, despite the best efforts of preempt-to-busy to confuse
> +	 * us.
> +	 */
> +	if (i915_request_completed(next))
> +		return true;

It works with the current use of can_merge_rq but leaves a bit of a 
concern for the future. I did not come up with any interesting 
GEM_BUG_ONs to add. I was thinking along the lines of making sure we 
never end up coalescing different contexts to the same port. But no 
ideas how to do that.

> +
>   	if (!can_merge_ctx(prev->hw_context, next->hw_context))
>   		return false;
>   
> @@ -1181,21 +1192,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				continue;
>   			}
>   
> -			if (i915_request_completed(rq)) {
> -				ve->request = NULL;
> -				ve->base.execlists.queue_priority_hint = INT_MIN;
> -				rb_erase_cached(rb, &execlists->virtual);
> -				RB_CLEAR_NODE(rb);
> -
> -				rq->engine = engine;
> -				__i915_request_submit(rq);
> -
> -				spin_unlock(&ve->base.active.lock);
> -
> -				rb = rb_first_cached(&execlists->virtual);
> -				continue;
> -			}
> -
>   			if (last && !can_merge_rq(last, rq)) {
>   				spin_unlock(&ve->base.active.lock);
>   				return; /* leave this for another */
> @@ -1249,11 +1245,23 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				GEM_BUG_ON(ve->siblings[0] != engine);
>   			}
>   
> -			__i915_request_submit(rq);
> -			if (!i915_request_completed(rq)) {
> +			if (__i915_request_submit(rq)) {
>   				submit = true;
>   				last = rq;
>   			}
> +
> +			/*
> +			 * Hmm, we have a bunch of virtual engine requests,
> +			 * but the first one was already complete (thanks

Complete or completed? Not sure myself..

> +			 * preempt-to-busy!). Keep looking at the veng queue
> +			 * until we have no more relevent requests (i.e.

relevant

> +			 * the normal submit queue has higher priority).
> +			 */
> +			if (!submit) {
> +				spin_unlock(&ve->base.active.lock);
> +				rb = rb_first_cached(&execlists->virtual);
> +				continue;
> +			}
>   		}
>   
>   		spin_unlock(&ve->base.active.lock);
> @@ -1266,8 +1274,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		int i;
>   
>   		priolist_for_each_request_consume(rq, rn, p, i) {
> -			if (i915_request_completed(rq))
> -				goto skip;
> +			bool merge = true;
>   
>   			/*
>   			 * Can we combine this request with the current port?
> @@ -1308,14 +1315,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				    ctx_single_port_submission(rq->hw_context))
>   					goto done;
>   
> -				*port = execlists_schedule_in(last, port - execlists->pending);
> -				port++;
> +				merge = false;
>   			}
>   
> -			last = rq;
> -			submit = true;
> -skip:
> -			__i915_request_submit(rq);
> +			if (__i915_request_submit(rq)) {
> +				if (!merge) {
> +					*port = execlists_schedule_in(last, port - execlists->pending);
> +					port++;
> +				}
> +				submit = true;
> +				last = rq;
> +			}
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 9bd8538b1907..0ca43ca15ca0 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -377,9 +377,10 @@ __i915_request_await_execution(struct i915_request *rq,
>   	return 0;
>   }
>   
> -void __i915_request_submit(struct i915_request *request)
> +bool __i915_request_submit(struct i915_request *request)
>   {
>   	struct intel_engine_cs *engine = request->engine;
> +	bool result = false;
>   
>   	GEM_TRACE("%s fence %llx:%lld, current %d\n",
>   		  engine->name,
> @@ -389,6 +390,25 @@ void __i915_request_submit(struct i915_request *request)
>   	GEM_BUG_ON(!irqs_disabled());
>   	lockdep_assert_held(&engine->active.lock);
>   
> +	/*
> +	 * With the advent of preempt-to-busy, we frequently encounter
> +	 * requests that we have unsubmitted from HW, but left running
> +	 * until the next ack and have completed in the meantime. On
> +	 * resubmission of that completed request, we can skip
> +	 * updating the payload (and execlists can even skip submitting
> +	 * the request).
> +	 *
> +	 * We must remove the request from the caller's priority queue,
> +	 * and the caller must only call us when the request is in their
> +	 * priority queue, under the active.lock. This ensures that the
> +	 * request has *not* yet been retired and we can safely move
> +	 * the request into the engine->active.list where it will be
> +	 * dropped upon retiring. (Otherwise if resubmit a *retired*
> +	 * request, this would be a horrible use-after-free.)
> +	 */
> +	if (i915_request_completed(request))
> +		goto xfer;
> +
>   	if (i915_gem_context_is_banned(request->gem_context))
>   		i915_request_skip(request, -EIO);
>   
> @@ -412,13 +432,18 @@ void __i915_request_submit(struct i915_request *request)
>   	    i915_sw_fence_signaled(&request->semaphore))
>   		engine->saturated |= request->sched.semaphores;
>   
> -	/* We may be recursing from the signal callback of another i915 fence */
> -	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
> +	engine->emit_fini_breadcrumb(request,
> +				     request->ring->vaddr + request->postfix);
> +
> +	trace_i915_request_execute(request);
> +	engine->serial++;
> +	result = true;
>   
> -	list_move_tail(&request->sched.link, &engine->active.requests);
> +xfer:	/* We may be recursing from the signal callback of another i915 fence */
> +	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
>   
> -	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
> -	set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
> +	if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags))
> +		list_move_tail(&request->sched.link, &engine->active.requests);
>   
>   	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
>   	    !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
> @@ -429,12 +454,7 @@ void __i915_request_submit(struct i915_request *request)
>   
>   	spin_unlock(&request->lock);
>   
> -	engine->emit_fini_breadcrumb(request,
> -				     request->ring->vaddr + request->postfix);
> -
> -	engine->serial++;
> -
> -	trace_i915_request_execute(request);
> +	return result;
>   }
>   
>   void i915_request_submit(struct i915_request *request)
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index b18f49528ded..ec5bb4c2e5ae 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -292,7 +292,7 @@ int i915_request_await_execution(struct i915_request *rq,
>   
>   void i915_request_add(struct i915_request *rq);
>   
> -void __i915_request_submit(struct i915_request *request);
> +bool __i915_request_submit(struct i915_request *request);
>   void i915_request_submit(struct i915_request *request);
>   
>   void i915_request_skip(struct i915_request *request, int error);
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Chris Wilson Sept. 23, 2019, 10:56 a.m. UTC | #2
Quoting Tvrtko Ursulin (2019-09-23 11:44:01)
> 
> On 23/09/2019 11:32, Chris Wilson wrote:
> > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > index 0a4812ebd184..8c1ea5c315ac 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > @@ -799,6 +799,17 @@ static bool can_merge_rq(const struct i915_request *prev,
> >       GEM_BUG_ON(prev == next);
> >       GEM_BUG_ON(!assert_priority_queue(prev, next));
> >   
> > +     /*
> > +      * We not submit known completed requests. Therefore if the next
> 
> "We _do_ not submit"?

Verbs are for the weak.

> > +      * request is already completed, we can pretend to merge it in
> > +      * with the previous context (and we will skip updating the ELSP
> > +      * and tracking). Thus hopefully keeping the ELSP full with active
> > +      * contexts, despite the best efforts of preempt-to-busy to confuse
> > +      * us.
> > +      */
> > +     if (i915_request_completed(next))
> > +             return true;
> 
> It works with the current use of can_merge_rq but leaves a bit of a 
> concern for the future. I did not come up with any interesting 
> GEM_BUG_ONs to add. I was thinking along the lines of making sure we 
> never end up coalescing different contexts to the same port. But no 
> ideas how to do that.

That should just be

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 96dd95d8252c..924cd51e1140 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1322,7 +1322,13 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                                if (!merge) {
                                        *port = execlists_schedule_in(last, port - execlists->pending);
                                        port++;
+                                       last = NULL;
                                }
+
+                               GEM_BUG_ON(last &&
+                                          !can_merge_ctx(last->hw_context,
+                                                         rq->hw_context));
+
                                submit = true;
                                last = rq;
                        }

> > @@ -1249,11 +1245,23 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >                               GEM_BUG_ON(ve->siblings[0] != engine);
> >                       }
> >   
> > -                     __i915_request_submit(rq);
> > -                     if (!i915_request_completed(rq)) {
> > +                     if (__i915_request_submit(rq)) {
> >                               submit = true;
> >                               last = rq;
> >                       }
> > +
> > +                     /*
> > +                      * Hmm, we have a bunch of virtual engine requests,
> > +                      * but the first one was already complete (thanks
> 
> Complete or completed? Not sure myself..

Hmm. I suppose you can treat it as complete == "whole", completed ==
"finished". +d for consistency.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 0a4812ebd184..8c1ea5c315ac 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -799,6 +799,17 @@  static bool can_merge_rq(const struct i915_request *prev,
 	GEM_BUG_ON(prev == next);
 	GEM_BUG_ON(!assert_priority_queue(prev, next));
 
+	/*
+	 * We not submit known completed requests. Therefore if the next
+	 * request is already completed, we can pretend to merge it in
+	 * with the previous context (and we will skip updating the ELSP
+	 * and tracking). Thus hopefully keeping the ELSP full with active
+	 * contexts, despite the best efforts of preempt-to-busy to confuse
+	 * us.
+	 */
+	if (i915_request_completed(next))
+		return true;
+
 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
 		return false;
 
@@ -1181,21 +1192,6 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 				continue;
 			}
 
-			if (i915_request_completed(rq)) {
-				ve->request = NULL;
-				ve->base.execlists.queue_priority_hint = INT_MIN;
-				rb_erase_cached(rb, &execlists->virtual);
-				RB_CLEAR_NODE(rb);
-
-				rq->engine = engine;
-				__i915_request_submit(rq);
-
-				spin_unlock(&ve->base.active.lock);
-
-				rb = rb_first_cached(&execlists->virtual);
-				continue;
-			}
-
 			if (last && !can_merge_rq(last, rq)) {
 				spin_unlock(&ve->base.active.lock);
 				return; /* leave this for another */
@@ -1249,11 +1245,23 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 				GEM_BUG_ON(ve->siblings[0] != engine);
 			}
 
-			__i915_request_submit(rq);
-			if (!i915_request_completed(rq)) {
+			if (__i915_request_submit(rq)) {
 				submit = true;
 				last = rq;
 			}
+
+			/*
+			 * Hmm, we have a bunch of virtual engine requests,
+			 * but the first one was already complete (thanks
+			 * preempt-to-busy!). Keep looking at the veng queue
+			 * until we have no more relevent requests (i.e.
+			 * the normal submit queue has higher priority).
+			 */
+			if (!submit) {
+				spin_unlock(&ve->base.active.lock);
+				rb = rb_first_cached(&execlists->virtual);
+				continue;
+			}
 		}
 
 		spin_unlock(&ve->base.active.lock);
@@ -1266,8 +1274,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 		int i;
 
 		priolist_for_each_request_consume(rq, rn, p, i) {
-			if (i915_request_completed(rq))
-				goto skip;
+			bool merge = true;
 
 			/*
 			 * Can we combine this request with the current port?
@@ -1308,14 +1315,17 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 				    ctx_single_port_submission(rq->hw_context))
 					goto done;
 
-				*port = execlists_schedule_in(last, port - execlists->pending);
-				port++;
+				merge = false;
 			}
 
-			last = rq;
-			submit = true;
-skip:
-			__i915_request_submit(rq);
+			if (__i915_request_submit(rq)) {
+				if (!merge) {
+					*port = execlists_schedule_in(last, port - execlists->pending);
+					port++;
+				}
+				submit = true;
+				last = rq;
+			}
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 9bd8538b1907..0ca43ca15ca0 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -377,9 +377,10 @@  __i915_request_await_execution(struct i915_request *rq,
 	return 0;
 }
 
-void __i915_request_submit(struct i915_request *request)
+bool __i915_request_submit(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
+	bool result = false;
 
 	GEM_TRACE("%s fence %llx:%lld, current %d\n",
 		  engine->name,
@@ -389,6 +390,25 @@  void __i915_request_submit(struct i915_request *request)
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->active.lock);
 
+	/*
+	 * With the advent of preempt-to-busy, we frequently encounter
+	 * requests that we have unsubmitted from HW, but left running
+	 * until the next ack and have completed in the meantime. On
+	 * resubmission of that completed request, we can skip
+	 * updating the payload (and execlists can even skip submitting
+	 * the request).
+	 *
+	 * We must remove the request from the caller's priority queue,
+	 * and the caller must only call us when the request is in their
+	 * priority queue, under the active.lock. This ensures that the
+	 * request has *not* yet been retired and we can safely move
+	 * the request into the engine->active.list where it will be
+	 * dropped upon retiring. (Otherwise if resubmit a *retired*
+	 * request, this would be a horrible use-after-free.)
+	 */
+	if (i915_request_completed(request))
+		goto xfer;
+
 	if (i915_gem_context_is_banned(request->gem_context))
 		i915_request_skip(request, -EIO);
 
@@ -412,13 +432,18 @@  void __i915_request_submit(struct i915_request *request)
 	    i915_sw_fence_signaled(&request->semaphore))
 		engine->saturated |= request->sched.semaphores;
 
-	/* We may be recursing from the signal callback of another i915 fence */
-	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+	engine->emit_fini_breadcrumb(request,
+				     request->ring->vaddr + request->postfix);
+
+	trace_i915_request_execute(request);
+	engine->serial++;
+	result = true;
 
-	list_move_tail(&request->sched.link, &engine->active.requests);
+xfer:	/* We may be recursing from the signal callback of another i915 fence */
+	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
 
-	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
-	set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
+	if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags))
+		list_move_tail(&request->sched.link, &engine->active.requests);
 
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
 	    !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
@@ -429,12 +454,7 @@  void __i915_request_submit(struct i915_request *request)
 
 	spin_unlock(&request->lock);
 
-	engine->emit_fini_breadcrumb(request,
-				     request->ring->vaddr + request->postfix);
-
-	engine->serial++;
-
-	trace_i915_request_execute(request);
+	return result;
 }
 
 void i915_request_submit(struct i915_request *request)
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index b18f49528ded..ec5bb4c2e5ae 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -292,7 +292,7 @@  int i915_request_await_execution(struct i915_request *rq,
 
 void i915_request_add(struct i915_request *rq);
 
-void __i915_request_submit(struct i915_request *request);
+bool __i915_request_submit(struct i915_request *request);
 void i915_request_submit(struct i915_request *request);
 
 void i915_request_skip(struct i915_request *request, int error);