diff mbox

[v2,10/15] drm/i915: Remove the preempted request from the execution queue

Message ID 20170222114610.5819-11-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Feb. 22, 2017, 11:46 a.m. UTC
After the request is cancelled, we then need to remove it from the
global execution timeline and return it to the context timeline, the
inverse of submit_request().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c            | 58 +++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_request.h            |  3 ++
 drivers/gpu/drm/i915/intel_breadcrumbs.c           | 19 ++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h            |  6 ---
 drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c |  6 +++
 5 files changed, 83 insertions(+), 9 deletions(-)

Comments

Tvrtko Ursulin Feb. 22, 2017, 1:33 p.m. UTC | #1
On 22/02/2017 11:46, Chris Wilson wrote:
> After the request is cancelled, we then need to remove it from the
> global execution timeline and return it to the context timeline, the
> inverse of submit_request().
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c            | 58 +++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gem_request.h            |  3 ++
>  drivers/gpu/drm/i915/intel_breadcrumbs.c           | 19 ++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h            |  6 ---
>  drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c |  6 +++
>  5 files changed, 83 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index d18f450977e0..97116e492d01 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -441,6 +441,55 @@ void i915_gem_request_submit(struct drm_i915_gem_request *request)
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  }
>
> +void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
> +{
> +	struct intel_engine_cs *engine = request->engine;
> +	struct intel_timeline *timeline;
> +
> +	assert_spin_locked(&engine->timeline->lock);
> +
> +	/* Only unwind in reverse order, required so that the per-context list
> +	 * is kept in seqno/ring order.
> +	 */
> +	GEM_BUG_ON(request->global_seqno != engine->timeline->seqno);
> +	engine->timeline->seqno--;
> +
> +	/* We may be recursing from the signal callback of another i915 fence */

Copy-paste of the comment of there will really be preemption triggered 
from the signal callback?

> +	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
> +	request->global_seqno = 0;
> +	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
> +		intel_engine_cancel_signaling(request);
> +	spin_unlock(&request->lock);
> +
> +	/* Transfer back from the global per-engine timeline to per-context */
> +	timeline = request->timeline;
> +	GEM_BUG_ON(timeline == engine->timeline);
> +
> +	spin_lock(&timeline->lock);
> +	list_move(&request->link, &timeline->requests);
> +	spin_unlock(&timeline->lock);
> +
> +	/* We don't need to wake_up any waiters on request->execute, they
> +	 * will get woken by any other event or us re-adding this request
> +	 * to the engine timeline (__i915_gem_request_submit()). The waiters
> +	 * should be quite adapt at finding that the request now has a new
> +	 * global_seqno to the one they went to sleep on.
> +	 */
> +}
> +
> +void i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
> +{
> +	struct intel_engine_cs *engine = request->engine;
> +	unsigned long flags;
> +
> +	/* Will be called from irq-context when using foreign fences. */
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +
> +	__i915_gem_request_unsubmit(request);
> +
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +}
> +
>  static int __i915_sw_fence_call
>  submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  {
> @@ -1034,9 +1083,11 @@ long i915_wait_request(struct drm_i915_gem_request *req,
>  	if (flags & I915_WAIT_LOCKED)
>  		add_wait_queue(errq, &reset);
>
> -	intel_wait_init(&wait, i915_gem_request_global_seqno(req));
> +	wait.tsk = current;
>
> +restart:
>  	reset_wait_queue(&req->execute, &exec);
> +	wait.seqno = i915_gem_request_global_seqno(req);

Not sure if it is worth dropping intel_wait_init, I presume to avoid 
assigning the task twice? It will still be the same task so just moving 
the intel_wait_init here would be clearer.

>  	if (!wait.seqno) {
>  		do {
>  			set_current_state(state);
> @@ -1135,6 +1186,11 @@ long i915_wait_request(struct drm_i915_gem_request *req,
>  		/* Only spin if we know the GPU is processing this request */
>  		if (i915_spin_request(req, state, 2))
>  			break;
> +
> +		if (i915_gem_request_global_seqno(req) != wait.seqno) {
> +			intel_engine_remove_wait(req->engine, &wait);
> +			goto restart;
> +		}
>  	}
>
>  	intel_engine_remove_wait(req->engine, &wait);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index b81f6709905c..5f73d8c0a38a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -274,6 +274,9 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
>  void __i915_gem_request_submit(struct drm_i915_gem_request *request);
>  void i915_gem_request_submit(struct drm_i915_gem_request *request);
>
> +void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request);
> +void i915_gem_request_unsubmit(struct drm_i915_gem_request *request);
> +
>  struct intel_rps_client;
>  #define NO_WAITBOOST ERR_PTR(-1)
>  #define IS_RPS_CLIENT(p) (!IS_ERR(p))
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 882e601ebb09..5bcad7872c08 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -453,7 +453,14 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
>  	spin_unlock_irq(&b->lock);
>  }
>
> -static bool signal_complete(struct drm_i915_gem_request *request)
> +static bool signal_valid(const struct drm_i915_gem_request *request)
> +{
> +	u32 seqno = READ_ONCE(request->global_seqno);
> +
> +	return seqno == request->signaling.wait.seqno;
> +}
> +
> +static bool signal_complete(const struct drm_i915_gem_request *request)
>  {
>  	if (!request)
>  		return false;
> @@ -462,7 +469,7 @@ static bool signal_complete(struct drm_i915_gem_request *request)
>  	 * signalled that this wait is already completed.
>  	 */
>  	if (intel_wait_complete(&request->signaling.wait))
> -		return true;
> +		return signal_valid(request);
>
>  	/* Carefully check if the request is complete, giving time for the
>  	 * seqno to be visible or if the GPU hung.
> @@ -542,13 +549,21 @@ static int intel_breadcrumbs_signaler(void *arg)
>
>  			i915_gem_request_put(request);
>  		} else {
> +			DEFINE_WAIT(exec);
> +
>  			if (kthread_should_stop()) {
>  				GEM_BUG_ON(request);
>  				break;
>  			}
>
> +			if (request)
> +				add_wait_queue(&request->execute, &exec);
> +
>  			schedule();
>
> +			if (request)
> +				remove_wait_queue(&request->execute, &exec);
> +

Not directly related but made me think why we are using 
TASK_INTERRUPTIBLE in the signallers? Shouldn't it be 
TASK_UNINTERRUPTIBLE and io_schedule? Sounds a bit deja vu though, maybe 
we have talked about it before..

>  			if (kthread_should_park())
>  				kthread_parkme();
>  		}
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 45d2c2fa946e..97fde79167a6 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -582,12 +582,6 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
>  /* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
>  int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
>
> -static inline void intel_wait_init(struct intel_wait *wait, u32 seqno)
> -{
> -	wait->tsk = current;
> -	wait->seqno = seqno;
> -}
> -
>  static inline bool intel_wait_complete(const struct intel_wait *wait)
>  {
>  	return RB_EMPTY_NODE(&wait->node);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
> index 6426acc9fdca..62c020c7ea80 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
> @@ -28,6 +28,12 @@
>  #include "mock_gem_device.h"
>  #include "mock_engine.h"
>
> +static inline void intel_wait_init(struct intel_wait *wait, u32 seqno)
> +{
> +	wait->tsk = current;
> +	wait->seqno = seqno;
> +}
> +
>  static int check_rbtree(struct intel_engine_cs *engine,
>  			const unsigned long *bitmap,
>  			const struct intel_wait *waiters,
>

Regards,

Tvrtko
Chris Wilson Feb. 22, 2017, 1:40 p.m. UTC | #2
On Wed, Feb 22, 2017 at 01:33:22PM +0000, Tvrtko Ursulin wrote:
> 
> On 22/02/2017 11:46, Chris Wilson wrote:
> >+void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
> >+{
> >+	struct intel_engine_cs *engine = request->engine;
> >+	struct intel_timeline *timeline;
> >+
> >+	assert_spin_locked(&engine->timeline->lock);
> >+
> >+	/* Only unwind in reverse order, required so that the per-context list
> >+	 * is kept in seqno/ring order.
> >+	 */
> >+	GEM_BUG_ON(request->global_seqno != engine->timeline->seqno);
> >+	engine->timeline->seqno--;
> >+
> >+	/* We may be recursing from the signal callback of another i915 fence */
> 
> Copy-paste of the comment of there will really be preemption
> triggered from the signal callback?

I believe it may be. Say an RCS request was waiting on a BCS request,
and we decide to preempt, and can do so immediately. I think being
prepared for the same recursion here is predundant.

> > static int __i915_sw_fence_call
> > submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> > {
> >@@ -1034,9 +1083,11 @@ long i915_wait_request(struct drm_i915_gem_request *req,
> > 	if (flags & I915_WAIT_LOCKED)
> > 		add_wait_queue(errq, &reset);
> >
> >-	intel_wait_init(&wait, i915_gem_request_global_seqno(req));
> >+	wait.tsk = current;
> >
> >+restart:
> > 	reset_wait_queue(&req->execute, &exec);
> >+	wait.seqno = i915_gem_request_global_seqno(req);
> 
> Not sure if it is worth dropping intel_wait_init, I presume to avoid
> assigning the task twice? It will still be the same task so just
> moving the intel_wait_init here would be clearer.

I was thinking the opposite, since we are looking at wait.seqno directly
elsewhere, so wanted that to be clear. And current is in a special
register, so why pay the cost to reload it onto stack :)

> >@@ -542,13 +549,21 @@ static int intel_breadcrumbs_signaler(void *arg)
> >
> > 			i915_gem_request_put(request);
> > 		} else {
> >+			DEFINE_WAIT(exec);
> >+
> > 			if (kthread_should_stop()) {
> > 				GEM_BUG_ON(request);
> > 				break;
> > 			}
> >
> >+			if (request)
> >+				add_wait_queue(&request->execute, &exec);
> >+
> > 			schedule();
> >
> >+			if (request)
> >+				remove_wait_queue(&request->execute, &exec);
> >+
> 
> Not directly related but made me think why we are using
> TASK_INTERRUPTIBLE in the signallers? Shouldn't it be
> TASK_UNINTERRUPTIBLE and io_schedule? Sounds a bit deja vu though,
> maybe we have talked about it before..

It doesn't make any difference to the signalers are they are kthreads
and shouldn't be interrupted - but it does make a difference to the
reported load as TASK_UNINTERRUPTIBLE contribute to system load.
-Chris
Tvrtko Ursulin Feb. 22, 2017, 1:50 p.m. UTC | #3
On 22/02/2017 13:40, Chris Wilson wrote:
> On Wed, Feb 22, 2017 at 01:33:22PM +0000, Tvrtko Ursulin wrote:
>>
>> On 22/02/2017 11:46, Chris Wilson wrote:
>>> +void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
>>> +{
>>> +	struct intel_engine_cs *engine = request->engine;
>>> +	struct intel_timeline *timeline;
>>> +
>>> +	assert_spin_locked(&engine->timeline->lock);
>>> +
>>> +	/* Only unwind in reverse order, required so that the per-context list
>>> +	 * is kept in seqno/ring order.
>>> +	 */
>>> +	GEM_BUG_ON(request->global_seqno != engine->timeline->seqno);
>>> +	engine->timeline->seqno--;
>>> +
>>> +	/* We may be recursing from the signal callback of another i915 fence */
>>
>> Copy-paste of the comment of there will really be preemption
>> triggered from the signal callback?
>
> I believe it may be. Say an RCS request was waiting on a BCS request,
> and we decide to preempt, and can do so immediately. I think being
> prepared for the same recursion here is predundant.

Yeah OK, just wasn't sure at which level will we handle preemption.

>>> static int __i915_sw_fence_call
>>> submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>>> {
>>> @@ -1034,9 +1083,11 @@ long i915_wait_request(struct drm_i915_gem_request *req,
>>> 	if (flags & I915_WAIT_LOCKED)
>>> 		add_wait_queue(errq, &reset);
>>>
>>> -	intel_wait_init(&wait, i915_gem_request_global_seqno(req));
>>> +	wait.tsk = current;
>>>
>>> +restart:
>>> 	reset_wait_queue(&req->execute, &exec);
>>> +	wait.seqno = i915_gem_request_global_seqno(req);
>>
>> Not sure if it is worth dropping intel_wait_init, I presume to avoid
>> assigning the task twice? It will still be the same task so just
>> moving the intel_wait_init here would be clearer.
>
> I was thinking the opposite, since we are looking at wait.seqno directly
> elsewhere, so wanted that to be clear. And current is in a special
> register, so why pay the cost to reload it onto stack :)

I can see that but intel_wait_init was so nice as a marker when reading 
the code.

Maybe leave it and add intel_wait_update_seqno?

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index d18f450977e0..97116e492d01 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -441,6 +441,55 @@  void i915_gem_request_submit(struct drm_i915_gem_request *request)
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
+void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
+{
+	struct intel_engine_cs *engine = request->engine;
+	struct intel_timeline *timeline;
+
+	assert_spin_locked(&engine->timeline->lock);
+
+	/* Only unwind in reverse order, required so that the per-context list
+	 * is kept in seqno/ring order.
+	 */
+	GEM_BUG_ON(request->global_seqno != engine->timeline->seqno);
+	engine->timeline->seqno--;
+
+	/* We may be recursing from the signal callback of another i915 fence */
+	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+	request->global_seqno = 0;
+	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
+		intel_engine_cancel_signaling(request);
+	spin_unlock(&request->lock);
+
+	/* Transfer back from the global per-engine timeline to per-context */
+	timeline = request->timeline;
+	GEM_BUG_ON(timeline == engine->timeline);
+
+	spin_lock(&timeline->lock);
+	list_move(&request->link, &timeline->requests);
+	spin_unlock(&timeline->lock);
+
+	/* We don't need to wake_up any waiters on request->execute, they
+	 * will get woken by any other event or us re-adding this request
+	 * to the engine timeline (__i915_gem_request_submit()). The waiters
+	 * should be quite adapt at finding that the request now has a new
+	 * global_seqno to the one they went to sleep on.
+	 */
+}
+
+void i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
+{
+	struct intel_engine_cs *engine = request->engine;
+	unsigned long flags;
+
+	/* Will be called from irq-context when using foreign fences. */
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+
+	__i915_gem_request_unsubmit(request);
+
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+}
+
 static int __i915_sw_fence_call
 submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 {
@@ -1034,9 +1083,11 @@  long i915_wait_request(struct drm_i915_gem_request *req,
 	if (flags & I915_WAIT_LOCKED)
 		add_wait_queue(errq, &reset);
 
-	intel_wait_init(&wait, i915_gem_request_global_seqno(req));
+	wait.tsk = current;
 
+restart:
 	reset_wait_queue(&req->execute, &exec);
+	wait.seqno = i915_gem_request_global_seqno(req);
 	if (!wait.seqno) {
 		do {
 			set_current_state(state);
@@ -1135,6 +1186,11 @@  long i915_wait_request(struct drm_i915_gem_request *req,
 		/* Only spin if we know the GPU is processing this request */
 		if (i915_spin_request(req, state, 2))
 			break;
+
+		if (i915_gem_request_global_seqno(req) != wait.seqno) {
+			intel_engine_remove_wait(req->engine, &wait);
+			goto restart;
+		}
 	}
 
 	intel_engine_remove_wait(req->engine, &wait);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index b81f6709905c..5f73d8c0a38a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -274,6 +274,9 @@  void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 void __i915_gem_request_submit(struct drm_i915_gem_request *request);
 void i915_gem_request_submit(struct drm_i915_gem_request *request);
 
+void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request);
+void i915_gem_request_unsubmit(struct drm_i915_gem_request *request);
+
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)
 #define IS_RPS_CLIENT(p) (!IS_ERR(p))
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 882e601ebb09..5bcad7872c08 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -453,7 +453,14 @@  void intel_engine_remove_wait(struct intel_engine_cs *engine,
 	spin_unlock_irq(&b->lock);
 }
 
-static bool signal_complete(struct drm_i915_gem_request *request)
+static bool signal_valid(const struct drm_i915_gem_request *request)
+{
+	u32 seqno = READ_ONCE(request->global_seqno);
+
+	return seqno == request->signaling.wait.seqno;
+}
+
+static bool signal_complete(const struct drm_i915_gem_request *request)
 {
 	if (!request)
 		return false;
@@ -462,7 +469,7 @@  static bool signal_complete(struct drm_i915_gem_request *request)
 	 * signalled that this wait is already completed.
 	 */
 	if (intel_wait_complete(&request->signaling.wait))
-		return true;
+		return signal_valid(request);
 
 	/* Carefully check if the request is complete, giving time for the
 	 * seqno to be visible or if the GPU hung.
@@ -542,13 +549,21 @@  static int intel_breadcrumbs_signaler(void *arg)
 
 			i915_gem_request_put(request);
 		} else {
+			DEFINE_WAIT(exec);
+
 			if (kthread_should_stop()) {
 				GEM_BUG_ON(request);
 				break;
 			}
 
+			if (request)
+				add_wait_queue(&request->execute, &exec);
+
 			schedule();
 
+			if (request)
+				remove_wait_queue(&request->execute, &exec);
+
 			if (kthread_should_park())
 				kthread_parkme();
 		}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 45d2c2fa946e..97fde79167a6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -582,12 +582,6 @@  static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 /* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
 int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
 
-static inline void intel_wait_init(struct intel_wait *wait, u32 seqno)
-{
-	wait->tsk = current;
-	wait->seqno = seqno;
-}
-
 static inline bool intel_wait_complete(const struct intel_wait *wait)
 {
 	return RB_EMPTY_NODE(&wait->node);
diff --git a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
index 6426acc9fdca..62c020c7ea80 100644
--- a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
@@ -28,6 +28,12 @@ 
 #include "mock_gem_device.h"
 #include "mock_engine.h"
 
+static inline void intel_wait_init(struct intel_wait *wait, u32 seqno)
+{
+	wait->tsk = current;
+	wait->seqno = seqno;
+}
+
 static int check_rbtree(struct intel_engine_cs *engine,
 			const unsigned long *bitmap,
 			const struct intel_wait *waiters,