diff mbox

[3/3] drm/i915/execlists: Direct submit onto idle engines

Message ID 20180506224728.25450-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson May 6, 2018, 10:47 p.m. UTC
Bypass using the tasklet to submit the first request to HW, as the
tasklet may be deferred unto ksoftirqd and at a minimum will add in
excess of 10us (and maybe tens of milliseconds) to our execution
latency. This latency reduction is most notable when execution flows
between engines.

Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_guc_submission.c |  1 +
 drivers/gpu/drm/i915/intel_lrc.c            | 64 +++++++++++++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.h     |  7 +++
 3 files changed, 60 insertions(+), 12 deletions(-)

Comments

Chris Wilson May 6, 2018, 10:52 p.m. UTC | #1
Quoting Chris Wilson (2018-05-06 23:47:28)
> +static void __submit_queue(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +
> +       GEM_BUG_ON(!engine->i915->gt.awake);
> +
> +       /* Directly submit the first request to reduce the initial latency */
> +       if (!intel_engine_has_guc(engine) &&
> +           !port_isset(execlists->port) &&
> +           tasklet_trylock(&execlists->tasklet)) {
> +               if (__execlists_dequeue(engine))
> +                       execlists_submit_ports(engine);
> +               tasklet_unlock(&execlists->tasklet);
> +               return;
> +       }
> +
> +       __schedule_queue(engine);
> +}

> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 010750e8ee44..3d13835d4a87 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -569,6 +569,7 @@ struct intel_engine_cs {
>  #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
>  #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
>  #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> +#define I915_ENGINE_HAS_GUC          BIT(3)

I915_ENGINE_NO_DIRECT_SUBMISSION might be more apt, as I'll want this
bit for virtual engine as well
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 62828e39ee26..d899a2e6fa7d 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -1255,6 +1255,7 @@  int intel_guc_submission_enable(struct intel_guc *guc)
 		engine->unpark = guc_submission_unpark;
 
 		engine->flags &= ~I915_ENGINE_SUPPORTS_STATS;
+		engine->flags |= I915_ENGINE_HAS_GUC;
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f9f4064dec0e..8a1ed31c4fc3 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -554,7 +554,7 @@  static void inject_preempt_context(struct intel_engine_cs *engine)
 	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
 }
 
-static void execlists_dequeue(struct intel_engine_cs *engine)
+static bool __execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
@@ -564,6 +564,8 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 	struct rb_node *rb;
 	bool submit = false;
 
+	lockdep_assert_held(&engine->timeline.lock);
+
 	/* Hardware submission is through 2 ports. Conceptually each port
 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
 	 * static for a context, and unique to each, so we only execute
@@ -585,7 +587,6 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
-	spin_lock_irq(&engine->timeline.lock);
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
 
@@ -598,6 +599,8 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 		 */
 		GEM_BUG_ON(!execlists_is_active(execlists,
 						EXECLISTS_ACTIVE_USER));
+		GEM_BUG_ON(execlists_is_active(execlists,
+					       EXECLISTS_ACTIVE_PREEMPT));
 		GEM_BUG_ON(!port_count(&port[0]));
 		if (port_count(&port[0]) > 1)
 			goto unlock;
@@ -745,12 +748,23 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 	GEM_BUG_ON(execlists->first && !port_isset(execlists->port));
 
 unlock:
+	if (last)
+		execlists_user_begin(execlists, execlists->port);
+
+	return submit;
+}
+
+static void execlists_dequeue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	bool submit;
+
+	spin_lock_irq(&engine->timeline.lock);
+	submit = __execlists_dequeue(engine);
 	spin_unlock_irq(&engine->timeline.lock);
 
-	if (submit) {
-		execlists_user_begin(execlists, execlists->port);
+	if (submit)
 		execlists_submit_ports(engine);
-	}
 
 	GEM_BUG_ON(port_isset(execlists->port) &&
 		   !execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
@@ -1147,16 +1161,41 @@  static void queue_request(struct intel_engine_cs *engine,
 		      &lookup_priolist(engine, node, prio)->requests);
 }
 
-static void __submit_queue(struct intel_engine_cs *engine, int prio)
+static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
 {
 	engine->execlists.queue_priority = prio;
+}
+
+static void __schedule_queue(struct intel_engine_cs *engine)
+{
 	tasklet_hi_schedule(&engine->execlists.tasklet);
 }
 
+static void __submit_queue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+
+	GEM_BUG_ON(!engine->i915->gt.awake);
+
+	/* Directly submit the first request to reduce the initial latency */
+	if (!intel_engine_has_guc(engine) &&
+	    !port_isset(execlists->port) &&
+	    tasklet_trylock(&execlists->tasklet)) {
+		if (__execlists_dequeue(engine))
+			execlists_submit_ports(engine);
+		tasklet_unlock(&execlists->tasklet);
+		return;
+	}
+
+	__schedule_queue(engine);
+}
+
 static void submit_queue(struct intel_engine_cs *engine, int prio)
 {
-	if (prio > engine->execlists.queue_priority)
-		__submit_queue(engine, prio);
+	if (prio > engine->execlists.queue_priority) {
+		__wakeup_queue(engine, prio);
+		__submit_queue(engine);
+	}
 }
 
 static void execlists_submit_request(struct i915_request *request)
@@ -1168,10 +1207,9 @@  static void execlists_submit_request(struct i915_request *request)
 	spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	queue_request(engine, &request->sched, rq_prio(request));
-	submit_queue(engine, rq_prio(request));
-
 	GEM_BUG_ON(!engine->execlists.first);
 	GEM_BUG_ON(list_empty(&request->sched.link));
+	submit_queue(engine, rq_prio(request));
 
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
@@ -1293,8 +1331,10 @@  static void execlists_schedule(struct i915_request *request,
 		}
 
 		if (prio > engine->execlists.queue_priority &&
-		    i915_sw_fence_done(&sched_to_request(node)->submit))
-			__submit_queue(engine, prio);
+		    i915_sw_fence_done(&sched_to_request(node)->submit)) {
+			__wakeup_queue(engine, prio);
+			__schedule_queue(engine);
+		}
 	}
 
 	spin_unlock_irq(&engine->timeline.lock);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 010750e8ee44..3d13835d4a87 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -569,6 +569,7 @@  struct intel_engine_cs {
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_HAS_GUC          BIT(3)
 	unsigned int flags;
 
 	/*
@@ -646,6 +647,12 @@  intel_engine_has_preemption(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
 }
 
+static inline bool
+intel_engine_has_guc(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_HAS_GUC;
+}
+
 static inline bool __execlists_need_preempt(int prio, int last)
 {
 	return prio > max(0, last);