@@ -679,7 +679,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
bool submit = false;
struct rb_node *rb;
- spin_lock_irq(&engine->timeline.lock);
+ if (!intel_engine_direct_submit(engine))
+ spin_lock_irq(&engine->timeline.lock);
+
rb = execlists->first;
GEM_BUG_ON(rb_first(&execlists->queue) != rb);
@@ -750,7 +752,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
GEM_BUG_ON(execlists->first && !port_isset(execlists->port));
unlock:
- spin_unlock_irq(&engine->timeline.lock);
+ if (!intel_engine_direct_submit(engine))
+ spin_unlock_irq(&engine->timeline.lock);
}
static void guc_submission_tasklet(unsigned long data)
@@ -554,7 +554,7 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
}
-static void execlists_dequeue(struct intel_engine_cs *engine)
+static bool __execlists_dequeue(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
struct execlist_port *port = execlists->port;
@@ -564,6 +564,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
struct rb_node *rb;
bool submit = false;
+ lockdep_assert_held(&engine->timeline.lock);
+
/* Hardware submission is through 2 ports. Conceptually each port
* has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
* static for a context, and unique to each, so we only execute
@@ -585,7 +587,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
* and context switches) submission.
*/
- spin_lock_irq(&engine->timeline.lock);
rb = execlists->first;
GEM_BUG_ON(rb_first(&execlists->queue) != rb);
@@ -598,6 +599,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
*/
GEM_BUG_ON(!execlists_is_active(execlists,
EXECLISTS_ACTIVE_USER));
+ GEM_BUG_ON(execlists_is_active(execlists,
+ EXECLISTS_ACTIVE_PREEMPT));
GEM_BUG_ON(!port_count(&port[0]));
if (port_count(&port[0]) > 1)
goto unlock;
@@ -745,12 +748,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
GEM_BUG_ON(execlists->first && !port_isset(execlists->port));
unlock:
- spin_unlock_irq(&engine->timeline.lock);
-
- if (submit) {
+ if (last)
execlists_user_begin(execlists, execlists->port);
+
+ return submit;
+}
+
+static void execlists_dequeue(struct intel_engine_cs *engine)
+{
+ struct intel_engine_execlists * const execlists = &engine->execlists;
+ bool submit;
+
+ if (!intel_engine_direct_submit(engine))
+ spin_lock_irq(&engine->timeline.lock);
+
+ submit = __execlists_dequeue(engine);
+
+ if (!intel_engine_direct_submit(engine))
+ spin_unlock_irq(&engine->timeline.lock);
+
+ if (submit)
execlists_submit_ports(engine);
- }
GEM_BUG_ON(port_isset(execlists->port) &&
!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
@@ -1147,16 +1165,41 @@ static void queue_request(struct intel_engine_cs *engine,
&lookup_priolist(engine, node, prio)->requests);
}
-static void __submit_queue(struct intel_engine_cs *engine, int prio)
+static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
{
engine->execlists.queue_priority = prio;
+}
+
+static void __schedule_queue(struct intel_engine_cs *engine)
+{
tasklet_hi_schedule(&engine->execlists.tasklet);
}
+static void __submit_queue(struct intel_engine_cs *engine)
+{
+ struct intel_engine_execlists * const execlists = &engine->execlists;
+
+ GEM_BUG_ON(!engine->i915->gt.awake);
+
+ /* Directly submit the first request to reduce the initial latency */
+ if (!port_isset(execlists->port) &&
+ tasklet_trylock(&execlists->tasklet)) {
+ engine->flags |= I915_ENGINE_DIRECT_SUBMIT;
+ execlists->tasklet.func(execlists->tasklet.data);
+ engine->flags &= ~I915_ENGINE_DIRECT_SUBMIT;
+ tasklet_unlock(&execlists->tasklet);
+ return;
+ }
+
+ __schedule_queue(engine);
+}
+
static void submit_queue(struct intel_engine_cs *engine, int prio)
{
- if (prio > engine->execlists.queue_priority)
- __submit_queue(engine, prio);
+ if (prio > engine->execlists.queue_priority) {
+ __wakeup_queue(engine, prio);
+ __submit_queue(engine);
+ }
}
static void execlists_submit_request(struct i915_request *request)
@@ -1168,10 +1211,9 @@ static void execlists_submit_request(struct i915_request *request)
spin_lock_irqsave(&engine->timeline.lock, flags);
queue_request(engine, &request->sched, rq_prio(request));
- submit_queue(engine, rq_prio(request));
-
GEM_BUG_ON(!engine->execlists.first);
GEM_BUG_ON(list_empty(&request->sched.link));
+ submit_queue(engine, rq_prio(request));
spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
@@ -1293,8 +1335,10 @@ static void execlists_schedule(struct i915_request *request,
}
if (prio > engine->execlists.queue_priority &&
- i915_sw_fence_done(&sched_to_request(node)->submit))
- __submit_queue(engine, prio);
+ i915_sw_fence_done(&sched_to_request(node)->submit)) {
+ __wakeup_queue(engine, prio);
+ __schedule_queue(engine);
+ }
}
spin_unlock_irq(&engine->timeline.lock);
@@ -569,6 +569,7 @@ struct intel_engine_cs {
#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
#define I915_ENGINE_SUPPORTS_STATS BIT(1)
#define I915_ENGINE_HAS_PREEMPTION BIT(2)
+#define I915_ENGINE_DIRECT_SUBMIT BIT(3)
unsigned int flags;
/*
@@ -646,6 +647,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
return engine->flags & I915_ENGINE_HAS_PREEMPTION;
}
+static inline bool
+intel_engine_direct_submit(const struct intel_engine_cs *engine)
+{
+ return engine->flags & I915_ENGINE_DIRECT_SUBMIT;
+}
+
static inline bool __execlists_need_preempt(int prio, int last)
{
return prio > max(0, last);
Bypass using the tasklet to submit the first request to HW, as the tasklet may be deferred unto ksoftirqd and at a minimum will add in excess of 10us (and maybe tens of milliseconds) to our execution latency. This latency reduction is most notable when execution flows between engines. Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/intel_guc_submission.c | 7 ++- drivers/gpu/drm/i915/intel_lrc.c | 70 +++++++++++++++++---- drivers/gpu/drm/i915/intel_ringbuffer.h | 7 +++ 3 files changed, 69 insertions(+), 15 deletions(-)