@@ -8,8 +8,11 @@
#define _I915_TASKLET_H_
#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/interrupt.h>
+#include "i915_gem.h"
+
/**
* struct i915_tasklet - wrapper around tasklet_struct
*
@@ -19,6 +22,8 @@
*/
struct i915_tasklet {
struct tasklet_struct base;
+ unsigned long flags;
+#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
};
static inline void i915_tasklet_init(struct i915_tasklet *t,
@@ -43,6 +48,14 @@ static inline bool i915_tasklet_is_enabled(const struct i915_tasklet *t)
return likely(!atomic_read(&t->base.count));
}
+static inline bool i915_tasklet_is_direct_submit(const struct i915_tasklet *t)
+{
+ /* Only legal to be checked from inside the tasklet. */
+ GEM_BUG_ON(!i915_tasklet_is_locked(t));
+
+ return t->flags & I915_TASKLET_DIRECT_SUBMIT;
+}
+
static inline void i915_tasklet_schedule(struct i915_tasklet *t)
{
tasklet_hi_schedule(&t->base);
@@ -89,4 +102,15 @@ static inline void i915_tasklet_set_func(struct i915_tasklet *t,
tasklet_enable(&t->base);
}
+static inline void __i915_tasklet_run(const struct i915_tasklet *t)
+{
+ t->base.func(t->base.data);
+}
+
+static inline void i915_tasklet_run(const struct i915_tasklet *t)
+{
+ GEM_BUG_ON(!i915_tasklet_is_locked(t));
+ __i915_tasklet_run(t);
+}
+
#endif /* _I915_TASKLET_H_ */
@@ -754,14 +754,18 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
static void guc_dequeue(struct intel_engine_cs *engine)
{
- unsigned long flags;
+ unsigned long uninitialized_var(flags);
bool submit;
local_irq_save(flags);
- spin_lock(&engine->timeline.lock);
+ if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
+ spin_lock(&engine->timeline.lock);
+
submit = __guc_dequeue(engine);
- spin_unlock(&engine->timeline.lock);
+
+ if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
+ spin_unlock(&engine->timeline.lock);
if (submit)
guc_submit(engine);
@@ -356,13 +356,15 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
{
struct intel_engine_cs *engine =
container_of(execlists, typeof(*engine), execlists);
- unsigned long flags;
+ unsigned long uninitialized_var(flags);
- spin_lock_irqsave(&engine->timeline.lock, flags);
+ if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+ spin_lock_irqsave(&engine->timeline.lock, flags);
__unwind_incomplete_requests(engine);
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
+ if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+ spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
static inline void
@@ -601,6 +603,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
*/
GEM_BUG_ON(!execlists_is_active(execlists,
EXECLISTS_ACTIVE_USER));
+ GEM_BUG_ON(execlists_is_active(execlists,
+ EXECLISTS_ACTIVE_PREEMPT));
GEM_BUG_ON(!port_count(&port[0]));
if (port_count(&port[0]) > 1)
return false;
@@ -757,12 +761,16 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
static void execlists_dequeue(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
- unsigned long flags;
+ unsigned long uninitialized_var(flags);
bool submit;
- spin_lock_irqsave(&engine->timeline.lock, flags);
+ if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+ spin_lock_irqsave(&engine->timeline.lock, flags);
+
submit = __execlists_dequeue(engine);
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+ if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+ spin_unlock_irqrestore(&engine->timeline.lock, flags);
if (submit)
execlists_submit_ports(engine);
@@ -1162,16 +1170,52 @@ static void queue_request(struct intel_engine_cs *engine,
&lookup_priolist(engine, prio)->requests);
}
-static void __submit_queue(struct intel_engine_cs *engine, int prio)
+static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
{
engine->execlists.queue_priority = prio;
+}
+
+static void __schedule_queue(struct intel_engine_cs *engine)
+{
i915_tasklet_schedule(&engine->execlists.tasklet);
}
+static bool __direct_submit(struct intel_engine_execlists *const execlists)
+{
+ struct i915_tasklet * const t = &execlists->tasklet;
+
+ if (!tasklet_trylock(&t->base))
+ return false;
+
+ t->flags |= I915_TASKLET_DIRECT_SUBMIT;
+ i915_tasklet_run(t);
+ t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
+
+ tasklet_unlock(&t->base);
+ return true;
+}
+
+static void __submit_queue(struct intel_engine_cs *engine)
+{
+ struct intel_engine_execlists * const execlists = &engine->execlists;
+
+ GEM_BUG_ON(!engine->i915->gt.awake);
+
+ /* If inside GPU reset, the tasklet will be queued later. */
+ if (!i915_tasklet_is_enabled(&execlists->tasklet))
+ return;
+
+ /* Directly submit the first request to reduce the initial latency */
+ if (port_isset(execlists->port) || !__direct_submit(execlists))
+ __schedule_queue(engine);
+}
+
static void submit_queue(struct intel_engine_cs *engine, int prio)
{
- if (prio > engine->execlists.queue_priority)
- __submit_queue(engine, prio);
+ if (prio > engine->execlists.queue_priority) {
+ __wakeup_queue(engine, prio);
+ __submit_queue(engine);
+ }
}
static void execlists_submit_request(struct i915_request *request)
@@ -1183,10 +1227,9 @@ static void execlists_submit_request(struct i915_request *request)
spin_lock_irqsave(&engine->timeline.lock, flags);
queue_request(engine, &request->sched, rq_prio(request));
- submit_queue(engine, rq_prio(request));
-
GEM_BUG_ON(!engine->execlists.first);
GEM_BUG_ON(list_empty(&request->sched.link));
+ submit_queue(engine, rq_prio(request));
spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
@@ -1314,8 +1357,10 @@ static void execlists_schedule(struct i915_request *request,
}
if (prio > engine->execlists.queue_priority &&
- i915_sw_fence_done(&sched_to_request(node)->submit))
- __submit_queue(engine, prio);
+ i915_sw_fence_done(&sched_to_request(node)->submit)) {
+ __wakeup_queue(engine, prio);
+ __schedule_queue(engine);
+ }
}
spin_unlock_irq(&engine->timeline.lock);
Bypass using the tasklet to submit the first request to HW, as the tasklet may be deferred unto ksoftirqd and at a minimum will add in excess of 10us (and maybe tens of milliseconds) to our execution latency. This latency reduction is most notable when execution flows between engines. v2: Beware handling preemption completion from the direct submit path as well. v3: Make the abuse clear and track our extra state inside i915_tasklet. Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/i915_tasklet.h | 24 +++++++ drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++- drivers/gpu/drm/i915/intel_lrc.c | 71 +++++++++++++++++---- 3 files changed, 89 insertions(+), 16 deletions(-)