Message ID | 20180507135731.10587-5-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 07/05/2018 14:57, Chris Wilson wrote: > Bypass using the tasklet to submit the first request to HW, as the > tasklet may be deferred unto ksoftirqd and at a minimum will add in > excess of 10us (and maybe tens of milliseconds) to our execution > latency. This latency reduction is most notable when execution flows > between engines. > > v2: Beware handling preemption completion from the direct submit path as > well. > > Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++- > drivers/gpu/drm/i915/intel_lrc.c | 66 +++++++++++++++++---- > drivers/gpu/drm/i915/intel_ringbuffer.h | 7 +++ > 3 files changed, 69 insertions(+), 16 deletions(-) > > diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c > index 2feb65096966..6bfe30af7826 100644 > --- a/drivers/gpu/drm/i915/intel_guc_submission.c > +++ b/drivers/gpu/drm/i915/intel_guc_submission.c > @@ -754,14 +754,20 @@ static bool __guc_dequeue(struct intel_engine_cs *engine) > > static void guc_dequeue(struct intel_engine_cs *engine) > { > - unsigned long flags; > + unsigned long uninitialized_var(flags); > bool submit; > > local_irq_save(flags); > > - spin_lock(&engine->timeline.lock); > + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, > + &engine->execlists.tasklet.state)); Soon it will be time for i915_tasklet. :) > + if (!intel_engine_direct_submit(engine)) > + spin_lock(&engine->timeline.lock); A bit ugly both on the conditional locking and using engine->flags for transient purposes. Since you are locking the tasklet and own it (and open coding the call) completely when calling directly, you could just the same cheat and call a different function? > + > submit = __guc_dequeue(engine); > - spin_unlock(&engine->timeline.lock); > + > + if (!intel_engine_direct_submit(engine)) > + spin_unlock(&engine->timeline.lock); > > if (submit) > guc_submit(engine); > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 15c373ea5b7e..ac7c5edee4ee 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -357,13 +357,16 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) > { > struct intel_engine_cs *engine = > container_of(execlists, typeof(*engine), execlists); > - unsigned long flags; > + unsigned long uninitialized_var(flags); > > - spin_lock_irqsave(&engine->timeline.lock, flags); > + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state)); > + if (!intel_engine_direct_submit(engine)) > + spin_lock_irqsave(&engine->timeline.lock, flags); > > __unwind_incomplete_requests(engine); > > - spin_unlock_irqrestore(&engine->timeline.lock, flags); > + if (!intel_engine_direct_submit(engine)) > + spin_unlock_irqrestore(&engine->timeline.lock, flags); Hm ok yes, this one would be a problem.. Maybe at least use some bit under execlists state instead of engine flags? Regards, Tvrtko > } > > static inline void > @@ -602,6 +605,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine) > */ > GEM_BUG_ON(!execlists_is_active(execlists, > EXECLISTS_ACTIVE_USER)); > + GEM_BUG_ON(execlists_is_active(execlists, > + EXECLISTS_ACTIVE_PREEMPT)); > GEM_BUG_ON(!port_count(&port[0])); > if (port_count(&port[0]) > 1) > return false; > @@ -758,12 +763,17 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine) > static void execlists_dequeue(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > - unsigned long flags; > + unsigned long uninitialized_var(flags); > bool submit; > > - spin_lock_irqsave(&engine->timeline.lock, flags); > + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state)); > + if (!intel_engine_direct_submit(engine)) > + spin_lock_irqsave(&engine->timeline.lock, flags); > + > submit = __execlists_dequeue(engine); > - spin_unlock_irqrestore(&engine->timeline.lock, flags); > + > + if (!intel_engine_direct_submit(engine)) > + spin_unlock_irqrestore(&engine->timeline.lock, flags); > > if (submit) > execlists_submit_ports(engine); > @@ -1163,16 +1173,45 @@ static void queue_request(struct intel_engine_cs *engine, > &lookup_priolist(engine, node, prio)->requests); > } > > -static void __submit_queue(struct intel_engine_cs *engine, int prio) > +static void __wakeup_queue(struct intel_engine_cs *engine, int prio) > { > engine->execlists.queue_priority = prio; > +} > + > +static void __schedule_queue(struct intel_engine_cs *engine) > +{ > tasklet_hi_schedule(&engine->execlists.tasklet); > } > > +static void __submit_queue(struct intel_engine_cs *engine) > +{ > + struct intel_engine_execlists * const execlists = &engine->execlists; > + struct tasklet_struct * const t = &execlists->tasklet; > + > + GEM_BUG_ON(!engine->i915->gt.awake); > + > + /* If inside GPU reset, the tasklet will be queued later. */ > + if (unlikely(atomic_read(&t->count))) > + return; > + > + /* Directly submit the first request to reduce the initial latency */ > + if (!port_isset(execlists->port) && tasklet_trylock(t)) { > + engine->flags |= I915_ENGINE_DIRECT_SUBMIT; > + t->func(t->data); > + engine->flags &= ~I915_ENGINE_DIRECT_SUBMIT; > + tasklet_unlock(t); > + return; > + } > + > + __schedule_queue(engine); > +} > + > static void submit_queue(struct intel_engine_cs *engine, int prio) > { > - if (prio > engine->execlists.queue_priority) > - __submit_queue(engine, prio); > + if (prio > engine->execlists.queue_priority) { > + __wakeup_queue(engine, prio); > + __submit_queue(engine); > + } > } > > static void execlists_submit_request(struct i915_request *request) > @@ -1184,10 +1223,9 @@ static void execlists_submit_request(struct i915_request *request) > spin_lock_irqsave(&engine->timeline.lock, flags); > > queue_request(engine, &request->sched, rq_prio(request)); > - submit_queue(engine, rq_prio(request)); > - > GEM_BUG_ON(!engine->execlists.first); > GEM_BUG_ON(list_empty(&request->sched.link)); > + submit_queue(engine, rq_prio(request)); > > spin_unlock_irqrestore(&engine->timeline.lock, flags); > } > @@ -1309,8 +1347,10 @@ static void execlists_schedule(struct i915_request *request, > } > > if (prio > engine->execlists.queue_priority && > - i915_sw_fence_done(&sched_to_request(node)->submit)) > - __submit_queue(engine, prio); > + i915_sw_fence_done(&sched_to_request(node)->submit)) { > + __wakeup_queue(engine, prio); > + __schedule_queue(engine); > + } > } > > spin_unlock_irq(&engine->timeline.lock); > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 010750e8ee44..f5545391d76a 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -569,6 +569,7 @@ struct intel_engine_cs { > #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) > #define I915_ENGINE_SUPPORTS_STATS BIT(1) > #define I915_ENGINE_HAS_PREEMPTION BIT(2) > +#define I915_ENGINE_DIRECT_SUBMIT BIT(3) > unsigned int flags; > > /* > @@ -646,6 +647,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine) > return engine->flags & I915_ENGINE_HAS_PREEMPTION; > } > > +static inline bool > +intel_engine_direct_submit(const struct intel_engine_cs *engine) > +{ > + return engine->flags & I915_ENGINE_DIRECT_SUBMIT; > +} > + > static inline bool __execlists_need_preempt(int prio, int last) > { > return prio > max(0, last); >
Quoting Tvrtko Ursulin (2018-05-08 11:23:09) > > On 07/05/2018 14:57, Chris Wilson wrote: > > Bypass using the tasklet to submit the first request to HW, as the > > tasklet may be deferred unto ksoftirqd and at a minimum will add in > > excess of 10us (and maybe tens of milliseconds) to our execution > > latency. This latency reduction is most notable when execution flows > > between engines. > > > > v2: Beware handling preemption completion from the direct submit path as > > well. > > > > Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > --- > > drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++- > > drivers/gpu/drm/i915/intel_lrc.c | 66 +++++++++++++++++---- > > drivers/gpu/drm/i915/intel_ringbuffer.h | 7 +++ > > 3 files changed, 69 insertions(+), 16 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c > > index 2feb65096966..6bfe30af7826 100644 > > --- a/drivers/gpu/drm/i915/intel_guc_submission.c > > +++ b/drivers/gpu/drm/i915/intel_guc_submission.c > > @@ -754,14 +754,20 @@ static bool __guc_dequeue(struct intel_engine_cs *engine) > > > > static void guc_dequeue(struct intel_engine_cs *engine) > > { > > - unsigned long flags; > > + unsigned long uninitialized_var(flags); > > bool submit; > > > > local_irq_save(flags); > > > > - spin_lock(&engine->timeline.lock); > > + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, > > + &engine->execlists.tasklet.state)); > > Soon it will be time for i915_tasklet. :) > > > + if (!intel_engine_direct_submit(engine)) > > + spin_lock(&engine->timeline.lock); > > A bit ugly both on the conditional locking and using engine->flags for > transient purposes. > > Since you are locking the tasklet and own it (and open coding the call) > completely when calling directly, you could just the same cheat and call > a different function? My first attempt was to call __execlists_dequeue() directly and not tasklet->func(). But that then has this nasty if (tasklet->func == execlists_submission_tasklet) or some such in the middle of otherwise generic code. https://patchwork.freedesktop.org/patch/221105/ I was less happy about that. At least this does have the making of something more generic like i915_tasklet ;) > > submit = __guc_dequeue(engine); > > - spin_unlock(&engine->timeline.lock); > > + > > + if (!intel_engine_direct_submit(engine)) > > + spin_unlock(&engine->timeline.lock); > > > > if (submit) > > guc_submit(engine); > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > > index 15c373ea5b7e..ac7c5edee4ee 100644 > > --- a/drivers/gpu/drm/i915/intel_lrc.c > > +++ b/drivers/gpu/drm/i915/intel_lrc.c > > @@ -357,13 +357,16 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) > > { > > struct intel_engine_cs *engine = > > container_of(execlists, typeof(*engine), execlists); > > - unsigned long flags; > > + unsigned long uninitialized_var(flags); > > > > - spin_lock_irqsave(&engine->timeline.lock, flags); > > + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state)); > > + if (!intel_engine_direct_submit(engine)) > > + spin_lock_irqsave(&engine->timeline.lock, flags); > > > > __unwind_incomplete_requests(engine); > > > > - spin_unlock_irqrestore(&engine->timeline.lock, flags); > > + if (!intel_engine_direct_submit(engine)) > > + spin_unlock_irqrestore(&engine->timeline.lock, flags); > > Hm ok yes, this one would be a problem.. > > Maybe at least use some bit under execlists state instead of engine flags? But I have engine->flags :-p Could I steal a bit from tasklet.state? I tend to get funny looks everytime I ask for TASKLET_STATE_USER ;) -Chris
On 08/05/2018 11:40, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-05-08 11:23:09) >> >> On 07/05/2018 14:57, Chris Wilson wrote: >>> Bypass using the tasklet to submit the first request to HW, as the >>> tasklet may be deferred unto ksoftirqd and at a minimum will add in >>> excess of 10us (and maybe tens of milliseconds) to our execution >>> latency. This latency reduction is most notable when execution flows >>> between engines. >>> >>> v2: Beware handling preemption completion from the direct submit path as >>> well. >>> >>> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>> --- >>> drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++- >>> drivers/gpu/drm/i915/intel_lrc.c | 66 +++++++++++++++++---- >>> drivers/gpu/drm/i915/intel_ringbuffer.h | 7 +++ >>> 3 files changed, 69 insertions(+), 16 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c >>> index 2feb65096966..6bfe30af7826 100644 >>> --- a/drivers/gpu/drm/i915/intel_guc_submission.c >>> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c >>> @@ -754,14 +754,20 @@ static bool __guc_dequeue(struct intel_engine_cs *engine) >>> >>> static void guc_dequeue(struct intel_engine_cs *engine) >>> { >>> - unsigned long flags; >>> + unsigned long uninitialized_var(flags); >>> bool submit; >>> >>> local_irq_save(flags); >>> >>> - spin_lock(&engine->timeline.lock); >>> + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, >>> + &engine->execlists.tasklet.state)); >> >> Soon it will be time for i915_tasklet. :) >> >>> + if (!intel_engine_direct_submit(engine)) >>> + spin_lock(&engine->timeline.lock); >> >> A bit ugly both on the conditional locking and using engine->flags for >> transient purposes. >> >> Since you are locking the tasklet and own it (and open coding the call) >> completely when calling directly, you could just the same cheat and call >> a different function? > > My first attempt was to call __execlists_dequeue() directly and not > tasklet->func(). But that then has this nasty > if (tasklet->func == execlists_submission_tasklet) I thought not call the t->func but func directly, well a special flavour of the func. But the unwind as noticed a bit later is the only one which throws the spanner in those works. Unfortunately I have no ideas at the moment on how to elegantly solve that. > or some such in the middle of otherwise generic code. > https://patchwork.freedesktop.org/patch/221105/ > > I was less happy about that. At least this does have the making of > something more generic like i915_tasklet ;) > >>> submit = __guc_dequeue(engine); >>> - spin_unlock(&engine->timeline.lock); >>> + >>> + if (!intel_engine_direct_submit(engine)) >>> + spin_unlock(&engine->timeline.lock); >>> >>> if (submit) >>> guc_submit(engine); >>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c >>> index 15c373ea5b7e..ac7c5edee4ee 100644 >>> --- a/drivers/gpu/drm/i915/intel_lrc.c >>> +++ b/drivers/gpu/drm/i915/intel_lrc.c >>> @@ -357,13 +357,16 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) >>> { >>> struct intel_engine_cs *engine = >>> container_of(execlists, typeof(*engine), execlists); >>> - unsigned long flags; >>> + unsigned long uninitialized_var(flags); >>> >>> - spin_lock_irqsave(&engine->timeline.lock, flags); >>> + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state)); >>> + if (!intel_engine_direct_submit(engine)) >>> + spin_lock_irqsave(&engine->timeline.lock, flags); >>> >>> __unwind_incomplete_requests(engine); >>> >>> - spin_unlock_irqrestore(&engine->timeline.lock, flags); >>> + if (!intel_engine_direct_submit(engine)) >>> + spin_unlock_irqrestore(&engine->timeline.lock, flags); >> >> Hm ok yes, this one would be a problem.. >> >> Maybe at least use some bit under execlists state instead of engine flags? > > But I have engine->flags :-p Could I steal a bit from tasklet.state? I > tend to get funny looks everytime I ask for TASKLET_STATE_USER ;) We intended engine->flags to be stable for engine lifetime (effectively). So I don't like using it for this. Put a new flag/boolean to intel_execlists_state? Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c index 2feb65096966..6bfe30af7826 100644 --- a/drivers/gpu/drm/i915/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/intel_guc_submission.c @@ -754,14 +754,20 @@ static bool __guc_dequeue(struct intel_engine_cs *engine) static void guc_dequeue(struct intel_engine_cs *engine) { - unsigned long flags; + unsigned long uninitialized_var(flags); bool submit; local_irq_save(flags); - spin_lock(&engine->timeline.lock); + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, + &engine->execlists.tasklet.state)); + if (!intel_engine_direct_submit(engine)) + spin_lock(&engine->timeline.lock); + submit = __guc_dequeue(engine); - spin_unlock(&engine->timeline.lock); + + if (!intel_engine_direct_submit(engine)) + spin_unlock(&engine->timeline.lock); if (submit) guc_submit(engine); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 15c373ea5b7e..ac7c5edee4ee 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -357,13 +357,16 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) { struct intel_engine_cs *engine = container_of(execlists, typeof(*engine), execlists); - unsigned long flags; + unsigned long uninitialized_var(flags); - spin_lock_irqsave(&engine->timeline.lock, flags); + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state)); + if (!intel_engine_direct_submit(engine)) + spin_lock_irqsave(&engine->timeline.lock, flags); __unwind_incomplete_requests(engine); - spin_unlock_irqrestore(&engine->timeline.lock, flags); + if (!intel_engine_direct_submit(engine)) + spin_unlock_irqrestore(&engine->timeline.lock, flags); } static inline void @@ -602,6 +605,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine) */ GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER)); + GEM_BUG_ON(execlists_is_active(execlists, + EXECLISTS_ACTIVE_PREEMPT)); GEM_BUG_ON(!port_count(&port[0])); if (port_count(&port[0]) > 1) return false; @@ -758,12 +763,17 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine) static void execlists_dequeue(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; - unsigned long flags; + unsigned long uninitialized_var(flags); bool submit; - spin_lock_irqsave(&engine->timeline.lock, flags); + GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state)); + if (!intel_engine_direct_submit(engine)) + spin_lock_irqsave(&engine->timeline.lock, flags); + submit = __execlists_dequeue(engine); - spin_unlock_irqrestore(&engine->timeline.lock, flags); + + if (!intel_engine_direct_submit(engine)) + spin_unlock_irqrestore(&engine->timeline.lock, flags); if (submit) execlists_submit_ports(engine); @@ -1163,16 +1173,45 @@ static void queue_request(struct intel_engine_cs *engine, &lookup_priolist(engine, node, prio)->requests); } -static void __submit_queue(struct intel_engine_cs *engine, int prio) +static void __wakeup_queue(struct intel_engine_cs *engine, int prio) { engine->execlists.queue_priority = prio; +} + +static void __schedule_queue(struct intel_engine_cs *engine) +{ tasklet_hi_schedule(&engine->execlists.tasklet); } +static void __submit_queue(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct tasklet_struct * const t = &execlists->tasklet; + + GEM_BUG_ON(!engine->i915->gt.awake); + + /* If inside GPU reset, the tasklet will be queued later. */ + if (unlikely(atomic_read(&t->count))) + return; + + /* Directly submit the first request to reduce the initial latency */ + if (!port_isset(execlists->port) && tasklet_trylock(t)) { + engine->flags |= I915_ENGINE_DIRECT_SUBMIT; + t->func(t->data); + engine->flags &= ~I915_ENGINE_DIRECT_SUBMIT; + tasklet_unlock(t); + return; + } + + __schedule_queue(engine); +} + static void submit_queue(struct intel_engine_cs *engine, int prio) { - if (prio > engine->execlists.queue_priority) - __submit_queue(engine, prio); + if (prio > engine->execlists.queue_priority) { + __wakeup_queue(engine, prio); + __submit_queue(engine); + } } static void execlists_submit_request(struct i915_request *request) @@ -1184,10 +1223,9 @@ static void execlists_submit_request(struct i915_request *request) spin_lock_irqsave(&engine->timeline.lock, flags); queue_request(engine, &request->sched, rq_prio(request)); - submit_queue(engine, rq_prio(request)); - GEM_BUG_ON(!engine->execlists.first); GEM_BUG_ON(list_empty(&request->sched.link)); + submit_queue(engine, rq_prio(request)); spin_unlock_irqrestore(&engine->timeline.lock, flags); } @@ -1309,8 +1347,10 @@ static void execlists_schedule(struct i915_request *request, } if (prio > engine->execlists.queue_priority && - i915_sw_fence_done(&sched_to_request(node)->submit)) - __submit_queue(engine, prio); + i915_sw_fence_done(&sched_to_request(node)->submit)) { + __wakeup_queue(engine, prio); + __schedule_queue(engine); + } } spin_unlock_irq(&engine->timeline.lock); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 010750e8ee44..f5545391d76a 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -569,6 +569,7 @@ struct intel_engine_cs { #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) #define I915_ENGINE_SUPPORTS_STATS BIT(1) #define I915_ENGINE_HAS_PREEMPTION BIT(2) +#define I915_ENGINE_DIRECT_SUBMIT BIT(3) unsigned int flags; /* @@ -646,6 +647,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine) return engine->flags & I915_ENGINE_HAS_PREEMPTION; } +static inline bool +intel_engine_direct_submit(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_DIRECT_SUBMIT; +} + static inline bool __execlists_need_preempt(int prio, int last) { return prio > max(0, last);
Bypass using the tasklet to submit the first request to HW, as the tasklet may be deferred unto ksoftirqd and at a minimum will add in excess of 10us (and maybe tens of milliseconds) to our execution latency. This latency reduction is most notable when execution flows between engines. v2: Beware handling preemption completion from the direct submit path as well. Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++- drivers/gpu/drm/i915/intel_lrc.c | 66 +++++++++++++++++---- drivers/gpu/drm/i915/intel_ringbuffer.h | 7 +++ 3 files changed, 69 insertions(+), 16 deletions(-)