diff mbox

[v2,5/7] drm/i915/execlists: Direct submit onto idle engines

Message ID 20180507135731.10587-5-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson May 7, 2018, 1:57 p.m. UTC
Bypass using the tasklet to submit the first request to HW, as the
tasklet may be deferred unto ksoftirqd and at a minimum will add in
excess of 10us (and maybe tens of milliseconds) to our execution
latency. This latency reduction is most notable when execution flows
between engines.

v2: Beware handling preemption completion from the direct submit path as
well.

Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++-
 drivers/gpu/drm/i915/intel_lrc.c            | 66 +++++++++++++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.h     |  7 +++
 3 files changed, 69 insertions(+), 16 deletions(-)

Comments

Tvrtko Ursulin May 8, 2018, 10:23 a.m. UTC | #1
On 07/05/2018 14:57, Chris Wilson wrote:
> Bypass using the tasklet to submit the first request to HW, as the
> tasklet may be deferred unto ksoftirqd and at a minimum will add in
> excess of 10us (and maybe tens of milliseconds) to our execution
> latency. This latency reduction is most notable when execution flows
> between engines.
> 
> v2: Beware handling preemption completion from the direct submit path as
> well.
> 
> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++-
>   drivers/gpu/drm/i915/intel_lrc.c            | 66 +++++++++++++++++----
>   drivers/gpu/drm/i915/intel_ringbuffer.h     |  7 +++
>   3 files changed, 69 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 2feb65096966..6bfe30af7826 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -754,14 +754,20 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
>   
>   static void guc_dequeue(struct intel_engine_cs *engine)
>   {
> -	unsigned long flags;
> +	unsigned long uninitialized_var(flags);
>   	bool submit;
>   
>   	local_irq_save(flags);
>   
> -	spin_lock(&engine->timeline.lock);
> +	GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN,
> +			     &engine->execlists.tasklet.state));

Soon it will be time for i915_tasklet. :)

> +	if (!intel_engine_direct_submit(engine))
> +		spin_lock(&engine->timeline.lock);

A bit ugly both on the conditional locking and using engine->flags for 
transient purposes.

Since you are locking the tasklet and own it (and open coding the call) 
completely when calling directly, you could just the same cheat and call 
a different function?

> +
>   	submit = __guc_dequeue(engine);
> -	spin_unlock(&engine->timeline.lock);
> +
> +	if (!intel_engine_direct_submit(engine))
> +		spin_unlock(&engine->timeline.lock);
>   
>   	if (submit)
>   		guc_submit(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 15c373ea5b7e..ac7c5edee4ee 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -357,13 +357,16 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
>   {
>   	struct intel_engine_cs *engine =
>   		container_of(execlists, typeof(*engine), execlists);
> -	unsigned long flags;
> +	unsigned long uninitialized_var(flags);
>   
> -	spin_lock_irqsave(&engine->timeline.lock, flags);
> +	GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state));
> +	if (!intel_engine_direct_submit(engine))
> +		spin_lock_irqsave(&engine->timeline.lock, flags);
>   
>   	__unwind_incomplete_requests(engine);
>   
> -	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +	if (!intel_engine_direct_submit(engine))
> +		spin_unlock_irqrestore(&engine->timeline.lock, flags);

Hm ok yes, this one would be a problem..

Maybe at least use some bit under execlists state instead of engine flags?

Regards,

Tvrtko

>   }
>   
>   static inline void
> @@ -602,6 +605,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
>   		 */
>   		GEM_BUG_ON(!execlists_is_active(execlists,
>   						EXECLISTS_ACTIVE_USER));
> +		GEM_BUG_ON(execlists_is_active(execlists,
> +					       EXECLISTS_ACTIVE_PREEMPT));
>   		GEM_BUG_ON(!port_count(&port[0]));
>   		if (port_count(&port[0]) > 1)
>   			return false;
> @@ -758,12 +763,17 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
>   static void execlists_dequeue(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> -	unsigned long flags;
> +	unsigned long uninitialized_var(flags);
>   	bool submit;
>   
> -	spin_lock_irqsave(&engine->timeline.lock, flags);
> +	GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state));
> +	if (!intel_engine_direct_submit(engine))
> +		spin_lock_irqsave(&engine->timeline.lock, flags);
> +
>   	submit = __execlists_dequeue(engine);
> -	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +
> +	if (!intel_engine_direct_submit(engine))
> +		spin_unlock_irqrestore(&engine->timeline.lock, flags);
>   
>   	if (submit)
>   		execlists_submit_ports(engine);
> @@ -1163,16 +1173,45 @@ static void queue_request(struct intel_engine_cs *engine,
>   		      &lookup_priolist(engine, node, prio)->requests);
>   }
>   
> -static void __submit_queue(struct intel_engine_cs *engine, int prio)
> +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
>   {
>   	engine->execlists.queue_priority = prio;
> +}
> +
> +static void __schedule_queue(struct intel_engine_cs *engine)
> +{
>   	tasklet_hi_schedule(&engine->execlists.tasklet);
>   }
>   
> +static void __submit_queue(struct intel_engine_cs *engine)
> +{
> +	struct intel_engine_execlists * const execlists = &engine->execlists;
> +	struct tasklet_struct * const t = &execlists->tasklet;
> +
> +	GEM_BUG_ON(!engine->i915->gt.awake);
> +
> +	/* If inside GPU reset, the tasklet will be queued later. */
> +	if (unlikely(atomic_read(&t->count)))
> +		return;
> +
> +	/* Directly submit the first request to reduce the initial latency */
> +	if (!port_isset(execlists->port) && tasklet_trylock(t)) {
> +		engine->flags |= I915_ENGINE_DIRECT_SUBMIT;
> +		t->func(t->data);
> +		engine->flags &= ~I915_ENGINE_DIRECT_SUBMIT;
> +		tasklet_unlock(t);
> +		return;
> +	}
> +
> +	__schedule_queue(engine);
> +}
> +
>   static void submit_queue(struct intel_engine_cs *engine, int prio)
>   {
> -	if (prio > engine->execlists.queue_priority)
> -		__submit_queue(engine, prio);
> +	if (prio > engine->execlists.queue_priority) {
> +		__wakeup_queue(engine, prio);
> +		__submit_queue(engine);
> +	}
>   }
>   
>   static void execlists_submit_request(struct i915_request *request)
> @@ -1184,10 +1223,9 @@ static void execlists_submit_request(struct i915_request *request)
>   	spin_lock_irqsave(&engine->timeline.lock, flags);
>   
>   	queue_request(engine, &request->sched, rq_prio(request));
> -	submit_queue(engine, rq_prio(request));
> -
>   	GEM_BUG_ON(!engine->execlists.first);
>   	GEM_BUG_ON(list_empty(&request->sched.link));
> +	submit_queue(engine, rq_prio(request));
>   
>   	spin_unlock_irqrestore(&engine->timeline.lock, flags);
>   }
> @@ -1309,8 +1347,10 @@ static void execlists_schedule(struct i915_request *request,
>   		}
>   
>   		if (prio > engine->execlists.queue_priority &&
> -		    i915_sw_fence_done(&sched_to_request(node)->submit))
> -			__submit_queue(engine, prio);
> +		    i915_sw_fence_done(&sched_to_request(node)->submit)) {
> +			__wakeup_queue(engine, prio);
> +			__schedule_queue(engine);
> +		}
>   	}
>   
>   	spin_unlock_irq(&engine->timeline.lock);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 010750e8ee44..f5545391d76a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -569,6 +569,7 @@ struct intel_engine_cs {
>   #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
>   #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
>   #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> +#define I915_ENGINE_DIRECT_SUBMIT    BIT(3)
>   	unsigned int flags;
>   
>   	/*
> @@ -646,6 +647,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
>   	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
>   }
>   
> +static inline bool
> +intel_engine_direct_submit(const struct intel_engine_cs *engine)
> +{
> +	return engine->flags & I915_ENGINE_DIRECT_SUBMIT;
> +}
> +
>   static inline bool __execlists_need_preempt(int prio, int last)
>   {
>   	return prio > max(0, last);
>
Chris Wilson May 8, 2018, 10:40 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-05-08 11:23:09)
> 
> On 07/05/2018 14:57, Chris Wilson wrote:
> > Bypass using the tasklet to submit the first request to HW, as the
> > tasklet may be deferred unto ksoftirqd and at a minimum will add in
> > excess of 10us (and maybe tens of milliseconds) to our execution
> > latency. This latency reduction is most notable when execution flows
> > between engines.
> > 
> > v2: Beware handling preemption completion from the direct submit path as
> > well.
> > 
> > Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++-
> >   drivers/gpu/drm/i915/intel_lrc.c            | 66 +++++++++++++++++----
> >   drivers/gpu/drm/i915/intel_ringbuffer.h     |  7 +++
> >   3 files changed, 69 insertions(+), 16 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> > index 2feb65096966..6bfe30af7826 100644
> > --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> > @@ -754,14 +754,20 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
> >   
> >   static void guc_dequeue(struct intel_engine_cs *engine)
> >   {
> > -     unsigned long flags;
> > +     unsigned long uninitialized_var(flags);
> >       bool submit;
> >   
> >       local_irq_save(flags);
> >   
> > -     spin_lock(&engine->timeline.lock);
> > +     GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN,
> > +                          &engine->execlists.tasklet.state));
> 
> Soon it will be time for i915_tasklet. :)
> 
> > +     if (!intel_engine_direct_submit(engine))
> > +             spin_lock(&engine->timeline.lock);
> 
> A bit ugly both on the conditional locking and using engine->flags for 
> transient purposes.
> 
> Since you are locking the tasklet and own it (and open coding the call) 
> completely when calling directly, you could just the same cheat and call 
> a different function?

My first attempt was to call __execlists_dequeue() directly and not
tasklet->func(). But that then has this nasty
  if (tasklet->func == execlists_submission_tasklet)
or some such in the middle of otherwise generic code.
https://patchwork.freedesktop.org/patch/221105/

I was less happy about that. At least this does have the making of
something more generic like i915_tasklet ;)

> >       submit = __guc_dequeue(engine);
> > -     spin_unlock(&engine->timeline.lock);
> > +
> > +     if (!intel_engine_direct_submit(engine))
> > +             spin_unlock(&engine->timeline.lock);
> >   
> >       if (submit)
> >               guc_submit(engine);
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 15c373ea5b7e..ac7c5edee4ee 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -357,13 +357,16 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
> >   {
> >       struct intel_engine_cs *engine =
> >               container_of(execlists, typeof(*engine), execlists);
> > -     unsigned long flags;
> > +     unsigned long uninitialized_var(flags);
> >   
> > -     spin_lock_irqsave(&engine->timeline.lock, flags);
> > +     GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state));
> > +     if (!intel_engine_direct_submit(engine))
> > +             spin_lock_irqsave(&engine->timeline.lock, flags);
> >   
> >       __unwind_incomplete_requests(engine);
> >   
> > -     spin_unlock_irqrestore(&engine->timeline.lock, flags);
> > +     if (!intel_engine_direct_submit(engine))
> > +             spin_unlock_irqrestore(&engine->timeline.lock, flags);
> 
> Hm ok yes, this one would be a problem..
> 
> Maybe at least use some bit under execlists state instead of engine flags?

But I have engine->flags :-p Could I steal a bit from tasklet.state? I
tend to get funny looks everytime I ask for TASKLET_STATE_USER ;)
-Chris
Tvrtko Ursulin May 8, 2018, 11 a.m. UTC | #3
On 08/05/2018 11:40, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-08 11:23:09)
>>
>> On 07/05/2018 14:57, Chris Wilson wrote:
>>> Bypass using the tasklet to submit the first request to HW, as the
>>> tasklet may be deferred unto ksoftirqd and at a minimum will add in
>>> excess of 10us (and maybe tens of milliseconds) to our execution
>>> latency. This latency reduction is most notable when execution flows
>>> between engines.
>>>
>>> v2: Beware handling preemption completion from the direct submit path as
>>> well.
>>>
>>> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/intel_guc_submission.c | 12 +++-
>>>    drivers/gpu/drm/i915/intel_lrc.c            | 66 +++++++++++++++++----
>>>    drivers/gpu/drm/i915/intel_ringbuffer.h     |  7 +++
>>>    3 files changed, 69 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
>>> index 2feb65096966..6bfe30af7826 100644
>>> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
>>> @@ -754,14 +754,20 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
>>>    
>>>    static void guc_dequeue(struct intel_engine_cs *engine)
>>>    {
>>> -     unsigned long flags;
>>> +     unsigned long uninitialized_var(flags);
>>>        bool submit;
>>>    
>>>        local_irq_save(flags);
>>>    
>>> -     spin_lock(&engine->timeline.lock);
>>> +     GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN,
>>> +                          &engine->execlists.tasklet.state));
>>
>> Soon it will be time for i915_tasklet. :)
>>
>>> +     if (!intel_engine_direct_submit(engine))
>>> +             spin_lock(&engine->timeline.lock);
>>
>> A bit ugly both on the conditional locking and using engine->flags for
>> transient purposes.
>>
>> Since you are locking the tasklet and own it (and open coding the call)
>> completely when calling directly, you could just the same cheat and call
>> a different function?
> 
> My first attempt was to call __execlists_dequeue() directly and not
> tasklet->func(). But that then has this nasty
>    if (tasklet->func == execlists_submission_tasklet)

I thought not call the t->func but func directly, well a special flavour 
of the func. But the unwind as noticed a bit later is the only one which 
throws the spanner in those works.

Unfortunately I have no ideas at the moment on how to elegantly solve that.

> or some such in the middle of otherwise generic code.
> https://patchwork.freedesktop.org/patch/221105/
> 
> I was less happy about that. At least this does have the making of
> something more generic like i915_tasklet ;)
> 
>>>        submit = __guc_dequeue(engine);
>>> -     spin_unlock(&engine->timeline.lock);
>>> +
>>> +     if (!intel_engine_direct_submit(engine))
>>> +             spin_unlock(&engine->timeline.lock);
>>>    
>>>        if (submit)
>>>                guc_submit(engine);
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 15c373ea5b7e..ac7c5edee4ee 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -357,13 +357,16 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
>>>    {
>>>        struct intel_engine_cs *engine =
>>>                container_of(execlists, typeof(*engine), execlists);
>>> -     unsigned long flags;
>>> +     unsigned long uninitialized_var(flags);
>>>    
>>> -     spin_lock_irqsave(&engine->timeline.lock, flags);
>>> +     GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state));
>>> +     if (!intel_engine_direct_submit(engine))
>>> +             spin_lock_irqsave(&engine->timeline.lock, flags);
>>>    
>>>        __unwind_incomplete_requests(engine);
>>>    
>>> -     spin_unlock_irqrestore(&engine->timeline.lock, flags);
>>> +     if (!intel_engine_direct_submit(engine))
>>> +             spin_unlock_irqrestore(&engine->timeline.lock, flags);
>>
>> Hm ok yes, this one would be a problem..
>>
>> Maybe at least use some bit under execlists state instead of engine flags?
> 
> But I have engine->flags :-p Could I steal a bit from tasklet.state? I > tend to get funny looks everytime I ask for TASKLET_STATE_USER ;)

We intended engine->flags to be stable for engine lifetime 
(effectively). So I don't like using it for this. Put a new flag/boolean 
to intel_execlists_state?

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 2feb65096966..6bfe30af7826 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -754,14 +754,20 @@  static bool __guc_dequeue(struct intel_engine_cs *engine)
 
 static void guc_dequeue(struct intel_engine_cs *engine)
 {
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
 	bool submit;
 
 	local_irq_save(flags);
 
-	spin_lock(&engine->timeline.lock);
+	GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN,
+			     &engine->execlists.tasklet.state));
+	if (!intel_engine_direct_submit(engine))
+		spin_lock(&engine->timeline.lock);
+
 	submit = __guc_dequeue(engine);
-	spin_unlock(&engine->timeline.lock);
+
+	if (!intel_engine_direct_submit(engine))
+		spin_unlock(&engine->timeline.lock);
 
 	if (submit)
 		guc_submit(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 15c373ea5b7e..ac7c5edee4ee 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -357,13 +357,16 @@  execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
 {
 	struct intel_engine_cs *engine =
 		container_of(execlists, typeof(*engine), execlists);
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state));
+	if (!intel_engine_direct_submit(engine))
+		spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	__unwind_incomplete_requests(engine);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	if (!intel_engine_direct_submit(engine))
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static inline void
@@ -602,6 +605,8 @@  static bool __execlists_dequeue(struct intel_engine_cs *engine)
 		 */
 		GEM_BUG_ON(!execlists_is_active(execlists,
 						EXECLISTS_ACTIVE_USER));
+		GEM_BUG_ON(execlists_is_active(execlists,
+					       EXECLISTS_ACTIVE_PREEMPT));
 		GEM_BUG_ON(!port_count(&port[0]));
 		if (port_count(&port[0]) > 1)
 			return false;
@@ -758,12 +763,17 @@  static bool __execlists_dequeue(struct intel_engine_cs *engine)
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
 	bool submit;
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	GEM_BUG_ON(!test_bit(TASKLET_STATE_RUN, &execlists->tasklet.state));
+	if (!intel_engine_direct_submit(engine))
+		spin_lock_irqsave(&engine->timeline.lock, flags);
+
 	submit = __execlists_dequeue(engine);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+	if (!intel_engine_direct_submit(engine))
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
 
 	if (submit)
 		execlists_submit_ports(engine);
@@ -1163,16 +1173,45 @@  static void queue_request(struct intel_engine_cs *engine,
 		      &lookup_priolist(engine, node, prio)->requests);
 }
 
-static void __submit_queue(struct intel_engine_cs *engine, int prio)
+static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
 {
 	engine->execlists.queue_priority = prio;
+}
+
+static void __schedule_queue(struct intel_engine_cs *engine)
+{
 	tasklet_hi_schedule(&engine->execlists.tasklet);
 }
 
+static void __submit_queue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct tasklet_struct * const t = &execlists->tasklet;
+
+	GEM_BUG_ON(!engine->i915->gt.awake);
+
+	/* If inside GPU reset, the tasklet will be queued later. */
+	if (unlikely(atomic_read(&t->count)))
+		return;
+
+	/* Directly submit the first request to reduce the initial latency */
+	if (!port_isset(execlists->port) && tasklet_trylock(t)) {
+		engine->flags |= I915_ENGINE_DIRECT_SUBMIT;
+		t->func(t->data);
+		engine->flags &= ~I915_ENGINE_DIRECT_SUBMIT;
+		tasklet_unlock(t);
+		return;
+	}
+
+	__schedule_queue(engine);
+}
+
 static void submit_queue(struct intel_engine_cs *engine, int prio)
 {
-	if (prio > engine->execlists.queue_priority)
-		__submit_queue(engine, prio);
+	if (prio > engine->execlists.queue_priority) {
+		__wakeup_queue(engine, prio);
+		__submit_queue(engine);
+	}
 }
 
 static void execlists_submit_request(struct i915_request *request)
@@ -1184,10 +1223,9 @@  static void execlists_submit_request(struct i915_request *request)
 	spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	queue_request(engine, &request->sched, rq_prio(request));
-	submit_queue(engine, rq_prio(request));
-
 	GEM_BUG_ON(!engine->execlists.first);
 	GEM_BUG_ON(list_empty(&request->sched.link));
+	submit_queue(engine, rq_prio(request));
 
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
@@ -1309,8 +1347,10 @@  static void execlists_schedule(struct i915_request *request,
 		}
 
 		if (prio > engine->execlists.queue_priority &&
-		    i915_sw_fence_done(&sched_to_request(node)->submit))
-			__submit_queue(engine, prio);
+		    i915_sw_fence_done(&sched_to_request(node)->submit)) {
+			__wakeup_queue(engine, prio);
+			__schedule_queue(engine);
+		}
 	}
 
 	spin_unlock_irq(&engine->timeline.lock);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 010750e8ee44..f5545391d76a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -569,6 +569,7 @@  struct intel_engine_cs {
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_DIRECT_SUBMIT    BIT(3)
 	unsigned int flags;
 
 	/*
@@ -646,6 +647,12 @@  intel_engine_has_preemption(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
 }
 
+static inline bool
+intel_engine_direct_submit(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_DIRECT_SUBMIT;
+}
+
 static inline bool __execlists_need_preempt(int prio, int last)
 {
 	return prio > max(0, last);