diff mbox

[3/5] drm/i915/execlists: Direct submit onto idle engines

Message ID 20180509142801.28130-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson May 9, 2018, 2:27 p.m. UTC
Bypass using the tasklet to submit the first request to HW, as the
tasklet may be deferred unto ksoftirqd and at a minimum will add in
excess of 10us (and maybe tens of milliseconds) to our execution
latency. This latency reduction is most notable when execution flows
between engines.

v2: Beware handling preemption completion from the direct submit path as
well.
v3: Make the abuse clear and track our extra state inside i915_tasklet.

Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
 drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
 drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
 3 files changed, 89 insertions(+), 16 deletions(-)

Comments

Tvrtko Ursulin May 10, 2018, 4:09 p.m. UTC | #1
On 09/05/2018 15:27, Chris Wilson wrote:
> Bypass using the tasklet to submit the first request to HW, as the
> tasklet may be deferred unto ksoftirqd and at a minimum will add in
> excess of 10us (and maybe tens of milliseconds) to our execution
> latency. This latency reduction is most notable when execution flows
> between engines.
> 
> v2: Beware handling preemption completion from the direct submit path as
> well.
> v3: Make the abuse clear and track our extra state inside i915_tasklet.
> 
> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
>   drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
>   drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
>   3 files changed, 89 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
> index 42b002b88edb..99e2fa2241ba 100644
> --- a/drivers/gpu/drm/i915/i915_tasklet.h
> +++ b/drivers/gpu/drm/i915/i915_tasklet.h
> @@ -8,8 +8,11 @@
>   #define _I915_TASKLET_H_
>   
>   #include <linux/atomic.h>
> +#include <linux/bitops.h>
>   #include <linux/interrupt.h>
>   
> +#include "i915_gem.h"
> +
>   /**
>    * struct i915_tasklet - wrapper around tasklet_struct
>    *
> @@ -19,6 +22,8 @@
>    */
>   struct i915_tasklet {
>   	struct tasklet_struct base;
> +	unsigned long flags;
> +#define I915_TASKLET_DIRECT_SUBMIT BIT(0)

I would suggest a more generic name for the bit since i915_tasklet is 
generic-ish. For instance simply I915_TASKLET_DIRECT would signify the 
callback has been invoked directly and not (necessarily) from softirq 
context. Then it is for each user to know what that means for them 
specifically.

>   };
>   
>   static inline void i915_tasklet_init(struct i915_tasklet *t,
> @@ -43,6 +48,14 @@ static inline bool i915_tasklet_is_enabled(const struct i915_tasklet *t)
>   	return likely(!atomic_read(&t->base.count));
>   }
>   
> +static inline bool i915_tasklet_is_direct_submit(const struct i915_tasklet *t)
> +{
> +	/* Only legal to be checked from inside the tasklet. */
> +	GEM_BUG_ON(!i915_tasklet_is_running(t));
> +
> +	return t->flags & I915_TASKLET_DIRECT_SUBMIT;
> +}

Or maybe i915_tasklet_direct_invocation?

> +
>   static inline void i915_tasklet_schedule(struct i915_tasklet *t)
>   {
>   	tasklet_hi_schedule(&t->base);
> @@ -75,4 +88,15 @@ static inline void i915_tasklet_set_func(struct i915_tasklet *t,
>   	i915_tasklet_unlock(t);
>   }
>   
> +static inline void __i915_tasklet_run(const struct i915_tasklet *t)
> +{
> +	t->base.func(t->base.data);
> +}
> +
> +static inline void i915_tasklet_run(const struct i915_tasklet *t)
> +{
> +	GEM_BUG_ON(!i915_tasklet_is_running(t));
> +	__i915_tasklet_run(t);
> +}
> +
>   #endif /* _I915_TASKLET_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index a7afc976c3b9..f2ded1796523 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -754,14 +754,18 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
>   
>   static void guc_dequeue(struct intel_engine_cs *engine)
>   {
> -	unsigned long flags;
> +	unsigned long uninitialized_var(flags);
>   	bool submit;
>   
>   	local_irq_save(flags);
>   
> -	spin_lock(&engine->timeline.lock);
> +	if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
> +		spin_lock(&engine->timeline.lock);
> +
>   	submit = __guc_dequeue(engine);
> -	spin_unlock(&engine->timeline.lock);
> +
> +	if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
> +		spin_unlock(&engine->timeline.lock);
>   
>   	if (submit)
>   		guc_submit(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 539fa03d7600..09fded9d409f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -356,13 +356,15 @@ execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
>   {
>   	struct intel_engine_cs *engine =
>   		container_of(execlists, typeof(*engine), execlists);
> -	unsigned long flags;
> +	unsigned long uninitialized_var(flags);
>   
> -	spin_lock_irqsave(&engine->timeline.lock, flags);
> +	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
> +		spin_lock_irqsave(&engine->timeline.lock, flags);
>   
>   	__unwind_incomplete_requests(engine);
>   
> -	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
> +		spin_unlock_irqrestore(&engine->timeline.lock, flags);
>   }
>   
>   static inline void
> @@ -601,6 +603,8 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
>   		 */
>   		GEM_BUG_ON(!execlists_is_active(execlists,
>   						EXECLISTS_ACTIVE_USER));
> +		GEM_BUG_ON(execlists_is_active(execlists,
> +					       EXECLISTS_ACTIVE_PREEMPT));
>   		GEM_BUG_ON(!port_count(&port[0]));
>   		if (port_count(&port[0]) > 1)
>   			return false;
> @@ -757,12 +761,16 @@ static bool __execlists_dequeue(struct intel_engine_cs *engine)
>   static void execlists_dequeue(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> -	unsigned long flags;
> +	unsigned long uninitialized_var(flags);
>   	bool submit;
>   
> -	spin_lock_irqsave(&engine->timeline.lock, flags);
> +	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
> +		spin_lock_irqsave(&engine->timeline.lock, flags);
> +
>   	submit = __execlists_dequeue(engine);
> -	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +
> +	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
> +		spin_unlock_irqrestore(&engine->timeline.lock, flags);
>   
>   	if (submit)
>   		execlists_submit_ports(engine);
> @@ -1162,16 +1170,52 @@ static void queue_request(struct intel_engine_cs *engine,
>   		      &lookup_priolist(engine, prio)->requests);
>   }
>   
> -static void __submit_queue(struct intel_engine_cs *engine, int prio)
> +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
>   {
>   	engine->execlists.queue_priority = prio;
> +}

Why is this called wakeup? Plans to add something in it later?

> +
> +static void __schedule_queue(struct intel_engine_cs *engine)
> +{
>   	i915_tasklet_schedule(&engine->execlists.tasklet);
>   }
>   
> +static bool __direct_submit(struct intel_engine_execlists *const execlists)
> +{
> +	struct i915_tasklet * const t = &execlists->tasklet;
> +
> +	if (!tasklet_trylock(&t->base))
> +		return false;
> +
> +	t->flags |= I915_TASKLET_DIRECT_SUBMIT;
> +	i915_tasklet_run(t);
> +	t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
> +
> +	tasklet_unlock(&t->base);

Feels like this whole sequence belongs to i915_tasklet since it touches 
the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?

> +	return true;
> +}
> +
> +static void __submit_queue(struct intel_engine_cs *engine)
> +{
> +	struct intel_engine_execlists * const execlists = &engine->execlists;
> +
> +	GEM_BUG_ON(!engine->i915->gt.awake);
> +
> +	/* If inside GPU reset, the tasklet will be queued later. */
> +	if (!i915_tasklet_is_enabled(&execlists->tasklet))
> +		return;
> +
> +	/* Directly submit the first request to reduce the initial latency */
> +	if (port_isset(execlists->port) || !__direct_submit(execlists))
> +		__schedule_queue(engine);

Hmm a bit evil to maybe invoke in the condition. Would it be acceptable to:

if (!port_isset(...))
	i915_tasklet_run_or_schedule(...);
else
	i915_tasklet_schedule(...);

It's not ideal but maybe a bit better.

> +}
> +
>   static void submit_queue(struct intel_engine_cs *engine, int prio)
>   {
> -	if (prio > engine->execlists.queue_priority)
> -		__submit_queue(engine, prio);
> +	if (prio > engine->execlists.queue_priority) {
> +		__wakeup_queue(engine, prio);
> +		__submit_queue(engine);
> +	}
>   }
>   
>   static void execlists_submit_request(struct i915_request *request)
> @@ -1183,10 +1227,9 @@ static void execlists_submit_request(struct i915_request *request)
>   	spin_lock_irqsave(&engine->timeline.lock, flags);
>   
>   	queue_request(engine, &request->sched, rq_prio(request));
> -	submit_queue(engine, rq_prio(request));
> -
>   	GEM_BUG_ON(!engine->execlists.first);
>   	GEM_BUG_ON(list_empty(&request->sched.link));
> +	submit_queue(engine, rq_prio(request));
>   
>   	spin_unlock_irqrestore(&engine->timeline.lock, flags);
>   }
> @@ -1314,8 +1357,10 @@ static void execlists_schedule(struct i915_request *request,
>   		}
>   
>   		if (prio > engine->execlists.queue_priority &&
> -		    i915_sw_fence_done(&sched_to_request(node)->submit))
> -			__submit_queue(engine, prio);
> +		    i915_sw_fence_done(&sched_to_request(node)->submit)) {
> +			__wakeup_queue(engine, prio);
> +			__schedule_queue(engine);
> +		}
>   	}
>   
>   	spin_unlock_irq(&engine->timeline.lock);
> 

Regards,

Tvrtko
Chris Wilson May 10, 2018, 4:25 p.m. UTC | #2
Quoting Tvrtko Ursulin (2018-05-10 17:09:14)
> 
> On 09/05/2018 15:27, Chris Wilson wrote:
> > Bypass using the tasklet to submit the first request to HW, as the
> > tasklet may be deferred unto ksoftirqd and at a minimum will add in
> > excess of 10us (and maybe tens of milliseconds) to our execution
> > latency. This latency reduction is most notable when execution flows
> > between engines.
> > 
> > v2: Beware handling preemption completion from the direct submit path as
> > well.
> > v3: Make the abuse clear and track our extra state inside i915_tasklet.
> > 
> > Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
> >   drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
> >   drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
> >   3 files changed, 89 insertions(+), 16 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
> > index 42b002b88edb..99e2fa2241ba 100644
> > --- a/drivers/gpu/drm/i915/i915_tasklet.h
> > +++ b/drivers/gpu/drm/i915/i915_tasklet.h
> > @@ -8,8 +8,11 @@
> >   #define _I915_TASKLET_H_
> >   
> >   #include <linux/atomic.h>
> > +#include <linux/bitops.h>
> >   #include <linux/interrupt.h>
> >   
> > +#include "i915_gem.h"
> > +
> >   /**
> >    * struct i915_tasklet - wrapper around tasklet_struct
> >    *
> > @@ -19,6 +22,8 @@
> >    */
> >   struct i915_tasklet {
> >       struct tasklet_struct base;
> > +     unsigned long flags;
> > +#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
> 
> I would suggest a more generic name for the bit since i915_tasklet is 
> generic-ish. For instance simply I915_TASKLET_DIRECT would signify the 
> callback has been invoked directly and not (necessarily) from softirq 
> context. Then it is for each user to know what that means for them 
> specifically.

Problem is we have two direct invocations, only one is special. It
really wants to be something like I915_TASKLET_ENGINE_IS_LOCKED - you can
see why I didn't propose that.

> > -static void __submit_queue(struct intel_engine_cs *engine, int prio)
> > +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
> >   {
> >       engine->execlists.queue_priority = prio;
> > +}
> 
> Why is this called wakeup? Plans to add something in it later?

Yes. It's called wakeup because it's setting the value that the dequeue
wakes up at. First name was kick_queue, but it doesn't kick either.

The later side-effect involves controlling timers.

__restart_queue()?

> > +static void __schedule_queue(struct intel_engine_cs *engine)
> > +{
> >       i915_tasklet_schedule(&engine->execlists.tasklet);
> >   }
> >   
> > +static bool __direct_submit(struct intel_engine_execlists *const execlists)
> > +{
> > +     struct i915_tasklet * const t = &execlists->tasklet;
> > +
> > +     if (!tasklet_trylock(&t->base))
> > +             return false;
> > +
> > +     t->flags |= I915_TASKLET_DIRECT_SUBMIT;
> > +     i915_tasklet_run(t);
> > +     t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
> > +
> > +     tasklet_unlock(&t->base);
> 
> Feels like this whole sequence belongs to i915_tasklet since it touches 
> the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?

Keep reading the series and you'll see just why this is so special and
confined to execlists.

> > +     return true;
> > +}
> > +
> > +static void __submit_queue(struct intel_engine_cs *engine)
> > +{
> > +     struct intel_engine_execlists * const execlists = &engine->execlists;
> > +
> > +     GEM_BUG_ON(!engine->i915->gt.awake);
> > +
> > +     /* If inside GPU reset, the tasklet will be queued later. */
> > +     if (!i915_tasklet_is_enabled(&execlists->tasklet))
> > +             return;
> > +
> > +     /* Directly submit the first request to reduce the initial latency */
> > +     if (port_isset(execlists->port) || !__direct_submit(execlists))
> > +             __schedule_queue(engine);
> 
> Hmm a bit evil to maybe invoke in the condition. Would it be acceptable to:
> 
> if (!port_isset(...))
>         i915_tasklet_run_or_schedule(...);
> else
>         i915_tasklet_schedule(...);
> 
> It's not ideal but maybe a bit better.

Beauty is in the eye of the beholder, and that ain't beautiful :)
-Chris
Tvrtko Ursulin May 10, 2018, 5:26 p.m. UTC | #3
On 10/05/2018 17:25, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-10 17:09:14)
>>
>> On 09/05/2018 15:27, Chris Wilson wrote:
>>> Bypass using the tasklet to submit the first request to HW, as the
>>> tasklet may be deferred unto ksoftirqd and at a minimum will add in
>>> excess of 10us (and maybe tens of milliseconds) to our execution
>>> latency. This latency reduction is most notable when execution flows
>>> between engines.
>>>
>>> v2: Beware handling preemption completion from the direct submit path as
>>> well.
>>> v3: Make the abuse clear and track our extra state inside i915_tasklet.
>>>
>>> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
>>>    drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
>>>    drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
>>>    3 files changed, 89 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
>>> index 42b002b88edb..99e2fa2241ba 100644
>>> --- a/drivers/gpu/drm/i915/i915_tasklet.h
>>> +++ b/drivers/gpu/drm/i915/i915_tasklet.h
>>> @@ -8,8 +8,11 @@
>>>    #define _I915_TASKLET_H_
>>>    
>>>    #include <linux/atomic.h>
>>> +#include <linux/bitops.h>
>>>    #include <linux/interrupt.h>
>>>    
>>> +#include "i915_gem.h"
>>> +
>>>    /**
>>>     * struct i915_tasklet - wrapper around tasklet_struct
>>>     *
>>> @@ -19,6 +22,8 @@
>>>     */
>>>    struct i915_tasklet {
>>>        struct tasklet_struct base;
>>> +     unsigned long flags;
>>> +#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
>>
>> I would suggest a more generic name for the bit since i915_tasklet is
>> generic-ish. For instance simply I915_TASKLET_DIRECT would signify the
>> callback has been invoked directly and not (necessarily) from softirq
>> context. Then it is for each user to know what that means for them
>> specifically.
> 
> Problem is we have two direct invocations, only one is special. It
> really wants to be something like I915_TASKLET_ENGINE_IS_LOCKED - you can
> see why I didn't propose that.

TBC...

>>> -static void __submit_queue(struct intel_engine_cs *engine, int prio)
>>> +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
>>>    {
>>>        engine->execlists.queue_priority = prio;
>>> +}
>>
>> Why is this called wakeup? Plans to add something in it later?
> 
> Yes. It's called wakeup because it's setting the value that the dequeue
> wakes up at. First name was kick_queue, but it doesn't kick either.
> 
> The later side-effect involves controlling timers.
> 
> __restart_queue()?

__update_queue_priority? :)

> 
>>> +static void __schedule_queue(struct intel_engine_cs *engine)
>>> +{
>>>        i915_tasklet_schedule(&engine->execlists.tasklet);
>>>    }
>>>    
>>> +static bool __direct_submit(struct intel_engine_execlists *const execlists)
>>> +{
>>> +     struct i915_tasklet * const t = &execlists->tasklet;
>>> +
>>> +     if (!tasklet_trylock(&t->base))
>>> +             return false;
>>> +
>>> +     t->flags |= I915_TASKLET_DIRECT_SUBMIT;
>>> +     i915_tasklet_run(t);
>>> +     t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
>>> +
>>> +     tasklet_unlock(&t->base);
>>
>> Feels like this whole sequence belongs to i915_tasklet since it touches
>> the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?
> 
> Keep reading the series and you'll see just why this is so special and
> confined to execlists.

... TBC here.

Having peeked ahead, it feels a bit not generic enough as it is, a bit 
too hacky.

Would it work to pass context together with the invocation. Like:

i915_tasklet_try(..., I915_TASKLET_SUBMIT_IDLE);
i915_tasklet_try(..., I915_TASKLET_SUBMIT_IRQ);

i915_tasklet.flags field namespace would then be owned by the caller 
completely. And the tasklet func itself would have more context on what 
to do.

Following form that, i915_tasklet_run_or_schedule(.., flags).

bool i915_taskle_try(tasklet, flags)
{
	if (!trylock)
		return false;

	t->flags |= flags;
	i915_tasklet_run(...);
	t->flags &= ~flags;

	tasklet_unlock(...);

	return true;
}


void i915_tasklet_run_or_schedule(..., flags)
{
	if (!i915_tasklet_try(..., flags))
		i915_tasklet_schedule(...);
}

?

Leaves a question of a tasklet_is_enabled check in your tasklet_try, 
which I don't quite get since that check wasn't there before. So why it 
is needed?

> 
>>> +     return true;
>>> +}
>>> +
>>> +static void __submit_queue(struct intel_engine_cs *engine)
>>> +{
>>> +     struct intel_engine_execlists * const execlists = &engine->execlists;
>>> +
>>> +     GEM_BUG_ON(!engine->i915->gt.awake);
>>> +
>>> +     /* If inside GPU reset, the tasklet will be queued later. */
>>> +     if (!i915_tasklet_is_enabled(&execlists->tasklet))
>>> +             return;
>>> +
>>> +     /* Directly submit the first request to reduce the initial latency */
>>> +     if (port_isset(execlists->port) || !__direct_submit(execlists))
>>> +             __schedule_queue(engine);
>>
>> Hmm a bit evil to maybe invoke in the condition. Would it be acceptable to:
>>
>> if (!port_isset(...))
>>          i915_tasklet_run_or_schedule(...);
>> else
>>          i915_tasklet_schedule(...);
>>
>> It's not ideal but maybe a bit better.
> 
> Beauty is in the eye of the beholder, and that ain't beautiful :)

Did not say it was, just more obvious what's happening.

Regards,

Tvrtko
Chris Wilson May 10, 2018, 5:40 p.m. UTC | #4
Quoting Tvrtko Ursulin (2018-05-10 18:26:31)
> 
> On 10/05/2018 17:25, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-05-10 17:09:14)
> >>
> >> On 09/05/2018 15:27, Chris Wilson wrote:
> >>> Bypass using the tasklet to submit the first request to HW, as the
> >>> tasklet may be deferred unto ksoftirqd and at a minimum will add in
> >>> excess of 10us (and maybe tens of milliseconds) to our execution
> >>> latency. This latency reduction is most notable when execution flows
> >>> between engines.
> >>>
> >>> v2: Beware handling preemption completion from the direct submit path as
> >>> well.
> >>> v3: Make the abuse clear and track our extra state inside i915_tasklet.
> >>>
> >>> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> ---
> >>>    drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
> >>>    drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
> >>>    drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
> >>>    3 files changed, 89 insertions(+), 16 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
> >>> index 42b002b88edb..99e2fa2241ba 100644
> >>> --- a/drivers/gpu/drm/i915/i915_tasklet.h
> >>> +++ b/drivers/gpu/drm/i915/i915_tasklet.h
> >>> @@ -8,8 +8,11 @@
> >>>    #define _I915_TASKLET_H_
> >>>    
> >>>    #include <linux/atomic.h>
> >>> +#include <linux/bitops.h>
> >>>    #include <linux/interrupt.h>
> >>>    
> >>> +#include "i915_gem.h"
> >>> +
> >>>    /**
> >>>     * struct i915_tasklet - wrapper around tasklet_struct
> >>>     *
> >>> @@ -19,6 +22,8 @@
> >>>     */
> >>>    struct i915_tasklet {
> >>>        struct tasklet_struct base;
> >>> +     unsigned long flags;
> >>> +#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
> >>
> >> I would suggest a more generic name for the bit since i915_tasklet is
> >> generic-ish. For instance simply I915_TASKLET_DIRECT would signify the
> >> callback has been invoked directly and not (necessarily) from softirq
> >> context. Then it is for each user to know what that means for them
> >> specifically.
> > 
> > Problem is we have two direct invocations, only one is special. It
> > really wants to be something like I915_TASKLET_ENGINE_IS_LOCKED - you can
> > see why I didn't propose that.
> 
> TBC...
> 
> >>> -static void __submit_queue(struct intel_engine_cs *engine, int prio)
> >>> +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
> >>>    {
> >>>        engine->execlists.queue_priority = prio;
> >>> +}
> >>
> >> Why is this called wakeup? Plans to add something in it later?
> > 
> > Yes. It's called wakeup because it's setting the value that the dequeue
> > wakes up at. First name was kick_queue, but it doesn't kick either.
> > 
> > The later side-effect involves controlling timers.
> > 
> > __restart_queue()?
> 
> __update_queue_priority? :)

It doesn't just update the priority...

Now a choice between restart_queue and update_queue.

> >>> +static void __schedule_queue(struct intel_engine_cs *engine)
> >>> +{
> >>>        i915_tasklet_schedule(&engine->execlists.tasklet);
> >>>    }
> >>>    
> >>> +static bool __direct_submit(struct intel_engine_execlists *const execlists)
> >>> +{
> >>> +     struct i915_tasklet * const t = &execlists->tasklet;
> >>> +
> >>> +     if (!tasklet_trylock(&t->base))
> >>> +             return false;
> >>> +
> >>> +     t->flags |= I915_TASKLET_DIRECT_SUBMIT;
> >>> +     i915_tasklet_run(t);
> >>> +     t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
> >>> +
> >>> +     tasklet_unlock(&t->base);
> >>
> >> Feels like this whole sequence belongs to i915_tasklet since it touches
> >> the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?
> > 
> > Keep reading the series and you'll see just why this is so special and
> > confined to execlists.
> 
> ... TBC here.
> 
> Having peeked ahead, it feels a bit not generic enough as it is, a bit 
> too hacky.
> 
> Would it work to pass context together with the invocation. Like:
> 
> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IDLE);
> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IRQ);
> 
> i915_tasklet.flags field namespace would then be owned by the caller 
> completely. And the tasklet func itself would have more context on what 
> to do.

That doesn't apply very well to the use case either. It's not the
tasklet being called from irq/process that's significant but whether we
are calling it with the engine/data locked.

I keep wanting to use LOCKED, but that has no meaning to the tasklet,
and tasklet_trylock means something entirely different.
 
> Following form that, i915_tasklet_run_or_schedule(.., flags).
> 
> bool i915_taskle_try(tasklet, flags)
> {
>         if (!trylock)
>                 return false;
> 
>         t->flags |= flags;
>         i915_tasklet_run(...);
>         t->flags &= ~flags;
> 
>         tasklet_unlock(...);
> 
>         return true;
> }
> 
> 
> void i915_tasklet_run_or_schedule(..., flags)
> {
>         if (!i915_tasklet_try(..., flags))
>                 i915_tasklet_schedule(...);
> }
> 
> ?
> 
> Leaves a question of a tasklet_is_enabled check in your tasklet_try, 
> which I don't quite get since that check wasn't there before. So why it 
> is needed?

Concurrent reset can happen at irq time, but we know cannot happen from
the submit path.
-Chris
Tvrtko Ursulin May 11, 2018, 8:25 a.m. UTC | #5
On 10/05/2018 18:40, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-10 18:26:31)
>>
>> On 10/05/2018 17:25, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-05-10 17:09:14)
>>>>
>>>> On 09/05/2018 15:27, Chris Wilson wrote:
>>>>> Bypass using the tasklet to submit the first request to HW, as the
>>>>> tasklet may be deferred unto ksoftirqd and at a minimum will add in
>>>>> excess of 10us (and maybe tens of milliseconds) to our execution
>>>>> latency. This latency reduction is most notable when execution flows
>>>>> between engines.
>>>>>
>>>>> v2: Beware handling preemption completion from the direct submit path as
>>>>> well.
>>>>> v3: Make the abuse clear and track our extra state inside i915_tasklet.
>>>>>
>>>>> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>> ---
>>>>>     drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
>>>>>     drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
>>>>>     drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
>>>>>     3 files changed, 89 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
>>>>> index 42b002b88edb..99e2fa2241ba 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_tasklet.h
>>>>> +++ b/drivers/gpu/drm/i915/i915_tasklet.h
>>>>> @@ -8,8 +8,11 @@
>>>>>     #define _I915_TASKLET_H_
>>>>>     
>>>>>     #include <linux/atomic.h>
>>>>> +#include <linux/bitops.h>
>>>>>     #include <linux/interrupt.h>
>>>>>     
>>>>> +#include "i915_gem.h"
>>>>> +
>>>>>     /**
>>>>>      * struct i915_tasklet - wrapper around tasklet_struct
>>>>>      *
>>>>> @@ -19,6 +22,8 @@
>>>>>      */
>>>>>     struct i915_tasklet {
>>>>>         struct tasklet_struct base;
>>>>> +     unsigned long flags;
>>>>> +#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
>>>>
>>>> I would suggest a more generic name for the bit since i915_tasklet is
>>>> generic-ish. For instance simply I915_TASKLET_DIRECT would signify the
>>>> callback has been invoked directly and not (necessarily) from softirq
>>>> context. Then it is for each user to know what that means for them
>>>> specifically.
>>>
>>> Problem is we have two direct invocations, only one is special. It
>>> really wants to be something like I915_TASKLET_ENGINE_IS_LOCKED - you can
>>> see why I didn't propose that.
>>
>> TBC...
>>
>>>>> -static void __submit_queue(struct intel_engine_cs *engine, int prio)
>>>>> +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
>>>>>     {
>>>>>         engine->execlists.queue_priority = prio;
>>>>> +}
>>>>
>>>> Why is this called wakeup? Plans to add something in it later?
>>>
>>> Yes. It's called wakeup because it's setting the value that the dequeue
>>> wakes up at. First name was kick_queue, but it doesn't kick either.
>>>
>>> The later side-effect involves controlling timers.
>>>
>>> __restart_queue()?
>>
>> __update_queue_priority? :)
> 
> It doesn't just update the priority...
> 
> Now a choice between restart_queue and update_queue.

Update sounds better match to me.

> 
>>>>> +static void __schedule_queue(struct intel_engine_cs *engine)
>>>>> +{
>>>>>         i915_tasklet_schedule(&engine->execlists.tasklet);
>>>>>     }
>>>>>     
>>>>> +static bool __direct_submit(struct intel_engine_execlists *const execlists)
>>>>> +{
>>>>> +     struct i915_tasklet * const t = &execlists->tasklet;
>>>>> +
>>>>> +     if (!tasklet_trylock(&t->base))
>>>>> +             return false;
>>>>> +
>>>>> +     t->flags |= I915_TASKLET_DIRECT_SUBMIT;
>>>>> +     i915_tasklet_run(t);
>>>>> +     t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
>>>>> +
>>>>> +     tasklet_unlock(&t->base);
>>>>
>>>> Feels like this whole sequence belongs to i915_tasklet since it touches
>>>> the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?
>>>
>>> Keep reading the series and you'll see just why this is so special and
>>> confined to execlists.
>>
>> ... TBC here.
>>
>> Having peeked ahead, it feels a bit not generic enough as it is, a bit
>> too hacky.
>>
>> Would it work to pass context together with the invocation. Like:
>>
>> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IDLE);
>> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IRQ);
>>
>> i915_tasklet.flags field namespace would then be owned by the caller
>> completely. And the tasklet func itself would have more context on what
>> to do.
> 
> That doesn't apply very well to the use case either. It's not the
> tasklet being called from irq/process that's significant but whether we
> are calling it with the engine/data locked.

That's why I am proposing to allow a generic mechanism to pass in a 
"token" to the API, which API will pass down to the user if invoking the 
tasklet directly.

The user then decides how to interpret the token.

I915_TASKLET_SUBMIT_IDLE would mean "I know this is the path with 
timeline lock already taken".

I915_TASKLET_SUBMIT_IRQ token would mean "I need to take the lock and I 
need an early abort if tasklet is disabled".

> 
> I keep wanting to use LOCKED, but that has no meaning to the tasklet,
> and tasklet_trylock means something entirely different.
>   
>> Following form that, i915_tasklet_run_or_schedule(.., flags).
>>
>> bool i915_taskle_try(tasklet, flags)
>> {
>>          if (!trylock)
>>                  return false;
>>
>>          t->flags |= flags;
>>          i915_tasklet_run(...);
>>          t->flags &= ~flags;
>>
>>          tasklet_unlock(...);
>>
>>          return true;
>> }
>>
>>
>> void i915_tasklet_run_or_schedule(..., flags)
>> {
>>          if (!i915_tasklet_try(..., flags))
>>                  i915_tasklet_schedule(...);
>> }
>>
>> ?
>>
>> Leaves a question of a tasklet_is_enabled check in your tasklet_try,
>> which I don't quite get since that check wasn't there before. So why it
>> is needed?
> 
> Concurrent reset can happen at irq time, but we know cannot happen from
> the submit path.

Right, yes, it is relevant for direct invocation only.

Regards,

Tvrtko
Chris Wilson May 11, 2018, 8:31 a.m. UTC | #6
Quoting Tvrtko Ursulin (2018-05-11 09:25:00)
> 
> On 10/05/2018 18:40, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-05-10 18:26:31)
> >>
> >> On 10/05/2018 17:25, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2018-05-10 17:09:14)
> >>>>
> >>>> On 09/05/2018 15:27, Chris Wilson wrote:
> >>>>> Bypass using the tasklet to submit the first request to HW, as the
> >>>>> tasklet may be deferred unto ksoftirqd and at a minimum will add in
> >>>>> excess of 10us (and maybe tens of milliseconds) to our execution
> >>>>> latency. This latency reduction is most notable when execution flows
> >>>>> between engines.
> >>>>>
> >>>>> v2: Beware handling preemption completion from the direct submit path as
> >>>>> well.
> >>>>> v3: Make the abuse clear and track our extra state inside i915_tasklet.
> >>>>>
> >>>>> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>>>> ---
> >>>>>     drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
> >>>>>     drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
> >>>>>     drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
> >>>>>     3 files changed, 89 insertions(+), 16 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
> >>>>> index 42b002b88edb..99e2fa2241ba 100644
> >>>>> --- a/drivers/gpu/drm/i915/i915_tasklet.h
> >>>>> +++ b/drivers/gpu/drm/i915/i915_tasklet.h
> >>>>> @@ -8,8 +8,11 @@
> >>>>>     #define _I915_TASKLET_H_
> >>>>>     
> >>>>>     #include <linux/atomic.h>
> >>>>> +#include <linux/bitops.h>
> >>>>>     #include <linux/interrupt.h>
> >>>>>     
> >>>>> +#include "i915_gem.h"
> >>>>> +
> >>>>>     /**
> >>>>>      * struct i915_tasklet - wrapper around tasklet_struct
> >>>>>      *
> >>>>> @@ -19,6 +22,8 @@
> >>>>>      */
> >>>>>     struct i915_tasklet {
> >>>>>         struct tasklet_struct base;
> >>>>> +     unsigned long flags;
> >>>>> +#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
> >>>>
> >>>> I would suggest a more generic name for the bit since i915_tasklet is
> >>>> generic-ish. For instance simply I915_TASKLET_DIRECT would signify the
> >>>> callback has been invoked directly and not (necessarily) from softirq
> >>>> context. Then it is for each user to know what that means for them
> >>>> specifically.
> >>>
> >>> Problem is we have two direct invocations, only one is special. It
> >>> really wants to be something like I915_TASKLET_ENGINE_IS_LOCKED - you can
> >>> see why I didn't propose that.
> >>
> >> TBC...
> >>
> >>>>> -static void __submit_queue(struct intel_engine_cs *engine, int prio)
> >>>>> +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
> >>>>>     {
> >>>>>         engine->execlists.queue_priority = prio;
> >>>>> +}
> >>>>
> >>>> Why is this called wakeup? Plans to add something in it later?
> >>>
> >>> Yes. It's called wakeup because it's setting the value that the dequeue
> >>> wakes up at. First name was kick_queue, but it doesn't kick either.
> >>>
> >>> The later side-effect involves controlling timers.
> >>>
> >>> __restart_queue()?
> >>
> >> __update_queue_priority? :)
> > 
> > It doesn't just update the priority...
> > 
> > Now a choice between restart_queue and update_queue.
> 
> Update sounds better match to me.
> 
> > 
> >>>>> +static void __schedule_queue(struct intel_engine_cs *engine)
> >>>>> +{
> >>>>>         i915_tasklet_schedule(&engine->execlists.tasklet);
> >>>>>     }
> >>>>>     
> >>>>> +static bool __direct_submit(struct intel_engine_execlists *const execlists)
> >>>>> +{
> >>>>> +     struct i915_tasklet * const t = &execlists->tasklet;
> >>>>> +
> >>>>> +     if (!tasklet_trylock(&t->base))
> >>>>> +             return false;
> >>>>> +
> >>>>> +     t->flags |= I915_TASKLET_DIRECT_SUBMIT;
> >>>>> +     i915_tasklet_run(t);
> >>>>> +     t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
> >>>>> +
> >>>>> +     tasklet_unlock(&t->base);
> >>>>
> >>>> Feels like this whole sequence belongs to i915_tasklet since it touches
> >>>> the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?
> >>>
> >>> Keep reading the series and you'll see just why this is so special and
> >>> confined to execlists.
> >>
> >> ... TBC here.
> >>
> >> Having peeked ahead, it feels a bit not generic enough as it is, a bit
> >> too hacky.
> >>
> >> Would it work to pass context together with the invocation. Like:
> >>
> >> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IDLE);
> >> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IRQ);
> >>
> >> i915_tasklet.flags field namespace would then be owned by the caller
> >> completely. And the tasklet func itself would have more context on what
> >> to do.
> > 
> > That doesn't apply very well to the use case either. It's not the
> > tasklet being called from irq/process that's significant but whether we
> > are calling it with the engine/data locked.
> 
> That's why I am proposing to allow a generic mechanism to pass in a 
> "token" to the API, which API will pass down to the user if invoking the 
> tasklet directly.
> 
> The user then decides how to interpret the token.
> 
> I915_TASKLET_SUBMIT_IDLE would mean "I know this is the path with 
> timeline lock already taken".
> 
> I915_TASKLET_SUBMIT_IRQ token would mean "I need to take the lock and I 
> need an early abort if tasklet is disabled".

I don't see a reason to extend it to a generic mechanism yet.
direct-submit-onto-idle is the special case. I've 3 users for the normal
case (just calling into the tasklet directly), one for wanting to pass
along private state after claiming the tasklet for itself, and one that
does unspeakable things that doesn't match any of the above ;)
-Chris
Tvrtko Ursulin May 11, 2018, 8:48 a.m. UTC | #7
On 11/05/2018 09:31, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-11 09:25:00)
>>
>> On 10/05/2018 18:40, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-05-10 18:26:31)
>>>>
>>>> On 10/05/2018 17:25, Chris Wilson wrote:
>>>>> Quoting Tvrtko Ursulin (2018-05-10 17:09:14)
>>>>>>
>>>>>> On 09/05/2018 15:27, Chris Wilson wrote:
>>>>>>> Bypass using the tasklet to submit the first request to HW, as the
>>>>>>> tasklet may be deferred unto ksoftirqd and at a minimum will add in
>>>>>>> excess of 10us (and maybe tens of milliseconds) to our execution
>>>>>>> latency. This latency reduction is most notable when execution flows
>>>>>>> between engines.
>>>>>>>
>>>>>>> v2: Beware handling preemption completion from the direct submit path as
>>>>>>> well.
>>>>>>> v3: Make the abuse clear and track our extra state inside i915_tasklet.
>>>>>>>
>>>>>>> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>> ---
>>>>>>>      drivers/gpu/drm/i915/i915_tasklet.h         | 24 +++++++
>>>>>>>      drivers/gpu/drm/i915/intel_guc_submission.c | 10 ++-
>>>>>>>      drivers/gpu/drm/i915/intel_lrc.c            | 71 +++++++++++++++++----
>>>>>>>      3 files changed, 89 insertions(+), 16 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
>>>>>>> index 42b002b88edb..99e2fa2241ba 100644
>>>>>>> --- a/drivers/gpu/drm/i915/i915_tasklet.h
>>>>>>> +++ b/drivers/gpu/drm/i915/i915_tasklet.h
>>>>>>> @@ -8,8 +8,11 @@
>>>>>>>      #define _I915_TASKLET_H_
>>>>>>>      
>>>>>>>      #include <linux/atomic.h>
>>>>>>> +#include <linux/bitops.h>
>>>>>>>      #include <linux/interrupt.h>
>>>>>>>      
>>>>>>> +#include "i915_gem.h"
>>>>>>> +
>>>>>>>      /**
>>>>>>>       * struct i915_tasklet - wrapper around tasklet_struct
>>>>>>>       *
>>>>>>> @@ -19,6 +22,8 @@
>>>>>>>       */
>>>>>>>      struct i915_tasklet {
>>>>>>>          struct tasklet_struct base;
>>>>>>> +     unsigned long flags;
>>>>>>> +#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
>>>>>>
>>>>>> I would suggest a more generic name for the bit since i915_tasklet is
>>>>>> generic-ish. For instance simply I915_TASKLET_DIRECT would signify the
>>>>>> callback has been invoked directly and not (necessarily) from softirq
>>>>>> context. Then it is for each user to know what that means for them
>>>>>> specifically.
>>>>>
>>>>> Problem is we have two direct invocations, only one is special. It
>>>>> really wants to be something like I915_TASKLET_ENGINE_IS_LOCKED - you can
>>>>> see why I didn't propose that.
>>>>
>>>> TBC...
>>>>
>>>>>>> -static void __submit_queue(struct intel_engine_cs *engine, int prio)
>>>>>>> +static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
>>>>>>>      {
>>>>>>>          engine->execlists.queue_priority = prio;
>>>>>>> +}
>>>>>>
>>>>>> Why is this called wakeup? Plans to add something in it later?
>>>>>
>>>>> Yes. It's called wakeup because it's setting the value that the dequeue
>>>>> wakes up at. First name was kick_queue, but it doesn't kick either.
>>>>>
>>>>> The later side-effect involves controlling timers.
>>>>>
>>>>> __restart_queue()?
>>>>
>>>> __update_queue_priority? :)
>>>
>>> It doesn't just update the priority...
>>>
>>> Now a choice between restart_queue and update_queue.
>>
>> Update sounds better match to me.
>>
>>>
>>>>>>> +static void __schedule_queue(struct intel_engine_cs *engine)
>>>>>>> +{
>>>>>>>          i915_tasklet_schedule(&engine->execlists.tasklet);
>>>>>>>      }
>>>>>>>      
>>>>>>> +static bool __direct_submit(struct intel_engine_execlists *const execlists)
>>>>>>> +{
>>>>>>> +     struct i915_tasklet * const t = &execlists->tasklet;
>>>>>>> +
>>>>>>> +     if (!tasklet_trylock(&t->base))
>>>>>>> +             return false;
>>>>>>> +
>>>>>>> +     t->flags |= I915_TASKLET_DIRECT_SUBMIT;
>>>>>>> +     i915_tasklet_run(t);
>>>>>>> +     t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
>>>>>>> +
>>>>>>> +     tasklet_unlock(&t->base);
>>>>>>
>>>>>> Feels like this whole sequence belongs to i915_tasklet since it touches
>>>>>> the internals. Maybe i915_tasklet_try_run, or i915_tasklet_run_or_schedule?
>>>>>
>>>>> Keep reading the series and you'll see just why this is so special and
>>>>> confined to execlists.
>>>>
>>>> ... TBC here.
>>>>
>>>> Having peeked ahead, it feels a bit not generic enough as it is, a bit
>>>> too hacky.
>>>>
>>>> Would it work to pass context together with the invocation. Like:
>>>>
>>>> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IDLE);
>>>> i915_tasklet_try(..., I915_TASKLET_SUBMIT_IRQ);
>>>>
>>>> i915_tasklet.flags field namespace would then be owned by the caller
>>>> completely. And the tasklet func itself would have more context on what
>>>> to do.
>>>
>>> That doesn't apply very well to the use case either. It's not the
>>> tasklet being called from irq/process that's significant but whether we
>>> are calling it with the engine/data locked.
>>
>> That's why I am proposing to allow a generic mechanism to pass in a
>> "token" to the API, which API will pass down to the user if invoking the
>> tasklet directly.
>>
>> The user then decides how to interpret the token.
>>
>> I915_TASKLET_SUBMIT_IDLE would mean "I know this is the path with
>> timeline lock already taken".
>>
>> I915_TASKLET_SUBMIT_IRQ token would mean "I need to take the lock and I
>> need an early abort if tasklet is disabled".
> 
> I don't see a reason to extend it to a generic mechanism yet.
> direct-submit-onto-idle is the special case. I've 3 users for the normal
> case (just calling into the tasklet directly), one for wanting to pass
> along private state after claiming the tasklet for itself, and one that
> does unspeakable things that doesn't match any of the above ;)

I don't know what is this last one, but, talking about the three call 
sites form this series.. not moving more of the logic to the 
i915_tasklet IMO diminishes the reason for it to exists. If it is only 
wrapping a single token, which the callers have to fiddle with manually 
for majority of (special) cases, for me it is a harder sell.

We could just as well just use a data field in struct intel_execlists, 
as I suggested earlier, and write our own i915_execlists_tasklet_try or 
something wrappers. It would be the same thing and much less code.

So for me either making it more generic, or not having it at all, seems 
that it would result in less code and the functional design would be the 
same.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_tasklet.h b/drivers/gpu/drm/i915/i915_tasklet.h
index 42b002b88edb..99e2fa2241ba 100644
--- a/drivers/gpu/drm/i915/i915_tasklet.h
+++ b/drivers/gpu/drm/i915/i915_tasklet.h
@@ -8,8 +8,11 @@ 
 #define _I915_TASKLET_H_
 
 #include <linux/atomic.h>
+#include <linux/bitops.h>
 #include <linux/interrupt.h>
 
+#include "i915_gem.h"
+
 /**
  * struct i915_tasklet - wrapper around tasklet_struct
  *
@@ -19,6 +22,8 @@ 
  */
 struct i915_tasklet {
 	struct tasklet_struct base;
+	unsigned long flags;
+#define I915_TASKLET_DIRECT_SUBMIT BIT(0)
 };
 
 static inline void i915_tasklet_init(struct i915_tasklet *t,
@@ -43,6 +48,14 @@  static inline bool i915_tasklet_is_enabled(const struct i915_tasklet *t)
 	return likely(!atomic_read(&t->base.count));
 }
 
+static inline bool i915_tasklet_is_direct_submit(const struct i915_tasklet *t)
+{
+	/* Only legal to be checked from inside the tasklet. */
+	GEM_BUG_ON(!i915_tasklet_is_running(t));
+
+	return t->flags & I915_TASKLET_DIRECT_SUBMIT;
+}
+
 static inline void i915_tasklet_schedule(struct i915_tasklet *t)
 {
 	tasklet_hi_schedule(&t->base);
@@ -75,4 +88,15 @@  static inline void i915_tasklet_set_func(struct i915_tasklet *t,
 	i915_tasklet_unlock(t);
 }
 
+static inline void __i915_tasklet_run(const struct i915_tasklet *t)
+{
+	t->base.func(t->base.data);
+}
+
+static inline void i915_tasklet_run(const struct i915_tasklet *t)
+{
+	GEM_BUG_ON(!i915_tasklet_is_running(t));
+	__i915_tasklet_run(t);
+}
+
 #endif /* _I915_TASKLET_H_ */
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index a7afc976c3b9..f2ded1796523 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -754,14 +754,18 @@  static bool __guc_dequeue(struct intel_engine_cs *engine)
 
 static void guc_dequeue(struct intel_engine_cs *engine)
 {
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
 	bool submit;
 
 	local_irq_save(flags);
 
-	spin_lock(&engine->timeline.lock);
+	if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
+		spin_lock(&engine->timeline.lock);
+
 	submit = __guc_dequeue(engine);
-	spin_unlock(&engine->timeline.lock);
+
+	if (!i915_tasklet_is_direct_submit(&engine->execlists.tasklet))
+		spin_unlock(&engine->timeline.lock);
 
 	if (submit)
 		guc_submit(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 539fa03d7600..09fded9d409f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -356,13 +356,15 @@  execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
 {
 	struct intel_engine_cs *engine =
 		container_of(execlists, typeof(*engine), execlists);
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	__unwind_incomplete_requests(engine);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static inline void
@@ -601,6 +603,8 @@  static bool __execlists_dequeue(struct intel_engine_cs *engine)
 		 */
 		GEM_BUG_ON(!execlists_is_active(execlists,
 						EXECLISTS_ACTIVE_USER));
+		GEM_BUG_ON(execlists_is_active(execlists,
+					       EXECLISTS_ACTIVE_PREEMPT));
 		GEM_BUG_ON(!port_count(&port[0]));
 		if (port_count(&port[0]) > 1)
 			return false;
@@ -757,12 +761,16 @@  static bool __execlists_dequeue(struct intel_engine_cs *engine)
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
 	bool submit;
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_lock_irqsave(&engine->timeline.lock, flags);
+
 	submit = __execlists_dequeue(engine);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+	if (!i915_tasklet_is_direct_submit(&execlists->tasklet))
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
 
 	if (submit)
 		execlists_submit_ports(engine);
@@ -1162,16 +1170,52 @@  static void queue_request(struct intel_engine_cs *engine,
 		      &lookup_priolist(engine, prio)->requests);
 }
 
-static void __submit_queue(struct intel_engine_cs *engine, int prio)
+static void __wakeup_queue(struct intel_engine_cs *engine, int prio)
 {
 	engine->execlists.queue_priority = prio;
+}
+
+static void __schedule_queue(struct intel_engine_cs *engine)
+{
 	i915_tasklet_schedule(&engine->execlists.tasklet);
 }
 
+static bool __direct_submit(struct intel_engine_execlists *const execlists)
+{
+	struct i915_tasklet * const t = &execlists->tasklet;
+
+	if (!tasklet_trylock(&t->base))
+		return false;
+
+	t->flags |= I915_TASKLET_DIRECT_SUBMIT;
+	i915_tasklet_run(t);
+	t->flags &= ~I915_TASKLET_DIRECT_SUBMIT;
+
+	tasklet_unlock(&t->base);
+	return true;
+}
+
+static void __submit_queue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+
+	GEM_BUG_ON(!engine->i915->gt.awake);
+
+	/* If inside GPU reset, the tasklet will be queued later. */
+	if (!i915_tasklet_is_enabled(&execlists->tasklet))
+		return;
+
+	/* Directly submit the first request to reduce the initial latency */
+	if (port_isset(execlists->port) || !__direct_submit(execlists))
+		__schedule_queue(engine);
+}
+
 static void submit_queue(struct intel_engine_cs *engine, int prio)
 {
-	if (prio > engine->execlists.queue_priority)
-		__submit_queue(engine, prio);
+	if (prio > engine->execlists.queue_priority) {
+		__wakeup_queue(engine, prio);
+		__submit_queue(engine);
+	}
 }
 
 static void execlists_submit_request(struct i915_request *request)
@@ -1183,10 +1227,9 @@  static void execlists_submit_request(struct i915_request *request)
 	spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	queue_request(engine, &request->sched, rq_prio(request));
-	submit_queue(engine, rq_prio(request));
-
 	GEM_BUG_ON(!engine->execlists.first);
 	GEM_BUG_ON(list_empty(&request->sched.link));
+	submit_queue(engine, rq_prio(request));
 
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
@@ -1314,8 +1357,10 @@  static void execlists_schedule(struct i915_request *request,
 		}
 
 		if (prio > engine->execlists.queue_priority &&
-		    i915_sw_fence_done(&sched_to_request(node)->submit))
-			__submit_queue(engine, prio);
+		    i915_sw_fence_done(&sched_to_request(node)->submit)) {
+			__wakeup_queue(engine, prio);
+			__schedule_queue(engine);
+		}
 	}
 
 	spin_unlock_irq(&engine->timeline.lock);