diff mbox series

[02/10] drm/i915: Adjust PM QoS response frequency based on GPU load.

Message ID 20200310214203.26459-3-currojerez@riseup.net (mailing list archive)
State New, archived
Headers show
Series [01/10] PM: QoS: Add CPU_RESPONSE_FREQUENCY global PM QoS limit. | expand

Commit Message

Francisco Jerez March 10, 2020, 9:41 p.m. UTC
This allows CPUFREQ governors to realize when the system becomes
non-CPU-bound due to GPU rendering activity, and cause them to respond
more conservatively to the workload by limiting their response
frequency: CPU energy usage will be reduced when there isn't a good
chance for system performance to scale with CPU frequency due to the
GPU bottleneck.  This leaves additional TDP budget available for the
GPU to reach higher frequencies, which is translated into an
improvement in graphics performance to the extent that the workload
remains TDP-limited (Most non-trivial graphics benchmarks out there
improve significantly in the TDP-constrained platforms where this is
currently enabled, see the cover letter for some numbers).  If the
workload isn't (anymore) TDP-limited performance should stay roughly
constant, but energy usage will be divided by a similar factor.

Signed-off-by: Francisco Jerez <currojerez@riseup.net>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    |   1 +
 drivers/gpu/drm/i915/gt/intel_engine_types.h |   7 ++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c        | 107 +++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_gt_pm.h        |   3 +
 drivers/gpu/drm/i915/gt/intel_gt_types.h     |  12 +++
 drivers/gpu/drm/i915/gt/intel_lrc.c          |  14 +++
 6 files changed, 144 insertions(+)

Comments

Chris Wilson March 10, 2020, 10:26 p.m. UTC | #1
Quoting Francisco Jerez (2020-03-10 21:41:55)
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index b9b3f78f1324..a5d7a80b826d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>         /* we need to manually load the submit queue */
>         if (execlists->ctrl_reg)
>                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
> +
> +       if (execlists_num_ports(execlists) > 1 &&
pending[1] is always defined, the minimum submission is one slot, with
pending[1] as the sentinel NULL.

> +           execlists->pending[1] &&
> +           !atomic_xchg(&execlists->overload, 1))
> +               intel_gt_pm_active_begin(&engine->i915->gt);

engine->gt

>  }
>  
>  static bool ctx_single_port_submission(const struct intel_context *ce)
> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>  
>         WRITE_ONCE(execlists->active, execlists->inflight);
> +
> +       if (atomic_xchg(&execlists->overload, 0)) {
> +               struct intel_engine_cs *engine =
> +                       container_of(execlists, typeof(*engine), execlists);
> +               intel_gt_pm_active_end(&engine->i915->gt);
> +       }
>  }
>  
>  static inline void
> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>                         /* port0 completed, advanced to port1 */
>                         trace_ports(execlists, "completed", execlists->active);
>  
> +                       if (atomic_xchg(&execlists->overload, 0))
> +                               intel_gt_pm_active_end(&engine->i915->gt);

So this looses track if we preempt a dual-ELSP submission with a
single-ELSP submission (and never go back to dual).

If you move this to the end of the loop and check

if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
	intel_gt_pm_active_end(engine->gt);

so that it covers both preemption/promotion and completion.

However, that will fluctuate quite rapidly. (And runs the risk of
exceeding the sentinel.)

An alternative approach would be to couple along
schedule_in/schedule_out

atomic_set(overload, -1);

__execlists_schedule_in:
	if (!atomic_fetch_inc(overload)
		intel_gt_pm_active_begin(engine->gt);
__execlists_schedule_out:
	if (!atomic_dec_return(overload)
		intel_gt_pm_active_end(engine->gt);

which would mean we are overloaded as soon as we try to submit an
overlapping ELSP.


The metric feels very multiple client (game + display server, or
saturated transcode) centric. In the endless kernel world, we expect
100% engine utilisation from a single context, and never a dual-ELSP
submission. They are also likely to want to avoid being throttled to
converse TDP for the CPU.

Should we also reduce the overload for the number of clients who are
waiting for interrupts from the GPU, so that their wakeup latency is not
impacted?
-Chris
Francisco Jerez March 11, 2020, 12:34 a.m. UTC | #2
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Francisco Jerez (2020-03-10 21:41:55)
>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> index b9b3f78f1324..a5d7a80b826d 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>         /* we need to manually load the submit queue */
>>         if (execlists->ctrl_reg)
>>                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
>> +
>> +       if (execlists_num_ports(execlists) > 1 &&
> pending[1] is always defined, the minimum submission is one slot, with
> pending[1] as the sentinel NULL.
>
>> +           execlists->pending[1] &&
>> +           !atomic_xchg(&execlists->overload, 1))
>> +               intel_gt_pm_active_begin(&engine->i915->gt);
>
> engine->gt
>

Applied your suggestions above locally, will probably wait to have a few
more changes batched up before sending a v2.

>>  }
>>  
>>  static bool ctx_single_port_submission(const struct intel_context *ce)
>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>>         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>>  
>>         WRITE_ONCE(execlists->active, execlists->inflight);
>> +
>> +       if (atomic_xchg(&execlists->overload, 0)) {
>> +               struct intel_engine_cs *engine =
>> +                       container_of(execlists, typeof(*engine), execlists);
>> +               intel_gt_pm_active_end(&engine->i915->gt);
>> +       }
>>  }
>>  
>>  static inline void
>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>>                         /* port0 completed, advanced to port1 */
>>                         trace_ports(execlists, "completed", execlists->active);
>>  
>> +                       if (atomic_xchg(&execlists->overload, 0))
>> +                               intel_gt_pm_active_end(&engine->i915->gt);
>
> So this looses track if we preempt a dual-ELSP submission with a
> single-ELSP submission (and never go back to dual).
>

Yes, good point.  You're right that if a dual-ELSP submission gets
preempted by a single-ELSP submission "overload" will remain signaled
until the first completion interrupt arrives (e.g. from the preempting
submission).

> If you move this to the end of the loop and check
>
> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
> 	intel_gt_pm_active_end(engine->gt);
>
> so that it covers both preemption/promotion and completion.
>

That sounds reasonable.

> However, that will fluctuate quite rapidly. (And runs the risk of
> exceeding the sentinel.)
>
> An alternative approach would be to couple along
> schedule_in/schedule_out
>
> atomic_set(overload, -1);
>
> __execlists_schedule_in:
> 	if (!atomic_fetch_inc(overload)
> 		intel_gt_pm_active_begin(engine->gt);
> __execlists_schedule_out:
> 	if (!atomic_dec_return(overload)
> 		intel_gt_pm_active_end(engine->gt);
>
> which would mean we are overloaded as soon as we try to submit an
> overlapping ELSP.
>

That sounds good to me too, and AFAICT would have roughly the same
behavior as this metric except for the preemption corner case you
mention above.  I'll try this and verify that I get approximately the
same performance numbers.

>
> The metric feels very multiple client (game + display server, or
> saturated transcode) centric. In the endless kernel world, we expect
> 100% engine utilisation from a single context, and never a dual-ELSP
> submission. They are also likely to want to avoid being throttled to
> converse TDP for the CPU.
>
Yes, this metric is fairly conservative, it won't trigger in all cases
which would potentially benefit from the energy efficiency optimization,
only where we can be reasonably certain that CPU latency is not critical
in order to keep the GPU busy (e.g. because the CS has an additional
ELSP port pending execution that will immediately kick in as soon as the
current one completes).

My original approach was to call intel_gt_pm_active_begin() directly as
soon as the first ELSP is submitted to the GPU, which was somewhat more
effective at improving the energy efficiency of the system than waiting
for the second port to be in use, but it involved a slight execlists
submission latency cost that led to some regressions.  It would
certainly cover the single-context case you have in mind though.  I'll
get some updated numbers with my previous approach so we can decide
which one provides a better trade-off.

> Should we also reduce the overload for the number of clients who are
> waiting for interrupts from the GPU, so that their wakeup latency is not
> impacted?

A number of clients waiting doesn't necessarily indicate that wake-up
latency is a concern.  It frequently indicates the opposite: That the
GPU has a bottleneck which will only be exacerbated by attempting to
reduce the ramp-up latency of the CPU.  IOW, I think we should only care
about reducing the CPU wake-up latency in cases where the client is
unable to keep the GPU fully utilized with the latency target which
allows the GPU to run at maximum throughput -- If the client is unable
to it will already cause the GPU utilization to drop, so the PM QoS
request will be removed whether it is waiting or not.

> -Chris

Thanks!
Tvrtko Ursulin March 11, 2020, 10 a.m. UTC | #3
On 10/03/2020 22:26, Chris Wilson wrote:
> Quoting Francisco Jerez (2020-03-10 21:41:55)
>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> index b9b3f78f1324..a5d7a80b826d 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>          /* we need to manually load the submit queue */
>>          if (execlists->ctrl_reg)
>>                  writel(EL_CTRL_LOAD, execlists->ctrl_reg);
>> +
>> +       if (execlists_num_ports(execlists) > 1 &&
> pending[1] is always defined, the minimum submission is one slot, with
> pending[1] as the sentinel NULL.
> 
>> +           execlists->pending[1] &&
>> +           !atomic_xchg(&execlists->overload, 1))
>> +               intel_gt_pm_active_begin(&engine->i915->gt);
> 
> engine->gt
> 
>>   }
>>   
>>   static bool ctx_single_port_submission(const struct intel_context *ce)
>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>>          clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>>   
>>          WRITE_ONCE(execlists->active, execlists->inflight);
>> +
>> +       if (atomic_xchg(&execlists->overload, 0)) {
>> +               struct intel_engine_cs *engine =
>> +                       container_of(execlists, typeof(*engine), execlists);
>> +               intel_gt_pm_active_end(&engine->i915->gt);
>> +       }
>>   }
>>   
>>   static inline void
>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>>                          /* port0 completed, advanced to port1 */
>>                          trace_ports(execlists, "completed", execlists->active);
>>   
>> +                       if (atomic_xchg(&execlists->overload, 0))
>> +                               intel_gt_pm_active_end(&engine->i915->gt);
> 
> So this looses track if we preempt a dual-ELSP submission with a
> single-ELSP submission (and never go back to dual).
> 
> If you move this to the end of the loop and check
> 
> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
> 	intel_gt_pm_active_end(engine->gt);
> 
> so that it covers both preemption/promotion and completion.
> 
> However, that will fluctuate quite rapidly. (And runs the risk of
> exceeding the sentinel.)
> 
> An alternative approach would be to couple along
> schedule_in/schedule_out
> 
> atomic_set(overload, -1);
> 
> __execlists_schedule_in:
> 	if (!atomic_fetch_inc(overload)
> 		intel_gt_pm_active_begin(engine->gt);
> __execlists_schedule_out:
> 	if (!atomic_dec_return(overload)
> 		intel_gt_pm_active_end(engine->gt);
> 
> which would mean we are overloaded as soon as we try to submit an
> overlapping ELSP.

Putting it this low-level into submission code also would not work well 
with GuC.

How about we try to keep some accounting one level higher, as the i915 
scheduler is passing requests on to the backend for execution?

Or number of runnable contexts, if the distinction between contexts and 
requests is better for this purpose.

Problematic bit in going one level higher though is that the exit point 
is less precisely coupled to the actual state. Or maybe with aggressive 
engine retire we have nowadays it wouldn't be a problem.

Regards,

Tvrtko

> 
> 
> The metric feels very multiple client (game + display server, or
> saturated transcode) centric. In the endless kernel world, we expect
> 100% engine utilisation from a single context, and never a dual-ELSP
> submission. They are also likely to want to avoid being throttled to
> converse TDP for the CPU.
> 
> Should we also reduce the overload for the number of clients who are
> waiting for interrupts from the GPU, so that their wakeup latency is not
> impacted?
> -Chris
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
Chris Wilson March 11, 2020, 10:21 a.m. UTC | #4
Quoting Tvrtko Ursulin (2020-03-11 10:00:41)
> 
> On 10/03/2020 22:26, Chris Wilson wrote:
> > Quoting Francisco Jerez (2020-03-10 21:41:55)
> >>   static inline void
> >> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
> >>                          /* port0 completed, advanced to port1 */
> >>                          trace_ports(execlists, "completed", execlists->active);
> >>   
> >> +                       if (atomic_xchg(&execlists->overload, 0))
> >> +                               intel_gt_pm_active_end(&engine->i915->gt);
> > 
> > So this looses track if we preempt a dual-ELSP submission with a
> > single-ELSP submission (and never go back to dual).
> > 
> > If you move this to the end of the loop and check
> > 
> > if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
> >       intel_gt_pm_active_end(engine->gt);
> > 
> > so that it covers both preemption/promotion and completion.
> > 
> > However, that will fluctuate quite rapidly. (And runs the risk of
> > exceeding the sentinel.)
> > 
> > An alternative approach would be to couple along
> > schedule_in/schedule_out
> > 
> > atomic_set(overload, -1);
> > 
> > __execlists_schedule_in:
> >       if (!atomic_fetch_inc(overload)
> >               intel_gt_pm_active_begin(engine->gt);
> > __execlists_schedule_out:
> >       if (!atomic_dec_return(overload)
> >               intel_gt_pm_active_end(engine->gt);
> > 
> > which would mean we are overloaded as soon as we try to submit an
> > overlapping ELSP.
> 
> Putting it this low-level into submission code also would not work well 
> with GuC.

We can cross that bridge when it is built. [The GuC is also likely to
not want to play with us anyway, and just use SLPC.]

Now, I suspect we may want to use an engine utilisation (busy-stats or
equivalent) metric, but honestly if we can finally land this work it
brings huge benefit for GPU bound TDP constrained workloads. (p-state
loves to starve the GPU even when it provides no extra benefit for the
CPU.) We can raise the bar, establish expected behaviour and then work
to maintain and keep on improving.
-Chris
Francisco Jerez March 11, 2020, 7:54 p.m. UTC | #5
Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> writes:

> On 10/03/2020 22:26, Chris Wilson wrote:
>> Quoting Francisco Jerez (2020-03-10 21:41:55)
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> index b9b3f78f1324..a5d7a80b826d 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>          /* we need to manually load the submit queue */
>>>          if (execlists->ctrl_reg)
>>>                  writel(EL_CTRL_LOAD, execlists->ctrl_reg);
>>> +
>>> +       if (execlists_num_ports(execlists) > 1 &&
>> pending[1] is always defined, the minimum submission is one slot, with
>> pending[1] as the sentinel NULL.
>> 
>>> +           execlists->pending[1] &&
>>> +           !atomic_xchg(&execlists->overload, 1))
>>> +               intel_gt_pm_active_begin(&engine->i915->gt);
>> 
>> engine->gt
>> 
>>>   }
>>>   
>>>   static bool ctx_single_port_submission(const struct intel_context *ce)
>>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>>>          clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>>>   
>>>          WRITE_ONCE(execlists->active, execlists->inflight);
>>> +
>>> +       if (atomic_xchg(&execlists->overload, 0)) {
>>> +               struct intel_engine_cs *engine =
>>> +                       container_of(execlists, typeof(*engine), execlists);
>>> +               intel_gt_pm_active_end(&engine->i915->gt);
>>> +       }
>>>   }
>>>   
>>>   static inline void
>>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>>>                          /* port0 completed, advanced to port1 */
>>>                          trace_ports(execlists, "completed", execlists->active);
>>>   
>>> +                       if (atomic_xchg(&execlists->overload, 0))
>>> +                               intel_gt_pm_active_end(&engine->i915->gt);
>> 
>> So this looses track if we preempt a dual-ELSP submission with a
>> single-ELSP submission (and never go back to dual).
>> 
>> If you move this to the end of the loop and check
>> 
>> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
>> 	intel_gt_pm_active_end(engine->gt);
>> 
>> so that it covers both preemption/promotion and completion.
>> 
>> However, that will fluctuate quite rapidly. (And runs the risk of
>> exceeding the sentinel.)
>> 
>> An alternative approach would be to couple along
>> schedule_in/schedule_out
>> 
>> atomic_set(overload, -1);
>> 
>> __execlists_schedule_in:
>> 	if (!atomic_fetch_inc(overload)
>> 		intel_gt_pm_active_begin(engine->gt);
>> __execlists_schedule_out:
>> 	if (!atomic_dec_return(overload)
>> 		intel_gt_pm_active_end(engine->gt);
>> 
>> which would mean we are overloaded as soon as we try to submit an
>> overlapping ELSP.
>
> Putting it this low-level into submission code also would not work well 
> with GuC.
>

I wrote a patch at some point that added calls to
intel_gt_pm_active_begin() and intel_gt_pm_active_end() to the GuC
submission code in order to obtain a similar effect.  However people
requested me to leave GuC submission alone for the moment in order to
avoid interference with SLPC.  At some point it might make sense to hook
this up in combination with SLPC, because SLPC doesn't provide much of a
CPU energy efficiency advantage in comparison to this series.

> How about we try to keep some accounting one level higher, as the i915 
> scheduler is passing requests on to the backend for execution?
>
> Or number of runnable contexts, if the distinction between contexts and 
> requests is better for this purpose.
>
> Problematic bit in going one level higher though is that the exit point 
> is less precisely coupled to the actual state. Or maybe with aggressive 
> engine retire we have nowadays it wouldn't be a problem.
>

The main advantage of instrumenting the execlists submission code at a
low level is that it gives us visibility over the number of ELSP ports
pending execution, which can cause the performance of the workload to be
substantially more or less latency-sensitive.  GuC submission shouldn't
care about this variable, so it kind of makes sense for its behavior to
be slightly different.

Anyway if we're willing to give up the accuracy of keeping track of this
at a low level (and give GuC submission exactly the same treatment) it
should be possible to move the tracking one level up.

> Regards,
>
> Tvrtko
>

Thank you.
Tvrtko Ursulin March 12, 2020, 11:52 a.m. UTC | #6
On 11/03/2020 19:54, Francisco Jerez wrote:
> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> writes:
> 
>> On 10/03/2020 22:26, Chris Wilson wrote:
>>> Quoting Francisco Jerez (2020-03-10 21:41:55)
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>> index b9b3f78f1324..a5d7a80b826d 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>>           /* we need to manually load the submit queue */
>>>>           if (execlists->ctrl_reg)
>>>>                   writel(EL_CTRL_LOAD, execlists->ctrl_reg);
>>>> +
>>>> +       if (execlists_num_ports(execlists) > 1 &&
>>> pending[1] is always defined, the minimum submission is one slot, with
>>> pending[1] as the sentinel NULL.
>>>
>>>> +           execlists->pending[1] &&
>>>> +           !atomic_xchg(&execlists->overload, 1))
>>>> +               intel_gt_pm_active_begin(&engine->i915->gt);
>>>
>>> engine->gt
>>>
>>>>    }
>>>>    
>>>>    static bool ctx_single_port_submission(const struct intel_context *ce)
>>>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>>>>           clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>>>>    
>>>>           WRITE_ONCE(execlists->active, execlists->inflight);
>>>> +
>>>> +       if (atomic_xchg(&execlists->overload, 0)) {
>>>> +               struct intel_engine_cs *engine =
>>>> +                       container_of(execlists, typeof(*engine), execlists);
>>>> +               intel_gt_pm_active_end(&engine->i915->gt);
>>>> +       }
>>>>    }
>>>>    
>>>>    static inline void
>>>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>>>>                           /* port0 completed, advanced to port1 */
>>>>                           trace_ports(execlists, "completed", execlists->active);
>>>>    
>>>> +                       if (atomic_xchg(&execlists->overload, 0))
>>>> +                               intel_gt_pm_active_end(&engine->i915->gt);
>>>
>>> So this looses track if we preempt a dual-ELSP submission with a
>>> single-ELSP submission (and never go back to dual).
>>>
>>> If you move this to the end of the loop and check
>>>
>>> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
>>> 	intel_gt_pm_active_end(engine->gt);
>>>
>>> so that it covers both preemption/promotion and completion.
>>>
>>> However, that will fluctuate quite rapidly. (And runs the risk of
>>> exceeding the sentinel.)
>>>
>>> An alternative approach would be to couple along
>>> schedule_in/schedule_out
>>>
>>> atomic_set(overload, -1);
>>>
>>> __execlists_schedule_in:
>>> 	if (!atomic_fetch_inc(overload)
>>> 		intel_gt_pm_active_begin(engine->gt);
>>> __execlists_schedule_out:
>>> 	if (!atomic_dec_return(overload)
>>> 		intel_gt_pm_active_end(engine->gt);
>>>
>>> which would mean we are overloaded as soon as we try to submit an
>>> overlapping ELSP.
>>
>> Putting it this low-level into submission code also would not work well
>> with GuC.
>>
> 
> I wrote a patch at some point that added calls to
> intel_gt_pm_active_begin() and intel_gt_pm_active_end() to the GuC
> submission code in order to obtain a similar effect.  However people
> requested me to leave GuC submission alone for the moment in order to
> avoid interference with SLPC.  At some point it might make sense to hook
> this up in combination with SLPC, because SLPC doesn't provide much of a
> CPU energy efficiency advantage in comparison to this series.
> 
>> How about we try to keep some accounting one level higher, as the i915
>> scheduler is passing requests on to the backend for execution?
>>
>> Or number of runnable contexts, if the distinction between contexts and
>> requests is better for this purpose.
>>
>> Problematic bit in going one level higher though is that the exit point
>> is less precisely coupled to the actual state. Or maybe with aggressive
>> engine retire we have nowadays it wouldn't be a problem.
>>
> 
> The main advantage of instrumenting the execlists submission code at a
> low level is that it gives us visibility over the number of ELSP ports
> pending execution, which can cause the performance of the workload to be
> substantially more or less latency-sensitive.  GuC submission shouldn't
> care about this variable, so it kind of makes sense for its behavior to
> be slightly different.
> 
> Anyway if we're willing to give up the accuracy of keeping track of this
> at a low level (and give GuC submission exactly the same treatment) it
> should be possible to move the tracking one level up.

The results you got are certainly extremely attractive and the approach 
and code looks tidy and mature - just so you don't get me wrong that I 
am not objecting to the idea.

What I'd like to see is an easier to read breakdown of results, at 
minimum with separate perf and perf-per-Watt results. A graph with 
sorted results and error bars would also be nice.

Secondly in in the commit message of this particular patch I'd like to 
read some more thought about why ELSP[1] occupancy is thought to be the 
desired signal. Why for instance a deep ELSP[0] shouldn't benefit from 
more TDP budget towards the GPU and similar.

Also a description of the control processing "rf_qos" function do with 
this signal. What and why.

Some time ago we entertained the idea of GPU "load average", where that 
was defined as a count of runnable requests (so batch buffers). How 
that, more generic metric, would behave here if used as an input signal 
really intrigues me. Sadly I don't have a patch ready to give to you and 
ask to please test it.

Or maybe the key is count of runnable contexts as opposed to requests, 
which would more match the ELSP[1] idea.

But this is secondary, I primarily think we need to see a better 
presentation of the result and the theory of operation explained better 
in the commit message.

Regards,

Tvrtko
Francisco Jerez March 13, 2020, 7:39 a.m. UTC | #7
Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> writes:

> On 11/03/2020 19:54, Francisco Jerez wrote:
>> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> writes:
>> 
>>> On 10/03/2020 22:26, Chris Wilson wrote:
>>>> Quoting Francisco Jerez (2020-03-10 21:41:55)
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>>> index b9b3f78f1324..a5d7a80b826d 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>>>           /* we need to manually load the submit queue */
>>>>>           if (execlists->ctrl_reg)
>>>>>                   writel(EL_CTRL_LOAD, execlists->ctrl_reg);
>>>>> +
>>>>> +       if (execlists_num_ports(execlists) > 1 &&
>>>> pending[1] is always defined, the minimum submission is one slot, with
>>>> pending[1] as the sentinel NULL.
>>>>
>>>>> +           execlists->pending[1] &&
>>>>> +           !atomic_xchg(&execlists->overload, 1))
>>>>> +               intel_gt_pm_active_begin(&engine->i915->gt);
>>>>
>>>> engine->gt
>>>>
>>>>>    }
>>>>>    
>>>>>    static bool ctx_single_port_submission(const struct intel_context *ce)
>>>>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>>>>>           clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>>>>>    
>>>>>           WRITE_ONCE(execlists->active, execlists->inflight);
>>>>> +
>>>>> +       if (atomic_xchg(&execlists->overload, 0)) {
>>>>> +               struct intel_engine_cs *engine =
>>>>> +                       container_of(execlists, typeof(*engine), execlists);
>>>>> +               intel_gt_pm_active_end(&engine->i915->gt);
>>>>> +       }
>>>>>    }
>>>>>    
>>>>>    static inline void
>>>>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>>>>>                           /* port0 completed, advanced to port1 */
>>>>>                           trace_ports(execlists, "completed", execlists->active);
>>>>>    
>>>>> +                       if (atomic_xchg(&execlists->overload, 0))
>>>>> +                               intel_gt_pm_active_end(&engine->i915->gt);
>>>>
>>>> So this looses track if we preempt a dual-ELSP submission with a
>>>> single-ELSP submission (and never go back to dual).
>>>>
>>>> If you move this to the end of the loop and check
>>>>
>>>> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
>>>> 	intel_gt_pm_active_end(engine->gt);
>>>>
>>>> so that it covers both preemption/promotion and completion.
>>>>
>>>> However, that will fluctuate quite rapidly. (And runs the risk of
>>>> exceeding the sentinel.)
>>>>
>>>> An alternative approach would be to couple along
>>>> schedule_in/schedule_out
>>>>
>>>> atomic_set(overload, -1);
>>>>
>>>> __execlists_schedule_in:
>>>> 	if (!atomic_fetch_inc(overload)
>>>> 		intel_gt_pm_active_begin(engine->gt);
>>>> __execlists_schedule_out:
>>>> 	if (!atomic_dec_return(overload)
>>>> 		intel_gt_pm_active_end(engine->gt);
>>>>
>>>> which would mean we are overloaded as soon as we try to submit an
>>>> overlapping ELSP.
>>>
>>> Putting it this low-level into submission code also would not work well
>>> with GuC.
>>>
>> 
>> I wrote a patch at some point that added calls to
>> intel_gt_pm_active_begin() and intel_gt_pm_active_end() to the GuC
>> submission code in order to obtain a similar effect.  However people
>> requested me to leave GuC submission alone for the moment in order to
>> avoid interference with SLPC.  At some point it might make sense to hook
>> this up in combination with SLPC, because SLPC doesn't provide much of a
>> CPU energy efficiency advantage in comparison to this series.
>> 
>>> How about we try to keep some accounting one level higher, as the i915
>>> scheduler is passing requests on to the backend for execution?
>>>
>>> Or number of runnable contexts, if the distinction between contexts and
>>> requests is better for this purpose.
>>>
>>> Problematic bit in going one level higher though is that the exit point
>>> is less precisely coupled to the actual state. Or maybe with aggressive
>>> engine retire we have nowadays it wouldn't be a problem.
>>>
>> 
>> The main advantage of instrumenting the execlists submission code at a
>> low level is that it gives us visibility over the number of ELSP ports
>> pending execution, which can cause the performance of the workload to be
>> substantially more or less latency-sensitive.  GuC submission shouldn't
>> care about this variable, so it kind of makes sense for its behavior to
>> be slightly different.
>> 
>> Anyway if we're willing to give up the accuracy of keeping track of this
>> at a low level (and give GuC submission exactly the same treatment) it
>> should be possible to move the tracking one level up.
>
> The results you got are certainly extremely attractive and the approach 
> and code looks tidy and mature - just so you don't get me wrong that I 
> am not objecting to the idea.
>
> What I'd like to see is an easier to read breakdown of results, at 
> minimum with separate perf and perf-per-Watt results. A graph with 
> sorted results and error bars would also be nice.
>

I just plotted the same results from the cover letter in separate
performance and energy efficiency graphs:

https://people.freedesktop.org/~currojerez/intel_pstate-vlp-v2/benchmark-comparison-ICL-perf.svg
https://people.freedesktop.org/~currojerez/intel_pstate-vlp-v2/benchmark-comparison-ICL-perf-per-watt.svg

> Secondly in in the commit message of this particular patch I'd like to 
> read some more thought about why ELSP[1] occupancy is thought to be the 
> desired signal. Why for instance a deep ELSP[0] shouldn't benefit from 
> more TDP budget towards the GPU and similar.
>
> Also a description of the control processing "rf_qos" function do with 
> this signal. What and why.
>

I'll work on a better commit message for v2.

> Some time ago we entertained the idea of GPU "load average", where that 
> was defined as a count of runnable requests (so batch buffers). How 
> that, more generic metric, would behave here if used as an input signal 
> really intrigues me. Sadly I don't have a patch ready to give to you and 
> ask to please test it.
>
> Or maybe the key is count of runnable contexts as opposed to requests, 
> which would more match the ELSP[1] idea.
>

Ultimately what we're trying to determine here is whether the
performance of the graphics workload is sensitive to the latency of the
CPU -- If it is we don't want to place a response latency constraint.
If the two ELSP ports are in use somewhere close to 100% of the time we
know that for most of the run-time of the workload the completion of one
request leads to the immediate execution of another, which means that
the GPU can be kept busy without the execlists submission code rushing
to submit a new requestt, so latency isn't a concern.

Looking at the number of runnable contexts is very close but not exactly
equivalent to that, since the workload may still be latency-sensitive if
the multiple contexts are only being submitted to a single port.

In the GuC submission case the CPU doesn't need to get involved to
submit the next piece of work (unless there is some cyclical dependency
between CPU and GPU work that is), so it should be sufficient to look at
whether at least one port is active -- Also even while using execlists,
there are applications which are able to keep some GPU engine busy
nearly 100% of the time (meaning that their performance won't increase
with decreasing latency since the engine can hardly do more work than
that), but they are unable to keep the two ELSP ports busy for some
significant fraction of that time, so it would be more accurate for them
to use the single-port utilization as heuristic (which yeah, is also
roughly equivalent to the fraction of time that at least one runnable
context or request was pending execution), at a cost for the
applications that are actually sensitive to the ELSP submission latency
we would be neglecting.

This patch takes the rather conservative approach of limiting the
application of the response frequency PM QoS request to the more
restrictive set of cases where we are most certain that CPU latency
shouldn't be an issue, in order to avoid regressions.  But it might be
that you find the additional energy efficiency benefit from the more
aggressive approach to be worth the cost to a few execlists submission
latency-sensitive applications.  I'm trying to get some numbers
comparing the two approaches now, will post them here once I have
results so we can make a more informed trade-off.

> But this is secondary, I primarily think we need to see a better 
> presentation of the result and the theory of operation explained better 
> in the commit message.
>

Sure, I'll see what I can come up with.

> Regards,
>
> Tvrtko

Thank you.
Francisco Jerez March 16, 2020, 8:54 p.m. UTC | #8
Francisco Jerez <currojerez@riseup.net> writes:

> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> writes:
>[...]
>> Some time ago we entertained the idea of GPU "load average", where that 
>> was defined as a count of runnable requests (so batch buffers). How 
>> that, more generic metric, would behave here if used as an input signal 
>> really intrigues me. Sadly I don't have a patch ready to give to you and 
>> ask to please test it.
>>
>> Or maybe the key is count of runnable contexts as opposed to requests, 
>> which would more match the ELSP[1] idea.
>>
>[..]
> This patch takes the rather conservative approach of limiting the
> application of the response frequency PM QoS request to the more
> restrictive set of cases where we are most certain that CPU latency
> shouldn't be an issue, in order to avoid regressions.  But it might be
> that you find the additional energy efficiency benefit from the more
> aggressive approach to be worth the cost to a few execlists submission
> latency-sensitive applications.  I'm trying to get some numbers
> comparing the two approaches now, will post them here once I have
> results so we can make a more informed trade-off.
>

I got some results from the promised comparison between the dual-ELSP
utilization approach used in this series and the more obvious
alternative of keeping track of the time that any request (or context)
is in flight.  As expected there are quite a few performance
improvements (numbers relative to this approach), however most of them
are either synthetic benchmarks or off-screen variants of benchmarks
(the corresponding on-screen variant of each benchmark below doesn't
show a significant improvement):

 synmark/OglCSDof:                                                                      XXX ±0.15% x18 ->   XXX ±0.22% x12          d=1.15% ±0.18%       p=0.00%
 synmark/OglDeferred:                                                                   XXX ±0.31% x18 ->   XXX ±0.15% x12          d=1.16% ±0.26%       p=0.00%
 synmark/OglTexFilterAniso:                                                             XXX ±0.18% x18 ->   XXX ±0.21% x12          d=1.25% ±0.19%       p=0.00%
 synmark/OglPSPhong:                                                                    XXX ±0.43% x18 ->   XXX ±0.29% x12          d=1.28% ±0.38%       p=0.00%
 synmark/OglBatch0:                                                                     XXX ±0.40% x18 ->   XXX ±0.53% x12          d=1.29% ±0.46%       p=0.00%
 synmark/OglVSDiffuse8:                                                                 XXX ±0.49% x17 ->   XXX ±0.25% x12          d=1.30% ±0.41%       p=0.00%
 synmark/OglVSTangent:                                                                  XXX ±0.53% x18 ->   XXX ±0.31% x12          d=1.31% ±0.46%       p=0.00%
 synmark/OglGeomPoint:                                                                  XXX ±0.56% x18 ->   XXX ±0.15% x12          d=1.48% ±0.44%       p=0.00%
 gputest/plot3d:                                                                        XXX ±0.16% x18 ->   XXX ±0.11% x12          d=1.50% ±0.14%       p=0.00%
 gputest/tess_x32:                                                                      XXX ±0.15% x18 ->   XXX ±0.06% x12          d=1.59% ±0.13%       p=0.00%
 synmark/OglTexFilterTri:                                                               XXX ±0.15% x18 ->   XXX ±0.19% x12          d=1.62% ±0.17%       p=0.00%
 synmark/OglBatch3:                                                                     XXX ±0.57% x18 ->   XXX ±0.33% x12          d=1.70% ±0.49%       p=0.00%
 synmark/OglBatch1:                                                                     XXX ±0.41% x18 ->   XXX ±0.34% x12          d=1.81% ±0.38%       p=0.00%
 synmark/OglShMapVsm:                                                                   XXX ±0.53% x18 ->   XXX ±0.38% x12          d=1.81% ±0.48%       p=0.00%
 synmark/OglTexMem128:                                                                  XXX ±0.62% x18 ->   XXX ±0.29% x12          d=1.87% ±0.52%       p=0.00%
 phoronix/x11perf/test=Scrolling 500 x 500 px:                                           XXX ±0.35% x6 ->   XXX ±0.56% x12          d=2.23% ±0.52%       p=0.00%
 phoronix/x11perf/test=500px Copy From Window To Window:                                 XXX ±0.00% x3 ->   XXX ±0.74% x12          d=2.41% ±0.70%       p=0.01%
 gfxbench/gl_trex_off:                                                                   XXX ±0.04% x3 ->   XXX ±0.34% x12          d=2.59% ±0.32%       p=0.00%
 synmark/OglBatch2:                                                                     XXX ±0.85% x18 ->   XXX ±0.21% x12          d=2.87% ±0.67%       p=0.00%
 glbenchmark/GLB27_EgyptHD_inherited_C24Z16_FixedTime_Offscreen:                         XXX ±0.35% x3 ->   XXX ±0.84% x12          d=3.03% ±0.81%       p=0.01%
 glbenchmark/GLB27_TRex_C24Z16_Offscreen:                                                XXX ±0.23% x3 ->   XXX ±0.32% x12          d=3.09% ±0.32%       p=0.00%
 synmark/OglCSCloth:                                                                    XXX ±0.60% x18 ->   XXX ±0.29% x12          d=3.76% ±0.50%       p=0.00%
 phoronix/x11perf/test=Copy 500x500 From Pixmap To Pixmap:                               XXX ±0.44% x3 ->   XXX ±0.70% x12          d=4.31% ±0.69%       p=0.00%

There aren't as many regressions (numbers relative to upstream
linux-next kernel), they're mostly 2D test-cases, however they are
substantially worse in absolute value:

 phoronix/jxrendermark/rendering-test=12pt Text LCD/rendering-size=128x128:              XXX ±0.30% x26 ->  XXX ±5.71% x26        d=-23.15% ±3.11%       p=0.00%
 phoronix/jxrendermark/rendering-test=Linear Gradient Blend/rendering-size=128x128:      XXX ±0.30% x26 ->  XXX ±4.32% x26        d=-21.34% ±2.41%       p=0.00%
 phoronix/x11perf/test=500px Compositing From Pixmap To Window:                         XXX ±15.46% x26 -> XXX ±12.76% x26       d=-19.05% ±13.15%       p=0.00%
 phoronix/jxrendermark/rendering-test=Transformed Blit Bilinear/rendering-size=128x128:  XXX ±0.20% x26 ->  XXX ±3.82% x27         d=-5.07% ±2.57%       p=0.00%
 phoronix/gtkperf/gtk-test=GtkDrawingArea - Pixbufs:                                     XXX ±2.81% x26 ->  XXX ±2.10% x26         d=-3.59% ±2.45%       p=0.00%
 warsow/benchsow:                                                                        XXX ±0.61% x26 ->  XXX ±1.41% x27         d=-2.45% ±1.07%       p=0.00%
 synmark/OglTerrainFlyInst:                                                              XXX ±0.44% x25 ->  XXX ±0.74% x25         d=-1.24% ±0.60%       p=0.00%

There are some things we might be able to do to get some of the
additional improvement we can see above without hurting
latency-sensitive workloads, but it's going to take more effort, the
present approach of using the dual-ELSP utilization seems like a good
compromise to me for starters.

>[...]
Francisco Jerez March 18, 2020, 7:42 p.m. UTC | #9
Francisco Jerez <currojerez@riseup.net> writes:

> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
>> Quoting Francisco Jerez (2020-03-10 21:41:55)
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> index b9b3f78f1324..a5d7a80b826d 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>         /* we need to manually load the submit queue */
>>>         if (execlists->ctrl_reg)
>>>                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
>>> +
>>> +       if (execlists_num_ports(execlists) > 1 &&
>> pending[1] is always defined, the minimum submission is one slot, with
>> pending[1] as the sentinel NULL.
>>
>>> +           execlists->pending[1] &&
>>> +           !atomic_xchg(&execlists->overload, 1))
>>> +               intel_gt_pm_active_begin(&engine->i915->gt);
>>
>> engine->gt
>>
>
> Applied your suggestions above locally, will probably wait to have a few
> more changes batched up before sending a v2.
>
>>>  }
>>>  
>>>  static bool ctx_single_port_submission(const struct intel_context *ce)
>>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>>>         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>>>  
>>>         WRITE_ONCE(execlists->active, execlists->inflight);
>>> +
>>> +       if (atomic_xchg(&execlists->overload, 0)) {
>>> +               struct intel_engine_cs *engine =
>>> +                       container_of(execlists, typeof(*engine), execlists);
>>> +               intel_gt_pm_active_end(&engine->i915->gt);
>>> +       }
>>>  }
>>>  
>>>  static inline void
>>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>>>                         /* port0 completed, advanced to port1 */
>>>                         trace_ports(execlists, "completed", execlists->active);
>>>  
>>> +                       if (atomic_xchg(&execlists->overload, 0))
>>> +                               intel_gt_pm_active_end(&engine->i915->gt);
>>
>> So this looses track if we preempt a dual-ELSP submission with a
>> single-ELSP submission (and never go back to dual).
>>
>
> Yes, good point.  You're right that if a dual-ELSP submission gets
> preempted by a single-ELSP submission "overload" will remain signaled
> until the first completion interrupt arrives (e.g. from the preempting
> submission).
>
>> If you move this to the end of the loop and check
>>
>> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
>> 	intel_gt_pm_active_end(engine->gt);
>>
>> so that it covers both preemption/promotion and completion.
>>
>
> That sounds reasonable.
>
>> However, that will fluctuate quite rapidly. (And runs the risk of
>> exceeding the sentinel.)
>>
>> An alternative approach would be to couple along
>> schedule_in/schedule_out
>>
>> atomic_set(overload, -1);
>>
>> __execlists_schedule_in:
>> 	if (!atomic_fetch_inc(overload)
>> 		intel_gt_pm_active_begin(engine->gt);
>> __execlists_schedule_out:
>> 	if (!atomic_dec_return(overload)
>> 		intel_gt_pm_active_end(engine->gt);
>>
>> which would mean we are overloaded as soon as we try to submit an
>> overlapping ELSP.
>>
>
> That sounds good to me too, and AFAICT would have roughly the same
> behavior as this metric except for the preemption corner case you
> mention above.  I'll try this and verify that I get approximately the
> same performance numbers.
>

This suggestion seems to lead to some minor regressions, I'm
investigating the issue.  Will send a v2 as soon as I have something
along the lines of what you suggested running with equivalent
performance to v1.
Francisco Jerez March 20, 2020, 2:46 a.m. UTC | #10
Francisco Jerez <currojerez@riseup.net> writes:

> Francisco Jerez <currojerez@riseup.net> writes:
>
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>>
>>> Quoting Francisco Jerez (2020-03-10 21:41:55)
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>> index b9b3f78f1324..a5d7a80b826d 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>>         /* we need to manually load the submit queue */
>>>>         if (execlists->ctrl_reg)
>>>>                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
>>>> +
>>>> +       if (execlists_num_ports(execlists) > 1 &&
>>> pending[1] is always defined, the minimum submission is one slot, with
>>> pending[1] as the sentinel NULL.
>>>
>>>> +           execlists->pending[1] &&
>>>> +           !atomic_xchg(&execlists->overload, 1))
>>>> +               intel_gt_pm_active_begin(&engine->i915->gt);
>>>
>>> engine->gt
>>>
>>
>> Applied your suggestions above locally, will probably wait to have a few
>> more changes batched up before sending a v2.
>>
>>>>  }
>>>>  
>>>>  static bool ctx_single_port_submission(const struct intel_context *ce)
>>>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
>>>>         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
>>>>  
>>>>         WRITE_ONCE(execlists->active, execlists->inflight);
>>>> +
>>>> +       if (atomic_xchg(&execlists->overload, 0)) {
>>>> +               struct intel_engine_cs *engine =
>>>> +                       container_of(execlists, typeof(*engine), execlists);
>>>> +               intel_gt_pm_active_end(&engine->i915->gt);
>>>> +       }
>>>>  }
>>>>  
>>>>  static inline void
>>>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
>>>>                         /* port0 completed, advanced to port1 */
>>>>                         trace_ports(execlists, "completed", execlists->active);
>>>>  
>>>> +                       if (atomic_xchg(&execlists->overload, 0))
>>>> +                               intel_gt_pm_active_end(&engine->i915->gt);
>>>
>>> So this looses track if we preempt a dual-ELSP submission with a
>>> single-ELSP submission (and never go back to dual).
>>>
>>
>> Yes, good point.  You're right that if a dual-ELSP submission gets
>> preempted by a single-ELSP submission "overload" will remain signaled
>> until the first completion interrupt arrives (e.g. from the preempting
>> submission).
>>
>>> If you move this to the end of the loop and check
>>>
>>> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
>>> 	intel_gt_pm_active_end(engine->gt);
>>>
>>> so that it covers both preemption/promotion and completion.
>>>
>>
>> That sounds reasonable.
>>
>>> However, that will fluctuate quite rapidly. (And runs the risk of
>>> exceeding the sentinel.)
>>>
>>> An alternative approach would be to couple along
>>> schedule_in/schedule_out
>>>
>>> atomic_set(overload, -1);
>>>
>>> __execlists_schedule_in:
>>> 	if (!atomic_fetch_inc(overload)
>>> 		intel_gt_pm_active_begin(engine->gt);
>>> __execlists_schedule_out:
>>> 	if (!atomic_dec_return(overload)
>>> 		intel_gt_pm_active_end(engine->gt);
>>>
>>> which would mean we are overloaded as soon as we try to submit an
>>> overlapping ELSP.
>>>
>>
>> That sounds good to me too, and AFAICT would have roughly the same
>> behavior as this metric except for the preemption corner case you
>> mention above.  I'll try this and verify that I get approximately the
>> same performance numbers.
>>
>
> This suggestion seems to lead to some minor regressions, I'm
> investigating the issue.  Will send a v2 as soon as I have something
> along the lines of what you suggested running with equivalent
> performance to v1.

I think I've figured out why both of the alternatives we were talking
about above lead to a couple percent regressions in latency-sensitive
workloads: In some scenarios it's possible for execlist_dequeue() to
execute after the GPU has gone idle, but before we've processed the
corresponding CSB entries, particularly when called from the
submit_queue() path.  In that case __execlists_schedule_in() will think
that the next request is overlapping, and tell CPU power management to
relax, even though the GPU is starving intermittently.

How about we do the same:

|       if (atomic_xchg(&execlists->overload, 0))
|               intel_gt_pm_active_end(engine->gt);

as in this patch from process_csb() in response to each completion CSB
entry, which ensures that the system is considered non-GPU-bound as soon
as the first context completes.  Subsequently if another CSB entry
signals a dual-ELSP active-to-idle transition or a dual-ELSP preemption
we call intel_gt_pm_active_begin() directly from process_csb().  If we
hit a single-ELSP preemption CSB entry we call intel_gt_pm_active_end()
instead, in order to avoid the problem you pointed out in your previous
email.

How does that sound to you?  [Still need to verify that it has
comparable performance to this patch overall.]

Thanks!
Chris Wilson March 20, 2020, 10:06 a.m. UTC | #11
Quoting Francisco Jerez (2020-03-20 02:46:19)
> Francisco Jerez <currojerez@riseup.net> writes:
> 
> > Francisco Jerez <currojerez@riseup.net> writes:
> >
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >>
> >>> Quoting Francisco Jerez (2020-03-10 21:41:55)
> >>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> >>>> index b9b3f78f1324..a5d7a80b826d 100644
> >>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> >>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> >>>> @@ -1577,6 +1577,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
> >>>>         /* we need to manually load the submit queue */
> >>>>         if (execlists->ctrl_reg)
> >>>>                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
> >>>> +
> >>>> +       if (execlists_num_ports(execlists) > 1 &&
> >>> pending[1] is always defined, the minimum submission is one slot, with
> >>> pending[1] as the sentinel NULL.
> >>>
> >>>> +           execlists->pending[1] &&
> >>>> +           !atomic_xchg(&execlists->overload, 1))
> >>>> +               intel_gt_pm_active_begin(&engine->i915->gt);
> >>>
> >>> engine->gt
> >>>
> >>
> >> Applied your suggestions above locally, will probably wait to have a few
> >> more changes batched up before sending a v2.
> >>
> >>>>  }
> >>>>  
> >>>>  static bool ctx_single_port_submission(const struct intel_context *ce)
> >>>> @@ -2213,6 +2218,12 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
> >>>>         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
> >>>>  
> >>>>         WRITE_ONCE(execlists->active, execlists->inflight);
> >>>> +
> >>>> +       if (atomic_xchg(&execlists->overload, 0)) {
> >>>> +               struct intel_engine_cs *engine =
> >>>> +                       container_of(execlists, typeof(*engine), execlists);
> >>>> +               intel_gt_pm_active_end(&engine->i915->gt);
> >>>> +       }
> >>>>  }
> >>>>  
> >>>>  static inline void
> >>>> @@ -2386,6 +2397,9 @@ static void process_csb(struct intel_engine_cs *engine)
> >>>>                         /* port0 completed, advanced to port1 */
> >>>>                         trace_ports(execlists, "completed", execlists->active);
> >>>>  
> >>>> +                       if (atomic_xchg(&execlists->overload, 0))
> >>>> +                               intel_gt_pm_active_end(&engine->i915->gt);
> >>>
> >>> So this looses track if we preempt a dual-ELSP submission with a
> >>> single-ELSP submission (and never go back to dual).
> >>>
> >>
> >> Yes, good point.  You're right that if a dual-ELSP submission gets
> >> preempted by a single-ELSP submission "overload" will remain signaled
> >> until the first completion interrupt arrives (e.g. from the preempting
> >> submission).
> >>
> >>> If you move this to the end of the loop and check
> >>>
> >>> if (!execlists->active[1] && atomic_xchg(&execlists->overload, 0))
> >>>     intel_gt_pm_active_end(engine->gt);
> >>>
> >>> so that it covers both preemption/promotion and completion.
> >>>
> >>
> >> That sounds reasonable.
> >>
> >>> However, that will fluctuate quite rapidly. (And runs the risk of
> >>> exceeding the sentinel.)
> >>>
> >>> An alternative approach would be to couple along
> >>> schedule_in/schedule_out
> >>>
> >>> atomic_set(overload, -1);
> >>>
> >>> __execlists_schedule_in:
> >>>     if (!atomic_fetch_inc(overload)
> >>>             intel_gt_pm_active_begin(engine->gt);
> >>> __execlists_schedule_out:
> >>>     if (!atomic_dec_return(overload)
> >>>             intel_gt_pm_active_end(engine->gt);
> >>>
> >>> which would mean we are overloaded as soon as we try to submit an
> >>> overlapping ELSP.
> >>>
> >>
> >> That sounds good to me too, and AFAICT would have roughly the same
> >> behavior as this metric except for the preemption corner case you
> >> mention above.  I'll try this and verify that I get approximately the
> >> same performance numbers.
> >>
> >
> > This suggestion seems to lead to some minor regressions, I'm
> > investigating the issue.  Will send a v2 as soon as I have something
> > along the lines of what you suggested running with equivalent
> > performance to v1.
> 
> I think I've figured out why both of the alternatives we were talking
> about above lead to a couple percent regressions in latency-sensitive
> workloads: In some scenarios it's possible for execlist_dequeue() to
> execute after the GPU has gone idle, but before we've processed the
> corresponding CSB entries, particularly when called from the
> submit_queue() path.  In that case __execlists_schedule_in() will think
> that the next request is overlapping, and tell CPU power management to
> relax, even though the GPU is starving intermittently.
> 
> How about we do the same:
> 
> |       if (atomic_xchg(&execlists->overload, 0))
> |               intel_gt_pm_active_end(engine->gt);
> 
> as in this patch from process_csb() in response to each completion CSB
> entry, which ensures that the system is considered non-GPU-bound as soon
> as the first context completes.  Subsequently if another CSB entry
> signals a dual-ELSP active-to-idle transition or a dual-ELSP preemption
> we call intel_gt_pm_active_begin() directly from process_csb().  If we
> hit a single-ELSP preemption CSB entry we call intel_gt_pm_active_end()
> instead, in order to avoid the problem you pointed out in your previous
> email.
> 
> How does that sound to you?  [Still need to verify that it has
> comparable performance to this patch overall.]

Sounds like we're trying to compensate for ksoftirqd latency, which is a
killer overall. How about something as simple as

execlists_submit_ports:
	tasklet_hi_schedule(&execlists->tasklet);

which will then be run immediately from local context at the end of the
direct submission... Unless it's already queued on another CPU. Instead
of waiting for that, we may manually try to kick it locally.

As your latency governor is kicked from a worker, iirc, we should still
be executing before it has a chance to process a partial update. I hope.

Anyway if it is the ksoftirqd latency hurting here, it's not a problem
uniquely to the governor and I would like to improve it :)
-Chris
---------------------------------------------------------------------
Intel Corporation (UK) Limited
Registered No. 1134945 (England)
Registered Office: Pipers Way, Swindon SN3 1RJ
VAT No: 860 2173 47

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 53ac3f00909a..16ebdfa1dfc9 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -504,6 +504,7 @@  void intel_engine_init_execlists(struct intel_engine_cs *engine)
 
 	execlists->queue_priority_hint = INT_MIN;
 	execlists->queue = RB_ROOT_CACHED;
+	atomic_set(&execlists->overload, 0);
 }
 
 static void cleanup_status_page(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 80cdde712842..1b17b2f0c7a3 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -266,6 +266,13 @@  struct intel_engine_execlists {
 	 */
 	u8 csb_head;
 
+	/**
+	 * @overload: whether at least two execlist ports are
+	 * currently submitted to the hardware, indicating that CPU
+	 * latency isn't critical in order to maintain the GPU busy.
+	 */
+	atomic_t overload;
+
 	I915_SELFTEST_DECLARE(struct st_preempt_hang preempt_hang;)
 };
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 8b653c0f5e5f..f1f859e89a8f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -107,6 +107,102 @@  void intel_gt_pm_init_early(struct intel_gt *gt)
 	intel_wakeref_init(&gt->wakeref, gt->uncore->rpm, &wf_ops);
 }
 
+/**
+ * Time increment until the most immediate PM QoS response frequency
+ * update.
+ *
+ * May be in the future (return value > 0) if the GPU is currently
+ * active but we haven't updated the PM QoS request to reflect a
+ * bottleneck yet.  May be in the past (return value < 0) if the GPU
+ * isn't fully utilized and we've already reset the PM QoS request to
+ * the default value.  May be zero if a PM QoS request update is due.
+ *
+ * The time increment returned by this function decreases linearly
+ * with time until it reaches either zero or a configurable limit.
+ */
+static int32_t time_to_rf_qos_update_ns(struct intel_gt *gt)
+{
+	const uint64_t t1 = ktime_get_ns();
+	const uint64_t dt1 = gt->rf_qos.delay_max_ns;
+
+	if (atomic_read_acquire(&gt->rf_qos.active_count)) {
+		const uint64_t t0 = atomic64_read(&gt->rf_qos.time_set_ns);
+
+		return min(dt1, t0 <= t1 ? 0 : t0 - t1);
+	} else {
+		const uint64_t t0 = atomic64_read(&gt->rf_qos.time_clear_ns);
+		const unsigned int shift = gt->rf_qos.delay_slope_shift;
+
+		return -(int32_t)(t1 <= t0 ? 1 :
+				  min(dt1, (t1 - t0) << shift));
+	}
+}
+
+/**
+ * Perform a delayed PM QoS response frequency update.
+ */
+static void intel_gt_rf_qos_update(struct intel_gt *gt)
+{
+	const uint32_t dt = max(0, time_to_rf_qos_update_ns(gt));
+
+	timer_reduce(&gt->rf_qos.timer, jiffies + nsecs_to_jiffies(dt));
+}
+
+/**
+ * Timer that fires once the delay used to switch the PM QoS response
+ * frequency request has elapsed.
+ */
+static void intel_gt_rf_qos_timeout(struct timer_list *timer)
+{
+	struct intel_gt *gt = container_of(timer, struct intel_gt,
+					   rf_qos.timer);
+	const int32_t dt = time_to_rf_qos_update_ns(gt);
+
+	if (dt == 0)
+		cpu_response_frequency_qos_update_request(
+			&gt->rf_qos.req, gt->rf_qos.target_hz);
+	else
+		cpu_response_frequency_qos_update_request(
+			&gt->rf_qos.req, PM_QOS_DEFAULT_VALUE);
+
+	if (dt > 0)
+		intel_gt_rf_qos_update(gt);
+}
+
+/**
+ * Report the beginning of a period of GPU utilization to PM.
+ *
+ * May trigger a more energy-efficient response mode in CPU PM, but
+ * only after a certain delay has elapsed so we don't have a negative
+ * impact on the CPU ramp-up latency except after the GPU has been
+ * continuously utilized for a long enough period of time.
+ */
+void intel_gt_pm_active_begin(struct intel_gt *gt)
+{
+	const uint32_t dt = abs(time_to_rf_qos_update_ns(gt));
+
+	atomic64_set(&gt->rf_qos.time_set_ns, ktime_get_ns() + dt);
+
+	if (!atomic_fetch_inc_release(&gt->rf_qos.active_count))
+		intel_gt_rf_qos_update(gt);
+}
+
+/**
+ * Report the end of a period of GPU utilization to PM.
+ *
+ * Must be called once after each call to intel_gt_pm_active_begin().
+ */
+void intel_gt_pm_active_end(struct intel_gt *gt)
+{
+	const uint32_t dt = abs(time_to_rf_qos_update_ns(gt));
+	const unsigned int shift = gt->rf_qos.delay_slope_shift;
+
+	atomic64_set(&gt->rf_qos.time_clear_ns, ktime_get_ns() - (dt >> shift));
+
+	if (!atomic_dec_return_release(&gt->rf_qos.active_count))
+		intel_gt_rf_qos_update(gt);
+}
+
 void intel_gt_pm_init(struct intel_gt *gt)
 {
 	/*
@@ -116,6 +212,14 @@  void intel_gt_pm_init(struct intel_gt *gt)
 	 */
 	intel_rc6_init(&gt->rc6);
 	intel_rps_init(&gt->rps);
+
+	cpu_response_frequency_qos_add_request(&gt->rf_qos.req,
+					       PM_QOS_DEFAULT_VALUE);
+
+	gt->rf_qos.delay_max_ns = 250000;
+	gt->rf_qos.delay_slope_shift = 0;
+	gt->rf_qos.target_hz = 2;
+	timer_setup(&gt->rf_qos.timer, intel_gt_rf_qos_timeout, 0);
 }
 
 static bool reset_engines(struct intel_gt *gt)
@@ -170,6 +274,9 @@  static void gt_sanitize(struct intel_gt *gt, bool force)
 
 void intel_gt_pm_fini(struct intel_gt *gt)
 {
+	del_timer_sync(&gt->rf_qos.timer);
+	cpu_response_frequency_qos_remove_request(&gt->rf_qos.req);
+
 	intel_rc6_fini(&gt->rc6);
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index 60f0e2fbe55c..43f1d45fb0db 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -58,6 +58,9 @@  int intel_gt_resume(struct intel_gt *gt);
 void intel_gt_runtime_suspend(struct intel_gt *gt);
 int intel_gt_runtime_resume(struct intel_gt *gt);
 
+void intel_gt_pm_active_begin(struct intel_gt *gt);
+void intel_gt_pm_active_end(struct intel_gt *gt);
+
 static inline bool is_mock_gt(const struct intel_gt *gt)
 {
 	return I915_SELFTEST_ONLY(gt->awake == -ENODEV);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index 96890dd12b5f..4bc80c55e6f0 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -10,6 +10,7 @@ 
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
+#include <linux/pm_qos.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 
@@ -97,6 +98,17 @@  struct intel_gt {
 	 * Reserved for exclusive use by the kernel.
 	 */
 	struct i915_address_space *vm;
+
+	struct {
+		struct pm_qos_request req;
+		struct timer_list timer;
+		uint32_t target_hz;
+		uint32_t delay_max_ns;
+		uint32_t delay_slope_shift;
+		atomic64_t time_set_ns;
+		atomic64_t time_clear_ns;
+		atomic_t active_count;
+	} rf_qos;
 };
 
 enum intel_gt_scratch_field {
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index b9b3f78f1324..a5d7a80b826d 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1577,6 +1577,11 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 	/* we need to manually load the submit queue */
 	if (execlists->ctrl_reg)
 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
+
+	if (execlists_num_ports(execlists) > 1 &&
+	    execlists->pending[1] &&
+	    !atomic_xchg(&execlists->overload, 1))
+		intel_gt_pm_active_begin(&engine->i915->gt);
 }
 
 static bool ctx_single_port_submission(const struct intel_context *ce)
@@ -2213,6 +2218,12 @@  cancel_port_requests(struct intel_engine_execlists * const execlists)
 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
 
 	WRITE_ONCE(execlists->active, execlists->inflight);
+
+	if (atomic_xchg(&execlists->overload, 0)) {
+		struct intel_engine_cs *engine =
+			container_of(execlists, typeof(*engine), execlists);
+		intel_gt_pm_active_end(&engine->i915->gt);
+	}
 }
 
 static inline void
@@ -2386,6 +2397,9 @@  static void process_csb(struct intel_engine_cs *engine)
 			/* port0 completed, advanced to port1 */
 			trace_ports(execlists, "completed", execlists->active);
 
+			if (atomic_xchg(&execlists->overload, 0))
+				intel_gt_pm_active_end(&engine->i915->gt);
+
 			/*
 			 * We rely on the hardware being strongly
 			 * ordered, that the breadcrumb write is