[06/19] drm/i915/gt: Schedule request retirement when submission idles
diff mbox series

Message ID 20191118230254.2615942-7-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [01/19] drm/i915/selftests: Force bonded submission to overlap
Related show

Commit Message

Chris Wilson Nov. 18, 2019, 11:02 p.m. UTC
The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
corruption WA") is that it disables RC6 while Skylake (and friends) is
active, and we do not consider the GPU idle until all outstanding
requests have been retired and the engine switched over to the kernel
context. If userspace is idle, this task falls onto our background idle
worker, which only runs roughly once a second, meaning that userspace has
to have been idle for a couple of seconds before we enable RC6 again.
Naturally, this causes us to consume considerably more energy than
before as powersaving is effectively disabled while a display server
(here's looking at you Xorg) is running.

As execlists will get a completion event as the last context is
completed and the GPU goes idle, we can use our submission tasklet to
notice when the GPU is idle and kick the retire worker. Thus during
light workloads, we will do much more work to idle the GPU faster...
Hopefully with commensurate power saving!

References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_requests.h |  9 ++++++++-
 drivers/gpu/drm/i915/gt/intel_lrc.c         | 13 +++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

Comments

Tvrtko Ursulin Nov. 19, 2019, 3:04 p.m. UTC | #1
On 18/11/2019 23:02, Chris Wilson wrote:
> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> corruption WA") is that it disables RC6 while Skylake (and friends) is
> active, and we do not consider the GPU idle until all outstanding
> requests have been retired and the engine switched over to the kernel
> context. If userspace is idle, this task falls onto our background idle
> worker, which only runs roughly once a second, meaning that userspace has
> to have been idle for a couple of seconds before we enable RC6 again.
> Naturally, this causes us to consume considerably more energy than
> before as powersaving is effectively disabled while a display server
> (here's looking at you Xorg) is running.
> 
> As execlists will get a completion event as the last context is
> completed and the GPU goes idle, we can use our submission tasklet to
> notice when the GPU is idle and kick the retire worker. Thus during
> light workloads, we will do much more work to idle the GPU faster...
> Hopefully with commensurate power saving!
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_gt_requests.h |  9 ++++++++-
>   drivers/gpu/drm/i915/gt/intel_lrc.c         | 13 +++++++++++++
>   2 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> index fde546424c63..94b8758a45d9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> @@ -7,7 +7,9 @@
>   #ifndef INTEL_GT_REQUESTS_H
>   #define INTEL_GT_REQUESTS_H
>   
> -struct intel_gt;
> +#include <linux/workqueue.h>
> +
> +#include "intel_gt_types.h"
>   
>   long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
>   static inline void intel_gt_retire_requests(struct intel_gt *gt)
> @@ -15,6 +17,11 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
>   	intel_gt_retire_requests_timeout(gt, 0);
>   }
>   
> +static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
> +{
> +	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
> +}
> +
>   int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
>   
>   void intel_gt_init_requests(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 33ce258d484f..f7c8fec436a9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -142,6 +142,7 @@
>   #include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   #include "intel_gt_pm.h"
> +#include "intel_gt_requests.h"
>   #include "intel_lrc_reg.h"
>   #include "intel_mocs.h"
>   #include "intel_reset.h"
> @@ -2278,6 +2279,18 @@ static void execlists_submission_tasklet(unsigned long data)
>   		if (timeout && preempt_timeout(engine))
>   			preempt_reset(engine);
>   	}
> +
> +	/*
> +	 * If the GPU is currently idle, retire the outstanding completed
> +	 * requests. This will allow us to enter soft-rc6 as soon as possible,
> +	 * albeit at the cost of running the retire worker much more frequently
> +	 * (over the entire GT not just this engine) and emitting more idle
> +	 * barriers (i.e. kernel context switches unpinning all that went
> +	 * before) which may add some extra latency.
> +	 */
> +	if (intel_engine_pm_is_awake(engine) &&
> +	    !execlists_active(&engine->execlists))
> +		intel_gt_schedule_retire_requests(engine->gt);

I am still not a fan of doing this for all platforms.

Its not just the cost of retirement but there is 
intel_engine_flush_submission on all engines in there as well which we 
cannot avoid triggering from this path.

Would it be worth experimenting with additional per-engine retire 
workers? Most of the code could be shared, just a little bit of 
specialization to filter on engine.

Regards,

Tvrtko

>   }
>   
>   static void __execlists_kick(struct intel_engine_execlists *execlists)
>
Chris Wilson Nov. 19, 2019, 4:20 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-11-19 15:04:46)
> 
> On 18/11/2019 23:02, Chris Wilson wrote:
> > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > index 33ce258d484f..f7c8fec436a9 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> > @@ -142,6 +142,7 @@
> >   #include "intel_engine_pm.h"
> >   #include "intel_gt.h"
> >   #include "intel_gt_pm.h"
> > +#include "intel_gt_requests.h"
> >   #include "intel_lrc_reg.h"
> >   #include "intel_mocs.h"
> >   #include "intel_reset.h"
> > @@ -2278,6 +2279,18 @@ static void execlists_submission_tasklet(unsigned long data)
> >               if (timeout && preempt_timeout(engine))
> >                       preempt_reset(engine);
> >       }
> > +
> > +     /*
> > +      * If the GPU is currently idle, retire the outstanding completed
> > +      * requests. This will allow us to enter soft-rc6 as soon as possible,
> > +      * albeit at the cost of running the retire worker much more frequently
> > +      * (over the entire GT not just this engine) and emitting more idle
> > +      * barriers (i.e. kernel context switches unpinning all that went
> > +      * before) which may add some extra latency.
> > +      */
> > +     if (intel_engine_pm_is_awake(engine) &&
> > +         !execlists_active(&engine->execlists))
> > +             intel_gt_schedule_retire_requests(engine->gt);
> 
> I am still not a fan of doing this for all platforms.

I understand. I think it makes a fair amount of sense to do early
retires, and wish to pursue that if I can show there is no harm.
 
> Its not just the cost of retirement but there is 
> intel_engine_flush_submission on all engines in there as well which we 
> cannot avoid triggering from this path.
> 
> Would it be worth experimenting with additional per-engine retire 
> workers? Most of the code could be shared, just a little bit of 
> specialization to filter on engine.

I haven't sketched out anything more than peeking at the last request on
the timeline and doing a rq->engine == engine filter. Walking the global
timeline.active_list in that case is also a nuisance.

There's definitely scope here for us using some more information from
process_csb() about which context completed and limit work to that
timeline. Hmm, something along those lines maybe...
-Chris
Tvrtko Ursulin Nov. 19, 2019, 4:33 p.m. UTC | #3
On 19/11/2019 16:20, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-19 15:04:46)
>>
>> On 18/11/2019 23:02, Chris Wilson wrote:
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> index 33ce258d484f..f7c8fec436a9 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> @@ -142,6 +142,7 @@
>>>    #include "intel_engine_pm.h"
>>>    #include "intel_gt.h"
>>>    #include "intel_gt_pm.h"
>>> +#include "intel_gt_requests.h"
>>>    #include "intel_lrc_reg.h"
>>>    #include "intel_mocs.h"
>>>    #include "intel_reset.h"
>>> @@ -2278,6 +2279,18 @@ static void execlists_submission_tasklet(unsigned long data)
>>>                if (timeout && preempt_timeout(engine))
>>>                        preempt_reset(engine);
>>>        }
>>> +
>>> +     /*
>>> +      * If the GPU is currently idle, retire the outstanding completed
>>> +      * requests. This will allow us to enter soft-rc6 as soon as possible,
>>> +      * albeit at the cost of running the retire worker much more frequently
>>> +      * (over the entire GT not just this engine) and emitting more idle
>>> +      * barriers (i.e. kernel context switches unpinning all that went
>>> +      * before) which may add some extra latency.
>>> +      */
>>> +     if (intel_engine_pm_is_awake(engine) &&
>>> +         !execlists_active(&engine->execlists))
>>> +             intel_gt_schedule_retire_requests(engine->gt);
>>
>> I am still not a fan of doing this for all platforms.
> 
> I understand. I think it makes a fair amount of sense to do early
> retires, and wish to pursue that if I can show there is no harm.

It's also a bit of a layering problem.

>> Its not just the cost of retirement but there is
>> intel_engine_flush_submission on all engines in there as well which we
>> cannot avoid triggering from this path.
>>
>> Would it be worth experimenting with additional per-engine retire
>> workers? Most of the code could be shared, just a little bit of
>> specialization to filter on engine.
> 
> I haven't sketched out anything more than peeking at the last request on
> the timeline and doing a rq->engine == engine filter. Walking the global
> timeline.active_list in that case is also a nuisance.

That together with:

	flush_submission(gt, engine ? engine->mask : ALL_ENGINES);

Might be enough? At least to satisfy my concern.

Apart layering is still bad.. And I'd still limit it to when RC6 WA is 
active unless it can be shown there is no perf/power impact across 
GPU/CPU to do this everywhere.

At which point it becomes easier to just limit it because we have to 
have it there.

I also wonder if the current flush_submission wasn't the reason for 
performance regression you were seeing with this? It makes this tasklet 
wait for all other engines, if they are busy. But not sure.. perhaps it 
is work which would be done anyway.

> There's definitely scope here for us using some more information from
> process_csb() about which context completed and limit work to that
> timeline. Hmm, something along those lines maybe...

But you want to retire all timelines which have work on this particular 
physical engine. Otherwise it doesn't get parked, no?

Regards,

Tvrtko
Chris Wilson Nov. 19, 2019, 4:42 p.m. UTC | #4
Quoting Tvrtko Ursulin (2019-11-19 16:33:18)
> 
> On 19/11/2019 16:20, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-19 15:04:46)
> >>
> >> On 18/11/2019 23:02, Chris Wilson wrote:
> >>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> >>> index 33ce258d484f..f7c8fec436a9 100644
> >>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> >>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> >>> @@ -142,6 +142,7 @@
> >>>    #include "intel_engine_pm.h"
> >>>    #include "intel_gt.h"
> >>>    #include "intel_gt_pm.h"
> >>> +#include "intel_gt_requests.h"
> >>>    #include "intel_lrc_reg.h"
> >>>    #include "intel_mocs.h"
> >>>    #include "intel_reset.h"
> >>> @@ -2278,6 +2279,18 @@ static void execlists_submission_tasklet(unsigned long data)
> >>>                if (timeout && preempt_timeout(engine))
> >>>                        preempt_reset(engine);
> >>>        }
> >>> +
> >>> +     /*
> >>> +      * If the GPU is currently idle, retire the outstanding completed
> >>> +      * requests. This will allow us to enter soft-rc6 as soon as possible,
> >>> +      * albeit at the cost of running the retire worker much more frequently
> >>> +      * (over the entire GT not just this engine) and emitting more idle
> >>> +      * barriers (i.e. kernel context switches unpinning all that went
> >>> +      * before) which may add some extra latency.
> >>> +      */
> >>> +     if (intel_engine_pm_is_awake(engine) &&
> >>> +         !execlists_active(&engine->execlists))
> >>> +             intel_gt_schedule_retire_requests(engine->gt);
> >>
> >> I am still not a fan of doing this for all platforms.
> > 
> > I understand. I think it makes a fair amount of sense to do early
> > retires, and wish to pursue that if I can show there is no harm.
> 
> It's also a bit of a layering problem.

Them's fighting words! :)
 
> >> Its not just the cost of retirement but there is
> >> intel_engine_flush_submission on all engines in there as well which we
> >> cannot avoid triggering from this path.
> >>
> >> Would it be worth experimenting with additional per-engine retire
> >> workers? Most of the code could be shared, just a little bit of
> >> specialization to filter on engine.
> > 
> > I haven't sketched out anything more than peeking at the last request on
> > the timeline and doing a rq->engine == engine filter. Walking the global
> > timeline.active_list in that case is also a nuisance.
> 
> That together with:
> 
>         flush_submission(gt, engine ? engine->mask : ALL_ENGINES);
> 
> Might be enough? At least to satisfy my concern.

Aye, flushing all other when we know we only care about being idle is
definitely a weak point of the current scheme.

> Apart layering is still bad.. And I'd still limit it to when RC6 WA is 
> active unless it can be shown there is no perf/power impact across 
> GPU/CPU to do this everywhere.

Bah, keep tuning until it's a win for everyone!
 
> At which point it becomes easier to just limit it because we have to 
> have it there.
> 
> I also wonder if the current flush_submission wasn't the reason for 
> performance regression you were seeing with this? It makes this tasklet 
> wait for all other engines, if they are busy. But not sure.. perhaps it 
> is work which would be done anyway.

I haven't finished yet; but the baseline took a big nose dive so it
might be enough to hide a lot of evil.

Too bad I don't have an Icelake with to cross check with an unaffected
platform.

> > There's definitely scope here for us using some more information from
> > process_csb() about which context completed and limit work to that
> > timeline. Hmm, something along those lines maybe...
> 
> But you want to retire all timelines which have work on this particular 
> physical engine. Otherwise it doesn't get parked, no?

There I was suggesting being even more proactive, and say keeping an
llist of completed timelines. Nothing concrete yet, plenty of existing
races found already that need fixing.
-Chris
Chris Wilson Nov. 19, 2019, 6:58 p.m. UTC | #5
Quoting Chris Wilson (2019-11-19 16:42:28)
> Quoting Tvrtko Ursulin (2019-11-19 16:33:18)
> > I also wonder if the current flush_submission wasn't the reason for 
> > performance regression you were seeing with this? It makes this tasklet 
> > wait for all other engines, if they are busy. But not sure.. perhaps it 
> > is work which would be done anyway.
> 
> I haven't finished yet; but the baseline took a big nose dive so it
> might be enough to hide a lot of evil.

Early results yet, but the extra work is exacerbating the regressions in
gem_wsim, enough that this cannot land as is and we do need to be
smarter.
-Chris

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
index fde546424c63..94b8758a45d9 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
@@ -7,7 +7,9 @@ 
 #ifndef INTEL_GT_REQUESTS_H
 #define INTEL_GT_REQUESTS_H
 
-struct intel_gt;
+#include <linux/workqueue.h>
+
+#include "intel_gt_types.h"
 
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
 static inline void intel_gt_retire_requests(struct intel_gt *gt)
@@ -15,6 +17,11 @@  static inline void intel_gt_retire_requests(struct intel_gt *gt)
 	intel_gt_retire_requests_timeout(gt, 0);
 }
 
+static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
+{
+	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
+}
+
 int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
 
 void intel_gt_init_requests(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 33ce258d484f..f7c8fec436a9 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -142,6 +142,7 @@ 
 #include "intel_engine_pm.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
 #include "intel_reset.h"
@@ -2278,6 +2279,18 @@  static void execlists_submission_tasklet(unsigned long data)
 		if (timeout && preempt_timeout(engine))
 			preempt_reset(engine);
 	}
+
+	/*
+	 * If the GPU is currently idle, retire the outstanding completed
+	 * requests. This will allow us to enter soft-rc6 as soon as possible,
+	 * albeit at the cost of running the retire worker much more frequently
+	 * (over the entire GT not just this engine) and emitting more idle
+	 * barriers (i.e. kernel context switches unpinning all that went
+	 * before) which may add some extra latency.
+	 */
+	if (intel_engine_pm_is_awake(engine) &&
+	    !execlists_active(&engine->execlists))
+		intel_gt_schedule_retire_requests(engine->gt);
 }
 
 static void __execlists_kick(struct intel_engine_execlists *execlists)