diff mbox series

[9/9] drm/i915/gt: Schedule request retirement when timeline idles

Message ID 20191120093302.3723715-9-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/9] drm/i915/selftests: Take a ref to the request we wait upon | expand

Commit Message

Chris Wilson Nov. 20, 2019, 9:33 a.m. UTC
The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
corruption WA") is that it disables RC6 while Skylake (and friends) is
active, and we do not consider the GPU idle until all outstanding
requests have been retired and the engine switched over to the kernel
context. If userspace is idle, this task falls onto our background idle
worker, which only runs roughly once a second, meaning that userspace has
to have been idle for a couple of seconds before we enable RC6 again.
Naturally, this causes us to consume considerably more energy than
before as powersaving is effectively disabled while a display server
(here's looking at you Xorg) is running.

As execlists will get a completion event as each context is completed,
we can use this interrupt to queue a retire worker bound to this engine
to cleanup idle timelines. We will then immediately notice the idle
engine (without userspace intervention or the aid of the background
retire worker) and start parking the GPU. Thus during light workloads,
we will do much more work to idle the GPU faster...  Hopefully with
commensurate power saving!

v2: Watch context completions and only look at those local to the engine
when retiring to reduce the amount of excess work we perform.

References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
 drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
 drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
 drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
 .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
 7 files changed, 116 insertions(+), 4 deletions(-)

Comments

Tvrtko Ursulin Nov. 20, 2019, 1:16 p.m. UTC | #1
On 20/11/2019 09:33, Chris Wilson wrote:
> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> corruption WA") is that it disables RC6 while Skylake (and friends) is
> active, and we do not consider the GPU idle until all outstanding
> requests have been retired and the engine switched over to the kernel
> context. If userspace is idle, this task falls onto our background idle
> worker, which only runs roughly once a second, meaning that userspace has
> to have been idle for a couple of seconds before we enable RC6 again.
> Naturally, this causes us to consume considerably more energy than
> before as powersaving is effectively disabled while a display server
> (here's looking at you Xorg) is running.
> 
> As execlists will get a completion event as each context is completed,
> we can use this interrupt to queue a retire worker bound to this engine
> to cleanup idle timelines. We will then immediately notice the idle
> engine (without userspace intervention or the aid of the background
> retire worker) and start parking the GPU. Thus during light workloads,
> we will do much more work to idle the GPU faster...  Hopefully with
> commensurate power saving!
> 
> v2: Watch context completions and only look at those local to the engine
> when retiring to reduce the amount of excess work we perform.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
>   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
>   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
>   drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
>   drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
>   drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
>   .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
>   7 files changed, 116 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index b9613d044393..8f6e353caa66 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -28,13 +28,13 @@
>   
>   #include "i915_drv.h"
>   
> -#include "gt/intel_gt.h"
> -
> +#include "intel_context.h"
>   #include "intel_engine.h"
>   #include "intel_engine_pm.h"
>   #include "intel_engine_pool.h"
>   #include "intel_engine_user.h"
> -#include "intel_context.h"
> +#include "intel_gt.h"
> +#include "intel_gt_requests.h"
>   #include "intel_lrc.h"
>   #include "intel_reset.h"
>   #include "intel_ring.h"
> @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
>   	intel_engine_init_execlists(engine);
>   	intel_engine_init_cmd_parser(engine);
>   	intel_engine_init__pm(engine);
> +	intel_engine_init_retire(engine);
>   
>   	intel_engine_pool_init(&engine->pool);
>   
> @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>   
>   	cleanup_status_page(engine);
>   
> +	intel_engine_fini_retire(engine);
>   	intel_engine_pool_fini(&engine->pool);
>   	intel_engine_fini_breadcrumbs(engine);
>   	intel_engine_cleanup_cmd_parser(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 758f0e8ec672..17f1f1441efc 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -451,6 +451,14 @@ struct intel_engine_cs {
>   
>   	struct intel_engine_execlists execlists;
>   
> +	/*
> +	 * Keep track of completed timelines on this engine for early
> +	 * retirement with the goal of quickly enabling powersaving as
> +	 * soon as the engine is idle.
> +	 */
> +	struct intel_timeline *retire;
> +	struct work_struct retire_work;
> +
>   	/* status_notifier: list of callbacks for context-switch changes */
>   	struct atomic_notifier_head context_status_notifier;
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 4dc3cbeb1b36..4a98fefdf915 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
>   		intel_engine_flush_submission(engine);
>   }
>   
> +static void engine_retire(struct work_struct *work)
> +{
> +	struct intel_engine_cs *engine =
> +		container_of(work, typeof(*engine), retire_work);
> +	struct intel_timeline *tl = xchg(&engine->retire, NULL);

Shouldn't this be atomic_xchg to avoid racing with add_retire?

> +
> +	do {
> +		struct intel_timeline *next = xchg(&tl->retire, NULL);

Here as well?

> +
> +		/*
> +		 * Our goal here is to retire _idle_ timelines as soon as
> +		 * possible (as they are idle, we do not expect userspace
> +		 * to be cleaning up anytime soon).
> +		 *
> +		 * If the tl->active_count is already zero, someone else
> +		 * should have retired the timeline. Equally if the timeline
> +		 * is currently locked, either it is being retired elsewhere
> +		 * or about to be!
> +		 */
> +		if (atomic_read(&tl->active_count) &&
> +		    mutex_trylock(&tl->mutex)) {
> +			retire_requests(tl);
> +			mutex_unlock(&tl->mutex);
> +		}
> +		intel_timeline_put(tl);
> +
> +		GEM_BUG_ON(!next);
> +		tl = ptr_mask_bits(next, 1);

You sometimes expect engine->retire to contain 0x1?

> +	} while (tl);
> +}
> +
> +static bool add_retire(struct intel_engine_cs *engine,
> +		       struct intel_timeline *tl)
> +{
> +	struct intel_timeline *first = READ_ONCE(engine->retire);
> +
> +	/*
> +	 * We open-code a llist here to include the additional tag [BIT(0)]
> +	 * so that we know when the timeline is already on a
> +	 * retirement queue: either this engine or another.
> +	 *
> +	 * However, we rely on that a timeline can only be active on a single
> +	 * engine at any one time and that add_retire() is called before the
> +	 * engine releases the timeline and transferred to another to retire.
> +	 */
> +
> +	if (READ_ONCE(tl->retire)) /* already queued */
> +		return false;

Can't this go first in the function?

> +
> +	intel_timeline_get(tl);
> +	do
> +		tl->retire = ptr_pack_bits(first, 1, 1);

Here you rely on assignment being atomic right?

> +	while (!try_cmpxchg(&engine->retire, &first, tl));

So the loop is effectively creating a chain of timelines to retire on 
this engine.

What happens with virtual engines when a timeline goes to different 
engine before (well or any single timeline context) the retire worker 
runs? Ah okay, it gets re-assigned to the most recent engine.

I am not sure about the BIT(0) business. It's always set on write so I 
am not getting why it is useful.

Regards,

Tvrtko

> +
> +	return !first;
> +}
> +
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> +			     struct intel_timeline *tl)
> +{
> +	if (add_retire(engine, tl))
> +		schedule_work(&engine->retire_work);
> +}
> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine)
> +{
> +	INIT_WORK(&engine->retire_work, engine_retire);
> +}
> +
> +void intel_engine_fini_retire(struct intel_engine_cs *engine)
> +{
> +	flush_work(&engine->retire_work);
> +}
> +
>   long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
>   {
>   	struct intel_gt_timelines *timelines = &gt->timelines;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> index fde546424c63..8de559b5a033 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> @@ -7,7 +7,12 @@
>   #ifndef INTEL_GT_REQUESTS_H
>   #define INTEL_GT_REQUESTS_H
>   
> -struct intel_gt;
> +#include <linux/workqueue.h>
> +
> +#include "intel_gt_types.h"
> +
> +struct intel_engine_cs;
> +struct intel_timeline;
>   
>   long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
>   static inline void intel_gt_retire_requests(struct intel_gt *gt)
> @@ -15,6 +20,16 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
>   	intel_gt_retire_requests_timeout(gt, 0);
>   }
>   
> +static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
> +{
> +	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
> +}

This is unused in v2.

> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine);
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> +			     struct intel_timeline *tl);
> +void intel_engine_fini_retire(struct intel_engine_cs *engine);
> +
>   int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
>   
>   void intel_gt_init_requests(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index b65bc06855b0..2ceaa2f22996 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -142,6 +142,7 @@
>   #include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   #include "intel_gt_pm.h"
> +#include "intel_gt_requests.h"
>   #include "intel_lrc_reg.h"
>   #include "intel_mocs.h"
>   #include "intel_reset.h"
> @@ -1170,6 +1171,14 @@ __execlists_schedule_out(struct i915_request *rq,
>   	 * refrain from doing non-trivial work here.
>   	 */
>   
> +	/*
> +	 * If we have just completed this context, the engine may now be
> +	 * idle and we want to re-enter powersaving.
> +	 */
> +	if (list_is_last(&rq->link, &ce->timeline->requests) &&
> +	    i915_request_completed(rq))
> +		intel_engine_add_retire(engine, ce->timeline);
> +
>   	intel_engine_context_out(engine);
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
>   	intel_gt_pm_put_async(engine->gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index b190a5d9ab02..c1d2419444f8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -277,6 +277,7 @@ void intel_timeline_fini(struct intel_timeline *timeline)
>   {
>   	GEM_BUG_ON(atomic_read(&timeline->pin_count));
>   	GEM_BUG_ON(!list_empty(&timeline->requests));
> +	GEM_BUG_ON(timeline->retire);
>   
>   	if (timeline->hwsp_cacheline)
>   		cacheline_free(timeline->hwsp_cacheline);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> index 5244615ed1cb..aaf15cbe1ce1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> @@ -66,6 +66,9 @@ struct intel_timeline {
>   	 */
>   	struct i915_active_fence last_request;
>   
> +	/** A chain of completed timelines ready for early retirement. */
> +	struct intel_timeline *retire;
> +
>   	/**
>   	 * We track the most recent seqno that we wait on in every context so
>   	 * that we only have to emit a new await and dependency on a more
>
Chris Wilson Nov. 20, 2019, 1:39 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
> 
> On 20/11/2019 09:33, Chris Wilson wrote:
> > The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> > corruption WA") is that it disables RC6 while Skylake (and friends) is
> > active, and we do not consider the GPU idle until all outstanding
> > requests have been retired and the engine switched over to the kernel
> > context. If userspace is idle, this task falls onto our background idle
> > worker, which only runs roughly once a second, meaning that userspace has
> > to have been idle for a couple of seconds before we enable RC6 again.
> > Naturally, this causes us to consume considerably more energy than
> > before as powersaving is effectively disabled while a display server
> > (here's looking at you Xorg) is running.
> > 
> > As execlists will get a completion event as each context is completed,
> > we can use this interrupt to queue a retire worker bound to this engine
> > to cleanup idle timelines. We will then immediately notice the idle
> > engine (without userspace intervention or the aid of the background
> > retire worker) and start parking the GPU. Thus during light workloads,
> > we will do much more work to idle the GPU faster...  Hopefully with
> > commensurate power saving!
> > 
> > v2: Watch context completions and only look at those local to the engine
> > when retiring to reduce the amount of excess work we perform.
> > 
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> > References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> > References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
> >   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
> >   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
> >   drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
> >   drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
> >   drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
> >   .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
> >   7 files changed, 116 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > index b9613d044393..8f6e353caa66 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > @@ -28,13 +28,13 @@
> >   
> >   #include "i915_drv.h"
> >   
> > -#include "gt/intel_gt.h"
> > -
> > +#include "intel_context.h"
> >   #include "intel_engine.h"
> >   #include "intel_engine_pm.h"
> >   #include "intel_engine_pool.h"
> >   #include "intel_engine_user.h"
> > -#include "intel_context.h"
> > +#include "intel_gt.h"
> > +#include "intel_gt_requests.h"
> >   #include "intel_lrc.h"
> >   #include "intel_reset.h"
> >   #include "intel_ring.h"
> > @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
> >       intel_engine_init_execlists(engine);
> >       intel_engine_init_cmd_parser(engine);
> >       intel_engine_init__pm(engine);
> > +     intel_engine_init_retire(engine);
> >   
> >       intel_engine_pool_init(&engine->pool);
> >   
> > @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
> >   
> >       cleanup_status_page(engine);
> >   
> > +     intel_engine_fini_retire(engine);
> >       intel_engine_pool_fini(&engine->pool);
> >       intel_engine_fini_breadcrumbs(engine);
> >       intel_engine_cleanup_cmd_parser(engine);
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > index 758f0e8ec672..17f1f1441efc 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > @@ -451,6 +451,14 @@ struct intel_engine_cs {
> >   
> >       struct intel_engine_execlists execlists;
> >   
> > +     /*
> > +      * Keep track of completed timelines on this engine for early
> > +      * retirement with the goal of quickly enabling powersaving as
> > +      * soon as the engine is idle.
> > +      */
> > +     struct intel_timeline *retire;
> > +     struct work_struct retire_work;
> > +
> >       /* status_notifier: list of callbacks for context-switch changes */
> >       struct atomic_notifier_head context_status_notifier;
> >   
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > index 4dc3cbeb1b36..4a98fefdf915 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
> >               intel_engine_flush_submission(engine);
> >   }
> >   
> > +static void engine_retire(struct work_struct *work)
> > +{
> > +     struct intel_engine_cs *engine =
> > +             container_of(work, typeof(*engine), retire_work);
> > +     struct intel_timeline *tl = xchg(&engine->retire, NULL);
> 
> Shouldn't this be atomic_xchg to avoid racing with add_retire?
> 
> > +
> > +     do {
> > +             struct intel_timeline *next = xchg(&tl->retire, NULL);
> 
> Here as well?

xchg() are always locked.

atomic_xchg() operates on atomic_t; xchg() works on any variable, like
cmpxchg().

> > +
> > +             /*
> > +              * Our goal here is to retire _idle_ timelines as soon as
> > +              * possible (as they are idle, we do not expect userspace
> > +              * to be cleaning up anytime soon).
> > +              *
> > +              * If the tl->active_count is already zero, someone else
> > +              * should have retired the timeline. Equally if the timeline
> > +              * is currently locked, either it is being retired elsewhere
> > +              * or about to be!
> > +              */
> > +             if (atomic_read(&tl->active_count) &&
> > +                 mutex_trylock(&tl->mutex)) {
> > +                     retire_requests(tl);
> > +                     mutex_unlock(&tl->mutex);
> > +             }
> > +             intel_timeline_put(tl);
> > +
> > +             GEM_BUG_ON(!next);
> > +             tl = ptr_mask_bits(next, 1);
> 
> You sometimes expect engine->retire to contain 0x1?

Yes, imagine that we are submitting very fast such that we schedule_out
the same context before the worker ran, we would then try to
add_retire() the same timeline again. So I was using BIT(0) to tag an
active element in the retirement list.

> > +     } while (tl);
> > +}
> > +
> > +static bool add_retire(struct intel_engine_cs *engine,
> > +                    struct intel_timeline *tl)
> > +{
> > +     struct intel_timeline *first = READ_ONCE(engine->retire);
> > +
> > +     /*
> > +      * We open-code a llist here to include the additional tag [BIT(0)]
> > +      * so that we know when the timeline is already on a
> > +      * retirement queue: either this engine or another.
> > +      *
> > +      * However, we rely on that a timeline can only be active on a single
> > +      * engine at any one time and that add_retire() is called before the
> > +      * engine releases the timeline and transferred to another to retire.
> > +      */
> > +
> > +     if (READ_ONCE(tl->retire)) /* already queued */
> > +             return false;
> 
> Can't this go first in the function?

Conceptually it is. And I made it so because I also decided against
having the READ_ONCE() at the top.

> > +
> > +     intel_timeline_get(tl);
> > +     do
> > +             tl->retire = ptr_pack_bits(first, 1, 1);
> 
> Here you rely on assignment being atomic right?

Ish. Here we rely on the timeline being owned by the engine so it cannot
be submitted by another (and so schedule_out called) until this engine
has released it.

It is a weak point for generality, but the ordering is strong in
execlists.

> > +     while (!try_cmpxchg(&engine->retire, &first, tl));
> 
> So the loop is effectively creating a chain of timelines to retire on 
> this engine.
> 
> What happens with virtual engines when a timeline goes to different 
> engine before (well or any single timeline context) the retire worker 
> runs? Ah okay, it gets re-assigned to the most recent engine.

Right. The engine_retire() doesn't care which engine the timelines were
run on, it's just a list of suspected idle timelines.

> I am not sure about the BIT(0) business. It's always set on write so I 
> am not getting why it is useful.

It's also set to 0 on consumption :)
-Chris
Tvrtko Ursulin Nov. 20, 2019, 2:16 p.m. UTC | #3
On 20/11/2019 13:39, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
>>
>> On 20/11/2019 09:33, Chris Wilson wrote:
>>> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
>>> corruption WA") is that it disables RC6 while Skylake (and friends) is
>>> active, and we do not consider the GPU idle until all outstanding
>>> requests have been retired and the engine switched over to the kernel
>>> context. If userspace is idle, this task falls onto our background idle
>>> worker, which only runs roughly once a second, meaning that userspace has
>>> to have been idle for a couple of seconds before we enable RC6 again.
>>> Naturally, this causes us to consume considerably more energy than
>>> before as powersaving is effectively disabled while a display server
>>> (here's looking at you Xorg) is running.
>>>
>>> As execlists will get a completion event as each context is completed,
>>> we can use this interrupt to queue a retire worker bound to this engine
>>> to cleanup idle timelines. We will then immediately notice the idle
>>> engine (without userspace intervention or the aid of the background
>>> retire worker) and start parking the GPU. Thus during light workloads,
>>> we will do much more work to idle the GPU faster...  Hopefully with
>>> commensurate power saving!
>>>
>>> v2: Watch context completions and only look at those local to the engine
>>> when retiring to reduce the amount of excess work we perform.
>>>
>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
>>> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
>>> References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
>>>    drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
>>>    drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
>>>    drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
>>>    drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
>>>    drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
>>>    .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
>>>    7 files changed, 116 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> index b9613d044393..8f6e353caa66 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> @@ -28,13 +28,13 @@
>>>    
>>>    #include "i915_drv.h"
>>>    
>>> -#include "gt/intel_gt.h"
>>> -
>>> +#include "intel_context.h"
>>>    #include "intel_engine.h"
>>>    #include "intel_engine_pm.h"
>>>    #include "intel_engine_pool.h"
>>>    #include "intel_engine_user.h"
>>> -#include "intel_context.h"
>>> +#include "intel_gt.h"
>>> +#include "intel_gt_requests.h"
>>>    #include "intel_lrc.h"
>>>    #include "intel_reset.h"
>>>    #include "intel_ring.h"
>>> @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
>>>        intel_engine_init_execlists(engine);
>>>        intel_engine_init_cmd_parser(engine);
>>>        intel_engine_init__pm(engine);
>>> +     intel_engine_init_retire(engine);
>>>    
>>>        intel_engine_pool_init(&engine->pool);
>>>    
>>> @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>>>    
>>>        cleanup_status_page(engine);
>>>    
>>> +     intel_engine_fini_retire(engine);
>>>        intel_engine_pool_fini(&engine->pool);
>>>        intel_engine_fini_breadcrumbs(engine);
>>>        intel_engine_cleanup_cmd_parser(engine);
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> index 758f0e8ec672..17f1f1441efc 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> @@ -451,6 +451,14 @@ struct intel_engine_cs {
>>>    
>>>        struct intel_engine_execlists execlists;
>>>    
>>> +     /*
>>> +      * Keep track of completed timelines on this engine for early
>>> +      * retirement with the goal of quickly enabling powersaving as
>>> +      * soon as the engine is idle.
>>> +      */
>>> +     struct intel_timeline *retire;
>>> +     struct work_struct retire_work;
>>> +
>>>        /* status_notifier: list of callbacks for context-switch changes */
>>>        struct atomic_notifier_head context_status_notifier;
>>>    
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> index 4dc3cbeb1b36..4a98fefdf915 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
>>>                intel_engine_flush_submission(engine);
>>>    }
>>>    
>>> +static void engine_retire(struct work_struct *work)
>>> +{
>>> +     struct intel_engine_cs *engine =
>>> +             container_of(work, typeof(*engine), retire_work);
>>> +     struct intel_timeline *tl = xchg(&engine->retire, NULL);
>>
>> Shouldn't this be atomic_xchg to avoid racing with add_retire?
>>
>>> +
>>> +     do {
>>> +             struct intel_timeline *next = xchg(&tl->retire, NULL);
>>
>> Here as well?
> 
> xchg() are always locked.
> 
> atomic_xchg() operates on atomic_t; xchg() works on any variable, like
> cmpxchg().
> 
>>> +
>>> +             /*
>>> +              * Our goal here is to retire _idle_ timelines as soon as
>>> +              * possible (as they are idle, we do not expect userspace
>>> +              * to be cleaning up anytime soon).
>>> +              *
>>> +              * If the tl->active_count is already zero, someone else
>>> +              * should have retired the timeline. Equally if the timeline
>>> +              * is currently locked, either it is being retired elsewhere
>>> +              * or about to be!
>>> +              */
>>> +             if (atomic_read(&tl->active_count) &&
>>> +                 mutex_trylock(&tl->mutex)) {
>>> +                     retire_requests(tl);
>>> +                     mutex_unlock(&tl->mutex);
>>> +             }
>>> +             intel_timeline_put(tl);
>>> +
>>> +             GEM_BUG_ON(!next);
>>> +             tl = ptr_mask_bits(next, 1);
>>
>> You sometimes expect engine->retire to contain 0x1?
> 
> Yes, imagine that we are submitting very fast such that we schedule_out
> the same context before the worker ran, we would then try to
> add_retire() the same timeline again. So I was using BIT(0) to tag an
> active element in the retirement list.

If we have schedule_out on the same context before the retire ran then 
add_retire is a no-op due tl->retire being set.

It could be a race between engine_retire and add_retire where 
engine->retire is set to NULL so latter would set tl->retire to NULL | 
BIT(0).

But BIT(0) is only filtered out and not acted upon anywhere so I am 
still lost.

So maybe from the opposite angle, what goes wrong if you drop all BIT(0) 
business?

> 
>>> +     } while (tl);
>>> +}
>>> +
>>> +static bool add_retire(struct intel_engine_cs *engine,
>>> +                    struct intel_timeline *tl)
>>> +{
>>> +     struct intel_timeline *first = READ_ONCE(engine->retire);
>>> +
>>> +     /*
>>> +      * We open-code a llist here to include the additional tag [BIT(0)]
>>> +      * so that we know when the timeline is already on a
>>> +      * retirement queue: either this engine or another.
>>> +      *
>>> +      * However, we rely on that a timeline can only be active on a single
>>> +      * engine at any one time and that add_retire() is called before the
>>> +      * engine releases the timeline and transferred to another to retire.
>>> +      */
>>> +
>>> +     if (READ_ONCE(tl->retire)) /* already queued */
>>> +             return false;
>>
>> Can't this go first in the function?
> 
> Conceptually it is. And I made it so because I also decided against
> having the READ_ONCE() at the top.

But READ_ONCE is at the top, just a different one which is discarded if 
this one is NULL.

> 
>>> +
>>> +     intel_timeline_get(tl);
>>> +     do
>>> +             tl->retire = ptr_pack_bits(first, 1, 1);
>>
>> Here you rely on assignment being atomic right?
> 
> Ish. Here we rely on the timeline being owned by the engine so it cannot
> be submitted by another (and so schedule_out called) until this engine
> has released it.
> 
> It is a weak point for generality, but the ordering is strong in
> execlists.
> 
>>> +     while (!try_cmpxchg(&engine->retire, &first, tl));
>>
>> So the loop is effectively creating a chain of timelines to retire on
>> this engine.
>>
>> What happens with virtual engines when a timeline goes to different
>> engine before (well or any single timeline context) the retire worker
>> runs? Ah okay, it gets re-assigned to the most recent engine.
> 
> Right. The engine_retire() doesn't care which engine the timelines were
> run on, it's just a list of suspected idle timelines.
> 
>> I am not sure about the BIT(0) business. It's always set on write so I
>> am not getting why it is useful.
> 
> It's also set to 0 on consumption :)

Another thing - assert that engine->retire is NULL after flush_work in 
intel_engine_fini_retire could be useful.

Regards,

Tvrtko
Chris Wilson Nov. 20, 2019, 2:25 p.m. UTC | #4
Quoting Tvrtko Ursulin (2019-11-20 14:16:17)
> 
> On 20/11/2019 13:39, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
> >>
> >> On 20/11/2019 09:33, Chris Wilson wrote:
> >>> +
> >>> +             /*
> >>> +              * Our goal here is to retire _idle_ timelines as soon as
> >>> +              * possible (as they are idle, we do not expect userspace
> >>> +              * to be cleaning up anytime soon).
> >>> +              *
> >>> +              * If the tl->active_count is already zero, someone else
> >>> +              * should have retired the timeline. Equally if the timeline
> >>> +              * is currently locked, either it is being retired elsewhere
> >>> +              * or about to be!
> >>> +              */
> >>> +             if (atomic_read(&tl->active_count) &&
> >>> +                 mutex_trylock(&tl->mutex)) {
> >>> +                     retire_requests(tl);
> >>> +                     mutex_unlock(&tl->mutex);
> >>> +             }
> >>> +             intel_timeline_put(tl);
> >>> +
> >>> +             GEM_BUG_ON(!next);
> >>> +             tl = ptr_mask_bits(next, 1);
> >>
> >> You sometimes expect engine->retire to contain 0x1?
> > 
> > Yes, imagine that we are submitting very fast such that we schedule_out
> > the same context before the worker ran, we would then try to
> > add_retire() the same timeline again. So I was using BIT(0) to tag an
> > active element in the retirement list.
> 
> If we have schedule_out on the same context before the retire ran then 
> add_retire is a no-op due tl->retire being set.

The first element in the list has timeline->retire = NULL. We only know
it is set because we make it timeline->retire = 1 instead.

> It could be a race between engine_retire and add_retire where 
> engine->retire is set to NULL so latter would set tl->retire to NULL | 
> BIT(0).
> 
> But BIT(0) is only filtered out and not acted upon anywhere so I am 
> still lost.
> 
> So maybe from the opposite angle, what goes wrong if you drop all BIT(0) 
> business?

You insert the element into the list twice causing a loop. Now that loop
is short lived as we clear timeline->retire on processing, but we may
still end up dropping a reference twice.

> 
> > 
> >>> +     } while (tl);
> >>> +}
> >>> +
> >>> +static bool add_retire(struct intel_engine_cs *engine,
> >>> +                    struct intel_timeline *tl)
> >>> +{
> >>> +     struct intel_timeline *first = READ_ONCE(engine->retire);
> >>> +
> >>> +     /*
> >>> +      * We open-code a llist here to include the additional tag [BIT(0)]
> >>> +      * so that we know when the timeline is already on a
> >>> +      * retirement queue: either this engine or another.
> >>> +      *
> >>> +      * However, we rely on that a timeline can only be active on a single
> >>> +      * engine at any one time and that add_retire() is called before the
> >>> +      * engine releases the timeline and transferred to another to retire.
> >>> +      */
> >>> +
> >>> +     if (READ_ONCE(tl->retire)) /* already queued */
> >>> +             return false;
> >>
> >> Can't this go first in the function?
> > 
> > Conceptually it is. And I made it so because I also decided against
> > having the READ_ONCE() at the top.
> 
> But READ_ONCE is at the top, just a different one which is discarded if 
> this one is NULL.

static bool add_retire(struct intel_engine_cs *engine,
                       struct intel_timeline *tl)
{
        struct intel_timeline *first;

        /*
         * We open-code a llist here to include the additional tag [BIT(0)]
         * so that we know when the timeline is already on a
         * retirement queue: either this engine or another.
         *
         * However, we rely on that a timeline can only be active on a single
         * engine at any one time and that add_retire() is called before the
         * engine releases the timeline and transferred to another to retire.
         */

        if (READ_ONCE(tl->retire)) /* already queued */
                return false;

        if (!atomic_read(&tl->active_count)) /* already retired */
                return false;

        intel_timeline_get(tl);
        first = READ_ONCE(engine->retire);
        do
                tl->retire = ptr_pack_bits(first, 1, 1);
        while (!try_cmpxchg(&engine->retire, &first, tl));

        return !first;
}

is the current incarnation.

> >>> +
> >>> +     intel_timeline_get(tl);
> >>> +     do
> >>> +             tl->retire = ptr_pack_bits(first, 1, 1);
> >>
> >> Here you rely on assignment being atomic right?
> > 
> > Ish. Here we rely on the timeline being owned by the engine so it cannot
> > be submitted by another (and so schedule_out called) until this engine
> > has released it.
> > 
> > It is a weak point for generality, but the ordering is strong in
> > execlists.
> > 
> >>> +     while (!try_cmpxchg(&engine->retire, &first, tl));
> >>
> >> So the loop is effectively creating a chain of timelines to retire on
> >> this engine.
> >>
> >> What happens with virtual engines when a timeline goes to different
> >> engine before (well or any single timeline context) the retire worker
> >> runs? Ah okay, it gets re-assigned to the most recent engine.
> > 
> > Right. The engine_retire() doesn't care which engine the timelines were
> > run on, it's just a list of suspected idle timelines.
> > 
> >> I am not sure about the BIT(0) business. It's always set on write so I
> >> am not getting why it is useful.
> > 
> > It's also set to 0 on consumption :)
> 
> Another thing - assert that engine->retire is NULL after flush_work in 
> intel_engine_fini_retire could be useful.

Yup, wilco.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index b9613d044393..8f6e353caa66 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -28,13 +28,13 @@ 
 
 #include "i915_drv.h"
 
-#include "gt/intel_gt.h"
-
+#include "intel_context.h"
 #include "intel_engine.h"
 #include "intel_engine_pm.h"
 #include "intel_engine_pool.h"
 #include "intel_engine_user.h"
-#include "intel_context.h"
+#include "intel_gt.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc.h"
 #include "intel_reset.h"
 #include "intel_ring.h"
@@ -617,6 +617,7 @@  static int intel_engine_setup_common(struct intel_engine_cs *engine)
 	intel_engine_init_execlists(engine);
 	intel_engine_init_cmd_parser(engine);
 	intel_engine_init__pm(engine);
+	intel_engine_init_retire(engine);
 
 	intel_engine_pool_init(&engine->pool);
 
@@ -839,6 +840,7 @@  void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 
 	cleanup_status_page(engine);
 
+	intel_engine_fini_retire(engine);
 	intel_engine_pool_fini(&engine->pool);
 	intel_engine_fini_breadcrumbs(engine);
 	intel_engine_cleanup_cmd_parser(engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 758f0e8ec672..17f1f1441efc 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -451,6 +451,14 @@  struct intel_engine_cs {
 
 	struct intel_engine_execlists execlists;
 
+	/*
+	 * Keep track of completed timelines on this engine for early
+	 * retirement with the goal of quickly enabling powersaving as
+	 * soon as the engine is idle.
+	 */
+	struct intel_timeline *retire;
+	struct work_struct retire_work;
+
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 4dc3cbeb1b36..4a98fefdf915 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -29,6 +29,80 @@  static void flush_submission(struct intel_gt *gt)
 		intel_engine_flush_submission(engine);
 }
 
+static void engine_retire(struct work_struct *work)
+{
+	struct intel_engine_cs *engine =
+		container_of(work, typeof(*engine), retire_work);
+	struct intel_timeline *tl = xchg(&engine->retire, NULL);
+
+	do {
+		struct intel_timeline *next = xchg(&tl->retire, NULL);
+
+		/*
+		 * Our goal here is to retire _idle_ timelines as soon as
+		 * possible (as they are idle, we do not expect userspace
+		 * to be cleaning up anytime soon).
+		 *
+		 * If the tl->active_count is already zero, someone else
+		 * should have retired the timeline. Equally if the timeline
+		 * is currently locked, either it is being retired elsewhere
+		 * or about to be!
+		 */
+		if (atomic_read(&tl->active_count) &&
+		    mutex_trylock(&tl->mutex)) {
+			retire_requests(tl);
+			mutex_unlock(&tl->mutex);
+		}
+		intel_timeline_put(tl);
+
+		GEM_BUG_ON(!next);
+		tl = ptr_mask_bits(next, 1);
+	} while (tl);
+}
+
+static bool add_retire(struct intel_engine_cs *engine,
+		       struct intel_timeline *tl)
+{
+	struct intel_timeline *first = READ_ONCE(engine->retire);
+
+	/*
+	 * We open-code a llist here to include the additional tag [BIT(0)]
+	 * so that we know when the timeline is already on a
+	 * retirement queue: either this engine or another.
+	 *
+	 * However, we rely on that a timeline can only be active on a single
+	 * engine at any one time and that add_retire() is called before the
+	 * engine releases the timeline and transferred to another to retire.
+	 */
+
+	if (READ_ONCE(tl->retire)) /* already queued */
+		return false;
+
+	intel_timeline_get(tl);
+	do
+		tl->retire = ptr_pack_bits(first, 1, 1);
+	while (!try_cmpxchg(&engine->retire, &first, tl));
+
+	return !first;
+}
+
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl)
+{
+	if (add_retire(engine, tl))
+		schedule_work(&engine->retire_work);
+}
+
+void intel_engine_init_retire(struct intel_engine_cs *engine)
+{
+	INIT_WORK(&engine->retire_work, engine_retire);
+}
+
+void intel_engine_fini_retire(struct intel_engine_cs *engine)
+{
+	flush_work(&engine->retire_work);
+}
+
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
index fde546424c63..8de559b5a033 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
@@ -7,7 +7,12 @@ 
 #ifndef INTEL_GT_REQUESTS_H
 #define INTEL_GT_REQUESTS_H
 
-struct intel_gt;
+#include <linux/workqueue.h>
+
+#include "intel_gt_types.h"
+
+struct intel_engine_cs;
+struct intel_timeline;
 
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
 static inline void intel_gt_retire_requests(struct intel_gt *gt)
@@ -15,6 +20,16 @@  static inline void intel_gt_retire_requests(struct intel_gt *gt)
 	intel_gt_retire_requests_timeout(gt, 0);
 }
 
+static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
+{
+	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
+}
+
+void intel_engine_init_retire(struct intel_engine_cs *engine);
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl);
+void intel_engine_fini_retire(struct intel_engine_cs *engine);
+
 int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
 
 void intel_gt_init_requests(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index b65bc06855b0..2ceaa2f22996 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -142,6 +142,7 @@ 
 #include "intel_engine_pm.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
 #include "intel_reset.h"
@@ -1170,6 +1171,14 @@  __execlists_schedule_out(struct i915_request *rq,
 	 * refrain from doing non-trivial work here.
 	 */
 
+	/*
+	 * If we have just completed this context, the engine may now be
+	 * idle and we want to re-enter powersaving.
+	 */
+	if (list_is_last(&rq->link, &ce->timeline->requests) &&
+	    i915_request_completed(rq))
+		intel_engine_add_retire(engine, ce->timeline);
+
 	intel_engine_context_out(engine);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
 	intel_gt_pm_put_async(engine->gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index b190a5d9ab02..c1d2419444f8 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -277,6 +277,7 @@  void intel_timeline_fini(struct intel_timeline *timeline)
 {
 	GEM_BUG_ON(atomic_read(&timeline->pin_count));
 	GEM_BUG_ON(!list_empty(&timeline->requests));
+	GEM_BUG_ON(timeline->retire);
 
 	if (timeline->hwsp_cacheline)
 		cacheline_free(timeline->hwsp_cacheline);
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
index 5244615ed1cb..aaf15cbe1ce1 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
@@ -66,6 +66,9 @@  struct intel_timeline {
 	 */
 	struct i915_active_fence last_request;
 
+	/** A chain of completed timelines ready for early retirement. */
+	struct intel_timeline *retire;
+
 	/**
 	 * We track the most recent seqno that we wait on in every context so
 	 * that we only have to emit a new await and dependency on a more