diff mbox series

[38/40] drm/i915/execlists: Flush the CS events before unpinning

Message ID 20180919195544.1511-38-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/40] drm: Use default dma_fence hooks where possible for null syncobj | expand

Commit Message

Chris Wilson Sept. 19, 2018, 7:55 p.m. UTC
Inside the execlists submission tasklet, we often make the mistake of
assuming that everything beneath the request is available for use.
However, the submission and the request live on two separate timelines,
and the request contents may be freed from an early retirement before we
have had a chance to run the submission tasklet (think ksoftirqd). To
safeguard ourselves against any mistakes, flush the tasklet before we
unpin the context if execlists still has a reference to this context.

References: 60367132a214 ("drm/i915: Avoid use-after-free of ctx in request tracepoints")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.h |  1 +
 drivers/gpu/drm/i915/intel_lrc.c        | 32 ++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

Comments

Tvrtko Ursulin Oct. 1, 2018, 10:51 a.m. UTC | #1
On 19/09/2018 20:55, Chris Wilson wrote:
> Inside the execlists submission tasklet, we often make the mistake of
> assuming that everything beneath the request is available for use.
> However, the submission and the request live on two separate timelines,
> and the request contents may be freed from an early retirement before we
> have had a chance to run the submission tasklet (think ksoftirqd). To
> safeguard ourselves against any mistakes, flush the tasklet before we
> unpin the context if execlists still has a reference to this context.
> 
> References: 60367132a214 ("drm/i915: Avoid use-after-free of ctx in request tracepoints")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.h |  1 +
>   drivers/gpu/drm/i915/intel_lrc.c        | 32 ++++++++++++++++++++++++-
>   2 files changed, 32 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 9f89119a6566..1fd71dfdfa62 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -170,6 +170,7 @@ struct i915_gem_context {
>   	/** engine: per-engine logical HW state */
>   	struct intel_context {
>   		struct i915_gem_context *gem_context;
> +		struct intel_engine_cs *active;
>   		struct i915_vma *state;
>   		struct intel_ring *ring;
>   		u32 *lrc_reg_state;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 48a2bca7fec3..be7dbdd7fc2c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -282,6 +282,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   		__i915_request_unsubmit(rq);
>   		unwind_wa_tail(rq);
>   
> +		GEM_BUG_ON(rq->hw_context->active);
> +
>   		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
>   		if (rq_prio(rq) != prio) {
>   			prio = rq_prio(rq);
> @@ -427,8 +429,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>   		rq = port_unpack(&port[n], &count);
>   		if (rq) {
>   			GEM_BUG_ON(count > !n);
> -			if (!count++)
> +			if (!count++) {
> +				GEM_BUG_ON(rq->hw_context->active);
>   				execlists_context_schedule_in(rq);
> +				rq->hw_context->active = engine;

Put it in execlists_context_schedule_in/out?

Why does it have to be the engine pointer and not just a boolean? 
Because we don't have an engine backpointer in hw_context? Should we add 
it? I think I had occasionally wished we had it.. maybe too much work to 
evaluate what function prototypes we could clean up with it and whether 
it would be an overall gain.

> +			}
>   			port_set(&port[n], port_pack(rq, count));
>   			desc = execlists_update_context(rq);
>   			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
> @@ -734,6 +739,8 @@ execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
>   			  intel_engine_get_seqno(rq->engine));
>   
>   		GEM_BUG_ON(!execlists->active);
> +
> +		rq->hw_context->active = NULL;
>   		execlists_context_schedule_out(rq,
>   					       i915_request_completed(rq) ?
>   					       INTEL_CONTEXT_SCHEDULE_OUT :
> @@ -971,6 +978,7 @@ static void process_csb(struct intel_engine_cs *engine)
>   			 */
>   			GEM_BUG_ON(!i915_request_completed(rq));
>   
> +			rq->hw_context->active = NULL;
>   			execlists_context_schedule_out(rq,
>   						       INTEL_CONTEXT_SCHEDULE_OUT);
>   			i915_request_put(rq);
> @@ -1080,6 +1088,28 @@ static void execlists_context_destroy(struct intel_context *ce)
>   
>   static void execlists_context_unpin(struct intel_context *ce)
>   {
> +	struct intel_engine_cs *engine;
> +
> +	/*
> +	 * The tasklet may still be using a pointer to our state, via an
> +	 * old request. However, since we know we only unpin the context
> +	 * on retirement of the following request, we know that the last
> +	 * request referencing us will have had a completion CS interrupt.

Hm hm hm... my initial thought was that interrupts could be more delayed 
than breadcrumb writes (more than one context ahead), in which case the 
process_csb below could be premature and the assert would trigger. But I 
must be forgetting something since that would also mean we would 
prematurely unpin the context. So I must be forgetting something..

Regards,

Tvrtko

> +	 * If we see that it is still active, it means that the tasklet hasn't
> +	 * had the chance to run yet; let it run before we teardown the
> +	 * reference it may use.
> +	 */
> +	engine = READ_ONCE(ce->active);
> +	if (unlikely(engine)) {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&engine->timeline.lock, flags);
> +		process_csb(engine);
> +		spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +
> +		GEM_BUG_ON(READ_ONCE(ce->active));
> +	}
> +
>   	i915_gem_context_unpin_hw_id(ce->gem_context);
>   
>   	intel_ring_unpin(ce->ring);
>
Chris Wilson Oct. 1, 2018, 11:06 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-10-01 11:51:23)
> 
> On 19/09/2018 20:55, Chris Wilson wrote:
> > Inside the execlists submission tasklet, we often make the mistake of
> > assuming that everything beneath the request is available for use.
> > However, the submission and the request live on two separate timelines,
> > and the request contents may be freed from an early retirement before we
> > have had a chance to run the submission tasklet (think ksoftirqd). To
> > safeguard ourselves against any mistakes, flush the tasklet before we
> > unpin the context if execlists still has a reference to this context.
> > 
> > References: 60367132a214 ("drm/i915: Avoid use-after-free of ctx in request tracepoints")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_gem_context.h |  1 +
> >   drivers/gpu/drm/i915/intel_lrc.c        | 32 ++++++++++++++++++++++++-
> >   2 files changed, 32 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> > index 9f89119a6566..1fd71dfdfa62 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.h
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> > @@ -170,6 +170,7 @@ struct i915_gem_context {
> >       /** engine: per-engine logical HW state */
> >       struct intel_context {
> >               struct i915_gem_context *gem_context;
> > +             struct intel_engine_cs *active;
> >               struct i915_vma *state;
> >               struct intel_ring *ring;
> >               u32 *lrc_reg_state;
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 48a2bca7fec3..be7dbdd7fc2c 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -282,6 +282,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
> >               __i915_request_unsubmit(rq);
> >               unwind_wa_tail(rq);
> >   
> > +             GEM_BUG_ON(rq->hw_context->active);
> > +
> >               GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> >               if (rq_prio(rq) != prio) {
> >                       prio = rq_prio(rq);
> > @@ -427,8 +429,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
> >               rq = port_unpack(&port[n], &count);
> >               if (rq) {
> >                       GEM_BUG_ON(count > !n);
> > -                     if (!count++)
> > +                     if (!count++) {
> > +                             GEM_BUG_ON(rq->hw_context->active);
> >                               execlists_context_schedule_in(rq);
> > +                             rq->hw_context->active = engine;
> 
> Put it in execlists_context_schedule_in/out?
> 
> Why does it have to be the engine pointer and not just a boolean? 
> Because we don't have an engine backpointer in hw_context? Should we add 
> it? I think I had occasionally wished we had it.. maybe too much work to 
> evaluate what function prototypes we could clean up with it and whether 
> it would be an overall gain.

It's a backpointer in hw_context for the purposes of being a backpointer
in hw_context. Its purpose is for:

	active = READ_ONCE(ve->context.active);
        if (active && active != engine) {
        	rb = rb_next(rb);
        	continue;
        }

> > +                     }
> >                       port_set(&port[n], port_pack(rq, count));
> >                       desc = execlists_update_context(rq);
> >                       GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
> > @@ -734,6 +739,8 @@ execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
> >                         intel_engine_get_seqno(rq->engine));
> >   
> >               GEM_BUG_ON(!execlists->active);
> > +
> > +             rq->hw_context->active = NULL;
> >               execlists_context_schedule_out(rq,
> >                                              i915_request_completed(rq) ?
> >                                              INTEL_CONTEXT_SCHEDULE_OUT :
> > @@ -971,6 +978,7 @@ static void process_csb(struct intel_engine_cs *engine)
> >                        */
> >                       GEM_BUG_ON(!i915_request_completed(rq));
> >   
> > +                     rq->hw_context->active = NULL;
> >                       execlists_context_schedule_out(rq,
> >                                                      INTEL_CONTEXT_SCHEDULE_OUT);
> >                       i915_request_put(rq);
> > @@ -1080,6 +1088,28 @@ static void execlists_context_destroy(struct intel_context *ce)
> >   
> >   static void execlists_context_unpin(struct intel_context *ce)
> >   {
> > +     struct intel_engine_cs *engine;
> > +
> > +     /*
> > +      * The tasklet may still be using a pointer to our state, via an
> > +      * old request. However, since we know we only unpin the context
> > +      * on retirement of the following request, we know that the last
> > +      * request referencing us will have had a completion CS interrupt.
> 
> Hm hm hm... my initial thought was that interrupts could be more delayed 
> than breadcrumb writes (more than one context ahead), in which case the 
> process_csb below could be premature and the assert would trigger. But I 
> must be forgetting something since that would also mean we would 
> prematurely unpin the context. So I must be forgetting something..

There will have been at least one CS event written because we have
switched contexts due to the unpin being one seqno behind. I have not
(yet) observed CS events being out of order with breadcrumb writes (and
we have very strict checking) so confident that a single process_csb()
is required rather than a loop.
-Chris
Tvrtko Ursulin Oct. 1, 2018, 1:15 p.m. UTC | #3
On 01/10/2018 12:06, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-10-01 11:51:23)
>>
>> On 19/09/2018 20:55, Chris Wilson wrote:
>>> Inside the execlists submission tasklet, we often make the mistake of
>>> assuming that everything beneath the request is available for use.
>>> However, the submission and the request live on two separate timelines,
>>> and the request contents may be freed from an early retirement before we
>>> have had a chance to run the submission tasklet (think ksoftirqd). To
>>> safeguard ourselves against any mistakes, flush the tasklet before we
>>> unpin the context if execlists still has a reference to this context.
>>>
>>> References: 60367132a214 ("drm/i915: Avoid use-after-free of ctx in request tracepoints")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem_context.h |  1 +
>>>    drivers/gpu/drm/i915/intel_lrc.c        | 32 ++++++++++++++++++++++++-
>>>    2 files changed, 32 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
>>> index 9f89119a6566..1fd71dfdfa62 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_context.h
>>> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
>>> @@ -170,6 +170,7 @@ struct i915_gem_context {
>>>        /** engine: per-engine logical HW state */
>>>        struct intel_context {
>>>                struct i915_gem_context *gem_context;
>>> +             struct intel_engine_cs *active;
>>>                struct i915_vma *state;
>>>                struct intel_ring *ring;
>>>                u32 *lrc_reg_state;
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 48a2bca7fec3..be7dbdd7fc2c 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -282,6 +282,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>>>                __i915_request_unsubmit(rq);
>>>                unwind_wa_tail(rq);
>>>    
>>> +             GEM_BUG_ON(rq->hw_context->active);
>>> +
>>>                GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
>>>                if (rq_prio(rq) != prio) {
>>>                        prio = rq_prio(rq);
>>> @@ -427,8 +429,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>                rq = port_unpack(&port[n], &count);
>>>                if (rq) {
>>>                        GEM_BUG_ON(count > !n);
>>> -                     if (!count++)
>>> +                     if (!count++) {
>>> +                             GEM_BUG_ON(rq->hw_context->active);
>>>                                execlists_context_schedule_in(rq);
>>> +                             rq->hw_context->active = engine;
>>
>> Put it in execlists_context_schedule_in/out?
>>
>> Why does it have to be the engine pointer and not just a boolean?
>> Because we don't have an engine backpointer in hw_context? Should we add
>> it? I think I had occasionally wished we had it.. maybe too much work to
>> evaluate what function prototypes we could clean up with it and whether
>> it would be an overall gain.
> 
> It's a backpointer in hw_context for the purposes of being a backpointer
> in hw_context. Its purpose is for:
> 
> 	active = READ_ONCE(ve->context.active);
>          if (active && active != engine) {
>          	rb = rb_next(rb);
>          	continue;
>          }

Yeah, I was asking why not call it 'engine' and have the active flag as 
boolean. I guess because ce->engine will not be permanent soon.

> 
>>> +                     }
>>>                        port_set(&port[n], port_pack(rq, count));
>>>                        desc = execlists_update_context(rq);
>>>                        GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
>>> @@ -734,6 +739,8 @@ execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
>>>                          intel_engine_get_seqno(rq->engine));
>>>    
>>>                GEM_BUG_ON(!execlists->active);
>>> +
>>> +             rq->hw_context->active = NULL;
>>>                execlists_context_schedule_out(rq,
>>>                                               i915_request_completed(rq) ?
>>>                                               INTEL_CONTEXT_SCHEDULE_OUT :
>>> @@ -971,6 +978,7 @@ static void process_csb(struct intel_engine_cs *engine)
>>>                         */
>>>                        GEM_BUG_ON(!i915_request_completed(rq));
>>>    
>>> +                     rq->hw_context->active = NULL;
>>>                        execlists_context_schedule_out(rq,
>>>                                                       INTEL_CONTEXT_SCHEDULE_OUT);
>>>                        i915_request_put(rq);
>>> @@ -1080,6 +1088,28 @@ static void execlists_context_destroy(struct intel_context *ce)
>>>    
>>>    static void execlists_context_unpin(struct intel_context *ce)
>>>    {
>>> +     struct intel_engine_cs *engine;
>>> +
>>> +     /*
>>> +      * The tasklet may still be using a pointer to our state, via an
>>> +      * old request. However, since we know we only unpin the context
>>> +      * on retirement of the following request, we know that the last
>>> +      * request referencing us will have had a completion CS interrupt.
>>
>> Hm hm hm... my initial thought was that interrupts could be more delayed
>> than breadcrumb writes (more than one context ahead), in which case the
>> process_csb below could be premature and the assert would trigger. But I
>> must be forgetting something since that would also mean we would
>> prematurely unpin the context. So I must be forgetting something..
> 
> There will have been at least one CS event written because we have
> switched contexts due to the unpin being one seqno behind. I have not
> (yet) observed CS events being out of order with breadcrumb writes (and
> we have very strict checking) so confident that a single process_csb()
> is required rather than a loop.

I did not mean out of order but just delayed.

Say port 0 & 1 are both submitted, we observe seqno 1 & 2 as complete, 
but the ctx complete irq/handler has been delayed. We go to unpin ctx0 
(port0) but ce->active hasn't been cleared due no ctx complete yet so 
the assert triggers. Impossible in your experience?

Regards,

Tvrtko
Chris Wilson Oct. 1, 2018, 1:26 p.m. UTC | #4
Quoting Tvrtko Ursulin (2018-10-01 14:15:49)
> 
> On 01/10/2018 12:06, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-10-01 11:51:23)
> >>
> >> Hm hm hm... my initial thought was that interrupts could be more delayed
> >> than breadcrumb writes (more than one context ahead), in which case the
> >> process_csb below could be premature and the assert would trigger. But I
> >> must be forgetting something since that would also mean we would
> >> prematurely unpin the context. So I must be forgetting something..
> > 
> > There will have been at least one CS event written because we have
> > switched contexts due to the unpin being one seqno behind. I have not
> > (yet) observed CS events being out of order with breadcrumb writes (and
> > we have very strict checking) so confident that a single process_csb()
> > is required rather than a loop.
> 
> I did not mean out of order but just delayed.
> 
> Say port 0 & 1 are both submitted, we observe seqno 1 & 2 as complete, 
> but the ctx complete irq/handler has been delayed. We go to unpin ctx0 
> (port0) but ce->active hasn't been cleared due no ctx complete yet so 
> the assert triggers. Impossible in your experience?

I have not seen anything to doubt that the CS interrupts, and more
importantly here, the CS events are delayed. It's the CS event itself
that we care about, and I think we are very safe in our assertion that
it is written on the context switch prior to the breadcrumb in the
second context.
-Chris
Tvrtko Ursulin Oct. 1, 2018, 2:03 p.m. UTC | #5
On 01/10/2018 14:26, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-10-01 14:15:49)
>>
>> On 01/10/2018 12:06, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-10-01 11:51:23)
>>>>
>>>> Hm hm hm... my initial thought was that interrupts could be more delayed
>>>> than breadcrumb writes (more than one context ahead), in which case the
>>>> process_csb below could be premature and the assert would trigger. But I
>>>> must be forgetting something since that would also mean we would
>>>> prematurely unpin the context. So I must be forgetting something..
>>>
>>> There will have been at least one CS event written because we have
>>> switched contexts due to the unpin being one seqno behind. I have not
>>> (yet) observed CS events being out of order with breadcrumb writes (and
>>> we have very strict checking) so confident that a single process_csb()
>>> is required rather than a loop.
>>
>> I did not mean out of order but just delayed.
>>
>> Say port 0 & 1 are both submitted, we observe seqno 1 & 2 as complete,
>> but the ctx complete irq/handler has been delayed. We go to unpin ctx0
>> (port0) but ce->active hasn't been cleared due no ctx complete yet so
>> the assert triggers. Impossible in your experience?
> 
> I have not seen anything to doubt that the CS interrupts, and more
> importantly here, the CS events are delayed. It's the CS event itself
> that we care about, and I think we are very safe in our assertion that
> it is written on the context switch prior to the breadcrumb in the
> second context.

Ugh.. yes.. direct call of the tasklet exactly has nothing to do with 
irq delivery. My confusion.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 9f89119a6566..1fd71dfdfa62 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -170,6 +170,7 @@  struct i915_gem_context {
 	/** engine: per-engine logical HW state */
 	struct intel_context {
 		struct i915_gem_context *gem_context;
+		struct intel_engine_cs *active;
 		struct i915_vma *state;
 		struct intel_ring *ring;
 		u32 *lrc_reg_state;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 48a2bca7fec3..be7dbdd7fc2c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -282,6 +282,8 @@  static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		__i915_request_unsubmit(rq);
 		unwind_wa_tail(rq);
 
+		GEM_BUG_ON(rq->hw_context->active);
+
 		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 		if (rq_prio(rq) != prio) {
 			prio = rq_prio(rq);
@@ -427,8 +429,11 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 		rq = port_unpack(&port[n], &count);
 		if (rq) {
 			GEM_BUG_ON(count > !n);
-			if (!count++)
+			if (!count++) {
+				GEM_BUG_ON(rq->hw_context->active);
 				execlists_context_schedule_in(rq);
+				rq->hw_context->active = engine;
+			}
 			port_set(&port[n], port_pack(rq, count));
 			desc = execlists_update_context(rq);
 			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
@@ -734,6 +739,8 @@  execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
 			  intel_engine_get_seqno(rq->engine));
 
 		GEM_BUG_ON(!execlists->active);
+
+		rq->hw_context->active = NULL;
 		execlists_context_schedule_out(rq,
 					       i915_request_completed(rq) ?
 					       INTEL_CONTEXT_SCHEDULE_OUT :
@@ -971,6 +978,7 @@  static void process_csb(struct intel_engine_cs *engine)
 			 */
 			GEM_BUG_ON(!i915_request_completed(rq));
 
+			rq->hw_context->active = NULL;
 			execlists_context_schedule_out(rq,
 						       INTEL_CONTEXT_SCHEDULE_OUT);
 			i915_request_put(rq);
@@ -1080,6 +1088,28 @@  static void execlists_context_destroy(struct intel_context *ce)
 
 static void execlists_context_unpin(struct intel_context *ce)
 {
+	struct intel_engine_cs *engine;
+
+	/*
+	 * The tasklet may still be using a pointer to our state, via an
+	 * old request. However, since we know we only unpin the context
+	 * on retirement of the following request, we know that the last
+	 * request referencing us will have had a completion CS interrupt.
+	 * If we see that it is still active, it means that the tasklet hasn't
+	 * had the chance to run yet; let it run before we teardown the
+	 * reference it may use.
+	 */
+	engine = READ_ONCE(ce->active);
+	if (unlikely(engine)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&engine->timeline.lock, flags);
+		process_csb(engine);
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+		GEM_BUG_ON(READ_ONCE(ce->active));
+	}
+
 	i915_gem_context_unpin_hw_id(ce->gem_context);
 
 	intel_ring_unpin(ce->ring);