diff mbox

[5/5] drm/i915/execlists: Flush pending preemption events during reset

Message ID 20180320001848.4405-5-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson March 20, 2018, 12:18 a.m. UTC
Catch up with the inflight CSB events, after disabling the tasklet
before deciding which request was truly guilty of hanging the GPU.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
CC: Michel Thierry <michel.thierry@intel.com>
Cc: Jeff McGee <jeff.mcgee@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 355 ++++++++++++++++++++++-----------------
 1 file changed, 197 insertions(+), 158 deletions(-)

Comments

jeff.mcgee@intel.com March 21, 2018, 12:17 a.m. UTC | #1
On Tue, Mar 20, 2018 at 12:18:48AM +0000, Chris Wilson wrote:
> Catch up with the inflight CSB events, after disabling the tasklet
> before deciding which request was truly guilty of hanging the GPU.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 355 ++++++++++++++++++++++-----------------
>  1 file changed, 197 insertions(+), 158 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e5a3ffbc273a..beb81f13a3cc 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -828,186 +828,192 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	local_irq_restore(flags);
>  }
>  
> -/*
> - * Check the unread Context Status Buffers and manage the submission of new
> - * contexts to the ELSP accordingly.
> - */
> -static void execlists_submission_tasklet(unsigned long data)
> +static void process_csb(struct intel_engine_cs *engine)
>  {
> -	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
>  	struct execlist_port * const port = execlists->port;
> -	struct drm_i915_private *dev_priv = engine->i915;
> +	struct drm_i915_private *i915 = engine->i915;
> +	unsigned int head, tail;
> +	const u32 *buf;
>  	bool fw = false;
>  
> -	/* We can skip acquiring intel_runtime_pm_get() here as it was taken
> -	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
> -	 * not be relinquished until the device is idle (see
> -	 * i915_gem_idle_work_handler()). As a precaution, we make sure
> -	 * that all ELSP are drained i.e. we have processed the CSB,
> -	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
> -	 */
> -	GEM_BUG_ON(!dev_priv->gt.awake);
> +	if (unlikely(execlists->csb_use_mmio)) {
> +		buf = (u32 * __force)
> +			(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> +		execlists->csb_head = -1; /* force mmio read of CSB ptrs */
> +	} else {
> +		/* The HWSP contains a (cacheable) mirror of the CSB */
> +		buf = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> +	}
>  
> -	/* Prefer doing test_and_clear_bit() as a two stage operation to avoid
> -	 * imposing the cost of a locked atomic transaction when submitting a
> -	 * new request (outside of the context-switch interrupt).
> +	/*
> +	 * The write will be ordered by the uncached read (itself
> +	 * a memory barrier), so we do not need another in the form
> +	 * of a locked instruction. The race between the interrupt
> +	 * handler and the split test/clear is harmless as we order
> +	 * our clear before the CSB read. If the interrupt arrived
> +	 * first between the test and the clear, we read the updated
> +	 * CSB and clear the bit. If the interrupt arrives as we read
> +	 * the CSB or later (i.e. after we had cleared the bit) the bit
> +	 * is set and we do a new loop.
>  	 */
> -	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
> -		/* The HWSP contains a (cacheable) mirror of the CSB */
> -		const u32 *buf =
> -			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> -		unsigned int head, tail;
> -
> -		if (unlikely(execlists->csb_use_mmio)) {
> -			buf = (u32 * __force)
> -				(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> -			execlists->csb_head = -1; /* force mmio read of CSB ptrs */
> -		}
> +	__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> +	if (unlikely(execlists->csb_head == -1)) { /* following a reset */
> +		intel_uncore_forcewake_get(i915, execlists->fw_domains);
> +		fw = true;
> +
> +		head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> +		tail = GEN8_CSB_WRITE_PTR(head);
> +		head = GEN8_CSB_READ_PTR(head);
> +		execlists->csb_head = head;
> +	} else {
> +		const int write_idx =
> +			intel_hws_csb_write_index(i915) -
> +			I915_HWS_CSB_BUF0_INDEX;
> +
> +		head = execlists->csb_head;
> +		tail = READ_ONCE(buf[write_idx]);
> +	}
> +	GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> +		  engine->name,
> +		  head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> +		  tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> +
> +	while (head != tail) {
> +		struct i915_request *rq;
> +		unsigned int status;
> +		unsigned int count;
> +
> +		if (++head == GEN8_CSB_ENTRIES)
> +			head = 0;
>  
> -		/* The write will be ordered by the uncached read (itself
> -		 * a memory barrier), so we do not need another in the form
> -		 * of a locked instruction. The race between the interrupt
> -		 * handler and the split test/clear is harmless as we order
> -		 * our clear before the CSB read. If the interrupt arrived
> -		 * first between the test and the clear, we read the updated
> -		 * CSB and clear the bit. If the interrupt arrives as we read
> -		 * the CSB or later (i.e. after we had cleared the bit) the bit
> -		 * is set and we do a new loop.
> +		/*
> +		 * We are flying near dragons again.
> +		 *
> +		 * We hold a reference to the request in execlist_port[]
> +		 * but no more than that. We are operating in softirq
> +		 * context and so cannot hold any mutex or sleep. That
> +		 * prevents us stopping the requests we are processing
> +		 * in port[] from being retired simultaneously (the
> +		 * breadcrumb will be complete before we see the
> +		 * context-switch). As we only hold the reference to the
> +		 * request, any pointer chasing underneath the request
> +		 * is subject to a potential use-after-free. Thus we
> +		 * store all of the bookkeeping within port[] as
> +		 * required, and avoid using unguarded pointers beneath
> +		 * request itself. The same applies to the atomic
> +		 * status notifier.
>  		 */
> -		__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -		if (unlikely(execlists->csb_head == -1)) { /* following a reset */
> -			if (!fw) {
> -				intel_uncore_forcewake_get(dev_priv,
> -							   execlists->fw_domains);
> -				fw = true;
> -			}
>  
> -			head = readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> -			tail = GEN8_CSB_WRITE_PTR(head);
> -			head = GEN8_CSB_READ_PTR(head);
> -			execlists->csb_head = head;
> -		} else {
> -			const int write_idx =
> -				intel_hws_csb_write_index(dev_priv) -
> -				I915_HWS_CSB_BUF0_INDEX;
> +		status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
> +		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
> +			  engine->name, head,
> +			  status, buf[2*head + 1],
> +			  execlists->active);
> +
> +		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> +			      GEN8_CTX_STATUS_PREEMPTED))
> +			execlists_set_active(execlists,
> +					     EXECLISTS_ACTIVE_HWACK);
> +		if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> +			execlists_clear_active(execlists,
> +					       EXECLISTS_ACTIVE_HWACK);
> +
> +		if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> +			continue;
>  
> -			head = execlists->csb_head;
> -			tail = READ_ONCE(buf[write_idx]);
> -		}
> -		GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> -			  engine->name,
> -			  head, GEN8_CSB_READ_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> -			  tail, GEN8_CSB_WRITE_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> +		/* We should never get a COMPLETED | IDLE_ACTIVE! */
> +		GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
>  
> -		while (head != tail) {
> -			struct i915_request *rq;
> -			unsigned int status;
> -			unsigned int count;
> +		if (status & GEN8_CTX_STATUS_COMPLETE &&
> +		    buf[2*head + 1] == execlists->preempt_complete_status) {
> +			GEM_TRACE("%s preempt-idle\n", engine->name);
> +			complete_preempt_context(execlists);
> +			continue;
> +		}
>  
> -			if (++head == GEN8_CSB_ENTRIES)
> -				head = 0;
> +		if (status & GEN8_CTX_STATUS_PREEMPTED &&
> +		    execlists_is_active(execlists,
> +					EXECLISTS_ACTIVE_PREEMPT))
> +			continue;
>  
> -			/* We are flying near dragons again.
> -			 *
> -			 * We hold a reference to the request in execlist_port[]
> -			 * but no more than that. We are operating in softirq
> -			 * context and so cannot hold any mutex or sleep. That
> -			 * prevents us stopping the requests we are processing
> -			 * in port[] from being retired simultaneously (the
> -			 * breadcrumb will be complete before we see the
> -			 * context-switch). As we only hold the reference to the
> -			 * request, any pointer chasing underneath the request
> -			 * is subject to a potential use-after-free. Thus we
> -			 * store all of the bookkeeping within port[] as
> -			 * required, and avoid using unguarded pointers beneath
> -			 * request itself. The same applies to the atomic
> -			 * status notifier.
> -			 */
> +		GEM_BUG_ON(!execlists_is_active(execlists,
> +						EXECLISTS_ACTIVE_USER));
>  
> -			status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
> -			GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
> -				  engine->name, head,
> -				  status, buf[2*head + 1],
> -				  execlists->active);
> -
> -			if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> -				      GEN8_CTX_STATUS_PREEMPTED))
> -				execlists_set_active(execlists,
> -						     EXECLISTS_ACTIVE_HWACK);
> -			if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> -				execlists_clear_active(execlists,
> -						       EXECLISTS_ACTIVE_HWACK);
> -
> -			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> -				continue;
> +		rq = port_unpack(port, &count);
> +		GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
> +			  engine->name,
> +			  port->context_id, count,
> +			  rq ? rq->global_seqno : 0,
> +			  rq ? rq_prio(rq) : 0);
> +
> +		/* Check the context/desc id for this event matches */
> +		GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> +
> +		GEM_BUG_ON(count == 0);
> +		if (--count == 0) {
> +			GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> +			GEM_BUG_ON(port_isset(&port[1]) &&
> +				   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
> +			GEM_BUG_ON(!i915_request_completed(rq));
> +			execlists_context_schedule_out(rq);
> +			trace_i915_request_out(rq);
> +			i915_request_put(rq);
> +
> +			GEM_TRACE("%s completed ctx=%d\n",
> +				  engine->name, port->context_id);
> +
> +			execlists_port_complete(execlists, port);
> +		} else {
> +			port_set(port, port_pack(rq, count));
> +		}
>  
> -			/* We should never get a COMPLETED | IDLE_ACTIVE! */
> -			GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
> +		/* After the final element, the hw should be idle */
> +		GEM_BUG_ON(port_count(port) == 0 &&
> +			   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> +		if (port_count(port) == 0)
> +			execlists_clear_active(execlists,
> +					       EXECLISTS_ACTIVE_USER);
> +	}
>  
> -			if (status & GEN8_CTX_STATUS_COMPLETE &&
> -			    buf[2*head + 1] == execlists->preempt_complete_status) {
> -				GEM_TRACE("%s preempt-idle\n", engine->name);
> -				complete_preempt_context(execlists);
> -				continue;
> -			}
> +	if (head != execlists->csb_head) {
> +		execlists->csb_head = head;
> +		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> +		       i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> +	}
>  
> -			if (status & GEN8_CTX_STATUS_PREEMPTED &&
> -			    execlists_is_active(execlists,
> -						EXECLISTS_ACTIVE_PREEMPT))
> -				continue;
> +	if (unlikely(fw))
> +		intel_uncore_forcewake_put(i915, execlists->fw_domains);
> +}
>  
> -			GEM_BUG_ON(!execlists_is_active(execlists,
> -							EXECLISTS_ACTIVE_USER));
> -
> -			rq = port_unpack(port, &count);
> -			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
> -				  engine->name,
> -				  port->context_id, count,
> -				  rq ? rq->global_seqno : 0,
> -				  rq ? rq_prio(rq) : 0);
> -
> -			/* Check the context/desc id for this event matches */
> -			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> -
> -			GEM_BUG_ON(count == 0);
> -			if (--count == 0) {
> -				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> -				GEM_BUG_ON(port_isset(&port[1]) &&
> -					   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
> -				GEM_BUG_ON(!i915_request_completed(rq));
> -				execlists_context_schedule_out(rq);
> -				trace_i915_request_out(rq);
> -				i915_request_put(rq);
> -
> -				GEM_TRACE("%s completed ctx=%d\n",
> -					  engine->name, port->context_id);
> -
> -				execlists_port_complete(execlists, port);
> -			} else {
> -				port_set(port, port_pack(rq, count));
> -			}
> +/*
> + * Check the unread Context Status Buffers and manage the submission of new
> + * contexts to the ELSP accordingly.
> + */
> +static void execlists_submission_tasklet(unsigned long data)
> +{
> +	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
>  
> -			/* After the final element, the hw should be idle */
> -			GEM_BUG_ON(port_count(port) == 0 &&
> -				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> -			if (port_count(port) == 0)
> -				execlists_clear_active(execlists,
> -						       EXECLISTS_ACTIVE_USER);
> -		}
> +	/*
> +	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
> +	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
> +	 * not be relinquished until the device is idle (see
> +	 * i915_gem_idle_work_handler()). As a precaution, we make sure
> +	 * that all ELSP are drained i.e. we have processed the CSB,
> +	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
> +	 */
> +	GEM_BUG_ON(!engine->i915->gt.awake);
>  
> -		if (head != execlists->csb_head) {
> -			execlists->csb_head = head;
> -			writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> -			       dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> -		}
> -	}
> +	/*
> +	 * Prefer doing test_and_clear_bit() as a two stage operation to avoid
> +	 * imposing the cost of a locked atomic transaction when submitting a
> +	 * new request (outside of the context-switch interrupt).
> +	 */
> +	if (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted))
This was a 'while' before refactoring. Shouldn't it still be in order to catch
new irq_posted during processing?

BTW, I have revised and rebased the force preemption patches on drm-tip with
this and the other related patches you have proposed. I just started running
my IGT test that covers the innocent context on port[1] scenario that we
discussed on IRC. Getting a sporadic failure but need more time to root cause.
Will update soon.

> +		process_csb(engine);
>  
> -	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
> +	if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
>  		execlists_dequeue(engine);
> -
> -	if (fw)
> -		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
>  }
>  
>  static void queue_request(struct intel_engine_cs *engine,
> @@ -1667,6 +1673,7 @@ static struct i915_request *
>  execlists_reset_prepare(struct intel_engine_cs *engine)
>  {
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
> +	struct i915_request *request, *active;
>  
>  	/*
>  	 * Prevent request submission to the hardware until we have
> @@ -1688,7 +1695,39 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
>  		tasklet_kill(&execlists->tasklet);
>  	tasklet_disable(&execlists->tasklet);
>  
> -	return i915_gem_find_active_request(engine);
> +	/*
> +	 * We want to flush the pending context switches, having disabled
> +	 * the tasklet above, we can assume exclusive access to the execlists.
> +	 * For this allows us to catch up with an inflight preemption event,
> +	 * and avoid blaming an innocent request if the stall was due to the
> +	 * preemption itself.
> +	 */
> +	process_csb(engine);
> +
> +	/*
> +	 * The last active request can then be no later than the last request
> +	 * now in ELSP[0]. So search backwards from there, so that is the GPU
> +	 * has advanced beyond the last CSB update, it will be pardoned.
> +	 */
> +	active = NULL;
> +	request = port_request(execlists->port);
> +	if (request) {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&engine->timeline->lock, flags);
> +		list_for_each_entry_from_reverse(request,
> +						 &engine->timeline->requests,
> +						 link) {
> +			if (__i915_request_completed(request,
> +						     request->global_seqno))
> +				break;
> +
> +			active = request;
> +		}
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +	}
> +
> +	return active;
>  }
>  
>  static void reset_irq(struct intel_engine_cs *engine)
> -- 
> 2.16.2
>
jeff.mcgee@intel.com March 22, 2018, 3:14 p.m. UTC | #2
On Tue, Mar 20, 2018 at 05:17:34PM -0700, Jeff McGee wrote:
> On Tue, Mar 20, 2018 at 12:18:48AM +0000, Chris Wilson wrote:
> > Catch up with the inflight CSB events, after disabling the tasklet
> > before deciding which request was truly guilty of hanging the GPU.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > CC: Michel Thierry <michel.thierry@intel.com>
> > Cc: Jeff McGee <jeff.mcgee@intel.com>
> > ---
> >  drivers/gpu/drm/i915/intel_lrc.c | 355 ++++++++++++++++++++++-----------------
> >  1 file changed, 197 insertions(+), 158 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index e5a3ffbc273a..beb81f13a3cc 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -828,186 +828,192 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> >  	local_irq_restore(flags);
> >  }
> >  
> > -/*
> > - * Check the unread Context Status Buffers and manage the submission of new
> > - * contexts to the ELSP accordingly.
> > - */
> > -static void execlists_submission_tasklet(unsigned long data)
> > +static void process_csb(struct intel_engine_cs *engine)
> >  {
> > -	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> >  	struct intel_engine_execlists * const execlists = &engine->execlists;
> >  	struct execlist_port * const port = execlists->port;
> > -	struct drm_i915_private *dev_priv = engine->i915;
> > +	struct drm_i915_private *i915 = engine->i915;
> > +	unsigned int head, tail;
> > +	const u32 *buf;
> >  	bool fw = false;
> >  
> > -	/* We can skip acquiring intel_runtime_pm_get() here as it was taken
> > -	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
> > -	 * not be relinquished until the device is idle (see
> > -	 * i915_gem_idle_work_handler()). As a precaution, we make sure
> > -	 * that all ELSP are drained i.e. we have processed the CSB,
> > -	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
> > -	 */
> > -	GEM_BUG_ON(!dev_priv->gt.awake);
> > +	if (unlikely(execlists->csb_use_mmio)) {
> > +		buf = (u32 * __force)
> > +			(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> > +		execlists->csb_head = -1; /* force mmio read of CSB ptrs */
> > +	} else {
> > +		/* The HWSP contains a (cacheable) mirror of the CSB */
> > +		buf = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> > +	}
> >  
> > -	/* Prefer doing test_and_clear_bit() as a two stage operation to avoid
> > -	 * imposing the cost of a locked atomic transaction when submitting a
> > -	 * new request (outside of the context-switch interrupt).
> > +	/*
> > +	 * The write will be ordered by the uncached read (itself
> > +	 * a memory barrier), so we do not need another in the form
> > +	 * of a locked instruction. The race between the interrupt
> > +	 * handler and the split test/clear is harmless as we order
> > +	 * our clear before the CSB read. If the interrupt arrived
> > +	 * first between the test and the clear, we read the updated
> > +	 * CSB and clear the bit. If the interrupt arrives as we read
> > +	 * the CSB or later (i.e. after we had cleared the bit) the bit
> > +	 * is set and we do a new loop.
> >  	 */
> > -	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
> > -		/* The HWSP contains a (cacheable) mirror of the CSB */
> > -		const u32 *buf =
> > -			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> > -		unsigned int head, tail;
> > -
> > -		if (unlikely(execlists->csb_use_mmio)) {
> > -			buf = (u32 * __force)
> > -				(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> > -			execlists->csb_head = -1; /* force mmio read of CSB ptrs */
> > -		}
> > +	__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > +	if (unlikely(execlists->csb_head == -1)) { /* following a reset */
> > +		intel_uncore_forcewake_get(i915, execlists->fw_domains);
> > +		fw = true;
> > +
> > +		head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > +		tail = GEN8_CSB_WRITE_PTR(head);
> > +		head = GEN8_CSB_READ_PTR(head);
> > +		execlists->csb_head = head;
> > +	} else {
> > +		const int write_idx =
> > +			intel_hws_csb_write_index(i915) -
> > +			I915_HWS_CSB_BUF0_INDEX;
> > +
> > +		head = execlists->csb_head;
> > +		tail = READ_ONCE(buf[write_idx]);
> > +	}
> > +	GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> > +		  engine->name,
> > +		  head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> > +		  tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> > +
> > +	while (head != tail) {
> > +		struct i915_request *rq;
> > +		unsigned int status;
> > +		unsigned int count;
> > +
> > +		if (++head == GEN8_CSB_ENTRIES)
> > +			head = 0;
> >  
> > -		/* The write will be ordered by the uncached read (itself
> > -		 * a memory barrier), so we do not need another in the form
> > -		 * of a locked instruction. The race between the interrupt
> > -		 * handler and the split test/clear is harmless as we order
> > -		 * our clear before the CSB read. If the interrupt arrived
> > -		 * first between the test and the clear, we read the updated
> > -		 * CSB and clear the bit. If the interrupt arrives as we read
> > -		 * the CSB or later (i.e. after we had cleared the bit) the bit
> > -		 * is set and we do a new loop.
> > +		/*
> > +		 * We are flying near dragons again.
> > +		 *
> > +		 * We hold a reference to the request in execlist_port[]
> > +		 * but no more than that. We are operating in softirq
> > +		 * context and so cannot hold any mutex or sleep. That
> > +		 * prevents us stopping the requests we are processing
> > +		 * in port[] from being retired simultaneously (the
> > +		 * breadcrumb will be complete before we see the
> > +		 * context-switch). As we only hold the reference to the
> > +		 * request, any pointer chasing underneath the request
> > +		 * is subject to a potential use-after-free. Thus we
> > +		 * store all of the bookkeeping within port[] as
> > +		 * required, and avoid using unguarded pointers beneath
> > +		 * request itself. The same applies to the atomic
> > +		 * status notifier.
> >  		 */
> > -		__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > -		if (unlikely(execlists->csb_head == -1)) { /* following a reset */
> > -			if (!fw) {
> > -				intel_uncore_forcewake_get(dev_priv,
> > -							   execlists->fw_domains);
> > -				fw = true;
> > -			}
> >  
> > -			head = readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > -			tail = GEN8_CSB_WRITE_PTR(head);
> > -			head = GEN8_CSB_READ_PTR(head);
> > -			execlists->csb_head = head;
> > -		} else {
> > -			const int write_idx =
> > -				intel_hws_csb_write_index(dev_priv) -
> > -				I915_HWS_CSB_BUF0_INDEX;
> > +		status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
> > +		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
> > +			  engine->name, head,
> > +			  status, buf[2*head + 1],
> > +			  execlists->active);
> > +
> > +		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> > +			      GEN8_CTX_STATUS_PREEMPTED))
> > +			execlists_set_active(execlists,
> > +					     EXECLISTS_ACTIVE_HWACK);
> > +		if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> > +			execlists_clear_active(execlists,
> > +					       EXECLISTS_ACTIVE_HWACK);
> > +
> > +		if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> > +			continue;
> >  
> > -			head = execlists->csb_head;
> > -			tail = READ_ONCE(buf[write_idx]);
> > -		}
> > -		GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> > -			  engine->name,
> > -			  head, GEN8_CSB_READ_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> > -			  tail, GEN8_CSB_WRITE_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> > +		/* We should never get a COMPLETED | IDLE_ACTIVE! */
> > +		GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
> >  
> > -		while (head != tail) {
> > -			struct i915_request *rq;
> > -			unsigned int status;
> > -			unsigned int count;
> > +		if (status & GEN8_CTX_STATUS_COMPLETE &&
> > +		    buf[2*head + 1] == execlists->preempt_complete_status) {
> > +			GEM_TRACE("%s preempt-idle\n", engine->name);
> > +			complete_preempt_context(execlists);
> > +			continue;
> > +		}
> >  
> > -			if (++head == GEN8_CSB_ENTRIES)
> > -				head = 0;
> > +		if (status & GEN8_CTX_STATUS_PREEMPTED &&
> > +		    execlists_is_active(execlists,
> > +					EXECLISTS_ACTIVE_PREEMPT))
> > +			continue;
> >  
> > -			/* We are flying near dragons again.
> > -			 *
> > -			 * We hold a reference to the request in execlist_port[]
> > -			 * but no more than that. We are operating in softirq
> > -			 * context and so cannot hold any mutex or sleep. That
> > -			 * prevents us stopping the requests we are processing
> > -			 * in port[] from being retired simultaneously (the
> > -			 * breadcrumb will be complete before we see the
> > -			 * context-switch). As we only hold the reference to the
> > -			 * request, any pointer chasing underneath the request
> > -			 * is subject to a potential use-after-free. Thus we
> > -			 * store all of the bookkeeping within port[] as
> > -			 * required, and avoid using unguarded pointers beneath
> > -			 * request itself. The same applies to the atomic
> > -			 * status notifier.
> > -			 */
> > +		GEM_BUG_ON(!execlists_is_active(execlists,
> > +						EXECLISTS_ACTIVE_USER));
> >  
> > -			status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
> > -			GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
> > -				  engine->name, head,
> > -				  status, buf[2*head + 1],
> > -				  execlists->active);
> > -
> > -			if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> > -				      GEN8_CTX_STATUS_PREEMPTED))
> > -				execlists_set_active(execlists,
> > -						     EXECLISTS_ACTIVE_HWACK);
> > -			if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> > -				execlists_clear_active(execlists,
> > -						       EXECLISTS_ACTIVE_HWACK);
> > -
> > -			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> > -				continue;
> > +		rq = port_unpack(port, &count);
> > +		GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
> > +			  engine->name,
> > +			  port->context_id, count,
> > +			  rq ? rq->global_seqno : 0,
> > +			  rq ? rq_prio(rq) : 0);
> > +
> > +		/* Check the context/desc id for this event matches */
> > +		GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> > +
> > +		GEM_BUG_ON(count == 0);
> > +		if (--count == 0) {
> > +			GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> > +			GEM_BUG_ON(port_isset(&port[1]) &&
> > +				   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
> > +			GEM_BUG_ON(!i915_request_completed(rq));
> > +			execlists_context_schedule_out(rq);
> > +			trace_i915_request_out(rq);
> > +			i915_request_put(rq);
> > +
> > +			GEM_TRACE("%s completed ctx=%d\n",
> > +				  engine->name, port->context_id);
> > +
> > +			execlists_port_complete(execlists, port);
> > +		} else {
> > +			port_set(port, port_pack(rq, count));
> > +		}
> >  
> > -			/* We should never get a COMPLETED | IDLE_ACTIVE! */
> > -			GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
> > +		/* After the final element, the hw should be idle */
> > +		GEM_BUG_ON(port_count(port) == 0 &&
> > +			   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> > +		if (port_count(port) == 0)
> > +			execlists_clear_active(execlists,
> > +					       EXECLISTS_ACTIVE_USER);
> > +	}
> >  
> > -			if (status & GEN8_CTX_STATUS_COMPLETE &&
> > -			    buf[2*head + 1] == execlists->preempt_complete_status) {
> > -				GEM_TRACE("%s preempt-idle\n", engine->name);
> > -				complete_preempt_context(execlists);
> > -				continue;
> > -			}
> > +	if (head != execlists->csb_head) {
> > +		execlists->csb_head = head;
> > +		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> > +		       i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > +	}
> >  
> > -			if (status & GEN8_CTX_STATUS_PREEMPTED &&
> > -			    execlists_is_active(execlists,
> > -						EXECLISTS_ACTIVE_PREEMPT))
> > -				continue;
> > +	if (unlikely(fw))
> > +		intel_uncore_forcewake_put(i915, execlists->fw_domains);
> > +}
> >  
> > -			GEM_BUG_ON(!execlists_is_active(execlists,
> > -							EXECLISTS_ACTIVE_USER));
> > -
> > -			rq = port_unpack(port, &count);
> > -			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
> > -				  engine->name,
> > -				  port->context_id, count,
> > -				  rq ? rq->global_seqno : 0,
> > -				  rq ? rq_prio(rq) : 0);
> > -
> > -			/* Check the context/desc id for this event matches */
> > -			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> > -
> > -			GEM_BUG_ON(count == 0);
> > -			if (--count == 0) {
> > -				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> > -				GEM_BUG_ON(port_isset(&port[1]) &&
> > -					   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
> > -				GEM_BUG_ON(!i915_request_completed(rq));
> > -				execlists_context_schedule_out(rq);
> > -				trace_i915_request_out(rq);
> > -				i915_request_put(rq);
> > -
> > -				GEM_TRACE("%s completed ctx=%d\n",
> > -					  engine->name, port->context_id);
> > -
> > -				execlists_port_complete(execlists, port);
> > -			} else {
> > -				port_set(port, port_pack(rq, count));
> > -			}
> > +/*
> > + * Check the unread Context Status Buffers and manage the submission of new
> > + * contexts to the ELSP accordingly.
> > + */
> > +static void execlists_submission_tasklet(unsigned long data)
> > +{
> > +	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> >  
> > -			/* After the final element, the hw should be idle */
> > -			GEM_BUG_ON(port_count(port) == 0 &&
> > -				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> > -			if (port_count(port) == 0)
> > -				execlists_clear_active(execlists,
> > -						       EXECLISTS_ACTIVE_USER);
> > -		}
> > +	/*
> > +	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
> > +	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
> > +	 * not be relinquished until the device is idle (see
> > +	 * i915_gem_idle_work_handler()). As a precaution, we make sure
> > +	 * that all ELSP are drained i.e. we have processed the CSB,
> > +	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
> > +	 */
> > +	GEM_BUG_ON(!engine->i915->gt.awake);
> >  
> > -		if (head != execlists->csb_head) {
> > -			execlists->csb_head = head;
> > -			writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> > -			       dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > -		}
> > -	}
> > +	/*
> > +	 * Prefer doing test_and_clear_bit() as a two stage operation to avoid
> > +	 * imposing the cost of a locked atomic transaction when submitting a
> > +	 * new request (outside of the context-switch interrupt).
> > +	 */
> > +	if (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted))
> This was a 'while' before refactoring. Shouldn't it still be in order to catch
> new irq_posted during processing?
> 
> BTW, I have revised and rebased the force preemption patches on drm-tip with
> this and the other related patches you have proposed. I just started running
> my IGT test that covers the innocent context on port[1] scenario that we
> discussed on IRC. Getting a sporadic failure but need more time to root cause.
> Will update soon.
> 
> > +		process_csb(engine);
> >  
> > -	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
> > +	if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
> >  		execlists_dequeue(engine);
> > -
> > -	if (fw)
> > -		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
> >  }
> >  
> >  static void queue_request(struct intel_engine_cs *engine,
> > @@ -1667,6 +1673,7 @@ static struct i915_request *
> >  execlists_reset_prepare(struct intel_engine_cs *engine)
> >  {
> >  	struct intel_engine_execlists * const execlists = &engine->execlists;
> > +	struct i915_request *request, *active;
> >  
> >  	/*
> >  	 * Prevent request submission to the hardware until we have
> > @@ -1688,7 +1695,39 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
> >  		tasklet_kill(&execlists->tasklet);
> >  	tasklet_disable(&execlists->tasklet);
> >  
> > -	return i915_gem_find_active_request(engine);
> > +	/*
> > +	 * We want to flush the pending context switches, having disabled
> > +	 * the tasklet above, we can assume exclusive access to the execlists.
> > +	 * For this allows us to catch up with an inflight preemption event,
> > +	 * and avoid blaming an innocent request if the stall was due to the
> > +	 * preemption itself.
> > +	 */
> > +	process_csb(engine);
> > +
> > +	/*
> > +	 * The last active request can then be no later than the last request
> > +	 * now in ELSP[0]. So search backwards from there, so that is the GPU
> > +	 * has advanced beyond the last CSB update, it will be pardoned.
> > +	 */
> > +	active = NULL;
> > +	request = port_request(execlists->port);
> > +	if (request) {
> > +		unsigned long flags;
> > +
> > +		spin_lock_irqsave(&engine->timeline->lock, flags);
> > +		list_for_each_entry_from_reverse(request,
> > +						 &engine->timeline->requests,
> > +						 link) {
> > +			if (__i915_request_completed(request,
> > +						     request->global_seqno))
> > +				break;
> > +
> > +			active = request;
> > +		}
> > +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> > +	}
> > +
> > +	return active;

Now we can see an abort of the reset if process_csb() processes a completed
preemption. So we need to kick the tasklet to get a dequeue of the waiting
request to happen. Currently we only kick if needed in gen8_init_common_ring
which is skipped if we abort the reset.

Otherwise this is working well in my force preemption tests.
-Jeff
> >  }
> >  
> >  static void reset_irq(struct intel_engine_cs *engine)
> > -- 
> > 2.16.2
> > 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson March 26, 2018, 11:28 a.m. UTC | #3
Quoting Jeff McGee (2018-03-22 15:14:01)
> On Tue, Mar 20, 2018 at 05:17:34PM -0700, Jeff McGee wrote:
> > On Tue, Mar 20, 2018 at 12:18:48AM +0000, Chris Wilson wrote:
> > > Catch up with the inflight CSB events, after disabling the tasklet
> > > before deciding which request was truly guilty of hanging the GPU.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > > CC: Michel Thierry <michel.thierry@intel.com>
> > > Cc: Jeff McGee <jeff.mcgee@intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/intel_lrc.c | 355 ++++++++++++++++++++++-----------------
> > >  1 file changed, 197 insertions(+), 158 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > > index e5a3ffbc273a..beb81f13a3cc 100644
> > > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > > @@ -828,186 +828,192 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> > >     local_irq_restore(flags);
> > >  }
> > >  
> > > -/*
> > > - * Check the unread Context Status Buffers and manage the submission of new
> > > - * contexts to the ELSP accordingly.
> > > - */
> > > -static void execlists_submission_tasklet(unsigned long data)
> > > +static void process_csb(struct intel_engine_cs *engine)
> > >  {
> > > -   struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> > >     struct intel_engine_execlists * const execlists = &engine->execlists;
> > >     struct execlist_port * const port = execlists->port;
> > > -   struct drm_i915_private *dev_priv = engine->i915;
> > > +   struct drm_i915_private *i915 = engine->i915;
> > > +   unsigned int head, tail;
> > > +   const u32 *buf;
> > >     bool fw = false;
> > >  
> > > -   /* We can skip acquiring intel_runtime_pm_get() here as it was taken
> > > -    * on our behalf by the request (see i915_gem_mark_busy()) and it will
> > > -    * not be relinquished until the device is idle (see
> > > -    * i915_gem_idle_work_handler()). As a precaution, we make sure
> > > -    * that all ELSP are drained i.e. we have processed the CSB,
> > > -    * before allowing ourselves to idle and calling intel_runtime_pm_put().
> > > -    */
> > > -   GEM_BUG_ON(!dev_priv->gt.awake);
> > > +   if (unlikely(execlists->csb_use_mmio)) {
> > > +           buf = (u32 * __force)
> > > +                   (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> > > +           execlists->csb_head = -1; /* force mmio read of CSB ptrs */
> > > +   } else {
> > > +           /* The HWSP contains a (cacheable) mirror of the CSB */
> > > +           buf = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> > > +   }
> > >  
> > > -   /* Prefer doing test_and_clear_bit() as a two stage operation to avoid
> > > -    * imposing the cost of a locked atomic transaction when submitting a
> > > -    * new request (outside of the context-switch interrupt).
> > > +   /*
> > > +    * The write will be ordered by the uncached read (itself
> > > +    * a memory barrier), so we do not need another in the form
> > > +    * of a locked instruction. The race between the interrupt
> > > +    * handler and the split test/clear is harmless as we order
> > > +    * our clear before the CSB read. If the interrupt arrived
> > > +    * first between the test and the clear, we read the updated
> > > +    * CSB and clear the bit. If the interrupt arrives as we read
> > > +    * the CSB or later (i.e. after we had cleared the bit) the bit
> > > +    * is set and we do a new loop.
> > >      */
> > > -   while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
> > > -           /* The HWSP contains a (cacheable) mirror of the CSB */
> > > -           const u32 *buf =
> > > -                   &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> > > -           unsigned int head, tail;
> > > -
> > > -           if (unlikely(execlists->csb_use_mmio)) {
> > > -                   buf = (u32 * __force)
> > > -                           (dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> > > -                   execlists->csb_head = -1; /* force mmio read of CSB ptrs */
> > > -           }
> > > +   __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > > +   if (unlikely(execlists->csb_head == -1)) { /* following a reset */
> > > +           intel_uncore_forcewake_get(i915, execlists->fw_domains);
> > > +           fw = true;
> > > +
> > > +           head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > > +           tail = GEN8_CSB_WRITE_PTR(head);
> > > +           head = GEN8_CSB_READ_PTR(head);
> > > +           execlists->csb_head = head;
> > > +   } else {
> > > +           const int write_idx =
> > > +                   intel_hws_csb_write_index(i915) -
> > > +                   I915_HWS_CSB_BUF0_INDEX;
> > > +
> > > +           head = execlists->csb_head;
> > > +           tail = READ_ONCE(buf[write_idx]);
> > > +   }
> > > +   GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> > > +             engine->name,
> > > +             head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> > > +             tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> > > +
> > > +   while (head != tail) {
> > > +           struct i915_request *rq;
> > > +           unsigned int status;
> > > +           unsigned int count;
> > > +
> > > +           if (++head == GEN8_CSB_ENTRIES)
> > > +                   head = 0;
> > >  
> > > -           /* The write will be ordered by the uncached read (itself
> > > -            * a memory barrier), so we do not need another in the form
> > > -            * of a locked instruction. The race between the interrupt
> > > -            * handler and the split test/clear is harmless as we order
> > > -            * our clear before the CSB read. If the interrupt arrived
> > > -            * first between the test and the clear, we read the updated
> > > -            * CSB and clear the bit. If the interrupt arrives as we read
> > > -            * the CSB or later (i.e. after we had cleared the bit) the bit
> > > -            * is set and we do a new loop.
> > > +           /*
> > > +            * We are flying near dragons again.
> > > +            *
> > > +            * We hold a reference to the request in execlist_port[]
> > > +            * but no more than that. We are operating in softirq
> > > +            * context and so cannot hold any mutex or sleep. That
> > > +            * prevents us stopping the requests we are processing
> > > +            * in port[] from being retired simultaneously (the
> > > +            * breadcrumb will be complete before we see the
> > > +            * context-switch). As we only hold the reference to the
> > > +            * request, any pointer chasing underneath the request
> > > +            * is subject to a potential use-after-free. Thus we
> > > +            * store all of the bookkeeping within port[] as
> > > +            * required, and avoid using unguarded pointers beneath
> > > +            * request itself. The same applies to the atomic
> > > +            * status notifier.
> > >              */
> > > -           __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > > -           if (unlikely(execlists->csb_head == -1)) { /* following a reset */
> > > -                   if (!fw) {
> > > -                           intel_uncore_forcewake_get(dev_priv,
> > > -                                                      execlists->fw_domains);
> > > -                           fw = true;
> > > -                   }
> > >  
> > > -                   head = readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > > -                   tail = GEN8_CSB_WRITE_PTR(head);
> > > -                   head = GEN8_CSB_READ_PTR(head);
> > > -                   execlists->csb_head = head;
> > > -           } else {
> > > -                   const int write_idx =
> > > -                           intel_hws_csb_write_index(dev_priv) -
> > > -                           I915_HWS_CSB_BUF0_INDEX;
> > > +           status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
> > > +           GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
> > > +                     engine->name, head,
> > > +                     status, buf[2*head + 1],
> > > +                     execlists->active);
> > > +
> > > +           if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> > > +                         GEN8_CTX_STATUS_PREEMPTED))
> > > +                   execlists_set_active(execlists,
> > > +                                        EXECLISTS_ACTIVE_HWACK);
> > > +           if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> > > +                   execlists_clear_active(execlists,
> > > +                                          EXECLISTS_ACTIVE_HWACK);
> > > +
> > > +           if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> > > +                   continue;
> > >  
> > > -                   head = execlists->csb_head;
> > > -                   tail = READ_ONCE(buf[write_idx]);
> > > -           }
> > > -           GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> > > -                     engine->name,
> > > -                     head, GEN8_CSB_READ_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> > > -                     tail, GEN8_CSB_WRITE_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> > > +           /* We should never get a COMPLETED | IDLE_ACTIVE! */
> > > +           GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
> > >  
> > > -           while (head != tail) {
> > > -                   struct i915_request *rq;
> > > -                   unsigned int status;
> > > -                   unsigned int count;
> > > +           if (status & GEN8_CTX_STATUS_COMPLETE &&
> > > +               buf[2*head + 1] == execlists->preempt_complete_status) {
> > > +                   GEM_TRACE("%s preempt-idle\n", engine->name);
> > > +                   complete_preempt_context(execlists);
> > > +                   continue;
> > > +           }
> > >  
> > > -                   if (++head == GEN8_CSB_ENTRIES)
> > > -                           head = 0;
> > > +           if (status & GEN8_CTX_STATUS_PREEMPTED &&
> > > +               execlists_is_active(execlists,
> > > +                                   EXECLISTS_ACTIVE_PREEMPT))
> > > +                   continue;
> > >  
> > > -                   /* We are flying near dragons again.
> > > -                    *
> > > -                    * We hold a reference to the request in execlist_port[]
> > > -                    * but no more than that. We are operating in softirq
> > > -                    * context and so cannot hold any mutex or sleep. That
> > > -                    * prevents us stopping the requests we are processing
> > > -                    * in port[] from being retired simultaneously (the
> > > -                    * breadcrumb will be complete before we see the
> > > -                    * context-switch). As we only hold the reference to the
> > > -                    * request, any pointer chasing underneath the request
> > > -                    * is subject to a potential use-after-free. Thus we
> > > -                    * store all of the bookkeeping within port[] as
> > > -                    * required, and avoid using unguarded pointers beneath
> > > -                    * request itself. The same applies to the atomic
> > > -                    * status notifier.
> > > -                    */
> > > +           GEM_BUG_ON(!execlists_is_active(execlists,
> > > +                                           EXECLISTS_ACTIVE_USER));
> > >  
> > > -                   status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
> > > -                   GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
> > > -                             engine->name, head,
> > > -                             status, buf[2*head + 1],
> > > -                             execlists->active);
> > > -
> > > -                   if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> > > -                                 GEN8_CTX_STATUS_PREEMPTED))
> > > -                           execlists_set_active(execlists,
> > > -                                                EXECLISTS_ACTIVE_HWACK);
> > > -                   if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> > > -                           execlists_clear_active(execlists,
> > > -                                                  EXECLISTS_ACTIVE_HWACK);
> > > -
> > > -                   if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> > > -                           continue;
> > > +           rq = port_unpack(port, &count);
> > > +           GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
> > > +                     engine->name,
> > > +                     port->context_id, count,
> > > +                     rq ? rq->global_seqno : 0,
> > > +                     rq ? rq_prio(rq) : 0);
> > > +
> > > +           /* Check the context/desc id for this event matches */
> > > +           GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> > > +
> > > +           GEM_BUG_ON(count == 0);
> > > +           if (--count == 0) {
> > > +                   GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> > > +                   GEM_BUG_ON(port_isset(&port[1]) &&
> > > +                              !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
> > > +                   GEM_BUG_ON(!i915_request_completed(rq));
> > > +                   execlists_context_schedule_out(rq);
> > > +                   trace_i915_request_out(rq);
> > > +                   i915_request_put(rq);
> > > +
> > > +                   GEM_TRACE("%s completed ctx=%d\n",
> > > +                             engine->name, port->context_id);
> > > +
> > > +                   execlists_port_complete(execlists, port);
> > > +           } else {
> > > +                   port_set(port, port_pack(rq, count));
> > > +           }
> > >  
> > > -                   /* We should never get a COMPLETED | IDLE_ACTIVE! */
> > > -                   GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
> > > +           /* After the final element, the hw should be idle */
> > > +           GEM_BUG_ON(port_count(port) == 0 &&
> > > +                      !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> > > +           if (port_count(port) == 0)
> > > +                   execlists_clear_active(execlists,
> > > +                                          EXECLISTS_ACTIVE_USER);
> > > +   }
> > >  
> > > -                   if (status & GEN8_CTX_STATUS_COMPLETE &&
> > > -                       buf[2*head + 1] == execlists->preempt_complete_status) {
> > > -                           GEM_TRACE("%s preempt-idle\n", engine->name);
> > > -                           complete_preempt_context(execlists);
> > > -                           continue;
> > > -                   }
> > > +   if (head != execlists->csb_head) {
> > > +           execlists->csb_head = head;
> > > +           writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> > > +                  i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > > +   }
> > >  
> > > -                   if (status & GEN8_CTX_STATUS_PREEMPTED &&
> > > -                       execlists_is_active(execlists,
> > > -                                           EXECLISTS_ACTIVE_PREEMPT))
> > > -                           continue;
> > > +   if (unlikely(fw))
> > > +           intel_uncore_forcewake_put(i915, execlists->fw_domains);
> > > +}
> > >  
> > > -                   GEM_BUG_ON(!execlists_is_active(execlists,
> > > -                                                   EXECLISTS_ACTIVE_USER));
> > > -
> > > -                   rq = port_unpack(port, &count);
> > > -                   GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
> > > -                             engine->name,
> > > -                             port->context_id, count,
> > > -                             rq ? rq->global_seqno : 0,
> > > -                             rq ? rq_prio(rq) : 0);
> > > -
> > > -                   /* Check the context/desc id for this event matches */
> > > -                   GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> > > -
> > > -                   GEM_BUG_ON(count == 0);
> > > -                   if (--count == 0) {
> > > -                           GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> > > -                           GEM_BUG_ON(port_isset(&port[1]) &&
> > > -                                      !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
> > > -                           GEM_BUG_ON(!i915_request_completed(rq));
> > > -                           execlists_context_schedule_out(rq);
> > > -                           trace_i915_request_out(rq);
> > > -                           i915_request_put(rq);
> > > -
> > > -                           GEM_TRACE("%s completed ctx=%d\n",
> > > -                                     engine->name, port->context_id);
> > > -
> > > -                           execlists_port_complete(execlists, port);
> > > -                   } else {
> > > -                           port_set(port, port_pack(rq, count));
> > > -                   }
> > > +/*
> > > + * Check the unread Context Status Buffers and manage the submission of new
> > > + * contexts to the ELSP accordingly.
> > > + */
> > > +static void execlists_submission_tasklet(unsigned long data)
> > > +{
> > > +   struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> > >  
> > > -                   /* After the final element, the hw should be idle */
> > > -                   GEM_BUG_ON(port_count(port) == 0 &&
> > > -                              !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> > > -                   if (port_count(port) == 0)
> > > -                           execlists_clear_active(execlists,
> > > -                                                  EXECLISTS_ACTIVE_USER);
> > > -           }
> > > +   /*
> > > +    * We can skip acquiring intel_runtime_pm_get() here as it was taken
> > > +    * on our behalf by the request (see i915_gem_mark_busy()) and it will
> > > +    * not be relinquished until the device is idle (see
> > > +    * i915_gem_idle_work_handler()). As a precaution, we make sure
> > > +    * that all ELSP are drained i.e. we have processed the CSB,
> > > +    * before allowing ourselves to idle and calling intel_runtime_pm_put().
> > > +    */
> > > +   GEM_BUG_ON(!engine->i915->gt.awake);
> > >  
> > > -           if (head != execlists->csb_head) {
> > > -                   execlists->csb_head = head;
> > > -                   writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> > > -                          dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > > -           }
> > > -   }
> > > +   /*
> > > +    * Prefer doing test_and_clear_bit() as a two stage operation to avoid
> > > +    * imposing the cost of a locked atomic transaction when submitting a
> > > +    * new request (outside of the context-switch interrupt).
> > > +    */
> > > +   if (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted))
> > This was a 'while' before refactoring. Shouldn't it still be in order to catch
> > new irq_posted during processing?
> > 
> > BTW, I have revised and rebased the force preemption patches on drm-tip with
> > this and the other related patches you have proposed. I just started running
> > my IGT test that covers the innocent context on port[1] scenario that we
> > discussed on IRC. Getting a sporadic failure but need more time to root cause.
> > Will update soon.
> > 
> > > +           process_csb(engine);
> > >  
> > > -   if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
> > > +   if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
> > >             execlists_dequeue(engine);
> > > -
> > > -   if (fw)
> > > -           intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
> > >  }
> > >  
> > >  static void queue_request(struct intel_engine_cs *engine,
> > > @@ -1667,6 +1673,7 @@ static struct i915_request *
> > >  execlists_reset_prepare(struct intel_engine_cs *engine)
> > >  {
> > >     struct intel_engine_execlists * const execlists = &engine->execlists;
> > > +   struct i915_request *request, *active;
> > >  
> > >     /*
> > >      * Prevent request submission to the hardware until we have
> > > @@ -1688,7 +1695,39 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
> > >             tasklet_kill(&execlists->tasklet);
> > >     tasklet_disable(&execlists->tasklet);
> > >  
> > > -   return i915_gem_find_active_request(engine);
> > > +   /*
> > > +    * We want to flush the pending context switches, having disabled
> > > +    * the tasklet above, we can assume exclusive access to the execlists.
> > > +    * For this allows us to catch up with an inflight preemption event,
> > > +    * and avoid blaming an innocent request if the stall was due to the
> > > +    * preemption itself.
> > > +    */
> > > +   process_csb(engine);
> > > +
> > > +   /*
> > > +    * The last active request can then be no later than the last request
> > > +    * now in ELSP[0]. So search backwards from there, so that is the GPU
> > > +    * has advanced beyond the last CSB update, it will be pardoned.
> > > +    */
> > > +   active = NULL;
> > > +   request = port_request(execlists->port);
> > > +   if (request) {
> > > +           unsigned long flags;
> > > +
> > > +           spin_lock_irqsave(&engine->timeline->lock, flags);
> > > +           list_for_each_entry_from_reverse(request,
> > > +                                            &engine->timeline->requests,
> > > +                                            link) {
> > > +                   if (__i915_request_completed(request,
> > > +                                                request->global_seqno))
> > > +                           break;
> > > +
> > > +                   active = request;
> > > +           }
> > > +           spin_unlock_irqrestore(&engine->timeline->lock, flags);
> > > +   }
> > > +
> > > +   return active;
> 
> Now we can see an abort of the reset if process_csb() processes a completed
> preemption. So we need to kick the tasklet to get a dequeue of the waiting
> request to happen. Currently we only kick if needed in gen8_init_common_ring
> which is skipped if we abort the reset.

Yes, it is imperative that the tasklet be disabled and process_csb() be
manually flushed in the preemption timeout (because of ksortirqd we may
not have run the submission tasklet at all before the timer expires).
Hence the desire to split out process_csb().
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e5a3ffbc273a..beb81f13a3cc 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -828,186 +828,192 @@  static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	local_irq_restore(flags);
 }
 
-/*
- * Check the unread Context Status Buffers and manage the submission of new
- * contexts to the ELSP accordingly.
- */
-static void execlists_submission_tasklet(unsigned long data)
+static void process_csb(struct intel_engine_cs *engine)
 {
-	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port * const port = execlists->port;
-	struct drm_i915_private *dev_priv = engine->i915;
+	struct drm_i915_private *i915 = engine->i915;
+	unsigned int head, tail;
+	const u32 *buf;
 	bool fw = false;
 
-	/* We can skip acquiring intel_runtime_pm_get() here as it was taken
-	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
-	 * not be relinquished until the device is idle (see
-	 * i915_gem_idle_work_handler()). As a precaution, we make sure
-	 * that all ELSP are drained i.e. we have processed the CSB,
-	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
-	 */
-	GEM_BUG_ON(!dev_priv->gt.awake);
+	if (unlikely(execlists->csb_use_mmio)) {
+		buf = (u32 * __force)
+			(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
+		execlists->csb_head = -1; /* force mmio read of CSB ptrs */
+	} else {
+		/* The HWSP contains a (cacheable) mirror of the CSB */
+		buf = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
+	}
 
-	/* Prefer doing test_and_clear_bit() as a two stage operation to avoid
-	 * imposing the cost of a locked atomic transaction when submitting a
-	 * new request (outside of the context-switch interrupt).
+	/*
+	 * The write will be ordered by the uncached read (itself
+	 * a memory barrier), so we do not need another in the form
+	 * of a locked instruction. The race between the interrupt
+	 * handler and the split test/clear is harmless as we order
+	 * our clear before the CSB read. If the interrupt arrived
+	 * first between the test and the clear, we read the updated
+	 * CSB and clear the bit. If the interrupt arrives as we read
+	 * the CSB or later (i.e. after we had cleared the bit) the bit
+	 * is set and we do a new loop.
 	 */
-	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
-		/* The HWSP contains a (cacheable) mirror of the CSB */
-		const u32 *buf =
-			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
-		unsigned int head, tail;
-
-		if (unlikely(execlists->csb_use_mmio)) {
-			buf = (u32 * __force)
-				(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
-			execlists->csb_head = -1; /* force mmio read of CSB ptrs */
-		}
+	__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+	if (unlikely(execlists->csb_head == -1)) { /* following a reset */
+		intel_uncore_forcewake_get(i915, execlists->fw_domains);
+		fw = true;
+
+		head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
+		tail = GEN8_CSB_WRITE_PTR(head);
+		head = GEN8_CSB_READ_PTR(head);
+		execlists->csb_head = head;
+	} else {
+		const int write_idx =
+			intel_hws_csb_write_index(i915) -
+			I915_HWS_CSB_BUF0_INDEX;
+
+		head = execlists->csb_head;
+		tail = READ_ONCE(buf[write_idx]);
+	}
+	GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
+		  engine->name,
+		  head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
+		  tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
+
+	while (head != tail) {
+		struct i915_request *rq;
+		unsigned int status;
+		unsigned int count;
+
+		if (++head == GEN8_CSB_ENTRIES)
+			head = 0;
 
-		/* The write will be ordered by the uncached read (itself
-		 * a memory barrier), so we do not need another in the form
-		 * of a locked instruction. The race between the interrupt
-		 * handler and the split test/clear is harmless as we order
-		 * our clear before the CSB read. If the interrupt arrived
-		 * first between the test and the clear, we read the updated
-		 * CSB and clear the bit. If the interrupt arrives as we read
-		 * the CSB or later (i.e. after we had cleared the bit) the bit
-		 * is set and we do a new loop.
+		/*
+		 * We are flying near dragons again.
+		 *
+		 * We hold a reference to the request in execlist_port[]
+		 * but no more than that. We are operating in softirq
+		 * context and so cannot hold any mutex or sleep. That
+		 * prevents us stopping the requests we are processing
+		 * in port[] from being retired simultaneously (the
+		 * breadcrumb will be complete before we see the
+		 * context-switch). As we only hold the reference to the
+		 * request, any pointer chasing underneath the request
+		 * is subject to a potential use-after-free. Thus we
+		 * store all of the bookkeeping within port[] as
+		 * required, and avoid using unguarded pointers beneath
+		 * request itself. The same applies to the atomic
+		 * status notifier.
 		 */
-		__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-		if (unlikely(execlists->csb_head == -1)) { /* following a reset */
-			if (!fw) {
-				intel_uncore_forcewake_get(dev_priv,
-							   execlists->fw_domains);
-				fw = true;
-			}
 
-			head = readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
-			tail = GEN8_CSB_WRITE_PTR(head);
-			head = GEN8_CSB_READ_PTR(head);
-			execlists->csb_head = head;
-		} else {
-			const int write_idx =
-				intel_hws_csb_write_index(dev_priv) -
-				I915_HWS_CSB_BUF0_INDEX;
+		status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
+		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
+			  engine->name, head,
+			  status, buf[2*head + 1],
+			  execlists->active);
+
+		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
+			      GEN8_CTX_STATUS_PREEMPTED))
+			execlists_set_active(execlists,
+					     EXECLISTS_ACTIVE_HWACK);
+		if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
+			execlists_clear_active(execlists,
+					       EXECLISTS_ACTIVE_HWACK);
+
+		if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
+			continue;
 
-			head = execlists->csb_head;
-			tail = READ_ONCE(buf[write_idx]);
-		}
-		GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
-			  engine->name,
-			  head, GEN8_CSB_READ_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
-			  tail, GEN8_CSB_WRITE_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
+		/* We should never get a COMPLETED | IDLE_ACTIVE! */
+		GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
 
-		while (head != tail) {
-			struct i915_request *rq;
-			unsigned int status;
-			unsigned int count;
+		if (status & GEN8_CTX_STATUS_COMPLETE &&
+		    buf[2*head + 1] == execlists->preempt_complete_status) {
+			GEM_TRACE("%s preempt-idle\n", engine->name);
+			complete_preempt_context(execlists);
+			continue;
+		}
 
-			if (++head == GEN8_CSB_ENTRIES)
-				head = 0;
+		if (status & GEN8_CTX_STATUS_PREEMPTED &&
+		    execlists_is_active(execlists,
+					EXECLISTS_ACTIVE_PREEMPT))
+			continue;
 
-			/* We are flying near dragons again.
-			 *
-			 * We hold a reference to the request in execlist_port[]
-			 * but no more than that. We are operating in softirq
-			 * context and so cannot hold any mutex or sleep. That
-			 * prevents us stopping the requests we are processing
-			 * in port[] from being retired simultaneously (the
-			 * breadcrumb will be complete before we see the
-			 * context-switch). As we only hold the reference to the
-			 * request, any pointer chasing underneath the request
-			 * is subject to a potential use-after-free. Thus we
-			 * store all of the bookkeeping within port[] as
-			 * required, and avoid using unguarded pointers beneath
-			 * request itself. The same applies to the atomic
-			 * status notifier.
-			 */
+		GEM_BUG_ON(!execlists_is_active(execlists,
+						EXECLISTS_ACTIVE_USER));
 
-			status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
-			GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
-				  engine->name, head,
-				  status, buf[2*head + 1],
-				  execlists->active);
-
-			if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
-				      GEN8_CTX_STATUS_PREEMPTED))
-				execlists_set_active(execlists,
-						     EXECLISTS_ACTIVE_HWACK);
-			if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
-				execlists_clear_active(execlists,
-						       EXECLISTS_ACTIVE_HWACK);
-
-			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
-				continue;
+		rq = port_unpack(port, &count);
+		GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
+			  engine->name,
+			  port->context_id, count,
+			  rq ? rq->global_seqno : 0,
+			  rq ? rq_prio(rq) : 0);
+
+		/* Check the context/desc id for this event matches */
+		GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
+
+		GEM_BUG_ON(count == 0);
+		if (--count == 0) {
+			GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
+			GEM_BUG_ON(port_isset(&port[1]) &&
+				   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
+			GEM_BUG_ON(!i915_request_completed(rq));
+			execlists_context_schedule_out(rq);
+			trace_i915_request_out(rq);
+			i915_request_put(rq);
+
+			GEM_TRACE("%s completed ctx=%d\n",
+				  engine->name, port->context_id);
+
+			execlists_port_complete(execlists, port);
+		} else {
+			port_set(port, port_pack(rq, count));
+		}
 
-			/* We should never get a COMPLETED | IDLE_ACTIVE! */
-			GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
+		/* After the final element, the hw should be idle */
+		GEM_BUG_ON(port_count(port) == 0 &&
+			   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
+		if (port_count(port) == 0)
+			execlists_clear_active(execlists,
+					       EXECLISTS_ACTIVE_USER);
+	}
 
-			if (status & GEN8_CTX_STATUS_COMPLETE &&
-			    buf[2*head + 1] == execlists->preempt_complete_status) {
-				GEM_TRACE("%s preempt-idle\n", engine->name);
-				complete_preempt_context(execlists);
-				continue;
-			}
+	if (head != execlists->csb_head) {
+		execlists->csb_head = head;
+		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
+		       i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
+	}
 
-			if (status & GEN8_CTX_STATUS_PREEMPTED &&
-			    execlists_is_active(execlists,
-						EXECLISTS_ACTIVE_PREEMPT))
-				continue;
+	if (unlikely(fw))
+		intel_uncore_forcewake_put(i915, execlists->fw_domains);
+}
 
-			GEM_BUG_ON(!execlists_is_active(execlists,
-							EXECLISTS_ACTIVE_USER));
-
-			rq = port_unpack(port, &count);
-			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
-				  engine->name,
-				  port->context_id, count,
-				  rq ? rq->global_seqno : 0,
-				  rq ? rq_prio(rq) : 0);
-
-			/* Check the context/desc id for this event matches */
-			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
-
-			GEM_BUG_ON(count == 0);
-			if (--count == 0) {
-				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
-				GEM_BUG_ON(port_isset(&port[1]) &&
-					   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
-				GEM_BUG_ON(!i915_request_completed(rq));
-				execlists_context_schedule_out(rq);
-				trace_i915_request_out(rq);
-				i915_request_put(rq);
-
-				GEM_TRACE("%s completed ctx=%d\n",
-					  engine->name, port->context_id);
-
-				execlists_port_complete(execlists, port);
-			} else {
-				port_set(port, port_pack(rq, count));
-			}
+/*
+ * Check the unread Context Status Buffers and manage the submission of new
+ * contexts to the ELSP accordingly.
+ */
+static void execlists_submission_tasklet(unsigned long data)
+{
+	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 
-			/* After the final element, the hw should be idle */
-			GEM_BUG_ON(port_count(port) == 0 &&
-				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
-			if (port_count(port) == 0)
-				execlists_clear_active(execlists,
-						       EXECLISTS_ACTIVE_USER);
-		}
+	/*
+	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
+	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
+	 * not be relinquished until the device is idle (see
+	 * i915_gem_idle_work_handler()). As a precaution, we make sure
+	 * that all ELSP are drained i.e. we have processed the CSB,
+	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
+	 */
+	GEM_BUG_ON(!engine->i915->gt.awake);
 
-		if (head != execlists->csb_head) {
-			execlists->csb_head = head;
-			writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
-			       dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
-		}
-	}
+	/*
+	 * Prefer doing test_and_clear_bit() as a two stage operation to avoid
+	 * imposing the cost of a locked atomic transaction when submitting a
+	 * new request (outside of the context-switch interrupt).
+	 */
+	if (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted))
+		process_csb(engine);
 
-	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
+	if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
 		execlists_dequeue(engine);
-
-	if (fw)
-		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
 }
 
 static void queue_request(struct intel_engine_cs *engine,
@@ -1667,6 +1673,7 @@  static struct i915_request *
 execlists_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request *request, *active;
 
 	/*
 	 * Prevent request submission to the hardware until we have
@@ -1688,7 +1695,39 @@  execlists_reset_prepare(struct intel_engine_cs *engine)
 		tasklet_kill(&execlists->tasklet);
 	tasklet_disable(&execlists->tasklet);
 
-	return i915_gem_find_active_request(engine);
+	/*
+	 * We want to flush the pending context switches, having disabled
+	 * the tasklet above, we can assume exclusive access to the execlists.
+	 * For this allows us to catch up with an inflight preemption event,
+	 * and avoid blaming an innocent request if the stall was due to the
+	 * preemption itself.
+	 */
+	process_csb(engine);
+
+	/*
+	 * The last active request can then be no later than the last request
+	 * now in ELSP[0]. So search backwards from there, so that is the GPU
+	 * has advanced beyond the last CSB update, it will be pardoned.
+	 */
+	active = NULL;
+	request = port_request(execlists->port);
+	if (request) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&engine->timeline->lock, flags);
+		list_for_each_entry_from_reverse(request,
+						 &engine->timeline->requests,
+						 link) {
+			if (__i915_request_completed(request,
+						     request->global_seqno))
+				break;
+
+			active = request;
+		}
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
+	}
+
+	return active;
 }
 
 static void reset_irq(struct intel_engine_cs *engine)