diff mbox series

drm/i915/gt: Hook up CS_MASTER_ERROR_INTERRUPT

Message ID 20200128150429.3809387-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915/gt: Hook up CS_MASTER_ERROR_INTERRUPT | expand

Commit Message

Chris Wilson Jan. 28, 2020, 3:04 p.m. UTC
Now that we have offline error capture and can reset an engine from
inside an atomic context while also preserving the GPU state for
post-mortem analysis, it is time to handle error interrupts thrown by
the command parser.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    |   8 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  10 ++
 drivers/gpu/drm/i915/gt/intel_gt.c           |   5 +
 drivers/gpu/drm/i915/gt/intel_gt_irq.c       |  27 ++-
 drivers/gpu/drm/i915/gt/intel_lrc.c          |  57 +++++--
 drivers/gpu/drm/i915/gt/selftest_lrc.c       | 166 +++++++++++++++++--
 drivers/gpu/drm/i915/i915_gpu_error.c        |   2 +
 drivers/gpu/drm/i915/i915_gpu_error.h        |   1 +
 drivers/gpu/drm/i915/i915_reg.h              |   5 +-
 9 files changed, 246 insertions(+), 35 deletions(-)

Comments

Mika Kuoppala Jan. 29, 2020, 3:07 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Now that we have offline error capture and can reset an engine from
> inside an atomic context while also preserving the GPU state for
> post-mortem analysis, it is time to handle error interrupts thrown by
> the command parser.

You might want to add an advertisement here that this way the
detection/recovery speed of cs errors is greatly improved.

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c    |   8 +-
>  drivers/gpu/drm/i915/gt/intel_engine_types.h |  10 ++
>  drivers/gpu/drm/i915/gt/intel_gt.c           |   5 +
>  drivers/gpu/drm/i915/gt/intel_gt_irq.c       |  27 ++-
>  drivers/gpu/drm/i915/gt/intel_lrc.c          |  57 +++++--
>  drivers/gpu/drm/i915/gt/selftest_lrc.c       | 166 +++++++++++++++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c        |   2 +
>  drivers/gpu/drm/i915/i915_gpu_error.h        |   1 +
>  drivers/gpu/drm/i915/i915_reg.h              |   5 +-
>  9 files changed, 246 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 9b965d1f811d..39fe9a5b4820 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1293,8 +1293,14 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
>  	}
>  
>  	if (INTEL_GEN(dev_priv) >= 6) {
> -		drm_printf(m, "\tRING_IMR: %08x\n",
> +		drm_printf(m, "\tRING_IMR:   0x%08x\n",
>  			   ENGINE_READ(engine, RING_IMR));
> +		drm_printf(m, "\tRING_ESR:   0x%08x\n",
> +			   ENGINE_READ(engine, RING_ESR));
> +		drm_printf(m, "\tRING_EMR:   0x%08x\n",
> +			   ENGINE_READ(engine, RING_EMR));
> +		drm_printf(m, "\tRING_EIR:   0x%08x\n",
> +			   ENGINE_READ(engine, RING_EIR));
>  	}
>  
>  	addr = intel_engine_get_active_head(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 92be41a6903c..abd1de3b83a8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -156,6 +156,16 @@ struct intel_engine_execlists {
>  	 */
>  	struct i915_priolist default_priolist;
>  
> +	/**
> +	 * @error_interrupt: CS Master EIR
> +	 *
> +	 * The CS generates an interrupt when it detects an error. We capture
> +	 * the first error interrupt, record the EIR and schedule the tasklet.
> +	 * In the tasklet, we process the pending CS events to ensure we have
> +	 * the guilty request, and then reset the engine.
> +	 */
> +	u32 error_interrupt;
> +
>  	/**
>  	 * @no_priolist: priority lists disabled
>  	 */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 88b6c904607c..143268083135 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -455,6 +455,11 @@ static int __engines_record_defaults(struct intel_gt *gt)
>  		if (!rq)
>  			continue;
>  
> +		if (rq->fence.error) {
> +			err = -EIO;
> +			goto out;
> +		}
> +
>  		GEM_BUG_ON(!test_bit(CONTEXT_ALLOC_BIT, &rq->context->flags));
>  		state = rq->context->state;
>  		if (!state)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> index 7278b10e1a03..864efaf1eb37 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> @@ -24,6 +24,21 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
>  {
>  	bool tasklet = false;
>  
> +	if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
> +		u32 eir;
> +
> +		eir = ENGINE_READ(engine, RING_EIR);
> +		ENGINE_TRACE(engine, "CS error: %x\n", eir);
> +
> +		/* Disable the error interrupt until after the reset */
> +		if (likely(eir)) {
> +			ENGINE_WRITE(engine, RING_EMR, ~0u);
> +			ENGINE_WRITE(engine, RING_EIR, eir);
> +			engine->execlists.error_interrupt = eir;

WRITE_ONCE

> +			tasklet = true;
> +		}
> +	}
> +
>  	if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
>  		tasklet = true;
>  
> @@ -210,7 +225,10 @@ void gen11_gt_irq_reset(struct intel_gt *gt)
>  
>  void gen11_gt_irq_postinstall(struct intel_gt *gt)
>  {
> -	const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
> +	const u32 irqs =
> +		GT_CS_MASTER_ERROR_INTERRUPT |
> +		GT_RENDER_USER_INTERRUPT |
> +		GT_CONTEXT_SWITCH_INTERRUPT;
>  	struct intel_uncore *uncore = gt->uncore;
>  	const u32 dmask = irqs << 16 | irqs;
>  	const u32 smask = irqs << 16;
> @@ -279,7 +297,7 @@ void gen6_gt_irq_handler(struct intel_gt *gt, u32 gt_iir)
>  
>  	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
>  		      GT_BSD_CS_ERROR_INTERRUPT |
> -		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT))
> +		      GT_CS_MASTER_ERROR_INTERRUPT))
>  		DRM_DEBUG("Command parser error, gt_iir 0x%08x\n", gt_iir);
>  
>  	if (gt_iir & GT_PARITY_ERROR(gt->i915))
> @@ -345,7 +363,10 @@ void gen8_gt_irq_reset(struct intel_gt *gt)
>  void gen8_gt_irq_postinstall(struct intel_gt *gt)
>  {
>  	/* These are interrupts we'll toggle with the ring mask register */
> -	const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
> +	const u32 irqs =
> +		GT_CS_MASTER_ERROR_INTERRUPT |
> +		GT_RENDER_USER_INTERRUPT |
> +		GT_CONTEXT_SWITCH_INTERRUPT;
>  	const u32 gt_interrupts[] = {
>  		irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT,
>  		irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT,
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index cf6c43bd540a..1b6aab05fde1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2613,13 +2613,13 @@ static bool execlists_capture(struct intel_engine_cs *engine)
>  	if (!cap)
>  		return true;
>  
> +	spin_lock_irq(&engine->active.lock);
>  	cap->rq = execlists_active(&engine->execlists);
> -	GEM_BUG_ON(!cap->rq);
> -
> -	rcu_read_lock();
> -	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
> -	cap->rq = i915_request_get_rcu(cap->rq);
> -	rcu_read_unlock();
> +	if (cap->rq) {
> +		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
> +		cap->rq = i915_request_get_rcu(cap->rq);
> +	}
> +	spin_unlock_irq(&engine->active.lock);
>  	if (!cap->rq)
>  		goto err_free;
>  
> @@ -2658,27 +2658,25 @@ static bool execlists_capture(struct intel_engine_cs *engine)
>  	return false;
>  }
>  
> -static noinline void preempt_reset(struct intel_engine_cs *engine)
> +static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
>  {
>  	const unsigned int bit = I915_RESET_ENGINE + engine->id;
>  	unsigned long *lock = &engine->gt->reset.flags;
>  
> -	if (i915_modparams.reset < 3)
> +	if (!intel_has_reset_engine(engine->gt))
>  		return;
>  
>  	if (test_and_set_bit(bit, lock))
>  		return;
>  
> +	ENGINE_TRACE(engine, "reset for %s\n", msg);
> +
>  	/* Mark this tasklet as disabled to avoid waiting for it to complete */
>  	tasklet_disable_nosync(&engine->execlists.tasklet);
>  
> -	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
> -		     READ_ONCE(engine->props.preempt_timeout_ms),
> -		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
> -
>  	ring_set_paused(engine, 1); /* Freeze the current request in place */
>  	if (execlists_capture(engine))
> -		intel_engine_reset(engine, "preemption time out");
> +		intel_engine_reset(engine, msg);
>  	else
>  		ring_set_paused(engine, 0);
>  
> @@ -2709,6 +2707,13 @@ static void execlists_submission_tasklet(unsigned long data)
>  	bool timeout = preempt_timeout(engine);
>  
>  	process_csb(engine);
> +
> +	if (unlikely(engine->execlists.error_interrupt)) {

READ_ONCE

> +		engine->execlists.error_interrupt = 0;

The race looks bening as this is only for us.

> +		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
> +			execlists_reset(engine, "CS error");
> +	}
> +
>  	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
>  		unsigned long flags;
>  
> @@ -2717,8 +2722,8 @@ static void execlists_submission_tasklet(unsigned long data)
>  		spin_unlock_irqrestore(&engine->active.lock, flags);
>  
>  		/* Recheck after serialising with direct-submission */
> -		if (timeout && preempt_timeout(engine))
> -			preempt_reset(engine);
> +		if (unlikely(timeout && preempt_timeout(engine)))
> +			execlists_reset(engine, "preemption time out");
>  	}
>  }
>  
> @@ -3343,6 +3348,25 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
>  	return ret;
>  }
>  
> +static void enable_error_interrupt(struct intel_engine_cs *engine)
> +{
> +	u32 status;
> +
> +	engine->execlists.error_interrupt = 0;
> +	ENGINE_WRITE(engine, RING_EMR, ~0u);
> +	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
> +
> +	status = ENGINE_READ(engine, RING_ESR);
> +	if (unlikely(status)) {
> +		dev_err(engine->i915->drm.dev,
> +			"engine '%s' resumed still in error: %08x\n",
> +			engine->name, status);
> +		__intel_gt_reset(engine->gt, engine->mask);
> +	}
> +
> +	ENGINE_WRITE(engine, RING_EMR, ~REG_BIT(0));

Please do define this bit.

> +}
> +
>  static void enable_execlists(struct intel_engine_cs *engine)
>  {
>  	u32 mode;
> @@ -3364,6 +3388,8 @@ static void enable_execlists(struct intel_engine_cs *engine)
>  			i915_ggtt_offset(engine->status_page.vma));
>  	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
>  
> +	enable_error_interrupt(engine);
> +
>  	engine->context_tag = 0;
>  }
>  
> @@ -4290,6 +4316,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
>  
>  	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
>  	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
> +	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
>  }
>  
>  static void rcs_submission_override(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 65718ca2326e..84bac953d1b7 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -68,6 +68,21 @@ static void engine_heartbeat_enable(struct intel_engine_cs *engine,
>  	engine->props.heartbeat_interval_ms = saved;
>  }
>  
> +static int wait_for_submit(struct intel_engine_cs *engine,
> +			   struct i915_request *rq,
> +			   unsigned long timeout)
> +{
> +	timeout += jiffies;
> +	do {
> +		cond_resched();
> +		intel_engine_flush_submission(engine);
> +		if (i915_request_is_active(rq))

If this would not be just moving, I would ask what
if the request whizzed by to inactivity between
our sample points.

> +			return 0;
> +	} while (time_before(jiffies, timeout));
> +
> +	return -ETIME;
> +}
> +
>  static int live_sanitycheck(void *arg)
>  {
>  	struct intel_gt *gt = arg;
> @@ -386,6 +401,141 @@ static int live_hold_reset(void *arg)
>  	return err;
>  }
>  
> +static const char *error_repr(int err)
> +{
> +	return err ? "bad" : "good";
> +}
> +
> +static int live_error_interrupt(void *arg)
> +{
> +	static const struct error_phase {
> +		enum { GOOD = 0, BAD = -EIO } error[2];
> +	} phases[] = {
> +		{ { BAD,  GOOD } },
> +		{ { BAD,  BAD  } },
> +		{ { BAD,  GOOD } },
> +		{ { GOOD, GOOD } }, /* sentinel */
> +	};
> +	struct intel_gt *gt = arg;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	/*
> +	 * We hook up the CS_MASTER_ERROR_INTERRUPT to have forewarning
> +	 * of invalid commands in user batches that will cause a GPU hang.
> +	 * This is a faster mechanism than using hangcheck/heartbeats, but
> +	 * only detects problems the HW knows about -- it will not warn when
> +	 * we kill the HW!
> +	 *
> +	 * To verify our detection and reset, we throw some invalid commands
> +	 * at the HW and wait for the interrupt.
> +	 */
> +
> +	if (!intel_has_reset_engine(gt))
> +		return 0;
> +
> +	for_each_engine(engine, gt, id) {
> +		const struct error_phase *p;
> +		unsigned long heartbeat;
> +
> +		engine_heartbeat_disable(engine, &heartbeat);
> +
> +		for (p = phases; p->error[0] != GOOD; p++) {
> +			struct i915_request *client[ARRAY_SIZE(phases->error)];
> +			int err = 0, i;
> +			u32 *cs;
> +
> +			memset(client, 0, sizeof(*client));
> +			for (i = 0; i < ARRAY_SIZE(client); i++) {
> +				struct intel_context *ce;
> +				struct i915_request *rq;
> +
> +				ce = intel_context_create(engine);
> +				if (IS_ERR(ce)) {
> +					err = PTR_ERR(ce);
> +					goto out;
> +				}
> +
> +				rq = intel_context_create_request(ce);
> +				intel_context_put(ce);
> +				if (IS_ERR(rq)) {
> +					err = PTR_ERR(rq);
> +					goto out;
> +				}
> +
> +				if (rq->engine->emit_init_breadcrumb) {
> +					err = rq->engine->emit_init_breadcrumb(rq);
> +					if (err) {
> +						i915_request_add(rq);
> +						goto out;
> +					}
> +				}
> +
> +				cs = intel_ring_begin(rq, 2);
> +				if (IS_ERR(cs)) {
> +					i915_request_add(rq);
> +					err = PTR_ERR(cs);
> +					goto out;
> +				}
> +
> +				if (p->error[i]) {

Was going to nag about enums but on second read,
it reads just fine.

Defining the REG(0) and some WRITE_ONCE/READ_ONCE salt
added on top,

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> +					*cs++ = 0xdeadbeef;
> +					*cs++ = 0xdeadbeef;
> +				} else {
> +					*cs++ = MI_NOOP;
> +					*cs++ = MI_NOOP;
> +				}
> +
> +				client[i] = i915_request_get(rq);
> +				i915_request_add(rq);
> +			}
> +
> +			err = wait_for_submit(engine, client[0], HZ / 2);
> +			if (err) {
> +				pr_err("%s: first request did not start within time!\n",
> +				       engine->name);
> +				err = -ETIME;
> +				goto out;
> +			}
> +
> +			for (i = 0; i < ARRAY_SIZE(client); i++) {
> +				if (i915_request_wait(client[i], 0, HZ / 5) < 0) {
> +					pr_err("%s: %s request still executing!\n",
> +					       engine->name,
> +					       error_repr(p->error[i]));
> +					err = -ETIME;
> +					goto out;
> +				}
> +
> +				if (client[i]->fence.error != p->error[i]) {
> +					pr_err("%s: %s request completed with wrong error code: %d\n",
> +					       engine->name,
> +					       error_repr(p->error[i]),
> +					       client[i]->fence.error);
> +					err = -EINVAL;
> +					goto out;
> +				}
> +			}
> +
> +out:
> +			for (i = 0; i < ARRAY_SIZE(client); i++)
> +				if (client[i])
> +					i915_request_put(client[i]);
> +			if (err) {
> +				pr_err("%s: failed at phase[%zd] { %d, %d }\n",
> +				       engine->name, p - phases,
> +				       p->error[0], p->error[1]);
> +				intel_gt_set_wedged(gt);
> +				return err;
> +			}
> +		}
> +
> +		engine_heartbeat_enable(engine, heartbeat);
> +	}
> +
> +	return 0;
> +}
> +
>  static int
>  emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx)
>  {
> @@ -628,21 +778,6 @@ static struct i915_request *nop_request(struct intel_engine_cs *engine)
>  	return rq;
>  }
>  
> -static int wait_for_submit(struct intel_engine_cs *engine,
> -			   struct i915_request *rq,
> -			   unsigned long timeout)
> -{
> -	timeout += jiffies;
> -	do {
> -		cond_resched();
> -		intel_engine_flush_submission(engine);
> -		if (i915_request_is_active(rq))
> -			return 0;
> -	} while (time_before(jiffies, timeout));
> -
> -	return -ETIME;
> -}
> -
>  static long timeslice_threshold(const struct intel_engine_cs *engine)
>  {
>  	return 2 * msecs_to_jiffies_timeout(timeslice(engine)) + 1;
> @@ -3572,6 +3707,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>  		SUBTEST(live_unlite_switch),
>  		SUBTEST(live_unlite_preempt),
>  		SUBTEST(live_hold_reset),
> +		SUBTEST(live_error_interrupt),
>  		SUBTEST(live_timeslice_preempt),
>  		SUBTEST(live_timeslice_queue),
>  		SUBTEST(live_busywait_preempt),
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 0f67bef83106..dcab4723b17d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -515,6 +515,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>  		   (u32)(ee->acthd>>32), (u32)ee->acthd);
>  	err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
>  	err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
> +	err_printf(m, "  ESR:   0x%08x\n", ee->esr);
>  
>  	error_print_instdone(m, ee);
>  
> @@ -1102,6 +1103,7 @@ static void engine_record_registers(struct intel_engine_coredump *ee)
>  	}
>  
>  	if (INTEL_GEN(i915) >= 4) {
> +		ee->esr = ENGINE_READ(engine, RING_ESR);
>  		ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
>  		ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
>  		ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index e4a6afed3bbf..b35bc9edd733 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -75,6 +75,7 @@ struct intel_engine_coredump {
>  	u32 hws;
>  	u32 ipeir;
>  	u32 ipehr;
> +	u32 esr;
>  	u32 bbstate;
>  	u32 instpm;
>  	u32 instps;
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index b93c4c18f05c..4c72b8ac0f2e 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2639,6 +2639,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>  #define   GEN11_MCR_SUBSLICE_MASK	GEN11_MCR_SUBSLICE(0x7)
>  #define RING_IPEIR(base)	_MMIO((base) + 0x64)
>  #define RING_IPEHR(base)	_MMIO((base) + 0x68)
> +#define RING_EIR(base)		_MMIO((base) + 0xb0)
> +#define RING_EMR(base)		_MMIO((base) + 0xb4)
> +#define RING_ESR(base)		_MMIO((base) + 0xb8)
>  /*
>   * On GEN4, only the render ring INSTDONE exists and has a different
>   * layout than the GEN7+ version.
> @@ -3088,7 +3091,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>  #define GT_CONTEXT_SWITCH_INTERRUPT		(1 <<  8)
>  #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT	(1 <<  5) /* !snb */
>  #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT	(1 <<  4)
> -#define GT_RENDER_CS_MASTER_ERROR_INTERRUPT	(1 <<  3)
> +#define GT_CS_MASTER_ERROR_INTERRUPT		REG_BIT(3)
>  #define GT_RENDER_SYNC_STATUS_INTERRUPT		(1 <<  2)
>  #define GT_RENDER_DEBUG_INTERRUPT		(1 <<  1)
>  #define GT_RENDER_USER_INTERRUPT		(1 <<  0)
> -- 
> 2.25.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 9b965d1f811d..39fe9a5b4820 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1293,8 +1293,14 @@  static void intel_engine_print_registers(struct intel_engine_cs *engine,
 	}
 
 	if (INTEL_GEN(dev_priv) >= 6) {
-		drm_printf(m, "\tRING_IMR: %08x\n",
+		drm_printf(m, "\tRING_IMR:   0x%08x\n",
 			   ENGINE_READ(engine, RING_IMR));
+		drm_printf(m, "\tRING_ESR:   0x%08x\n",
+			   ENGINE_READ(engine, RING_ESR));
+		drm_printf(m, "\tRING_EMR:   0x%08x\n",
+			   ENGINE_READ(engine, RING_EMR));
+		drm_printf(m, "\tRING_EIR:   0x%08x\n",
+			   ENGINE_READ(engine, RING_EIR));
 	}
 
 	addr = intel_engine_get_active_head(engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 92be41a6903c..abd1de3b83a8 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -156,6 +156,16 @@  struct intel_engine_execlists {
 	 */
 	struct i915_priolist default_priolist;
 
+	/**
+	 * @error_interrupt: CS Master EIR
+	 *
+	 * The CS generates an interrupt when it detects an error. We capture
+	 * the first error interrupt, record the EIR and schedule the tasklet.
+	 * In the tasklet, we process the pending CS events to ensure we have
+	 * the guilty request, and then reset the engine.
+	 */
+	u32 error_interrupt;
+
 	/**
 	 * @no_priolist: priority lists disabled
 	 */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index 88b6c904607c..143268083135 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -455,6 +455,11 @@  static int __engines_record_defaults(struct intel_gt *gt)
 		if (!rq)
 			continue;
 
+		if (rq->fence.error) {
+			err = -EIO;
+			goto out;
+		}
+
 		GEM_BUG_ON(!test_bit(CONTEXT_ALLOC_BIT, &rq->context->flags));
 		state = rq->context->state;
 		if (!state)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
index 7278b10e1a03..864efaf1eb37 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
@@ -24,6 +24,21 @@  cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
 {
 	bool tasklet = false;
 
+	if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
+		u32 eir;
+
+		eir = ENGINE_READ(engine, RING_EIR);
+		ENGINE_TRACE(engine, "CS error: %x\n", eir);
+
+		/* Disable the error interrupt until after the reset */
+		if (likely(eir)) {
+			ENGINE_WRITE(engine, RING_EMR, ~0u);
+			ENGINE_WRITE(engine, RING_EIR, eir);
+			engine->execlists.error_interrupt = eir;
+			tasklet = true;
+		}
+	}
+
 	if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
 		tasklet = true;
 
@@ -210,7 +225,10 @@  void gen11_gt_irq_reset(struct intel_gt *gt)
 
 void gen11_gt_irq_postinstall(struct intel_gt *gt)
 {
-	const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
+	const u32 irqs =
+		GT_CS_MASTER_ERROR_INTERRUPT |
+		GT_RENDER_USER_INTERRUPT |
+		GT_CONTEXT_SWITCH_INTERRUPT;
 	struct intel_uncore *uncore = gt->uncore;
 	const u32 dmask = irqs << 16 | irqs;
 	const u32 smask = irqs << 16;
@@ -279,7 +297,7 @@  void gen6_gt_irq_handler(struct intel_gt *gt, u32 gt_iir)
 
 	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
 		      GT_BSD_CS_ERROR_INTERRUPT |
-		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT))
+		      GT_CS_MASTER_ERROR_INTERRUPT))
 		DRM_DEBUG("Command parser error, gt_iir 0x%08x\n", gt_iir);
 
 	if (gt_iir & GT_PARITY_ERROR(gt->i915))
@@ -345,7 +363,10 @@  void gen8_gt_irq_reset(struct intel_gt *gt)
 void gen8_gt_irq_postinstall(struct intel_gt *gt)
 {
 	/* These are interrupts we'll toggle with the ring mask register */
-	const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT;
+	const u32 irqs =
+		GT_CS_MASTER_ERROR_INTERRUPT |
+		GT_RENDER_USER_INTERRUPT |
+		GT_CONTEXT_SWITCH_INTERRUPT;
 	const u32 gt_interrupts[] = {
 		irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT,
 		irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT,
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index cf6c43bd540a..1b6aab05fde1 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2613,13 +2613,13 @@  static bool execlists_capture(struct intel_engine_cs *engine)
 	if (!cap)
 		return true;
 
+	spin_lock_irq(&engine->active.lock);
 	cap->rq = execlists_active(&engine->execlists);
-	GEM_BUG_ON(!cap->rq);
-
-	rcu_read_lock();
-	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
-	cap->rq = i915_request_get_rcu(cap->rq);
-	rcu_read_unlock();
+	if (cap->rq) {
+		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
+		cap->rq = i915_request_get_rcu(cap->rq);
+	}
+	spin_unlock_irq(&engine->active.lock);
 	if (!cap->rq)
 		goto err_free;
 
@@ -2658,27 +2658,25 @@  static bool execlists_capture(struct intel_engine_cs *engine)
 	return false;
 }
 
-static noinline void preempt_reset(struct intel_engine_cs *engine)
+static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
 {
 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
 	unsigned long *lock = &engine->gt->reset.flags;
 
-	if (i915_modparams.reset < 3)
+	if (!intel_has_reset_engine(engine->gt))
 		return;
 
 	if (test_and_set_bit(bit, lock))
 		return;
 
+	ENGINE_TRACE(engine, "reset for %s\n", msg);
+
 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
 	tasklet_disable_nosync(&engine->execlists.tasklet);
 
-	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
-		     READ_ONCE(engine->props.preempt_timeout_ms),
-		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
-
 	ring_set_paused(engine, 1); /* Freeze the current request in place */
 	if (execlists_capture(engine))
-		intel_engine_reset(engine, "preemption time out");
+		intel_engine_reset(engine, msg);
 	else
 		ring_set_paused(engine, 0);
 
@@ -2709,6 +2707,13 @@  static void execlists_submission_tasklet(unsigned long data)
 	bool timeout = preempt_timeout(engine);
 
 	process_csb(engine);
+
+	if (unlikely(engine->execlists.error_interrupt)) {
+		engine->execlists.error_interrupt = 0;
+		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
+			execlists_reset(engine, "CS error");
+	}
+
 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
 		unsigned long flags;
 
@@ -2717,8 +2722,8 @@  static void execlists_submission_tasklet(unsigned long data)
 		spin_unlock_irqrestore(&engine->active.lock, flags);
 
 		/* Recheck after serialising with direct-submission */
-		if (timeout && preempt_timeout(engine))
-			preempt_reset(engine);
+		if (unlikely(timeout && preempt_timeout(engine)))
+			execlists_reset(engine, "preemption time out");
 	}
 }
 
@@ -3343,6 +3348,25 @@  static int intel_init_workaround_bb(struct intel_engine_cs *engine)
 	return ret;
 }
 
+static void enable_error_interrupt(struct intel_engine_cs *engine)
+{
+	u32 status;
+
+	engine->execlists.error_interrupt = 0;
+	ENGINE_WRITE(engine, RING_EMR, ~0u);
+	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
+
+	status = ENGINE_READ(engine, RING_ESR);
+	if (unlikely(status)) {
+		dev_err(engine->i915->drm.dev,
+			"engine '%s' resumed still in error: %08x\n",
+			engine->name, status);
+		__intel_gt_reset(engine->gt, engine->mask);
+	}
+
+	ENGINE_WRITE(engine, RING_EMR, ~REG_BIT(0));
+}
+
 static void enable_execlists(struct intel_engine_cs *engine)
 {
 	u32 mode;
@@ -3364,6 +3388,8 @@  static void enable_execlists(struct intel_engine_cs *engine)
 			i915_ggtt_offset(engine->status_page.vma));
 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
 
+	enable_error_interrupt(engine);
+
 	engine->context_tag = 0;
 }
 
@@ -4290,6 +4316,7 @@  logical_ring_default_irqs(struct intel_engine_cs *engine)
 
 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
+	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
 }
 
 static void rcs_submission_override(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 65718ca2326e..84bac953d1b7 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -68,6 +68,21 @@  static void engine_heartbeat_enable(struct intel_engine_cs *engine,
 	engine->props.heartbeat_interval_ms = saved;
 }
 
+static int wait_for_submit(struct intel_engine_cs *engine,
+			   struct i915_request *rq,
+			   unsigned long timeout)
+{
+	timeout += jiffies;
+	do {
+		cond_resched();
+		intel_engine_flush_submission(engine);
+		if (i915_request_is_active(rq))
+			return 0;
+	} while (time_before(jiffies, timeout));
+
+	return -ETIME;
+}
+
 static int live_sanitycheck(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -386,6 +401,141 @@  static int live_hold_reset(void *arg)
 	return err;
 }
 
+static const char *error_repr(int err)
+{
+	return err ? "bad" : "good";
+}
+
+static int live_error_interrupt(void *arg)
+{
+	static const struct error_phase {
+		enum { GOOD = 0, BAD = -EIO } error[2];
+	} phases[] = {
+		{ { BAD,  GOOD } },
+		{ { BAD,  BAD  } },
+		{ { BAD,  GOOD } },
+		{ { GOOD, GOOD } }, /* sentinel */
+	};
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	/*
+	 * We hook up the CS_MASTER_ERROR_INTERRUPT to have forewarning
+	 * of invalid commands in user batches that will cause a GPU hang.
+	 * This is a faster mechanism than using hangcheck/heartbeats, but
+	 * only detects problems the HW knows about -- it will not warn when
+	 * we kill the HW!
+	 *
+	 * To verify our detection and reset, we throw some invalid commands
+	 * at the HW and wait for the interrupt.
+	 */
+
+	if (!intel_has_reset_engine(gt))
+		return 0;
+
+	for_each_engine(engine, gt, id) {
+		const struct error_phase *p;
+		unsigned long heartbeat;
+
+		engine_heartbeat_disable(engine, &heartbeat);
+
+		for (p = phases; p->error[0] != GOOD; p++) {
+			struct i915_request *client[ARRAY_SIZE(phases->error)];
+			int err = 0, i;
+			u32 *cs;
+
+			memset(client, 0, sizeof(*client));
+			for (i = 0; i < ARRAY_SIZE(client); i++) {
+				struct intel_context *ce;
+				struct i915_request *rq;
+
+				ce = intel_context_create(engine);
+				if (IS_ERR(ce)) {
+					err = PTR_ERR(ce);
+					goto out;
+				}
+
+				rq = intel_context_create_request(ce);
+				intel_context_put(ce);
+				if (IS_ERR(rq)) {
+					err = PTR_ERR(rq);
+					goto out;
+				}
+
+				if (rq->engine->emit_init_breadcrumb) {
+					err = rq->engine->emit_init_breadcrumb(rq);
+					if (err) {
+						i915_request_add(rq);
+						goto out;
+					}
+				}
+
+				cs = intel_ring_begin(rq, 2);
+				if (IS_ERR(cs)) {
+					i915_request_add(rq);
+					err = PTR_ERR(cs);
+					goto out;
+				}
+
+				if (p->error[i]) {
+					*cs++ = 0xdeadbeef;
+					*cs++ = 0xdeadbeef;
+				} else {
+					*cs++ = MI_NOOP;
+					*cs++ = MI_NOOP;
+				}
+
+				client[i] = i915_request_get(rq);
+				i915_request_add(rq);
+			}
+
+			err = wait_for_submit(engine, client[0], HZ / 2);
+			if (err) {
+				pr_err("%s: first request did not start within time!\n",
+				       engine->name);
+				err = -ETIME;
+				goto out;
+			}
+
+			for (i = 0; i < ARRAY_SIZE(client); i++) {
+				if (i915_request_wait(client[i], 0, HZ / 5) < 0) {
+					pr_err("%s: %s request still executing!\n",
+					       engine->name,
+					       error_repr(p->error[i]));
+					err = -ETIME;
+					goto out;
+				}
+
+				if (client[i]->fence.error != p->error[i]) {
+					pr_err("%s: %s request completed with wrong error code: %d\n",
+					       engine->name,
+					       error_repr(p->error[i]),
+					       client[i]->fence.error);
+					err = -EINVAL;
+					goto out;
+				}
+			}
+
+out:
+			for (i = 0; i < ARRAY_SIZE(client); i++)
+				if (client[i])
+					i915_request_put(client[i]);
+			if (err) {
+				pr_err("%s: failed at phase[%zd] { %d, %d }\n",
+				       engine->name, p - phases,
+				       p->error[0], p->error[1]);
+				intel_gt_set_wedged(gt);
+				return err;
+			}
+		}
+
+		engine_heartbeat_enable(engine, heartbeat);
+	}
+
+	return 0;
+}
+
 static int
 emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx)
 {
@@ -628,21 +778,6 @@  static struct i915_request *nop_request(struct intel_engine_cs *engine)
 	return rq;
 }
 
-static int wait_for_submit(struct intel_engine_cs *engine,
-			   struct i915_request *rq,
-			   unsigned long timeout)
-{
-	timeout += jiffies;
-	do {
-		cond_resched();
-		intel_engine_flush_submission(engine);
-		if (i915_request_is_active(rq))
-			return 0;
-	} while (time_before(jiffies, timeout));
-
-	return -ETIME;
-}
-
 static long timeslice_threshold(const struct intel_engine_cs *engine)
 {
 	return 2 * msecs_to_jiffies_timeout(timeslice(engine)) + 1;
@@ -3572,6 +3707,7 @@  int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_unlite_switch),
 		SUBTEST(live_unlite_preempt),
 		SUBTEST(live_hold_reset),
+		SUBTEST(live_error_interrupt),
 		SUBTEST(live_timeslice_preempt),
 		SUBTEST(live_timeslice_queue),
 		SUBTEST(live_busywait_preempt),
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0f67bef83106..dcab4723b17d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -515,6 +515,7 @@  static void error_print_engine(struct drm_i915_error_state_buf *m,
 		   (u32)(ee->acthd>>32), (u32)ee->acthd);
 	err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
 	err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
+	err_printf(m, "  ESR:   0x%08x\n", ee->esr);
 
 	error_print_instdone(m, ee);
 
@@ -1102,6 +1103,7 @@  static void engine_record_registers(struct intel_engine_coredump *ee)
 	}
 
 	if (INTEL_GEN(i915) >= 4) {
+		ee->esr = ENGINE_READ(engine, RING_ESR);
 		ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
 		ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
 		ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index e4a6afed3bbf..b35bc9edd733 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -75,6 +75,7 @@  struct intel_engine_coredump {
 	u32 hws;
 	u32 ipeir;
 	u32 ipehr;
+	u32 esr;
 	u32 bbstate;
 	u32 instpm;
 	u32 instps;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index b93c4c18f05c..4c72b8ac0f2e 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2639,6 +2639,9 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   GEN11_MCR_SUBSLICE_MASK	GEN11_MCR_SUBSLICE(0x7)
 #define RING_IPEIR(base)	_MMIO((base) + 0x64)
 #define RING_IPEHR(base)	_MMIO((base) + 0x68)
+#define RING_EIR(base)		_MMIO((base) + 0xb0)
+#define RING_EMR(base)		_MMIO((base) + 0xb4)
+#define RING_ESR(base)		_MMIO((base) + 0xb8)
 /*
  * On GEN4, only the render ring INSTDONE exists and has a different
  * layout than the GEN7+ version.
@@ -3088,7 +3091,7 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define GT_CONTEXT_SWITCH_INTERRUPT		(1 <<  8)
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT	(1 <<  5) /* !snb */
 #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT	(1 <<  4)
-#define GT_RENDER_CS_MASTER_ERROR_INTERRUPT	(1 <<  3)
+#define GT_CS_MASTER_ERROR_INTERRUPT		REG_BIT(3)
 #define GT_RENDER_SYNC_STATUS_INTERRUPT		(1 <<  2)
 #define GT_RENDER_DEBUG_INTERRUPT		(1 <<  1)
 #define GT_RENDER_USER_INTERRUPT		(1 <<  0)