diff mbox series

[1/2] drm/i915/execlists: Peek at the next submission for error interrupts

Message ID 20200401110435.30389-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/i915/execlists: Peek at the next submission for error interrupts | expand

Commit Message

Chris Wilson April 1, 2020, 11:04 a.m. UTC
If we receive the error interrupt before the CS interrupt, we may find
ourselves without an active request to reset, skipping the GPU reset.
All because the attempt to reset was too early.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 41 ++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

Comments

Mika Kuoppala April 2, 2020, 8:16 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we receive the error interrupt before the CS interrupt, we may find
> ourselves without an active request to reset, skipping the GPU reset.
> All because the attempt to reset was too early.
>

With the tracing, we will see the the out of sync situations
so

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>


> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gt/intel_lrc.c | 41 ++++++++++++++++++++++++++++-
>  1 file changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 3479cda37fdc..f028114714cd 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2804,6 +2804,45 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
>  	return NULL;
>  }
>  
> +static struct i915_request *
> +active_context(struct intel_engine_cs *engine, u32 ccid)
> +{
> +	const struct intel_engine_execlists * const el = &engine->execlists;
> +	struct i915_request * const *port, *rq;
> +
> +	/*
> +	 * Use the most recent result from process_csb(), but just in case
> +	 * we trigger an error (via interrupt) before the first CS event has
> +	 * been written, peek at the next submission.
> +	 */
> +
> +	for (port = el->active; (rq = *port); port++) {
> +		if (upper_32_bits(rq->context->lrc_desc) == ccid) {
> +			ENGINE_TRACE(engine,
> +				     "ccid found at active:%zd\n",
> +				     port - el->active);
> +			return rq;
> +		}
> +	}
> +
> +	for (port = el->pending; (rq = *port); port++) {
> +		if (upper_32_bits(rq->context->lrc_desc) == ccid) {
> +			ENGINE_TRACE(engine,
> +				     "ccid found at pending:%zd\n",
> +				     port - el->pending);
> +			return rq;
> +		}
> +	}
> +
> +	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
> +	return NULL;
> +}
> +
> +static u32 active_ccid(struct intel_engine_cs *engine)
> +{
> +	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
> +}
> +
>  static bool execlists_capture(struct intel_engine_cs *engine)
>  {
>  	struct execlists_capture *cap;
> @@ -2821,7 +2860,7 @@ static bool execlists_capture(struct intel_engine_cs *engine)
>  		return true;
>  
>  	spin_lock_irq(&engine->active.lock);
> -	cap->rq = execlists_active(&engine->execlists);
> +	cap->rq = active_context(engine, active_ccid(engine));
>  	if (cap->rq) {
>  		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
>  		cap->rq = i915_request_get_rcu(cap->rq);
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson April 2, 2020, 8:30 p.m. UTC | #2
Quoting Mika Kuoppala (2020-04-02 21:16:52)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > If we receive the error interrupt before the CS interrupt, we may find
> > ourselves without an active request to reset, skipping the GPU reset.
> > All because the attempt to reset was too early.
> >
> 
> With the tracing, we will see the the out of sync situations
> so
> 
> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

I think that's the main benefit and it makes sense to have this patch
by itself so that we can get a bit more info perhaps next time. Then if
it keeps on failing, that might justify trying the second patch.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 3479cda37fdc..f028114714cd 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2804,6 +2804,45 @@  static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
 	return NULL;
 }
 
+static struct i915_request *
+active_context(struct intel_engine_cs *engine, u32 ccid)
+{
+	const struct intel_engine_execlists * const el = &engine->execlists;
+	struct i915_request * const *port, *rq;
+
+	/*
+	 * Use the most recent result from process_csb(), but just in case
+	 * we trigger an error (via interrupt) before the first CS event has
+	 * been written, peek at the next submission.
+	 */
+
+	for (port = el->active; (rq = *port); port++) {
+		if (upper_32_bits(rq->context->lrc_desc) == ccid) {
+			ENGINE_TRACE(engine,
+				     "ccid found at active:%zd\n",
+				     port - el->active);
+			return rq;
+		}
+	}
+
+	for (port = el->pending; (rq = *port); port++) {
+		if (upper_32_bits(rq->context->lrc_desc) == ccid) {
+			ENGINE_TRACE(engine,
+				     "ccid found at pending:%zd\n",
+				     port - el->pending);
+			return rq;
+		}
+	}
+
+	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
+	return NULL;
+}
+
+static u32 active_ccid(struct intel_engine_cs *engine)
+{
+	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
+}
+
 static bool execlists_capture(struct intel_engine_cs *engine)
 {
 	struct execlists_capture *cap;
@@ -2821,7 +2860,7 @@  static bool execlists_capture(struct intel_engine_cs *engine)
 		return true;
 
 	spin_lock_irq(&engine->active.lock);
-	cap->rq = execlists_active(&engine->execlists);
+	cap->rq = active_context(engine, active_ccid(engine));
 	if (cap->rq) {
 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
 		cap->rq = i915_request_get_rcu(cap->rq);