[01/14] drm/i915/hangcheck: Track context changes

Message ID	20190501114541.10077-1-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Wed, 1 May 2019 12:45:28 +0100 Message-Id: <20190501114541.10077-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 01/14] drm/i915/hangcheck: Track context changes Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[01/14] drm/i915/hangcheck: Track context changes \| expand [01/14] drm/i915/hangcheck: Track context changes [02/14] drm/i915: Include fence signaled bit in print_request() [03/14] drm/i915/execlists: Flush the tasklet on parking [04/14] drm/i915: Leave engine parking to the engines [05/14] drm/i915: Remove delay for idle_work [06/14] drm/i915: Cancel retire_worker on parking [07/14] drm/i915: Stop spinning for DROP_IDLE (debugfs/i915_drop_caches) [08/14] drm/i915: Only reschedule the submission tasklet if preemption is possible [09/14] drm/i915: Delay semaphore submission until the start of the signaler [10/14] drm/i915/execlists: Don't apply priority boost for resets [11/14] drm/i915: Rearrange i915_scheduler.c [12/14] drm/i915: Pass i915_sched_node around internally [13/14] drm/i915: Bump signaler priority on adding a waiter [14/14] drm/i915: Convert inconsistent static engine tables into an init error

Message ID

20190501114541.10077-1-chris@chris-wilson.co.uk (mailing list archive)

State

New, archived

Headers

From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Wed,  1 May 2019 12:45:28 +0100
Message-Id: <20190501114541.10077-1-chris@chris-wilson.co.uk>
MIME-Version: 1.0
Subject: [Intel-gfx] [PATCH 01/14] drm/i915/hangcheck: Track context changes
Precedence: list
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Series

[01/14] drm/i915/hangcheck: Track context changes | expand

Commit Message

Chris Wilson May 1, 2019, 11:45 a.m. UTC

Given sufficient preemption, we may see a busy system that doesn't
advance seqno while performing work across multiple contexts, and given
sufficient pathology not even notice a change in ACTHD. What does change
between the preempting contexts is their RING, so take note of that and
treat a change in the ring address as being an indication of forward
progress.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  1 +
 drivers/gpu/drm/i915/gt/intel_hangcheck.c    | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

Comments

Tvrtko Ursulin May 3, 2019, 10:36 a.m. UTC | #1

On 01/05/2019 12:45, Chris Wilson wrote:
> Given sufficient preemption, we may see a busy system that doesn't
> advance seqno while performing work across multiple contexts, and given
> sufficient pathology not even notice a change in ACTHD. What does change
> between the preempting contexts is their RING, so take note of that and
> treat a change in the ring address as being an indication of forward
> progress.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_types.h |  1 +
>   drivers/gpu/drm/i915/gt/intel_hangcheck.c    | 12 +++++++++---
>   2 files changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 9d64e33f8427..c0ab11b12e14 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -53,6 +53,7 @@ struct intel_instdone {
>   
>   struct intel_engine_hangcheck {
>   	u64 acthd;
> +	u32 last_ring;
>   	u32 last_seqno;
>   	u32 next_seqno;
>   	unsigned long action_timestamp;
> diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> index e5eaa06fe74d..721ab74a382f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> @@ -27,6 +27,7 @@
>   
>   struct hangcheck {
>   	u64 acthd;
> +	u32 ring;
>   	u32 seqno;
>   	enum intel_engine_hangcheck_action action;
>   	unsigned long action_timestamp;
> @@ -134,6 +135,7 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
>   {
>   	hc->acthd = intel_engine_get_active_head(engine);
>   	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
> +	hc->ring = ENGINE_READ(engine, RING_START);
>   }
>   
>   static void hangcheck_store_sample(struct intel_engine_cs *engine,
> @@ -141,18 +143,22 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
>   {
>   	engine->hangcheck.acthd = hc->acthd;
>   	engine->hangcheck.last_seqno = hc->seqno;
> +	engine->hangcheck.last_ring = hc->ring;
>   }
>   
>   static enum intel_engine_hangcheck_action
>   hangcheck_get_action(struct intel_engine_cs *engine,
>   		     const struct hangcheck *hc)
>   {
> -	if (engine->hangcheck.last_seqno != hc->seqno)
> -		return ENGINE_ACTIVE_SEQNO;
> -
>   	if (intel_engine_is_idle(engine))
>   		return ENGINE_IDLE;
>   
> +	if (engine->hangcheck.last_ring != hc->ring)
> +		return ENGINE_ACTIVE_SEQNO;
> +
> +	if (engine->hangcheck.last_seqno != hc->seqno)
> +		return ENGINE_ACTIVE_SEQNO;
> +
>   	return engine_stuck(engine, hc->acthd);
>   }
>   
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

This should be associated with engine seqno removal, right? Not sure if 
it triggers in reality to be really needed.

Regards,

Tvrtko

Chris Wilson May 3, 2019, 10:43 a.m. UTC | #2

Quoting Tvrtko Ursulin (2019-05-03 11:36:55)
> 
> On 01/05/2019 12:45, Chris Wilson wrote:
> > Given sufficient preemption, we may see a busy system that doesn't
> > advance seqno while performing work across multiple contexts, and given
> > sufficient pathology not even notice a change in ACTHD. What does change
> > between the preempting contexts is their RING, so take note of that and
> > treat a change in the ring address as being an indication of forward
> > progress.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_types.h |  1 +
> >   drivers/gpu/drm/i915/gt/intel_hangcheck.c    | 12 +++++++++---
> >   2 files changed, 10 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > index 9d64e33f8427..c0ab11b12e14 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > @@ -53,6 +53,7 @@ struct intel_instdone {
> >   
> >   struct intel_engine_hangcheck {
> >       u64 acthd;
> > +     u32 last_ring;
> >       u32 last_seqno;
> >       u32 next_seqno;
> >       unsigned long action_timestamp;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> > index e5eaa06fe74d..721ab74a382f 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> > @@ -27,6 +27,7 @@
> >   
> >   struct hangcheck {
> >       u64 acthd;
> > +     u32 ring;
> >       u32 seqno;
> >       enum intel_engine_hangcheck_action action;
> >       unsigned long action_timestamp;
> > @@ -134,6 +135,7 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
> >   {
> >       hc->acthd = intel_engine_get_active_head(engine);
> >       hc->seqno = intel_engine_get_hangcheck_seqno(engine);
> > +     hc->ring = ENGINE_READ(engine, RING_START);
> >   }
> >   
> >   static void hangcheck_store_sample(struct intel_engine_cs *engine,
> > @@ -141,18 +143,22 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
> >   {
> >       engine->hangcheck.acthd = hc->acthd;
> >       engine->hangcheck.last_seqno = hc->seqno;
> > +     engine->hangcheck.last_ring = hc->ring;
> >   }
> >   
> >   static enum intel_engine_hangcheck_action
> >   hangcheck_get_action(struct intel_engine_cs *engine,
> >                    const struct hangcheck *hc)
> >   {
> > -     if (engine->hangcheck.last_seqno != hc->seqno)
> > -             return ENGINE_ACTIVE_SEQNO;
> > -
> >       if (intel_engine_is_idle(engine))
> >               return ENGINE_IDLE;
> >   
> > +     if (engine->hangcheck.last_ring != hc->ring)
> > +             return ENGINE_ACTIVE_SEQNO;
> > +
> > +     if (engine->hangcheck.last_seqno != hc->seqno)
> > +             return ENGINE_ACTIVE_SEQNO;
> > +
> >       return engine_stuck(engine, hc->acthd);
> >   }
> >   
> > 
> 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> This should be associated with engine seqno removal, right? Not sure if 
> it triggers in reality to be really needed.

Yeah, I'm not convinced we have a pressing need until timeslicing as
userspace can only create 1024 preemption events by itself, and that
should be ok... I can imagine that userspace submits a semaphore at
address 0 in each and waits 1s before submitting the next preemption
event... That would fire the current hangcheck. (Possibly legitimately
but the blame would not be effective at defeating the hostile client.)

It wasn't until timeslicing that I noticed the effect in practice.
-Chris

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 9d64e33f8427..c0ab11b12e14 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -53,6 +53,7 @@  struct intel_instdone {
 
 struct intel_engine_hangcheck {
 	u64 acthd;
+	u32 last_ring;
 	u32 last_seqno;
 	u32 next_seqno;
 	unsigned long action_timestamp;
diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
index e5eaa06fe74d..721ab74a382f 100644
--- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
@@ -27,6 +27,7 @@ 
 
 struct hangcheck {
 	u64 acthd;
+	u32 ring;
 	u32 seqno;
 	enum intel_engine_hangcheck_action action;
 	unsigned long action_timestamp;
@@ -134,6 +135,7 @@  static void hangcheck_load_sample(struct intel_engine_cs *engine,
 {
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
+	hc->ring = ENGINE_READ(engine, RING_START);
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
@@ -141,18 +143,22 @@  static void hangcheck_store_sample(struct intel_engine_cs *engine,
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.last_seqno = hc->seqno;
+	engine->hangcheck.last_ring = hc->ring;
 }
 
 static enum intel_engine_hangcheck_action
 hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct hangcheck *hc)
 {
-	if (engine->hangcheck.last_seqno != hc->seqno)
-		return ENGINE_ACTIVE_SEQNO;
-
 	if (intel_engine_is_idle(engine))
 		return ENGINE_IDLE;
 
+	if (engine->hangcheck.last_ring != hc->ring)
+		return ENGINE_ACTIVE_SEQNO;
+
+	if (engine->hangcheck.last_seqno != hc->seqno)
+		return ENGINE_ACTIVE_SEQNO;
+
 	return engine_stuck(engine, hc->acthd);
 }

[01/14] drm/i915/hangcheck: Track context changes

Commit Message

Comments

Patch