[01/14] drm/i915/hangcheck: Track context changes
diff mbox series

Message ID 20190501114541.10077-1-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [01/14] drm/i915/hangcheck: Track context changes
Related show

Commit Message

Chris Wilson May 1, 2019, 11:45 a.m. UTC
Given sufficient preemption, we may see a busy system that doesn't
advance seqno while performing work across multiple contexts, and given
sufficient pathology not even notice a change in ACTHD. What does change
between the preempting contexts is their RING, so take note of that and
treat a change in the ring address as being an indication of forward
progress.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  1 +
 drivers/gpu/drm/i915/gt/intel_hangcheck.c    | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

Comments

Tvrtko Ursulin May 3, 2019, 10:36 a.m. UTC | #1
On 01/05/2019 12:45, Chris Wilson wrote:
> Given sufficient preemption, we may see a busy system that doesn't
> advance seqno while performing work across multiple contexts, and given
> sufficient pathology not even notice a change in ACTHD. What does change
> between the preempting contexts is their RING, so take note of that and
> treat a change in the ring address as being an indication of forward
> progress.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_types.h |  1 +
>   drivers/gpu/drm/i915/gt/intel_hangcheck.c    | 12 +++++++++---
>   2 files changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 9d64e33f8427..c0ab11b12e14 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -53,6 +53,7 @@ struct intel_instdone {
>   
>   struct intel_engine_hangcheck {
>   	u64 acthd;
> +	u32 last_ring;
>   	u32 last_seqno;
>   	u32 next_seqno;
>   	unsigned long action_timestamp;
> diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> index e5eaa06fe74d..721ab74a382f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> @@ -27,6 +27,7 @@
>   
>   struct hangcheck {
>   	u64 acthd;
> +	u32 ring;
>   	u32 seqno;
>   	enum intel_engine_hangcheck_action action;
>   	unsigned long action_timestamp;
> @@ -134,6 +135,7 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
>   {
>   	hc->acthd = intel_engine_get_active_head(engine);
>   	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
> +	hc->ring = ENGINE_READ(engine, RING_START);
>   }
>   
>   static void hangcheck_store_sample(struct intel_engine_cs *engine,
> @@ -141,18 +143,22 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
>   {
>   	engine->hangcheck.acthd = hc->acthd;
>   	engine->hangcheck.last_seqno = hc->seqno;
> +	engine->hangcheck.last_ring = hc->ring;
>   }
>   
>   static enum intel_engine_hangcheck_action
>   hangcheck_get_action(struct intel_engine_cs *engine,
>   		     const struct hangcheck *hc)
>   {
> -	if (engine->hangcheck.last_seqno != hc->seqno)
> -		return ENGINE_ACTIVE_SEQNO;
> -
>   	if (intel_engine_is_idle(engine))
>   		return ENGINE_IDLE;
>   
> +	if (engine->hangcheck.last_ring != hc->ring)
> +		return ENGINE_ACTIVE_SEQNO;
> +
> +	if (engine->hangcheck.last_seqno != hc->seqno)
> +		return ENGINE_ACTIVE_SEQNO;
> +
>   	return engine_stuck(engine, hc->acthd);
>   }
>   
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

This should be associated with engine seqno removal, right? Not sure if 
it triggers in reality to be really needed.

Regards,

Tvrtko
Chris Wilson May 3, 2019, 10:43 a.m. UTC | #2
Quoting Tvrtko Ursulin (2019-05-03 11:36:55)
> 
> On 01/05/2019 12:45, Chris Wilson wrote:
> > Given sufficient preemption, we may see a busy system that doesn't
> > advance seqno while performing work across multiple contexts, and given
> > sufficient pathology not even notice a change in ACTHD. What does change
> > between the preempting contexts is their RING, so take note of that and
> > treat a change in the ring address as being an indication of forward
> > progress.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_types.h |  1 +
> >   drivers/gpu/drm/i915/gt/intel_hangcheck.c    | 12 +++++++++---
> >   2 files changed, 10 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > index 9d64e33f8427..c0ab11b12e14 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > @@ -53,6 +53,7 @@ struct intel_instdone {
> >   
> >   struct intel_engine_hangcheck {
> >       u64 acthd;
> > +     u32 last_ring;
> >       u32 last_seqno;
> >       u32 next_seqno;
> >       unsigned long action_timestamp;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> > index e5eaa06fe74d..721ab74a382f 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
> > @@ -27,6 +27,7 @@
> >   
> >   struct hangcheck {
> >       u64 acthd;
> > +     u32 ring;
> >       u32 seqno;
> >       enum intel_engine_hangcheck_action action;
> >       unsigned long action_timestamp;
> > @@ -134,6 +135,7 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
> >   {
> >       hc->acthd = intel_engine_get_active_head(engine);
> >       hc->seqno = intel_engine_get_hangcheck_seqno(engine);
> > +     hc->ring = ENGINE_READ(engine, RING_START);
> >   }
> >   
> >   static void hangcheck_store_sample(struct intel_engine_cs *engine,
> > @@ -141,18 +143,22 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
> >   {
> >       engine->hangcheck.acthd = hc->acthd;
> >       engine->hangcheck.last_seqno = hc->seqno;
> > +     engine->hangcheck.last_ring = hc->ring;
> >   }
> >   
> >   static enum intel_engine_hangcheck_action
> >   hangcheck_get_action(struct intel_engine_cs *engine,
> >                    const struct hangcheck *hc)
> >   {
> > -     if (engine->hangcheck.last_seqno != hc->seqno)
> > -             return ENGINE_ACTIVE_SEQNO;
> > -
> >       if (intel_engine_is_idle(engine))
> >               return ENGINE_IDLE;
> >   
> > +     if (engine->hangcheck.last_ring != hc->ring)
> > +             return ENGINE_ACTIVE_SEQNO;
> > +
> > +     if (engine->hangcheck.last_seqno != hc->seqno)
> > +             return ENGINE_ACTIVE_SEQNO;
> > +
> >       return engine_stuck(engine, hc->acthd);
> >   }
> >   
> > 
> 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> This should be associated with engine seqno removal, right? Not sure if 
> it triggers in reality to be really needed.

Yeah, I'm not convinced we have a pressing need until timeslicing as
userspace can only create 1024 preemption events by itself, and that
should be ok... I can imagine that userspace submits a semaphore at
address 0 in each and waits 1s before submitting the next preemption
event... That would fire the current hangcheck. (Possibly legitimately
but the blame would not be effective at defeating the hostile client.)

It wasn't until timeslicing that I noticed the effect in practice.
-Chris

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 9d64e33f8427..c0ab11b12e14 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -53,6 +53,7 @@  struct intel_instdone {
 
 struct intel_engine_hangcheck {
 	u64 acthd;
+	u32 last_ring;
 	u32 last_seqno;
 	u32 next_seqno;
 	unsigned long action_timestamp;
diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
index e5eaa06fe74d..721ab74a382f 100644
--- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c
@@ -27,6 +27,7 @@ 
 
 struct hangcheck {
 	u64 acthd;
+	u32 ring;
 	u32 seqno;
 	enum intel_engine_hangcheck_action action;
 	unsigned long action_timestamp;
@@ -134,6 +135,7 @@  static void hangcheck_load_sample(struct intel_engine_cs *engine,
 {
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
+	hc->ring = ENGINE_READ(engine, RING_START);
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
@@ -141,18 +143,22 @@  static void hangcheck_store_sample(struct intel_engine_cs *engine,
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.last_seqno = hc->seqno;
+	engine->hangcheck.last_ring = hc->ring;
 }
 
 static enum intel_engine_hangcheck_action
 hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct hangcheck *hc)
 {
-	if (engine->hangcheck.last_seqno != hc->seqno)
-		return ENGINE_ACTIVE_SEQNO;
-
 	if (intel_engine_is_idle(engine))
 		return ENGINE_IDLE;
 
+	if (engine->hangcheck.last_ring != hc->ring)
+		return ENGINE_ACTIVE_SEQNO;
+
+	if (engine->hangcheck.last_seqno != hc->seqno)
+		return ENGINE_ACTIVE_SEQNO;
+
 	return engine_stuck(engine, hc->acthd);
 }