diff mbox series

[1/4] drm/i915: Replace global_seqno with a hangcheck heartbeat seqno

Message ID 20190225162357.7166-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/4] drm/i915: Replace global_seqno with a hangcheck heartbeat seqno | expand

Commit Message

Chris Wilson Feb. 25, 2019, 4:23 p.m. UTC
To determine whether an engine has 'stuck', we simply check whether or
not is still on the same seqno for several seconds. To keep this simple
mechanism intact over the loss of a global seqno, we can simply add a
new global heartbeat seqno instead. As we cannot know the sequence in
which requests will then be completed, we use a primitive random number
generator instead (with a cycle long enough to not matter over an
interval of a few thousand requests between hangcheck samples).

The alternative to using a dedicated seqno on every request is to issue
a heartbeat request and query its progress through the system. Sadly
this requires us to reduce struct_mutex so that we can issue requests
without requiring that bkl.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  7 ++---
 drivers/gpu/drm/i915/intel_engine_cs.c  |  5 ++--
 drivers/gpu/drm/i915/intel_hangcheck.c  |  6 ++---
 drivers/gpu/drm/i915/intel_lrc.c        | 15 +++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.c | 36 +++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_ringbuffer.h | 19 ++++++++++++-
 6 files changed, 77 insertions(+), 11 deletions(-)

Comments

Tvrtko Ursulin Feb. 25, 2019, 5:59 p.m. UTC | #1
On 25/02/2019 16:23, Chris Wilson wrote:
> To determine whether an engine has 'stuck', we simply check whether or
> not is still on the same seqno for several seconds. To keep this simple
> mechanism intact over the loss of a global seqno, we can simply add a
> new global heartbeat seqno instead. As we cannot know the sequence in
> which requests will then be completed, we use a primitive random number
> generator instead (with a cycle long enough to not matter over an
> interval of a few thousand requests between hangcheck samples).
> 
> The alternative to using a dedicated seqno on every request is to issue
> a heartbeat request and query its progress through the system. Sadly
> this requires us to reduce struct_mutex so that we can issue requests
> without requiring that bkl.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c     |  7 ++---
>   drivers/gpu/drm/i915/intel_engine_cs.c  |  5 ++--
>   drivers/gpu/drm/i915/intel_hangcheck.c  |  6 ++---
>   drivers/gpu/drm/i915/intel_lrc.c        | 15 +++++++++++
>   drivers/gpu/drm/i915/intel_ringbuffer.c | 36 +++++++++++++++++++++++--
>   drivers/gpu/drm/i915/intel_ringbuffer.h | 19 ++++++++++++-
>   6 files changed, 77 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 37175414ce89..545091a5180b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1295,7 +1295,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>   	with_intel_runtime_pm(dev_priv, wakeref) {
>   		for_each_engine(engine, dev_priv, id) {
>   			acthd[id] = intel_engine_get_active_head(engine);
> -			seqno[id] = intel_engine_get_seqno(engine);
> +			seqno[id] = intel_engine_get_hangcheck_seqno(engine);
>   		}
>   
>   		intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
> @@ -1315,8 +1315,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>   	for_each_engine(engine, dev_priv, id) {
>   		seq_printf(m, "%s:\n", engine->name);
>   		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
> -			   engine->hangcheck.seqno, seqno[id],
> -			   intel_engine_last_submit(engine),
> +			   engine->hangcheck.last_seqno,
> +			   seqno[id],
> +			   engine->hangcheck.next_seqno,
>   			   jiffies_to_msecs(jiffies -
>   					    engine->hangcheck.action_timestamp));
>   
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 81b80f8fd9ea..57bc5c4fb3ff 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1497,10 +1497,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	if (i915_reset_failed(engine->i915))
>   		drm_printf(m, "*** WEDGED ***\n");
>   
> -	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
> +	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x/%x [%d ms]\n",
>   		   intel_engine_get_seqno(engine),
>   		   intel_engine_last_submit(engine),
> -		   engine->hangcheck.seqno,
> +		   engine->hangcheck.last_seqno,
> +		   engine->hangcheck.next_seqno,
>   		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
>   	drm_printf(m, "\tReset count: %d (global %d)\n",
>   		   i915_reset_engine_count(error, engine),
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 9be033b6f4d2..f1d8dfc58049 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -133,21 +133,21 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
>   				  struct hangcheck *hc)
>   {
>   	hc->acthd = intel_engine_get_active_head(engine);
> -	hc->seqno = intel_engine_get_seqno(engine);
> +	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
>   }
>   
>   static void hangcheck_store_sample(struct intel_engine_cs *engine,
>   				   const struct hangcheck *hc)
>   {
>   	engine->hangcheck.acthd = hc->acthd;
> -	engine->hangcheck.seqno = hc->seqno;
> +	engine->hangcheck.last_seqno = hc->seqno;
>   }
>   
>   static enum intel_engine_hangcheck_action
>   hangcheck_get_action(struct intel_engine_cs *engine,
>   		     const struct hangcheck *hc)
>   {
> -	if (engine->hangcheck.seqno != hc->seqno)
> +	if (engine->hangcheck.last_seqno != hc->seqno)
>   		return ENGINE_ACTIVE_SEQNO;
>   
>   	if (intel_engine_is_idle(engine))
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 34a0866959c5..c134b3ca2df3 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -178,6 +178,12 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
>   		I915_GEM_HWS_INDEX_ADDR);
>   }
>   
> +static inline u32 intel_hws_hangcheck_address(struct intel_engine_cs *engine)
> +{
> +	return (i915_ggtt_offset(engine->status_page.vma) +
> +		I915_GEM_HWS_HANGCHECK_ADDR);
> +}
> +
>   static inline struct i915_priolist *to_priolist(struct rb_node *rb)
>   {
>   	return rb_entry(rb, struct i915_priolist, node);
> @@ -2206,6 +2212,10 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
>   				  request->fence.seqno,
>   				  request->timeline->hwsp_offset);
>   
> +	cs = gen8_emit_ggtt_write(cs,
> +				  intel_engine_next_hangcheck_seqno(request->engine),
> +				  intel_hws_hangcheck_address(request->engine));
> +
>   	cs = gen8_emit_ggtt_write(cs,
>   				  request->global_seqno,
>   				  intel_hws_seqno_address(request->engine));
> @@ -2230,6 +2240,11 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>   				      PIPE_CONTROL_FLUSH_ENABLE |
>   				      PIPE_CONTROL_CS_STALL);
>   
> +	cs = gen8_emit_ggtt_write_rcs(cs,
> +				      intel_engine_next_hangcheck_seqno(request->engine),
> +				      intel_hws_hangcheck_address(request->engine),
> +				      PIPE_CONTROL_CS_STALL);

Are CS_STALL needed on two writes or only last one would be enough? Or 
even, should all flushes be moved to the last pipe control?

> +
>   	cs = gen8_emit_ggtt_write_rcs(cs,
>   				      request->global_seqno,
>   				      intel_hws_seqno_address(request->engine),
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 7f841dba87b3..870184bbd169 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -43,6 +43,12 @@
>    */
>   #define LEGACY_REQUEST_SIZE 200
>   
> +static inline u32 hws_hangcheck_address(struct intel_engine_cs *engine)
> +{
> +	return (i915_ggtt_offset(engine->status_page.vma) +
> +		I915_GEM_HWS_HANGCHECK_ADDR);
> +}
> +

You can consolidate by putting the previous copy in a header.

>   static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
>   {
>   	return (i915_ggtt_offset(engine->status_page.vma) +
> @@ -316,6 +322,11 @@ static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	*cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
>   	*cs++ = rq->fence.seqno;
>   
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_QW_WRITE;
> +	*cs++ = hws_hangcheck_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
> +	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
> +
>   	*cs++ = GFX_OP_PIPE_CONTROL(4);
>   	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
>   	*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
> @@ -422,6 +433,11 @@ static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	*cs++ = rq->timeline->hwsp_offset;
>   	*cs++ = rq->fence.seqno;
>   
> +	*cs++ = GFX_OP_PIPE_CONTROL(4);
> +	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB;
> +	*cs++ = hws_hangcheck_address(rq->engine);
> +	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
> +
>   	*cs++ = GFX_OP_PIPE_CONTROL(4);
>   	*cs++ = (PIPE_CONTROL_QW_WRITE |
>   		 PIPE_CONTROL_GLOBAL_GTT_IVB |
> @@ -447,12 +463,15 @@ static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
>   	*cs++ = rq->fence.seqno;
>   
> +	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
> +	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
> +	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
> +
>   	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
>   	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
>   	*cs++ = rq->global_seqno;
>   
>   	*cs++ = MI_USER_INTERRUPT;
> -	*cs++ = MI_NOOP;
>   
>   	rq->tail = intel_ring_offset(rq, cs);
>   	assert_ring_tail_valid(rq->ring, rq->tail);
> @@ -472,6 +491,10 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
>   	*cs++ = rq->fence.seqno;
>   
> +	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
> +	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
> +	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
> +
>   	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
>   	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
>   	*cs++ = rq->global_seqno;
> @@ -487,6 +510,7 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	*cs++ = 0;
>   
>   	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
>   
>   	rq->tail = intel_ring_offset(rq, cs);
>   	assert_ring_tail_valid(rq->ring, rq->tail);
> @@ -930,11 +954,16 @@ static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
>   	*cs++ = rq->fence.seqno;
>   
> +	*cs++ = MI_STORE_DWORD_INDEX;
> +	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
> +	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
> +
>   	*cs++ = MI_STORE_DWORD_INDEX;
>   	*cs++ = I915_GEM_HWS_INDEX_ADDR;
>   	*cs++ = rq->global_seqno;
>   
>   	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
>   
>   	rq->tail = intel_ring_offset(rq, cs);
>   	assert_ring_tail_valid(rq->ring, rq->tail);
> @@ -956,6 +985,10 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
>   	*cs++ = rq->fence.seqno;
>   
> +	*cs++ = MI_STORE_DWORD_INDEX;
> +	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
> +	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
> +
>   	BUILD_BUG_ON(GEN5_WA_STORES < 1);
>   	for (i = 0; i < GEN5_WA_STORES; i++) {
>   		*cs++ = MI_STORE_DWORD_INDEX;
> @@ -964,7 +997,6 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
>   	}
>   
>   	*cs++ = MI_USER_INTERRUPT;
> -	*cs++ = MI_NOOP;
>   
>   	rq->tail = intel_ring_offset(rq, cs);
>   	assert_ring_tail_valid(rq->ring, rq->tail);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d45ad4ecca9..2869aaa9d225 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -6,6 +6,7 @@
>   
>   #include <linux/hashtable.h>
>   #include <linux/irq_work.h>
> +#include <linux/random.h>
>   #include <linux/seqlock.h>
>   
>   #include "i915_gem_batch_pool.h"
> @@ -119,7 +120,8 @@ struct intel_instdone {
>   
>   struct intel_engine_hangcheck {
>   	u64 acthd;
> -	u32 seqno;
> +	u32 last_seqno;
> +	u32 next_seqno;

Reading the code I got the impression:

s/last_seqno/hangcheck_seqno/
s/next_seqno/last_seqno/

Could be closer to reality. But your choice.

>   	unsigned long action_timestamp;
>   	struct intel_instdone instdone;
>   };
> @@ -726,6 +728,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>   #define I915_GEM_HWS_INDEX_ADDR		(I915_GEM_HWS_INDEX * sizeof(u32))
>   #define I915_GEM_HWS_PREEMPT		0x32
>   #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
> +#define I915_GEM_HWS_HANGCHECK		0x34
> +#define I915_GEM_HWS_HANGCHECK_ADDR	(I915_GEM_HWS_HANGCHECK * sizeof(u32))
>   #define I915_GEM_HWS_SEQNO		0x40
>   #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
>   #define I915_GEM_HWS_SCRATCH		0x80
> @@ -1086,4 +1090,17 @@ static inline bool inject_preempt_hang(struct intel_engine_execlists *execlists)
>   
>   #endif
>   
> +static inline u32
> +intel_engine_next_hangcheck_seqno(struct intel_engine_cs *engine)
> +{
> +	return engine->hangcheck.next_seqno =
> +		next_pseudo_random32(engine->hangcheck.next_seqno);

Having read the implementation I am now okay with it. (I was originially 
suggestion atomic_inc_return due concerns about computation cost.)

> +}
> +
> +static inline u32
> +intel_engine_get_hangcheck_seqno(struct intel_engine_cs *engine)
> +{
> +	return intel_read_status_page(engine, I915_GEM_HWS_HANGCHECK);
> +}
> +
>   #endif /* _INTEL_RINGBUFFER_H_ */
> 

Regards,

Tvrtko
Chris Wilson Feb. 25, 2019, 6:40 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-02-25 17:59:40)
> 
> On 25/02/2019 16:23, Chris Wilson wrote:
> >   static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> >   {
> >       return rb_entry(rb, struct i915_priolist, node);
> > @@ -2206,6 +2212,10 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
> >                                 request->fence.seqno,
> >                                 request->timeline->hwsp_offset);
> >   
> > +     cs = gen8_emit_ggtt_write(cs,
> > +                               intel_engine_next_hangcheck_seqno(request->engine),
> > +                               intel_hws_hangcheck_address(request->engine));
> > +
> >       cs = gen8_emit_ggtt_write(cs,
> >                                 request->global_seqno,
> >                                 intel_hws_seqno_address(request->engine));
> > @@ -2230,6 +2240,11 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
> >                                     PIPE_CONTROL_FLUSH_ENABLE |
> >                                     PIPE_CONTROL_CS_STALL);
> >   
> > +     cs = gen8_emit_ggtt_write_rcs(cs,
> > +                                   intel_engine_next_hangcheck_seqno(request->engine),
> > +                                   intel_hws_hangcheck_address(request->engine),
> > +                                   PIPE_CONTROL_CS_STALL);
> 
> Are CS_STALL needed on two writes or only last one would be enough? Or 
> even, should all flushes be moved to the last pipe control?

The CS_STALL is overkill as there's no requirement for it to be before
the global_seqno, but the convenience and ease to reason over win.

> > +
> >       cs = gen8_emit_ggtt_write_rcs(cs,
> >                                     request->global_seqno,
> >                                     intel_hws_seqno_address(request->engine),
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 7f841dba87b3..870184bbd169 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -43,6 +43,12 @@
> >    */
> >   #define LEGACY_REQUEST_SIZE 200
> >   
> > +static inline u32 hws_hangcheck_address(struct intel_engine_cs *engine)
> > +{
> > +     return (i915_ggtt_offset(engine->status_page.vma) +
> > +             I915_GEM_HWS_HANGCHECK_ADDR);
> > +}
> > +
> 
> You can consolidate by putting the previous copy in a header.

Inline spaghetti means it didn't go where I wanted and I purposely moved
these address computation to their users so that I can kill them off,
one by one. As is the plan even for the new hangcheck seqno.
 
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > index 5d45ad4ecca9..2869aaa9d225 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > @@ -6,6 +6,7 @@
> >   
> >   #include <linux/hashtable.h>
> >   #include <linux/irq_work.h>
> > +#include <linux/random.h>
> >   #include <linux/seqlock.h>
> >   
> >   #include "i915_gem_batch_pool.h"
> > @@ -119,7 +120,8 @@ struct intel_instdone {
> >   
> >   struct intel_engine_hangcheck {
> >       u64 acthd;
> > -     u32 seqno;
> > +     u32 last_seqno;
> > +     u32 next_seqno;
> 
> Reading the code I got the impression:
> 
> s/last_seqno/hangcheck_seqno/
> s/next_seqno/last_seqno/
> 
> Could be closer to reality. But your choice.

hangcheck.last_seqno,
hangcheck.next_seqno

hangcheck.hangcheck_seqno? Nah.
-Chris
Tvrtko Ursulin Feb. 26, 2019, 7:34 a.m. UTC | #3
On 25/02/2019 18:40, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-02-25 17:59:40)
>>
>> On 25/02/2019 16:23, Chris Wilson wrote:
>>>    static inline struct i915_priolist *to_priolist(struct rb_node *rb)
>>>    {
>>>        return rb_entry(rb, struct i915_priolist, node);
>>> @@ -2206,6 +2212,10 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
>>>                                  request->fence.seqno,
>>>                                  request->timeline->hwsp_offset);
>>>    
>>> +     cs = gen8_emit_ggtt_write(cs,
>>> +                               intel_engine_next_hangcheck_seqno(request->engine),
>>> +                               intel_hws_hangcheck_address(request->engine));
>>> +
>>>        cs = gen8_emit_ggtt_write(cs,
>>>                                  request->global_seqno,
>>>                                  intel_hws_seqno_address(request->engine));
>>> @@ -2230,6 +2240,11 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>>>                                      PIPE_CONTROL_FLUSH_ENABLE |
>>>                                      PIPE_CONTROL_CS_STALL);
>>>    
>>> +     cs = gen8_emit_ggtt_write_rcs(cs,
>>> +                                   intel_engine_next_hangcheck_seqno(request->engine),
>>> +                                   intel_hws_hangcheck_address(request->engine),
>>> +                                   PIPE_CONTROL_CS_STALL);
>>
>> Are CS_STALL needed on two writes or only last one would be enough? Or
>> even, should all flushes be moved to the last pipe control?
> 
> The CS_STALL is overkill as there's no requirement for it to be before
> the global_seqno, but the convenience and ease to reason over win.
> 
>>> +
>>>        cs = gen8_emit_ggtt_write_rcs(cs,
>>>                                      request->global_seqno,
>>>                                      intel_hws_seqno_address(request->engine),
>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> index 7f841dba87b3..870184bbd169 100644
>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> @@ -43,6 +43,12 @@
>>>     */
>>>    #define LEGACY_REQUEST_SIZE 200
>>>    
>>> +static inline u32 hws_hangcheck_address(struct intel_engine_cs *engine)
>>> +{
>>> +     return (i915_ggtt_offset(engine->status_page.vma) +
>>> +             I915_GEM_HWS_HANGCHECK_ADDR);
>>> +}
>>> +
>>
>> You can consolidate by putting the previous copy in a header.
> 
> Inline spaghetti means it didn't go where I wanted and I purposely moved
> these address computation to their users so that I can kill them off,
> one by one. As is the plan even for the new hangcheck seqno.
>   
>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> index 5d45ad4ecca9..2869aaa9d225 100644
>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> @@ -6,6 +6,7 @@
>>>    
>>>    #include <linux/hashtable.h>
>>>    #include <linux/irq_work.h>
>>> +#include <linux/random.h>
>>>    #include <linux/seqlock.h>
>>>    
>>>    #include "i915_gem_batch_pool.h"
>>> @@ -119,7 +120,8 @@ struct intel_instdone {
>>>    
>>>    struct intel_engine_hangcheck {
>>>        u64 acthd;
>>> -     u32 seqno;
>>> +     u32 last_seqno;
>>> +     u32 next_seqno;
>>
>> Reading the code I got the impression:
>>
>> s/last_seqno/hangcheck_seqno/
>> s/next_seqno/last_seqno/
>>
>> Could be closer to reality. But your choice.
> 
> hangcheck.last_seqno,
> hangcheck.next_seqno
> 
> hangcheck.hangcheck_seqno? Nah.

Ok have at it.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Chris Wilson Feb. 26, 2019, 7:46 a.m. UTC | #4
Quoting Tvrtko Ursulin (2019-02-26 07:34:37)
> 
> On 25/02/2019 18:40, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-02-25 17:59:40)
> >>
> >> On 25/02/2019 16:23, Chris Wilson wrote:
> >>>    static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> >>>    {
> >>>        return rb_entry(rb, struct i915_priolist, node);
> >>> @@ -2206,6 +2212,10 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
> >>>                                  request->fence.seqno,
> >>>                                  request->timeline->hwsp_offset);
> >>>    
> >>> +     cs = gen8_emit_ggtt_write(cs,
> >>> +                               intel_engine_next_hangcheck_seqno(request->engine),
> >>> +                               intel_hws_hangcheck_address(request->engine));
> >>> +
> >>>        cs = gen8_emit_ggtt_write(cs,
> >>>                                  request->global_seqno,
> >>>                                  intel_hws_seqno_address(request->engine));
> >>> @@ -2230,6 +2240,11 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
> >>>                                      PIPE_CONTROL_FLUSH_ENABLE |
> >>>                                      PIPE_CONTROL_CS_STALL);
> >>>    
> >>> +     cs = gen8_emit_ggtt_write_rcs(cs,
> >>> +                                   intel_engine_next_hangcheck_seqno(request->engine),
> >>> +                                   intel_hws_hangcheck_address(request->engine),
> >>> +                                   PIPE_CONTROL_CS_STALL);
> >>
> >> Are CS_STALL needed on two writes or only last one would be enough? Or
> >> even, should all flushes be moved to the last pipe control?
> > 
> > The CS_STALL is overkill as there's no requirement for it to be before
> > the global_seqno, but the convenience and ease to reason over win.

[snip]

> Ok have at it.
> 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

I was just about to resend without the CS_STALL...
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 37175414ce89..545091a5180b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1295,7 +1295,7 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	with_intel_runtime_pm(dev_priv, wakeref) {
 		for_each_engine(engine, dev_priv, id) {
 			acthd[id] = intel_engine_get_active_head(engine);
-			seqno[id] = intel_engine_get_seqno(engine);
+			seqno[id] = intel_engine_get_hangcheck_seqno(engine);
 		}
 
 		intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
@@ -1315,8 +1315,9 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	for_each_engine(engine, dev_priv, id) {
 		seq_printf(m, "%s:\n", engine->name);
 		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
-			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine),
+			   engine->hangcheck.last_seqno,
+			   seqno[id],
+			   engine->hangcheck.next_seqno,
 			   jiffies_to_msecs(jiffies -
 					    engine->hangcheck.action_timestamp));
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 81b80f8fd9ea..57bc5c4fb3ff 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1497,10 +1497,11 @@  void intel_engine_dump(struct intel_engine_cs *engine,
 	if (i915_reset_failed(engine->i915))
 		drm_printf(m, "*** WEDGED ***\n");
 
-	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
+	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x/%x [%d ms]\n",
 		   intel_engine_get_seqno(engine),
 		   intel_engine_last_submit(engine),
-		   engine->hangcheck.seqno,
+		   engine->hangcheck.last_seqno,
+		   engine->hangcheck.next_seqno,
 		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
 	drm_printf(m, "\tReset count: %d (global %d)\n",
 		   i915_reset_engine_count(error, engine),
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 9be033b6f4d2..f1d8dfc58049 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -133,21 +133,21 @@  static void hangcheck_load_sample(struct intel_engine_cs *engine,
 				  struct hangcheck *hc)
 {
 	hc->acthd = intel_engine_get_active_head(engine);
-	hc->seqno = intel_engine_get_seqno(engine);
+	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
 				   const struct hangcheck *hc)
 {
 	engine->hangcheck.acthd = hc->acthd;
-	engine->hangcheck.seqno = hc->seqno;
+	engine->hangcheck.last_seqno = hc->seqno;
 }
 
 static enum intel_engine_hangcheck_action
 hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct hangcheck *hc)
 {
-	if (engine->hangcheck.seqno != hc->seqno)
+	if (engine->hangcheck.last_seqno != hc->seqno)
 		return ENGINE_ACTIVE_SEQNO;
 
 	if (intel_engine_is_idle(engine))
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 34a0866959c5..c134b3ca2df3 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -178,6 +178,12 @@  static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 		I915_GEM_HWS_INDEX_ADDR);
 }
 
+static inline u32 intel_hws_hangcheck_address(struct intel_engine_cs *engine)
+{
+	return (i915_ggtt_offset(engine->status_page.vma) +
+		I915_GEM_HWS_HANGCHECK_ADDR);
+}
+
 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 {
 	return rb_entry(rb, struct i915_priolist, node);
@@ -2206,6 +2212,10 @@  static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
 				  request->fence.seqno,
 				  request->timeline->hwsp_offset);
 
+	cs = gen8_emit_ggtt_write(cs,
+				  intel_engine_next_hangcheck_seqno(request->engine),
+				  intel_hws_hangcheck_address(request->engine));
+
 	cs = gen8_emit_ggtt_write(cs,
 				  request->global_seqno,
 				  intel_hws_seqno_address(request->engine));
@@ -2230,6 +2240,11 @@  static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 				      PIPE_CONTROL_FLUSH_ENABLE |
 				      PIPE_CONTROL_CS_STALL);
 
+	cs = gen8_emit_ggtt_write_rcs(cs,
+				      intel_engine_next_hangcheck_seqno(request->engine),
+				      intel_hws_hangcheck_address(request->engine),
+				      PIPE_CONTROL_CS_STALL);
+
 	cs = gen8_emit_ggtt_write_rcs(cs,
 				      request->global_seqno,
 				      intel_hws_seqno_address(request->engine),
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 7f841dba87b3..870184bbd169 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -43,6 +43,12 @@ 
  */
 #define LEGACY_REQUEST_SIZE 200
 
+static inline u32 hws_hangcheck_address(struct intel_engine_cs *engine)
+{
+	return (i915_ggtt_offset(engine->status_page.vma) +
+		I915_GEM_HWS_HANGCHECK_ADDR);
+}
+
 static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 {
 	return (i915_ggtt_offset(engine->status_page.vma) +
@@ -316,6 +322,11 @@  static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_QW_WRITE;
+	*cs++ = hws_hangcheck_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
@@ -422,6 +433,11 @@  static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = rq->timeline->hwsp_offset;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB;
+	*cs++ = hws_hangcheck_address(rq->engine);
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = (PIPE_CONTROL_QW_WRITE |
 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
@@ -447,12 +463,15 @@  static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->global_seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
@@ -472,6 +491,10 @@  static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->global_seqno;
@@ -487,6 +510,7 @@  static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = 0;
 
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
@@ -930,11 +954,16 @@  static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_STORE_DWORD_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = MI_STORE_DWORD_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX_ADDR;
 	*cs++ = rq->global_seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
@@ -956,6 +985,10 @@  static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_STORE_DWORD_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	BUILD_BUG_ON(GEN5_WA_STORES < 1);
 	for (i = 0; i < GEN5_WA_STORES; i++) {
 		*cs++ = MI_STORE_DWORD_INDEX;
@@ -964,7 +997,6 @@  static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	}
 
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d45ad4ecca9..2869aaa9d225 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -6,6 +6,7 @@ 
 
 #include <linux/hashtable.h>
 #include <linux/irq_work.h>
+#include <linux/random.h>
 #include <linux/seqlock.h>
 
 #include "i915_gem_batch_pool.h"
@@ -119,7 +120,8 @@  struct intel_instdone {
 
 struct intel_engine_hangcheck {
 	u64 acthd;
-	u32 seqno;
+	u32 last_seqno;
+	u32 next_seqno;
 	unsigned long action_timestamp;
 	struct intel_instdone instdone;
 };
@@ -726,6 +728,8 @@  intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_INDEX_ADDR		(I915_GEM_HWS_INDEX * sizeof(u32))
 #define I915_GEM_HWS_PREEMPT		0x32
 #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
+#define I915_GEM_HWS_HANGCHECK		0x34
+#define I915_GEM_HWS_HANGCHECK_ADDR	(I915_GEM_HWS_HANGCHECK * sizeof(u32))
 #define I915_GEM_HWS_SEQNO		0x40
 #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
 #define I915_GEM_HWS_SCRATCH		0x80
@@ -1086,4 +1090,17 @@  static inline bool inject_preempt_hang(struct intel_engine_execlists *execlists)
 
 #endif
 
+static inline u32
+intel_engine_next_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+	return engine->hangcheck.next_seqno =
+		next_pseudo_random32(engine->hangcheck.next_seqno);
+}
+
+static inline u32
+intel_engine_get_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+	return intel_read_status_page(engine, I915_GEM_HWS_HANGCHECK);
+}
+
 #endif /* _INTEL_RINGBUFFER_H_ */