diff mbox series

drm/i915/watchdog: move emit_stop_watchdog until the very end of the ring commands

Message ID 20190105024001.37629-8-carlos.santa@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/watchdog: move emit_stop_watchdog until the very end of the ring commands | expand

Commit Message

Santa, Carlos Jan. 5, 2019, 2:40 a.m. UTC
From: Michel Thierry <michel.thierry@intel.com>

On command streams that could potentially hang the GPU after a last
flush command, it's best not to cancel the watchdog
until after all commands have executed.

Patch shared by Michel Thierry through IIRC after reproduction on
my local setup.

Tested-by: Carlos Santa <carlos.santa@intel.com>
CC: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Carlos Santa <carlos.santa@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 53 +++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 8 deletions(-)

Comments

Tvrtko Ursulin Jan. 7, 2019, 12:50 p.m. UTC | #1
On 05/01/2019 02:40, Carlos Santa wrote:
> From: Michel Thierry <michel.thierry@intel.com>
> 
> On command streams that could potentially hang the GPU after a last
> flush command, it's best not to cancel the watchdog
> until after all commands have executed.
> 
> Patch shared by Michel Thierry through IIRC after reproduction on

Joonas pointed out on IRC that IRC is called IRC. :)

> my local setup.
> 
> Tested-by: Carlos Santa <carlos.santa@intel.com>
> CC: Antonio Argenziano <antonio.argenziano@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_lrc.c | 53 +++++++++++++++++++++++++++-----
>   1 file changed, 45 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0afcbeb18329..25ba5fcc9466 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1885,8 +1885,8 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   		GEM_BUG_ON(!engine->emit_start_watchdog ||
>   			   !engine->emit_stop_watchdog);
>   
> -		/* + start_watchdog (6) + stop_watchdog (4) */
> -		num_dwords += 10;
> +		/* + start_watchdog (6) */
> +		num_dwords += 6;
>   		watchdog_running = true;
>           }
>   
> @@ -1927,10 +1927,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
>   	*cs++ = MI_NOOP;
>   
> -	if (watchdog_running) {
> -		/* Cancel watchdog timer */
> -		cs = engine->emit_stop_watchdog(rq, cs);
> -	}
> +	// XXX: emit_stop_watchdog happens in gen8_emit_breadcrumb_vcs

No C++ comments please. And what does XXX mean? Doesn't feel like it 
belongs.

>   
>   	intel_ring_advance(rq, cs);
>   	return 0;
> @@ -2189,6 +2186,37 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
>   }
>   static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
>   
> +static void gen8_emit_breadcrumb_vcs(struct i915_request *request, u32 *cs)
> +{
> +	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
> +	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
> +
> +	cs = gen8_emit_ggtt_write(cs, request->global_seqno,
> +				  intel_hws_seqno_address(request->engine));
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> +
> +	// stop_watchdog at the very end of the ring commands
> +	if (request->gem_context->__engine[VCS].watchdog_threshold != 0)

VCS is wrong. Whole check needs to be to_intel_context(ctx, 
engine)->watchdog_threshold I think.

> +	{
> +		/* Cancel watchdog timer */
> +		GEM_BUG_ON(!request->engine->emit_stop_watchdog);
> +		cs = request->engine->emit_stop_watchdog(request, cs);
> +	}
> +	else
> +	{

Coding style is wrong (curly braces for if else).

> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +	}
> +
> +	request->tail = intel_ring_offset(request, cs);
> +	assert_ring_tail_valid(request->ring, request->tail);
> +	gen8_emit_wa_tail(request, cs);
> +}
> +static const int gen8_emit_breadcrumb_vcs_sz = 6 + WA_TAIL_DWORDS + 4; //+4 for optional stop_watchdog
> +
>   static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>   {
>   	/* We're using qword write, seqno should be aligned to 8 bytes. */
> @@ -2306,8 +2334,17 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   	engine->request_alloc = execlists_request_alloc;
>   
>   	engine->emit_flush = gen8_emit_flush;
> -	engine->emit_breadcrumb = gen8_emit_breadcrumb;
> -	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
> +
> +	if (engine->id == VCS || engine->id == VCS2)

What about VCS3 or 4? Hint use engine class.

And what about RCS and VECS?

> +	{
> +		engine->emit_breadcrumb = gen8_emit_breadcrumb_vcs;
> +		engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_vcs_sz;
> +	}
> +	else
> +	{
> +		engine->emit_breadcrumb = gen8_emit_breadcrumb;
> +		engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
> +	}
>   
>   	engine->set_default_submission = intel_execlists_set_default_submission;
>   
> 

Looks like the patch should be squashed with the one which implements 
watchdog emit start/end? I mean if the whole setup has broken edge cases 
without this..

Regards,

Tvrtko
Chris Wilson Jan. 7, 2019, 12:54 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-01-07 12:50:24)
> 
> On 05/01/2019 02:40, Carlos Santa wrote:
> > +static void gen8_emit_breadcrumb_vcs(struct i915_request *request, u32 *cs)
> > +{
> > +     /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
> > +     BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
> > +
> > +     cs = gen8_emit_ggtt_write(cs, request->global_seqno,
> > +                               intel_hws_seqno_address(request->engine));
> > +     *cs++ = MI_USER_INTERRUPT;
> > +     *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > +
> > +     // stop_watchdog at the very end of the ring commands
> > +     if (request->gem_context->__engine[VCS].watchdog_threshold != 0)
> 
> VCS is wrong. Whole check needs to be to_intel_context(ctx, 
> engine)->watchdog_threshold I think.

You too! rq->hw_context->watchdog_threshold. It's as if hw_context may
not even be part of gem_context...
-Chris
Tvrtko Ursulin Jan. 7, 2019, 1:01 p.m. UTC | #3
On 07/01/2019 12:54, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-01-07 12:50:24)
>>
>> On 05/01/2019 02:40, Carlos Santa wrote:
>>> +static void gen8_emit_breadcrumb_vcs(struct i915_request *request, u32 *cs)
>>> +{
>>> +     /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
>>> +     BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
>>> +
>>> +     cs = gen8_emit_ggtt_write(cs, request->global_seqno,
>>> +                               intel_hws_seqno_address(request->engine));
>>> +     *cs++ = MI_USER_INTERRUPT;
>>> +     *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>>> +
>>> +     // stop_watchdog at the very end of the ring commands
>>> +     if (request->gem_context->__engine[VCS].watchdog_threshold != 0)
>>
>> VCS is wrong. Whole check needs to be to_intel_context(ctx,
>> engine)->watchdog_threshold I think.
> 
> You too! rq->hw_context->watchdog_threshold. It's as if hw_context may
> not even be part of gem_context...

Oops.. this is what happens when you just review and review - new stuff 
does not get ingrained in memory unless typing it enough. :)

Regards,

Tvrtko
Santa, Carlos Jan. 11, 2019, 2:25 a.m. UTC | #4
On Mon, 2019-01-07 at 12:50 +0000, Tvrtko Ursulin wrote:
> On 05/01/2019 02:40, Carlos Santa wrote:
> > From: Michel Thierry <michel.thierry@intel.com>
> > 
> > On command streams that could potentially hang the GPU after a last
> > flush command, it's best not to cancel the watchdog
> > until after all commands have executed.
> > 
> > Patch shared by Michel Thierry through IIRC after reproduction on
> 
> Joonas pointed out on IRC that IRC is called IRC. :)
> 
> > my local setup.
> > 
> > Tested-by: Carlos Santa <carlos.santa@intel.com>
> > CC: Antonio Argenziano <antonio.argenziano@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> > Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> > ---
> >   drivers/gpu/drm/i915/intel_lrc.c | 53
> > +++++++++++++++++++++++++++-----
> >   1 file changed, 45 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c
> > b/drivers/gpu/drm/i915/intel_lrc.c
> > index 0afcbeb18329..25ba5fcc9466 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -1885,8 +1885,8 @@ static int gen8_emit_bb_start(struct
> > i915_request *rq,
> >   		GEM_BUG_ON(!engine->emit_start_watchdog ||
> >   			   !engine->emit_stop_watchdog);
> >   
> > -		/* + start_watchdog (6) + stop_watchdog (4) */
> > -		num_dwords += 10;
> > +		/* + start_watchdog (6) */
> > +		num_dwords += 6;
> >   		watchdog_running = true;
> >           }
> >   
> > @@ -1927,10 +1927,7 @@ static int gen8_emit_bb_start(struct
> > i915_request *rq,
> >   	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> >   	*cs++ = MI_NOOP;
> >   
> > -	if (watchdog_running) {
> > -		/* Cancel watchdog timer */
> > -		cs = engine->emit_stop_watchdog(rq, cs);
> > -	}
> > +	// XXX: emit_stop_watchdog happens in gen8_emit_breadcrumb_vcs
> 
> No C++ comments please. And what does XXX mean? Doesn't feel like it 
> belongs.
> 
> >   
> >   	intel_ring_advance(rq, cs);
> >   	return 0;
> > @@ -2189,6 +2186,37 @@ static void gen8_emit_breadcrumb(struct
> > i915_request *request, u32 *cs)
> >   }
> >   static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
> >   
> > +static void gen8_emit_breadcrumb_vcs(struct i915_request *request,
> > u32 *cs)
> > +{
> > +	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
> > +	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
> > +
> > +	cs = gen8_emit_ggtt_write(cs, request->global_seqno,
> > +				  intel_hws_seqno_address(request-
> > >engine));
> > +	*cs++ = MI_USER_INTERRUPT;
> > +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> > +
> > +	// stop_watchdog at the very end of the ring commands
> > +	if (request->gem_context->__engine[VCS].watchdog_threshold !=
> > 0)
> 
> VCS is wrong. Whole check needs to be to_intel_context(ctx, 
> engine)->watchdog_threshold I think.
> 
> > +	{
> > +		/* Cancel watchdog timer */
> > +		GEM_BUG_ON(!request->engine->emit_stop_watchdog);
> > +		cs = request->engine->emit_stop_watchdog(request, cs);
> > +	}
> > +	else
> > +	{
> 
> Coding style is wrong (curly braces for if else).
> 
> > +		*cs++ = MI_NOOP;
> > +		*cs++ = MI_NOOP;
> > +		*cs++ = MI_NOOP;
> > +		*cs++ = MI_NOOP;
> > +	}
> > +
> > +	request->tail = intel_ring_offset(request, cs);
> > +	assert_ring_tail_valid(request->ring, request->tail);
> > +	gen8_emit_wa_tail(request, cs);
> > +}
> > +static const int gen8_emit_breadcrumb_vcs_sz = 6 + WA_TAIL_DWORDS
> > + 4; //+4 for optional stop_watchdog
> > +
> >   static void gen8_emit_breadcrumb_rcs(struct i915_request
> > *request, u32 *cs)
> >   {
> >   	/* We're using qword write, seqno should be aligned to 8 bytes.
> > */
> > @@ -2306,8 +2334,17 @@ logical_ring_default_vfuncs(struct
> > intel_engine_cs *engine)
> >   	engine->request_alloc = execlists_request_alloc;
> >   
> >   	engine->emit_flush = gen8_emit_flush;
> > -	engine->emit_breadcrumb = gen8_emit_breadcrumb;
> > -	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
> > +
> > +	if (engine->id == VCS || engine->id == VCS2)
> 
> What about VCS3 or 4? Hint use engine class.
> 
> And what about RCS and VECS?
> 
> > +	{
> > +		engine->emit_breadcrumb = gen8_emit_breadcrumb_vcs;
> > +		engine->emit_breadcrumb_sz =
> > gen8_emit_breadcrumb_vcs_sz;
> > +	}
> > +	else
> > +	{
> > +		engine->emit_breadcrumb = gen8_emit_breadcrumb;
> > +		engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
> > +	}
> >   
> >   	engine->set_default_submission =
> > intel_execlists_set_default_submission;
> >   
> > 
> 
> Looks like the patch should be squashed with the one which
> implements 
> watchdog emit start/end? I mean if the whole setup has broken edge
> cases 
> without this..

Ok, I'll rework the above and squash it with the watchdog emit/start
patch
thx, CS

> 
> Regards,
> 
> Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0afcbeb18329..25ba5fcc9466 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1885,8 +1885,8 @@  static int gen8_emit_bb_start(struct i915_request *rq,
 		GEM_BUG_ON(!engine->emit_start_watchdog ||
 			   !engine->emit_stop_watchdog);
 
-		/* + start_watchdog (6) + stop_watchdog (4) */
-		num_dwords += 10;
+		/* + start_watchdog (6) */
+		num_dwords += 6;
 		watchdog_running = true;
         }
 
@@ -1927,10 +1927,7 @@  static int gen8_emit_bb_start(struct i915_request *rq,
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 	*cs++ = MI_NOOP;
 
-	if (watchdog_running) {
-		/* Cancel watchdog timer */
-		cs = engine->emit_stop_watchdog(rq, cs);
-	}
+	// XXX: emit_stop_watchdog happens in gen8_emit_breadcrumb_vcs
 
 	intel_ring_advance(rq, cs);
 	return 0;
@@ -2189,6 +2186,37 @@  static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 }
 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
 
+static void gen8_emit_breadcrumb_vcs(struct i915_request *request, u32 *cs)
+{
+	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
+	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
+
+	cs = gen8_emit_ggtt_write(cs, request->global_seqno,
+				  intel_hws_seqno_address(request->engine));
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+	// stop_watchdog at the very end of the ring commands
+	if (request->gem_context->__engine[VCS].watchdog_threshold != 0)
+	{
+		/* Cancel watchdog timer */
+		GEM_BUG_ON(!request->engine->emit_stop_watchdog);
+		cs = request->engine->emit_stop_watchdog(request, cs);
+	}
+	else
+	{
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+	}
+
+	request->tail = intel_ring_offset(request, cs);
+	assert_ring_tail_valid(request->ring, request->tail);
+	gen8_emit_wa_tail(request, cs);
+}
+static const int gen8_emit_breadcrumb_vcs_sz = 6 + WA_TAIL_DWORDS + 4; //+4 for optional stop_watchdog
+
 static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 {
 	/* We're using qword write, seqno should be aligned to 8 bytes. */
@@ -2306,8 +2334,17 @@  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->request_alloc = execlists_request_alloc;
 
 	engine->emit_flush = gen8_emit_flush;
-	engine->emit_breadcrumb = gen8_emit_breadcrumb;
-	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
+
+	if (engine->id == VCS || engine->id == VCS2)
+	{
+		engine->emit_breadcrumb = gen8_emit_breadcrumb_vcs;
+		engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_vcs_sz;
+	}
+	else
+	{
+		engine->emit_breadcrumb = gen8_emit_breadcrumb;
+		engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
+	}
 
 	engine->set_default_submission = intel_execlists_set_default_submission;