diff mbox series

[RFC,60/97] drm/i915: Track 'serial' counts for virtual engines

Message ID 20210506191451.77768-61-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Basic GuC submission support in the i915 | expand

Commit Message

Matthew Brost May 6, 2021, 7:14 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

The serial number tracking of engines happens at the backend of
request submission and was expecting to only be given physical
engines. However, in GuC submission mode, the decomposition of virtual
to physical engines does not happen in i915. Instead, requests are
submitted to their virtual engine mask all the way through to the
hardware (i.e. to GuC). This would mean that the heart beat code
thinks the physical engines are idle due to the serial number not
incrementing.

This patch updates the tracking to decompose virtual engines into
their physical constituents and tracks the request against each. This
is not entirely accurate as the GuC will only be issuing the request
to one physical engine. However, it is the best that i915 can do given
that it has no knowledge of the GuC's scheduling decisions.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
 .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
 drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
 drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 ++++++++++++++++
 drivers/gpu/drm/i915/i915_request.c              |  4 +++-
 6 files changed, 39 insertions(+), 1 deletion(-)

Comments

Tvrtko Ursulin May 25, 2021, 10:16 a.m. UTC | #1
On 06/05/2021 20:14, Matthew Brost wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> The serial number tracking of engines happens at the backend of
> request submission and was expecting to only be given physical
> engines. However, in GuC submission mode, the decomposition of virtual
> to physical engines does not happen in i915. Instead, requests are
> submitted to their virtual engine mask all the way through to the
> hardware (i.e. to GuC). This would mean that the heart beat code
> thinks the physical engines are idle due to the serial number not
> incrementing.
> 
> This patch updates the tracking to decompose virtual engines into
> their physical constituents and tracks the request against each. This
> is not entirely accurate as the GuC will only be issuing the request
> to one physical engine. However, it is the best that i915 can do given
> that it has no knowledge of the GuC's scheduling decisions.

Commit text sounds a bit defeatist. I think instead of making up the 
serial counts, which has downsides (could you please document in the 
commit what they are), we should think how to design things properly.

Regards,

Tvrtko

> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
>   .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
>   drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
>   drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 ++++++++++++++++
>   drivers/gpu/drm/i915/i915_request.c              |  4 +++-
>   6 files changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 86302e6d86b2..e2b5cda6dbc4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -389,6 +389,8 @@ struct intel_engine_cs {
>   	void		(*park)(struct intel_engine_cs *engine);
>   	void		(*unpark)(struct intel_engine_cs *engine);
>   
> +	void		(*bump_serial)(struct intel_engine_cs *engine);
> +
>   	void		(*set_default_submission)(struct intel_engine_cs *engine);
>   
>   	const struct intel_context_ops *cops;
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index ae12d7f19ecd..02880ea5d693 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -3199,6 +3199,11 @@ static void execlists_release(struct intel_engine_cs *engine)
>   	lrc_fini_wa_ctx(engine);
>   }
>   
> +static void execlist_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
>   static void
>   logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   {
> @@ -3208,6 +3213,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   
>   	engine->cops = &execlists_context_ops;
>   	engine->request_alloc = execlists_request_alloc;
> +	engine->bump_serial = execlist_bump_serial;
>   
>   	engine->reset.prepare = execlists_reset_prepare;
>   	engine->reset.rewind = execlists_reset_rewind;
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 14aa31879a37..39dd7c4ed0a9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -1045,6 +1045,11 @@ static void setup_irq(struct intel_engine_cs *engine)
>   	}
>   }
>   
> +static void ring_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
>   static void setup_common(struct intel_engine_cs *engine)
>   {
>   	struct drm_i915_private *i915 = engine->i915;
> @@ -1064,6 +1069,7 @@ static void setup_common(struct intel_engine_cs *engine)
>   
>   	engine->cops = &ring_context_ops;
>   	engine->request_alloc = ring_request_alloc;
> +	engine->bump_serial = ring_bump_serial;
>   
>   	/*
>   	 * Using a global execution timeline; the previous final breadcrumb is
> diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c
> index bd005c1b6fd5..97b10fd60b55 100644
> --- a/drivers/gpu/drm/i915/gt/mock_engine.c
> +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
> @@ -292,6 +292,11 @@ static void mock_engine_release(struct intel_engine_cs *engine)
>   	intel_engine_fini_retire(engine);
>   }
>   
> +static void mock_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
>   struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   				    const char *name,
>   				    int id)
> @@ -318,6 +323,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   
>   	engine->base.cops = &mock_context_ops;
>   	engine->base.request_alloc = mock_request_alloc;
> +	engine->base.bump_serial = mock_bump_serial;
>   	engine->base.emit_flush = mock_emit_flush;
>   	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
>   	engine->base.submit_request = mock_submit_request;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index dc79d287c50a..f0e5731bcef6 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1500,6 +1500,20 @@ static void guc_release(struct intel_engine_cs *engine)
>   	lrc_fini_wa_ctx(engine);
>   }
>   
> +static void guc_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
> +static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
> +{
> +	struct intel_engine_cs *e;
> +	intel_engine_mask_t tmp, mask = engine->mask;
> +
> +	for_each_engine_masked(e, engine->gt, mask, tmp)
> +		e->serial++;
> +}
> +
>   static void guc_default_vfuncs(struct intel_engine_cs *engine)
>   {
>   	/* Default vfuncs which can be overridden by each engine. */
> @@ -1508,6 +1522,7 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>   
>   	engine->cops = &guc_context_ops;
>   	engine->request_alloc = guc_request_alloc;
> +	engine->bump_serial = guc_bump_serial;
>   
>   	engine->sched_engine->schedule = i915_schedule;
>   
> @@ -1843,6 +1858,7 @@ guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count)
>   
>   	ve->base.cops = &virtual_guc_context_ops;
>   	ve->base.request_alloc = guc_request_alloc;
> +	ve->base.bump_serial = virtual_guc_bump_serial;
>   
>   	ve->base.submit_request = guc_submit_request;
>   
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 9542a5baa45a..127d60b36422 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -692,7 +692,9 @@ bool __i915_request_submit(struct i915_request *request)
>   				     request->ring->vaddr + request->postfix);
>   
>   	trace_i915_request_execute(request);
> -	engine->serial++;
> +	if (engine->bump_serial)
> +		engine->bump_serial(engine);
> +
>   	result = true;
>   
>   	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
>
Matthew Brost May 25, 2021, 5:52 p.m. UTC | #2
On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
> 
> On 06/05/2021 20:14, Matthew Brost wrote:
> > From: John Harrison <John.C.Harrison@Intel.com>
> > 
> > The serial number tracking of engines happens at the backend of
> > request submission and was expecting to only be given physical
> > engines. However, in GuC submission mode, the decomposition of virtual
> > to physical engines does not happen in i915. Instead, requests are
> > submitted to their virtual engine mask all the way through to the
> > hardware (i.e. to GuC). This would mean that the heart beat code
> > thinks the physical engines are idle due to the serial number not
> > incrementing.
> > 
> > This patch updates the tracking to decompose virtual engines into
> > their physical constituents and tracks the request against each. This
> > is not entirely accurate as the GuC will only be issuing the request
> > to one physical engine. However, it is the best that i915 can do given
> > that it has no knowledge of the GuC's scheduling decisions.
> 
> Commit text sounds a bit defeatist. I think instead of making up the serial
> counts, which has downsides (could you please document in the commit what
> they are), we should think how to design things properly.
> 

IMO, I don't think fixing serial counts is the scope of this series. We
should focus on getting GuC submission in not cleaning up all the crap
that is in the i915. Let's make a note of this though so we can revisit
later.

Matt

> Regards,
> 
> Tvrtko
> 
> > Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
> >   .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
> >   drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
> >   drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 ++++++++++++++++
> >   drivers/gpu/drm/i915/i915_request.c              |  4 +++-
> >   6 files changed, 39 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > index 86302e6d86b2..e2b5cda6dbc4 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > @@ -389,6 +389,8 @@ struct intel_engine_cs {
> >   	void		(*park)(struct intel_engine_cs *engine);
> >   	void		(*unpark)(struct intel_engine_cs *engine);
> > +	void		(*bump_serial)(struct intel_engine_cs *engine);
> > +
> >   	void		(*set_default_submission)(struct intel_engine_cs *engine);
> >   	const struct intel_context_ops *cops;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > index ae12d7f19ecd..02880ea5d693 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > @@ -3199,6 +3199,11 @@ static void execlists_release(struct intel_engine_cs *engine)
> >   	lrc_fini_wa_ctx(engine);
> >   }
> > +static void execlist_bump_serial(struct intel_engine_cs *engine)
> > +{
> > +	engine->serial++;
> > +}
> > +
> >   static void
> >   logical_ring_default_vfuncs(struct intel_engine_cs *engine)
> >   {
> > @@ -3208,6 +3213,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
> >   	engine->cops = &execlists_context_ops;
> >   	engine->request_alloc = execlists_request_alloc;
> > +	engine->bump_serial = execlist_bump_serial;
> >   	engine->reset.prepare = execlists_reset_prepare;
> >   	engine->reset.rewind = execlists_reset_rewind;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> > index 14aa31879a37..39dd7c4ed0a9 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> > @@ -1045,6 +1045,11 @@ static void setup_irq(struct intel_engine_cs *engine)
> >   	}
> >   }
> > +static void ring_bump_serial(struct intel_engine_cs *engine)
> > +{
> > +	engine->serial++;
> > +}
> > +
> >   static void setup_common(struct intel_engine_cs *engine)
> >   {
> >   	struct drm_i915_private *i915 = engine->i915;
> > @@ -1064,6 +1069,7 @@ static void setup_common(struct intel_engine_cs *engine)
> >   	engine->cops = &ring_context_ops;
> >   	engine->request_alloc = ring_request_alloc;
> > +	engine->bump_serial = ring_bump_serial;
> >   	/*
> >   	 * Using a global execution timeline; the previous final breadcrumb is
> > diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c
> > index bd005c1b6fd5..97b10fd60b55 100644
> > --- a/drivers/gpu/drm/i915/gt/mock_engine.c
> > +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
> > @@ -292,6 +292,11 @@ static void mock_engine_release(struct intel_engine_cs *engine)
> >   	intel_engine_fini_retire(engine);
> >   }
> > +static void mock_bump_serial(struct intel_engine_cs *engine)
> > +{
> > +	engine->serial++;
> > +}
> > +
> >   struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
> >   				    const char *name,
> >   				    int id)
> > @@ -318,6 +323,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
> >   	engine->base.cops = &mock_context_ops;
> >   	engine->base.request_alloc = mock_request_alloc;
> > +	engine->base.bump_serial = mock_bump_serial;
> >   	engine->base.emit_flush = mock_emit_flush;
> >   	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
> >   	engine->base.submit_request = mock_submit_request;
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index dc79d287c50a..f0e5731bcef6 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -1500,6 +1500,20 @@ static void guc_release(struct intel_engine_cs *engine)
> >   	lrc_fini_wa_ctx(engine);
> >   }
> > +static void guc_bump_serial(struct intel_engine_cs *engine)
> > +{
> > +	engine->serial++;
> > +}
> > +
> > +static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
> > +{
> > +	struct intel_engine_cs *e;
> > +	intel_engine_mask_t tmp, mask = engine->mask;
> > +
> > +	for_each_engine_masked(e, engine->gt, mask, tmp)
> > +		e->serial++;
> > +}
> > +
> >   static void guc_default_vfuncs(struct intel_engine_cs *engine)
> >   {
> >   	/* Default vfuncs which can be overridden by each engine. */
> > @@ -1508,6 +1522,7 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
> >   	engine->cops = &guc_context_ops;
> >   	engine->request_alloc = guc_request_alloc;
> > +	engine->bump_serial = guc_bump_serial;
> >   	engine->sched_engine->schedule = i915_schedule;
> > @@ -1843,6 +1858,7 @@ guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count)
> >   	ve->base.cops = &virtual_guc_context_ops;
> >   	ve->base.request_alloc = guc_request_alloc;
> > +	ve->base.bump_serial = virtual_guc_bump_serial;
> >   	ve->base.submit_request = guc_submit_request;
> > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> > index 9542a5baa45a..127d60b36422 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -692,7 +692,9 @@ bool __i915_request_submit(struct i915_request *request)
> >   				     request->ring->vaddr + request->postfix);
> >   	trace_i915_request_execute(request);
> > -	engine->serial++;
> > +	if (engine->bump_serial)
> > +		engine->bump_serial(engine);
> > +
> >   	result = true;
> >   	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
> >
Tvrtko Ursulin May 26, 2021, 8:40 a.m. UTC | #3
On 25/05/2021 18:52, Matthew Brost wrote:
> On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
>>
>> On 06/05/2021 20:14, Matthew Brost wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> The serial number tracking of engines happens at the backend of
>>> request submission and was expecting to only be given physical
>>> engines. However, in GuC submission mode, the decomposition of virtual
>>> to physical engines does not happen in i915. Instead, requests are
>>> submitted to their virtual engine mask all the way through to the
>>> hardware (i.e. to GuC). This would mean that the heart beat code
>>> thinks the physical engines are idle due to the serial number not
>>> incrementing.
>>>
>>> This patch updates the tracking to decompose virtual engines into
>>> their physical constituents and tracks the request against each. This
>>> is not entirely accurate as the GuC will only be issuing the request
>>> to one physical engine. However, it is the best that i915 can do given
>>> that it has no knowledge of the GuC's scheduling decisions.
>>
>> Commit text sounds a bit defeatist. I think instead of making up the serial
>> counts, which has downsides (could you please document in the commit what
>> they are), we should think how to design things properly.
>>
> 
> IMO, I don't think fixing serial counts is the scope of this series. We
> should focus on getting GuC submission in not cleaning up all the crap
> that is in the i915. Let's make a note of this though so we can revisit
> later.

I will say again - commit message implies it is introducing an 
unspecified downside by not fully fixing an also unspecified issue. It 
is completely reasonable, and customary even, to ask for both to be 
documented in the commit message.

If we are abandoning the normal review process someone please say so I 
don't waste my time reading it.

Regards,

Tvrtko

> Matt
> 
>> Regards,
>>
>> Tvrtko
>>
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
>>>    .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
>>>    drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
>>>    drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 ++++++++++++++++
>>>    drivers/gpu/drm/i915/i915_request.c              |  4 +++-
>>>    6 files changed, 39 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> index 86302e6d86b2..e2b5cda6dbc4 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> @@ -389,6 +389,8 @@ struct intel_engine_cs {
>>>    	void		(*park)(struct intel_engine_cs *engine);
>>>    	void		(*unpark)(struct intel_engine_cs *engine);
>>> +	void		(*bump_serial)(struct intel_engine_cs *engine);
>>> +
>>>    	void		(*set_default_submission)(struct intel_engine_cs *engine);
>>>    	const struct intel_context_ops *cops;
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> index ae12d7f19ecd..02880ea5d693 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> @@ -3199,6 +3199,11 @@ static void execlists_release(struct intel_engine_cs *engine)
>>>    	lrc_fini_wa_ctx(engine);
>>>    }
>>> +static void execlist_bump_serial(struct intel_engine_cs *engine)
>>> +{
>>> +	engine->serial++;
>>> +}
>>> +
>>>    static void
>>>    logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>>    {
>>> @@ -3208,6 +3213,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>>    	engine->cops = &execlists_context_ops;
>>>    	engine->request_alloc = execlists_request_alloc;
>>> +	engine->bump_serial = execlist_bump_serial;
>>>    	engine->reset.prepare = execlists_reset_prepare;
>>>    	engine->reset.rewind = execlists_reset_rewind;
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>> index 14aa31879a37..39dd7c4ed0a9 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>> @@ -1045,6 +1045,11 @@ static void setup_irq(struct intel_engine_cs *engine)
>>>    	}
>>>    }
>>> +static void ring_bump_serial(struct intel_engine_cs *engine)
>>> +{
>>> +	engine->serial++;
>>> +}
>>> +
>>>    static void setup_common(struct intel_engine_cs *engine)
>>>    {
>>>    	struct drm_i915_private *i915 = engine->i915;
>>> @@ -1064,6 +1069,7 @@ static void setup_common(struct intel_engine_cs *engine)
>>>    	engine->cops = &ring_context_ops;
>>>    	engine->request_alloc = ring_request_alloc;
>>> +	engine->bump_serial = ring_bump_serial;
>>>    	/*
>>>    	 * Using a global execution timeline; the previous final breadcrumb is
>>> diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c
>>> index bd005c1b6fd5..97b10fd60b55 100644
>>> --- a/drivers/gpu/drm/i915/gt/mock_engine.c
>>> +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
>>> @@ -292,6 +292,11 @@ static void mock_engine_release(struct intel_engine_cs *engine)
>>>    	intel_engine_fini_retire(engine);
>>>    }
>>> +static void mock_bump_serial(struct intel_engine_cs *engine)
>>> +{
>>> +	engine->serial++;
>>> +}
>>> +
>>>    struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>>>    				    const char *name,
>>>    				    int id)
>>> @@ -318,6 +323,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>>>    	engine->base.cops = &mock_context_ops;
>>>    	engine->base.request_alloc = mock_request_alloc;
>>> +	engine->base.bump_serial = mock_bump_serial;
>>>    	engine->base.emit_flush = mock_emit_flush;
>>>    	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
>>>    	engine->base.submit_request = mock_submit_request;
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index dc79d287c50a..f0e5731bcef6 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -1500,6 +1500,20 @@ static void guc_release(struct intel_engine_cs *engine)
>>>    	lrc_fini_wa_ctx(engine);
>>>    }
>>> +static void guc_bump_serial(struct intel_engine_cs *engine)
>>> +{
>>> +	engine->serial++;
>>> +}
>>> +
>>> +static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
>>> +{
>>> +	struct intel_engine_cs *e;
>>> +	intel_engine_mask_t tmp, mask = engine->mask;
>>> +
>>> +	for_each_engine_masked(e, engine->gt, mask, tmp)
>>> +		e->serial++;
>>> +}
>>> +
>>>    static void guc_default_vfuncs(struct intel_engine_cs *engine)
>>>    {
>>>    	/* Default vfuncs which can be overridden by each engine. */
>>> @@ -1508,6 +1522,7 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>>>    	engine->cops = &guc_context_ops;
>>>    	engine->request_alloc = guc_request_alloc;
>>> +	engine->bump_serial = guc_bump_serial;
>>>    	engine->sched_engine->schedule = i915_schedule;
>>> @@ -1843,6 +1858,7 @@ guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count)
>>>    	ve->base.cops = &virtual_guc_context_ops;
>>>    	ve->base.request_alloc = guc_request_alloc;
>>> +	ve->base.bump_serial = virtual_guc_bump_serial;
>>>    	ve->base.submit_request = guc_submit_request;
>>> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
>>> index 9542a5baa45a..127d60b36422 100644
>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>> @@ -692,7 +692,9 @@ bool __i915_request_submit(struct i915_request *request)
>>>    				     request->ring->vaddr + request->postfix);
>>>    	trace_i915_request_execute(request);
>>> -	engine->serial++;
>>> +	if (engine->bump_serial)
>>> +		engine->bump_serial(engine);
>>> +
>>>    	result = true;
>>>    	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
>>>
John Harrison May 26, 2021, 6:45 p.m. UTC | #4
On 5/26/2021 01:40, Tvrtko Ursulin wrote:
> On 25/05/2021 18:52, Matthew Brost wrote:
>> On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
>>>
>>> On 06/05/2021 20:14, Matthew Brost wrote:
>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>
>>>> The serial number tracking of engines happens at the backend of
>>>> request submission and was expecting to only be given physical
>>>> engines. However, in GuC submission mode, the decomposition of virtual
>>>> to physical engines does not happen in i915. Instead, requests are
>>>> submitted to their virtual engine mask all the way through to the
>>>> hardware (i.e. to GuC). This would mean that the heart beat code
>>>> thinks the physical engines are idle due to the serial number not
>>>> incrementing.
>>>>
>>>> This patch updates the tracking to decompose virtual engines into
>>>> their physical constituents and tracks the request against each. This
>>>> is not entirely accurate as the GuC will only be issuing the request
>>>> to one physical engine. However, it is the best that i915 can do given
>>>> that it has no knowledge of the GuC's scheduling decisions.
>>>
>>> Commit text sounds a bit defeatist. I think instead of making up the 
>>> serial
>>> counts, which has downsides (could you please document in the commit 
>>> what
>>> they are), we should think how to design things properly.
>>>
>>
>> IMO, I don't think fixing serial counts is the scope of this series. We
>> should focus on getting GuC submission in not cleaning up all the crap
>> that is in the i915. Let's make a note of this though so we can revisit
>> later.
>
> I will say again - commit message implies it is introducing an 
> unspecified downside by not fully fixing an also unspecified issue. It 
> is completely reasonable, and customary even, to ask for both to be 
> documented in the commit message.
Not sure what exactly is 'unspecified'. I thought the commit message 
described both the problem (heartbeat not running when using virtual 
engines) and the result (heartbeat running on more engines than strictly 
necessary). But in greater detail...

The serial number tracking is a hack for the heartbeat code to know 
whether an engine is busy or idle, and therefore whether it should be 
pinged for aliveness. Whenever a submission is made to an engine, the 
serial number is incremented. The heartbeat code keeps a copy of the 
value. If the value has changed, the engine is busy and needs to be pinged.

This works fine for execlist mode where virtual engine decomposition is 
done inside i915. It fails miserably for GuC mode where the 
decomposition is done by the hardware. The reason being that the 
heartbeat code only looks at physical engines but the serial count is 
only incremented on the virtual engine. Thus, the heartbeat sees 
everything as idle and does not ping.

This patch decomposes the virtual engines for the sake of incrementing 
the serial count on each sub-engine in order to keep the heartbeat code 
happy. The downside is that now the heartbeat sees all sub-engines as 
busy rather than only the one the submission actually ends up on. There 
really isn't much that can be done about that. The heartbeat code is in 
i915 not GuC, the scheduler is in GuC not i915. The only way to improve 
it is to either move the heartbeat code into GuC as well and completely 
disable the i915 side, or add some way for i915 to interrogate GuC as to 
which engines are or are not active. Technically, we do have both. GuC 
has (or at least had) an option to force a context switch on every 
execution quantum pre-emption. However, that is much, much, more heavy 
weight than the heartbeat. For the latter, we do (almost) have the 
engine usage statistics for PMU and such like. I'm not sure how much 
effort it would be to wire that up to the heartbeat code instead of 
using the serial count.

In short, the serial count is ever so slightly inefficient in that it 
causes heartbeat pings on engines which are idle. On the other hand, it 
is way more efficient and simpler than the current alternatives.

Does that answer the questions?

John.


>
> If we are abandoning the normal review process someone please say so I 
> don't waste my time reading it.
>
> Regards,
>
> Tvrtko
>
>> Matt
>>
>>> Regards,
>>>
>>> Tvrtko
>>>
>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>> ---
>>>>    drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
>>>>    .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
>>>>    drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
>>>>    drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
>>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 
>>>> ++++++++++++++++
>>>>    drivers/gpu/drm/i915/i915_request.c              |  4 +++-
>>>>    6 files changed, 39 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
>>>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>> index 86302e6d86b2..e2b5cda6dbc4 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>> @@ -389,6 +389,8 @@ struct intel_engine_cs {
>>>>        void        (*park)(struct intel_engine_cs *engine);
>>>>        void        (*unpark)(struct intel_engine_cs *engine);
>>>> +    void        (*bump_serial)(struct intel_engine_cs *engine);
>>>> +
>>>>        void        (*set_default_submission)(struct intel_engine_cs 
>>>> *engine);
>>>>        const struct intel_context_ops *cops;
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
>>>> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>> index ae12d7f19ecd..02880ea5d693 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>> @@ -3199,6 +3199,11 @@ static void execlists_release(struct 
>>>> intel_engine_cs *engine)
>>>>        lrc_fini_wa_ctx(engine);
>>>>    }
>>>> +static void execlist_bump_serial(struct intel_engine_cs *engine)
>>>> +{
>>>> +    engine->serial++;
>>>> +}
>>>> +
>>>>    static void
>>>>    logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>>>    {
>>>> @@ -3208,6 +3213,7 @@ logical_ring_default_vfuncs(struct 
>>>> intel_engine_cs *engine)
>>>>        engine->cops = &execlists_context_ops;
>>>>        engine->request_alloc = execlists_request_alloc;
>>>> +    engine->bump_serial = execlist_bump_serial;
>>>>        engine->reset.prepare = execlists_reset_prepare;
>>>>        engine->reset.rewind = execlists_reset_rewind;
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c 
>>>> b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>> index 14aa31879a37..39dd7c4ed0a9 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>> @@ -1045,6 +1045,11 @@ static void setup_irq(struct intel_engine_cs 
>>>> *engine)
>>>>        }
>>>>    }
>>>> +static void ring_bump_serial(struct intel_engine_cs *engine)
>>>> +{
>>>> +    engine->serial++;
>>>> +}
>>>> +
>>>>    static void setup_common(struct intel_engine_cs *engine)
>>>>    {
>>>>        struct drm_i915_private *i915 = engine->i915;
>>>> @@ -1064,6 +1069,7 @@ static void setup_common(struct 
>>>> intel_engine_cs *engine)
>>>>        engine->cops = &ring_context_ops;
>>>>        engine->request_alloc = ring_request_alloc;
>>>> +    engine->bump_serial = ring_bump_serial;
>>>>        /*
>>>>         * Using a global execution timeline; the previous final 
>>>> breadcrumb is
>>>> diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c 
>>>> b/drivers/gpu/drm/i915/gt/mock_engine.c
>>>> index bd005c1b6fd5..97b10fd60b55 100644
>>>> --- a/drivers/gpu/drm/i915/gt/mock_engine.c
>>>> +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
>>>> @@ -292,6 +292,11 @@ static void mock_engine_release(struct 
>>>> intel_engine_cs *engine)
>>>>        intel_engine_fini_retire(engine);
>>>>    }
>>>> +static void mock_bump_serial(struct intel_engine_cs *engine)
>>>> +{
>>>> +    engine->serial++;
>>>> +}
>>>> +
>>>>    struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>>>>                        const char *name,
>>>>                        int id)
>>>> @@ -318,6 +323,7 @@ struct intel_engine_cs *mock_engine(struct 
>>>> drm_i915_private *i915,
>>>>        engine->base.cops = &mock_context_ops;
>>>>        engine->base.request_alloc = mock_request_alloc;
>>>> +    engine->base.bump_serial = mock_bump_serial;
>>>>        engine->base.emit_flush = mock_emit_flush;
>>>>        engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
>>>>        engine->base.submit_request = mock_submit_request;
>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>> index dc79d287c50a..f0e5731bcef6 100644
>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>> @@ -1500,6 +1500,20 @@ static void guc_release(struct 
>>>> intel_engine_cs *engine)
>>>>        lrc_fini_wa_ctx(engine);
>>>>    }
>>>> +static void guc_bump_serial(struct intel_engine_cs *engine)
>>>> +{
>>>> +    engine->serial++;
>>>> +}
>>>> +
>>>> +static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
>>>> +{
>>>> +    struct intel_engine_cs *e;
>>>> +    intel_engine_mask_t tmp, mask = engine->mask;
>>>> +
>>>> +    for_each_engine_masked(e, engine->gt, mask, tmp)
>>>> +        e->serial++;
>>>> +}
>>>> +
>>>>    static void guc_default_vfuncs(struct intel_engine_cs *engine)
>>>>    {
>>>>        /* Default vfuncs which can be overridden by each engine. */
>>>> @@ -1508,6 +1522,7 @@ static void guc_default_vfuncs(struct 
>>>> intel_engine_cs *engine)
>>>>        engine->cops = &guc_context_ops;
>>>>        engine->request_alloc = guc_request_alloc;
>>>> +    engine->bump_serial = guc_bump_serial;
>>>>        engine->sched_engine->schedule = i915_schedule;
>>>> @@ -1843,6 +1858,7 @@ guc_create_virtual(struct intel_engine_cs 
>>>> **siblings, unsigned int count)
>>>>        ve->base.cops = &virtual_guc_context_ops;
>>>>        ve->base.request_alloc = guc_request_alloc;
>>>> +    ve->base.bump_serial = virtual_guc_bump_serial;
>>>>        ve->base.submit_request = guc_submit_request;
>>>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>>>> b/drivers/gpu/drm/i915/i915_request.c
>>>> index 9542a5baa45a..127d60b36422 100644
>>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>>> @@ -692,7 +692,9 @@ bool __i915_request_submit(struct i915_request 
>>>> *request)
>>>>                         request->ring->vaddr + request->postfix);
>>>>        trace_i915_request_execute(request);
>>>> -    engine->serial++;
>>>> +    if (engine->bump_serial)
>>>> +        engine->bump_serial(engine);
>>>> +
>>>>        result = true;
>>>>        GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, 
>>>> &request->fence.flags));
>>>>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Tvrtko Ursulin May 27, 2021, 8:53 a.m. UTC | #5
On 26/05/2021 19:45, John Harrison wrote:
> On 5/26/2021 01:40, Tvrtko Ursulin wrote:
>> On 25/05/2021 18:52, Matthew Brost wrote:
>>> On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 06/05/2021 20:14, Matthew Brost wrote:
>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>
>>>>> The serial number tracking of engines happens at the backend of
>>>>> request submission and was expecting to only be given physical
>>>>> engines. However, in GuC submission mode, the decomposition of virtual
>>>>> to physical engines does not happen in i915. Instead, requests are
>>>>> submitted to their virtual engine mask all the way through to the
>>>>> hardware (i.e. to GuC). This would mean that the heart beat code
>>>>> thinks the physical engines are idle due to the serial number not
>>>>> incrementing.
>>>>>
>>>>> This patch updates the tracking to decompose virtual engines into
>>>>> their physical constituents and tracks the request against each. This
>>>>> is not entirely accurate as the GuC will only be issuing the request
>>>>> to one physical engine. However, it is the best that i915 can do given
>>>>> that it has no knowledge of the GuC's scheduling decisions.
>>>>
>>>> Commit text sounds a bit defeatist. I think instead of making up the 
>>>> serial
>>>> counts, which has downsides (could you please document in the commit 
>>>> what
>>>> they are), we should think how to design things properly.
>>>>
>>>
>>> IMO, I don't think fixing serial counts is the scope of this series. We
>>> should focus on getting GuC submission in not cleaning up all the crap
>>> that is in the i915. Let's make a note of this though so we can revisit
>>> later.
>>
>> I will say again - commit message implies it is introducing an 
>> unspecified downside by not fully fixing an also unspecified issue. It 
>> is completely reasonable, and customary even, to ask for both to be 
>> documented in the commit message.
> Not sure what exactly is 'unspecified'. I thought the commit message 
> described both the problem (heartbeat not running when using virtual 
> engines) and the result (heartbeat running on more engines than strictly 
> necessary). But in greater detail...
> 
> The serial number tracking is a hack for the heartbeat code to know 
> whether an engine is busy or idle, and therefore whether it should be 
> pinged for aliveness. Whenever a submission is made to an engine, the 
> serial number is incremented. The heartbeat code keeps a copy of the 
> value. If the value has changed, the engine is busy and needs to be pinged.
> 
> This works fine for execlist mode where virtual engine decomposition is 
> done inside i915. It fails miserably for GuC mode where the 
> decomposition is done by the hardware. The reason being that the 
> heartbeat code only looks at physical engines but the serial count is 
> only incremented on the virtual engine. Thus, the heartbeat sees 
> everything as idle and does not ping.

So hangcheck does not work. Or it works because GuC does it anyway. 
Either way, that's one thing to explicitly state in the commit message.

> This patch decomposes the virtual engines for the sake of incrementing 
> the serial count on each sub-engine in order to keep the heartbeat code 
> happy. The downside is that now the heartbeat sees all sub-engines as 
> busy rather than only the one the submission actually ends up on. There 
> really isn't much that can be done about that. The heartbeat code is in 
> i915 not GuC, the scheduler is in GuC not i915. The only way to improve 
> it is to either move the heartbeat code into GuC as well and completely 
> disable the i915 side, or add some way for i915 to interrogate GuC as to 
> which engines are or are not active. Technically, we do have both. GuC 
> has (or at least had) an option to force a context switch on every 
> execution quantum pre-emption. However, that is much, much, more heavy 
> weight than the heartbeat. For the latter, we do (almost) have the 
> engine usage statistics for PMU and such like. I'm not sure how much 
> effort it would be to wire that up to the heartbeat code instead of 
> using the serial count.
> 
> In short, the serial count is ever so slightly inefficient in that it 
> causes heartbeat pings on engines which are idle. On the other hand, it 
> is way more efficient and simpler than the current alternatives.

And the hack to make hangcheck work creates this inefficiency where 
heartbeats are sent to idle engines. Which is probably fine just needs 
to be explained.

> Does that answer the questions?

With the two points I re-raise clearly explained, possibly even patch 
title changed, yeah. I am just wanting for it to be more easily obvious 
to patch reader what it is functionally about - not just what 
implementation details have been change but why as well.

Regards,

Tvrtko

> John.
> 
> 
>>
>> If we are abandoning the normal review process someone please say so I 
>> don't waste my time reading it.
>>
>> Regards,
>>
>> Tvrtko
>>
>>> Matt
>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
>>>>>    .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
>>>>>    drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
>>>>>    drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
>>>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 
>>>>> ++++++++++++++++
>>>>>    drivers/gpu/drm/i915/i915_request.c              |  4 +++-
>>>>>    6 files changed, 39 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
>>>>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>>> index 86302e6d86b2..e2b5cda6dbc4 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>>> @@ -389,6 +389,8 @@ struct intel_engine_cs {
>>>>>        void        (*park)(struct intel_engine_cs *engine);
>>>>>        void        (*unpark)(struct intel_engine_cs *engine);
>>>>> +    void        (*bump_serial)(struct intel_engine_cs *engine);
>>>>> +
>>>>>        void        (*set_default_submission)(struct intel_engine_cs 
>>>>> *engine);
>>>>>        const struct intel_context_ops *cops;
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
>>>>> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>>> index ae12d7f19ecd..02880ea5d693 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>>> @@ -3199,6 +3199,11 @@ static void execlists_release(struct 
>>>>> intel_engine_cs *engine)
>>>>>        lrc_fini_wa_ctx(engine);
>>>>>    }
>>>>> +static void execlist_bump_serial(struct intel_engine_cs *engine)
>>>>> +{
>>>>> +    engine->serial++;
>>>>> +}
>>>>> +
>>>>>    static void
>>>>>    logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>>>>    {
>>>>> @@ -3208,6 +3213,7 @@ logical_ring_default_vfuncs(struct 
>>>>> intel_engine_cs *engine)
>>>>>        engine->cops = &execlists_context_ops;
>>>>>        engine->request_alloc = execlists_request_alloc;
>>>>> +    engine->bump_serial = execlist_bump_serial;
>>>>>        engine->reset.prepare = execlists_reset_prepare;
>>>>>        engine->reset.rewind = execlists_reset_rewind;
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c 
>>>>> b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>>> index 14aa31879a37..39dd7c4ed0a9 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>>> @@ -1045,6 +1045,11 @@ static void setup_irq(struct intel_engine_cs 
>>>>> *engine)
>>>>>        }
>>>>>    }
>>>>> +static void ring_bump_serial(struct intel_engine_cs *engine)
>>>>> +{
>>>>> +    engine->serial++;
>>>>> +}
>>>>> +
>>>>>    static void setup_common(struct intel_engine_cs *engine)
>>>>>    {
>>>>>        struct drm_i915_private *i915 = engine->i915;
>>>>> @@ -1064,6 +1069,7 @@ static void setup_common(struct 
>>>>> intel_engine_cs *engine)
>>>>>        engine->cops = &ring_context_ops;
>>>>>        engine->request_alloc = ring_request_alloc;
>>>>> +    engine->bump_serial = ring_bump_serial;
>>>>>        /*
>>>>>         * Using a global execution timeline; the previous final 
>>>>> breadcrumb is
>>>>> diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c 
>>>>> b/drivers/gpu/drm/i915/gt/mock_engine.c
>>>>> index bd005c1b6fd5..97b10fd60b55 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/mock_engine.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
>>>>> @@ -292,6 +292,11 @@ static void mock_engine_release(struct 
>>>>> intel_engine_cs *engine)
>>>>>        intel_engine_fini_retire(engine);
>>>>>    }
>>>>> +static void mock_bump_serial(struct intel_engine_cs *engine)
>>>>> +{
>>>>> +    engine->serial++;
>>>>> +}
>>>>> +
>>>>>    struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>>>>>                        const char *name,
>>>>>                        int id)
>>>>> @@ -318,6 +323,7 @@ struct intel_engine_cs *mock_engine(struct 
>>>>> drm_i915_private *i915,
>>>>>        engine->base.cops = &mock_context_ops;
>>>>>        engine->base.request_alloc = mock_request_alloc;
>>>>> +    engine->base.bump_serial = mock_bump_serial;
>>>>>        engine->base.emit_flush = mock_emit_flush;
>>>>>        engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
>>>>>        engine->base.submit_request = mock_submit_request;
>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>> index dc79d287c50a..f0e5731bcef6 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>> @@ -1500,6 +1500,20 @@ static void guc_release(struct 
>>>>> intel_engine_cs *engine)
>>>>>        lrc_fini_wa_ctx(engine);
>>>>>    }
>>>>> +static void guc_bump_serial(struct intel_engine_cs *engine)
>>>>> +{
>>>>> +    engine->serial++;
>>>>> +}
>>>>> +
>>>>> +static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
>>>>> +{
>>>>> +    struct intel_engine_cs *e;
>>>>> +    intel_engine_mask_t tmp, mask = engine->mask;
>>>>> +
>>>>> +    for_each_engine_masked(e, engine->gt, mask, tmp)
>>>>> +        e->serial++;
>>>>> +}
>>>>> +
>>>>>    static void guc_default_vfuncs(struct intel_engine_cs *engine)
>>>>>    {
>>>>>        /* Default vfuncs which can be overridden by each engine. */
>>>>> @@ -1508,6 +1522,7 @@ static void guc_default_vfuncs(struct 
>>>>> intel_engine_cs *engine)
>>>>>        engine->cops = &guc_context_ops;
>>>>>        engine->request_alloc = guc_request_alloc;
>>>>> +    engine->bump_serial = guc_bump_serial;
>>>>>        engine->sched_engine->schedule = i915_schedule;
>>>>> @@ -1843,6 +1858,7 @@ guc_create_virtual(struct intel_engine_cs 
>>>>> **siblings, unsigned int count)
>>>>>        ve->base.cops = &virtual_guc_context_ops;
>>>>>        ve->base.request_alloc = guc_request_alloc;
>>>>> +    ve->base.bump_serial = virtual_guc_bump_serial;
>>>>>        ve->base.submit_request = guc_submit_request;
>>>>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>>>>> b/drivers/gpu/drm/i915/i915_request.c
>>>>> index 9542a5baa45a..127d60b36422 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>>>> @@ -692,7 +692,9 @@ bool __i915_request_submit(struct i915_request 
>>>>> *request)
>>>>>                         request->ring->vaddr + request->postfix);
>>>>>        trace_i915_request_execute(request);
>>>>> -    engine->serial++;
>>>>> +    if (engine->bump_serial)
>>>>> +        engine->bump_serial(engine);
>>>>> +
>>>>>        result = true;
>>>>>        GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, 
>>>>> &request->fence.flags));
>>>>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
John Harrison May 27, 2021, 5:01 p.m. UTC | #6
On 5/27/2021 01:53, Tvrtko Ursulin wrote:
> On 26/05/2021 19:45, John Harrison wrote:
>> On 5/26/2021 01:40, Tvrtko Ursulin wrote:
>>> On 25/05/2021 18:52, Matthew Brost wrote:
>>>> On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
>>>>>
>>>>> On 06/05/2021 20:14, Matthew Brost wrote:
>>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>
>>>>>> The serial number tracking of engines happens at the backend of
>>>>>> request submission and was expecting to only be given physical
>>>>>> engines. However, in GuC submission mode, the decomposition of 
>>>>>> virtual
>>>>>> to physical engines does not happen in i915. Instead, requests are
>>>>>> submitted to their virtual engine mask all the way through to the
>>>>>> hardware (i.e. to GuC). This would mean that the heart beat code
>>>>>> thinks the physical engines are idle due to the serial number not
>>>>>> incrementing.
>>>>>>
>>>>>> This patch updates the tracking to decompose virtual engines into
>>>>>> their physical constituents and tracks the request against each. 
>>>>>> This
>>>>>> is not entirely accurate as the GuC will only be issuing the request
>>>>>> to one physical engine. However, it is the best that i915 can do 
>>>>>> given
>>>>>> that it has no knowledge of the GuC's scheduling decisions.
>>>>>
>>>>> Commit text sounds a bit defeatist. I think instead of making up 
>>>>> the serial
>>>>> counts, which has downsides (could you please document in the 
>>>>> commit what
>>>>> they are), we should think how to design things properly.
>>>>>
>>>>
>>>> IMO, I don't think fixing serial counts is the scope of this 
>>>> series. We
>>>> should focus on getting GuC submission in not cleaning up all the crap
>>>> that is in the i915. Let's make a note of this though so we can 
>>>> revisit
>>>> later.
>>>
>>> I will say again - commit message implies it is introducing an 
>>> unspecified downside by not fully fixing an also unspecified issue. 
>>> It is completely reasonable, and customary even, to ask for both to 
>>> be documented in the commit message.
>> Not sure what exactly is 'unspecified'. I thought the commit message 
>> described both the problem (heartbeat not running when using virtual 
>> engines) and the result (heartbeat running on more engines than 
>> strictly necessary). But in greater detail...
>>
>> The serial number tracking is a hack for the heartbeat code to know 
>> whether an engine is busy or idle, and therefore whether it should be 
>> pinged for aliveness. Whenever a submission is made to an engine, the 
>> serial number is incremented. The heartbeat code keeps a copy of the 
>> value. If the value has changed, the engine is busy and needs to be 
>> pinged.
>>
>> This works fine for execlist mode where virtual engine decomposition 
>> is done inside i915. It fails miserably for GuC mode where the 
>> decomposition is done by the hardware. The reason being that the 
>> heartbeat code only looks at physical engines but the serial count is 
>> only incremented on the virtual engine. Thus, the heartbeat sees 
>> everything as idle and does not ping.
>
> So hangcheck does not work. Or it works because GuC does it anyway. 
> Either way, that's one thing to explicitly state in the commit message.
>
>> This patch decomposes the virtual engines for the sake of 
>> incrementing the serial count on each sub-engine in order to keep the 
>> heartbeat code happy. The downside is that now the heartbeat sees all 
>> sub-engines as busy rather than only the one the submission actually 
>> ends up on. There really isn't much that can be done about that. The 
>> heartbeat code is in i915 not GuC, the scheduler is in GuC not i915. 
>> The only way to improve it is to either move the heartbeat code into 
>> GuC as well and completely disable the i915 side, or add some way for 
>> i915 to interrogate GuC as to which engines are or are not active. 
>> Technically, we do have both. GuC has (or at least had) an option to 
>> force a context switch on every execution quantum pre-emption. 
>> However, that is much, much, more heavy weight than the heartbeat. 
>> For the latter, we do (almost) have the engine usage statistics for 
>> PMU and such like. I'm not sure how much effort it would be to wire 
>> that up to the heartbeat code instead of using the serial count.
>>
>> In short, the serial count is ever so slightly inefficient in that it 
>> causes heartbeat pings on engines which are idle. On the other hand, 
>> it is way more efficient and simpler than the current alternatives.
>
> And the hack to make hangcheck work creates this inefficiency where 
> heartbeats are sent to idle engines. Which is probably fine just needs 
> to be explained.
>
>> Does that answer the questions?
>
> With the two points I re-raise clearly explained, possibly even patch 
> title changed, yeah. I am just wanting for it to be more easily 
> obvious to patch reader what it is functionally about - not just what 
> implementation details have been change but why as well.
>
My understanding is that we don't explain every piece of code in minute 
detail in every checkin email that touches it. I thought my description 
was already pretty verbose. I've certainly seen way less informative 
checkins that apparently made it through review without issue.

Regarding the problem statement, I thought this was fairly clear that 
the heartbeat was broken for virtual engines:

    This would mean that the heart beat code
    thinks the physical engines are idle due to the serial number not
    incrementing.


Regarding the inefficiency about heartbeating all physical engines in a 
virtual engine, again, this seems clear to me:

    decompose virtual engines into
    their physical constituents and tracks the request against each. This
    is not entirely accurate as the GuC will only be issuing the request
    to one physical engine.


For the subject, I guess you could say "Track 'heartbeat serial' counts 
for virtual engines". However, the serial tracking count is not 
explicitly named for heartbeats so it seems inaccurate to rename it for 
a checkin email subject.

If you have a suggestion for better wording then feel free to propose 
something.

John.


> Regards,
>
> Tvrtko
>
>> John.
>>
>>
>>>
>>> If we are abandoning the normal review process someone please say so 
>>> I don't waste my time reading it.
>>>
>>> Regards,
>>>
>>> Tvrtko
>>>
>>>> Matt
>>>>
>>>>> Regards,
>>>>>
>>>>> Tvrtko
>>>>>
>>>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>> ---
>>>>>>    drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
>>>>>>    .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
>>>>>>    drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
>>>>>>    drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
>>>>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 
>>>>>> ++++++++++++++++
>>>>>>    drivers/gpu/drm/i915/i915_request.c              |  4 +++-
>>>>>>    6 files changed, 39 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
>>>>>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>>>> index 86302e6d86b2..e2b5cda6dbc4 100644
>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>>>>> @@ -389,6 +389,8 @@ struct intel_engine_cs {
>>>>>>        void        (*park)(struct intel_engine_cs *engine);
>>>>>>        void        (*unpark)(struct intel_engine_cs *engine);
>>>>>> +    void        (*bump_serial)(struct intel_engine_cs *engine);
>>>>>> +
>>>>>>        void        (*set_default_submission)(struct 
>>>>>> intel_engine_cs *engine);
>>>>>>        const struct intel_context_ops *cops;
>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
>>>>>> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>>>> index ae12d7f19ecd..02880ea5d693 100644
>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>>>>> @@ -3199,6 +3199,11 @@ static void execlists_release(struct 
>>>>>> intel_engine_cs *engine)
>>>>>>        lrc_fini_wa_ctx(engine);
>>>>>>    }
>>>>>> +static void execlist_bump_serial(struct intel_engine_cs *engine)
>>>>>> +{
>>>>>> +    engine->serial++;
>>>>>> +}
>>>>>> +
>>>>>>    static void
>>>>>>    logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>>>>>>    {
>>>>>> @@ -3208,6 +3213,7 @@ logical_ring_default_vfuncs(struct 
>>>>>> intel_engine_cs *engine)
>>>>>>        engine->cops = &execlists_context_ops;
>>>>>>        engine->request_alloc = execlists_request_alloc;
>>>>>> +    engine->bump_serial = execlist_bump_serial;
>>>>>>        engine->reset.prepare = execlists_reset_prepare;
>>>>>>        engine->reset.rewind = execlists_reset_rewind;
>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c 
>>>>>> b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>>>> index 14aa31879a37..39dd7c4ed0a9 100644
>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>>>>> @@ -1045,6 +1045,11 @@ static void setup_irq(struct 
>>>>>> intel_engine_cs *engine)
>>>>>>        }
>>>>>>    }
>>>>>> +static void ring_bump_serial(struct intel_engine_cs *engine)
>>>>>> +{
>>>>>> +    engine->serial++;
>>>>>> +}
>>>>>> +
>>>>>>    static void setup_common(struct intel_engine_cs *engine)
>>>>>>    {
>>>>>>        struct drm_i915_private *i915 = engine->i915;
>>>>>> @@ -1064,6 +1069,7 @@ static void setup_common(struct 
>>>>>> intel_engine_cs *engine)
>>>>>>        engine->cops = &ring_context_ops;
>>>>>>        engine->request_alloc = ring_request_alloc;
>>>>>> +    engine->bump_serial = ring_bump_serial;
>>>>>>        /*
>>>>>>         * Using a global execution timeline; the previous final 
>>>>>> breadcrumb is
>>>>>> diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c 
>>>>>> b/drivers/gpu/drm/i915/gt/mock_engine.c
>>>>>> index bd005c1b6fd5..97b10fd60b55 100644
>>>>>> --- a/drivers/gpu/drm/i915/gt/mock_engine.c
>>>>>> +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
>>>>>> @@ -292,6 +292,11 @@ static void mock_engine_release(struct 
>>>>>> intel_engine_cs *engine)
>>>>>>        intel_engine_fini_retire(engine);
>>>>>>    }
>>>>>> +static void mock_bump_serial(struct intel_engine_cs *engine)
>>>>>> +{
>>>>>> +    engine->serial++;
>>>>>> +}
>>>>>> +
>>>>>>    struct intel_engine_cs *mock_engine(struct drm_i915_private 
>>>>>> *i915,
>>>>>>                        const char *name,
>>>>>>                        int id)
>>>>>> @@ -318,6 +323,7 @@ struct intel_engine_cs *mock_engine(struct 
>>>>>> drm_i915_private *i915,
>>>>>>        engine->base.cops = &mock_context_ops;
>>>>>>        engine->base.request_alloc = mock_request_alloc;
>>>>>> +    engine->base.bump_serial = mock_bump_serial;
>>>>>>        engine->base.emit_flush = mock_emit_flush;
>>>>>>        engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
>>>>>>        engine->base.submit_request = mock_submit_request;
>>>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>> index dc79d287c50a..f0e5731bcef6 100644
>>>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>> @@ -1500,6 +1500,20 @@ static void guc_release(struct 
>>>>>> intel_engine_cs *engine)
>>>>>>        lrc_fini_wa_ctx(engine);
>>>>>>    }
>>>>>> +static void guc_bump_serial(struct intel_engine_cs *engine)
>>>>>> +{
>>>>>> +    engine->serial++;
>>>>>> +}
>>>>>> +
>>>>>> +static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
>>>>>> +{
>>>>>> +    struct intel_engine_cs *e;
>>>>>> +    intel_engine_mask_t tmp, mask = engine->mask;
>>>>>> +
>>>>>> +    for_each_engine_masked(e, engine->gt, mask, tmp)
>>>>>> +        e->serial++;
>>>>>> +}
>>>>>> +
>>>>>>    static void guc_default_vfuncs(struct intel_engine_cs *engine)
>>>>>>    {
>>>>>>        /* Default vfuncs which can be overridden by each engine. */
>>>>>> @@ -1508,6 +1522,7 @@ static void guc_default_vfuncs(struct 
>>>>>> intel_engine_cs *engine)
>>>>>>        engine->cops = &guc_context_ops;
>>>>>>        engine->request_alloc = guc_request_alloc;
>>>>>> +    engine->bump_serial = guc_bump_serial;
>>>>>>        engine->sched_engine->schedule = i915_schedule;
>>>>>> @@ -1843,6 +1858,7 @@ guc_create_virtual(struct intel_engine_cs 
>>>>>> **siblings, unsigned int count)
>>>>>>        ve->base.cops = &virtual_guc_context_ops;
>>>>>>        ve->base.request_alloc = guc_request_alloc;
>>>>>> +    ve->base.bump_serial = virtual_guc_bump_serial;
>>>>>>        ve->base.submit_request = guc_submit_request;
>>>>>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>>>>>> b/drivers/gpu/drm/i915/i915_request.c
>>>>>> index 9542a5baa45a..127d60b36422 100644
>>>>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>>>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>>>>> @@ -692,7 +692,9 @@ bool __i915_request_submit(struct 
>>>>>> i915_request *request)
>>>>>>                         request->ring->vaddr + request->postfix);
>>>>>>        trace_i915_request_execute(request);
>>>>>> -    engine->serial++;
>>>>>> +    if (engine->bump_serial)
>>>>>> +        engine->bump_serial(engine);
>>>>>> +
>>>>>>        result = true;
>>>>>>        GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, 
>>>>>> &request->fence.flags));
>>>>>>
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
>>
Tvrtko Ursulin June 1, 2021, 9:31 a.m. UTC | #7
On 27/05/2021 18:01, John Harrison wrote:
> On 5/27/2021 01:53, Tvrtko Ursulin wrote:
>> On 26/05/2021 19:45, John Harrison wrote:
>>> On 5/26/2021 01:40, Tvrtko Ursulin wrote:
>>>> On 25/05/2021 18:52, Matthew Brost wrote:
>>>>> On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
>>>>>>
>>>>>> On 06/05/2021 20:14, Matthew Brost wrote:
>>>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>>
>>>>>>> The serial number tracking of engines happens at the backend of
>>>>>>> request submission and was expecting to only be given physical
>>>>>>> engines. However, in GuC submission mode, the decomposition of 
>>>>>>> virtual
>>>>>>> to physical engines does not happen in i915. Instead, requests are
>>>>>>> submitted to their virtual engine mask all the way through to the
>>>>>>> hardware (i.e. to GuC). This would mean that the heart beat code
>>>>>>> thinks the physical engines are idle due to the serial number not
>>>>>>> incrementing.
>>>>>>>
>>>>>>> This patch updates the tracking to decompose virtual engines into
>>>>>>> their physical constituents and tracks the request against each. 
>>>>>>> This
>>>>>>> is not entirely accurate as the GuC will only be issuing the request
>>>>>>> to one physical engine. However, it is the best that i915 can do 
>>>>>>> given
>>>>>>> that it has no knowledge of the GuC's scheduling decisions.
>>>>>>
>>>>>> Commit text sounds a bit defeatist. I think instead of making up 
>>>>>> the serial
>>>>>> counts, which has downsides (could you please document in the 
>>>>>> commit what
>>>>>> they are), we should think how to design things properly.
>>>>>>
>>>>>
>>>>> IMO, I don't think fixing serial counts is the scope of this 
>>>>> series. We
>>>>> should focus on getting GuC submission in not cleaning up all the crap
>>>>> that is in the i915. Let's make a note of this though so we can 
>>>>> revisit
>>>>> later.
>>>>
>>>> I will say again - commit message implies it is introducing an 
>>>> unspecified downside by not fully fixing an also unspecified issue. 
>>>> It is completely reasonable, and customary even, to ask for both to 
>>>> be documented in the commit message.
>>> Not sure what exactly is 'unspecified'. I thought the commit message 
>>> described both the problem (heartbeat not running when using virtual 
>>> engines) and the result (heartbeat running on more engines than 
>>> strictly necessary). But in greater detail...
>>>
>>> The serial number tracking is a hack for the heartbeat code to know 
>>> whether an engine is busy or idle, and therefore whether it should be 
>>> pinged for aliveness. Whenever a submission is made to an engine, the 
>>> serial number is incremented. The heartbeat code keeps a copy of the 
>>> value. If the value has changed, the engine is busy and needs to be 
>>> pinged.
>>>
>>> This works fine for execlist mode where virtual engine decomposition 
>>> is done inside i915. It fails miserably for GuC mode where the 
>>> decomposition is done by the hardware. The reason being that the 
>>> heartbeat code only looks at physical engines but the serial count is 
>>> only incremented on the virtual engine. Thus, the heartbeat sees 
>>> everything as idle and does not ping.
>>
>> So hangcheck does not work. Or it works because GuC does it anyway. 
>> Either way, that's one thing to explicitly state in the commit message.
>>
>>> This patch decomposes the virtual engines for the sake of 
>>> incrementing the serial count on each sub-engine in order to keep the 
>>> heartbeat code happy. The downside is that now the heartbeat sees all 
>>> sub-engines as busy rather than only the one the submission actually 
>>> ends up on. There really isn't much that can be done about that. The 
>>> heartbeat code is in i915 not GuC, the scheduler is in GuC not i915. 
>>> The only way to improve it is to either move the heartbeat code into 
>>> GuC as well and completely disable the i915 side, or add some way for 
>>> i915 to interrogate GuC as to which engines are or are not active. 
>>> Technically, we do have both. GuC has (or at least had) an option to 
>>> force a context switch on every execution quantum pre-emption. 
>>> However, that is much, much, more heavy weight than the heartbeat. 
>>> For the latter, we do (almost) have the engine usage statistics for 
>>> PMU and such like. I'm not sure how much effort it would be to wire 
>>> that up to the heartbeat code instead of using the serial count.
>>>
>>> In short, the serial count is ever so slightly inefficient in that it 
>>> causes heartbeat pings on engines which are idle. On the other hand, 
>>> it is way more efficient and simpler than the current alternatives.
>>
>> And the hack to make hangcheck work creates this inefficiency where 
>> heartbeats are sent to idle engines. Which is probably fine just needs 
>> to be explained.
>>
>>> Does that answer the questions?
>>
>> With the two points I re-raise clearly explained, possibly even patch 
>> title changed, yeah. I am just wanting for it to be more easily 
>> obvious to patch reader what it is functionally about - not just what 
>> implementation details have been change but why as well.
>>
> My understanding is that we don't explain every piece of code in minute 
> detail in every checkin email that touches it. I thought my description 
> was already pretty verbose. I've certainly seen way less informative 
> checkins that apparently made it through review without issue.
> 
> Regarding the problem statement, I thought this was fairly clear that 
> the heartbeat was broken for virtual engines:
> 
>     This would mean that the heart beat code
>     thinks the physical engines are idle due to the serial number not
>     incrementing.
> 
> 
> Regarding the inefficiency about heartbeating all physical engines in a 
> virtual engine, again, this seems clear to me:
> 
>     decompose virtual engines into
>     their physical constituents and tracks the request against each. This
>     is not entirely accurate as the GuC will only be issuing the request
>     to one physical engine.
> 
> 
> For the subject, I guess you could say "Track 'heartbeat serial' counts 
> for virtual engines". However, the serial tracking count is not 
> explicitly named for heartbeats so it seems inaccurate to rename it for 
> a checkin email subject.
> 
> If you have a suggestion for better wording then feel free to propose 
> something.

Sigh, I am not asking for more low level detail but for more up to point 
high level naming and high level description.

"drm/i915: Fix hangchek for guc virtual engines"

"..Blah blah, but hack because it is not ideal due xyz which needlessly 
wakes up all engines which has an effect on power yes/no? Latency? 
Throughput when high prio pulse triggers pointless preemption?"

Also, can we fix it properly without introducing inefficiencies? Do we 
even need heartbeats when GuC is in charge of engine resets? And if we 
do can we make them work better?

Regards,

Tvrtko
John Harrison June 2, 2021, 1:20 a.m. UTC | #8
On 6/1/2021 02:31, Tvrtko Ursulin wrote:
> On 27/05/2021 18:01, John Harrison wrote:
>> On 5/27/2021 01:53, Tvrtko Ursulin wrote:
>>> On 26/05/2021 19:45, John Harrison wrote:
>>>> On 5/26/2021 01:40, Tvrtko Ursulin wrote:
>>>>> On 25/05/2021 18:52, Matthew Brost wrote:
>>>>>> On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
>>>>>>>
>>>>>>> On 06/05/2021 20:14, Matthew Brost wrote:
>>>>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>>>
>>>>>>>> The serial number tracking of engines happens at the backend of
>>>>>>>> request submission and was expecting to only be given physical
>>>>>>>> engines. However, in GuC submission mode, the decomposition of 
>>>>>>>> virtual
>>>>>>>> to physical engines does not happen in i915. Instead, requests are
>>>>>>>> submitted to their virtual engine mask all the way through to the
>>>>>>>> hardware (i.e. to GuC). This would mean that the heart beat code
>>>>>>>> thinks the physical engines are idle due to the serial number not
>>>>>>>> incrementing.
>>>>>>>>
>>>>>>>> This patch updates the tracking to decompose virtual engines into
>>>>>>>> their physical constituents and tracks the request against 
>>>>>>>> each. This
>>>>>>>> is not entirely accurate as the GuC will only be issuing the 
>>>>>>>> request
>>>>>>>> to one physical engine. However, it is the best that i915 can 
>>>>>>>> do given
>>>>>>>> that it has no knowledge of the GuC's scheduling decisions.
>>>>>>>
>>>>>>> Commit text sounds a bit defeatist. I think instead of making up 
>>>>>>> the serial
>>>>>>> counts, which has downsides (could you please document in the 
>>>>>>> commit what
>>>>>>> they are), we should think how to design things properly.
>>>>>>>
>>>>>>
>>>>>> IMO, I don't think fixing serial counts is the scope of this 
>>>>>> series. We
>>>>>> should focus on getting GuC submission in not cleaning up all the 
>>>>>> crap
>>>>>> that is in the i915. Let's make a note of this though so we can 
>>>>>> revisit
>>>>>> later.
>>>>>
>>>>> I will say again - commit message implies it is introducing an 
>>>>> unspecified downside by not fully fixing an also unspecified 
>>>>> issue. It is completely reasonable, and customary even, to ask for 
>>>>> both to be documented in the commit message.
>>>> Not sure what exactly is 'unspecified'. I thought the commit 
>>>> message described both the problem (heartbeat not running when 
>>>> using virtual engines) and the result (heartbeat running on more 
>>>> engines than strictly necessary). But in greater detail...
>>>>
>>>> The serial number tracking is a hack for the heartbeat code to know 
>>>> whether an engine is busy or idle, and therefore whether it should 
>>>> be pinged for aliveness. Whenever a submission is made to an 
>>>> engine, the serial number is incremented. The heartbeat code keeps 
>>>> a copy of the value. If the value has changed, the engine is busy 
>>>> and needs to be pinged.
>>>>
>>>> This works fine for execlist mode where virtual engine 
>>>> decomposition is done inside i915. It fails miserably for GuC mode 
>>>> where the decomposition is done by the hardware. The reason being 
>>>> that the heartbeat code only looks at physical engines but the 
>>>> serial count is only incremented on the virtual engine. Thus, the 
>>>> heartbeat sees everything as idle and does not ping.
>>>
>>> So hangcheck does not work. Or it works because GuC does it anyway. 
>>> Either way, that's one thing to explicitly state in the commit message.
>>>
>>>> This patch decomposes the virtual engines for the sake of 
>>>> incrementing the serial count on each sub-engine in order to keep 
>>>> the heartbeat code happy. The downside is that now the heartbeat 
>>>> sees all sub-engines as busy rather than only the one the 
>>>> submission actually ends up on. There really isn't much that can be 
>>>> done about that. The heartbeat code is in i915 not GuC, the 
>>>> scheduler is in GuC not i915. The only way to improve it is to 
>>>> either move the heartbeat code into GuC as well and completely 
>>>> disable the i915 side, or add some way for i915 to interrogate GuC 
>>>> as to which engines are or are not active. Technically, we do have 
>>>> both. GuC has (or at least had) an option to force a context switch 
>>>> on every execution quantum pre-emption. However, that is much, 
>>>> much, more heavy weight than the heartbeat. For the latter, we do 
>>>> (almost) have the engine usage statistics for PMU and such like. 
>>>> I'm not sure how much effort it would be to wire that up to the 
>>>> heartbeat code instead of using the serial count.
>>>>
>>>> In short, the serial count is ever so slightly inefficient in that 
>>>> it causes heartbeat pings on engines which are idle. On the other 
>>>> hand, it is way more efficient and simpler than the current 
>>>> alternatives.
>>>
>>> And the hack to make hangcheck work creates this inefficiency where 
>>> heartbeats are sent to idle engines. Which is probably fine just 
>>> needs to be explained.
>>>
>>>> Does that answer the questions?
>>>
>>> With the two points I re-raise clearly explained, possibly even 
>>> patch title changed, yeah. I am just wanting for it to be more 
>>> easily obvious to patch reader what it is functionally about - not 
>>> just what implementation details have been change but why as well.
>>>
>> My understanding is that we don't explain every piece of code in 
>> minute detail in every checkin email that touches it. I thought my 
>> description was already pretty verbose. I've certainly seen way less 
>> informative checkins that apparently made it through review without 
>> issue.
>>
>> Regarding the problem statement, I thought this was fairly clear that 
>> the heartbeat was broken for virtual engines:
>>
>>     This would mean that the heart beat code
>>     thinks the physical engines are idle due to the serial number not
>>     incrementing.
>>
>>
>> Regarding the inefficiency about heartbeating all physical engines in 
>> a virtual engine, again, this seems clear to me:
>>
>>     decompose virtual engines into
>>     their physical constituents and tracks the request against each. 
>> This
>>     is not entirely accurate as the GuC will only be issuing the request
>>     to one physical engine.
>>
>>
>> For the subject, I guess you could say "Track 'heartbeat serial' 
>> counts for virtual engines". However, the serial tracking count is 
>> not explicitly named for heartbeats so it seems inaccurate to rename 
>> it for a checkin email subject.
>>
>> If you have a suggestion for better wording then feel free to propose 
>> something.
>
> Sigh, I am not asking for more low level detail but for more up to 
> point high level naming and high level description.
>
> "drm/i915: Fix hangchek for guc virtual engines"
I would argue that the bug is not a with hangcheck bug and only 
tangentially a GuC bug. It is really a bug with the serial number 
tracking of virtual engines in general and the lack of support for 
non-execlist backends in the serial number implementation. Hangcheck 
makes use of the serial number. It is not clear from the code whether 
anything else does currently or used to previously use them. Certainly, 
there is no documentation on the serial number declaration in the engine 
structure to explain its purpose. Likewise, there is nothing GuC 
specific about delaying the decomposition of virtual engines. Any 
externally scheduled backed end would do similar. E.g. once the execlist 
backend moves to using the DRM scheduler then maybe it will have delayed 
decomposition as well, and therefore also fall foul of the missing 
serial number updates.


>
> "..Blah blah, but hack because it is not ideal due xyz which 
> needlessly wakes up all engines which has an effect on power yes/no? 
> Latency? Throughput when high prio pulse triggers pointless preemption?"
Yes to all the above but that is already true of the heartbeat mechanism 
in general and I do not see any documentation in the code as to what the 
effect of the heartbeat mechanism is on power, latency, throughput, etc. 
My assumption is that the heartbeat is considered slow enough 
periodicity that any performance impact is negligible. And if the system 
is loaded to the point where the heartbeat is having an impact then all 
engines within the virtual set are going to be in use (because if they 
aren't then the system is obviously not heavily loaded), in which case 
the heartbeat would be pinging all engines anyway.

>
> Also, can we fix it properly without introducing inefficiencies? Do we 
> even need heartbeats when GuC is in charge of engine resets? And if we 
> do can we make them work better?
In short, no, not easily.

The GuC's internal hang detection and recovery mechanism relies on 
pre-emption timeouts for the detection part. However, if only one 
context is active on a given engine, there will be no pre-emptions and 
thus the GuC will not be able to detect if that context is making 
forward progress or not. That's where the heartbeat comes in. It sends a 
dummy request on a different context and thus causes a pre-emption to 
occur. So the architecture level decision was to keep the heartbeat 
enabled even with the GuC submission backend. Unless you are running 
OpenCL of course, in which case we turn everything off :(.

As for doing something better, not easily. GuC is not able to generate 
requests itself, so it can't replicate the heartbeat's operation 
internally. There is an option to force a context switch to idle on 
every quantum expiration. However, that is deemed too intrusive and 
costly from a performance viewpoint. It might be possible to add an 
independent heartbeat timer to the GuC firmware and use that to trigger 
less frequent forced pre-emptions. That would be more efficient and more 
targetted. Whether it is worth the effort required is another matter 
given how small an impact the heartbeat itself currently is.

I would still be my view that the serial count should be fixed anyway. 
It is broken for virtual engines. End of story. Whether that actually 
affects the users of the count is a separate issue that is dependent 
upon those users. But that just changes the severity of the bug, not its 
validity.

John.


>
> Regards,
>
> Tvrtko
Tvrtko Ursulin June 2, 2021, 12:04 p.m. UTC | #9
On 02/06/2021 02:20, John Harrison wrote:
> On 6/1/2021 02:31, Tvrtko Ursulin wrote:
>> On 27/05/2021 18:01, John Harrison wrote:
>>> On 5/27/2021 01:53, Tvrtko Ursulin wrote:
>>>> On 26/05/2021 19:45, John Harrison wrote:
>>>>> On 5/26/2021 01:40, Tvrtko Ursulin wrote:
>>>>>> On 25/05/2021 18:52, Matthew Brost wrote:
>>>>>>> On Tue, May 25, 2021 at 11:16:12AM +0100, Tvrtko Ursulin wrote:
>>>>>>>>
>>>>>>>> On 06/05/2021 20:14, Matthew Brost wrote:
>>>>>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>>>>
>>>>>>>>> The serial number tracking of engines happens at the backend of
>>>>>>>>> request submission and was expecting to only be given physical
>>>>>>>>> engines. However, in GuC submission mode, the decomposition of 
>>>>>>>>> virtual
>>>>>>>>> to physical engines does not happen in i915. Instead, requests are
>>>>>>>>> submitted to their virtual engine mask all the way through to the
>>>>>>>>> hardware (i.e. to GuC). This would mean that the heart beat code
>>>>>>>>> thinks the physical engines are idle due to the serial number not
>>>>>>>>> incrementing.
>>>>>>>>>
>>>>>>>>> This patch updates the tracking to decompose virtual engines into
>>>>>>>>> their physical constituents and tracks the request against 
>>>>>>>>> each. This
>>>>>>>>> is not entirely accurate as the GuC will only be issuing the 
>>>>>>>>> request
>>>>>>>>> to one physical engine. However, it is the best that i915 can 
>>>>>>>>> do given
>>>>>>>>> that it has no knowledge of the GuC's scheduling decisions.
>>>>>>>>
>>>>>>>> Commit text sounds a bit defeatist. I think instead of making up 
>>>>>>>> the serial
>>>>>>>> counts, which has downsides (could you please document in the 
>>>>>>>> commit what
>>>>>>>> they are), we should think how to design things properly.
>>>>>>>>
>>>>>>>
>>>>>>> IMO, I don't think fixing serial counts is the scope of this 
>>>>>>> series. We
>>>>>>> should focus on getting GuC submission in not cleaning up all the 
>>>>>>> crap
>>>>>>> that is in the i915. Let's make a note of this though so we can 
>>>>>>> revisit
>>>>>>> later.
>>>>>>
>>>>>> I will say again - commit message implies it is introducing an 
>>>>>> unspecified downside by not fully fixing an also unspecified 
>>>>>> issue. It is completely reasonable, and customary even, to ask for 
>>>>>> both to be documented in the commit message.
>>>>> Not sure what exactly is 'unspecified'. I thought the commit 
>>>>> message described both the problem (heartbeat not running when 
>>>>> using virtual engines) and the result (heartbeat running on more 
>>>>> engines than strictly necessary). But in greater detail...
>>>>>
>>>>> The serial number tracking is a hack for the heartbeat code to know 
>>>>> whether an engine is busy or idle, and therefore whether it should 
>>>>> be pinged for aliveness. Whenever a submission is made to an 
>>>>> engine, the serial number is incremented. The heartbeat code keeps 
>>>>> a copy of the value. If the value has changed, the engine is busy 
>>>>> and needs to be pinged.
>>>>>
>>>>> This works fine for execlist mode where virtual engine 
>>>>> decomposition is done inside i915. It fails miserably for GuC mode 
>>>>> where the decomposition is done by the hardware. The reason being 
>>>>> that the heartbeat code only looks at physical engines but the 
>>>>> serial count is only incremented on the virtual engine. Thus, the 
>>>>> heartbeat sees everything as idle and does not ping.
>>>>
>>>> So hangcheck does not work. Or it works because GuC does it anyway. 
>>>> Either way, that's one thing to explicitly state in the commit message.
>>>>
>>>>> This patch decomposes the virtual engines for the sake of 
>>>>> incrementing the serial count on each sub-engine in order to keep 
>>>>> the heartbeat code happy. The downside is that now the heartbeat 
>>>>> sees all sub-engines as busy rather than only the one the 
>>>>> submission actually ends up on. There really isn't much that can be 
>>>>> done about that. The heartbeat code is in i915 not GuC, the 
>>>>> scheduler is in GuC not i915. The only way to improve it is to 
>>>>> either move the heartbeat code into GuC as well and completely 
>>>>> disable the i915 side, or add some way for i915 to interrogate GuC 
>>>>> as to which engines are or are not active. Technically, we do have 
>>>>> both. GuC has (or at least had) an option to force a context switch 
>>>>> on every execution quantum pre-emption. However, that is much, 
>>>>> much, more heavy weight than the heartbeat. For the latter, we do 
>>>>> (almost) have the engine usage statistics for PMU and such like. 
>>>>> I'm not sure how much effort it would be to wire that up to the 
>>>>> heartbeat code instead of using the serial count.
>>>>>
>>>>> In short, the serial count is ever so slightly inefficient in that 
>>>>> it causes heartbeat pings on engines which are idle. On the other 
>>>>> hand, it is way more efficient and simpler than the current 
>>>>> alternatives.
>>>>
>>>> And the hack to make hangcheck work creates this inefficiency where 
>>>> heartbeats are sent to idle engines. Which is probably fine just 
>>>> needs to be explained.
>>>>
>>>>> Does that answer the questions?
>>>>
>>>> With the two points I re-raise clearly explained, possibly even 
>>>> patch title changed, yeah. I am just wanting for it to be more 
>>>> easily obvious to patch reader what it is functionally about - not 
>>>> just what implementation details have been change but why as well.
>>>>
>>> My understanding is that we don't explain every piece of code in 
>>> minute detail in every checkin email that touches it. I thought my 
>>> description was already pretty verbose. I've certainly seen way less 
>>> informative checkins that apparently made it through review without 
>>> issue.
>>>
>>> Regarding the problem statement, I thought this was fairly clear that 
>>> the heartbeat was broken for virtual engines:
>>>
>>>     This would mean that the heart beat code
>>>     thinks the physical engines are idle due to the serial number not
>>>     incrementing.
>>>
>>>
>>> Regarding the inefficiency about heartbeating all physical engines in 
>>> a virtual engine, again, this seems clear to me:
>>>
>>>     decompose virtual engines into
>>>     their physical constituents and tracks the request against each. 
>>> This
>>>     is not entirely accurate as the GuC will only be issuing the request
>>>     to one physical engine.
>>>
>>>
>>> For the subject, I guess you could say "Track 'heartbeat serial' 
>>> counts for virtual engines". However, the serial tracking count is 
>>> not explicitly named for heartbeats so it seems inaccurate to rename 
>>> it for a checkin email subject.
>>>
>>> If you have a suggestion for better wording then feel free to propose 
>>> something.
>>
>> Sigh, I am not asking for more low level detail but for more up to 
>> point high level naming and high level description.
>>
>> "drm/i915: Fix hangchek for guc virtual engines"
> I would argue that the bug is not a with hangcheck bug and only 
> tangentially a GuC bug. It is really a bug with the serial number 
> tracking of virtual engines in general and the lack of support for 

You argue it is a bug in general but nothing is currently broken apart 
from hangcheck with GuC virtual engines? :) That could mean, say, that 
it is not actually broken but designed for the current code base.

Maybe "drm/i915: Make hangcheck work with GuC virtual engines" then if 
you object on the word fix? Would that make it immediately clear why is 
this patch must have/desirable?

> non-execlist backends in the serial number implementation. Hangcheck 
> makes use of the serial number. It is not clear from the code whether 
> anything else does currently or used to previously use them. Certainly, 

Engine pm clearly uses it to know when it is safe to park the engine. I 
think I asked earlier in the series have the interactions in that area 
been looked at. I don't know myself, since I think that GuC changes how 
engine parking is done, but not really familiar. Now that I think of it, 
there possibly is a patch which keeps all engines unparked for virtual 
engines, so that's looking okay.

> there is no documentation on the serial number declaration in the engine 
> structure to explain its purpose. Likewise, there is nothing GuC 
> specific about delaying the decomposition of virtual engines. Any 
> externally scheduled backed end would do similar. E.g. once the execlist 
> backend moves to using the DRM scheduler then maybe it will have delayed 
> decomposition as well, and therefore also fall foul of the missing 
> serial number updates.

I don't think we know yet how drm/scheduler will be used to go that far.

>> "..Blah blah, but hack because it is not ideal due xyz which 
>> needlessly wakes up all engines which has an effect on power yes/no? 
>> Latency? Throughput when high prio pulse triggers pointless preemption?"
> Yes to all the above but that is already true of the heartbeat mechanism 
> in general and I do not see any documentation in the code as to what the 
> effect of the heartbeat mechanism is on power, latency, throughput, etc. 

Difference is current code does not emit heartbeats on idle engines. So 
if we have a virtual engine built of say four some class engines, then 
the proposal here is to keep pinging all four in parallel. Even if only 
single context is executing. I am not saying that cost is big but 
honestly I don't understand why it is difficult to mention this in the 
commit message using clear and direct language.

> My assumption is that the heartbeat is considered slow enough 
> periodicity that any performance impact is negligible. And if the system 
> is loaded to the point where the heartbeat is having an impact then all 
> engines within the virtual set are going to be in use (because if they 
> aren't then the system is obviously not heavily loaded), in which case 
> the heartbeat would be pinging all engines anyway.
> 
>>
>> Also, can we fix it properly without introducing inefficiencies? Do we 
>> even need heartbeats when GuC is in charge of engine resets? And if we 
>> do can we make them work better?
> In short, no, not easily.
> 
> The GuC's internal hang detection and recovery mechanism relies on 
> pre-emption timeouts for the detection part. However, if only one 
> context is active on a given engine, there will be no pre-emptions and 
> thus the GuC will not be able to detect if that context is making 
> forward progress or not. That's where the heartbeat comes in. It sends a 
> dummy request on a different context and thus causes a pre-emption to 
> occur. So the architecture level decision was to keep the heartbeat 
> enabled even with the GuC submission backend. Unless you are running 
> OpenCL of course, in which case we turn everything off :(.
> 
> As for doing something better, not easily. GuC is not able to generate 
> requests itself, so it can't replicate the heartbeat's operation 
> internally. There is an option to force a context switch to idle on 
> every quantum expiration. However, that is deemed too intrusive and 
> costly from a performance viewpoint. It might be possible to add an 
> independent heartbeat timer to the GuC firmware and use that to trigger 
> less frequent forced pre-emptions. That would be more efficient and more 
> targetted. Whether it is worth the effort required is another matter 
> given how small an impact the heartbeat itself currently is.

Well GuC could obviously do it in many ways and not all are expensive. 
If it can force a context switch on quantum expiration, it could force 
it on hearbeat expiration as you say. That would actually be more proper 
design than this kludge which leaves a bad taste regardless how little 
cost it has. Or it could perhaps track some sort of serials in a shared 
memory page.

But anyway, all I am asking here is that patch subject and commit 
message are made clear and direct. Here, I add two sentences as what I 
think is minimum:

drm/i915/guc: Make hangcheck work with GuC virtual engines

The serial number tracking of engines happens at the backend of
request submission and was expecting to only be given physical
engines. However, in GuC submission mode, the decomposition of virtual
to physical engines does not happen in i915. Instead, requests are
submitted to their virtual engine mask all the way through to the
hardware (i.e. to GuC). This would mean that the heart beat code
thinks the physical engines are idle due to the serial number not
incrementing. <added>Which in turns means hangcheck does not work for 
GuC virtual engines.</added>

This patch updates the tracking to decompose virtual engines into
their physical constituents and tracks the request against each. This
is not entirely accurate as the GuC will only be issuing the request
to one physical engine. However, it is the best that i915 can do given
that it has no knowledge of the GuC's scheduling decisions.

<added>Downside of this is that all physical engines constituting a GuC 
virtual engine will be periodically unparked (even during just a single 
context executing) in order to be pinged with a heartbeat request. 
However the power and performance cost of this is not expected to be 
measurable (due low frequency of heartbeat pulses) and it is considered 
an easier option than trying to make changes to GuC firmware.</added>

> I would still be my view that the serial count should be fixed anyway. 
> It is broken for virtual engines. End of story. Whether that actually 
> affects the users of the count is a separate issue that is dependent 
> upon those users. But that just changes the severity of the bug, not its 
> validity.

It is clearly not broken for the current codebase, otherwise this patch 
would come with virtual_execlists_bump_serial and would be called like 
"drm/i915: Fix hangcheck on virtual engines". :)

Regards,

Tvrtko
Tvrtko Ursulin June 2, 2021, 12:09 p.m. UTC | #10
On 06/05/2021 20:14, Matthew Brost wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> The serial number tracking of engines happens at the backend of
> request submission and was expecting to only be given physical
> engines. However, in GuC submission mode, the decomposition of virtual
> to physical engines does not happen in i915. Instead, requests are
> submitted to their virtual engine mask all the way through to the
> hardware (i.e. to GuC). This would mean that the heart beat code
> thinks the physical engines are idle due to the serial number not
> incrementing.
> 
> This patch updates the tracking to decompose virtual engines into
> their physical constituents and tracks the request against each. This
> is not entirely accurate as the GuC will only be issuing the request
> to one physical engine. However, it is the best that i915 can do given
> that it has no knowledge of the GuC's scheduling decisions.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_types.h     |  2 ++
>   .../gpu/drm/i915/gt/intel_execlists_submission.c |  6 ++++++
>   drivers/gpu/drm/i915/gt/intel_ring_submission.c  |  6 ++++++
>   drivers/gpu/drm/i915/gt/mock_engine.c            |  6 ++++++
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c    | 16 ++++++++++++++++
>   drivers/gpu/drm/i915/i915_request.c              |  4 +++-
>   6 files changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 86302e6d86b2..e2b5cda6dbc4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -389,6 +389,8 @@ struct intel_engine_cs {
>   	void		(*park)(struct intel_engine_cs *engine);
>   	void		(*unpark)(struct intel_engine_cs *engine);
>   
> +	void		(*bump_serial)(struct intel_engine_cs *engine);
> +
>   	void		(*set_default_submission)(struct intel_engine_cs *engine);
>   
>   	const struct intel_context_ops *cops;
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index ae12d7f19ecd..02880ea5d693 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -3199,6 +3199,11 @@ static void execlists_release(struct intel_engine_cs *engine)
>   	lrc_fini_wa_ctx(engine);
>   }
>   
> +static void execlist_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
>   static void
>   logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   {
> @@ -3208,6 +3213,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   
>   	engine->cops = &execlists_context_ops;
>   	engine->request_alloc = execlists_request_alloc;
> +	engine->bump_serial = execlist_bump_serial;
>   
>   	engine->reset.prepare = execlists_reset_prepare;
>   	engine->reset.rewind = execlists_reset_rewind;
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 14aa31879a37..39dd7c4ed0a9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -1045,6 +1045,11 @@ static void setup_irq(struct intel_engine_cs *engine)
>   	}
>   }
>   
> +static void ring_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
>   static void setup_common(struct intel_engine_cs *engine)
>   {
>   	struct drm_i915_private *i915 = engine->i915;
> @@ -1064,6 +1069,7 @@ static void setup_common(struct intel_engine_cs *engine)
>   
>   	engine->cops = &ring_context_ops;
>   	engine->request_alloc = ring_request_alloc;
> +	engine->bump_serial = ring_bump_serial;
>   
>   	/*
>   	 * Using a global execution timeline; the previous final breadcrumb is
> diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c
> index bd005c1b6fd5..97b10fd60b55 100644
> --- a/drivers/gpu/drm/i915/gt/mock_engine.c
> +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
> @@ -292,6 +292,11 @@ static void mock_engine_release(struct intel_engine_cs *engine)
>   	intel_engine_fini_retire(engine);
>   }
>   
> +static void mock_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
>   struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   				    const char *name,
>   				    int id)
> @@ -318,6 +323,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   
>   	engine->base.cops = &mock_context_ops;
>   	engine->base.request_alloc = mock_request_alloc;
> +	engine->base.bump_serial = mock_bump_serial;
>   	engine->base.emit_flush = mock_emit_flush;
>   	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
>   	engine->base.submit_request = mock_submit_request;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index dc79d287c50a..f0e5731bcef6 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1500,6 +1500,20 @@ static void guc_release(struct intel_engine_cs *engine)
>   	lrc_fini_wa_ctx(engine);
>   }
>   
> +static void guc_bump_serial(struct intel_engine_cs *engine)
> +{
> +	engine->serial++;
> +}
> +
> +static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
> +{
> +	struct intel_engine_cs *e;
> +	intel_engine_mask_t tmp, mask = engine->mask;
> +
> +	for_each_engine_masked(e, engine->gt, mask, tmp)
> +		e->serial++;
> +}
> +
>   static void guc_default_vfuncs(struct intel_engine_cs *engine)
>   {
>   	/* Default vfuncs which can be overridden by each engine. */
> @@ -1508,6 +1522,7 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine)
>   
>   	engine->cops = &guc_context_ops;
>   	engine->request_alloc = guc_request_alloc;
> +	engine->bump_serial = guc_bump_serial;
>   
>   	engine->sched_engine->schedule = i915_schedule;
>   
> @@ -1843,6 +1858,7 @@ guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count)
>   
>   	ve->base.cops = &virtual_guc_context_ops;
>   	ve->base.request_alloc = guc_request_alloc;
> +	ve->base.bump_serial = virtual_guc_bump_serial;
>   
>   	ve->base.submit_request = guc_submit_request;
>   
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 9542a5baa45a..127d60b36422 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -692,7 +692,9 @@ bool __i915_request_submit(struct i915_request *request)
>   				     request->ring->vaddr + request->postfix);
>   
>   	trace_i915_request_execute(request);
> -	engine->serial++;
> +	if (engine->bump_serial)
> +		engine->bump_serial(engine);
> +

As long as you have to handle null vfunc, you could make the patch way 
smaller by doing:

   if (engine->bump_serial)
	engine->bump_serial(engine);
   else
	engine->serial++;

Added bonus you avoid a function call with execlists making the patch 
not introduce a double penalty. Or just make bump_serial always point to 
a valid/default function. No need for both a new branch *and* a function 
call I think. I'd prefer the code snippet as above though.

Regards,

Tvrtko

>   	result = true;
>   
>   	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 86302e6d86b2..e2b5cda6dbc4 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -389,6 +389,8 @@  struct intel_engine_cs {
 	void		(*park)(struct intel_engine_cs *engine);
 	void		(*unpark)(struct intel_engine_cs *engine);
 
+	void		(*bump_serial)(struct intel_engine_cs *engine);
+
 	void		(*set_default_submission)(struct intel_engine_cs *engine);
 
 	const struct intel_context_ops *cops;
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index ae12d7f19ecd..02880ea5d693 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3199,6 +3199,11 @@  static void execlists_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static void execlist_bump_serial(struct intel_engine_cs *engine)
+{
+	engine->serial++;
+}
+
 static void
 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 {
@@ -3208,6 +3213,7 @@  logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 
 	engine->cops = &execlists_context_ops;
 	engine->request_alloc = execlists_request_alloc;
+	engine->bump_serial = execlist_bump_serial;
 
 	engine->reset.prepare = execlists_reset_prepare;
 	engine->reset.rewind = execlists_reset_rewind;
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 14aa31879a37..39dd7c4ed0a9 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -1045,6 +1045,11 @@  static void setup_irq(struct intel_engine_cs *engine)
 	}
 }
 
+static void ring_bump_serial(struct intel_engine_cs *engine)
+{
+	engine->serial++;
+}
+
 static void setup_common(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *i915 = engine->i915;
@@ -1064,6 +1069,7 @@  static void setup_common(struct intel_engine_cs *engine)
 
 	engine->cops = &ring_context_ops;
 	engine->request_alloc = ring_request_alloc;
+	engine->bump_serial = ring_bump_serial;
 
 	/*
 	 * Using a global execution timeline; the previous final breadcrumb is
diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c
index bd005c1b6fd5..97b10fd60b55 100644
--- a/drivers/gpu/drm/i915/gt/mock_engine.c
+++ b/drivers/gpu/drm/i915/gt/mock_engine.c
@@ -292,6 +292,11 @@  static void mock_engine_release(struct intel_engine_cs *engine)
 	intel_engine_fini_retire(engine);
 }
 
+static void mock_bump_serial(struct intel_engine_cs *engine)
+{
+	engine->serial++;
+}
+
 struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 				    const char *name,
 				    int id)
@@ -318,6 +323,7 @@  struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 
 	engine->base.cops = &mock_context_ops;
 	engine->base.request_alloc = mock_request_alloc;
+	engine->base.bump_serial = mock_bump_serial;
 	engine->base.emit_flush = mock_emit_flush;
 	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
 	engine->base.submit_request = mock_submit_request;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index dc79d287c50a..f0e5731bcef6 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1500,6 +1500,20 @@  static void guc_release(struct intel_engine_cs *engine)
 	lrc_fini_wa_ctx(engine);
 }
 
+static void guc_bump_serial(struct intel_engine_cs *engine)
+{
+	engine->serial++;
+}
+
+static void virtual_guc_bump_serial(struct intel_engine_cs *engine)
+{
+	struct intel_engine_cs *e;
+	intel_engine_mask_t tmp, mask = engine->mask;
+
+	for_each_engine_masked(e, engine->gt, mask, tmp)
+		e->serial++;
+}
+
 static void guc_default_vfuncs(struct intel_engine_cs *engine)
 {
 	/* Default vfuncs which can be overridden by each engine. */
@@ -1508,6 +1522,7 @@  static void guc_default_vfuncs(struct intel_engine_cs *engine)
 
 	engine->cops = &guc_context_ops;
 	engine->request_alloc = guc_request_alloc;
+	engine->bump_serial = guc_bump_serial;
 
 	engine->sched_engine->schedule = i915_schedule;
 
@@ -1843,6 +1858,7 @@  guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count)
 
 	ve->base.cops = &virtual_guc_context_ops;
 	ve->base.request_alloc = guc_request_alloc;
+	ve->base.bump_serial = virtual_guc_bump_serial;
 
 	ve->base.submit_request = guc_submit_request;
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 9542a5baa45a..127d60b36422 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -692,7 +692,9 @@  bool __i915_request_submit(struct i915_request *request)
 				     request->ring->vaddr + request->postfix);
 
 	trace_i915_request_execute(request);
-	engine->serial++;
+	if (engine->bump_serial)
+		engine->bump_serial(engine);
+
 	result = true;
 
 	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));