diff mbox series

drm/i915/pcode: Wait 10 seconds for pcode to settle

Message ID 20230111104447.338136-1-andi.shyti@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/pcode: Wait 10 seconds for pcode to settle | expand

Commit Message

Andi Shyti Jan. 11, 2023, 10:44 a.m. UTC
From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>

During module load not all the punit transaction have completed
and we might end up timing out, as shown by the following
warning:

   i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)

Wait 10 seconds for the punit to settle and complete any
outstanding transactions upon module load.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814

Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

Comments

Jani Nikula Jan. 11, 2023, 1:18 p.m. UTC | #1
On Wed, 11 Jan 2023, Andi Shyti <andi.shyti@linux.intel.com> wrote:
> From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
>
> During module load not all the punit transaction have completed
> and we might end up timing out, as shown by the following
> warning:

Root cause?

>
>    i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
>
> Wait 10 seconds for the punit to settle and complete any
> outstanding transactions upon module load.
>
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
>

No blank lines between the tag lines please.

> Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
>  1 file changed, 31 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
> index a234d9b4ed14..3db2ba439bb5 100644
> --- a/drivers/gpu/drm/i915/intel_pcode.c
> +++ b/drivers/gpu/drm/i915/intel_pcode.c
> @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
>  #undef COND
>  }
>  
> +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
> +{
> +	if (__intel_wait_for_register_fw(uncore,
> +					 GEN6_PCODE_MAILBOX,
> +					 GEN6_PCODE_READY, 0,
> +					 500, timeout_ms,
> +					 NULL))
> +		return -EPROBE_DEFER;

This is already done within skl_pcode_request -> skl_pcode_try_request
-> __snb_pcode_rw path, with waits in skl_pcode_request.

Is there anyone who still understands what's being waited for, where,
for how long, and why in the different code paths? I know I don't, and
this isn't helping.

There's also no explanation on the -EPROBE_DEFER return in the commit
message or comments or anywhere.

Again, root cause?

BR,
Jani.


> +
> +	return skl_pcode_request(uncore,
> +				 DG1_PCODE_STATUS,
> +				 DG1_UNCORE_GET_INIT_STATUS,
> +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
> +}
> +
>  int intel_pcode_init(struct intel_uncore *uncore)
>  {
> +	int err;
> +
>  	if (!IS_DGFX(uncore->i915))
>  		return 0;
>  
> -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
> -				 DG1_UNCORE_GET_INIT_STATUS,
> -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> +	/*
> +	 * Wait 10 seconds so that the punit to settle and complete
> +	 * any outstanding transactions upon module load
> +	 */
> +	err = pcode_init_wait(uncore, 10000);
> +
> +	if (err) {
> +		drm_notice(&uncore->i915->drm,
> +			   "Waiting for HW initialisation...\n");
> +		err = pcode_init_wait(uncore, 180000);
> +	}
> +
> +	return err;
>  }
>  
>  int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
Rodrigo Vivi Jan. 11, 2023, 3:25 p.m. UTC | #2
On Wed, Jan 11, 2023 at 11:44:47AM +0100, Andi Shyti wrote:
> From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> 
> During module load not all the punit transaction have completed
> and we might end up timing out, as shown by the following
> warning:
> 
>    i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
> 
> Wait 10 seconds for the punit to settle and complete any
> outstanding transactions upon module load.

10 *SECONDS* ?!

> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
> 
> Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
>  1 file changed, 31 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
> index a234d9b4ed14..3db2ba439bb5 100644
> --- a/drivers/gpu/drm/i915/intel_pcode.c
> +++ b/drivers/gpu/drm/i915/intel_pcode.c
> @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
>  #undef COND
>  }
>  
> +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
> +{
> +	if (__intel_wait_for_register_fw(uncore,
> +					 GEN6_PCODE_MAILBOX,
> +					 GEN6_PCODE_READY, 0,
> +					 500, timeout_ms,
> +					 NULL))
> +		return -EPROBE_DEFER;
> +
> +	return skl_pcode_request(uncore,
> +				 DG1_PCODE_STATUS,
> +				 DG1_UNCORE_GET_INIT_STATUS,
> +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
> +}
> +
>  int intel_pcode_init(struct intel_uncore *uncore)
>  {
> +	int err;
> +
>  	if (!IS_DGFX(uncore->i915))
>  		return 0;
>  
> -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
> -				 DG1_UNCORE_GET_INIT_STATUS,
> -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> +	/*
> +	 * Wait 10 seconds so that the punit to settle and complete
> +	 * any outstanding transactions upon module load
> +	 */
> +	err = pcode_init_wait(uncore, 10000);
> +
> +	if (err) {
> +		drm_notice(&uncore->i915->drm,
> +			   "Waiting for HW initialisation...\n");
> +		err = pcode_init_wait(uncore, 180000);
> +	}
> +
> +	return err;
>  }
>  
>  int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
> -- 
> 2.34.1
>
Andi Shyti Jan. 11, 2023, 3:36 p.m. UTC | #3
On Wed, Jan 11, 2023 at 03:18:38PM +0200, Jani Nikula wrote:
> On Wed, 11 Jan 2023, Andi Shyti <andi.shyti@linux.intel.com> wrote:
> > From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> >
> > During module load not all the punit transaction have completed
> > and we might end up timing out, as shown by the following
> > warning:
> 
> Root cause?
> 
> >
> >    i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
> >
> > Wait 10 seconds for the punit to settle and complete any
> > outstanding transactions upon module load.
> >
> > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
> >
> 
> No blank lines between the tag lines please.

I don't consider "Closes:" to be a tag even if someone is using
it as such. AFAIK is not mentioned it in any of the kernel docs
(e.g. Documentation/process/maintainer-tip.rst).

Not to confuse it with a normal tag, I wanted to put it in
parenthesis or rephrase it.

> > Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
> >  1 file changed, 31 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
> > index a234d9b4ed14..3db2ba439bb5 100644
> > --- a/drivers/gpu/drm/i915/intel_pcode.c
> > +++ b/drivers/gpu/drm/i915/intel_pcode.c
> > @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> >  #undef COND
> >  }
> >  
> > +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
> > +{
> > +	if (__intel_wait_for_register_fw(uncore,
> > +					 GEN6_PCODE_MAILBOX,
> > +					 GEN6_PCODE_READY, 0,
> > +					 500, timeout_ms,
> > +					 NULL))
> > +		return -EPROBE_DEFER;
> 
> This is already done within skl_pcode_request -> skl_pcode_try_request
> -> __snb_pcode_rw path, with waits in skl_pcode_request.

the idea is to check for PCODE_READY even before checking if
data are sent/received by pcode. And this is only during boot
time. While skl_pcode_request is called in other contexts as
well.

In other words here I want to start the communication when I
already know that the punit is ready. Otherwise I would hit an
-EAGAIN and fail.

> Is there anyone who still understands what's being waited for, where,
> for how long, and why in the different code paths? I know I don't, and
> this isn't helping.

I think it depends on hardware. There are some documents roaming
around with some boot time and reset time calculation.

> There's also no explanation on the -EPROBE_DEFER return in the commit
> message or comments or anywhere.

we haven't really failed, right? We just need some time for the
punit to be ready and try to probe again (remember we are here in
module probe).

Thanks for the review,
Andi

> Again, root cause?
> 
> BR,
> Jani.
> 
> 
> > +
> > +	return skl_pcode_request(uncore,
> > +				 DG1_PCODE_STATUS,
> > +				 DG1_UNCORE_GET_INIT_STATUS,
> > +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
> > +}
> > +
> >  int intel_pcode_init(struct intel_uncore *uncore)
> >  {
> > +	int err;
> > +
> >  	if (!IS_DGFX(uncore->i915))
> >  		return 0;
> >  
> > -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > -				 DG1_UNCORE_GET_INIT_STATUS,
> > -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > +	/*
> > +	 * Wait 10 seconds so that the punit to settle and complete
> > +	 * any outstanding transactions upon module load
> > +	 */
> > +	err = pcode_init_wait(uncore, 10000);
> > +
> > +	if (err) {
> > +		drm_notice(&uncore->i915->drm,
> > +			   "Waiting for HW initialisation...\n");
> > +		err = pcode_init_wait(uncore, 180000);
> > +	}
> > +
> > +	return err;
> >  }
> >  
> >  int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
> 
> -- 
> Jani Nikula, Intel Open Source Graphics Center
Andi Shyti Jan. 11, 2023, 3:39 p.m. UTC | #4
Hi Rodrigo,

On Wed, Jan 11, 2023 at 10:25:56AM -0500, Rodrigo Vivi wrote:
> On Wed, Jan 11, 2023 at 11:44:47AM +0100, Andi Shyti wrote:
> > From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > 
> > During module load not all the punit transaction have completed
> > and we might end up timing out, as shown by the following
> > warning:
> > 
> >    i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
> > 
> > Wait 10 seconds for the punit to settle and complete any
> > outstanding transactions upon module load.
> 
> 10 *SECONDS* ?!

Don't be alarmed :)

It's up to 10 seconds, otherwise we would end up waiting up to 3
minutes.

And I've seen a version (and you did as well) where those 3
minutes were raised to 6 (for the PVC particular case).

Thanks for checking this,
Andi

> > 
> > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
> > 
> > Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
> >  1 file changed, 31 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
> > index a234d9b4ed14..3db2ba439bb5 100644
> > --- a/drivers/gpu/drm/i915/intel_pcode.c
> > +++ b/drivers/gpu/drm/i915/intel_pcode.c
> > @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> >  #undef COND
> >  }
> >  
> > +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
> > +{
> > +	if (__intel_wait_for_register_fw(uncore,
> > +					 GEN6_PCODE_MAILBOX,
> > +					 GEN6_PCODE_READY, 0,
> > +					 500, timeout_ms,
> > +					 NULL))
> > +		return -EPROBE_DEFER;
> > +
> > +	return skl_pcode_request(uncore,
> > +				 DG1_PCODE_STATUS,
> > +				 DG1_UNCORE_GET_INIT_STATUS,
> > +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
> > +}
> > +
> >  int intel_pcode_init(struct intel_uncore *uncore)
> >  {
> > +	int err;
> > +
> >  	if (!IS_DGFX(uncore->i915))
> >  		return 0;
> >  
> > -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > -				 DG1_UNCORE_GET_INIT_STATUS,
> > -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > +	/*
> > +	 * Wait 10 seconds so that the punit to settle and complete
> > +	 * any outstanding transactions upon module load
> > +	 */
> > +	err = pcode_init_wait(uncore, 10000);
> > +
> > +	if (err) {
> > +		drm_notice(&uncore->i915->drm,
> > +			   "Waiting for HW initialisation...\n");
> > +		err = pcode_init_wait(uncore, 180000);
> > +	}
> > +
> > +	return err;
> >  }
> >  
> >  int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
> > -- 
> > 2.34.1
> >
Jani Nikula Jan. 11, 2023, 4:40 p.m. UTC | #5
On Wed, 11 Jan 2023, Andi Shyti <andi.shyti@linux.intel.com> wrote:
> On Wed, Jan 11, 2023 at 03:18:38PM +0200, Jani Nikula wrote:
>> On Wed, 11 Jan 2023, Andi Shyti <andi.shyti@linux.intel.com> wrote:
>> > From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
>> >
>> > During module load not all the punit transaction have completed
>> > and we might end up timing out, as shown by the following
>> > warning:
>> 
>> Root cause?
>> 
>> >
>> >    i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
>> >
>> > Wait 10 seconds for the punit to settle and complete any
>> > outstanding transactions upon module load.
>> >
>> > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
>> >
>> 
>> No blank lines between the tag lines please.
>
> I don't consider "Closes:" to be a tag even if someone is using
> it as such. AFAIK is not mentioned it in any of the kernel docs
> (e.g. Documentation/process/maintainer-tip.rst).

Basically all of the Something: lines should be bundled together.

BR,
Jani.

>
> Not to confuse it with a normal tag, I wanted to put it in
> parenthesis or rephrase it.
>
>> > Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
>> > Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
>> >  1 file changed, 31 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
>> > index a234d9b4ed14..3db2ba439bb5 100644
>> > --- a/drivers/gpu/drm/i915/intel_pcode.c
>> > +++ b/drivers/gpu/drm/i915/intel_pcode.c
>> > @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
>> >  #undef COND
>> >  }
>> >  
>> > +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
>> > +{
>> > +	if (__intel_wait_for_register_fw(uncore,
>> > +					 GEN6_PCODE_MAILBOX,
>> > +					 GEN6_PCODE_READY, 0,
>> > +					 500, timeout_ms,
>> > +					 NULL))
>> > +		return -EPROBE_DEFER;
>> 
>> This is already done within skl_pcode_request -> skl_pcode_try_request
>> -> __snb_pcode_rw path, with waits in skl_pcode_request.
>
> the idea is to check for PCODE_READY even before checking if
> data are sent/received by pcode. And this is only during boot
> time. While skl_pcode_request is called in other contexts as
> well.
>
> In other words here I want to start the communication when I
> already know that the punit is ready. Otherwise I would hit an
> -EAGAIN and fail.
>
>> Is there anyone who still understands what's being waited for, where,
>> for how long, and why in the different code paths? I know I don't, and
>> this isn't helping.
>
> I think it depends on hardware. There are some documents roaming
> around with some boot time and reset time calculation.
>
>> There's also no explanation on the -EPROBE_DEFER return in the commit
>> message or comments or anywhere.
>
> we haven't really failed, right? We just need some time for the
> punit to be ready and try to probe again (remember we are here in
> module probe).
>
> Thanks for the review,
> Andi
>
>> Again, root cause?
>> 
>> BR,
>> Jani.
>> 
>> 
>> > +
>> > +	return skl_pcode_request(uncore,
>> > +				 DG1_PCODE_STATUS,
>> > +				 DG1_UNCORE_GET_INIT_STATUS,
>> > +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
>> > +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
>> > +}
>> > +
>> >  int intel_pcode_init(struct intel_uncore *uncore)
>> >  {
>> > +	int err;
>> > +
>> >  	if (!IS_DGFX(uncore->i915))
>> >  		return 0;
>> >  
>> > -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
>> > -				 DG1_UNCORE_GET_INIT_STATUS,
>> > -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
>> > -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
>> > +	/*
>> > +	 * Wait 10 seconds so that the punit to settle and complete
>> > +	 * any outstanding transactions upon module load
>> > +	 */
>> > +	err = pcode_init_wait(uncore, 10000);
>> > +
>> > +	if (err) {
>> > +		drm_notice(&uncore->i915->drm,
>> > +			   "Waiting for HW initialisation...\n");
>> > +		err = pcode_init_wait(uncore, 180000);
>> > +	}
>> > +
>> > +	return err;
>> >  }
>> >  
>> >  int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
>> 
>> -- 
>> Jani Nikula, Intel Open Source Graphics Center
Rodrigo Vivi Jan. 11, 2023, 5:06 p.m. UTC | #6
On Wed, Jan 11, 2023 at 04:39:36PM +0100, Andi Shyti wrote:
> Hi Rodrigo,
> 
> On Wed, Jan 11, 2023 at 10:25:56AM -0500, Rodrigo Vivi wrote:
> > On Wed, Jan 11, 2023 at 11:44:47AM +0100, Andi Shyti wrote:
> > > From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > > 
> > > During module load not all the punit transaction have completed
> > > and we might end up timing out, as shown by the following
> > > warning:
> > > 
> > >    i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
> > > 
> > > Wait 10 seconds for the punit to settle and complete any
> > > outstanding transactions upon module load.
> > 
> > 10 *SECONDS* ?!
> 
> Don't be alarmed :)
> 
> It's up to 10 seconds, otherwise we would end up waiting up to 3
> minutes.
> 
> And I've seen a version (and you did as well) where those 3
> minutes were raised to 6 (for the PVC particular case).

Oh yeap! and that case is funny! Because the indication from PCODE
is that 10 seconds is enough, but there are some rare cases where
it gets stuck and end up taking a very long time. Then they multiplied
the bad rare case to 3, and no idea why how that become 6.

But anyway, thanks for refreshing my memory. When I first noticed this
patch I thought it was in all the platforms, where this wouldn't make
sense. But on discrete where the pcode needs to bring the memory and
gt up before we can really use it, it makes sense.

And then I noticed that your patch is indeed for dgfx only, so it
would be okay. And 10 seconds is okay.

However I also noticed that you do this before the other pcode_init
check that we were told by pcode folks to use. So, I don't understand
how your patch is helping now... you wait for 10 seconds and then you
will wait more 10 seconds on the pcode_ready... why the pcode_ready
check that we have in case already doesn't cover yours?

And why that return?

> 
> Thanks for checking this,
> Andi
> 
> > > 
> > > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
> > > 
> > > Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > > Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
> > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
> > > index a234d9b4ed14..3db2ba439bb5 100644
> > > --- a/drivers/gpu/drm/i915/intel_pcode.c
> > > +++ b/drivers/gpu/drm/i915/intel_pcode.c
> > > @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > >  #undef COND
> > >  }
> > >  
> > > +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
> > > +{
> > > +	if (__intel_wait_for_register_fw(uncore,
> > > +					 GEN6_PCODE_MAILBOX,
> > > +					 GEN6_PCODE_READY, 0,
> > > +					 500, timeout_ms,
> > > +					 NULL))
> > > +		return -EPROBE_DEFER;
> > > +
> > > +	return skl_pcode_request(uncore,
> > > +				 DG1_PCODE_STATUS,
> > > +				 DG1_UNCORE_GET_INIT_STATUS,
> > > +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
> > > +}
> > > +
> > >  int intel_pcode_init(struct intel_uncore *uncore)
> > >  {
> > > +	int err;
> > > +
> > >  	if (!IS_DGFX(uncore->i915))
> > >  		return 0;
> > >  
> > > -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > > -				 DG1_UNCORE_GET_INIT_STATUS,
> > > -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > > +	/*
> > > +	 * Wait 10 seconds so that the punit to settle and complete
> > > +	 * any outstanding transactions upon module load
> > > +	 */
> > > +	err = pcode_init_wait(uncore, 10000);
> > > +
> > > +	if (err) {
> > > +		drm_notice(&uncore->i915->drm,
> > > +			   "Waiting for HW initialisation...\n");
> > > +		err = pcode_init_wait(uncore, 180000);
> > > +	}
> > > +
> > > +	return err;
> > >  }
> > >  
> > >  int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
> > > -- 
> > > 2.34.1
> > >
Andi Shyti Jan. 16, 2023, 7:27 p.m. UTC | #7
Hi Rodrigo,

On Wed, Jan 11, 2023 at 12:06:24PM -0500, Rodrigo Vivi wrote:
> On Wed, Jan 11, 2023 at 04:39:36PM +0100, Andi Shyti wrote:
> > Hi Rodrigo,
> > 
> > On Wed, Jan 11, 2023 at 10:25:56AM -0500, Rodrigo Vivi wrote:
> > > On Wed, Jan 11, 2023 at 11:44:47AM +0100, Andi Shyti wrote:
> > > > From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > > > 
> > > > During module load not all the punit transaction have completed
> > > > and we might end up timing out, as shown by the following
> > > > warning:
> > > > 
> > > >    i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
> > > > 
> > > > Wait 10 seconds for the punit to settle and complete any
> > > > outstanding transactions upon module load.
> > > 
> > > 10 *SECONDS* ?!
> > 
> > Don't be alarmed :)
> > 
> > It's up to 10 seconds, otherwise we would end up waiting up to 3
> > minutes.
> > 
> > And I've seen a version (and you did as well) where those 3
> > minutes were raised to 6 (for the PVC particular case).
> 
> Oh yeap! and that case is funny! Because the indication from PCODE
> is that 10 seconds is enough, but there are some rare cases where
> it gets stuck and end up taking a very long time. Then they multiplied
> the bad rare case to 3, and no idea why how that become 6.
> 
> But anyway, thanks for refreshing my memory. When I first noticed this
> patch I thought it was in all the platforms, where this wouldn't make
> sense. But on discrete where the pcode needs to bring the memory and
> gt up before we can really use it, it makes sense.
> 
> And then I noticed that your patch is indeed for dgfx only, so it
> would be okay. And 10 seconds is okay.

yes... :)

Thanks for the recap!

> However I also noticed that you do this before the other pcode_init
> check that we were told by pcode folks to use. So, I don't understand
> how your patch is helping now... you wait for 10 seconds and then you
> will wait more 10 seconds on the pcode_ready... why the pcode_ready
> check that we have in case already doesn't cover yours?

The difference is that the first timeout ensures that the punit
is ready during boot time. The need to wait is only in boot,
as we check if it's ready and then start communicating.

The second 10s wait comes after the read/write has actually
happened. I expect in this case to wait just a little because
it's just a test write done in boot to make sure the punit is
really responding and ready for the next writes.

But because skl_pcode_request() is used in other contexts, as
well, I assume that the punit is ready and I don't need to wait
anymore before communicating with it. Thus I wait only after
sending the command: I wait for it to complete and be ready again
for the next command.

In other words each wait makes sure that the punit is ready for
the next command. That's why we need a first 10 seconds wait as
at the very first write we weren't sure 100% the punit was ready.

> And why that return?

Becase the driver is not really failing. Most probably the
hardware is completing the boot routines thus we need to try
probing again later.

I hope I was able to explain myself.

Thanks for the comments and for looking into this,
Andi

> > 
> > Thanks for checking this,
> > Andi
> > 
> > > > 
> > > > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
> > > > 
> > > > Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > > > Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > > ---
> > > >  drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
> > > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
> > > > index a234d9b4ed14..3db2ba439bb5 100644
> > > > --- a/drivers/gpu/drm/i915/intel_pcode.c
> > > > +++ b/drivers/gpu/drm/i915/intel_pcode.c
> > > > @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > > >  #undef COND
> > > >  }
> > > >  
> > > > +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
> > > > +{
> > > > +	if (__intel_wait_for_register_fw(uncore,
> > > > +					 GEN6_PCODE_MAILBOX,
> > > > +					 GEN6_PCODE_READY, 0,
> > > > +					 500, timeout_ms,
> > > > +					 NULL))
> > > > +		return -EPROBE_DEFER;
> > > > +
> > > > +	return skl_pcode_request(uncore,
> > > > +				 DG1_PCODE_STATUS,
> > > > +				 DG1_UNCORE_GET_INIT_STATUS,
> > > > +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > > +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
> > > > +}
> > > > +
> > > >  int intel_pcode_init(struct intel_uncore *uncore)
> > > >  {
> > > > +	int err;
> > > > +
> > > >  	if (!IS_DGFX(uncore->i915))
> > > >  		return 0;
> > > >  
> > > > -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > > > -				 DG1_UNCORE_GET_INIT_STATUS,
> > > > -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > > -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > > > +	/*
> > > > +	 * Wait 10 seconds so that the punit to settle and complete
> > > > +	 * any outstanding transactions upon module load
> > > > +	 */
> > > > +	err = pcode_init_wait(uncore, 10000);
> > > > +
> > > > +	if (err) {
> > > > +		drm_notice(&uncore->i915->drm,
> > > > +			   "Waiting for HW initialisation...\n");
> > > > +		err = pcode_init_wait(uncore, 180000);
> > > > +	}
> > > > +
> > > > +	return err;
> > > >  }
> > > >  
> > > >  int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
> > > > -- 
> > > > 2.34.1
> > > >
Gwan-gyeong Mun Jan. 27, 2023, 6:50 a.m. UTC | #8
On 1/11/23 5:36 PM, Andi Shyti wrote:
> On Wed, Jan 11, 2023 at 03:18:38PM +0200, Jani Nikula wrote:
>> On Wed, 11 Jan 2023, Andi Shyti <andi.shyti@linux.intel.com> wrote:
>>> From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
>>>
>>> During module load not all the punit transaction have completed
>>> and we might end up timing out, as shown by the following
>>> warning:
>>
>> Root cause?

Hi Andi, looking at the log where this problem is reported,

https://gitlab.freedesktop.org/drm/intel/-/issues/7814

https://intel-gfx-ci.01.org/tree/drm-tip/drmtip_1294/bat-atsm-1/igt@i915_module_load@resize-bar.html#dmesg-warnings17324

<4> [268.276908] i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)

In the code above, the call stack is output, and the return value of 
intel_pcode_init() returns -11 (-EAGAIN).

<3> [268.329058] i915 0000:4d:00.0: [drm] *ERROR* gt0: intel_pcode_init 
failed -11


If you simplify the function call flow, you can see it as below. (The 
call of _wait_for(COND, timeout_base_ms * 1000, 10, 10) in 
skl_pcode_request() is omitted)

-------------------------------------------------------------------------
intel_pcode_init()
  |
  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
                        DG1_UNCORE_GET_INIT_STATUS,
                        DG1_UNCORE_INIT_STATUS_COMPLETE,
                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
        |
        +-> skl_pcode_try_request()
              |
              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
                                            500, 0, true);

-------------------------------------------------------------------------
static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
		          u32 *val, u32 *val1,
			  int fast_timeout_us, int slow_timeout_ms,
			  bool is_read)
{
...

	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
		return -EAGAIN;

	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
	intel_uncore_write_fw(uncore,
			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);

	if (__intel_wait_for_register_fw(uncore,
					 GEN6_PCODE_MAILBOX,
					 GEN6_PCODE_READY, 0,
					 fast_timeout_us,
					 slow_timeout_ms,
					 &mbox))
		return -ETIMEDOUT;

...
}
-------------------------------------------------------------------------

The case where skl_pcode_request() returns -EAGAIN can be considered as 
the following scenario.
1. 1. When checking the GEN6_PCODE_MAILBOX register status before 
writing the value to the GEN6_PCODE_DATA register in __snb_pcode_rw(), 
it is always BUSY
2. _wait_for(COND, timeout_base_ms * 1000, 10, 10) of 
skl_pcode_request() returns -EAGAIN if the GEN6_PCODE_MAILBOX register 
indicates BUSY even after waiting 500us after writing a value to the 
GEN6_PCODE_DATA register in __snb_pcode_rw()

(Even if skl_pcode_request() gives a timeout of 180 seconds, the time 
used each time __snb_pcode_rw() is called is up to 500us. The rest of 
the time is used for sleep.)

In the situation where the problem is reported, it is not possible to 
confirm exactly which scenario code causes the problem with the current 
log information, and it seems that additional analysis is needed to 
confirm it.
If the hardware takes more than 500us to succeed after PCODE_MAILBOX 
write under certain circumstances, it is thought that the root problem 
causing the problem should be fixed.

br,
G.G.

>>
>>>
>>>     i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
>>>
>>> Wait 10 seconds for the punit to settle and complete any
>>> outstanding transactions upon module load.
>>>
>>> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7814
>>>
>>
>> No blank lines between the tag lines please.
> 
> I don't consider "Closes:" to be a tag even if someone is using
> it as such. AFAIK is not mentioned it in any of the kernel docs
> (e.g. Documentation/process/maintainer-tip.rst).
> 
> Not to confuse it with a normal tag, I wanted to put it in
> parenthesis or rephrase it.
> 
>>> Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
>>> Co-developed-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/intel_pcode.c | 35 ++++++++++++++++++++++++++----
>>>   1 file changed, 31 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
>>> index a234d9b4ed14..3db2ba439bb5 100644
>>> --- a/drivers/gpu/drm/i915/intel_pcode.c
>>> +++ b/drivers/gpu/drm/i915/intel_pcode.c
>>> @@ -204,15 +204,42 @@ int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
>>>   #undef COND
>>>   }
>>>   
>>> +static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
>>> +{
>>> +	if (__intel_wait_for_register_fw(uncore,
>>> +					 GEN6_PCODE_MAILBOX,
>>> +					 GEN6_PCODE_READY, 0,
>>> +					 500, timeout_ms,
>>> +					 NULL))
>>> +		return -EPROBE_DEFER;
>>
>> This is already done within skl_pcode_request -> skl_pcode_try_request
>> -> __snb_pcode_rw path, with waits in skl_pcode_request.
> 
> the idea is to check for PCODE_READY even before checking if
> data are sent/received by pcode. And this is only during boot
> time. While skl_pcode_request is called in other contexts as
> well.
> 
> In other words here I want to start the communication when I
> already know that the punit is ready. Otherwise I would hit an
> -EAGAIN and fail.
> 
>> Is there anyone who still understands what's being waited for, where,
>> for how long, and why in the different code paths? I know I don't, and
>> this isn't helping.
> 
> I think it depends on hardware. There are some documents roaming
> around with some boot time and reset time calculation.
> 
>> There's also no explanation on the -EPROBE_DEFER return in the commit
>> message or comments or anywhere.
> 
> we haven't really failed, right? We just need some time for the
> punit to be ready and try to probe again (remember we are here in
> module probe).
> 
> Thanks for the review,
> Andi
> 
>> Again, root cause?
>>
>> BR,
>> Jani.
>>
>>
>>> +
>>> +	return skl_pcode_request(uncore,
>>> +				 DG1_PCODE_STATUS,
>>> +				 DG1_UNCORE_GET_INIT_STATUS,
>>> +				 DG1_UNCORE_INIT_STATUS_COMPLETE,
>>> +				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
>>> +}
>>> +
>>>   int intel_pcode_init(struct intel_uncore *uncore)
>>>   {
>>> +	int err;
>>> +
>>>   	if (!IS_DGFX(uncore->i915))
>>>   		return 0;
>>>   
>>> -	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
>>> -				 DG1_UNCORE_GET_INIT_STATUS,
>>> -				 DG1_UNCORE_INIT_STATUS_COMPLETE,
>>> -				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
>>> +	/*
>>> +	 * Wait 10 seconds so that the punit to settle and complete
>>> +	 * any outstanding transactions upon module load
>>> +	 */
>>> +	err = pcode_init_wait(uncore, 10000);
>>> +
>>> +	if (err) {
>>> +		drm_notice(&uncore->i915->drm,
>>> +			   "Waiting for HW initialisation...\n");
>>> +		err = pcode_init_wait(uncore, 180000);
>>> +	}
>>> +
>>> +	return err;
>>>   }
>>>   
>>>   int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)
>>
>> -- 
>> Jani Nikula, Intel Open Source Graphics Center
Andi Shyti Jan. 27, 2023, 9 a.m. UTC | #9
Hi Gwan-gyeong,

thanks for the review and the thorough explanation.

On Fri, Jan 27, 2023 at 08:50:26AM +0200, Gwan-gyeong Mun wrote:
> 
> 
> On 1/11/23 5:36 PM, Andi Shyti wrote:
> > On Wed, Jan 11, 2023 at 03:18:38PM +0200, Jani Nikula wrote:
> > > On Wed, 11 Jan 2023, Andi Shyti <andi.shyti@linux.intel.com> wrote:
> > > > From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
> > > > 
> > > > During module load not all the punit transaction have completed
> > > > and we might end up timing out, as shown by the following
> > > > warning:
> > > 
> > > Root cause?
> 
> Hi Andi, looking at the log where this problem is reported,
> 
> https://gitlab.freedesktop.org/drm/intel/-/issues/7814
> 
> https://intel-gfx-ci.01.org/tree/drm-tip/drmtip_1294/bat-atsm-1/igt@i915_module_load@resize-bar.html#dmesg-warnings17324
> 
> <4> [268.276908] i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
> 
> In the code above, the call stack is output, and the return value of
> intel_pcode_init() returns -11 (-EAGAIN).
> 
> <3> [268.329058] i915 0000:4d:00.0: [drm] *ERROR* gt0: intel_pcode_init
> failed -11
> 
> 
> If you simplify the function call flow, you can see it as below. (The call
> of _wait_for(COND, timeout_base_ms * 1000, 10, 10) in skl_pcode_request() is
> omitted)
> 
> -------------------------------------------------------------------------
> intel_pcode_init()
>  |
>  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
>                        DG1_UNCORE_GET_INIT_STATUS,
>                        DG1_UNCORE_INIT_STATUS_COMPLETE,
>                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
>        |
>        +-> skl_pcode_try_request()
>              |
>              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
>                                            500, 0, true);
> 
> -------------------------------------------------------------------------
> static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> 		          u32 *val, u32 *val1,
> 			  int fast_timeout_us, int slow_timeout_ms,
> 			  bool is_read)
> {
> ...
> 
> 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> 		return -EAGAIN;
> 
> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> 	intel_uncore_write_fw(uncore,
> 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> 
> 	if (__intel_wait_for_register_fw(uncore,
> 					 GEN6_PCODE_MAILBOX,
> 					 GEN6_PCODE_READY, 0,
> 					 fast_timeout_us,
> 					 slow_timeout_ms,
> 					 &mbox))
> 		return -ETIMEDOUT;
> 
> ...
> }
> -------------------------------------------------------------------------
> 
> The case where skl_pcode_request() returns -EAGAIN can be considered as the
> following scenario.
> 1. 1. When checking the GEN6_PCODE_MAILBOX register status before writing
> the value to the GEN6_PCODE_DATA register in __snb_pcode_rw(), it is always
> BUSY

correct! We fail with EAGAIN because we are not able to
communicate with the punit because the punit is busy.

Talking about this case we are in boot time and the gpu is
performing its boot routine, the punit as well. While the punit
is working we try communicate with it, but unfortunately, being
busy, we fail with -EAGAIN exactly where you pointed.

Adding an extra wait_for_register_fw, i.e. waiting until the
PCODE_READY register tells that the punit is ready, makes sure
that the read/write will succeed.

Thus Chris has added a 10 seconds wait before the very first read
and write. If the punit is not ready we don't fail with -EAGAIN
and give up the driver loading as it happens now. But we give it
another chance trying to probe it again later.

> 2. _wait_for(COND, timeout_base_ms * 1000, 10, 10) of skl_pcode_request()
> returns -EAGAIN if the GEN6_PCODE_MAILBOX register indicates BUSY even after
> waiting 500us after writing a value to the GEN6_PCODE_DATA register in
> __snb_pcode_rw()

Isn't it the same as '1'?

> (Even if skl_pcode_request() gives a timeout of 180 seconds, the time used
> each time __snb_pcode_rw() is called is up to 500us. The rest of the time is
> used for sleep.)

There is one big, massive, huge difference... the timeout in
skl_pcode_request() after the read/write, not before. So that at
the first communication we fail, return -EAGAIN and give up
probing without starting any timer.

Be aware of the fact that the timeout is not for the current
communication, but for the next one. De facto we start the
timeout after communicating, this makes sure that the next
communication will work.

But no one makes sure that the very first communication works
during probe. Thus the extra wait.

> In the situation where the problem is reported, it is not possible to
> confirm exactly which scenario code causes the problem with the current log
> information, and it seems that additional analysis is needed to confirm it.
> If the hardware takes more than 500us to succeed after PCODE_MAILBOX write
> under certain circumstances, it is thought that the root problem causing the
> problem should be fixed.

Actually I have tested it and indeed the extra check on the
PCODE_READY is needed and makes the driver probe correctly.

Thanks,
Andi
Gwan-gyeong Mun Jan. 28, 2023, 5:41 a.m. UTC | #10
On 1/27/23 11:00 AM, Andi Shyti wrote:
> Hi Gwan-gyeong,
> 
> thanks for the review and the thorough explanation.
> 
> On Fri, Jan 27, 2023 at 08:50:26AM +0200, Gwan-gyeong Mun wrote:
>>
>>
>> On 1/11/23 5:36 PM, Andi Shyti wrote:
>>> On Wed, Jan 11, 2023 at 03:18:38PM +0200, Jani Nikula wrote:
>>>> On Wed, 11 Jan 2023, Andi Shyti <andi.shyti@linux.intel.com> wrote:
>>>>> From: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
>>>>>
>>>>> During module load not all the punit transaction have completed
>>>>> and we might end up timing out, as shown by the following
>>>>> warning:
>>>>
>>>> Root cause?
>>
>> Hi Andi, looking at the log where this problem is reported,
>>
>> https://gitlab.freedesktop.org/drm/intel/-/issues/7814
>>
>> https://intel-gfx-ci.01.org/tree/drm-tip/drmtip_1294/bat-atsm-1/igt@i915_module_load@resize-bar.html#dmesg-warnings17324
>>
>> <4> [268.276908] i915 0000:4d:00.0: drm_WARN_ON_ONCE(timeout_base_ms > 3)
>>
>> In the code above, the call stack is output, and the return value of
>> intel_pcode_init() returns -11 (-EAGAIN).
>>
>> <3> [268.329058] i915 0000:4d:00.0: [drm] *ERROR* gt0: intel_pcode_init
>> failed -11
>>
>>
>> If you simplify the function call flow, you can see it as below. (The call
>> of _wait_for(COND, timeout_base_ms * 1000, 10, 10) in skl_pcode_request() is
>> omitted)
>>
>> -------------------------------------------------------------------------
>> intel_pcode_init()
>>   |
>>   +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
>>                         DG1_UNCORE_GET_INIT_STATUS,
>>                         DG1_UNCORE_INIT_STATUS_COMPLETE,
>>                         DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
>>         |
>>         +-> skl_pcode_try_request()
>>               |
>>               +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
>>                                             500, 0, true);
>>
>> -------------------------------------------------------------------------
>> static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
>> 		          u32 *val, u32 *val1,
>> 			  int fast_timeout_us, int slow_timeout_ms,
>> 			  bool is_read)
>> {
>> ...
>>
>> 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>> 		return -EAGAIN;
>>
>> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
>> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
>> 	intel_uncore_write_fw(uncore,
>> 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
>>
>> 	if (__intel_wait_for_register_fw(uncore,
>> 					 GEN6_PCODE_MAILBOX,
>> 					 GEN6_PCODE_READY, 0,
>> 					 fast_timeout_us,
>> 					 slow_timeout_ms,
>> 					 &mbox))
>> 		return -ETIMEDOUT;
>>
>> ...
>> }
>> -------------------------------------------------------------------------
>>
>> The case where skl_pcode_request() returns -EAGAIN can be considered as the
>> following scenario.
>> 1. 1. When checking the GEN6_PCODE_MAILBOX register status before writing
>> the value to the GEN6_PCODE_DATA register in __snb_pcode_rw(), it is always
>> BUSY
> 
> correct! We fail with EAGAIN because we are not able to
> communicate with the punit because the punit is busy.
> 
> Talking about this case we are in boot time and the gpu is
> performing its boot routine, the punit as well. While the punit
> is working we try communicate with it, but unfortunately, being
> busy, we fail with -EAGAIN exactly where you pointed.
> 
> Adding an extra wait_for_register_fw, i.e. waiting until the
> PCODE_READY register tells that the punit is ready, makes sure
> that the read/write will succeed.
> 
> Thus Chris has added a 10 seconds wait before the very first read
> and write. If the punit is not ready we don't fail with -EAGAIN
> and give up the driver loading as it happens now. But we give it
> another chance trying to probe it again later.
> 
>> 2. _wait_for(COND, timeout_base_ms * 1000, 10, 10) of skl_pcode_request()
>> returns -EAGAIN if the GEN6_PCODE_MAILBOX register indicates BUSY even after
>> waiting 500us after writing a value to the GEN6_PCODE_DATA register in
>> __snb_pcode_rw()
> 
> Isn't it the same as '1'?
> 
>> (Even if skl_pcode_request() gives a timeout of 180 seconds, the time used
>> each time __snb_pcode_rw() is called is up to 500us. The rest of the time is
>> used for sleep.)
> 
> There is one big, massive, huge difference... the timeout in
> skl_pcode_request() after the read/write, not before. So that at
> the first communication we fail, return -EAGAIN and give up
> probing without starting any timer.
> 
> Be aware of the fact that the timeout is not for the current
> communication, but for the next one. De facto we start the
> timeout after communicating, this makes sure that the next
> communication will work.
> 
> But no one makes sure that the very first communication works
> during probe. Thus the extra wait.
>
Hi Andi,
In the call flow invoked by intel_pcode_init(), I've added brief 
comments where further clarification is needed in this scenario, and a 
description of the suspicious scenario at the bottom.

-------------------------------------------------------------------------
intel_pcode_init()
  |
  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
                        DG1_UNCORE_GET_INIT_STATUS,
                        DG1_UNCORE_INIT_STATUS_COMPLETE,
                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
        |
        +-> skl_pcode_try_request()
              |
              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
                                            500, 0, true);

-------------------------------------------------------------------------
static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
		          u32 *val, u32 *val1,
			  int fast_timeout_us, int slow_timeout_ms,
			  bool is_read)
{
...
         /* Before writing a value to the GEN6_PCODE_DATA register,
            check if the bit in the GEN6_PCODE_MAILBOX register indicates
            BUSY. */
	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
		return -EAGAIN;

         /* write value to GEN6_PCODE_DATA register */
	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);

	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);

         /* In this scenario, the value
            "DG1_PCODE_STATUS | GEN6_PCODE_READY"
            is written to the GEN6_PCODE_MAILBOX register,
            so that the Busy status of the GEN6_PCODE_MAILBOX register
            can be checked later.
            (When the value of the GEN6_PCODE_READY bit of the
             GEN6_PCODE_MAILBOX register changes to 0, the operation can
             be considered completed.) */
	intel_uncore_write_fw(uncore,
			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);

         /* In this scenario, verify that the BUSY status bit in the
            GEN6_PCODE_MAILBOX register turns off for up to 500us. */
	if (__intel_wait_for_register_fw(uncore,
					 GEN6_PCODE_MAILBOX,
					 GEN6_PCODE_READY, 0,
					 fast_timeout_us,
					 slow_timeout_ms,
					 &mbox))
		return -ETIMEDOUT;
         /* If there is a failure here, it may be considered that the
            "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
            completed within 500us */
...
}

int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
		      u32 reply_mask, u32 reply, int timeout_base_ms)
{
	u32 status;
	int ret;

	mutex_lock(&uncore->i915->sb_lock);

#define COND \
	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)

         /* the first trial for skl_pcode_try_request() can return
            -EAGAIN or -ETIMEDOUT. And the code did not check the error
            code here, so we don't know how far the __snb_pcode_rw()
            function went. It is not known whether the pcode_mailbox
            status was busy before writing the value to the
            GEN6_PCODE_DATA register or after.*/
	if (COND) {
		ret = 0;
		goto out;
	}

         /* In this scenario, skl_pcode_try_request() is invoked every
            10us for 180 seconds. When skl_pcode_try_request() returns
            -EAGAIN and -ETIMEDOUT by _wait_for(),
            -ETIMEDOUT is returned to a variable ret. */

	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);

	if (!ret)
		goto out;

	/*
	 * The above can time out if the number of requests was low (2 in the
	 * worst case) _and_ PCODE was busy for some reason even after a
	 * (queued) request and @timeout_base_ms delay. As a workaround retry
	 * the poll with preemption disabled to maximize the number of
	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
	 * account for interrupts that could reduce the number of these
	 * requests, and for any quirks of the PCODE firmware that delays
	 * the request completion.
	 */
	drm_dbg_kms(&uncore->i915->drm,
		    "PCODE timeout, retrying with preemption disabled\n");
	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
	preempt_disable();
	ret = wait_for_atomic(COND, 50);

	preempt_enable();

out:
	mutex_unlock(&uncore->i915->sb_lock);
	return status ? status : ret;
#undef COND
}

---------------------------------------------------------

If you try skl_pcode_try_request() for 180 seconds in 
skl_pcode_request(), and the first "intel_uncore_read_fw(uncore, 
GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)" call in __snb_pcode_rw() that 
skl_pcode_try_request() invokes always fails. if then it does not make 
sense to me why this patch fixes it by just waiting 10 seconds.This is 
because if it was called with the flow described above, 180 seconds is 
longer than 10 seconds, so the scenario you mentioned is also covered in 
the existing code.

To describe in more detail the second scenario I previously commented on:
skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
  1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is 
not BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
  2) skl_pcode_try_request() returns -ETIMEDOUT value because the 
operation of 1) does not complete within 500us.
  3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as 
BUSY and returns -EAGAIN in the last call of skl_pcode_try_request() 
invoked by skl_pcode_request()

If the reason why this problem occurred is because of this scenario, 
shouldn't there be an attempt to increase fast_timeout_us used as an 
argument of __snb_pcode_rw() to 500us or more when 
skl_pcode_try_request() returns -ETIMEDOUT?

Br,
G.G.
>> In the situation where the problem is reported, it is not possible to
>> confirm exactly which scenario code causes the problem with the current log
>> information, and it seems that additional analysis is needed to confirm it.
>> If the hardware takes more than 500us to succeed after PCODE_MAILBOX write
>> under certain circumstances, it is thought that the root problem causing the
>> problem should be fixed.
> 
> Actually I have tested it and indeed the extra check on the
> PCODE_READY is needed and makes the driver probe correctly.
> 
> Thanks,
> Andi
Andi Shyti Jan. 30, 2023, 8:48 a.m. UTC | #11
Hi GG,

thanks for the deep analysis!

> Hi Andi,
> In the call flow invoked by intel_pcode_init(), I've added brief comments
> where further clarification is needed in this scenario, and a description of
> the suspicious scenario at the bottom.
> 
> -------------------------------------------------------------------------
> intel_pcode_init()
>  |
>  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
>                        DG1_UNCORE_GET_INIT_STATUS,
>                        DG1_UNCORE_INIT_STATUS_COMPLETE,
>                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
>        |
>        +-> skl_pcode_try_request()
>              |
>              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
>                                            500, 0, true);
> 
> -------------------------------------------------------------------------
> static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> 		          u32 *val, u32 *val1,
> 			  int fast_timeout_us, int slow_timeout_ms,
> 			  bool is_read)
> {
> ...
>         /* Before writing a value to the GEN6_PCODE_DATA register,
>            check if the bit in the GEN6_PCODE_MAILBOX register indicates
>            BUSY. */
> 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> 		return -EAGAIN;

what if we fail here because the punit is still initializing and
will be ready, say, in 10 seconds?

GG, without going any further, we fail here! The -EAGAIN we
receive from the test comes from this point. We don't fail with
-ETIMEDOUT, but with -EAGAIN and the reason is because the punit
is not ready to perform the very fist communication and we fail
the probing.

It doesn't mean, though, that there is anything wrong, we just
need to wait a bit before "taking drastic decisions"!

This patch is suggesting to wait up to 10s for the punit to be
ready and eventually try to probe again... and, indeed, it works!

Andi

> 
>         /* write value to GEN6_PCODE_DATA register */
> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> 
> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> 
>         /* In this scenario, the value
>            "DG1_PCODE_STATUS | GEN6_PCODE_READY"
>            is written to the GEN6_PCODE_MAILBOX register,
>            so that the Busy status of the GEN6_PCODE_MAILBOX register
>            can be checked later.
>            (When the value of the GEN6_PCODE_READY bit of the
>             GEN6_PCODE_MAILBOX register changes to 0, the operation can
>             be considered completed.) */
> 	intel_uncore_write_fw(uncore,
> 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> 
>         /* In this scenario, verify that the BUSY status bit in the
>            GEN6_PCODE_MAILBOX register turns off for up to 500us. */
> 	if (__intel_wait_for_register_fw(uncore,
> 					 GEN6_PCODE_MAILBOX,
> 					 GEN6_PCODE_READY, 0,
> 					 fast_timeout_us,
> 					 slow_timeout_ms,
> 					 &mbox))
> 		return -ETIMEDOUT;
>         /* If there is a failure here, it may be considered that the
>            "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
>            completed within 500us */
> ...
> }
> 
> int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> {
> 	u32 status;
> 	int ret;
> 
> 	mutex_lock(&uncore->i915->sb_lock);
> 
> #define COND \
> 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> 
>         /* the first trial for skl_pcode_try_request() can return
>            -EAGAIN or -ETIMEDOUT. And the code did not check the error
>            code here, so we don't know how far the __snb_pcode_rw()
>            function went. It is not known whether the pcode_mailbox
>            status was busy before writing the value to the
>            GEN6_PCODE_DATA register or after.*/
> 	if (COND) {
> 		ret = 0;
> 		goto out;
> 	}
> 
>         /* In this scenario, skl_pcode_try_request() is invoked every
>            10us for 180 seconds. When skl_pcode_try_request() returns
>            -EAGAIN and -ETIMEDOUT by _wait_for(),
>            -ETIMEDOUT is returned to a variable ret. */
> 
> 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> 
> 	if (!ret)
> 		goto out;
> 
> 	/*
> 	 * The above can time out if the number of requests was low (2 in the
> 	 * worst case) _and_ PCODE was busy for some reason even after a
> 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> 	 * the poll with preemption disabled to maximize the number of
> 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> 	 * account for interrupts that could reduce the number of these
> 	 * requests, and for any quirks of the PCODE firmware that delays
> 	 * the request completion.
> 	 */
> 	drm_dbg_kms(&uncore->i915->drm,
> 		    "PCODE timeout, retrying with preemption disabled\n");
> 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> 	preempt_disable();
> 	ret = wait_for_atomic(COND, 50);
> 
> 	preempt_enable();
> 
> out:
> 	mutex_unlock(&uncore->i915->sb_lock);
> 	return status ? status : ret;
> #undef COND
> }
> 
> ---------------------------------------------------------
> 
> If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
> and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
> GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
> invokes always fails. if then it does not make sense to me why this patch
> fixes it by just waiting 10 seconds.This is because if it was called with
> the flow described above, 180 seconds is longer than 10 seconds, so the
> scenario you mentioned is also covered in the existing code.
> 
> To describe in more detail the second scenario I previously commented on:
> skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
>  1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
> BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
>  2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
> of 1) does not complete within 500us.
>  3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
> and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
> skl_pcode_request()
> 
> If the reason why this problem occurred is because of this scenario,
> shouldn't there be an attempt to increase fast_timeout_us used as an
> argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
> returns -ETIMEDOUT?
> 
> Br,
> G.G.
Rodrigo Vivi Jan. 30, 2023, 2:41 p.m. UTC | #12
On Mon, Jan 30, 2023 at 09:48:31AM +0100, Andi Shyti wrote:
> Hi GG,
> 
> thanks for the deep analysis!
> 
> > Hi Andi,
> > In the call flow invoked by intel_pcode_init(), I've added brief comments
> > where further clarification is needed in this scenario, and a description of
> > the suspicious scenario at the bottom.
> > 
> > -------------------------------------------------------------------------
> > intel_pcode_init()
> >  |
> >  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
> >                        DG1_UNCORE_GET_INIT_STATUS,
> >                        DG1_UNCORE_INIT_STATUS_COMPLETE,
> >                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> >        |
> >        +-> skl_pcode_try_request()
> >              |
> >              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
> >                                            500, 0, true);
> > 
> > -------------------------------------------------------------------------
> > static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> > 		          u32 *val, u32 *val1,
> > 			  int fast_timeout_us, int slow_timeout_ms,
> > 			  bool is_read)
> > {
> > ...
> >         /* Before writing a value to the GEN6_PCODE_DATA register,
> >            check if the bit in the GEN6_PCODE_MAILBOX register indicates
> >            BUSY. */
> > 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > 		return -EAGAIN;
> 
> what if we fail here because the punit is still initializing and
> will be ready, say, in 10 seconds?
> 
> GG, without going any further, we fail here! The -EAGAIN we
> receive from the test comes from this point. We don't fail with
> -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
> is not ready to perform the very fist communication and we fail
> the probing.
> 
> It doesn't mean, though, that there is anything wrong, we just
> need to wait a bit before "taking drastic decisions"!
> 
> This patch is suggesting to wait up to 10s for the punit to be
> ready and eventually try to probe again... and, indeed, it works!

As GG, what I still don't understand is how this extra 10 seconds
wait helps... have you tried to simple add the 10 to the 180 and
make the code 190 sec instead?

> 
> Andi
> 
> > 
> >         /* write value to GEN6_PCODE_DATA register */
> > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> > 
> > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> > 
> >         /* In this scenario, the value
> >            "DG1_PCODE_STATUS | GEN6_PCODE_READY"
> >            is written to the GEN6_PCODE_MAILBOX register,
> >            so that the Busy status of the GEN6_PCODE_MAILBOX register
> >            can be checked later.
> >            (When the value of the GEN6_PCODE_READY bit of the
> >             GEN6_PCODE_MAILBOX register changes to 0, the operation can
> >             be considered completed.) */
> > 	intel_uncore_write_fw(uncore,
> > 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> > 
> >         /* In this scenario, verify that the BUSY status bit in the
> >            GEN6_PCODE_MAILBOX register turns off for up to 500us. */
> > 	if (__intel_wait_for_register_fw(uncore,
> > 					 GEN6_PCODE_MAILBOX,
> > 					 GEN6_PCODE_READY, 0,
> > 					 fast_timeout_us,
> > 					 slow_timeout_ms,
> > 					 &mbox))
> > 		return -ETIMEDOUT;
> >         /* If there is a failure here, it may be considered that the
> >            "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
> >            completed within 500us */
> > ...
> > }
> > 
> > int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> > {
> > 	u32 status;
> > 	int ret;
> > 
> > 	mutex_lock(&uncore->i915->sb_lock);
> > 
> > #define COND \
> > 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> > 
> >         /* the first trial for skl_pcode_try_request() can return
> >            -EAGAIN or -ETIMEDOUT. And the code did not check the error
> >            code here, so we don't know how far the __snb_pcode_rw()
> >            function went. It is not known whether the pcode_mailbox
> >            status was busy before writing the value to the
> >            GEN6_PCODE_DATA register or after.*/
> > 	if (COND) {
> > 		ret = 0;
> > 		goto out;
> > 	}
> > 
> >         /* In this scenario, skl_pcode_try_request() is invoked every
> >            10us for 180 seconds. When skl_pcode_try_request() returns
> >            -EAGAIN and -ETIMEDOUT by _wait_for(),
> >            -ETIMEDOUT is returned to a variable ret. */
> > 
> > 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> > 
> > 	if (!ret)
> > 		goto out;
> > 
> > 	/*
> > 	 * The above can time out if the number of requests was low (2 in the
> > 	 * worst case) _and_ PCODE was busy for some reason even after a
> > 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> > 	 * the poll with preemption disabled to maximize the number of
> > 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> > 	 * account for interrupts that could reduce the number of these
> > 	 * requests, and for any quirks of the PCODE firmware that delays
> > 	 * the request completion.
> > 	 */
> > 	drm_dbg_kms(&uncore->i915->drm,
> > 		    "PCODE timeout, retrying with preemption disabled\n");
> > 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> > 	preempt_disable();
> > 	ret = wait_for_atomic(COND, 50);
> > 
> > 	preempt_enable();
> > 
> > out:
> > 	mutex_unlock(&uncore->i915->sb_lock);
> > 	return status ? status : ret;
> > #undef COND
> > }
> > 
> > ---------------------------------------------------------
> > 
> > If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
> > and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
> > GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
> > invokes always fails. if then it does not make sense to me why this patch
> > fixes it by just waiting 10 seconds.This is because if it was called with
> > the flow described above, 180 seconds is longer than 10 seconds, so the
> > scenario you mentioned is also covered in the existing code.
> > 
> > To describe in more detail the second scenario I previously commented on:
> > skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
> >  1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
> > BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
> >  2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
> > of 1) does not complete within 500us.
> >  3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
> > and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
> > skl_pcode_request()
> > 
> > If the reason why this problem occurred is because of this scenario,
> > shouldn't there be an attempt to increase fast_timeout_us used as an
> > argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
> > returns -ETIMEDOUT?
> > 
> > Br,
> > G.G.
Andi Shyti Jan. 30, 2023, 4:12 p.m. UTC | #13
Hi Rodrigo,

> > > In the call flow invoked by intel_pcode_init(), I've added brief comments
> > > where further clarification is needed in this scenario, and a description of
> > > the suspicious scenario at the bottom.
> > > 
> > > -------------------------------------------------------------------------
> > > intel_pcode_init()
> > >  |
> > >  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > >                        DG1_UNCORE_GET_INIT_STATUS,
> > >                        DG1_UNCORE_INIT_STATUS_COMPLETE,
> > >                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > >        |
> > >        +-> skl_pcode_try_request()
> > >              |
> > >              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
> > >                                            500, 0, true);
> > > 
> > > -------------------------------------------------------------------------
> > > static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> > > 		          u32 *val, u32 *val1,
> > > 			  int fast_timeout_us, int slow_timeout_ms,
> > > 			  bool is_read)
> > > {
> > > ...
> > >         /* Before writing a value to the GEN6_PCODE_DATA register,
> > >            check if the bit in the GEN6_PCODE_MAILBOX register indicates
> > >            BUSY. */
> > > 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > > 		return -EAGAIN;
> > 
> > what if we fail here because the punit is still initializing and
> > will be ready, say, in 10 seconds?
> > 
> > GG, without going any further, we fail here! The -EAGAIN we
> > receive from the test comes from this point. We don't fail with
> > -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
> > is not ready to perform the very fist communication and we fail
> > the probing.
> > 
> > It doesn't mean, though, that there is anything wrong, we just
> > need to wait a bit before "taking drastic decisions"!
> > 
> > This patch is suggesting to wait up to 10s for the punit to be
> > ready and eventually try to probe again... and, indeed, it works!
> 
> As GG, what I still don't understand is how this extra 10 seconds
> wait helps... have you tried to simple add the 10 to the 180 and
> make the code 190 sec instead?

maybe I haven't been able to explain the issue properly.

I can even set that timer to 2hrs and a half and nothing changes
because we fail before.

Here it's not a matter of how much do I wait but when do I check
the pcode readiness (i.e. signalled by the GEN6_PCODE_READY bit
in the GEN6_PCODE_MAILBOX register).

During a normal run we are always sure that communicating with
the punit works, because we made it sure during the previous
transaction.

During probe there is no previous transaction and we start
communicating with the punit without making sure that it is
ready. And indeed some times it is not, so that we suppress the
probing on purpose instead of giving it another chance.

I admit that the commit message is not written properly and
rather misleading, but here it's not at all a matter of how much
do I wait.

Thanks, Rodrigo!
Andi

> > 
> > Andi
> > 
> > > 
> > >         /* write value to GEN6_PCODE_DATA register */
> > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> > > 
> > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> > > 
> > >         /* In this scenario, the value
> > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY"
> > >            is written to the GEN6_PCODE_MAILBOX register,
> > >            so that the Busy status of the GEN6_PCODE_MAILBOX register
> > >            can be checked later.
> > >            (When the value of the GEN6_PCODE_READY bit of the
> > >             GEN6_PCODE_MAILBOX register changes to 0, the operation can
> > >             be considered completed.) */
> > > 	intel_uncore_write_fw(uncore,
> > > 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> > > 
> > >         /* In this scenario, verify that the BUSY status bit in the
> > >            GEN6_PCODE_MAILBOX register turns off for up to 500us. */
> > > 	if (__intel_wait_for_register_fw(uncore,
> > > 					 GEN6_PCODE_MAILBOX,
> > > 					 GEN6_PCODE_READY, 0,
> > > 					 fast_timeout_us,
> > > 					 slow_timeout_ms,
> > > 					 &mbox))
> > > 		return -ETIMEDOUT;
> > >         /* If there is a failure here, it may be considered that the
> > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
> > >            completed within 500us */
> > > ...
> > > }
> > > 
> > > int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > > 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> > > {
> > > 	u32 status;
> > > 	int ret;
> > > 
> > > 	mutex_lock(&uncore->i915->sb_lock);
> > > 
> > > #define COND \
> > > 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> > > 
> > >         /* the first trial for skl_pcode_try_request() can return
> > >            -EAGAIN or -ETIMEDOUT. And the code did not check the error
> > >            code here, so we don't know how far the __snb_pcode_rw()
> > >            function went. It is not known whether the pcode_mailbox
> > >            status was busy before writing the value to the
> > >            GEN6_PCODE_DATA register or after.*/
> > > 	if (COND) {
> > > 		ret = 0;
> > > 		goto out;
> > > 	}
> > > 
> > >         /* In this scenario, skl_pcode_try_request() is invoked every
> > >            10us for 180 seconds. When skl_pcode_try_request() returns
> > >            -EAGAIN and -ETIMEDOUT by _wait_for(),
> > >            -ETIMEDOUT is returned to a variable ret. */
> > > 
> > > 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> > > 
> > > 	if (!ret)
> > > 		goto out;
> > > 
> > > 	/*
> > > 	 * The above can time out if the number of requests was low (2 in the
> > > 	 * worst case) _and_ PCODE was busy for some reason even after a
> > > 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> > > 	 * the poll with preemption disabled to maximize the number of
> > > 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> > > 	 * account for interrupts that could reduce the number of these
> > > 	 * requests, and for any quirks of the PCODE firmware that delays
> > > 	 * the request completion.
> > > 	 */
> > > 	drm_dbg_kms(&uncore->i915->drm,
> > > 		    "PCODE timeout, retrying with preemption disabled\n");
> > > 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> > > 	preempt_disable();
> > > 	ret = wait_for_atomic(COND, 50);
> > > 
> > > 	preempt_enable();
> > > 
> > > out:
> > > 	mutex_unlock(&uncore->i915->sb_lock);
> > > 	return status ? status : ret;
> > > #undef COND
> > > }
> > > 
> > > ---------------------------------------------------------
> > > 
> > > If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
> > > and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
> > > GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
> > > invokes always fails. if then it does not make sense to me why this patch
> > > fixes it by just waiting 10 seconds.This is because if it was called with
> > > the flow described above, 180 seconds is longer than 10 seconds, so the
> > > scenario you mentioned is also covered in the existing code.
> > > 
> > > To describe in more detail the second scenario I previously commented on:
> > > skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
> > >  1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
> > > BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
> > >  2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
> > > of 1) does not complete within 500us.
> > >  3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
> > > and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
> > > skl_pcode_request()
> > > 
> > > If the reason why this problem occurred is because of this scenario,
> > > shouldn't there be an attempt to increase fast_timeout_us used as an
> > > argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
> > > returns -ETIMEDOUT?
> > > 
> > > Br,
> > > G.G.
Rodrigo Vivi Jan. 30, 2023, 4:37 p.m. UTC | #14
On Mon, Jan 30, 2023 at 05:12:48PM +0100, Andi Shyti wrote:
> Hi Rodrigo,
> 
> > > > In the call flow invoked by intel_pcode_init(), I've added brief comments
> > > > where further clarification is needed in this scenario, and a description of
> > > > the suspicious scenario at the bottom.
> > > > 
> > > > -------------------------------------------------------------------------
> > > > intel_pcode_init()
> > > >  |
> > > >  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > > >                        DG1_UNCORE_GET_INIT_STATUS,
> > > >                        DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > >                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > > >        |
> > > >        +-> skl_pcode_try_request()
> > > >              |
> > > >              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
> > > >                                            500, 0, true);
> > > > 
> > > > -------------------------------------------------------------------------
> > > > static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> > > > 		          u32 *val, u32 *val1,
> > > > 			  int fast_timeout_us, int slow_timeout_ms,
> > > > 			  bool is_read)
> > > > {
> > > > ...
> > > >         /* Before writing a value to the GEN6_PCODE_DATA register,
> > > >            check if the bit in the GEN6_PCODE_MAILBOX register indicates
> > > >            BUSY. */
> > > > 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > > > 		return -EAGAIN;
> > > 
> > > what if we fail here because the punit is still initializing and
> > > will be ready, say, in 10 seconds?
> > > 
> > > GG, without going any further, we fail here! The -EAGAIN we
> > > receive from the test comes from this point. We don't fail with
> > > -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
> > > is not ready to perform the very fist communication and we fail
> > > the probing.
> > > 
> > > It doesn't mean, though, that there is anything wrong, we just
> > > need to wait a bit before "taking drastic decisions"!
> > > 
> > > This patch is suggesting to wait up to 10s for the punit to be
> > > ready and eventually try to probe again... and, indeed, it works!
> > 
> > As GG, what I still don't understand is how this extra 10 seconds
> > wait helps... have you tried to simple add the 10 to the 180 and
> > make the code 190 sec instead?
> 
> maybe I haven't been able to explain the issue properly.
> 
> I can even set that timer to 2hrs and a half and nothing changes
> because we fail before.
> 
> Here it's not a matter of how much do I wait but when do I check
> the pcode readiness (i.e. signalled by the GEN6_PCODE_READY bit
> in the GEN6_PCODE_MAILBOX register).
> 
> During a normal run we are always sure that communicating with
> the punit works, because we made it sure during the previous
> transaction.
> 
> During probe there is no previous transaction and we start
> communicating with the punit without making sure that it is
> ready. And indeed some times it is not, so that we suppress the
> probing on purpose instead of giving it another chance.
> 
> I admit that the commit message is not written properly and
> rather misleading, but here it's not at all a matter of how much
> do I wait.

The commit message was initially confused because it looks like
we are just adding a wait, without doing anything....

But looking to the code we can see that it will wait until
pcode is ready with a timeout of 10 seconds.

But if pcode is ready in 10 seconds, why pcode is not ready
in 190 seconds. We are doing absolutely nothing more that could
make pcode ready in 10 seconds that won't be in 190.

This is what we are missing here... The code as is doesn't make
a lot of sense to us and it looks like it is solving the issue
by the 10 extra seconds and not by some special status checking.

> 
> Thanks, Rodrigo!
> Andi
> 
> > > 
> > > Andi
> > > 
> > > > 
> > > >         /* write value to GEN6_PCODE_DATA register */
> > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> > > > 
> > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> > > > 
> > > >         /* In this scenario, the value
> > > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY"
> > > >            is written to the GEN6_PCODE_MAILBOX register,
> > > >            so that the Busy status of the GEN6_PCODE_MAILBOX register
> > > >            can be checked later.
> > > >            (When the value of the GEN6_PCODE_READY bit of the
> > > >             GEN6_PCODE_MAILBOX register changes to 0, the operation can
> > > >             be considered completed.) */
> > > > 	intel_uncore_write_fw(uncore,
> > > > 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> > > > 
> > > >         /* In this scenario, verify that the BUSY status bit in the
> > > >            GEN6_PCODE_MAILBOX register turns off for up to 500us. */
> > > > 	if (__intel_wait_for_register_fw(uncore,
> > > > 					 GEN6_PCODE_MAILBOX,
> > > > 					 GEN6_PCODE_READY, 0,
> > > > 					 fast_timeout_us,
> > > > 					 slow_timeout_ms,
> > > > 					 &mbox))
> > > > 		return -ETIMEDOUT;
> > > >         /* If there is a failure here, it may be considered that the
> > > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
> > > >            completed within 500us */
> > > > ...
> > > > }
> > > > 
> > > > int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > > > 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> > > > {
> > > > 	u32 status;
> > > > 	int ret;
> > > > 
> > > > 	mutex_lock(&uncore->i915->sb_lock);
> > > > 
> > > > #define COND \
> > > > 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> > > > 
> > > >         /* the first trial for skl_pcode_try_request() can return
> > > >            -EAGAIN or -ETIMEDOUT. And the code did not check the error
> > > >            code here, so we don't know how far the __snb_pcode_rw()
> > > >            function went. It is not known whether the pcode_mailbox
> > > >            status was busy before writing the value to the
> > > >            GEN6_PCODE_DATA register or after.*/
> > > > 	if (COND) {
> > > > 		ret = 0;
> > > > 		goto out;
> > > > 	}
> > > > 
> > > >         /* In this scenario, skl_pcode_try_request() is invoked every
> > > >            10us for 180 seconds. When skl_pcode_try_request() returns
> > > >            -EAGAIN and -ETIMEDOUT by _wait_for(),
> > > >            -ETIMEDOUT is returned to a variable ret. */
> > > > 
> > > > 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> > > > 
> > > > 	if (!ret)
> > > > 		goto out;
> > > > 
> > > > 	/*
> > > > 	 * The above can time out if the number of requests was low (2 in the
> > > > 	 * worst case) _and_ PCODE was busy for some reason even after a
> > > > 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> > > > 	 * the poll with preemption disabled to maximize the number of
> > > > 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> > > > 	 * account for interrupts that could reduce the number of these
> > > > 	 * requests, and for any quirks of the PCODE firmware that delays
> > > > 	 * the request completion.
> > > > 	 */
> > > > 	drm_dbg_kms(&uncore->i915->drm,
> > > > 		    "PCODE timeout, retrying with preemption disabled\n");
> > > > 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> > > > 	preempt_disable();
> > > > 	ret = wait_for_atomic(COND, 50);
> > > > 
> > > > 	preempt_enable();
> > > > 
> > > > out:
> > > > 	mutex_unlock(&uncore->i915->sb_lock);
> > > > 	return status ? status : ret;
> > > > #undef COND
> > > > }
> > > > 
> > > > ---------------------------------------------------------
> > > > 
> > > > If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
> > > > and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
> > > > GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
> > > > invokes always fails. if then it does not make sense to me why this patch
> > > > fixes it by just waiting 10 seconds.This is because if it was called with
> > > > the flow described above, 180 seconds is longer than 10 seconds, so the
> > > > scenario you mentioned is also covered in the existing code.
> > > > 
> > > > To describe in more detail the second scenario I previously commented on:
> > > > skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
> > > >  1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
> > > > BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
> > > >  2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
> > > > of 1) does not complete within 500us.
> > > >  3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
> > > > and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
> > > > skl_pcode_request()
> > > > 
> > > > If the reason why this problem occurred is because of this scenario,
> > > > shouldn't there be an attempt to increase fast_timeout_us used as an
> > > > argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
> > > > returns -ETIMEDOUT?
> > > > 
> > > > Br,
> > > > G.G.
Rodrigo Vivi Jan. 30, 2023, 5:30 p.m. UTC | #15
On Mon, Jan 30, 2023 at 11:37:29AM -0500, Rodrigo Vivi wrote:
> On Mon, Jan 30, 2023 at 05:12:48PM +0100, Andi Shyti wrote:
> > Hi Rodrigo,
> > 
> > > > > In the call flow invoked by intel_pcode_init(), I've added brief comments
> > > > > where further clarification is needed in this scenario, and a description of
> > > > > the suspicious scenario at the bottom.
> > > > > 
> > > > > -------------------------------------------------------------------------
> > > > > intel_pcode_init()
> > > > >  |
> > > > >  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > > > >                        DG1_UNCORE_GET_INIT_STATUS,
> > > > >                        DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > > >                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > > > >        |
> > > > >        +-> skl_pcode_try_request()
> > > > >              |
> > > > >              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
> > > > >                                            500, 0, true);
> > > > > 
> > > > > -------------------------------------------------------------------------
> > > > > static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> > > > > 		          u32 *val, u32 *val1,
> > > > > 			  int fast_timeout_us, int slow_timeout_ms,
> > > > > 			  bool is_read)
> > > > > {
> > > > > ...
> > > > >         /* Before writing a value to the GEN6_PCODE_DATA register,
> > > > >            check if the bit in the GEN6_PCODE_MAILBOX register indicates
> > > > >            BUSY. */
> > > > > 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > > > > 		return -EAGAIN;
> > > > 
> > > > what if we fail here because the punit is still initializing and
> > > > will be ready, say, in 10 seconds?
> > > > 
> > > > GG, without going any further, we fail here! The -EAGAIN we
> > > > receive from the test comes from this point. We don't fail with
> > > > -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
> > > > is not ready to perform the very fist communication and we fail
> > > > the probing.
> > > > 
> > > > It doesn't mean, though, that there is anything wrong, we just
> > > > need to wait a bit before "taking drastic decisions"!
> > > > 
> > > > This patch is suggesting to wait up to 10s for the punit to be
> > > > ready and eventually try to probe again... and, indeed, it works!
> > > 
> > > As GG, what I still don't understand is how this extra 10 seconds
> > > wait helps... have you tried to simple add the 10 to the 180 and
> > > make the code 190 sec instead?
> > 
> > maybe I haven't been able to explain the issue properly.
> > 
> > I can even set that timer to 2hrs and a half and nothing changes
> > because we fail before.
> > 
> > Here it's not a matter of how much do I wait but when do I check
> > the pcode readiness (i.e. signalled by the GEN6_PCODE_READY bit
> > in the GEN6_PCODE_MAILBOX register).
> > 
> > During a normal run we are always sure that communicating with
> > the punit works, because we made it sure during the previous
> > transaction.
> > 
> > During probe there is no previous transaction and we start
> > communicating with the punit without making sure that it is
> > ready. And indeed some times it is not, so that we suppress the
> > probing on purpose instead of giving it another chance.
> > 
> > I admit that the commit message is not written properly and
> > rather misleading, but here it's not at all a matter of how much
> > do I wait.
> 
> The commit message was initially confused because it looks like
> we are just adding a wait, without doing anything....
> 
> But looking to the code we can see that it will wait until
> pcode is ready with a timeout of 10 seconds.
> 
> But if pcode is ready in 10 seconds, why pcode is not ready
> in 190 seconds. We are doing absolutely nothing more that could
> make pcode ready in 10 seconds that won't be in 190.
> 
> This is what we are missing here... The code as is doesn't make
> a lot of sense to us and it looks like it is solving the issue
> by the 10 extra seconds and not by some special status checking.

Okay, after an offline talk I am convinced now that we need some
check like this in some place.

But the commit message needs be be fully re-written.

It needs to be clear that underneath, the pcode communication
functions will do a check for ready without any wait, what will
make desired timeout to never really wait for the pcode done
and prematurely return.

at __snb_pcode_rw():

 if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
                return -EAGAIN;

So, for this reason we need to ensure that pcode is really ready
before we wait.

Other options are to handle the EAGAIN return and then wait.
Or even change the __snb_pcode_rw to ensure that it is ready...

Something like:

-       if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
-               return -EAGAIN;
+       if (__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX,
+                                        GEN6_PCODE_READY, GEN6_PCODE_READY,
+                                        fast_timeout_us,
+                                        slow_timeout_ms,
+                                        NULL))
+               return -ETIMEDOUT;

In the best regular case it will not change anything since the wait
for will return 0 immediatelly... in the worst case we would double
the timeout value, but this would be only in this insane case of
180 seconds anyway and if it goes really really nasty...

But anyway, now the motivation and the source of the issue is pretty
clear, I can add my rv-b there, but I really want a better commit msg
at least...

> 
> > 
> > Thanks, Rodrigo!
> > Andi
> > 
> > > > 
> > > > Andi
> > > > 
> > > > > 
> > > > >         /* write value to GEN6_PCODE_DATA register */
> > > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> > > > > 
> > > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> > > > > 
> > > > >         /* In this scenario, the value
> > > > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY"
> > > > >            is written to the GEN6_PCODE_MAILBOX register,
> > > > >            so that the Busy status of the GEN6_PCODE_MAILBOX register
> > > > >            can be checked later.
> > > > >            (When the value of the GEN6_PCODE_READY bit of the
> > > > >             GEN6_PCODE_MAILBOX register changes to 0, the operation can
> > > > >             be considered completed.) */
> > > > > 	intel_uncore_write_fw(uncore,
> > > > > 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> > > > > 
> > > > >         /* In this scenario, verify that the BUSY status bit in the
> > > > >            GEN6_PCODE_MAILBOX register turns off for up to 500us. */
> > > > > 	if (__intel_wait_for_register_fw(uncore,
> > > > > 					 GEN6_PCODE_MAILBOX,
> > > > > 					 GEN6_PCODE_READY, 0,
> > > > > 					 fast_timeout_us,
> > > > > 					 slow_timeout_ms,
> > > > > 					 &mbox))
> > > > > 		return -ETIMEDOUT;
> > > > >         /* If there is a failure here, it may be considered that the
> > > > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
> > > > >            completed within 500us */
> > > > > ...
> > > > > }
> > > > > 
> > > > > int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > > > > 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> > > > > {
> > > > > 	u32 status;
> > > > > 	int ret;
> > > > > 
> > > > > 	mutex_lock(&uncore->i915->sb_lock);
> > > > > 
> > > > > #define COND \
> > > > > 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> > > > > 
> > > > >         /* the first trial for skl_pcode_try_request() can return
> > > > >            -EAGAIN or -ETIMEDOUT. And the code did not check the error
> > > > >            code here, so we don't know how far the __snb_pcode_rw()
> > > > >            function went. It is not known whether the pcode_mailbox
> > > > >            status was busy before writing the value to the
> > > > >            GEN6_PCODE_DATA register or after.*/
> > > > > 	if (COND) {
> > > > > 		ret = 0;
> > > > > 		goto out;
> > > > > 	}
> > > > > 
> > > > >         /* In this scenario, skl_pcode_try_request() is invoked every
> > > > >            10us for 180 seconds. When skl_pcode_try_request() returns
> > > > >            -EAGAIN and -ETIMEDOUT by _wait_for(),
> > > > >            -ETIMEDOUT is returned to a variable ret. */
> > > > > 
> > > > > 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> > > > > 
> > > > > 	if (!ret)
> > > > > 		goto out;
> > > > > 
> > > > > 	/*
> > > > > 	 * The above can time out if the number of requests was low (2 in the
> > > > > 	 * worst case) _and_ PCODE was busy for some reason even after a
> > > > > 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> > > > > 	 * the poll with preemption disabled to maximize the number of
> > > > > 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> > > > > 	 * account for interrupts that could reduce the number of these
> > > > > 	 * requests, and for any quirks of the PCODE firmware that delays
> > > > > 	 * the request completion.
> > > > > 	 */
> > > > > 	drm_dbg_kms(&uncore->i915->drm,
> > > > > 		    "PCODE timeout, retrying with preemption disabled\n");
> > > > > 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> > > > > 	preempt_disable();
> > > > > 	ret = wait_for_atomic(COND, 50);
> > > > > 
> > > > > 	preempt_enable();
> > > > > 
> > > > > out:
> > > > > 	mutex_unlock(&uncore->i915->sb_lock);
> > > > > 	return status ? status : ret;
> > > > > #undef COND
> > > > > }
> > > > > 
> > > > > ---------------------------------------------------------
> > > > > 
> > > > > If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
> > > > > and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
> > > > > GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
> > > > > invokes always fails. if then it does not make sense to me why this patch
> > > > > fixes it by just waiting 10 seconds.This is because if it was called with
> > > > > the flow described above, 180 seconds is longer than 10 seconds, so the
> > > > > scenario you mentioned is also covered in the existing code.
> > > > > 
> > > > > To describe in more detail the second scenario I previously commented on:
> > > > > skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
> > > > >  1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
> > > > > BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
> > > > >  2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
> > > > > of 1) does not complete within 500us.
> > > > >  3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
> > > > > and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
> > > > > skl_pcode_request()
> > > > > 
> > > > > If the reason why this problem occurred is because of this scenario,
> > > > > shouldn't there be an attempt to increase fast_timeout_us used as an
> > > > > argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
> > > > > returns -ETIMEDOUT?
> > > > > 
> > > > > Br,
> > > > > G.G.
Andi Shyti Jan. 30, 2023, 8:41 p.m. UTC | #16
Hi Rodrigo,

first of all, thanks for looking into this!

> > > > > > In the call flow invoked by intel_pcode_init(), I've added brief comments
> > > > > > where further clarification is needed in this scenario, and a description of
> > > > > > the suspicious scenario at the bottom.
> > > > > > 
> > > > > > -------------------------------------------------------------------------
> > > > > > intel_pcode_init()
> > > > > >  |
> > > > > >  +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > > > > >                        DG1_UNCORE_GET_INIT_STATUS,
> > > > > >                        DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > > > >                        DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > > > > >        |
> > > > > >        +-> skl_pcode_try_request()
> > > > > >              |
> > > > > >              +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
> > > > > >                                            500, 0, true);
> > > > > > 
> > > > > > -------------------------------------------------------------------------
> > > > > > static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> > > > > > 		          u32 *val, u32 *val1,
> > > > > > 			  int fast_timeout_us, int slow_timeout_ms,
> > > > > > 			  bool is_read)
> > > > > > {
> > > > > > ...
> > > > > >         /* Before writing a value to the GEN6_PCODE_DATA register,
> > > > > >            check if the bit in the GEN6_PCODE_MAILBOX register indicates
> > > > > >            BUSY. */
> > > > > > 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > > > > > 		return -EAGAIN;
> > > > > 
> > > > > what if we fail here because the punit is still initializing and
> > > > > will be ready, say, in 10 seconds?
> > > > > 
> > > > > GG, without going any further, we fail here! The -EAGAIN we
> > > > > receive from the test comes from this point. We don't fail with
> > > > > -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
> > > > > is not ready to perform the very fist communication and we fail
> > > > > the probing.
> > > > > 
> > > > > It doesn't mean, though, that there is anything wrong, we just
> > > > > need to wait a bit before "taking drastic decisions"!
> > > > > 
> > > > > This patch is suggesting to wait up to 10s for the punit to be
> > > > > ready and eventually try to probe again... and, indeed, it works!
> > > > 
> > > > As GG, what I still don't understand is how this extra 10 seconds
> > > > wait helps... have you tried to simple add the 10 to the 180 and
> > > > make the code 190 sec instead?
> > > 
> > > maybe I haven't been able to explain the issue properly.
> > > 
> > > I can even set that timer to 2hrs and a half and nothing changes
> > > because we fail before.
> > > 
> > > Here it's not a matter of how much do I wait but when do I check
> > > the pcode readiness (i.e. signalled by the GEN6_PCODE_READY bit
> > > in the GEN6_PCODE_MAILBOX register).
> > > 
> > > During a normal run we are always sure that communicating with
> > > the punit works, because we made it sure during the previous
> > > transaction.
> > > 
> > > During probe there is no previous transaction and we start
> > > communicating with the punit without making sure that it is
> > > ready. And indeed some times it is not, so that we suppress the
> > > probing on purpose instead of giving it another chance.
> > > 
> > > I admit that the commit message is not written properly and
> > > rather misleading, but here it's not at all a matter of how much
> > > do I wait.
> > 
> > The commit message was initially confused because it looks like
> > we are just adding a wait, without doing anything....
> > 
> > But looking to the code we can see that it will wait until
> > pcode is ready with a timeout of 10 seconds.
> > 
> > But if pcode is ready in 10 seconds, why pcode is not ready
> > in 190 seconds. We are doing absolutely nothing more that could
> > make pcode ready in 10 seconds that won't be in 190.
> > 
> > This is what we are missing here... The code as is doesn't make
> > a lot of sense to us and it looks like it is solving the issue
> > by the 10 extra seconds and not by some special status checking.
> 
> Okay, after an offline talk I am convinced now that we need some
> check like this in some place.
> 
> But the commit message needs be be fully re-written.
> 
> It needs to be clear that underneath, the pcode communication
> functions will do a check for ready without any wait, what will
> make desired timeout to never really wait for the pcode done
> and prematurely return.
> 
> at __snb_pcode_rw():
> 
>  if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>                 return -EAGAIN;
> 
> So, for this reason we need to ensure that pcode is really ready
> before we wait.
> 
> Other options are to handle the EAGAIN return and then wait.
> Or even change the __snb_pcode_rw to ensure that it is ready...
> 
> Something like:
> 
> -       if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> -               return -EAGAIN;
> +       if (__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX,
> +                                        GEN6_PCODE_READY, GEN6_PCODE_READY,
> +                                        fast_timeout_us,
> +                                        slow_timeout_ms,
> +                                        NULL))
> +               return -ETIMEDOUT;

This works, but the difference is that you are putting it on any
call to the __snb_pcode_rw(), even when not necessary.

Putting it in the schema we used in our offline chat, the
original implementation was:

 +-----------------------+
 | r/w -> check if ready |-> r/w -> cir -> r/w -> cir -> r/w -> ...
 +-----------------------+
   probe time

(where cir stands for "check if ready"). Because in the first
r/w we didn't check the status of the punit we added:

 +-------------------+
 | cir -> r/w -> cir |-> r/w -> cir -> r/w -> cir -> r/w -> ...
 +-------------------+
    probe time

So that we are sure that the first r/w works. What you are
suggesting is:

 +-------------------+
 | cir -> r/w -> cir |-> cir -> r/w -> cir -> cir -> r/w -> cir ->
 --------------------+
    probe time

As you can se we have two "check if ready" in a raw, which might
be a bit of an overkill, it doesn't hurt much but it would look
like:

  if (__intel_wait_for_register_fw())
  	return -EAGAIN;

  intel_uncore_write_fw......

  if (__intel_wait_for_register_fw())
  	return -TIMEDOUT;

and this for every single snb_pcode_r/w.

Besides some functions might need the first wait, some might not.
Check, for example icl_tc_cold_exit() where the -EAGAIN is
handled. With your version the retries would be quite ponderous.

I'm sorry, but I can't take your suggestion as it can have major
consequences along i915, it requires more changes and and it
needs to be carefully tested.

On top of that I like Ashutosh refactoring that is quite an
elegant way to gain more flexibility at boot time without any
further impact.

> In the best regular case it will not change anything since the wait
> for will return 0 immediatelly... in the worst case we would double
> the timeout value, but this would be only in this insane case of
> 180 seconds anyway and if it goes really really nasty...
> 
> But anyway, now the motivation and the source of the issue is pretty
> clear, I can add my rv-b there, but I really want a better commit msg
> at least...

I definitely need to make a better commit message :)

Thanks a lot,
Andi

> > 
> > > 
> > > Thanks, Rodrigo!
> > > Andi
> > > 
> > > > > 
> > > > > Andi
> > > > > 
> > > > > > 
> > > > > >         /* write value to GEN6_PCODE_DATA register */
> > > > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> > > > > > 
> > > > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> > > > > > 
> > > > > >         /* In this scenario, the value
> > > > > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY"
> > > > > >            is written to the GEN6_PCODE_MAILBOX register,
> > > > > >            so that the Busy status of the GEN6_PCODE_MAILBOX register
> > > > > >            can be checked later.
> > > > > >            (When the value of the GEN6_PCODE_READY bit of the
> > > > > >             GEN6_PCODE_MAILBOX register changes to 0, the operation can
> > > > > >             be considered completed.) */
> > > > > > 	intel_uncore_write_fw(uncore,
> > > > > > 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> > > > > > 
> > > > > >         /* In this scenario, verify that the BUSY status bit in the
> > > > > >            GEN6_PCODE_MAILBOX register turns off for up to 500us. */
> > > > > > 	if (__intel_wait_for_register_fw(uncore,
> > > > > > 					 GEN6_PCODE_MAILBOX,
> > > > > > 					 GEN6_PCODE_READY, 0,
> > > > > > 					 fast_timeout_us,
> > > > > > 					 slow_timeout_ms,
> > > > > > 					 &mbox))
> > > > > > 		return -ETIMEDOUT;
> > > > > >         /* If there is a failure here, it may be considered that the
> > > > > >            "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
> > > > > >            completed within 500us */
> > > > > > ...
> > > > > > }
> > > > > > 
> > > > > > int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > > > > > 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> > > > > > {
> > > > > > 	u32 status;
> > > > > > 	int ret;
> > > > > > 
> > > > > > 	mutex_lock(&uncore->i915->sb_lock);
> > > > > > 
> > > > > > #define COND \
> > > > > > 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> > > > > > 
> > > > > >         /* the first trial for skl_pcode_try_request() can return
> > > > > >            -EAGAIN or -ETIMEDOUT. And the code did not check the error
> > > > > >            code here, so we don't know how far the __snb_pcode_rw()
> > > > > >            function went. It is not known whether the pcode_mailbox
> > > > > >            status was busy before writing the value to the
> > > > > >            GEN6_PCODE_DATA register or after.*/
> > > > > > 	if (COND) {
> > > > > > 		ret = 0;
> > > > > > 		goto out;
> > > > > > 	}
> > > > > > 
> > > > > >         /* In this scenario, skl_pcode_try_request() is invoked every
> > > > > >            10us for 180 seconds. When skl_pcode_try_request() returns
> > > > > >            -EAGAIN and -ETIMEDOUT by _wait_for(),
> > > > > >            -ETIMEDOUT is returned to a variable ret. */
> > > > > > 
> > > > > > 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> > > > > > 
> > > > > > 	if (!ret)
> > > > > > 		goto out;
> > > > > > 
> > > > > > 	/*
> > > > > > 	 * The above can time out if the number of requests was low (2 in the
> > > > > > 	 * worst case) _and_ PCODE was busy for some reason even after a
> > > > > > 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> > > > > > 	 * the poll with preemption disabled to maximize the number of
> > > > > > 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> > > > > > 	 * account for interrupts that could reduce the number of these
> > > > > > 	 * requests, and for any quirks of the PCODE firmware that delays
> > > > > > 	 * the request completion.
> > > > > > 	 */
> > > > > > 	drm_dbg_kms(&uncore->i915->drm,
> > > > > > 		    "PCODE timeout, retrying with preemption disabled\n");
> > > > > > 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> > > > > > 	preempt_disable();
> > > > > > 	ret = wait_for_atomic(COND, 50);
> > > > > > 
> > > > > > 	preempt_enable();
> > > > > > 
> > > > > > out:
> > > > > > 	mutex_unlock(&uncore->i915->sb_lock);
> > > > > > 	return status ? status : ret;
> > > > > > #undef COND
> > > > > > }
> > > > > > 
> > > > > > ---------------------------------------------------------
> > > > > > 
> > > > > > If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
> > > > > > and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
> > > > > > GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
> > > > > > invokes always fails. if then it does not make sense to me why this patch
> > > > > > fixes it by just waiting 10 seconds.This is because if it was called with
> > > > > > the flow described above, 180 seconds is longer than 10 seconds, so the
> > > > > > scenario you mentioned is also covered in the existing code.
> > > > > > 
> > > > > > To describe in more detail the second scenario I previously commented on:
> > > > > > skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
> > > > > >  1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
> > > > > > BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
> > > > > >  2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
> > > > > > of 1) does not complete within 500us.
> > > > > >  3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
> > > > > > and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
> > > > > > skl_pcode_request()
> > > > > > 
> > > > > > If the reason why this problem occurred is because of this scenario,
> > > > > > shouldn't there be an attempt to increase fast_timeout_us used as an
> > > > > > argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
> > > > > > returns -ETIMEDOUT?
> > > > > > 
> > > > > > Br,
> > > > > > G.G.
Gwan-gyeong Mun Feb. 2, 2023, 8:22 a.m. UTC | #17
Hi Andi,

You gave a lot of explanations, and confirmed that this patch solves the 
problem, but the root cause of this problem still seems to be unclear.

In the logs where this problem was reported, the logs were output in the 
following order.

link
  : 
https://intel-gfx-ci.01.org/tree/drm-tip/drmtip_1294/bat-atsm-1/igt@i915_module_load@resize-bar.html#dmesg-warnings17324

[log]
<7> [268.276586] i915 0000:4d:00.0: [drm:skl_pcode_request [i915]] PCODE 
timeout, retrying with preemption disabled
...
<4> [268.276942] WARNING: CPU: 1 PID: 1678 at 
drivers/gpu/drm/i915/intel_pcode.c:196 skl_pcode_request+0x242/0x260 [i915]
...
<3> [268.329058] i915 0000:4d:00.0: [drm] *ERROR* gt0: intel_pcode_init 
failed -11


In other words, -EAGAIN was not returned when the first 
skl_pcode_try_request() was executed in the skl_pcode_request() 
function, but the logs confirmed that the _wait_for(COND ...) of the 
skl_pcode_request() function was executed.

And, I reviewed bpsec in more detail to see if I missed anything related 
to the status and restrictions of the initial GTDRIVER_MAILBOX_INTERFACE 
(PCODE).

I've reviewed bpsec in detail for the two below where 
GTDRIVER_MAILBOX_INTERFACE is mentioned, but I haven't identified any 
additional constraints other than those described below.


Pcode Driver Mailbox Spec: 34151
Driver Initialization Programming Sequence: 33149

RUN_BUSY: SW may write to the two mailbox registers only when RUN_BUSY 
is cleared (0b)

My suggestion is that the HW should explain and analyze the root cause 
of the situation where it takes more than 180s for the Busy bit of 
GTDRIVER_MAILBOX_INTERFACE to change to 0.

And the method of the proposed patch, the newly added line 
"__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX, 
GEN6_PCODE_READY, 0, 500, 10000, NULL))" succeeded, but the existing 
method of trying 180s with checking intel_uncore_read_fw(uncore, 
GEN6_PCODE_MAILBOX)&GEN6_PCODE_READY) failed. Therefore please review it 
again.
(Checking the code, it appears that both call usleep_range(wait__, 
wait__ * 2); in __wait_for().)

Br,

G.G.

int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
		      u32 reply_mask, u32 reply, int timeout_base_ms)
{
	u32 status;
	int ret;

	mutex_lock(&uncore->i915->sb_lock);

#define COND \
	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)

	/*
	 * Prime the PCODE by doing a request first. Normally it guarantees
	 * that a subsequent request, at most @timeout_base_ms later, succeeds.
	 * _wait_for() doesn't guarantee when its passed condition is evaluated
	 * first, so send the first request explicitly.
	 */
	if (COND) {
		ret = 0;
		goto out;
	}
	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
	if (!ret)
		goto out;

	/*
	 * The above can time out if the number of requests was low (2 in the
	 * worst case) _and_ PCODE was busy for some reason even after a
	 * (queued) request and @timeout_base_ms delay. As a workaround retry
	 * the poll with preemption disabled to maximize the number of
	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
	 * account for interrupts that could reduce the number of these
	 * requests, and for any quirks of the PCODE firmware that delays
	 * the request completion.
	 */
	drm_dbg_kms(&uncore->i915->drm,
		    "PCODE timeout, retrying with preemption disabled\n");
	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
	preempt_disable();
	ret = wait_for_atomic(COND, 50);
	preempt_enable();

out:
	mutex_unlock(&uncore->i915->sb_lock);
	return status ? status : ret;
#undef COND
}






On 1/30/23 10:41 PM, Andi Shyti wrote:
> Hi Rodrigo,
> 
> first of all, thanks for looking into this!
> 
>>>>>>> In the call flow invoked by intel_pcode_init(), I've added brief comments
>>>>>>> where further clarification is needed in this scenario, and a description of
>>>>>>> the suspicious scenario at the bottom.
>>>>>>>
>>>>>>> -------------------------------------------------------------------------
>>>>>>> intel_pcode_init()
>>>>>>>   |
>>>>>>>   +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
>>>>>>>                         DG1_UNCORE_GET_INIT_STATUS,
>>>>>>>                         DG1_UNCORE_INIT_STATUS_COMPLETE,
>>>>>>>                         DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
>>>>>>>         |
>>>>>>>         +-> skl_pcode_try_request()
>>>>>>>               |
>>>>>>>               +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
>>>>>>>                                             500, 0, true);
>>>>>>>
>>>>>>> -------------------------------------------------------------------------
>>>>>>> static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
>>>>>>> 		          u32 *val, u32 *val1,
>>>>>>> 			  int fast_timeout_us, int slow_timeout_ms,
>>>>>>> 			  bool is_read)
>>>>>>> {
>>>>>>> ...
>>>>>>>          /* Before writing a value to the GEN6_PCODE_DATA register,
>>>>>>>             check if the bit in the GEN6_PCODE_MAILBOX register indicates
>>>>>>>             BUSY. */
>>>>>>> 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>>>>>>> 		return -EAGAIN;
>>>>>>
>>>>>> what if we fail here because the punit is still initializing and
>>>>>> will be ready, say, in 10 seconds?
>>>>>>
>>>>>> GG, without going any further, we fail here! The -EAGAIN we
>>>>>> receive from the test comes from this point. We don't fail with
>>>>>> -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
>>>>>> is not ready to perform the very fist communication and we fail
>>>>>> the probing.
>>>>>>
>>>>>> It doesn't mean, though, that there is anything wrong, we just
>>>>>> need to wait a bit before "taking drastic decisions"!
>>>>>>
>>>>>> This patch is suggesting to wait up to 10s for the punit to be
>>>>>> ready and eventually try to probe again... and, indeed, it works!
>>>>>
>>>>> As GG, what I still don't understand is how this extra 10 seconds
>>>>> wait helps... have you tried to simple add the 10 to the 180 and
>>>>> make the code 190 sec instead?
>>>>
>>>> maybe I haven't been able to explain the issue properly.
>>>>
>>>> I can even set that timer to 2hrs and a half and nothing changes
>>>> because we fail before.
>>>>
>>>> Here it's not a matter of how much do I wait but when do I check
>>>> the pcode readiness (i.e. signalled by the GEN6_PCODE_READY bit
>>>> in the GEN6_PCODE_MAILBOX register).
>>>>
>>>> During a normal run we are always sure that communicating with
>>>> the punit works, because we made it sure during the previous
>>>> transaction.
>>>>
>>>> During probe there is no previous transaction and we start
>>>> communicating with the punit without making sure that it is
>>>> ready. And indeed some times it is not, so that we suppress the
>>>> probing on purpose instead of giving it another chance.
>>>>
>>>> I admit that the commit message is not written properly and
>>>> rather misleading, but here it's not at all a matter of how much
>>>> do I wait.
>>>
>>> The commit message was initially confused because it looks like
>>> we are just adding a wait, without doing anything....
>>>
>>> But looking to the code we can see that it will wait until
>>> pcode is ready with a timeout of 10 seconds.
>>>
>>> But if pcode is ready in 10 seconds, why pcode is not ready
>>> in 190 seconds. We are doing absolutely nothing more that could
>>> make pcode ready in 10 seconds that won't be in 190.
>>>
>>> This is what we are missing here... The code as is doesn't make
>>> a lot of sense to us and it looks like it is solving the issue
>>> by the 10 extra seconds and not by some special status checking.
>>
>> Okay, after an offline talk I am convinced now that we need some
>> check like this in some place.
>>
>> But the commit message needs be be fully re-written.
>>
>> It needs to be clear that underneath, the pcode communication
>> functions will do a check for ready without any wait, what will
>> make desired timeout to never really wait for the pcode done
>> and prematurely return.
>>
>> at __snb_pcode_rw():
>>
>>   if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>>                  return -EAGAIN;
>>
>> So, for this reason we need to ensure that pcode is really ready
>> before we wait.
>>
>> Other options are to handle the EAGAIN return and then wait.
>> Or even change the __snb_pcode_rw to ensure that it is ready...
>>
>> Something like:
>>
>> -       if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>> -               return -EAGAIN;
>> +       if (__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX,
>> +                                        GEN6_PCODE_READY, GEN6_PCODE_READY,
>> +                                        fast_timeout_us,
>> +                                        slow_timeout_ms,
>> +                                        NULL))
>> +               return -ETIMEDOUT;
> 
> This works, but the difference is that you are putting it on any
> call to the __snb_pcode_rw(), even when not necessary.
> 
> Putting it in the schema we used in our offline chat, the
> original implementation was:
> 
>   +-----------------------+
>   | r/w -> check if ready |-> r/w -> cir -> r/w -> cir -> r/w -> ...
>   +-----------------------+
>     probe time
> 
> (where cir stands for "check if ready"). Because in the first
> r/w we didn't check the status of the punit we added:
> 
>   +-------------------+
>   | cir -> r/w -> cir |-> r/w -> cir -> r/w -> cir -> r/w -> ...
>   +-------------------+
>      probe time
> 
> So that we are sure that the first r/w works. What you are
> suggesting is:
> 
>   +-------------------+
>   | cir -> r/w -> cir |-> cir -> r/w -> cir -> cir -> r/w -> cir ->
>   --------------------+
>      probe time
> 
> As you can se we have two "check if ready" in a raw, which might
> be a bit of an overkill, it doesn't hurt much but it would look
> like:
> 
>    if (__intel_wait_for_register_fw())
>    	return -EAGAIN;
> 
>    intel_uncore_write_fw......
> 
>    if (__intel_wait_for_register_fw())
>    	return -TIMEDOUT;
> 
> and this for every single snb_pcode_r/w.
> 
> Besides some functions might need the first wait, some might not.
> Check, for example icl_tc_cold_exit() where the -EAGAIN is
> handled. With your version the retries would be quite ponderous.
> 
> I'm sorry, but I can't take your suggestion as it can have major
> consequences along i915, it requires more changes and and it
> needs to be carefully tested.
> 
> On top of that I like Ashutosh refactoring that is quite an
> elegant way to gain more flexibility at boot time without any
> further impact.
> 
>> In the best regular case it will not change anything since the wait
>> for will return 0 immediatelly... in the worst case we would double
>> the timeout value, but this would be only in this insane case of
>> 180 seconds anyway and if it goes really really nasty...
>>
>> But anyway, now the motivation and the source of the issue is pretty
>> clear, I can add my rv-b there, but I really want a better commit msg
>> at least...
> 
> I definitely need to make a better commit message :)
> 
> Thanks a lot,
> Andi
> 
>>>
>>>>
>>>> Thanks, Rodrigo!
>>>> Andi
>>>>
>>>>>>
>>>>>> Andi
>>>>>>
>>>>>>>
>>>>>>>          /* write value to GEN6_PCODE_DATA register */
>>>>>>> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
>>>>>>>
>>>>>>> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
>>>>>>>
>>>>>>>          /* In this scenario, the value
>>>>>>>             "DG1_PCODE_STATUS | GEN6_PCODE_READY"
>>>>>>>             is written to the GEN6_PCODE_MAILBOX register,
>>>>>>>             so that the Busy status of the GEN6_PCODE_MAILBOX register
>>>>>>>             can be checked later.
>>>>>>>             (When the value of the GEN6_PCODE_READY bit of the
>>>>>>>              GEN6_PCODE_MAILBOX register changes to 0, the operation can
>>>>>>>              be considered completed.) */
>>>>>>> 	intel_uncore_write_fw(uncore,
>>>>>>> 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
>>>>>>>
>>>>>>>          /* In this scenario, verify that the BUSY status bit in the
>>>>>>>             GEN6_PCODE_MAILBOX register turns off for up to 500us. */
>>>>>>> 	if (__intel_wait_for_register_fw(uncore,
>>>>>>> 					 GEN6_PCODE_MAILBOX,
>>>>>>> 					 GEN6_PCODE_READY, 0,
>>>>>>> 					 fast_timeout_us,
>>>>>>> 					 slow_timeout_ms,
>>>>>>> 					 &mbox))
>>>>>>> 		return -ETIMEDOUT;
>>>>>>>          /* If there is a failure here, it may be considered that the
>>>>>>>             "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
>>>>>>>             completed within 500us */
>>>>>>> ...
>>>>>>> }
>>>>>>>
>>>>>>> int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
>>>>>>> 		      u32 reply_mask, u32 reply, int timeout_base_ms)
>>>>>>> {
>>>>>>> 	u32 status;
>>>>>>> 	int ret;
>>>>>>>
>>>>>>> 	mutex_lock(&uncore->i915->sb_lock);
>>>>>>>
>>>>>>> #define COND \
>>>>>>> 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
>>>>>>>
>>>>>>>          /* the first trial for skl_pcode_try_request() can return
>>>>>>>             -EAGAIN or -ETIMEDOUT. And the code did not check the error
>>>>>>>             code here, so we don't know how far the __snb_pcode_rw()
>>>>>>>             function went. It is not known whether the pcode_mailbox
>>>>>>>             status was busy before writing the value to the
>>>>>>>             GEN6_PCODE_DATA register or after.*/
>>>>>>> 	if (COND) {
>>>>>>> 		ret = 0;
>>>>>>> 		goto out;
>>>>>>> 	}
>>>>>>>
>>>>>>>          /* In this scenario, skl_pcode_try_request() is invoked every
>>>>>>>             10us for 180 seconds. When skl_pcode_try_request() returns
>>>>>>>             -EAGAIN and -ETIMEDOUT by _wait_for(),
>>>>>>>             -ETIMEDOUT is returned to a variable ret. */
>>>>>>>
>>>>>>> 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
>>>>>>>
>>>>>>> 	if (!ret)
>>>>>>> 		goto out;
>>>>>>>
>>>>>>> 	/*
>>>>>>> 	 * The above can time out if the number of requests was low (2 in the
>>>>>>> 	 * worst case) _and_ PCODE was busy for some reason even after a
>>>>>>> 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
>>>>>>> 	 * the poll with preemption disabled to maximize the number of
>>>>>>> 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
>>>>>>> 	 * account for interrupts that could reduce the number of these
>>>>>>> 	 * requests, and for any quirks of the PCODE firmware that delays
>>>>>>> 	 * the request completion.
>>>>>>> 	 */
>>>>>>> 	drm_dbg_kms(&uncore->i915->drm,
>>>>>>> 		    "PCODE timeout, retrying with preemption disabled\n");
>>>>>>> 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
>>>>>>> 	preempt_disable();
>>>>>>> 	ret = wait_for_atomic(COND, 50);
>>>>>>>
>>>>>>> 	preempt_enable();
>>>>>>>
>>>>>>> out:
>>>>>>> 	mutex_unlock(&uncore->i915->sb_lock);
>>>>>>> 	return status ? status : ret;
>>>>>>> #undef COND
>>>>>>> }
>>>>>>>
>>>>>>> ---------------------------------------------------------
>>>>>>>
>>>>>>> If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
>>>>>>> and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
>>>>>>> GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
>>>>>>> invokes always fails. if then it does not make sense to me why this patch
>>>>>>> fixes it by just waiting 10 seconds.This is because if it was called with
>>>>>>> the flow described above, 180 seconds is longer than 10 seconds, so the
>>>>>>> scenario you mentioned is also covered in the existing code.
>>>>>>>
>>>>>>> To describe in more detail the second scenario I previously commented on:
>>>>>>> skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
>>>>>>>   1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
>>>>>>> BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
>>>>>>>   2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
>>>>>>> of 1) does not complete within 500us.
>>>>>>>   3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
>>>>>>> and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
>>>>>>> skl_pcode_request()
>>>>>>>
>>>>>>> If the reason why this problem occurred is because of this scenario,
>>>>>>> shouldn't there be an attempt to increase fast_timeout_us used as an
>>>>>>> argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
>>>>>>> returns -ETIMEDOUT?
>>>>>>>
>>>>>>> Br,
>>>>>>> G.G.
Andi Shyti Feb. 2, 2023, 9:28 a.m. UTC | #18
Hi GG,

On Thu, Feb 02, 2023 at 10:22:30AM +0200, Gwan-gyeong Mun wrote:
> Hi Andi,
> 
> You gave a lot of explanations, and confirmed that this patch solves the
> problem, but the root cause of this problem still seems to be unclear.
> 
> In the logs where this problem was reported, the logs were output in the
> following order.
> 
> link
>  : https://intel-gfx-ci.01.org/tree/drm-tip/drmtip_1294/bat-atsm-1/igt@i915_module_load@resize-bar.html#dmesg-warnings17324
> 
> [log]
> <7> [268.276586] i915 0000:4d:00.0: [drm:skl_pcode_request [i915]] PCODE
> timeout, retrying with preemption disabled
> ...
> <4> [268.276942] WARNING: CPU: 1 PID: 1678 at
> drivers/gpu/drm/i915/intel_pcode.c:196 skl_pcode_request+0x242/0x260 [i915]
> ...
> <3> [268.329058] i915 0000:4d:00.0: [drm] *ERROR* gt0: intel_pcode_init
> failed -11
> 
> 
> In other words, -EAGAIN was not returned when the first
> skl_pcode_try_request() was executed in the skl_pcode_request() function,
> but the logs confirmed that the _wait_for(COND ...) of the
> skl_pcode_request() function was executed.

I'm not following... how can you say that -EAGAIN was not
returned, it's written explecetly that pcode_init failed with
-11, i.e. -EAGAIN.

Besides if we get as far as to receive "PCODE timeout, retrying
with preemption disabled" isn't it because we failed twice? And
apparently with -EAGAIN?

How can you say from the logs above that this

	if (COND) {
		ret = 0;
		goto out;
	}

has not failed with -EAGAIN? The logs are not telling us how the
calls to __skl_pcode_try_request() failed. I am telling you after
I tested it.

I hope I understood what you are saying. Otherwise, let's have an
offline chat.

Andi

> And, I reviewed bpsec in more detail to see if I missed anything related to
> the status and restrictions of the initial GTDRIVER_MAILBOX_INTERFACE
> (PCODE).
> 
> I've reviewed bpsec in detail for the two below where
> GTDRIVER_MAILBOX_INTERFACE is mentioned, but I haven't identified any
> additional constraints other than those described below.
> 
> 
> Pcode Driver Mailbox Spec: 34151
> Driver Initialization Programming Sequence: 33149
> 
> RUN_BUSY: SW may write to the two mailbox registers only when RUN_BUSY is
> cleared (0b)
> 
> My suggestion is that the HW should explain and analyze the root cause of
> the situation where it takes more than 180s for the Busy bit of
> GTDRIVER_MAILBOX_INTERFACE to change to 0.
> 
> And the method of the proposed patch, the newly added line
> "__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX, GEN6_PCODE_READY,
> 0, 500, 10000, NULL))" succeeded, but the existing method of trying 180s
> with checking intel_uncore_read_fw(uncore,
> GEN6_PCODE_MAILBOX)&GEN6_PCODE_READY) failed. Therefore please review it
> again.
> (Checking the code, it appears that both call usleep_range(wait__, wait__ *
> 2); in __wait_for().)
> 
> Br,
> 
> G.G.
> 
> int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> {
> 	u32 status;
> 	int ret;
> 
> 	mutex_lock(&uncore->i915->sb_lock);
> 
> #define COND \
> 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> 
> 	/*
> 	 * Prime the PCODE by doing a request first. Normally it guarantees
> 	 * that a subsequent request, at most @timeout_base_ms later, succeeds.
> 	 * _wait_for() doesn't guarantee when its passed condition is evaluated
> 	 * first, so send the first request explicitly.
> 	 */
> 	if (COND) {
> 		ret = 0;
> 		goto out;
> 	}
> 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> 	if (!ret)
> 		goto out;
> 
> 	/*
> 	 * The above can time out if the number of requests was low (2 in the
> 	 * worst case) _and_ PCODE was busy for some reason even after a
> 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> 	 * the poll with preemption disabled to maximize the number of
> 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> 	 * account for interrupts that could reduce the number of these
> 	 * requests, and for any quirks of the PCODE firmware that delays
> 	 * the request completion.
> 	 */
> 	drm_dbg_kms(&uncore->i915->drm,
> 		    "PCODE timeout, retrying with preemption disabled\n");
> 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> 	preempt_disable();
> 	ret = wait_for_atomic(COND, 50);
> 	preempt_enable();
> 
> out:
> 	mutex_unlock(&uncore->i915->sb_lock);
> 	return status ? status : ret;
> #undef COND
> }
> 
> 
> 
> 
> 
> 
> On 1/30/23 10:41 PM, Andi Shyti wrote:
> > Hi Rodrigo,
> > 
> > first of all, thanks for looking into this!
> > 
> > > > > > > > In the call flow invoked by intel_pcode_init(), I've added brief comments
> > > > > > > > where further clarification is needed in this scenario, and a description of
> > > > > > > > the suspicious scenario at the bottom.
> > > > > > > > 
> > > > > > > > -------------------------------------------------------------------------
> > > > > > > > intel_pcode_init()
> > > > > > > >   |
> > > > > > > >   +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
> > > > > > > >                         DG1_UNCORE_GET_INIT_STATUS,
> > > > > > > >                         DG1_UNCORE_INIT_STATUS_COMPLETE,
> > > > > > > >                         DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
> > > > > > > >         |
> > > > > > > >         +-> skl_pcode_try_request()
> > > > > > > >               |
> > > > > > > >               +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
> > > > > > > >                                             500, 0, true);
> > > > > > > > 
> > > > > > > > -------------------------------------------------------------------------
> > > > > > > > static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
> > > > > > > > 		          u32 *val, u32 *val1,
> > > > > > > > 			  int fast_timeout_us, int slow_timeout_ms,
> > > > > > > > 			  bool is_read)
> > > > > > > > {
> > > > > > > > ...
> > > > > > > >          /* Before writing a value to the GEN6_PCODE_DATA register,
> > > > > > > >             check if the bit in the GEN6_PCODE_MAILBOX register indicates
> > > > > > > >             BUSY. */
> > > > > > > > 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > > > > > > > 		return -EAGAIN;
> > > > > > > 
> > > > > > > what if we fail here because the punit is still initializing and
> > > > > > > will be ready, say, in 10 seconds?
> > > > > > > 
> > > > > > > GG, without going any further, we fail here! The -EAGAIN we
> > > > > > > receive from the test comes from this point. We don't fail with
> > > > > > > -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
> > > > > > > is not ready to perform the very fist communication and we fail
> > > > > > > the probing.
> > > > > > > 
> > > > > > > It doesn't mean, though, that there is anything wrong, we just
> > > > > > > need to wait a bit before "taking drastic decisions"!
> > > > > > > 
> > > > > > > This patch is suggesting to wait up to 10s for the punit to be
> > > > > > > ready and eventually try to probe again... and, indeed, it works!
> > > > > > 
> > > > > > As GG, what I still don't understand is how this extra 10 seconds
> > > > > > wait helps... have you tried to simple add the 10 to the 180 and
> > > > > > make the code 190 sec instead?
> > > > > 
> > > > > maybe I haven't been able to explain the issue properly.
> > > > > 
> > > > > I can even set that timer to 2hrs and a half and nothing changes
> > > > > because we fail before.
> > > > > 
> > > > > Here it's not a matter of how much do I wait but when do I check
> > > > > the pcode readiness (i.e. signalled by the GEN6_PCODE_READY bit
> > > > > in the GEN6_PCODE_MAILBOX register).
> > > > > 
> > > > > During a normal run we are always sure that communicating with
> > > > > the punit works, because we made it sure during the previous
> > > > > transaction.
> > > > > 
> > > > > During probe there is no previous transaction and we start
> > > > > communicating with the punit without making sure that it is
> > > > > ready. And indeed some times it is not, so that we suppress the
> > > > > probing on purpose instead of giving it another chance.
> > > > > 
> > > > > I admit that the commit message is not written properly and
> > > > > rather misleading, but here it's not at all a matter of how much
> > > > > do I wait.
> > > > 
> > > > The commit message was initially confused because it looks like
> > > > we are just adding a wait, without doing anything....
> > > > 
> > > > But looking to the code we can see that it will wait until
> > > > pcode is ready with a timeout of 10 seconds.
> > > > 
> > > > But if pcode is ready in 10 seconds, why pcode is not ready
> > > > in 190 seconds. We are doing absolutely nothing more that could
> > > > make pcode ready in 10 seconds that won't be in 190.
> > > > 
> > > > This is what we are missing here... The code as is doesn't make
> > > > a lot of sense to us and it looks like it is solving the issue
> > > > by the 10 extra seconds and not by some special status checking.
> > > 
> > > Okay, after an offline talk I am convinced now that we need some
> > > check like this in some place.
> > > 
> > > But the commit message needs be be fully re-written.
> > > 
> > > It needs to be clear that underneath, the pcode communication
> > > functions will do a check for ready without any wait, what will
> > > make desired timeout to never really wait for the pcode done
> > > and prematurely return.
> > > 
> > > at __snb_pcode_rw():
> > > 
> > >   if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > >                  return -EAGAIN;
> > > 
> > > So, for this reason we need to ensure that pcode is really ready
> > > before we wait.
> > > 
> > > Other options are to handle the EAGAIN return and then wait.
> > > Or even change the __snb_pcode_rw to ensure that it is ready...
> > > 
> > > Something like:
> > > 
> > > -       if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
> > > -               return -EAGAIN;
> > > +       if (__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX,
> > > +                                        GEN6_PCODE_READY, GEN6_PCODE_READY,
> > > +                                        fast_timeout_us,
> > > +                                        slow_timeout_ms,
> > > +                                        NULL))
> > > +               return -ETIMEDOUT;
> > 
> > This works, but the difference is that you are putting it on any
> > call to the __snb_pcode_rw(), even when not necessary.
> > 
> > Putting it in the schema we used in our offline chat, the
> > original implementation was:
> > 
> >   +-----------------------+
> >   | r/w -> check if ready |-> r/w -> cir -> r/w -> cir -> r/w -> ...
> >   +-----------------------+
> >     probe time
> > 
> > (where cir stands for "check if ready"). Because in the first
> > r/w we didn't check the status of the punit we added:
> > 
> >   +-------------------+
> >   | cir -> r/w -> cir |-> r/w -> cir -> r/w -> cir -> r/w -> ...
> >   +-------------------+
> >      probe time
> > 
> > So that we are sure that the first r/w works. What you are
> > suggesting is:
> > 
> >   +-------------------+
> >   | cir -> r/w -> cir |-> cir -> r/w -> cir -> cir -> r/w -> cir ->
> >   --------------------+
> >      probe time
> > 
> > As you can se we have two "check if ready" in a raw, which might
> > be a bit of an overkill, it doesn't hurt much but it would look
> > like:
> > 
> >    if (__intel_wait_for_register_fw())
> >    	return -EAGAIN;
> > 
> >    intel_uncore_write_fw......
> > 
> >    if (__intel_wait_for_register_fw())
> >    	return -TIMEDOUT;
> > 
> > and this for every single snb_pcode_r/w.
> > 
> > Besides some functions might need the first wait, some might not.
> > Check, for example icl_tc_cold_exit() where the -EAGAIN is
> > handled. With your version the retries would be quite ponderous.
> > 
> > I'm sorry, but I can't take your suggestion as it can have major
> > consequences along i915, it requires more changes and and it
> > needs to be carefully tested.
> > 
> > On top of that I like Ashutosh refactoring that is quite an
> > elegant way to gain more flexibility at boot time without any
> > further impact.
> > 
> > > In the best regular case it will not change anything since the wait
> > > for will return 0 immediatelly... in the worst case we would double
> > > the timeout value, but this would be only in this insane case of
> > > 180 seconds anyway and if it goes really really nasty...
> > > 
> > > But anyway, now the motivation and the source of the issue is pretty
> > > clear, I can add my rv-b there, but I really want a better commit msg
> > > at least...
> > 
> > I definitely need to make a better commit message :)
> > 
> > Thanks a lot,
> > Andi
> > 
> > > > 
> > > > > 
> > > > > Thanks, Rodrigo!
> > > > > Andi
> > > > > 
> > > > > > > 
> > > > > > > Andi
> > > > > > > 
> > > > > > > > 
> > > > > > > >          /* write value to GEN6_PCODE_DATA register */
> > > > > > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
> > > > > > > > 
> > > > > > > > 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
> > > > > > > > 
> > > > > > > >          /* In this scenario, the value
> > > > > > > >             "DG1_PCODE_STATUS | GEN6_PCODE_READY"
> > > > > > > >             is written to the GEN6_PCODE_MAILBOX register,
> > > > > > > >             so that the Busy status of the GEN6_PCODE_MAILBOX register
> > > > > > > >             can be checked later.
> > > > > > > >             (When the value of the GEN6_PCODE_READY bit of the
> > > > > > > >              GEN6_PCODE_MAILBOX register changes to 0, the operation can
> > > > > > > >              be considered completed.) */
> > > > > > > > 	intel_uncore_write_fw(uncore,
> > > > > > > > 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
> > > > > > > > 
> > > > > > > >          /* In this scenario, verify that the BUSY status bit in the
> > > > > > > >             GEN6_PCODE_MAILBOX register turns off for up to 500us. */
> > > > > > > > 	if (__intel_wait_for_register_fw(uncore,
> > > > > > > > 					 GEN6_PCODE_MAILBOX,
> > > > > > > > 					 GEN6_PCODE_READY, 0,
> > > > > > > > 					 fast_timeout_us,
> > > > > > > > 					 slow_timeout_ms,
> > > > > > > > 					 &mbox))
> > > > > > > > 		return -ETIMEDOUT;
> > > > > > > >          /* If there is a failure here, it may be considered that the
> > > > > > > >             "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
> > > > > > > >             completed within 500us */
> > > > > > > > ...
> > > > > > > > }
> > > > > > > > 
> > > > > > > > int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
> > > > > > > > 		      u32 reply_mask, u32 reply, int timeout_base_ms)
> > > > > > > > {
> > > > > > > > 	u32 status;
> > > > > > > > 	int ret;
> > > > > > > > 
> > > > > > > > 	mutex_lock(&uncore->i915->sb_lock);
> > > > > > > > 
> > > > > > > > #define COND \
> > > > > > > > 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
> > > > > > > > 
> > > > > > > >          /* the first trial for skl_pcode_try_request() can return
> > > > > > > >             -EAGAIN or -ETIMEDOUT. And the code did not check the error
> > > > > > > >             code here, so we don't know how far the __snb_pcode_rw()
> > > > > > > >             function went. It is not known whether the pcode_mailbox
> > > > > > > >             status was busy before writing the value to the
> > > > > > > >             GEN6_PCODE_DATA register or after.*/
> > > > > > > > 	if (COND) {
> > > > > > > > 		ret = 0;
> > > > > > > > 		goto out;
> > > > > > > > 	}
> > > > > > > > 
> > > > > > > >          /* In this scenario, skl_pcode_try_request() is invoked every
> > > > > > > >             10us for 180 seconds. When skl_pcode_try_request() returns
> > > > > > > >             -EAGAIN and -ETIMEDOUT by _wait_for(),
> > > > > > > >             -ETIMEDOUT is returned to a variable ret. */
> > > > > > > > 
> > > > > > > > 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
> > > > > > > > 
> > > > > > > > 	if (!ret)
> > > > > > > > 		goto out;
> > > > > > > > 
> > > > > > > > 	/*
> > > > > > > > 	 * The above can time out if the number of requests was low (2 in the
> > > > > > > > 	 * worst case) _and_ PCODE was busy for some reason even after a
> > > > > > > > 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
> > > > > > > > 	 * the poll with preemption disabled to maximize the number of
> > > > > > > > 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
> > > > > > > > 	 * account for interrupts that could reduce the number of these
> > > > > > > > 	 * requests, and for any quirks of the PCODE firmware that delays
> > > > > > > > 	 * the request completion.
> > > > > > > > 	 */
> > > > > > > > 	drm_dbg_kms(&uncore->i915->drm,
> > > > > > > > 		    "PCODE timeout, retrying with preemption disabled\n");
> > > > > > > > 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
> > > > > > > > 	preempt_disable();
> > > > > > > > 	ret = wait_for_atomic(COND, 50);
> > > > > > > > 
> > > > > > > > 	preempt_enable();
> > > > > > > > 
> > > > > > > > out:
> > > > > > > > 	mutex_unlock(&uncore->i915->sb_lock);
> > > > > > > > 	return status ? status : ret;
> > > > > > > > #undef COND
> > > > > > > > }
> > > > > > > > 
> > > > > > > > ---------------------------------------------------------
> > > > > > > > 
> > > > > > > > If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
> > > > > > > > and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
> > > > > > > > GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
> > > > > > > > invokes always fails. if then it does not make sense to me why this patch
> > > > > > > > fixes it by just waiting 10 seconds.This is because if it was called with
> > > > > > > > the flow described above, 180 seconds is longer than 10 seconds, so the
> > > > > > > > scenario you mentioned is also covered in the existing code.
> > > > > > > > 
> > > > > > > > To describe in more detail the second scenario I previously commented on:
> > > > > > > > skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
> > > > > > > >   1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
> > > > > > > > BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
> > > > > > > >   2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
> > > > > > > > of 1) does not complete within 500us.
> > > > > > > >   3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
> > > > > > > > and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
> > > > > > > > skl_pcode_request()
> > > > > > > > 
> > > > > > > > If the reason why this problem occurred is because of this scenario,
> > > > > > > > shouldn't there be an attempt to increase fast_timeout_us used as an
> > > > > > > > argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
> > > > > > > > returns -ETIMEDOUT?
> > > > > > > > 
> > > > > > > > Br,
> > > > > > > > G.G.
Gwan-gyeong Mun Feb. 2, 2023, 11:01 a.m. UTC | #19
On 2/2/23 11:28 AM, Andi Shyti wrote:
> Hi GG,
> 
> On Thu, Feb 02, 2023 at 10:22:30AM +0200, Gwan-gyeong Mun wrote:
>> Hi Andi,
>>
>> You gave a lot of explanations, and confirmed that this patch solves the
>> problem, but the root cause of this problem still seems to be unclear.
>>
>> In the logs where this problem was reported, the logs were output in the
>> following order.
>>
>> link
>>   : https://intel-gfx-ci.01.org/tree/drm-tip/drmtip_1294/bat-atsm-1/igt@i915_module_load@resize-bar.html#dmesg-warnings17324
>>
>> [log]
>> <7> [268.276586] i915 0000:4d:00.0: [drm:skl_pcode_request [i915]] PCODE
>> timeout, retrying with preemption disabled
>> ...
>> <4> [268.276942] WARNING: CPU: 1 PID: 1678 at
>> drivers/gpu/drm/i915/intel_pcode.c:196 skl_pcode_request+0x242/0x260 [i915]
>> ...
>> <3> [268.329058] i915 0000:4d:00.0: [drm] *ERROR* gt0: intel_pcode_init
>> failed -11
>>
>>
>> In other words, -EAGAIN was not returned when the first
>> skl_pcode_try_request() was executed in the skl_pcode_request() function,
>> but the logs confirmed that the _wait_for(COND ...) of the
>> skl_pcode_request() function was executed.
> 
> I'm not following... how can you say that -EAGAIN was not
> returned, it's written explecetly that pcode_init failed with
> -11, i.e. -EAGAIN.
> 
> Besides if we get as far as to receive "PCODE timeout, retrying
> with preemption disabled" isn't it because we failed twice? And
> apparently with -EAGAIN?
> 
> How can you say from the logs above that this
> 
> 	if (COND) {
> 		ret = 0;
> 		goto out;
> 	}
Hi Andi,
Because, the log outputs "<7> [268.276586] i915 0000:4d:00.0: 
[drm:skl_pcode_request [i915]] PCODE
timeout, retrying with preemption disabled"

Br,
G.G.
> 
> has not failed with -EAGAIN? The logs are not telling us how the
> calls to __skl_pcode_try_request() failed. I am telling you after
> I tested it.
> 
> I hope I understood what you are saying. Otherwise, let's have an
> offline chat.
> 
> Andi
> 
>> And, I reviewed bpsec in more detail to see if I missed anything related to
>> the status and restrictions of the initial GTDRIVER_MAILBOX_INTERFACE
>> (PCODE).
>>
>> I've reviewed bpsec in detail for the two below where
>> GTDRIVER_MAILBOX_INTERFACE is mentioned, but I haven't identified any
>> additional constraints other than those described below.
>>
>>
>> Pcode Driver Mailbox Spec: 34151
>> Driver Initialization Programming Sequence: 33149
>>
>> RUN_BUSY: SW may write to the two mailbox registers only when RUN_BUSY is
>> cleared (0b)
>>
>> My suggestion is that the HW should explain and analyze the root cause of
>> the situation where it takes more than 180s for the Busy bit of
>> GTDRIVER_MAILBOX_INTERFACE to change to 0.
>>
>> And the method of the proposed patch, the newly added line
>> "__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX, GEN6_PCODE_READY,
>> 0, 500, 10000, NULL))" succeeded, but the existing method of trying 180s
>> with checking intel_uncore_read_fw(uncore,
>> GEN6_PCODE_MAILBOX)&GEN6_PCODE_READY) failed. Therefore please review it
>> again.
>> (Checking the code, it appears that both call usleep_range(wait__, wait__ *
>> 2); in __wait_for().)
>>
>> Br,
>>
>> G.G.
>>
>> int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
>> 		      u32 reply_mask, u32 reply, int timeout_base_ms)
>> {
>> 	u32 status;
>> 	int ret;
>>
>> 	mutex_lock(&uncore->i915->sb_lock);
>>
>> #define COND \
>> 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
>>
>> 	/*
>> 	 * Prime the PCODE by doing a request first. Normally it guarantees
>> 	 * that a subsequent request, at most @timeout_base_ms later, succeeds.
>> 	 * _wait_for() doesn't guarantee when its passed condition is evaluated
>> 	 * first, so send the first request explicitly.
>> 	 */
>> 	if (COND) {
>> 		ret = 0;
>> 		goto out;
>> 	}
>> 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
>> 	if (!ret)
>> 		goto out;
>>
>> 	/*
>> 	 * The above can time out if the number of requests was low (2 in the
>> 	 * worst case) _and_ PCODE was busy for some reason even after a
>> 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
>> 	 * the poll with preemption disabled to maximize the number of
>> 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
>> 	 * account for interrupts that could reduce the number of these
>> 	 * requests, and for any quirks of the PCODE firmware that delays
>> 	 * the request completion.
>> 	 */
>> 	drm_dbg_kms(&uncore->i915->drm,
>> 		    "PCODE timeout, retrying with preemption disabled\n");
>> 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
>> 	preempt_disable();
>> 	ret = wait_for_atomic(COND, 50);
>> 	preempt_enable();
>>
>> out:
>> 	mutex_unlock(&uncore->i915->sb_lock);
>> 	return status ? status : ret;
>> #undef COND
>> }
>>
>>
>>
>>
>>
>>
>> On 1/30/23 10:41 PM, Andi Shyti wrote:
>>> Hi Rodrigo,
>>>
>>> first of all, thanks for looking into this!
>>>
>>>>>>>>> In the call flow invoked by intel_pcode_init(), I've added brief comments
>>>>>>>>> where further clarification is needed in this scenario, and a description of
>>>>>>>>> the suspicious scenario at the bottom.
>>>>>>>>>
>>>>>>>>> -------------------------------------------------------------------------
>>>>>>>>> intel_pcode_init()
>>>>>>>>>    |
>>>>>>>>>    +-> skl_pcode_request(uncore, DG1_PCODE_STATUS,
>>>>>>>>>                          DG1_UNCORE_GET_INIT_STATUS,
>>>>>>>>>                          DG1_UNCORE_INIT_STATUS_COMPLETE,
>>>>>>>>>                          DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
>>>>>>>>>          |
>>>>>>>>>          +-> skl_pcode_try_request()
>>>>>>>>>                |
>>>>>>>>>                +->  *status = __snb_pcode_rw(uncore, mbox, &request, NULL,
>>>>>>>>>                                              500, 0, true);
>>>>>>>>>
>>>>>>>>> -------------------------------------------------------------------------
>>>>>>>>> static int __snb_pcode_rw(struct intel_uncore *uncore, u32 mbox,
>>>>>>>>> 		          u32 *val, u32 *val1,
>>>>>>>>> 			  int fast_timeout_us, int slow_timeout_ms,
>>>>>>>>> 			  bool is_read)
>>>>>>>>> {
>>>>>>>>> ...
>>>>>>>>>           /* Before writing a value to the GEN6_PCODE_DATA register,
>>>>>>>>>              check if the bit in the GEN6_PCODE_MAILBOX register indicates
>>>>>>>>>              BUSY. */
>>>>>>>>> 	if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>>>>>>>>> 		return -EAGAIN;
>>>>>>>>
>>>>>>>> what if we fail here because the punit is still initializing and
>>>>>>>> will be ready, say, in 10 seconds?
>>>>>>>>
>>>>>>>> GG, without going any further, we fail here! The -EAGAIN we
>>>>>>>> receive from the test comes from this point. We don't fail with
>>>>>>>> -ETIMEDOUT, but with -EAGAIN and the reason is because the punit
>>>>>>>> is not ready to perform the very fist communication and we fail
>>>>>>>> the probing.
>>>>>>>>
>>>>>>>> It doesn't mean, though, that there is anything wrong, we just
>>>>>>>> need to wait a bit before "taking drastic decisions"!
>>>>>>>>
>>>>>>>> This patch is suggesting to wait up to 10s for the punit to be
>>>>>>>> ready and eventually try to probe again... and, indeed, it works!
>>>>>>>
>>>>>>> As GG, what I still don't understand is how this extra 10 seconds
>>>>>>> wait helps... have you tried to simple add the 10 to the 180 and
>>>>>>> make the code 190 sec instead?
>>>>>>
>>>>>> maybe I haven't been able to explain the issue properly.
>>>>>>
>>>>>> I can even set that timer to 2hrs and a half and nothing changes
>>>>>> because we fail before.
>>>>>>
>>>>>> Here it's not a matter of how much do I wait but when do I check
>>>>>> the pcode readiness (i.e. signalled by the GEN6_PCODE_READY bit
>>>>>> in the GEN6_PCODE_MAILBOX register).
>>>>>>
>>>>>> During a normal run we are always sure that communicating with
>>>>>> the punit works, because we made it sure during the previous
>>>>>> transaction.
>>>>>>
>>>>>> During probe there is no previous transaction and we start
>>>>>> communicating with the punit without making sure that it is
>>>>>> ready. And indeed some times it is not, so that we suppress the
>>>>>> probing on purpose instead of giving it another chance.
>>>>>>
>>>>>> I admit that the commit message is not written properly and
>>>>>> rather misleading, but here it's not at all a matter of how much
>>>>>> do I wait.
>>>>>
>>>>> The commit message was initially confused because it looks like
>>>>> we are just adding a wait, without doing anything....
>>>>>
>>>>> But looking to the code we can see that it will wait until
>>>>> pcode is ready with a timeout of 10 seconds.
>>>>>
>>>>> But if pcode is ready in 10 seconds, why pcode is not ready
>>>>> in 190 seconds. We are doing absolutely nothing more that could
>>>>> make pcode ready in 10 seconds that won't be in 190.
>>>>>
>>>>> This is what we are missing here... The code as is doesn't make
>>>>> a lot of sense to us and it looks like it is solving the issue
>>>>> by the 10 extra seconds and not by some special status checking.
>>>>
>>>> Okay, after an offline talk I am convinced now that we need some
>>>> check like this in some place.
>>>>
>>>> But the commit message needs be be fully re-written.
>>>>
>>>> It needs to be clear that underneath, the pcode communication
>>>> functions will do a check for ready without any wait, what will
>>>> make desired timeout to never really wait for the pcode done
>>>> and prematurely return.
>>>>
>>>> at __snb_pcode_rw():
>>>>
>>>>    if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>>>>                   return -EAGAIN;
>>>>
>>>> So, for this reason we need to ensure that pcode is really ready
>>>> before we wait.
>>>>
>>>> Other options are to handle the EAGAIN return and then wait.
>>>> Or even change the __snb_pcode_rw to ensure that it is ready...
>>>>
>>>> Something like:
>>>>
>>>> -       if (intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY)
>>>> -               return -EAGAIN;
>>>> +       if (__intel_wait_for_register_fw(uncore, GEN6_PCODE_MAILBOX,
>>>> +                                        GEN6_PCODE_READY, GEN6_PCODE_READY,
>>>> +                                        fast_timeout_us,
>>>> +                                        slow_timeout_ms,
>>>> +                                        NULL))
>>>> +               return -ETIMEDOUT;
>>>
>>> This works, but the difference is that you are putting it on any
>>> call to the __snb_pcode_rw(), even when not necessary.
>>>
>>> Putting it in the schema we used in our offline chat, the
>>> original implementation was:
>>>
>>>    +-----------------------+
>>>    | r/w -> check if ready |-> r/w -> cir -> r/w -> cir -> r/w -> ...
>>>    +-----------------------+
>>>      probe time
>>>
>>> (where cir stands for "check if ready"). Because in the first
>>> r/w we didn't check the status of the punit we added:
>>>
>>>    +-------------------+
>>>    | cir -> r/w -> cir |-> r/w -> cir -> r/w -> cir -> r/w -> ...
>>>    +-------------------+
>>>       probe time
>>>
>>> So that we are sure that the first r/w works. What you are
>>> suggesting is:
>>>
>>>    +-------------------+
>>>    | cir -> r/w -> cir |-> cir -> r/w -> cir -> cir -> r/w -> cir ->
>>>    --------------------+
>>>       probe time
>>>
>>> As you can se we have two "check if ready" in a raw, which might
>>> be a bit of an overkill, it doesn't hurt much but it would look
>>> like:
>>>
>>>     if (__intel_wait_for_register_fw())
>>>     	return -EAGAIN;
>>>
>>>     intel_uncore_write_fw......
>>>
>>>     if (__intel_wait_for_register_fw())
>>>     	return -TIMEDOUT;
>>>
>>> and this for every single snb_pcode_r/w.
>>>
>>> Besides some functions might need the first wait, some might not.
>>> Check, for example icl_tc_cold_exit() where the -EAGAIN is
>>> handled. With your version the retries would be quite ponderous.
>>>
>>> I'm sorry, but I can't take your suggestion as it can have major
>>> consequences along i915, it requires more changes and and it
>>> needs to be carefully tested.
>>>
>>> On top of that I like Ashutosh refactoring that is quite an
>>> elegant way to gain more flexibility at boot time without any
>>> further impact.
>>>
>>>> In the best regular case it will not change anything since the wait
>>>> for will return 0 immediatelly... in the worst case we would double
>>>> the timeout value, but this would be only in this insane case of
>>>> 180 seconds anyway and if it goes really really nasty...
>>>>
>>>> But anyway, now the motivation and the source of the issue is pretty
>>>> clear, I can add my rv-b there, but I really want a better commit msg
>>>> at least...
>>>
>>> I definitely need to make a better commit message :)
>>>
>>> Thanks a lot,
>>> Andi
>>>
>>>>>
>>>>>>
>>>>>> Thanks, Rodrigo!
>>>>>> Andi
>>>>>>
>>>>>>>>
>>>>>>>> Andi
>>>>>>>>
>>>>>>>>>
>>>>>>>>>           /* write value to GEN6_PCODE_DATA register */
>>>>>>>>> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA, *val);
>>>>>>>>>
>>>>>>>>> 	intel_uncore_write_fw(uncore, GEN6_PCODE_DATA1, val1 ? *val1 : 0);
>>>>>>>>>
>>>>>>>>>           /* In this scenario, the value
>>>>>>>>>              "DG1_PCODE_STATUS | GEN6_PCODE_READY"
>>>>>>>>>              is written to the GEN6_PCODE_MAILBOX register,
>>>>>>>>>              so that the Busy status of the GEN6_PCODE_MAILBOX register
>>>>>>>>>              can be checked later.
>>>>>>>>>              (When the value of the GEN6_PCODE_READY bit of the
>>>>>>>>>               GEN6_PCODE_MAILBOX register changes to 0, the operation can
>>>>>>>>>               be considered completed.) */
>>>>>>>>> 	intel_uncore_write_fw(uncore,
>>>>>>>>> 			      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
>>>>>>>>>
>>>>>>>>>           /* In this scenario, verify that the BUSY status bit in the
>>>>>>>>>              GEN6_PCODE_MAILBOX register turns off for up to 500us. */
>>>>>>>>> 	if (__intel_wait_for_register_fw(uncore,
>>>>>>>>> 					 GEN6_PCODE_MAILBOX,
>>>>>>>>> 					 GEN6_PCODE_READY, 0,
>>>>>>>>> 					 fast_timeout_us,
>>>>>>>>> 					 slow_timeout_ms,
>>>>>>>>> 					 &mbox))
>>>>>>>>> 		return -ETIMEDOUT;
>>>>>>>>>           /* If there is a failure here, it may be considered that the
>>>>>>>>>              "DG1_PCODE_STATUS | GEN6_PCODE_READY" operation was not
>>>>>>>>>              completed within 500us */
>>>>>>>>> ...
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
>>>>>>>>> 		      u32 reply_mask, u32 reply, int timeout_base_ms)
>>>>>>>>> {
>>>>>>>>> 	u32 status;
>>>>>>>>> 	int ret;
>>>>>>>>>
>>>>>>>>> 	mutex_lock(&uncore->i915->sb_lock);
>>>>>>>>>
>>>>>>>>> #define COND \
>>>>>>>>> 	skl_pcode_try_request(uncore, mbox, request, reply_mask, reply, &status)
>>>>>>>>>
>>>>>>>>>           /* the first trial for skl_pcode_try_request() can return
>>>>>>>>>              -EAGAIN or -ETIMEDOUT. And the code did not check the error
>>>>>>>>>              code here, so we don't know how far the __snb_pcode_rw()
>>>>>>>>>              function went. It is not known whether the pcode_mailbox
>>>>>>>>>              status was busy before writing the value to the
>>>>>>>>>              GEN6_PCODE_DATA register or after.*/
>>>>>>>>> 	if (COND) {
>>>>>>>>> 		ret = 0;
>>>>>>>>> 		goto out;
>>>>>>>>> 	}
>>>>>>>>>
>>>>>>>>>           /* In this scenario, skl_pcode_try_request() is invoked every
>>>>>>>>>              10us for 180 seconds. When skl_pcode_try_request() returns
>>>>>>>>>              -EAGAIN and -ETIMEDOUT by _wait_for(),
>>>>>>>>>              -ETIMEDOUT is returned to a variable ret. */
>>>>>>>>>
>>>>>>>>> 	ret = _wait_for(COND, timeout_base_ms * 1000, 10, 10);
>>>>>>>>>
>>>>>>>>> 	if (!ret)
>>>>>>>>> 		goto out;
>>>>>>>>>
>>>>>>>>> 	/*
>>>>>>>>> 	 * The above can time out if the number of requests was low (2 in the
>>>>>>>>> 	 * worst case) _and_ PCODE was busy for some reason even after a
>>>>>>>>> 	 * (queued) request and @timeout_base_ms delay. As a workaround retry
>>>>>>>>> 	 * the poll with preemption disabled to maximize the number of
>>>>>>>>> 	 * requests. Increase the timeout from @timeout_base_ms to 50ms to
>>>>>>>>> 	 * account for interrupts that could reduce the number of these
>>>>>>>>> 	 * requests, and for any quirks of the PCODE firmware that delays
>>>>>>>>> 	 * the request completion.
>>>>>>>>> 	 */
>>>>>>>>> 	drm_dbg_kms(&uncore->i915->drm,
>>>>>>>>> 		    "PCODE timeout, retrying with preemption disabled\n");
>>>>>>>>> 	drm_WARN_ON_ONCE(&uncore->i915->drm, timeout_base_ms > 3);
>>>>>>>>> 	preempt_disable();
>>>>>>>>> 	ret = wait_for_atomic(COND, 50);
>>>>>>>>>
>>>>>>>>> 	preempt_enable();
>>>>>>>>>
>>>>>>>>> out:
>>>>>>>>> 	mutex_unlock(&uncore->i915->sb_lock);
>>>>>>>>> 	return status ? status : ret;
>>>>>>>>> #undef COND
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> ---------------------------------------------------------
>>>>>>>>>
>>>>>>>>> If you try skl_pcode_try_request() for 180 seconds in skl_pcode_request(),
>>>>>>>>> and the first "intel_uncore_read_fw(uncore, GEN6_PCODE_MAILBOX) &
>>>>>>>>> GEN6_PCODE_READY)" call in __snb_pcode_rw() that skl_pcode_try_request()
>>>>>>>>> invokes always fails. if then it does not make sense to me why this patch
>>>>>>>>> fixes it by just waiting 10 seconds.This is because if it was called with
>>>>>>>>> the flow described above, 180 seconds is longer than 10 seconds, so the
>>>>>>>>> scenario you mentioned is also covered in the existing code.
>>>>>>>>>
>>>>>>>>> To describe in more detail the second scenario I previously commented on:
>>>>>>>>> skl_pcode_request() tries skl_pcode_try_request() for 180 seconds
>>>>>>>>>    1) In skl_pcode_try_request(), the GEN6_PCODE_MAILBOX register bit is not
>>>>>>>>> BUSY, so the value is written to the GEN6_PCODE_DATA1 register.
>>>>>>>>>    2) skl_pcode_try_request() returns -ETIMEDOUT value because the operation
>>>>>>>>> of 1) does not complete within 500us.
>>>>>>>>>    3) Scenario in which the GEN6_PCODE_MAILBOX register bit is checked as BUSY
>>>>>>>>> and returns -EAGAIN in the last call of skl_pcode_try_request() invoked by
>>>>>>>>> skl_pcode_request()
>>>>>>>>>
>>>>>>>>> If the reason why this problem occurred is because of this scenario,
>>>>>>>>> shouldn't there be an attempt to increase fast_timeout_us used as an
>>>>>>>>> argument of __snb_pcode_rw() to 500us or more when skl_pcode_try_request()
>>>>>>>>> returns -ETIMEDOUT?
>>>>>>>>>
>>>>>>>>> Br,
>>>>>>>>> G.G.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c
index a234d9b4ed14..3db2ba439bb5 100644
--- a/drivers/gpu/drm/i915/intel_pcode.c
+++ b/drivers/gpu/drm/i915/intel_pcode.c
@@ -204,15 +204,42 @@  int skl_pcode_request(struct intel_uncore *uncore, u32 mbox, u32 request,
 #undef COND
 }
 
+static int pcode_init_wait(struct intel_uncore *uncore, int timeout_ms)
+{
+	if (__intel_wait_for_register_fw(uncore,
+					 GEN6_PCODE_MAILBOX,
+					 GEN6_PCODE_READY, 0,
+					 500, timeout_ms,
+					 NULL))
+		return -EPROBE_DEFER;
+
+	return skl_pcode_request(uncore,
+				 DG1_PCODE_STATUS,
+				 DG1_UNCORE_GET_INIT_STATUS,
+				 DG1_UNCORE_INIT_STATUS_COMPLETE,
+				 DG1_UNCORE_INIT_STATUS_COMPLETE, timeout_ms);
+}
+
 int intel_pcode_init(struct intel_uncore *uncore)
 {
+	int err;
+
 	if (!IS_DGFX(uncore->i915))
 		return 0;
 
-	return skl_pcode_request(uncore, DG1_PCODE_STATUS,
-				 DG1_UNCORE_GET_INIT_STATUS,
-				 DG1_UNCORE_INIT_STATUS_COMPLETE,
-				 DG1_UNCORE_INIT_STATUS_COMPLETE, 180000);
+	/*
+	 * Wait 10 seconds so that the punit to settle and complete
+	 * any outstanding transactions upon module load
+	 */
+	err = pcode_init_wait(uncore, 10000);
+
+	if (err) {
+		drm_notice(&uncore->i915->drm,
+			   "Waiting for HW initialisation...\n");
+		err = pcode_init_wait(uncore, 180000);
+	}
+
+	return err;
 }
 
 int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val)