diff mbox series

[v2] drm/i915/selftest/gsc: Ensure GSC Proxy init completes before selftests

Message ID 20230530170104.2192126-1-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915/selftest/gsc: Ensure GSC Proxy init completes before selftests | expand

Commit Message

Teres Alexis, Alan Previn May 30, 2023, 5:01 p.m. UTC
On MTL, if the GSC Proxy init flows haven't completed, submissions to the
GSC engine will fail. Those init flows are dependent on the mei's
gsc_proxy component that is loaded in parallel with i915 and a
worker that could potentially start after i915 driver init is done.

That said, all subsytems that access the GSC engine today does check
for such init flow completion before using the GSC engine. However,
selftests currently don't wait on anything before starting.

To fix this, add a waiter function at the start of __run_selftests
that waits for gsc-proxy init flows to complete. While implementing this,
use an table of function pointers so its scalable to add additional
waiter functions for future such "wait on dependency" cases that.

Difference from prior versions:
   v1: Based on internal testing, increase the timeout for gsc-proxy
       specific case to 8 seconds.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
---
 .../gpu/drm/i915/selftests/i915_selftest.c    | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)


base-commit: 0ae4ee2c735979030a0219218081eee661606921

Comments

Zhanjun Dong June 8, 2023, 6:14 p.m. UTC | #1
See my comments below.

> -----Original Message-----
> From: Alan Previn <alan.previn.teres.alexis@intel.com>
> Sent: May 30, 2023 1:01 PM
> To: intel-gfx@lists.freedesktop.org
> Cc: dri-devel@lists.freedesktop.org; Alan Previn
> <alan.previn.teres.alexis@intel.com>
> Subject: [v2] drm/i915/selftest/gsc: Ensure GSC Proxy init completes before
> selftests
> 
> On MTL, if the GSC Proxy init flows haven't completed, submissions to the
> GSC engine will fail. Those init flows are dependent on the mei's
> gsc_proxy component that is loaded in parallel with i915 and a
> worker that could potentially start after i915 driver init is done.
> 
> That said, all subsytems that access the GSC engine today does check
> for such init flow completion before using the GSC engine. However,
> selftests currently don't wait on anything before starting.
> 
> To fix this, add a waiter function at the start of __run_selftests
> that waits for gsc-proxy init flows to complete. While implementing this,
> use an table of function pointers so its scalable to add additional
> waiter functions for future such "wait on dependency" cases that.
> 
> Difference from prior versions:
>    v1: Based on internal testing, increase the timeout for gsc-proxy
>        specific case to 8 seconds.
> 
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> ---
>  .../gpu/drm/i915/selftests/i915_selftest.c    | 61 +++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> 
> base-commit: 0ae4ee2c735979030a0219218081eee661606921
> 
> diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c
> b/drivers/gpu/drm/i915/selftests/i915_selftest.c
> index 39da0fb0d6d2..77168a7a7e3f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_selftest.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c
> @@ -24,6 +24,8 @@
>  #include <linux/random.h>
> 
>  #include "gt/intel_gt_pm.h"
> +#include "gt/uc/intel_gsc_fw.h"
> +
>  #include "i915_driver.h"
>  #include "i915_drv.h"
>  #include "i915_selftest.h"
> @@ -127,6 +129,63 @@ static void set_default_test_all(struct selftest *st,
> unsigned int count)
>  		st[i].enabled = true;
>  }
> 
> +static int
> +__wait_gsc_proxy_completed(struct drm_i915_private *i915,
> +			   unsigned long timeout_ms)
> +{
> +	bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_GSC_PROXY)
> &&
> +			     i915->media_gt &&
> +			     HAS_ENGINE(i915->media_gt, GSC0) &&
> +			     intel_uc_fw_is_loadable(&i915->media_gt-
> >uc.gsc.fw));
> +
> +	/*
> +	 * For gsc proxy component loading + init, we need a much longer
> timeout
> +	 * than what CI selftest infrastrucutre currently uses. This longer wait
> +	 * period depends on the kernel config and component driver load
> ordering
> +	 */
> +	if (timeout_ms < 8000)
> +		timeout_ms = 8000;


Lgtm, just an concern about the fixed number here, shall we set the minimal here, or let i915_selftest.timeout_ms take control? Thus no longer need coding change here in the future.

Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>
> +
> +	if (need_to_wait &&
> +	    (wait_for(intel_gsc_uc_fw_proxy_init_done(&i915->media_gt-
> >uc.gsc),
> +	    timeout_ms)))
> +		return -ETIME;
> +
> +	return 0;
> +}
> +
> +struct __startup_waiter {
> +	const char *name;
> +	int (*wait_to_completed)(struct drm_i915_private *i915, unsigned
> long timeout_ms);
> +};
> +
> +static struct __startup_waiter all_startup_waiters[] = { \
> +	{"gsc_proxy", __wait_gsc_proxy_completed} \
> +	};
> +
> +static int __wait_on_all_system_dependencies(struct drm_i915_private
> *i915)
> +{
> +	struct __startup_waiter *waiter = all_startup_waiters;
> +	int count = ARRAY_SIZE(all_startup_waiters);
> +	int ret;
> +
> +	if (!waiter || !count || !i915)
> +		return 0;
> +
> +	for (; count--; waiter++) {
> +		if (!waiter->wait_to_completed)
> +			continue;
> +		ret = waiter->wait_to_completed(i915,
> i915_selftest.timeout_ms);
> +		if (ret) {
> +			pr_info(DRIVER_NAME ": Pre-selftest waiter %s failed
> with %d\n",
> +				waiter->name, ret);
> +			return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int __run_selftests(const char *name,
>  			   struct selftest *st,
>  			   unsigned int count,
> @@ -134,6 +193,8 @@ static int __run_selftests(const char *name,
>  {
>  	int err = 0;
> 
> +	__wait_on_all_system_dependencies(data);
> +
>  	while (!i915_selftest.random_seed)
>  		i915_selftest.random_seed = get_random_u32();
>
Teres Alexis, Alan Previn June 8, 2023, 6:31 p.m. UTC | #2
On Thu, 2023-06-08 at 18:14 +0000, Dong, Zhanjun wrote:
> See my comments below.
> 
> > -----Original Message-----
> > From: Alan Previn <alan.previn.teres.alexis@intel.com>
alan:snip

> > +static int
> > +__wait_gsc_proxy_completed(struct drm_i915_private *i915,
> > +			   unsigned long timeout_ms)
> > +{
> > +	bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_GSC_PROXY)
> > &&
> > +			     i915->media_gt &&
> > +			     HAS_ENGINE(i915->media_gt, GSC0) &&
> > +			     intel_uc_fw_is_loadable(&i915->media_gt-
> > > uc.gsc.fw));
> > +
> > +	/*
> > +	 * For gsc proxy component loading + init, we need a much longer
> > timeout
> > +	 * than what CI selftest infrastrucutre currently uses. This longer wait
> > +	 * period depends on the kernel config and component driver load
> > ordering
> > +	 */
> > +	if (timeout_ms < 8000)
> > +		timeout_ms = 8000;
> 
> 
> Lgtm, just an concern about the fixed number here, shall we set the minimal here, or let i915_selftest.timeout_ms take control? Thus no longer need coding change here in the future.
> 
> Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>

Thanks Zhanjun, unfortunately, based on internal testing, i915_selftest.timeout_ms default is too
low that it does occasionally timeout for CI. From experience, with a lean ubuntu config, it typically
takes about ~1 seconds for the mei-gsc-sw-proxy component driver to load after i915 loads.
Since CI regular unloads and reloads i915, the timeout observed ends up being reported as issue.

8 seconds was based on internal testing of the worst case scenario - which hardly ever happens.
We've only seen the 8 second happen when the kernel config has configs enabled for very many SOC IP
drivers and component driver (seen one at least one customer config) or if the MTL board IFWI was only
just reflashed (this would be a one-off 8 seconds, we suspect due to the firmware doing additional steps)
Zhanjun Dong June 8, 2023, 7:01 p.m. UTC | #3
> -----Original Message-----
> From: Teres Alexis, Alan Previn <alan.previn.teres.alexis@intel.com>
> Sent: June 8, 2023 2:31 PM
> To: Dong, Zhanjun <zhanjun.dong@intel.com>; intel-
> gfx@lists.freedesktop.org
> Cc: dri-devel@lists.freedesktop.org
> Subject: Re: [v2] drm/i915/selftest/gsc: Ensure GSC Proxy init completes
> before selftests
> 
> On Thu, 2023-06-08 at 18:14 +0000, Dong, Zhanjun wrote:
> > See my comments below.
> >
> > > -----Original Message-----
> > > From: Alan Previn <alan.previn.teres.alexis@intel.com>
> alan:snip
> 
> > > +static int
> > > +__wait_gsc_proxy_completed(struct drm_i915_private *i915,
> > > +			   unsigned long timeout_ms)
> > > +{
> > > +	bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_GSC_PROXY)
> > > &&
> > > +			     i915->media_gt &&
> > > +			     HAS_ENGINE(i915->media_gt, GSC0) &&
> > > +			     intel_uc_fw_is_loadable(&i915->media_gt-
> > > > uc.gsc.fw));
> > > +
> > > +	/*
> > > +	 * For gsc proxy component loading + init, we need a much longer
> > > timeout
> > > +	 * than what CI selftest infrastrucutre currently uses. This longer wait
> > > +	 * period depends on the kernel config and component driver load
> > > ordering
> > > +	 */
> > > +	if (timeout_ms < 8000)
> > > +		timeout_ms = 8000;
> >
> >
> > Lgtm, just an concern about the fixed number here, shall we set the
> minimal here, or let i915_selftest.timeout_ms take control? Thus no longer
> need coding change here in the future.
> >
> > Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>
> 
> Thanks Zhanjun, unfortunately, based on internal testing,
> i915_selftest.timeout_ms default is too
> low that it does occasionally timeout for CI. From experience, with a lean
> ubuntu config, it typically
> takes about ~1 seconds for the mei-gsc-sw-proxy component driver to load
> after i915 loads.
> Since CI regular unloads and reloads i915, the timeout observed ends up
> being reported as issue.
> 
> 8 seconds was based on internal testing of the worst case scenario - which
> hardly ever happens.
> We've only seen the 8 second happen when the kernel config has configs
> enabled for very many SOC IP
> drivers and component driver (seen one at least one customer config) or if
> the MTL board IFWI was only
> just reflashed (this would be a one-off 8 seconds, we suspect due to the
> firmware doing additional steps)
> 

Thanks for detailed info. The i915_selftest.timeout_ms is too low for this test, so it need an special minimal for itself, valid reason.
The concern I raised is at minor level. Looks good to me.

Regards,
Zhanjun
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c b/drivers/gpu/drm/i915/selftests/i915_selftest.c
index 39da0fb0d6d2..77168a7a7e3f 100644
--- a/drivers/gpu/drm/i915/selftests/i915_selftest.c
+++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c
@@ -24,6 +24,8 @@ 
 #include <linux/random.h>
 
 #include "gt/intel_gt_pm.h"
+#include "gt/uc/intel_gsc_fw.h"
+
 #include "i915_driver.h"
 #include "i915_drv.h"
 #include "i915_selftest.h"
@@ -127,6 +129,63 @@  static void set_default_test_all(struct selftest *st, unsigned int count)
 		st[i].enabled = true;
 }
 
+static int
+__wait_gsc_proxy_completed(struct drm_i915_private *i915,
+			   unsigned long timeout_ms)
+{
+	bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_GSC_PROXY) &&
+			     i915->media_gt &&
+			     HAS_ENGINE(i915->media_gt, GSC0) &&
+			     intel_uc_fw_is_loadable(&i915->media_gt->uc.gsc.fw));
+
+	/*
+	 * For gsc proxy component loading + init, we need a much longer timeout
+	 * than what CI selftest infrastrucutre currently uses. This longer wait
+	 * period depends on the kernel config and component driver load ordering
+	 */
+	if (timeout_ms < 8000)
+		timeout_ms = 8000;
+
+	if (need_to_wait &&
+	    (wait_for(intel_gsc_uc_fw_proxy_init_done(&i915->media_gt->uc.gsc),
+	    timeout_ms)))
+		return -ETIME;
+
+	return 0;
+}
+
+struct __startup_waiter {
+	const char *name;
+	int (*wait_to_completed)(struct drm_i915_private *i915, unsigned long timeout_ms);
+};
+
+static struct __startup_waiter all_startup_waiters[] = { \
+	{"gsc_proxy", __wait_gsc_proxy_completed} \
+	};
+
+static int __wait_on_all_system_dependencies(struct drm_i915_private *i915)
+{
+	struct __startup_waiter *waiter = all_startup_waiters;
+	int count = ARRAY_SIZE(all_startup_waiters);
+	int ret;
+
+	if (!waiter || !count || !i915)
+		return 0;
+
+	for (; count--; waiter++) {
+		if (!waiter->wait_to_completed)
+			continue;
+		ret = waiter->wait_to_completed(i915, i915_selftest.timeout_ms);
+		if (ret) {
+			pr_info(DRIVER_NAME ": Pre-selftest waiter %s failed with %d\n",
+				waiter->name, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 static int __run_selftests(const char *name,
 			   struct selftest *st,
 			   unsigned int count,
@@ -134,6 +193,8 @@  static int __run_selftests(const char *name,
 {
 	int err = 0;
 
+	__wait_on_all_system_dependencies(data);
+
 	while (!i915_selftest.random_seed)
 		i915_selftest.random_seed = get_random_u32();