diff mbox series

[v3] drm/i915/selftest/gsc: Ensure GSC Proxy init completes before selftests

Message ID 20230629204248.1283601-1-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series [v3] drm/i915/selftest/gsc: Ensure GSC Proxy init completes before selftests | expand

Commit Message

Alan Previn June 29, 2023, 8:42 p.m. UTC
On MTL, if the GSC Proxy init flows haven't completed, submissions to the
GSC engine will fail. Those init flows are dependent on the mei's
gsc_proxy component that is loaded in parallel with i915 and a
worker that could potentially start after i915 driver init is done.

That said, all subsytems that access the GSC engine today does check
for such init flow completion before using the GSC engine. However,
selftests currently don't wait on anything before starting.

To fix this, add a waiter function at the start of __run_selftests
that waits for gsc-proxy init flows to complete. While implementing this,
use an table of function pointers so its scalable to add additional
waiter functions for future such "wait on dependency" cases that.

Difference from prior versions:
   v3: Rebase to latest drm-tip.
   v2: Based on internal testing, increase the timeout for gsc-proxy
       specific case to 8 seconds.

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>

---
 .../gpu/drm/i915/selftests/i915_selftest.c    | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)


base-commit: 6f8963ce33be65c67e53b16fa18325e12ab76861

Comments

Tvrtko Ursulin June 29, 2023, 9:44 p.m. UTC | #1
On 29/06/2023 21:42, Alan Previn wrote:
> On MTL, if the GSC Proxy init flows haven't completed, submissions to the
> GSC engine will fail. Those init flows are dependent on the mei's
> gsc_proxy component that is loaded in parallel with i915 and a
> worker that could potentially start after i915 driver init is done.
> 
> That said, all subsytems that access the GSC engine today does check
> for such init flow completion before using the GSC engine. However,
> selftests currently don't wait on anything before starting.
> 
> To fix this, add a waiter function at the start of __run_selftests
> that waits for gsc-proxy init flows to complete. While implementing this,
> use an table of function pointers so its scalable to add additional
> waiter functions for future such "wait on dependency" cases that.
> 
> Difference from prior versions:
>     v3: Rebase to latest drm-tip.
>     v2: Based on internal testing, increase the timeout for gsc-proxy
>         specific case to 8 seconds.
> 
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> Reviewed-by: Zhanjun Dong <zhanjun.dong@intel.com>
> 
> ---
>   .../gpu/drm/i915/selftests/i915_selftest.c    | 61 +++++++++++++++++++
>   1 file changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c b/drivers/gpu/drm/i915/selftests/i915_selftest.c
> index 39da0fb0d6d2..a74b7e264d92 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_selftest.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c
> @@ -24,6 +24,8 @@
>   #include <linux/random.h>
>   
>   #include "gt/intel_gt_pm.h"
> +#include "gt/uc/intel_gsc_fw.h"
> +
>   #include "i915_driver.h"
>   #include "i915_drv.h"
>   #include "i915_selftest.h"
> @@ -127,6 +129,63 @@ static void set_default_test_all(struct selftest *st, unsigned int count)
>   		st[i].enabled = true;
>   }
>   
> +static int
> +__wait_gsc_proxy_completed(struct drm_i915_private *i915,
> +			   unsigned long timeout_ms)
> +{
> +	bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_GSC_PROXY) &&
> +			     i915->media_gt &&
> +			     HAS_ENGINE(i915->media_gt, GSC0) &&
> +			     intel_uc_fw_is_loadable(&i915->media_gt->uc.gsc.fw));
> +
> +	/*
> +	 * For gsc proxy component loading + init, we need a much longer timeout
> +	 * than what CI selftest infrastrucutre currently uses. This longer wait
> +	 * period depends on the kernel config and component driver load ordering
> +	 */

How is a CI timeout value relevant?

Plus from the commit message it sounds like the point of wait is so 
submission to gsc does not fail if loading is still in progress, not 
that the CI times out. So what is the main problem?

> +	if (timeout_ms < 8000)
> +		timeout_ms = 8000;
> +
> +	if (need_to_wait &&
> +	    (wait_for(intel_gsc_uc_fw_proxy_init_done(&i915->media_gt->uc.gsc, true),
> +	    timeout_ms)))
> +		return -ETIME;
> +
> +	return 0;
> +}
> +
> +struct __startup_waiter {
> +	const char *name;
> +	int (*wait_to_completed)(struct drm_i915_private *i915, unsigned long timeout_ms);
> +};
> +
> +static struct __startup_waiter all_startup_waiters[] = { \
> +	{"gsc_proxy", __wait_gsc_proxy_completed} \
> +	};
> +
> +static int __wait_on_all_system_dependencies(struct drm_i915_private *i915)
> +{
> +	struct __startup_waiter *waiter = all_startup_waiters;
> +	int count = ARRAY_SIZE(all_startup_waiters);
> +	int ret;
> +
> +	if (!waiter || !count || !i915)
> +		return 0;

Ugh.

If it ever becomes an empty array just zap this whole code and not have 
these checks.

Also, no i915 is a possibility?

But actually.. please don't add the function table generalization unless 
it is already known something else is coming to be plugged into it.

> +
> +	for (; count--; waiter++) {
> +		if (!waiter->wait_to_completed)
> +			continue;
> +		ret = waiter->wait_to_completed(i915, i915_selftest.timeout_ms);
> +		if (ret) {
> +			pr_info(DRIVER_NAME ": Pre-selftest waiter %s failed with %d\n",
> +				waiter->name, ret);
> +			return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>   static int __run_selftests(const char *name,
>   			   struct selftest *st,
>   			   unsigned int count,
> @@ -134,6 +193,8 @@ static int __run_selftests(const char *name,
>   {
>   	int err = 0;
>   
> +	__wait_on_all_system_dependencies(data);

Why does this need to be top level selftests and not just a wait for 
intel_gsc_uc_fw_proxy_init_done in the tests where it is relevant, via 
some helper or something?

Regards,

Tvrtko

> +
>   	while (!i915_selftest.random_seed)
>   		i915_selftest.random_seed = get_random_u32();
>   
> 
> base-commit: 6f8963ce33be65c67e53b16fa18325e12ab76861
Alan Previn July 11, 2023, 6:15 p.m. UTC | #2
Thanks fore reviewing Tvrtko, below are my responses.
I'll rerev without generalized func ptr and only for the subtests that need it.
...alan

On Thu, 2023-06-29 at 22:44 +0100, Tvrtko Ursulin wrote:
> On 29/06/2023 21:42, Alan Previn wrote:
> > On MTL, if the GSC Proxy init flows haven't completed, submissions to the
> > GSC engine will fail. Those init flows are dependent on the mei's
> > gsc_proxy component that is loaded in parallel with i915 and a
> > worker that could potentially start after i915 driver init is done.
> > 
> > That said, all subsytems that access the GSC engine today does check
> > for such init flow completion before using the GSC engine. However,
> > selftests currently don't wait on anything before starting.

alan:snip
> > +static int
> > +__wait_gsc_proxy_completed(struct drm_i915_private *i915,
> > +			   unsigned long timeout_ms)
> > +{
> > +	bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_GSC_PROXY) &&
> > +			     i915->media_gt &&
> > +			     HAS_ENGINE(i915->media_gt, GSC0) &&
> > +			     intel_uc_fw_is_loadable(&i915->media_gt->uc.gsc.fw));
> > +
> > +	/*
> > +	 * For gsc proxy component loading + init, we need a much longer timeout
> > +	 * than what CI selftest infrastrucutre currently uses. This longer wait
> > +	 * period depends on the kernel config and component driver load ordering
> > +	 */
> 
> How is a CI timeout value relevant?
> 
> Plus from the commit message it sounds like the point of wait is so 
> submission to gsc does not fail if loading is still in progress, not 
> that the CI times out. So what is the main problem?

Alan: The comment was meant to explain why we override the CI selftest timeout (an input param
to the generalized func ptr loop) to something much larger specially for gsc-proxy-waiting.
However, since your other review comment below is to remove the generalization, this comment
therefore will not make sense so I'll remove it accordingly. The point was that CI might
have a system level selftest timeout of something much smaller like 500 milisecs (used to
have some control over the execution time), but for the gsc-proxy waiting, its not in i915's
control but depends on the kernel component driver loading flow (and in rare occasions, after
a fresh IFWI was flashed which causes a 1-time longer period for fw-proxy flows to complete).
In any case, I'll remove the comment as per your direction.

> 
> > +	if (timeout_ms < 8000)
> > +		timeout_ms = 8000;
> > +
> 

alan:snip
> > +static int __wait_on_all_system_dependencies(struct drm_i915_private *i915)
> > +{
> > +	struct __startup_waiter *waiter = all_startup_waiters;
> > +	int count = ARRAY_SIZE(all_startup_waiters);
> > +	int ret;
> > +
> > +	if (!waiter || !count || !i915)
> > +		return 0;
> 
> Ugh.
> 
> If it ever becomes an empty array just zap this whole code and not have 
> these checks.
Alan: Okay sure - will remove these check except the i915 - see below.
> 
> Also, no i915 is a possibility?
Alan: i915_mock_selftests passes in NULL for i915. This checking of the i915
aligns with the existing __run_selftests code - but in that function the
param is called "data" eventhough in all callers of __run_selftests, that
"data" is actually i915 when its not null.
> 
> But actually.. please don't add the function table generalization unless 
> it is already known something else is coming to be plugged into it.
Alan: Okay- I'll remove it.

alan:snip
> 

> > @@ -134,6 +193,8 @@ static int __run_selftests(const char *name,
> >   {
> >   	int err = 0;
> >   
> > +	__wait_on_all_system_dependencies(data);
> 
> Why does this need to be top level selftests and not just a wait for 
> intel_gsc_uc_fw_proxy_init_done in the tests where it is relevant, via 
> some helper or something?
Alan: it was an offline decision because we didn't want to repeat
the same check for all permutations of selftests' subtests (i.e. considering
module params can dictate to skip some subtests but execute others).

Anyways, let me get back to you on how how many selftests' subtests actually excercise the
need for proxy-init to complete - if its just 1-to-2 subtest I'll move the remove the code
from here and move them into the individual subtests.

alan:snip
Daniele Ceraolo Spurio July 11, 2023, 6:49 p.m. UTC | #3
<snip>
>>> @@ -134,6 +193,8 @@ static int __run_selftests(const char *name,
>>>    {
>>>    	int err = 0;
>>>    
>>> +	__wait_on_all_system_dependencies(data);
>> Why does this need to be top level selftests and not just a wait for
>> intel_gsc_uc_fw_proxy_init_done in the tests where it is relevant, via
>> some helper or something?
> Alan: it was an offline decision because we didn't want to repeat
> the same check for all permutations of selftests' subtests (i.e. considering
> module params can dictate to skip some subtests but execute others).
>
> Anyways, let me get back to you on how how many selftests' subtests actually excercise the
> need for proxy-init to complete - if its just 1-to-2 subtest I'll move the remove the code
> from here and move them into the individual subtests.

I don't think it is going to be easy to figure out which selftest are 
impacted. All selftests looping on all engines of course, but also tests 
triggering GT resets and/or messing with the system in other ways. Any 
new tests added will also need to be evaluated.

IMO there is minimal impact of having this check on every test. When 
running selftests we load i915 after the rest of the system has already 
fully booted, so there are no delays in getting the mei component up and 
therefore proxy init is sometimes completed even before the selftest 
code starts; when we do have to wait, it's usually for a very short 
time, because the expected total execution time for the GSC worker when 
not having to wait for the mei component to load is ~750ms (~200ms for 
GSC load + 20ms for HuC auth + ~500ms for proxy init). Having a few 
seconds added to the total selftests runtime is IMO a better option that 
having to maintain a list of impacted tests.

Daniele
Alan Previn July 11, 2023, 10:06 p.m. UTC | #4
On Tue, 2023-07-11 at 11:49 -0700, Ceraolo Spurio, Daniele wrote:
> <snip>
> > > > @@ -134,6 +193,8 @@ static int __run_selftests(const char *name,
> > > >    {
> > > >    	int err = 0;
> > > >    
> > > > +	__wait_on_all_system_dependencies(data);
> > > Why does this need to be top level selftests and not just a wait for
> > > intel_gsc_uc_fw_proxy_init_done in the tests where it is relevant, via
> > > some helper or something?
> > Alan: it was an offline decision because we didn't want to repeat
> > the same check for all permutations of selftests' subtests (i.e. considering
> > module params can dictate to skip some subtests but execute others).
> > 
> > Anyways, let me get back to you on how how many selftests' subtests actually excercise the
> > need for proxy-init to complete - if its just 1-to-2 subtest I'll move the remove the code
> > from here and move them into the individual subtests.
> 
> I don't think it is going to be easy to figure out which selftest are 
> impacted. All selftests looping on all engines of course, but also tests 
> triggering GT resets and/or messing with the system in other ways. Any 
> new tests added will also need to be evaluated.
> 
> IMO there is minimal impact of having this check on every test. When 
> running selftests we load i915 after the rest of the system has already 
> fully booted, so there are no delays in getting the mei component up and 
> therefore proxy init is sometimes completed even before the selftest 
> code starts; when we do have to wait, it's usually for a very short 
> time, because the expected total execution time for the GSC worker when 
> not having to wait for the mei component to load is ~750ms (~200ms for 
> GSC load + 20ms for HuC auth + ~500ms for proxy init). Having a few 
> seconds added to the total selftests runtime is IMO a better option that 
> having to maintain a list of impacted tests.
> 
> Daniele
> 

Thanks Daniele - I completely forgot about reset or other system disruptive tests.
For now I'll re-rev to address Tvrtko's other comments but will keep the waiter
as 'once-top-down' for now and wait for Tvrtko's thoughts on that next rev.
...alan
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c b/drivers/gpu/drm/i915/selftests/i915_selftest.c
index 39da0fb0d6d2..a74b7e264d92 100644
--- a/drivers/gpu/drm/i915/selftests/i915_selftest.c
+++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c
@@ -24,6 +24,8 @@ 
 #include <linux/random.h>
 
 #include "gt/intel_gt_pm.h"
+#include "gt/uc/intel_gsc_fw.h"
+
 #include "i915_driver.h"
 #include "i915_drv.h"
 #include "i915_selftest.h"
@@ -127,6 +129,63 @@  static void set_default_test_all(struct selftest *st, unsigned int count)
 		st[i].enabled = true;
 }
 
+static int
+__wait_gsc_proxy_completed(struct drm_i915_private *i915,
+			   unsigned long timeout_ms)
+{
+	bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_GSC_PROXY) &&
+			     i915->media_gt &&
+			     HAS_ENGINE(i915->media_gt, GSC0) &&
+			     intel_uc_fw_is_loadable(&i915->media_gt->uc.gsc.fw));
+
+	/*
+	 * For gsc proxy component loading + init, we need a much longer timeout
+	 * than what CI selftest infrastrucutre currently uses. This longer wait
+	 * period depends on the kernel config and component driver load ordering
+	 */
+	if (timeout_ms < 8000)
+		timeout_ms = 8000;
+
+	if (need_to_wait &&
+	    (wait_for(intel_gsc_uc_fw_proxy_init_done(&i915->media_gt->uc.gsc, true),
+	    timeout_ms)))
+		return -ETIME;
+
+	return 0;
+}
+
+struct __startup_waiter {
+	const char *name;
+	int (*wait_to_completed)(struct drm_i915_private *i915, unsigned long timeout_ms);
+};
+
+static struct __startup_waiter all_startup_waiters[] = { \
+	{"gsc_proxy", __wait_gsc_proxy_completed} \
+	};
+
+static int __wait_on_all_system_dependencies(struct drm_i915_private *i915)
+{
+	struct __startup_waiter *waiter = all_startup_waiters;
+	int count = ARRAY_SIZE(all_startup_waiters);
+	int ret;
+
+	if (!waiter || !count || !i915)
+		return 0;
+
+	for (; count--; waiter++) {
+		if (!waiter->wait_to_completed)
+			continue;
+		ret = waiter->wait_to_completed(i915, i915_selftest.timeout_ms);
+		if (ret) {
+			pr_info(DRIVER_NAME ": Pre-selftest waiter %s failed with %d\n",
+				waiter->name, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 static int __run_selftests(const char *name,
 			   struct selftest *st,
 			   unsigned int count,
@@ -134,6 +193,8 @@  static int __run_selftests(const char *name,
 {
 	int err = 0;
 
+	__wait_on_all_system_dependencies(data);
+
 	while (!i915_selftest.random_seed)
 		i915_selftest.random_seed = get_random_u32();