diff mbox series

[i-g-t] tests/core_hotunplug: Take care of closing fences before failing

Message ID 20201014165535.25668-1-janusz.krzysztofik@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [i-g-t] tests/core_hotunplug: Take care of closing fences before failing | expand

Commit Message

Janusz Krzysztofik Oct. 14, 2020, 4:55 p.m. UTC
The test was designed to keep track of open device file descriptors
for safe driver unbind on recovery from a failed subtest.  In that
context, fences introduced by commit 1fbd127bd4e1 ("core_hotplug:
Teach the healthcheck how to check execution status") can affect device
recovery as much as an open device file if not closed before unbind.

Moreover, forced GPU reset which used to be applied on recovery from a
failed i915 GPU health check is no longer reachable since a GPU hang
hopefully detected by the new health check algorithm can now break the
whole recovery procedure prematurely.

Refactor local_i915_healthcheck() so it takes care of closing fences
and returns a result to its caller instead of long jumping on failures
believed to be recoverable.  While avoiding use of igt_assert() and
friends, report actual source and error code of failures via
igt_warn_on_f().

Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 tests/core_hotunplug.c | 46 ++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 11 deletions(-)

Comments

Bernatowicz, Marcin Oct. 15, 2020, 7:40 a.m. UTC | #1
On Wed, 2020-10-14 at 18:55 +0200, Janusz Krzysztofik wrote:
> The test was designed to keep track of open device file descriptors
> for safe driver unbind on recovery from a failed subtest.  In that
> context, fences introduced by commit 1fbd127bd4e1 ("core_hotplug:
> Teach the healthcheck how to check execution status") can affect
> device
> recovery as much as an open device file if not closed before unbind.
> 
> Moreover, forced GPU reset which used to be applied on recovery from
> a
> failed i915 GPU health check is no longer reachable since a GPU hang
> hopefully detected by the new health check algorithm can now break
> the
> whole recovery procedure prematurely.
> 
> Refactor local_i915_healthcheck() so it takes care of closing fences
> and returns a result to its caller instead of long jumping on
> failures
> believed to be recoverable.  While avoiding use of igt_assert() and
> friends, report actual source and error code of failures via
> igt_warn_on_f().
> 
> Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com
> >
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  tests/core_hotunplug.c | 46 ++++++++++++++++++++++++++++++++------
> ----
>  1 file changed, 35 insertions(+), 11 deletions(-)
> 
> diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
> index 70669c590..d6db02bad 100644
> --- a/tests/core_hotunplug.c
> +++ b/tests/core_hotunplug.c
> @@ -233,9 +233,9 @@ static int merge_fences(int old, int new)
>  		return new;
>  
>  	merge = sync_fence_merge(old, new);
> -	igt_assert(merge != -1);
> -	close(old);
> -	close(new);
> +	/* Assume fence close errors don't affect device close status
> */
> +	igt_ignore_warn(local_close(old, "old fence close failed"));
> +	igt_ignore_warn(local_close(new, "new fence close failed"));
>  
>  	return merge;
>  }
> @@ -249,29 +249,53 @@ static int local_i915_healthcheck(int i915,
> const char *prefix)
>  		.buffer_count = 1,
>  	};
>  	const struct intel_execution_engine2 *engine;
> -	int fence = -1;
> +	int fence = -1, err = 0, status = 1;
>  
>  	local_debug("%s%s\n", prefix, "running i915 GPU healthcheck");
> -	if (local_i915_is_wedged(i915))
> +	if (igt_warn_on_f(local_i915_is_wedged(i915), "GPU found
> wedged\n"))
>  		return -EIO;
>  
> +	/* Assume gem_create()/gem_write() failures are unrecoverable
> */
>  	obj.handle = gem_create(i915, 4096);
>  	gem_write(i915, obj.handle, 0, &bbe, sizeof(bbe));
>  
> +	/* As soon as a fence is open, don't fail before closing it */
>  	__for_each_physical_engine(i915, engine) {
>  		execbuf.flags = engine->flags | I915_EXEC_FENCE_OUT;
> -		gem_execbuf_wr(i915, &execbuf);
> +		err = __gem_execbuf_wr(i915, &execbuf);
> +		if (igt_warn_on_f(err < 0, "__gem_execbuf_wr() returned
> %d\n",
> +				  err))
> +			break;
>  
>  		fence = merge_fences(fence, execbuf.rsvd2 >> 32);
> +		if (igt_warn_on_f(fence < 0, "merge_fences() returned
> %d\n",
> +				  fence)) {
> +			err = fence;
> +			break;
> +		}
> +	}
> +	if (fence >= 0) {
> +		status = sync_fence_wait(fence, -1);
> +		if (igt_warn_on_f(status < 0, "sync_fence_wait()
> returned %d\n",
> +				  status))
> +			err = status;
> +		if (!err)
> +			status = sync_fence_status(fence);
> +
> +		/* Assume fence close errors don't affect device close
> status */
> +		igt_ignore_warn(local_close(fence, "fence close
> failed"));
>  	}
> -	igt_assert(fence != -1);
> +
> +	/* Assume gem_close() failure is unrecoverable */
>  	gem_close(i915, obj.handle);
>  
> -	igt_assert_eq(sync_fence_wait(fence, -1), 0);
> -	igt_assert_eq(sync_fence_status(fence), 1);
> -	close(fence);
> +	if (err < 0)
> +		return err;
> +	if (igt_warn_on_f(status != 1, "sync_fence_status() returned
> %d\n",
> +			  status))
> +		return -1;
>  
> -	if (local_i915_is_wedged(i915))
> +	if (igt_warn_on_f(local_i915_is_wedged(i915), "GPU turned
> wedged\n"))
>  		return -EIO;
LGTM,
Reviewed-by: Marcin Bernatowicz <marcin.bernatowicz@linux.intel.com>

>  
>  	return 0;
Janusz Krzysztofik Oct. 15, 2020, 5:18 p.m. UTC | #2
On Thu, 2020-10-15 at 09:40 +0200, Marcin Bernatowicz wrote:
> On Wed, 2020-10-14 at 18:55 +0200, Janusz Krzysztofik wrote:
> > The test was designed to keep track of open device file descriptors
> > for safe driver unbind on recovery from a failed subtest.  In that
> > context, fences introduced by commit 1fbd127bd4e1 ("core_hotplug:
> > Teach the healthcheck how to check execution status") can affect
> > device
> > recovery as much as an open device file if not closed before unbind.
> > 
> > Moreover, forced GPU reset which used to be applied on recovery from
> > a
> > failed i915 GPU health check is no longer reachable since a GPU hang
> > hopefully detected by the new health check algorithm can now break
> > the
> > whole recovery procedure prematurely.
> > 
> > Refactor local_i915_healthcheck() so it takes care of closing fences
> > and returns a result to its caller instead of long jumping on
> > failures
> > believed to be recoverable.  While avoiding use of igt_assert() and
> > friends, report actual source and error code of failures via
> > igt_warn_on_f().
> > 
> > Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  tests/core_hotunplug.c | 46 ++++++++++++++++++++++++++++++++------
> > ----
> >  1 file changed, 35 insertions(+), 11 deletions(-)
> > 
> > diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
> > index 70669c590..d6db02bad 100644
> > --- a/tests/core_hotunplug.c
> > +++ b/tests/core_hotunplug.c
> > @@ -233,9 +233,9 @@ static int merge_fences(int old, int new)
> >  		return new;
> >  
> >  	merge = sync_fence_merge(old, new);
> > -	igt_assert(merge != -1);
> > -	close(old);
> > -	close(new);
> > +	/* Assume fence close errors don't affect device close status
> > */
> > +	igt_ignore_warn(local_close(old, "old fence close failed"));
> > +	igt_ignore_warn(local_close(new, "new fence close failed"));
> >  
> >  	return merge;
> >  }
> > @@ -249,29 +249,53 @@ static int local_i915_healthcheck(int i915,
> > const char *prefix)
> >  		.buffer_count = 1,
> >  	};
> >  	const struct intel_execution_engine2 *engine;
> > -	int fence = -1;
> > +	int fence = -1, err = 0, status = 1;
> >  
> >  	local_debug("%s%s\n", prefix, "running i915 GPU healthcheck");
> > -	if (local_i915_is_wedged(i915))
> > +	if (igt_warn_on_f(local_i915_is_wedged(i915), "GPU found
> > wedged\n"))
> >  		return -EIO;
> >  
> > +	/* Assume gem_create()/gem_write() failures are unrecoverable
> > */
> >  	obj.handle = gem_create(i915, 4096);
> >  	gem_write(i915, obj.handle, 0, &bbe, sizeof(bbe));
> >  
> > +	/* As soon as a fence is open, don't fail before closing it */
> >  	__for_each_physical_engine(i915, engine) {
> >  		execbuf.flags = engine->flags | I915_EXEC_FENCE_OUT;
> > -		gem_execbuf_wr(i915, &execbuf);
> > +		err = __gem_execbuf_wr(i915, &execbuf);
> > +		if (igt_warn_on_f(err < 0, "__gem_execbuf_wr() returned
> > %d\n",
> > +				  err))
> > +			break;
> >  
> >  		fence = merge_fences(fence, execbuf.rsvd2 >> 32);
> > +		if (igt_warn_on_f(fence < 0, "merge_fences() returned
> > %d\n",
> > +				  fence)) {
> > +			err = fence;
> > +			break;
> > +		}
> > +	}
> > +	if (fence >= 0) {
> > +		status = sync_fence_wait(fence, -1);
> > +		if (igt_warn_on_f(status < 0, "sync_fence_wait()
> > returned %d\n",
> > +				  status))
> > +			err = status;
> > +		if (!err)
> > +			status = sync_fence_status(fence);
> > +
> > +		/* Assume fence close errors don't affect device close
> > status */
> > +		igt_ignore_warn(local_close(fence, "fence close
> > failed"));
> >  	}
> > -	igt_assert(fence != -1);
> > +
> > +	/* Assume gem_close() failure is unrecoverable */
> >  	gem_close(i915, obj.handle);
> >  
> > -	igt_assert_eq(sync_fence_wait(fence, -1), 0);
> > -	igt_assert_eq(sync_fence_status(fence), 1);
> > -	close(fence);
> > +	if (err < 0)
> > +		return err;
> > +	if (igt_warn_on_f(status != 1, "sync_fence_status() returned
> > %d\n",
> > +			  status))
> > +		return -1;
> >  
> > -	if (local_i915_is_wedged(i915))
> > +	if (igt_warn_on_f(local_i915_is_wedged(i915), "GPU turned
> > wedged\n"))
> >  		return -EIO;
> LGTM,
> Reviewed-by: Marcin Bernatowicz <marcin.bernatowicz@linux.intel.com>

Thanks Marcin, pushed.

Unfortunately I forgot to include the R-b and pushed without it so it
will exist only in the list archives, sorry.

Thanks,
Janusz

> 
> >  
> >  	return 0;
diff mbox series

Patch

diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
index 70669c590..d6db02bad 100644
--- a/tests/core_hotunplug.c
+++ b/tests/core_hotunplug.c
@@ -233,9 +233,9 @@  static int merge_fences(int old, int new)
 		return new;
 
 	merge = sync_fence_merge(old, new);
-	igt_assert(merge != -1);
-	close(old);
-	close(new);
+	/* Assume fence close errors don't affect device close status */
+	igt_ignore_warn(local_close(old, "old fence close failed"));
+	igt_ignore_warn(local_close(new, "new fence close failed"));
 
 	return merge;
 }
@@ -249,29 +249,53 @@  static int local_i915_healthcheck(int i915, const char *prefix)
 		.buffer_count = 1,
 	};
 	const struct intel_execution_engine2 *engine;
-	int fence = -1;
+	int fence = -1, err = 0, status = 1;
 
 	local_debug("%s%s\n", prefix, "running i915 GPU healthcheck");
-	if (local_i915_is_wedged(i915))
+	if (igt_warn_on_f(local_i915_is_wedged(i915), "GPU found wedged\n"))
 		return -EIO;
 
+	/* Assume gem_create()/gem_write() failures are unrecoverable */
 	obj.handle = gem_create(i915, 4096);
 	gem_write(i915, obj.handle, 0, &bbe, sizeof(bbe));
 
+	/* As soon as a fence is open, don't fail before closing it */
 	__for_each_physical_engine(i915, engine) {
 		execbuf.flags = engine->flags | I915_EXEC_FENCE_OUT;
-		gem_execbuf_wr(i915, &execbuf);
+		err = __gem_execbuf_wr(i915, &execbuf);
+		if (igt_warn_on_f(err < 0, "__gem_execbuf_wr() returned %d\n",
+				  err))
+			break;
 
 		fence = merge_fences(fence, execbuf.rsvd2 >> 32);
+		if (igt_warn_on_f(fence < 0, "merge_fences() returned %d\n",
+				  fence)) {
+			err = fence;
+			break;
+		}
+	}
+	if (fence >= 0) {
+		status = sync_fence_wait(fence, -1);
+		if (igt_warn_on_f(status < 0, "sync_fence_wait() returned %d\n",
+				  status))
+			err = status;
+		if (!err)
+			status = sync_fence_status(fence);
+
+		/* Assume fence close errors don't affect device close status */
+		igt_ignore_warn(local_close(fence, "fence close failed"));
 	}
-	igt_assert(fence != -1);
+
+	/* Assume gem_close() failure is unrecoverable */
 	gem_close(i915, obj.handle);
 
-	igt_assert_eq(sync_fence_wait(fence, -1), 0);
-	igt_assert_eq(sync_fence_status(fence), 1);
-	close(fence);
+	if (err < 0)
+		return err;
+	if (igt_warn_on_f(status != 1, "sync_fence_status() returned %d\n",
+			  status))
+		return -1;
 
-	if (local_i915_is_wedged(i915))
+	if (igt_warn_on_f(local_i915_is_wedged(i915), "GPU turned wedged\n"))
 		return -EIO;
 
 	return 0;