diff mbox series

[i-g-t] i915/gem_mmap_gtt: Reset faster and longer to catch fencing errors

Message ID 20190118003238.30588-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [i-g-t] i915/gem_mmap_gtt: Reset faster and longer to catch fencing errors | expand

Commit Message

Chris Wilson Jan. 18, 2019, 12:32 a.m. UTC
Performing a GPU reset clobbers the fence registers, affecting which
addresses the tiled GTT mmap access. If the driver does not take
precautions across a GPU reset, a client may read the wrong values (but
only within their own buffer as the fence will only be degraded to
I915_TILING_NONE, reducing the access area). However, as this requires
performing a read using the indirect GTT at exactly the same time as the
reset occurs, it can be quite difficult to catch, so repeat the test
many times and across all cores simultaneously.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 tests/i915/gem_mmap_gtt.c | 99 +++++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 31 deletions(-)

Comments

Mika Kuoppala Jan. 24, 2019, 3:25 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Performing a GPU reset clobbers the fence registers, affecting which
> addresses the tiled GTT mmap access. If the driver does not take
> precautions across a GPU reset, a client may read the wrong values (but
> only within their own buffer as the fence will only be degraded to
> I915_TILING_NONE, reducing the access area). However, as this requires
> performing a read using the indirect GTT at exactly the same time as the
> reset occurs, it can be quite difficult to catch, so repeat the test
> many times and across all cores simultaneously.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  tests/i915/gem_mmap_gtt.c | 99 +++++++++++++++++++++++++++------------
>  1 file changed, 68 insertions(+), 31 deletions(-)
>
> diff --git a/tests/i915/gem_mmap_gtt.c b/tests/i915/gem_mmap_gtt.c
> index f63535556..21880d31d 100644
> --- a/tests/i915/gem_mmap_gtt.c
> +++ b/tests/i915/gem_mmap_gtt.c
> @@ -38,6 +38,7 @@
>  #include "drm.h"
>  
>  #include "igt.h"
> +#include "igt_sysfs.h"
>  #include "igt_x86.h"
>  
>  #ifndef PAGE_SIZE
> @@ -375,50 +376,86 @@ test_clflush(int fd)
>  static void
>  test_hang(int fd)
>  {
> -	igt_hang_t hang;
> -	uint32_t patterns[] = {
> +	const uint32_t patterns[] = {
>  		0, 0xaaaaaaaa, 0x55555555, 0xcccccccc,
>  	};
> -	uint32_t *gtt[3];
> -	int last_pattern = 0;
> -	int next_pattern = 1;
> -	int i;
> +	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
> +	struct {
> +		bool done;
> +		bool error;
> +	} *control;
> +	unsigned long count;
> +	igt_hang_t hang;
> +	int dir;
>  
> -	for (i = I915_TILING_NONE; i <= I915_TILING_Y; i++) {
> -		uint32_t handle;
> +	hang = igt_allow_hang(fd, 0, 0);
> +	igt_sysfs_set_parameter(fd, "reset", "1"); /* global resets */

igt_assert to be sure that you made it?

igt_sysfs_set_module_parameter would be less misleadning :P

>  
> -		handle = gem_create(fd, OBJECT_SIZE);
> -		gem_set_tiling(fd, handle, i, 2048);
> +	control = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> +	igt_assert(control != MAP_FAILED);
>  
> -		gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE);
> -		set_domain_gtt(fd, handle);
> -		gem_close(fd, handle);
> -	}
> +	igt_fork(child, ncpus) {
> +		int last_pattern = 0;
> +		int next_pattern = 1;
> +		uint32_t *gtt[2];

You throw tiling none out as it is just a distraction and
waste of cycles?

>  
> -	hang = igt_hang_ring(fd, I915_EXEC_RENDER);
> +		for (int i = 0; i < ARRAY_SIZE(gtt); i++) {
> +			uint32_t handle;
>  
> -	do {
> -		for (i = 0; i < OBJECT_SIZE / 64; i++) {
> -			int x = 16*i + (i%16);
> +			handle = gem_create(fd, OBJECT_SIZE);
> +			gem_set_tiling(fd, handle, I915_TILING_X + i, 2048);

You could have setup a priori. But this prolly is faster than
one reset cycle of tests so nothing to gain.

>  
> -			igt_assert(gtt[0][x] == patterns[last_pattern]);
> -			igt_assert(gtt[1][x] == patterns[last_pattern]);
> -			igt_assert(gtt[2][x] == patterns[last_pattern]);
> +			gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE);
> +			set_domain_gtt(fd, handle);
> +			gem_close(fd, handle);
> +		}
>  
> -			gtt[0][x] = patterns[next_pattern];
> -			gtt[1][x] = patterns[next_pattern];
> -			gtt[2][x] = patterns[next_pattern];
> +		while (!READ_ONCE(control->done)) {
> +			for (int i = 0; i < OBJECT_SIZE / 64; i++) {
> +				uint32_t expected = patterns[last_pattern];
> +				uint32_t found[2];
> +				int x = 16*i + (i%16);

nitpicking here for consts and unsigned x.

> +
> +				found[0] = READ_ONCE(gtt[0][x]);
> +				found[1] = READ_ONCE(gtt[1][x]);
> +
> +				if (found[0] != expected ||
> +				    found[1] != expected) {
> +					igt_warn("child[%d] found (%x, %x), expecting %x\n",
> +						 child,
> +						 found[0], found[1],
> +						 expected);
> +					control->error = true;
> +					exit(0);
> +				}
> +
> +				gtt[0][x] = patterns[next_pattern];
> +				gtt[1][x] = patterns[next_pattern];
> +			}
> +
> +			last_pattern = next_pattern;
> +			next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns);
>  		}
> +	}
>  
> -		last_pattern = next_pattern;
> -		next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns);
> -	} while (gem_bo_busy(fd, hang.spin->handle));

Well, no concern here anymore that something would sync on
there.

Only nitpicks so,
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>


> +	count = 0;
> +	dir = igt_debugfs_dir(fd);
> +	igt_until_timeout(5) {
> +		igt_sysfs_set(dir, "i915_wedged", "-1");
> +		if (READ_ONCE(control->error))
> +			break;
> +		count++;
> +	}
> +	close(dir);
> +	igt_info("%lu resets\n", count);
> +
> +	control->done = true;
> +	igt_waitchildren();
>  
> -	igt_post_hang_ring(fd, hang);
> +	igt_assert(!control->error);
> +	munmap(control, 4096);
>  
> -	munmap(gtt[0], OBJECT_SIZE);
> -	munmap(gtt[1], OBJECT_SIZE);
> -	munmap(gtt[2], OBJECT_SIZE);
> +	igt_disallow_hang(fd, hang);
>  }
>  
>  static int min_tile_width(uint32_t devid, int tiling)
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 24, 2019, 3:34 p.m. UTC | #2
Quoting Mika Kuoppala (2019-01-24 15:25:10)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Performing a GPU reset clobbers the fence registers, affecting which
> > addresses the tiled GTT mmap access. If the driver does not take
> > precautions across a GPU reset, a client may read the wrong values (but
> > only within their own buffer as the fence will only be degraded to
> > I915_TILING_NONE, reducing the access area). However, as this requires
> > performing a read using the indirect GTT at exactly the same time as the
> > reset occurs, it can be quite difficult to catch, so repeat the test
> > many times and across all cores simultaneously.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> > -             gem_set_tiling(fd, handle, i, 2048);
> > +     control = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> > +     igt_assert(control != MAP_FAILED);
> >  
> > -             gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE);
> > -             set_domain_gtt(fd, handle);
> > -             gem_close(fd, handle);
> > -     }
> > +     igt_fork(child, ncpus) {
> > +             int last_pattern = 0;
> > +             int next_pattern = 1;
> > +             uint32_t *gtt[2];
> 
> You throw tiling none out as it is just a distraction and
> waste of cycles?

Fences being the name of game, waiting on unfenced GTT accessed
represents a missed opportunity of detecting the glitch. And the glitch
is hard enough to detect.

> > -     hang = igt_hang_ring(fd, I915_EXEC_RENDER);
> > +             for (int i = 0; i < ARRAY_SIZE(gtt); i++) {
> > +                     uint32_t handle;
> >  
> > -     do {
> > -             for (i = 0; i < OBJECT_SIZE / 64; i++) {
> > -                     int x = 16*i + (i%16);
> > +                     handle = gem_create(fd, OBJECT_SIZE);
> > +                     gem_set_tiling(fd, handle, I915_TILING_X + i, 2048);
> 
> You could have setup a priori. But this prolly is faster than
> one reset cycle of tests so nothing to gain.

I was thinking per cpu so we use more fences. But probably not, it
depends on the memory access occurring between the reset and the revoke,
so number of fences less important, just the sheer volume of traffic to
hit the small timing window.
-Chris
diff mbox series

Patch

diff --git a/tests/i915/gem_mmap_gtt.c b/tests/i915/gem_mmap_gtt.c
index f63535556..21880d31d 100644
--- a/tests/i915/gem_mmap_gtt.c
+++ b/tests/i915/gem_mmap_gtt.c
@@ -38,6 +38,7 @@ 
 #include "drm.h"
 
 #include "igt.h"
+#include "igt_sysfs.h"
 #include "igt_x86.h"
 
 #ifndef PAGE_SIZE
@@ -375,50 +376,86 @@  test_clflush(int fd)
 static void
 test_hang(int fd)
 {
-	igt_hang_t hang;
-	uint32_t patterns[] = {
+	const uint32_t patterns[] = {
 		0, 0xaaaaaaaa, 0x55555555, 0xcccccccc,
 	};
-	uint32_t *gtt[3];
-	int last_pattern = 0;
-	int next_pattern = 1;
-	int i;
+	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	struct {
+		bool done;
+		bool error;
+	} *control;
+	unsigned long count;
+	igt_hang_t hang;
+	int dir;
 
-	for (i = I915_TILING_NONE; i <= I915_TILING_Y; i++) {
-		uint32_t handle;
+	hang = igt_allow_hang(fd, 0, 0);
+	igt_sysfs_set_parameter(fd, "reset", "1"); /* global resets */
 
-		handle = gem_create(fd, OBJECT_SIZE);
-		gem_set_tiling(fd, handle, i, 2048);
+	control = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+	igt_assert(control != MAP_FAILED);
 
-		gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE);
-		set_domain_gtt(fd, handle);
-		gem_close(fd, handle);
-	}
+	igt_fork(child, ncpus) {
+		int last_pattern = 0;
+		int next_pattern = 1;
+		uint32_t *gtt[2];
 
-	hang = igt_hang_ring(fd, I915_EXEC_RENDER);
+		for (int i = 0; i < ARRAY_SIZE(gtt); i++) {
+			uint32_t handle;
 
-	do {
-		for (i = 0; i < OBJECT_SIZE / 64; i++) {
-			int x = 16*i + (i%16);
+			handle = gem_create(fd, OBJECT_SIZE);
+			gem_set_tiling(fd, handle, I915_TILING_X + i, 2048);
 
-			igt_assert(gtt[0][x] == patterns[last_pattern]);
-			igt_assert(gtt[1][x] == patterns[last_pattern]);
-			igt_assert(gtt[2][x] == patterns[last_pattern]);
+			gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE);
+			set_domain_gtt(fd, handle);
+			gem_close(fd, handle);
+		}
 
-			gtt[0][x] = patterns[next_pattern];
-			gtt[1][x] = patterns[next_pattern];
-			gtt[2][x] = patterns[next_pattern];
+		while (!READ_ONCE(control->done)) {
+			for (int i = 0; i < OBJECT_SIZE / 64; i++) {
+				uint32_t expected = patterns[last_pattern];
+				uint32_t found[2];
+				int x = 16*i + (i%16);
+
+				found[0] = READ_ONCE(gtt[0][x]);
+				found[1] = READ_ONCE(gtt[1][x]);
+
+				if (found[0] != expected ||
+				    found[1] != expected) {
+					igt_warn("child[%d] found (%x, %x), expecting %x\n",
+						 child,
+						 found[0], found[1],
+						 expected);
+					control->error = true;
+					exit(0);
+				}
+
+				gtt[0][x] = patterns[next_pattern];
+				gtt[1][x] = patterns[next_pattern];
+			}
+
+			last_pattern = next_pattern;
+			next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns);
 		}
+	}
 
-		last_pattern = next_pattern;
-		next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns);
-	} while (gem_bo_busy(fd, hang.spin->handle));
+	count = 0;
+	dir = igt_debugfs_dir(fd);
+	igt_until_timeout(5) {
+		igt_sysfs_set(dir, "i915_wedged", "-1");
+		if (READ_ONCE(control->error))
+			break;
+		count++;
+	}
+	close(dir);
+	igt_info("%lu resets\n", count);
+
+	control->done = true;
+	igt_waitchildren();
 
-	igt_post_hang_ring(fd, hang);
+	igt_assert(!control->error);
+	munmap(control, 4096);
 
-	munmap(gtt[0], OBJECT_SIZE);
-	munmap(gtt[1], OBJECT_SIZE);
-	munmap(gtt[2], OBJECT_SIZE);
+	igt_disallow_hang(fd, hang);
 }
 
 static int min_tile_width(uint32_t devid, int tiling)