Message ID | 20190118003238.30588-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [i-g-t] i915/gem_mmap_gtt: Reset faster and longer to catch fencing errors | expand |
Chris Wilson <chris@chris-wilson.co.uk> writes: > Performing a GPU reset clobbers the fence registers, affecting which > addresses the tiled GTT mmap access. If the driver does not take > precautions across a GPU reset, a client may read the wrong values (but > only within their own buffer as the fence will only be degraded to > I915_TILING_NONE, reducing the access area). However, as this requires > performing a read using the indirect GTT at exactly the same time as the > reset occurs, it can be quite difficult to catch, so repeat the test > many times and across all cores simultaneously. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > tests/i915/gem_mmap_gtt.c | 99 +++++++++++++++++++++++++++------------ > 1 file changed, 68 insertions(+), 31 deletions(-) > > diff --git a/tests/i915/gem_mmap_gtt.c b/tests/i915/gem_mmap_gtt.c > index f63535556..21880d31d 100644 > --- a/tests/i915/gem_mmap_gtt.c > +++ b/tests/i915/gem_mmap_gtt.c > @@ -38,6 +38,7 @@ > #include "drm.h" > > #include "igt.h" > +#include "igt_sysfs.h" > #include "igt_x86.h" > > #ifndef PAGE_SIZE > @@ -375,50 +376,86 @@ test_clflush(int fd) > static void > test_hang(int fd) > { > - igt_hang_t hang; > - uint32_t patterns[] = { > + const uint32_t patterns[] = { > 0, 0xaaaaaaaa, 0x55555555, 0xcccccccc, > }; > - uint32_t *gtt[3]; > - int last_pattern = 0; > - int next_pattern = 1; > - int i; > + const int ncpus = sysconf(_SC_NPROCESSORS_ONLN); > + struct { > + bool done; > + bool error; > + } *control; > + unsigned long count; > + igt_hang_t hang; > + int dir; > > - for (i = I915_TILING_NONE; i <= I915_TILING_Y; i++) { > - uint32_t handle; > + hang = igt_allow_hang(fd, 0, 0); > + igt_sysfs_set_parameter(fd, "reset", "1"); /* global resets */ igt_assert to be sure that you made it? igt_sysfs_set_module_parameter would be less misleadning :P > > - handle = gem_create(fd, OBJECT_SIZE); > - gem_set_tiling(fd, handle, i, 2048); > + control = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); > + igt_assert(control != MAP_FAILED); > > - gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE); > - set_domain_gtt(fd, handle); > - gem_close(fd, handle); > - } > + igt_fork(child, ncpus) { > + int last_pattern = 0; > + int next_pattern = 1; > + uint32_t *gtt[2]; You throw tiling none out as it is just a distraction and waste of cycles? > > - hang = igt_hang_ring(fd, I915_EXEC_RENDER); > + for (int i = 0; i < ARRAY_SIZE(gtt); i++) { > + uint32_t handle; > > - do { > - for (i = 0; i < OBJECT_SIZE / 64; i++) { > - int x = 16*i + (i%16); > + handle = gem_create(fd, OBJECT_SIZE); > + gem_set_tiling(fd, handle, I915_TILING_X + i, 2048); You could have setup a priori. But this prolly is faster than one reset cycle of tests so nothing to gain. > > - igt_assert(gtt[0][x] == patterns[last_pattern]); > - igt_assert(gtt[1][x] == patterns[last_pattern]); > - igt_assert(gtt[2][x] == patterns[last_pattern]); > + gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE); > + set_domain_gtt(fd, handle); > + gem_close(fd, handle); > + } > > - gtt[0][x] = patterns[next_pattern]; > - gtt[1][x] = patterns[next_pattern]; > - gtt[2][x] = patterns[next_pattern]; > + while (!READ_ONCE(control->done)) { > + for (int i = 0; i < OBJECT_SIZE / 64; i++) { > + uint32_t expected = patterns[last_pattern]; > + uint32_t found[2]; > + int x = 16*i + (i%16); nitpicking here for consts and unsigned x. > + > + found[0] = READ_ONCE(gtt[0][x]); > + found[1] = READ_ONCE(gtt[1][x]); > + > + if (found[0] != expected || > + found[1] != expected) { > + igt_warn("child[%d] found (%x, %x), expecting %x\n", > + child, > + found[0], found[1], > + expected); > + control->error = true; > + exit(0); > + } > + > + gtt[0][x] = patterns[next_pattern]; > + gtt[1][x] = patterns[next_pattern]; > + } > + > + last_pattern = next_pattern; > + next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns); > } > + } > > - last_pattern = next_pattern; > - next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns); > - } while (gem_bo_busy(fd, hang.spin->handle)); Well, no concern here anymore that something would sync on there. Only nitpicks so, Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> > + count = 0; > + dir = igt_debugfs_dir(fd); > + igt_until_timeout(5) { > + igt_sysfs_set(dir, "i915_wedged", "-1"); > + if (READ_ONCE(control->error)) > + break; > + count++; > + } > + close(dir); > + igt_info("%lu resets\n", count); > + > + control->done = true; > + igt_waitchildren(); > > - igt_post_hang_ring(fd, hang); > + igt_assert(!control->error); > + munmap(control, 4096); > > - munmap(gtt[0], OBJECT_SIZE); > - munmap(gtt[1], OBJECT_SIZE); > - munmap(gtt[2], OBJECT_SIZE); > + igt_disallow_hang(fd, hang); > } > > static int min_tile_width(uint32_t devid, int tiling) > -- > 2.20.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Quoting Mika Kuoppala (2019-01-24 15:25:10) > Chris Wilson <chris@chris-wilson.co.uk> writes: > > > Performing a GPU reset clobbers the fence registers, affecting which > > addresses the tiled GTT mmap access. If the driver does not take > > precautions across a GPU reset, a client may read the wrong values (but > > only within their own buffer as the fence will only be degraded to > > I915_TILING_NONE, reducing the access area). However, as this requires > > performing a read using the indirect GTT at exactly the same time as the > > reset occurs, it can be quite difficult to catch, so repeat the test > > many times and across all cores simultaneously. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > --- > > - gem_set_tiling(fd, handle, i, 2048); > > + control = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); > > + igt_assert(control != MAP_FAILED); > > > > - gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE); > > - set_domain_gtt(fd, handle); > > - gem_close(fd, handle); > > - } > > + igt_fork(child, ncpus) { > > + int last_pattern = 0; > > + int next_pattern = 1; > > + uint32_t *gtt[2]; > > You throw tiling none out as it is just a distraction and > waste of cycles? Fences being the name of game, waiting on unfenced GTT accessed represents a missed opportunity of detecting the glitch. And the glitch is hard enough to detect. > > - hang = igt_hang_ring(fd, I915_EXEC_RENDER); > > + for (int i = 0; i < ARRAY_SIZE(gtt); i++) { > > + uint32_t handle; > > > > - do { > > - for (i = 0; i < OBJECT_SIZE / 64; i++) { > > - int x = 16*i + (i%16); > > + handle = gem_create(fd, OBJECT_SIZE); > > + gem_set_tiling(fd, handle, I915_TILING_X + i, 2048); > > You could have setup a priori. But this prolly is faster than > one reset cycle of tests so nothing to gain. I was thinking per cpu so we use more fences. But probably not, it depends on the memory access occurring between the reset and the revoke, so number of fences less important, just the sheer volume of traffic to hit the small timing window. -Chris
diff --git a/tests/i915/gem_mmap_gtt.c b/tests/i915/gem_mmap_gtt.c index f63535556..21880d31d 100644 --- a/tests/i915/gem_mmap_gtt.c +++ b/tests/i915/gem_mmap_gtt.c @@ -38,6 +38,7 @@ #include "drm.h" #include "igt.h" +#include "igt_sysfs.h" #include "igt_x86.h" #ifndef PAGE_SIZE @@ -375,50 +376,86 @@ test_clflush(int fd) static void test_hang(int fd) { - igt_hang_t hang; - uint32_t patterns[] = { + const uint32_t patterns[] = { 0, 0xaaaaaaaa, 0x55555555, 0xcccccccc, }; - uint32_t *gtt[3]; - int last_pattern = 0; - int next_pattern = 1; - int i; + const int ncpus = sysconf(_SC_NPROCESSORS_ONLN); + struct { + bool done; + bool error; + } *control; + unsigned long count; + igt_hang_t hang; + int dir; - for (i = I915_TILING_NONE; i <= I915_TILING_Y; i++) { - uint32_t handle; + hang = igt_allow_hang(fd, 0, 0); + igt_sysfs_set_parameter(fd, "reset", "1"); /* global resets */ - handle = gem_create(fd, OBJECT_SIZE); - gem_set_tiling(fd, handle, i, 2048); + control = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + igt_assert(control != MAP_FAILED); - gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE); - set_domain_gtt(fd, handle); - gem_close(fd, handle); - } + igt_fork(child, ncpus) { + int last_pattern = 0; + int next_pattern = 1; + uint32_t *gtt[2]; - hang = igt_hang_ring(fd, I915_EXEC_RENDER); + for (int i = 0; i < ARRAY_SIZE(gtt); i++) { + uint32_t handle; - do { - for (i = 0; i < OBJECT_SIZE / 64; i++) { - int x = 16*i + (i%16); + handle = gem_create(fd, OBJECT_SIZE); + gem_set_tiling(fd, handle, I915_TILING_X + i, 2048); - igt_assert(gtt[0][x] == patterns[last_pattern]); - igt_assert(gtt[1][x] == patterns[last_pattern]); - igt_assert(gtt[2][x] == patterns[last_pattern]); + gtt[i] = gem_mmap__gtt(fd, handle, OBJECT_SIZE, PROT_WRITE); + set_domain_gtt(fd, handle); + gem_close(fd, handle); + } - gtt[0][x] = patterns[next_pattern]; - gtt[1][x] = patterns[next_pattern]; - gtt[2][x] = patterns[next_pattern]; + while (!READ_ONCE(control->done)) { + for (int i = 0; i < OBJECT_SIZE / 64; i++) { + uint32_t expected = patterns[last_pattern]; + uint32_t found[2]; + int x = 16*i + (i%16); + + found[0] = READ_ONCE(gtt[0][x]); + found[1] = READ_ONCE(gtt[1][x]); + + if (found[0] != expected || + found[1] != expected) { + igt_warn("child[%d] found (%x, %x), expecting %x\n", + child, + found[0], found[1], + expected); + control->error = true; + exit(0); + } + + gtt[0][x] = patterns[next_pattern]; + gtt[1][x] = patterns[next_pattern]; + } + + last_pattern = next_pattern; + next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns); } + } - last_pattern = next_pattern; - next_pattern = (next_pattern + 1) % ARRAY_SIZE(patterns); - } while (gem_bo_busy(fd, hang.spin->handle)); + count = 0; + dir = igt_debugfs_dir(fd); + igt_until_timeout(5) { + igt_sysfs_set(dir, "i915_wedged", "-1"); + if (READ_ONCE(control->error)) + break; + count++; + } + close(dir); + igt_info("%lu resets\n", count); + + control->done = true; + igt_waitchildren(); - igt_post_hang_ring(fd, hang); + igt_assert(!control->error); + munmap(control, 4096); - munmap(gtt[0], OBJECT_SIZE); - munmap(gtt[1], OBJECT_SIZE); - munmap(gtt[2], OBJECT_SIZE); + igt_disallow_hang(fd, hang); } static int min_tile_width(uint32_t devid, int tiling)
Performing a GPU reset clobbers the fence registers, affecting which addresses the tiled GTT mmap access. If the driver does not take precautions across a GPU reset, a client may read the wrong values (but only within their own buffer as the fence will only be degraded to I915_TILING_NONE, reducing the access area). However, as this requires performing a read using the indirect GTT at exactly the same time as the reset occurs, it can be quite difficult to catch, so repeat the test many times and across all cores simultaneously. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- tests/i915/gem_mmap_gtt.c | 99 +++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 31 deletions(-)