Message ID | 20211213232914.2523139-8-John.C.Harrison@Intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Fixes for i915_hangman and gem_exec_capture | expand |
On Mon, Dec 13, 2021 at 03:29:10PM -0800, John.C.Harrison@Intel.com wrote: > From: John Harrison <John.C.Harrison@Intel.com> > > Added a an extra step to the i915_hangman tests to check that the > system is still alive after the hang and recovery. This submits a > simple batch to each engine which does a write to memory and checks > that the write occurred. > > Signed-off-by: John Harrison <John.C.Harrison@Intel.com> > --- > tests/i915/i915_hangman.c | 115 ++++++++++++++++++++++++++++++++++++++ > 1 file changed, 115 insertions(+) > > diff --git a/tests/i915/i915_hangman.c b/tests/i915/i915_hangman.c > index b77705206..20653b479 100644 > --- a/tests/i915/i915_hangman.c > +++ b/tests/i915/i915_hangman.c > @@ -47,8 +47,113 @@ > static int device = -1; > static int sysfs = -1; > > +#define OFFSET_ALIVE 10 > + > IGT_TEST_DESCRIPTION("Tests for hang detection and recovery"); > > +/* Requires master for STORE_DWORD on gen4/5 */ > +static void store(int fd, const struct intel_execution_engine2 *e, > + int fence, uint32_t target, unsigned offset_value) > +{ > + const int SCRATCH = 0; > + const int BATCH = 1; > + const int gen = intel_gen(intel_get_drm_devid(fd)); > + struct drm_i915_gem_exec_object2 obj[2]; > + struct drm_i915_gem_relocation_entry reloc; > + struct drm_i915_gem_execbuffer2 execbuf; > + uint32_t batch[16]; > + int i; > + > + memset(&execbuf, 0, sizeof(execbuf)); > + execbuf.buffers_ptr = to_user_pointer(obj); > + execbuf.buffer_count = ARRAY_SIZE(obj); > + execbuf.flags = e->flags; > + if (fence != -1) { > + execbuf.flags |= I915_EXEC_FENCE_IN; > + execbuf.rsvd2 = fence; > + } > + if (gen < 6) > + execbuf.flags |= I915_EXEC_SECURE; > + > + memset(obj, 0, sizeof(obj)); > + obj[SCRATCH].handle = target; > + > + obj[BATCH].handle = gem_create(fd, 4096); > + obj[BATCH].relocs_ptr = to_user_pointer(&reloc); > + obj[BATCH].relocation_count = 1; > + memset(&reloc, 0, sizeof(reloc)); > + > + i = 0; > + reloc.target_handle = obj[SCRATCH].handle; > + reloc.presumed_offset = -1; > + reloc.offset = sizeof(uint32_t) * (i + 1); > + reloc.delta = sizeof(uint32_t) * offset_value; > + reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; > + reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; > + batch[i] = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); > + if (gen >= 8) { > + batch[++i] = reloc.delta; > + batch[++i] = 0; > + } else if (gen >= 4) { > + batch[++i] = 0; > + batch[++i] = reloc.delta; > + reloc.offset += sizeof(uint32_t); > + } else { > + batch[i]--; > + batch[++i] = reloc.delta; > + } > + batch[++i] = offset_value; > + batch[++i] = MI_BATCH_BUFFER_END; > + gem_write(fd, obj[BATCH].handle, 0, batch, sizeof(batch)); > + gem_execbuf(fd, &execbuf); > + gem_close(fd, obj[BATCH].handle); > +} > + > +static void check_alive(void) > +{ > + const struct intel_execution_engine2 *engine; > + const intel_ctx_t *ctx; > + uint32_t scratch, *out; > + int fd, i = 0; > + uint64_t ahnd; > + > + fd = drm_open_driver(DRIVER_INTEL); > + igt_require(gem_class_can_store_dword(fd, 0)); > + > + ctx = intel_ctx_create_all_physical(fd); > + ahnd = get_reloc_ahnd(fd, ctx->id); > + scratch = gem_create(fd, 4096); > + out = gem_mmap__wc(fd, scratch, 0, 4096, PROT_WRITE); > + gem_set_domain(fd, scratch, > + I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); > + > + for_each_physical_engine(fd, engine) { > + igt_assert_eq_u32(out[i + OFFSET_ALIVE], 0); > + i++; > + } > + > + i = 0; > + for_each_ctx_engine(fd, ctx, engine) { > + if (!gem_class_can_store_dword(fd, engine->class)) > + continue; > + > + /* +OFFSET_ALIVE to ensure engine zero doesn't get a false negative */ > + store(fd, engine, -1, scratch, i + OFFSET_ALIVE); You need to pass ctx + ahnd to store() to add softpin path. Relocs won't work above Tigerlake. -- Zbigniew > + i++; > + } > + > + gem_set_domain(fd, scratch, I915_GEM_DOMAIN_GTT, 0); > + > + while (i--) > + igt_assert_eq_u32(out[i + OFFSET_ALIVE], i + OFFSET_ALIVE); > + > + munmap(out, 4096); > + gem_close(fd, scratch); > + put_ahnd(ahnd); > + intel_ctx_destroy(fd, ctx); > + close(fd); > +} > + > static bool has_error_state(int dir) > { > bool result; > @@ -230,6 +335,8 @@ static void test_error_state_capture(const intel_ctx_t *ctx, > check_error_state(e->name, offset, batch); > munmap(batch, 4096); > put_ahnd(ahnd); > + > + check_alive(); > } > > static void > @@ -288,6 +395,8 @@ test_engine_hang(const intel_ctx_t *ctx, > put_ahnd(ahndN); > } > put_ahnd(ahnd); > + > + check_alive(); > } > > static int hang_count; > @@ -320,6 +429,8 @@ static void test_hang_detector(const intel_ctx_t *ctx, > > /* Did it work? */ > igt_assert(hang_count == 1); > + > + check_alive(); > } > > /* This test covers the case where we end up in an uninitialised area of the > @@ -355,6 +466,8 @@ static void hangcheck_unterminated(const intel_ctx_t *ctx) > igt_force_gpu_reset(device); > igt_assert_f(0, "unterminated batch did not trigger a hang!"); > } > + > + check_alive(); > } > > static void do_tests(const char *name, const char *prefix, > @@ -432,6 +545,8 @@ igt_main > igt_assert(sysfs != -1); > > igt_require(has_error_state(sysfs)); > + > + gem_require_mmap_wc(device); > } > > igt_describe("Basic error capture"); > -- > 2.25.1 >
On 12/15/2021 23:23, Zbigniew Kempczyński wrote: > On Mon, Dec 13, 2021 at 03:29:10PM -0800, John.C.Harrison@Intel.com wrote: >> From: John Harrison <John.C.Harrison@Intel.com> >> >> Added a an extra step to the i915_hangman tests to check that the >> system is still alive after the hang and recovery. This submits a >> simple batch to each engine which does a write to memory and checks >> that the write occurred. >> >> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> >> --- >> tests/i915/i915_hangman.c | 115 ++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 115 insertions(+) >> >> diff --git a/tests/i915/i915_hangman.c b/tests/i915/i915_hangman.c >> index b77705206..20653b479 100644 >> --- a/tests/i915/i915_hangman.c >> +++ b/tests/i915/i915_hangman.c >> @@ -47,8 +47,113 @@ >> static int device = -1; >> static int sysfs = -1; >> >> +#define OFFSET_ALIVE 10 >> + >> IGT_TEST_DESCRIPTION("Tests for hang detection and recovery"); >> >> +/* Requires master for STORE_DWORD on gen4/5 */ >> +static void store(int fd, const struct intel_execution_engine2 *e, >> + int fence, uint32_t target, unsigned offset_value) >> +{ >> + const int SCRATCH = 0; >> + const int BATCH = 1; >> + const int gen = intel_gen(intel_get_drm_devid(fd)); >> + struct drm_i915_gem_exec_object2 obj[2]; >> + struct drm_i915_gem_relocation_entry reloc; >> + struct drm_i915_gem_execbuffer2 execbuf; >> + uint32_t batch[16]; >> + int i; >> + >> + memset(&execbuf, 0, sizeof(execbuf)); >> + execbuf.buffers_ptr = to_user_pointer(obj); >> + execbuf.buffer_count = ARRAY_SIZE(obj); >> + execbuf.flags = e->flags; >> + if (fence != -1) { >> + execbuf.flags |= I915_EXEC_FENCE_IN; >> + execbuf.rsvd2 = fence; >> + } >> + if (gen < 6) >> + execbuf.flags |= I915_EXEC_SECURE; >> + >> + memset(obj, 0, sizeof(obj)); >> + obj[SCRATCH].handle = target; >> + >> + obj[BATCH].handle = gem_create(fd, 4096); >> + obj[BATCH].relocs_ptr = to_user_pointer(&reloc); >> + obj[BATCH].relocation_count = 1; >> + memset(&reloc, 0, sizeof(reloc)); >> + >> + i = 0; >> + reloc.target_handle = obj[SCRATCH].handle; >> + reloc.presumed_offset = -1; >> + reloc.offset = sizeof(uint32_t) * (i + 1); >> + reloc.delta = sizeof(uint32_t) * offset_value; >> + reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; >> + reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; >> + batch[i] = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); >> + if (gen >= 8) { >> + batch[++i] = reloc.delta; >> + batch[++i] = 0; >> + } else if (gen >= 4) { >> + batch[++i] = 0; >> + batch[++i] = reloc.delta; >> + reloc.offset += sizeof(uint32_t); >> + } else { >> + batch[i]--; >> + batch[++i] = reloc.delta; >> + } >> + batch[++i] = offset_value; >> + batch[++i] = MI_BATCH_BUFFER_END; >> + gem_write(fd, obj[BATCH].handle, 0, batch, sizeof(batch)); >> + gem_execbuf(fd, &execbuf); >> + gem_close(fd, obj[BATCH].handle); >> +} >> + >> +static void check_alive(void) >> +{ >> + const struct intel_execution_engine2 *engine; >> + const intel_ctx_t *ctx; >> + uint32_t scratch, *out; >> + int fd, i = 0; >> + uint64_t ahnd; >> + >> + fd = drm_open_driver(DRIVER_INTEL); >> + igt_require(gem_class_can_store_dword(fd, 0)); >> + >> + ctx = intel_ctx_create_all_physical(fd); >> + ahnd = get_reloc_ahnd(fd, ctx->id); >> + scratch = gem_create(fd, 4096); >> + out = gem_mmap__wc(fd, scratch, 0, 4096, PROT_WRITE); >> + gem_set_domain(fd, scratch, >> + I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); >> + >> + for_each_physical_engine(fd, engine) { >> + igt_assert_eq_u32(out[i + OFFSET_ALIVE], 0); >> + i++; >> + } >> + >> + i = 0; >> + for_each_ctx_engine(fd, ctx, engine) { >> + if (!gem_class_can_store_dword(fd, engine->class)) >> + continue; >> + >> + /* +OFFSET_ALIVE to ensure engine zero doesn't get a false negative */ >> + store(fd, engine, -1, scratch, i + OFFSET_ALIVE); > You need to pass ctx + ahnd to store() to add softpin path. Relocs > won't work above Tigerlake. There didn't seem much point in reworking the code in this file when it is going to be removed in the next patch. I might just re-order the two for the next rev. Create the helper first and then never actually add the almost-duplicate here at all. John. > > -- > Zbigniew > > >> + i++; >> + } >> + >> + gem_set_domain(fd, scratch, I915_GEM_DOMAIN_GTT, 0); >> + >> + while (i--) >> + igt_assert_eq_u32(out[i + OFFSET_ALIVE], i + OFFSET_ALIVE); >> + >> + munmap(out, 4096); >> + gem_close(fd, scratch); >> + put_ahnd(ahnd); >> + intel_ctx_destroy(fd, ctx); >> + close(fd); >> +} >> + >> static bool has_error_state(int dir) >> { >> bool result; >> @@ -230,6 +335,8 @@ static void test_error_state_capture(const intel_ctx_t *ctx, >> check_error_state(e->name, offset, batch); >> munmap(batch, 4096); >> put_ahnd(ahnd); >> + >> + check_alive(); >> } >> >> static void >> @@ -288,6 +395,8 @@ test_engine_hang(const intel_ctx_t *ctx, >> put_ahnd(ahndN); >> } >> put_ahnd(ahnd); >> + >> + check_alive(); >> } >> >> static int hang_count; >> @@ -320,6 +429,8 @@ static void test_hang_detector(const intel_ctx_t *ctx, >> >> /* Did it work? */ >> igt_assert(hang_count == 1); >> + >> + check_alive(); >> } >> >> /* This test covers the case where we end up in an uninitialised area of the >> @@ -355,6 +466,8 @@ static void hangcheck_unterminated(const intel_ctx_t *ctx) >> igt_force_gpu_reset(device); >> igt_assert_f(0, "unterminated batch did not trigger a hang!"); >> } >> + >> + check_alive(); >> } >> >> static void do_tests(const char *name, const char *prefix, >> @@ -432,6 +545,8 @@ igt_main >> igt_assert(sysfs != -1); >> >> igt_require(has_error_state(sysfs)); >> + >> + gem_require_mmap_wc(device); >> } >> >> igt_describe("Basic error capture"); >> -- >> 2.25.1 >>
diff --git a/tests/i915/i915_hangman.c b/tests/i915/i915_hangman.c index b77705206..20653b479 100644 --- a/tests/i915/i915_hangman.c +++ b/tests/i915/i915_hangman.c @@ -47,8 +47,113 @@ static int device = -1; static int sysfs = -1; +#define OFFSET_ALIVE 10 + IGT_TEST_DESCRIPTION("Tests for hang detection and recovery"); +/* Requires master for STORE_DWORD on gen4/5 */ +static void store(int fd, const struct intel_execution_engine2 *e, + int fence, uint32_t target, unsigned offset_value) +{ + const int SCRATCH = 0; + const int BATCH = 1; + const int gen = intel_gen(intel_get_drm_devid(fd)); + struct drm_i915_gem_exec_object2 obj[2]; + struct drm_i915_gem_relocation_entry reloc; + struct drm_i915_gem_execbuffer2 execbuf; + uint32_t batch[16]; + int i; + + memset(&execbuf, 0, sizeof(execbuf)); + execbuf.buffers_ptr = to_user_pointer(obj); + execbuf.buffer_count = ARRAY_SIZE(obj); + execbuf.flags = e->flags; + if (fence != -1) { + execbuf.flags |= I915_EXEC_FENCE_IN; + execbuf.rsvd2 = fence; + } + if (gen < 6) + execbuf.flags |= I915_EXEC_SECURE; + + memset(obj, 0, sizeof(obj)); + obj[SCRATCH].handle = target; + + obj[BATCH].handle = gem_create(fd, 4096); + obj[BATCH].relocs_ptr = to_user_pointer(&reloc); + obj[BATCH].relocation_count = 1; + memset(&reloc, 0, sizeof(reloc)); + + i = 0; + reloc.target_handle = obj[SCRATCH].handle; + reloc.presumed_offset = -1; + reloc.offset = sizeof(uint32_t) * (i + 1); + reloc.delta = sizeof(uint32_t) * offset_value; + reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; + reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; + batch[i] = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); + if (gen >= 8) { + batch[++i] = reloc.delta; + batch[++i] = 0; + } else if (gen >= 4) { + batch[++i] = 0; + batch[++i] = reloc.delta; + reloc.offset += sizeof(uint32_t); + } else { + batch[i]--; + batch[++i] = reloc.delta; + } + batch[++i] = offset_value; + batch[++i] = MI_BATCH_BUFFER_END; + gem_write(fd, obj[BATCH].handle, 0, batch, sizeof(batch)); + gem_execbuf(fd, &execbuf); + gem_close(fd, obj[BATCH].handle); +} + +static void check_alive(void) +{ + const struct intel_execution_engine2 *engine; + const intel_ctx_t *ctx; + uint32_t scratch, *out; + int fd, i = 0; + uint64_t ahnd; + + fd = drm_open_driver(DRIVER_INTEL); + igt_require(gem_class_can_store_dword(fd, 0)); + + ctx = intel_ctx_create_all_physical(fd); + ahnd = get_reloc_ahnd(fd, ctx->id); + scratch = gem_create(fd, 4096); + out = gem_mmap__wc(fd, scratch, 0, 4096, PROT_WRITE); + gem_set_domain(fd, scratch, + I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); + + for_each_physical_engine(fd, engine) { + igt_assert_eq_u32(out[i + OFFSET_ALIVE], 0); + i++; + } + + i = 0; + for_each_ctx_engine(fd, ctx, engine) { + if (!gem_class_can_store_dword(fd, engine->class)) + continue; + + /* +OFFSET_ALIVE to ensure engine zero doesn't get a false negative */ + store(fd, engine, -1, scratch, i + OFFSET_ALIVE); + i++; + } + + gem_set_domain(fd, scratch, I915_GEM_DOMAIN_GTT, 0); + + while (i--) + igt_assert_eq_u32(out[i + OFFSET_ALIVE], i + OFFSET_ALIVE); + + munmap(out, 4096); + gem_close(fd, scratch); + put_ahnd(ahnd); + intel_ctx_destroy(fd, ctx); + close(fd); +} + static bool has_error_state(int dir) { bool result; @@ -230,6 +335,8 @@ static void test_error_state_capture(const intel_ctx_t *ctx, check_error_state(e->name, offset, batch); munmap(batch, 4096); put_ahnd(ahnd); + + check_alive(); } static void @@ -288,6 +395,8 @@ test_engine_hang(const intel_ctx_t *ctx, put_ahnd(ahndN); } put_ahnd(ahnd); + + check_alive(); } static int hang_count; @@ -320,6 +429,8 @@ static void test_hang_detector(const intel_ctx_t *ctx, /* Did it work? */ igt_assert(hang_count == 1); + + check_alive(); } /* This test covers the case where we end up in an uninitialised area of the @@ -355,6 +466,8 @@ static void hangcheck_unterminated(const intel_ctx_t *ctx) igt_force_gpu_reset(device); igt_assert_f(0, "unterminated batch did not trigger a hang!"); } + + check_alive(); } static void do_tests(const char *name, const char *prefix, @@ -432,6 +545,8 @@ igt_main igt_assert(sysfs != -1); igt_require(has_error_state(sysfs)); + + gem_require_mmap_wc(device); } igt_describe("Basic error capture");