Message ID | 20180403113644.6855-1-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Quoting Tvrtko Ursulin (2018-04-03 12:36:44) > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Reset and unwedge stress testing is supposed to trigger wedging or resets > at incovenient times and then re-use the context so either the context or > driver tracking might get confused and break. > > v2: > * Renamed for more sensible naming. > * Added some comments to explain what the test is doing. (Chris Wilson) You bring shame unto my tests with such beauty. > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> -Chris
On 03/04/18 04:36, Tvrtko Ursulin wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Reset and unwedge stress testing is supposed to trigger wedging or resets > at incovenient times and then re-use the context so either the context or > driver tracking might get confused and break. > > v2: > * Renamed for more sensible naming. > * Added some comments to explain what the test is doing. (Chris Wilson) > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > tests/gem_eio.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 74 insertions(+) > > diff --git a/tests/gem_eio.c b/tests/gem_eio.c > index b7c5047f0816..9599e73db736 100644 > --- a/tests/gem_eio.c > +++ b/tests/gem_eio.c > @@ -591,6 +591,74 @@ static void test_inflight_internal(int fd, unsigned int wait) > close(fd); > } > > +/* > + * Verify that we can submit and execute work after unwedging the GPU. > + */ > +static void test_reset_stress(int fd, unsigned int flags) > +{ > + uint32_t ctx0 = gem_context_create(fd); > + > + igt_until_timeout(5) { > + struct drm_i915_gem_execbuffer2 execbuf = { }; > + struct drm_i915_gem_exec_object2 obj = { }; > + uint32_t bbe = MI_BATCH_BUFFER_END; > + igt_spin_t *hang; > + unsigned int i; > + uint32_t ctx; > + > + gem_quiescent_gpu(fd); > + > + igt_require(i915_reset_control(flags & TEST_WEDGE ? > + false : true)); > + > + ctx = context_create_safe(fd); > + > + /* > + * Start executing a spin batch with some queued batches > + * against a different context after it. > + */ Aren't all batches queued on ctx0? Or is this a reference to the check on ctx you have later in the test. Thanks, Antonio > + hang = spin_sync(fd, ctx0, 0); > + > + obj.handle = gem_create(fd, 4096); > + gem_write(fd, obj.handle, 0, &bbe, sizeof(bbe)); > + > + execbuf.buffers_ptr = to_user_pointer(&obj); > + execbuf.buffer_count = 1; > + execbuf.rsvd1 = ctx0; > + > + for (i = 0; i < 10; i++) > + gem_execbuf(fd, &execbuf); > + > + /* Wedge after a small delay. */ > + igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0); > + > + /* Unwedge by forcing a reset. */ > + igt_assert(i915_reset_control(true)); > + trigger_reset(fd); > + > + gem_quiescent_gpu(fd); > + > + /* > + * Verify that we are able to submit work after unwedging from > + * both contexts. > + */ > + execbuf.rsvd1 = ctx; > + for (i = 0; i < 5; i++) > + gem_execbuf(fd, &execbuf); > + > + execbuf.rsvd1 = ctx0; > + for (i = 0; i < 5; i++) > + gem_execbuf(fd, &execbuf); > + > + gem_sync(fd, obj.handle); > + igt_spin_batch_free(fd, hang); > + gem_context_destroy(fd, ctx); > + gem_close(fd, obj.handle); > + } > + > + gem_context_destroy(fd, ctx0); > +} > + > static int fd = -1; > > static void > @@ -635,6 +703,12 @@ igt_main > igt_subtest("in-flight-suspend") > test_inflight_suspend(fd); > > + igt_subtest("reset-stress") > + test_reset_stress(fd, 0); > + > + igt_subtest("unwedge-stress") > + test_reset_stress(fd, TEST_WEDGE); > + > igt_subtest_group { > const struct { > unsigned int wait; >
On 03/04/18 11:24, Antonio Argenziano wrote: > > > On 03/04/18 04:36, Tvrtko Ursulin wrote: >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> >> Reset and unwedge stress testing is supposed to trigger wedging or resets >> at incovenient times and then re-use the context so either the context or >> driver tracking might get confused and break. >> >> v2: >> * Renamed for more sensible naming. >> * Added some comments to explain what the test is doing. (Chris Wilson) >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> --- >> tests/gem_eio.c | 74 >> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 74 insertions(+) >> >> diff --git a/tests/gem_eio.c b/tests/gem_eio.c >> index b7c5047f0816..9599e73db736 100644 >> --- a/tests/gem_eio.c >> +++ b/tests/gem_eio.c >> @@ -591,6 +591,74 @@ static void test_inflight_internal(int fd, >> unsigned int wait) >> close(fd); >> } >> +/* >> + * Verify that we can submit and execute work after unwedging the GPU. >> + */ >> +static void test_reset_stress(int fd, unsigned int flags) >> +{ >> + uint32_t ctx0 = gem_context_create(fd); >> + >> + igt_until_timeout(5) { >> + struct drm_i915_gem_execbuffer2 execbuf = { }; >> + struct drm_i915_gem_exec_object2 obj = { }; >> + uint32_t bbe = MI_BATCH_BUFFER_END; >> + igt_spin_t *hang; >> + unsigned int i; >> + uint32_t ctx; >> + >> + gem_quiescent_gpu(fd); >> + >> + igt_require(i915_reset_control(flags & TEST_WEDGE ? >> + false : true)); >> + >> + ctx = context_create_safe(fd); >> + >> + /* >> + * Start executing a spin batch with some queued batches >> + * against a different context after it. >> + */ > > Aren't all batches queued on ctx0? Or is this a reference to the check > on ctx you have later in the test. > > Thanks, > Antonio > >> + hang = spin_sync(fd, ctx0, 0); I think you meant to send this^ on ctx. Antonio. >> + >> + obj.handle = gem_create(fd, 4096); >> + gem_write(fd, obj.handle, 0, &bbe, sizeof(bbe)); >> + >> + execbuf.buffers_ptr = to_user_pointer(&obj); >> + execbuf.buffer_count = 1; >> + execbuf.rsvd1 = ctx0; >> + >> + for (i = 0; i < 10; i++) >> + gem_execbuf(fd, &execbuf); >> + >> + /* Wedge after a small delay. */ >> + igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0); >> + >> + /* Unwedge by forcing a reset. */ >> + igt_assert(i915_reset_control(true)); >> + trigger_reset(fd); >> + >> + gem_quiescent_gpu(fd); >> + >> + /* >> + * Verify that we are able to submit work after unwedging from >> + * both contexts. >> + */ >> + execbuf.rsvd1 = ctx; >> + for (i = 0; i < 5; i++) >> + gem_execbuf(fd, &execbuf); >> + >> + execbuf.rsvd1 = ctx0; >> + for (i = 0; i < 5; i++) >> + gem_execbuf(fd, &execbuf); >> + >> + gem_sync(fd, obj.handle); >> + igt_spin_batch_free(fd, hang); >> + gem_context_destroy(fd, ctx); >> + gem_close(fd, obj.handle); >> + } >> + >> + gem_context_destroy(fd, ctx0); >> +} >> + >> static int fd = -1; >> static void >> @@ -635,6 +703,12 @@ igt_main >> igt_subtest("in-flight-suspend") >> test_inflight_suspend(fd); >> + igt_subtest("reset-stress") >> + test_reset_stress(fd, 0); >> + >> + igt_subtest("unwedge-stress") >> + test_reset_stress(fd, TEST_WEDGE); >> + >> igt_subtest_group { >> const struct { >> unsigned int wait; >> > _______________________________________________ > igt-dev mailing list > igt-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/igt-dev
On 03/04/2018 19:34, Antonio Argenziano wrote: > > > On 03/04/18 11:24, Antonio Argenziano wrote: >> >> >> On 03/04/18 04:36, Tvrtko Ursulin wrote: >>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>> >>> Reset and unwedge stress testing is supposed to trigger wedging or >>> resets >>> at incovenient times and then re-use the context so either the >>> context or >>> driver tracking might get confused and break. >>> >>> v2: >>> * Renamed for more sensible naming. >>> * Added some comments to explain what the test is doing. (Chris >>> Wilson) >>> >>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>> --- >>> tests/gem_eio.c | 74 >>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>> 1 file changed, 74 insertions(+) >>> >>> diff --git a/tests/gem_eio.c b/tests/gem_eio.c >>> index b7c5047f0816..9599e73db736 100644 >>> --- a/tests/gem_eio.c >>> +++ b/tests/gem_eio.c >>> @@ -591,6 +591,74 @@ static void test_inflight_internal(int fd, >>> unsigned int wait) >>> close(fd); >>> } >>> +/* >>> + * Verify that we can submit and execute work after unwedging the GPU. >>> + */ >>> +static void test_reset_stress(int fd, unsigned int flags) >>> +{ >>> + uint32_t ctx0 = gem_context_create(fd); >>> + >>> + igt_until_timeout(5) { >>> + struct drm_i915_gem_execbuffer2 execbuf = { }; >>> + struct drm_i915_gem_exec_object2 obj = { }; >>> + uint32_t bbe = MI_BATCH_BUFFER_END; >>> + igt_spin_t *hang; >>> + unsigned int i; >>> + uint32_t ctx; >>> + >>> + gem_quiescent_gpu(fd); >>> + >>> + igt_require(i915_reset_control(flags & TEST_WEDGE ? >>> + false : true)); >>> + >>> + ctx = context_create_safe(fd); >>> + >>> + /* >>> + * Start executing a spin batch with some queued batches >>> + * against a different context after it. >>> + */ >> >> Aren't all batches queued on ctx0? Or is this a reference to the check >> on ctx you have later in the test. Yes, a mistake in comment text. >>> + hang = spin_sync(fd, ctx0, 0); > > I think you meant to send this^ on ctx. Why do you think so? Did you find a different or better way to trigger the bug this test is trying to hit? Regards, Tvrtko > Antonio. > >>> + >>> + obj.handle = gem_create(fd, 4096); >>> + gem_write(fd, obj.handle, 0, &bbe, sizeof(bbe)); >>> + >>> + execbuf.buffers_ptr = to_user_pointer(&obj); >>> + execbuf.buffer_count = 1; >>> + execbuf.rsvd1 = ctx0; >>> + >>> + for (i = 0; i < 10; i++) >>> + gem_execbuf(fd, &execbuf); >>> + >>> + /* Wedge after a small delay. */ >>> + igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0); >>> + >>> + /* Unwedge by forcing a reset. */ >>> + igt_assert(i915_reset_control(true)); >>> + trigger_reset(fd); >>> + >>> + gem_quiescent_gpu(fd); >>> + >>> + /* >>> + * Verify that we are able to submit work after unwedging from >>> + * both contexts. >>> + */ >>> + execbuf.rsvd1 = ctx; >>> + for (i = 0; i < 5; i++) >>> + gem_execbuf(fd, &execbuf); >>> + >>> + execbuf.rsvd1 = ctx0; >>> + for (i = 0; i < 5; i++) >>> + gem_execbuf(fd, &execbuf); >>> + >>> + gem_sync(fd, obj.handle); >>> + igt_spin_batch_free(fd, hang); >>> + gem_context_destroy(fd, ctx); >>> + gem_close(fd, obj.handle); >>> + } >>> + >>> + gem_context_destroy(fd, ctx0); >>> +} >>> + >>> static int fd = -1; >>> static void >>> @@ -635,6 +703,12 @@ igt_main >>> igt_subtest("in-flight-suspend") >>> test_inflight_suspend(fd); >>> + igt_subtest("reset-stress") >>> + test_reset_stress(fd, 0); >>> + >>> + igt_subtest("unwedge-stress") >>> + test_reset_stress(fd, TEST_WEDGE); >>> + >>> igt_subtest_group { >>> const struct { >>> unsigned int wait; >>> >> _______________________________________________ >> igt-dev mailing list >> igt-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/igt-dev > _______________________________________________ > igt-dev mailing list > igt-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/igt-dev
Quoting Tvrtko Ursulin (2018-04-04 10:58:14) > > On 03/04/2018 19:34, Antonio Argenziano wrote: > > > > > > On 03/04/18 11:24, Antonio Argenziano wrote: > >> > >> > >> On 03/04/18 04:36, Tvrtko Ursulin wrote: > >>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >>> > >>> Reset and unwedge stress testing is supposed to trigger wedging or > >>> resets > >>> at incovenient times and then re-use the context so either the > >>> context or > >>> driver tracking might get confused and break. > >>> > >>> v2: > >>> * Renamed for more sensible naming. > >>> * Added some comments to explain what the test is doing. (Chris > >>> Wilson) > >>> > >>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >>> --- > >>> tests/gem_eio.c | 74 > >>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > >>> 1 file changed, 74 insertions(+) > >>> > >>> diff --git a/tests/gem_eio.c b/tests/gem_eio.c > >>> index b7c5047f0816..9599e73db736 100644 > >>> --- a/tests/gem_eio.c > >>> +++ b/tests/gem_eio.c > >>> @@ -591,6 +591,74 @@ static void test_inflight_internal(int fd, > >>> unsigned int wait) > >>> close(fd); > >>> } > >>> +/* > >>> + * Verify that we can submit and execute work after unwedging the GPU. > >>> + */ > >>> +static void test_reset_stress(int fd, unsigned int flags) > >>> +{ > >>> + uint32_t ctx0 = gem_context_create(fd); > >>> + > >>> + igt_until_timeout(5) { > >>> + struct drm_i915_gem_execbuffer2 execbuf = { }; > >>> + struct drm_i915_gem_exec_object2 obj = { }; > >>> + uint32_t bbe = MI_BATCH_BUFFER_END; > >>> + igt_spin_t *hang; > >>> + unsigned int i; > >>> + uint32_t ctx; > >>> + > >>> + gem_quiescent_gpu(fd); > >>> + > >>> + igt_require(i915_reset_control(flags & TEST_WEDGE ? > >>> + false : true)); > >>> + > >>> + ctx = context_create_safe(fd); > >>> + > >>> + /* > >>> + * Start executing a spin batch with some queued batches > >>> + * against a different context after it. > >>> + */ > >> > >> Aren't all batches queued on ctx0? Or is this a reference to the check > >> on ctx you have later in the test. > > Yes, a mistake in comment text. > > >>> + hang = spin_sync(fd, ctx0, 0); > > > > I think you meant to send this^ on ctx. > > Why do you think so? Did you find a different or better way to trigger > the bug this test is trying to hit? You might need to explain that this test was trying to reproduce a kernel bug around unwedging you found earlier, and instead managed to find a similar one. ;) -Chris
On 04/04/18 02:58, Tvrtko Ursulin wrote: > > On 03/04/2018 19:34, Antonio Argenziano wrote: >> >> >> On 03/04/18 11:24, Antonio Argenziano wrote: >>> >>> >>> On 03/04/18 04:36, Tvrtko Ursulin wrote: >>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>> >>>> Reset and unwedge stress testing is supposed to trigger wedging or >>>> resets >>>> at incovenient times and then re-use the context so either the >>>> context or >>>> driver tracking might get confused and break. >>>> >>>> v2: >>>> * Renamed for more sensible naming. >>>> * Added some comments to explain what the test is doing. (Chris >>>> Wilson) >>>> >>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>> --- >>>> tests/gem_eio.c | 74 >>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>> 1 file changed, 74 insertions(+) >>>> >>>> diff --git a/tests/gem_eio.c b/tests/gem_eio.c >>>> index b7c5047f0816..9599e73db736 100644 >>>> --- a/tests/gem_eio.c >>>> +++ b/tests/gem_eio.c >>>> @@ -591,6 +591,74 @@ static void test_inflight_internal(int fd, >>>> unsigned int wait) >>>> close(fd); >>>> } >>>> +/* >>>> + * Verify that we can submit and execute work after unwedging the GPU. >>>> + */ >>>> +static void test_reset_stress(int fd, unsigned int flags) >>>> +{ >>>> + uint32_t ctx0 = gem_context_create(fd); >>>> + >>>> + igt_until_timeout(5) { >>>> + struct drm_i915_gem_execbuffer2 execbuf = { }; >>>> + struct drm_i915_gem_exec_object2 obj = { }; >>>> + uint32_t bbe = MI_BATCH_BUFFER_END; >>>> + igt_spin_t *hang; >>>> + unsigned int i; >>>> + uint32_t ctx; >>>> + >>>> + gem_quiescent_gpu(fd); >>>> + >>>> + igt_require(i915_reset_control(flags & TEST_WEDGE ? >>>> + false : true)); >>>> + >>>> + ctx = context_create_safe(fd); >>>> + >>>> + /* >>>> + * Start executing a spin batch with some queued batches >>>> + * against a different context after it. >>>> + */ >>> >>> Aren't all batches queued on ctx0? Or is this a reference to the >>> check on ctx you have later in the test. > > Yes, a mistake in comment text. > >>>> + hang = spin_sync(fd, ctx0, 0); >> >> I think you meant to send this^ on ctx. > > Why do you think so? Did you find a different or better way to trigger > the bug this test is trying to hit? Nope, I just misunderstood the code :). I thought you were creating ctx as 'safe' to be not 'bannable' because you were going to reuse the same context across multiple resets and didn't want it to be banned. BTW given that this is not the case wouldn't ctx0 be banned after so many resets? Apologies for the cryptic comment, Antonio. > > Regards, > > Tvrtko > >> Antonio. >> >>>> + >>>> + obj.handle = gem_create(fd, 4096); >>>> + gem_write(fd, obj.handle, 0, &bbe, sizeof(bbe)); >>>> + >>>> + execbuf.buffers_ptr = to_user_pointer(&obj); >>>> + execbuf.buffer_count = 1; >>>> + execbuf.rsvd1 = ctx0; >>>> + >>>> + for (i = 0; i < 10; i++) >>>> + gem_execbuf(fd, &execbuf); >>>> + >>>> + /* Wedge after a small delay. */ >>>> + igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0); >>>> + >>>> + /* Unwedge by forcing a reset. */ >>>> + igt_assert(i915_reset_control(true)); >>>> + trigger_reset(fd); >>>> + >>>> + gem_quiescent_gpu(fd); >>>> + >>>> + /* >>>> + * Verify that we are able to submit work after unwedging from >>>> + * both contexts. >>>> + */ >>>> + execbuf.rsvd1 = ctx; >>>> + for (i = 0; i < 5; i++) >>>> + gem_execbuf(fd, &execbuf); >>>> + >>>> + execbuf.rsvd1 = ctx0; >>>> + for (i = 0; i < 5; i++) >>>> + gem_execbuf(fd, &execbuf); >>>> + >>>> + gem_sync(fd, obj.handle); >>>> + igt_spin_batch_free(fd, hang); >>>> + gem_context_destroy(fd, ctx); >>>> + gem_close(fd, obj.handle); >>>> + } >>>> + >>>> + gem_context_destroy(fd, ctx0); >>>> +} >>>> + >>>> static int fd = -1; >>>> static void >>>> @@ -635,6 +703,12 @@ igt_main >>>> igt_subtest("in-flight-suspend") >>>> test_inflight_suspend(fd); >>>> + igt_subtest("reset-stress") >>>> + test_reset_stress(fd, 0); >>>> + >>>> + igt_subtest("unwedge-stress") >>>> + test_reset_stress(fd, TEST_WEDGE); >>>> + >>>> igt_subtest_group { >>>> const struct { >>>> unsigned int wait; >>>> >>> _______________________________________________ >>> igt-dev mailing list >>> igt-dev@lists.freedesktop.org >>> https://lists.freedesktop.org/mailman/listinfo/igt-dev >> _______________________________________________ >> igt-dev mailing list >> igt-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/igt-dev
diff --git a/tests/gem_eio.c b/tests/gem_eio.c index b7c5047f0816..9599e73db736 100644 --- a/tests/gem_eio.c +++ b/tests/gem_eio.c @@ -591,6 +591,74 @@ static void test_inflight_internal(int fd, unsigned int wait) close(fd); } +/* + * Verify that we can submit and execute work after unwedging the GPU. + */ +static void test_reset_stress(int fd, unsigned int flags) +{ + uint32_t ctx0 = gem_context_create(fd); + + igt_until_timeout(5) { + struct drm_i915_gem_execbuffer2 execbuf = { }; + struct drm_i915_gem_exec_object2 obj = { }; + uint32_t bbe = MI_BATCH_BUFFER_END; + igt_spin_t *hang; + unsigned int i; + uint32_t ctx; + + gem_quiescent_gpu(fd); + + igt_require(i915_reset_control(flags & TEST_WEDGE ? + false : true)); + + ctx = context_create_safe(fd); + + /* + * Start executing a spin batch with some queued batches + * against a different context after it. + */ + hang = spin_sync(fd, ctx0, 0); + + obj.handle = gem_create(fd, 4096); + gem_write(fd, obj.handle, 0, &bbe, sizeof(bbe)); + + execbuf.buffers_ptr = to_user_pointer(&obj); + execbuf.buffer_count = 1; + execbuf.rsvd1 = ctx0; + + for (i = 0; i < 10; i++) + gem_execbuf(fd, &execbuf); + + /* Wedge after a small delay. */ + igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0); + + /* Unwedge by forcing a reset. */ + igt_assert(i915_reset_control(true)); + trigger_reset(fd); + + gem_quiescent_gpu(fd); + + /* + * Verify that we are able to submit work after unwedging from + * both contexts. + */ + execbuf.rsvd1 = ctx; + for (i = 0; i < 5; i++) + gem_execbuf(fd, &execbuf); + + execbuf.rsvd1 = ctx0; + for (i = 0; i < 5; i++) + gem_execbuf(fd, &execbuf); + + gem_sync(fd, obj.handle); + igt_spin_batch_free(fd, hang); + gem_context_destroy(fd, ctx); + gem_close(fd, obj.handle); + } + + gem_context_destroy(fd, ctx0); +} + static int fd = -1; static void @@ -635,6 +703,12 @@ igt_main igt_subtest("in-flight-suspend") test_inflight_suspend(fd); + igt_subtest("reset-stress") + test_reset_stress(fd, 0); + + igt_subtest("unwedge-stress") + test_reset_stress(fd, TEST_WEDGE); + igt_subtest_group { const struct { unsigned int wait;