Message ID | 1436534819-11328-1-git-send-email-tim.gore@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jul 10, 2015 at 02:26:59PM +0100, tim.gore@intel.com wrote: > From: Tim Gore <tim.gore@intel.com> > > The tests for context banning fail when the gpu scheduler > is enabled. The test causes a hang (using an infinite loop > batch) and then queues up some work behind it on both the > hanging context and also on a second "good" context. On > the "good" context it queues up 2 batch buffers. After the > hanging ring has been reset (not a full gpu reset) the > test checks the values of batch_active and batch_pending > returned by the i915_get_reset_stats_ioctl. For the "good" > context it expects to see batch_pending == 2, because two > batch buffers we queued up behind the hang on this > context. But, with the scheduler enabled (android, gen8), > one of these batch buffers is still waiting in the > scheduler and has not made it as far as the > ring->request_list, so this batch buffer is unaffected by > the ring reset, and batch_pending is only 1. > > I considered putting in a test for the scheduler being > enabled, but decided that a simpler solution is to only > queue up 1 batch buffer on the good context. This does > not change the test logic in any way and ensures that we > should always have batch_pending=1, with or without the > scheduler. For the scheduler/tdr we probably need to split this up into two testcases each: - one where the 2nd batch depends upon the first (cross-context depency). - one where the 2nd batch doesn't depend upon the first (should execute unhampered with scheduler/tdr). Since we don't yet have a scheduler/tdr both of these will result in the same outcome (since there's always the temporal depency). But with your patch you only test the 2nd case (and incompletely, we should assert that the 2nd batch did run) and ignore the first case. In short this failure here is telling you that your test coverage for these features is lacking. The correct fix is not to mangle the existing, but fix it up to correctly cover the new expectations in all cases. And that should be done as part of the tdr/scheduler submission. -Daniel > > Signed-off-by: Tim Gore <tim.gore@intel.com> > --- > tests/gem_reset_stats.c | 16 ++++++---------- > 1 file changed, 6 insertions(+), 10 deletions(-) > > diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c > index 2bb4291..6529463 100644 > --- a/tests/gem_reset_stats.c > +++ b/tests/gem_reset_stats.c > @@ -468,7 +468,7 @@ static void test_rs_ctx(int num_fds, int num_ctx, int hang_index, > > static void test_ban(void) > { > - int h1,h2,h3,h4,h5,h6,h7; > + int h1,h2,h3,h4,h5,h6; > int fd_bad, fd_good; > int retry = 10; > int active_count = 0, pending_count = 0; > @@ -496,7 +496,6 @@ static void test_ban(void) > pending_count++; > > h6 = exec_valid(fd_good, 0); > - h7 = exec_valid(fd_good, 0); > > while (retry--) { > h3 = inject_hang_no_ban_error(fd_bad, 0); > @@ -525,7 +524,7 @@ static void test_ban(void) > igt_assert_eq(h4, -EIO); > assert_reset_status(fd_bad, 0, RS_BATCH_ACTIVE); > > - gem_sync(fd_good, h7); > + gem_sync(fd_good, h6); > assert_reset_status(fd_good, 0, RS_BATCH_PENDING); > > igt_assert_eq(gem_reset_stats(fd_good, 0, &rs_good), 0); > @@ -534,12 +533,11 @@ static void test_ban(void) > igt_assert(rs_bad.batch_active == active_count); > igt_assert(rs_bad.batch_pending == pending_count); > igt_assert(rs_good.batch_active == 0); > - igt_assert(rs_good.batch_pending == 2); > + igt_assert(rs_good.batch_pending == 1); > > gem_close(fd_bad, h1); > gem_close(fd_bad, h2); > gem_close(fd_good, h6); > - gem_close(fd_good, h7); > > h1 = exec_valid(fd_good, 0); > igt_assert_lte(0, h1); > @@ -554,7 +552,7 @@ static void test_ban(void) > > static void test_ban_ctx(void) > { > - int h1,h2,h3,h4,h5,h6,h7; > + int h1,h2,h3,h4,h5,h6; > int ctx_good, ctx_bad; > int fd; > int retry = 10; > @@ -587,7 +585,6 @@ static void test_ban_ctx(void) > pending_count++; > > h6 = exec_valid(fd, ctx_good); > - h7 = exec_valid(fd, ctx_good); > > while (retry--) { > h3 = inject_hang_no_ban_error(fd, ctx_bad); > @@ -616,7 +613,7 @@ static void test_ban_ctx(void) > igt_assert_eq(h4, -EIO); > assert_reset_status(fd, ctx_bad, RS_BATCH_ACTIVE); > > - gem_sync(fd, h7); > + gem_sync(fd, h6); > assert_reset_status(fd, ctx_good, RS_BATCH_PENDING); > > igt_assert_eq(gem_reset_stats(fd, ctx_good, &rs_good), 0); > @@ -625,12 +622,11 @@ static void test_ban_ctx(void) > igt_assert(rs_bad.batch_active == active_count); > igt_assert(rs_bad.batch_pending == pending_count); > igt_assert(rs_good.batch_active == 0); > - igt_assert(rs_good.batch_pending == 2); > + igt_assert(rs_good.batch_pending == 1); > > gem_close(fd, h1); > gem_close(fd, h2); > gem_close(fd, h6); > - gem_close(fd, h7); > > h1 = exec_valid(fd, ctx_good); > igt_assert_lte(0, h1); > -- > 1.9.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Tim GoreĀ Intel Corporation (UK) Ltd. - Co. Reg. #1134945 - Pipers Way, Swindon SN3 1RJ > -----Original Message----- > From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel > Vetter > Sent: Monday, July 13, 2015 10:35 AM > To: Gore, Tim > Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas; Kuoppala, Mika > Subject: Re: [Intel-gfx] [PATCH i-g-t] tests/gem_reset_stats.c: fix "ban" tests > with scheduler > > On Fri, Jul 10, 2015 at 02:26:59PM +0100, tim.gore@intel.com wrote: > > From: Tim Gore <tim.gore@intel.com> > > > > The tests for context banning fail when the gpu scheduler is enabled. > > The test causes a hang (using an infinite loop > > batch) and then queues up some work behind it on both the hanging > > context and also on a second "good" context. On the "good" context it > > queues up 2 batch buffers. After the hanging ring has been reset (not > > a full gpu reset) the test checks the values of batch_active and > > batch_pending returned by the i915_get_reset_stats_ioctl. For the > > "good" > > context it expects to see batch_pending == 2, because two batch > > buffers we queued up behind the hang on this context. But, with the > > scheduler enabled (android, gen8), one of these batch buffers is still > > waiting in the scheduler and has not made it as far as the > > ring->request_list, so this batch buffer is unaffected by > > the ring reset, and batch_pending is only 1. > > > > I considered putting in a test for the scheduler being enabled, but > > decided that a simpler solution is to only queue up 1 batch buffer on > > the good context. This does not change the test logic in any way and > > ensures that we should always have batch_pending=1, with or without > > the scheduler. > > For the scheduler/tdr we probably need to split this up into two testcases > each: > - one where the 2nd batch depends upon the first (cross-context depency). > - one where the 2nd batch doesn't depend upon the first (should execute > unhampered with scheduler/tdr). > > Since we don't yet have a scheduler/tdr both of these will result in the same > outcome (since there's always the temporal depency). But with your patch > you only test the 2nd case (and incompletely, we should assert that the 2nd > batch did run) and ignore the first case. > > In short this failure here is telling you that your test coverage for these > features is lacking. The correct fix is not to mangle the existing, but fix it up to > correctly cover the new expectations in all cases. And that should be done as > part of the tdr/scheduler submission. > -Daniel > Should gem_rest_stats be testing the operation of the scheduler? I would have thought that the scheduler operation should have its own test(s). Gem_reset_stats is just about the reset mechanism and the stats collected. I can add this test, just seems like the wrong place. Tim > > > > Signed-off-by: Tim Gore <tim.gore@intel.com> > > --- > > tests/gem_reset_stats.c | 16 ++++++---------- > > 1 file changed, 6 insertions(+), 10 deletions(-) > > > > diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c index > > 2bb4291..6529463 100644 > > --- a/tests/gem_reset_stats.c > > +++ b/tests/gem_reset_stats.c > > @@ -468,7 +468,7 @@ static void test_rs_ctx(int num_fds, int num_ctx, > > int hang_index, > > > > static void test_ban(void) > > { > > - int h1,h2,h3,h4,h5,h6,h7; > > + int h1,h2,h3,h4,h5,h6; > > int fd_bad, fd_good; > > int retry = 10; > > int active_count = 0, pending_count = 0; @@ -496,7 +496,6 @@ static > > void test_ban(void) > > pending_count++; > > > > h6 = exec_valid(fd_good, 0); > > - h7 = exec_valid(fd_good, 0); > > > > while (retry--) { > > h3 = inject_hang_no_ban_error(fd_bad, 0); @@ -525,7 > > +524,7 @@ static void test_ban(void) > > igt_assert_eq(h4, -EIO); > > assert_reset_status(fd_bad, 0, RS_BATCH_ACTIVE); > > > > - gem_sync(fd_good, h7); > > + gem_sync(fd_good, h6); > > assert_reset_status(fd_good, 0, RS_BATCH_PENDING); > > > > igt_assert_eq(gem_reset_stats(fd_good, 0, &rs_good), 0); @@ - > 534,12 > > +533,11 @@ static void test_ban(void) > > igt_assert(rs_bad.batch_active == active_count); > > igt_assert(rs_bad.batch_pending == pending_count); > > igt_assert(rs_good.batch_active == 0); > > - igt_assert(rs_good.batch_pending == 2); > > + igt_assert(rs_good.batch_pending == 1); > > > > gem_close(fd_bad, h1); > > gem_close(fd_bad, h2); > > gem_close(fd_good, h6); > > - gem_close(fd_good, h7); > > > > h1 = exec_valid(fd_good, 0); > > igt_assert_lte(0, h1); > > @@ -554,7 +552,7 @@ static void test_ban(void) > > > > static void test_ban_ctx(void) > > { > > - int h1,h2,h3,h4,h5,h6,h7; > > + int h1,h2,h3,h4,h5,h6; > > int ctx_good, ctx_bad; > > int fd; > > int retry = 10; > > @@ -587,7 +585,6 @@ static void test_ban_ctx(void) > > pending_count++; > > > > h6 = exec_valid(fd, ctx_good); > > - h7 = exec_valid(fd, ctx_good); > > > > while (retry--) { > > h3 = inject_hang_no_ban_error(fd, ctx_bad); @@ -616,7 > > +613,7 @@ static void test_ban_ctx(void) > > igt_assert_eq(h4, -EIO); > > assert_reset_status(fd, ctx_bad, RS_BATCH_ACTIVE); > > > > - gem_sync(fd, h7); > > + gem_sync(fd, h6); > > assert_reset_status(fd, ctx_good, RS_BATCH_PENDING); > > > > igt_assert_eq(gem_reset_stats(fd, ctx_good, &rs_good), 0); @@ > > -625,12 +622,11 @@ static void test_ban_ctx(void) > > igt_assert(rs_bad.batch_active == active_count); > > igt_assert(rs_bad.batch_pending == pending_count); > > igt_assert(rs_good.batch_active == 0); > > - igt_assert(rs_good.batch_pending == 2); > > + igt_assert(rs_good.batch_pending == 1); > > > > gem_close(fd, h1); > > gem_close(fd, h2); > > gem_close(fd, h6); > > - gem_close(fd, h7); > > > > h1 = exec_valid(fd, ctx_good); > > igt_assert_lte(0, h1); > > -- > > 1.9.1 > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx > > -- > Daniel Vetter > Software Engineer, Intel Corporation > http://blog.ffwll.ch
On Mon, Jul 13, 2015 at 10:09:59AM +0000, Gore, Tim wrote: > > > Tim GoreĀ > Intel Corporation (UK) Ltd. - Co. Reg. #1134945 - Pipers Way, Swindon SN3 1RJ > > > > -----Original Message----- > > From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel > > Vetter > > Sent: Monday, July 13, 2015 10:35 AM > > To: Gore, Tim > > Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas; Kuoppala, Mika > > Subject: Re: [Intel-gfx] [PATCH i-g-t] tests/gem_reset_stats.c: fix "ban" tests > > with scheduler > > > > On Fri, Jul 10, 2015 at 02:26:59PM +0100, tim.gore@intel.com wrote: > > > From: Tim Gore <tim.gore@intel.com> > > > > > > The tests for context banning fail when the gpu scheduler is enabled. > > > The test causes a hang (using an infinite loop > > > batch) and then queues up some work behind it on both the hanging > > > context and also on a second "good" context. On the "good" context it > > > queues up 2 batch buffers. After the hanging ring has been reset (not > > > a full gpu reset) the test checks the values of batch_active and > > > batch_pending returned by the i915_get_reset_stats_ioctl. For the > > > "good" > > > context it expects to see batch_pending == 2, because two batch > > > buffers we queued up behind the hang on this context. But, with the > > > scheduler enabled (android, gen8), one of these batch buffers is still > > > waiting in the scheduler and has not made it as far as the > > > ring->request_list, so this batch buffer is unaffected by > > > the ring reset, and batch_pending is only 1. > > > > > > I considered putting in a test for the scheduler being enabled, but > > > decided that a simpler solution is to only queue up 1 batch buffer on > > > the good context. This does not change the test logic in any way and > > > ensures that we should always have batch_pending=1, with or without > > > the scheduler. > > > > For the scheduler/tdr we probably need to split this up into two testcases > > each: > > - one where the 2nd batch depends upon the first (cross-context depency). > > - one where the 2nd batch doesn't depend upon the first (should execute > > unhampered with scheduler/tdr). > > > > Since we don't yet have a scheduler/tdr both of these will result in the same > > outcome (since there's always the temporal depency). But with your patch > > you only test the 2nd case (and incompletely, we should assert that the 2nd > > batch did run) and ignore the first case. > > > > In short this failure here is telling you that your test coverage for these > > features is lacking. The correct fix is not to mangle the existing, but fix it up to > > correctly cover the new expectations in all cases. And that should be done as > > part of the tdr/scheduler submission. > > -Daniel > > > Should gem_rest_stats be testing the operation of the scheduler? I would have > thought that the scheduler operation should have its own test(s). Gem_reset_stats > is just about the reset mechanism and the stats collected. I can add this test, just > seems like the wrong place. Well gem_reset_stats assumes that a hang will victimize all subsequent batches, irrespective of context. tdr/scheduler change that rather fundamentally (which is the entire point really, at least of tdr). So yeah we need to adjust those existing testcase. And I think it's clearer if we do that by changing the existing test-cases, that way the impact on existing features for new code is much clearer. -Daniel
diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c index 2bb4291..6529463 100644 --- a/tests/gem_reset_stats.c +++ b/tests/gem_reset_stats.c @@ -468,7 +468,7 @@ static void test_rs_ctx(int num_fds, int num_ctx, int hang_index, static void test_ban(void) { - int h1,h2,h3,h4,h5,h6,h7; + int h1,h2,h3,h4,h5,h6; int fd_bad, fd_good; int retry = 10; int active_count = 0, pending_count = 0; @@ -496,7 +496,6 @@ static void test_ban(void) pending_count++; h6 = exec_valid(fd_good, 0); - h7 = exec_valid(fd_good, 0); while (retry--) { h3 = inject_hang_no_ban_error(fd_bad, 0); @@ -525,7 +524,7 @@ static void test_ban(void) igt_assert_eq(h4, -EIO); assert_reset_status(fd_bad, 0, RS_BATCH_ACTIVE); - gem_sync(fd_good, h7); + gem_sync(fd_good, h6); assert_reset_status(fd_good, 0, RS_BATCH_PENDING); igt_assert_eq(gem_reset_stats(fd_good, 0, &rs_good), 0); @@ -534,12 +533,11 @@ static void test_ban(void) igt_assert(rs_bad.batch_active == active_count); igt_assert(rs_bad.batch_pending == pending_count); igt_assert(rs_good.batch_active == 0); - igt_assert(rs_good.batch_pending == 2); + igt_assert(rs_good.batch_pending == 1); gem_close(fd_bad, h1); gem_close(fd_bad, h2); gem_close(fd_good, h6); - gem_close(fd_good, h7); h1 = exec_valid(fd_good, 0); igt_assert_lte(0, h1); @@ -554,7 +552,7 @@ static void test_ban(void) static void test_ban_ctx(void) { - int h1,h2,h3,h4,h5,h6,h7; + int h1,h2,h3,h4,h5,h6; int ctx_good, ctx_bad; int fd; int retry = 10; @@ -587,7 +585,6 @@ static void test_ban_ctx(void) pending_count++; h6 = exec_valid(fd, ctx_good); - h7 = exec_valid(fd, ctx_good); while (retry--) { h3 = inject_hang_no_ban_error(fd, ctx_bad); @@ -616,7 +613,7 @@ static void test_ban_ctx(void) igt_assert_eq(h4, -EIO); assert_reset_status(fd, ctx_bad, RS_BATCH_ACTIVE); - gem_sync(fd, h7); + gem_sync(fd, h6); assert_reset_status(fd, ctx_good, RS_BATCH_PENDING); igt_assert_eq(gem_reset_stats(fd, ctx_good, &rs_good), 0); @@ -625,12 +622,11 @@ static void test_ban_ctx(void) igt_assert(rs_bad.batch_active == active_count); igt_assert(rs_bad.batch_pending == pending_count); igt_assert(rs_good.batch_active == 0); - igt_assert(rs_good.batch_pending == 2); + igt_assert(rs_good.batch_pending == 1); gem_close(fd, h1); gem_close(fd, h2); gem_close(fd, h6); - gem_close(fd, h7); h1 = exec_valid(fd, ctx_good); igt_assert_lte(0, h1);