diff mbox

[i-g-t] tests/gem_reset_stats.c: fix "ban" tests with scheduler

Message ID 1436534819-11328-1-git-send-email-tim.gore@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

tim.gore@intel.com July 10, 2015, 1:26 p.m. UTC
From: Tim Gore <tim.gore@intel.com>

The tests for context banning fail when the gpu scheduler
is enabled. The test causes a hang (using an infinite loop
batch) and then queues up some work behind it on both the
hanging context and also on a second "good" context. On
the "good" context it queues up 2 batch buffers. After the
hanging ring has been reset (not a full gpu reset) the
test checks the values of batch_active and batch_pending
returned by the i915_get_reset_stats_ioctl. For the "good"
context it expects to see batch_pending == 2, because two
batch buffers we queued up behind the hang on this
context. But, with the scheduler enabled (android, gen8),
one of these batch buffers is still waiting in the
scheduler and has not made it as far as the
ring->request_list, so this batch buffer is unaffected by
the ring reset, and batch_pending is only 1.

I considered putting in a test for the scheduler being
enabled, but decided that a simpler solution is to only
queue up 1 batch buffer on the good context. This does
not change the test logic in any way and ensures that we
should always have batch_pending=1, with or without the
scheduler.

Signed-off-by: Tim Gore <tim.gore@intel.com>
---
 tests/gem_reset_stats.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

Comments

Daniel Vetter July 13, 2015, 9:35 a.m. UTC | #1
On Fri, Jul 10, 2015 at 02:26:59PM +0100, tim.gore@intel.com wrote:
> From: Tim Gore <tim.gore@intel.com>
> 
> The tests for context banning fail when the gpu scheduler
> is enabled. The test causes a hang (using an infinite loop
> batch) and then queues up some work behind it on both the
> hanging context and also on a second "good" context. On
> the "good" context it queues up 2 batch buffers. After the
> hanging ring has been reset (not a full gpu reset) the
> test checks the values of batch_active and batch_pending
> returned by the i915_get_reset_stats_ioctl. For the "good"
> context it expects to see batch_pending == 2, because two
> batch buffers we queued up behind the hang on this
> context. But, with the scheduler enabled (android, gen8),
> one of these batch buffers is still waiting in the
> scheduler and has not made it as far as the
> ring->request_list, so this batch buffer is unaffected by
> the ring reset, and batch_pending is only 1.
> 
> I considered putting in a test for the scheduler being
> enabled, but decided that a simpler solution is to only
> queue up 1 batch buffer on the good context. This does
> not change the test logic in any way and ensures that we
> should always have batch_pending=1, with or without the
> scheduler.

For the scheduler/tdr we probably need to split this up into two testcases
each:
- one where the 2nd batch depends upon the first (cross-context depency).
- one where the 2nd batch doesn't depend upon the first (should execute
  unhampered with scheduler/tdr).

Since we don't yet have a scheduler/tdr both of these will result in the
same outcome (since there's always the temporal depency). But with your
patch you only test the 2nd case (and incompletely, we should assert that
the 2nd batch did run) and ignore the first case.

In short this failure here is telling you that your test coverage for
these features is lacking. The correct fix is not to mangle the existing,
but fix it up to correctly cover the new expectations in all cases. And
that should be done as part of the tdr/scheduler submission.
-Daniel

> 
> Signed-off-by: Tim Gore <tim.gore@intel.com>
> ---
>  tests/gem_reset_stats.c | 16 ++++++----------
>  1 file changed, 6 insertions(+), 10 deletions(-)
> 
> diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c
> index 2bb4291..6529463 100644
> --- a/tests/gem_reset_stats.c
> +++ b/tests/gem_reset_stats.c
> @@ -468,7 +468,7 @@ static void test_rs_ctx(int num_fds, int num_ctx, int hang_index,
>  
>  static void test_ban(void)
>  {
> -	int h1,h2,h3,h4,h5,h6,h7;
> +	int h1,h2,h3,h4,h5,h6;
>  	int fd_bad, fd_good;
>  	int retry = 10;
>  	int active_count = 0, pending_count = 0;
> @@ -496,7 +496,6 @@ static void test_ban(void)
>  	pending_count++;
>  
>  	h6 = exec_valid(fd_good, 0);
> -	h7 = exec_valid(fd_good, 0);
>  
>          while (retry--) {
>                  h3 = inject_hang_no_ban_error(fd_bad, 0);
> @@ -525,7 +524,7 @@ static void test_ban(void)
>  	igt_assert_eq(h4, -EIO);
>  	assert_reset_status(fd_bad, 0, RS_BATCH_ACTIVE);
>  
> -	gem_sync(fd_good, h7);
> +	gem_sync(fd_good, h6);
>  	assert_reset_status(fd_good, 0, RS_BATCH_PENDING);
>  
>  	igt_assert_eq(gem_reset_stats(fd_good, 0, &rs_good), 0);
> @@ -534,12 +533,11 @@ static void test_ban(void)
>  	igt_assert(rs_bad.batch_active == active_count);
>  	igt_assert(rs_bad.batch_pending == pending_count);
>  	igt_assert(rs_good.batch_active == 0);
> -	igt_assert(rs_good.batch_pending == 2);
> +	igt_assert(rs_good.batch_pending == 1);
>  
>  	gem_close(fd_bad, h1);
>  	gem_close(fd_bad, h2);
>  	gem_close(fd_good, h6);
> -	gem_close(fd_good, h7);
>  
>  	h1 = exec_valid(fd_good, 0);
>  	igt_assert_lte(0, h1);
> @@ -554,7 +552,7 @@ static void test_ban(void)
>  
>  static void test_ban_ctx(void)
>  {
> -	int h1,h2,h3,h4,h5,h6,h7;
> +	int h1,h2,h3,h4,h5,h6;
>  	int ctx_good, ctx_bad;
>  	int fd;
>  	int retry = 10;
> @@ -587,7 +585,6 @@ static void test_ban_ctx(void)
>  	pending_count++;
>  
>  	h6 = exec_valid(fd, ctx_good);
> -	h7 = exec_valid(fd, ctx_good);
>  
>          while (retry--) {
>                  h3 = inject_hang_no_ban_error(fd, ctx_bad);
> @@ -616,7 +613,7 @@ static void test_ban_ctx(void)
>  	igt_assert_eq(h4, -EIO);
>  	assert_reset_status(fd, ctx_bad, RS_BATCH_ACTIVE);
>  
> -	gem_sync(fd, h7);
> +	gem_sync(fd, h6);
>  	assert_reset_status(fd, ctx_good, RS_BATCH_PENDING);
>  
>  	igt_assert_eq(gem_reset_stats(fd, ctx_good, &rs_good), 0);
> @@ -625,12 +622,11 @@ static void test_ban_ctx(void)
>  	igt_assert(rs_bad.batch_active == active_count);
>  	igt_assert(rs_bad.batch_pending == pending_count);
>  	igt_assert(rs_good.batch_active == 0);
> -	igt_assert(rs_good.batch_pending == 2);
> +	igt_assert(rs_good.batch_pending == 1);
>  
>  	gem_close(fd, h1);
>  	gem_close(fd, h2);
>  	gem_close(fd, h6);
> -	gem_close(fd, h7);
>  
>  	h1 = exec_valid(fd, ctx_good);
>  	igt_assert_lte(0, h1);
> -- 
> 1.9.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
tim.gore@intel.com July 13, 2015, 10:09 a.m. UTC | #2
Tim GoreĀ 
Intel Corporation (UK) Ltd. - Co. Reg. #1134945 - Pipers Way, Swindon SN3 1RJ


> -----Original Message-----
> From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel
> Vetter
> Sent: Monday, July 13, 2015 10:35 AM
> To: Gore, Tim
> Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas; Kuoppala, Mika
> Subject: Re: [Intel-gfx] [PATCH i-g-t] tests/gem_reset_stats.c: fix "ban" tests
> with scheduler
> 
> On Fri, Jul 10, 2015 at 02:26:59PM +0100, tim.gore@intel.com wrote:
> > From: Tim Gore <tim.gore@intel.com>
> >
> > The tests for context banning fail when the gpu scheduler is enabled.
> > The test causes a hang (using an infinite loop
> > batch) and then queues up some work behind it on both the hanging
> > context and also on a second "good" context. On the "good" context it
> > queues up 2 batch buffers. After the hanging ring has been reset (not
> > a full gpu reset) the test checks the values of batch_active and
> > batch_pending returned by the i915_get_reset_stats_ioctl. For the
> > "good"
> > context it expects to see batch_pending == 2, because two batch
> > buffers we queued up behind the hang on this context. But, with the
> > scheduler enabled (android, gen8), one of these batch buffers is still
> > waiting in the scheduler and has not made it as far as the
> > ring->request_list, so this batch buffer is unaffected by
> > the ring reset, and batch_pending is only 1.
> >
> > I considered putting in a test for the scheduler being enabled, but
> > decided that a simpler solution is to only queue up 1 batch buffer on
> > the good context. This does not change the test logic in any way and
> > ensures that we should always have batch_pending=1, with or without
> > the scheduler.
> 
> For the scheduler/tdr we probably need to split this up into two testcases
> each:
> - one where the 2nd batch depends upon the first (cross-context depency).
> - one where the 2nd batch doesn't depend upon the first (should execute
>   unhampered with scheduler/tdr).
> 
> Since we don't yet have a scheduler/tdr both of these will result in the same
> outcome (since there's always the temporal depency). But with your patch
> you only test the 2nd case (and incompletely, we should assert that the 2nd
> batch did run) and ignore the first case.
> 
> In short this failure here is telling you that your test coverage for these
> features is lacking. The correct fix is not to mangle the existing, but fix it up to
> correctly cover the new expectations in all cases. And that should be done as
> part of the tdr/scheduler submission.
> -Daniel
> 
Should gem_rest_stats be testing the operation of the scheduler? I would have
thought that the scheduler operation should have its own test(s). Gem_reset_stats
is just about the reset mechanism and the stats collected. I can add this test, just
seems like the wrong place.

  Tim


> >
> > Signed-off-by: Tim Gore <tim.gore@intel.com>
> > ---
> >  tests/gem_reset_stats.c | 16 ++++++----------
> >  1 file changed, 6 insertions(+), 10 deletions(-)
> >
> > diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c index
> > 2bb4291..6529463 100644
> > --- a/tests/gem_reset_stats.c
> > +++ b/tests/gem_reset_stats.c
> > @@ -468,7 +468,7 @@ static void test_rs_ctx(int num_fds, int num_ctx,
> > int hang_index,
> >
> >  static void test_ban(void)
> >  {
> > -	int h1,h2,h3,h4,h5,h6,h7;
> > +	int h1,h2,h3,h4,h5,h6;
> >  	int fd_bad, fd_good;
> >  	int retry = 10;
> >  	int active_count = 0, pending_count = 0; @@ -496,7 +496,6 @@ static
> > void test_ban(void)
> >  	pending_count++;
> >
> >  	h6 = exec_valid(fd_good, 0);
> > -	h7 = exec_valid(fd_good, 0);
> >
> >          while (retry--) {
> >                  h3 = inject_hang_no_ban_error(fd_bad, 0); @@ -525,7
> > +524,7 @@ static void test_ban(void)
> >  	igt_assert_eq(h4, -EIO);
> >  	assert_reset_status(fd_bad, 0, RS_BATCH_ACTIVE);
> >
> > -	gem_sync(fd_good, h7);
> > +	gem_sync(fd_good, h6);
> >  	assert_reset_status(fd_good, 0, RS_BATCH_PENDING);
> >
> >  	igt_assert_eq(gem_reset_stats(fd_good, 0, &rs_good), 0); @@ -
> 534,12
> > +533,11 @@ static void test_ban(void)
> >  	igt_assert(rs_bad.batch_active == active_count);
> >  	igt_assert(rs_bad.batch_pending == pending_count);
> >  	igt_assert(rs_good.batch_active == 0);
> > -	igt_assert(rs_good.batch_pending == 2);
> > +	igt_assert(rs_good.batch_pending == 1);
> >
> >  	gem_close(fd_bad, h1);
> >  	gem_close(fd_bad, h2);
> >  	gem_close(fd_good, h6);
> > -	gem_close(fd_good, h7);
> >
> >  	h1 = exec_valid(fd_good, 0);
> >  	igt_assert_lte(0, h1);
> > @@ -554,7 +552,7 @@ static void test_ban(void)
> >
> >  static void test_ban_ctx(void)
> >  {
> > -	int h1,h2,h3,h4,h5,h6,h7;
> > +	int h1,h2,h3,h4,h5,h6;
> >  	int ctx_good, ctx_bad;
> >  	int fd;
> >  	int retry = 10;
> > @@ -587,7 +585,6 @@ static void test_ban_ctx(void)
> >  	pending_count++;
> >
> >  	h6 = exec_valid(fd, ctx_good);
> > -	h7 = exec_valid(fd, ctx_good);
> >
> >          while (retry--) {
> >                  h3 = inject_hang_no_ban_error(fd, ctx_bad); @@ -616,7
> > +613,7 @@ static void test_ban_ctx(void)
> >  	igt_assert_eq(h4, -EIO);
> >  	assert_reset_status(fd, ctx_bad, RS_BATCH_ACTIVE);
> >
> > -	gem_sync(fd, h7);
> > +	gem_sync(fd, h6);
> >  	assert_reset_status(fd, ctx_good, RS_BATCH_PENDING);
> >
> >  	igt_assert_eq(gem_reset_stats(fd, ctx_good, &rs_good), 0); @@
> > -625,12 +622,11 @@ static void test_ban_ctx(void)
> >  	igt_assert(rs_bad.batch_active == active_count);
> >  	igt_assert(rs_bad.batch_pending == pending_count);
> >  	igt_assert(rs_good.batch_active == 0);
> > -	igt_assert(rs_good.batch_pending == 2);
> > +	igt_assert(rs_good.batch_pending == 1);
> >
> >  	gem_close(fd, h1);
> >  	gem_close(fd, h2);
> >  	gem_close(fd, h6);
> > -	gem_close(fd, h7);
> >
> >  	h1 = exec_valid(fd, ctx_good);
> >  	igt_assert_lte(0, h1);
> > --
> > 1.9.1
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter July 13, 2015, 3 p.m. UTC | #3
On Mon, Jul 13, 2015 at 10:09:59AM +0000, Gore, Tim wrote:
> 
> 
> Tim GoreĀ 
> Intel Corporation (UK) Ltd. - Co. Reg. #1134945 - Pipers Way, Swindon SN3 1RJ
> 
> 
> > -----Original Message-----
> > From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel
> > Vetter
> > Sent: Monday, July 13, 2015 10:35 AM
> > To: Gore, Tim
> > Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas; Kuoppala, Mika
> > Subject: Re: [Intel-gfx] [PATCH i-g-t] tests/gem_reset_stats.c: fix "ban" tests
> > with scheduler
> > 
> > On Fri, Jul 10, 2015 at 02:26:59PM +0100, tim.gore@intel.com wrote:
> > > From: Tim Gore <tim.gore@intel.com>
> > >
> > > The tests for context banning fail when the gpu scheduler is enabled.
> > > The test causes a hang (using an infinite loop
> > > batch) and then queues up some work behind it on both the hanging
> > > context and also on a second "good" context. On the "good" context it
> > > queues up 2 batch buffers. After the hanging ring has been reset (not
> > > a full gpu reset) the test checks the values of batch_active and
> > > batch_pending returned by the i915_get_reset_stats_ioctl. For the
> > > "good"
> > > context it expects to see batch_pending == 2, because two batch
> > > buffers we queued up behind the hang on this context. But, with the
> > > scheduler enabled (android, gen8), one of these batch buffers is still
> > > waiting in the scheduler and has not made it as far as the
> > > ring->request_list, so this batch buffer is unaffected by
> > > the ring reset, and batch_pending is only 1.
> > >
> > > I considered putting in a test for the scheduler being enabled, but
> > > decided that a simpler solution is to only queue up 1 batch buffer on
> > > the good context. This does not change the test logic in any way and
> > > ensures that we should always have batch_pending=1, with or without
> > > the scheduler.
> > 
> > For the scheduler/tdr we probably need to split this up into two testcases
> > each:
> > - one where the 2nd batch depends upon the first (cross-context depency).
> > - one where the 2nd batch doesn't depend upon the first (should execute
> >   unhampered with scheduler/tdr).
> > 
> > Since we don't yet have a scheduler/tdr both of these will result in the same
> > outcome (since there's always the temporal depency). But with your patch
> > you only test the 2nd case (and incompletely, we should assert that the 2nd
> > batch did run) and ignore the first case.
> > 
> > In short this failure here is telling you that your test coverage for these
> > features is lacking. The correct fix is not to mangle the existing, but fix it up to
> > correctly cover the new expectations in all cases. And that should be done as
> > part of the tdr/scheduler submission.
> > -Daniel
> > 
> Should gem_rest_stats be testing the operation of the scheduler? I would have
> thought that the scheduler operation should have its own test(s). Gem_reset_stats
> is just about the reset mechanism and the stats collected. I can add this test, just
> seems like the wrong place.

Well gem_reset_stats assumes that a hang will victimize all subsequent
batches, irrespective of context. tdr/scheduler change that rather
fundamentally (which is the entire point really, at least of tdr). So yeah
we need to adjust those existing testcase.

And I think it's clearer if we do that by changing the existing
test-cases, that way the impact on existing features for new code is much
clearer.
-Daniel
diff mbox

Patch

diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c
index 2bb4291..6529463 100644
--- a/tests/gem_reset_stats.c
+++ b/tests/gem_reset_stats.c
@@ -468,7 +468,7 @@  static void test_rs_ctx(int num_fds, int num_ctx, int hang_index,
 
 static void test_ban(void)
 {
-	int h1,h2,h3,h4,h5,h6,h7;
+	int h1,h2,h3,h4,h5,h6;
 	int fd_bad, fd_good;
 	int retry = 10;
 	int active_count = 0, pending_count = 0;
@@ -496,7 +496,6 @@  static void test_ban(void)
 	pending_count++;
 
 	h6 = exec_valid(fd_good, 0);
-	h7 = exec_valid(fd_good, 0);
 
         while (retry--) {
                 h3 = inject_hang_no_ban_error(fd_bad, 0);
@@ -525,7 +524,7 @@  static void test_ban(void)
 	igt_assert_eq(h4, -EIO);
 	assert_reset_status(fd_bad, 0, RS_BATCH_ACTIVE);
 
-	gem_sync(fd_good, h7);
+	gem_sync(fd_good, h6);
 	assert_reset_status(fd_good, 0, RS_BATCH_PENDING);
 
 	igt_assert_eq(gem_reset_stats(fd_good, 0, &rs_good), 0);
@@ -534,12 +533,11 @@  static void test_ban(void)
 	igt_assert(rs_bad.batch_active == active_count);
 	igt_assert(rs_bad.batch_pending == pending_count);
 	igt_assert(rs_good.batch_active == 0);
-	igt_assert(rs_good.batch_pending == 2);
+	igt_assert(rs_good.batch_pending == 1);
 
 	gem_close(fd_bad, h1);
 	gem_close(fd_bad, h2);
 	gem_close(fd_good, h6);
-	gem_close(fd_good, h7);
 
 	h1 = exec_valid(fd_good, 0);
 	igt_assert_lte(0, h1);
@@ -554,7 +552,7 @@  static void test_ban(void)
 
 static void test_ban_ctx(void)
 {
-	int h1,h2,h3,h4,h5,h6,h7;
+	int h1,h2,h3,h4,h5,h6;
 	int ctx_good, ctx_bad;
 	int fd;
 	int retry = 10;
@@ -587,7 +585,6 @@  static void test_ban_ctx(void)
 	pending_count++;
 
 	h6 = exec_valid(fd, ctx_good);
-	h7 = exec_valid(fd, ctx_good);
 
         while (retry--) {
                 h3 = inject_hang_no_ban_error(fd, ctx_bad);
@@ -616,7 +613,7 @@  static void test_ban_ctx(void)
 	igt_assert_eq(h4, -EIO);
 	assert_reset_status(fd, ctx_bad, RS_BATCH_ACTIVE);
 
-	gem_sync(fd, h7);
+	gem_sync(fd, h6);
 	assert_reset_status(fd, ctx_good, RS_BATCH_PENDING);
 
 	igt_assert_eq(gem_reset_stats(fd, ctx_good, &rs_good), 0);
@@ -625,12 +622,11 @@  static void test_ban_ctx(void)
 	igt_assert(rs_bad.batch_active == active_count);
 	igt_assert(rs_bad.batch_pending == pending_count);
 	igt_assert(rs_good.batch_active == 0);
-	igt_assert(rs_good.batch_pending == 2);
+	igt_assert(rs_good.batch_pending == 1);
 
 	gem_close(fd, h1);
 	gem_close(fd, h2);
 	gem_close(fd, h6);
-	gem_close(fd, h7);
 
 	h1 = exec_valid(fd, ctx_good);
 	igt_assert_lte(0, h1);