diff mbox

[i-g-t,1/3] tests/gem_reset_stats: Change pending/active assertions

Message ID 1480599105-31479-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Dec. 1, 2016, 1:31 p.m. UTC
Now that we replay the non guilty contexts and always replay
the default ctx, even when guilty, the assumptions of how many
active and pending batches there was in the time of reset has
changed.

Driver doesn't increment pending counts for contexts that it
considered unaffected by reset. Because it can now replay the
queued requests.

For contexts, guilty of reset, we replay the request envelopes,
but nop the batchbuffer starts. This changes of how many 'hangs'
we think there were queued. As future queued hangs are now NOPed
out, we need to change the assumption of active counts also.

Adapt to these changes of how the replaying of batches affect
the assertions in pending/active counting and banning tests.
While doing this, throw out the retrying to be more strict about
the determinism we want to achieve.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 tests/gem_reset_stats.c | 154 ++++++++++++++++++------------------------------
 1 file changed, 56 insertions(+), 98 deletions(-)

Comments

Chris Wilson Dec. 1, 2016, 1:51 p.m. UTC | #1
How about a patch 0 to enable hang testing contexts on all rings now?
Then exploration of how one ring affects another...

You will want to use busy batches to load the engines without hanging,
that will be tricky...

On Thu, Dec 01, 2016 at 03:31:43PM +0200, Mika Kuoppala wrote:
> Now that we replay the non guilty contexts and always replay
> the default ctx, even when guilty, the assumptions of how many
> active and pending batches there was in the time of reset has
> changed.
> 
> Driver doesn't increment pending counts for contexts that it
> considered unaffected by reset. Because it can now replay the
> queued requests.
> 
> For contexts, guilty of reset, we replay the request envelopes,
> but nop the batchbuffer starts. This changes of how many 'hangs'
> we think there were queued. As future queued hangs are now NOPed
> out, we need to change the assumption of active counts also.
> 
> Adapt to these changes of how the replaying of batches affect
> the assertions in pending/active counting and banning tests.
> While doing this, throw out the retrying to be more strict about
> the determinism we want to achieve.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  tests/gem_reset_stats.c | 154 ++++++++++++++++++------------------------------
>  1 file changed, 56 insertions(+), 98 deletions(-)
> 
> diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c
> index 3de74af..2718a33 100644
> --- a/tests/gem_reset_stats.c
> +++ b/tests/gem_reset_stats.c
> @@ -152,7 +152,7 @@ static struct timespec ts_injected;
>  
>  #define BAN HANG_ALLOW_BAN
>  #define ASYNC 2
> -static void inject_hang(int fd, uint32_t ctx,
> +static bool inject_hang(int fd, uint32_t ctx,
>  			const struct intel_execution_engine *e,
>  			unsigned flags)
>  {
> @@ -163,6 +163,8 @@ static void inject_hang(int fd, uint32_t ctx,
>  	hang = igt_hang_ctx(fd, ctx, e->exec_id | e->flags, flags & BAN, NULL);
>  	if ((flags & ASYNC) == 0)
>  		igt_post_hang_ring(fd, hang);
> +
> +	return hang.handle >= 0;

Never returns false. Future patch?

>  static const char *status_to_string(int x)
> @@ -239,7 +241,7 @@ static void test_rs(const struct intel_execution_engine *e,
>  		if (i == hang_index)
>  			assert_reset_status(i, fd[i], 0, RS_BATCH_ACTIVE);
>  		if (i > hang_index)
> -			assert_reset_status(i, fd[i], 0, RS_BATCH_PENDING);
> +			assert_reset_status(i, fd[i], 0, RS_NO_ERROR);
>  	}
>  
>  	igt_assert(igt_seconds_elapsed(&ts_injected) <= 30);
> @@ -312,10 +314,10 @@ static void test_rs_ctx(const struct intel_execution_engine *e,
>  						    RS_BATCH_ACTIVE);
>  			if (i == hang_index && j > hang_context)
>  				assert_reset_status(i, fd[i], ctx[i][j],
> -						    RS_BATCH_PENDING);
> +						    RS_NO_ERROR);
>  			if (i > hang_index)
>  				assert_reset_status(i, fd[i], ctx[i][j],
> -						    RS_BATCH_PENDING);
> +						    RS_NO_ERROR);
>  		}
>  	}
>  
> @@ -325,128 +327,84 @@ static void test_rs_ctx(const struct intel_execution_engine *e,
>  	}
>  }
>  
> -static void test_ban(const struct intel_execution_engine *e)
> +static void test_ban_ctx(const struct intel_execution_engine *e,
> +			 const bool ban_default)
>  {
>  	struct local_drm_i915_reset_stats rs_bad, rs_good;
> -	int fd_bad, fd_good;
> -	int ban, retry = 10;
> +	int fd_bad, fd_good, i;
> +	uint32_t ctx_good, ctx_bad;
>  	int active_count = 0, pending_count = 0;
>  
>  	fd_bad = drm_open_driver(DRIVER_INTEL);
>  	fd_good = drm_open_driver(DRIVER_INTEL);
>  
> -	assert_reset_status(fd_bad, fd_bad, 0, RS_NO_ERROR);
> +	ctx_good = gem_context_create(fd_bad);
> +	if (!ban_default)
> +		ctx_bad = gem_context_create(fd_bad);
> +	else
> +		ctx_bad = 0;
> +
> +	assert_reset_status(fd_bad, fd_bad, ctx_bad, RS_NO_ERROR);
> +	assert_reset_status(fd_bad, fd_bad, ctx_good, RS_NO_ERROR);
>  	assert_reset_status(fd_good, fd_good, 0, RS_NO_ERROR);
>  
> -	noop(fd_bad, 0, e);
> -	noop(fd_good, 0, e);
> +	igt_assert_lt(0, noop(fd_bad, ctx_bad, e));
> +	igt_assert_lt(0, noop(fd_bad, ctx_good, e));
> +	igt_assert_lt(0, noop(fd_good, 0, e));

Yuck.

__noop();
noop() { igt_assert_eq(__noop, 0));

(Or better names than noop? )

Otherwise, lgtm. R-b if you feel so inclined.
-Chris
diff mbox

Patch

diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c
index 3de74af..2718a33 100644
--- a/tests/gem_reset_stats.c
+++ b/tests/gem_reset_stats.c
@@ -152,7 +152,7 @@  static struct timespec ts_injected;
 
 #define BAN HANG_ALLOW_BAN
 #define ASYNC 2
-static void inject_hang(int fd, uint32_t ctx,
+static bool inject_hang(int fd, uint32_t ctx,
 			const struct intel_execution_engine *e,
 			unsigned flags)
 {
@@ -163,6 +163,8 @@  static void inject_hang(int fd, uint32_t ctx,
 	hang = igt_hang_ctx(fd, ctx, e->exec_id | e->flags, flags & BAN, NULL);
 	if ((flags & ASYNC) == 0)
 		igt_post_hang_ring(fd, hang);
+
+	return hang.handle >= 0;
 }
 
 static const char *status_to_string(int x)
@@ -239,7 +241,7 @@  static void test_rs(const struct intel_execution_engine *e,
 		if (i == hang_index)
 			assert_reset_status(i, fd[i], 0, RS_BATCH_ACTIVE);
 		if (i > hang_index)
-			assert_reset_status(i, fd[i], 0, RS_BATCH_PENDING);
+			assert_reset_status(i, fd[i], 0, RS_NO_ERROR);
 	}
 
 	igt_assert(igt_seconds_elapsed(&ts_injected) <= 30);
@@ -312,10 +314,10 @@  static void test_rs_ctx(const struct intel_execution_engine *e,
 						    RS_BATCH_ACTIVE);
 			if (i == hang_index && j > hang_context)
 				assert_reset_status(i, fd[i], ctx[i][j],
-						    RS_BATCH_PENDING);
+						    RS_NO_ERROR);
 			if (i > hang_index)
 				assert_reset_status(i, fd[i], ctx[i][j],
-						    RS_BATCH_PENDING);
+						    RS_NO_ERROR);
 		}
 	}
 
@@ -325,128 +327,84 @@  static void test_rs_ctx(const struct intel_execution_engine *e,
 	}
 }
 
-static void test_ban(const struct intel_execution_engine *e)
+static void test_ban_ctx(const struct intel_execution_engine *e,
+			 const bool ban_default)
 {
 	struct local_drm_i915_reset_stats rs_bad, rs_good;
-	int fd_bad, fd_good;
-	int ban, retry = 10;
+	int fd_bad, fd_good, i;
+	uint32_t ctx_good, ctx_bad;
 	int active_count = 0, pending_count = 0;
 
 	fd_bad = drm_open_driver(DRIVER_INTEL);
 	fd_good = drm_open_driver(DRIVER_INTEL);
 
-	assert_reset_status(fd_bad, fd_bad, 0, RS_NO_ERROR);
+	ctx_good = gem_context_create(fd_bad);
+	if (!ban_default)
+		ctx_bad = gem_context_create(fd_bad);
+	else
+		ctx_bad = 0;
+
+	assert_reset_status(fd_bad, fd_bad, ctx_bad, RS_NO_ERROR);
+	assert_reset_status(fd_bad, fd_bad, ctx_good, RS_NO_ERROR);
 	assert_reset_status(fd_good, fd_good, 0, RS_NO_ERROR);
 
-	noop(fd_bad, 0, e);
-	noop(fd_good, 0, e);
+	igt_assert_lt(0, noop(fd_bad, ctx_bad, e));
+	igt_assert_lt(0, noop(fd_bad, ctx_good, e));
+	igt_assert_lt(0, noop(fd_good, 0, e));
 
-	assert_reset_status(fd_bad, fd_bad, 0, RS_NO_ERROR);
+	assert_reset_status(fd_bad, fd_bad, ctx_bad, RS_NO_ERROR);
+	assert_reset_status(fd_bad, fd_bad, ctx_good, RS_NO_ERROR);
 	assert_reset_status(fd_good, fd_good, 0, RS_NO_ERROR);
 
-	inject_hang(fd_bad, 0, e, BAN | ASYNC);
-	active_count++;
+	inject_hang(fd_bad, ctx_bad, e, BAN | ASYNC);
+	/* The next synced ban below ban will get nopped if default ctx */
+	if (!ctx_bad)
+		active_count++;
 
-	noop(fd_good, 0, e);
-	noop(fd_good, 0, e);
+	igt_assert_lt(0, noop(fd_good, 0, e));
+	/* We don't skip requests for default ctx, so no
+	 * pending change as they will be replayed */
+	igt_assert_lt(0, noop(fd_bad, 0, e));
+	igt_assert_lt(0, noop(fd_bad, ctx_good, e));
 
-	/* The second hang will count as pending and be discarded */
-	active_count--;
-	pending_count += 1; /* inject hang does 1 real exec + 1 dummy */
-	while (retry--) {
-		inject_hang(fd_bad, 0, e, BAN);
-		active_count++;
+	i = 0;
+	do {
+		/* Check if we are already kicked out */
+		if (noop(fd_bad, ctx_bad, e) < 0)
+			break;
 
-		ban = noop(fd_bad, 0, e);
-		if (ban == -EIO)
+		if (inject_hang(fd_bad, ctx_bad, e, BAN))
+			active_count++;
+		else
 			break;
 
-		/* Should not happen often but sometimes hang is declared too
-		 * slow due to our way of faking hang using loop */
-		gem_close(fd_bad, ban);
+		i++;
+	} while (i < 15);
+
+	/* ctx case needs more as default one will get one nopped */
+	if (ctx_bad)
+		igt_assert_eq(i, 5);
+	else
+		igt_assert_eq(i, 4);
 
-		igt_info("retrying for ban (%d)\n", retry);
-	}
-	igt_assert_eq(ban, -EIO);
 	igt_assert_lt(0, noop(fd_good, 0, e));
+	igt_assert_eq(-EIO, noop(fd_bad, ctx_bad, e));
+	igt_assert_lt(0, noop(fd_bad, ctx_good, e));
 
-	assert_reset_status(fd_bad, fd_bad, 0, RS_BATCH_ACTIVE);
-	igt_assert_eq(gem_reset_stats(fd_bad, 0, &rs_bad), 0);
+	assert_reset_status(fd_bad, fd_bad, ctx_bad, RS_BATCH_ACTIVE);
+	igt_assert_eq(gem_reset_stats(fd_bad, ctx_bad, &rs_bad), 0);
 	igt_assert_eq(rs_bad.batch_active, active_count);
 	igt_assert_eq(rs_bad.batch_pending, pending_count);
 
-	assert_reset_status(fd_good, fd_good, 0, RS_BATCH_PENDING);
+	assert_reset_status(fd_good, fd_good, 0, RS_NO_ERROR);
 	igt_assert_eq(gem_reset_stats(fd_good, 0, &rs_good), 0);
 	igt_assert_eq(rs_good.batch_active, 0);
-	igt_assert_eq(rs_good.batch_pending, 2);
+	igt_assert_eq(rs_good.batch_pending, 0);
 
 	close(fd_bad);
 	close(fd_good);
 }
 
-static void test_ban_ctx(const struct intel_execution_engine *e)
-{
-	struct local_drm_i915_reset_stats rs_bad, rs_good;
-	int fd, ban, retry = 10;
-	uint32_t ctx_good, ctx_bad;
-	int active_count = 0, pending_count = 0;
-
-	fd = drm_open_driver(DRIVER_INTEL);
-
-	assert_reset_status(fd, fd, 0, RS_NO_ERROR);
-
-	ctx_good = gem_context_create(fd);
-	ctx_bad = gem_context_create(fd);
-
-	assert_reset_status(fd, fd, 0, RS_NO_ERROR);
-	assert_reset_status(fd, fd, ctx_good, RS_NO_ERROR);
-	assert_reset_status(fd, fd, ctx_bad, RS_NO_ERROR);
-
-	noop(fd, ctx_bad, e);
-	noop(fd, ctx_good, e);
-
-	assert_reset_status(fd, fd, ctx_good, RS_NO_ERROR);
-	assert_reset_status(fd, fd, ctx_bad, RS_NO_ERROR);
-
-	inject_hang(fd, ctx_bad, e, BAN | ASYNC);
-	active_count++;
-
-	noop(fd, ctx_good, e);
-	noop(fd, ctx_good, e);
-
-	/* This second hang will count as pending and be discarded */
-	active_count--;
-	pending_count++;
-	while (retry--) {
-		inject_hang(fd, ctx_bad, e, BAN);
-		active_count++;
-
-		ban = noop(fd, ctx_bad, e);
-		if (ban == -EIO)
-			break;
-
-		/* Should not happen often but sometimes hang is declared too
-		 * slow due to our way of faking hang using loop */
-		gem_close(fd, ban);
-
-		igt_info("retrying for ban (%d)\n", retry);
-	}
-	igt_assert_eq(ban, -EIO);
-	igt_assert_lt(0, noop(fd, ctx_good, e));
-
-	assert_reset_status(fd, fd, ctx_bad, RS_BATCH_ACTIVE);
-	igt_assert_eq(gem_reset_stats(fd, ctx_bad, &rs_bad), 0);
-	igt_assert_eq(rs_bad.batch_active, active_count);
-	igt_assert_eq(rs_bad.batch_pending, pending_count);
-
-	assert_reset_status(fd, fd, ctx_good, RS_BATCH_PENDING);
-	igt_assert_eq(gem_reset_stats(fd, ctx_good, &rs_good), 0);
-	igt_assert_eq(rs_good.batch_active, 0);
-	igt_assert_eq(rs_good.batch_pending, 2);
-
-	close(fd);
-}
-
 static void test_unrelated_ctx(const struct intel_execution_engine *e)
 {
 	int fd1,fd2;
@@ -812,10 +770,10 @@  igt_main
 			RUN_CTX_TEST(test_rs_ctx(e, 4, 4, 1, 2));
 
 		igt_subtest_f("ban-%s", e->name)
-			RUN_TEST(test_ban(e));
+			RUN_TEST(test_ban_ctx(e, true));
 
 		igt_subtest_f("ban-ctx-%s", e->name)
-			RUN_CTX_TEST(test_ban_ctx(e));
+			RUN_CTX_TEST(test_ban_ctx(e, false));
 
 		igt_subtest_f("reset-count-%s", e->name)
 			RUN_TEST(test_reset_count(e, false));