diff mbox

[i-g-t] tests/gem_eio: Never re-use contexts which were in the middle of GPU reset

Message ID 20180327164056.6301-1-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin March 27, 2018, 4:40 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Contexts executing when reset triggers are potentialy corrupt so trying to
use them from a subsequent test (like the default context) can hang the
GPU or even the driver.

Workaround that by always creating a dedicated context which will be
running when GPU reset happens.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 tests/gem_eio.c | 96 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 60 insertions(+), 36 deletions(-)

Comments

Chris Wilson March 27, 2018, 7:49 p.m. UTC | #1
Quoting Tvrtko Ursulin (2018-03-27 17:40:56)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Contexts executing when reset triggers are potentialy corrupt so trying to
> use them from a subsequent test (like the default context) can hang the
> GPU or even the driver.
> 
> Workaround that by always creating a dedicated context which will be
> running when GPU reset happens.

Hmm, what CI won't say until it gets around to running on the shards
(i.e. post merge) is that the !contexts tests are expected to pass on
the older gen. This patch will now fail in gem_context_create().

I was only half jokingly suggesting reopen_device().

As CI is only running each subtest individually, I don't think we are
exposed to the issue, that should give us a bit of time to hopefully fix
it. So back to the perplexing puzzle.
-Chris
diff mbox

Patch

diff --git a/tests/gem_eio.c b/tests/gem_eio.c
index b824d9d4c9c0..cefe26adf893 100644
--- a/tests/gem_eio.c
+++ b/tests/gem_eio.c
@@ -249,14 +249,34 @@  static int __check_wait(int fd, uint32_t bo, unsigned int wait)
 	return ret;
 }
 
+static uint32_t context_create_safe(int i915)
+{
+	struct drm_i915_gem_context_param param;
+
+	memset(&param, 0, sizeof(param));
+
+	param.ctx_id = gem_context_create(i915);
+	param.param = I915_CONTEXT_PARAM_BANNABLE;
+	gem_context_set_param(i915, &param);
+
+	param.param = I915_CONTEXT_PARAM_NO_ERROR_CAPTURE;
+	param.value = 1;
+	gem_context_set_param(i915, &param);
+
+	return param.ctx_id;
+}
+
 #define TEST_WEDGE (1)
 
 static void test_wait(int fd, unsigned int flags, unsigned int wait)
 {
 	igt_spin_t *hang;
+	uint32_t ctx;
 
 	igt_require_gem(fd);
 
+	ctx = context_create_safe(fd);
+
 	/*
 	 * If the request we wait on completes due to a hang (even for
 	 * that request), the user expects the return value to 0 (success).
@@ -267,7 +287,7 @@  static void test_wait(int fd, unsigned int flags, unsigned int wait)
 	else
 		igt_require(i915_reset_control(true));
 
-	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
+	hang = spin_sync(fd, ctx, I915_EXEC_DEFAULT);
 
 	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
 
@@ -276,6 +296,8 @@  static void test_wait(int fd, unsigned int flags, unsigned int wait)
 	igt_require(i915_reset_control(true));
 
 	trigger_reset(fd);
+
+	gem_context_destroy(fd, ctx);
 }
 
 static void test_suspend(int fd, int state)
@@ -309,6 +331,7 @@  static void test_inflight(int fd, unsigned int wait)
 
 	for_each_engine(fd, engine) {
 		struct drm_i915_gem_execbuffer2 execbuf;
+		uint32_t ctx = context_create_safe(fd);
 		igt_spin_t *hang;
 		int fence[64]; /* conservative estimate of ring size */
 
@@ -316,7 +339,7 @@  static void test_inflight(int fd, unsigned int wait)
 		igt_debug("Starting %s on engine '%s'\n", __func__, e__->name);
 		igt_require(i915_reset_control(false));
 
-		hang = spin_sync(fd, 0, engine);
+		hang = spin_sync(fd, ctx, engine);
 		obj[0].handle = hang->handle;
 
 		memset(&execbuf, 0, sizeof(execbuf));
@@ -340,6 +363,8 @@  static void test_inflight(int fd, unsigned int wait)
 		igt_spin_batch_free(fd, hang);
 		igt_assert(i915_reset_control(true));
 		trigger_reset(fd);
+
+		gem_context_destroy(fd, ctx);
 	}
 }
 
@@ -350,17 +375,20 @@  static void test_inflight_suspend(int fd)
 	uint32_t bbe = MI_BATCH_BUFFER_END;
 	int fence[64]; /* conservative estimate of ring size */
 	igt_spin_t *hang;
+	uint32_t ctx;
 
 	igt_require_gem(fd);
 	igt_require(gem_has_exec_fence(fd));
 	igt_require(i915_reset_control(false));
 
+	ctx = context_create_safe(fd);
+
 	memset(obj, 0, sizeof(obj));
 	obj[0].flags = EXEC_OBJECT_WRITE;
 	obj[1].handle = gem_create(fd, 4096);
 	gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
 
-	hang = spin_sync(fd, 0, 0);
+	hang = spin_sync(fd, ctx, 0);
 	obj[0].handle = hang->handle;
 
 	memset(&execbuf, 0, sizeof(execbuf));
@@ -387,23 +415,8 @@  static void test_inflight_suspend(int fd)
 	igt_spin_batch_free(fd, hang);
 	igt_assert(i915_reset_control(true));
 	trigger_reset(fd);
-}
-
-static uint32_t context_create_safe(int i915)
-{
-	struct drm_i915_gem_context_param param;
-
-	memset(&param, 0, sizeof(param));
 
-	param.ctx_id = gem_context_create(i915);
-	param.param = I915_CONTEXT_PARAM_BANNABLE;
-	gem_context_set_param(i915, &param);
-
-	param.param = I915_CONTEXT_PARAM_NO_ERROR_CAPTURE;
-	param.value = 1;
-	gem_context_set_param(i915, &param);
-
-	return param.ctx_id;
+	gem_context_destroy(fd, ctx);
 }
 
 static void test_inflight_contexts(int fd, unsigned int wait)
@@ -411,40 +424,40 @@  static void test_inflight_contexts(int fd, unsigned int wait)
 	struct drm_i915_gem_exec_object2 obj[2];
 	const uint32_t bbe = MI_BATCH_BUFFER_END;
 	unsigned int engine;
-	uint32_t ctx[64];
+	uint32_t ctx[65];
 
 	igt_require_gem(fd);
 	igt_require(gem_has_exec_fence(fd));
 	gem_require_contexts(fd);
 
-	for (unsigned int n = 0; n < ARRAY_SIZE(ctx); n++)
-		ctx[n] = context_create_safe(fd);
-
 	memset(obj, 0, sizeof(obj));
 	obj[0].flags = EXEC_OBJECT_WRITE;
 	obj[1].handle = gem_create(fd, 4096);
 	gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
 
 	for_each_engine(fd, engine) {
-		struct drm_i915_gem_execbuffer2 execbuf;
+		struct drm_i915_gem_execbuffer2 execbuf = { };
 		igt_spin_t *hang;
 		int fence[64];
 
-		gem_quiescent_gpu(fd);
-
 		igt_debug("Starting %s on engine '%s'\n", __func__, e__->name);
+
 		igt_require(i915_reset_control(false));
 
-		hang = spin_sync(fd, 0, engine);
+		for (unsigned int n = 0; n < ARRAY_SIZE(ctx); n++)
+			ctx[n] = context_create_safe(fd);
+
+		gem_quiescent_gpu(fd);
+
+		hang = spin_sync(fd, ctx[0], engine);
 		obj[0].handle = hang->handle;
 
-		memset(&execbuf, 0, sizeof(execbuf));
 		execbuf.buffers_ptr = to_user_pointer(obj);
 		execbuf.buffer_count = 2;
 		execbuf.flags = engine | I915_EXEC_FENCE_OUT;
 
-		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
-			execbuf.rsvd1 = ctx[n];
+		for (unsigned int n = 0; n < (ARRAY_SIZE(fence) - 1); n++) {
+			execbuf.rsvd1 = ctx[n + 1];
 			gem_execbuf_wr(fd, &execbuf);
 			fence[n] = execbuf.rsvd2 >> 32;
 			igt_assert(fence[n] != -1);
@@ -452,18 +465,19 @@  static void test_inflight_contexts(int fd, unsigned int wait)
 
 		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
 
-		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
+		for (unsigned int n = 0; n < (ARRAY_SIZE(fence) - 1); n++) {
 			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
 			close(fence[n]);
 		}
 
 		igt_spin_batch_free(fd, hang);
+
+		for (unsigned int n = 0; n < ARRAY_SIZE(ctx); n++)
+			gem_context_destroy(fd, ctx[n]);
+
 		igt_assert(i915_reset_control(true));
 		trigger_reset(fd);
 	}
-
-	for (unsigned int n = 0; n < ARRAY_SIZE(ctx); n++)
-		gem_context_destroy(fd, ctx[n]);
 }
 
 static void test_inflight_external(int fd)
@@ -473,15 +487,18 @@  static void test_inflight_external(int fd)
 	struct drm_i915_gem_exec_object2 obj;
 	igt_spin_t *hang;
 	uint32_t fence;
+	uint32_t ctx;
 	IGT_CORK_FENCE(cork);
 
 	igt_require_sw_sync();
 	igt_require(gem_has_exec_fence(fd));
 
+	ctx = context_create_safe(fd);
+
 	fence = igt_cork_plug(&cork, fd);
 
 	igt_require(i915_reset_control(false));
-	hang = __spin_poll(fd, 0, 0);
+	hang = __spin_poll(fd, ctx, 0);
 
 	memset(&obj, 0, sizeof(obj));
 	obj.handle = gem_create(fd, 4096);
@@ -514,6 +531,8 @@  static void test_inflight_external(int fd)
 	igt_spin_batch_free(fd, hang);
 	igt_assert(i915_reset_control(true));
 	trigger_reset(fd);
+
+	gem_context_destroy(fd, ctx);
 }
 
 static void test_inflight_internal(int fd, unsigned int wait)
@@ -524,12 +543,15 @@  static void test_inflight_internal(int fd, unsigned int wait)
 	unsigned engine, nfence = 0;
 	int fences[16];
 	igt_spin_t *hang;
+	uint32_t ctx;
 
 	igt_require_gem(fd);
 	igt_require(gem_has_exec_fence(fd));
 
+	ctx = context_create_safe(fd);
+
 	igt_require(i915_reset_control(false));
-	hang = spin_sync(fd, 0, 0);
+	hang = spin_sync(fd, ctx, 0);
 
 	memset(obj, 0, sizeof(obj));
 	obj[0].handle = hang->handle;
@@ -560,6 +582,8 @@  static void test_inflight_internal(int fd, unsigned int wait)
 	igt_spin_batch_free(fd, hang);
 	igt_assert(i915_reset_control(true));
 	trigger_reset(fd);
+
+	gem_context_destroy(fd, ctx);
 }
 
 static int fd = -1;