diff mbox

[RFC,i-g-t] igt/gem_ringfill: Adds full ringbuffer preemption test

Message ID 20170815214421.2136-1-antonio.argenziano@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Antonio Argenziano Aug. 15, 2017, 9:44 p.m. UTC
Sending as RFC to get feedback on what would be the correct behaviour of
the driver and, therefore, if the test is valid.

We do a wait while holding the mutex if we are adding a request and figure
out that there is no more space in the ring buffer.
Is that considered a bug? In the current driver all goes well
because even if you are waiting on a hanging request eventually
hangcheck will come in a kill the request and since the new request
would have waited there anyway it is not a big deal. But, when
preemption is going to be added it will cause a high priority
(preemptive) request to wait for a long time before actually preempting.

The testcase added here, stimulates this scenario where a high priority
request is sent while another process keeps submitting requests and
filling its ringbuffer.

Cc: Chris Wilson <chris@chris-wilson.co.uk>

Signed-off-by: Antonio Argenziano <antonio.argenziano@intel.com>
---
 tests/gem_ringfill.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 129 insertions(+), 11 deletions(-)

Comments

Chris Wilson Aug. 15, 2017, 10:35 p.m. UTC | #1
Quoting Antonio Argenziano (2017-08-15 22:44:21)
> Sending as RFC to get feedback on what would be the correct behaviour of
> the driver and, therefore, if the test is valid.

It's not a preemption specific bug. It is fair to say that any client
blocking any other client over a non-contended resource is an issue.
Skip to end for a really easy way to demonstrate this.

> We do a wait while holding the mutex if we are adding a request and figure
> out that there is no more space in the ring buffer.
> Is that considered a bug?

Yes, but it is one of many priority inversion problems we have because
we have a BKL. Timeframe for fixing it is a few more years.

> +static void wait_batch(int fd, uint32_t handle)
> +{
> +       int64_t timeout = 1ull * NSEC_PER_SEC; //1 sec
> +
> +       if(gem_wait(fd, handle, &timeout) != 0) {
> +               //Force reset and fail the test
> +               igt_force_gpu_reset(fd);

Just terminate the spin batches.

> +               igt_assert_f(0, "[0x%x] batch did not complete!", handle);
> +       }
> +}
> +
> +/*
> + * This test checks that is possible for a high priority request to trigger a
> + * preemption if another context has filled its ringbuffer.
> + * The aim of the test is to make sure that high priority requests cannot be
> + * stalled by low priority tasks.
> + * */
> +static void preempt_while_ringbuffer_full(int fd, uint32_t engine)
> +{
> +       uint32_t hp_ctx, lp_ctx;
> +       uint32_t hp_batch;
> +       igt_spin_t *lp_batch;
> +
> +       struct drm_i915_gem_exec_object2 obj[2];
> +       struct drm_i915_gem_relocation_entry reloc[1024];

That's a bit excessive for this pi test, no ?

> +       struct drm_i915_gem_execbuffer2 execbuf;
> +       const unsigned timeout = 10;
> +
> +       lp_ctx = gem_context_create(fd);
> +       ctx_set_priority(fd, lp_ctx, -MAX_PRIO);
> +
> +       hp_ctx = gem_context_create(fd);
> +       ctx_set_priority(fd, hp_ctx, MAX_PRIO);
> +
> +       igt_require(setup_execbuf(fd, &execbuf, obj, reloc, engine) == 0);
> +       execbuf.rsvd1 = lp_ctx;
> +
> +       /*Stall execution and fill ring*/
> +       lp_batch = igt_spin_batch_new(fd, lp_ctx, engine, 0);
> +       igt_fork(child_no, 1) {
> +               fill_ring(fd, &execbuf, 0, timeout);
> +       }
> +
> +       /*We don't know when the ring will be full so keep sending in a loop*/

Yes we do. See measure_ring_size.

static void bind_to_cpu(int cpu)
{
	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
	struct sched_param rt = {.sched_priority = 99 };
	cpu_set_t allowed;

	igt_assert(sched_setscheduler(getpid(), SCHED_RR | SCHED_RESET_ON_FORK, &rt) == 0);

	CPU_ZERO(&allowed);
	CPU_SET(cpu % ncpus, &allowed);
	igt_assert(sched_setaffinity(getpid(), sizeof(cpu_set_t), &allowed) == 0);
}

setup timer
execbuf.rsvd1 = ctx_lo;
while (__raw_gem_execbuf(fd, &execbuf) == 0)
	;

/* steal cpu */
bind_to_cpu(0);
igt_fork(child, 1) {
	/* this child is forced to wait for parent to sleep */
	execbuf.rsvd1 = ctx_hi;
	setup timer;
	*success = __raw_gem_execbuf(fd, &execbuf) == 0;
}
setup 2*timer
__raw_gem_execbuf(fd, &execbuf); /* sleeps under mutex, releasing child
*/

igt_terminate_spin_batches();
igt_waitchildren();

igt_assert(*success);

Timer can be safely 10ms.

Similarly:

race set-domain (pretty much any GEM ioctl ends up in set-domain) vs
spin-batch, when successful then try any set-domain ioctl from a second
client and observe that it too is blocked on the first client hogging.

end:
For the purpose of testing, just create a debugfs that acquires
struct_mutex on opening. Then test every ioctl and trap from a second
client.
-Chris
Chris Wilson Aug. 15, 2017, 11:32 p.m. UTC | #2
Quoting Antonio Argenziano (2017-08-15 22:44:21)
> Sending as RFC to get feedback on what would be the correct behaviour of
> the driver and, therefore, if the test is valid.
> 
> We do a wait while holding the mutex if we are adding a request and figure
> out that there is no more space in the ring buffer.
> Is that considered a bug?

Hmm, note that we may have contention on a struct drm_i915_file_private
lock for execbuf (as handles are in a per-fd namespace). To be 100% sure
that the clients are independent, use separate fd, and also note that we
can only have client separation for execbuf on full-ppgtt.
-Chris
Chris Wilson Aug. 16, 2017, 7:39 a.m. UTC | #3
Quoting Chris Wilson (2017-08-15 23:35:46)
> end:
> For the purpose of testing, just create a debugfs that acquires
> struct_mutex on opening. Then test every ioctl and trap from a second
> client.

Whilst fun, this is focusing on the current implementation issue and
doesn't define the behaviour you want, namely that any two ioctls from
different clients for disparate resources do not block. The better test
will be setting up the typical conflicts, but playing with a
struct_mutex-blocker will show you just how widespread the immediate
problem is.
-Chris
Antonio Argenziano Aug. 17, 2017, 3:14 p.m. UTC | #4
On 15/08/17 15:35, Chris Wilson wrote:
> Quoting Antonio Argenziano (2017-08-15 22:44:21)
>> Sending as RFC to get feedback on what would be the correct behaviour of
>> the driver and, therefore, if the test is valid.
> 
> It's not a preemption specific bug. It is fair to say that any client
> blocking any other client over a non-contended resource is an issue.
> Skip to end for a really easy way to demonstrate this.

Ok, I'll push a patch then.

> 
>> We do a wait while holding the mutex if we are adding a request and figure
>> out that there is no more space in the ring buffer.
>> Is that considered a bug?
> 
> Yes, but it is one of many priority inversion problems we have because
> we have a BKL. Timeframe for fixing it is a few more years.
> 
>> +static void wait_batch(int fd, uint32_t handle)
>> +{
>> +       int64_t timeout = 1ull * NSEC_PER_SEC; //1 sec
>> +
>> +       if(gem_wait(fd, handle, &timeout) != 0) {
>> +               //Force reset and fail the test
>> +               igt_force_gpu_reset(fd);
> 
> Just terminate the spin batches.
> 
>> +               igt_assert_f(0, "[0x%x] batch did not complete!", handle);
>> +       }
>> +}
>> +
>> +/*
>> + * This test checks that is possible for a high priority request to trigger a
>> + * preemption if another context has filled its ringbuffer.
>> + * The aim of the test is to make sure that high priority requests cannot be
>> + * stalled by low priority tasks.
>> + * */
>> +static void preempt_while_ringbuffer_full(int fd, uint32_t engine)
>> +{
>> +       uint32_t hp_ctx, lp_ctx;
>> +       uint32_t hp_batch;
>> +       igt_spin_t *lp_batch;
>> +
>> +       struct drm_i915_gem_exec_object2 obj[2];
>> +       struct drm_i915_gem_relocation_entry reloc[1024];
> 
> That's a bit excessive for this pi test, no ?

Just wanted to reuse the utility functions in the test with minimal changes.

> 
>> +       struct drm_i915_gem_execbuffer2 execbuf;
>> +       const unsigned timeout = 10;
>> +
>> +       lp_ctx = gem_context_create(fd);
>> +       ctx_set_priority(fd, lp_ctx, -MAX_PRIO);
>> +
>> +       hp_ctx = gem_context_create(fd);
>> +       ctx_set_priority(fd, hp_ctx, MAX_PRIO);
>> +
>> +       igt_require(setup_execbuf(fd, &execbuf, obj, reloc, engine) == 0);
>> +       execbuf.rsvd1 = lp_ctx;
>> +
>> +       /*Stall execution and fill ring*/
>> +       lp_batch = igt_spin_batch_new(fd, lp_ctx, engine, 0);
>> +       igt_fork(child_no, 1) {
>> +               fill_ring(fd, &execbuf, 0, timeout);
>> +       }
>> +
>> +       /*We don't know when the ring will be full so keep sending in a loop*/
> 
> Yes we do. See measure_ring_size.
> 
> static void bind_to_cpu(int cpu)
> {
> 	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
> 	struct sched_param rt = {.sched_priority = 99 };
> 	cpu_set_t allowed;
> 
> 	igt_assert(sched_setscheduler(getpid(), SCHED_RR | SCHED_RESET_ON_FORK, &rt) == 0);
> 
> 	CPU_ZERO(&allowed);
> 	CPU_SET(cpu % ncpus, &allowed);
> 	igt_assert(sched_setaffinity(getpid(), sizeof(cpu_set_t), &allowed) == 0);
> }
> 
> setup timer
> execbuf.rsvd1 = ctx_lo;
> while (__raw_gem_execbuf(fd, &execbuf) == 0)
> 	;
> 
> /* steal cpu */
> bind_to_cpu(0);
> igt_fork(child, 1) {
> 	/* this child is forced to wait for parent to sleep */
> 	execbuf.rsvd1 = ctx_hi;
> 	setup timer;
> 	*success = __raw_gem_execbuf(fd, &execbuf) == 0;
> }
> setup 2*timer
> __raw_gem_execbuf(fd, &execbuf); /* sleeps under mutex, releasing child
> */
> 
> igt_terminate_spin_batches();
> igt_waitchildren();
> 
> igt_assert(*success);
> 
> Timer can be safely 10ms.

Isn't this a bit too complicated? Wouldn't a "keep poking at it for a 
while" approach just do the same while being more readable?

-Antonio

> 
> Similarly:
> 
> race set-domain (pretty much any GEM ioctl ends up in set-domain) vs
> spin-batch, when successful then try any set-domain ioctl from a second
> client and observe that it too is blocked on the first client hogging.
> 
> end:
> For the purpose of testing, just create a debugfs that acquires
> struct_mutex on opening. Then test every ioctl and trap from a second
> client.
> -Chris
>
Chris Wilson Aug. 17, 2017, 3:38 p.m. UTC | #5
Quoting Antonio Argenziano (2017-08-17 16:14:04)
> 
> 
> On 15/08/17 15:35, Chris Wilson wrote:
> > Quoting Antonio Argenziano (2017-08-15 22:44:21)
> >> Sending as RFC to get feedback on what would be the correct behaviour of
> >> the driver and, therefore, if the test is valid.
> > 
> > It's not a preemption specific bug. It is fair to say that any client
> > blocking any other client over a non-contended resource is an issue.
> > Skip to end for a really easy way to demonstrate this.
> 
> Ok, I'll push a patch then.
> 
> > 
> >> We do a wait while holding the mutex if we are adding a request and figure
> >> out that there is no more space in the ring buffer.
> >> Is that considered a bug?
> > 
> > Yes, but it is one of many priority inversion problems we have because
> > we have a BKL. Timeframe for fixing it is a few more years.
> > 
> >> +static void wait_batch(int fd, uint32_t handle)
> >> +{
> >> +       int64_t timeout = 1ull * NSEC_PER_SEC; //1 sec
> >> +
> >> +       if(gem_wait(fd, handle, &timeout) != 0) {
> >> +               //Force reset and fail the test
> >> +               igt_force_gpu_reset(fd);
> > 
> > Just terminate the spin batches.
> > 
> >> +               igt_assert_f(0, "[0x%x] batch did not complete!", handle);
> >> +       }
> >> +}
> >> +
> >> +/*
> >> + * This test checks that is possible for a high priority request to trigger a
> >> + * preemption if another context has filled its ringbuffer.
> >> + * The aim of the test is to make sure that high priority requests cannot be
> >> + * stalled by low priority tasks.
> >> + * */
> >> +static void preempt_while_ringbuffer_full(int fd, uint32_t engine)
> >> +{
> >> +       uint32_t hp_ctx, lp_ctx;
> >> +       uint32_t hp_batch;
> >> +       igt_spin_t *lp_batch;
> >> +
> >> +       struct drm_i915_gem_exec_object2 obj[2];
> >> +       struct drm_i915_gem_relocation_entry reloc[1024];
> > 
> > That's a bit excessive for this pi test, no ?
> 
> Just wanted to reuse the utility functions in the test with minimal changes.
> 
> > 
> >> +       struct drm_i915_gem_execbuffer2 execbuf;
> >> +       const unsigned timeout = 10;
> >> +
> >> +       lp_ctx = gem_context_create(fd);
> >> +       ctx_set_priority(fd, lp_ctx, -MAX_PRIO);
> >> +
> >> +       hp_ctx = gem_context_create(fd);
> >> +       ctx_set_priority(fd, hp_ctx, MAX_PRIO);
> >> +
> >> +       igt_require(setup_execbuf(fd, &execbuf, obj, reloc, engine) == 0);
> >> +       execbuf.rsvd1 = lp_ctx;
> >> +
> >> +       /*Stall execution and fill ring*/
> >> +       lp_batch = igt_spin_batch_new(fd, lp_ctx, engine, 0);
> >> +       igt_fork(child_no, 1) {
> >> +               fill_ring(fd, &execbuf, 0, timeout);
> >> +       }
> >> +
> >> +       /*We don't know when the ring will be full so keep sending in a loop*/
> > 
> > Yes we do. See measure_ring_size.
> > 
> > static void bind_to_cpu(int cpu)
> > {
> >       const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
> >       struct sched_param rt = {.sched_priority = 99 };
> >       cpu_set_t allowed;
> > 
> >       igt_assert(sched_setscheduler(getpid(), SCHED_RR | SCHED_RESET_ON_FORK, &rt) == 0);
> > 
> >       CPU_ZERO(&allowed);
> >       CPU_SET(cpu % ncpus, &allowed);
> >       igt_assert(sched_setaffinity(getpid(), sizeof(cpu_set_t), &allowed) == 0);
> > }
> > 
> > setup timer
> > execbuf.rsvd1 = ctx_lo;
> > while (__raw_gem_execbuf(fd, &execbuf) == 0)
> >       ;
> > 
> > /* steal cpu */
> > bind_to_cpu(0);
> > igt_fork(child, 1) {
> >       /* this child is forced to wait for parent to sleep */
> >       execbuf.rsvd1 = ctx_hi;
> >       setup timer;
> >       *success = __raw_gem_execbuf(fd, &execbuf) == 0;
> > }
> > setup 2*timer
> > __raw_gem_execbuf(fd, &execbuf); /* sleeps under mutex, releasing child
> > */
> > 
> > igt_terminate_spin_batches();
> > igt_waitchildren();
> > 
> > igt_assert(*success);
> > 
> > Timer can be safely 10ms.
> 
> Isn't this a bit too complicated? Wouldn't a "keep poking at it for a 
> while" approach just do the same while being more readable?

Be explicit and use fine control to exactly describe the behaviour you
want. This case is one client exhausts their ring, it will block not
only itself but all users.

Please don't add this to gem_ringfill, if you consider it a scheduling /
priority inversion issue, add it to gem_exec_scheduler. If you want to
tackle more generally as being just one of many, many ways a client can
block another client, start a new test. Considering just how general
this problem is, I'd rather we tackle the problem of modelling the
driver and a system to find all such contention points.
-Chris
diff mbox

Patch

diff --git a/tests/gem_ringfill.c b/tests/gem_ringfill.c
index b52996a4..6ca8352e 100644
--- a/tests/gem_ringfill.c
+++ b/tests/gem_ringfill.c
@@ -47,8 +47,31 @@ 
 #define HIBERNATE 0x40
 #define NEWFD 0x80
 
+#define MAX_PRIO 1023
+#define LOCAL_CONTEXT_PARAM_PRIORITY 0x7
+
+IGT_TEST_DESCRIPTION("Check that we can manage the ringbuffer when full");
+
 static unsigned int ring_size;
 
+static int __ctx_set_priority(int fd, uint32_t ctx, int prio)
+{
+	struct local_i915_gem_context_param param;
+
+	memset(&param, 0, sizeof(param));
+	param.context = ctx;
+	param.size = 0;
+	param.param = LOCAL_CONTEXT_PARAM_PRIORITY;
+	param.value = prio;
+
+	return __gem_context_set_param(fd, &param);
+}
+
+static void ctx_set_priority(int fd, uint32_t ctx, int prio)
+{
+	igt_assert_eq(__ctx_set_priority(fd, ctx, prio), 0);
+}
+
 static void check_bo(int fd, uint32_t handle)
 {
 	uint32_t *map;
@@ -324,6 +347,88 @@  static unsigned int measure_ring_size(int fd)
 	return count;
 }
 
+static void exec_noop(int fd, uint32_t ctx, uint32_t engine, uint32_t *handle)
+{
+	uint32_t bbe = MI_BATCH_BUFFER_END;
+	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_exec_object2 exec;
+
+	gem_require_ring(fd, engine);
+
+	memset(&exec, 0, sizeof(exec));
+
+	*handle = exec.handle = gem_create(fd, 4096);
+
+	gem_write(fd, exec.handle, 0, &bbe, sizeof(bbe));
+
+	memset(&execbuf, 0, sizeof(execbuf));
+	execbuf.buffers_ptr = to_user_pointer(&exec);
+	execbuf.buffer_count = 1;
+	execbuf.flags = engine;
+	execbuf.rsvd1 = ctx;
+
+	gem_execbuf(fd, &execbuf);
+}
+
+static void wait_batch(int fd, uint32_t handle)
+{
+	int64_t timeout = 1ull * NSEC_PER_SEC; //1 sec
+
+	if(gem_wait(fd, handle, &timeout) != 0) {
+		//Force reset and fail the test
+		igt_force_gpu_reset(fd);
+		igt_assert_f(0, "[0x%x] batch did not complete!", handle);
+	}
+}
+
+/*
+ * This test checks that is possible for a high priority request to trigger a
+ * preemption if another context has filled its ringbuffer.
+ * The aim of the test is to make sure that high priority requests cannot be
+ * stalled by low priority tasks.
+ * */
+static void preempt_while_ringbuffer_full(int fd, uint32_t engine)
+{
+	uint32_t hp_ctx, lp_ctx;
+	uint32_t hp_batch;
+	igt_spin_t *lp_batch;
+
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_relocation_entry reloc[1024];
+	struct drm_i915_gem_execbuffer2 execbuf;
+	const unsigned timeout = 10;
+
+	lp_ctx = gem_context_create(fd);
+	ctx_set_priority(fd, lp_ctx, -MAX_PRIO);
+
+	hp_ctx = gem_context_create(fd);
+	ctx_set_priority(fd, hp_ctx, MAX_PRIO);
+
+	igt_require(setup_execbuf(fd, &execbuf, obj, reloc, engine) == 0);
+	execbuf.rsvd1 = lp_ctx;
+
+	/*Stall execution and fill ring*/
+	lp_batch = igt_spin_batch_new(fd, lp_ctx, engine, 0);
+	igt_fork(child_no, 1) {
+		fill_ring(fd, &execbuf, 0, timeout);
+	}
+
+	/*We don't know when the ring will be full so keep sending in a loop*/
+	igt_until_timeout(1) {
+		exec_noop(fd, hp_ctx, engine, &hp_batch);
+
+		/*Preemption expected, if HP batch doesn't complete test fails*/
+		wait_batch(fd, hp_batch);
+		igt_assert(gem_bo_busy(fd, lp_batch->handle));
+		gem_close(fd, hp_batch);
+
+		usleep(100);
+	}
+
+	igt_terminate_spin_batches();
+	igt_waitchildren();
+}
+
 igt_main
 {
 	const struct {
@@ -363,21 +468,34 @@  igt_main
 		igt_require(ring_size);
 	}
 
-	for (m = modes; m->suffix; m++) {
-		const struct intel_execution_engine *e;
-
-		for (e = intel_execution_engines; e->name; e++) {
-			igt_subtest_f("%s%s%s",
-				      m->basic && !e->exec_id ? "basic-" : "",
-				      e->name,
-				      m->suffix) {
-				igt_skip_on(m->flags & NEWFD && master);
-				run_test(fd, e->exec_id | e->flags,
-					 m->flags, m->timeout);
+	igt_subtest_group {
+		for (m = modes; m->suffix; m++) {
+			const struct intel_execution_engine *e;
+
+			for (e = intel_execution_engines; e->name; e++) {
+				igt_subtest_f("%s%s%s",
+						m->basic && !e->exec_id ? "basic-" : "",
+						e->name,
+						m->suffix) {
+					igt_skip_on(m->flags & NEWFD && master);
+					run_test(fd, e->exec_id | e->flags,
+							m->flags, m->timeout);
+				}
 			}
 		}
 	}
 
+	igt_subtest_group {
+			for (const struct intel_execution_engine *e
+					= intel_execution_engines; e->name; e++) {
+				igt_subtest_f("preempt-while-full-%s", e->name) {
+					igt_fork_hang_detector(fd);
+					preempt_while_ringbuffer_full(fd, e->exec_id | e->flags);
+					igt_stop_hang_detector();
+				}
+			}
+	}
+
 	igt_fixture
 		close(fd);
 }