diff mbox series

[i-g-t] i915/gem_ctx_persistence: Race context closure with replace-engines

Message ID 20200211192253.1120964-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [i-g-t] i915/gem_ctx_persistence: Race context closure with replace-engines | expand

Commit Message

Chris Wilson Feb. 11, 2020, 7:22 p.m. UTC
Tvrtko spotted a race condition between replacing a set of hanging
engines and closing the context. So exercise it.

5s is not much time to hit the small window, but a little bit of testing
several times a day is better than nothing.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 tests/i915/gem_ctx_persistence.c | 93 ++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

Comments

Tvrtko Ursulin Feb. 13, 2020, 9:07 a.m. UTC | #1
On 11/02/2020 19:22, Chris Wilson wrote:
> Tvrtko spotted a race condition between replacing a set of hanging
> engines and closing the context. So exercise it.
> 
> 5s is not much time to hit the small window, but a little bit of testing
> several times a day is better than nothing.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   tests/i915/gem_ctx_persistence.c | 93 ++++++++++++++++++++++++++++++++
>   1 file changed, 93 insertions(+)
> 
> diff --git a/tests/i915/gem_ctx_persistence.c b/tests/i915/gem_ctx_persistence.c
> index 22f29d25e..6321dbe67 100644
> --- a/tests/i915/gem_ctx_persistence.c
> +++ b/tests/i915/gem_ctx_persistence.c
> @@ -33,6 +33,7 @@
>   #include "i915/gem_engine_topology.h"
>   #include "i915/gem_ring.h"
>   #include "i915/gem_submission.h"
> +#include "igt_aux.h"
>   #include "igt_debugfs.h"
>   #include "igt_dummyload.h"
>   #include "igt_gt.h"
> @@ -803,6 +804,95 @@ static void replace_engines(int i915, const struct intel_execution_engine2 *e)
>   	gem_quiescent_gpu(i915);
>   }
>   
> +struct close_race {
> +	int pipe[2];
> +};
> +
> +static void race_set_engines(int i915, int fd)
> +{
> +	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 1) = {
> +		.engines = {}
> +	};
> +	struct drm_i915_gem_context_param param = {
> +		.param = I915_CONTEXT_PARAM_ENGINES,
> +		.value = to_user_pointer(&engines),
> +		.size = sizeof(engines),
> +	};
> +
> +	while (read(fd, &param.ctx_id, sizeof(param.ctx_id)) > 0) {
> +		if (!param.ctx_id)
> +			break;
> +		__gem_context_set_param(i915, &param);
> +	}
> +}
> +
> +static void close_replace_race(int i915)
> +{
> +	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
> +	struct close_race *t;
> +	int fence = -1;
> +
> +	/*
> +	 * Tvrtko being the evil genius noticed that if we could successfully
> +	 * replace a set of engines after the context had been closed, those
> +	 * engines could escape oversight.
> +	 */

I think it would read better if you reworded this a bit not to mention 
names and claimed attributes. :)

> +
> +	t = malloc(sizeof(*t) * ncpus);
> +	igt_assert(t);
> +
> +	for (int i = 0; i < ncpus; i++)
> +		igt_assert(pipe(t[i].pipe) == 0);
> +
> +	igt_fork(child, ncpus) {
> +		close(t[child].pipe[1]);
> +		race_set_engines(i915, t[child].pipe[0]);
> +	}
> +
> +	for (int i = 0; i < ncpus; i++)
> +		close(t[i].pipe[0]);
> +
> +	igt_until_timeout(5) {
> +		igt_spin_t *spin;
> +		uint32_t ctx;
> +
> +		ctx = gem_context_clone_with_engines(i915, 0);
> +		gem_context_set_persistence(i915, ctx, true);
> +
> +		spin = igt_spin_new(i915, ctx, .flags = IGT_SPIN_FENCE_OUT);
> +		for (int i = 0; i < ncpus; i++)
> +			write(t[i].pipe[1], &ctx, sizeof(ctx));

It's early so I hope I am not too confused, but drm client in the forked 
process is a different one. So I think it needs to use threads to be 
able to share.

Regards,

Tvrtko

> +
> +		if (fence < 0) {
> +			fence = spin->out_fence;
> +		} else {
> +			int tmp;
> +
> +			tmp = sync_fence_merge(fence, spin->out_fence);
> +			close(fence);
> +			close(spin->out_fence);
> +
> +			fence = tmp;
> +		}
> +		spin->out_fence = -1;
> +
> +		gem_context_destroy(i915, ctx);
> +	}
> +
> +	for (int i = 0; i < ncpus; i++) {
> +		uint32_t end = 0;
> +		write(t[i].pipe[1], &end, sizeof(end));
> +		close(t[i].pipe[1]);
> +	}
> +	igt_waitchildren();
> +	free(t);
> +
> +	igt_assert(sync_fence_wait(fence, MSEC_PER_SEC / 2) == 0);
> +	close(fence);
> +
> +	gem_quiescent_gpu(i915);
> +}
> +
>   static void replace_engines_hostile(int i915,
>   				    const struct intel_execution_engine2 *e)
>   {
> @@ -961,6 +1051,9 @@ igt_main
>   					replace_engines_hostile(i915, e);
>   			}
>   		}
> +
> +		igt_subtest("close-replace-race")
> +			close_replace_race(i915);
>   	}
>   
>   	igt_fixture {
>
Chris Wilson Feb. 13, 2020, 9:51 a.m. UTC | #2
Quoting Tvrtko Ursulin (2020-02-13 09:07:59)
> 
> On 11/02/2020 19:22, Chris Wilson wrote:
> > +     igt_until_timeout(5) {
> > +             igt_spin_t *spin;
> > +             uint32_t ctx;
> > +
> > +             ctx = gem_context_clone_with_engines(i915, 0);
> > +             gem_context_set_persistence(i915, ctx, true);
> > +
> > +             spin = igt_spin_new(i915, ctx, .flags = IGT_SPIN_FENCE_OUT);
> > +             for (int i = 0; i < ncpus; i++)
> > +                     write(t[i].pipe[1], &ctx, sizeof(ctx));
> 
> It's early so I hope I am not too confused, but drm client in the forked 
> process is a different one. So I think it needs to use threads to be 
> able to share.

It using the same fd, so the children have control over the parents ctx
(and shares the ctx id space via the fd)
-Chris
Tvrtko Ursulin Feb. 13, 2020, 11:02 a.m. UTC | #3
On 13/02/2020 09:51, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-02-13 09:07:59)
>>
>> On 11/02/2020 19:22, Chris Wilson wrote:
>>> +     igt_until_timeout(5) {
>>> +             igt_spin_t *spin;
>>> +             uint32_t ctx;
>>> +
>>> +             ctx = gem_context_clone_with_engines(i915, 0);
>>> +             gem_context_set_persistence(i915, ctx, true);
>>> +
>>> +             spin = igt_spin_new(i915, ctx, .flags = IGT_SPIN_FENCE_OUT);
>>> +             for (int i = 0; i < ncpus; i++)
>>> +                     write(t[i].pipe[1], &ctx, sizeof(ctx));
>>
>> It's early so I hope I am not too confused, but drm client in the forked
>> process is a different one. So I think it needs to use threads to be
>> able to share.
> 
> It using the same fd, so the children have control over the parents ctx
> (and shares the ctx id space via the fd)

It was too early then.

Then with a more neutral comment:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/tests/i915/gem_ctx_persistence.c b/tests/i915/gem_ctx_persistence.c
index 22f29d25e..6321dbe67 100644
--- a/tests/i915/gem_ctx_persistence.c
+++ b/tests/i915/gem_ctx_persistence.c
@@ -33,6 +33,7 @@ 
 #include "i915/gem_engine_topology.h"
 #include "i915/gem_ring.h"
 #include "i915/gem_submission.h"
+#include "igt_aux.h"
 #include "igt_debugfs.h"
 #include "igt_dummyload.h"
 #include "igt_gt.h"
@@ -803,6 +804,95 @@  static void replace_engines(int i915, const struct intel_execution_engine2 *e)
 	gem_quiescent_gpu(i915);
 }
 
+struct close_race {
+	int pipe[2];
+};
+
+static void race_set_engines(int i915, int fd)
+{
+	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 1) = {
+		.engines = {}
+	};
+	struct drm_i915_gem_context_param param = {
+		.param = I915_CONTEXT_PARAM_ENGINES,
+		.value = to_user_pointer(&engines),
+		.size = sizeof(engines),
+	};
+
+	while (read(fd, &param.ctx_id, sizeof(param.ctx_id)) > 0) {
+		if (!param.ctx_id)
+			break;
+		__gem_context_set_param(i915, &param);
+	}
+}
+
+static void close_replace_race(int i915)
+{
+	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	struct close_race *t;
+	int fence = -1;
+
+	/*
+	 * Tvrtko being the evil genius noticed that if we could successfully
+	 * replace a set of engines after the context had been closed, those
+	 * engines could escape oversight.
+	 */
+
+	t = malloc(sizeof(*t) * ncpus);
+	igt_assert(t);
+
+	for (int i = 0; i < ncpus; i++)
+		igt_assert(pipe(t[i].pipe) == 0);
+
+	igt_fork(child, ncpus) {
+		close(t[child].pipe[1]);
+		race_set_engines(i915, t[child].pipe[0]);
+	}
+
+	for (int i = 0; i < ncpus; i++)
+		close(t[i].pipe[0]);
+
+	igt_until_timeout(5) {
+		igt_spin_t *spin;
+		uint32_t ctx;
+
+		ctx = gem_context_clone_with_engines(i915, 0);
+		gem_context_set_persistence(i915, ctx, true);
+
+		spin = igt_spin_new(i915, ctx, .flags = IGT_SPIN_FENCE_OUT);
+		for (int i = 0; i < ncpus; i++)
+			write(t[i].pipe[1], &ctx, sizeof(ctx));
+
+		if (fence < 0) {
+			fence = spin->out_fence;
+		} else {
+			int tmp;
+
+			tmp = sync_fence_merge(fence, spin->out_fence);
+			close(fence);
+			close(spin->out_fence);
+
+			fence = tmp;
+		}
+		spin->out_fence = -1;
+
+		gem_context_destroy(i915, ctx);
+	}
+
+	for (int i = 0; i < ncpus; i++) {
+		uint32_t end = 0;
+		write(t[i].pipe[1], &end, sizeof(end));
+		close(t[i].pipe[1]);
+	}
+	igt_waitchildren();
+	free(t);
+
+	igt_assert(sync_fence_wait(fence, MSEC_PER_SEC / 2) == 0);
+	close(fence);
+
+	gem_quiescent_gpu(i915);
+}
+
 static void replace_engines_hostile(int i915,
 				    const struct intel_execution_engine2 *e)
 {
@@ -961,6 +1051,9 @@  igt_main
 					replace_engines_hostile(i915, e);
 			}
 		}
+
+		igt_subtest("close-replace-race")
+			close_replace_race(i915);
 	}
 
 	igt_fixture {