Message ID | 20191114191546.149722-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [i-g-t] i915/gem_exec_balancer: Throw a few hangs into the virtual pipelines | expand |
On 14/11/2019 19:15, Chris Wilson wrote: > Although a virtual engine itself has no hang detection; that is on the > underlying physical engines, it does provide a unique means for clients > to try and break the system. Try and break it before they do. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++ > 1 file changed, 105 insertions(+) > > diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c > index 70c4529b4..86028cfdd 100644 > --- a/tests/i915/gem_exec_balancer.c > +++ b/tests/i915/gem_exec_balancer.c > @@ -24,6 +24,7 @@ > #include <sched.h> > > #include "igt.h" > +#include "igt_gt.h" > #include "igt_perf.h" > #include "i915/gem_ring.h" > #include "sw_sync.h" > @@ -1314,6 +1315,102 @@ static void semaphore(int i915) > gem_quiescent_gpu(i915); > } > > +static void set_unbannable(int i915, uint32_t ctx) > +{ > + struct drm_i915_gem_context_param p = { > + .ctx_id = ctx, > + .param = I915_CONTEXT_PARAM_BANNABLE, > + }; > + > + igt_assert_eq(__gem_context_set_param(i915, &p), 0); > +} > + > +static void hangme(int i915) > +{ > + struct drm_i915_gem_exec_object2 batch = { > + .handle = batch_create(i915), > + }; > + > + /* > + * Fill the available engines with hanging virtual engines and verify > + * that execution continues onto the second batch. > + */ > + > + for (int class = 1; class < 32; class++) { > + struct i915_engine_class_instance *ci; > + struct client { > + igt_spin_t *spin[2]; > + } *client; > + unsigned int count; > + uint32_t bg; > + > + ci = list_engines(i915, 1u << class, &count); > + if (!ci) > + continue; > + > + if (count < 2) { > + free(ci); > + continue; > + } > + > + client = malloc(sizeof(*client) * count); > + igt_assert(client); > + > + for (int i = 0; i < count; i++) { > + uint32_t ctx = gem_context_create(i915); > + struct client *c = &client[i]; > + unsigned int flags; > + > + set_unbannable(i915, ctx); > + set_load_balancer(i915, ctx, ci, count, NULL); > + > + flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION; > + for (int j = 0; j < ARRAY_SIZE(c->spin); j++) { > + c->spin[j] = igt_spin_new(i915, ctx, > + .flags = flags); > + flags = IGT_SPIN_FENCE_OUT; > + } > + > + gem_context_destroy(i915, ctx); > + } > + > + /* Apply some background context to speed up hang detection */ > + bg = gem_context_create(i915); > + set_engines(i915, bg, ci, count); > + for (int i = 0; i < count; i++) { > + struct drm_i915_gem_execbuffer2 execbuf = { > + .buffers_ptr = to_user_pointer(&batch), > + .buffer_count = 1, > + .flags = i, > + .rsvd1 = bg, > + }; > + gem_execbuf(i915, &execbuf); > + } > + gem_context_destroy(i915, bg); > + > + for (int i = 0; i < count; i++) { > + struct client *c = &client[i]; > + > + igt_debug("Waiting for client[%d].spin[%d]\n", i, 0); > + gem_sync(i915, c->spin[0]->handle); > + igt_assert_eq(sync_fence_status(c->spin[0]->out_fence), > + -EIO); > + > + igt_debug("Waiting for client[%d].spin[%d]\n", i, 1); > + gem_sync(i915, c->spin[1]->handle); > + igt_assert_eq(sync_fence_status(c->spin[1]->out_fence), > + -EIO); > + > + igt_spin_free(i915, c->spin[0]); > + igt_spin_free(i915, c->spin[1]); > + } > + free(client); > + } > + > + gem_close(i915, batch.handle); > + gem_quiescent_gpu(i915); > +} > + > static void smoketest(int i915, int timeout) > { > struct drm_i915_gem_exec_object2 batch[2] = { > @@ -1486,4 +1583,12 @@ igt_main > igt_fixture { > igt_stop_hang_detector(); > } > + > + igt_subtest("hang") { > + igt_hang_t hang = igt_allow_hang(i915, 0, 0); > + > + hangme(i915); > + > + igt_disallow_hang(i915, hang); > + } > } > Looks good. But do we need some core helpers to figure out when preempt timeout is compiled out? Regards, Tvrtko
Quoting Tvrtko Ursulin (2019-11-15 13:02:24) > > On 14/11/2019 19:15, Chris Wilson wrote: > > Although a virtual engine itself has no hang detection; that is on the > > underlying physical engines, it does provide a unique means for clients > > to try and break the system. Try and break it before they do. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > --- > > tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++ > > 1 file changed, 105 insertions(+) > > > > diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c > > index 70c4529b4..86028cfdd 100644 > > --- a/tests/i915/gem_exec_balancer.c > > +++ b/tests/i915/gem_exec_balancer.c > > @@ -24,6 +24,7 @@ > > #include <sched.h> > > > > #include "igt.h" > > +#include "igt_gt.h" > > #include "igt_perf.h" > > #include "i915/gem_ring.h" > > #include "sw_sync.h" > > @@ -1314,6 +1315,102 @@ static void semaphore(int i915) > > gem_quiescent_gpu(i915); > > } > > > > +static void set_unbannable(int i915, uint32_t ctx) > > +{ > > + struct drm_i915_gem_context_param p = { > > + .ctx_id = ctx, > > + .param = I915_CONTEXT_PARAM_BANNABLE, > > + }; > > + > > + igt_assert_eq(__gem_context_set_param(i915, &p), 0); > > +} > > + > > +static void hangme(int i915) > > +{ > > + struct drm_i915_gem_exec_object2 batch = { > > + .handle = batch_create(i915), > > + }; > > + > > + /* > > + * Fill the available engines with hanging virtual engines and verify > > + * that execution continues onto the second batch. > > + */ > > + > > + for (int class = 1; class < 32; class++) { > > + struct i915_engine_class_instance *ci; > > + struct client { > > + igt_spin_t *spin[2]; > > + } *client; > > + unsigned int count; > > + uint32_t bg; > > + > > + ci = list_engines(i915, 1u << class, &count); > > + if (!ci) > > + continue; > > + > > + if (count < 2) { > > + free(ci); > > + continue; > > + } > > + > > + client = malloc(sizeof(*client) * count); > > + igt_assert(client); > > + > > + for (int i = 0; i < count; i++) { > > + uint32_t ctx = gem_context_create(i915); > > + struct client *c = &client[i]; > > + unsigned int flags; > > + > > + set_unbannable(i915, ctx); > > + set_load_balancer(i915, ctx, ci, count, NULL); > > + > > + flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION; > > + for (int j = 0; j < ARRAY_SIZE(c->spin); j++) { > > + c->spin[j] = igt_spin_new(i915, ctx, > > + .flags = flags); > > + flags = IGT_SPIN_FENCE_OUT; > > + } > > + > > + gem_context_destroy(i915, ctx); > > + } > > + > > + /* Apply some background context to speed up hang detection */ > > + bg = gem_context_create(i915); > > + set_engines(i915, bg, ci, count); > > + for (int i = 0; i < count; i++) { > > + struct drm_i915_gem_execbuffer2 execbuf = { > > + .buffers_ptr = to_user_pointer(&batch), > > + .buffer_count = 1, > > + .flags = i, > > + .rsvd1 = bg, > > + }; > > + gem_execbuf(i915, &execbuf); > > + } > > + gem_context_destroy(i915, bg); > > + > > + for (int i = 0; i < count; i++) { > > + struct client *c = &client[i]; > > + > > + igt_debug("Waiting for client[%d].spin[%d]\n", i, 0); > > + gem_sync(i915, c->spin[0]->handle); > > + igt_assert_eq(sync_fence_status(c->spin[0]->out_fence), > > + -EIO); > > + > > + igt_debug("Waiting for client[%d].spin[%d]\n", i, 1); > > + gem_sync(i915, c->spin[1]->handle); > > + igt_assert_eq(sync_fence_status(c->spin[1]->out_fence), > > + -EIO); > > + > > + igt_spin_free(i915, c->spin[0]); > > + igt_spin_free(i915, c->spin[1]); > > + } > > + free(client); > > + } > > + > > + gem_close(i915, batch.handle); > > + gem_quiescent_gpu(i915); > > +} > > + > > static void smoketest(int i915, int timeout) > > { > > struct drm_i915_gem_exec_object2 batch[2] = { > > @@ -1486,4 +1583,12 @@ igt_main > > igt_fixture { > > igt_stop_hang_detector(); > > } > > + > > + igt_subtest("hang") { > > + igt_hang_t hang = igt_allow_hang(i915, 0, 0); > > + > > + hangme(i915); > > + > > + igt_disallow_hang(i915, hang); > > + } > > } > > > > Looks good. But do we need some core helpers to figure out when preempt > timeout is compiled out? It should still work the same, but slower; 10s hang detection rather than ~200ms. -Chris
On 15/11/2019 13:09, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2019-11-15 13:02:24) >> >> On 14/11/2019 19:15, Chris Wilson wrote: >>> Although a virtual engine itself has no hang detection; that is on the >>> underlying physical engines, it does provide a unique means for clients >>> to try and break the system. Try and break it before they do. >>> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>> --- >>> tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++ >>> 1 file changed, 105 insertions(+) >>> >>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c >>> index 70c4529b4..86028cfdd 100644 >>> --- a/tests/i915/gem_exec_balancer.c >>> +++ b/tests/i915/gem_exec_balancer.c >>> @@ -24,6 +24,7 @@ >>> #include <sched.h> >>> >>> #include "igt.h" >>> +#include "igt_gt.h" >>> #include "igt_perf.h" >>> #include "i915/gem_ring.h" >>> #include "sw_sync.h" >>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915) >>> gem_quiescent_gpu(i915); >>> } >>> >>> +static void set_unbannable(int i915, uint32_t ctx) >>> +{ >>> + struct drm_i915_gem_context_param p = { >>> + .ctx_id = ctx, >>> + .param = I915_CONTEXT_PARAM_BANNABLE, >>> + }; >>> + >>> + igt_assert_eq(__gem_context_set_param(i915, &p), 0); >>> +} >>> + >>> +static void hangme(int i915) >>> +{ >>> + struct drm_i915_gem_exec_object2 batch = { >>> + .handle = batch_create(i915), >>> + }; >>> + >>> + /* >>> + * Fill the available engines with hanging virtual engines and verify >>> + * that execution continues onto the second batch. >>> + */ >>> + >>> + for (int class = 1; class < 32; class++) { >>> + struct i915_engine_class_instance *ci; >>> + struct client { >>> + igt_spin_t *spin[2]; >>> + } *client; >>> + unsigned int count; >>> + uint32_t bg; >>> + >>> + ci = list_engines(i915, 1u << class, &count); >>> + if (!ci) >>> + continue; >>> + >>> + if (count < 2) { >>> + free(ci); >>> + continue; >>> + } >>> + >>> + client = malloc(sizeof(*client) * count); >>> + igt_assert(client); >>> + >>> + for (int i = 0; i < count; i++) { >>> + uint32_t ctx = gem_context_create(i915); >>> + struct client *c = &client[i]; >>> + unsigned int flags; >>> + >>> + set_unbannable(i915, ctx); >>> + set_load_balancer(i915, ctx, ci, count, NULL); >>> + >>> + flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION; >>> + for (int j = 0; j < ARRAY_SIZE(c->spin); j++) { >>> + c->spin[j] = igt_spin_new(i915, ctx, >>> + .flags = flags); >>> + flags = IGT_SPIN_FENCE_OUT; >>> + } >>> + >>> + gem_context_destroy(i915, ctx); >>> + } >>> + >>> + /* Apply some background context to speed up hang detection */ >>> + bg = gem_context_create(i915); >>> + set_engines(i915, bg, ci, count); >>> + for (int i = 0; i < count; i++) { >>> + struct drm_i915_gem_execbuffer2 execbuf = { >>> + .buffers_ptr = to_user_pointer(&batch), >>> + .buffer_count = 1, >>> + .flags = i, >>> + .rsvd1 = bg, >>> + }; >>> + gem_execbuf(i915, &execbuf); >>> + } >>> + gem_context_destroy(i915, bg); >>> + >>> + for (int i = 0; i < count; i++) { >>> + struct client *c = &client[i]; >>> + >>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 0); >>> + gem_sync(i915, c->spin[0]->handle); >>> + igt_assert_eq(sync_fence_status(c->spin[0]->out_fence), >>> + -EIO); >>> + >>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 1); >>> + gem_sync(i915, c->spin[1]->handle); >>> + igt_assert_eq(sync_fence_status(c->spin[1]->out_fence), >>> + -EIO); >>> + >>> + igt_spin_free(i915, c->spin[0]); >>> + igt_spin_free(i915, c->spin[1]); >>> + } >>> + free(client); >>> + } >>> + >>> + gem_close(i915, batch.handle); >>> + gem_quiescent_gpu(i915); >>> +} >>> + >>> static void smoketest(int i915, int timeout) >>> { >>> struct drm_i915_gem_exec_object2 batch[2] = { >>> @@ -1486,4 +1583,12 @@ igt_main >>> igt_fixture { >>> igt_stop_hang_detector(); >>> } >>> + >>> + igt_subtest("hang") { >>> + igt_hang_t hang = igt_allow_hang(i915, 0, 0); >>> + >>> + hangme(i915); >>> + >>> + igt_disallow_hang(i915, hang); >>> + } >>> } >>> >> >> Looks good. But do we need some core helpers to figure out when preempt >> timeout is compiled out? > > It should still work the same, but slower; 10s hang detection rather > than ~200ms. You are talking about old hangcheck? I was thinking about all new Kconfig's compiled out. No heartbeats, no preemption timeout. Still works? Regards, Tvrtko
Quoting Tvrtko Ursulin (2019-11-15 14:52:16) > > On 15/11/2019 13:09, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2019-11-15 13:02:24) > >> > >> On 14/11/2019 19:15, Chris Wilson wrote: > >>> Although a virtual engine itself has no hang detection; that is on the > >>> underlying physical engines, it does provide a unique means for clients > >>> to try and break the system. Try and break it before they do. > >>> > >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >>> --- > >>> tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++ > >>> 1 file changed, 105 insertions(+) > >>> > >>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c > >>> index 70c4529b4..86028cfdd 100644 > >>> --- a/tests/i915/gem_exec_balancer.c > >>> +++ b/tests/i915/gem_exec_balancer.c > >>> @@ -24,6 +24,7 @@ > >>> #include <sched.h> > >>> > >>> #include "igt.h" > >>> +#include "igt_gt.h" > >>> #include "igt_perf.h" > >>> #include "i915/gem_ring.h" > >>> #include "sw_sync.h" > >>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915) > >>> gem_quiescent_gpu(i915); > >>> } > >>> > >>> +static void set_unbannable(int i915, uint32_t ctx) > >>> +{ > >>> + struct drm_i915_gem_context_param p = { > >>> + .ctx_id = ctx, > >>> + .param = I915_CONTEXT_PARAM_BANNABLE, > >>> + }; > >>> + > >>> + igt_assert_eq(__gem_context_set_param(i915, &p), 0); > >>> +} > >>> + > >>> +static void hangme(int i915) > >>> +{ > >>> + struct drm_i915_gem_exec_object2 batch = { > >>> + .handle = batch_create(i915), > >>> + }; > >>> + > >>> + /* > >>> + * Fill the available engines with hanging virtual engines and verify > >>> + * that execution continues onto the second batch. > >>> + */ > >>> + > >>> + for (int class = 1; class < 32; class++) { > >>> + struct i915_engine_class_instance *ci; > >>> + struct client { > >>> + igt_spin_t *spin[2]; > >>> + } *client; > >>> + unsigned int count; > >>> + uint32_t bg; > >>> + > >>> + ci = list_engines(i915, 1u << class, &count); > >>> + if (!ci) > >>> + continue; > >>> + > >>> + if (count < 2) { > >>> + free(ci); > >>> + continue; > >>> + } > >>> + > >>> + client = malloc(sizeof(*client) * count); > >>> + igt_assert(client); > >>> + > >>> + for (int i = 0; i < count; i++) { > >>> + uint32_t ctx = gem_context_create(i915); > >>> + struct client *c = &client[i]; > >>> + unsigned int flags; > >>> + > >>> + set_unbannable(i915, ctx); > >>> + set_load_balancer(i915, ctx, ci, count, NULL); > >>> + > >>> + flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION; > >>> + for (int j = 0; j < ARRAY_SIZE(c->spin); j++) { > >>> + c->spin[j] = igt_spin_new(i915, ctx, > >>> + .flags = flags); > >>> + flags = IGT_SPIN_FENCE_OUT; > >>> + } > >>> + > >>> + gem_context_destroy(i915, ctx); > >>> + } > >>> + > >>> + /* Apply some background context to speed up hang detection */ > >>> + bg = gem_context_create(i915); > >>> + set_engines(i915, bg, ci, count); > >>> + for (int i = 0; i < count; i++) { > >>> + struct drm_i915_gem_execbuffer2 execbuf = { > >>> + .buffers_ptr = to_user_pointer(&batch), > >>> + .buffer_count = 1, > >>> + .flags = i, > >>> + .rsvd1 = bg, > >>> + }; > >>> + gem_execbuf(i915, &execbuf); > >>> + } > >>> + gem_context_destroy(i915, bg); > >>> + > >>> + for (int i = 0; i < count; i++) { > >>> + struct client *c = &client[i]; > >>> + > >>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 0); > >>> + gem_sync(i915, c->spin[0]->handle); > >>> + igt_assert_eq(sync_fence_status(c->spin[0]->out_fence), > >>> + -EIO); > >>> + > >>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 1); > >>> + gem_sync(i915, c->spin[1]->handle); > >>> + igt_assert_eq(sync_fence_status(c->spin[1]->out_fence), > >>> + -EIO); > >>> + > >>> + igt_spin_free(i915, c->spin[0]); > >>> + igt_spin_free(i915, c->spin[1]); > >>> + } > >>> + free(client); > >>> + } > >>> + > >>> + gem_close(i915, batch.handle); > >>> + gem_quiescent_gpu(i915); > >>> +} > >>> + > >>> static void smoketest(int i915, int timeout) > >>> { > >>> struct drm_i915_gem_exec_object2 batch[2] = { > >>> @@ -1486,4 +1583,12 @@ igt_main > >>> igt_fixture { > >>> igt_stop_hang_detector(); > >>> } > >>> + > >>> + igt_subtest("hang") { > >>> + igt_hang_t hang = igt_allow_hang(i915, 0, 0); > >>> + > >>> + hangme(i915); > >>> + > >>> + igt_disallow_hang(i915, hang); > >>> + } > >>> } > >>> > >> > >> Looks good. But do we need some core helpers to figure out when preempt > >> timeout is compiled out? > > > > It should still work the same, but slower; 10s hang detection rather > > than ~200ms. > > You are talking about old hangcheck? I was thinking about all new > Kconfig's compiled out. No heartbeats, no preemption timeout. Still works? Works even faster. :) The spinners then get killed when the contexts are closed (default is non-persistent contexts if you disable heartbeats entirely). The challenge is really on the per-engine heartbeat controls to make sure we kick off the dead contexts, but that's for the future. -Chris
Quoting Chris Wilson (2019-11-15 14:58:00) > Quoting Tvrtko Ursulin (2019-11-15 14:52:16) > > > > On 15/11/2019 13:09, Chris Wilson wrote: > > > Quoting Tvrtko Ursulin (2019-11-15 13:02:24) > > >> > > >> On 14/11/2019 19:15, Chris Wilson wrote: > > >>> Although a virtual engine itself has no hang detection; that is on the > > >>> underlying physical engines, it does provide a unique means for clients > > >>> to try and break the system. Try and break it before they do. > > >>> > > >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > >>> --- > > >>> tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++ > > >>> 1 file changed, 105 insertions(+) > > >>> > > >>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c > > >>> index 70c4529b4..86028cfdd 100644 > > >>> --- a/tests/i915/gem_exec_balancer.c > > >>> +++ b/tests/i915/gem_exec_balancer.c > > >>> @@ -24,6 +24,7 @@ > > >>> #include <sched.h> > > >>> > > >>> #include "igt.h" > > >>> +#include "igt_gt.h" > > >>> #include "igt_perf.h" > > >>> #include "i915/gem_ring.h" > > >>> #include "sw_sync.h" > > >>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915) > > >>> gem_quiescent_gpu(i915); > > >>> } > > >>> > > >>> +static void set_unbannable(int i915, uint32_t ctx) > > >>> +{ > > >>> + struct drm_i915_gem_context_param p = { > > >>> + .ctx_id = ctx, > > >>> + .param = I915_CONTEXT_PARAM_BANNABLE, > > >>> + }; > > >>> + > > >>> + igt_assert_eq(__gem_context_set_param(i915, &p), 0); > > >>> +} > > >>> + > > >>> +static void hangme(int i915) > > >>> +{ > > >>> + struct drm_i915_gem_exec_object2 batch = { > > >>> + .handle = batch_create(i915), > > >>> + }; > > >>> + > > >>> + /* > > >>> + * Fill the available engines with hanging virtual engines and verify > > >>> + * that execution continues onto the second batch. > > >>> + */ > > >>> + > > >>> + for (int class = 1; class < 32; class++) { > > >>> + struct i915_engine_class_instance *ci; > > >>> + struct client { > > >>> + igt_spin_t *spin[2]; > > >>> + } *client; > > >>> + unsigned int count; > > >>> + uint32_t bg; > > >>> + > > >>> + ci = list_engines(i915, 1u << class, &count); > > >>> + if (!ci) > > >>> + continue; > > >>> + > > >>> + if (count < 2) { > > >>> + free(ci); > > >>> + continue; > > >>> + } > > >>> + > > >>> + client = malloc(sizeof(*client) * count); > > >>> + igt_assert(client); > > >>> + > > >>> + for (int i = 0; i < count; i++) { > > >>> + uint32_t ctx = gem_context_create(i915); > > >>> + struct client *c = &client[i]; > > >>> + unsigned int flags; > > >>> + > > >>> + set_unbannable(i915, ctx); > > >>> + set_load_balancer(i915, ctx, ci, count, NULL); > > >>> + > > >>> + flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION; > > >>> + for (int j = 0; j < ARRAY_SIZE(c->spin); j++) { > > >>> + c->spin[j] = igt_spin_new(i915, ctx, > > >>> + .flags = flags); > > >>> + flags = IGT_SPIN_FENCE_OUT; > > >>> + } > > >>> + > > >>> + gem_context_destroy(i915, ctx); > > >>> + } > > >>> + > > >>> + /* Apply some background context to speed up hang detection */ > > >>> + bg = gem_context_create(i915); > > >>> + set_engines(i915, bg, ci, count); > > >>> + for (int i = 0; i < count; i++) { > > >>> + struct drm_i915_gem_execbuffer2 execbuf = { > > >>> + .buffers_ptr = to_user_pointer(&batch), > > >>> + .buffer_count = 1, > > >>> + .flags = i, > > >>> + .rsvd1 = bg, > > >>> + }; > > >>> + gem_execbuf(i915, &execbuf); > > >>> + } > > >>> + gem_context_destroy(i915, bg); > > >>> + > > >>> + for (int i = 0; i < count; i++) { > > >>> + struct client *c = &client[i]; > > >>> + > > >>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 0); > > >>> + gem_sync(i915, c->spin[0]->handle); > > >>> + igt_assert_eq(sync_fence_status(c->spin[0]->out_fence), > > >>> + -EIO); > > >>> + > > >>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 1); > > >>> + gem_sync(i915, c->spin[1]->handle); > > >>> + igt_assert_eq(sync_fence_status(c->spin[1]->out_fence), > > >>> + -EIO); > > >>> + > > >>> + igt_spin_free(i915, c->spin[0]); > > >>> + igt_spin_free(i915, c->spin[1]); > > >>> + } > > >>> + free(client); > > >>> + } > > >>> + > > >>> + gem_close(i915, batch.handle); > > >>> + gem_quiescent_gpu(i915); > > >>> +} > > >>> + > > >>> static void smoketest(int i915, int timeout) > > >>> { > > >>> struct drm_i915_gem_exec_object2 batch[2] = { > > >>> @@ -1486,4 +1583,12 @@ igt_main > > >>> igt_fixture { > > >>> igt_stop_hang_detector(); > > >>> } > > >>> + > > >>> + igt_subtest("hang") { > > >>> + igt_hang_t hang = igt_allow_hang(i915, 0, 0); > > >>> + > > >>> + hangme(i915); > > >>> + > > >>> + igt_disallow_hang(i915, hang); > > >>> + } > > >>> } > > >>> > > >> > > >> Looks good. But do we need some core helpers to figure out when preempt > > >> timeout is compiled out? > > > > > > It should still work the same, but slower; 10s hang detection rather > > > than ~200ms. > > > > You are talking about old hangcheck? I was thinking about all new > > Kconfig's compiled out. No heartbeats, no preemption timeout. Still works? > > Works even faster. :) > > The spinners then get killed when the contexts are closed (default is > non-persistent contexts if you disable heartbeats entirely). The > challenge is really on the per-engine heartbeat controls to make sure we > kick off the dead contexts, but that's for the future. And for the other kconfig, with no preemption timeout, you just get regular heartbeats, so roughly the 10s hangcheck timeout. -Chris
On 15/11/2019 14:59, Chris Wilson wrote: > Quoting Chris Wilson (2019-11-15 14:58:00) >> Quoting Tvrtko Ursulin (2019-11-15 14:52:16) >>> >>> On 15/11/2019 13:09, Chris Wilson wrote: >>>> Quoting Tvrtko Ursulin (2019-11-15 13:02:24) >>>>> >>>>> On 14/11/2019 19:15, Chris Wilson wrote: >>>>>> Although a virtual engine itself has no hang detection; that is on the >>>>>> underlying physical engines, it does provide a unique means for clients >>>>>> to try and break the system. Try and break it before they do. >>>>>> >>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>>>> --- >>>>>> tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++ >>>>>> 1 file changed, 105 insertions(+) >>>>>> >>>>>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c >>>>>> index 70c4529b4..86028cfdd 100644 >>>>>> --- a/tests/i915/gem_exec_balancer.c >>>>>> +++ b/tests/i915/gem_exec_balancer.c >>>>>> @@ -24,6 +24,7 @@ >>>>>> #include <sched.h> >>>>>> >>>>>> #include "igt.h" >>>>>> +#include "igt_gt.h" >>>>>> #include "igt_perf.h" >>>>>> #include "i915/gem_ring.h" >>>>>> #include "sw_sync.h" >>>>>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915) >>>>>> gem_quiescent_gpu(i915); >>>>>> } >>>>>> >>>>>> +static void set_unbannable(int i915, uint32_t ctx) >>>>>> +{ >>>>>> + struct drm_i915_gem_context_param p = { >>>>>> + .ctx_id = ctx, >>>>>> + .param = I915_CONTEXT_PARAM_BANNABLE, >>>>>> + }; >>>>>> + >>>>>> + igt_assert_eq(__gem_context_set_param(i915, &p), 0); >>>>>> +} >>>>>> + >>>>>> +static void hangme(int i915) >>>>>> +{ >>>>>> + struct drm_i915_gem_exec_object2 batch = { >>>>>> + .handle = batch_create(i915), >>>>>> + }; >>>>>> + >>>>>> + /* >>>>>> + * Fill the available engines with hanging virtual engines and verify >>>>>> + * that execution continues onto the second batch. >>>>>> + */ >>>>>> + >>>>>> + for (int class = 1; class < 32; class++) { >>>>>> + struct i915_engine_class_instance *ci; >>>>>> + struct client { >>>>>> + igt_spin_t *spin[2]; >>>>>> + } *client; >>>>>> + unsigned int count; >>>>>> + uint32_t bg; >>>>>> + >>>>>> + ci = list_engines(i915, 1u << class, &count); >>>>>> + if (!ci) >>>>>> + continue; >>>>>> + >>>>>> + if (count < 2) { >>>>>> + free(ci); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + client = malloc(sizeof(*client) * count); >>>>>> + igt_assert(client); >>>>>> + >>>>>> + for (int i = 0; i < count; i++) { >>>>>> + uint32_t ctx = gem_context_create(i915); >>>>>> + struct client *c = &client[i]; >>>>>> + unsigned int flags; >>>>>> + >>>>>> + set_unbannable(i915, ctx); >>>>>> + set_load_balancer(i915, ctx, ci, count, NULL); >>>>>> + >>>>>> + flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION; >>>>>> + for (int j = 0; j < ARRAY_SIZE(c->spin); j++) { >>>>>> + c->spin[j] = igt_spin_new(i915, ctx, >>>>>> + .flags = flags); >>>>>> + flags = IGT_SPIN_FENCE_OUT; >>>>>> + } >>>>>> + >>>>>> + gem_context_destroy(i915, ctx); >>>>>> + } >>>>>> + >>>>>> + /* Apply some background context to speed up hang detection */ >>>>>> + bg = gem_context_create(i915); >>>>>> + set_engines(i915, bg, ci, count); >>>>>> + for (int i = 0; i < count; i++) { >>>>>> + struct drm_i915_gem_execbuffer2 execbuf = { >>>>>> + .buffers_ptr = to_user_pointer(&batch), >>>>>> + .buffer_count = 1, >>>>>> + .flags = i, >>>>>> + .rsvd1 = bg, >>>>>> + }; >>>>>> + gem_execbuf(i915, &execbuf); >>>>>> + } >>>>>> + gem_context_destroy(i915, bg); >>>>>> + >>>>>> + for (int i = 0; i < count; i++) { >>>>>> + struct client *c = &client[i]; >>>>>> + >>>>>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 0); >>>>>> + gem_sync(i915, c->spin[0]->handle); >>>>>> + igt_assert_eq(sync_fence_status(c->spin[0]->out_fence), >>>>>> + -EIO); >>>>>> + >>>>>> + igt_debug("Waiting for client[%d].spin[%d]\n", i, 1); >>>>>> + gem_sync(i915, c->spin[1]->handle); >>>>>> + igt_assert_eq(sync_fence_status(c->spin[1]->out_fence), >>>>>> + -EIO); >>>>>> + >>>>>> + igt_spin_free(i915, c->spin[0]); >>>>>> + igt_spin_free(i915, c->spin[1]); >>>>>> + } >>>>>> + free(client); >>>>>> + } >>>>>> + >>>>>> + gem_close(i915, batch.handle); >>>>>> + gem_quiescent_gpu(i915); >>>>>> +} >>>>>> + >>>>>> static void smoketest(int i915, int timeout) >>>>>> { >>>>>> struct drm_i915_gem_exec_object2 batch[2] = { >>>>>> @@ -1486,4 +1583,12 @@ igt_main >>>>>> igt_fixture { >>>>>> igt_stop_hang_detector(); >>>>>> } >>>>>> + >>>>>> + igt_subtest("hang") { >>>>>> + igt_hang_t hang = igt_allow_hang(i915, 0, 0); >>>>>> + >>>>>> + hangme(i915); >>>>>> + >>>>>> + igt_disallow_hang(i915, hang); >>>>>> + } >>>>>> } >>>>>> >>>>> >>>>> Looks good. But do we need some core helpers to figure out when preempt >>>>> timeout is compiled out? >>>> >>>> It should still work the same, but slower; 10s hang detection rather >>>> than ~200ms. >>> >>> You are talking about old hangcheck? I was thinking about all new >>> Kconfig's compiled out. No heartbeats, no preemption timeout. Still works? >> >> Works even faster. :) >> >> The spinners then get killed when the contexts are closed (default is >> non-persistent contexts if you disable heartbeats entirely). The >> challenge is really on the per-engine heartbeat controls to make sure we >> kick off the dead contexts, but that's for the future. > > And for the other kconfig, with no preemption timeout, you just get > regular heartbeats, so roughly the 10s hangcheck timeout. Good then. No other opens: Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Regards, Tvrtko
diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c index 70c4529b4..86028cfdd 100644 --- a/tests/i915/gem_exec_balancer.c +++ b/tests/i915/gem_exec_balancer.c @@ -24,6 +24,7 @@ #include <sched.h> #include "igt.h" +#include "igt_gt.h" #include "igt_perf.h" #include "i915/gem_ring.h" #include "sw_sync.h" @@ -1314,6 +1315,102 @@ static void semaphore(int i915) gem_quiescent_gpu(i915); } +static void set_unbannable(int i915, uint32_t ctx) +{ + struct drm_i915_gem_context_param p = { + .ctx_id = ctx, + .param = I915_CONTEXT_PARAM_BANNABLE, + }; + + igt_assert_eq(__gem_context_set_param(i915, &p), 0); +} + +static void hangme(int i915) +{ + struct drm_i915_gem_exec_object2 batch = { + .handle = batch_create(i915), + }; + + /* + * Fill the available engines with hanging virtual engines and verify + * that execution continues onto the second batch. + */ + + for (int class = 1; class < 32; class++) { + struct i915_engine_class_instance *ci; + struct client { + igt_spin_t *spin[2]; + } *client; + unsigned int count; + uint32_t bg; + + ci = list_engines(i915, 1u << class, &count); + if (!ci) + continue; + + if (count < 2) { + free(ci); + continue; + } + + client = malloc(sizeof(*client) * count); + igt_assert(client); + + for (int i = 0; i < count; i++) { + uint32_t ctx = gem_context_create(i915); + struct client *c = &client[i]; + unsigned int flags; + + set_unbannable(i915, ctx); + set_load_balancer(i915, ctx, ci, count, NULL); + + flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION; + for (int j = 0; j < ARRAY_SIZE(c->spin); j++) { + c->spin[j] = igt_spin_new(i915, ctx, + .flags = flags); + flags = IGT_SPIN_FENCE_OUT; + } + + gem_context_destroy(i915, ctx); + } + + /* Apply some background context to speed up hang detection */ + bg = gem_context_create(i915); + set_engines(i915, bg, ci, count); + for (int i = 0; i < count; i++) { + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(&batch), + .buffer_count = 1, + .flags = i, + .rsvd1 = bg, + }; + gem_execbuf(i915, &execbuf); + } + gem_context_destroy(i915, bg); + + for (int i = 0; i < count; i++) { + struct client *c = &client[i]; + + igt_debug("Waiting for client[%d].spin[%d]\n", i, 0); + gem_sync(i915, c->spin[0]->handle); + igt_assert_eq(sync_fence_status(c->spin[0]->out_fence), + -EIO); + + igt_debug("Waiting for client[%d].spin[%d]\n", i, 1); + gem_sync(i915, c->spin[1]->handle); + igt_assert_eq(sync_fence_status(c->spin[1]->out_fence), + -EIO); + + igt_spin_free(i915, c->spin[0]); + igt_spin_free(i915, c->spin[1]); + } + free(client); + } + + gem_close(i915, batch.handle); + gem_quiescent_gpu(i915); +} + static void smoketest(int i915, int timeout) { struct drm_i915_gem_exec_object2 batch[2] = { @@ -1486,4 +1583,12 @@ igt_main igt_fixture { igt_stop_hang_detector(); } + + igt_subtest("hang") { + igt_hang_t hang = igt_allow_hang(i915, 0, 0); + + hangme(i915); + + igt_disallow_hang(i915, hang); + } }
Although a virtual engine itself has no hang detection; that is on the underlying physical engines, it does provide a unique means for clients to try and break the system. Try and break it before they do. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+)