Message ID | 20180514080251.11224-6-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 14/05/2018 09:02, Chris Wilson wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > We want to make sure RT tasks which use a lot of CPU times can submit > batch buffers with roughly the same latency (and certainly not worse) > compared to normal tasks. > > v2: Add tests to run across all engines simultaneously to encourage > ksoftirqd to kick in even more often. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> #v1 > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > tests/gem_exec_latency.c | 207 +++++++++++++++++++++++++++++++++++++++ > 1 file changed, 207 insertions(+) > > diff --git a/tests/gem_exec_latency.c b/tests/gem_exec_latency.c > index 9498c0921..c5816427b 100644 > --- a/tests/gem_exec_latency.c > +++ b/tests/gem_exec_latency.c > @@ -36,11 +36,15 @@ > #include <sys/time.h> > #include <sys/signal.h> > #include <time.h> > +#include <sched.h> > > #include "drm.h" > > #include "igt_sysfs.h" > #include "igt_vgem.h" > +#include "igt_dummyload.h" > +#include "igt_stats.h" > + > #include "i915/gem_ring.h" > > #define LOCAL_I915_EXEC_NO_RELOC (1<<11) > @@ -351,6 +355,189 @@ static void latency_from_ring(int fd, > } > } > > +static void __rearm_spin_batch(igt_spin_t *spin) > +{ > + const uint32_t mi_arb_chk = 0x5 << 23; > + > + *spin->batch = mi_arb_chk; > + *spin->running = 0; > + __sync_synchronize(); > +} > + > +static void > +__submit_spin_batch(int fd, igt_spin_t *spin, unsigned int flags) > +{ > + struct drm_i915_gem_execbuffer2 eb = spin->execbuf; > + > + eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK); > + eb.flags |= flags | I915_EXEC_NO_RELOC; > + > + gem_execbuf(fd, &eb); > +} > + > +struct rt_pkt { > + struct igt_mean mean; > + double min, max; > +}; > + > +static bool __spin_wait(int fd, igt_spin_t *spin) > +{ > + while (!READ_ONCE(*spin->running)) { > + if (!gem_bo_busy(fd, spin->handle)) > + return false; > + } > + > + return true; > +} > + > +/* > + * Test whether RT thread which hogs the CPU a lot can submit work with > + * reasonable latency. > + */ > +static void > +rthog_latency_on_ring(int fd, unsigned int engine, const char *name, unsigned int flags) > +#define RTIDLE 0x1 > +{ > + const char *passname[] = { "warmup", "normal", "rt" }; > + struct rt_pkt *results; > + unsigned int engines[16]; > + const char *names[16]; > + unsigned int nengine; > + int ret; > + > + results = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); > + igt_assert(results != MAP_FAILED); > + > + nengine = 0; > + if (engine == ALL_ENGINES) { > + for_each_physical_engine(fd, engine) { > + if (!gem_can_store_dword(fd, engine)) > + continue; > + > + engines[nengine] = engine; > + names[nengine] = e__->name; > + nengine++; > + } > + igt_require(nengine > 1); > + } else { > + igt_require(gem_can_store_dword(fd, engine)); > + engines[nengine] = engine; > + names[nengine] = name; > + nengine++; > + } > + > + igt_fork(child, nengine) { > + unsigned int pass = 0; /* Three passes: warmup, normal, rt. */ > + > + engine = engines[child]; > + do { > + struct igt_mean mean; > + double min = HUGE_VAL; > + double max = -HUGE_VAL; > + igt_spin_t *spin; > + > + igt_mean_init(&mean); > + > + if (pass == 2) { > + struct sched_param rt = > + { .sched_priority = 99 }; > + > + ret = sched_setscheduler(0, > + SCHED_FIFO | SCHED_RESET_ON_FORK, > + &rt); > + if (ret) { > + igt_warn("Failed to set scheduling policy!\n"); > + break; > + } > + } > + > + spin = __igt_spin_batch_new_poll(fd, 0, engine); > + if (!spin) { > + igt_warn("Failed to create spinner! (%s)\n", > + passname[pass]); > + break; > + } > + igt_spin_busywait_until_running(spin); > + > + igt_until_timeout(pass > 0 ? 5 : 2) { > + struct timespec ts = { }; > + double t; > + > + igt_spin_batch_end(spin); > + gem_sync(fd, spin->handle); > + if (flags & RTIDLE) > + usleep(250); 250us is how long you expect context complete to be received in? s/RTIDLE/SUBMIT_IDLE/ ? > + > + /* > + * If we are oversubscribed (more RT hogs than > + * cpus) give the others a change to run; > + * otherwise, they will interrupt us in the > + * middle of the measurement. > + */ > + if (nengine > 1) > + usleep(10*nengine); Could check for actual number of cpus here. You want it on top of the RTIDLE sleep? > + > + __rearm_spin_batch(spin); > + > + igt_nsec_elapsed(&ts); > + __submit_spin_batch(fd, spin, engine); > + if (!__spin_wait(fd, spin)) { > + igt_warn("Wait timeout! (%s)\n", > + passname[pass]); It's not really a timeout how __spin_wait is implemented. > + break; > + } > + > + t = igt_nsec_elapsed(&ts) * 1e-9; > + if (t > max) > + max = t; > + if (t < min) > + min = t; > + > + igt_mean_add(&mean, t); > + } > + > + igt_spin_batch_free(fd, spin); > + > + igt_info("%8s %10s: mean=%.2fus stddev=%.3fus [%.2fus, %.2fus] (n=%lu)\n", > + names[child], > + passname[pass], > + igt_mean_get(&mean) * 1e6, > + sqrt(igt_mean_get_variance(&mean)) * 1e6, > + min * 1e6, max * 1e6, > + mean.count); > + > + results[3 * child + pass].mean = mean; > + results[3 * child + pass].min = min; > + results[3 * child + pass].max = max; > + } while (++pass < 3); > + } > + > + igt_waitchildren(); > + > + for (unsigned int child = 0; child < nengine; child++) { > + struct rt_pkt normal = results[3 * child + 1]; > + struct rt_pkt rt = results[3 * child + 2]; > + > + igt_assert(rt.max); > + > + igt_info("%8s: normal latency=%.2f±%.3fus, rt latency=%.2f±%.3fus\n", > + names[child], > + igt_mean_get(&normal.mean) * 1e6, > + sqrt(igt_mean_get_variance(&normal.mean)) * 1e6, > + igt_mean_get(&rt.mean) * 1e6, > + sqrt(igt_mean_get_variance(&rt.mean)) * 1e6); > + > + igt_assert(igt_mean_get(&rt.mean) < > + igt_mean_get(&normal.mean) * 2); > + > + /* The system is noisy; be conservative when declaring fail. */ > + igt_assert(igt_mean_get_variance(&rt.mean) < > + igt_mean_get_variance(&normal.mean) * 25); I don't know what "* 25" means in statistical terms. At some point you said variance doesn't work and you had to go with stdev? > + } > + > + munmap(results, 4096); > +} > + > igt_main > { > const struct intel_execution_engine *e; > @@ -373,6 +560,12 @@ igt_main > intel_register_access_init(intel_get_pci_device(), false, device); > } > > + igt_subtest("all-rtidle-submit") > + rthog_latency_on_ring(device, ALL_ENGINES, "all", RTIDLE); > + > + igt_subtest("all-rthog-submit") > + rthog_latency_on_ring(device, ALL_ENGINES, "all", 0); > + > igt_subtest_group { > igt_fixture > igt_require(intel_gen(intel_get_drm_devid(device)) >= 7); > @@ -391,6 +584,20 @@ igt_main > e->exec_id | e->flags, > e->name, 0); > > + igt_subtest_f("%s-rtidle-submit", e->name) > + rthog_latency_on_ring(device, > + e->exec_id | > + e->flags, > + e->name, > + RTIDLE); > + > + igt_subtest_f("%s-rthog-submit", e->name) > + rthog_latency_on_ring(device, > + e->exec_id | > + e->flags, > + e->name, > + 0); > + > igt_subtest_f("%s-dispatch-queued", e->name) > latency_on_ring(device, > e->exec_id | e->flags, > Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-05-14 12:13:10) > > On 14/05/2018 09:02, Chris Wilson wrote: > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > > > We want to make sure RT tasks which use a lot of CPU times can submit > > batch buffers with roughly the same latency (and certainly not worse) > > compared to normal tasks. > > > > v2: Add tests to run across all engines simultaneously to encourage > > ksoftirqd to kick in even more often. > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> #v1 > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > --- > > tests/gem_exec_latency.c | 207 +++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 207 insertions(+) > > > > diff --git a/tests/gem_exec_latency.c b/tests/gem_exec_latency.c > > index 9498c0921..c5816427b 100644 > > --- a/tests/gem_exec_latency.c > > +++ b/tests/gem_exec_latency.c > > @@ -36,11 +36,15 @@ > > #include <sys/time.h> > > #include <sys/signal.h> > > #include <time.h> > > +#include <sched.h> > > > > #include "drm.h" > > > > #include "igt_sysfs.h" > > #include "igt_vgem.h" > > +#include "igt_dummyload.h" > > +#include "igt_stats.h" > > + > > #include "i915/gem_ring.h" > > > > #define LOCAL_I915_EXEC_NO_RELOC (1<<11) > > @@ -351,6 +355,189 @@ static void latency_from_ring(int fd, > > } > > } > > > > +static void __rearm_spin_batch(igt_spin_t *spin) > > +{ > > + const uint32_t mi_arb_chk = 0x5 << 23; > > + > > + *spin->batch = mi_arb_chk; > > + *spin->running = 0; > > + __sync_synchronize(); > > +} > > + > > +static void > > +__submit_spin_batch(int fd, igt_spin_t *spin, unsigned int flags) > > +{ > > + struct drm_i915_gem_execbuffer2 eb = spin->execbuf; > > + > > + eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK); > > + eb.flags |= flags | I915_EXEC_NO_RELOC; > > + > > + gem_execbuf(fd, &eb); > > +} > > + > > +struct rt_pkt { > > + struct igt_mean mean; > > + double min, max; > > +}; > > + > > +static bool __spin_wait(int fd, igt_spin_t *spin) > > +{ > > + while (!READ_ONCE(*spin->running)) { > > + if (!gem_bo_busy(fd, spin->handle)) > > + return false; > > + } > > + > > + return true; > > +} > > + > > +/* > > + * Test whether RT thread which hogs the CPU a lot can submit work with > > + * reasonable latency. > > + */ > > +static void > > +rthog_latency_on_ring(int fd, unsigned int engine, const char *name, unsigned int flags) > > +#define RTIDLE 0x1 > > +{ > > + const char *passname[] = { "warmup", "normal", "rt" }; > > + struct rt_pkt *results; > > + unsigned int engines[16]; > > + const char *names[16]; > > + unsigned int nengine; > > + int ret; > > + > > + results = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); > > + igt_assert(results != MAP_FAILED); > > + > > + nengine = 0; > > + if (engine == ALL_ENGINES) { > > + for_each_physical_engine(fd, engine) { > > + if (!gem_can_store_dword(fd, engine)) > > + continue; > > + > > + engines[nengine] = engine; > > + names[nengine] = e__->name; > > + nengine++; > > + } > > + igt_require(nengine > 1); > > + } else { > > + igt_require(gem_can_store_dword(fd, engine)); > > + engines[nengine] = engine; > > + names[nengine] = name; > > + nengine++; > > + } > > + > > + igt_fork(child, nengine) { > > + unsigned int pass = 0; /* Three passes: warmup, normal, rt. */ > > + > > + engine = engines[child]; > > + do { > > + struct igt_mean mean; > > + double min = HUGE_VAL; > > + double max = -HUGE_VAL; > > + igt_spin_t *spin; > > + > > + igt_mean_init(&mean); > > + > > + if (pass == 2) { > > + struct sched_param rt = > > + { .sched_priority = 99 }; > > + > > + ret = sched_setscheduler(0, > > + SCHED_FIFO | SCHED_RESET_ON_FORK, > > + &rt); > > + if (ret) { > > + igt_warn("Failed to set scheduling policy!\n"); > > + break; > > + } > > + } > > + > > + spin = __igt_spin_batch_new_poll(fd, 0, engine); > > + if (!spin) { > > + igt_warn("Failed to create spinner! (%s)\n", > > + passname[pass]); > > + break; > > + } > > + igt_spin_busywait_until_running(spin); > > + > > + igt_until_timeout(pass > 0 ? 5 : 2) { > > + struct timespec ts = { }; > > + double t; > > + > > + igt_spin_batch_end(spin); > > + gem_sync(fd, spin->handle); > > + if (flags & RTIDLE) > > + usleep(250); > > 250us is how long you expect context complete to be received in? > > s/RTIDLE/SUBMIT_IDLE/ ? And the others. I guess DROP_IDLE (i.e. do a wait-for-idle might be fun). > > + > > + /* > > + * If we are oversubscribed (more RT hogs than > > + * cpus) give the others a change to run; > > + * otherwise, they will interrupt us in the > > + * middle of the measurement. > > + */ > > + if (nengine > 1) > > + usleep(10*nengine); > > Could check for actual number of cpus here. Could be that fancy. There's still the issue that if we consume more than 70% (or whatever the rt limit is) we can be throttled. So we do need some sort of scheduler relinquishment. > You want it on top of the RTIDLE sleep? It's peanuts on top, so I don't mind. > > > + > > + __rearm_spin_batch(spin); > > + > > + igt_nsec_elapsed(&ts); > > + __submit_spin_batch(fd, spin, engine); > > + if (!__spin_wait(fd, spin)) { > > + igt_warn("Wait timeout! (%s)\n", > > + passname[pass]); > > It's not really a timeout how __spin_wait is implemented. Can't blame me for this one ;) > > + break; > > + } > > + > > + t = igt_nsec_elapsed(&ts) * 1e-9; > > + if (t > max) > > + max = t; > > + if (t < min) > > + min = t; > > + > > + igt_mean_add(&mean, t); > > + } > > + > > + igt_spin_batch_free(fd, spin); > > + > > + igt_info("%8s %10s: mean=%.2fus stddev=%.3fus [%.2fus, %.2fus] (n=%lu)\n", > > + names[child], > > + passname[pass], > > + igt_mean_get(&mean) * 1e6, > > + sqrt(igt_mean_get_variance(&mean)) * 1e6, > + min * 1e6, max * 1e6, > > + mean.count); > > + > > + results[3 * child + pass].mean = mean; > > + results[3 * child + pass].min = min; > > + results[3 * child + pass].max = max; > > + } while (++pass < 3); > > + } > > + > > + igt_waitchildren(); > > + > > + for (unsigned int child = 0; child < nengine; child++) { > > + struct rt_pkt normal = results[3 * child + 1]; > > + struct rt_pkt rt = results[3 * child + 2]; > > + > > + igt_assert(rt.max); > > + > > + igt_info("%8s: normal latency=%.2f±%.3fus, rt latency=%.2f±%.3fus\n", > > + names[child], > > + igt_mean_get(&normal.mean) * 1e6, > > + sqrt(igt_mean_get_variance(&normal.mean)) * 1e6, > > + igt_mean_get(&rt.mean) * 1e6, > > + sqrt(igt_mean_get_variance(&rt.mean)) * 1e6); > > + > > + igt_assert(igt_mean_get(&rt.mean) < > > + igt_mean_get(&normal.mean) * 2); > > + > > + /* The system is noisy; be conservative when declaring fail. */ > > + igt_assert(igt_mean_get_variance(&rt.mean) < > > + igt_mean_get_variance(&normal.mean) * 25); > > I don't know what "* 25" means in statistical terms. At some point you > said variance doesn't work and you had to go with stdev? 5 sigma. I was complaining that reporting variance as 0.000us wasn't particularly useful (wrong units for starters ;) and that we definitely could meet the 3*variance requirement :) At this moment, I'm plucking numbers to try and safely fail when it's bad (we hit ksoftird and it's order of magnitude worse), but enough slack for noise. The mean so far looks more reliable than the variance/stddev, but testing the variability makes sense. Oh well, if the system is that unstable, we just could use more passes to improve the statistics. -Chris
diff --git a/tests/gem_exec_latency.c b/tests/gem_exec_latency.c index 9498c0921..c5816427b 100644 --- a/tests/gem_exec_latency.c +++ b/tests/gem_exec_latency.c @@ -36,11 +36,15 @@ #include <sys/time.h> #include <sys/signal.h> #include <time.h> +#include <sched.h> #include "drm.h" #include "igt_sysfs.h" #include "igt_vgem.h" +#include "igt_dummyload.h" +#include "igt_stats.h" + #include "i915/gem_ring.h" #define LOCAL_I915_EXEC_NO_RELOC (1<<11) @@ -351,6 +355,189 @@ static void latency_from_ring(int fd, } } +static void __rearm_spin_batch(igt_spin_t *spin) +{ + const uint32_t mi_arb_chk = 0x5 << 23; + + *spin->batch = mi_arb_chk; + *spin->running = 0; + __sync_synchronize(); +} + +static void +__submit_spin_batch(int fd, igt_spin_t *spin, unsigned int flags) +{ + struct drm_i915_gem_execbuffer2 eb = spin->execbuf; + + eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK); + eb.flags |= flags | I915_EXEC_NO_RELOC; + + gem_execbuf(fd, &eb); +} + +struct rt_pkt { + struct igt_mean mean; + double min, max; +}; + +static bool __spin_wait(int fd, igt_spin_t *spin) +{ + while (!READ_ONCE(*spin->running)) { + if (!gem_bo_busy(fd, spin->handle)) + return false; + } + + return true; +} + +/* + * Test whether RT thread which hogs the CPU a lot can submit work with + * reasonable latency. + */ +static void +rthog_latency_on_ring(int fd, unsigned int engine, const char *name, unsigned int flags) +#define RTIDLE 0x1 +{ + const char *passname[] = { "warmup", "normal", "rt" }; + struct rt_pkt *results; + unsigned int engines[16]; + const char *names[16]; + unsigned int nengine; + int ret; + + results = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + igt_assert(results != MAP_FAILED); + + nengine = 0; + if (engine == ALL_ENGINES) { + for_each_physical_engine(fd, engine) { + if (!gem_can_store_dword(fd, engine)) + continue; + + engines[nengine] = engine; + names[nengine] = e__->name; + nengine++; + } + igt_require(nengine > 1); + } else { + igt_require(gem_can_store_dword(fd, engine)); + engines[nengine] = engine; + names[nengine] = name; + nengine++; + } + + igt_fork(child, nengine) { + unsigned int pass = 0; /* Three passes: warmup, normal, rt. */ + + engine = engines[child]; + do { + struct igt_mean mean; + double min = HUGE_VAL; + double max = -HUGE_VAL; + igt_spin_t *spin; + + igt_mean_init(&mean); + + if (pass == 2) { + struct sched_param rt = + { .sched_priority = 99 }; + + ret = sched_setscheduler(0, + SCHED_FIFO | SCHED_RESET_ON_FORK, + &rt); + if (ret) { + igt_warn("Failed to set scheduling policy!\n"); + break; + } + } + + spin = __igt_spin_batch_new_poll(fd, 0, engine); + if (!spin) { + igt_warn("Failed to create spinner! (%s)\n", + passname[pass]); + break; + } + igt_spin_busywait_until_running(spin); + + igt_until_timeout(pass > 0 ? 5 : 2) { + struct timespec ts = { }; + double t; + + igt_spin_batch_end(spin); + gem_sync(fd, spin->handle); + if (flags & RTIDLE) + usleep(250); + + /* + * If we are oversubscribed (more RT hogs than + * cpus) give the others a change to run; + * otherwise, they will interrupt us in the + * middle of the measurement. + */ + if (nengine > 1) + usleep(10*nengine); + + __rearm_spin_batch(spin); + + igt_nsec_elapsed(&ts); + __submit_spin_batch(fd, spin, engine); + if (!__spin_wait(fd, spin)) { + igt_warn("Wait timeout! (%s)\n", + passname[pass]); + break; + } + + t = igt_nsec_elapsed(&ts) * 1e-9; + if (t > max) + max = t; + if (t < min) + min = t; + + igt_mean_add(&mean, t); + } + + igt_spin_batch_free(fd, spin); + + igt_info("%8s %10s: mean=%.2fus stddev=%.3fus [%.2fus, %.2fus] (n=%lu)\n", + names[child], + passname[pass], + igt_mean_get(&mean) * 1e6, + sqrt(igt_mean_get_variance(&mean)) * 1e6, + min * 1e6, max * 1e6, + mean.count); + + results[3 * child + pass].mean = mean; + results[3 * child + pass].min = min; + results[3 * child + pass].max = max; + } while (++pass < 3); + } + + igt_waitchildren(); + + for (unsigned int child = 0; child < nengine; child++) { + struct rt_pkt normal = results[3 * child + 1]; + struct rt_pkt rt = results[3 * child + 2]; + + igt_assert(rt.max); + + igt_info("%8s: normal latency=%.2f±%.3fus, rt latency=%.2f±%.3fus\n", + names[child], + igt_mean_get(&normal.mean) * 1e6, + sqrt(igt_mean_get_variance(&normal.mean)) * 1e6, + igt_mean_get(&rt.mean) * 1e6, + sqrt(igt_mean_get_variance(&rt.mean)) * 1e6); + + igt_assert(igt_mean_get(&rt.mean) < + igt_mean_get(&normal.mean) * 2); + + /* The system is noisy; be conservative when declaring fail. */ + igt_assert(igt_mean_get_variance(&rt.mean) < + igt_mean_get_variance(&normal.mean) * 25); + } + + munmap(results, 4096); +} + igt_main { const struct intel_execution_engine *e; @@ -373,6 +560,12 @@ igt_main intel_register_access_init(intel_get_pci_device(), false, device); } + igt_subtest("all-rtidle-submit") + rthog_latency_on_ring(device, ALL_ENGINES, "all", RTIDLE); + + igt_subtest("all-rthog-submit") + rthog_latency_on_ring(device, ALL_ENGINES, "all", 0); + igt_subtest_group { igt_fixture igt_require(intel_gen(intel_get_drm_devid(device)) >= 7); @@ -391,6 +584,20 @@ igt_main e->exec_id | e->flags, e->name, 0); + igt_subtest_f("%s-rtidle-submit", e->name) + rthog_latency_on_ring(device, + e->exec_id | + e->flags, + e->name, + RTIDLE); + + igt_subtest_f("%s-rthog-submit", e->name) + rthog_latency_on_ring(device, + e->exec_id | + e->flags, + e->name, + 0); + igt_subtest_f("%s-dispatch-queued", e->name) latency_on_ring(device, e->exec_id | e->flags,