Message ID | 20180522110044.26439-3-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 22/05/2018 12:00, Chris Wilson wrote: > While for stressing the system we want to submit as many batches as we > can as that shows us worst case impact on system latency, it is not a > very realistic case. To introduce a bit more realism allow the batches > run for a user defined duration. > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > benchmarks/gem_syslatency.c | 71 ++++++++++++++++++++++++++++++++++--- > 1 file changed, 67 insertions(+), 4 deletions(-) > > diff --git a/benchmarks/gem_syslatency.c b/benchmarks/gem_syslatency.c > index d1056773a..45cabe86c 100644 > --- a/benchmarks/gem_syslatency.c > +++ b/benchmarks/gem_syslatency.c > @@ -51,6 +51,7 @@ static volatile int done; > > struct gem_busyspin { > pthread_t thread; > + unsigned long sz; > unsigned long count; > bool leak; > bool interrupts; > @@ -96,7 +97,8 @@ static void *gem_busyspin(void *arg) > struct gem_busyspin *bs = arg; > struct drm_i915_gem_execbuffer2 execbuf; > struct drm_i915_gem_exec_object2 obj[2]; > - const unsigned sz = bs->leak ? 16 << 20 : 4 << 10; > + const unsigned sz = > + bs->sz ? bs->sz + sizeof(bbe) : bs->leak ? 16 << 20 : 4 << 10; > unsigned engines[16]; > unsigned nengine; > unsigned engine; > @@ -112,7 +114,7 @@ static void *gem_busyspin(void *arg) > obj[0].handle = gem_create(fd, 4096); > obj[0].flags = EXEC_OBJECT_WRITE; > obj[1].handle = gem_create(fd, sz); > - gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe)); > + gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe)); Hm what was the point in creating large batches here if bbend was always first? > > memset(&execbuf, 0, sizeof(execbuf)); > execbuf.buffers_ptr = (uintptr_t)(obj + !bs->interrupts); > @@ -125,6 +127,12 @@ static void *gem_busyspin(void *arg) > } > > while (!done) { > + for (int n = 0; n < nengine; n++) { > + const int m = rand() % nengine; > + unsigned int tmp = engines[n]; > + engines[n] = engines[m]; > + engines[m] = tmp; igt_exchange_int? Problem with frameworks getting more featureful is easier to forget what is there. :) Or even igt_permute_array? But what it has to do with batch duration? > + } > for (int n = 0; n < nengine; n++) { > execbuf.flags &= ~ENGINE_FLAGS; > execbuf.flags |= engines[n]; > @@ -134,7 +142,7 @@ static void *gem_busyspin(void *arg) > if (bs->leak) { > gem_madvise(fd, obj[1].handle, I915_MADV_DONTNEED); > obj[1].handle = gem_create(fd, sz); > - gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe)); > + gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe)); > } > } > > @@ -294,6 +302,50 @@ static void *background_fs(void *path) > return NULL; > } > > +static unsigned long calibrate_nop(unsigned int target_us, > + unsigned int tolerance_pct) > +{ > + const uint32_t bbe = MI_BATCH_BUFFER_END; > + const unsigned int loops = 100; > + struct drm_i915_gem_exec_object2 obj = {}; > + struct drm_i915_gem_execbuffer2 eb = > + { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj}; > + struct timespec t_0, t_end; > + long sz, prev; > + int fd; > + > + fd = drm_open_driver(DRIVER_INTEL); > + > + clock_gettime(CLOCK_MONOTONIC, &t_0); > + > + sz = 256 * 1024; > + do { > + struct timespec t_start; > + > + obj.handle = gem_create(fd, sz + sizeof(bbe)); > + gem_write(fd, obj.handle, sz, &bbe, sizeof(bbe)); > + gem_execbuf(fd, &eb); > + gem_sync(fd, obj.handle); > + > + clock_gettime(CLOCK_MONOTONIC, &t_start); > + for (int loop = 0; loop < loops; loop++) > + gem_execbuf(fd, &eb); > + gem_sync(fd, obj.handle); > + clock_gettime(CLOCK_MONOTONIC, &t_end); > + > + gem_close(fd, obj.handle); > + > + prev = sz; > + sz = loops * sz / elapsed(&t_start, &t_end) * 1e3 * target_us; > + sz = ALIGN(sz, sizeof(uint32_t)); > + } while (elapsed(&t_0, &t_end) < 5 || > + abs(sz - prev) > (sz * tolerance_pct / 100)); > + > + close(fd); > + > + return sz; > +} I presume this is a copy&paste so don't have to look into it in detail. > + > int main(int argc, char **argv) > { > struct gem_busyspin *busy; > @@ -309,9 +361,10 @@ int main(int argc, char **argv) > int enable_gem_sysbusy = 1; > bool leak = false; > bool interrupts = false; > + long batch = 0; > int n, c; > > - while ((c = getopt(argc, argv, "t:f:bmni1")) != -1) { > + while ((c = getopt(argc, argv, "r:t:f:bmni1")) != -1) { > switch (c) { > case '1': > ncpus = 1; > @@ -328,6 +381,10 @@ int main(int argc, char **argv) > if (time < 0) > time = INT_MAX; > break; > + case 'r': > + /* Duration of each batch (microseconds) */ > + batch = atoi(optarg); > + break; > case 'f': > /* Select an output field */ > field = atoi(optarg); > @@ -350,11 +407,17 @@ int main(int argc, char **argv) > force_low_latency(); > min = min_measurement_error(); > > + if (batch > 0) > + batch = calibrate_nop(batch, 2); > + else > + batch = -batch; > + No idea of the purpose of this. User passes in negative on the cmd line? But then calibration is missing. > busy = calloc(ncpus, sizeof(*busy)); > pthread_attr_init(&attr); > if (enable_gem_sysbusy) { > for (n = 0; n < ncpus; n++) { > bind_cpu(&attr, n); > + busy[n].sz = batch; > busy[n].leak = leak; > busy[n].interrupts = interrupts; > pthread_create(&busy[n].thread, &attr, > Regards, Tvrtko
diff --git a/benchmarks/gem_syslatency.c b/benchmarks/gem_syslatency.c index d1056773a..45cabe86c 100644 --- a/benchmarks/gem_syslatency.c +++ b/benchmarks/gem_syslatency.c @@ -51,6 +51,7 @@ static volatile int done; struct gem_busyspin { pthread_t thread; + unsigned long sz; unsigned long count; bool leak; bool interrupts; @@ -96,7 +97,8 @@ static void *gem_busyspin(void *arg) struct gem_busyspin *bs = arg; struct drm_i915_gem_execbuffer2 execbuf; struct drm_i915_gem_exec_object2 obj[2]; - const unsigned sz = bs->leak ? 16 << 20 : 4 << 10; + const unsigned sz = + bs->sz ? bs->sz + sizeof(bbe) : bs->leak ? 16 << 20 : 4 << 10; unsigned engines[16]; unsigned nengine; unsigned engine; @@ -112,7 +114,7 @@ static void *gem_busyspin(void *arg) obj[0].handle = gem_create(fd, 4096); obj[0].flags = EXEC_OBJECT_WRITE; obj[1].handle = gem_create(fd, sz); - gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe)); + gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe)); memset(&execbuf, 0, sizeof(execbuf)); execbuf.buffers_ptr = (uintptr_t)(obj + !bs->interrupts); @@ -125,6 +127,12 @@ static void *gem_busyspin(void *arg) } while (!done) { + for (int n = 0; n < nengine; n++) { + const int m = rand() % nengine; + unsigned int tmp = engines[n]; + engines[n] = engines[m]; + engines[m] = tmp; + } for (int n = 0; n < nengine; n++) { execbuf.flags &= ~ENGINE_FLAGS; execbuf.flags |= engines[n]; @@ -134,7 +142,7 @@ static void *gem_busyspin(void *arg) if (bs->leak) { gem_madvise(fd, obj[1].handle, I915_MADV_DONTNEED); obj[1].handle = gem_create(fd, sz); - gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe)); + gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe)); } } @@ -294,6 +302,50 @@ static void *background_fs(void *path) return NULL; } +static unsigned long calibrate_nop(unsigned int target_us, + unsigned int tolerance_pct) +{ + const uint32_t bbe = MI_BATCH_BUFFER_END; + const unsigned int loops = 100; + struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_execbuffer2 eb = + { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj}; + struct timespec t_0, t_end; + long sz, prev; + int fd; + + fd = drm_open_driver(DRIVER_INTEL); + + clock_gettime(CLOCK_MONOTONIC, &t_0); + + sz = 256 * 1024; + do { + struct timespec t_start; + + obj.handle = gem_create(fd, sz + sizeof(bbe)); + gem_write(fd, obj.handle, sz, &bbe, sizeof(bbe)); + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + + clock_gettime(CLOCK_MONOTONIC, &t_start); + for (int loop = 0; loop < loops; loop++) + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + gem_close(fd, obj.handle); + + prev = sz; + sz = loops * sz / elapsed(&t_start, &t_end) * 1e3 * target_us; + sz = ALIGN(sz, sizeof(uint32_t)); + } while (elapsed(&t_0, &t_end) < 5 || + abs(sz - prev) > (sz * tolerance_pct / 100)); + + close(fd); + + return sz; +} + int main(int argc, char **argv) { struct gem_busyspin *busy; @@ -309,9 +361,10 @@ int main(int argc, char **argv) int enable_gem_sysbusy = 1; bool leak = false; bool interrupts = false; + long batch = 0; int n, c; - while ((c = getopt(argc, argv, "t:f:bmni1")) != -1) { + while ((c = getopt(argc, argv, "r:t:f:bmni1")) != -1) { switch (c) { case '1': ncpus = 1; @@ -328,6 +381,10 @@ int main(int argc, char **argv) if (time < 0) time = INT_MAX; break; + case 'r': + /* Duration of each batch (microseconds) */ + batch = atoi(optarg); + break; case 'f': /* Select an output field */ field = atoi(optarg); @@ -350,11 +407,17 @@ int main(int argc, char **argv) force_low_latency(); min = min_measurement_error(); + if (batch > 0) + batch = calibrate_nop(batch, 2); + else + batch = -batch; + busy = calloc(ncpus, sizeof(*busy)); pthread_attr_init(&attr); if (enable_gem_sysbusy) { for (n = 0; n < ncpus; n++) { bind_cpu(&attr, n); + busy[n].sz = batch; busy[n].leak = leak; busy[n].interrupts = interrupts; pthread_create(&busy[n].thread, &attr,
While for stressing the system we want to submit as many batches as we can as that shows us worst case impact on system latency, it is not a very realistic case. To introduce a bit more realism allow the batches run for a user defined duration. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- benchmarks/gem_syslatency.c | 71 ++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 4 deletions(-)