Message ID | 20171212145300.25092-1-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Quoting Tvrtko Ursulin (2017-12-12 14:53:00) > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > A subtest to verify that the engine busyness is reported with expected > accuracy on platforms where the feature is available. > > We test three patterns: 2%, 50% and 98% load per engine. > > Problematic part is we also rely on scheduling latencies and the no-op > batch calibration accuracy. For these reasons we use a large-ish tolerance > and also set the load emitting process to SCHED_FIFO. > > Load calibration is also moved to a subtest group fixture so the set-up > time is shared between all subtests which use it. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > tests/perf_pmu.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 115 insertions(+), 10 deletions(-) > > diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c > index db7696115a7b..ec6b0ee1cb86 100644 > --- a/tests/perf_pmu.c > +++ b/tests/perf_pmu.c > @@ -35,6 +35,7 @@ > #include <dirent.h> > #include <time.h> > #include <poll.h> > +#include <sched.h> > > #include "igt.h" > #include "igt_core.h" > @@ -79,6 +80,17 @@ init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample) > close(fd); > } > > +static uint64_t __pmu_read_single(int fd, uint64_t *ts) > +{ > + uint64_t data[2]; > + > + igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)); > + > + *ts = data[1]; > + > + return data[0]; > +} > + > static uint64_t pmu_read_single(int fd) > { > uint64_t data[2]; > @@ -665,6 +677,77 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e) > assert_within_epsilon(val[1], slept, tolerance); > } > > +static void > +accuracy(int gem_fd, const struct intel_execution_engine2 *e, > + unsigned long cal_ms_sz, unsigned long target_busy_pct) > +{ > + const unsigned long busy_us = 2500; > + const unsigned long idle_us = 100 * (busy_us - target_busy_pct * > + busy_us / 100) / target_busy_pct; > + const unsigned int test_us = 1e6; > + double busy_r; > + uint64_t val[2]; > + uint64_t ts[2]; > + int fd; > + > + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); > + > + assert_within_epsilon((double)busy_us / (busy_us + idle_us), > + (double)target_busy_pct / 100.0, tolerance); > + > + /* Emit PWM pattern on the engine from a child. */ > + igt_fork(child, 1) { > + struct sched_param rt = { .sched_priority = 99 }; > + const uint32_t bbe = MI_BATCH_BUFFER_END; > + const unsigned long loops = test_us / (busy_us + idle_us); > + const unsigned long sz = ALIGN(busy_us * cal_ms_sz / 1000, > + sizeof(uint32_t)); > + struct drm_i915_gem_exec_object2 obj = { }; > + struct drm_i915_gem_execbuffer2 eb = { > + .buffers_ptr = to_user_pointer(&obj), > + .buffer_count = 1, > + .flags = e2ring(gem_fd, e) > + }; > + unsigned long i; > + > + /* We need the best sleep accuracy we can get. */ > + igt_require(sched_setscheduler(0, > + SCHED_FIFO | SCHED_RESET_ON_FORK, > + &rt) == 0); > + > + obj.handle = gem_create(gem_fd, sz); > + gem_write(gem_fd, obj.handle, sz - sizeof(bbe), &bbe, > + sizeof(bbe)); > + > + for (i = 0; i < loops; i++) { > + gem_execbuf(gem_fd, &eb); > + gem_sync(gem_fd, obj.handle); > + usleep(idle_us); > + } > + > + gem_close(gem_fd, obj.handle); > + } Wouldn't using a signaling thread and a igt_spin_t give you better accuracy, with the bonus of not requiring calibration? -Chris
On 12/12/2017 15:05, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2017-12-12 14:53:00) >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> >> A subtest to verify that the engine busyness is reported with expected >> accuracy on platforms where the feature is available. >> >> We test three patterns: 2%, 50% and 98% load per engine. >> >> Problematic part is we also rely on scheduling latencies and the no-op >> batch calibration accuracy. For these reasons we use a large-ish tolerance >> and also set the load emitting process to SCHED_FIFO. >> >> Load calibration is also moved to a subtest group fixture so the set-up >> time is shared between all subtests which use it. >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> --- >> tests/perf_pmu.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++----- >> 1 file changed, 115 insertions(+), 10 deletions(-) >> >> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c >> index db7696115a7b..ec6b0ee1cb86 100644 >> --- a/tests/perf_pmu.c >> +++ b/tests/perf_pmu.c >> @@ -35,6 +35,7 @@ >> #include <dirent.h> >> #include <time.h> >> #include <poll.h> >> +#include <sched.h> >> >> #include "igt.h" >> #include "igt_core.h" >> @@ -79,6 +80,17 @@ init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample) >> close(fd); >> } >> >> +static uint64_t __pmu_read_single(int fd, uint64_t *ts) >> +{ >> + uint64_t data[2]; >> + >> + igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)); >> + >> + *ts = data[1]; >> + >> + return data[0]; >> +} >> + >> static uint64_t pmu_read_single(int fd) >> { >> uint64_t data[2]; >> @@ -665,6 +677,77 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e) >> assert_within_epsilon(val[1], slept, tolerance); >> } >> >> +static void >> +accuracy(int gem_fd, const struct intel_execution_engine2 *e, >> + unsigned long cal_ms_sz, unsigned long target_busy_pct) >> +{ >> + const unsigned long busy_us = 2500; >> + const unsigned long idle_us = 100 * (busy_us - target_busy_pct * >> + busy_us / 100) / target_busy_pct; >> + const unsigned int test_us = 1e6; >> + double busy_r; >> + uint64_t val[2]; >> + uint64_t ts[2]; >> + int fd; >> + >> + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); >> + >> + assert_within_epsilon((double)busy_us / (busy_us + idle_us), >> + (double)target_busy_pct / 100.0, tolerance); >> + >> + /* Emit PWM pattern on the engine from a child. */ >> + igt_fork(child, 1) { >> + struct sched_param rt = { .sched_priority = 99 }; >> + const uint32_t bbe = MI_BATCH_BUFFER_END; >> + const unsigned long loops = test_us / (busy_us + idle_us); >> + const unsigned long sz = ALIGN(busy_us * cal_ms_sz / 1000, >> + sizeof(uint32_t)); >> + struct drm_i915_gem_exec_object2 obj = { }; >> + struct drm_i915_gem_execbuffer2 eb = { >> + .buffers_ptr = to_user_pointer(&obj), >> + .buffer_count = 1, >> + .flags = e2ring(gem_fd, e) >> + }; >> + unsigned long i; >> + >> + /* We need the best sleep accuracy we can get. */ >> + igt_require(sched_setscheduler(0, >> + SCHED_FIFO | SCHED_RESET_ON_FORK, >> + &rt) == 0); >> + >> + obj.handle = gem_create(gem_fd, sz); >> + gem_write(gem_fd, obj.handle, sz - sizeof(bbe), &bbe, >> + sizeof(bbe)); >> + >> + for (i = 0; i < loops; i++) { >> + gem_execbuf(gem_fd, &eb); >> + gem_sync(gem_fd, obj.handle); >> + usleep(idle_us); >> + } >> + >> + gem_close(gem_fd, obj.handle); >> + } > > Wouldn't using a signaling thread and a igt_spin_t give you better > accuracy, with the bonus of not requiring calibration? Sounds like it could be better if main source of error is calibration and not the scheduler. I have no idea at the moment which one it is so I'll give it a try. Regards, Tvrtko
Quoting Tvrtko Ursulin (2017-12-12 15:21:32) > > On 12/12/2017 15:05, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2017-12-12 14:53:00) > >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> > >> A subtest to verify that the engine busyness is reported with expected > >> accuracy on platforms where the feature is available. > >> > >> We test three patterns: 2%, 50% and 98% load per engine. > >> > >> Problematic part is we also rely on scheduling latencies and the no-op > >> batch calibration accuracy. For these reasons we use a large-ish tolerance > >> and also set the load emitting process to SCHED_FIFO. > >> > >> Load calibration is also moved to a subtest group fixture so the set-up > >> time is shared between all subtests which use it. > >> > >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> --- > >> tests/perf_pmu.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++----- > >> 1 file changed, 115 insertions(+), 10 deletions(-) > >> > >> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c > >> index db7696115a7b..ec6b0ee1cb86 100644 > >> --- a/tests/perf_pmu.c > >> +++ b/tests/perf_pmu.c > >> @@ -35,6 +35,7 @@ > >> #include <dirent.h> > >> #include <time.h> > >> #include <poll.h> > >> +#include <sched.h> > >> > >> #include "igt.h" > >> #include "igt_core.h" > >> @@ -79,6 +80,17 @@ init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample) > >> close(fd); > >> } > >> > >> +static uint64_t __pmu_read_single(int fd, uint64_t *ts) > >> +{ > >> + uint64_t data[2]; > >> + > >> + igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)); > >> + > >> + *ts = data[1]; > >> + > >> + return data[0]; > >> +} > >> + > >> static uint64_t pmu_read_single(int fd) > >> { > >> uint64_t data[2]; > >> @@ -665,6 +677,77 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e) > >> assert_within_epsilon(val[1], slept, tolerance); > >> } > >> > >> +static void > >> +accuracy(int gem_fd, const struct intel_execution_engine2 *e, > >> + unsigned long cal_ms_sz, unsigned long target_busy_pct) > >> +{ > >> + const unsigned long busy_us = 2500; > >> + const unsigned long idle_us = 100 * (busy_us - target_busy_pct * > >> + busy_us / 100) / target_busy_pct; > >> + const unsigned int test_us = 1e6; > >> + double busy_r; > >> + uint64_t val[2]; > >> + uint64_t ts[2]; > >> + int fd; > >> + > >> + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); > >> + > >> + assert_within_epsilon((double)busy_us / (busy_us + idle_us), > >> + (double)target_busy_pct / 100.0, tolerance); > >> + > >> + /* Emit PWM pattern on the engine from a child. */ > >> + igt_fork(child, 1) { > >> + struct sched_param rt = { .sched_priority = 99 }; > >> + const uint32_t bbe = MI_BATCH_BUFFER_END; > >> + const unsigned long loops = test_us / (busy_us + idle_us); > >> + const unsigned long sz = ALIGN(busy_us * cal_ms_sz / 1000, > >> + sizeof(uint32_t)); > >> + struct drm_i915_gem_exec_object2 obj = { }; > >> + struct drm_i915_gem_execbuffer2 eb = { > >> + .buffers_ptr = to_user_pointer(&obj), > >> + .buffer_count = 1, > >> + .flags = e2ring(gem_fd, e) > >> + }; > >> + unsigned long i; > >> + > >> + /* We need the best sleep accuracy we can get. */ > >> + igt_require(sched_setscheduler(0, > >> + SCHED_FIFO | SCHED_RESET_ON_FORK, > >> + &rt) == 0); > >> + > >> + obj.handle = gem_create(gem_fd, sz); > >> + gem_write(gem_fd, obj.handle, sz - sizeof(bbe), &bbe, > >> + sizeof(bbe)); > >> + > >> + for (i = 0; i < loops; i++) { > >> + gem_execbuf(gem_fd, &eb); > >> + gem_sync(gem_fd, obj.handle); > >> + usleep(idle_us); > >> + } > >> + > >> + gem_close(gem_fd, obj.handle); > >> + } > > > > Wouldn't using a signaling thread and a igt_spin_t give you better > > accuracy, with the bonus of not requiring calibration? > > Sounds like it could be better if main source of error is calibration > and not the scheduler. I have no idea at the moment which one it is so > I'll give it a try. I think you'll still need a RT-99 thread, but it's worth a shot. :) -Chris
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c index db7696115a7b..ec6b0ee1cb86 100644 --- a/tests/perf_pmu.c +++ b/tests/perf_pmu.c @@ -35,6 +35,7 @@ #include <dirent.h> #include <time.h> #include <poll.h> +#include <sched.h> #include "igt.h" #include "igt_core.h" @@ -79,6 +80,17 @@ init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample) close(fd); } +static uint64_t __pmu_read_single(int fd, uint64_t *ts) +{ + uint64_t data[2]; + + igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)); + + *ts = data[1]; + + return data[0]; +} + static uint64_t pmu_read_single(int fd) { uint64_t data[2]; @@ -665,6 +677,77 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e) assert_within_epsilon(val[1], slept, tolerance); } +static void +accuracy(int gem_fd, const struct intel_execution_engine2 *e, + unsigned long cal_ms_sz, unsigned long target_busy_pct) +{ + const unsigned long busy_us = 2500; + const unsigned long idle_us = 100 * (busy_us - target_busy_pct * + busy_us / 100) / target_busy_pct; + const unsigned int test_us = 1e6; + double busy_r; + uint64_t val[2]; + uint64_t ts[2]; + int fd; + + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); + + assert_within_epsilon((double)busy_us / (busy_us + idle_us), + (double)target_busy_pct / 100.0, tolerance); + + /* Emit PWM pattern on the engine from a child. */ + igt_fork(child, 1) { + struct sched_param rt = { .sched_priority = 99 }; + const uint32_t bbe = MI_BATCH_BUFFER_END; + const unsigned long loops = test_us / (busy_us + idle_us); + const unsigned long sz = ALIGN(busy_us * cal_ms_sz / 1000, + sizeof(uint32_t)); + struct drm_i915_gem_exec_object2 obj = { }; + struct drm_i915_gem_execbuffer2 eb = { + .buffers_ptr = to_user_pointer(&obj), + .buffer_count = 1, + .flags = e2ring(gem_fd, e) + }; + unsigned long i; + + /* We need the best sleep accuracy we can get. */ + igt_require(sched_setscheduler(0, + SCHED_FIFO | SCHED_RESET_ON_FORK, + &rt) == 0); + + obj.handle = gem_create(gem_fd, sz); + gem_write(gem_fd, obj.handle, sz - sizeof(bbe), &bbe, + sizeof(bbe)); + + for (i = 0; i < loops; i++) { + gem_execbuf(gem_fd, &eb); + gem_sync(gem_fd, obj.handle); + usleep(idle_us); + } + + gem_close(gem_fd, obj.handle); + } + + /* Let child run. */ + usleep(test_us / 4); + + /* Collect engine busyness for a subset of child runtime. */ + fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance)); + val[0] = __pmu_read_single(fd, &ts[0]); + usleep(test_us / 2); + val[1] = __pmu_read_single(fd, &ts[1]); + close(fd); + + igt_waitchildren(); + + busy_r = (double)(val[1] - val[0]) / (ts[1] - ts[0]); + + igt_info("error=%.2f%%\n", + 100.0 - 100.0 * (busy_r / ((double)target_busy_pct / 100.0))); + + assert_within_epsilon(busy_r, (double)target_busy_pct / 100.0, 0.2); +} + /** * Tests that i915 PMU corectly errors out in invalid initialization. * i915 PMU is uncore PMU, thus: @@ -801,7 +884,7 @@ static void cpu_hotplug(int gem_fd) static unsigned long calibrate_nop(int fd, const uint64_t calibration_us) { - const uint64_t cal_min_us = calibration_us * 3; + const uint64_t cal_min_us = 2e6; const unsigned int tolerance_pct = 10; const uint32_t bbe = MI_BATCH_BUFFER_END; const unsigned int loops = 17; @@ -844,7 +927,7 @@ static unsigned long calibrate_nop(int fd, const uint64_t calibration_us) } static void -test_interrupts(int gem_fd) +test_interrupts(int gem_fd, unsigned long cal_ms_sz) { const uint32_t bbe = MI_BATCH_BUFFER_END; const unsigned int test_duration_ms = 1000; @@ -854,14 +937,14 @@ test_interrupts(int gem_fd) .buffer_count = 1, .flags = I915_EXEC_FENCE_OUT, }; - unsigned long sz; - igt_spin_t *spin; const int target = 30; + const unsigned long sz = ALIGN(test_duration_ms * cal_ms_sz / target, + sizeof(uint32_t)); + igt_spin_t *spin; struct pollfd pfd; uint64_t idle, busy; int fd; - sz = calibrate_nop(gem_fd, test_duration_ms * 1000 / target); gem_quiescent_gpu(gem_fd); fd = open_pmu(I915_PMU_INTERRUPTS); @@ -1178,11 +1261,33 @@ igt_main igt_subtest("frequency") test_frequency(fd); - /** - * Test interrupt count reporting. - */ - igt_subtest("interrupts") - test_interrupts(fd); + igt_subtest_group { + unsigned long cal_ms_sz; + + igt_fixture { + cal_ms_sz = calibrate_nop(fd, 1e3); + igt_debug("%lu nops for a 1ms batch\n", cal_ms_sz / 4); + } + + /** + * Test interrupt count reporting. + */ + igt_subtest("interrupts") + test_interrupts(fd, cal_ms_sz); + + for_each_engine_class_instance(fd, e) { + unsigned int pct[] = { 2, 50, 98 }; + + /** + * Check engine busyness accuracy is as expected. + */ + for (i = 0; i < ARRAY_SIZE(pct); i++) { + igt_subtest_f("busy-accuracy-%u-%s", pct[i], + e->name) + accuracy(fd, e, cal_ms_sz, pct[i]); + } + } + } /** * Test RC6 residency reporting.