[i-g-t] i915/gem_exec_schedule: Try to spot unfairness

Message ID	20200602083241.1413087-1-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=JImn=7P=lists.freedesktop.org=intel-gfx-bounces@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org EB6A8206C3 From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Tue, 2 Jun 2020 09:32:41 +0100 Message-Id: <20200602083241.1413087-1-chris@chris-wilson.co.uk> In-Reply-To: <20200602082245.1356782-1-chris@chris-wilson.co.uk> References: <20200602082245.1356782-1-chris@chris-wilson.co.uk> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness Precedence: list Cc: igt-dev@lists.freedesktop.org, Chris Wilson <chris@chris-wilson.co.uk> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
Series	[i-g-t] i915/gem_exec_schedule: Try to spot unfairness \| expand [i-g-t] i915/gem_exec_schedule: Try to spot unfairness

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c index 56c638833..911379cad 100644 --- a/tests/i915/gem_exec_schedule.c +++ b/tests/i915/gem_exec_schedule.c @@ -2495,6 +2495,431 @@ static void measure_semaphore_power(int i915) rapl_close(&pkg); } +static int read_timestamp_frequency(int i915) +{ + int value = 0; + drm_i915_getparam_t gp = { + .value = &value, + .param = I915_PARAM_CS_TIMESTAMP_FREQUENCY, + }; + ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp); + return value; +} + +static uint64_t div64_u64_round_up(uint64_t x, uint64_t y) +{ + return (x + y - 1) / y; +} + +static uint64_t ns_to_ticks(int i915, uint64_t ns) +{ + return div64_u64_round_up(ns * read_timestamp_frequency(i915), + NSEC_PER_SEC); +} + +static uint64_t ticks_to_ns(int i915, uint64_t ticks) +{ + return div64_u64_round_up(ticks * NSEC_PER_SEC, + read_timestamp_frequency(i915)); +} + +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags)) + +#define MI_MATH(x) MI_INSTR(0x1a, (x) - 1) +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2)) +/* Opcodes for MI_MATH_INSTR */ +#define MI_MATH_NOOP MI_MATH_INSTR(0x000, 0x0, 0x0) +#define MI_MATH_LOAD(op1, op2) MI_MATH_INSTR(0x080, op1, op2) +#define MI_MATH_LOADINV(op1, op2) MI_MATH_INSTR(0x480, op1, op2) +#define MI_MATH_LOAD0(op1) MI_MATH_INSTR(0x081, op1) +#define MI_MATH_LOAD1(op1) MI_MATH_INSTR(0x481, op1) +#define MI_MATH_ADD MI_MATH_INSTR(0x100, 0x0, 0x0) +#define MI_MATH_SUB MI_MATH_INSTR(0x101, 0x0, 0x0) +#define MI_MATH_AND MI_MATH_INSTR(0x102, 0x0, 0x0) +#define MI_MATH_OR MI_MATH_INSTR(0x103, 0x0, 0x0) +#define MI_MATH_XOR MI_MATH_INSTR(0x104, 0x0, 0x0) +#define MI_MATH_STORE(op1, op2) MI_MATH_INSTR(0x180, op1, op2) +#define MI_MATH_STOREINV(op1, op2) MI_MATH_INSTR(0x580, op1, op2) +/* Registers used as operands in MI_MATH_INSTR */ +#define MI_MATH_REG(x) (x) +#define MI_MATH_REG_SRCA 0x20 +#define MI_MATH_REG_SRCB 0x21 +#define MI_MATH_REG_ACCU 0x31 +#define MI_MATH_REG_ZF 0x32 +#define MI_MATH_REG_CF 0x33 + +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) + +static void delay(int i915, + const struct intel_execution_engine2 *e, + uint32_t handle, + uint64_t addr, + uint64_t ns) +{ + const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8; + const uint32_t base = gem_engine_mmio_base(i915, e->name); +#define CS_GPR(x) (base + 0x600 + 8 * (x)) +#define TIMESTAMP (base + 0x3a8) + enum { START_TS, NOW_TS }; + uint32_t *map, *cs, *jmp; + + igt_require(base); + + cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE); + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(START_TS) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG; + *cs++ = TIMESTAMP; + *cs++ = CS_GPR(START_TS); + + if (offset_in_page(cs) & 4) + *cs++ = 0; + jmp = cs; + + *cs++ = 0x5 << 23; /* MI_ARB_CHECK */ + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(NOW_TS) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG; + *cs++ = TIMESTAMP; + *cs++ = CS_GPR(NOW_TS); + + *cs++ = MI_MATH(4); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS)); + *cs++ = MI_MATH_SUB; + *cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU); + + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_GPR(NOW_TS); + *cs++ = addr + 4000; + *cs++ = addr >> 32; + + *cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b); + *cs++ = ~ns_to_ticks(i915, ns); + *cs++ = addr + 4000; + *cs++ = addr >> 32; + + *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b; + *cs++ = addr + offset_in_page(jmp); + *cs++ = addr >> 32; + + munmap(map, 4096); +} + +static struct drm_i915_gem_exec_object2 +delay_create(int i915, uint32_t ctx, + const struct intel_execution_engine2 *e, + uint64_t target_ns) +{ + struct drm_i915_gem_exec_object2 obj = { + .handle = batch_create(i915), + .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS, + }; + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(&obj), + .buffer_count = 1, + .rsvd1 = ctx, + .flags = e->flags, + }; + + gem_execbuf(i915, &execbuf); + gem_sync(i915, obj.handle); + + delay(i915, e, obj.handle, obj.offset, target_ns); + + obj.flags |= EXEC_OBJECT_PINNED; + return obj; +} + +static void tslog(int i915, + const struct intel_execution_engine2 *e, + uint32_t handle, + uint64_t addr) +{ + const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8; + const uint32_t base = gem_engine_mmio_base(i915, e->name); +#define CS_GPR(x) (base + 0x600 + 8 * (x)) +#define CS_TIMESTAMP (base + 0x358) + enum { ONE, MASK, ADDR }; + uint32_t *timestamp_lo, *addr_lo; + uint32_t *map, *cs; + + igt_require(base); + + map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE); + cs = map + 512; + + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_TIMESTAMP; + timestamp_lo = cs; + *cs++ = addr; + *cs++ = addr >> 32; + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(ADDR); + addr_lo = cs; + *cs++ = addr; + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(ADDR) + 4; + *cs++ = addr >> 32; + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(ONE); + *cs++ = 4; + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(ONE) + 4; + *cs++ = 0; + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(MASK); + *cs++ = 0xfffff7ff; + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(MASK) + 4; + *cs++ = 0xffffffff; + + *cs++ = MI_MATH(8); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ONE)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR)); + *cs++ = MI_MATH_ADD; + *cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK)); + *cs++ = MI_MATH_AND; + *cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU); + + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_GPR(ADDR); + *cs++ = addr + offset_in_page(timestamp_lo); + *cs++ = addr >> 32; + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_GPR(ADDR); + *cs++ = addr + offset_in_page(addr_lo); + *cs++ = addr >> 32; + + *cs++ = MI_BATCH_BUFFER_END; + + munmap(map, 4096); +} + +static struct drm_i915_gem_exec_object2 +tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e) +{ + struct drm_i915_gem_exec_object2 obj = { + .handle = batch_create(i915), + .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS, + }; + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(&obj), + .buffer_count = 1, + .rsvd1 = ctx, + .flags = e->flags, + }; + + gem_execbuf(i915, &execbuf); + gem_sync(i915, obj.handle); + + tslog(i915, e, obj.handle, obj.offset); + + obj.flags |= EXEC_OBJECT_PINNED; + return obj; +} + +static int cmp_u32(const void *A, const void *B) +{ + const unsigned long *a = A, *b = B; + + if (*a < *b) + return -1; + else if (*a > *b) + return 1; + else + return 0; +} + +static void fair_child(int i915, uint32_t ctx, + const struct intel_execution_engine2 *e, + uint64_t frame_ns, + int timeout, + int timeline, + unsigned int flags, + unsigned long *ctl, + unsigned long *out) +#define F_SYNC (1 << 0) +#define F_PACE (1 << 1) +#define F_FLOW (1 << 2) +#define F_HALF (1 << 3) +#define F_SOLO (1 << 4) +#define F_SPARE (1 << 8) +{ + const int batches_per_frame = flags & F_SOLO ? 1 : 3; + struct drm_i915_gem_exec_object2 prev = + delay_create(i915, ctx, e, frame_ns / batches_per_frame); + struct drm_i915_gem_exec_object2 next = + delay_create(i915, ctx, e, frame_ns / batches_per_frame); + struct drm_i915_gem_exec_object2 ts = tslog_create(i915, ctx, e); + int p_fence = -1, n_fence = -1; + unsigned long count = 0; + uint32_t *map; + int n; + + while (!READ_ONCE(*ctl)) { + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(&next), + .buffer_count = 1, + .rsvd1 = ctx, + .rsvd2 = -1, + .flags = e->flags, + }; + + if (flags & F_FLOW) { + execbuf.rsvd2 = + sw_sync_timeline_create_fence(timeline, count); + execbuf.flags |= I915_EXEC_FENCE_IN; + } + + execbuf.flags |= I915_EXEC_FENCE_OUT; + gem_execbuf_wr(i915, &execbuf); + n_fence = execbuf.rsvd2 >> 32; + execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN); + for (n = 1; n < batches_per_frame; n++) + gem_execbuf(i915, &execbuf); + + execbuf.buffers_ptr = to_user_pointer(&ts); + execbuf.batch_start_offset = 2048; + gem_execbuf(i915, &execbuf); + + if (flags & F_PACE && p_fence != -1) { + struct pollfd pfd = { + .fd = p_fence, + .events = POLLIN, + }; + poll(&pfd, 1, -1); + } + close(p_fence); + close(execbuf.rsvd2); + + if (flags & F_SYNC) { + struct pollfd pfd = { + .fd = n_fence, + .events = POLLIN, + }; + poll(&pfd, 1, -1); + } + + igt_swap(prev, next); + igt_swap(p_fence, n_fence); + count++; + } + close(p_fence); + + gem_close(i915, next.handle); + gem_close(i915, prev.handle); + + gem_sync(i915, ts.handle); + map = gem_mmap__device_coherent(i915, ts.handle, 0, 4096, PROT_WRITE); + for (n = 1; n < min(count, 512); n++) { + igt_assert(map[n]); + map[n - 1] = map[n] - map[n - 1]; + } + qsort(map, --n, sizeof(*map), cmp_u32); + *out = ticks_to_ns(i915, map[n / 2]); + munmap(map, 4096); + + gem_close(i915, ts.handle); +} + +static int cmp_ul(const void *A, const void *B) +{ + const unsigned long *a = A, *b = B; + + if (*a < *b) + return -1; + else if (*a > *b) + return 1; + else + return 0; +} + +static void fairness(int i915, + const struct intel_execution_engine2 *e, + int timeout, unsigned int flags) +{ + const int frame_ns = 16666 * 1000; + const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns; + unsigned long *result; + + igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8); + igt_require(gem_class_has_mutable_submission(i915, e->class)); + + result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + + for (int n = 2; n <= 16; n <<= 1) { + int timeline = sw_sync_timeline_create(); + int nfences = timeout * NSEC_PER_SEC / fence_ns + 1; + const int nchild = n - 1; /* odd for easy medians */ + const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE)); + const int lo = nchild / 4; + const int hi = (3 * nchild + 3) / 4 - 1; + struct igt_mean m; + + memset(result, 0, (nchild + 1) * sizeof(result[0])); + igt_fork(child, nchild) { + uint32_t ctx = gem_context_clone_with_engines(i915, 0); + + fair_child(i915, ctx, e, child_ns, + timeout, timeline, flags, + &result[nchild], + &result[child]); + + gem_context_destroy(i915, ctx); + } + + while (nfences--) { + struct timespec tv = { .tv_nsec = fence_ns }; + nanosleep(&tv, NULL); + sw_sync_timeline_inc(timeline, 1); + } + result[nchild] = 1; + for (int child = 0; child < nchild; child++) { + while (!READ_ONCE(result[child])) { + struct timespec tv = { .tv_nsec = fence_ns }; + nanosleep(&tv, NULL); + sw_sync_timeline_inc(timeline, 1); + } + } + igt_waitchildren(); + close(timeline); + + igt_mean_init(&m); + for (int child = 0; child < nchild; child++) + igt_mean_add(&m, result[child]); + + qsort(result, nchild, sizeof(*result), cmp_ul); + igt_info("%d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n", + nchild, + 1e-6 * result[0], 1e-6 * result[nchild - 1], + 1e-6 * result[lo], 1e-6 * result[hi], + 1e-6 * result[nchild / 2], + 1e-6 * igt_mean_get(&m), + 1e-6 * sqrt(igt_mean_get_variance(&m))); + +#if 0 + /* Mean within 10% of target */ + igt_assert( 9 * igt_mean_get(&m) > 10 * frame_ns && + 10 * igt_mean_get(&m) < 9 * frame_ns); + + /* Variance [inter-quartile range] is less than 33% of median */ + igt_assert(3 * result[hi] - result[lo] < result[nchild / 2]); +#endif + } + + munmap(result, 4096); +} + #define test_each_engine(T, i915, e) \ igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \ igt_dynamic_f("%s", e->name) @@ -2589,6 +3014,21 @@ igt_main test_each_engine_store("promotion", fd, e) promotion(fd, e->flags); + test_each_engine_store("fair-none", fd, e) + fairness(fd, e, 2, 0); + test_each_engine_store("fair-pace", fd, e) + fairness(fd, e, 2, F_PACE); + test_each_engine_store("fair-sync", fd, e) + fairness(fd, e, 2, F_SYNC); + test_each_engine_store("fair-solo", fd, e) + fairness(fd, e, 2, F_SYNC | F_SOLO); + test_each_engine_store("fair-flow", fd, e) + fairness(fd, e, 2, F_PACE | F_FLOW); + test_each_engine_store("fair-spare", fd, e) + fairness(fd, e, 2, F_PACE | F_FLOW | F_SPARE); + test_each_engine_store("fair-half", fd, e) + fairness(fd, e, 2, F_PACE | F_FLOW | F_HALF); + igt_subtest_group { igt_fixture { igt_require(gem_scheduler_has_preemption(fd));

[i-g-t] i915/gem_exec_schedule: Try to spot unfairness

Commit Message

Patch