From patchwork Mon Aug  3 13:57:53 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: Chris Wilson <chris@chris-wilson.co.uk>
X-Patchwork-Id: 11698165
Return-Path: <SRS0=uD7q=BN=lists.freedesktop.org=intel-gfx-bounces@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 6119E1575
	for <patchwork-intel-gfx@patchwork.kernel.org>;
 Mon,  3 Aug 2020 13:58:07 +0000 (UTC)
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by mail.kernel.org (Postfix) with ESMTPS id 3237720775
	for <patchwork-intel-gfx@patchwork.kernel.org>;
 Mon,  3 Aug 2020 13:58:07 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 3237720775
Authentication-Results: mail.kernel.org;
 dmarc=none (p=none dis=none) header.from=chris-wilson.co.uk
Authentication-Results: mail.kernel.org;
 spf=none smtp.mailfrom=intel-gfx-bounces@lists.freedesktop.org
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 682A46E0FD;
	Mon,  3 Aug 2020 13:58:07 +0000 (UTC)
X-Original-To: intel-gfx@lists.freedesktop.org
Delivered-To: intel-gfx@lists.freedesktop.org
Received: from fireflyinternet.com (unknown [77.68.26.236])
 by gabe.freedesktop.org (Postfix) with ESMTPS id 691906E0FD;
 Mon,  3 Aug 2020 13:58:05 +0000 (UTC)
X-Default-Received-SPF: pass (skip=forwardok (res=PASS))
 x-ip-name=78.156.65.138;
Received: from haswell.alporthouse.com (unverified [78.156.65.138])
 by fireflyinternet.com (Firefly Internet (M1)) with ESMTP id 22018289-1500050
 for multiple; Mon, 03 Aug 2020 14:57:54 +0100
From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Mon,  3 Aug 2020 14:57:53 +0100
Message-Id: <20200803135753.58654-1-chris@chris-wilson.co.uk>
X-Mailer: git-send-email 2.28.0
MIME-Version: 1.0
Subject: [Intel-gfx] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot
 unfairness
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: igt-dev@lists.freedesktop.org, Chris Wilson <chris@chris-wilson.co.uk>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

An important property for multi-client systems is that each client gets
a 'fair' allotment of system time. (Where fairness is at the whim of the
context properties, such as priorities.) This test forks N independent
clients (albeit they happen to share a single vm), and does an equal
amount of work in client and asserts that they take an equal amount of
time.

Though we have never claimed to have a completely fair scheduler, that
is what is expected.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Ramalingam C <ramalingam.c@intel.com>
---
 tests/i915/gem_exec_schedule.c | 816 +++++++++++++++++++++++++++++++++
 1 file changed, 816 insertions(+)

diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
index 488d93511..7c8ea6d70 100644
--- a/tests/i915/gem_exec_schedule.c
+++ b/tests/i915/gem_exec_schedule.c
@@ -29,6 +29,7 @@
 #include <sys/poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <signal.h>
@@ -2503,6 +2504,800 @@ static void measure_semaphore_power(int i915)
 	rapl_close(&pkg);
 }
 
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ticks(int i915, uint64_t ns)
+{
+	return div64_u64_round_up(ns * read_timestamp_frequency(i915),
+				  NSEC_PER_SEC);
+}
+
+static uint64_t ticks_to_ns(int i915, uint64_t ticks)
+{
+	return div64_u64_round_up(ticks * NSEC_PER_SEC,
+				  read_timestamp_frequency(i915));
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static void tslog(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define CS_TIMESTAMP (base + 0x358)
+	enum { INC, MASK, ADDR };
+	uint32_t *timestamp_lo, *addr_lo;
+	uint32_t *map, *cs;
+
+	igt_require(base);
+
+	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+	cs = map + 512;
+
+	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_TIMESTAMP;
+	timestamp_lo = cs;
+	*cs++ = addr;
+	*cs++ = addr >> 32;
+
+	/* Load the address + inc & mask variables */
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR);
+	addr_lo = cs;
+	*cs++ = addr;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(ADDR) + 4;
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC);
+	*cs++ = 4;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(INC) + 4;
+	*cs++ = 0;
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK);
+	*cs++ = 0xfffff7ff;
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(MASK) + 4;
+	*cs++ = 0xffffffff;
+
+	/* Increment the [ring] address for saving CS_TIMESTAMP */
+	*cs++ = MI_MATH(8);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
+	*cs++ = MI_MATH_AND;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
+
+	/* Rewrite the batch buffer for the next execution */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(timestamp_lo);
+	*cs++ = addr >> 32;
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(ADDR);
+	*cs++ = addr + offset_in_page(addr_lo);
+	*cs++ = addr >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	tslog(i915, e, obj.handle, obj.offset);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const uint32_t *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
+{
+	const int gen = intel_gen(intel_get_drm_devid(i915));
+
+	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
+		return false; /* looks fubar */
+
+	return true;
+}
+
+static struct intel_execution_engine2
+pick_random_engine(int i915, const struct intel_execution_engine2 *not)
+{
+	const struct intel_execution_engine2 *e;
+	unsigned int count = 0;
+
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		count++;
+	}
+	if (!count)
+		return *not;
+
+	count = rand() % count;
+	__for_each_physical_engine(i915, e) {
+		if (e->flags == not->flags)
+			continue;
+		if (!gem_class_has_mutable_submission(i915, e->class))
+			continue;
+		if (!count--)
+			break;
+	}
+
+	return *e;
+}
+
+static void fair_child(int i915, uint32_t ctx,
+		       const struct intel_execution_engine2 *e,
+		       uint64_t frame_ns,
+		       int timeline,
+		       uint32_t common,
+		       unsigned int flags,
+		       unsigned long *ctl,
+		       unsigned long *out)
+#define F_SYNC		(1 << 0)
+#define F_PACE		(1 << 1)
+#define F_FLOW		(1 << 2)
+#define F_HALF		(1 << 3)
+#define F_SOLO		(1 << 4)
+#define F_SPARE		(1 << 5)
+#define F_NEXT		(1 << 6)
+#define F_VIP		(1 << 7)
+#define F_RRUL		(1 << 8)
+#define F_SHARE		(1 << 9)
+#define F_PING		(1 << 10)
+#define F_THROTTLE	(1 << 11)
+#define F_ISOLATE	(1 << 12)
+{
+	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
+	struct drm_i915_gem_exec_object2 obj[4] = {
+		{},
+		{
+			.handle = common ?: gem_create(i915, 4096),
+		},
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
+	};
+	struct intel_execution_engine2 ping = *e;
+	int p_fence = -1, n_fence = -1;
+	unsigned long count = 0;
+	int n;
+
+	srandom(getpid());
+	if (flags & F_PING)
+		ping = pick_random_engine(i915, e);
+	obj[0] = tslog_create(i915, ctx, &ping);
+
+	while (!READ_ONCE(*ctl)) {
+		struct drm_i915_gem_execbuffer2 execbuf = {
+			.buffers_ptr = to_user_pointer(obj),
+			.buffer_count = 4,
+			.rsvd1 = ctx,
+			.rsvd2 = -1,
+			.flags = e->flags,
+		};
+
+		if (flags & F_FLOW) {
+			unsigned int seq;
+
+			seq = count;
+			if (flags & F_NEXT)
+				seq++;
+
+			execbuf.rsvd2 =
+				sw_sync_timeline_create_fence(timeline, seq);
+			execbuf.flags |= I915_EXEC_FENCE_IN;
+		}
+
+		execbuf.flags |= I915_EXEC_FENCE_OUT;
+		gem_execbuf_wr(i915, &execbuf);
+		n_fence = execbuf.rsvd2 >> 32;
+		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
+		for (n = 1; n < batches_per_frame; n++)
+			gem_execbuf(i915, &execbuf);
+		close(execbuf.rsvd2);
+
+		execbuf.buffer_count = 1;
+		execbuf.batch_start_offset = 2048;
+		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
+		execbuf.rsvd2 = n_fence;
+		gem_execbuf(i915, &execbuf);
+
+		if (flags & F_PACE && p_fence != -1) {
+			struct pollfd pfd = {
+				.fd = p_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+		close(p_fence);
+
+		if (flags & F_SYNC) {
+			struct pollfd pfd = {
+				.fd = n_fence,
+				.events = POLLIN,
+			};
+			poll(&pfd, 1, -1);
+		}
+
+		if (flags & F_THROTTLE)
+			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
+
+		igt_swap(obj[2], obj[3]);
+		igt_swap(p_fence, n_fence);
+		count++;
+	}
+	close(p_fence);
+
+	gem_close(i915, obj[3].handle);
+	gem_close(i915, obj[2].handle);
+	if (obj[1].handle != common)
+		gem_close(i915, obj[1].handle);
+
+	gem_sync(i915, obj[0].handle);
+	if (out) {
+		uint32_t *map;
+
+		map = gem_mmap__device_coherent(i915, obj[0].handle,
+						0, 4096, PROT_WRITE);
+		for (n = 1; n < min(count, 512); n++) {
+			igt_assert(map[n]);
+			map[n - 1] = map[n] - map[n - 1];
+		}
+		qsort(map, --n, sizeof(*map), cmp_u32);
+		*out = ticks_to_ns(i915, map[n / 2]);
+		munmap(map, 4096);
+	}
+	gem_close(i915, obj[0].handle);
+}
+
+static int cmp_ul(const void *A, const void *B)
+{
+	const unsigned long *a = A, *b = B;
+
+	if (*a < *b)
+		return -1;
+	else if (*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
+{
+	uint64_t cpu_time = 0;
+
+	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
+
+	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
+	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
+
+	return cpu_time;
+}
+
+static void timeline_advance(int timeline, int delay_ns)
+{
+	struct timespec tv = { .tv_nsec = delay_ns };
+	nanosleep(&tv, NULL);
+	sw_sync_timeline_inc(timeline, 1);
+}
+
+static void fairness(int i915,
+		     const struct intel_execution_engine2 *e,
+		     int timeout, unsigned int flags)
+{
+	const int frame_ns = 16666 * 1000;
+	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
+	unsigned long *result;
+	uint32_t common = 0;
+
+	igt_require(has_ctx_timestamp(i915, e));
+	igt_require(gem_class_has_mutable_submission(i915, e->class));
+
+	if (flags & F_SHARE)
+		common = gem_create(i915, 4095);
+
+	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
+		int timeline = sw_sync_timeline_create();
+		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
+		const int nchild = n - 1; /* odd for easy medians */
+		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
+		const int lo = nchild / 4;
+		const int hi = (3 * nchild + 3) / 4 - 1;
+		struct rusage old_usage, usage;
+		uint64_t cpu_time, d_time;
+		unsigned long vip = -1;
+		struct timespec tv;
+		struct igt_mean m;
+
+		if (flags & F_PING) {
+			struct intel_execution_engine2 *ping;
+
+			__for_each_physical_engine(i915, ping) {
+				if (ping->flags == e->flags)
+					continue;
+
+				igt_fork(child, 1) {
+					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
+
+					fair_child(i915, ctx, ping,
+						   child_ns / 8,
+						   -1, common,
+						   F_SOLO | F_PACE | F_SHARE,
+						   &result[nchild],
+						   NULL);
+
+					gem_context_destroy(i915, ctx);
+				}
+			}
+		}
+
+		memset(result, 0, (nchild + 1) * sizeof(result[0]));
+		getrusage(RUSAGE_CHILDREN, &old_usage);
+		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
+		igt_fork(child, nchild) {
+			uint32_t ctx;
+
+			if (flags & F_ISOLATE) {
+				int clone, dmabuf = -1;
+
+				if (common)
+					dmabuf = prime_handle_to_fd(i915, common);
+
+				clone = gem_reopen_driver(i915);
+				gem_context_copy_engines(i915, 0, clone, 0);
+				i915 = clone;
+
+				if (dmabuf != -1)
+					common = prime_fd_to_handle(i915, dmabuf);
+			}
+
+			ctx = gem_context_clone_with_engines(i915, 0);
+
+			if (flags & F_VIP && child == 0) {
+				gem_context_set_priority(i915, ctx, MAX_PRIO);
+				flags |= F_FLOW;
+			}
+			if (flags & F_RRUL && child == 0)
+				flags |= F_SOLO | F_FLOW | F_SYNC;
+
+			fair_child(i915, ctx, e, child_ns,
+				   timeline, common, flags,
+				   &result[nchild],
+				   &result[child]);
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		while (nfences--)
+			timeline_advance(timeline, fence_ns);
+
+		result[nchild] = 1;
+		for (int child = 0; child < nchild; child++) {
+			while (!READ_ONCE(result[child]))
+				timeline_advance(timeline, fence_ns);
+		}
+
+		igt_waitchildren();
+		close(timeline);
+
+		/* Are we running out of CPU time, and fail to submit frames? */
+		d_time = igt_nsec_elapsed(&tv);
+		getrusage(RUSAGE_CHILDREN, &usage);
+		cpu_time = d_cpu_time(&usage, &old_usage);
+		if (10 * cpu_time > 9 * d_time) {
+			if (nchild > 7)
+				break;
+
+			igt_skip_on_f(10 * cpu_time > 9 * d_time,
+				      "%.0f%% CPU usage, presuming capacity exceeded\n",
+				      100. * cpu_time / d_time);
+		}
+
+		igt_mean_init(&m);
+		for (int child = 0; child < nchild; child++)
+			igt_mean_add(&m, result[child]);
+
+		if (flags & (F_VIP | F_RRUL))
+			vip = result[0];
+
+		qsort(result, nchild, sizeof(*result), cmp_ul);
+		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
+			 nchild,
+			 1e-6 * result[0],  1e-6 * result[nchild - 1],
+			 1e-6 * result[lo], 1e-6 * result[hi],
+			 1e-6 * result[nchild / 2],
+			 1e-6 * igt_mean_get(&m),
+			 1e-6 * sqrt(igt_mean_get_variance(&m)));
+
+		if (vip != -1) {
+			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
+			igt_assert(4 * vip > 3 * fence_ns &&
+				   3 * vip < 4 * fence_ns);
+		}
+
+		/* May be slowed due to sheer volume of context switches */
+		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
+			       igt_mean_get(&m) < 3 * fence_ns);
+
+		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
+			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
+
+		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);
+	}
+
+	munmap(result, 4096);
+	if (common)
+		gem_close(i915, common);
+}
+
+static void test_fairness(int i915, int timeout)
+{
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} fair[] = {
+		/*
+		 * none - maximal greed in each client
+		 *
+		 * Push as many frames from each client as fast as possible
+		 */
+		{ "none",       0 },
+		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
+		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
+		{ "none-share", F_SHARE }, /* read from a common buffer */
+		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
+		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
+
+		/*
+		 * throttle - original per client throttling
+		 *
+		 * Used for front buffering rendering where there is no
+		 * extenal frame marker. Each client tries to only keep
+		 * 20ms of work submitted, though that measurement is
+		 * flawed...
+		 *
+		 * This is used by Xory to try and maintain some resembalance
+		 * of input/output consistency when being feed a continuous
+		 * stream of X11 draw requests straight into scanout, where
+		 * the clients may submit the work faster than can be drawn.
+		 *
+		 * Throttling tracks requests per-file (and assumes that
+		 * all requests are in submission order across the whole file),
+		 * so we split each child to its own fd.
+		 */
+		{ "throttle",       F_THROTTLE | F_ISOLATE },
+		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
+		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
+		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
+		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
+
+		/*
+		 * pace - mesa "submit double buffering"
+		 *
+		 * Submit a frame, wait for previous frame to start. This
+		 * prevents each client from getting too far ahead of its
+		 * rendering, maintaining a consistent input/output latency.
+		 */
+		{ "pace",       F_PACE },
+		{ "pace-solo",  F_PACE | F_SOLO},
+		{ "pace-share", F_PACE | F_SHARE},
+		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
+
+		/* sync - only submit a frame at a time */
+		{ "sync",      F_SYNC },
+		{ "sync-vip",  F_SYNC | F_VIP },
+		{ "sync-solo", F_SYNC | F_SOLO },
+
+		/* flow - synchronise execution against the clock (vblank) */
+		{ "flow",       F_PACE | F_FLOW },
+		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
+		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
+
+		/* next - submit ahead of the clock (vblank double buffering) */
+		{ "next",       F_PACE | F_FLOW | F_NEXT },
+		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
+		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
+
+		/* spare - underutilise by a single client timeslice */
+		{ "spare", F_PACE | F_FLOW | F_SPARE },
+
+		/* half - run at half pace (submit 16ms of work every 32ms) */
+		{ "half",  F_PACE | F_FLOW | F_HALF },
+
+		{}
+	};
+
+	for (typeof(*fair) *f = fair; f->name; f++) {
+		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
+			const struct intel_execution_engine2 *e;
+
+			igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
+
+			__for_each_physical_engine(i915, e) {
+				if (!gem_class_can_store_dword(i915, e->class))
+					continue;
+
+				igt_dynamic_f("%s", e->name)
+					fairness(i915, e, timeout, f->flags);
+			}
+		}
+	}
+}
+
+static uint32_t read_ctx_timestamp(int i915,
+				   uint32_t ctx,
+				   const struct intel_execution_engine2 *e)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+	struct drm_i915_gem_relocation_entry reloc;
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = gem_create(i915, 4096),
+		.offset = 32 << 20,
+		.relocs_ptr = to_user_pointer(&reloc),
+		.relocation_count = 1,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.flags = e->flags,
+		.rsvd1 = ctx,
+	};
+#define RUNTIME (base + 0x3a8)
+	uint32_t *map, *cs;
+	uint32_t ts;
+
+	igt_require(base);
+
+	cs = map = gem_mmap__device_coherent(i915, obj.handle,
+					     0, 4096, PROT_WRITE);
+
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = RUNTIME;
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj.handle;
+	reloc.presumed_offset = obj.offset;
+	reloc.offset = offset_in_page(cs);
+	reloc.delta = 4000;
+	*cs++ = obj.offset + 4000;
+	*cs++ = obj.offset >> 32;
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	ts = map[1000];
+	munmap(map, 4096);
+
+	return ts;
+}
+
+static void fairslice(int i915, const struct intel_execution_engine2 *e)
+{
+	igt_spin_t *spin[3];
+	uint32_t ctx[3];
+	uint32_t ts[3];
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		ctx[i] = gem_context_clone_with_engines(i915, 0);
+		spin[i] = igt_spin_new(i915, .ctx = ctx[i], .engine = e->flags);
+	}
+
+	sleep(2); /* over the course of many timeslices */
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_assert(gem_bo_busy(i915, spin[i]->handle));
+		igt_spin_end(spin[i]);
+
+		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
+	}
+
+	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
+		igt_spin_free(i915, spin[i]);
+		gem_context_destroy(i915, ctx[i]);
+	}
+
+	qsort(ts, 3, sizeof(*ts), cmp_u32);
+	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
+		 1e-6 * ticks_to_ns(i915, ts[0]),
+		 1e-6 * ticks_to_ns(i915, ts[2]));
+
+	igt_assert(ts[0] && ts[2] > ts[0]);
+	igt_assert(4 * ts[0] > 3 * ts[2]);
+}
+
 #define test_each_engine(T, i915, e) \
 	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
 		igt_dynamic_f("%s", e->name)
@@ -2569,6 +3364,25 @@ igt_main
 		test_each_engine("lateslice", fd, e)
 			lateslice(fd, e->flags);
 
+		igt_subtest_group {
+			igt_fixture {
+				igt_require(gem_scheduler_has_semaphores(fd));
+				igt_require(gem_scheduler_has_preemption(fd));
+				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
+			}
+
+			test_each_engine("fairslice", fd, e)
+				fairslice(fd, e);
+
+			igt_subtest("fairslice-all")  {
+				__for_each_physical_engine(fd, e) {
+					igt_fork(child, 1)
+						fairslice(fd, e);
+				}
+				igt_waitchildren();
+			}
+		}
+
 		test_each_engine("submit-early-slice", fd, e)
 			submit_slice(fd, e, EARLY_SUBMIT);
 		test_each_engine("submit-golden-slice", fd, e)
@@ -2597,6 +3411,8 @@ igt_main
 		test_each_engine_store("promotion", fd, e)
 			promotion(fd, e->flags);
 
+		test_fairness(fd, 2);
+
 		igt_subtest_group {
 			igt_fixture {
 				igt_require(gem_scheduler_has_preemption(fd));