diff mbox

[i-g-t,4/5] tests/perf_pmu: Add tests for engine queued/runnable/running stats

Message ID 20180319182205.29454-5-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin March 19, 2018, 6:22 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Simple tests to check reported queue depths are correct.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 tests/perf_pmu.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)

Comments

Chris Wilson March 19, 2018, 8:58 p.m. UTC | #1
Quoting Tvrtko Ursulin (2018-03-19 18:22:04)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Simple tests to check reported queue depths are correct.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  tests/perf_pmu.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 224 insertions(+)
> 
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 469b9becdbac..206c18960b7b 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -966,6 +966,196 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
>         assert_within_epsilon(val[1], perf_slept[1], tolerance);
>  }
>  
> +static double calc_queued(uint64_t d_val, uint64_t d_ns)
> +{
> +       return (double)d_val * 1e9 / I915_SAMPLE_QUEUED_DIVISOR / d_ns;
> +}
> +
> +static void
> +queued(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       const unsigned long engine = e2ring(gem_fd, e);
> +       const unsigned int max_rq = 10;
> +       double queued[max_rq + 1];
> +       uint32_t bo[max_rq + 1];
> +       unsigned int n, i;
> +       uint64_t val[2];
> +       uint64_t ts[2];
> +       int fd;

igt_require_sw_sync();

I guess we should do igt_require_cork(CORK_SYNC_FD) or something like
that.

> +
> +       memset(queued, 0, sizeof(queued));
> +       memset(bo, 0, sizeof(bo));
> +
> +       fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
> +
> +       for (n = 0; n <= max_rq; n++) {
> +               int fence = -1;
> +               struct igt_cork cork = { .fd = fence, .type = CORK_SYNC_FD };

IGT_CORK_FENCE(cork); if you prefer

> +
> +               gem_quiescent_gpu(gem_fd);
> +
> +               if (n)
> +                       fence = igt_cork_plug(&cork, -1);
> +
> +               for (i = 0; i < n; i++) {
> +                       struct drm_i915_gem_exec_object2 obj = { };
> +                       struct drm_i915_gem_execbuffer2 eb = { };
> +
> +                       if (!bo[i]) {
> +                               const uint32_t bbe = MI_BATCH_BUFFER_END;
> +
> +                               bo[i] = gem_create(gem_fd, 4096);
> +                               gem_write(gem_fd, bo[i], 4092, &bbe,
> +                                         sizeof(bbe));
> +                       }
> +
> +                       obj.handle = bo[i];

Looks like you can use just the one handle multiple times?

> +
> +                       eb.buffer_count = 1;
> +                       eb.buffers_ptr = to_user_pointer(&obj);
> +
> +                       eb.flags = engine | I915_EXEC_FENCE_IN;
> +                       eb.rsvd2 = fence;

You do however also want to check with one context per execbuf.

if (flags & CONTEXTS)
	eb.rsvd1 = gem_context_create(fd);
> +
> +                       gem_execbuf(gem_fd, &eb);

if (flags & CONTEXTS)
	gem_context_destroy(fd, eb.rsvd1);
	eb.rsvd1 = gem_context_create();

> +               }
> +
> +               val[0] = __pmu_read_single(fd, &ts[0]);
> +               usleep(batch_duration_ns / 1000);
> +               val[1] = __pmu_read_single(fd, &ts[1]);
> +
> +               queued[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
> +               igt_info("n=%u queued=%.2f\n", n, queued[n]);
> +
> +               if (fence >= 0)
> +                       igt_cork_unplug(&cork);

Maybe we should just make this a no-op when used on an unplugged cork.

> +
> +               for (i = 0; i < n; i++)
> +                       gem_sync(gem_fd, bo[i]);
> +       }
> +
> +       close(fd);
> +
> +       for (i = 0; i < max_rq; i++) {
> +               if (bo[i])
> +                       gem_close(gem_fd, bo[i]);
> +       }
> +
> +       for (i = 0; i <= max_rq; i++)
> +               assert_within_epsilon(queued[i], i, tolerance);
> +}
> +
> +static void
> +runnable(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       const unsigned long engine = e2ring(gem_fd, e);
> +       const unsigned int max_rq = 10;
> +       igt_spin_t *spin[max_rq + 1];
> +       double runnable[max_rq + 1];
> +       uint32_t ctx[max_rq];
> +       unsigned int n, i;
> +       uint64_t val[2];
> +       uint64_t ts[2];
> +       int fd;
> +
> +       memset(runnable, 0, sizeof(runnable));
> +       memset(ctx, 0, sizeof(ctx));
> +
> +       fd = open_pmu(I915_PMU_ENGINE_RUNNABLE(e->class, e->instance));
> +
> +       for (n = 0; n <= max_rq; n++) {
> +               gem_quiescent_gpu(gem_fd);
> +
> +               for (i = 0; i < n; i++) {
> +                       if (!ctx[i])
> +                               ctx[i] = gem_context_create(gem_fd);
> +
> +                       if (i == 0)
> +                               spin[i] = __spin_poll(gem_fd, ctx[i], engine);
> +                       else
> +                               spin[i] = __igt_spin_batch_new(gem_fd, ctx[i],
> +                                                              engine, 0);
> +               }
> +
> +               if (n)
> +                       __spin_wait(gem_fd, spin[0]);
> +
> +               val[0] = __pmu_read_single(fd, &ts[0]);
> +               usleep(batch_duration_ns / 1000);
> +               val[1] = __pmu_read_single(fd, &ts[1]);
> +
> +               runnable[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
> +               igt_info("n=%u runnable=%.2f\n", n, runnable[n]);
> +
> +               for (i = 0; i < n; i++) {
> +                       end_spin(gem_fd, spin[i], FLAG_SYNC);
> +                       igt_spin_batch_free(gem_fd, spin[i]);
> +               }
> +       }
> +
> +       for (i = 0; i < max_rq; i++) {
> +               if (ctx[i])
> +                       gem_context_destroy(gem_fd, ctx[i]);

I would just create the contexts unconditionally.

> +       }
> +
> +       close(fd);
> +
> +       assert_within_epsilon(runnable[0], 0, tolerance);
> +       igt_assert(runnable[max_rq] > 0.0);
> +       assert_within_epsilon(runnable[max_rq] - runnable[max_rq - 1], 1,
> +                             tolerance);
> +}
> +
> +static void
> +running(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       const unsigned long engine = e2ring(gem_fd, e);
> +       const unsigned int max_rq = 10;
> +       igt_spin_t *spin[max_rq + 1];
> +       double running[max_rq + 1];
> +       unsigned int n, i;
> +       uint64_t val[2];
> +       uint64_t ts[2];
> +       int fd;
> +
> +       memset(running, 0, sizeof(running));
> +       memset(spin, 0, sizeof(spin));
> +
> +       fd = open_pmu(I915_PMU_ENGINE_RUNNING(e->class, e->instance));
> +
> +       for (n = 0; n <= max_rq; n++) {
> +               gem_quiescent_gpu(gem_fd);
> +
> +               for (i = 0; i < n; i++) {
> +                       if (i == 0)
> +                               spin[i] = __spin_poll(gem_fd, 0, engine);
> +                       else
> +                               spin[i] = __igt_spin_batch_new(gem_fd, 0,
> +                                                              engine, 0);
> +               }
> +
> +               if (n)
> +                       __spin_wait(gem_fd, spin[0]);

So create N requests on the same context so that running == N due to
lite-restore every time. I have some caveats that this relies on the
precise implementation, e.g. I don't think it will work for guc (using
execlists emulation with no lite-restore) for N > 2 or 8, or if we get
creative with execlists.

> +
> +               val[0] = __pmu_read_single(fd, &ts[0]);
> +               usleep(batch_duration_ns / 1000);
> +               val[1] = __pmu_read_single(fd, &ts[1]);
> +
> +               running[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
> +               igt_info("n=%u running=%.2f\n", n, running[n]);
> +
> +               for (i = 0; i < n; i++) {
> +                       end_spin(gem_fd, spin[i], FLAG_SYNC);
> +                       igt_spin_batch_free(gem_fd, spin[i]);
> +               }
> +       }
> +
> +       close(fd);
> +
> +       for (i = 0; i <= max_rq; i++)
> +               assert_within_epsilon(running[i], i, tolerance);
> +}

Ok, the tests look like they should be covering the counters.

Do we need to do an all-engines pass to check concurrent usage?
-Chris
Tvrtko Ursulin March 23, 2018, 10:08 a.m. UTC | #2
On 19/03/2018 20:58, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-19 18:22:04)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Simple tests to check reported queue depths are correct.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   tests/perf_pmu.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 224 insertions(+)
>>
>> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
>> index 469b9becdbac..206c18960b7b 100644
>> --- a/tests/perf_pmu.c
>> +++ b/tests/perf_pmu.c
>> @@ -966,6 +966,196 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
>>          assert_within_epsilon(val[1], perf_slept[1], tolerance);
>>   }
>>   
>> +static double calc_queued(uint64_t d_val, uint64_t d_ns)
>> +{
>> +       return (double)d_val * 1e9 / I915_SAMPLE_QUEUED_DIVISOR / d_ns;
>> +}
>> +
>> +static void
>> +queued(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       const unsigned long engine = e2ring(gem_fd, e);
>> +       const unsigned int max_rq = 10;
>> +       double queued[max_rq + 1];
>> +       uint32_t bo[max_rq + 1];
>> +       unsigned int n, i;
>> +       uint64_t val[2];
>> +       uint64_t ts[2];
>> +       int fd;
> 
> igt_require_sw_sync();
> 
> I guess we should do igt_require_cork(CORK_SYNC_FD) or something like
> that.
> 
>> +
>> +       memset(queued, 0, sizeof(queued));
>> +       memset(bo, 0, sizeof(bo));
>> +
>> +       fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
>> +
>> +       for (n = 0; n <= max_rq; n++) {
>> +               int fence = -1;
>> +               struct igt_cork cork = { .fd = fence, .type = CORK_SYNC_FD };
> 
> IGT_CORK_FENCE(cork); if you prefer

Missed it.

> 
>> +
>> +               gem_quiescent_gpu(gem_fd);
>> +
>> +               if (n)
>> +                       fence = igt_cork_plug(&cork, -1);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       struct drm_i915_gem_exec_object2 obj = { };
>> +                       struct drm_i915_gem_execbuffer2 eb = { };
>> +
>> +                       if (!bo[i]) {
>> +                               const uint32_t bbe = MI_BATCH_BUFFER_END;
>> +
>> +                               bo[i] = gem_create(gem_fd, 4096);
>> +                               gem_write(gem_fd, bo[i], 4092, &bbe,
>> +                                         sizeof(bbe));
>> +                       }
>> +
>> +                       obj.handle = bo[i];
> 
> Looks like you can use just the one handle multiple times?

Hm yeah.

> 
>> +
>> +                       eb.buffer_count = 1;
>> +                       eb.buffers_ptr = to_user_pointer(&obj);
>> +
>> +                       eb.flags = engine | I915_EXEC_FENCE_IN;
>> +                       eb.rsvd2 = fence;
> 
> You do however also want to check with one context per execbuf.

Ok.

> 
> if (flags & CONTEXTS)
> 	eb.rsvd1 = gem_context_create(fd);
>> +
>> +                       gem_execbuf(gem_fd, &eb);
> 
> if (flags & CONTEXTS)
> 	gem_context_destroy(fd, eb.rsvd1);
> 	eb.rsvd1 = gem_context_create();
> 
>> +               }
>> +
>> +               val[0] = __pmu_read_single(fd, &ts[0]);
>> +               usleep(batch_duration_ns / 1000);
>> +               val[1] = __pmu_read_single(fd, &ts[1]);
>> +
>> +               queued[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
>> +               igt_info("n=%u queued=%.2f\n", n, queued[n]);
>> +
>> +               if (fence >= 0)
>> +                       igt_cork_unplug(&cork);
> 
> Maybe we should just make this a no-op when used on an unplugged cork.

Don't know really, there's an assert in there and I didn't feel like 
evaluating all callers.

> 
>> +
>> +               for (i = 0; i < n; i++)
>> +                       gem_sync(gem_fd, bo[i]);
>> +       }
>> +
>> +       close(fd);
>> +
>> +       for (i = 0; i < max_rq; i++) {
>> +               if (bo[i])
>> +                       gem_close(gem_fd, bo[i]);
>> +       }
>> +
>> +       for (i = 0; i <= max_rq; i++)
>> +               assert_within_epsilon(queued[i], i, tolerance);
>> +}
>> +
>> +static void
>> +runnable(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       const unsigned long engine = e2ring(gem_fd, e);
>> +       const unsigned int max_rq = 10;
>> +       igt_spin_t *spin[max_rq + 1];
>> +       double runnable[max_rq + 1];
>> +       uint32_t ctx[max_rq];
>> +       unsigned int n, i;
>> +       uint64_t val[2];
>> +       uint64_t ts[2];
>> +       int fd;
>> +
>> +       memset(runnable, 0, sizeof(runnable));
>> +       memset(ctx, 0, sizeof(ctx));
>> +
>> +       fd = open_pmu(I915_PMU_ENGINE_RUNNABLE(e->class, e->instance));
>> +
>> +       for (n = 0; n <= max_rq; n++) {
>> +               gem_quiescent_gpu(gem_fd);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       if (!ctx[i])
>> +                               ctx[i] = gem_context_create(gem_fd);
>> +
>> +                       if (i == 0)
>> +                               spin[i] = __spin_poll(gem_fd, ctx[i], engine);
>> +                       else
>> +                               spin[i] = __igt_spin_batch_new(gem_fd, ctx[i],
>> +                                                              engine, 0);
>> +               }
>> +
>> +               if (n)
>> +                       __spin_wait(gem_fd, spin[0]);
>> +
>> +               val[0] = __pmu_read_single(fd, &ts[0]);
>> +               usleep(batch_duration_ns / 1000);
>> +               val[1] = __pmu_read_single(fd, &ts[1]);
>> +
>> +               runnable[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
>> +               igt_info("n=%u runnable=%.2f\n", n, runnable[n]);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       end_spin(gem_fd, spin[i], FLAG_SYNC);
>> +                       igt_spin_batch_free(gem_fd, spin[i]);
>> +               }
>> +       }
>> +
>> +       for (i = 0; i < max_rq; i++) {
>> +               if (ctx[i])
>> +                       gem_context_destroy(gem_fd, ctx[i]);
> 
> I would just create the contexts unconditionally.

Can do.

> 
>> +       }
>> +
>> +       close(fd);
>> +
>> +       assert_within_epsilon(runnable[0], 0, tolerance);
>> +       igt_assert(runnable[max_rq] > 0.0);
>> +       assert_within_epsilon(runnable[max_rq] - runnable[max_rq - 1], 1,
>> +                             tolerance);
>> +}
>> +
>> +static void
>> +running(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       const unsigned long engine = e2ring(gem_fd, e);
>> +       const unsigned int max_rq = 10;
>> +       igt_spin_t *spin[max_rq + 1];
>> +       double running[max_rq + 1];
>> +       unsigned int n, i;
>> +       uint64_t val[2];
>> +       uint64_t ts[2];
>> +       int fd;
>> +
>> +       memset(running, 0, sizeof(running));
>> +       memset(spin, 0, sizeof(spin));
>> +
>> +       fd = open_pmu(I915_PMU_ENGINE_RUNNING(e->class, e->instance));
>> +
>> +       for (n = 0; n <= max_rq; n++) {
>> +               gem_quiescent_gpu(gem_fd);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       if (i == 0)
>> +                               spin[i] = __spin_poll(gem_fd, 0, engine);
>> +                       else
>> +                               spin[i] = __igt_spin_batch_new(gem_fd, 0,
>> +                                                              engine, 0);
>> +               }
>> +
>> +               if (n)
>> +                       __spin_wait(gem_fd, spin[0]);
> 
> So create N requests on the same context so that running == N due to
> lite-restore every time. I have some caveats that this relies on the
> precise implementation, e.g. I don't think it will work for guc (using
> execlists emulation with no lite-restore) for N > 2 or 8, or if we get
> creative with execlists.

Yep, I think it doesn't work with GuC. Well, the assert would need to be 
different as minimum.

For N > 2 and execlists I think it works since it only needs port 0.

Runnable subtest on the other hand tries to be flexible towards 
different possible Ns already. I guess for running I could downgrade the 
asserts to just check running[N + 1] >= running[N] and running[1] > 0.

> 
>> +
>> +               val[0] = __pmu_read_single(fd, &ts[0]);
>> +               usleep(batch_duration_ns / 1000);
>> +               val[1] = __pmu_read_single(fd, &ts[1]);
>> +
>> +               running[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
>> +               igt_info("n=%u running=%.2f\n", n, running[n]);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       end_spin(gem_fd, spin[i], FLAG_SYNC);
>> +                       igt_spin_batch_free(gem_fd, spin[i]);
>> +               }
>> +       }
>> +
>> +       close(fd);
>> +
>> +       for (i = 0; i <= max_rq; i++)
>> +               assert_within_epsilon(running[i], i, tolerance);
>> +}
> 
> Ok, the tests look like they should be covering the counters.
> 
> Do we need to do an all-engines pass to check concurrent usage?

Depends how grey is your approach between white box and black box testing.

But what is definitely needed is some tests involving hangs, resets and 
preemption, since I am pretty sure a bug sneaked in somewhere in those 
areas.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 469b9becdbac..206c18960b7b 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -966,6 +966,196 @@  multi_client(int gem_fd, const struct intel_execution_engine2 *e)
 	assert_within_epsilon(val[1], perf_slept[1], tolerance);
 }
 
+static double calc_queued(uint64_t d_val, uint64_t d_ns)
+{
+	return (double)d_val * 1e9 / I915_SAMPLE_QUEUED_DIVISOR / d_ns;
+}
+
+static void
+queued(int gem_fd, const struct intel_execution_engine2 *e)
+{
+	const unsigned long engine = e2ring(gem_fd, e);
+	const unsigned int max_rq = 10;
+	double queued[max_rq + 1];
+	uint32_t bo[max_rq + 1];
+	unsigned int n, i;
+	uint64_t val[2];
+	uint64_t ts[2];
+	int fd;
+
+	memset(queued, 0, sizeof(queued));
+	memset(bo, 0, sizeof(bo));
+
+	fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
+
+	for (n = 0; n <= max_rq; n++) {
+		int fence = -1;
+		struct igt_cork cork = { .fd = fence, .type = CORK_SYNC_FD };
+
+		gem_quiescent_gpu(gem_fd);
+
+		if (n)
+			fence = igt_cork_plug(&cork, -1);
+
+		for (i = 0; i < n; i++) {
+			struct drm_i915_gem_exec_object2 obj = { };
+			struct drm_i915_gem_execbuffer2 eb = { };
+
+			if (!bo[i]) {
+				const uint32_t bbe = MI_BATCH_BUFFER_END;
+
+				bo[i] = gem_create(gem_fd, 4096);
+				gem_write(gem_fd, bo[i], 4092, &bbe,
+					  sizeof(bbe));
+			}
+
+			obj.handle = bo[i];
+
+			eb.buffer_count = 1;
+			eb.buffers_ptr = to_user_pointer(&obj);
+
+			eb.flags = engine | I915_EXEC_FENCE_IN;
+			eb.rsvd2 = fence;
+
+			gem_execbuf(gem_fd, &eb);
+		}
+
+		val[0] = __pmu_read_single(fd, &ts[0]);
+		usleep(batch_duration_ns / 1000);
+		val[1] = __pmu_read_single(fd, &ts[1]);
+
+		queued[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
+		igt_info("n=%u queued=%.2f\n", n, queued[n]);
+
+		if (fence >= 0)
+			igt_cork_unplug(&cork);
+
+		for (i = 0; i < n; i++)
+			gem_sync(gem_fd, bo[i]);
+	}
+
+	close(fd);
+
+	for (i = 0; i < max_rq; i++) {
+		if (bo[i])
+			gem_close(gem_fd, bo[i]);
+	}
+
+	for (i = 0; i <= max_rq; i++)
+		assert_within_epsilon(queued[i], i, tolerance);
+}
+
+static void
+runnable(int gem_fd, const struct intel_execution_engine2 *e)
+{
+	const unsigned long engine = e2ring(gem_fd, e);
+	const unsigned int max_rq = 10;
+	igt_spin_t *spin[max_rq + 1];
+	double runnable[max_rq + 1];
+	uint32_t ctx[max_rq];
+	unsigned int n, i;
+	uint64_t val[2];
+	uint64_t ts[2];
+	int fd;
+
+	memset(runnable, 0, sizeof(runnable));
+	memset(ctx, 0, sizeof(ctx));
+
+	fd = open_pmu(I915_PMU_ENGINE_RUNNABLE(e->class, e->instance));
+
+	for (n = 0; n <= max_rq; n++) {
+		gem_quiescent_gpu(gem_fd);
+
+		for (i = 0; i < n; i++) {
+			if (!ctx[i])
+				ctx[i] = gem_context_create(gem_fd);
+
+			if (i == 0)
+				spin[i] = __spin_poll(gem_fd, ctx[i], engine);
+			else
+				spin[i] = __igt_spin_batch_new(gem_fd, ctx[i],
+							       engine, 0);
+		}
+
+		if (n)
+			__spin_wait(gem_fd, spin[0]);
+
+		val[0] = __pmu_read_single(fd, &ts[0]);
+		usleep(batch_duration_ns / 1000);
+		val[1] = __pmu_read_single(fd, &ts[1]);
+
+		runnable[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
+		igt_info("n=%u runnable=%.2f\n", n, runnable[n]);
+
+		for (i = 0; i < n; i++) {
+			end_spin(gem_fd, spin[i], FLAG_SYNC);
+			igt_spin_batch_free(gem_fd, spin[i]);
+		}
+	}
+
+	for (i = 0; i < max_rq; i++) {
+		if (ctx[i])
+			gem_context_destroy(gem_fd, ctx[i]);
+	}
+
+	close(fd);
+
+	assert_within_epsilon(runnable[0], 0, tolerance);
+	igt_assert(runnable[max_rq] > 0.0);
+	assert_within_epsilon(runnable[max_rq] - runnable[max_rq - 1], 1,
+			      tolerance);
+}
+
+static void
+running(int gem_fd, const struct intel_execution_engine2 *e)
+{
+	const unsigned long engine = e2ring(gem_fd, e);
+	const unsigned int max_rq = 10;
+	igt_spin_t *spin[max_rq + 1];
+	double running[max_rq + 1];
+	unsigned int n, i;
+	uint64_t val[2];
+	uint64_t ts[2];
+	int fd;
+
+	memset(running, 0, sizeof(running));
+	memset(spin, 0, sizeof(spin));
+
+	fd = open_pmu(I915_PMU_ENGINE_RUNNING(e->class, e->instance));
+
+	for (n = 0; n <= max_rq; n++) {
+		gem_quiescent_gpu(gem_fd);
+
+		for (i = 0; i < n; i++) {
+			if (i == 0)
+				spin[i] = __spin_poll(gem_fd, 0, engine);
+			else
+				spin[i] = __igt_spin_batch_new(gem_fd, 0,
+							       engine, 0);
+		}
+
+		if (n)
+			__spin_wait(gem_fd, spin[0]);
+
+		val[0] = __pmu_read_single(fd, &ts[0]);
+		usleep(batch_duration_ns / 1000);
+		val[1] = __pmu_read_single(fd, &ts[1]);
+
+		running[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
+		igt_info("n=%u running=%.2f\n", n, running[n]);
+
+		for (i = 0; i < n; i++) {
+			end_spin(gem_fd, spin[i], FLAG_SYNC);
+			igt_spin_batch_free(gem_fd, spin[i]);
+		}
+	}
+
+	close(fd);
+
+	for (i = 0; i <= max_rq; i++)
+		assert_within_epsilon(running[i], i, tolerance);
+}
+
 /**
  * Tests that i915 PMU corectly errors out in invalid initialization.
  * i915 PMU is uncore PMU, thus:
@@ -1718,6 +1908,15 @@  igt_main
 		igt_subtest_f("init-sema-%s", e->name)
 			init(fd, e, I915_SAMPLE_SEMA);
 
+		igt_subtest_f("init-queued-%s", e->name)
+			init(fd, e, I915_SAMPLE_QUEUED);
+
+		igt_subtest_f("init-runnable-%s", e->name)
+			init(fd, e, I915_SAMPLE_RUNNABLE);
+
+		igt_subtest_f("init-running-%s", e->name)
+			init(fd, e, I915_SAMPLE_RUNNING);
+
 		igt_subtest_group {
 			igt_fixture {
 				gem_require_engine(fd, e->class, e->instance);
@@ -1823,6 +2022,24 @@  igt_main
 
 			igt_subtest_f("busy-hang-%s", e->name)
 				single(fd, e, TEST_BUSY | FLAG_HANG);
+
+			/**
+			 * Test that queued metric works.
+			 */
+			igt_subtest_f("queued-%s", e->name)
+				queued(fd, e);
+
+			/**
+			 * Test that runnable metric works.
+			 */
+			igt_subtest_f("runnable-%s", e->name)
+				runnable(fd, e);
+
+			/**
+			 * Test that running metric works.
+			 */
+			igt_subtest_f("running-%s", e->name)
+				running(fd, e);
 		}
 
 		/**
@@ -1915,6 +2132,13 @@  igt_main
 					      e->name)
 					single(fd, e,
 					       TEST_BUSY | TEST_TRAILING_IDLE);
+				igt_subtest_f("render-node-queued-%s", e->name)
+					queued(fd, e);
+				igt_subtest_f("render-node-runnable-%s",
+					      e->name)
+					runnable(fd, e);
+				igt_subtest_f("render-node-running-%s", e->name)
+					running(fd, e);
 			}
 		}