diff mbox

[i-g-t,5/5] tests/perf_pmu: Tests for i915 PMU API

Message ID 20170918113840.2684-6-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin Sept. 18, 2017, 11:38 a.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

A bunch of tests for the new i915 PMU feature.

Parts of the code were initialy sketched by Dmitry Rogozhkin.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 lib/igt_gt.c           |  23 +-
 lib/igt_gt.h           |   8 +
 tests/Makefile.sources |   1 +
 tests/perf_pmu.c       | 713 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 738 insertions(+), 7 deletions(-)
 create mode 100644 tests/perf_pmu.c

Comments

Chris Wilson Sept. 18, 2017, 1:17 p.m. UTC | #1
Quoting Tvrtko Ursulin (2017-09-18 12:38:40)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> A bunch of tests for the new i915 PMU feature.
> 
> Parts of the code were initialy sketched by Dmitry Rogozhkin.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> ---
>  lib/igt_gt.c           |  23 +-
>  lib/igt_gt.h           |   8 +
>  tests/Makefile.sources |   1 +
>  tests/perf_pmu.c       | 713 +++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 738 insertions(+), 7 deletions(-)
>  create mode 100644 tests/perf_pmu.c
> 
> diff --git a/lib/igt_gt.c b/lib/igt_gt.c
> index b3f3b3809eee..102cc2841feb 100644
> --- a/lib/igt_gt.c
> +++ b/lib/igt_gt.c
> @@ -537,14 +537,23 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd)
>         return missed;
>  }
>  
> +enum drm_i915_gem_engine_class {
> +       I915_ENGINE_CLASS_OTHER = 0,
> +       I915_ENGINE_CLASS_RENDER = 1,
> +       I915_ENGINE_CLASS_COPY = 2,
> +       I915_ENGINE_CLASS_VIDEO = 3,
> +       I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
> +       I915_ENGINE_CLASS_MAX /* non-ABI */
> +};
> +
>  const struct intel_execution_engine intel_execution_engines[] = {
> -       { "default", NULL, 0, 0 },
> -       { "render", "rcs0", I915_EXEC_RENDER, 0 },
> -       { "bsd", "vcs0", I915_EXEC_BSD, 0 },
> -       { "bsd1", "vcs0", I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
> -       { "bsd2", "vcs1", I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
> -       { "blt", "bcs0", I915_EXEC_BLT, 0 },
> -       { "vebox", "vecs0", I915_EXEC_VEBOX, 0 },
> +       { "default", NULL, -1, -1, 0, 0 },
> +       { "render", "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER, 0 },
> +       { "bsd", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 0 },
> +       { "bsd1", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
> +       { "bsd2", "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
> +       { "blt", "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT, 0 },
> +       { "vebox", "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX, 0 },
>         { NULL, 0, 0 }

I was anticipating a new struct for the explicit interface so that we
can easily phase out the out with its aliasing.

>  };
>  
> diff --git a/lib/igt_gt.h b/lib/igt_gt.h
> index 2579cbd37be7..436041ce9cc0 100644
> --- a/lib/igt_gt.h
> +++ b/lib/igt_gt.h
> @@ -66,6 +66,8 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd);
>  extern const struct intel_execution_engine {
>         const char *name;
>         const char *full_name;
> +       int class;
> +       int instance;
>         unsigned exec_id;
>         unsigned flags;
>  } intel_execution_engines[];
> @@ -78,6 +80,12 @@ extern const struct intel_execution_engine {
>              e__++) \
>                 for_if (gem_has_ring(fd__, flags__ = e__->exec_id | e__->flags))
>  
> +#define for_each_engine_class_instance(fd__, e__) \
> +       for ((e__) = intel_execution_engines;\
> +            (e__)->name; \
> +            (e__)++) \
> +               for_if ((e__)->class > 0)
> +
>  bool gem_can_store_dword(int fd, unsigned int engine);
>  
>  #endif /* IGT_GT_H */
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index cf542df181a8..4bab6247151c 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -217,6 +217,7 @@ TESTS_progs = \
>         kms_vblank \
>         meta_test \
>         perf \
> +       perf_pmu \
>         pm_backlight \
>         pm_lpsp \
>         pm_rc6_residency \
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> new file mode 100644
> index 000000000000..2dbee586dacc
> --- /dev/null
> +++ b/tests/perf_pmu.c
> @@ -0,0 +1,713 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <errno.h>
> +#include <sys/stat.h>
> +#include <sys/time.h>
> +#include <sys/times.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <time.h>
> +#include <poll.h>
> +
> +#include "igt.h"
> +#include "igt_perf.h"
> +
> +IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");
> +
> +const double tolerance = 0.02f;
> +const unsigned long batch_duration_ns = 1000 * 1000 * 1000 / 2;
> +
> +static void
> +init(int gem_fd, const struct intel_execution_engine *e, uint8_t sample)
> +{
> +       uint64_t config = __I915_PMU_ENGINE(e->class, e->instance, sample);
> +       int fd;
> +
> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

gem_require_ring()

> +
> +       fd = perf_i915_open(config);

Although the kernel interface is the authority.

So this should be igt_require, and igt_assert(has_ring);

> +       igt_assert(fd >= 0);
> +
> +       close(fd);
> +}
> +
> +static uint64_t pmu_read_single(int fd)
> +{
> +       uint64_t data[2];
> +       ssize_t len;
> +
> +       len = read(fd, data, sizeof(data));

Perf is a datagram api, right? A short read gives what you asked for and
discards the rest of the packet, iirc.

> +       igt_assert_eq(len, sizeof(data));
> +
> +       return data[0];
> +}
> +
> +static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
> +{
> +       uint64_t buf[2 + num];
> +       unsigned int i;
> +       ssize_t len;
> +
> +       len = read(fd, buf, sizeof(buf));
> +       igt_assert_eq(len, sizeof(buf));
> +       for (i = 0; i < num; i++)
> +               val[i] = buf[2 + i];
> +}
> +
> +#define assert_within_epsilon(x, ref, tolerance) \
> +       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
> +                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
> +                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
> +                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
> +
> +static void
> +single(int gem_fd, const struct intel_execution_engine *e, bool busy)
> +{
> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
> +       double ref = busy ? batch_duration_ns : 0.0f;
> +       igt_spin_t *spin;
> +       uint64_t val;
> +       int fd;
> +
> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> +
> +       if (busy) {
> +               spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +       } else {
> +               usleep(batch_duration_ns / 1000);
> +       }
> +
> +       fd = perf_i915_open(config);
> +       igt_assert(fd >= 0);
> +
> +       if (busy)
> +               gem_sync(gem_fd, spin->handle);
> +
> +       val = pmu_read_single(fd);
> +
> +       assert_within_epsilon(val, ref, tolerance);
> +
> +       if (busy)
> +               igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static void
> +busy_check_all(int gem_fd, const struct intel_execution_engine *e,

busy_check_others

busy_check_all I would expect to be checking that all engines are
correctly recorded as being busy at the same time. And there should also
be permutations of (busy, idle, wait) across the engines.

> +              const unsigned int num_engines)
> +{
> +       const struct intel_execution_engine *e_;
> +       uint64_t val[num_engines];
> +       int fd[2];
> +       igt_spin_t *spin;
> +       unsigned int busy_idx, i;
> +
> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +
> +       i = 0;
> +       fd[0] = -1;
> +       for_each_engine_class_instance(fd, e_) {
> +               if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))
> +                       continue;
> +
> +               if (e == e_)
> +                       busy_idx = i;
> +
> +               fd[i == 0 ? 0 : 1] =
> +                       perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,
> +                                                                 e_->instance),
> +                                            fd[0]);
> +               igt_assert(fd[0] > 0);
> +               igt_assert(i == 0 || fd[1] > 0);
> +               i++;
> +       }
> +
> +       gem_sync(gem_fd, spin->handle);
> +
> +       pmu_read_multi(fd[0], num_engines, val);
> +
> +       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
> +       for (i = 0; i < num_engines; i++) {
> +               if (i == busy_idx)
> +                       continue;
> +               assert_within_epsilon(val[i], 0.0f, tolerance);
> +       }
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd[0]);
> +}

> +static void
> +no_sema(int gem_fd, const struct intel_execution_engine *e, bool busy)

This is just the sanity check half of the sema test.

No wait, no queued?

> +static void
> +multi_client(int gem_fd, const struct intel_execution_engine *e)
> +{
> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
> +       igt_spin_t *spin;
> +       uint64_t val[2];
> +       int fd[2];
> +
> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +
> +       fd[0] = perf_i915_open(config);
> +       igt_assert(fd[0] >= 0);
> +
> +       usleep(batch_duration_ns / 4000);
> +
> +       fd[1] = perf_i915_open(config);
> +       igt_assert(fd[1] >= 0);
> +
> +       usleep(batch_duration_ns / 3000);
> +
> +       val[1] = pmu_read_single(fd[1]);
> +       close(fd[1]);
> +
> +       gem_sync(gem_fd, spin->handle);
> +
> +       val[0] = pmu_read_single(fd[0]);
> +
> +       assert_within_epsilon(val[0], batch_duration_ns, tolerance);
> +       assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd[0]);
> +}


> +static void cpu_hotplug(int gem_fd)
> +{
> +       struct timespec start, now;
> +       igt_spin_t *spin;
> +       uint64_t val, ref;
> +       int fd;
> +
> +       igt_require(cpu0_hotplug_support());
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
> +       igt_assert(fd >= 0);
> +
> +       clock_gettime(CLOCK_MONOTONIC, &start);
> +
> +       igt_fork(child, 1) {
> +               int cpu = 0;
> +
> +               for (;;) {
> +                       char name[128];
> +                       int cpufd;
> +
> +                       sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
> +                               cpu);
> +                       cpufd = open(name, O_WRONLY);
> +                       if (cpufd == -1) {
> +                               igt_assert(cpu > 0);
> +                               break;
> +                       }
> +                       igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +                       usleep(1000 * 1000);
> +
> +                       igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +                       close(cpufd);
> +                       cpu++;
> +               }
> +       }
> +
> +       igt_waitchildren();
> +
> +       igt_spin_batch_end(spin);
> +       gem_sync(gem_fd, spin->handle);
> +
> +       clock_gettime(CLOCK_MONOTONIC, &now);

Did we ever export the igt routines for probing supported clocks?
In this case, this fits into igt_nsec_elapsed.

> +       val = pmu_read_single(fd);
> +
> +       ref = elapsed_ns(&start, &now);
> +
> +       assert_within_epsilon(val, ref, tolerance);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static void
> +test_interrupts(int gem_fd)
> +{
> +       igt_spin_t *spin;
> +       uint64_t idle, busy, prev;
> +       int fd;
> +
> +       fd = perf_i915_open(I915_PMU_INTERRUPTS);
> +       igt_assert(fd >= 0);
> +
> +       gem_quiescent_gpu(gem_fd);
> +       sleep(2);
> +       prev = pmu_read_single(fd);
> +       usleep(batch_duration_ns / 1000);
> +       idle = pmu_read_single(fd);
> +
> +       igt_assert_eq(idle - prev, 0);
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +       gem_sync(gem_fd, spin->handle);

There's no guaranteed interrupt here.

> +
> +       busy = pmu_read_single(fd);
> +       igt_assert(busy > idle);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static void
> +test_frequency(int gem_fd)
> +{
> +       igt_spin_t *spin;
> +       uint64_t idle[2], busy[2];
> +       int fd;
> +
> +       fd = perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
> +       igt_assert(fd >= 0);

Ask the kernel if it is supported.

> +       igt_assert(perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, fd) >= 0);
> +
> +       gem_quiescent_gpu(gem_fd);
> +       usleep(batch_duration_ns / 1000);
> +
> +       pmu_read_multi(fd, 2, idle);
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +       gem_sync(gem_fd, spin->handle);
> +
> +       pmu_read_multi(fd, 2, busy);
> +
> +       igt_assert(busy[0] > idle[0]);
> +       igt_assert(busy[1] > idle[1]);

Nothing guarantees busy[1] changes, it is hw/fw dependent.
busy[0] depends on user config.

> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +

> +static void
> +test_rc6p(int gem_fd)
> +{
> +       const unsigned int devid = intel_get_drm_devid(gem_fd);
> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
> +       unsigned int num_pmu = 1;
> +       igt_spin_t *spin;
> +       uint64_t idle[3], busy[3], prev[3];
> +       unsigned int i;
> +       int fd, ret;
> +
> +       igt_require(intel_gen(devid) < 8 && !IS_HASWELL(devid));

Ask the kernel. (Applies equally to rc6, rc6p).

No rc6pp testing?

> +
> +       fd = perf_i915_open_group(I915_PMU_RC6_RESIDENCY, -1);
> +       igt_assert(fd >= 0);
> +
> +       ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
> +       if (ret > 0) {
> +               num_pmu++;
> +               ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
> +               if (ret > 0)
> +                       num_pmu++;
> +       }
> +
> +       gem_quiescent_gpu(gem_fd);
> +       sleep(2);
> +
> +       pmu_read_multi(fd, num_pmu, prev);
> +       usleep(duration_ns / 1000);
> +       pmu_read_multi(fd, num_pmu, idle);
> +
> +       for (i = 0; i < num_pmu; i++)
> +               assert_within_epsilon(idle[i] - prev[i], duration_ns,
> +                                     tolerance);
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       igt_spin_batch_set_timeout(spin, duration_ns);

Are we sure the GPU isn't allowed to sleep? i915_user_forcewake we
expect to keep the GPU out of rc6.

> +igt_main
> +{
> +       const unsigned int num_other_metrics =
> +                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
> +       unsigned int num_engines = 0;
> +       int fd = -1;
> +       const struct intel_execution_engine *e;
> +       unsigned int i;
> +
> +       igt_fixture {
> +               fd = drm_open_driver_master(DRIVER_INTEL);
> +
> +               igt_require_gem(fd);
> +               igt_require(i915_type_id() > 0);
> +
> +               for_each_engine_class_instance(fd, e) {
> +                       if (gem_has_ring(fd, e->exec_id | e->flags))
> +                               num_engines++;
> +               }
> +       }
> +
> +       /**
> +        * Test invalid access via perf API is rejected.
> +        */

ARGH. No comments on the intentions of the code?
-Chris
Rogozhkin, Dmitry V Sept. 18, 2017, 9:18 p.m. UTC | #2
Did you try tests on the system with 2 VDBOX engines? On my side 2 tests
are failing on SKL GT4e NUC:

(perf_pmu:5414) CRITICAL: Test assertion failure function
busy_check_all, file perf_pmu.c:164:
(perf_pmu:5414) CRITICAL: Failed assertion: (double)(val[i]) <= (1.0 +
tolerance) * (double)0.0f && (double)(val[i]) >= (1.0 - tolerance) *
(double)0.0f
(perf_pmu:5414) CRITICAL: 'val[i]' != '0.0f' (499984960.000000 not
within 2.000000% tolerance of 0.000000)
Subtest two-busy-check-all-bsd: FAIL (0.501s)

(perf_pmu:5414) CRITICAL: Test assertion failure function
two_busy_check_all, file perf_pmu.c:221:
(perf_pmu:5414) CRITICAL: Failed assertion: (double)(val[i]) <= (1.0 +
tolerance) * (double)0.0f && (double)(val[i]) >= (1.0 - tolerance) *
(double)0.0f
(perf_pmu:5414) CRITICAL: 'val[i]' != '0.0f' (499940146.000000 not
within 2.000000% tolerance of 0.000000)
Subtest two-busy-check-all-bsd1: FAIL (0.501s)

I am trying to speculate on the reasons below.


On Mon, 2017-09-18 at 12:38 +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

> 

> A bunch of tests for the new i915 PMU feature.

> 

> Parts of the code were initialy sketched by Dmitry Rogozhkin.

> 

> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

> Cc: Chris Wilson <chris@chris-wilson.co.uk>

> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>

> ---

>  lib/igt_gt.c           |  23 +-

>  lib/igt_gt.h           |   8 +

>  tests/Makefile.sources |   1 +

>  tests/perf_pmu.c       | 713 +++++++++++++++++++++++++++++++++++++++++++++++++

>  4 files changed, 738 insertions(+), 7 deletions(-)

>  create mode 100644 tests/perf_pmu.c

> 

> diff --git a/lib/igt_gt.c b/lib/igt_gt.c

> index b3f3b3809eee..102cc2841feb 100644

> --- a/lib/igt_gt.c

> +++ b/lib/igt_gt.c

> @@ -537,14 +537,23 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd)

>  	return missed;

>  }

>  

> +enum drm_i915_gem_engine_class {

> +	I915_ENGINE_CLASS_OTHER = 0,

> +	I915_ENGINE_CLASS_RENDER = 1,

> +	I915_ENGINE_CLASS_COPY = 2,

> +	I915_ENGINE_CLASS_VIDEO = 3,

> +	I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,

> +	I915_ENGINE_CLASS_MAX /* non-ABI */

> +};

> +

>  const struct intel_execution_engine intel_execution_engines[] = {

> -	{ "default", NULL, 0, 0 },

> -	{ "render", "rcs0", I915_EXEC_RENDER, 0 },

> -	{ "bsd", "vcs0", I915_EXEC_BSD, 0 },

> -	{ "bsd1", "vcs0", I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },

> -	{ "bsd2", "vcs1", I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },

> -	{ "blt", "bcs0", I915_EXEC_BLT, 0 },

> -	{ "vebox", "vecs0", I915_EXEC_VEBOX, 0 },

> +	{ "default", NULL, -1, -1, 0, 0 },

> +	{ "render", "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER, 0 },

> +	{ "bsd", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 0 },

With such definition, we will probably detect "bsd" as an engine (as
well as "bsd1" and "bsd2"), right? As a result, we will run
two-busy-check-all-bsd for it and according to defined flags we will
submit workloads to _both_ vcs0 and vcs1 evenly following i915 KMD
dispatching. Thus, the two-busy-check-all-bsd will fail since it will
detect a load on 3 engines (rcs0, vcs0, vcs1) instead of 2.

I am not quite sure why two-busy-check-all-bsd1 fails as well on my
side? or rather, why it did not fail on your side as well? The only
explanation I see is that the test thinks "bsd" and "bsd1" are separate
engines, and, thus, count them as 2. But that should fail on single
VDBOX system as well... hm...

> +	{ "bsd1", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },

> +	{ "bsd2", "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },

> +	{ "blt", "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT, 0 },

> +	{ "vebox", "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX, 0 },

>  	{ NULL, 0, 0 }

>  };

>  

> diff --git a/lib/igt_gt.h b/lib/igt_gt.h

> index 2579cbd37be7..436041ce9cc0 100644

> --- a/lib/igt_gt.h

> +++ b/lib/igt_gt.h

> @@ -66,6 +66,8 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd);

>  extern const struct intel_execution_engine {

>  	const char *name;

>  	const char *full_name;

> +	int class;

> +	int instance;

>  	unsigned exec_id;

>  	unsigned flags;

>  } intel_execution_engines[];

> @@ -78,6 +80,12 @@ extern const struct intel_execution_engine {

>  	     e__++) \

>  		for_if (gem_has_ring(fd__, flags__ = e__->exec_id | e__->flags))

>  

> +#define for_each_engine_class_instance(fd__, e__) \

> +	for ((e__) = intel_execution_engines;\

> +	     (e__)->name; \

> +	     (e__)++) \

> +		for_if ((e__)->class > 0)

> +

>  bool gem_can_store_dword(int fd, unsigned int engine);

>  

>  #endif /* IGT_GT_H */

> diff --git a/tests/Makefile.sources b/tests/Makefile.sources

> index cf542df181a8..4bab6247151c 100644

> --- a/tests/Makefile.sources

> +++ b/tests/Makefile.sources

> @@ -217,6 +217,7 @@ TESTS_progs = \

>  	kms_vblank \

>  	meta_test \

>  	perf \

> +	perf_pmu \

>  	pm_backlight \

>  	pm_lpsp \

>  	pm_rc6_residency \

> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c

> new file mode 100644

> index 000000000000..2dbee586dacc

> --- /dev/null

> +++ b/tests/perf_pmu.c

> @@ -0,0 +1,713 @@

> +/*

> + * Copyright © 2017 Intel Corporation

> + *

> + * Permission is hereby granted, free of charge, to any person obtaining a

> + * copy of this software and associated documentation files (the "Software"),

> + * to deal in the Software without restriction, including without limitation

> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,

> + * and/or sell copies of the Software, and to permit persons to whom the

> + * Software is furnished to do so, subject to the following conditions:

> + *

> + * The above copyright notice and this permission notice (including the next

> + * paragraph) shall be included in all copies or substantial portions of the

> + * Software.

> + *

> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL

> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING

> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

> + * IN THE SOFTWARE.

> + *

> + */

> +

> +#include <stdlib.h>

> +#include <stdio.h>

> +#include <string.h>

> +#include <fcntl.h>

> +#include <inttypes.h>

> +#include <errno.h>

> +#include <sys/stat.h>

> +#include <sys/time.h>

> +#include <sys/times.h>

> +#include <sys/types.h>

> +#include <dirent.h>

> +#include <time.h>

> +#include <poll.h>

> +

> +#include "igt.h"

> +#include "igt_perf.h"

> +

> +IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");

> +

> +const double tolerance = 0.02f;

> +const unsigned long batch_duration_ns = 1000 * 1000 * 1000 / 2;

> +

> +static void

> +init(int gem_fd, const struct intel_execution_engine *e, uint8_t sample)

> +{

> +	uint64_t config = __I915_PMU_ENGINE(e->class, e->instance, sample);

> +	int fd;

> +

> +	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

> +

> +	fd = perf_i915_open(config);

> +	igt_assert(fd >= 0);

> +

> +	close(fd);

> +}

> +

> +static uint64_t pmu_read_single(int fd)

> +{

> +	uint64_t data[2];

> +	ssize_t len;

> +

> +	len = read(fd, data, sizeof(data));

> +	igt_assert_eq(len, sizeof(data));

> +

> +	return data[0];

> +}

> +

> +static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)

> +{

> +	uint64_t buf[2 + num];

> +	unsigned int i;

> +	ssize_t len;

> +

> +	len = read(fd, buf, sizeof(buf));

> +	igt_assert_eq(len, sizeof(buf));

> +	for (i = 0; i < num; i++)

> +		val[i] = buf[2 + i];

> +}

> +

> +#define assert_within_epsilon(x, ref, tolerance) \

> +	igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \

> +		     (double)(x) >= (1.0 - tolerance) * (double)ref, \

> +		     "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\

> +		     #x, #ref, (double)x, tolerance * 100.0, (double)ref)

> +

> +static void

> +single(int gem_fd, const struct intel_execution_engine *e, bool busy)

> +{

> +	uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);

> +	double ref = busy ? batch_duration_ns : 0.0f;

> +	igt_spin_t *spin;

> +	uint64_t val;

> +	int fd;

> +

> +	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

> +

> +	if (busy) {

> +		spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);

> +		igt_spin_batch_set_timeout(spin, batch_duration_ns);

> +	} else {

> +		usleep(batch_duration_ns / 1000);

> +	}

> +

> +	fd = perf_i915_open(config);

> +	igt_assert(fd >= 0);

> +

> +	if (busy)

> +		gem_sync(gem_fd, spin->handle);

> +

> +	val = pmu_read_single(fd);

> +

> +	assert_within_epsilon(val, ref, tolerance);

> +

> +	if (busy)

> +		igt_spin_batch_free(gem_fd, spin);

> +	close(fd);

> +}

> +

> +static void

> +busy_check_all(int gem_fd, const struct intel_execution_engine *e,

> +	       const unsigned int num_engines)

> +{

> +	const struct intel_execution_engine *e_;

> +	uint64_t val[num_engines];

> +	int fd[2];

> +	igt_spin_t *spin;

> +	unsigned int busy_idx, i;

> +

> +	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

> +

> +	spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);

> +	igt_spin_batch_set_timeout(spin, batch_duration_ns);

> +

> +	i = 0;

> +	fd[0] = -1;

> +	for_each_engine_class_instance(fd, e_) {

> +		if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))

> +			continue;

> +

> +		if (e == e_)

> +			busy_idx = i;

> +

> +		fd[i == 0 ? 0 : 1] =

> +			perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,

> +								  e_->instance),

> +					     fd[0]);

> +		igt_assert(fd[0] > 0);

> +		igt_assert(i == 0 || fd[1] > 0);

> +		i++;

> +	}

> +

> +	gem_sync(gem_fd, spin->handle);

> +

> +	pmu_read_multi(fd[0], num_engines, val);

> +

> +	assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);

> +	for (i = 0; i < num_engines; i++) {

> +		if (i == busy_idx)

> +			continue;

> +		assert_within_epsilon(val[i], 0.0f, tolerance);

> +	}

> +

> +	igt_spin_batch_free(gem_fd, spin);

> +	close(fd[0]);

> +}

> +

> +static void

> +two_busy_check_all(int gem_fd, const struct intel_execution_engine *e,

> +		   const unsigned int num_engines)

> +{

> +	const struct intel_execution_engine *e_;

> +	uint64_t val[num_engines];

> +	int fd[2];

> +	igt_spin_t *spin[2];

> +	unsigned int busy_idx[2], i;

> +

> +	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

> +	igt_assert(e->exec_id != 0 && e->exec_id != I915_EXEC_RENDER);

> +

> +	spin[0] = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);

> +	igt_spin_batch_set_timeout(spin[0], batch_duration_ns);

> +

> +	spin[1] = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);

> +	igt_spin_batch_set_timeout(spin[1], batch_duration_ns);

> +

> +	i = 0;

> +	fd[0] = -1;

> +	for_each_engine_class_instance(fd, e_) {

> +		unsigned int idx = i == 0 ? 0 : 1;

> +

> +		if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))

> +			continue;

> +

> +		if (e_->class == I915_ENGINE_CLASS_RENDER && e_->instance == 0)

> +			busy_idx[0] = i;

> +		else if (e == e_)

> +			busy_idx[1] = i;

> +

> +		fd[idx] =

> +			perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,

> +								  e_->instance),

> +					     fd[0]);

> +		igt_assert(fd[idx] > 0);

> +		i++;

> +	}

> +

> +	gem_sync(gem_fd, spin[0]->handle);

> +	gem_sync(gem_fd, spin[1]->handle);

> +

> +	pmu_read_multi(fd[0], num_engines, val);

> +

> +	for (i = 0; i < num_engines; i++) {

> +		if (i == busy_idx[0] || i == busy_idx[1])

> +			assert_within_epsilon(val[i], batch_duration_ns,

> +					      tolerance);

> +		else

> +			assert_within_epsilon(val[i], 0.0f, tolerance);

> +	}

> +

> +	igt_spin_batch_free(gem_fd, spin[0]);

> +	igt_spin_batch_free(gem_fd, spin[1]);

> +	close(fd[0]);

> +}

> +

> +static void

> +no_sema(int gem_fd, const struct intel_execution_engine *e, bool busy)

> +{

> +	igt_spin_t *spin;

> +	uint64_t val[2];

> +	int fd, fd2;

> +

> +	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

> +

> +	if (busy) {

> +		spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);

> +		igt_spin_batch_set_timeout(spin, batch_duration_ns);

> +	} else {

> +		usleep(batch_duration_ns / 1000);

> +	}

> +

> +	fd = perf_i915_open_group(I915_PMU_ENGINE_SEMA(e->class, e->instance),

> +				  -1);

> +	igt_assert(fd >= 0);

> +	fd2 = perf_i915_open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance),

> +				  fd);

> +

> +	pmu_read_multi(fd, 2, val);

> +

> +	assert_within_epsilon(val[0], 0.0f, tolerance);

> +	assert_within_epsilon(val[1], 0.0f, tolerance);

> +

> +	if (busy)

> +		igt_spin_batch_free(gem_fd, spin);

> +	close(fd2);

> +	close(fd);

> +}

> +

> +static void

> +multi_client(int gem_fd, const struct intel_execution_engine *e)

> +{

> +	uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);

> +	igt_spin_t *spin;

> +	uint64_t val[2];

> +	int fd[2];

> +

> +	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

> +

> +	spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);

> +	igt_spin_batch_set_timeout(spin, batch_duration_ns);

> +

> +	fd[0] = perf_i915_open(config);

> +	igt_assert(fd[0] >= 0);

> +

> +	usleep(batch_duration_ns / 4000);

> +

> +	fd[1] = perf_i915_open(config);

> +	igt_assert(fd[1] >= 0);

> +

> +	usleep(batch_duration_ns / 3000);

> +

> +	val[1] = pmu_read_single(fd[1]);

> +	close(fd[1]);

> +

> +	gem_sync(gem_fd, spin->handle);

> +

> +	val[0] = pmu_read_single(fd[0]);

> +

> +	assert_within_epsilon(val[0], batch_duration_ns, tolerance);

> +	assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);

> +

> +	igt_spin_batch_free(gem_fd, spin);

> +	close(fd[0]);

> +}

> +

> +/**

> + * Tests that i915 PMU corectly error out in invalid initialization.

> + * i915 PMU is uncore PMU, thus:

> + *  - sampling period is not supported

> + *  - pid > 0 is not supported since we can't count per-process (we count

> + *    per whole system(

> + *  - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0

> + */

> +static void invalid_init(void)

> +{

> +	struct perf_event_attr attr;

> +	int pid, cpu;

> +

> +#define ATTR_INIT() \

> +do { \

> +	memset(&attr, 0, sizeof (attr)); \

> +	attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \

> +	attr.type = i915_type_id(); \

> +	igt_assert(attr.type != 0); \

> +} while(0)

> +

> +	ATTR_INIT();

> +	attr.sample_period = 100;

> +	pid = -1;

> +	cpu = 0;

> +	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);

> +	igt_assert_eq(errno, EINVAL);

> +

> +	ATTR_INIT();

> +	pid = 0;

> +	cpu = 0;

> +	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);

> +	igt_assert_eq(errno, EINVAL);

> +

> +	ATTR_INIT();

> +	pid = -1;

> +	cpu = 1;

> +	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);

> +	igt_assert_eq(errno, ENODEV);

> +}

> +

> +static void init_other(unsigned int i, bool valid)

> +{

> +	int fd;

> +

> +	fd = perf_i915_open(__I915_PMU_OTHER(i));

> +	igt_require(!(fd < 0 && errno == ENODEV));

> +	if (valid) {

> +		igt_assert(fd >= 0);

> +	} else {

> +		igt_assert(fd < 0);

> +		return;

> +	}

> +

> +	close(fd);

> +}

> +

> +static void read_other(unsigned int i, bool valid)

> +{

> +	int fd;

> +

> +	fd = perf_i915_open(__I915_PMU_OTHER(i));

> +	igt_require(!(fd < 0 && errno == ENODEV));

> +	if (valid) {

> +		igt_assert(fd >= 0);

> +	} else {

> +		igt_assert(fd < 0);

> +		return;

> +	}

> +

> +	(void)pmu_read_single(fd);

> +

> +	close(fd);

> +}

> +

> +static bool cpu0_hotplug_support(void)

> +{

> +	int fd = open("/sys/devices/system/cpu/cpu0/online", O_WRONLY);

> +

> +	close(fd);

> +

> +	return fd > 0;

> +}

> +

> +static uint64_t

> +elapsed_ns(const struct timespec *start, const struct timespec *end)

> +{

> +	return ((end->tv_sec - start->tv_sec) * 1e9 +

> +		(end->tv_nsec - start->tv_nsec));

> +}

> +

> +static void cpu_hotplug(int gem_fd)

> +{

> +	struct timespec start, now;

> +	igt_spin_t *spin;

> +	uint64_t val, ref;

> +	int fd;

> +

> +	igt_require(cpu0_hotplug_support());

> +

> +	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);

> +	fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));

> +	igt_assert(fd >= 0);

> +

> +	clock_gettime(CLOCK_MONOTONIC, &start);

> +

> +	igt_fork(child, 1) {

> +		int cpu = 0;

> +

> +		for (;;) {

> +			char name[128];

> +			int cpufd;

> +

> +			sprintf(name, "/sys/devices/system/cpu/cpu%d/online",

> +				cpu);

> +			cpufd = open(name, O_WRONLY);

> +			if (cpufd == -1) {

> +				igt_assert(cpu > 0);

> +				break;

> +			}

> +			igt_assert_eq(write(cpufd, "0", 2), 2);

> +

> +			usleep(1000 * 1000);

> +

> +			igt_assert_eq(write(cpufd, "1", 2), 2);

> +

> +			close(cpufd);

> +			cpu++;

> +		}

> +	}

> +

> +	igt_waitchildren();

> +

> +	igt_spin_batch_end(spin);

> +	gem_sync(gem_fd, spin->handle);

> +

> +	clock_gettime(CLOCK_MONOTONIC, &now);

> +	val = pmu_read_single(fd);

> +

> +	ref = elapsed_ns(&start, &now);

> +

> +	assert_within_epsilon(val, ref, tolerance);

> +

> +	igt_spin_batch_free(gem_fd, spin);

> +	close(fd);

> +}

> +

> +static void

> +test_interrupts(int gem_fd)

> +{

> +	igt_spin_t *spin;

> +	uint64_t idle, busy, prev;

> +	int fd;

> +

> +	fd = perf_i915_open(I915_PMU_INTERRUPTS);

> +	igt_assert(fd >= 0);

> +

> +	gem_quiescent_gpu(gem_fd);

> +	sleep(2);

> +	prev = pmu_read_single(fd);

> +	usleep(batch_duration_ns / 1000);

> +	idle = pmu_read_single(fd);

> +

> +	igt_assert_eq(idle - prev, 0);

> +

> +	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);

> +	igt_spin_batch_set_timeout(spin, batch_duration_ns);

> +	gem_sync(gem_fd, spin->handle);

> +

> +	busy = pmu_read_single(fd);

> +	igt_assert(busy > idle);

> +

> +	igt_spin_batch_free(gem_fd, spin);

> +	close(fd);

> +}

> +

> +static void

> +test_frequency(int gem_fd)

> +{

> +	igt_spin_t *spin;

> +	uint64_t idle[2], busy[2];

> +	int fd;

> +

> +	fd = perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, -1);

> +	igt_assert(fd >= 0);

> +	igt_assert(perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, fd) >= 0);

> +

> +	gem_quiescent_gpu(gem_fd);

> +	usleep(batch_duration_ns / 1000);

> +

> +	pmu_read_multi(fd, 2, idle);

> +

> +	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);

> +	igt_spin_batch_set_timeout(spin, batch_duration_ns);

> +	gem_sync(gem_fd, spin->handle);

> +

> +	pmu_read_multi(fd, 2, busy);

> +

> +	igt_assert(busy[0] > idle[0]);

> +	igt_assert(busy[1] > idle[1]);

> +

> +	igt_spin_batch_free(gem_fd, spin);

> +	close(fd);

> +}

> +

> +static void

> +test_rc6(int gem_fd)

> +{

> +	int64_t duration_ns = 500 * 1000 * 1000;

> +	igt_spin_t *spin;

> +	uint64_t idle, busy, prev;

> +	int fd;

> +

> +	fd = perf_i915_open(I915_PMU_RC6_RESIDENCY);

> +	igt_assert(fd >= 0);

> +

> +	gem_quiescent_gpu(gem_fd);

> +	sleep(2);

> +

> +	prev = pmu_read_single(fd);

> +	usleep(duration_ns / 1000);

> +	idle = pmu_read_single(fd);

> +

> +	assert_within_epsilon(idle - prev, duration_ns, tolerance);

> +

> +	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);

> +	igt_spin_batch_set_timeout(spin, duration_ns);

> +

> +	prev = pmu_read_single(fd);

> +

> +	gem_sync(gem_fd, spin->handle);

> +

> +	busy = pmu_read_single(fd);

> +	assert_within_epsilon(busy - prev, 0.0, tolerance);

> +

> +	igt_spin_batch_free(gem_fd, spin);

> +	close(fd);

> +}

> +

> +static void

> +test_rc6p(int gem_fd)

> +{

> +	const unsigned int devid = intel_get_drm_devid(gem_fd);

> +	int64_t duration_ns = 2 * 1000 * 1000 * 1000;

> +	unsigned int num_pmu = 1;

> +	igt_spin_t *spin;

> +	uint64_t idle[3], busy[3], prev[3];

> +	unsigned int i;

> +	int fd, ret;

> +

> +	igt_require(intel_gen(devid) < 8 && !IS_HASWELL(devid));

> +

> +	fd = perf_i915_open_group(I915_PMU_RC6_RESIDENCY, -1);

> +	igt_assert(fd >= 0);

> +

> +	ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);

> +	if (ret > 0) {

> +		num_pmu++;

> +		ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);

> +		if (ret > 0)

> +			num_pmu++;

> +	}

> +

> +	gem_quiescent_gpu(gem_fd);

> +	sleep(2);

> +

> +	pmu_read_multi(fd, num_pmu, prev);

> +	usleep(duration_ns / 1000);

> +	pmu_read_multi(fd, num_pmu, idle);

> +

> +	for (i = 0; i < num_pmu; i++)

> +		assert_within_epsilon(idle[i] - prev[i], duration_ns,

> +				      tolerance);

> +

> +	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);

> +	igt_spin_batch_set_timeout(spin, duration_ns);

> +

> +	pmu_read_multi(fd, num_pmu, prev);

> +

> +	gem_sync(gem_fd, spin->handle);

> +

> +	pmu_read_multi(fd, num_pmu, busy);

> +

> +	for (i = 0; i < num_pmu; i++)

> +		assert_within_epsilon(busy[i] - prev[i], 0.0, tolerance);

> +

> +	igt_spin_batch_free(gem_fd, spin);

> +	close(fd);

> +}

> +

> +igt_main

> +{

> +	const unsigned int num_other_metrics =

> +				I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;

> +	unsigned int num_engines = 0;

> +	int fd = -1;

> +	const struct intel_execution_engine *e;

> +	unsigned int i;

> +

> +	igt_fixture {

> +		fd = drm_open_driver_master(DRIVER_INTEL);

> +

> +		igt_require_gem(fd);

> +		igt_require(i915_type_id() > 0);

> +

> +		for_each_engine_class_instance(fd, e) {

> +			if (gem_has_ring(fd, e->exec_id | e->flags))

> +				num_engines++;

> +		}

> +	}

> +

> +	/**

> +	 * Test invalid access via perf API is rejected.

> +	 */

> +	igt_subtest("invalid-init")

> +		invalid_init();

> +

> +	for_each_engine_class_instance(fd, e) {

> +		/**

> +		 * Test that a single engine metric can be initialized.

> +		 */

> +		igt_subtest_f("init-busy-%s", e->name)

> +			init(fd, e, I915_SAMPLE_BUSY);

> +

> +		igt_subtest_f("init-wait-%s", e->name)

> +			init(fd, e, I915_SAMPLE_WAIT);

> +

> +		igt_subtest_f("init-sema-%s", e->name)

> +			init(fd, e, I915_SAMPLE_SEMA);

> +

> +		/**

> +		 * Test that engines show no load when idle.

> +		 */

> +		igt_subtest_f("idle-%s", e->name)

> +			single(fd, e, false);

> +

> +		/**

> +		 * Test that a single engine reports load correctly.

> +		 */

> +		igt_subtest_f("busy-%s", e->name)

> +			single(fd, e, true);

> +

> +		/**

> +		 * Test that when one engine is loaded other report no load.

> +		 */

> +		igt_subtest_f("busy-check-all-%s", e->name)

> +			busy_check_all(fd, e, num_engines);

> +

> +		/**

> +		 * Test that when two engines are loaded other report no load.

> +		 */

> +		if (!(e->class == I915_ENGINE_CLASS_RENDER && e->instance == 0))

> +			igt_subtest_f("two-busy-check-all-%s", e->name)

> +				two_busy_check_all(fd, e, num_engines);

> +

> +		/**

> +		 * Test that semphore counters report no activity on idle

> +		 * engines.

> +		 */

> +		igt_subtest_f("idle-no-semaphores-%s", e->name)

> +			no_sema(fd, e, false);

> +

> +		igt_subtest_f("busy-no-semaphores-%s", e->name)

> +			no_sema(fd, e, true);

> +

> +		/**

> +		 * Check that two perf clients do not influence each others

> +		 * observations.

> +		 */

> +		igt_subtest_f("multi-client-%s", e->name)

> +			multi_client(fd, e);

> +	}

> +

> +	/**

> +	 * Test that non-engine counters can be initialized and read. Apart

> +	 * from the invalid metric which should fail.

> +	 */

> +	for (i = 0; i < num_other_metrics + 1; i++) {

> +		igt_subtest_f("other-init-%u", i)

> +			init_other(i, i < num_other_metrics);

> +

> +		igt_subtest_f("other-read-%u", i)

> +			read_other(i, i < num_other_metrics);

> +	}

> +

> +	/**

> +	 * Test counters are not affected by CPU offline/online events.

> +	 */

> +	igt_subtest("cpu-hotplug")

> +		cpu_hotplug(fd);

> +

> +	/**

> +	 * Test GPU frequency.

> +	 */

> +	igt_subtest("frequency")

> +		test_frequency(fd);

> +

> +	/**

> +	 * Test interrupt count reporting.

> +	 */

> +	igt_subtest("interrupts")

> +		test_interrupts(fd);

> +

> +	/**

> +	 * Test RC6 residency reporting.

> +	 */

> +	igt_subtest("rc6")

> +		test_rc6(fd);

> +

> +	/**

> +	 * Test RC6p residency reporting.

> +	 */

> +	igt_subtest("rc6p")

> +		test_rc6p(fd);

> +}
Tvrtko Ursulin Sept. 19, 2017, 8:19 a.m. UTC | #3
On 18/09/2017 22:18, Rogozhkin, Dmitry V wrote:
> Did you try tests on the system with 2 VDBOX engines? On my side 2 tests
> are failing on SKL GT4e NUC:

Nope.

> (perf_pmu:5414) CRITICAL: Test assertion failure function
> busy_check_all, file perf_pmu.c:164:
> (perf_pmu:5414) CRITICAL: Failed assertion: (double)(val[i]) <= (1.0 +
> tolerance) * (double)0.0f && (double)(val[i]) >= (1.0 - tolerance) *
> (double)0.0f
> (perf_pmu:5414) CRITICAL: 'val[i]' != '0.0f' (499984960.000000 not
> within 2.000000% tolerance of 0.000000)
> Subtest two-busy-check-all-bsd: FAIL (0.501s)
> 
> (perf_pmu:5414) CRITICAL: Test assertion failure function
> two_busy_check_all, file perf_pmu.c:221:
> (perf_pmu:5414) CRITICAL: Failed assertion: (double)(val[i]) <= (1.0 +
> tolerance) * (double)0.0f && (double)(val[i]) >= (1.0 - tolerance) *
> (double)0.0f
> (perf_pmu:5414) CRITICAL: 'val[i]' != '0.0f' (499940146.000000 not
> within 2.000000% tolerance of 0.000000)
> Subtest two-busy-check-all-bsd1: FAIL (0.501s)
> 
> I am trying to speculate on the reasons below.
> 
> 
> On Mon, 2017-09-18 at 12:38 +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> A bunch of tests for the new i915 PMU feature.
>>
>> Parts of the code were initialy sketched by Dmitry Rogozhkin.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>> ---
>>   lib/igt_gt.c           |  23 +-
>>   lib/igt_gt.h           |   8 +
>>   tests/Makefile.sources |   1 +
>>   tests/perf_pmu.c       | 713 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 738 insertions(+), 7 deletions(-)
>>   create mode 100644 tests/perf_pmu.c
>>
>> diff --git a/lib/igt_gt.c b/lib/igt_gt.c
>> index b3f3b3809eee..102cc2841feb 100644
>> --- a/lib/igt_gt.c
>> +++ b/lib/igt_gt.c
>> @@ -537,14 +537,23 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd)
>>   	return missed;
>>   }
>>   
>> +enum drm_i915_gem_engine_class {
>> +	I915_ENGINE_CLASS_OTHER = 0,
>> +	I915_ENGINE_CLASS_RENDER = 1,
>> +	I915_ENGINE_CLASS_COPY = 2,
>> +	I915_ENGINE_CLASS_VIDEO = 3,
>> +	I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
>> +	I915_ENGINE_CLASS_MAX /* non-ABI */
>> +};
>> +
>>   const struct intel_execution_engine intel_execution_engines[] = {
>> -	{ "default", NULL, 0, 0 },
>> -	{ "render", "rcs0", I915_EXEC_RENDER, 0 },
>> -	{ "bsd", "vcs0", I915_EXEC_BSD, 0 },
>> -	{ "bsd1", "vcs0", I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
>> -	{ "bsd2", "vcs1", I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
>> -	{ "blt", "bcs0", I915_EXEC_BLT, 0 },
>> -	{ "vebox", "vecs0", I915_EXEC_VEBOX, 0 },
>> +	{ "default", NULL, -1, -1, 0, 0 },
>> +	{ "render", "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER, 0 },
>> +	{ "bsd", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 0 },
> With such definition, we will probably detect "bsd" as an engine (as
> well as "bsd1" and "bsd2"), right? As a result, we will run
> two-busy-check-all-bsd for it and according to defined flags we will
> submit workloads to _both_ vcs0 and vcs1 evenly following i915 KMD
> dispatching. Thus, the two-busy-check-all-bsd will fail since it will
> detect a load on 3 engines (rcs0, vcs0, vcs1) instead of 2.
> 
> I am not quite sure why two-busy-check-all-bsd1 fails as well on my
> side? or rather, why it did not fail on your side as well? The only
> explanation I see is that the test thinks "bsd" and "bsd1" are separate
> engines, and, thus, count them as 2. But that should fail on single
> VDBOX system as well... hm...

What makes it fail on GT3+ parts is the fact I915_EXEC_BSD gets 
round-robinned per context.

I need to come up with an elegant solution in the world where we started 
using engine class/instance concepts but don't yet have the 
class-instance execbuf...

Regards,

Tvrtko
Tvrtko Ursulin Sept. 19, 2017, 8:37 a.m. UTC | #4
On 18/09/2017 14:17, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-09-18 12:38:40)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> A bunch of tests for the new i915 PMU feature.
>>
>> Parts of the code were initialy sketched by Dmitry Rogozhkin.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>> ---
>>   lib/igt_gt.c           |  23 +-
>>   lib/igt_gt.h           |   8 +
>>   tests/Makefile.sources |   1 +
>>   tests/perf_pmu.c       | 713 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 738 insertions(+), 7 deletions(-)
>>   create mode 100644 tests/perf_pmu.c
>>
>> diff --git a/lib/igt_gt.c b/lib/igt_gt.c
>> index b3f3b3809eee..102cc2841feb 100644
>> --- a/lib/igt_gt.c
>> +++ b/lib/igt_gt.c
>> @@ -537,14 +537,23 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd)
>>          return missed;
>>   }
>>   
>> +enum drm_i915_gem_engine_class {
>> +       I915_ENGINE_CLASS_OTHER = 0,
>> +       I915_ENGINE_CLASS_RENDER = 1,
>> +       I915_ENGINE_CLASS_COPY = 2,
>> +       I915_ENGINE_CLASS_VIDEO = 3,
>> +       I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
>> +       I915_ENGINE_CLASS_MAX /* non-ABI */
>> +};
>> +
>>   const struct intel_execution_engine intel_execution_engines[] = {
>> -       { "default", NULL, 0, 0 },
>> -       { "render", "rcs0", I915_EXEC_RENDER, 0 },
>> -       { "bsd", "vcs0", I915_EXEC_BSD, 0 },
>> -       { "bsd1", "vcs0", I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
>> -       { "bsd2", "vcs1", I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
>> -       { "blt", "bcs0", I915_EXEC_BLT, 0 },
>> -       { "vebox", "vecs0", I915_EXEC_VEBOX, 0 },
>> +       { "default", NULL, -1, -1, 0, 0 },
>> +       { "render", "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER, 0 },
>> +       { "bsd", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 0 },
>> +       { "bsd1", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
>> +       { "bsd2", "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
>> +       { "blt", "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT, 0 },
>> +       { "vebox", "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX, 0 },
>>          { NULL, 0, 0 }
> 
> I was anticipating a new struct for the explicit interface so that we
> can easily phase out the out with its aliasing.

It's definitely buggy as it is as Dmitry has discovered. I'll have a 
think on how to do it elegantly. Too bad we can't piggy back the 
class-instance execbuf to this..

>>   };
>>   
>> diff --git a/lib/igt_gt.h b/lib/igt_gt.h
>> index 2579cbd37be7..436041ce9cc0 100644
>> --- a/lib/igt_gt.h
>> +++ b/lib/igt_gt.h
>> @@ -66,6 +66,8 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd);
>>   extern const struct intel_execution_engine {
>>          const char *name;
>>          const char *full_name;
>> +       int class;
>> +       int instance;
>>          unsigned exec_id;
>>          unsigned flags;
>>   } intel_execution_engines[];
>> @@ -78,6 +80,12 @@ extern const struct intel_execution_engine {
>>               e__++) \
>>                  for_if (gem_has_ring(fd__, flags__ = e__->exec_id | e__->flags))
>>   
>> +#define for_each_engine_class_instance(fd__, e__) \
>> +       for ((e__) = intel_execution_engines;\
>> +            (e__)->name; \
>> +            (e__)++) \
>> +               for_if ((e__)->class > 0)
>> +
>>   bool gem_can_store_dword(int fd, unsigned int engine);
>>   
>>   #endif /* IGT_GT_H */
>> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
>> index cf542df181a8..4bab6247151c 100644
>> --- a/tests/Makefile.sources
>> +++ b/tests/Makefile.sources
>> @@ -217,6 +217,7 @@ TESTS_progs = \
>>          kms_vblank \
>>          meta_test \
>>          perf \
>> +       perf_pmu \
>>          pm_backlight \
>>          pm_lpsp \
>>          pm_rc6_residency \
>> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
>> new file mode 100644
>> index 000000000000..2dbee586dacc
>> --- /dev/null
>> +++ b/tests/perf_pmu.c
>> @@ -0,0 +1,713 @@
>> +/*
>> + * Copyright © 2017 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + */
>> +
>> +#include <stdlib.h>
>> +#include <stdio.h>
>> +#include <string.h>
>> +#include <fcntl.h>
>> +#include <inttypes.h>
>> +#include <errno.h>
>> +#include <sys/stat.h>
>> +#include <sys/time.h>
>> +#include <sys/times.h>
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +#include <time.h>
>> +#include <poll.h>
>> +
>> +#include "igt.h"
>> +#include "igt_perf.h"
>> +
>> +IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");
>> +
>> +const double tolerance = 0.02f;
>> +const unsigned long batch_duration_ns = 1000 * 1000 * 1000 / 2;
>> +
>> +static void
>> +init(int gem_fd, const struct intel_execution_engine *e, uint8_t sample)
>> +{
>> +       uint64_t config = __I915_PMU_ENGINE(e->class, e->instance, sample);
>> +       int fd;
>> +
>> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> 
> gem_require_ring()

Missed the existance of it.


> 
>> +
>> +       fd = perf_i915_open(config);
> 
> Although the kernel interface is the authority.
> 
> So this should be igt_require, and igt_assert(has_ring);

Don't get what you are saying, igt_require(has_ring) followed by 
igt_assert(has_ring)??

> 
>> +       igt_assert(fd >= 0);
>> +
>> +       close(fd);
>> +}
>> +
>> +static uint64_t pmu_read_single(int fd)
>> +{
>> +       uint64_t data[2];
>> +       ssize_t len;
>> +
>> +       len = read(fd, data, sizeof(data));
> 
> Perf is a datagram api, right? A short read gives what you asked for and
> discards the rest of the packet, iirc.

Nope, I've noticed overlay was failing due that assumption and even 
traced the code in core perf which fails short reads. Hence the patch in 
this series to fix overlay in that respect.

> 
>> +       igt_assert_eq(len, sizeof(data));
>> +
>> +       return data[0];
>> +}
>> +
>> +static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>> +{
>> +       uint64_t buf[2 + num];
>> +       unsigned int i;
>> +       ssize_t len;
>> +
>> +       len = read(fd, buf, sizeof(buf));
>> +       igt_assert_eq(len, sizeof(buf));
>> +       for (i = 0; i < num; i++)
>> +               val[i] = buf[2 + i];
>> +}
>> +
>> +#define assert_within_epsilon(x, ref, tolerance) \
>> +       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
>> +                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
>> +                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
>> +                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
>> +
>> +static void
>> +single(int gem_fd, const struct intel_execution_engine *e, bool busy)
>> +{
>> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
>> +       double ref = busy ? batch_duration_ns : 0.0f;
>> +       igt_spin_t *spin;
>> +       uint64_t val;
>> +       int fd;
>> +
>> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
>> +
>> +       if (busy) {
>> +               spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
>> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +       } else {
>> +               usleep(batch_duration_ns / 1000);
>> +       }
>> +
>> +       fd = perf_i915_open(config);
>> +       igt_assert(fd >= 0);
>> +
>> +       if (busy)
>> +               gem_sync(gem_fd, spin->handle);
>> +
>> +       val = pmu_read_single(fd);
>> +
>> +       assert_within_epsilon(val, ref, tolerance);
>> +
>> +       if (busy)
>> +               igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
>> +}
>> +
>> +static void
>> +busy_check_all(int gem_fd, const struct intel_execution_engine *e,
> 
> busy_check_others
> 
> busy_check_all I would expect to be checking that all engines are
> correctly recorded as being busy at the same time. And there should also
> be permutations of (busy, idle, wait) across the engines.

I can do that, sure. But it is checking all engines, just some for 100% 
busy, and some for 100% idle. :) Naming it other would then be not 
correct either.

> 
>> +              const unsigned int num_engines)
>> +{
>> +       const struct intel_execution_engine *e_;
>> +       uint64_t val[num_engines];
>> +       int fd[2];
>> +       igt_spin_t *spin;
>> +       unsigned int busy_idx, i;
>> +
>> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
>> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +
>> +       i = 0;
>> +       fd[0] = -1;
>> +       for_each_engine_class_instance(fd, e_) {
>> +               if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))
>> +                       continue;
>> +
>> +               if (e == e_)
>> +                       busy_idx = i;
>> +
>> +               fd[i == 0 ? 0 : 1] =
>> +                       perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,
>> +                                                                 e_->instance),
>> +                                            fd[0]);
>> +               igt_assert(fd[0] > 0);
>> +               igt_assert(i == 0 || fd[1] > 0);
>> +               i++;
>> +       }
>> +
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       pmu_read_multi(fd[0], num_engines, val);
>> +
>> +       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
>> +       for (i = 0; i < num_engines; i++) {
>> +               if (i == busy_idx)
>> +                       continue;
>> +               assert_within_epsilon(val[i], 0.0f, tolerance);
>> +       }
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd[0]);
>> +}
> 
>> +static void
>> +no_sema(int gem_fd, const struct intel_execution_engine *e, bool busy)
> 
> This is just the sanity check half of the sema test.
> 
> No wait, no queued?

Forgot about queued completely!

And semaphores I left for later. I don't have any <gen9 machines to play 
with them locally.

> 
>> +static void
>> +multi_client(int gem_fd, const struct intel_execution_engine *e)
>> +{
>> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
>> +       igt_spin_t *spin;
>> +       uint64_t val[2];
>> +       int fd[2];
>> +
>> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
>> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +
>> +       fd[0] = perf_i915_open(config);
>> +       igt_assert(fd[0] >= 0);
>> +
>> +       usleep(batch_duration_ns / 4000);
>> +
>> +       fd[1] = perf_i915_open(config);
>> +       igt_assert(fd[1] >= 0);
>> +
>> +       usleep(batch_duration_ns / 3000);
>> +
>> +       val[1] = pmu_read_single(fd[1]);
>> +       close(fd[1]);
>> +
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       val[0] = pmu_read_single(fd[0]);
>> +
>> +       assert_within_epsilon(val[0], batch_duration_ns, tolerance);
>> +       assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd[0]);
>> +}
> 

Forgot to comment or changed your mind?

>> +static void cpu_hotplug(int gem_fd)
>> +{
>> +       struct timespec start, now;
>> +       igt_spin_t *spin;
>> +       uint64_t val, ref;
>> +       int fd;
>> +
>> +       igt_require(cpu0_hotplug_support());
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
>> +       fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
>> +       igt_assert(fd >= 0);
>> +
>> +       clock_gettime(CLOCK_MONOTONIC, &start);
>> +
>> +       igt_fork(child, 1) {
>> +               int cpu = 0;
>> +
>> +               for (;;) {
>> +                       char name[128];
>> +                       int cpufd;
>> +
>> +                       sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
>> +                               cpu);
>> +                       cpufd = open(name, O_WRONLY);
>> +                       if (cpufd == -1) {
>> +                               igt_assert(cpu > 0);
>> +                               break;
>> +                       }
>> +                       igt_assert_eq(write(cpufd, "0", 2), 2);
>> +
>> +                       usleep(1000 * 1000);
>> +
>> +                       igt_assert_eq(write(cpufd, "1", 2), 2);
>> +
>> +                       close(cpufd);
>> +                       cpu++;
>> +               }
>> +       }
>> +
>> +       igt_waitchildren();
>> +
>> +       igt_spin_batch_end(spin);
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       clock_gettime(CLOCK_MONOTONIC, &now);
> 
> Did we ever export the igt routines for probing supported clocks?
> In this case, this fits into igt_nsec_elapsed.

Did not spot this one either.

> 
>> +       val = pmu_read_single(fd);
>> +
>> +       ref = elapsed_ns(&start, &now);
>> +
>> +       assert_within_epsilon(val, ref, tolerance);
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
>> +}
>> +
>> +static void
>> +test_interrupts(int gem_fd)
>> +{
>> +       igt_spin_t *spin;
>> +       uint64_t idle, busy, prev;
>> +       int fd;
>> +
>> +       fd = perf_i915_open(I915_PMU_INTERRUPTS);
>> +       igt_assert(fd >= 0);
>> +
>> +       gem_quiescent_gpu(gem_fd);
>> +       sleep(2);
>> +       prev = pmu_read_single(fd);
>> +       usleep(batch_duration_ns / 1000);
>> +       idle = pmu_read_single(fd);
>> +
>> +       igt_assert_eq(idle - prev, 0);
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
>> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +       gem_sync(gem_fd, spin->handle);
> 
> There's no guaranteed interrupt here.

Hm yes.. bugger. Use fences instead of spin batch to ensure some? Or 
extend spin batch API to support fences?

> 
>> +
>> +       busy = pmu_read_single(fd);
>> +       igt_assert(busy > idle);
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
>> +}
>> +
>> +static void
>> +test_frequency(int gem_fd)
>> +{
>> +       igt_spin_t *spin;
>> +       uint64_t idle[2], busy[2];
>> +       int fd;
>> +
>> +       fd = perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
>> +       igt_assert(fd >= 0);
> 
> Ask the kernel if it is supported.

Yep.

> 
>> +       igt_assert(perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, fd) >= 0);
>> +
>> +       gem_quiescent_gpu(gem_fd);
>> +       usleep(batch_duration_ns / 1000);
>> +
>> +       pmu_read_multi(fd, 2, idle);
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
>> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       pmu_read_multi(fd, 2, busy);
>> +
>> +       igt_assert(busy[0] > idle[0]);
>> +       igt_assert(busy[1] > idle[1]);
> 
> Nothing guarantees busy[1] changes, it is hw/fw dependent.
> busy[0] depends on user config.

Do we reasonably expect IGT to be ran in such environments? Or change 
this to not expect a change but just compare against debugfs?

> 
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
>> +}
>> +
> 
>> +static void
>> +test_rc6p(int gem_fd)
>> +{
>> +       const unsigned int devid = intel_get_drm_devid(gem_fd);
>> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
>> +       unsigned int num_pmu = 1;
>> +       igt_spin_t *spin;
>> +       uint64_t idle[3], busy[3], prev[3];
>> +       unsigned int i;
>> +       int fd, ret;
>> +
>> +       igt_require(intel_gen(devid) < 8 && !IS_HASWELL(devid));
> 
> Ask the kernel. (Applies equally to rc6, rc6p).

What is the way to do this? Don't see these in get_param.

> No rc6pp testing?

Copy and paste error.

>> +
>> +       fd = perf_i915_open_group(I915_PMU_RC6_RESIDENCY, -1);
>> +       igt_assert(fd >= 0);
>> +
>> +       ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
>> +       if (ret > 0) {
>> +               num_pmu++;
>> +               ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
>> +               if (ret > 0)
>> +                       num_pmu++;
>> +       }
>> +
>> +       gem_quiescent_gpu(gem_fd);
>> +       sleep(2);
>> +
>> +       pmu_read_multi(fd, num_pmu, prev);
>> +       usleep(duration_ns / 1000);
>> +       pmu_read_multi(fd, num_pmu, idle);
>> +
>> +       for (i = 0; i < num_pmu; i++)
>> +               assert_within_epsilon(idle[i] - prev[i], duration_ns,
>> +                                     tolerance);
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
>> +       igt_spin_batch_set_timeout(spin, duration_ns);
> 
> Are we sure the GPU isn't allowed to sleep? i915_user_forcewake we
> expect to keep the GPU out of rc6.

I was sure, but was I wrong? :)

Regardless, replacing spin batch with a forcewake sounds simpler so I 
can do that.

> 
>> +igt_main
>> +{
>> +       const unsigned int num_other_metrics =
>> +                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
>> +       unsigned int num_engines = 0;
>> +       int fd = -1;
>> +       const struct intel_execution_engine *e;
>> +       unsigned int i;
>> +
>> +       igt_fixture {
>> +               fd = drm_open_driver_master(DRIVER_INTEL);
>> +
>> +               igt_require_gem(fd);
>> +               igt_require(i915_type_id() > 0);
>> +
>> +               for_each_engine_class_instance(fd, e) {
>> +                       if (gem_has_ring(fd, e->exec_id | e->flags))
>> +                               num_engines++;
>> +               }
>> +       }
>> +
>> +       /**
>> +        * Test invalid access via perf API is rejected.
>> +        */
> 
> ARGH. No comments on the intentions of the code?

Will add.

Regards,

Tvrtko
Chris Wilson Sept. 19, 2017, 9:58 a.m. UTC | #5
Quoting Tvrtko Ursulin (2017-09-19 09:37:35)
> 
> On 18/09/2017 14:17, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2017-09-18 12:38:40)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> A bunch of tests for the new i915 PMU feature.
> >>
> >> Parts of the code were initialy sketched by Dmitry Rogozhkin.
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> >> ---
> >>   lib/igt_gt.c           |  23 +-
> >>   lib/igt_gt.h           |   8 +
> >>   tests/Makefile.sources |   1 +
> >>   tests/perf_pmu.c       | 713 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>   4 files changed, 738 insertions(+), 7 deletions(-)
> >>   create mode 100644 tests/perf_pmu.c
> >>
> >> diff --git a/lib/igt_gt.c b/lib/igt_gt.c
> >> index b3f3b3809eee..102cc2841feb 100644
> >> --- a/lib/igt_gt.c
> >> +++ b/lib/igt_gt.c
> >> @@ -537,14 +537,23 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd)
> >>          return missed;
> >>   }
> >>   
> >> +enum drm_i915_gem_engine_class {
> >> +       I915_ENGINE_CLASS_OTHER = 0,
> >> +       I915_ENGINE_CLASS_RENDER = 1,
> >> +       I915_ENGINE_CLASS_COPY = 2,
> >> +       I915_ENGINE_CLASS_VIDEO = 3,
> >> +       I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
> >> +       I915_ENGINE_CLASS_MAX /* non-ABI */
> >> +};
> >> +
> >>   const struct intel_execution_engine intel_execution_engines[] = {
> >> -       { "default", NULL, 0, 0 },
> >> -       { "render", "rcs0", I915_EXEC_RENDER, 0 },
> >> -       { "bsd", "vcs0", I915_EXEC_BSD, 0 },
> >> -       { "bsd1", "vcs0", I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
> >> -       { "bsd2", "vcs1", I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
> >> -       { "blt", "bcs0", I915_EXEC_BLT, 0 },
> >> -       { "vebox", "vecs0", I915_EXEC_VEBOX, 0 },
> >> +       { "default", NULL, -1, -1, 0, 0 },
> >> +       { "render", "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER, 0 },
> >> +       { "bsd", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 0 },
> >> +       { "bsd1", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
> >> +       { "bsd2", "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
> >> +       { "blt", "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT, 0 },
> >> +       { "vebox", "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX, 0 },
> >>          { NULL, 0, 0 }
> > 
> > I was anticipating a new struct for the explicit interface so that we
> > can easily phase out the out with its aliasing.
> 
> It's definitely buggy as it is as Dmitry has discovered. I'll have a 
> think on how to do it elegantly. Too bad we can't piggy back the 
> class-instance execbuf to this..

Well, I am hoping that it will be designed to slot into that interface. :)
Then we start going through the tests deciding which are covering ABI
and so need exercise on both, and which are designed to exercise
internal/hw paths and so only need to be run on specific engines and not
all aliases.
 
> >>   };
> >>   
> >> diff --git a/lib/igt_gt.h b/lib/igt_gt.h
> >> index 2579cbd37be7..436041ce9cc0 100644
> >> --- a/lib/igt_gt.h
> >> +++ b/lib/igt_gt.h
> >> @@ -66,6 +66,8 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd);
> >>   extern const struct intel_execution_engine {
> >>          const char *name;
> >>          const char *full_name;
> >> +       int class;
> >> +       int instance;
> >>          unsigned exec_id;
> >>          unsigned flags;
> >>   } intel_execution_engines[];
> >> @@ -78,6 +80,12 @@ extern const struct intel_execution_engine {
> >>               e__++) \
> >>                  for_if (gem_has_ring(fd__, flags__ = e__->exec_id | e__->flags))
> >>   
> >> +#define for_each_engine_class_instance(fd__, e__) \
> >> +       for ((e__) = intel_execution_engines;\
> >> +            (e__)->name; \
> >> +            (e__)++) \
> >> +               for_if ((e__)->class > 0)
> >> +
> >>   bool gem_can_store_dword(int fd, unsigned int engine);
> >>   
> >>   #endif /* IGT_GT_H */
> >> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> >> index cf542df181a8..4bab6247151c 100644
> >> --- a/tests/Makefile.sources
> >> +++ b/tests/Makefile.sources
> >> @@ -217,6 +217,7 @@ TESTS_progs = \
> >>          kms_vblank \
> >>          meta_test \
> >>          perf \
> >> +       perf_pmu \
> >>          pm_backlight \
> >>          pm_lpsp \
> >>          pm_rc6_residency \
> >> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> >> new file mode 100644
> >> index 000000000000..2dbee586dacc
> >> --- /dev/null
> >> +++ b/tests/perf_pmu.c
> >> @@ -0,0 +1,713 @@
> >> +/*
> >> + * Copyright © 2017 Intel Corporation
> >> + *
> >> + * Permission is hereby granted, free of charge, to any person obtaining a
> >> + * copy of this software and associated documentation files (the "Software"),
> >> + * to deal in the Software without restriction, including without limitation
> >> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> >> + * and/or sell copies of the Software, and to permit persons to whom the
> >> + * Software is furnished to do so, subject to the following conditions:
> >> + *
> >> + * The above copyright notice and this permission notice (including the next
> >> + * paragraph) shall be included in all copies or substantial portions of the
> >> + * Software.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> >> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> >> + * IN THE SOFTWARE.
> >> + *
> >> + */
> >> +
> >> +#include <stdlib.h>
> >> +#include <stdio.h>
> >> +#include <string.h>
> >> +#include <fcntl.h>
> >> +#include <inttypes.h>
> >> +#include <errno.h>
> >> +#include <sys/stat.h>
> >> +#include <sys/time.h>
> >> +#include <sys/times.h>
> >> +#include <sys/types.h>
> >> +#include <dirent.h>
> >> +#include <time.h>
> >> +#include <poll.h>
> >> +
> >> +#include "igt.h"
> >> +#include "igt_perf.h"
> >> +
> >> +IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");
> >> +
> >> +const double tolerance = 0.02f;
> >> +const unsigned long batch_duration_ns = 1000 * 1000 * 1000 / 2;
> >> +
> >> +static void
> >> +init(int gem_fd, const struct intel_execution_engine *e, uint8_t sample)
> >> +{
> >> +       uint64_t config = __I915_PMU_ENGINE(e->class, e->instance, sample);
> >> +       int fd;
> >> +
> >> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> > 
> > gem_require_ring()
> 
> Missed the existance of it.
> 
> 
> > 
> >> +
> >> +       fd = perf_i915_open(config);
> > 
> > Although the kernel interface is the authority.
> > 
> > So this should be igt_require, and igt_assert(has_ring);
> 
> Don't get what you are saying, igt_require(has_ring) followed by 
> igt_assert(has_ring)??

I'm saying the perf_i915_open() knows better than the test when it is
allowed to be run. i.e. don't second guess by preceding it with
gem_require_ring(), but assert afterwards that the result matches
expectation.

> > 
> >> +       igt_assert(fd >= 0);
> >> +
> >> +       close(fd);
> >> +}
> >> +
> >> +static uint64_t pmu_read_single(int fd)
> >> +{
> >> +       uint64_t data[2];
> >> +       ssize_t len;
> >> +
> >> +       len = read(fd, data, sizeof(data));
> > 
> > Perf is a datagram api, right? A short read gives what you asked for and
> > discards the rest of the packet, iirc.
> 
> Nope, I've noticed overlay was failing due that assumption and even 
> traced the code in core perf which fails short reads. Hence the patch in 
> this series to fix overlay in that respect.

You now understand why that bug exists ;) Ta.

> 
> > 
> >> +       igt_assert_eq(len, sizeof(data));
> >> +
> >> +       return data[0];
> >> +}
> >> +
> >> +static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
> >> +{
> >> +       uint64_t buf[2 + num];
> >> +       unsigned int i;
> >> +       ssize_t len;
> >> +
> >> +       len = read(fd, buf, sizeof(buf));
> >> +       igt_assert_eq(len, sizeof(buf));
> >> +       for (i = 0; i < num; i++)
> >> +               val[i] = buf[2 + i];
> >> +}
> >> +
> >> +#define assert_within_epsilon(x, ref, tolerance) \
> >> +       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
> >> +                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
> >> +                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
> >> +                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
> >> +
> >> +static void
> >> +single(int gem_fd, const struct intel_execution_engine *e, bool busy)
> >> +{
> >> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
> >> +       double ref = busy ? batch_duration_ns : 0.0f;
> >> +       igt_spin_t *spin;
> >> +       uint64_t val;
> >> +       int fd;
> >> +
> >> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> >> +
> >> +       if (busy) {
> >> +               spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
> >> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);
> >> +       } else {
> >> +               usleep(batch_duration_ns / 1000);
> >> +       }
> >> +
> >> +       fd = perf_i915_open(config);
> >> +       igt_assert(fd >= 0);
> >> +
> >> +       if (busy)
> >> +               gem_sync(gem_fd, spin->handle);
> >> +
> >> +       val = pmu_read_single(fd);
> >> +
> >> +       assert_within_epsilon(val, ref, tolerance);
> >> +
> >> +       if (busy)
> >> +               igt_spin_batch_free(gem_fd, spin);
> >> +       close(fd);
> >> +}
> >> +
> >> +static void
> >> +busy_check_all(int gem_fd, const struct intel_execution_engine *e,
> > 
> > busy_check_others
> > 
> > busy_check_all I would expect to be checking that all engines are
> > correctly recorded as being busy at the same time. And there should also
> > be permutations of (busy, idle, wait) across the engines.
> 
> I can do that, sure. But it is checking all engines, just some for 100% 
> busy, and some for 100% idle. :) Naming it other would then be not 
> correct either.

Definitely like the idea of doing odd-one-out testing for both busy and
idle (1 busy, N-1 idle; 1 idle, N-1 busy) across all engines. I don't
expect to get any more insight from the other permutations. 2N passes
with say 100us batches isn't going to be an issue.

Do we have a resolution test? With your execlists interface, you should
get precise timings for even nop batches. But we don't expose the lower
limit on accuracy for legacy, do we?

> >> +              const unsigned int num_engines)
> >> +{
> >> +       const struct intel_execution_engine *e_;
> >> +       uint64_t val[num_engines];
> >> +       int fd[2];
> >> +       igt_spin_t *spin;
> >> +       unsigned int busy_idx, i;
> >> +
> >> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> >> +
> >> +       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
> >> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> >> +
> >> +       i = 0;
> >> +       fd[0] = -1;
> >> +       for_each_engine_class_instance(fd, e_) {
> >> +               if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))
> >> +                       continue;
> >> +
> >> +               if (e == e_)
> >> +                       busy_idx = i;
> >> +
> >> +               fd[i == 0 ? 0 : 1] =
> >> +                       perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,
> >> +                                                                 e_->instance),
> >> +                                            fd[0]);
> >> +               igt_assert(fd[0] > 0);
> >> +               igt_assert(i == 0 || fd[1] > 0);
> >> +               i++;
> >> +       }
> >> +
> >> +       gem_sync(gem_fd, spin->handle);
> >> +
> >> +       pmu_read_multi(fd[0], num_engines, val);
> >> +
> >> +       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
> >> +       for (i = 0; i < num_engines; i++) {
> >> +               if (i == busy_idx)
> >> +                       continue;
> >> +               assert_within_epsilon(val[i], 0.0f, tolerance);
> >> +       }
> >> +
> >> +       igt_spin_batch_free(gem_fd, spin);
> >> +       close(fd[0]);
> >> +}
> > 
> >> +static void
> >> +no_sema(int gem_fd, const struct intel_execution_engine *e, bool busy)
> > 
> > This is just the sanity check half of the sema test.
> > 
> > No wait, no queued?
> 
> Forgot about queued completely!
> 
> And semaphores I left for later. I don't have any <gen9 machines to play 
> with them locally.

You can have an ivb celery with my sympathies!

> >> +static void
> >> +multi_client(int gem_fd, const struct intel_execution_engine *e)
> >> +{
> >> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
> >> +       igt_spin_t *spin;
> >> +       uint64_t val[2];
> >> +       int fd[2];
> >> +
> >> +       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
> >> +
> >> +       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
> >> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> >> +
> >> +       fd[0] = perf_i915_open(config);
> >> +       igt_assert(fd[0] >= 0);
> >> +
> >> +       usleep(batch_duration_ns / 4000);
> >> +
> >> +       fd[1] = perf_i915_open(config);
> >> +       igt_assert(fd[1] >= 0);
> >> +
> >> +       usleep(batch_duration_ns / 3000);
> >> +
> >> +       val[1] = pmu_read_single(fd[1]);
> >> +       close(fd[1]);
> >> +
> >> +       gem_sync(gem_fd, spin->handle);
> >> +
> >> +       val[0] = pmu_read_single(fd[0]);
> >> +
> >> +       assert_within_epsilon(val[0], batch_duration_ns, tolerance);
> >> +       assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
> >> +
> >> +       igt_spin_batch_free(gem_fd, spin);
> >> +       close(fd[0]);
> >> +}
> > 
> 
> Forgot to comment or changed your mind?

Partly, I think I was reading backwards and was going to comment on
igt_require(gem_has_ring()), and the other part thinks I left it here to
read again afterwards.

Hmm. I don't trust usleep() to be accurate. Tolerance is 2%. Feel safer
if you wrapped usleep() with clock_gettime / igt_nsecs_elapsed.

> >> +static void
> >> +test_interrupts(int gem_fd)
> >> +{
> >> +       igt_spin_t *spin;
> >> +       uint64_t idle, busy, prev;
> >> +       int fd;
> >> +
> >> +       fd = perf_i915_open(I915_PMU_INTERRUPTS);
> >> +       igt_assert(fd >= 0);
> >> +
> >> +       gem_quiescent_gpu(gem_fd);
> >> +       sleep(2);
> >> +       prev = pmu_read_single(fd);
> >> +       usleep(batch_duration_ns / 1000);
> >> +       idle = pmu_read_single(fd);
> >> +
> >> +       igt_assert_eq(idle - prev, 0);
> >> +
> >> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> >> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> >> +       gem_sync(gem_fd, spin->handle);
> > 
> > There's no guaranteed interrupt here.
> 
> Hm yes.. bugger. Use fences instead of spin batch to ensure some? Or 
> extend spin batch API to support fences?

Limit to execlists and do a context-switch? Oh, that reminds me, knowing
context-switch per engine per second would also be useful.

(As would isolating the wait times for flips.)

Oh, and if we really wanted to be nasty, MI_USER_INTERRUPT from a batch.
It wouldn't break the kernel, but I expect we might need a secure batch.

> >> +
> >> +       busy = pmu_read_single(fd);
> >> +       igt_assert(busy > idle);
> >> +
> >> +       igt_spin_batch_free(gem_fd, spin);
> >> +       close(fd);
> >> +}
> >> +
> >> +static void
> >> +test_frequency(int gem_fd)
> >> +{
> >> +       igt_spin_t *spin;
> >> +       uint64_t idle[2], busy[2];
> >> +       int fd;
> >> +
> >> +       fd = perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
> >> +       igt_assert(fd >= 0);
> > 
> > Ask the kernel if it is supported.
> 
> Yep.
> 
> > 
> >> +       igt_assert(perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, fd) >= 0);
> >> +
> >> +       gem_quiescent_gpu(gem_fd);
> >> +       usleep(batch_duration_ns / 1000);
> >> +
> >> +       pmu_read_multi(fd, 2, idle);
> >> +
> >> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> >> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> >> +       gem_sync(gem_fd, spin->handle);
> >> +
> >> +       pmu_read_multi(fd, 2, busy);
> >> +
> >> +       igt_assert(busy[0] > idle[0]);
> >> +       igt_assert(busy[1] > idle[1]);
> > 
> > Nothing guarantees busy[1] changes, it is hw/fw dependent.
> > busy[0] depends on user config.
> 
> Do we reasonably expect IGT to be ran in such environments? Or change 
> this to not expect a change but just compare against debugfs?
> 
> > 
> >> +
> >> +       igt_spin_batch_free(gem_fd, spin);
> >> +       close(fd);
> >> +}
> >> +
> > 
> >> +static void
> >> +test_rc6p(int gem_fd)
> >> +{
> >> +       const unsigned int devid = intel_get_drm_devid(gem_fd);
> >> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
> >> +       unsigned int num_pmu = 1;
> >> +       igt_spin_t *spin;
> >> +       uint64_t idle[3], busy[3], prev[3];
> >> +       unsigned int i;
> >> +       int fd, ret;
> >> +
> >> +       igt_require(intel_gen(devid) < 8 && !IS_HASWELL(devid));
> > 
> > Ask the kernel. (Applies equally to rc6, rc6p).
> 
> What is the way to do this? Don't see these in get_param.

Perf knows! I just don't like the test second guessing the kernel. The
kernel has both the list of supported hw, along with the user
restrictions and also what is allowed through the perf interface.
I expect the tests to be fairly agnostic, if the kernel says it has the
counter, then it should comply with our expectations on rc6 behaviour.
If the kernel doesn't say it has the counter, then it may not have for
any number of good reasons.

> > No rc6pp testing?
> 
> Copy and paste error.

Lack of hw would be a fine answer ;)

It's more of a question as to whether we should make sure the ABI covers
all rc6 possibilities, even though not all are currently implemented.

> >> +
> >> +       fd = perf_i915_open_group(I915_PMU_RC6_RESIDENCY, -1);
> >> +       igt_assert(fd >= 0);
> >> +
> >> +       ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
> >> +       if (ret > 0) {
> >> +               num_pmu++;
> >> +               ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
> >> +               if (ret > 0)
> >> +                       num_pmu++;
> >> +       }
> >> +
> >> +       gem_quiescent_gpu(gem_fd);
> >> +       sleep(2);
> >> +
> >> +       pmu_read_multi(fd, num_pmu, prev);
> >> +       usleep(duration_ns / 1000);
> >> +       pmu_read_multi(fd, num_pmu, idle);
> >> +
> >> +       for (i = 0; i < num_pmu; i++)
> >> +               assert_within_epsilon(idle[i] - prev[i], duration_ns,
> >> +                                     tolerance);
> >> +
> >> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> >> +       igt_spin_batch_set_timeout(spin, duration_ns);
> > 
> > Are we sure the GPU isn't allowed to sleep? i915_user_forcewake we
> > expect to keep the GPU out of rc6.
> 
> I was sure, but was I wrong? :)
> 
> Regardless, replacing spin batch with a forcewake sounds simpler so I 
> can do that.
> 
> > 
> >> +igt_main
> >> +{
> >> +       const unsigned int num_other_metrics =
> >> +                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
> >> +       unsigned int num_engines = 0;
> >> +       int fd = -1;
> >> +       const struct intel_execution_engine *e;
> >> +       unsigned int i;
> >> +
> >> +       igt_fixture {
> >> +               fd = drm_open_driver_master(DRIVER_INTEL);
> >> +
> >> +               igt_require_gem(fd);
> >> +               igt_require(i915_type_id() > 0);
> >> +
> >> +               for_each_engine_class_instance(fd, e) {
> >> +                       if (gem_has_ring(fd, e->exec_id | e->flags))
> >> +                               num_engines++;
> >> +               }
> >> +       }
> >> +
> >> +       /**
> >> +        * Test invalid access via perf API is rejected.
> >> +        */
> > 
> > ARGH. No comments on the intentions of the code?
> 
> Will add.

Pet peeve, adding comments for a hypothetical end user (to do what?)
rather than explain the purpose and subtleties of the code (esp. things
like using a context-switch to trigger an interrupt, that's the inside
knowledge that perhaps not everyone will know first hand).
-Chris
diff mbox

Patch

diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index b3f3b3809eee..102cc2841feb 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -537,14 +537,23 @@  unsigned intel_detect_and_clear_missed_interrupts(int fd)
 	return missed;
 }
 
+enum drm_i915_gem_engine_class {
+	I915_ENGINE_CLASS_OTHER = 0,
+	I915_ENGINE_CLASS_RENDER = 1,
+	I915_ENGINE_CLASS_COPY = 2,
+	I915_ENGINE_CLASS_VIDEO = 3,
+	I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+	I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
 const struct intel_execution_engine intel_execution_engines[] = {
-	{ "default", NULL, 0, 0 },
-	{ "render", "rcs0", I915_EXEC_RENDER, 0 },
-	{ "bsd", "vcs0", I915_EXEC_BSD, 0 },
-	{ "bsd1", "vcs0", I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
-	{ "bsd2", "vcs1", I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
-	{ "blt", "bcs0", I915_EXEC_BLT, 0 },
-	{ "vebox", "vecs0", I915_EXEC_VEBOX, 0 },
+	{ "default", NULL, -1, -1, 0, 0 },
+	{ "render", "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER, 0 },
+	{ "bsd", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 0 },
+	{ "bsd1", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
+	{ "bsd2", "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
+	{ "blt", "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT, 0 },
+	{ "vebox", "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX, 0 },
 	{ NULL, 0, 0 }
 };
 
diff --git a/lib/igt_gt.h b/lib/igt_gt.h
index 2579cbd37be7..436041ce9cc0 100644
--- a/lib/igt_gt.h
+++ b/lib/igt_gt.h
@@ -66,6 +66,8 @@  unsigned intel_detect_and_clear_missed_interrupts(int fd);
 extern const struct intel_execution_engine {
 	const char *name;
 	const char *full_name;
+	int class;
+	int instance;
 	unsigned exec_id;
 	unsigned flags;
 } intel_execution_engines[];
@@ -78,6 +80,12 @@  extern const struct intel_execution_engine {
 	     e__++) \
 		for_if (gem_has_ring(fd__, flags__ = e__->exec_id | e__->flags))
 
+#define for_each_engine_class_instance(fd__, e__) \
+	for ((e__) = intel_execution_engines;\
+	     (e__)->name; \
+	     (e__)++) \
+		for_if ((e__)->class > 0)
+
 bool gem_can_store_dword(int fd, unsigned int engine);
 
 #endif /* IGT_GT_H */
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index cf542df181a8..4bab6247151c 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -217,6 +217,7 @@  TESTS_progs = \
 	kms_vblank \
 	meta_test \
 	perf \
+	perf_pmu \
 	pm_backlight \
 	pm_lpsp \
 	pm_rc6_residency \
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
new file mode 100644
index 000000000000..2dbee586dacc
--- /dev/null
+++ b/tests/perf_pmu.c
@@ -0,0 +1,713 @@ 
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/times.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <time.h>
+#include <poll.h>
+
+#include "igt.h"
+#include "igt_perf.h"
+
+IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");
+
+const double tolerance = 0.02f;
+const unsigned long batch_duration_ns = 1000 * 1000 * 1000 / 2;
+
+static void
+init(int gem_fd, const struct intel_execution_engine *e, uint8_t sample)
+{
+	uint64_t config = __I915_PMU_ENGINE(e->class, e->instance, sample);
+	int fd;
+
+	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+	fd = perf_i915_open(config);
+	igt_assert(fd >= 0);
+
+	close(fd);
+}
+
+static uint64_t pmu_read_single(int fd)
+{
+	uint64_t data[2];
+	ssize_t len;
+
+	len = read(fd, data, sizeof(data));
+	igt_assert_eq(len, sizeof(data));
+
+	return data[0];
+}
+
+static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
+{
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	len = read(fd, buf, sizeof(buf));
+	igt_assert_eq(len, sizeof(buf));
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+}
+
+#define assert_within_epsilon(x, ref, tolerance) \
+	igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
+		     (double)(x) >= (1.0 - tolerance) * (double)ref, \
+		     "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
+		     #x, #ref, (double)x, tolerance * 100.0, (double)ref)
+
+static void
+single(int gem_fd, const struct intel_execution_engine *e, bool busy)
+{
+	uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
+	double ref = busy ? batch_duration_ns : 0.0f;
+	igt_spin_t *spin;
+	uint64_t val;
+	int fd;
+
+	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+	if (busy) {
+		spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+		igt_spin_batch_set_timeout(spin, batch_duration_ns);
+	} else {
+		usleep(batch_duration_ns / 1000);
+	}
+
+	fd = perf_i915_open(config);
+	igt_assert(fd >= 0);
+
+	if (busy)
+		gem_sync(gem_fd, spin->handle);
+
+	val = pmu_read_single(fd);
+
+	assert_within_epsilon(val, ref, tolerance);
+
+	if (busy)
+		igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+busy_check_all(int gem_fd, const struct intel_execution_engine *e,
+	       const unsigned int num_engines)
+{
+	const struct intel_execution_engine *e_;
+	uint64_t val[num_engines];
+	int fd[2];
+	igt_spin_t *spin;
+	unsigned int busy_idx, i;
+
+	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+	spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+	igt_spin_batch_set_timeout(spin, batch_duration_ns);
+
+	i = 0;
+	fd[0] = -1;
+	for_each_engine_class_instance(fd, e_) {
+		if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))
+			continue;
+
+		if (e == e_)
+			busy_idx = i;
+
+		fd[i == 0 ? 0 : 1] =
+			perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,
+								  e_->instance),
+					     fd[0]);
+		igt_assert(fd[0] > 0);
+		igt_assert(i == 0 || fd[1] > 0);
+		i++;
+	}
+
+	gem_sync(gem_fd, spin->handle);
+
+	pmu_read_multi(fd[0], num_engines, val);
+
+	assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
+	for (i = 0; i < num_engines; i++) {
+		if (i == busy_idx)
+			continue;
+		assert_within_epsilon(val[i], 0.0f, tolerance);
+	}
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd[0]);
+}
+
+static void
+two_busy_check_all(int gem_fd, const struct intel_execution_engine *e,
+		   const unsigned int num_engines)
+{
+	const struct intel_execution_engine *e_;
+	uint64_t val[num_engines];
+	int fd[2];
+	igt_spin_t *spin[2];
+	unsigned int busy_idx[2], i;
+
+	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+	igt_assert(e->exec_id != 0 && e->exec_id != I915_EXEC_RENDER);
+
+	spin[0] = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+	igt_spin_batch_set_timeout(spin[0], batch_duration_ns);
+
+	spin[1] = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	igt_spin_batch_set_timeout(spin[1], batch_duration_ns);
+
+	i = 0;
+	fd[0] = -1;
+	for_each_engine_class_instance(fd, e_) {
+		unsigned int idx = i == 0 ? 0 : 1;
+
+		if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))
+			continue;
+
+		if (e_->class == I915_ENGINE_CLASS_RENDER && e_->instance == 0)
+			busy_idx[0] = i;
+		else if (e == e_)
+			busy_idx[1] = i;
+
+		fd[idx] =
+			perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,
+								  e_->instance),
+					     fd[0]);
+		igt_assert(fd[idx] > 0);
+		i++;
+	}
+
+	gem_sync(gem_fd, spin[0]->handle);
+	gem_sync(gem_fd, spin[1]->handle);
+
+	pmu_read_multi(fd[0], num_engines, val);
+
+	for (i = 0; i < num_engines; i++) {
+		if (i == busy_idx[0] || i == busy_idx[1])
+			assert_within_epsilon(val[i], batch_duration_ns,
+					      tolerance);
+		else
+			assert_within_epsilon(val[i], 0.0f, tolerance);
+	}
+
+	igt_spin_batch_free(gem_fd, spin[0]);
+	igt_spin_batch_free(gem_fd, spin[1]);
+	close(fd[0]);
+}
+
+static void
+no_sema(int gem_fd, const struct intel_execution_engine *e, bool busy)
+{
+	igt_spin_t *spin;
+	uint64_t val[2];
+	int fd, fd2;
+
+	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+	if (busy) {
+		spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+		igt_spin_batch_set_timeout(spin, batch_duration_ns);
+	} else {
+		usleep(batch_duration_ns / 1000);
+	}
+
+	fd = perf_i915_open_group(I915_PMU_ENGINE_SEMA(e->class, e->instance),
+				  -1);
+	igt_assert(fd >= 0);
+	fd2 = perf_i915_open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance),
+				  fd);
+
+	pmu_read_multi(fd, 2, val);
+
+	assert_within_epsilon(val[0], 0.0f, tolerance);
+	assert_within_epsilon(val[1], 0.0f, tolerance);
+
+	if (busy)
+		igt_spin_batch_free(gem_fd, spin);
+	close(fd2);
+	close(fd);
+}
+
+static void
+multi_client(int gem_fd, const struct intel_execution_engine *e)
+{
+	uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
+	igt_spin_t *spin;
+	uint64_t val[2];
+	int fd[2];
+
+	igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+	spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+	igt_spin_batch_set_timeout(spin, batch_duration_ns);
+
+	fd[0] = perf_i915_open(config);
+	igt_assert(fd[0] >= 0);
+
+	usleep(batch_duration_ns / 4000);
+
+	fd[1] = perf_i915_open(config);
+	igt_assert(fd[1] >= 0);
+
+	usleep(batch_duration_ns / 3000);
+
+	val[1] = pmu_read_single(fd[1]);
+	close(fd[1]);
+
+	gem_sync(gem_fd, spin->handle);
+
+	val[0] = pmu_read_single(fd[0]);
+
+	assert_within_epsilon(val[0], batch_duration_ns, tolerance);
+	assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd[0]);
+}
+
+/**
+ * Tests that i915 PMU corectly error out in invalid initialization.
+ * i915 PMU is uncore PMU, thus:
+ *  - sampling period is not supported
+ *  - pid > 0 is not supported since we can't count per-process (we count
+ *    per whole system(
+ *  - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0
+ */
+static void invalid_init(void)
+{
+	struct perf_event_attr attr;
+	int pid, cpu;
+
+#define ATTR_INIT() \
+do { \
+	memset(&attr, 0, sizeof (attr)); \
+	attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \
+	attr.type = i915_type_id(); \
+	igt_assert(attr.type != 0); \
+} while(0)
+
+	ATTR_INIT();
+	attr.sample_period = 100;
+	pid = -1;
+	cpu = 0;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, EINVAL);
+
+	ATTR_INIT();
+	pid = 0;
+	cpu = 0;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, EINVAL);
+
+	ATTR_INIT();
+	pid = -1;
+	cpu = 1;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, ENODEV);
+}
+
+static void init_other(unsigned int i, bool valid)
+{
+	int fd;
+
+	fd = perf_i915_open(__I915_PMU_OTHER(i));
+	igt_require(!(fd < 0 && errno == ENODEV));
+	if (valid) {
+		igt_assert(fd >= 0);
+	} else {
+		igt_assert(fd < 0);
+		return;
+	}
+
+	close(fd);
+}
+
+static void read_other(unsigned int i, bool valid)
+{
+	int fd;
+
+	fd = perf_i915_open(__I915_PMU_OTHER(i));
+	igt_require(!(fd < 0 && errno == ENODEV));
+	if (valid) {
+		igt_assert(fd >= 0);
+	} else {
+		igt_assert(fd < 0);
+		return;
+	}
+
+	(void)pmu_read_single(fd);
+
+	close(fd);
+}
+
+static bool cpu0_hotplug_support(void)
+{
+	int fd = open("/sys/devices/system/cpu/cpu0/online", O_WRONLY);
+
+	close(fd);
+
+	return fd > 0;
+}
+
+static uint64_t
+elapsed_ns(const struct timespec *start, const struct timespec *end)
+{
+	return ((end->tv_sec - start->tv_sec) * 1e9 +
+		(end->tv_nsec - start->tv_nsec));
+}
+
+static void cpu_hotplug(int gem_fd)
+{
+	struct timespec start, now;
+	igt_spin_t *spin;
+	uint64_t val, ref;
+	int fd;
+
+	igt_require(cpu0_hotplug_support());
+
+	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
+	igt_assert(fd >= 0);
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+
+	igt_fork(child, 1) {
+		int cpu = 0;
+
+		for (;;) {
+			char name[128];
+			int cpufd;
+
+			sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
+				cpu);
+			cpufd = open(name, O_WRONLY);
+			if (cpufd == -1) {
+				igt_assert(cpu > 0);
+				break;
+			}
+			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			usleep(1000 * 1000);
+
+			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			close(cpufd);
+			cpu++;
+		}
+	}
+
+	igt_waitchildren();
+
+	igt_spin_batch_end(spin);
+	gem_sync(gem_fd, spin->handle);
+
+	clock_gettime(CLOCK_MONOTONIC, &now);
+	val = pmu_read_single(fd);
+
+	ref = elapsed_ns(&start, &now);
+
+	assert_within_epsilon(val, ref, tolerance);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+test_interrupts(int gem_fd)
+{
+	igt_spin_t *spin;
+	uint64_t idle, busy, prev;
+	int fd;
+
+	fd = perf_i915_open(I915_PMU_INTERRUPTS);
+	igt_assert(fd >= 0);
+
+	gem_quiescent_gpu(gem_fd);
+	sleep(2);
+	prev = pmu_read_single(fd);
+	usleep(batch_duration_ns / 1000);
+	idle = pmu_read_single(fd);
+
+	igt_assert_eq(idle - prev, 0);
+
+	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	igt_spin_batch_set_timeout(spin, batch_duration_ns);
+	gem_sync(gem_fd, spin->handle);
+
+	busy = pmu_read_single(fd);
+	igt_assert(busy > idle);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+test_frequency(int gem_fd)
+{
+	igt_spin_t *spin;
+	uint64_t idle[2], busy[2];
+	int fd;
+
+	fd = perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
+	igt_assert(fd >= 0);
+	igt_assert(perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, fd) >= 0);
+
+	gem_quiescent_gpu(gem_fd);
+	usleep(batch_duration_ns / 1000);
+
+	pmu_read_multi(fd, 2, idle);
+
+	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	igt_spin_batch_set_timeout(spin, batch_duration_ns);
+	gem_sync(gem_fd, spin->handle);
+
+	pmu_read_multi(fd, 2, busy);
+
+	igt_assert(busy[0] > idle[0]);
+	igt_assert(busy[1] > idle[1]);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+test_rc6(int gem_fd)
+{
+	int64_t duration_ns = 500 * 1000 * 1000;
+	igt_spin_t *spin;
+	uint64_t idle, busy, prev;
+	int fd;
+
+	fd = perf_i915_open(I915_PMU_RC6_RESIDENCY);
+	igt_assert(fd >= 0);
+
+	gem_quiescent_gpu(gem_fd);
+	sleep(2);
+
+	prev = pmu_read_single(fd);
+	usleep(duration_ns / 1000);
+	idle = pmu_read_single(fd);
+
+	assert_within_epsilon(idle - prev, duration_ns, tolerance);
+
+	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	igt_spin_batch_set_timeout(spin, duration_ns);
+
+	prev = pmu_read_single(fd);
+
+	gem_sync(gem_fd, spin->handle);
+
+	busy = pmu_read_single(fd);
+	assert_within_epsilon(busy - prev, 0.0, tolerance);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+test_rc6p(int gem_fd)
+{
+	const unsigned int devid = intel_get_drm_devid(gem_fd);
+	int64_t duration_ns = 2 * 1000 * 1000 * 1000;
+	unsigned int num_pmu = 1;
+	igt_spin_t *spin;
+	uint64_t idle[3], busy[3], prev[3];
+	unsigned int i;
+	int fd, ret;
+
+	igt_require(intel_gen(devid) < 8 && !IS_HASWELL(devid));
+
+	fd = perf_i915_open_group(I915_PMU_RC6_RESIDENCY, -1);
+	igt_assert(fd >= 0);
+
+	ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
+	if (ret > 0) {
+		num_pmu++;
+		ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
+		if (ret > 0)
+			num_pmu++;
+	}
+
+	gem_quiescent_gpu(gem_fd);
+	sleep(2);
+
+	pmu_read_multi(fd, num_pmu, prev);
+	usleep(duration_ns / 1000);
+	pmu_read_multi(fd, num_pmu, idle);
+
+	for (i = 0; i < num_pmu; i++)
+		assert_within_epsilon(idle[i] - prev[i], duration_ns,
+				      tolerance);
+
+	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	igt_spin_batch_set_timeout(spin, duration_ns);
+
+	pmu_read_multi(fd, num_pmu, prev);
+
+	gem_sync(gem_fd, spin->handle);
+
+	pmu_read_multi(fd, num_pmu, busy);
+
+	for (i = 0; i < num_pmu; i++)
+		assert_within_epsilon(busy[i] - prev[i], 0.0, tolerance);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+igt_main
+{
+	const unsigned int num_other_metrics =
+				I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
+	unsigned int num_engines = 0;
+	int fd = -1;
+	const struct intel_execution_engine *e;
+	unsigned int i;
+
+	igt_fixture {
+		fd = drm_open_driver_master(DRIVER_INTEL);
+
+		igt_require_gem(fd);
+		igt_require(i915_type_id() > 0);
+
+		for_each_engine_class_instance(fd, e) {
+			if (gem_has_ring(fd, e->exec_id | e->flags))
+				num_engines++;
+		}
+	}
+
+	/**
+	 * Test invalid access via perf API is rejected.
+	 */
+	igt_subtest("invalid-init")
+		invalid_init();
+
+	for_each_engine_class_instance(fd, e) {
+		/**
+		 * Test that a single engine metric can be initialized.
+		 */
+		igt_subtest_f("init-busy-%s", e->name)
+			init(fd, e, I915_SAMPLE_BUSY);
+
+		igt_subtest_f("init-wait-%s", e->name)
+			init(fd, e, I915_SAMPLE_WAIT);
+
+		igt_subtest_f("init-sema-%s", e->name)
+			init(fd, e, I915_SAMPLE_SEMA);
+
+		/**
+		 * Test that engines show no load when idle.
+		 */
+		igt_subtest_f("idle-%s", e->name)
+			single(fd, e, false);
+
+		/**
+		 * Test that a single engine reports load correctly.
+		 */
+		igt_subtest_f("busy-%s", e->name)
+			single(fd, e, true);
+
+		/**
+		 * Test that when one engine is loaded other report no load.
+		 */
+		igt_subtest_f("busy-check-all-%s", e->name)
+			busy_check_all(fd, e, num_engines);
+
+		/**
+		 * Test that when two engines are loaded other report no load.
+		 */
+		if (!(e->class == I915_ENGINE_CLASS_RENDER && e->instance == 0))
+			igt_subtest_f("two-busy-check-all-%s", e->name)
+				two_busy_check_all(fd, e, num_engines);
+
+		/**
+		 * Test that semphore counters report no activity on idle
+		 * engines.
+		 */
+		igt_subtest_f("idle-no-semaphores-%s", e->name)
+			no_sema(fd, e, false);
+
+		igt_subtest_f("busy-no-semaphores-%s", e->name)
+			no_sema(fd, e, true);
+
+		/**
+		 * Check that two perf clients do not influence each others
+		 * observations.
+		 */
+		igt_subtest_f("multi-client-%s", e->name)
+			multi_client(fd, e);
+	}
+
+	/**
+	 * Test that non-engine counters can be initialized and read. Apart
+	 * from the invalid metric which should fail.
+	 */
+	for (i = 0; i < num_other_metrics + 1; i++) {
+		igt_subtest_f("other-init-%u", i)
+			init_other(i, i < num_other_metrics);
+
+		igt_subtest_f("other-read-%u", i)
+			read_other(i, i < num_other_metrics);
+	}
+
+	/**
+	 * Test counters are not affected by CPU offline/online events.
+	 */
+	igt_subtest("cpu-hotplug")
+		cpu_hotplug(fd);
+
+	/**
+	 * Test GPU frequency.
+	 */
+	igt_subtest("frequency")
+		test_frequency(fd);
+
+	/**
+	 * Test interrupt count reporting.
+	 */
+	igt_subtest("interrupts")
+		test_interrupts(fd);
+
+	/**
+	 * Test RC6 residency reporting.
+	 */
+	igt_subtest("rc6")
+		test_rc6(fd);
+
+	/**
+	 * Test RC6p residency reporting.
+	 */
+	igt_subtest("rc6p")
+		test_rc6p(fd);
+}