Message ID | 20180315125617.12062-1-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Quoting Tvrtko Ursulin (2018-03-15 12:56:17) > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > More than one test assumes that the spinner is running pretty much > immediately after we have create or submitted it. > > In actuality there is a variable delay, especially on execlists platforms, > between submission and spin batch starting to run on the hardware. > > To enable tests which care about this level of timing to account for this, > we add a new spin batch constructor which provides an output field which > can be polled to determine when the batch actually started running. > > This is implemented via MI_STOREDW_IMM from the spin batch, writing into > memory mapped page shared with userspace. > > Using this facility from perf_pmu, where applicable, should improve very > occasional test fails across the set and platforms. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > lib/igt_dummyload.c | 99 +++++++++++++++++++++++++++++++---- > lib/igt_dummyload.h | 9 ++++ > tests/perf_pmu.c | 145 +++++++++++++++++++++++++++++++++++----------------- > 3 files changed, 196 insertions(+), 57 deletions(-) > > diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c > index 4b20f23dfe26..0447d2f14d57 100644 > --- a/lib/igt_dummyload.c > +++ b/lib/igt_dummyload.c > @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc, > reloc->write_domain = write_domains; > } > > -static int emit_recursive_batch(igt_spin_t *spin, > - int fd, uint32_t ctx, unsigned engine, > - uint32_t dep, bool out_fence) > +#define OUT_FENCE (1 << 0) > +#define POLL_RUN (1 << 1) > + > +static int > +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine, > + uint32_t dep, unsigned int flags) > { > #define SCRATCH 0 > #define BATCH 1 > @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin, > execbuf.buffer_count++; > > if (dep) { > + igt_assert(!(flags & POLL_RUN)); > + Challenge left to the reader :) > /* dummy write to dependency */ > obj[SCRATCH].handle = dep; > fill_reloc(&relocs[obj[BATCH].relocation_count++], > @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin, > I915_GEM_DOMAIN_RENDER, > I915_GEM_DOMAIN_RENDER); > execbuf.buffer_count++; > + } else if (flags & POLL_RUN) { > + unsigned int offset; > + > + igt_assert(!dep); > + > + spin->poll_handle = gem_create(fd, 4096); > + spin->running = __gem_mmap__wc(fd, spin->poll_handle, > + 0, 4096, PROT_READ | PROT_WRITE); Use mmap_cpu and gem_set_caching(). > + igt_assert(spin->running); > + igt_assert_eq(*spin->running, 0); > + > + *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); Hmm, have we forgot the (len-2) or is this an unusual command that knows its own length? > + > + if (gen >= 8) { > + offset = sizeof(uint32_t); > + *batch++ = 0; > + *batch++ = 0; > + } else if (gen >= 4) { > + offset = 2 * sizeof(uint32_t); > + *batch++ = 0; > + *batch++ = 0; > + } else { > + offset = sizeof(uint32_t); > + batch[-1]--; > + *batch++ = 0; > + } > + > + *batch++ = 1; > + > + obj[SCRATCH].handle = spin->poll_handle; > + fill_reloc(&relocs[obj[BATCH].relocation_count++], > + spin->poll_handle, offset, > + I915_GEM_DOMAIN_INSTRUCTION, > + I915_GEM_DOMAIN_INSTRUCTION); DOMAIN_RENDER preferably. You don't need the w/a. Could we not lie about the write-hazard? Removes the need for EXEC_OBJECT_ASYNC and opens up the possibility for using different dwords for different engines and then waiting for all-engines. > + execbuf.buffer_count++; gen4 and gen5 require I915_EXEC_SECURE and a DRM_MASTER fd. We can just do something like if (gen == 4 || gen == 5) igt_require(igt_device_set_master(fd) == 0)); > +/** > + * igt_spin_batch_new_poll: > + * @fd: open i915 drm file descriptor > + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less > + * than 0, execute on all available rings. > + * > + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that > + * contains the batch's handle that can be waited upon. The returned structure > + * must be passed to igt_spin_batch_free() for post-processing. > + * > + * igt_spin_t->running will containt a pointer which target will change from > + * zero to one once the spinner actually starts executing on the GPU. > + * > + * Returns: > + * Structure with helper internal state for igt_spin_batch_free(). > + */ > +igt_spin_t * > +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine) > +{ > + igt_spin_t *spin; > + > + igt_require_gem(fd); > + igt_require(gem_mmap__has_wc(fd)); igt_require(gem_can_store_dword(fd, engine)); Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses at least) and some platforms will die (*cough* snb *cough*). > + > + spin = __igt_spin_batch_new_poll(fd, ctx, engine); > + igt_assert(gem_bo_busy(fd, spin->handle)); > + > + return spin; > +} > igt_spin_t *__igt_spin_batch_new(int fd, > @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd, > uint32_t ctx, > unsigned engine); > > +igt_spin_t *__igt_spin_batch_new_poll(int fd, > + uint32_t ctx, > + unsigned engine); > +igt_spin_t *igt_spin_batch_new_poll(int fd, > + uint32_t ctx, > + unsigned engine); > + > void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns); > void igt_spin_batch_end(igt_spin_t *spin); > void igt_spin_batch_free(int fd, igt_spin_t *spin); > diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c > index 19fcc95ffc7f..d1b7b23bc646 100644 > --- a/tests/perf_pmu.c > +++ b/tests/perf_pmu.c > @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags) > usleep(batch_duration_ns / 5000); > } > > +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags) > +{ > + return __igt_spin_batch_new_poll(fd, ctx, flags); > +} > + > +static unsigned long __spin_wait(igt_spin_t *spin) > +{ > + struct timespec start = { }; > + > + igt_nsec_elapsed(&start); > + > + while (!spin->running); Put ';' on a new line so it's clearly visible. > + > + return igt_nsec_elapsed(&start); > +} > + > +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags) > +{ > + igt_spin_t *spin = __spin_poll(fd, ctx, flags); > + > + __spin_wait(spin); > + > + return spin; > +} > + > +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags) spin_sync() has connotations with gem_sync(). gem_sync is wait for end, but spin_sync is wait_for_start. Maybe spin_wait_for_execute? Nah. > +{ > + igt_require_gem(fd); > + > + return __spin_sync(fd, ctx, flags); > +} > static void > __submit_spin_batch(int gem_fd, > + igt_spin_t *spin, > struct drm_i915_gem_exec_object2 *obj, > const struct intel_execution_engine2 *e) > { > struct drm_i915_gem_execbuffer2 eb = { > - .buffer_count = 1, > .buffers_ptr = to_user_pointer(obj), > .flags = e2ring(gem_fd, e), > }; > > + if (spin->running) { > + obj[0].handle = spin->poll_handle; > + obj[0].flags = EXEC_OBJECT_ASYNC; > + obj[1].handle = spin->handle; > + eb.buffer_count = 2; > + } else { > + obj[0].handle = spin->handle; > + eb.buffer_count = 1; > + } obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are essential. Or else the kernel *will* move spin->poll_handle and then it is fubar. -Chris
On 15/03/2018 13:14, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-03-15 12:56:17) >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> >> More than one test assumes that the spinner is running pretty much >> immediately after we have create or submitted it. >> >> In actuality there is a variable delay, especially on execlists platforms, >> between submission and spin batch starting to run on the hardware. >> >> To enable tests which care about this level of timing to account for this, >> we add a new spin batch constructor which provides an output field which >> can be polled to determine when the batch actually started running. >> >> This is implemented via MI_STOREDW_IMM from the spin batch, writing into >> memory mapped page shared with userspace. >> >> Using this facility from perf_pmu, where applicable, should improve very >> occasional test fails across the set and platforms. >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> >> --- >> lib/igt_dummyload.c | 99 +++++++++++++++++++++++++++++++---- >> lib/igt_dummyload.h | 9 ++++ >> tests/perf_pmu.c | 145 +++++++++++++++++++++++++++++++++++----------------- >> 3 files changed, 196 insertions(+), 57 deletions(-) >> >> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c >> index 4b20f23dfe26..0447d2f14d57 100644 >> --- a/lib/igt_dummyload.c >> +++ b/lib/igt_dummyload.c >> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc, >> reloc->write_domain = write_domains; >> } >> >> -static int emit_recursive_batch(igt_spin_t *spin, >> - int fd, uint32_t ctx, unsigned engine, >> - uint32_t dep, bool out_fence) >> +#define OUT_FENCE (1 << 0) >> +#define POLL_RUN (1 << 1) >> + >> +static int >> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine, >> + uint32_t dep, unsigned int flags) >> { >> #define SCRATCH 0 >> #define BATCH 1 >> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin, >> execbuf.buffer_count++; >> >> if (dep) { >> + igt_assert(!(flags & POLL_RUN)); >> + > > Challenge left to the reader :) Well not the reader, whoever gets to need both. :) >> /* dummy write to dependency */ >> obj[SCRATCH].handle = dep; >> fill_reloc(&relocs[obj[BATCH].relocation_count++], >> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin, >> I915_GEM_DOMAIN_RENDER, >> I915_GEM_DOMAIN_RENDER); >> execbuf.buffer_count++; >> + } else if (flags & POLL_RUN) { >> + unsigned int offset; >> + >> + igt_assert(!dep); >> + >> + spin->poll_handle = gem_create(fd, 4096); >> + spin->running = __gem_mmap__wc(fd, spin->poll_handle, >> + 0, 4096, PROT_READ | PROT_WRITE); > > Use mmap_cpu and gem_set_caching(). Wouldn't that get us into coherency issues on some platforms? I keep the page mapped for API users to poll on. > >> + igt_assert(spin->running); >> + igt_assert_eq(*spin->running, 0); >> + >> + *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); > > Hmm, have we forgot the (len-2) or is this an unusual command that knows > its own length? I lifted the code from elsewhere. > >> + >> + if (gen >= 8) { >> + offset = sizeof(uint32_t); >> + *batch++ = 0; >> + *batch++ = 0; >> + } else if (gen >= 4) { >> + offset = 2 * sizeof(uint32_t); >> + *batch++ = 0; >> + *batch++ = 0; >> + } else { >> + offset = sizeof(uint32_t); >> + batch[-1]--; >> + *batch++ = 0; >> + } >> + >> + *batch++ = 1; >> + >> + obj[SCRATCH].handle = spin->poll_handle; >> + fill_reloc(&relocs[obj[BATCH].relocation_count++], >> + spin->poll_handle, offset, >> + I915_GEM_DOMAIN_INSTRUCTION, >> + I915_GEM_DOMAIN_INSTRUCTION); > > DOMAIN_RENDER preferably. You don't need the w/a. Could we not lie about > the write-hazard? Removes the need for EXEC_OBJECT_ASYNC and opens up Can do. > the possibility for using different dwords for different engines and then > waiting for all-engines. Yes, I could even use this so good to not let me be lazy. :) >> + execbuf.buffer_count++; > > gen4 and gen5 require I915_EXEC_SECURE and a DRM_MASTER fd. > > We can just do something like > if (gen == 4 || gen == 5) > igt_require(igt_device_set_master(fd) == 0)); Okay. > >> +/** >> + * igt_spin_batch_new_poll: >> + * @fd: open i915 drm file descriptor >> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less >> + * than 0, execute on all available rings. >> + * >> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that >> + * contains the batch's handle that can be waited upon. The returned structure >> + * must be passed to igt_spin_batch_free() for post-processing. >> + * >> + * igt_spin_t->running will containt a pointer which target will change from >> + * zero to one once the spinner actually starts executing on the GPU. >> + * >> + * Returns: >> + * Structure with helper internal state for igt_spin_batch_free(). >> + */ >> +igt_spin_t * >> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine) >> +{ >> + igt_spin_t *spin; >> + >> + igt_require_gem(fd); >> + igt_require(gem_mmap__has_wc(fd)); > > igt_require(gem_can_store_dword(fd, engine)); > > Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses > at least) and some platforms will die (*cough* snb *cough*). Grr that makes it all problematic. Well, maybe not completely, I can just fall back to less accurate method on those platforms. > >> + >> + spin = __igt_spin_batch_new_poll(fd, ctx, engine); >> + igt_assert(gem_bo_busy(fd, spin->handle)); >> + >> + return spin; >> +} > >> igt_spin_t *__igt_spin_batch_new(int fd, >> @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd, >> uint32_t ctx, >> unsigned engine); >> >> +igt_spin_t *__igt_spin_batch_new_poll(int fd, >> + uint32_t ctx, >> + unsigned engine); >> +igt_spin_t *igt_spin_batch_new_poll(int fd, >> + uint32_t ctx, >> + unsigned engine); >> + >> void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns); >> void igt_spin_batch_end(igt_spin_t *spin); >> void igt_spin_batch_free(int fd, igt_spin_t *spin); >> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c >> index 19fcc95ffc7f..d1b7b23bc646 100644 >> --- a/tests/perf_pmu.c >> +++ b/tests/perf_pmu.c >> @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags) >> usleep(batch_duration_ns / 5000); >> } >> >> +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags) >> +{ >> + return __igt_spin_batch_new_poll(fd, ctx, flags); >> +} >> + >> +static unsigned long __spin_wait(igt_spin_t *spin) >> +{ >> + struct timespec start = { }; >> + >> + igt_nsec_elapsed(&start); >> + >> + while (!spin->running); > > Put ';' on a new line so it's clearly visible. Okay. > >> + >> + return igt_nsec_elapsed(&start); >> +} >> + >> +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags) >> +{ >> + igt_spin_t *spin = __spin_poll(fd, ctx, flags); >> + >> + __spin_wait(spin); >> + >> + return spin; >> +} >> + >> +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags) > > spin_sync() has connotations with gem_sync(). gem_sync is wait for end, > but spin_sync is wait_for_start. Maybe spin_wait_for_execute? Nah. > >> +{ >> + igt_require_gem(fd); >> + >> + return __spin_sync(fd, ctx, flags); >> +} > >> static void >> __submit_spin_batch(int gem_fd, >> + igt_spin_t *spin, >> struct drm_i915_gem_exec_object2 *obj, >> const struct intel_execution_engine2 *e) >> { >> struct drm_i915_gem_execbuffer2 eb = { >> - .buffer_count = 1, >> .buffers_ptr = to_user_pointer(obj), >> .flags = e2ring(gem_fd, e), >> }; >> >> + if (spin->running) { >> + obj[0].handle = spin->poll_handle; >> + obj[0].flags = EXEC_OBJECT_ASYNC; >> + obj[1].handle = spin->handle; >> + eb.buffer_count = 2; >> + } else { >> + obj[0].handle = spin->handle; >> + eb.buffer_count = 1; >> + } > > obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are > essential. Or else the kernel *will* move spin->poll_handle and then it > is fubar. Why the caller has to do it? It is providing obj array which gets populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED to the helper is there a remaining problem? Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-03-15 13:36:26) > > On 15/03/2018 13:14, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2018-03-15 12:56:17) > >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> > >> More than one test assumes that the spinner is running pretty much > >> immediately after we have create or submitted it. > >> > >> In actuality there is a variable delay, especially on execlists platforms, > >> between submission and spin batch starting to run on the hardware. > >> > >> To enable tests which care about this level of timing to account for this, > >> we add a new spin batch constructor which provides an output field which > >> can be polled to determine when the batch actually started running. > >> > >> This is implemented via MI_STOREDW_IMM from the spin batch, writing into > >> memory mapped page shared with userspace. > >> > >> Using this facility from perf_pmu, where applicable, should improve very > >> occasional test fails across the set and platforms. > >> > >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> > >> --- > >> lib/igt_dummyload.c | 99 +++++++++++++++++++++++++++++++---- > >> lib/igt_dummyload.h | 9 ++++ > >> tests/perf_pmu.c | 145 +++++++++++++++++++++++++++++++++++----------------- > >> 3 files changed, 196 insertions(+), 57 deletions(-) > >> > >> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c > >> index 4b20f23dfe26..0447d2f14d57 100644 > >> --- a/lib/igt_dummyload.c > >> +++ b/lib/igt_dummyload.c > >> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc, > >> reloc->write_domain = write_domains; > >> } > >> > >> -static int emit_recursive_batch(igt_spin_t *spin, > >> - int fd, uint32_t ctx, unsigned engine, > >> - uint32_t dep, bool out_fence) > >> +#define OUT_FENCE (1 << 0) > >> +#define POLL_RUN (1 << 1) > >> + > >> +static int > >> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine, > >> + uint32_t dep, unsigned int flags) > >> { > >> #define SCRATCH 0 > >> #define BATCH 1 > >> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin, > >> execbuf.buffer_count++; > >> > >> if (dep) { > >> + igt_assert(!(flags & POLL_RUN)); > >> + > > > > Challenge left to the reader :) > > Well not the reader, whoever gets to need both. :) > > >> /* dummy write to dependency */ > >> obj[SCRATCH].handle = dep; > >> fill_reloc(&relocs[obj[BATCH].relocation_count++], > >> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin, > >> I915_GEM_DOMAIN_RENDER, > >> I915_GEM_DOMAIN_RENDER); > >> execbuf.buffer_count++; > >> + } else if (flags & POLL_RUN) { > >> + unsigned int offset; > >> + > >> + igt_assert(!dep); > >> + > >> + spin->poll_handle = gem_create(fd, 4096); > >> + spin->running = __gem_mmap__wc(fd, spin->poll_handle, > >> + 0, 4096, PROT_READ | PROT_WRITE); > > > > Use mmap_cpu and gem_set_caching(). > > Wouldn't that get us into coherency issues on some platforms? I keep the > page mapped for API users to poll on. bxt-a? The point of using gem_set_caching() is that it is coherent with the CPU cache even on !llc via snooping. It's then essentially the same as how we handle breadcrumbs. Now admittedly, we should do if (__gem_set_caching() == 0) running = __gem_mmap__wb(); else running = __gem_mmap__wc(); The caller need be known the wiser; except having to assume the worst and so __sync_synchronize() if they do *running = x themselves. > >> + igt_assert(spin->running); > >> + igt_assert_eq(*spin->running, 0); > >> + > >> + *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); > > > > Hmm, have we forgot the (len-2) or is this an unusual command that knows > > its own length? > > I lifted the code from elsewhere. I checked, we have the same bug everywhere or nowhere. :| > >> +/** > >> + * igt_spin_batch_new_poll: > >> + * @fd: open i915 drm file descriptor > >> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less > >> + * than 0, execute on all available rings. > >> + * > >> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that > >> + * contains the batch's handle that can be waited upon. The returned structure > >> + * must be passed to igt_spin_batch_free() for post-processing. > >> + * > >> + * igt_spin_t->running will containt a pointer which target will change from > >> + * zero to one once the spinner actually starts executing on the GPU. > >> + * > >> + * Returns: > >> + * Structure with helper internal state for igt_spin_batch_free(). > >> + */ > >> +igt_spin_t * > >> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine) > >> +{ > >> + igt_spin_t *spin; > >> + > >> + igt_require_gem(fd); > >> + igt_require(gem_mmap__has_wc(fd)); > > > > igt_require(gem_can_store_dword(fd, engine)); > > > > Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses > > at least) and some platforms will die (*cough* snb *cough*). > > Grr that makes it all problematic. Well, maybe not completely, I can > just fall back to less accurate method on those platforms. It's only a few, I don't think in the grand scheme of things it's enough to worry about. We should lose just a few pmu tests on snb. > >> static void > >> __submit_spin_batch(int gem_fd, > >> + igt_spin_t *spin, > >> struct drm_i915_gem_exec_object2 *obj, > >> const struct intel_execution_engine2 *e) > >> { > >> struct drm_i915_gem_execbuffer2 eb = { > >> - .buffer_count = 1, > >> .buffers_ptr = to_user_pointer(obj), > >> .flags = e2ring(gem_fd, e), > >> }; > >> > >> + if (spin->running) { > >> + obj[0].handle = spin->poll_handle; > >> + obj[0].flags = EXEC_OBJECT_ASYNC; > >> + obj[1].handle = spin->handle; > >> + eb.buffer_count = 2; > >> + } else { > >> + obj[0].handle = spin->handle; > >> + eb.buffer_count = 1; > >> + } > > > > obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are > > essential. Or else the kernel *will* move spin->poll_handle and then it > > is fubar. > > Why the caller has to do it? It is providing obj array which gets > populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED > to the helper is there a remaining problem? Yes. The caller needs to ensure that flags = PINNED *and* the offset is correct. We can't just randomly stuff PINNED in there as that pretty much guarantees the object will be moved, breaking the implicit relocations. As we are making changes to igt_spin_t, one of the ideas was that we put the obj[] array there (with the offsets and flags setup correctly) so that we could just feed that in again later without having to worry about the relocations. -Chris
On 15/03/2018 13:45, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-03-15 13:36:26) >> >> On 15/03/2018 13:14, Chris Wilson wrote: >>> Quoting Tvrtko Ursulin (2018-03-15 12:56:17) >>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>> >>>> More than one test assumes that the spinner is running pretty much >>>> immediately after we have create or submitted it. >>>> >>>> In actuality there is a variable delay, especially on execlists platforms, >>>> between submission and spin batch starting to run on the hardware. >>>> >>>> To enable tests which care about this level of timing to account for this, >>>> we add a new spin batch constructor which provides an output field which >>>> can be polled to determine when the batch actually started running. >>>> >>>> This is implemented via MI_STOREDW_IMM from the spin batch, writing into >>>> memory mapped page shared with userspace. >>>> >>>> Using this facility from perf_pmu, where applicable, should improve very >>>> occasional test fails across the set and platforms. >>>> >>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> >>>> --- >>>> lib/igt_dummyload.c | 99 +++++++++++++++++++++++++++++++---- >>>> lib/igt_dummyload.h | 9 ++++ >>>> tests/perf_pmu.c | 145 +++++++++++++++++++++++++++++++++++----------------- >>>> 3 files changed, 196 insertions(+), 57 deletions(-) >>>> >>>> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c >>>> index 4b20f23dfe26..0447d2f14d57 100644 >>>> --- a/lib/igt_dummyload.c >>>> +++ b/lib/igt_dummyload.c >>>> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc, >>>> reloc->write_domain = write_domains; >>>> } >>>> >>>> -static int emit_recursive_batch(igt_spin_t *spin, >>>> - int fd, uint32_t ctx, unsigned engine, >>>> - uint32_t dep, bool out_fence) >>>> +#define OUT_FENCE (1 << 0) >>>> +#define POLL_RUN (1 << 1) >>>> + >>>> +static int >>>> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine, >>>> + uint32_t dep, unsigned int flags) >>>> { >>>> #define SCRATCH 0 >>>> #define BATCH 1 >>>> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin, >>>> execbuf.buffer_count++; >>>> >>>> if (dep) { >>>> + igt_assert(!(flags & POLL_RUN)); >>>> + >>> >>> Challenge left to the reader :) >> >> Well not the reader, whoever gets to need both. :) >> >>>> /* dummy write to dependency */ >>>> obj[SCRATCH].handle = dep; >>>> fill_reloc(&relocs[obj[BATCH].relocation_count++], >>>> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin, >>>> I915_GEM_DOMAIN_RENDER, >>>> I915_GEM_DOMAIN_RENDER); >>>> execbuf.buffer_count++; >>>> + } else if (flags & POLL_RUN) { >>>> + unsigned int offset; >>>> + >>>> + igt_assert(!dep); >>>> + >>>> + spin->poll_handle = gem_create(fd, 4096); >>>> + spin->running = __gem_mmap__wc(fd, spin->poll_handle, >>>> + 0, 4096, PROT_READ | PROT_WRITE); >>> >>> Use mmap_cpu and gem_set_caching(). >> >> Wouldn't that get us into coherency issues on some platforms? I keep the >> page mapped for API users to poll on. > > bxt-a? The point of using gem_set_caching() is that it is coherent with > the CPU cache even on !llc via snooping. It's then essentially the same > as how we handle breadcrumbs. > > Now admittedly, we should do > > if (__gem_set_caching() == 0) > running = __gem_mmap__wb(); > else > running = __gem_mmap__wc(); > > The caller need be known the wiser; except having to assume the worst > and so __sync_synchronize() if they do *running = x themselves. Ok. >>>> + igt_assert(spin->running); >>>> + igt_assert_eq(*spin->running, 0); >>>> + >>>> + *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); >>> >>> Hmm, have we forgot the (len-2) or is this an unusual command that knows >>> its own length? >> >> I lifted the code from elsewhere. > > I checked, we have the same bug everywhere or nowhere. :| > >>>> +/** >>>> + * igt_spin_batch_new_poll: >>>> + * @fd: open i915 drm file descriptor >>>> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less >>>> + * than 0, execute on all available rings. >>>> + * >>>> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that >>>> + * contains the batch's handle that can be waited upon. The returned structure >>>> + * must be passed to igt_spin_batch_free() for post-processing. >>>> + * >>>> + * igt_spin_t->running will containt a pointer which target will change from >>>> + * zero to one once the spinner actually starts executing on the GPU. >>>> + * >>>> + * Returns: >>>> + * Structure with helper internal state for igt_spin_batch_free(). >>>> + */ >>>> +igt_spin_t * >>>> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine) >>>> +{ >>>> + igt_spin_t *spin; >>>> + >>>> + igt_require_gem(fd); >>>> + igt_require(gem_mmap__has_wc(fd)); >>> >>> igt_require(gem_can_store_dword(fd, engine)); >>> >>> Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses >>> at least) and some platforms will die (*cough* snb *cough*). >> >> Grr that makes it all problematic. Well, maybe not completely, I can >> just fall back to less accurate method on those platforms. > > It's only a few, I don't think in the grand scheme of things it's enough > to worry about. We should lose just a few pmu tests on snb. > >>>> static void >>>> __submit_spin_batch(int gem_fd, >>>> + igt_spin_t *spin, >>>> struct drm_i915_gem_exec_object2 *obj, >>>> const struct intel_execution_engine2 *e) >>>> { >>>> struct drm_i915_gem_execbuffer2 eb = { >>>> - .buffer_count = 1, >>>> .buffers_ptr = to_user_pointer(obj), >>>> .flags = e2ring(gem_fd, e), >>>> }; >>>> >>>> + if (spin->running) { >>>> + obj[0].handle = spin->poll_handle; >>>> + obj[0].flags = EXEC_OBJECT_ASYNC; >>>> + obj[1].handle = spin->handle; >>>> + eb.buffer_count = 2; >>>> + } else { >>>> + obj[0].handle = spin->handle; >>>> + eb.buffer_count = 1; >>>> + } >>> >>> obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are >>> essential. Or else the kernel *will* move spin->poll_handle and then it >>> is fubar. >> >> Why the caller has to do it? It is providing obj array which gets >> populated by the helper and by the kernel. If I add EXEC_OBJECT_PINNED >> to the helper is there a remaining problem? > > Yes. The caller needs to ensure that flags = PINNED *and* the offset is > correct. We can't just randomly stuff PINNED in there as that pretty > much guarantees the object will be moved, breaking the implicit > relocations. Nevermind I was confused, thought I was always populating the obj array. > As we are making changes to igt_spin_t, one of the ideas was that we put > the obj[] array there (with the offsets and flags setup correctly) so > that we could just feed that in again later without having to worry > about the relocations. I tried that before but we couldn't agree on resubmit semantics. My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact same batch, including the dependency. That would actually work well for this use case. So if you are happy with that, I can resurrect that patch, add one more to implement stuff from this patch, and rebase perf_pmu changes to follow. Actually I would change it to igt_spin_batch_restart(fd, spin, engine) - so the engine can be changed. Regards, Tvrtko > -Chris >
Quoting Tvrtko Ursulin (2018-03-15 14:37:59) > > On 15/03/2018 13:45, Chris Wilson wrote: > > As we are making changes to igt_spin_t, one of the ideas was that we put > > the obj[] array there (with the offsets and flags setup correctly) so > > that we could just feed that in again later without having to worry > > about the relocations. > > I tried that before but we couldn't agree on resubmit semantics. > > My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact > same batch, including the dependency. That would actually work well for > this use case. > > So if you are happy with that, I can resurrect that patch, add one more > to implement stuff from this patch, and rebase perf_pmu changes to follow. Honestly, best to do here first, as we will probably take forever to come up with something we both like and applies to more test cases. -Chris
On 15/03/2018 14:46, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-03-15 14:37:59) >> >> On 15/03/2018 13:45, Chris Wilson wrote: >>> As we are making changes to igt_spin_t, one of the ideas was that we put >>> the obj[] array there (with the offsets and flags setup correctly) so >>> that we could just feed that in again later without having to worry >>> about the relocations. >> >> I tried that before but we couldn't agree on resubmit semantics. >> >> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact >> same batch, including the dependency. That would actually work well for >> this use case. >> >> So if you are happy with that, I can resurrect that patch, add one more >> to implement stuff from this patch, and rebase perf_pmu changes to follow. > > Honestly, best to do here first, as we will probably take forever to come > up with something we both like and applies to more test cases. Don't quite get what you mean by "best to do here first" - where is here? Still locally to perf_pmu so no generic spin batch resubmit yet? But I can cache the obj array under spin_batch_t as a shortcut for time being? Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-03-15 14:53:08) > > On 15/03/2018 14:46, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2018-03-15 14:37:59) > >> > >> On 15/03/2018 13:45, Chris Wilson wrote: > >>> As we are making changes to igt_spin_t, one of the ideas was that we put > >>> the obj[] array there (with the offsets and flags setup correctly) so > >>> that we could just feed that in again later without having to worry > >>> about the relocations. > >> > >> I tried that before but we couldn't agree on resubmit semantics. > >> > >> My patch had igt_spin_batch_restart(fd, spin) - so emitting the exact > >> same batch, including the dependency. That would actually work well for > >> this use case. > >> > >> So if you are happy with that, I can resurrect that patch, add one more > >> to implement stuff from this patch, and rebase perf_pmu changes to follow. > > > > Honestly, best to do here first, as we will probably take forever to come > > up with something we both like and applies to more test cases. > > Don't quite get what you mean by "best to do here first" - where is > here? Fix perf_pmu, then worry about API. We're still waiting for kasan results, we may have more work ahead of us yet. > Still locally to perf_pmu so no generic spin batch resubmit yet? > But I can cache the obj array under spin_batch_t as a shortcut for time > being? I'd take igt_spin_t.obj[] :) But I don't insist on it, I'd like to get the wait-for-spin working before tackling the resubmit API. There's a few other places that either have a open-coded wait-for-submit, or need one. -Chris
diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c index 4b20f23dfe26..0447d2f14d57 100644 --- a/lib/igt_dummyload.c +++ b/lib/igt_dummyload.c @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc, reloc->write_domain = write_domains; } -static int emit_recursive_batch(igt_spin_t *spin, - int fd, uint32_t ctx, unsigned engine, - uint32_t dep, bool out_fence) +#define OUT_FENCE (1 << 0) +#define POLL_RUN (1 << 1) + +static int +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine, + uint32_t dep, unsigned int flags) { #define SCRATCH 0 #define BATCH 1 @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin, execbuf.buffer_count++; if (dep) { + igt_assert(!(flags & POLL_RUN)); + /* dummy write to dependency */ obj[SCRATCH].handle = dep; fill_reloc(&relocs[obj[BATCH].relocation_count++], @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER); execbuf.buffer_count++; + } else if (flags & POLL_RUN) { + unsigned int offset; + + igt_assert(!dep); + + spin->poll_handle = gem_create(fd, 4096); + spin->running = __gem_mmap__wc(fd, spin->poll_handle, + 0, 4096, PROT_READ | PROT_WRITE); + igt_assert(spin->running); + igt_assert_eq(*spin->running, 0); + + *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); + + if (gen >= 8) { + offset = sizeof(uint32_t); + *batch++ = 0; + *batch++ = 0; + } else if (gen >= 4) { + offset = 2 * sizeof(uint32_t); + *batch++ = 0; + *batch++ = 0; + } else { + offset = sizeof(uint32_t); + batch[-1]--; + *batch++ = 0; + } + + *batch++ = 1; + + obj[SCRATCH].handle = spin->poll_handle; + fill_reloc(&relocs[obj[BATCH].relocation_count++], + spin->poll_handle, offset, + I915_GEM_DOMAIN_INSTRUCTION, + I915_GEM_DOMAIN_INSTRUCTION); + execbuf.buffer_count++; } spin->batch = batch; @@ -170,14 +210,14 @@ static int emit_recursive_batch(igt_spin_t *spin, execbuf.buffers_ptr = to_user_pointer(obj + (2 - execbuf.buffer_count)); execbuf.rsvd1 = ctx; - if (out_fence) + if (flags & OUT_FENCE) execbuf.flags |= I915_EXEC_FENCE_OUT; for (i = 0; i < nengine; i++) { execbuf.flags &= ~ENGINE_MASK; execbuf.flags |= engines[i]; gem_execbuf_wr(fd, &execbuf); - if (out_fence) { + if (flags & OUT_FENCE) { int _fd = execbuf.rsvd2 >> 32; igt_assert(_fd >= 0); @@ -199,7 +239,7 @@ static int emit_recursive_batch(igt_spin_t *spin, static igt_spin_t * ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep, - int out_fence) + unsigned int flags) { igt_spin_t *spin; @@ -207,7 +247,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep, igt_assert(spin); spin->out_fence = emit_recursive_batch(spin, fd, ctx, engine, dep, - out_fence); + flags); pthread_mutex_lock(&list_lock); igt_list_add(&spin->link, &spin_list); @@ -219,7 +259,7 @@ ___igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep, igt_spin_t * __igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep) { - return ___igt_spin_batch_new(fd, ctx, engine, dep, false); + return ___igt_spin_batch_new(fd, ctx, engine, dep, 0); } /** @@ -253,7 +293,7 @@ igt_spin_batch_new(int fd, uint32_t ctx, unsigned engine, uint32_t dep) igt_spin_t * __igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine) { - return ___igt_spin_batch_new(fd, ctx, engine, 0, true); + return ___igt_spin_batch_new(fd, ctx, engine, 0, OUT_FENCE); } /** @@ -286,6 +326,42 @@ igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine) return spin; } +igt_spin_t * +__igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine) +{ + return ___igt_spin_batch_new(fd, ctx, engine, 0, POLL_RUN); +} + +/** + * igt_spin_batch_new_poll: + * @fd: open i915 drm file descriptor + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less + * than 0, execute on all available rings. + * + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that + * contains the batch's handle that can be waited upon. The returned structure + * must be passed to igt_spin_batch_free() for post-processing. + * + * igt_spin_t->running will containt a pointer which target will change from + * zero to one once the spinner actually starts executing on the GPU. + * + * Returns: + * Structure with helper internal state for igt_spin_batch_free(). + */ +igt_spin_t * +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine) +{ + igt_spin_t *spin; + + igt_require_gem(fd); + igt_require(gem_mmap__has_wc(fd)); + + spin = __igt_spin_batch_new_poll(fd, ctx, engine); + igt_assert(gem_bo_busy(fd, spin->handle)); + + return spin; +} + static void notify(union sigval arg) { igt_spin_t *spin = arg.sival_ptr; @@ -367,6 +443,11 @@ void igt_spin_batch_free(int fd, igt_spin_t *spin) igt_spin_batch_end(spin); gem_munmap(spin->batch, BATCH_SIZE); + if (spin->running) { + gem_munmap(spin->running, 4096); + gem_close(fd, spin->poll_handle); + } + gem_close(fd, spin->handle); if (spin->out_fence >= 0) diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h index 4103e4ab9e36..7ed93a3884b9 100644 --- a/lib/igt_dummyload.h +++ b/lib/igt_dummyload.h @@ -36,6 +36,8 @@ typedef struct igt_spin { struct igt_list link; uint32_t *batch; int out_fence; + uint32_t poll_handle; + bool *running; } igt_spin_t; igt_spin_t *__igt_spin_batch_new(int fd, @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd, uint32_t ctx, unsigned engine); +igt_spin_t *__igt_spin_batch_new_poll(int fd, + uint32_t ctx, + unsigned engine); +igt_spin_t *igt_spin_batch_new_poll(int fd, + uint32_t ctx, + unsigned engine); + void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns); void igt_spin_batch_end(igt_spin_t *spin); void igt_spin_batch_free(int fd, igt_spin_t *spin); diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c index 19fcc95ffc7f..d1b7b23bc646 100644 --- a/tests/perf_pmu.c +++ b/tests/perf_pmu.c @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags) usleep(batch_duration_ns / 5000); } +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags) +{ + return __igt_spin_batch_new_poll(fd, ctx, flags); +} + +static unsigned long __spin_wait(igt_spin_t *spin) +{ + struct timespec start = { }; + + igt_nsec_elapsed(&start); + + while (!spin->running); + + return igt_nsec_elapsed(&start); +} + +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags) +{ + igt_spin_t *spin = __spin_poll(fd, ctx, flags); + + __spin_wait(spin); + + return spin; +} + +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags) +{ + igt_require_gem(fd); + + return __spin_sync(fd, ctx, flags); +} + static void single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags) { @@ -195,7 +227,7 @@ single(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags) fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance)); if (flags & TEST_BUSY) - spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); + spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e)); else spin = NULL; @@ -251,13 +283,7 @@ busy_start(int gem_fd, const struct intel_execution_engine2 *e) */ sleep(2); - spin = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); - - /* - * Sleep for a bit after making the engine busy to make sure the PMU - * gets enabled when the batch is already running. - */ - usleep(500e3); + spin = __spin_sync(gem_fd, 0, e2ring(gem_fd, e)); fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance)); @@ -300,7 +326,7 @@ busy_double_start(int gem_fd, const struct intel_execution_engine2 *e) * re-submission in execlists mode. Make sure busyness is correctly * reported with the engine busy, and after the engine went idle. */ - spin[0] = __igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); + spin[0] = __spin_sync(gem_fd, 0, e2ring(gem_fd, e)); usleep(500e3); spin[1] = __igt_spin_batch_new(gem_fd, ctx, e2ring(gem_fd, e), 0); @@ -386,7 +412,7 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, igt_assert_eq(i, num_engines); - spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); + spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e)); pmu_read_multi(fd[0], num_engines, tval[0]); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) @@ -413,15 +439,25 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, static void __submit_spin_batch(int gem_fd, + igt_spin_t *spin, struct drm_i915_gem_exec_object2 *obj, const struct intel_execution_engine2 *e) { struct drm_i915_gem_execbuffer2 eb = { - .buffer_count = 1, .buffers_ptr = to_user_pointer(obj), .flags = e2ring(gem_fd, e), }; + if (spin->running) { + obj[0].handle = spin->poll_handle; + obj[0].flags = EXEC_OBJECT_ASYNC; + obj[1].handle = spin->handle; + eb.buffer_count = 2; + } else { + obj[0].handle = spin->handle; + eb.buffer_count = 1; + } + gem_execbuf(gem_fd, &eb); } @@ -429,7 +465,7 @@ static void most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, const unsigned int num_engines, unsigned int flags) { - struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_exec_object2 obj[2]; const struct intel_execution_engine2 *e_; uint64_t tval[2][num_engines]; uint64_t val[num_engines]; @@ -438,20 +474,19 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, igt_spin_t *spin = NULL; unsigned int idle_idx, i; + memset(obj, 0, sizeof(obj)); + i = 0; for_each_engine_class_instance(fd, e_) { if (!gem_has_engine(gem_fd, e_->class, e_->instance)) continue; - if (e == e_) { + if (e == e_) idle_idx = i; - } else if (spin) { - __submit_spin_batch(gem_fd, &obj, e_); - } else { - spin = igt_spin_batch_new(gem_fd, 0, - e2ring(gem_fd, e_), 0); - obj.handle = spin->handle; - } + else if (spin) + __submit_spin_batch(gem_fd, spin, obj, e_); + else + spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e_)); val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance); } @@ -461,6 +496,9 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, for (i = 0; i < num_engines; i++) fd[i] = open_group(val[i], fd[0]); + /* Small delay to allow engines to start. */ + usleep(__spin_wait(spin) * num_engines / 1e3); + pmu_read_multi(fd[0], num_engines, tval[0]); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) @@ -489,7 +527,7 @@ static void all_busy_check_all(int gem_fd, const unsigned int num_engines, unsigned int flags) { - struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_exec_object2 obj[2]; const struct intel_execution_engine2 *e; uint64_t tval[2][num_engines]; uint64_t val[num_engines]; @@ -498,18 +536,17 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines, igt_spin_t *spin = NULL; unsigned int i; + memset(obj, 0, sizeof(obj)); + i = 0; for_each_engine_class_instance(fd, e) { if (!gem_has_engine(gem_fd, e->class, e->instance)) continue; - if (spin) { - __submit_spin_batch(gem_fd, &obj, e); - } else { - spin = igt_spin_batch_new(gem_fd, 0, - e2ring(gem_fd, e), 0); - obj.handle = spin->handle; - } + if (spin) + __submit_spin_batch(gem_fd, spin, obj, e); + else + spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e)); val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance); } @@ -519,6 +556,9 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines, for (i = 0; i < num_engines; i++) fd[i] = open_group(val[i], fd[0]); + /* Small delay to allow engines to start. */ + usleep(__spin_wait(spin) * num_engines / 1e3); + pmu_read_multi(fd[0], num_engines, tval[0]); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) @@ -550,7 +590,7 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, unsigned int flags) open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd); if (flags & TEST_BUSY) - spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); + spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e)); else spin = NULL; @@ -884,7 +924,7 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e) */ fd[1] = open_pmu(config); - spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); + spin = spin_sync(gem_fd, 0, e2ring(gem_fd, e)); val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]); slept[1] = measured_usleep(batch_duration_ns / 1000); @@ -1248,7 +1288,7 @@ test_frequency(int gem_fd) igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq); gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */ - spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0); + spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER); slept = pmu_read_multi(fd, 2, start); measured_usleep(batch_duration_ns / 1000); @@ -1274,7 +1314,7 @@ test_frequency(int gem_fd) igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq); gem_quiescent_gpu(gem_fd); - spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0); + spin = spin_sync(gem_fd, 0, I915_EXEC_RENDER); slept = pmu_read_multi(fd, 2, start); measured_usleep(batch_duration_ns / 1000); @@ -1455,6 +1495,8 @@ static void __rearm_spin_batch(igt_spin_t *spin) { const uint32_t mi_arb_chk = 0x5 << 23; + if (spin->running) + *spin->running = 0; *spin->batch = mi_arb_chk; __sync_synchronize(); } @@ -1517,7 +1559,7 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e, const unsigned long timeout[] = { pwm_calibration_us * 1000, test_us * 1000 }; - struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_exec_object2 obj[2]; uint64_t total_busy_ns = 0, total_idle_ns = 0; igt_spin_t *spin; int ret; @@ -1530,12 +1572,13 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e, igt_warn("Failed to set scheduling policy!\n"); /* Allocate our spin batch and idle it. */ - spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); - obj.handle = spin->handle; - __submit_spin_batch(gem_fd, &obj, e); /* record its location */ + spin = __spin_poll(gem_fd, 0, e2ring(gem_fd, e)); + memset(obj, 0, sizeof(obj)); + __submit_spin_batch(gem_fd, spin, obj, e); /* record its location */ igt_spin_batch_end(spin); - gem_sync(gem_fd, obj.handle); - obj.flags |= EXEC_OBJECT_PINNED; + gem_sync(gem_fd, spin->handle); + obj[0].flags |= EXEC_OBJECT_PINNED; + obj[1].flags |= EXEC_OBJECT_PINNED; /* 1st pass is calibration, second pass is the test. */ for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) { @@ -1545,24 +1588,30 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e, igt_nsec_elapsed(&test_start); do { - unsigned int target_idle_us, t_busy; + unsigned int target_idle_us; + struct timespec start = { }; + unsigned long prep_delay_ns; /* Restart the spinbatch. */ + igt_nsec_elapsed(&start); __rearm_spin_batch(spin); - __submit_spin_batch(gem_fd, &obj, e); + __submit_spin_batch(gem_fd, spin, obj, e); - /* - * Note that the submission may be delayed to a - * tasklet (ksoftirqd) which cannot run until we - * sleep as we hog the cpu (we are RT). - */ + /* Wait for batch to start executing. */ + __spin_wait(spin); + prep_delay_ns = igt_nsec_elapsed(&start); - t_busy = measured_usleep(busy_us); + /* PWM busy sleep. */ + memset(&start, 0, sizeof(start)); + igt_nsec_elapsed(&start); + measured_usleep(busy_us); igt_spin_batch_end(spin); - gem_sync(gem_fd, obj.handle); + gem_sync(gem_fd, spin->handle); - total_busy_ns += t_busy; + total_busy_ns += igt_nsec_elapsed(&start); + total_idle_ns += prep_delay_ns; + /* Re-calibrate. */ target_idle_us = (100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000; total_idle_ns += measured_usleep(target_idle_us);