diff mbox series

[i-g-t,17/21] gem_wsim: Infinite batch support

Message ID 20190508121058.27038-18-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Media scalability tooling | expand

Commit Message

Tvrtko Ursulin May 8, 2019, 12:10 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

For simulating frame split workloads it is useful to express a batch which
ends at the same time as the parallel submission on the respective bonded
engine. For this we add support for infinite batch durations and the batch
terminate command ('T'). Syntax looks like this:

  1.RCS.*.0.0
  T.-1

First step starts an infinite batch, and second command terminates the
infinite batch with the usual relative workload step addressing.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 benchmarks/gem_wsim.c                  | 119 +++++++++++++++++++------
 benchmarks/wsim/README                 |   9 +-
 benchmarks/wsim/frame-split-60fps.wsim |   6 +-
 3 files changed, 102 insertions(+), 32 deletions(-)

Comments

Chris Wilson May 10, 2019, 1:48 p.m. UTC | #1
Quoting Tvrtko Ursulin (2019-05-08 13:10:54)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> For simulating frame split workloads it is useful to express a batch which
> ends at the same time as the parallel submission on the respective bonded
> engine. For this we add support for infinite batch durations and the batch
> terminate command ('T'). Syntax looks like this:
> 
>   1.RCS.*.0.0
>   T.-1
> 
> First step starts an infinite batch, and second command terminates the
> infinite batch with the usual relative workload step addressing.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  benchmarks/gem_wsim.c                  | 119 +++++++++++++++++++------
>  benchmarks/wsim/README                 |   9 +-
>  benchmarks/wsim/frame-split-60fps.wsim |   6 +-
>  3 files changed, 102 insertions(+), 32 deletions(-)
> 
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index cc6f4a742c12..97821b723b02 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -86,6 +86,7 @@ enum w_type
>         ENGINE_MAP,
>         LOAD_BALANCE,
>         BOND,
> +       TERMINATE,
>  };
>  
>  struct deps
> @@ -113,6 +114,7 @@ struct w_step
>         unsigned int context;
>         unsigned int engine;
>         struct duration duration;
> +       bool unbound_duration;
>         struct deps data_deps;
>         struct deps fence_deps;
>         int emit_fence;
> @@ -143,7 +145,7 @@ struct w_step
>  
>         struct drm_i915_gem_execbuffer2 eb;
>         struct drm_i915_gem_exec_object2 *obj;
> -       struct drm_i915_gem_relocation_entry reloc[4];
> +       struct drm_i915_gem_relocation_entry reloc[5];
>         unsigned long bb_sz;
>         uint32_t bb_handle;
>         uint32_t *seqno_value;
> @@ -153,6 +155,7 @@ struct w_step
>         uint32_t *rt1_address;
>         uint32_t *latch_value;
>         uint32_t *latch_address;
> +       uint32_t *recursive_bb_start;
>  };
>  
>  DECLARE_EWMA(uint64_t, rt, 4, 2)
> @@ -491,6 +494,10 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
>  
>                                 step.type = ENGINE_MAP;
>                                 goto add_step;
> +                       } else if (!strcmp(field, "T")) {
> +                               int_field(TERMINATE, target,
> +                                         tmp >= 0 || ((int)nr_steps + tmp) < 0,
> +                                         "Invalid terminate target at step %u!\n");
>                         } else if (!strcmp(field, "X")) {
>                                 unsigned int nr = 0;
>                                 while ((field = strtok_r(fstart, ".", &fctx))) {
> @@ -605,23 +612,28 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
>  
>                         fstart = NULL;
>  
> -                       tmpl = strtol(field, &sep, 10);
> -                       check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
> -                                 tmpl == LONG_MAX,
> -                                 "Invalid duration at step %u!\n", nr_steps);
> -                       step.duration.min = tmpl;
> -
> -                       if (sep && *sep == '-') {
> -                               tmpl = strtol(sep + 1, NULL, 10);
> -                               check_arg(tmpl <= 0 ||
> -                                         tmpl <= step.duration.min ||
> -                                         tmpl == LONG_MIN ||
> +                       if (field[0] == '*') {
> +                               step.unbound_duration = true;
> +                       } else {
> +                               tmpl = strtol(field, &sep, 10);
> +                               check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
>                                           tmpl == LONG_MAX,
> -                                         "Invalid duration range at step %u!\n",
> +                                         "Invalid duration at step %u!\n",
>                                           nr_steps);
> -                               step.duration.max = tmpl;
> -                       } else {
> -                               step.duration.max = step.duration.min;
> +                               step.duration.min = tmpl;
> +
> +                               if (sep && *sep == '-') {
> +                                       tmpl = strtol(sep + 1, NULL, 10);
> +                                       check_arg(tmpl <= 0 ||
> +                                               tmpl <= step.duration.min ||
> +                                               tmpl == LONG_MIN ||
> +                                               tmpl == LONG_MAX,
> +                                               "Invalid duration range at step %u!\n",
> +                                               nr_steps);
> +                                       step.duration.max = tmpl;
> +                               } else {
> +                                       step.duration.max = step.duration.min;
> +                               }
>                         }
>  
>                         valid++;
> @@ -781,7 +793,7 @@ init_bb(struct w_step *w, unsigned int flags)
>         unsigned int i;
>         uint32_t *ptr;
>  
> -       if (!arb_period)
> +       if (w->unbound_duration || !arb_period)
>                 return;
>  
>         gem_set_domain(fd, w->bb_handle,
> @@ -801,6 +813,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         const uint32_t bbe = 0xa << 23;
>         unsigned long mmap_start, mmap_len;
>         unsigned long batch_start = w->bb_sz;
> +       unsigned int r = 0;
>         uint32_t *ptr, *cs;
>  
>         igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
> @@ -811,6 +824,9 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         if (flags & RT)
>                 batch_start -= 12 * sizeof(uint32_t);
>  
> +       if (w->unbound_duration)
> +               batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
> +
>         mmap_start = rounddown(batch_start, PAGE_SIZE);
>         mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
>  
> @@ -820,8 +836,19 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
>         cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
>  
> +       if (w->unbound_duration) {
> +               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
> +               batch_start += 4 * sizeof(uint32_t);
> +
> +               *cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
> +               w->recursive_bb_start = cs;
> +               *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
> +               *cs++ = 0;
> +               *cs++ = 0;

Hmm. Have we previously checked for gen >= 8?

So preemption check interval is given by batch_start - mmap_start.
Which is limited to a max of 64 bytes. That might be a bit excessive on
the frequency of doing MI_BB_START, certainly for gen7, gen8+ is a tad
more forgiving i.e. it has more bw and doesn't starve the cpu as much.

> +       }
> +
>         if (flags & SEQNO) {
> -               w->reloc[0].offset = batch_start + sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = MI_STORE_DWORD_IMM;
> @@ -833,7 +860,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         }
>  
>         if (flags & RT) {
> -               w->reloc[1].offset = batch_start + sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = MI_STORE_DWORD_IMM;
> @@ -843,7 +870,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>                 w->rt0_value = cs;
>                 *cs++ = 0;
>  
> -               w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
> @@ -852,7 +879,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>                 *cs++ = 0;
>                 *cs++ = 0;
>  
> -               w->reloc[3].offset = batch_start + sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = MI_STORE_DWORD_IMM;
> @@ -984,19 +1011,28 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
>                 }
>         }
>  
> -       w->bb_sz = get_bb_sz(w->duration.max);
> -       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz);
> +       if (w->unbound_duration)
> +               /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
> +               w->bb_sz = max(64, get_bb_sz(w->preempt_us)) +
> +                          (1 + 3) * sizeof(uint32_t);
> +       else
> +               w->bb_sz = get_bb_sz(w->duration.max);
> +       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
>         init_bb(w, flags);
>         terminate_bb(w, flags);
>  
> -       if (flags & SEQNO) {
> +       if ((flags & SEQNO) || w->unbound_duration) {
>                 w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
> +               if (flags & SEQNO)
> +                       w->obj[j].relocation_count++;
>                 if (flags & RT)
> -                       w->obj[j].relocation_count = 4;
> -               else
> -                       w->obj[j].relocation_count = 1;
> +                       w->obj[j].relocation_count += 3;
> +               if (w->unbound_duration)
> +                       w->obj[j].relocation_count++;

Huh, I expected to see w->obj[j].relocation_count = r;
Already out of scope?

>                 for (i = 0; i < w->obj[j].relocation_count; i++)
>                         w->reloc[i].target_handle = 1;
> +               if (w->unbound_duration)
> +                       w->reloc[0].target_handle = j;

Ok, recursive BB_START.
>         }
>  
>         w->eb.buffers_ptr = to_user_pointer(w->obj);
> @@ -2036,6 +2072,18 @@ update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
>         }
>  }
>  
> +static void
> +update_bb_start(struct w_step *w)
> +{
> +       if (!w->unbound_duration)
> +               return;
> +
> +       gem_set_domain(fd, w->bb_handle,
> +                      I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);

Hmm. A scary sync point. Do you just want to be sure you have flushed
the previous user?

> +       *w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
> +}
> +
>  static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
>  {
>         if (target < 0)
> @@ -2171,9 +2219,13 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
>         if (flags & RT)
>                 update_bb_rt(w, engine, seqno);
>  
> +       update_bb_start(w);
> +
>         w->eb.batch_start_offset =
> +               w->unbound_duration ?
> +               0 :
>                 ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
> -                       2 * sizeof(uint32_t));
> +                     2 * sizeof(uint32_t));
>  
>         for (i = 0; i < w->fence_deps.nr; i++) {
>                 int tgt = w->idx + w->fence_deps.list[i];
> @@ -2313,6 +2365,17 @@ static void *run_workload(void *data)
>                                                                     w->priority;
>                                 }
>                                 continue;
> +                       } else if (w->type == TERMINATE) {
> +                               unsigned int t_idx = i + w->target;
> +
> +                               igt_assert(t_idx >= 0 && t_idx < i);
> +                               igt_assert(wrk->steps[t_idx].type == BATCH);
> +                               igt_assert(wrk->steps[t_idx].unbound_duration);
> +
> +                               *wrk->steps[t_idx].recursive_bb_start =
> +                                       MI_BATCH_BUFFER_END;
> +                               __sync_synchronize();
> +                               continue;
>                         } else if (w->type == PREEMPTION ||
>                                    w->type == ENGINE_MAP ||
>                                    w->type == LOAD_BALANCE ||
> diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
> index 6aec718bc812..c94d01018419 100644
> --- a/benchmarks/wsim/README
> +++ b/benchmarks/wsim/README
> @@ -2,11 +2,11 @@ Workload descriptor format
>  ==========================
>  
>  ctx.engine.duration_us.dependency.wait,...
> -<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,...
> +<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,...
>  B.<uint>
>  M.<uint>.<str>[|<str>]...
>  P|X.<uint>.<int>
> -d|p|s|t|q|a.<int>,...
> +d|p|s|t|q|a|T.<int>,...
>  b.<uint>.<uint>.<str>
>  f
>  
> @@ -30,6 +30,7 @@ Additional workload steps are also supported:
>   'b' - Set up engine bonds.
>   'M' - Set up engine map.
>   'P' - Context priority.
> + 'T' - Terminate an infinite batch.
>   'X' - Context preemption control.
>  
>  Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS
> @@ -77,6 +78,10 @@ Example:
>  
>  I this case the last step has a data dependency on both first and second steps.
>  
> +Batch durations can also be specified as infinite by using the '*' in the
> +duration field. Such batches must be ended by the terminate command ('T')
> +otherwise they will cause a GPU hang to be reported.
> +
>  Sync (fd) fences
>  ----------------
>  
> diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsim
> index cfbfcd39be7d..ea89da3add48 100644
> --- a/benchmarks/wsim/frame-split-60fps.wsim
> +++ b/benchmarks/wsim/frame-split-60fps.wsim
> @@ -6,10 +6,12 @@ M.2.VCS2
>  B.2
>  b.2.1.VCS1
>  f
> -1.DEFAULT.4000-6000.f-1.0
> +1.DEFAULT.*.f-1.0
>  2.DEFAULT.4000-6000.s-1.0
>  a.-3
> -3.RCS.2000-4000.-3/-2.0
> +s.-2
> +T.-4
> +3.RCS.2000-4000.-5/-4.0
>  3.VECS.2000.-1.0
>  4.BCS.1000.-1.0
>  s.-2

Usecase looks reasonable.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
Tvrtko Ursulin May 13, 2019, 1:59 p.m. UTC | #2
On 10/05/2019 14:48, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-05-08 13:10:54)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> For simulating frame split workloads it is useful to express a batch which
>> ends at the same time as the parallel submission on the respective bonded
>> engine. For this we add support for infinite batch durations and the batch
>> terminate command ('T'). Syntax looks like this:
>>
>>    1.RCS.*.0.0
>>    T.-1
>>
>> First step starts an infinite batch, and second command terminates the
>> infinite batch with the usual relative workload step addressing.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   benchmarks/gem_wsim.c                  | 119 +++++++++++++++++++------
>>   benchmarks/wsim/README                 |   9 +-
>>   benchmarks/wsim/frame-split-60fps.wsim |   6 +-
>>   3 files changed, 102 insertions(+), 32 deletions(-)
>>
>> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
>> index cc6f4a742c12..97821b723b02 100644
>> --- a/benchmarks/gem_wsim.c
>> +++ b/benchmarks/gem_wsim.c
>> @@ -86,6 +86,7 @@ enum w_type
>>          ENGINE_MAP,
>>          LOAD_BALANCE,
>>          BOND,
>> +       TERMINATE,
>>   };
>>   
>>   struct deps
>> @@ -113,6 +114,7 @@ struct w_step
>>          unsigned int context;
>>          unsigned int engine;
>>          struct duration duration;
>> +       bool unbound_duration;
>>          struct deps data_deps;
>>          struct deps fence_deps;
>>          int emit_fence;
>> @@ -143,7 +145,7 @@ struct w_step
>>   
>>          struct drm_i915_gem_execbuffer2 eb;
>>          struct drm_i915_gem_exec_object2 *obj;
>> -       struct drm_i915_gem_relocation_entry reloc[4];
>> +       struct drm_i915_gem_relocation_entry reloc[5];
>>          unsigned long bb_sz;
>>          uint32_t bb_handle;
>>          uint32_t *seqno_value;
>> @@ -153,6 +155,7 @@ struct w_step
>>          uint32_t *rt1_address;
>>          uint32_t *latch_value;
>>          uint32_t *latch_address;
>> +       uint32_t *recursive_bb_start;
>>   };
>>   
>>   DECLARE_EWMA(uint64_t, rt, 4, 2)
>> @@ -491,6 +494,10 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
>>   
>>                                  step.type = ENGINE_MAP;
>>                                  goto add_step;
>> +                       } else if (!strcmp(field, "T")) {
>> +                               int_field(TERMINATE, target,
>> +                                         tmp >= 0 || ((int)nr_steps + tmp) < 0,
>> +                                         "Invalid terminate target at step %u!\n");
>>                          } else if (!strcmp(field, "X")) {
>>                                  unsigned int nr = 0;
>>                                  while ((field = strtok_r(fstart, ".", &fctx))) {
>> @@ -605,23 +612,28 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
>>   
>>                          fstart = NULL;
>>   
>> -                       tmpl = strtol(field, &sep, 10);
>> -                       check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
>> -                                 tmpl == LONG_MAX,
>> -                                 "Invalid duration at step %u!\n", nr_steps);
>> -                       step.duration.min = tmpl;
>> -
>> -                       if (sep && *sep == '-') {
>> -                               tmpl = strtol(sep + 1, NULL, 10);
>> -                               check_arg(tmpl <= 0 ||
>> -                                         tmpl <= step.duration.min ||
>> -                                         tmpl == LONG_MIN ||
>> +                       if (field[0] == '*') {
>> +                               step.unbound_duration = true;
>> +                       } else {
>> +                               tmpl = strtol(field, &sep, 10);
>> +                               check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
>>                                            tmpl == LONG_MAX,
>> -                                         "Invalid duration range at step %u!\n",
>> +                                         "Invalid duration at step %u!\n",
>>                                            nr_steps);
>> -                               step.duration.max = tmpl;
>> -                       } else {
>> -                               step.duration.max = step.duration.min;
>> +                               step.duration.min = tmpl;
>> +
>> +                               if (sep && *sep == '-') {
>> +                                       tmpl = strtol(sep + 1, NULL, 10);
>> +                                       check_arg(tmpl <= 0 ||
>> +                                               tmpl <= step.duration.min ||
>> +                                               tmpl == LONG_MIN ||
>> +                                               tmpl == LONG_MAX,
>> +                                               "Invalid duration range at step %u!\n",
>> +                                               nr_steps);
>> +                                       step.duration.max = tmpl;
>> +                               } else {
>> +                                       step.duration.max = step.duration.min;
>> +                               }
>>                          }
>>   
>>                          valid++;
>> @@ -781,7 +793,7 @@ init_bb(struct w_step *w, unsigned int flags)
>>          unsigned int i;
>>          uint32_t *ptr;
>>   
>> -       if (!arb_period)
>> +       if (w->unbound_duration || !arb_period)
>>                  return;
>>   
>>          gem_set_domain(fd, w->bb_handle,
>> @@ -801,6 +813,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>>          const uint32_t bbe = 0xa << 23;
>>          unsigned long mmap_start, mmap_len;
>>          unsigned long batch_start = w->bb_sz;
>> +       unsigned int r = 0;
>>          uint32_t *ptr, *cs;
>>   
>>          igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
>> @@ -811,6 +824,9 @@ terminate_bb(struct w_step *w, unsigned int flags)
>>          if (flags & RT)
>>                  batch_start -= 12 * sizeof(uint32_t);
>>   
>> +       if (w->unbound_duration)
>> +               batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
>> +
>>          mmap_start = rounddown(batch_start, PAGE_SIZE);
>>          mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
>>   
>> @@ -820,8 +836,19 @@ terminate_bb(struct w_step *w, unsigned int flags)
>>          ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
>>          cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
>>   
>> +       if (w->unbound_duration) {
>> +               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
>> +               batch_start += 4 * sizeof(uint32_t);
>> +
>> +               *cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
>> +               w->recursive_bb_start = cs;
>> +               *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
>> +               *cs++ = 0;
>> +               *cs++ = 0;
> 
> Hmm. Have we previously checked for gen >= 8?

No, will add.

> So preemption check interval is given by batch_start - mmap_start.
> Which is limited to a max of 64 bytes. That might be a bit excessive on
> the frequency of doing MI_BB_START, certainly for gen7, gen8+ is a tad
> more forgiving i.e. it has more bw and doesn't starve the cpu as much.

Nope, mmap_start is not controlling the batch buffer at all. It is just 
to find the calculated batch_start given that mmap() was given a 
round-down PAGE_ALIGN start address. Actual preemption check interval is 
one MI_NOOP. /o\ How much would you recommend to be safe?

>> +       }
>> +
>>          if (flags & SEQNO) {
>> -               w->reloc[0].offset = batch_start + sizeof(uint32_t);
>> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>>                  batch_start += 4 * sizeof(uint32_t);
>>   
>>                  *cs++ = MI_STORE_DWORD_IMM;
>> @@ -833,7 +860,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>>          }
>>   
>>          if (flags & RT) {
>> -               w->reloc[1].offset = batch_start + sizeof(uint32_t);
>> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>>                  batch_start += 4 * sizeof(uint32_t);
>>   
>>                  *cs++ = MI_STORE_DWORD_IMM;
>> @@ -843,7 +870,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>>                  w->rt0_value = cs;
>>                  *cs++ = 0;
>>   
>> -               w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
>> +               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
>>                  batch_start += 4 * sizeof(uint32_t);
>>   
>>                  *cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
>> @@ -852,7 +879,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>>                  *cs++ = 0;
>>                  *cs++ = 0;
>>   
>> -               w->reloc[3].offset = batch_start + sizeof(uint32_t);
>> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>>                  batch_start += 4 * sizeof(uint32_t);
>>   
>>                  *cs++ = MI_STORE_DWORD_IMM;
>> @@ -984,19 +1011,28 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
>>                  }
>>          }
>>   
>> -       w->bb_sz = get_bb_sz(w->duration.max);
>> -       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz);
>> +       if (w->unbound_duration)
>> +               /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
>> +               w->bb_sz = max(64, get_bb_sz(w->preempt_us)) +
>> +                          (1 + 3) * sizeof(uint32_t);
>> +       else
>> +               w->bb_sz = get_bb_sz(w->duration.max);
>> +       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
>>          init_bb(w, flags);
>>          terminate_bb(w, flags);
>>   
>> -       if (flags & SEQNO) {
>> +       if ((flags & SEQNO) || w->unbound_duration) {
>>                  w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
>> +               if (flags & SEQNO)
>> +                       w->obj[j].relocation_count++;
>>                  if (flags & RT)
>> -                       w->obj[j].relocation_count = 4;
>> -               else
>> -                       w->obj[j].relocation_count = 1;
>> +                       w->obj[j].relocation_count += 3;
>> +               if (w->unbound_duration)
>> +                       w->obj[j].relocation_count++;
> 
> Huh, I expected to see w->obj[j].relocation_count = r;
> Already out of scope?

In a helper yes. Under danger that I got confused about what's what, I 
think I could make the helper return the count.

> 
>>                  for (i = 0; i < w->obj[j].relocation_count; i++)
>>                          w->reloc[i].target_handle = 1;
>> +               if (w->unbound_duration)
>> +                       w->reloc[0].target_handle = j;
> 
> Ok, recursive BB_START.
>>          }
>>   
>>          w->eb.buffers_ptr = to_user_pointer(w->obj);
>> @@ -2036,6 +2072,18 @@ update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
>>          }
>>   }
>>   
>> +static void
>> +update_bb_start(struct w_step *w)
>> +{
>> +       if (!w->unbound_duration)
>> +               return;
>> +
>> +       gem_set_domain(fd, w->bb_handle,
>> +                      I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
> 
> Hmm. A scary sync point. Do you just want to be sure you have flushed
> the previous user?

Yes. By definition one infinite batch runs max once per frame. If it has 
been terminated the code needs to re-instate the BB_START, so I thought 
I need to be sure it is not executing before I do that. I guess if 
someone forgot to terminate it this would hang on second loop. But I 
think that's better than just carrying on with a potentially no-op 
instead of infinite batch.

> 
>> +       *w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
>> +}
>> +
>>   static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
>>   {
>>          if (target < 0)
>> @@ -2171,9 +2219,13 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
>>          if (flags & RT)
>>                  update_bb_rt(w, engine, seqno);
>>   
>> +       update_bb_start(w);
>> +
>>          w->eb.batch_start_offset =
>> +               w->unbound_duration ?
>> +               0 :
>>                  ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
>> -                       2 * sizeof(uint32_t));
>> +                     2 * sizeof(uint32_t));
>>   
>>          for (i = 0; i < w->fence_deps.nr; i++) {
>>                  int tgt = w->idx + w->fence_deps.list[i];
>> @@ -2313,6 +2365,17 @@ static void *run_workload(void *data)
>>                                                                      w->priority;
>>                                  }
>>                                  continue;
>> +                       } else if (w->type == TERMINATE) {
>> +                               unsigned int t_idx = i + w->target;
>> +
>> +                               igt_assert(t_idx >= 0 && t_idx < i);
>> +                               igt_assert(wrk->steps[t_idx].type == BATCH);
>> +                               igt_assert(wrk->steps[t_idx].unbound_duration);
>> +
>> +                               *wrk->steps[t_idx].recursive_bb_start =
>> +                                       MI_BATCH_BUFFER_END;
>> +                               __sync_synchronize();
>> +                               continue;
>>                          } else if (w->type == PREEMPTION ||
>>                                     w->type == ENGINE_MAP ||
>>                                     w->type == LOAD_BALANCE ||
>> diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
>> index 6aec718bc812..c94d01018419 100644
>> --- a/benchmarks/wsim/README
>> +++ b/benchmarks/wsim/README
>> @@ -2,11 +2,11 @@ Workload descriptor format
>>   ==========================
>>   
>>   ctx.engine.duration_us.dependency.wait,...
>> -<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,...
>> +<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,...
>>   B.<uint>
>>   M.<uint>.<str>[|<str>]...
>>   P|X.<uint>.<int>
>> -d|p|s|t|q|a.<int>,...
>> +d|p|s|t|q|a|T.<int>,...
>>   b.<uint>.<uint>.<str>
>>   f
>>   
>> @@ -30,6 +30,7 @@ Additional workload steps are also supported:
>>    'b' - Set up engine bonds.
>>    'M' - Set up engine map.
>>    'P' - Context priority.
>> + 'T' - Terminate an infinite batch.
>>    'X' - Context preemption control.
>>   
>>   Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS
>> @@ -77,6 +78,10 @@ Example:
>>   
>>   I this case the last step has a data dependency on both first and second steps.
>>   
>> +Batch durations can also be specified as infinite by using the '*' in the
>> +duration field. Such batches must be ended by the terminate command ('T')
>> +otherwise they will cause a GPU hang to be reported.
>> +
>>   Sync (fd) fences
>>   ----------------
>>   
>> diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsim
>> index cfbfcd39be7d..ea89da3add48 100644
>> --- a/benchmarks/wsim/frame-split-60fps.wsim
>> +++ b/benchmarks/wsim/frame-split-60fps.wsim
>> @@ -6,10 +6,12 @@ M.2.VCS2
>>   B.2
>>   b.2.1.VCS1
>>   f
>> -1.DEFAULT.4000-6000.f-1.0
>> +1.DEFAULT.*.f-1.0
>>   2.DEFAULT.4000-6000.s-1.0
>>   a.-3
>> -3.RCS.2000-4000.-3/-2.0
>> +s.-2
>> +T.-4
>> +3.RCS.2000-4000.-5/-4.0
>>   3.VECS.2000.-1.0
>>   4.BCS.1000.-1.0
>>   s.-2
> 
> Usecase looks reasonable.
> 
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Thanks,

Tvrtko
Chris Wilson May 13, 2019, 2:11 p.m. UTC | #3
Quoting Tvrtko Ursulin (2019-05-13 14:59:01)
> 
> On 10/05/2019 14:48, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-05-08 13:10:54)
> > So preemption check interval is given by batch_start - mmap_start.
> > Which is limited to a max of 64 bytes. That might be a bit excessive on
> > the frequency of doing MI_BB_START, certainly for gen7, gen8+ is a tad
> > more forgiving i.e. it has more bw and doesn't starve the cpu as much.
> 
> Nope, mmap_start is not controlling the batch buffer at all. It is just 
> to find the calculated batch_start given that mmap() was given a 
> round-down PAGE_ALIGN start address. Actual preemption check interval is 
> one MI_NOOP. /o\ How much would you recommend to be safe?

We've been using 64 bytes routinely without too much hassle, but that
can be noticeable. For the dummyload we use roughly the full page and
that seems ok, with a few microseconds of extra latency. If that's
tolerable, I would opt for trying to use a full page for the recursive
batch. Alternatively, we can use a MI_SEMA_WAIT | POLL on a user
address (just throwing it out there as an option).
-Chris
diff mbox series

Patch

diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index cc6f4a742c12..97821b723b02 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -86,6 +86,7 @@  enum w_type
 	ENGINE_MAP,
 	LOAD_BALANCE,
 	BOND,
+	TERMINATE,
 };
 
 struct deps
@@ -113,6 +114,7 @@  struct w_step
 	unsigned int context;
 	unsigned int engine;
 	struct duration duration;
+	bool unbound_duration;
 	struct deps data_deps;
 	struct deps fence_deps;
 	int emit_fence;
@@ -143,7 +145,7 @@  struct w_step
 
 	struct drm_i915_gem_execbuffer2 eb;
 	struct drm_i915_gem_exec_object2 *obj;
-	struct drm_i915_gem_relocation_entry reloc[4];
+	struct drm_i915_gem_relocation_entry reloc[5];
 	unsigned long bb_sz;
 	uint32_t bb_handle;
 	uint32_t *seqno_value;
@@ -153,6 +155,7 @@  struct w_step
 	uint32_t *rt1_address;
 	uint32_t *latch_value;
 	uint32_t *latch_address;
+	uint32_t *recursive_bb_start;
 };
 
 DECLARE_EWMA(uint64_t, rt, 4, 2)
@@ -491,6 +494,10 @@  parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
 
 				step.type = ENGINE_MAP;
 				goto add_step;
+			} else if (!strcmp(field, "T")) {
+				int_field(TERMINATE, target,
+					  tmp >= 0 || ((int)nr_steps + tmp) < 0,
+					  "Invalid terminate target at step %u!\n");
 			} else if (!strcmp(field, "X")) {
 				unsigned int nr = 0;
 				while ((field = strtok_r(fstart, ".", &fctx))) {
@@ -605,23 +612,28 @@  parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
 
 			fstart = NULL;
 
-			tmpl = strtol(field, &sep, 10);
-			check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
-				  tmpl == LONG_MAX,
-				  "Invalid duration at step %u!\n", nr_steps);
-			step.duration.min = tmpl;
-
-			if (sep && *sep == '-') {
-				tmpl = strtol(sep + 1, NULL, 10);
-				check_arg(tmpl <= 0 ||
-					  tmpl <= step.duration.min ||
-					  tmpl == LONG_MIN ||
+			if (field[0] == '*') {
+				step.unbound_duration = true;
+			} else {
+				tmpl = strtol(field, &sep, 10);
+				check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
 					  tmpl == LONG_MAX,
-					  "Invalid duration range at step %u!\n",
+					  "Invalid duration at step %u!\n",
 					  nr_steps);
-				step.duration.max = tmpl;
-			} else {
-				step.duration.max = step.duration.min;
+				step.duration.min = tmpl;
+
+				if (sep && *sep == '-') {
+					tmpl = strtol(sep + 1, NULL, 10);
+					check_arg(tmpl <= 0 ||
+						tmpl <= step.duration.min ||
+						tmpl == LONG_MIN ||
+						tmpl == LONG_MAX,
+						"Invalid duration range at step %u!\n",
+						nr_steps);
+					step.duration.max = tmpl;
+				} else {
+					step.duration.max = step.duration.min;
+				}
 			}
 
 			valid++;
@@ -781,7 +793,7 @@  init_bb(struct w_step *w, unsigned int flags)
 	unsigned int i;
 	uint32_t *ptr;
 
-	if (!arb_period)
+	if (w->unbound_duration || !arb_period)
 		return;
 
 	gem_set_domain(fd, w->bb_handle,
@@ -801,6 +813,7 @@  terminate_bb(struct w_step *w, unsigned int flags)
 	const uint32_t bbe = 0xa << 23;
 	unsigned long mmap_start, mmap_len;
 	unsigned long batch_start = w->bb_sz;
+	unsigned int r = 0;
 	uint32_t *ptr, *cs;
 
 	igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
@@ -811,6 +824,9 @@  terminate_bb(struct w_step *w, unsigned int flags)
 	if (flags & RT)
 		batch_start -= 12 * sizeof(uint32_t);
 
+	if (w->unbound_duration)
+		batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
+
 	mmap_start = rounddown(batch_start, PAGE_SIZE);
 	mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
 
@@ -820,8 +836,19 @@  terminate_bb(struct w_step *w, unsigned int flags)
 	ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
 	cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
 
+	if (w->unbound_duration) {
+		w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
+		batch_start += 4 * sizeof(uint32_t);
+
+		*cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
+		w->recursive_bb_start = cs;
+		*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
+		*cs++ = 0;
+		*cs++ = 0;
+	}
+
 	if (flags & SEQNO) {
-		w->reloc[0].offset = batch_start + sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = MI_STORE_DWORD_IMM;
@@ -833,7 +860,7 @@  terminate_bb(struct w_step *w, unsigned int flags)
 	}
 
 	if (flags & RT) {
-		w->reloc[1].offset = batch_start + sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = MI_STORE_DWORD_IMM;
@@ -843,7 +870,7 @@  terminate_bb(struct w_step *w, unsigned int flags)
 		w->rt0_value = cs;
 		*cs++ = 0;
 
-		w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
@@ -852,7 +879,7 @@  terminate_bb(struct w_step *w, unsigned int flags)
 		*cs++ = 0;
 		*cs++ = 0;
 
-		w->reloc[3].offset = batch_start + sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = MI_STORE_DWORD_IMM;
@@ -984,19 +1011,28 @@  alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
 		}
 	}
 
-	w->bb_sz = get_bb_sz(w->duration.max);
-	w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz);
+	if (w->unbound_duration)
+		/* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
+		w->bb_sz = max(64, get_bb_sz(w->preempt_us)) +
+			   (1 + 3) * sizeof(uint32_t);
+	else
+		w->bb_sz = get_bb_sz(w->duration.max);
+	w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
 	init_bb(w, flags);
 	terminate_bb(w, flags);
 
-	if (flags & SEQNO) {
+	if ((flags & SEQNO) || w->unbound_duration) {
 		w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
+		if (flags & SEQNO)
+			w->obj[j].relocation_count++;
 		if (flags & RT)
-			w->obj[j].relocation_count = 4;
-		else
-			w->obj[j].relocation_count = 1;
+			w->obj[j].relocation_count += 3;
+		if (w->unbound_duration)
+			w->obj[j].relocation_count++;
 		for (i = 0; i < w->obj[j].relocation_count; i++)
 			w->reloc[i].target_handle = 1;
+		if (w->unbound_duration)
+			w->reloc[0].target_handle = j;
 	}
 
 	w->eb.buffers_ptr = to_user_pointer(w->obj);
@@ -2036,6 +2072,18 @@  update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
 	}
 }
 
+static void
+update_bb_start(struct w_step *w)
+{
+	if (!w->unbound_duration)
+		return;
+
+	gem_set_domain(fd, w->bb_handle,
+		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
+
+	*w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
+}
+
 static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
 {
 	if (target < 0)
@@ -2171,9 +2219,13 @@  do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
 	if (flags & RT)
 		update_bb_rt(w, engine, seqno);
 
+	update_bb_start(w);
+
 	w->eb.batch_start_offset =
+		w->unbound_duration ?
+		0 :
 		ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
-			2 * sizeof(uint32_t));
+		      2 * sizeof(uint32_t));
 
 	for (i = 0; i < w->fence_deps.nr; i++) {
 		int tgt = w->idx + w->fence_deps.list[i];
@@ -2313,6 +2365,17 @@  static void *run_workload(void *data)
 								    w->priority;
 				}
 				continue;
+			} else if (w->type == TERMINATE) {
+				unsigned int t_idx = i + w->target;
+
+				igt_assert(t_idx >= 0 && t_idx < i);
+				igt_assert(wrk->steps[t_idx].type == BATCH);
+				igt_assert(wrk->steps[t_idx].unbound_duration);
+
+				*wrk->steps[t_idx].recursive_bb_start =
+					MI_BATCH_BUFFER_END;
+				__sync_synchronize();
+				continue;
 			} else if (w->type == PREEMPTION ||
 				   w->type == ENGINE_MAP ||
 				   w->type == LOAD_BALANCE ||
diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
index 6aec718bc812..c94d01018419 100644
--- a/benchmarks/wsim/README
+++ b/benchmarks/wsim/README
@@ -2,11 +2,11 @@  Workload descriptor format
 ==========================
 
 ctx.engine.duration_us.dependency.wait,...
-<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,...
+<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,...
 B.<uint>
 M.<uint>.<str>[|<str>]...
 P|X.<uint>.<int>
-d|p|s|t|q|a.<int>,...
+d|p|s|t|q|a|T.<int>,...
 b.<uint>.<uint>.<str>
 f
 
@@ -30,6 +30,7 @@  Additional workload steps are also supported:
  'b' - Set up engine bonds.
  'M' - Set up engine map.
  'P' - Context priority.
+ 'T' - Terminate an infinite batch.
  'X' - Context preemption control.
 
 Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS
@@ -77,6 +78,10 @@  Example:
 
 I this case the last step has a data dependency on both first and second steps.
 
+Batch durations can also be specified as infinite by using the '*' in the
+duration field. Such batches must be ended by the terminate command ('T')
+otherwise they will cause a GPU hang to be reported.
+
 Sync (fd) fences
 ----------------
 
diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsim
index cfbfcd39be7d..ea89da3add48 100644
--- a/benchmarks/wsim/frame-split-60fps.wsim
+++ b/benchmarks/wsim/frame-split-60fps.wsim
@@ -6,10 +6,12 @@  M.2.VCS2
 B.2
 b.2.1.VCS1
 f
-1.DEFAULT.4000-6000.f-1.0
+1.DEFAULT.*.f-1.0
 2.DEFAULT.4000-6000.s-1.0
 a.-3
-3.RCS.2000-4000.-3/-2.0
+s.-2
+T.-4
+3.RCS.2000-4000.-5/-4.0
 3.VECS.2000.-1.0
 4.BCS.1000.-1.0
 s.-2