Message ID | 20180511071141.24581-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 11/05/2018 08:11, Chris Wilson wrote: > A little tool I've been meaning to write for a while... Convert the > .wsim into their dag and find the longest chains and evaluate them on an > simulated machine. Very cool! But I think you need to handle the 's' command which appears in the interesting workloads. Maybe you could just fake it as a zero duration batch with a data dependency. Fence related commands would also be useful but more difficult I guess. As would throttling. Delays could maybe be faked as batches as well by adding input dependency to everything preceding it. Regards, Tvrtko > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > benchmarks/Makefile.sources | 1 + > benchmarks/sim_wsim.c | 434 ++++++++++++++++++++++++++++++++++++ > 2 files changed, 435 insertions(+) > create mode 100644 benchmarks/sim_wsim.c > > diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources > index d150035aa..b80a7644c 100644 > --- a/benchmarks/Makefile.sources > +++ b/benchmarks/Makefile.sources > @@ -17,6 +17,7 @@ benchmarks_prog_list = \ > gem_wsim \ > kms_vblank \ > prime_lookup \ > + sim_wsim \ > vgem_mmap \ > $(NULL) > > diff --git a/benchmarks/sim_wsim.c b/benchmarks/sim_wsim.c > new file mode 100644 > index 000000000..cb8e7adc4 > --- /dev/null > +++ b/benchmarks/sim_wsim.c > @@ -0,0 +1,434 @@ > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > + > +#include "igt_aux.h" > + > +#if 0 > +#define DBG(...) printf(__VA_ARGS__) > +#else > +#define DBG(...) do { } while(0) > +#endif > + > +struct dependency { > + struct task *signal; > + struct igt_list link; > +}; > + > +struct task { > + struct igt_list link; > + struct igt_list sched; > + struct igt_list signals; > + int step; > + int ctx; > + int class; > + int instance; > + long min, max; > + long duration; > + long deadline; > + bool visited; > +}; > + > +struct work { > + struct task **tasks; > + unsigned int count, size; > + > + struct igt_list ends; > +}; > + > +static void add_dependency(struct task *task, struct task *signal) > +{ > + struct dependency *dep; > + > + DBG("%s %d (context %d, engine %d) -> %d (context %d, engine %d)\n", > + __func__, > + signal->step, signal->ctx, signal->class, > + task->step, task->ctx, task->class); > + > + dep = malloc(sizeof(*dep)); > + dep->signal = signal; > + igt_list_add(&dep->link, &task->signals); > + igt_list_del(&signal->link); > + igt_list_init(&signal->link); > +} > + > +enum class { > + RCS, > + BCS, > + VCS, > + VECS, > +}; > + > +static void add_task(struct work *work, char *line) > +{ > + static const char *engines[] = { > + [RCS] = "rcs", > + [BCS] = "bcs", > + [VCS] = "vcs", > + [VECS] = "vecs", > + }; > + struct task *task; > + char *token; > + int i; > + > + task = malloc(sizeof(*task)); > + > + igt_list_init(&task->signals); > + igt_list_init(&task->sched); > + task->step = work->count; > + task->visited = false; > + > + /* > + * 1.RCS.2800-3100.-1.0 > + * - context > + * - engine > + * - delay > + * - dependencies > + * - sync > + */ > + DBG("line='%s'\n", line); > + > + /* context */ > + token = strtok(line, "."); > + DBG("context='%s'\n", token); > + task->ctx = atoi(token); > + if (!task->ctx) > + return; > + > + /* engine */ > + token = strtok(NULL, "."); > + DBG("engine='%s'\n", token); > + task->class = -1; > + for (i = 0; i < sizeof(engines)/sizeof(*engines); i++) { > + const int len = strlen(engines[i]); > + if (!strncasecmp(token, engines[i], len)) { > + task->class = i; > + if (token[len]) > + task->instance = atoi(token + len); > + else > + task->instance = -1; > + break; > + } > + } > + > + /* delay */ > + token = strtok(NULL, "."); > + DBG("delay='%s'\n", token); > + task->min = strtol(token, &token, 0); > + if (*token) > + task->max = strtol(token + 1, NULL, 0); > + else > + task->max = task->min; > + task->duration = (task->min + task->max) / 2; > + DBG("min=%lu, max=%lu; duration=%lu\n", task->min, task->max, task->duration); > + > + /* dependencies */ > + token = strtok(NULL, "."); > + DBG("deps='%s'\n", token); > + while ((i = strtol(token, &token, 0))) { > + add_dependency(task, work->tasks[work->count + i]); > + if (*token) > + token++; > + } > + > + /* add a dependency for the context+engine timeline */ > + for (i = work->count; --i >= 0; ) { > + if (work->tasks[i]->ctx == task->ctx && > + work->tasks[i]->class == task->class) { > + add_dependency(task, work->tasks[i]); > + break; > + } > + } > + > + igt_list_add(&task->link, &work->ends); > + work->tasks[work->count++] = task; > +} > + > +static struct work *parse_work(FILE *file) > +{ > + struct work *work; > + char *line = NULL; > + size_t len = 0; > + > + work = malloc(sizeof(*work)); > + igt_list_init(&work->ends); > + > + work->size = 64; > + work->count = 0; > + work->tasks = malloc(sizeof(*work->tasks) * work->size); > + > + while (getline(&line, &len, file) != -1) { > + if (work->count == work->size) { > + work->tasks = realloc(work->tasks, > + sizeof(*work->tasks) * work->size); > + work->size *= 2; > + } > + add_task(work, line); > + } > + > + free(line); > + > + DBG("%d tasks\n", work->count); > + return work; > +} > + > +static unsigned long sum_durations(struct task *task) > +{ > + unsigned long max_duration = 0; > + struct task *signal = NULL; > + struct dependency *dep; > + > + igt_list_for_each(dep, &task->signals, link) { > + if (dep->signal->duration > max_duration) { > + signal = dep->signal; > + max_duration = signal->duration; > + } > + } > + > + return task->duration + (signal ? sum_durations(signal) : 0); > +} > + > +static void ideal_depth(struct work *work) > +{ > + unsigned long total_duration; > + unsigned long max_duration; > + struct task *task; > + int i; > + > + /* > + * The ideal depth is the longest chain of dependencies as the > + * dependency chain requires sequential task execution. Each > + * chain is assumed to be run in parallel on an infinite set of > + * engines, so the ratelimiting step is its longest path. > + */ > + max_duration = 0; > + igt_list_for_each(task, &work->ends, link) { > + unsigned long duration = sum_durations(task); > + if (duration > max_duration) > + max_duration = duration; > + } > + > + total_duration = 0; > + for (i = 0; i < work->count; i++) > + total_duration += work->tasks[i]->duration; > + > + printf("Single client\n"); > + printf(" total duration %luus; %.2f wps\n", total_duration, 1e6/total_duration); > + printf(" ideal duration %luus; %.2f wps\n", max_duration, 1e6/max_duration); > +} > + > +struct sim_class { > + int ninstance; > + unsigned int instances[4]; > +}; > + > +static void single_client(struct work *work) > +{ > + struct sim_class sim[] = { > + [RCS] = { 1 }, > + [BCS] = { 1 }, > + [VCS] = { 2 }, > + [VECS] = { 1 }, > + }, *class; > + IGT_LIST(sched); > + struct task *task; > + unsigned long max; > + int i, j; > + > + for (i = 0; i < work->count; i++) > + igt_list_init(&work->tasks[i]->sched); > + > + igt_list_for_each(task, &work->ends, link) { > + igt_list_add_tail(&task->sched, &sched); > + } > + > + igt_list_for_each(task, &sched, sched) { > + struct dependency *dep; > + > + igt_list_for_each(dep, &task->signals, link) > + igt_list_move_tail(&dep->signal->sched, &sched); > + } > + > + igt_list_for_each_reverse(task, &sched, sched) { > + struct dependency *dep; > + int instance; > + > + class = &sim[task->class]; > + max = class->instances[0]; > + > + instance = task->instance; > + if (instance < 0) { > + instance = 0; > + for (i = 1; i < class->ninstance; i++) { > + if (class->instances[i] < max) { > + max = class->instances[i]; > + instance = i; > + } > + } > + } > + > + /* > + * Greedy (first available), not true optimal scheduling. > + * > + * For optimal, we do have to compute the global optimal > + * ordering by checking every permutation... > + */ > + igt_list_for_each(dep, &task->signals, link) { > + if (dep->signal->deadline > max) > + max = dep->signal->deadline; > + } > + > + DBG("task %d: engine %d, instance %d; finish %lu\n", > + task->step, task->class, instance, max); > + > + task->deadline = max + task->duration; > + class->instances[instance] = task->deadline; > + } > + > + max = 0; > + for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) { > + class = &sim[i]; > + for (j = 0; j < class->ninstance; j++) { > + if (class->instances[j] > max) > + max = class->instances[j]; > + } > + } > + > + printf("Simulated client:\n"); > + printf(" duration %luus; %.2f wps\n", max, 1e6/max); > +} > + > +static void packed_clients(struct work *work) > +{ > + struct sim_class sim[] = { > + [RCS] = { 1 }, > + [BCS] = { 1 }, > + [VCS] = { 2 }, > + [VECS] = { 1 }, > + }, *class; > + IGT_LIST(sched); > + struct task *task; > + unsigned long min; > + int i, j; > + > + for (i = 0; i < work->count; i++) > + igt_list_init(&work->tasks[i]->sched); > + > + igt_list_for_each(task, &work->ends, link) > + igt_list_add_tail(&task->sched, &sched); > + > + igt_list_for_each(task, &sched, sched) { > + struct dependency *dep; > + > + igt_list_for_each(dep, &task->signals, link) > + igt_list_move_tail(&dep->signal->sched, &sched); > + } > + > + /* > + * Compute the maximum duration required on any engine. > + * > + * With sufficient clients forcing maximum occupancy under their weight, > + * the ratelimiting step becomes a single engine and how many clients > + * it takes to fill. > + */ > + igt_list_for_each_reverse(task, &sched, sched) { > + int instance; > + > + class = &sim[task->class]; > + instance = task->instance; > + if (instance < 0) { > + instance = 0; > + min = class->instances[0]; > + for (i = 1; i < class->ninstance; i++) { > + if (class->instances[i] < min) { > + min = class->instances[i]; > + instance = i; > + } > + } > + } > + > + class->instances[instance] += task->duration; > + } > + > + min = 0; > + for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) { > + class = &sim[i]; > + for (j = 0; j < class->ninstance; j++) { > + if (class->instances[j] > min) > + min = class->instances[j]; > + } > + } > + > + printf("Packed clients:\n"); > + printf(" duration %luus; %.2f wps\n", min, 1e6/min); > +} > + > +static void graphviz(struct work *work) > +{ > +#if 0 > + int i, j; > + > + printf("digraph {\n"); > + printf(" rankdir=LR;\n"); > + printf(" splines=line;\n"); > + printf("\n"); > + > + for (i = 0; i < work->count; i++) { > + struct task *task = work->tasks[i]; > + > + if (task->visited) > + goto skip; > + > + printf(" subgraph cluster_%d {\n", task->ctx); > + printf(" label=\"Context %d\"\n", task->ctx); > + for (j = i; j < work->count; j++) { > + if (work->tasks[j]->ctx == task->ctx) { > + printf(" task_%03d;\n", j); > + work->tasks[j]->visited = true; > + } > + } > + printf(" }\n\n"); > + > +skip: > + task->visited = false; > + } > + > + for (i = 0; i < work->count; i++) { > + struct task *task = work->tasks[i]; > + struct dependency *dep; > + > + igt_list_for_each(dep, &task->signals, link) { > + printf(" task_%03d -> task_%03d;\n", > + dep->signal->step, task->step); > + } > + } > + > + printf("}\n"); > +#endif > +} > + > +int main(int argc, char **argv) > +{ > + int i; > + > + for (i = 1; i < argc; i++) { > + FILE *file = fopen(argv[i], "r"); > + struct work *work; > + > + if (!file) { > + perror(argv[i]); > + return 1; > + } > + > + work = parse_work(file); > + fclose(file); > + > + graphviz(work); > + > + ideal_depth(work); > + single_client(work); > + packed_clients(work); > + } > + > + return 0; > +} >
Quoting Tvrtko Ursulin (2018-05-11 09:31:52) > > On 11/05/2018 08:11, Chris Wilson wrote: > > A little tool I've been meaning to write for a while... Convert the > > .wsim into their dag and find the longest chains and evaluate them on an > > simulated machine. > > Very cool! > > But I think you need to handle the 's' command which appears in the > interesting workloads. Maybe you could just fake it as a zero duration > batch with a data dependency. Barriers were even more trivial than that. :) > Fence related commands would also be useful but more difficult I guess. > As would throttling. I haven't groked your fence command lines so skipped them. Fortunately, syncs appears to be the only one missed from the "interesting" set of wsim. > Delays could maybe be faked as batches as well by adding input > dependency to everything preceding it. It would definitely want to be a task with say .engine = -1. It implies a full barrier as well? Not limited to waiting for an earlier step? -Chris
On 11/05/2018 10:04, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-05-11 09:31:52) >> >> On 11/05/2018 08:11, Chris Wilson wrote: >>> A little tool I've been meaning to write for a while... Convert the >>> .wsim into their dag and find the longest chains and evaluate them on an >>> simulated machine. >> >> Very cool! >> >> But I think you need to handle the 's' command which appears in the >> interesting workloads. Maybe you could just fake it as a zero duration >> batch with a data dependency. > > Barriers were even more trivial than that. :) > >> Fence related commands would also be useful but more difficult I guess. >> As would throttling. > > I haven't groked your fence command lines so skipped them. Fortunately, > syncs appears to be the only one missed from the "interesting" set of > wsim. Fences are created with 'f' and then batches can reference them from the dependency list the same as with data dependencies. When created they are unsignaled and then 'a' advances the fence addresses relatively to the 'a' command. For instance 'a.-2' means "signal ready the fence created two steps above". >> Delays could maybe be faked as batches as well by adding input >> dependency to everything preceding it. > > It would definitely want to be a task with say .engine = -1. It implies > a full barrier as well? Not limited to waiting for an earlier step? There is 'd' which is a normal/dumb fixed delay, and then 'p' which is relative to start of the loop, meaning "wait how much is needed so that time from start of loop to this step is x". Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-05-11 10:21:05) > > On 11/05/2018 10:04, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2018-05-11 09:31:52) > >> > >> On 11/05/2018 08:11, Chris Wilson wrote: > >>> A little tool I've been meaning to write for a while... Convert the > >>> .wsim into their dag and find the longest chains and evaluate them on an > >>> simulated machine. > >> > >> Very cool! > >> > >> But I think you need to handle the 's' command which appears in the > >> interesting workloads. Maybe you could just fake it as a zero duration > >> batch with a data dependency. > > > > Barriers were even more trivial than that. :) > > > >> Fence related commands would also be useful but more difficult I guess. > >> As would throttling. > > > > I haven't groked your fence command lines so skipped them. Fortunately, > > syncs appears to be the only one missed from the "interesting" set of > > wsim. > > Fences are created with 'f' and then batches can reference them from the > dependency list the same as with data dependencies. When created they > are unsignaled and then 'a' advances the fence addresses relatively to > the 'a' command. For instance 'a.-2' means "signal ready the fence > created two steps above". I really wish you did named fences :) How is 'a' synchronised? If I just did the 'f' as a dependency then signaled it sometime later, it adds no delay to pipeline. media_nn_1080p_s1.wsim: f 1.VCS1.6500-8000.f-1.0 1.VCS2.6500-8000.f-2.0 a.-3 2.RCS.2000-4000.-2/-3.0 3.RCS.3000-5000.-1.0 3.RCS.23000-27000.0.0 3.VCS.16000-20000.-1.1 From the point of view of scheduling f,a become a no-op. media_nn_1080p_s2.wsim: 1.VCS.13000-17000.0.0 2.RCS.2000-4000.-1.0 3.RCS.3000-5000.-1.0 3.RCS.23000-27000.0.0 f 3.VCS1.8000-10000.-2/f-1.0 3.VCS2.8000-10000.-3/f-2.0 a.-3 s.-3 s.-3 Same here. "f" appears ungrounded. > >> Delays could maybe be faked as batches as well by adding input > >> dependency to everything preceding it. > > > > It would definitely want to be a task with say .engine = -1. It implies > > a full barrier as well? Not limited to waiting for an earlier step? > > There is 'd' which is a normal/dumb fixed delay, and then 'p' which is > relative to start of the loop, meaning "wait how much is needed so that > time from start of loop to this step is x". There aren't any '^[dp]' in the current set of .wsim, can I just pretend they don't exist for now? ;) -Chris
On 11/05/2018 10:33, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-05-11 10:21:05) >> >> On 11/05/2018 10:04, Chris Wilson wrote: >>> Quoting Tvrtko Ursulin (2018-05-11 09:31:52) >>>> >>>> On 11/05/2018 08:11, Chris Wilson wrote: >>>>> A little tool I've been meaning to write for a while... Convert the >>>>> .wsim into their dag and find the longest chains and evaluate them on an >>>>> simulated machine. >>>> >>>> Very cool! >>>> >>>> But I think you need to handle the 's' command which appears in the >>>> interesting workloads. Maybe you could just fake it as a zero duration >>>> batch with a data dependency. >>> >>> Barriers were even more trivial than that. :) >>> >>>> Fence related commands would also be useful but more difficult I guess. >>>> As would throttling. >>> >>> I haven't groked your fence command lines so skipped them. Fortunately, >>> syncs appears to be the only one missed from the "interesting" set of >>> wsim. >> >> Fences are created with 'f' and then batches can reference them from the >> dependency list the same as with data dependencies. When created they >> are unsignaled and then 'a' advances the fence addresses relatively to >> the 'a' command. For instance 'a.-2' means "signal ready the fence >> created two steps above". > > I really wish you did named fences :) Should be possible to add/replace but would need another data structure to look them up then. > How is 'a' synchronised? If I just did the 'f' as a dependency then > signaled it sometime later, it adds no delay to pipeline. > > media_nn_1080p_s1.wsim: > f > 1.VCS1.6500-8000.f-1.0 > 1.VCS2.6500-8000.f-2.0 > a.-3 > 2.RCS.2000-4000.-2/-3.0 > 3.RCS.3000-5000.-1.0 > 3.RCS.23000-27000.0.0 > 3.VCS.16000-20000.-1.1 > > From the point of view of scheduling f,a become a no-op. > > media_nn_1080p_s2.wsim: > 1.VCS.13000-17000.0.0 > 2.RCS.2000-4000.-1.0 > 3.RCS.3000-5000.-1.0 > 3.RCS.23000-27000.0.0 > f > 3.VCS1.8000-10000.-2/f-1.0 > 3.VCS2.8000-10000.-3/f-2.0 > a.-3 > s.-3 > s.-3 > > Same here. "f" appears ungrounded. Okay in these examples I guess it doesn't do much for a single client. In multiple client cases it can have an effect. Or if there was something in between the batches with input fence dependencies. >>>> Delays could maybe be faked as batches as well by adding input >>>> dependency to everything preceding it. >>> >>> It would definitely want to be a task with say .engine = -1. It implies >>> a full barrier as well? Not limited to waiting for an earlier step? >> >> There is 'd' which is a normal/dumb fixed delay, and then 'p' which is >> relative to start of the loop, meaning "wait how much is needed so that >> time from start of loop to this step is x". > > There aren't any '^[dp]' in the current set of .wsim, can I just > pretend they don't exist for now? ;) Probably. There is -a option to gem_wsim which media-bench uses in some modes essentially to convert any workload into realtime. For instance -a p.16667 (AFAIR) makes any workload not run faster than 60fps. But your sim_wsim I guess you don't care about that. Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-05-11 09:31:52) > > On 11/05/2018 08:11, Chris Wilson wrote: > > A little tool I've been meaning to write for a while... Convert the > > .wsim into their dag and find the longest chains and evaluate them on an > > simulated machine. > > Very cool! Would you care to ack it in its current form, knowing that we will fix it whenever we find a corner case interesting enough to study? -Chris
On 10/07/2018 14:47, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-05-11 09:31:52) >> >> On 11/05/2018 08:11, Chris Wilson wrote: >>> A little tool I've been meaning to write for a while... Convert the >>> .wsim into their dag and find the longest chains and evaluate them on an >>> simulated machine. >> >> Very cool! > > Would you care to ack it in its current form, knowing that we will fix > it whenever we find a corner case interesting enough to study? I think just a little bit more polish is needed. I've just tried it out and polish I can see is: 1. -h / --help with just a brief comment on what the tool does and usage. 2. Lead stats displayed for each cmdline argument with parsed filename. 3. Replace result section names "Single client", "Simulated clients" and item names "total/ideal/packed" with a more descriptive text which would hopefully be somewhat self-explanatory what the numbers represent. 4. Report garbage input (and unknown wsim commands) instead of reporting some numbers for instance for ./sim_wsim README :) 5. #ifdef 0 graphviz ? add --graphviz cmddline option? 6. Oh.. and meson makefiles.. :) Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-07-10 16:38:14) > > On 10/07/2018 14:47, Chris Wilson wrote: > > Quoting Tvrtko Ursulin (2018-05-11 09:31:52) > >> > >> On 11/05/2018 08:11, Chris Wilson wrote: > >>> A little tool I've been meaning to write for a while... Convert the > >>> .wsim into their dag and find the longest chains and evaluate them on an > >>> simulated machine. > >> > >> Very cool! > > > > Would you care to ack it in its current form, knowing that we will fix > > it whenever we find a corner case interesting enough to study? > > I think just a little bit more polish is needed. I've just tried it out > and polish I can see is: > > 1. -h / --help with just a brief comment on what the tool does and usage. > > 2. Lead stats displayed for each cmdline argument with parsed filename. > > 3. Replace result section names "Single client", "Simulated clients" and > item names "total/ideal/packed" with a more descriptive text which would > hopefully be somewhat self-explanatory what the numbers represent. > > 4. Report garbage input (and unknown wsim commands) instead of reporting > some numbers for instance for ./sim_wsim README :) > > 5. #ifdef 0 graphviz ? add --graphviz cmddline option? > > 6. Oh.. and meson makefiles.. :) But if it was upstream, you would just contribute those improvements yourself. So you are saying that this is just not useful? -Chris
On 10/07/2018 16:40, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-07-10 16:38:14) >> >> On 10/07/2018 14:47, Chris Wilson wrote: >>> Quoting Tvrtko Ursulin (2018-05-11 09:31:52) >>>> >>>> On 11/05/2018 08:11, Chris Wilson wrote: >>>>> A little tool I've been meaning to write for a while... Convert the >>>>> .wsim into their dag and find the longest chains and evaluate them on an >>>>> simulated machine. >>>> >>>> Very cool! >>> >>> Would you care to ack it in its current form, knowing that we will fix >>> it whenever we find a corner case interesting enough to study? >> >> I think just a little bit more polish is needed. I've just tried it out >> and polish I can see is: >> >> 1. -h / --help with just a brief comment on what the tool does and usage. >> >> 2. Lead stats displayed for each cmdline argument with parsed filename. >> >> 3. Replace result section names "Single client", "Simulated clients" and >> item names "total/ideal/packed" with a more descriptive text which would >> hopefully be somewhat self-explanatory what the numbers represent. >> >> 4. Report garbage input (and unknown wsim commands) instead of reporting >> some numbers for instance for ./sim_wsim README :) >> >> 5. #ifdef 0 graphviz ? add --graphviz cmddline option? >> >> 6. Oh.. and meson makefiles.. :) > > But if it was upstream, you would just contribute those improvements > yourself. > > So you are saying that this is just not useful? No, just that I think we have to have some minimum standard before we can make it upstream. If you want I can implement these bits and send a v3. Then it will just be a matter of finding a third person to review it since we will both be authors. :) Regards, Tvrtko
diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources index d150035aa..b80a7644c 100644 --- a/benchmarks/Makefile.sources +++ b/benchmarks/Makefile.sources @@ -17,6 +17,7 @@ benchmarks_prog_list = \ gem_wsim \ kms_vblank \ prime_lookup \ + sim_wsim \ vgem_mmap \ $(NULL) diff --git a/benchmarks/sim_wsim.c b/benchmarks/sim_wsim.c new file mode 100644 index 000000000..cb8e7adc4 --- /dev/null +++ b/benchmarks/sim_wsim.c @@ -0,0 +1,434 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "igt_aux.h" + +#if 0 +#define DBG(...) printf(__VA_ARGS__) +#else +#define DBG(...) do { } while(0) +#endif + +struct dependency { + struct task *signal; + struct igt_list link; +}; + +struct task { + struct igt_list link; + struct igt_list sched; + struct igt_list signals; + int step; + int ctx; + int class; + int instance; + long min, max; + long duration; + long deadline; + bool visited; +}; + +struct work { + struct task **tasks; + unsigned int count, size; + + struct igt_list ends; +}; + +static void add_dependency(struct task *task, struct task *signal) +{ + struct dependency *dep; + + DBG("%s %d (context %d, engine %d) -> %d (context %d, engine %d)\n", + __func__, + signal->step, signal->ctx, signal->class, + task->step, task->ctx, task->class); + + dep = malloc(sizeof(*dep)); + dep->signal = signal; + igt_list_add(&dep->link, &task->signals); + igt_list_del(&signal->link); + igt_list_init(&signal->link); +} + +enum class { + RCS, + BCS, + VCS, + VECS, +}; + +static void add_task(struct work *work, char *line) +{ + static const char *engines[] = { + [RCS] = "rcs", + [BCS] = "bcs", + [VCS] = "vcs", + [VECS] = "vecs", + }; + struct task *task; + char *token; + int i; + + task = malloc(sizeof(*task)); + + igt_list_init(&task->signals); + igt_list_init(&task->sched); + task->step = work->count; + task->visited = false; + + /* + * 1.RCS.2800-3100.-1.0 + * - context + * - engine + * - delay + * - dependencies + * - sync + */ + DBG("line='%s'\n", line); + + /* context */ + token = strtok(line, "."); + DBG("context='%s'\n", token); + task->ctx = atoi(token); + if (!task->ctx) + return; + + /* engine */ + token = strtok(NULL, "."); + DBG("engine='%s'\n", token); + task->class = -1; + for (i = 0; i < sizeof(engines)/sizeof(*engines); i++) { + const int len = strlen(engines[i]); + if (!strncasecmp(token, engines[i], len)) { + task->class = i; + if (token[len]) + task->instance = atoi(token + len); + else + task->instance = -1; + break; + } + } + + /* delay */ + token = strtok(NULL, "."); + DBG("delay='%s'\n", token); + task->min = strtol(token, &token, 0); + if (*token) + task->max = strtol(token + 1, NULL, 0); + else + task->max = task->min; + task->duration = (task->min + task->max) / 2; + DBG("min=%lu, max=%lu; duration=%lu\n", task->min, task->max, task->duration); + + /* dependencies */ + token = strtok(NULL, "."); + DBG("deps='%s'\n", token); + while ((i = strtol(token, &token, 0))) { + add_dependency(task, work->tasks[work->count + i]); + if (*token) + token++; + } + + /* add a dependency for the context+engine timeline */ + for (i = work->count; --i >= 0; ) { + if (work->tasks[i]->ctx == task->ctx && + work->tasks[i]->class == task->class) { + add_dependency(task, work->tasks[i]); + break; + } + } + + igt_list_add(&task->link, &work->ends); + work->tasks[work->count++] = task; +} + +static struct work *parse_work(FILE *file) +{ + struct work *work; + char *line = NULL; + size_t len = 0; + + work = malloc(sizeof(*work)); + igt_list_init(&work->ends); + + work->size = 64; + work->count = 0; + work->tasks = malloc(sizeof(*work->tasks) * work->size); + + while (getline(&line, &len, file) != -1) { + if (work->count == work->size) { + work->tasks = realloc(work->tasks, + sizeof(*work->tasks) * work->size); + work->size *= 2; + } + add_task(work, line); + } + + free(line); + + DBG("%d tasks\n", work->count); + return work; +} + +static unsigned long sum_durations(struct task *task) +{ + unsigned long max_duration = 0; + struct task *signal = NULL; + struct dependency *dep; + + igt_list_for_each(dep, &task->signals, link) { + if (dep->signal->duration > max_duration) { + signal = dep->signal; + max_duration = signal->duration; + } + } + + return task->duration + (signal ? sum_durations(signal) : 0); +} + +static void ideal_depth(struct work *work) +{ + unsigned long total_duration; + unsigned long max_duration; + struct task *task; + int i; + + /* + * The ideal depth is the longest chain of dependencies as the + * dependency chain requires sequential task execution. Each + * chain is assumed to be run in parallel on an infinite set of + * engines, so the ratelimiting step is its longest path. + */ + max_duration = 0; + igt_list_for_each(task, &work->ends, link) { + unsigned long duration = sum_durations(task); + if (duration > max_duration) + max_duration = duration; + } + + total_duration = 0; + for (i = 0; i < work->count; i++) + total_duration += work->tasks[i]->duration; + + printf("Single client\n"); + printf(" total duration %luus; %.2f wps\n", total_duration, 1e6/total_duration); + printf(" ideal duration %luus; %.2f wps\n", max_duration, 1e6/max_duration); +} + +struct sim_class { + int ninstance; + unsigned int instances[4]; +}; + +static void single_client(struct work *work) +{ + struct sim_class sim[] = { + [RCS] = { 1 }, + [BCS] = { 1 }, + [VCS] = { 2 }, + [VECS] = { 1 }, + }, *class; + IGT_LIST(sched); + struct task *task; + unsigned long max; + int i, j; + + for (i = 0; i < work->count; i++) + igt_list_init(&work->tasks[i]->sched); + + igt_list_for_each(task, &work->ends, link) { + igt_list_add_tail(&task->sched, &sched); + } + + igt_list_for_each(task, &sched, sched) { + struct dependency *dep; + + igt_list_for_each(dep, &task->signals, link) + igt_list_move_tail(&dep->signal->sched, &sched); + } + + igt_list_for_each_reverse(task, &sched, sched) { + struct dependency *dep; + int instance; + + class = &sim[task->class]; + max = class->instances[0]; + + instance = task->instance; + if (instance < 0) { + instance = 0; + for (i = 1; i < class->ninstance; i++) { + if (class->instances[i] < max) { + max = class->instances[i]; + instance = i; + } + } + } + + /* + * Greedy (first available), not true optimal scheduling. + * + * For optimal, we do have to compute the global optimal + * ordering by checking every permutation... + */ + igt_list_for_each(dep, &task->signals, link) { + if (dep->signal->deadline > max) + max = dep->signal->deadline; + } + + DBG("task %d: engine %d, instance %d; finish %lu\n", + task->step, task->class, instance, max); + + task->deadline = max + task->duration; + class->instances[instance] = task->deadline; + } + + max = 0; + for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) { + class = &sim[i]; + for (j = 0; j < class->ninstance; j++) { + if (class->instances[j] > max) + max = class->instances[j]; + } + } + + printf("Simulated client:\n"); + printf(" duration %luus; %.2f wps\n", max, 1e6/max); +} + +static void packed_clients(struct work *work) +{ + struct sim_class sim[] = { + [RCS] = { 1 }, + [BCS] = { 1 }, + [VCS] = { 2 }, + [VECS] = { 1 }, + }, *class; + IGT_LIST(sched); + struct task *task; + unsigned long min; + int i, j; + + for (i = 0; i < work->count; i++) + igt_list_init(&work->tasks[i]->sched); + + igt_list_for_each(task, &work->ends, link) + igt_list_add_tail(&task->sched, &sched); + + igt_list_for_each(task, &sched, sched) { + struct dependency *dep; + + igt_list_for_each(dep, &task->signals, link) + igt_list_move_tail(&dep->signal->sched, &sched); + } + + /* + * Compute the maximum duration required on any engine. + * + * With sufficient clients forcing maximum occupancy under their weight, + * the ratelimiting step becomes a single engine and how many clients + * it takes to fill. + */ + igt_list_for_each_reverse(task, &sched, sched) { + int instance; + + class = &sim[task->class]; + instance = task->instance; + if (instance < 0) { + instance = 0; + min = class->instances[0]; + for (i = 1; i < class->ninstance; i++) { + if (class->instances[i] < min) { + min = class->instances[i]; + instance = i; + } + } + } + + class->instances[instance] += task->duration; + } + + min = 0; + for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) { + class = &sim[i]; + for (j = 0; j < class->ninstance; j++) { + if (class->instances[j] > min) + min = class->instances[j]; + } + } + + printf("Packed clients:\n"); + printf(" duration %luus; %.2f wps\n", min, 1e6/min); +} + +static void graphviz(struct work *work) +{ +#if 0 + int i, j; + + printf("digraph {\n"); + printf(" rankdir=LR;\n"); + printf(" splines=line;\n"); + printf("\n"); + + for (i = 0; i < work->count; i++) { + struct task *task = work->tasks[i]; + + if (task->visited) + goto skip; + + printf(" subgraph cluster_%d {\n", task->ctx); + printf(" label=\"Context %d\"\n", task->ctx); + for (j = i; j < work->count; j++) { + if (work->tasks[j]->ctx == task->ctx) { + printf(" task_%03d;\n", j); + work->tasks[j]->visited = true; + } + } + printf(" }\n\n"); + +skip: + task->visited = false; + } + + for (i = 0; i < work->count; i++) { + struct task *task = work->tasks[i]; + struct dependency *dep; + + igt_list_for_each(dep, &task->signals, link) { + printf(" task_%03d -> task_%03d;\n", + dep->signal->step, task->step); + } + } + + printf("}\n"); +#endif +} + +int main(int argc, char **argv) +{ + int i; + + for (i = 1; i < argc; i++) { + FILE *file = fopen(argv[i], "r"); + struct work *work; + + if (!file) { + perror(argv[i]); + return 1; + } + + work = parse_work(file); + fclose(file); + + graphviz(work); + + ideal_depth(work); + single_client(work); + packed_clients(work); + } + + return 0; +}
A little tool I've been meaning to write for a while... Convert the .wsim into their dag and find the longest chains and evaluate them on an simulated machine. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- benchmarks/Makefile.sources | 1 + benchmarks/sim_wsim.c | 434 ++++++++++++++++++++++++++++++++++++ 2 files changed, 435 insertions(+) create mode 100644 benchmarks/sim_wsim.c