diff mbox

[i-g-t] benchmarks/wsim: Simulate and interpret .wsim

Message ID 20180511071141.24581-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson May 11, 2018, 7:11 a.m. UTC
A little tool I've been meaning to write for a while... Convert the
.wsim into their dag and find the longest chains and evaluate them on an
simulated machine.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 benchmarks/Makefile.sources |   1 +
 benchmarks/sim_wsim.c       | 434 ++++++++++++++++++++++++++++++++++++
 2 files changed, 435 insertions(+)
 create mode 100644 benchmarks/sim_wsim.c

Comments

Tvrtko Ursulin May 11, 2018, 8:31 a.m. UTC | #1
On 11/05/2018 08:11, Chris Wilson wrote:
> A little tool I've been meaning to write for a while... Convert the
> .wsim into their dag and find the longest chains and evaluate them on an
> simulated machine.

Very cool!

But I think you need to handle the 's' command which appears in the 
interesting workloads. Maybe you could just fake it as a zero duration 
batch with a data dependency.

Fence related commands would also be useful but more difficult I guess. 
As would throttling.

Delays could maybe be faked as batches as well by adding input 
dependency to everything preceding it.

Regards,

Tvrtko

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   benchmarks/Makefile.sources |   1 +
>   benchmarks/sim_wsim.c       | 434 ++++++++++++++++++++++++++++++++++++
>   2 files changed, 435 insertions(+)
>   create mode 100644 benchmarks/sim_wsim.c
> 
> diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
> index d150035aa..b80a7644c 100644
> --- a/benchmarks/Makefile.sources
> +++ b/benchmarks/Makefile.sources
> @@ -17,6 +17,7 @@ benchmarks_prog_list =			\
>   	gem_wsim			\
>   	kms_vblank			\
>   	prime_lookup			\
> +	sim_wsim			\
>   	vgem_mmap			\
>   	$(NULL)
>   
> diff --git a/benchmarks/sim_wsim.c b/benchmarks/sim_wsim.c
> new file mode 100644
> index 000000000..cb8e7adc4
> --- /dev/null
> +++ b/benchmarks/sim_wsim.c
> @@ -0,0 +1,434 @@
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#include "igt_aux.h"
> +
> +#if 0
> +#define DBG(...) printf(__VA_ARGS__)
> +#else
> +#define DBG(...) do { } while(0)
> +#endif
> +
> +struct dependency {
> +	struct task *signal;
> +	struct igt_list link;
> +};
> +
> +struct task {
> +	struct igt_list link;
> +	struct igt_list sched;
> +	struct igt_list signals;
> +	int step;
> +	int ctx;
> +	int class;
> +	int instance;
> +	long min, max;
> +	long duration;
> +	long deadline;
> +	bool visited;
> +};
> +
> +struct work {
> +	struct task **tasks;
> +	unsigned int count, size;
> +
> +	struct igt_list ends;
> +};
> +
> +static void add_dependency(struct task *task, struct task *signal)
> +{
> +	struct dependency *dep;
> +
> +	DBG("%s %d (context %d, engine %d) -> %d (context %d, engine %d)\n",
> +	    __func__,
> +	    signal->step, signal->ctx, signal->class,
> +	    task->step, task->ctx, task->class);
> +
> +	dep = malloc(sizeof(*dep));
> +	dep->signal = signal;
> +	igt_list_add(&dep->link, &task->signals);
> +	igt_list_del(&signal->link);
> +	igt_list_init(&signal->link);
> +}
> +
> +enum class {
> +	RCS,
> +	BCS,
> +	VCS,
> +	VECS,
> +};
> +
> +static void add_task(struct work *work, char *line)
> +{
> +	static const char *engines[] = {
> +		[RCS]  = "rcs",
> +		[BCS]  = "bcs",
> +		[VCS]  = "vcs",
> +		[VECS] = "vecs",
> +	};
> +	struct task *task;
> +	char *token;
> +	int i;
> +
> +	task = malloc(sizeof(*task));
> +
> +	igt_list_init(&task->signals);
> +	igt_list_init(&task->sched);
> +	task->step = work->count;
> +	task->visited = false;
> +
> +	/*
> +	 * 1.RCS.2800-3100.-1.0
> +	 * - context
> +	 * - engine
> +	 * - delay
> +	 * - dependencies
> +	 * - sync
> +	 */
> +	DBG("line='%s'\n", line);
> +
> +	/* context */
> +	token = strtok(line, ".");
> +	DBG("context='%s'\n", token);
> +	task->ctx  = atoi(token);
> +	if (!task->ctx)
> +		return;
> +
> +	/* engine */
> +	token = strtok(NULL, ".");
> +	DBG("engine='%s'\n", token);
> +	task->class = -1;
> +	for (i = 0; i < sizeof(engines)/sizeof(*engines); i++) {
> +		const int len = strlen(engines[i]);
> +		if (!strncasecmp(token, engines[i], len)) {
> +			task->class = i;
> +			if (token[len])
> +				task->instance = atoi(token + len);
> +			else
> +				task->instance = -1;
> +			break;
> +		}
> +	}
> +
> +	/* delay */
> +	token = strtok(NULL, ".");
> +	DBG("delay='%s'\n", token);
> +	task->min = strtol(token, &token, 0);
> +	if (*token)
> +		task->max = strtol(token + 1, NULL, 0);
> +	else
> +		task->max = task->min;
> +	task->duration = (task->min + task->max) / 2;
> +	DBG("min=%lu, max=%lu; duration=%lu\n", task->min, task->max, task->duration);
> +
> +	/* dependencies */
> +	token = strtok(NULL, ".");
> +	DBG("deps='%s'\n", token);
> +	while ((i = strtol(token, &token, 0))) {
> +		add_dependency(task, work->tasks[work->count + i]);
> +		if (*token)
> +			token++;
> +	}
> +
> +	/* add a dependency for the context+engine timeline */
> +	for (i = work->count; --i >= 0; ) {
> +		if (work->tasks[i]->ctx == task->ctx &&
> +		    work->tasks[i]->class == task->class) {
> +			add_dependency(task, work->tasks[i]);
> +			break;
> +		}
> +	}
> +
> +	igt_list_add(&task->link, &work->ends);
> +	work->tasks[work->count++] = task;
> +}
> +
> +static struct work *parse_work(FILE *file)
> +{
> +	struct work *work;
> +	char *line = NULL;
> +	size_t len = 0;
> +
> +	work = malloc(sizeof(*work));
> +	igt_list_init(&work->ends);
> +
> +	work->size = 64;
> +	work->count = 0;
> +	work->tasks = malloc(sizeof(*work->tasks) * work->size);
> +
> +	while (getline(&line, &len, file) != -1) {
> +		if (work->count == work->size) {
> +			work->tasks = realloc(work->tasks,
> +					      sizeof(*work->tasks) * work->size);
> +			work->size *= 2;
> +		}
> +		add_task(work, line);
> +	}
> +
> +	free(line);
> +
> +	DBG("%d tasks\n", work->count);
> +	return work;
> +}
> +
> +static unsigned long sum_durations(struct task *task)
> +{
> +	unsigned long max_duration = 0;
> +	struct task *signal = NULL;
> +	struct dependency *dep;
> +
> +	igt_list_for_each(dep, &task->signals, link) {
> +		if (dep->signal->duration > max_duration) {
> +			signal = dep->signal;
> +			max_duration = signal->duration;
> +		}
> +	}
> +
> +	return task->duration + (signal ? sum_durations(signal) : 0);
> +}
> +
> +static void ideal_depth(struct work *work)
> +{
> +	unsigned long total_duration;
> +	unsigned long max_duration;
> +	struct task *task;
> +	int i;
> +
> +	/*
> +	 * The ideal depth is the longest chain of dependencies as the
> +	 * dependency chain requires sequential task execution. Each
> +	 * chain is assumed to be run in parallel on an infinite set of
> +	 * engines, so the ratelimiting step is its longest path.
> +	 */
> +	max_duration = 0;
> +	igt_list_for_each(task, &work->ends, link) {
> +		unsigned long duration = sum_durations(task);
> +		if (duration > max_duration)
> +			max_duration = duration;
> +	}
> +
> +	total_duration = 0;
> +	for (i = 0; i < work->count; i++)
> +		total_duration += work->tasks[i]->duration;
> +
> +	printf("Single client\n");
> +	printf("   total duration %luus; %.2f wps\n", total_duration, 1e6/total_duration);
> +	printf("   ideal duration %luus; %.2f wps\n", max_duration, 1e6/max_duration);
> +}
> +
> +struct sim_class {
> +	int ninstance;
> +	unsigned int instances[4];
> +};
> +
> +static void single_client(struct work *work)
> +{
> +	struct sim_class sim[] = {
> +		[RCS]  = { 1 },
> +		[BCS]  = { 1 },
> +		[VCS]  = { 2 },
> +		[VECS] = { 1 },
> +	}, *class;
> +	IGT_LIST(sched);
> +	struct task *task;
> +	unsigned long max;
> +	int i, j;
> +
> +	for (i = 0; i < work->count; i++)
> +		igt_list_init(&work->tasks[i]->sched);
> +
> +	igt_list_for_each(task, &work->ends, link) {
> +		igt_list_add_tail(&task->sched, &sched);
> +	}
> +
> +	igt_list_for_each(task, &sched, sched) {
> +		struct dependency *dep;
> +
> +		igt_list_for_each(dep, &task->signals, link)
> +			igt_list_move_tail(&dep->signal->sched, &sched);
> +	}
> +
> +	igt_list_for_each_reverse(task, &sched, sched) {
> +		struct dependency *dep;
> +		int instance;
> +
> +		class = &sim[task->class];
> +		max = class->instances[0];
> +
> +		instance = task->instance;
> +		if (instance < 0) {
> +			instance = 0;
> +			for (i = 1; i < class->ninstance; i++) {
> +				if (class->instances[i] < max) {
> +					max = class->instances[i];
> +					instance = i;
> +				}
> +			}
> +		}
> +
> +		/*
> +		 * Greedy (first available), not true optimal scheduling.
> +		 *
> +		 * For optimal, we do have to compute the global optimal
> +		 * ordering by checking every permutation...
> +		 */
> +		igt_list_for_each(dep, &task->signals, link) {
> +			if (dep->signal->deadline > max)
> +				max = dep->signal->deadline;
> +		}
> +
> +		DBG("task %d: engine %d, instance %d; finish %lu\n",
> +		    task->step, task->class, instance, max);
> +
> +		task->deadline = max + task->duration;
> +		class->instances[instance] = task->deadline;
> +	}
> +
> +	max = 0;
> +	for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) {
> +		class = &sim[i];
> +		for (j = 0; j < class->ninstance; j++) {
> +			if (class->instances[j] > max)
> +				max = class->instances[j];
> +		}
> +	}
> +
> +	printf("Simulated client:\n");
> +	printf("   duration %luus; %.2f wps\n", max, 1e6/max);
> +}
> +
> +static void packed_clients(struct work *work)
> +{
> +	struct sim_class sim[] = {
> +		[RCS]  = { 1 },
> +		[BCS]  = { 1 },
> +		[VCS]  = { 2 },
> +		[VECS] = { 1 },
> +	}, *class;
> +	IGT_LIST(sched);
> +	struct task *task;
> +	unsigned long min;
> +	int i, j;
> +
> +	for (i = 0; i < work->count; i++)
> +		igt_list_init(&work->tasks[i]->sched);
> +
> +	igt_list_for_each(task, &work->ends, link)
> +		igt_list_add_tail(&task->sched, &sched);
> +
> +	igt_list_for_each(task, &sched, sched) {
> +		struct dependency *dep;
> +
> +		igt_list_for_each(dep, &task->signals, link)
> +			igt_list_move_tail(&dep->signal->sched, &sched);
> +	}
> +
> +	/*
> +	 * Compute the maximum duration required on any engine.
> +	 *
> +	 * With sufficient clients forcing maximum occupancy under their weight,
> +	 * the ratelimiting step becomes a single engine and how many clients
> +	 * it takes to fill.
> +	 */
> +	igt_list_for_each_reverse(task, &sched, sched) {
> +		int instance;
> +
> +		class = &sim[task->class];
> +		instance = task->instance;
> +		if (instance < 0) {
> +			instance = 0;
> +			min = class->instances[0];
> +			for (i = 1; i < class->ninstance; i++) {
> +				if (class->instances[i] < min) {
> +					min = class->instances[i];
> +					instance = i;
> +				}
> +			}
> +		}
> +
> +		class->instances[instance] += task->duration;
> +	}
> +
> +	min = 0;
> +	for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) {
> +		class = &sim[i];
> +		for (j = 0; j < class->ninstance; j++) {
> +			if (class->instances[j] > min)
> +				min = class->instances[j];
> +		}
> +	}
> +
> +	printf("Packed clients:\n");
> +	printf("   duration %luus; %.2f wps\n", min, 1e6/min);
> +}
> +
> +static void graphviz(struct work *work)
> +{
> +#if 0
> +	int i, j;
> +
> +	printf("digraph {\n");
> +	printf("  rankdir=LR;\n");
> +	printf("  splines=line;\n");
> +	printf("\n");
> +
> +	for (i = 0; i < work->count; i++) {
> +		struct task *task = work->tasks[i];
> +
> +		if (task->visited)
> +			goto skip;
> +
> +		printf("  subgraph cluster_%d {\n", task->ctx);
> +		printf("    label=\"Context %d\"\n", task->ctx);
> +		for (j = i; j < work->count; j++) {
> +			if (work->tasks[j]->ctx == task->ctx) {
> +				printf("    task_%03d;\n", j);
> +				work->tasks[j]->visited = true;
> +			}
> +		}
> +		printf("  }\n\n");
> +
> +skip:
> +		task->visited = false;
> +	}
> +
> +	for (i = 0; i < work->count; i++) {
> +		struct task *task = work->tasks[i];
> +		struct dependency *dep;
> +
> +		igt_list_for_each(dep, &task->signals, link) {
> +			printf("  task_%03d -> task_%03d;\n",
> +			       dep->signal->step, task->step);
> +		}
> +	}
> +
> +	printf("}\n");
> +#endif
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	int i;
> +
> +	for (i = 1; i < argc; i++) {
> +		FILE *file = fopen(argv[i], "r");
> +		struct work *work;
> +
> +		if (!file) {
> +			perror(argv[i]);
> +			return 1;
> +		}
> +
> +		work = parse_work(file);
> +		fclose(file);
> +
> +		graphviz(work);
> +
> +		ideal_depth(work);
> +		single_client(work);
> +		packed_clients(work);
> +	}
> +
> +	return 0;
> +}
>
Chris Wilson May 11, 2018, 9:04 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
> 
> On 11/05/2018 08:11, Chris Wilson wrote:
> > A little tool I've been meaning to write for a while... Convert the
> > .wsim into their dag and find the longest chains and evaluate them on an
> > simulated machine.
> 
> Very cool!
> 
> But I think you need to handle the 's' command which appears in the 
> interesting workloads. Maybe you could just fake it as a zero duration 
> batch with a data dependency.

Barriers were even more trivial than that. :)
 
> Fence related commands would also be useful but more difficult I guess. 
> As would throttling.

I haven't groked your fence command lines so skipped them. Fortunately,
syncs appears to be the only one missed from the "interesting" set of
wsim.

> Delays could maybe be faked as batches as well by adding input 
> dependency to everything preceding it.

It would definitely want to be a task with say .engine = -1. It implies
a full barrier as well? Not limited to waiting for an earlier step?
-Chris
Tvrtko Ursulin May 11, 2018, 9:21 a.m. UTC | #3
On 11/05/2018 10:04, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
>>
>> On 11/05/2018 08:11, Chris Wilson wrote:
>>> A little tool I've been meaning to write for a while... Convert the
>>> .wsim into their dag and find the longest chains and evaluate them on an
>>> simulated machine.
>>
>> Very cool!
>>
>> But I think you need to handle the 's' command which appears in the
>> interesting workloads. Maybe you could just fake it as a zero duration
>> batch with a data dependency.
> 
> Barriers were even more trivial than that. :)
>   
>> Fence related commands would also be useful but more difficult I guess.
>> As would throttling.
> 
> I haven't groked your fence command lines so skipped them. Fortunately,
> syncs appears to be the only one missed from the "interesting" set of
> wsim.

Fences are created with 'f' and then batches can reference them from the 
dependency list the same as with data dependencies. When created they 
are unsignaled and then 'a' advances the fence addresses relatively to 
the 'a' command. For instance 'a.-2' means "signal ready the fence 
created two steps above".

>> Delays could maybe be faked as batches as well by adding input
>> dependency to everything preceding it.
> 
> It would definitely want to be a task with say .engine = -1. It implies
> a full barrier as well? Not limited to waiting for an earlier step?

There is 'd' which is a normal/dumb fixed delay, and then 'p' which is 
relative to start of the loop, meaning "wait how much is needed so that 
time from start of loop to this step is x".

Regards,

Tvrtko
Chris Wilson May 11, 2018, 9:33 a.m. UTC | #4
Quoting Tvrtko Ursulin (2018-05-11 10:21:05)
> 
> On 11/05/2018 10:04, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
> >>
> >> On 11/05/2018 08:11, Chris Wilson wrote:
> >>> A little tool I've been meaning to write for a while... Convert the
> >>> .wsim into their dag and find the longest chains and evaluate them on an
> >>> simulated machine.
> >>
> >> Very cool!
> >>
> >> But I think you need to handle the 's' command which appears in the
> >> interesting workloads. Maybe you could just fake it as a zero duration
> >> batch with a data dependency.
> > 
> > Barriers were even more trivial than that. :)
> >   
> >> Fence related commands would also be useful but more difficult I guess.
> >> As would throttling.
> > 
> > I haven't groked your fence command lines so skipped them. Fortunately,
> > syncs appears to be the only one missed from the "interesting" set of
> > wsim.
> 
> Fences are created with 'f' and then batches can reference them from the 
> dependency list the same as with data dependencies. When created they 
> are unsignaled and then 'a' advances the fence addresses relatively to 
> the 'a' command. For instance 'a.-2' means "signal ready the fence 
> created two steps above".

I really wish you did named fences :)

How is 'a' synchronised? If I just did the 'f' as a dependency then
signaled it sometime later, it adds no delay to pipeline.

media_nn_1080p_s1.wsim:
f
1.VCS1.6500-8000.f-1.0
1.VCS2.6500-8000.f-2.0
a.-3
2.RCS.2000-4000.-2/-3.0
3.RCS.3000-5000.-1.0
3.RCS.23000-27000.0.0
3.VCS.16000-20000.-1.1

From the point of view of scheduling f,a become a no-op.

media_nn_1080p_s2.wsim:
1.VCS.13000-17000.0.0
2.RCS.2000-4000.-1.0
3.RCS.3000-5000.-1.0
3.RCS.23000-27000.0.0
f
3.VCS1.8000-10000.-2/f-1.0
3.VCS2.8000-10000.-3/f-2.0
a.-3
s.-3
s.-3

Same here. "f" appears ungrounded.

> >> Delays could maybe be faked as batches as well by adding input
> >> dependency to everything preceding it.
> > 
> > It would definitely want to be a task with say .engine = -1. It implies
> > a full barrier as well? Not limited to waiting for an earlier step?
> 
> There is 'd' which is a normal/dumb fixed delay, and then 'p' which is 
> relative to start of the loop, meaning "wait how much is needed so that 
> time from start of loop to this step is x".

There aren't any '^[dp]' in the current set of .wsim, can I just
pretend they don't exist for now? ;)
-Chris
Tvrtko Ursulin May 11, 2018, 11:56 a.m. UTC | #5
On 11/05/2018 10:33, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-11 10:21:05)
>>
>> On 11/05/2018 10:04, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
>>>>
>>>> On 11/05/2018 08:11, Chris Wilson wrote:
>>>>> A little tool I've been meaning to write for a while... Convert the
>>>>> .wsim into their dag and find the longest chains and evaluate them on an
>>>>> simulated machine.
>>>>
>>>> Very cool!
>>>>
>>>> But I think you need to handle the 's' command which appears in the
>>>> interesting workloads. Maybe you could just fake it as a zero duration
>>>> batch with a data dependency.
>>>
>>> Barriers were even more trivial than that. :)
>>>    
>>>> Fence related commands would also be useful but more difficult I guess.
>>>> As would throttling.
>>>
>>> I haven't groked your fence command lines so skipped them. Fortunately,
>>> syncs appears to be the only one missed from the "interesting" set of
>>> wsim.
>>
>> Fences are created with 'f' and then batches can reference them from the
>> dependency list the same as with data dependencies. When created they
>> are unsignaled and then 'a' advances the fence addresses relatively to
>> the 'a' command. For instance 'a.-2' means "signal ready the fence
>> created two steps above".
> 
> I really wish you did named fences :)

Should be possible to add/replace but would need another data structure 
to look them up then.

> How is 'a' synchronised? If I just did the 'f' as a dependency then
> signaled it sometime later, it adds no delay to pipeline.
> 
> media_nn_1080p_s1.wsim:
> f
> 1.VCS1.6500-8000.f-1.0
> 1.VCS2.6500-8000.f-2.0
> a.-3
> 2.RCS.2000-4000.-2/-3.0
> 3.RCS.3000-5000.-1.0
> 3.RCS.23000-27000.0.0
> 3.VCS.16000-20000.-1.1
> 
>  From the point of view of scheduling f,a become a no-op.
> 
> media_nn_1080p_s2.wsim:
> 1.VCS.13000-17000.0.0
> 2.RCS.2000-4000.-1.0
> 3.RCS.3000-5000.-1.0
> 3.RCS.23000-27000.0.0
> f
> 3.VCS1.8000-10000.-2/f-1.0
> 3.VCS2.8000-10000.-3/f-2.0
> a.-3
> s.-3
> s.-3
> 
> Same here. "f" appears ungrounded.

Okay in these examples I guess it doesn't do much for a single client. 
In multiple client cases it can have an effect. Or if there was 
something in between the batches with input fence dependencies.

>>>> Delays could maybe be faked as batches as well by adding input
>>>> dependency to everything preceding it.
>>>
>>> It would definitely want to be a task with say .engine = -1. It implies
>>> a full barrier as well? Not limited to waiting for an earlier step?
>>
>> There is 'd' which is a normal/dumb fixed delay, and then 'p' which is
>> relative to start of the loop, meaning "wait how much is needed so that
>> time from start of loop to this step is x".
> 
> There aren't any '^[dp]' in the current set of .wsim, can I just
> pretend they don't exist for now? ;)

Probably. There is -a option to gem_wsim which media-bench uses in some 
modes essentially to convert any workload into realtime. For instance -a 
p.16667 (AFAIR) makes any workload not run faster than 60fps. But your 
sim_wsim I guess you don't care about that.

Regards,

Tvrtko
Chris Wilson July 10, 2018, 1:47 p.m. UTC | #6
Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
> 
> On 11/05/2018 08:11, Chris Wilson wrote:
> > A little tool I've been meaning to write for a while... Convert the
> > .wsim into their dag and find the longest chains and evaluate them on an
> > simulated machine.
> 
> Very cool!

Would you care to ack it in its current form, knowing that we will fix
it whenever we find a corner case interesting enough to study?
-Chris
Tvrtko Ursulin July 10, 2018, 3:38 p.m. UTC | #7
On 10/07/2018 14:47, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
>>
>> On 11/05/2018 08:11, Chris Wilson wrote:
>>> A little tool I've been meaning to write for a while... Convert the
>>> .wsim into their dag and find the longest chains and evaluate them on an
>>> simulated machine.
>>
>> Very cool!
> 
> Would you care to ack it in its current form, knowing that we will fix
> it whenever we find a corner case interesting enough to study?

I think just a little bit more polish is needed. I've just tried it out 
and polish I can see is:

1. -h / --help with just a brief comment on what the tool does and usage.

2. Lead stats displayed for each cmdline argument with parsed filename.

3. Replace result section names "Single client", "Simulated clients" and 
item names "total/ideal/packed" with a more descriptive text which would 
hopefully be somewhat self-explanatory what the numbers represent.

4. Report garbage input (and unknown wsim commands) instead of reporting 
some numbers for instance for ./sim_wsim README :)

5. #ifdef 0 graphviz ? add --graphviz cmddline option?

6. Oh.. and meson makefiles.. :)

Regards,

Tvrtko
Chris Wilson July 10, 2018, 3:40 p.m. UTC | #8
Quoting Tvrtko Ursulin (2018-07-10 16:38:14)
> 
> On 10/07/2018 14:47, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
> >>
> >> On 11/05/2018 08:11, Chris Wilson wrote:
> >>> A little tool I've been meaning to write for a while... Convert the
> >>> .wsim into their dag and find the longest chains and evaluate them on an
> >>> simulated machine.
> >>
> >> Very cool!
> > 
> > Would you care to ack it in its current form, knowing that we will fix
> > it whenever we find a corner case interesting enough to study?
> 
> I think just a little bit more polish is needed. I've just tried it out 
> and polish I can see is:
> 
> 1. -h / --help with just a brief comment on what the tool does and usage.
> 
> 2. Lead stats displayed for each cmdline argument with parsed filename.
> 
> 3. Replace result section names "Single client", "Simulated clients" and 
> item names "total/ideal/packed" with a more descriptive text which would 
> hopefully be somewhat self-explanatory what the numbers represent.
> 
> 4. Report garbage input (and unknown wsim commands) instead of reporting 
> some numbers for instance for ./sim_wsim README :)
> 
> 5. #ifdef 0 graphviz ? add --graphviz cmddline option?
> 
> 6. Oh.. and meson makefiles.. :)

But if it was upstream, you would just contribute those improvements
yourself.

So you are saying that this is just not useful?
-Chris
Tvrtko Ursulin July 10, 2018, 4:05 p.m. UTC | #9
On 10/07/2018 16:40, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-07-10 16:38:14)
>>
>> On 10/07/2018 14:47, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-05-11 09:31:52)
>>>>
>>>> On 11/05/2018 08:11, Chris Wilson wrote:
>>>>> A little tool I've been meaning to write for a while... Convert the
>>>>> .wsim into their dag and find the longest chains and evaluate them on an
>>>>> simulated machine.
>>>>
>>>> Very cool!
>>>
>>> Would you care to ack it in its current form, knowing that we will fix
>>> it whenever we find a corner case interesting enough to study?
>>
>> I think just a little bit more polish is needed. I've just tried it out
>> and polish I can see is:
>>
>> 1. -h / --help with just a brief comment on what the tool does and usage.
>>
>> 2. Lead stats displayed for each cmdline argument with parsed filename.
>>
>> 3. Replace result section names "Single client", "Simulated clients" and
>> item names "total/ideal/packed" with a more descriptive text which would
>> hopefully be somewhat self-explanatory what the numbers represent.
>>
>> 4. Report garbage input (and unknown wsim commands) instead of reporting
>> some numbers for instance for ./sim_wsim README :)
>>
>> 5. #ifdef 0 graphviz ? add --graphviz cmddline option?
>>
>> 6. Oh.. and meson makefiles.. :)
> 
> But if it was upstream, you would just contribute those improvements
> yourself.
> 
> So you are saying that this is just not useful?

No, just that I think we have to have some minimum standard before we 
can make it upstream. If you want I can implement these bits and send a 
v3. Then it will just be a matter of finding a third person to review it 
since we will both be authors. :)

Regards,

Tvrtko
diff mbox

Patch

diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
index d150035aa..b80a7644c 100644
--- a/benchmarks/Makefile.sources
+++ b/benchmarks/Makefile.sources
@@ -17,6 +17,7 @@  benchmarks_prog_list =			\
 	gem_wsim			\
 	kms_vblank			\
 	prime_lookup			\
+	sim_wsim			\
 	vgem_mmap			\
 	$(NULL)
 
diff --git a/benchmarks/sim_wsim.c b/benchmarks/sim_wsim.c
new file mode 100644
index 000000000..cb8e7adc4
--- /dev/null
+++ b/benchmarks/sim_wsim.c
@@ -0,0 +1,434 @@ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "igt_aux.h"
+
+#if 0
+#define DBG(...) printf(__VA_ARGS__)
+#else
+#define DBG(...) do { } while(0)
+#endif
+
+struct dependency {
+	struct task *signal;
+	struct igt_list link;
+};
+
+struct task {
+	struct igt_list link;
+	struct igt_list sched;
+	struct igt_list signals;
+	int step;
+	int ctx;
+	int class;
+	int instance;
+	long min, max;
+	long duration;
+	long deadline;
+	bool visited;
+};
+
+struct work {
+	struct task **tasks;
+	unsigned int count, size;
+
+	struct igt_list ends;
+};
+
+static void add_dependency(struct task *task, struct task *signal)
+{
+	struct dependency *dep;
+
+	DBG("%s %d (context %d, engine %d) -> %d (context %d, engine %d)\n",
+	    __func__,
+	    signal->step, signal->ctx, signal->class,
+	    task->step, task->ctx, task->class);
+
+	dep = malloc(sizeof(*dep));
+	dep->signal = signal;
+	igt_list_add(&dep->link, &task->signals);
+	igt_list_del(&signal->link);
+	igt_list_init(&signal->link);
+}
+
+enum class {
+	RCS,
+	BCS,
+	VCS,
+	VECS,
+};
+
+static void add_task(struct work *work, char *line)
+{
+	static const char *engines[] = {
+		[RCS]  = "rcs",
+		[BCS]  = "bcs",
+		[VCS]  = "vcs",
+		[VECS] = "vecs",
+	};
+	struct task *task;
+	char *token;
+	int i;
+
+	task = malloc(sizeof(*task));
+
+	igt_list_init(&task->signals);
+	igt_list_init(&task->sched);
+	task->step = work->count;
+	task->visited = false;
+
+	/*
+	 * 1.RCS.2800-3100.-1.0
+	 * - context
+	 * - engine
+	 * - delay
+	 * - dependencies
+	 * - sync
+	 */
+	DBG("line='%s'\n", line);
+
+	/* context */
+	token = strtok(line, ".");
+	DBG("context='%s'\n", token);
+	task->ctx  = atoi(token);
+	if (!task->ctx)
+		return;
+
+	/* engine */
+	token = strtok(NULL, ".");
+	DBG("engine='%s'\n", token);
+	task->class = -1;
+	for (i = 0; i < sizeof(engines)/sizeof(*engines); i++) {
+		const int len = strlen(engines[i]);
+		if (!strncasecmp(token, engines[i], len)) {
+			task->class = i;
+			if (token[len])
+				task->instance = atoi(token + len);
+			else
+				task->instance = -1;
+			break;
+		}
+	}
+
+	/* delay */
+	token = strtok(NULL, ".");
+	DBG("delay='%s'\n", token);
+	task->min = strtol(token, &token, 0);
+	if (*token)
+		task->max = strtol(token + 1, NULL, 0);
+	else
+		task->max = task->min;
+	task->duration = (task->min + task->max) / 2;
+	DBG("min=%lu, max=%lu; duration=%lu\n", task->min, task->max, task->duration);
+
+	/* dependencies */
+	token = strtok(NULL, ".");
+	DBG("deps='%s'\n", token);
+	while ((i = strtol(token, &token, 0))) {
+		add_dependency(task, work->tasks[work->count + i]);
+		if (*token)
+			token++;
+	}
+
+	/* add a dependency for the context+engine timeline */
+	for (i = work->count; --i >= 0; ) {
+		if (work->tasks[i]->ctx == task->ctx &&
+		    work->tasks[i]->class == task->class) {
+			add_dependency(task, work->tasks[i]);
+			break;
+		}
+	}
+
+	igt_list_add(&task->link, &work->ends);
+	work->tasks[work->count++] = task;
+}
+
+static struct work *parse_work(FILE *file)
+{
+	struct work *work;
+	char *line = NULL;
+	size_t len = 0;
+
+	work = malloc(sizeof(*work));
+	igt_list_init(&work->ends);
+
+	work->size = 64;
+	work->count = 0;
+	work->tasks = malloc(sizeof(*work->tasks) * work->size);
+
+	while (getline(&line, &len, file) != -1) {
+		if (work->count == work->size) {
+			work->tasks = realloc(work->tasks,
+					      sizeof(*work->tasks) * work->size);
+			work->size *= 2;
+		}
+		add_task(work, line);
+	}
+
+	free(line);
+
+	DBG("%d tasks\n", work->count);
+	return work;
+}
+
+static unsigned long sum_durations(struct task *task)
+{
+	unsigned long max_duration = 0;
+	struct task *signal = NULL;
+	struct dependency *dep;
+
+	igt_list_for_each(dep, &task->signals, link) {
+		if (dep->signal->duration > max_duration) {
+			signal = dep->signal;
+			max_duration = signal->duration;
+		}
+	}
+
+	return task->duration + (signal ? sum_durations(signal) : 0);
+}
+
+static void ideal_depth(struct work *work)
+{
+	unsigned long total_duration;
+	unsigned long max_duration;
+	struct task *task;
+	int i;
+
+	/*
+	 * The ideal depth is the longest chain of dependencies as the
+	 * dependency chain requires sequential task execution. Each
+	 * chain is assumed to be run in parallel on an infinite set of
+	 * engines, so the ratelimiting step is its longest path.
+	 */
+	max_duration = 0;
+	igt_list_for_each(task, &work->ends, link) {
+		unsigned long duration = sum_durations(task);
+		if (duration > max_duration)
+			max_duration = duration;
+	}
+
+	total_duration = 0;
+	for (i = 0; i < work->count; i++)
+		total_duration += work->tasks[i]->duration;
+
+	printf("Single client\n");
+	printf("   total duration %luus; %.2f wps\n", total_duration, 1e6/total_duration);
+	printf("   ideal duration %luus; %.2f wps\n", max_duration, 1e6/max_duration);
+}
+
+struct sim_class {
+	int ninstance;
+	unsigned int instances[4];
+};
+
+static void single_client(struct work *work)
+{
+	struct sim_class sim[] = {
+		[RCS]  = { 1 },
+		[BCS]  = { 1 },
+		[VCS]  = { 2 },
+		[VECS] = { 1 },
+	}, *class;
+	IGT_LIST(sched);
+	struct task *task;
+	unsigned long max;
+	int i, j;
+
+	for (i = 0; i < work->count; i++)
+		igt_list_init(&work->tasks[i]->sched);
+
+	igt_list_for_each(task, &work->ends, link) {
+		igt_list_add_tail(&task->sched, &sched);
+	}
+
+	igt_list_for_each(task, &sched, sched) {
+		struct dependency *dep;
+
+		igt_list_for_each(dep, &task->signals, link)
+			igt_list_move_tail(&dep->signal->sched, &sched);
+	}
+
+	igt_list_for_each_reverse(task, &sched, sched) {
+		struct dependency *dep;
+		int instance;
+
+		class = &sim[task->class];
+		max = class->instances[0];
+
+		instance = task->instance;
+		if (instance < 0) {
+			instance = 0;
+			for (i = 1; i < class->ninstance; i++) {
+				if (class->instances[i] < max) {
+					max = class->instances[i];
+					instance = i;
+				}
+			}
+		}
+
+		/*
+		 * Greedy (first available), not true optimal scheduling.
+		 *
+		 * For optimal, we do have to compute the global optimal
+		 * ordering by checking every permutation...
+		 */
+		igt_list_for_each(dep, &task->signals, link) {
+			if (dep->signal->deadline > max)
+				max = dep->signal->deadline;
+		}
+
+		DBG("task %d: engine %d, instance %d; finish %lu\n",
+		    task->step, task->class, instance, max);
+
+		task->deadline = max + task->duration;
+		class->instances[instance] = task->deadline;
+	}
+
+	max = 0;
+	for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) {
+		class = &sim[i];
+		for (j = 0; j < class->ninstance; j++) {
+			if (class->instances[j] > max)
+				max = class->instances[j];
+		}
+	}
+
+	printf("Simulated client:\n");
+	printf("   duration %luus; %.2f wps\n", max, 1e6/max);
+}
+
+static void packed_clients(struct work *work)
+{
+	struct sim_class sim[] = {
+		[RCS]  = { 1 },
+		[BCS]  = { 1 },
+		[VCS]  = { 2 },
+		[VECS] = { 1 },
+	}, *class;
+	IGT_LIST(sched);
+	struct task *task;
+	unsigned long min;
+	int i, j;
+
+	for (i = 0; i < work->count; i++)
+		igt_list_init(&work->tasks[i]->sched);
+
+	igt_list_for_each(task, &work->ends, link)
+		igt_list_add_tail(&task->sched, &sched);
+
+	igt_list_for_each(task, &sched, sched) {
+		struct dependency *dep;
+
+		igt_list_for_each(dep, &task->signals, link)
+			igt_list_move_tail(&dep->signal->sched, &sched);
+	}
+
+	/*
+	 * Compute the maximum duration required on any engine.
+	 *
+	 * With sufficient clients forcing maximum occupancy under their weight,
+	 * the ratelimiting step becomes a single engine and how many clients
+	 * it takes to fill.
+	 */
+	igt_list_for_each_reverse(task, &sched, sched) {
+		int instance;
+
+		class = &sim[task->class];
+		instance = task->instance;
+		if (instance < 0) {
+			instance = 0;
+			min = class->instances[0];
+			for (i = 1; i < class->ninstance; i++) {
+				if (class->instances[i] < min) {
+					min = class->instances[i];
+					instance = i;
+				}
+			}
+		}
+
+		class->instances[instance] += task->duration;
+	}
+
+	min = 0;
+	for (i = 0; i < sizeof(sim)/sizeof(sim[0]); i++) {
+		class = &sim[i];
+		for (j = 0; j < class->ninstance; j++) {
+			if (class->instances[j] > min)
+				min = class->instances[j];
+		}
+	}
+
+	printf("Packed clients:\n");
+	printf("   duration %luus; %.2f wps\n", min, 1e6/min);
+}
+
+static void graphviz(struct work *work)
+{
+#if 0
+	int i, j;
+
+	printf("digraph {\n");
+	printf("  rankdir=LR;\n");
+	printf("  splines=line;\n");
+	printf("\n");
+
+	for (i = 0; i < work->count; i++) {
+		struct task *task = work->tasks[i];
+
+		if (task->visited)
+			goto skip;
+
+		printf("  subgraph cluster_%d {\n", task->ctx);
+		printf("    label=\"Context %d\"\n", task->ctx);
+		for (j = i; j < work->count; j++) {
+			if (work->tasks[j]->ctx == task->ctx) {
+				printf("    task_%03d;\n", j);
+				work->tasks[j]->visited = true;
+			}
+		}
+		printf("  }\n\n");
+
+skip:
+		task->visited = false;
+	}
+
+	for (i = 0; i < work->count; i++) {
+		struct task *task = work->tasks[i];
+		struct dependency *dep;
+
+		igt_list_for_each(dep, &task->signals, link) {
+			printf("  task_%03d -> task_%03d;\n",
+			       dep->signal->step, task->step);
+		}
+	}
+
+	printf("}\n");
+#endif
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+
+	for (i = 1; i < argc; i++) {
+		FILE *file = fopen(argv[i], "r");
+		struct work *work;
+
+		if (!file) {
+			perror(argv[i]);
+			return 1;
+		}
+
+		work = parse_work(file);
+		fclose(file);
+
+		graphviz(work);
+
+		ideal_depth(work);
+		single_client(work);
+		packed_clients(work);
+	}
+
+	return 0;
+}