diff mbox

[i-g-t,3/6] igt/gem_ctx_thrash: Order writes between contexts

Message ID 20180514080251.11224-3-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson May 14, 2018, 8:02 a.m. UTC
The test wrote to the same dwords from multiple contexts, assuming that
the writes would be ordered by its submission. However, as it was using
multiple contexts without a write hazard, those timelines are not
coupled and the requests may be emitted to hw in any order. So emit a
write hazard for each individual dword in the scratch (avoiding the
write hazard for the scratch as a whole) to ensure the writes do occur
in the expected order.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 tests/gem_ctx_thrash.c | 92 ++++++++++++++++++++++++------------------
 1 file changed, 53 insertions(+), 39 deletions(-)

Comments

Tvrtko Ursulin May 14, 2018, 10:59 a.m. UTC | #1
On 14/05/2018 09:02, Chris Wilson wrote:
> The test wrote to the same dwords from multiple contexts, assuming that
> the writes would be ordered by its submission. However, as it was using
> multiple contexts without a write hazard, those timelines are not
> coupled and the requests may be emitted to hw in any order. So emit a
> write hazard for each individual dword in the scratch (avoiding the
> write hazard for the scratch as a whole) to ensure the writes do occur
> in the expected order.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   tests/gem_ctx_thrash.c | 92 ++++++++++++++++++++++++------------------
>   1 file changed, 53 insertions(+), 39 deletions(-)
> 
> diff --git a/tests/gem_ctx_thrash.c b/tests/gem_ctx_thrash.c
> index 2cd9cfebf..b25f95f13 100644
> --- a/tests/gem_ctx_thrash.c
> +++ b/tests/gem_ctx_thrash.c
> @@ -90,17 +90,13 @@ static void single(const char *name, bool all_engines)
>   {
>   	struct drm_i915_gem_exec_object2 *obj;
>   	struct drm_i915_gem_relocation_entry *reloc;
> -	unsigned engines[16];
> -	uint64_t size;
> -	uint32_t *ctx, *map, scratch;
> -	unsigned num_ctx;
> -	int fd, gen, num_engines;
> +	unsigned int engines[16], num_engines, num_ctx;
> +	uint32_t *ctx, *map, scratch, size;
> +	int fd, gen;
>   #define MAX_LOOP 16
>   
> -	fd = drm_open_driver_master(DRIVER_INTEL);
> +	fd = drm_open_driver(DRIVER_INTEL);
>   	igt_require_gem(fd);
> -	igt_require(gem_can_store_dword(fd, 0));
> -
>   	gem_require_contexts(fd);
>   
>   	gen = intel_gen(intel_get_drm_devid(fd));
> @@ -108,54 +104,77 @@ static void single(const char *name, bool all_engines)
>   	num_engines = 0;
>   	if (all_engines) {
>   		unsigned engine;
> +
>   		for_each_physical_engine(fd, engine) {
> +			if (!gem_can_store_dword(fd, engine))
> +				continue;
> +
>   			engines[num_engines++] = engine;
>   			if (num_engines == ARRAY_SIZE(engines))
>   				break;
>   		}
> -	} else
> +	} else {
> +		igt_require(gem_can_store_dword(fd, 0));
>   		engines[num_engines++] = 0;
> +	}
> +	igt_require(num_engines);
>   
>   	num_ctx = get_num_contexts(fd, num_engines);
>   
>   	size = ALIGN(num_ctx * sizeof(uint32_t), 4096);
> -	scratch = gem_create(fd, ALIGN(num_ctx * sizeof(uint32_t), 4096));
> +	scratch = gem_create(fd, size);
>   	gem_set_caching(fd, scratch, I915_CACHING_CACHED);
> -	obj = calloc(num_ctx, 2 * sizeof(*obj));
> -	reloc = calloc(num_ctx, sizeof(*reloc));
> +	obj = calloc(num_ctx, 3 * sizeof(*obj));
> +	reloc = calloc(num_ctx, 2 * sizeof(*reloc));
>   
>   	ctx = malloc(num_ctx * sizeof(uint32_t));
>   	igt_assert(ctx);
>   	for (unsigned n = 0; n < num_ctx; n++) {
>   		ctx[n] = gem_context_create(fd);
> -		obj[2*n + 0].handle = scratch;
> -
> -		reloc[n].target_handle = scratch;
> -		reloc[n].presumed_offset = 0;
> -		reloc[n].offset = sizeof(uint32_t);
> -		reloc[n].delta = n * sizeof(uint32_t);
> -		reloc[n].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> -		reloc[n].write_domain = 0; /* lies! */
> +
> +		obj[3*n + 0].handle = gem_create(fd, 4096);
> +		reloc[2*n + 0].target_handle = obj[3*n + 0].handle;
> +		reloc[2*n + 0].presumed_offset = 0;
> +		reloc[2*n + 0].offset = 4000;
> +		reloc[2*n + 0].delta = 0;
> +		reloc[2*n + 0].read_domains = I915_GEM_DOMAIN_RENDER;
> +		reloc[2*n + 0].write_domain = I915_GEM_DOMAIN_RENDER;
> +
> +		obj[3*n + 1].handle = scratch;
> +		reloc[2*n + 1].target_handle = scratch;
> +		reloc[2*n + 1].presumed_offset = 0;
> +		reloc[2*n + 1].offset = sizeof(uint32_t);
> +		reloc[2*n + 1].delta = n * sizeof(uint32_t);
> +		reloc[2*n + 1].read_domains = I915_GEM_DOMAIN_RENDER;
> +		reloc[2*n + 1].write_domain = 0; /* lies! */
>   		if (gen >= 4 && gen < 8)
> -			reloc[n].offset += sizeof(uint32_t);
> +			reloc[2*n + 1].offset += sizeof(uint32_t);
>   
> -		obj[2*n + 1].relocs_ptr = to_user_pointer(&reloc[n]);
> -		obj[2*n + 1].relocation_count = 1;
> +		obj[3*n + 2].relocs_ptr = to_user_pointer(&reloc[2*n]);
> +		obj[3*n + 2].relocation_count = 2;
>   	}
>   
>   	map = gem_mmap__cpu(fd, scratch, 0, size, PROT_WRITE);
> -	for (unsigned loop = 1; loop <= MAX_LOOP; loop <<= 1) {
> -		unsigned count = loop * num_ctx;
> +	for (unsigned int loop = 1; loop <= MAX_LOOP; loop <<= 1) {
> +		const unsigned int count = loop * num_ctx;
>   		uint32_t *all;
>   
>   		all = malloc(count * sizeof(uint32_t));
> -		for (unsigned n = 0; n < count; n++)
> +		for (unsigned int n = 0; n < count; n++)
>   			all[n] = ctx[n % num_ctx];
>   		igt_permute_array(all, count, xchg_int);
> -		for (unsigned n = 0; n < count; n++) {
> -			struct drm_i915_gem_execbuffer2 execbuf;
> -			unsigned r = n % num_ctx;
> -			uint64_t offset = reloc[r].presumed_offset + reloc[r].delta;
> +
> +		for (unsigned int n = 0; n < count; n++) {
> +			const unsigned int r = n % num_ctx;
> +			struct drm_i915_gem_execbuffer2 execbuf = {
> +				.buffers_ptr = to_user_pointer(&obj[3*r]),
> +				.buffer_count = 3,
> +				.flags = engines[n % num_engines],
> +				.rsvd1 = all[n],
> +			};
> +			uint64_t offset =
> +				reloc[2*r + 1].presumed_offset +
> +				reloc[2*r + 1].delta;
>   			uint32_t handle = gem_create(fd, 4096);
>   			uint32_t buf[16];
>   			int i;
> @@ -176,27 +195,22 @@ static void single(const char *name, bool all_engines)
>   			buf[++i] = all[n];
>   			buf[++i] = MI_BATCH_BUFFER_END;
>   			gem_write(fd, handle, 0, buf, sizeof(buf));
> -			obj[2*r + 1].handle = handle;
> +			obj[3*r + 2].handle = handle;
>   
> -			memset(&execbuf, 0, sizeof(execbuf));
> -			execbuf.buffers_ptr = to_user_pointer(&obj[2*r]);
> -			execbuf.buffer_count = 2;
> -			execbuf.flags = engines[n % num_engines];
> -			execbuf.rsvd1 = all[n];
>   			gem_execbuf(fd, &execbuf);
>   			gem_close(fd, handle);
>   		}
>   
> -		/* Note we lied about the write-domain when writing from the
> +		/*
> +		 * Note we lied about the write-domain when writing from the
>   		 * GPU (in order to avoid inter-ring synchronisation), so now
>   		 * we have to force the synchronisation here.
>   		 */
>   		gem_set_domain(fd, scratch,
>   			       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> -		for (unsigned n = count - num_ctx; n < count; n++)
> +		for (unsigned int n = count - num_ctx; n < count; n++)
>   			igt_assert_eq(map[n % num_ctx], all[n]);
>   		free(all);
> -		igt_progress(name, loop, MAX_LOOP);
>   	}
>   	munmap(map, size);
>   
> 

Just one thing I failed to figure out - what would be the difference 
from simply putting a write hazard on the scratch page write? Wouldn't 
both guarantee execution in order of execbuf?

Regards,

Tvrtko
Chris Wilson May 14, 2018, 3:10 p.m. UTC | #2
Quoting Tvrtko Ursulin (2018-05-14 11:59:09)
> 
> On 14/05/2018 09:02, Chris Wilson wrote:
> > The test wrote to the same dwords from multiple contexts, assuming that
> > the writes would be ordered by its submission. However, as it was using
> > multiple contexts without a write hazard, those timelines are not
> > coupled and the requests may be emitted to hw in any order. So emit a
> > write hazard for each individual dword in the scratch (avoiding the
> > write hazard for the scratch as a whole) to ensure the writes do occur
> > in the expected order.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   tests/gem_ctx_thrash.c | 92 ++++++++++++++++++++++++------------------
> >   1 file changed, 53 insertions(+), 39 deletions(-)
> > 
> > diff --git a/tests/gem_ctx_thrash.c b/tests/gem_ctx_thrash.c
> > index 2cd9cfebf..b25f95f13 100644
> > --- a/tests/gem_ctx_thrash.c
> > +++ b/tests/gem_ctx_thrash.c
> > @@ -90,17 +90,13 @@ static void single(const char *name, bool all_engines)
> >   {
> >       struct drm_i915_gem_exec_object2 *obj;
> >       struct drm_i915_gem_relocation_entry *reloc;
> > -     unsigned engines[16];
> > -     uint64_t size;
> > -     uint32_t *ctx, *map, scratch;
> > -     unsigned num_ctx;
> > -     int fd, gen, num_engines;
> > +     unsigned int engines[16], num_engines, num_ctx;
> > +     uint32_t *ctx, *map, scratch, size;
> > +     int fd, gen;
> >   #define MAX_LOOP 16
> >   
> > -     fd = drm_open_driver_master(DRIVER_INTEL);
> > +     fd = drm_open_driver(DRIVER_INTEL);
> >       igt_require_gem(fd);
> > -     igt_require(gem_can_store_dword(fd, 0));
> > -
> >       gem_require_contexts(fd);
> >   
> >       gen = intel_gen(intel_get_drm_devid(fd));
> > @@ -108,54 +104,77 @@ static void single(const char *name, bool all_engines)
> >       num_engines = 0;
> >       if (all_engines) {
> >               unsigned engine;
> > +
> >               for_each_physical_engine(fd, engine) {
> > +                     if (!gem_can_store_dword(fd, engine))
> > +                             continue;
> > +
> >                       engines[num_engines++] = engine;
> >                       if (num_engines == ARRAY_SIZE(engines))
> >                               break;
> >               }
> > -     } else
> > +     } else {
> > +             igt_require(gem_can_store_dword(fd, 0));
> >               engines[num_engines++] = 0;
> > +     }
> > +     igt_require(num_engines);
> >   
> >       num_ctx = get_num_contexts(fd, num_engines);
> >   
> >       size = ALIGN(num_ctx * sizeof(uint32_t), 4096);
> > -     scratch = gem_create(fd, ALIGN(num_ctx * sizeof(uint32_t), 4096));
> > +     scratch = gem_create(fd, size);
> >       gem_set_caching(fd, scratch, I915_CACHING_CACHED);
> > -     obj = calloc(num_ctx, 2 * sizeof(*obj));
> > -     reloc = calloc(num_ctx, sizeof(*reloc));
> > +     obj = calloc(num_ctx, 3 * sizeof(*obj));
> > +     reloc = calloc(num_ctx, 2 * sizeof(*reloc));
> >   
> >       ctx = malloc(num_ctx * sizeof(uint32_t));
> >       igt_assert(ctx);
> >       for (unsigned n = 0; n < num_ctx; n++) {
> >               ctx[n] = gem_context_create(fd);
> > -             obj[2*n + 0].handle = scratch;
> > -
> > -             reloc[n].target_handle = scratch;
> > -             reloc[n].presumed_offset = 0;
> > -             reloc[n].offset = sizeof(uint32_t);
> > -             reloc[n].delta = n * sizeof(uint32_t);
> > -             reloc[n].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> > -             reloc[n].write_domain = 0; /* lies! */
> > +
> > +             obj[3*n + 0].handle = gem_create(fd, 4096);
> > +             reloc[2*n + 0].target_handle = obj[3*n + 0].handle;
> > +             reloc[2*n + 0].presumed_offset = 0;
> > +             reloc[2*n + 0].offset = 4000;
> > +             reloc[2*n + 0].delta = 0;
> > +             reloc[2*n + 0].read_domains = I915_GEM_DOMAIN_RENDER;
> > +             reloc[2*n + 0].write_domain = I915_GEM_DOMAIN_RENDER;
> > +
> > +             obj[3*n + 1].handle = scratch;
> > +             reloc[2*n + 1].target_handle = scratch;
> > +             reloc[2*n + 1].presumed_offset = 0;
> > +             reloc[2*n + 1].offset = sizeof(uint32_t);
> > +             reloc[2*n + 1].delta = n * sizeof(uint32_t);
> > +             reloc[2*n + 1].read_domains = I915_GEM_DOMAIN_RENDER;
> > +             reloc[2*n + 1].write_domain = 0; /* lies! */
> >               if (gen >= 4 && gen < 8)
> > -                     reloc[n].offset += sizeof(uint32_t);
> > +                     reloc[2*n + 1].offset += sizeof(uint32_t);
> >   
> > -             obj[2*n + 1].relocs_ptr = to_user_pointer(&reloc[n]);
> > -             obj[2*n + 1].relocation_count = 1;
> > +             obj[3*n + 2].relocs_ptr = to_user_pointer(&reloc[2*n]);
> > +             obj[3*n + 2].relocation_count = 2;
> >       }
> >   
> >       map = gem_mmap__cpu(fd, scratch, 0, size, PROT_WRITE);
> > -     for (unsigned loop = 1; loop <= MAX_LOOP; loop <<= 1) {
> > -             unsigned count = loop * num_ctx;
> > +     for (unsigned int loop = 1; loop <= MAX_LOOP; loop <<= 1) {
> > +             const unsigned int count = loop * num_ctx;
> >               uint32_t *all;
> >   
> >               all = malloc(count * sizeof(uint32_t));
> > -             for (unsigned n = 0; n < count; n++)
> > +             for (unsigned int n = 0; n < count; n++)
> >                       all[n] = ctx[n % num_ctx];
> >               igt_permute_array(all, count, xchg_int);
> > -             for (unsigned n = 0; n < count; n++) {
> > -                     struct drm_i915_gem_execbuffer2 execbuf;
> > -                     unsigned r = n % num_ctx;
> > -                     uint64_t offset = reloc[r].presumed_offset + reloc[r].delta;
> > +
> > +             for (unsigned int n = 0; n < count; n++) {
> > +                     const unsigned int r = n % num_ctx;
> > +                     struct drm_i915_gem_execbuffer2 execbuf = {
> > +                             .buffers_ptr = to_user_pointer(&obj[3*r]),
> > +                             .buffer_count = 3,
> > +                             .flags = engines[n % num_engines],
> > +                             .rsvd1 = all[n],
> > +                     };
> > +                     uint64_t offset =
> > +                             reloc[2*r + 1].presumed_offset +
> > +                             reloc[2*r + 1].delta;
> >                       uint32_t handle = gem_create(fd, 4096);
> >                       uint32_t buf[16];
> >                       int i;
> > @@ -176,27 +195,22 @@ static void single(const char *name, bool all_engines)
> >                       buf[++i] = all[n];
> >                       buf[++i] = MI_BATCH_BUFFER_END;
> >                       gem_write(fd, handle, 0, buf, sizeof(buf));
> > -                     obj[2*r + 1].handle = handle;
> > +                     obj[3*r + 2].handle = handle;
> >   
> > -                     memset(&execbuf, 0, sizeof(execbuf));
> > -                     execbuf.buffers_ptr = to_user_pointer(&obj[2*r]);
> > -                     execbuf.buffer_count = 2;
> > -                     execbuf.flags = engines[n % num_engines];
> > -                     execbuf.rsvd1 = all[n];
> >                       gem_execbuf(fd, &execbuf);
> >                       gem_close(fd, handle);
> >               }
> >   
> > -             /* Note we lied about the write-domain when writing from the
> > +             /*
> > +              * Note we lied about the write-domain when writing from the
> >                * GPU (in order to avoid inter-ring synchronisation), so now
> >                * we have to force the synchronisation here.
> >                */
> >               gem_set_domain(fd, scratch,
> >                              I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> > -             for (unsigned n = count - num_ctx; n < count; n++)
> > +             for (unsigned int n = count - num_ctx; n < count; n++)
> >                       igt_assert_eq(map[n % num_ctx], all[n]);
> >               free(all);
> > -             igt_progress(name, loop, MAX_LOOP);
> >       }
> >       munmap(map, size);
> >   
> > 
> 
> Just one thing I failed to figure out - what would be the difference 
> from simply putting a write hazard on the scratch page write? Wouldn't 
> both guarantee execution in order of execbuf?

That would cause every request to be executed in order and prevent
concurrent execution across engines. The intent of the test is to check
what happens when we run out of GTT space for the context/ring objects,
and part of that is to load up the GPU as much and as wide as possible.

Instead of forcing any ordering, we could just make the scratch big
enough for all writes; but it was such a quirky bug that I wanted to
share the fix. :)
-Chris
Tvrtko Ursulin May 15, 2018, 8:20 a.m. UTC | #3
On 14/05/2018 16:10, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-14 11:59:09)
>>
>> On 14/05/2018 09:02, Chris Wilson wrote:
>>> The test wrote to the same dwords from multiple contexts, assuming that
>>> the writes would be ordered by its submission. However, as it was using
>>> multiple contexts without a write hazard, those timelines are not
>>> coupled and the requests may be emitted to hw in any order. So emit a
>>> write hazard for each individual dword in the scratch (avoiding the
>>> write hazard for the scratch as a whole) to ensure the writes do occur
>>> in the expected order.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    tests/gem_ctx_thrash.c | 92 ++++++++++++++++++++++++------------------
>>>    1 file changed, 53 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/tests/gem_ctx_thrash.c b/tests/gem_ctx_thrash.c
>>> index 2cd9cfebf..b25f95f13 100644
>>> --- a/tests/gem_ctx_thrash.c
>>> +++ b/tests/gem_ctx_thrash.c
>>> @@ -90,17 +90,13 @@ static void single(const char *name, bool all_engines)
>>>    {
>>>        struct drm_i915_gem_exec_object2 *obj;
>>>        struct drm_i915_gem_relocation_entry *reloc;
>>> -     unsigned engines[16];
>>> -     uint64_t size;
>>> -     uint32_t *ctx, *map, scratch;
>>> -     unsigned num_ctx;
>>> -     int fd, gen, num_engines;
>>> +     unsigned int engines[16], num_engines, num_ctx;
>>> +     uint32_t *ctx, *map, scratch, size;
>>> +     int fd, gen;
>>>    #define MAX_LOOP 16
>>>    
>>> -     fd = drm_open_driver_master(DRIVER_INTEL);
>>> +     fd = drm_open_driver(DRIVER_INTEL);
>>>        igt_require_gem(fd);
>>> -     igt_require(gem_can_store_dword(fd, 0));
>>> -
>>>        gem_require_contexts(fd);
>>>    
>>>        gen = intel_gen(intel_get_drm_devid(fd));
>>> @@ -108,54 +104,77 @@ static void single(const char *name, bool all_engines)
>>>        num_engines = 0;
>>>        if (all_engines) {
>>>                unsigned engine;
>>> +
>>>                for_each_physical_engine(fd, engine) {
>>> +                     if (!gem_can_store_dword(fd, engine))
>>> +                             continue;
>>> +
>>>                        engines[num_engines++] = engine;
>>>                        if (num_engines == ARRAY_SIZE(engines))
>>>                                break;
>>>                }
>>> -     } else
>>> +     } else {
>>> +             igt_require(gem_can_store_dword(fd, 0));
>>>                engines[num_engines++] = 0;
>>> +     }
>>> +     igt_require(num_engines);
>>>    
>>>        num_ctx = get_num_contexts(fd, num_engines);
>>>    
>>>        size = ALIGN(num_ctx * sizeof(uint32_t), 4096);
>>> -     scratch = gem_create(fd, ALIGN(num_ctx * sizeof(uint32_t), 4096));
>>> +     scratch = gem_create(fd, size);
>>>        gem_set_caching(fd, scratch, I915_CACHING_CACHED);
>>> -     obj = calloc(num_ctx, 2 * sizeof(*obj));
>>> -     reloc = calloc(num_ctx, sizeof(*reloc));
>>> +     obj = calloc(num_ctx, 3 * sizeof(*obj));
>>> +     reloc = calloc(num_ctx, 2 * sizeof(*reloc));
>>>    
>>>        ctx = malloc(num_ctx * sizeof(uint32_t));
>>>        igt_assert(ctx);
>>>        for (unsigned n = 0; n < num_ctx; n++) {
>>>                ctx[n] = gem_context_create(fd);
>>> -             obj[2*n + 0].handle = scratch;
>>> -
>>> -             reloc[n].target_handle = scratch;
>>> -             reloc[n].presumed_offset = 0;
>>> -             reloc[n].offset = sizeof(uint32_t);
>>> -             reloc[n].delta = n * sizeof(uint32_t);
>>> -             reloc[n].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
>>> -             reloc[n].write_domain = 0; /* lies! */
>>> +
>>> +             obj[3*n + 0].handle = gem_create(fd, 4096);
>>> +             reloc[2*n + 0].target_handle = obj[3*n + 0].handle;
>>> +             reloc[2*n + 0].presumed_offset = 0;
>>> +             reloc[2*n + 0].offset = 4000;
>>> +             reloc[2*n + 0].delta = 0;
>>> +             reloc[2*n + 0].read_domains = I915_GEM_DOMAIN_RENDER;
>>> +             reloc[2*n + 0].write_domain = I915_GEM_DOMAIN_RENDER;
>>> +
>>> +             obj[3*n + 1].handle = scratch;
>>> +             reloc[2*n + 1].target_handle = scratch;
>>> +             reloc[2*n + 1].presumed_offset = 0;
>>> +             reloc[2*n + 1].offset = sizeof(uint32_t);
>>> +             reloc[2*n + 1].delta = n * sizeof(uint32_t);
>>> +             reloc[2*n + 1].read_domains = I915_GEM_DOMAIN_RENDER;
>>> +             reloc[2*n + 1].write_domain = 0; /* lies! */
>>>                if (gen >= 4 && gen < 8)
>>> -                     reloc[n].offset += sizeof(uint32_t);
>>> +                     reloc[2*n + 1].offset += sizeof(uint32_t);
>>>    
>>> -             obj[2*n + 1].relocs_ptr = to_user_pointer(&reloc[n]);
>>> -             obj[2*n + 1].relocation_count = 1;
>>> +             obj[3*n + 2].relocs_ptr = to_user_pointer(&reloc[2*n]);
>>> +             obj[3*n + 2].relocation_count = 2;
>>>        }
>>>    
>>>        map = gem_mmap__cpu(fd, scratch, 0, size, PROT_WRITE);
>>> -     for (unsigned loop = 1; loop <= MAX_LOOP; loop <<= 1) {
>>> -             unsigned count = loop * num_ctx;
>>> +     for (unsigned int loop = 1; loop <= MAX_LOOP; loop <<= 1) {
>>> +             const unsigned int count = loop * num_ctx;
>>>                uint32_t *all;
>>>    
>>>                all = malloc(count * sizeof(uint32_t));
>>> -             for (unsigned n = 0; n < count; n++)
>>> +             for (unsigned int n = 0; n < count; n++)
>>>                        all[n] = ctx[n % num_ctx];
>>>                igt_permute_array(all, count, xchg_int);
>>> -             for (unsigned n = 0; n < count; n++) {
>>> -                     struct drm_i915_gem_execbuffer2 execbuf;
>>> -                     unsigned r = n % num_ctx;
>>> -                     uint64_t offset = reloc[r].presumed_offset + reloc[r].delta;
>>> +
>>> +             for (unsigned int n = 0; n < count; n++) {
>>> +                     const unsigned int r = n % num_ctx;
>>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
>>> +                             .buffers_ptr = to_user_pointer(&obj[3*r]),
>>> +                             .buffer_count = 3,
>>> +                             .flags = engines[n % num_engines],
>>> +                             .rsvd1 = all[n],
>>> +                     };
>>> +                     uint64_t offset =
>>> +                             reloc[2*r + 1].presumed_offset +
>>> +                             reloc[2*r + 1].delta;
>>>                        uint32_t handle = gem_create(fd, 4096);
>>>                        uint32_t buf[16];
>>>                        int i;
>>> @@ -176,27 +195,22 @@ static void single(const char *name, bool all_engines)
>>>                        buf[++i] = all[n];
>>>                        buf[++i] = MI_BATCH_BUFFER_END;
>>>                        gem_write(fd, handle, 0, buf, sizeof(buf));
>>> -                     obj[2*r + 1].handle = handle;
>>> +                     obj[3*r + 2].handle = handle;
>>>    
>>> -                     memset(&execbuf, 0, sizeof(execbuf));
>>> -                     execbuf.buffers_ptr = to_user_pointer(&obj[2*r]);
>>> -                     execbuf.buffer_count = 2;
>>> -                     execbuf.flags = engines[n % num_engines];
>>> -                     execbuf.rsvd1 = all[n];
>>>                        gem_execbuf(fd, &execbuf);
>>>                        gem_close(fd, handle);
>>>                }
>>>    
>>> -             /* Note we lied about the write-domain when writing from the
>>> +             /*
>>> +              * Note we lied about the write-domain when writing from the
>>>                 * GPU (in order to avoid inter-ring synchronisation), so now
>>>                 * we have to force the synchronisation here.
>>>                 */
>>>                gem_set_domain(fd, scratch,
>>>                               I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
>>> -             for (unsigned n = count - num_ctx; n < count; n++)
>>> +             for (unsigned int n = count - num_ctx; n < count; n++)
>>>                        igt_assert_eq(map[n % num_ctx], all[n]);
>>>                free(all);
>>> -             igt_progress(name, loop, MAX_LOOP);
>>>        }
>>>        munmap(map, size);
>>>    
>>>
>>
>> Just one thing I failed to figure out - what would be the difference
>> from simply putting a write hazard on the scratch page write? Wouldn't
>> both guarantee execution in order of execbuf?
> 
> That would cause every request to be executed in order and prevent
> concurrent execution across engines. The intent of the test is to check

True, I forgot the context of a patch that each context gets it own 
"ordering" bo, while the execbuf loop is separate.

> what happens when we run out of GTT space for the context/ring objects,
> and part of that is to load up the GPU as much and as wide as possible.
> 
> Instead of forcing any ordering, we could just make the scratch big
> enough for all writes; but it was such a quirky bug that I wanted to
> share the fix. :)

However, if the "ordering" bo is one for each context, I still don't 
quite understand what is ordered with the patch which wouldn't normally be.

And also, the verification loop is after all execbufs have completed - 
so why is even the order of execution important?

Regards,

Tvrtko
Chris Wilson May 15, 2018, 8:29 a.m. UTC | #4
Quoting Tvrtko Ursulin (2018-05-15 09:20:13)
> 
> On 14/05/2018 16:10, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-05-14 11:59:09)
> >>
> >> On 14/05/2018 09:02, Chris Wilson wrote:
> >>> The test wrote to the same dwords from multiple contexts, assuming that
> >>> the writes would be ordered by its submission. However, as it was using
> >>> multiple contexts without a write hazard, those timelines are not
> >>> coupled and the requests may be emitted to hw in any order. So emit a
> >>> write hazard for each individual dword in the scratch (avoiding the
> >>> write hazard for the scratch as a whole) to ensure the writes do occur
> >>> in the expected order.
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> ---
> >>>    tests/gem_ctx_thrash.c | 92 ++++++++++++++++++++++++------------------
> >>>    1 file changed, 53 insertions(+), 39 deletions(-)
> >>>
> >>> diff --git a/tests/gem_ctx_thrash.c b/tests/gem_ctx_thrash.c
> >>> index 2cd9cfebf..b25f95f13 100644
> >>> --- a/tests/gem_ctx_thrash.c
> >>> +++ b/tests/gem_ctx_thrash.c
> >>> @@ -90,17 +90,13 @@ static void single(const char *name, bool all_engines)
> >>>    {
> >>>        struct drm_i915_gem_exec_object2 *obj;
> >>>        struct drm_i915_gem_relocation_entry *reloc;
> >>> -     unsigned engines[16];
> >>> -     uint64_t size;
> >>> -     uint32_t *ctx, *map, scratch;
> >>> -     unsigned num_ctx;
> >>> -     int fd, gen, num_engines;
> >>> +     unsigned int engines[16], num_engines, num_ctx;
> >>> +     uint32_t *ctx, *map, scratch, size;
> >>> +     int fd, gen;
> >>>    #define MAX_LOOP 16
> >>>    
> >>> -     fd = drm_open_driver_master(DRIVER_INTEL);
> >>> +     fd = drm_open_driver(DRIVER_INTEL);
> >>>        igt_require_gem(fd);
> >>> -     igt_require(gem_can_store_dword(fd, 0));
> >>> -
> >>>        gem_require_contexts(fd);
> >>>    
> >>>        gen = intel_gen(intel_get_drm_devid(fd));
> >>> @@ -108,54 +104,77 @@ static void single(const char *name, bool all_engines)
> >>>        num_engines = 0;
> >>>        if (all_engines) {
> >>>                unsigned engine;
> >>> +
> >>>                for_each_physical_engine(fd, engine) {
> >>> +                     if (!gem_can_store_dword(fd, engine))
> >>> +                             continue;
> >>> +
> >>>                        engines[num_engines++] = engine;
> >>>                        if (num_engines == ARRAY_SIZE(engines))
> >>>                                break;
> >>>                }
> >>> -     } else
> >>> +     } else {
> >>> +             igt_require(gem_can_store_dword(fd, 0));
> >>>                engines[num_engines++] = 0;
> >>> +     }
> >>> +     igt_require(num_engines);
> >>>    
> >>>        num_ctx = get_num_contexts(fd, num_engines);
> >>>    
> >>>        size = ALIGN(num_ctx * sizeof(uint32_t), 4096);
> >>> -     scratch = gem_create(fd, ALIGN(num_ctx * sizeof(uint32_t), 4096));
> >>> +     scratch = gem_create(fd, size);
> >>>        gem_set_caching(fd, scratch, I915_CACHING_CACHED);
> >>> -     obj = calloc(num_ctx, 2 * sizeof(*obj));
> >>> -     reloc = calloc(num_ctx, sizeof(*reloc));
> >>> +     obj = calloc(num_ctx, 3 * sizeof(*obj));
> >>> +     reloc = calloc(num_ctx, 2 * sizeof(*reloc));
> >>>    
> >>>        ctx = malloc(num_ctx * sizeof(uint32_t));
> >>>        igt_assert(ctx);
> >>>        for (unsigned n = 0; n < num_ctx; n++) {
> >>>                ctx[n] = gem_context_create(fd);
> >>> -             obj[2*n + 0].handle = scratch;
> >>> -
> >>> -             reloc[n].target_handle = scratch;
> >>> -             reloc[n].presumed_offset = 0;
> >>> -             reloc[n].offset = sizeof(uint32_t);
> >>> -             reloc[n].delta = n * sizeof(uint32_t);
> >>> -             reloc[n].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> >>> -             reloc[n].write_domain = 0; /* lies! */
> >>> +
> >>> +             obj[3*n + 0].handle = gem_create(fd, 4096);
> >>> +             reloc[2*n + 0].target_handle = obj[3*n + 0].handle;
> >>> +             reloc[2*n + 0].presumed_offset = 0;
> >>> +             reloc[2*n + 0].offset = 4000;
> >>> +             reloc[2*n + 0].delta = 0;
> >>> +             reloc[2*n + 0].read_domains = I915_GEM_DOMAIN_RENDER;
> >>> +             reloc[2*n + 0].write_domain = I915_GEM_DOMAIN_RENDER;
> >>> +
> >>> +             obj[3*n + 1].handle = scratch;
> >>> +             reloc[2*n + 1].target_handle = scratch;
> >>> +             reloc[2*n + 1].presumed_offset = 0;
> >>> +             reloc[2*n + 1].offset = sizeof(uint32_t);
> >>> +             reloc[2*n + 1].delta = n * sizeof(uint32_t);
> >>> +             reloc[2*n + 1].read_domains = I915_GEM_DOMAIN_RENDER;
> >>> +             reloc[2*n + 1].write_domain = 0; /* lies! */
> >>>                if (gen >= 4 && gen < 8)
> >>> -                     reloc[n].offset += sizeof(uint32_t);
> >>> +                     reloc[2*n + 1].offset += sizeof(uint32_t);
> >>>    
> >>> -             obj[2*n + 1].relocs_ptr = to_user_pointer(&reloc[n]);
> >>> -             obj[2*n + 1].relocation_count = 1;
> >>> +             obj[3*n + 2].relocs_ptr = to_user_pointer(&reloc[2*n]);
> >>> +             obj[3*n + 2].relocation_count = 2;
> >>>        }
> >>>    
> >>>        map = gem_mmap__cpu(fd, scratch, 0, size, PROT_WRITE);
> >>> -     for (unsigned loop = 1; loop <= MAX_LOOP; loop <<= 1) {
> >>> -             unsigned count = loop * num_ctx;
> >>> +     for (unsigned int loop = 1; loop <= MAX_LOOP; loop <<= 1) {
> >>> +             const unsigned int count = loop * num_ctx;
> >>>                uint32_t *all;
> >>>    
> >>>                all = malloc(count * sizeof(uint32_t));
> >>> -             for (unsigned n = 0; n < count; n++)
> >>> +             for (unsigned int n = 0; n < count; n++)
> >>>                        all[n] = ctx[n % num_ctx];
> >>>                igt_permute_array(all, count, xchg_int);
> >>> -             for (unsigned n = 0; n < count; n++) {
> >>> -                     struct drm_i915_gem_execbuffer2 execbuf;
> >>> -                     unsigned r = n % num_ctx;
> >>> -                     uint64_t offset = reloc[r].presumed_offset + reloc[r].delta;
> >>> +
> >>> +             for (unsigned int n = 0; n < count; n++) {
> >>> +                     const unsigned int r = n % num_ctx;
> >>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
> >>> +                             .buffers_ptr = to_user_pointer(&obj[3*r]),
> >>> +                             .buffer_count = 3,
> >>> +                             .flags = engines[n % num_engines],
> >>> +                             .rsvd1 = all[n],
> >>> +                     };
> >>> +                     uint64_t offset =
> >>> +                             reloc[2*r + 1].presumed_offset +
> >>> +                             reloc[2*r + 1].delta;
> >>>                        uint32_t handle = gem_create(fd, 4096);
> >>>                        uint32_t buf[16];
> >>>                        int i;
> >>> @@ -176,27 +195,22 @@ static void single(const char *name, bool all_engines)
> >>>                        buf[++i] = all[n];
> >>>                        buf[++i] = MI_BATCH_BUFFER_END;
> >>>                        gem_write(fd, handle, 0, buf, sizeof(buf));
> >>> -                     obj[2*r + 1].handle = handle;
> >>> +                     obj[3*r + 2].handle = handle;
> >>>    
> >>> -                     memset(&execbuf, 0, sizeof(execbuf));
> >>> -                     execbuf.buffers_ptr = to_user_pointer(&obj[2*r]);
> >>> -                     execbuf.buffer_count = 2;
> >>> -                     execbuf.flags = engines[n % num_engines];
> >>> -                     execbuf.rsvd1 = all[n];
> >>>                        gem_execbuf(fd, &execbuf);
> >>>                        gem_close(fd, handle);
> >>>                }
> >>>    
> >>> -             /* Note we lied about the write-domain when writing from the
> >>> +             /*
> >>> +              * Note we lied about the write-domain when writing from the
> >>>                 * GPU (in order to avoid inter-ring synchronisation), so now
> >>>                 * we have to force the synchronisation here.
> >>>                 */
> >>>                gem_set_domain(fd, scratch,
> >>>                               I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> >>> -             for (unsigned n = count - num_ctx; n < count; n++)
> >>> +             for (unsigned int n = count - num_ctx; n < count; n++)
> >>>                        igt_assert_eq(map[n % num_ctx], all[n]);
> >>>                free(all);
> >>> -             igt_progress(name, loop, MAX_LOOP);
> >>>        }
> >>>        munmap(map, size);
> >>>    
> >>>
> >>
> >> Just one thing I failed to figure out - what would be the difference
> >> from simply putting a write hazard on the scratch page write? Wouldn't
> >> both guarantee execution in order of execbuf?
> > 
> > That would cause every request to be executed in order and prevent
> > concurrent execution across engines. The intent of the test is to check
> 
> True, I forgot the context of a patch that each context gets it own 
> "ordering" bo, while the execbuf loop is separate.
> 
> > what happens when we run out of GTT space for the context/ring objects,
> > and part of that is to load up the GPU as much and as wide as possible.
> > 
> > Instead of forcing any ordering, we could just make the scratch big
> > enough for all writes; but it was such a quirky bug that I wanted to
> > share the fix. :)
> 
> However, if the "ordering" bo is one for each context, I still don't 
> quite understand what is ordered with the patch which wouldn't normally be.

Not one for each context, one for each dword in the scatch.
 
> And also, the verification loop is after all execbufs have completed - 
> so why is even the order of execution important?

Huh? Many contexts write into a single dword, we only check the last
value of the dword and presume which context wrote its id. We never told
the kernel what order the contexts had to be executed in, so the last
value is unspecified.
-Chris
Tvrtko Ursulin May 15, 2018, 8:45 a.m. UTC | #5
On 15/05/2018 09:29, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-05-15 09:20:13)
>>
>> On 14/05/2018 16:10, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2018-05-14 11:59:09)
>>>>
>>>> On 14/05/2018 09:02, Chris Wilson wrote:
>>>>> The test wrote to the same dwords from multiple contexts, assuming that
>>>>> the writes would be ordered by its submission. However, as it was using
>>>>> multiple contexts without a write hazard, those timelines are not
>>>>> coupled and the requests may be emitted to hw in any order. So emit a
>>>>> write hazard for each individual dword in the scratch (avoiding the
>>>>> write hazard for the scratch as a whole) to ensure the writes do occur
>>>>> in the expected order.
>>>>>
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> ---
>>>>>     tests/gem_ctx_thrash.c | 92 ++++++++++++++++++++++++------------------
>>>>>     1 file changed, 53 insertions(+), 39 deletions(-)
>>>>>
>>>>> diff --git a/tests/gem_ctx_thrash.c b/tests/gem_ctx_thrash.c
>>>>> index 2cd9cfebf..b25f95f13 100644
>>>>> --- a/tests/gem_ctx_thrash.c
>>>>> +++ b/tests/gem_ctx_thrash.c
>>>>> @@ -90,17 +90,13 @@ static void single(const char *name, bool all_engines)
>>>>>     {
>>>>>         struct drm_i915_gem_exec_object2 *obj;
>>>>>         struct drm_i915_gem_relocation_entry *reloc;
>>>>> -     unsigned engines[16];
>>>>> -     uint64_t size;
>>>>> -     uint32_t *ctx, *map, scratch;
>>>>> -     unsigned num_ctx;
>>>>> -     int fd, gen, num_engines;
>>>>> +     unsigned int engines[16], num_engines, num_ctx;
>>>>> +     uint32_t *ctx, *map, scratch, size;
>>>>> +     int fd, gen;
>>>>>     #define MAX_LOOP 16
>>>>>     
>>>>> -     fd = drm_open_driver_master(DRIVER_INTEL);
>>>>> +     fd = drm_open_driver(DRIVER_INTEL);
>>>>>         igt_require_gem(fd);
>>>>> -     igt_require(gem_can_store_dword(fd, 0));
>>>>> -
>>>>>         gem_require_contexts(fd);
>>>>>     
>>>>>         gen = intel_gen(intel_get_drm_devid(fd));
>>>>> @@ -108,54 +104,77 @@ static void single(const char *name, bool all_engines)
>>>>>         num_engines = 0;
>>>>>         if (all_engines) {
>>>>>                 unsigned engine;
>>>>> +
>>>>>                 for_each_physical_engine(fd, engine) {
>>>>> +                     if (!gem_can_store_dword(fd, engine))
>>>>> +                             continue;
>>>>> +
>>>>>                         engines[num_engines++] = engine;
>>>>>                         if (num_engines == ARRAY_SIZE(engines))
>>>>>                                 break;
>>>>>                 }
>>>>> -     } else
>>>>> +     } else {
>>>>> +             igt_require(gem_can_store_dword(fd, 0));
>>>>>                 engines[num_engines++] = 0;
>>>>> +     }
>>>>> +     igt_require(num_engines);
>>>>>     
>>>>>         num_ctx = get_num_contexts(fd, num_engines);
>>>>>     
>>>>>         size = ALIGN(num_ctx * sizeof(uint32_t), 4096);
>>>>> -     scratch = gem_create(fd, ALIGN(num_ctx * sizeof(uint32_t), 4096));
>>>>> +     scratch = gem_create(fd, size);
>>>>>         gem_set_caching(fd, scratch, I915_CACHING_CACHED);
>>>>> -     obj = calloc(num_ctx, 2 * sizeof(*obj));
>>>>> -     reloc = calloc(num_ctx, sizeof(*reloc));
>>>>> +     obj = calloc(num_ctx, 3 * sizeof(*obj));
>>>>> +     reloc = calloc(num_ctx, 2 * sizeof(*reloc));
>>>>>     
>>>>>         ctx = malloc(num_ctx * sizeof(uint32_t));
>>>>>         igt_assert(ctx);
>>>>>         for (unsigned n = 0; n < num_ctx; n++) {
>>>>>                 ctx[n] = gem_context_create(fd);
>>>>> -             obj[2*n + 0].handle = scratch;
>>>>> -
>>>>> -             reloc[n].target_handle = scratch;
>>>>> -             reloc[n].presumed_offset = 0;
>>>>> -             reloc[n].offset = sizeof(uint32_t);
>>>>> -             reloc[n].delta = n * sizeof(uint32_t);
>>>>> -             reloc[n].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
>>>>> -             reloc[n].write_domain = 0; /* lies! */
>>>>> +
>>>>> +             obj[3*n + 0].handle = gem_create(fd, 4096);
>>>>> +             reloc[2*n + 0].target_handle = obj[3*n + 0].handle;
>>>>> +             reloc[2*n + 0].presumed_offset = 0;
>>>>> +             reloc[2*n + 0].offset = 4000;
>>>>> +             reloc[2*n + 0].delta = 0;
>>>>> +             reloc[2*n + 0].read_domains = I915_GEM_DOMAIN_RENDER;
>>>>> +             reloc[2*n + 0].write_domain = I915_GEM_DOMAIN_RENDER;
>>>>> +
>>>>> +             obj[3*n + 1].handle = scratch;
>>>>> +             reloc[2*n + 1].target_handle = scratch;
>>>>> +             reloc[2*n + 1].presumed_offset = 0;
>>>>> +             reloc[2*n + 1].offset = sizeof(uint32_t);
>>>>> +             reloc[2*n + 1].delta = n * sizeof(uint32_t);
>>>>> +             reloc[2*n + 1].read_domains = I915_GEM_DOMAIN_RENDER;
>>>>> +             reloc[2*n + 1].write_domain = 0; /* lies! */
>>>>>                 if (gen >= 4 && gen < 8)
>>>>> -                     reloc[n].offset += sizeof(uint32_t);
>>>>> +                     reloc[2*n + 1].offset += sizeof(uint32_t);
>>>>>     
>>>>> -             obj[2*n + 1].relocs_ptr = to_user_pointer(&reloc[n]);
>>>>> -             obj[2*n + 1].relocation_count = 1;
>>>>> +             obj[3*n + 2].relocs_ptr = to_user_pointer(&reloc[2*n]);
>>>>> +             obj[3*n + 2].relocation_count = 2;
>>>>>         }
>>>>>     
>>>>>         map = gem_mmap__cpu(fd, scratch, 0, size, PROT_WRITE);
>>>>> -     for (unsigned loop = 1; loop <= MAX_LOOP; loop <<= 1) {
>>>>> -             unsigned count = loop * num_ctx;
>>>>> +     for (unsigned int loop = 1; loop <= MAX_LOOP; loop <<= 1) {
>>>>> +             const unsigned int count = loop * num_ctx;
>>>>>                 uint32_t *all;
>>>>>     
>>>>>                 all = malloc(count * sizeof(uint32_t));
>>>>> -             for (unsigned n = 0; n < count; n++)
>>>>> +             for (unsigned int n = 0; n < count; n++)
>>>>>                         all[n] = ctx[n % num_ctx];
>>>>>                 igt_permute_array(all, count, xchg_int);
>>>>> -             for (unsigned n = 0; n < count; n++) {
>>>>> -                     struct drm_i915_gem_execbuffer2 execbuf;
>>>>> -                     unsigned r = n % num_ctx;
>>>>> -                     uint64_t offset = reloc[r].presumed_offset + reloc[r].delta;
>>>>> +
>>>>> +             for (unsigned int n = 0; n < count; n++) {
>>>>> +                     const unsigned int r = n % num_ctx;
>>>>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
>>>>> +                             .buffers_ptr = to_user_pointer(&obj[3*r]),
>>>>> +                             .buffer_count = 3,
>>>>> +                             .flags = engines[n % num_engines],
>>>>> +                             .rsvd1 = all[n],
>>>>> +                     };
>>>>> +                     uint64_t offset =
>>>>> +                             reloc[2*r + 1].presumed_offset +
>>>>> +                             reloc[2*r + 1].delta;
>>>>>                         uint32_t handle = gem_create(fd, 4096);
>>>>>                         uint32_t buf[16];
>>>>>                         int i;
>>>>> @@ -176,27 +195,22 @@ static void single(const char *name, bool all_engines)
>>>>>                         buf[++i] = all[n];
>>>>>                         buf[++i] = MI_BATCH_BUFFER_END;
>>>>>                         gem_write(fd, handle, 0, buf, sizeof(buf));
>>>>> -                     obj[2*r + 1].handle = handle;
>>>>> +                     obj[3*r + 2].handle = handle;
>>>>>     
>>>>> -                     memset(&execbuf, 0, sizeof(execbuf));
>>>>> -                     execbuf.buffers_ptr = to_user_pointer(&obj[2*r]);
>>>>> -                     execbuf.buffer_count = 2;
>>>>> -                     execbuf.flags = engines[n % num_engines];
>>>>> -                     execbuf.rsvd1 = all[n];
>>>>>                         gem_execbuf(fd, &execbuf);
>>>>>                         gem_close(fd, handle);
>>>>>                 }
>>>>>     
>>>>> -             /* Note we lied about the write-domain when writing from the
>>>>> +             /*
>>>>> +              * Note we lied about the write-domain when writing from the
>>>>>                  * GPU (in order to avoid inter-ring synchronisation), so now
>>>>>                  * we have to force the synchronisation here.
>>>>>                  */
>>>>>                 gem_set_domain(fd, scratch,
>>>>>                                I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
>>>>> -             for (unsigned n = count - num_ctx; n < count; n++)
>>>>> +             for (unsigned int n = count - num_ctx; n < count; n++)
>>>>>                         igt_assert_eq(map[n % num_ctx], all[n]);
>>>>>                 free(all);
>>>>> -             igt_progress(name, loop, MAX_LOOP);
>>>>>         }
>>>>>         munmap(map, size);
>>>>>     
>>>>>
>>>>
>>>> Just one thing I failed to figure out - what would be the difference
>>>> from simply putting a write hazard on the scratch page write? Wouldn't
>>>> both guarantee execution in order of execbuf?
>>>
>>> That would cause every request to be executed in order and prevent
>>> concurrent execution across engines. The intent of the test is to check
>>
>> True, I forgot the context of a patch that each context gets it own
>> "ordering" bo, while the execbuf loop is separate.
>>
>>> what happens when we run out of GTT space for the context/ring objects,
>>> and part of that is to load up the GPU as much and as wide as possible.
>>>
>>> Instead of forcing any ordering, we could just make the scratch big
>>> enough for all writes; but it was such a quirky bug that I wanted to
>>> share the fix. :)
>>
>> However, if the "ordering" bo is one for each context, I still don't
>> quite understand what is ordered with the patch which wouldn't normally be.
> 
> Not one for each context, one for each dword in the scatch.

I see gem_context_create and gem_create for the new "ordering" bo in the 
same loop "for n = 0; n < num_ctx". So I conclude number of "ordering" 
bos is equal to number of contexts.

>> And also, the verification loop is after all execbufs have completed -
>> so why is even the order of execution important?
> 
> Huh? Many contexts write into a single dword, we only check the last
> value of the dword and presume which context wrote its id. We never told
> the kernel what order the contexts had to be executed in, so the last
> value is unspecified.

Then in the execbuf loop I saw for each execbuf context being picked 
with "n % num_ctx" and the write destination as "n % num_ctx" as well.

But I missed the igt_permute_array is only on the context list, which 
decouples the two.

I guess case in point why complex test patterns without any high level 
comments on what and how test intends to work suck since now both of us 
lost way too much time for this.

So I guess its fine, but a pain to review since I don't have a compiler 
and a virtual machine in my head.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
diff mbox

Patch

diff --git a/tests/gem_ctx_thrash.c b/tests/gem_ctx_thrash.c
index 2cd9cfebf..b25f95f13 100644
--- a/tests/gem_ctx_thrash.c
+++ b/tests/gem_ctx_thrash.c
@@ -90,17 +90,13 @@  static void single(const char *name, bool all_engines)
 {
 	struct drm_i915_gem_exec_object2 *obj;
 	struct drm_i915_gem_relocation_entry *reloc;
-	unsigned engines[16];
-	uint64_t size;
-	uint32_t *ctx, *map, scratch;
-	unsigned num_ctx;
-	int fd, gen, num_engines;
+	unsigned int engines[16], num_engines, num_ctx;
+	uint32_t *ctx, *map, scratch, size;
+	int fd, gen;
 #define MAX_LOOP 16
 
-	fd = drm_open_driver_master(DRIVER_INTEL);
+	fd = drm_open_driver(DRIVER_INTEL);
 	igt_require_gem(fd);
-	igt_require(gem_can_store_dword(fd, 0));
-
 	gem_require_contexts(fd);
 
 	gen = intel_gen(intel_get_drm_devid(fd));
@@ -108,54 +104,77 @@  static void single(const char *name, bool all_engines)
 	num_engines = 0;
 	if (all_engines) {
 		unsigned engine;
+
 		for_each_physical_engine(fd, engine) {
+			if (!gem_can_store_dword(fd, engine))
+				continue;
+
 			engines[num_engines++] = engine;
 			if (num_engines == ARRAY_SIZE(engines))
 				break;
 		}
-	} else
+	} else {
+		igt_require(gem_can_store_dword(fd, 0));
 		engines[num_engines++] = 0;
+	}
+	igt_require(num_engines);
 
 	num_ctx = get_num_contexts(fd, num_engines);
 
 	size = ALIGN(num_ctx * sizeof(uint32_t), 4096);
-	scratch = gem_create(fd, ALIGN(num_ctx * sizeof(uint32_t), 4096));
+	scratch = gem_create(fd, size);
 	gem_set_caching(fd, scratch, I915_CACHING_CACHED);
-	obj = calloc(num_ctx, 2 * sizeof(*obj));
-	reloc = calloc(num_ctx, sizeof(*reloc));
+	obj = calloc(num_ctx, 3 * sizeof(*obj));
+	reloc = calloc(num_ctx, 2 * sizeof(*reloc));
 
 	ctx = malloc(num_ctx * sizeof(uint32_t));
 	igt_assert(ctx);
 	for (unsigned n = 0; n < num_ctx; n++) {
 		ctx[n] = gem_context_create(fd);
-		obj[2*n + 0].handle = scratch;
-
-		reloc[n].target_handle = scratch;
-		reloc[n].presumed_offset = 0;
-		reloc[n].offset = sizeof(uint32_t);
-		reloc[n].delta = n * sizeof(uint32_t);
-		reloc[n].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
-		reloc[n].write_domain = 0; /* lies! */
+
+		obj[3*n + 0].handle = gem_create(fd, 4096);
+		reloc[2*n + 0].target_handle = obj[3*n + 0].handle;
+		reloc[2*n + 0].presumed_offset = 0;
+		reloc[2*n + 0].offset = 4000;
+		reloc[2*n + 0].delta = 0;
+		reloc[2*n + 0].read_domains = I915_GEM_DOMAIN_RENDER;
+		reloc[2*n + 0].write_domain = I915_GEM_DOMAIN_RENDER;
+
+		obj[3*n + 1].handle = scratch;
+		reloc[2*n + 1].target_handle = scratch;
+		reloc[2*n + 1].presumed_offset = 0;
+		reloc[2*n + 1].offset = sizeof(uint32_t);
+		reloc[2*n + 1].delta = n * sizeof(uint32_t);
+		reloc[2*n + 1].read_domains = I915_GEM_DOMAIN_RENDER;
+		reloc[2*n + 1].write_domain = 0; /* lies! */
 		if (gen >= 4 && gen < 8)
-			reloc[n].offset += sizeof(uint32_t);
+			reloc[2*n + 1].offset += sizeof(uint32_t);
 
-		obj[2*n + 1].relocs_ptr = to_user_pointer(&reloc[n]);
-		obj[2*n + 1].relocation_count = 1;
+		obj[3*n + 2].relocs_ptr = to_user_pointer(&reloc[2*n]);
+		obj[3*n + 2].relocation_count = 2;
 	}
 
 	map = gem_mmap__cpu(fd, scratch, 0, size, PROT_WRITE);
-	for (unsigned loop = 1; loop <= MAX_LOOP; loop <<= 1) {
-		unsigned count = loop * num_ctx;
+	for (unsigned int loop = 1; loop <= MAX_LOOP; loop <<= 1) {
+		const unsigned int count = loop * num_ctx;
 		uint32_t *all;
 
 		all = malloc(count * sizeof(uint32_t));
-		for (unsigned n = 0; n < count; n++)
+		for (unsigned int n = 0; n < count; n++)
 			all[n] = ctx[n % num_ctx];
 		igt_permute_array(all, count, xchg_int);
-		for (unsigned n = 0; n < count; n++) {
-			struct drm_i915_gem_execbuffer2 execbuf;
-			unsigned r = n % num_ctx;
-			uint64_t offset = reloc[r].presumed_offset + reloc[r].delta;
+
+		for (unsigned int n = 0; n < count; n++) {
+			const unsigned int r = n % num_ctx;
+			struct drm_i915_gem_execbuffer2 execbuf = {
+				.buffers_ptr = to_user_pointer(&obj[3*r]),
+				.buffer_count = 3,
+				.flags = engines[n % num_engines],
+				.rsvd1 = all[n],
+			};
+			uint64_t offset =
+				reloc[2*r + 1].presumed_offset +
+				reloc[2*r + 1].delta;
 			uint32_t handle = gem_create(fd, 4096);
 			uint32_t buf[16];
 			int i;
@@ -176,27 +195,22 @@  static void single(const char *name, bool all_engines)
 			buf[++i] = all[n];
 			buf[++i] = MI_BATCH_BUFFER_END;
 			gem_write(fd, handle, 0, buf, sizeof(buf));
-			obj[2*r + 1].handle = handle;
+			obj[3*r + 2].handle = handle;
 
-			memset(&execbuf, 0, sizeof(execbuf));
-			execbuf.buffers_ptr = to_user_pointer(&obj[2*r]);
-			execbuf.buffer_count = 2;
-			execbuf.flags = engines[n % num_engines];
-			execbuf.rsvd1 = all[n];
 			gem_execbuf(fd, &execbuf);
 			gem_close(fd, handle);
 		}
 
-		/* Note we lied about the write-domain when writing from the
+		/*
+		 * Note we lied about the write-domain when writing from the
 		 * GPU (in order to avoid inter-ring synchronisation), so now
 		 * we have to force the synchronisation here.
 		 */
 		gem_set_domain(fd, scratch,
 			       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
-		for (unsigned n = count - num_ctx; n < count; n++)
+		for (unsigned int n = count - num_ctx; n < count; n++)
 			igt_assert_eq(map[n % num_ctx], all[n]);
 		free(all);
-		igt_progress(name, loop, MAX_LOOP);
 	}
 	munmap(map, size);