diff mbox series

[i-g-t] i915/gem_exec_balancer: Throw a few hangs into the virtual pipelines

Message ID 20191114191546.149722-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [i-g-t] i915/gem_exec_balancer: Throw a few hangs into the virtual pipelines | expand

Commit Message

Chris Wilson Nov. 14, 2019, 7:15 p.m. UTC
Although a virtual engine itself has no hang detection; that is on the
underlying physical engines, it does provide a unique means for clients
to try and break the system. Try and break it before they do.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

Comments

Tvrtko Ursulin Nov. 15, 2019, 1:02 p.m. UTC | #1
On 14/11/2019 19:15, Chris Wilson wrote:
> Although a virtual engine itself has no hang detection; that is on the
> underlying physical engines, it does provide a unique means for clients
> to try and break the system. Try and break it before they do.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
>   1 file changed, 105 insertions(+)
> 
> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> index 70c4529b4..86028cfdd 100644
> --- a/tests/i915/gem_exec_balancer.c
> +++ b/tests/i915/gem_exec_balancer.c
> @@ -24,6 +24,7 @@
>   #include <sched.h>
>   
>   #include "igt.h"
> +#include "igt_gt.h"
>   #include "igt_perf.h"
>   #include "i915/gem_ring.h"
>   #include "sw_sync.h"
> @@ -1314,6 +1315,102 @@ static void semaphore(int i915)
>   	gem_quiescent_gpu(i915);
>   }
>   
> +static void set_unbannable(int i915, uint32_t ctx)
> +{
> +	struct drm_i915_gem_context_param p = {
> +		.ctx_id = ctx,
> +		.param = I915_CONTEXT_PARAM_BANNABLE,
> +	};
> +
> +	igt_assert_eq(__gem_context_set_param(i915, &p), 0);
> +}
> +
> +static void hangme(int i915)
> +{
> +	struct drm_i915_gem_exec_object2 batch = {
> +		.handle = batch_create(i915),
> +	};
> +
> +	/*
> +	 * Fill the available engines with hanging virtual engines and verify
> +	 * that execution continues onto the second batch.
> +	 */
> +
> +	for (int class = 1; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		struct client {
> +			igt_spin_t *spin[2];
> +		} *client;
> +		unsigned int count;
> +		uint32_t bg;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		if (count < 2) {
> +			free(ci);
> +			continue;
> +		}
> +
> +		client = malloc(sizeof(*client) * count);
> +		igt_assert(client);
> +
> +		for (int i = 0; i < count; i++) {
> +			uint32_t ctx = gem_context_create(i915);
> +			struct client *c = &client[i];
> +			unsigned int flags;
> +
> +			set_unbannable(i915, ctx);
> +			set_load_balancer(i915, ctx, ci, count, NULL);
> +
> +			flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
> +			for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
> +				c->spin[j] = igt_spin_new(i915, ctx,
> +							  .flags = flags);
> +				flags = IGT_SPIN_FENCE_OUT;
> +			}
> +
> +			gem_context_destroy(i915, ctx);
> +		}
> +
> +		/* Apply some background context to speed up hang detection */
> +		bg = gem_context_create(i915);
> +		set_engines(i915, bg, ci, count);
> +		for (int i = 0; i < count; i++) {
> +			struct drm_i915_gem_execbuffer2 execbuf = {
> +				.buffers_ptr = to_user_pointer(&batch),
> +				.buffer_count = 1,
> +				.flags = i,
> +				.rsvd1 = bg,
> +			};
> +			gem_execbuf(i915, &execbuf);
> +		}
> +		gem_context_destroy(i915, bg);
> +
> +		for (int i = 0; i < count; i++) {
> +			struct client *c = &client[i];
> +
> +			igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
> +			gem_sync(i915, c->spin[0]->handle);
> +			igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
> +				      -EIO);
> +
> +			igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
> +			gem_sync(i915, c->spin[1]->handle);
> +			igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
> +				      -EIO);
> +
> +			igt_spin_free(i915, c->spin[0]);
> +			igt_spin_free(i915, c->spin[1]);
> +		}
> +		free(client);
> +	}
> +
> +	gem_close(i915, batch.handle);
> +	gem_quiescent_gpu(i915);
> +}
> +
>   static void smoketest(int i915, int timeout)
>   {
>   	struct drm_i915_gem_exec_object2 batch[2] = {
> @@ -1486,4 +1583,12 @@ igt_main
>   	igt_fixture {
>   		igt_stop_hang_detector();
>   	}
> +
> +	igt_subtest("hang") {
> +		igt_hang_t hang = igt_allow_hang(i915, 0, 0);
> +
> +		hangme(i915);
> +
> +		igt_disallow_hang(i915, hang);
> +	}
>   }
> 

Looks good. But do we need some core helpers to figure out when preempt 
timeout is compiled out?

Regards,

Tvrtko
Chris Wilson Nov. 15, 2019, 1:09 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-11-15 13:02:24)
> 
> On 14/11/2019 19:15, Chris Wilson wrote:
> > Although a virtual engine itself has no hang detection; that is on the
> > underlying physical engines, it does provide a unique means for clients
> > to try and break the system. Try and break it before they do.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
> >   1 file changed, 105 insertions(+)
> > 
> > diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> > index 70c4529b4..86028cfdd 100644
> > --- a/tests/i915/gem_exec_balancer.c
> > +++ b/tests/i915/gem_exec_balancer.c
> > @@ -24,6 +24,7 @@
> >   #include <sched.h>
> >   
> >   #include "igt.h"
> > +#include "igt_gt.h"
> >   #include "igt_perf.h"
> >   #include "i915/gem_ring.h"
> >   #include "sw_sync.h"
> > @@ -1314,6 +1315,102 @@ static void semaphore(int i915)
> >       gem_quiescent_gpu(i915);
> >   }
> >   
> > +static void set_unbannable(int i915, uint32_t ctx)
> > +{
> > +     struct drm_i915_gem_context_param p = {
> > +             .ctx_id = ctx,
> > +             .param = I915_CONTEXT_PARAM_BANNABLE,
> > +     };
> > +
> > +     igt_assert_eq(__gem_context_set_param(i915, &p), 0);
> > +}
> > +
> > +static void hangme(int i915)
> > +{
> > +     struct drm_i915_gem_exec_object2 batch = {
> > +             .handle = batch_create(i915),
> > +     };
> > +
> > +     /*
> > +      * Fill the available engines with hanging virtual engines and verify
> > +      * that execution continues onto the second batch.
> > +      */
> > +
> > +     for (int class = 1; class < 32; class++) {
> > +             struct i915_engine_class_instance *ci;
> > +             struct client {
> > +                     igt_spin_t *spin[2];
> > +             } *client;
> > +             unsigned int count;
> > +             uint32_t bg;
> > +
> > +             ci = list_engines(i915, 1u << class, &count);
> > +             if (!ci)
> > +                     continue;
> > +
> > +             if (count < 2) {
> > +                     free(ci);
> > +                     continue;
> > +             }
> > +
> > +             client = malloc(sizeof(*client) * count);
> > +             igt_assert(client);
> > +
> > +             for (int i = 0; i < count; i++) {
> > +                     uint32_t ctx = gem_context_create(i915);
> > +                     struct client *c = &client[i];
> > +                     unsigned int flags;
> > +
> > +                     set_unbannable(i915, ctx);
> > +                     set_load_balancer(i915, ctx, ci, count, NULL);
> > +
> > +                     flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
> > +                     for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
> > +                             c->spin[j] = igt_spin_new(i915, ctx,
> > +                                                       .flags = flags);
> > +                             flags = IGT_SPIN_FENCE_OUT;
> > +                     }
> > +
> > +                     gem_context_destroy(i915, ctx);
> > +             }
> > +
> > +             /* Apply some background context to speed up hang detection */
> > +             bg = gem_context_create(i915);
> > +             set_engines(i915, bg, ci, count);
> > +             for (int i = 0; i < count; i++) {
> > +                     struct drm_i915_gem_execbuffer2 execbuf = {
> > +                             .buffers_ptr = to_user_pointer(&batch),
> > +                             .buffer_count = 1,
> > +                             .flags = i,
> > +                             .rsvd1 = bg,
> > +                     };
> > +                     gem_execbuf(i915, &execbuf);
> > +             }
> > +             gem_context_destroy(i915, bg);
> > +
> > +             for (int i = 0; i < count; i++) {
> > +                     struct client *c = &client[i];
> > +
> > +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
> > +                     gem_sync(i915, c->spin[0]->handle);
> > +                     igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
> > +                                   -EIO);
> > +
> > +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
> > +                     gem_sync(i915, c->spin[1]->handle);
> > +                     igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
> > +                                   -EIO);
> > +
> > +                     igt_spin_free(i915, c->spin[0]);
> > +                     igt_spin_free(i915, c->spin[1]);
> > +             }
> > +             free(client);
> > +     }
> > +
> > +     gem_close(i915, batch.handle);
> > +     gem_quiescent_gpu(i915);
> > +}
> > +
> >   static void smoketest(int i915, int timeout)
> >   {
> >       struct drm_i915_gem_exec_object2 batch[2] = {
> > @@ -1486,4 +1583,12 @@ igt_main
> >       igt_fixture {
> >               igt_stop_hang_detector();
> >       }
> > +
> > +     igt_subtest("hang") {
> > +             igt_hang_t hang = igt_allow_hang(i915, 0, 0);
> > +
> > +             hangme(i915);
> > +
> > +             igt_disallow_hang(i915, hang);
> > +     }
> >   }
> > 
> 
> Looks good. But do we need some core helpers to figure out when preempt 
> timeout is compiled out?

It should still work the same, but slower; 10s hang detection rather
than ~200ms.
-Chris
Tvrtko Ursulin Nov. 15, 2019, 2:52 p.m. UTC | #3
On 15/11/2019 13:09, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-15 13:02:24)
>>
>> On 14/11/2019 19:15, Chris Wilson wrote:
>>> Although a virtual engine itself has no hang detection; that is on the
>>> underlying physical engines, it does provide a unique means for clients
>>> to try and break the system. Try and break it before they do.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
>>>    1 file changed, 105 insertions(+)
>>>
>>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
>>> index 70c4529b4..86028cfdd 100644
>>> --- a/tests/i915/gem_exec_balancer.c
>>> +++ b/tests/i915/gem_exec_balancer.c
>>> @@ -24,6 +24,7 @@
>>>    #include <sched.h>
>>>    
>>>    #include "igt.h"
>>> +#include "igt_gt.h"
>>>    #include "igt_perf.h"
>>>    #include "i915/gem_ring.h"
>>>    #include "sw_sync.h"
>>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915)
>>>        gem_quiescent_gpu(i915);
>>>    }
>>>    
>>> +static void set_unbannable(int i915, uint32_t ctx)
>>> +{
>>> +     struct drm_i915_gem_context_param p = {
>>> +             .ctx_id = ctx,
>>> +             .param = I915_CONTEXT_PARAM_BANNABLE,
>>> +     };
>>> +
>>> +     igt_assert_eq(__gem_context_set_param(i915, &p), 0);
>>> +}
>>> +
>>> +static void hangme(int i915)
>>> +{
>>> +     struct drm_i915_gem_exec_object2 batch = {
>>> +             .handle = batch_create(i915),
>>> +     };
>>> +
>>> +     /*
>>> +      * Fill the available engines with hanging virtual engines and verify
>>> +      * that execution continues onto the second batch.
>>> +      */
>>> +
>>> +     for (int class = 1; class < 32; class++) {
>>> +             struct i915_engine_class_instance *ci;
>>> +             struct client {
>>> +                     igt_spin_t *spin[2];
>>> +             } *client;
>>> +             unsigned int count;
>>> +             uint32_t bg;
>>> +
>>> +             ci = list_engines(i915, 1u << class, &count);
>>> +             if (!ci)
>>> +                     continue;
>>> +
>>> +             if (count < 2) {
>>> +                     free(ci);
>>> +                     continue;
>>> +             }
>>> +
>>> +             client = malloc(sizeof(*client) * count);
>>> +             igt_assert(client);
>>> +
>>> +             for (int i = 0; i < count; i++) {
>>> +                     uint32_t ctx = gem_context_create(i915);
>>> +                     struct client *c = &client[i];
>>> +                     unsigned int flags;
>>> +
>>> +                     set_unbannable(i915, ctx);
>>> +                     set_load_balancer(i915, ctx, ci, count, NULL);
>>> +
>>> +                     flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
>>> +                     for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
>>> +                             c->spin[j] = igt_spin_new(i915, ctx,
>>> +                                                       .flags = flags);
>>> +                             flags = IGT_SPIN_FENCE_OUT;
>>> +                     }
>>> +
>>> +                     gem_context_destroy(i915, ctx);
>>> +             }
>>> +
>>> +             /* Apply some background context to speed up hang detection */
>>> +             bg = gem_context_create(i915);
>>> +             set_engines(i915, bg, ci, count);
>>> +             for (int i = 0; i < count; i++) {
>>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
>>> +                             .buffers_ptr = to_user_pointer(&batch),
>>> +                             .buffer_count = 1,
>>> +                             .flags = i,
>>> +                             .rsvd1 = bg,
>>> +                     };
>>> +                     gem_execbuf(i915, &execbuf);
>>> +             }
>>> +             gem_context_destroy(i915, bg);
>>> +
>>> +             for (int i = 0; i < count; i++) {
>>> +                     struct client *c = &client[i];
>>> +
>>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
>>> +                     gem_sync(i915, c->spin[0]->handle);
>>> +                     igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
>>> +                                   -EIO);
>>> +
>>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
>>> +                     gem_sync(i915, c->spin[1]->handle);
>>> +                     igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
>>> +                                   -EIO);
>>> +
>>> +                     igt_spin_free(i915, c->spin[0]);
>>> +                     igt_spin_free(i915, c->spin[1]);
>>> +             }
>>> +             free(client);
>>> +     }
>>> +
>>> +     gem_close(i915, batch.handle);
>>> +     gem_quiescent_gpu(i915);
>>> +}
>>> +
>>>    static void smoketest(int i915, int timeout)
>>>    {
>>>        struct drm_i915_gem_exec_object2 batch[2] = {
>>> @@ -1486,4 +1583,12 @@ igt_main
>>>        igt_fixture {
>>>                igt_stop_hang_detector();
>>>        }
>>> +
>>> +     igt_subtest("hang") {
>>> +             igt_hang_t hang = igt_allow_hang(i915, 0, 0);
>>> +
>>> +             hangme(i915);
>>> +
>>> +             igt_disallow_hang(i915, hang);
>>> +     }
>>>    }
>>>
>>
>> Looks good. But do we need some core helpers to figure out when preempt
>> timeout is compiled out?
> 
> It should still work the same, but slower; 10s hang detection rather
> than ~200ms.

You are talking about old hangcheck? I was thinking about all new 
Kconfig's compiled out. No heartbeats, no preemption timeout. Still works?

Regards,

Tvrtko
Chris Wilson Nov. 15, 2019, 2:58 p.m. UTC | #4
Quoting Tvrtko Ursulin (2019-11-15 14:52:16)
> 
> On 15/11/2019 13:09, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-15 13:02:24)
> >>
> >> On 14/11/2019 19:15, Chris Wilson wrote:
> >>> Although a virtual engine itself has no hang detection; that is on the
> >>> underlying physical engines, it does provide a unique means for clients
> >>> to try and break the system. Try and break it before they do.
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> ---
> >>>    tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
> >>>    1 file changed, 105 insertions(+)
> >>>
> >>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> >>> index 70c4529b4..86028cfdd 100644
> >>> --- a/tests/i915/gem_exec_balancer.c
> >>> +++ b/tests/i915/gem_exec_balancer.c
> >>> @@ -24,6 +24,7 @@
> >>>    #include <sched.h>
> >>>    
> >>>    #include "igt.h"
> >>> +#include "igt_gt.h"
> >>>    #include "igt_perf.h"
> >>>    #include "i915/gem_ring.h"
> >>>    #include "sw_sync.h"
> >>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915)
> >>>        gem_quiescent_gpu(i915);
> >>>    }
> >>>    
> >>> +static void set_unbannable(int i915, uint32_t ctx)
> >>> +{
> >>> +     struct drm_i915_gem_context_param p = {
> >>> +             .ctx_id = ctx,
> >>> +             .param = I915_CONTEXT_PARAM_BANNABLE,
> >>> +     };
> >>> +
> >>> +     igt_assert_eq(__gem_context_set_param(i915, &p), 0);
> >>> +}
> >>> +
> >>> +static void hangme(int i915)
> >>> +{
> >>> +     struct drm_i915_gem_exec_object2 batch = {
> >>> +             .handle = batch_create(i915),
> >>> +     };
> >>> +
> >>> +     /*
> >>> +      * Fill the available engines with hanging virtual engines and verify
> >>> +      * that execution continues onto the second batch.
> >>> +      */
> >>> +
> >>> +     for (int class = 1; class < 32; class++) {
> >>> +             struct i915_engine_class_instance *ci;
> >>> +             struct client {
> >>> +                     igt_spin_t *spin[2];
> >>> +             } *client;
> >>> +             unsigned int count;
> >>> +             uint32_t bg;
> >>> +
> >>> +             ci = list_engines(i915, 1u << class, &count);
> >>> +             if (!ci)
> >>> +                     continue;
> >>> +
> >>> +             if (count < 2) {
> >>> +                     free(ci);
> >>> +                     continue;
> >>> +             }
> >>> +
> >>> +             client = malloc(sizeof(*client) * count);
> >>> +             igt_assert(client);
> >>> +
> >>> +             for (int i = 0; i < count; i++) {
> >>> +                     uint32_t ctx = gem_context_create(i915);
> >>> +                     struct client *c = &client[i];
> >>> +                     unsigned int flags;
> >>> +
> >>> +                     set_unbannable(i915, ctx);
> >>> +                     set_load_balancer(i915, ctx, ci, count, NULL);
> >>> +
> >>> +                     flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
> >>> +                     for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
> >>> +                             c->spin[j] = igt_spin_new(i915, ctx,
> >>> +                                                       .flags = flags);
> >>> +                             flags = IGT_SPIN_FENCE_OUT;
> >>> +                     }
> >>> +
> >>> +                     gem_context_destroy(i915, ctx);
> >>> +             }
> >>> +
> >>> +             /* Apply some background context to speed up hang detection */
> >>> +             bg = gem_context_create(i915);
> >>> +             set_engines(i915, bg, ci, count);
> >>> +             for (int i = 0; i < count; i++) {
> >>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
> >>> +                             .buffers_ptr = to_user_pointer(&batch),
> >>> +                             .buffer_count = 1,
> >>> +                             .flags = i,
> >>> +                             .rsvd1 = bg,
> >>> +                     };
> >>> +                     gem_execbuf(i915, &execbuf);
> >>> +             }
> >>> +             gem_context_destroy(i915, bg);
> >>> +
> >>> +             for (int i = 0; i < count; i++) {
> >>> +                     struct client *c = &client[i];
> >>> +
> >>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
> >>> +                     gem_sync(i915, c->spin[0]->handle);
> >>> +                     igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
> >>> +                                   -EIO);
> >>> +
> >>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
> >>> +                     gem_sync(i915, c->spin[1]->handle);
> >>> +                     igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
> >>> +                                   -EIO);
> >>> +
> >>> +                     igt_spin_free(i915, c->spin[0]);
> >>> +                     igt_spin_free(i915, c->spin[1]);
> >>> +             }
> >>> +             free(client);
> >>> +     }
> >>> +
> >>> +     gem_close(i915, batch.handle);
> >>> +     gem_quiescent_gpu(i915);
> >>> +}
> >>> +
> >>>    static void smoketest(int i915, int timeout)
> >>>    {
> >>>        struct drm_i915_gem_exec_object2 batch[2] = {
> >>> @@ -1486,4 +1583,12 @@ igt_main
> >>>        igt_fixture {
> >>>                igt_stop_hang_detector();
> >>>        }
> >>> +
> >>> +     igt_subtest("hang") {
> >>> +             igt_hang_t hang = igt_allow_hang(i915, 0, 0);
> >>> +
> >>> +             hangme(i915);
> >>> +
> >>> +             igt_disallow_hang(i915, hang);
> >>> +     }
> >>>    }
> >>>
> >>
> >> Looks good. But do we need some core helpers to figure out when preempt
> >> timeout is compiled out?
> > 
> > It should still work the same, but slower; 10s hang detection rather
> > than ~200ms.
> 
> You are talking about old hangcheck? I was thinking about all new 
> Kconfig's compiled out. No heartbeats, no preemption timeout. Still works?

Works even faster. :)

The spinners then get killed when the contexts are closed (default is
non-persistent contexts if you disable heartbeats entirely). The
challenge is really on the per-engine heartbeat controls to make sure we
kick off the dead contexts, but that's for the future.
-Chris
Chris Wilson Nov. 15, 2019, 2:59 p.m. UTC | #5
Quoting Chris Wilson (2019-11-15 14:58:00)
> Quoting Tvrtko Ursulin (2019-11-15 14:52:16)
> > 
> > On 15/11/2019 13:09, Chris Wilson wrote:
> > > Quoting Tvrtko Ursulin (2019-11-15 13:02:24)
> > >>
> > >> On 14/11/2019 19:15, Chris Wilson wrote:
> > >>> Although a virtual engine itself has no hang detection; that is on the
> > >>> underlying physical engines, it does provide a unique means for clients
> > >>> to try and break the system. Try and break it before they do.
> > >>>
> > >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > >>> ---
> > >>>    tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
> > >>>    1 file changed, 105 insertions(+)
> > >>>
> > >>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> > >>> index 70c4529b4..86028cfdd 100644
> > >>> --- a/tests/i915/gem_exec_balancer.c
> > >>> +++ b/tests/i915/gem_exec_balancer.c
> > >>> @@ -24,6 +24,7 @@
> > >>>    #include <sched.h>
> > >>>    
> > >>>    #include "igt.h"
> > >>> +#include "igt_gt.h"
> > >>>    #include "igt_perf.h"
> > >>>    #include "i915/gem_ring.h"
> > >>>    #include "sw_sync.h"
> > >>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915)
> > >>>        gem_quiescent_gpu(i915);
> > >>>    }
> > >>>    
> > >>> +static void set_unbannable(int i915, uint32_t ctx)
> > >>> +{
> > >>> +     struct drm_i915_gem_context_param p = {
> > >>> +             .ctx_id = ctx,
> > >>> +             .param = I915_CONTEXT_PARAM_BANNABLE,
> > >>> +     };
> > >>> +
> > >>> +     igt_assert_eq(__gem_context_set_param(i915, &p), 0);
> > >>> +}
> > >>> +
> > >>> +static void hangme(int i915)
> > >>> +{
> > >>> +     struct drm_i915_gem_exec_object2 batch = {
> > >>> +             .handle = batch_create(i915),
> > >>> +     };
> > >>> +
> > >>> +     /*
> > >>> +      * Fill the available engines with hanging virtual engines and verify
> > >>> +      * that execution continues onto the second batch.
> > >>> +      */
> > >>> +
> > >>> +     for (int class = 1; class < 32; class++) {
> > >>> +             struct i915_engine_class_instance *ci;
> > >>> +             struct client {
> > >>> +                     igt_spin_t *spin[2];
> > >>> +             } *client;
> > >>> +             unsigned int count;
> > >>> +             uint32_t bg;
> > >>> +
> > >>> +             ci = list_engines(i915, 1u << class, &count);
> > >>> +             if (!ci)
> > >>> +                     continue;
> > >>> +
> > >>> +             if (count < 2) {
> > >>> +                     free(ci);
> > >>> +                     continue;
> > >>> +             }
> > >>> +
> > >>> +             client = malloc(sizeof(*client) * count);
> > >>> +             igt_assert(client);
> > >>> +
> > >>> +             for (int i = 0; i < count; i++) {
> > >>> +                     uint32_t ctx = gem_context_create(i915);
> > >>> +                     struct client *c = &client[i];
> > >>> +                     unsigned int flags;
> > >>> +
> > >>> +                     set_unbannable(i915, ctx);
> > >>> +                     set_load_balancer(i915, ctx, ci, count, NULL);
> > >>> +
> > >>> +                     flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
> > >>> +                     for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
> > >>> +                             c->spin[j] = igt_spin_new(i915, ctx,
> > >>> +                                                       .flags = flags);
> > >>> +                             flags = IGT_SPIN_FENCE_OUT;
> > >>> +                     }
> > >>> +
> > >>> +                     gem_context_destroy(i915, ctx);
> > >>> +             }
> > >>> +
> > >>> +             /* Apply some background context to speed up hang detection */
> > >>> +             bg = gem_context_create(i915);
> > >>> +             set_engines(i915, bg, ci, count);
> > >>> +             for (int i = 0; i < count; i++) {
> > >>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
> > >>> +                             .buffers_ptr = to_user_pointer(&batch),
> > >>> +                             .buffer_count = 1,
> > >>> +                             .flags = i,
> > >>> +                             .rsvd1 = bg,
> > >>> +                     };
> > >>> +                     gem_execbuf(i915, &execbuf);
> > >>> +             }
> > >>> +             gem_context_destroy(i915, bg);
> > >>> +
> > >>> +             for (int i = 0; i < count; i++) {
> > >>> +                     struct client *c = &client[i];
> > >>> +
> > >>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
> > >>> +                     gem_sync(i915, c->spin[0]->handle);
> > >>> +                     igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
> > >>> +                                   -EIO);
> > >>> +
> > >>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
> > >>> +                     gem_sync(i915, c->spin[1]->handle);
> > >>> +                     igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
> > >>> +                                   -EIO);
> > >>> +
> > >>> +                     igt_spin_free(i915, c->spin[0]);
> > >>> +                     igt_spin_free(i915, c->spin[1]);
> > >>> +             }
> > >>> +             free(client);
> > >>> +     }
> > >>> +
> > >>> +     gem_close(i915, batch.handle);
> > >>> +     gem_quiescent_gpu(i915);
> > >>> +}
> > >>> +
> > >>>    static void smoketest(int i915, int timeout)
> > >>>    {
> > >>>        struct drm_i915_gem_exec_object2 batch[2] = {
> > >>> @@ -1486,4 +1583,12 @@ igt_main
> > >>>        igt_fixture {
> > >>>                igt_stop_hang_detector();
> > >>>        }
> > >>> +
> > >>> +     igt_subtest("hang") {
> > >>> +             igt_hang_t hang = igt_allow_hang(i915, 0, 0);
> > >>> +
> > >>> +             hangme(i915);
> > >>> +
> > >>> +             igt_disallow_hang(i915, hang);
> > >>> +     }
> > >>>    }
> > >>>
> > >>
> > >> Looks good. But do we need some core helpers to figure out when preempt
> > >> timeout is compiled out?
> > > 
> > > It should still work the same, but slower; 10s hang detection rather
> > > than ~200ms.
> > 
> > You are talking about old hangcheck? I was thinking about all new 
> > Kconfig's compiled out. No heartbeats, no preemption timeout. Still works?
> 
> Works even faster. :)
> 
> The spinners then get killed when the contexts are closed (default is
> non-persistent contexts if you disable heartbeats entirely). The
> challenge is really on the per-engine heartbeat controls to make sure we
> kick off the dead contexts, but that's for the future.

And for the other kconfig, with no preemption timeout, you just get
regular heartbeats, so roughly the 10s hangcheck timeout.
-Chris
Tvrtko Ursulin Nov. 15, 2019, 3:26 p.m. UTC | #6
On 15/11/2019 14:59, Chris Wilson wrote:
> Quoting Chris Wilson (2019-11-15 14:58:00)
>> Quoting Tvrtko Ursulin (2019-11-15 14:52:16)
>>>
>>> On 15/11/2019 13:09, Chris Wilson wrote:
>>>> Quoting Tvrtko Ursulin (2019-11-15 13:02:24)
>>>>>
>>>>> On 14/11/2019 19:15, Chris Wilson wrote:
>>>>>> Although a virtual engine itself has no hang detection; that is on the
>>>>>> underlying physical engines, it does provide a unique means for clients
>>>>>> to try and break the system. Try and break it before they do.
>>>>>>
>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>> ---
>>>>>>     tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
>>>>>>     1 file changed, 105 insertions(+)
>>>>>>
>>>>>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
>>>>>> index 70c4529b4..86028cfdd 100644
>>>>>> --- a/tests/i915/gem_exec_balancer.c
>>>>>> +++ b/tests/i915/gem_exec_balancer.c
>>>>>> @@ -24,6 +24,7 @@
>>>>>>     #include <sched.h>
>>>>>>     
>>>>>>     #include "igt.h"
>>>>>> +#include "igt_gt.h"
>>>>>>     #include "igt_perf.h"
>>>>>>     #include "i915/gem_ring.h"
>>>>>>     #include "sw_sync.h"
>>>>>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915)
>>>>>>         gem_quiescent_gpu(i915);
>>>>>>     }
>>>>>>     
>>>>>> +static void set_unbannable(int i915, uint32_t ctx)
>>>>>> +{
>>>>>> +     struct drm_i915_gem_context_param p = {
>>>>>> +             .ctx_id = ctx,
>>>>>> +             .param = I915_CONTEXT_PARAM_BANNABLE,
>>>>>> +     };
>>>>>> +
>>>>>> +     igt_assert_eq(__gem_context_set_param(i915, &p), 0);
>>>>>> +}
>>>>>> +
>>>>>> +static void hangme(int i915)
>>>>>> +{
>>>>>> +     struct drm_i915_gem_exec_object2 batch = {
>>>>>> +             .handle = batch_create(i915),
>>>>>> +     };
>>>>>> +
>>>>>> +     /*
>>>>>> +      * Fill the available engines with hanging virtual engines and verify
>>>>>> +      * that execution continues onto the second batch.
>>>>>> +      */
>>>>>> +
>>>>>> +     for (int class = 1; class < 32; class++) {
>>>>>> +             struct i915_engine_class_instance *ci;
>>>>>> +             struct client {
>>>>>> +                     igt_spin_t *spin[2];
>>>>>> +             } *client;
>>>>>> +             unsigned int count;
>>>>>> +             uint32_t bg;
>>>>>> +
>>>>>> +             ci = list_engines(i915, 1u << class, &count);
>>>>>> +             if (!ci)
>>>>>> +                     continue;
>>>>>> +
>>>>>> +             if (count < 2) {
>>>>>> +                     free(ci);
>>>>>> +                     continue;
>>>>>> +             }
>>>>>> +
>>>>>> +             client = malloc(sizeof(*client) * count);
>>>>>> +             igt_assert(client);
>>>>>> +
>>>>>> +             for (int i = 0; i < count; i++) {
>>>>>> +                     uint32_t ctx = gem_context_create(i915);
>>>>>> +                     struct client *c = &client[i];
>>>>>> +                     unsigned int flags;
>>>>>> +
>>>>>> +                     set_unbannable(i915, ctx);
>>>>>> +                     set_load_balancer(i915, ctx, ci, count, NULL);
>>>>>> +
>>>>>> +                     flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
>>>>>> +                     for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
>>>>>> +                             c->spin[j] = igt_spin_new(i915, ctx,
>>>>>> +                                                       .flags = flags);
>>>>>> +                             flags = IGT_SPIN_FENCE_OUT;
>>>>>> +                     }
>>>>>> +
>>>>>> +                     gem_context_destroy(i915, ctx);
>>>>>> +             }
>>>>>> +
>>>>>> +             /* Apply some background context to speed up hang detection */
>>>>>> +             bg = gem_context_create(i915);
>>>>>> +             set_engines(i915, bg, ci, count);
>>>>>> +             for (int i = 0; i < count; i++) {
>>>>>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
>>>>>> +                             .buffers_ptr = to_user_pointer(&batch),
>>>>>> +                             .buffer_count = 1,
>>>>>> +                             .flags = i,
>>>>>> +                             .rsvd1 = bg,
>>>>>> +                     };
>>>>>> +                     gem_execbuf(i915, &execbuf);
>>>>>> +             }
>>>>>> +             gem_context_destroy(i915, bg);
>>>>>> +
>>>>>> +             for (int i = 0; i < count; i++) {
>>>>>> +                     struct client *c = &client[i];
>>>>>> +
>>>>>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
>>>>>> +                     gem_sync(i915, c->spin[0]->handle);
>>>>>> +                     igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
>>>>>> +                                   -EIO);
>>>>>> +
>>>>>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
>>>>>> +                     gem_sync(i915, c->spin[1]->handle);
>>>>>> +                     igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
>>>>>> +                                   -EIO);
>>>>>> +
>>>>>> +                     igt_spin_free(i915, c->spin[0]);
>>>>>> +                     igt_spin_free(i915, c->spin[1]);
>>>>>> +             }
>>>>>> +             free(client);
>>>>>> +     }
>>>>>> +
>>>>>> +     gem_close(i915, batch.handle);
>>>>>> +     gem_quiescent_gpu(i915);
>>>>>> +}
>>>>>> +
>>>>>>     static void smoketest(int i915, int timeout)
>>>>>>     {
>>>>>>         struct drm_i915_gem_exec_object2 batch[2] = {
>>>>>> @@ -1486,4 +1583,12 @@ igt_main
>>>>>>         igt_fixture {
>>>>>>                 igt_stop_hang_detector();
>>>>>>         }
>>>>>> +
>>>>>> +     igt_subtest("hang") {
>>>>>> +             igt_hang_t hang = igt_allow_hang(i915, 0, 0);
>>>>>> +
>>>>>> +             hangme(i915);
>>>>>> +
>>>>>> +             igt_disallow_hang(i915, hang);
>>>>>> +     }
>>>>>>     }
>>>>>>
>>>>>
>>>>> Looks good. But do we need some core helpers to figure out when preempt
>>>>> timeout is compiled out?
>>>>
>>>> It should still work the same, but slower; 10s hang detection rather
>>>> than ~200ms.
>>>
>>> You are talking about old hangcheck? I was thinking about all new
>>> Kconfig's compiled out. No heartbeats, no preemption timeout. Still works?
>>
>> Works even faster. :)
>>
>> The spinners then get killed when the contexts are closed (default is
>> non-persistent contexts if you disable heartbeats entirely). The
>> challenge is really on the per-engine heartbeat controls to make sure we
>> kick off the dead contexts, but that's for the future.
> 
> And for the other kconfig, with no preemption timeout, you just get
> regular heartbeats, so roughly the 10s hangcheck timeout.

Good then. No other opens:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
index 70c4529b4..86028cfdd 100644
--- a/tests/i915/gem_exec_balancer.c
+++ b/tests/i915/gem_exec_balancer.c
@@ -24,6 +24,7 @@ 
 #include <sched.h>
 
 #include "igt.h"
+#include "igt_gt.h"
 #include "igt_perf.h"
 #include "i915/gem_ring.h"
 #include "sw_sync.h"
@@ -1314,6 +1315,102 @@  static void semaphore(int i915)
 	gem_quiescent_gpu(i915);
 }
 
+static void set_unbannable(int i915, uint32_t ctx)
+{
+	struct drm_i915_gem_context_param p = {
+		.ctx_id = ctx,
+		.param = I915_CONTEXT_PARAM_BANNABLE,
+	};
+
+	igt_assert_eq(__gem_context_set_param(i915, &p), 0);
+}
+
+static void hangme(int i915)
+{
+	struct drm_i915_gem_exec_object2 batch = {
+		.handle = batch_create(i915),
+	};
+
+	/*
+	 * Fill the available engines with hanging virtual engines and verify
+	 * that execution continues onto the second batch.
+	 */
+
+	for (int class = 1; class < 32; class++) {
+		struct i915_engine_class_instance *ci;
+		struct client {
+			igt_spin_t *spin[2];
+		} *client;
+		unsigned int count;
+		uint32_t bg;
+
+		ci = list_engines(i915, 1u << class, &count);
+		if (!ci)
+			continue;
+
+		if (count < 2) {
+			free(ci);
+			continue;
+		}
+
+		client = malloc(sizeof(*client) * count);
+		igt_assert(client);
+
+		for (int i = 0; i < count; i++) {
+			uint32_t ctx = gem_context_create(i915);
+			struct client *c = &client[i];
+			unsigned int flags;
+
+			set_unbannable(i915, ctx);
+			set_load_balancer(i915, ctx, ci, count, NULL);
+
+			flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
+			for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
+				c->spin[j] = igt_spin_new(i915, ctx,
+							  .flags = flags);
+				flags = IGT_SPIN_FENCE_OUT;
+			}
+
+			gem_context_destroy(i915, ctx);
+		}
+
+		/* Apply some background context to speed up hang detection */
+		bg = gem_context_create(i915);
+		set_engines(i915, bg, ci, count);
+		for (int i = 0; i < count; i++) {
+			struct drm_i915_gem_execbuffer2 execbuf = {
+				.buffers_ptr = to_user_pointer(&batch),
+				.buffer_count = 1,
+				.flags = i,
+				.rsvd1 = bg,
+			};
+			gem_execbuf(i915, &execbuf);
+		}
+		gem_context_destroy(i915, bg);
+
+		for (int i = 0; i < count; i++) {
+			struct client *c = &client[i];
+
+			igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
+			gem_sync(i915, c->spin[0]->handle);
+			igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
+				      -EIO);
+
+			igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
+			gem_sync(i915, c->spin[1]->handle);
+			igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
+				      -EIO);
+
+			igt_spin_free(i915, c->spin[0]);
+			igt_spin_free(i915, c->spin[1]);
+		}
+		free(client);
+	}
+
+	gem_close(i915, batch.handle);
+	gem_quiescent_gpu(i915);
+}
+
 static void smoketest(int i915, int timeout)
 {
 	struct drm_i915_gem_exec_object2 batch[2] = {
@@ -1486,4 +1583,12 @@  igt_main
 	igt_fixture {
 		igt_stop_hang_detector();
 	}
+
+	igt_subtest("hang") {
+		igt_hang_t hang = igt_allow_hang(i915, 0, 0);
+
+		hangme(i915);
+
+		igt_disallow_hang(i915, hang);
+	}
 }