Message ID | 20180716134009.13143-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 16/07/2018 14:40, Chris Wilson wrote: > We must be able to reset the GPU while we are waiting on it to perform > an eviction (unbinding an active vma). So attach a spinning request to a > target vma and try and it evict it from a thread to see if that blocks > indefinitely. > > v2: Add a wait for the thread to start just in case that takes more than > 10ms... > v3: complete() not completion_done() to signal the completion. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > .../gpu/drm/i915/selftests/intel_hangcheck.c | 171 +++++++++++++++++- > 1 file changed, 169 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index 73462a65a330..65d66cdedd26 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -27,6 +27,7 @@ > #include "../i915_selftest.h" > #include "i915_random.h" > #include "igt_flush_test.h" > +#include "igt_wedge_me.h" > > #include "mock_context.h" > #include "mock_drm.h" > @@ -921,7 +922,7 @@ static u32 fake_hangcheck(struct i915_request *rq, u32 mask) > return reset_count; > } > > -static int igt_wait_reset(void *arg) > +static int igt_reset_wait(void *arg) > { > struct drm_i915_private *i915 = arg; > struct i915_request *rq; > @@ -995,6 +996,170 @@ static int igt_wait_reset(void *arg) > return err; > } > > +struct evict_vma { > + struct completion completion; > + struct i915_vma *vma; > +}; > + > +static int evict_vma(void *data) > +{ > + struct evict_vma *arg = data; > + struct i915_address_space *vm = arg->vma->vm; > + struct drm_i915_private *i915 = vm->i915; > + struct drm_mm_node evict = arg->vma->node; > + int err; > + > + complete(&arg->completion); > + > + mutex_lock(&i915->drm.struct_mutex); > + err = i915_gem_evict_for_node(vm, &evict, 0); > + mutex_unlock(&i915->drm.struct_mutex); > + > + return err; > +} > + > +static int __igt_reset_evict_vma(struct drm_i915_private *i915, > + struct i915_address_space *vm) > +{ > + struct drm_i915_gem_object *obj; > + struct task_struct *tsk = NULL; > + struct i915_request *rq; > + struct evict_vma arg; > + struct hang h; > + int err; > + > + if (!intel_engine_can_store_dword(i915->engine[RCS])) > + return 0; > + > + /* Check that we can recover an unbind stuck on a hanging request */ > + > + global_reset_lock(i915); > + > + mutex_lock(&i915->drm.struct_mutex); > + err = hang_init(&h, i915); > + if (err) > + goto unlock; > + > + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); > + if (IS_ERR(obj)) { > + err = PTR_ERR(obj); > + goto fini; > + } > + > + arg.vma = i915_vma_instance(obj, vm, NULL); > + if (IS_ERR(arg.vma)) { > + err = PTR_ERR(arg.vma); > + goto out_obj; > + } > + > + rq = hang_create_request(&h, i915->engine[RCS]); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto out_obj; > + } > + > + err = i915_vma_pin(arg.vma, 0, 0, > + i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER); > + if (err) > + goto out_obj; > + > + err = i915_vma_move_to_active(arg.vma, rq, EXEC_OBJECT_WRITE); > + i915_vma_unpin(arg.vma); > + > + i915_request_get(rq); > + i915_request_add(rq); > + if (err) > + goto out_rq; > + > + mutex_unlock(&i915->drm.struct_mutex); > + > + if (!wait_until_running(&h, rq)) { > + struct drm_printer p = drm_info_printer(i915->drm.dev); > + > + pr_err("%s: Failed to start request %x, at %x\n", > + __func__, rq->fence.seqno, hws_seqno(&h, rq)); > + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); > + > + i915_gem_set_wedged(i915); > + goto out_reset; > + } > + > + init_completion(&arg.completion); > + > + tsk = kthread_run(evict_vma, &arg, "igt/evict_vma"); > + if (IS_ERR(tsk)) { > + err = PTR_ERR(tsk); > + tsk = NULL; > + goto out_reset; > + } > + > + wait_for_completion(&arg.completion); > + > + if (wait_for(waitqueue_active(&rq->execute), 10)) { > + struct drm_printer p = drm_info_printer(i915->drm.dev); > + > + pr_err("igt/evict_vma kthread did not wait\n"); > + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); > + > + i915_gem_set_wedged(i915); > + goto out_reset; > + } > + > +out_reset: > + fake_hangcheck(rq, intel_engine_flag(rq->engine)); > + > + if (tsk) { > + struct igt_wedge_me w; > + > + /* The reset, even indirectly, should take less than 10ms. */ > + igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) > + err = kthread_stop(tsk); > + } > + > + mutex_lock(&i915->drm.struct_mutex); > +out_rq: > + i915_request_put(rq); > +out_obj: > + i915_gem_object_put(obj); > +fini: > + hang_fini(&h); > +unlock: > + mutex_unlock(&i915->drm.struct_mutex); > + global_reset_unlock(i915); > + > + if (i915_terminally_wedged(&i915->gpu_error)) > + return -EIO; > + > + return err; > +} > + > +static int igt_reset_evict_ggtt(void *arg) > +{ > + struct drm_i915_private *i915 = arg; > + > + return __igt_reset_evict_vma(i915, &i915->ggtt.vm); > +} > + > +static int igt_reset_evict_ppgtt(void *arg) > +{ > + struct drm_i915_private *i915 = arg; > + struct i915_gem_context *ctx; > + int err; > + > + mutex_lock(&i915->drm.struct_mutex); > + ctx = kernel_context(i915); > + mutex_unlock(&i915->drm.struct_mutex); > + if (IS_ERR(ctx)) > + return PTR_ERR(ctx); > + > + err = 0; > + if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */ > + err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm); > + > + kernel_context_close(ctx); > + return err; > +} > + > static int wait_for_others(struct drm_i915_private *i915, > struct intel_engine_cs *exclude) > { > @@ -1240,8 +1405,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) > SUBTEST(igt_reset_idle_engine), > SUBTEST(igt_reset_active_engine), > SUBTEST(igt_reset_engines), > - SUBTEST(igt_wait_reset), > SUBTEST(igt_reset_queue), > + SUBTEST(igt_reset_wait), > + SUBTEST(igt_reset_evict_ggtt), > + SUBTEST(igt_reset_evict_ppgtt), > SUBTEST(igt_handle_error), > }; > bool saved_hangcheck; > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index 73462a65a330..65d66cdedd26 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -27,6 +27,7 @@ #include "../i915_selftest.h" #include "i915_random.h" #include "igt_flush_test.h" +#include "igt_wedge_me.h" #include "mock_context.h" #include "mock_drm.h" @@ -921,7 +922,7 @@ static u32 fake_hangcheck(struct i915_request *rq, u32 mask) return reset_count; } -static int igt_wait_reset(void *arg) +static int igt_reset_wait(void *arg) { struct drm_i915_private *i915 = arg; struct i915_request *rq; @@ -995,6 +996,170 @@ static int igt_wait_reset(void *arg) return err; } +struct evict_vma { + struct completion completion; + struct i915_vma *vma; +}; + +static int evict_vma(void *data) +{ + struct evict_vma *arg = data; + struct i915_address_space *vm = arg->vma->vm; + struct drm_i915_private *i915 = vm->i915; + struct drm_mm_node evict = arg->vma->node; + int err; + + complete(&arg->completion); + + mutex_lock(&i915->drm.struct_mutex); + err = i915_gem_evict_for_node(vm, &evict, 0); + mutex_unlock(&i915->drm.struct_mutex); + + return err; +} + +static int __igt_reset_evict_vma(struct drm_i915_private *i915, + struct i915_address_space *vm) +{ + struct drm_i915_gem_object *obj; + struct task_struct *tsk = NULL; + struct i915_request *rq; + struct evict_vma arg; + struct hang h; + int err; + + if (!intel_engine_can_store_dword(i915->engine[RCS])) + return 0; + + /* Check that we can recover an unbind stuck on a hanging request */ + + global_reset_lock(i915); + + mutex_lock(&i915->drm.struct_mutex); + err = hang_init(&h, i915); + if (err) + goto unlock; + + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto fini; + } + + arg.vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(arg.vma)) { + err = PTR_ERR(arg.vma); + goto out_obj; + } + + rq = hang_create_request(&h, i915->engine[RCS]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_obj; + } + + err = i915_vma_pin(arg.vma, 0, 0, + i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER); + if (err) + goto out_obj; + + err = i915_vma_move_to_active(arg.vma, rq, EXEC_OBJECT_WRITE); + i915_vma_unpin(arg.vma); + + i915_request_get(rq); + i915_request_add(rq); + if (err) + goto out_rq; + + mutex_unlock(&i915->drm.struct_mutex); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s: Failed to start request %x, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + i915_gem_set_wedged(i915); + goto out_reset; + } + + init_completion(&arg.completion); + + tsk = kthread_run(evict_vma, &arg, "igt/evict_vma"); + if (IS_ERR(tsk)) { + err = PTR_ERR(tsk); + tsk = NULL; + goto out_reset; + } + + wait_for_completion(&arg.completion); + + if (wait_for(waitqueue_active(&rq->execute), 10)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("igt/evict_vma kthread did not wait\n"); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + i915_gem_set_wedged(i915); + goto out_reset; + } + +out_reset: + fake_hangcheck(rq, intel_engine_flag(rq->engine)); + + if (tsk) { + struct igt_wedge_me w; + + /* The reset, even indirectly, should take less than 10ms. */ + igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) + err = kthread_stop(tsk); + } + + mutex_lock(&i915->drm.struct_mutex); +out_rq: + i915_request_put(rq); +out_obj: + i915_gem_object_put(obj); +fini: + hang_fini(&h); +unlock: + mutex_unlock(&i915->drm.struct_mutex); + global_reset_unlock(i915); + + if (i915_terminally_wedged(&i915->gpu_error)) + return -EIO; + + return err; +} + +static int igt_reset_evict_ggtt(void *arg) +{ + struct drm_i915_private *i915 = arg; + + return __igt_reset_evict_vma(i915, &i915->ggtt.vm); +} + +static int igt_reset_evict_ppgtt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_gem_context *ctx; + int err; + + mutex_lock(&i915->drm.struct_mutex); + ctx = kernel_context(i915); + mutex_unlock(&i915->drm.struct_mutex); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + err = 0; + if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */ + err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm); + + kernel_context_close(ctx); + return err; +} + static int wait_for_others(struct drm_i915_private *i915, struct intel_engine_cs *exclude) { @@ -1240,8 +1405,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) SUBTEST(igt_reset_idle_engine), SUBTEST(igt_reset_active_engine), SUBTEST(igt_reset_engines), - SUBTEST(igt_wait_reset), SUBTEST(igt_reset_queue), + SUBTEST(igt_reset_wait), + SUBTEST(igt_reset_evict_ggtt), + SUBTEST(igt_reset_evict_ppgtt), SUBTEST(igt_handle_error), }; bool saved_hangcheck;
We must be able to reset the GPU while we are waiting on it to perform an eviction (unbinding an active vma). So attach a spinning request to a target vma and try and it evict it from a thread to see if that blocks indefinitely. v2: Add a wait for the thread to start just in case that takes more than 10ms... v3: complete() not completion_done() to signal the completion. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> --- .../gpu/drm/i915/selftests/intel_hangcheck.c | 171 +++++++++++++++++- 1 file changed, 169 insertions(+), 2 deletions(-)