diff mbox

[RESEND,1/6] drm/i915/selftests: Force a preemption hang

Message ID 20180716080332.32283-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson July 16, 2018, 8:03 a.m. UTC
Inject a failure into preemption completion to pretend as if the HW
didn't successfully handle preemption and we are forced to do a reset in
the middle.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_guc_submission.c |   3 +
 drivers/gpu/drm/i915/intel_lrc.c            |   3 +
 drivers/gpu/drm/i915/intel_ringbuffer.h     |   2 +
 drivers/gpu/drm/i915/selftests/intel_lrc.c  | 101 ++++++++++++++++++++
 4 files changed, 109 insertions(+)

Comments

Tvrtko Ursulin July 16, 2018, 9:08 a.m. UTC | #1
On 16/07/2018 09:03, Chris Wilson wrote:
> Inject a failure into preemption completion to pretend as if the HW
> didn't successfully handle preemption and we are forced to do a reset in
> the middle.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/intel_guc_submission.c |   3 +
>   drivers/gpu/drm/i915/intel_lrc.c            |   3 +
>   drivers/gpu/drm/i915/intel_ringbuffer.h     |   2 +
>   drivers/gpu/drm/i915/selftests/intel_lrc.c  | 101 ++++++++++++++++++++
>   4 files changed, 109 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index cc444dc5f3ad..de57cf6085d1 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -628,6 +628,9 @@ static void complete_preempt_context(struct intel_engine_cs *engine)
>   
>   	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
>   
> +	if (I915_SELFTEST_ONLY(execlists->preempt_hang))
> +		return;
> +
>   	execlists_cancel_port_requests(execlists);
>   	execlists_unwind_incomplete_requests(execlists);
>   
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 8fd8de71c2b5..703b76dcfcd2 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -559,6 +559,9 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists)
>   {
>   	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
>   
> +	if (I915_SELFTEST_ONLY(execlists->preempt_hang))
> +		return;
> +
>   	execlists_cancel_port_requests(execlists);
>   	__unwind_incomplete_requests(container_of(execlists,
>   						  struct intel_engine_cs,
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index d1eee08e5f6b..37a7c819435a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -333,6 +333,8 @@ struct intel_engine_execlists {
>   	 * @csb_head: context status buffer head
>   	 */
>   	u8 csb_head;
> +
> +	I915_SELFTEST_DECLARE(bool preempt_hang;)
>   };
>   
>   #define INTEL_ENGINE_CS_MAX_NAME 8
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index 636cb68191e3..d642e78ef145 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -451,12 +451,113 @@ static int live_late_preempt(void *arg)
>   	goto err_ctx_lo;
>   }
>   
> +static int live_preempt_hang(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct i915_gem_context *ctx_hi, *ctx_lo;
> +	struct spinner spin_hi, spin_lo;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	int err = -ENOMEM;
> +
> +	if (!HAS_LOGICAL_RING_PREEMPTION(i915))
> +		return 0;
> +
> +	if (!intel_has_reset_engine(i915))
> +		return 0;
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +
> +	if (spinner_init(&spin_hi, i915))
> +		goto err_unlock;
> +
> +	if (spinner_init(&spin_lo, i915))
> +		goto err_spin_hi;
> +
> +	ctx_hi = kernel_context(i915);
> +	if (!ctx_hi)
> +		goto err_spin_lo;
> +	ctx_hi->sched.priority = I915_CONTEXT_MAX_USER_PRIORITY;

Use more than max user to be more robust against possible scheduling 
policy changes in the future?

> +
> +	ctx_lo = kernel_context(i915);
> +	if (!ctx_lo)
> +		goto err_ctx_hi;
> +	ctx_lo->sched.priority = I915_CONTEXT_MIN_USER_PRIORITY;
> +
> +	for_each_engine(engine, i915, id) {
> +		struct i915_request *rq;
> +
> +		rq = spinner_create_request(&spin_lo, ctx_lo, engine,
> +					    MI_ARB_CHECK);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			goto err_ctx_lo;
> +		}
> +
> +		i915_request_add(rq);
> +		if (!wait_for_spinner(&spin_lo, rq)) {
> +			GEM_TRACE("lo spinner failed to start\n");
> +			GEM_TRACE_DUMP();
> +			i915_gem_set_wedged(i915);
> +			err = -EIO;
> +			goto err_ctx_lo;
> +		}
> +
> +		rq = spinner_create_request(&spin_hi, ctx_hi, engine,
> +					    MI_ARB_CHECK);

This one doesn't need to be preemptable but doesn't matter.

> +		if (IS_ERR(rq)) {
> +			spinner_end(&spin_lo);
> +			err = PTR_ERR(rq);
> +			goto err_ctx_lo;
> +		}
> +
> +		engine->execlists.preempt_hang = true;
> +		i915_request_add(rq);

Wait for timeout waiting for preempting spinner to run? Otherwise we are 
less sure that preempt_hang injection worked.

> +
> +		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> +		i915_reset_engine(engine, NULL);
> +		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> +
> +		engine->execlists.preempt_hang = false;
> +
> +		if (!wait_for_spinner(&spin_hi, rq)) {
> +			GEM_TRACE("hi spinner failed to start\n");
> +			GEM_TRACE_DUMP();
> +			i915_gem_set_wedged(i915);
> +			err = -EIO;
> +			goto err_ctx_lo;
> +		}
> +
> +		spinner_end(&spin_hi);
> +		spinner_end(&spin_lo);
> +		if (igt_flush_test(i915, I915_WAIT_LOCKED)) {
> +			err = -EIO;
> +			goto err_ctx_lo;
> +		}
> +	}
> +
> +	err = 0;
> +err_ctx_lo:
> +	kernel_context_close(ctx_lo);
> +err_ctx_hi:
> +	kernel_context_close(ctx_hi);
> +err_spin_lo:
> +	spinner_fini(&spin_lo);
> +err_spin_hi:
> +	spinner_fini(&spin_hi);
> +err_unlock:
> +	igt_flush_test(i915, I915_WAIT_LOCKED);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	return err;
> +}
> +
>   int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
>   		SUBTEST(live_sanitycheck),
>   		SUBTEST(live_preempt),
>   		SUBTEST(live_late_preempt),
> +		SUBTEST(live_preempt_hang),
>   	};
>   
>   	if (!HAS_EXECLISTS(i915))
> 

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index cc444dc5f3ad..de57cf6085d1 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -628,6 +628,9 @@  static void complete_preempt_context(struct intel_engine_cs *engine)
 
 	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
 
+	if (I915_SELFTEST_ONLY(execlists->preempt_hang))
+		return;
+
 	execlists_cancel_port_requests(execlists);
 	execlists_unwind_incomplete_requests(execlists);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 8fd8de71c2b5..703b76dcfcd2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -559,6 +559,9 @@  static void complete_preempt_context(struct intel_engine_execlists *execlists)
 {
 	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
 
+	if (I915_SELFTEST_ONLY(execlists->preempt_hang))
+		return;
+
 	execlists_cancel_port_requests(execlists);
 	__unwind_incomplete_requests(container_of(execlists,
 						  struct intel_engine_cs,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d1eee08e5f6b..37a7c819435a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -333,6 +333,8 @@  struct intel_engine_execlists {
 	 * @csb_head: context status buffer head
 	 */
 	u8 csb_head;
+
+	I915_SELFTEST_DECLARE(bool preempt_hang;)
 };
 
 #define INTEL_ENGINE_CS_MAX_NAME 8
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 636cb68191e3..d642e78ef145 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -451,12 +451,113 @@  static int live_late_preempt(void *arg)
 	goto err_ctx_lo;
 }
 
+static int live_preempt_hang(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct i915_gem_context *ctx_hi, *ctx_lo;
+	struct spinner spin_hi, spin_lo;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err = -ENOMEM;
+
+	if (!HAS_LOGICAL_RING_PREEMPTION(i915))
+		return 0;
+
+	if (!intel_has_reset_engine(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+
+	if (spinner_init(&spin_hi, i915))
+		goto err_unlock;
+
+	if (spinner_init(&spin_lo, i915))
+		goto err_spin_hi;
+
+	ctx_hi = kernel_context(i915);
+	if (!ctx_hi)
+		goto err_spin_lo;
+	ctx_hi->sched.priority = I915_CONTEXT_MAX_USER_PRIORITY;
+
+	ctx_lo = kernel_context(i915);
+	if (!ctx_lo)
+		goto err_ctx_hi;
+	ctx_lo->sched.priority = I915_CONTEXT_MIN_USER_PRIORITY;
+
+	for_each_engine(engine, i915, id) {
+		struct i915_request *rq;
+
+		rq = spinner_create_request(&spin_lo, ctx_lo, engine,
+					    MI_ARB_CHECK);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err_ctx_lo;
+		}
+
+		i915_request_add(rq);
+		if (!wait_for_spinner(&spin_lo, rq)) {
+			GEM_TRACE("lo spinner failed to start\n");
+			GEM_TRACE_DUMP();
+			i915_gem_set_wedged(i915);
+			err = -EIO;
+			goto err_ctx_lo;
+		}
+
+		rq = spinner_create_request(&spin_hi, ctx_hi, engine,
+					    MI_ARB_CHECK);
+		if (IS_ERR(rq)) {
+			spinner_end(&spin_lo);
+			err = PTR_ERR(rq);
+			goto err_ctx_lo;
+		}
+
+		engine->execlists.preempt_hang = true;
+		i915_request_add(rq);
+
+		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+		i915_reset_engine(engine, NULL);
+		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+
+		engine->execlists.preempt_hang = false;
+
+		if (!wait_for_spinner(&spin_hi, rq)) {
+			GEM_TRACE("hi spinner failed to start\n");
+			GEM_TRACE_DUMP();
+			i915_gem_set_wedged(i915);
+			err = -EIO;
+			goto err_ctx_lo;
+		}
+
+		spinner_end(&spin_hi);
+		spinner_end(&spin_lo);
+		if (igt_flush_test(i915, I915_WAIT_LOCKED)) {
+			err = -EIO;
+			goto err_ctx_lo;
+		}
+	}
+
+	err = 0;
+err_ctx_lo:
+	kernel_context_close(ctx_lo);
+err_ctx_hi:
+	kernel_context_close(ctx_hi);
+err_spin_lo:
+	spinner_fini(&spin_lo);
+err_spin_hi:
+	spinner_fini(&spin_hi);
+err_unlock:
+	igt_flush_test(i915, I915_WAIT_LOCKED);
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
 int intel_execlists_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
 		SUBTEST(live_sanitycheck),
 		SUBTEST(live_preempt),
 		SUBTEST(live_late_preempt),
+		SUBTEST(live_preempt_hang),
 	};
 
 	if (!HAS_EXECLISTS(i915))