diff mbox series

[19/24] drm/i915/selftests: Be a little more lenient for reset workers

Message ID 20200228082330.2411941-19-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/24] drm/i915/gt: Check engine-is-awake on reset later | expand

Commit Message

Chris Wilson Feb. 28, 2020, 8:23 a.m. UTC
Give the reset worker a kick before losing help when waiting for hang
recovery, as the CPU scheduler is a little unreliable.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 74 ++++++++++++++++++--------
 1 file changed, 52 insertions(+), 22 deletions(-)

Comments

Mika Kuoppala Feb. 28, 2020, 3:38 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Give the reset worker a kick before losing help when waiting for hang
> recovery, as the CPU scheduler is a little unreliable.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/gt/selftest_lrc.c | 74 ++++++++++++++++++--------
>  1 file changed, 52 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 95da6b880e3f..af5b3da6d894 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -90,6 +90,48 @@ static int wait_for_submit(struct intel_engine_cs *engine,
>  	return -ETIME;
>  }
>  
> +static int wait_for_reset(struct intel_engine_cs *engine,
> +			  struct i915_request *rq,
> +			  unsigned long timeout)
> +{
> +	timeout += jiffies;
> +	do {
> +		cond_resched();
> +		intel_engine_flush_submission(engine);
> +
> +		if (READ_ONCE(engine->execlists.pending[0]))
> +			continue;
> +
> +		if (i915_request_completed(rq))
> +			break;
> +
> +		if (READ_ONCE(rq->fence.error))
> +			break;
> +	} while (time_before(jiffies, timeout));
> +
> +	flush_scheduled_work();
> +
> +	if (rq->fence.error != -EIO) {
> +		pr_err("%s: hanging request %llx:%lld not reset\n",
> +		       engine->name,
> +		       rq->fence.context,
> +		       rq->fence.seqno);
> +		return -EINVAL;
> +	}
> +
> +	/* Give the request a jiffie to complete after flushing the worker */
> +	if (i915_request_wait(rq, 0,
> +			      max(0l, (long)(timeout - jiffies)) + 1) < 0) {
> +		pr_err("%s: hanging request %llx:%lld did not complete\n",
> +		       engine->name,
> +		       rq->fence.context,
> +		       rq->fence.seqno);
> +		return -ETIME;
> +	}
> +
> +	return 0;
> +}
> +
>  static int live_sanitycheck(void *arg)
>  {
>  	struct intel_gt *gt = arg;
> @@ -1805,14 +1847,9 @@ static int __cancel_active0(struct live_preempt_cancel *arg)
>  	if (err)
>  		goto out;
>  
> -	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> -		err = -EIO;
> -		goto out;
> -	}
> -
> -	if (rq->fence.error != -EIO) {
> -		pr_err("Cancelled inflight0 request did not report -EIO\n");
> -		err = -EINVAL;
> +	err = wait_for_reset(arg->engine, rq, HZ / 2);
> +	if (err) {
> +		pr_err("Cancelled inflight0 request did not reset\n");
>  		goto out;
>  	}
>  
> @@ -1870,10 +1907,9 @@ static int __cancel_active1(struct live_preempt_cancel *arg)
>  		goto out;
>  
>  	igt_spinner_end(&arg->a.spin);
> -	if (i915_request_wait(rq[1], 0, HZ / 5) < 0) {
> -		err = -EIO;
> +	err = wait_for_reset(arg->engine, rq[1], HZ / 2);
> +	if (err)
>  		goto out;
> -	}
>  
>  	if (rq[0]->fence.error != 0) {
>  		pr_err("Normal inflight0 request did not complete\n");
> @@ -1953,10 +1989,9 @@ static int __cancel_queued(struct live_preempt_cancel *arg)
>  	if (err)
>  		goto out;
>  
> -	if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
> -		err = -EIO;
> +	err = wait_for_reset(arg->engine, rq[2], HZ / 2);
> +	if (err)
>  		goto out;
> -	}
>  
>  	if (rq[0]->fence.error != -EIO) {
>  		pr_err("Cancelled inflight0 request did not report -EIO\n");
> @@ -2014,14 +2049,9 @@ static int __cancel_hostile(struct live_preempt_cancel *arg)
>  	if (err)
>  		goto out;
>  
> -	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> -		err = -EIO;
> -		goto out;
> -	}
> -
> -	if (rq->fence.error != -EIO) {
> -		pr_err("Cancelled inflight0 request did not report -EIO\n");
> -		err = -EINVAL;
> +	err = wait_for_reset(arg->engine, rq, HZ / 2);
> +	if (err) {
> +		pr_err("Cancelled inflight0 request did not reset\n");
>  		goto out;
>  	}
>  
> -- 
> 2.25.1
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 95da6b880e3f..af5b3da6d894 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -90,6 +90,48 @@  static int wait_for_submit(struct intel_engine_cs *engine,
 	return -ETIME;
 }
 
+static int wait_for_reset(struct intel_engine_cs *engine,
+			  struct i915_request *rq,
+			  unsigned long timeout)
+{
+	timeout += jiffies;
+	do {
+		cond_resched();
+		intel_engine_flush_submission(engine);
+
+		if (READ_ONCE(engine->execlists.pending[0]))
+			continue;
+
+		if (i915_request_completed(rq))
+			break;
+
+		if (READ_ONCE(rq->fence.error))
+			break;
+	} while (time_before(jiffies, timeout));
+
+	flush_scheduled_work();
+
+	if (rq->fence.error != -EIO) {
+		pr_err("%s: hanging request %llx:%lld not reset\n",
+		       engine->name,
+		       rq->fence.context,
+		       rq->fence.seqno);
+		return -EINVAL;
+	}
+
+	/* Give the request a jiffie to complete after flushing the worker */
+	if (i915_request_wait(rq, 0,
+			      max(0l, (long)(timeout - jiffies)) + 1) < 0) {
+		pr_err("%s: hanging request %llx:%lld did not complete\n",
+		       engine->name,
+		       rq->fence.context,
+		       rq->fence.seqno);
+		return -ETIME;
+	}
+
+	return 0;
+}
+
 static int live_sanitycheck(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -1805,14 +1847,9 @@  static int __cancel_active0(struct live_preempt_cancel *arg)
 	if (err)
 		goto out;
 
-	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
-		err = -EIO;
-		goto out;
-	}
-
-	if (rq->fence.error != -EIO) {
-		pr_err("Cancelled inflight0 request did not report -EIO\n");
-		err = -EINVAL;
+	err = wait_for_reset(arg->engine, rq, HZ / 2);
+	if (err) {
+		pr_err("Cancelled inflight0 request did not reset\n");
 		goto out;
 	}
 
@@ -1870,10 +1907,9 @@  static int __cancel_active1(struct live_preempt_cancel *arg)
 		goto out;
 
 	igt_spinner_end(&arg->a.spin);
-	if (i915_request_wait(rq[1], 0, HZ / 5) < 0) {
-		err = -EIO;
+	err = wait_for_reset(arg->engine, rq[1], HZ / 2);
+	if (err)
 		goto out;
-	}
 
 	if (rq[0]->fence.error != 0) {
 		pr_err("Normal inflight0 request did not complete\n");
@@ -1953,10 +1989,9 @@  static int __cancel_queued(struct live_preempt_cancel *arg)
 	if (err)
 		goto out;
 
-	if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
-		err = -EIO;
+	err = wait_for_reset(arg->engine, rq[2], HZ / 2);
+	if (err)
 		goto out;
-	}
 
 	if (rq[0]->fence.error != -EIO) {
 		pr_err("Cancelled inflight0 request did not report -EIO\n");
@@ -2014,14 +2049,9 @@  static int __cancel_hostile(struct live_preempt_cancel *arg)
 	if (err)
 		goto out;
 
-	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
-		err = -EIO;
-		goto out;
-	}
-
-	if (rq->fence.error != -EIO) {
-		pr_err("Cancelled inflight0 request did not report -EIO\n");
-		err = -EINVAL;
+	err = wait_for_reset(arg->engine, rq, HZ / 2);
+	if (err) {
+		pr_err("Cancelled inflight0 request did not reset\n");
 		goto out;
 	}