[v2] drm/i915/execlists: Take a reference while capturing the guilty request
diff mbox series

Message ID 20200122113421.482427-1-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [v2] drm/i915/execlists: Take a reference while capturing the guilty request
Related show

Commit Message

Chris Wilson Jan. 22, 2020, 11:34 a.m. UTC
Thanks to preempt-to-busy, we leave the request on the HW as we submit
the preemption request. This means that the request may complete at any
moment as we process HW events, and in particular the request may be
retired as we are planning to capture it for a preemption timeout.

Be more careful while obtaining the request to capture after a
preemption timeout, and check to see if it completed before we were able
to put it on the on-hold list. If we do see it did complete just before
we capture the request, proclaim the preemption-timeout a false positive
and pardon the reset as we should hit an arbitration point momentarily
and so be able to process the preemption.

Note that even after we move the request to be on hold it may be retired
(as the reset to stop the HW comes after), so we do require to hold our
own reference as we work on the request for capture (and all of the
peeking at state within the request needs to be carefully protected).

Fixes: 32ff621fd744 ("drm/i915/gt: Allow temporary suspension of inflight requests")
Closes: https://gitlab.freedesktop.org/drm/intel/issues/997
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 39 +++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 8 deletions(-)

Comments

Tvrtko Ursulin Jan. 22, 2020, 1:28 p.m. UTC | #1
On 22/01/2020 11:34, Chris Wilson wrote:
> Thanks to preempt-to-busy, we leave the request on the HW as we submit
> the preemption request. This means that the request may complete at any
> moment as we process HW events, and in particular the request may be
> retired as we are planning to capture it for a preemption timeout.
> 
> Be more careful while obtaining the request to capture after a
> preemption timeout, and check to see if it completed before we were able
> to put it on the on-hold list. If we do see it did complete just before
> we capture the request, proclaim the preemption-timeout a false positive
> and pardon the reset as we should hit an arbitration point momentarily
> and so be able to process the preemption.
> 
> Note that even after we move the request to be on hold it may be retired
> (as the reset to stop the HW comes after), so we do require to hold our
> own reference as we work on the request for capture (and all of the
> peeking at state within the request needs to be carefully protected).
> 
> Fixes: 32ff621fd744 ("drm/i915/gt: Allow temporary suspension of inflight requests")
> Closes: https://gitlab.freedesktop.org/drm/intel/issues/997
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c | 39 +++++++++++++++++++++++------
>   1 file changed, 31 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 3a30767ff0c4..c4826d49571f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2393,11 +2393,16 @@ static void __execlists_hold(struct i915_request *rq)
>   	} while (rq);
>   }
>   
> -static void execlists_hold(struct intel_engine_cs *engine,
> +static bool execlists_hold(struct intel_engine_cs *engine,
>   			   struct i915_request *rq)
>   {
>   	spin_lock_irq(&engine->active.lock);
>   
> +	if (!i915_request_completed(rq)) { /* too late! */
> +		rq = NULL;
> +		goto unlock;
> +	}
> +
>   	/*
>   	 * Transfer this request onto the hold queue to prevent it
>   	 * being resumbitted to HW (and potentially completed) before we have
> @@ -2408,7 +2413,9 @@ static void execlists_hold(struct intel_engine_cs *engine,
>   	GEM_BUG_ON(rq->engine != engine);
>   	__execlists_hold(rq);
>   
> +unlock:
>   	spin_unlock_irq(&engine->active.lock);
> +	return rq;
>   }
>   
>   static bool hold_request(const struct i915_request *rq)
> @@ -2524,6 +2531,7 @@ static void execlists_capture_work(struct work_struct *work)
>   
>   	/* Return this request and all that depend upon it for signaling */
>   	execlists_unhold(engine, cap->rq);
> +	i915_request_put(cap->rq);
>   
>   	kfree(cap);
>   }
> @@ -2560,12 +2568,12 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
>   	return NULL;
>   }
>   
> -static void execlists_capture(struct intel_engine_cs *engine)
> +static bool execlists_capture(struct intel_engine_cs *engine)
>   {
>   	struct execlists_capture *cap;
>   
>   	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
> -		return;
> +		return true;
>   
>   	/*
>   	 * We need to _quickly_ capture the engine state before we reset.
> @@ -2574,13 +2582,17 @@ static void execlists_capture(struct intel_engine_cs *engine)
>   	 */
>   	cap = capture_regs(engine);
>   	if (!cap)
> -		return;
> +		return true;
>   
>   	cap->rq = execlists_active(&engine->execlists);
>   	GEM_BUG_ON(!cap->rq);
>   
> +	rcu_read_lock();
>   	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
> -	GEM_BUG_ON(!cap->rq);
> +	cap->rq = i915_request_get_rcu(cap->rq);
> +	rcu_read_unlock();
> +	if (!cap->rq)
> +		goto err_free;
>   
>   	/*
>   	 * Remove the request from the execlists queue, and take ownership
> @@ -2602,10 +2614,19 @@ static void execlists_capture(struct intel_engine_cs *engine)
>   	 * simply hold that request accountable for being non-preemptible
>   	 * long enough to force the reset.
>   	 */
> -	execlists_hold(engine, cap->rq);
> +	if (!execlists_hold(engine, cap->rq))
> +		goto err_rq;
>   
>   	INIT_WORK(&cap->work, execlists_capture_work);
>   	schedule_work(&cap->work);
> +	return true;
> +
> +err_rq:
> +	i915_request_put(cap->rq);
> +err_free:
> +	i915_gpu_coredump_put(cap->error);
> +	kfree(cap);
> +	return false;
>   }
>   
>   static noinline void preempt_reset(struct intel_engine_cs *engine)
> @@ -2627,8 +2648,10 @@ static noinline void preempt_reset(struct intel_engine_cs *engine)
>   		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
>   
>   	ring_set_paused(engine, 1); /* Freeze the current request in place */
> -	execlists_capture(engine);
> -	intel_engine_reset(engine, "preemption time out");
> +	if (execlists_capture(engine))
> +		intel_engine_reset(engine, "preemption time out");
> +	else
> +		ring_set_paused(engine, 0);
>   
>   	tasklet_enable(&engine->execlists.tasklet);
>   	clear_and_wake_up_bit(bit, lock);
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 3a30767ff0c4..c4826d49571f 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2393,11 +2393,16 @@  static void __execlists_hold(struct i915_request *rq)
 	} while (rq);
 }
 
-static void execlists_hold(struct intel_engine_cs *engine,
+static bool execlists_hold(struct intel_engine_cs *engine,
 			   struct i915_request *rq)
 {
 	spin_lock_irq(&engine->active.lock);
 
+	if (!i915_request_completed(rq)) { /* too late! */
+		rq = NULL;
+		goto unlock;
+	}
+
 	/*
 	 * Transfer this request onto the hold queue to prevent it
 	 * being resumbitted to HW (and potentially completed) before we have
@@ -2408,7 +2413,9 @@  static void execlists_hold(struct intel_engine_cs *engine,
 	GEM_BUG_ON(rq->engine != engine);
 	__execlists_hold(rq);
 
+unlock:
 	spin_unlock_irq(&engine->active.lock);
+	return rq;
 }
 
 static bool hold_request(const struct i915_request *rq)
@@ -2524,6 +2531,7 @@  static void execlists_capture_work(struct work_struct *work)
 
 	/* Return this request and all that depend upon it for signaling */
 	execlists_unhold(engine, cap->rq);
+	i915_request_put(cap->rq);
 
 	kfree(cap);
 }
@@ -2560,12 +2568,12 @@  static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
 	return NULL;
 }
 
-static void execlists_capture(struct intel_engine_cs *engine)
+static bool execlists_capture(struct intel_engine_cs *engine)
 {
 	struct execlists_capture *cap;
 
 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
-		return;
+		return true;
 
 	/*
 	 * We need to _quickly_ capture the engine state before we reset.
@@ -2574,13 +2582,17 @@  static void execlists_capture(struct intel_engine_cs *engine)
 	 */
 	cap = capture_regs(engine);
 	if (!cap)
-		return;
+		return true;
 
 	cap->rq = execlists_active(&engine->execlists);
 	GEM_BUG_ON(!cap->rq);
 
+	rcu_read_lock();
 	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
-	GEM_BUG_ON(!cap->rq);
+	cap->rq = i915_request_get_rcu(cap->rq);
+	rcu_read_unlock();
+	if (!cap->rq)
+		goto err_free;
 
 	/*
 	 * Remove the request from the execlists queue, and take ownership
@@ -2602,10 +2614,19 @@  static void execlists_capture(struct intel_engine_cs *engine)
 	 * simply hold that request accountable for being non-preemptible
 	 * long enough to force the reset.
 	 */
-	execlists_hold(engine, cap->rq);
+	if (!execlists_hold(engine, cap->rq))
+		goto err_rq;
 
 	INIT_WORK(&cap->work, execlists_capture_work);
 	schedule_work(&cap->work);
+	return true;
+
+err_rq:
+	i915_request_put(cap->rq);
+err_free:
+	i915_gpu_coredump_put(cap->error);
+	kfree(cap);
+	return false;
 }
 
 static noinline void preempt_reset(struct intel_engine_cs *engine)
@@ -2627,8 +2648,10 @@  static noinline void preempt_reset(struct intel_engine_cs *engine)
 		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
 
 	ring_set_paused(engine, 1); /* Freeze the current request in place */
-	execlists_capture(engine);
-	intel_engine_reset(engine, "preemption time out");
+	if (execlists_capture(engine))
+		intel_engine_reset(engine, "preemption time out");
+	else
+		ring_set_paused(engine, 0);
 
 	tasklet_enable(&engine->execlists.tasklet);
 	clear_and_wake_up_bit(bit, lock);