diff mbox series

drm/i915/gt: Ignore error capturing a closed context

Message ID 20210129120620.6516-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915/gt: Ignore error capturing a closed context | expand

Commit Message

Chris Wilson Jan. 29, 2021, 12:06 p.m. UTC
To capture a context after a gpu hang, we suspend the request and then
resume its execution afterwards. If the context is already closed, we
can assume that no one is interested in the result, but instead we are
trying to terminate execution quickly as part of a forced-preemption.
In which case, do not waste time in suspending the request, capturing
the error, and just cancel it instead.

Testcase: igt/gem_ctx_persistence/many-contexts
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../drm/i915/gt/intel_execlists_submission.c   | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

Comments

Chris Wilson Jan. 29, 2021, 12:07 p.m. UTC | #1
Quoting Chris Wilson (2021-01-29 12:06:20)
> To capture a context after a gpu hang, we suspend the request and then
> resume its execution afterwards. If the context is already closed, we
> can assume that no one is interested in the result, but instead we are
> trying to terminate execution quickly as part of a forced-preemption.
> In which case, do not waste time in suspending the request, capturing
> the error, and just cancel it instead.

+ before throwing away the error state
-Chris
Mika Kuoppala Jan. 29, 2021, 3:01 p.m. UTC | #2
Chris Wilson <chris@chris-wilson.co.uk> writes:

> To capture a context after a gpu hang, we suspend the request and then
> resume its execution afterwards. If the context is already closed, we
> can assume that no one is interested in the result, but instead we are
> trying to terminate execution quickly as part of a forced-preemption.
> In which case, do not waste time in suspending the request, capturing
> the error, and just cancel it instead.
>
> Testcase: igt/gem_ctx_persistence/many-contexts
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  .../drm/i915/gt/intel_execlists_submission.c   | 18 +++++++++++++-----
>  1 file changed, 13 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index e20ab2eab3a8..2280d1bd2c77 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -2249,10 +2249,21 @@ static u32 active_ccid(struct intel_engine_cs *engine)
>  static void execlists_capture(struct intel_engine_cs *engine)
>  {
>  	struct execlists_capture *cap;
> +	struct i915_request *rq;
>  
>  	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
>  		return;
>  
> +	rq = active_context(engine, active_ccid(engine));
> +
> +	/*
> +	 * If the context is closed, assume no one is listening for the
> +	 * associated state; the user is already gone. We can save a lot of
> +	 * time around forced-preemption by just cancelling the guilty request.
> +	 */
> +	if (!rq || intel_context_is_closed(rq->context))
> +		return;
> +
>  	/*
>  	 * We need to _quickly_ capture the engine state before we reset.
>  	 * We are inside an atomic section (softirq) here and we are delaying
> @@ -2262,11 +2273,8 @@ static void execlists_capture(struct intel_engine_cs *engine)
>  	if (!cap)
>  		return;
>  
> -	cap->rq = active_context(engine, active_ccid(engine));
> -	if (cap->rq) {
> -		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
> -		cap->rq = i915_request_get_rcu(cap->rq);
> -	}
> +	rq = active_request(rq->context->timeline, rq);
> +	cap->rq = i915_request_get_rcu(rq);
>  	if (!cap->rq)
>  		goto err_free;
>  
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index e20ab2eab3a8..2280d1bd2c77 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2249,10 +2249,21 @@  static u32 active_ccid(struct intel_engine_cs *engine)
 static void execlists_capture(struct intel_engine_cs *engine)
 {
 	struct execlists_capture *cap;
+	struct i915_request *rq;
 
 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
 		return;
 
+	rq = active_context(engine, active_ccid(engine));
+
+	/*
+	 * If the context is closed, assume no one is listening for the
+	 * associated state; the user is already gone. We can save a lot of
+	 * time around forced-preemption by just cancelling the guilty request.
+	 */
+	if (!rq || intel_context_is_closed(rq->context))
+		return;
+
 	/*
 	 * We need to _quickly_ capture the engine state before we reset.
 	 * We are inside an atomic section (softirq) here and we are delaying
@@ -2262,11 +2273,8 @@  static void execlists_capture(struct intel_engine_cs *engine)
 	if (!cap)
 		return;
 
-	cap->rq = active_context(engine, active_ccid(engine));
-	if (cap->rq) {
-		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
-		cap->rq = i915_request_get_rcu(cap->rq);
-	}
+	rq = active_request(rq->context->timeline, rq);
+	cap->rq = i915_request_get_rcu(rq);
 	if (!cap->rq)
 		goto err_free;