diff mbox series

[v3,2/6] drm/i915: Fix up locking around dumping requests lists

Message ID 20230119065000.1661857-3-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show
Series Allow error capture without a request & fix locking issues | expand

Commit Message

John Harrison Jan. 19, 2023, 6:49 a.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

The debugfs dump of requests was confused about what state requires
the execlist lock versus the GuC lock. There was also a bunch of
duplicated messy code between it and the error capture code.

So refactor the hung request search into a re-usable function. And
reduce the span of the execlist state lock to only the execlist
specific code paths.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/intel_context.c   | 29 +++++++++++++
 drivers/gpu/drm/i915/gt/intel_context.h   |  3 ++
 drivers/gpu/drm/i915/gt/intel_engine_cs.c | 51 +++++++++++------------
 drivers/gpu/drm/i915/i915_gpu_error.c     | 27 ++----------
 4 files changed, 60 insertions(+), 50 deletions(-)

Comments

Daniele Ceraolo Spurio Jan. 20, 2023, 1:41 a.m. UTC | #1
On 1/18/2023 10:49 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> The debugfs dump of requests was confused about what state requires
> the execlist lock versus the GuC lock. There was also a bunch of
> duplicated messy code between it and the error capture code.
>
> So refactor the hung request search into a re-usable function. And
> reduce the span of the execlist state lock to only the execlist
> specific code paths.
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_context.c   | 29 +++++++++++++
>   drivers/gpu/drm/i915/gt/intel_context.h   |  3 ++
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c | 51 +++++++++++------------
>   drivers/gpu/drm/i915/i915_gpu_error.c     | 27 ++----------
>   4 files changed, 60 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index e7c5509c48ef1..a61f052092ed9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -559,6 +559,35 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce)
>   	return active;
>   }
>   
> +void intel_get_hung_entity(struct intel_engine_cs *engine,
> +			   struct intel_context **ce, struct i915_request **rq)
IMO this belongs in the engine_cs.c file, given that the engine is the 
input. Might also be worth renaming to intel_engine_*

> +{
> +	unsigned long flags;
> +
> +	*ce = intel_engine_get_hung_context(engine);
> +	if (*ce) {
> +		intel_engine_clear_hung_context(engine);
> +
> +		/* This will reference count the request (if found) */
> +		*rq = intel_context_find_active_request(*ce);
> +
> +		return;
> +	}
> +
> +	/*
> +	 * Getting here with GuC enabled means it is a forced error capture
> +	 * with no actual hang. So, no need to attempt the execlist search.
> +	 */
> +	if (intel_uc_uses_guc_submission(&engine->gt->uc))
> +		return;
> +
> +	spin_lock_irqsave(&engine->sched_engine->lock, flags);
> +	*rq = intel_engine_execlist_find_hung_request(engine);
> +	if (*rq)
> +		*rq = i915_request_get_rcu(*rq);
> +	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> +}
> +
>   void intel_context_bind_parent_child(struct intel_context *parent,
>   				     struct intel_context *child)
>   {
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index fb62b7b8cbcda..ca50f3312a941 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -271,6 +271,9 @@ struct i915_request *intel_context_create_request(struct intel_context *ce);
>   struct i915_request *
>   intel_context_find_active_request(struct intel_context *ce);
>   
> +void intel_get_hung_entity(struct intel_engine_cs *engine,
> +			   struct intel_context **ce, struct i915_request **rq);
> +
>   static inline bool intel_context_is_barrier(const struct intel_context *ce)
>   {
>   	return test_bit(CONTEXT_BARRIER_BIT, &ce->flags);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 6a082658d0082..5e173dfc8849e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -2216,11 +2216,27 @@ void intel_engine_dump_active_requests(struct list_head *requests,
>   	}
>   }
>   
> -static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
> +static void execlist_dump_active_requests(struct intel_engine_cs *engine,

Might be worth moving this to the execlists submission file, to be 
symmetrical with the GuC submission one. Other execlists functions are 
in this file though, so not a blocker.

> +					  struct i915_request *hung_rq,
> +					  struct drm_printer *m)
>   {
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&engine->sched_engine->lock, flags);
> +
> +	intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m);
> +
> +	drm_printf(m, "\tOn hold?: %lu\n",
> +		   list_count(&engine->sched_engine->hold));
> +
> +	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> +}
> +
> +static void engine_dump_active_requests(struct intel_engine_cs *engine,
> +					struct drm_printer *m)
> +{
> +	struct intel_context *hung_ce = NULL;
>   	struct i915_request *hung_rq = NULL;
> -	struct intel_context *ce;
> -	bool guc;
>   
>   	/*
>   	 * No need for an engine->irq_seqno_barrier() before the seqno reads.
> @@ -2229,31 +2245,20 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
>   	 * But the intention here is just to report an instantaneous snapshot
>   	 * so that's fine.
>   	 */
> -	lockdep_assert_held(&engine->sched_engine->lock);
> +	intel_get_hung_entity(engine, &hung_ce, &hung_rq);
>   
>   	drm_printf(m, "\tRequests:\n");
>   
> -	guc = intel_uc_uses_guc_submission(&engine->gt->uc);
> -	if (guc) {
> -		ce = intel_engine_get_hung_context(engine);
> -		if (ce) {
> -			/* This will reference count the request (if found) */
> -			hung_rq = intel_context_find_active_request(ce);
> -		}
> -	} else {
> -		hung_rq = intel_engine_execlist_find_hung_request(engine);
> -		if (hung_rq)
> -			hung_rq = i915_request_get_rcu(hung_rq);
> -	}
> -
>   	if (hung_rq)
>   		engine_dump_request(hung_rq, m, "\t\thung");
> +	else if (hung_ce)
> +		drm_printf(m, "\t\tGot hung ce but no hung rq!\n");
>   
> -	if (guc)
> +	if (intel_uc_uses_guc_submission(&engine->gt->uc))
>   		intel_guc_dump_active_requests(engine, hung_rq, m);
>   	else
> -		intel_engine_dump_active_requests(&engine->sched_engine->requests,
> -						  hung_rq, m);
> +		execlist_dump_active_requests(engine, hung_rq, m);
> +
>   	if (hung_rq)
>   		i915_request_put(hung_rq);
>   }
> @@ -2265,7 +2270,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	struct i915_gpu_error * const error = &engine->i915->gpu_error;
>   	struct i915_request *rq;
>   	intel_wakeref_t wakeref;
> -	unsigned long flags;
>   	ktime_t dummy;
>   
>   	if (header) {
> @@ -2302,13 +2306,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   		   i915_reset_count(error));
>   	print_properties(engine, m);
>   
> -	spin_lock_irqsave(&engine->sched_engine->lock, flags);
>   	engine_dump_active_requests(engine, m);
>   
> -	drm_printf(m, "\tOn hold?: %lu\n",
> -		   list_count(&engine->sched_engine->hold));

This print moves from the global function to the execlists submission 
one. AFAICS this is only used in execlists mode so not an issue, but a 
note in the commit message would've been nice.

> -	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
> -
>   	drm_printf(m, "\tMMIO base:  0x%08x\n", engine->mmio_base);
>   	wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm);
>   	if (wakeref) {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 7ea36478ee52d..78cf95e4dd230 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1596,36 +1596,15 @@ capture_engine(struct intel_engine_cs *engine,
>   {
>   	struct intel_engine_capture_vma *capture = NULL;
>   	struct intel_engine_coredump *ee;
> -	struct intel_context *ce;
> +	struct intel_context *ce = NULL;
>   	struct i915_request *rq = NULL;
> -	unsigned long flags;
>   
>   	ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
>   	if (!ee)
>   		return NULL;
>   
> -	ce = intel_engine_get_hung_context(engine);
> -	if (ce) {
> -		intel_engine_clear_hung_context(engine);
> -		/* This will reference count the request (if found) */
> -		rq = intel_context_find_active_request(ce);
> -		if (!rq || !i915_request_started(rq))
> -			goto no_request_capture;
> -	} else {
> -		/*
> -		 * Getting here with GuC enabled means it is a forced error capture
> -		 * with no actual hang. So, no need to attempt the execlist search.
> -		 */
> -		if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
> -			spin_lock_irqsave(&engine->sched_engine->lock, flags);
> -			rq = intel_engine_execlist_find_hung_request(engine);
> -			if (rq)
> -				rq = i915_request_get_rcu(rq);
> -			spin_unlock_irqrestore(&engine->sched_engine->lock,
> -					       flags);
> -		}
> -	}
> -	if (!rq)
> +	intel_get_hung_entity(engine, &ce, &rq);
> +	if (!rq || !i915_request_started(rq))
>   		goto no_request_capture;

This has been a pain to review, but AFAICS all the locking is correct, 
so with the get_hung function moved to the intel_engine file this is:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

>   
>   	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index e7c5509c48ef1..a61f052092ed9 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -559,6 +559,35 @@  struct i915_request *intel_context_find_active_request(struct intel_context *ce)
 	return active;
 }
 
+void intel_get_hung_entity(struct intel_engine_cs *engine,
+			   struct intel_context **ce, struct i915_request **rq)
+{
+	unsigned long flags;
+
+	*ce = intel_engine_get_hung_context(engine);
+	if (*ce) {
+		intel_engine_clear_hung_context(engine);
+
+		/* This will reference count the request (if found) */
+		*rq = intel_context_find_active_request(*ce);
+
+		return;
+	}
+
+	/*
+	 * Getting here with GuC enabled means it is a forced error capture
+	 * with no actual hang. So, no need to attempt the execlist search.
+	 */
+	if (intel_uc_uses_guc_submission(&engine->gt->uc))
+		return;
+
+	spin_lock_irqsave(&engine->sched_engine->lock, flags);
+	*rq = intel_engine_execlist_find_hung_request(engine);
+	if (*rq)
+		*rq = i915_request_get_rcu(*rq);
+	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
+}
+
 void intel_context_bind_parent_child(struct intel_context *parent,
 				     struct intel_context *child)
 {
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index fb62b7b8cbcda..ca50f3312a941 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -271,6 +271,9 @@  struct i915_request *intel_context_create_request(struct intel_context *ce);
 struct i915_request *
 intel_context_find_active_request(struct intel_context *ce);
 
+void intel_get_hung_entity(struct intel_engine_cs *engine,
+			   struct intel_context **ce, struct i915_request **rq);
+
 static inline bool intel_context_is_barrier(const struct intel_context *ce)
 {
 	return test_bit(CONTEXT_BARRIER_BIT, &ce->flags);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 6a082658d0082..5e173dfc8849e 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -2216,11 +2216,27 @@  void intel_engine_dump_active_requests(struct list_head *requests,
 	}
 }
 
-static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
+static void execlist_dump_active_requests(struct intel_engine_cs *engine,
+					  struct i915_request *hung_rq,
+					  struct drm_printer *m)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&engine->sched_engine->lock, flags);
+
+	intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m);
+
+	drm_printf(m, "\tOn hold?: %lu\n",
+		   list_count(&engine->sched_engine->hold));
+
+	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
+}
+
+static void engine_dump_active_requests(struct intel_engine_cs *engine,
+					struct drm_printer *m)
+{
+	struct intel_context *hung_ce = NULL;
 	struct i915_request *hung_rq = NULL;
-	struct intel_context *ce;
-	bool guc;
 
 	/*
 	 * No need for an engine->irq_seqno_barrier() before the seqno reads.
@@ -2229,31 +2245,20 @@  static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
 	 * But the intention here is just to report an instantaneous snapshot
 	 * so that's fine.
 	 */
-	lockdep_assert_held(&engine->sched_engine->lock);
+	intel_get_hung_entity(engine, &hung_ce, &hung_rq);
 
 	drm_printf(m, "\tRequests:\n");
 
-	guc = intel_uc_uses_guc_submission(&engine->gt->uc);
-	if (guc) {
-		ce = intel_engine_get_hung_context(engine);
-		if (ce) {
-			/* This will reference count the request (if found) */
-			hung_rq = intel_context_find_active_request(ce);
-		}
-	} else {
-		hung_rq = intel_engine_execlist_find_hung_request(engine);
-		if (hung_rq)
-			hung_rq = i915_request_get_rcu(hung_rq);
-	}
-
 	if (hung_rq)
 		engine_dump_request(hung_rq, m, "\t\thung");
+	else if (hung_ce)
+		drm_printf(m, "\t\tGot hung ce but no hung rq!\n");
 
-	if (guc)
+	if (intel_uc_uses_guc_submission(&engine->gt->uc))
 		intel_guc_dump_active_requests(engine, hung_rq, m);
 	else
-		intel_engine_dump_active_requests(&engine->sched_engine->requests,
-						  hung_rq, m);
+		execlist_dump_active_requests(engine, hung_rq, m);
+
 	if (hung_rq)
 		i915_request_put(hung_rq);
 }
@@ -2265,7 +2270,6 @@  void intel_engine_dump(struct intel_engine_cs *engine,
 	struct i915_gpu_error * const error = &engine->i915->gpu_error;
 	struct i915_request *rq;
 	intel_wakeref_t wakeref;
-	unsigned long flags;
 	ktime_t dummy;
 
 	if (header) {
@@ -2302,13 +2306,8 @@  void intel_engine_dump(struct intel_engine_cs *engine,
 		   i915_reset_count(error));
 	print_properties(engine, m);
 
-	spin_lock_irqsave(&engine->sched_engine->lock, flags);
 	engine_dump_active_requests(engine, m);
 
-	drm_printf(m, "\tOn hold?: %lu\n",
-		   list_count(&engine->sched_engine->hold));
-	spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
-
 	drm_printf(m, "\tMMIO base:  0x%08x\n", engine->mmio_base);
 	wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm);
 	if (wakeref) {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 7ea36478ee52d..78cf95e4dd230 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1596,36 +1596,15 @@  capture_engine(struct intel_engine_cs *engine,
 {
 	struct intel_engine_capture_vma *capture = NULL;
 	struct intel_engine_coredump *ee;
-	struct intel_context *ce;
+	struct intel_context *ce = NULL;
 	struct i915_request *rq = NULL;
-	unsigned long flags;
 
 	ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
 	if (!ee)
 		return NULL;
 
-	ce = intel_engine_get_hung_context(engine);
-	if (ce) {
-		intel_engine_clear_hung_context(engine);
-		/* This will reference count the request (if found) */
-		rq = intel_context_find_active_request(ce);
-		if (!rq || !i915_request_started(rq))
-			goto no_request_capture;
-	} else {
-		/*
-		 * Getting here with GuC enabled means it is a forced error capture
-		 * with no actual hang. So, no need to attempt the execlist search.
-		 */
-		if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
-			spin_lock_irqsave(&engine->sched_engine->lock, flags);
-			rq = intel_engine_execlist_find_hung_request(engine);
-			if (rq)
-				rq = i915_request_get_rcu(rq);
-			spin_unlock_irqrestore(&engine->sched_engine->lock,
-					       flags);
-		}
-	}
-	if (!rq)
+	intel_get_hung_entity(engine, &ce, &rq);
+	if (!rq || !i915_request_started(rq))
 		goto no_request_capture;
 
 	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);