diff mbox

[06/11] drm/i915/tdr: Restart submission after engine reset

Message ID 1469551257-26803-7-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

arun.siluvery@linux.intel.com July 26, 2016, 4:40 p.m. UTC
We stop the engine during reset and recovery so after a successful reset
the request that caused the hang would've been removed from the queue so we
can now restart submissions to elsp.

Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 45 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

Comments

Chris Wilson July 26, 2016, 9:32 p.m. UTC | #1
On Tue, Jul 26, 2016 at 05:40:52PM +0100, Arun Siluvery wrote:
> We stop the engine during reset and recovery so after a successful reset
> the request that caused the hang would've been removed from the queue so we
> can now restart submissions to elsp.
> 
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 45 +++++++++++++++++++++++++++++++++++++---
>  1 file changed, 42 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 8fc5a3b..7834edc 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -418,7 +418,8 @@ static inline void execlists_context_status_change(
>  	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
>  }
>  
> -static void execlists_context_unqueue(struct intel_engine_cs *engine)
> +static void execlists_context_unqueue(struct intel_engine_cs *engine,
> +				      bool submission_after_reset)
>  {
>  	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
>  	struct drm_i915_gem_request *cursor, *tmp;
> @@ -436,6 +437,27 @@ static void execlists_context_unqueue(struct intel_engine_cs *engine)
>  				 execlist_link) {
>  		if (!req0) {
>  			req0 = cursor;
> +
> +			/*
> +			 * we submit two requests at a time, req0 and req1.
> +			 * Assume that req0 is the one that causes hang and
> +			 * req1 is a normal batch.
> +
> +			 * After engine reset, once engine is
> +			 * reinitialized, we skip req0 and submit req1
> +			 * along with next request in the queue so we endup
> +			 * incrementing req1->elsp_submitted again. But
> +			 * after reset HW would've switched to req1 and
> +			 * executed it so just this once, submit only req1
> +			 * (which is req0 now) and don't increment
> +			 * submission count. Once this is removed we submit
> +			 * two requests as usual.
> +			 */
> +			if (submission_after_reset) {
> +				if (req0->elsp_submitted)
> +					req0->elsp_submitted--;
> +				break;

And no. Put the special case handling in the reset to cancel the
submitted hw tracking and re-establish the request queue.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 8fc5a3b..7834edc 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -418,7 +418,8 @@  static inline void execlists_context_status_change(
 	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
 }
 
-static void execlists_context_unqueue(struct intel_engine_cs *engine)
+static void execlists_context_unqueue(struct intel_engine_cs *engine,
+				      bool submission_after_reset)
 {
 	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
 	struct drm_i915_gem_request *cursor, *tmp;
@@ -436,6 +437,27 @@  static void execlists_context_unqueue(struct intel_engine_cs *engine)
 				 execlist_link) {
 		if (!req0) {
 			req0 = cursor;
+
+			/*
+			 * we submit two requests at a time, req0 and req1.
+			 * Assume that req0 is the one that causes hang and
+			 * req1 is a normal batch.
+
+			 * After engine reset, once engine is
+			 * reinitialized, we skip req0 and submit req1
+			 * along with next request in the queue so we endup
+			 * incrementing req1->elsp_submitted again. But
+			 * after reset HW would've switched to req1 and
+			 * executed it so just this once, submit only req1
+			 * (which is req0 now) and don't increment
+			 * submission count. Once this is removed we submit
+			 * two requests as usual.
+			 */
+			if (submission_after_reset) {
+				if (req0->elsp_submitted)
+					req0->elsp_submitted--;
+				break;
+			}
 		} else if (req0->ctx == cursor->ctx) {
 			/* Same ctx: ignore first request, as second request
 			 * will update tail past first request's workload */
@@ -600,7 +622,7 @@  static void intel_lrc_irq_handler(unsigned long data)
 	if (submit_contexts) {
 		if (!engine->disable_lite_restore_wa ||
 		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
-			execlists_context_unqueue(engine);
+			execlists_context_unqueue(engine, false);
 	}
 
 	spin_unlock(&engine->execlist_lock);
@@ -640,7 +662,7 @@  static void execlists_context_queue(struct drm_i915_gem_request *request)
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	request->ctx_hw_id = request->ctx->hw_id;
 	if (num_elements == 0)
-		execlists_context_unqueue(engine);
+		execlists_context_unqueue(engine, false);
 
 	spin_unlock_bh(&engine->execlist_lock);
 }
@@ -1142,6 +1164,23 @@  unlock:
 	return !continue_with_reset;
 }
 
+/**
+ * intel_execlists_restart_submission() - restarts elsp submissions after
+ * reset
+ *
+ * @engine: engine to be re-started
+ *
+ */
+void intel_execlists_restart_submission(struct intel_engine_cs *engine)
+{
+	spin_lock_bh(&engine->execlist_lock);
+
+	if (!list_empty(&engine->execlist_queue))
+		execlists_context_unqueue(engine, true);
+
+	spin_unlock_bh(&engine->execlist_lock);
+}
+
 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	int ret, i;