diff mbox

[v2,05/15] drm/i915/tdr: Prepare execlist submission to handle tdr resubmission after reset

Message ID 1466147355-4635-6-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

arun.siluvery@linux.intel.com June 17, 2016, 7:09 a.m. UTC
To resume execution after engine reset we resubmit the context and this
needs to be treated differently otherwise we would count it as a completely
new submission. This change modifies the submission path to account for
this.

During resubmission we only submit the head request that caused the hang,
once this is done then we continue with the normally submission of two
contexts at a time. The intention is to restore the submission state at the
time of hang.

Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

Comments

Chris Wilson June 17, 2016, 7:39 a.m. UTC | #1
On Fri, Jun 17, 2016 at 08:09:05AM +0100, Arun Siluvery wrote:
> To resume execution after engine reset we resubmit the context and this
> needs to be treated differently otherwise we would count it as a completely
> new submission. This change modifies the submission path to account for
> this.
> 
> During resubmission we only submit the head request that caused the hang,
> once this is done then we continue with the normally submission of two
> contexts at a time. The intention is to restore the submission state at the
> time of hang.

Why do we want to resubmit the hanging request (with HEAD increment by a
couple of dwords)? Kill the job, move on.

How quickly do you decide to actually abandon the request?
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 4fad830..35255ce 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -340,7 +340,8 @@  uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 }
 
 static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
+				 struct drm_i915_gem_request *rq1,
+				 bool tdr_resubmission)
 {
 
 	struct intel_engine_cs *engine = rq0->engine;
@@ -349,13 +350,15 @@  static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 
 	if (rq1) {
 		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
+		if (!tdr_resubmission)
+			rq1->elsp_submitted++;
 	} else {
 		desc[1] = 0;
 	}
 
 	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
+	if (!tdr_resubmission)
+		rq0->elsp_submitted++;
 
 	/* You must always write both descriptors in the order below. */
 	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
@@ -396,7 +399,8 @@  static void execlists_update_context(struct drm_i915_gem_request *rq)
 }
 
 static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
-				      struct drm_i915_gem_request *rq1)
+				      struct drm_i915_gem_request *rq1,
+				      bool tdr_resubmission)
 {
 	struct drm_i915_private *dev_priv = rq0->i915;
 	unsigned int fw_domains = rq0->engine->fw_domains;
@@ -409,13 +413,14 @@  static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
 	spin_lock_irq(&dev_priv->uncore.lock);
 	intel_uncore_forcewake_get__locked(dev_priv, fw_domains);
 
-	execlists_elsp_write(rq0, rq1);
+	execlists_elsp_write(rq0, rq1, tdr_resubmission);
 
 	intel_uncore_forcewake_put__locked(dev_priv, fw_domains);
 	spin_unlock_irq(&dev_priv->uncore.lock);
 }
 
-static void execlists_context_unqueue(struct intel_engine_cs *engine)
+static void execlists_context_unqueue(struct intel_engine_cs *engine,
+				      bool tdr_resubmission)
 {
 	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
 	struct drm_i915_gem_request *cursor, *tmp;
@@ -433,6 +438,19 @@  static void execlists_context_unqueue(struct intel_engine_cs *engine)
 				 execlist_link) {
 		if (!req0) {
 			req0 = cursor;
+
+			/*
+			 * Only submit head request if this is a resubmission
+			 * following engine reset. The intention is to restore
+			 * the original submission state from the situation
+			 * when the hang originally happened. Once the request
+			 * that caused the hang is resubmitted we can continue
+			 * normally by submitting two request at a time.
+			 */
+			if (tdr_resubmission) {
+				req1 = NULL;
+				break;
+			}
 		} else if (req0->ctx == cursor->ctx) {
 			/* Same ctx: ignore first request, as second request
 			 * will update tail past first request's workload */
@@ -466,7 +484,7 @@  static void execlists_context_unqueue(struct intel_engine_cs *engine)
 		req0->tail &= ringbuf->size - 1;
 	}
 
-	execlists_submit_requests(req0, req1);
+	execlists_submit_requests(req0, req1, tdr_resubmission);
 }
 
 static unsigned int
@@ -578,7 +596,7 @@  static void intel_lrc_irq_handler(unsigned long data)
 	if (submit_contexts) {
 		if (!engine->disable_lite_restore_wa ||
 		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
-			execlists_context_unqueue(engine);
+			execlists_context_unqueue(engine, false);
 	}
 
 	spin_unlock(&engine->execlist_lock);
@@ -618,7 +636,7 @@  static void execlists_context_queue(struct drm_i915_gem_request *request)
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	request->ctx_hw_id = request->ctx->hw_id;
 	if (num_elements == 0)
-		execlists_context_unqueue(engine);
+		execlists_context_unqueue(engine, false);
 
 	spin_unlock_bh(&engine->execlist_lock);
 }