[089/190] drm/i915: Tidy execlists submission and tracking
diff mbox

Message ID 1452509174-16671-3-git-send-email-chris@chris-wilson.co.uk
State New
Headers show

Commit Message

Chris Wilson Jan. 11, 2016, 10:44 a.m. UTC
Other than dramatically simplifying the submission code (requests ftw),
we can reduce the execlist spinlock duration and importantly avoid
having to hold it across the context switch register reads.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  20 +-
 drivers/gpu/drm/i915/i915_gem.c            |   8 +-
 drivers/gpu/drm/i915/i915_gem_request.h    |  21 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |  31 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 505 +++++++++++------------------
 drivers/gpu/drm/i915/intel_lrc.h           |   3 -
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   8 +-
 7 files changed, 209 insertions(+), 387 deletions(-)

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 15a6fddfb79b..a5ea90944bbb 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2005,8 +2005,7 @@  static void i915_dump_lrc_obj(struct seq_file *m,
 		return;
 	}
 
-	seq_printf(m, "CONTEXT: %s %u\n", ring->name,
-		   intel_execlists_ctx_id(ctx_obj));
+	seq_printf(m, "CONTEXT: %s\n", ring->name);
 
 	if (!i915_gem_obj_ggtt_bound(ctx_obj))
 		seq_puts(m, "\tNot bound in GGTT\n");
@@ -2092,7 +2091,6 @@  static int i915_execlists(struct seq_file *m, void *data)
 	intel_runtime_pm_get(dev_priv);
 
 	for_each_ring(ring, dev_priv, ring_id) {
-		struct drm_i915_gem_request *head_req = NULL;
 		int count = 0;
 
 		seq_printf(m, "%s\n", ring->name);
@@ -2105,8 +2103,8 @@  static int i915_execlists(struct seq_file *m, void *data)
 		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
 		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
 
-		read_pointer = ring->next_context_status_buffer;
-		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
+		read_pointer = (status_pointer >> 8) & GEN8_CSB_PTR_MASK;
+		write_pointer = status_pointer & GEN8_CSB_PTR_MASK;
 		if (read_pointer > write_pointer)
 			write_pointer += GEN8_CSB_ENTRIES;
 		seq_printf(m, "\tRead pointer: 0x%08X, write pointer 0x%08X\n",
@@ -2123,21 +2121,9 @@  static int i915_execlists(struct seq_file *m, void *data)
 		spin_lock(&ring->execlist_lock);
 		list_for_each(cursor, &ring->execlist_queue)
 			count++;
-		head_req = list_first_entry_or_null(&ring->execlist_queue,
-				struct drm_i915_gem_request, execlist_link);
 		spin_unlock(&ring->execlist_lock);
 
 		seq_printf(m, "\t%d requests in queue\n", count);
-		if (head_req) {
-			struct drm_i915_gem_object *ctx_obj;
-
-			ctx_obj = head_req->ctx->engine[ring_id].state;
-			seq_printf(m, "\tHead request id: %u\n",
-				   intel_execlists_ctx_id(ctx_obj));
-			seq_printf(m, "\tHead request tail: %u\n",
-				   head_req->tail);
-		}
-
 		seq_putc(m, '\n');
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index eb875ecd7907..054e11cff00f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2193,12 +2193,12 @@  static void i915_gem_reset_ring_cleanup(struct intel_engine_cs *engine)
 
 	if (i915.enable_execlists) {
 		spin_lock(&engine->execlist_lock);
-
-		/* list_splice_tail_init checks for empty lists */
 		list_splice_tail_init(&engine->execlist_queue,
-				      &engine->execlist_retired_req_list);
-
+				      &engine->execlist_completed);
+		memset(&engine->execlist_port, 0,
+		       sizeof(engine->execlist_port));
 		spin_unlock(&engine->execlist_lock);
+
 		intel_execlists_retire_requests(engine);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 59957d5edfdb..c2e83584f8a2 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -63,10 +63,11 @@  struct drm_i915_gem_request {
 	 * This is required to calculate the maximum available ringbuffer
 	 * space without overwriting the postfix.
 	 */
-	 u32 postfix;
+	u32 postfix;
 
 	/** Position in the ringbuffer of the end of the whole request */
 	u32 tail;
+	u32 wa_tail;
 
 	/**
 	 * Context and ring buffer related to this request
@@ -99,24 +100,8 @@  struct drm_i915_gem_request {
 	/** process identifier submitting this request */
 	struct pid *pid;
 
-	/**
-	 * The ELSP only accepts two elements at a time, so we queue
-	 * context/tail pairs on a given queue (ring->execlist_queue) until the
-	 * hardware is available. The queue serves a double purpose: we also use
-	 * it to keep track of the up to 2 contexts currently in the hardware
-	 * (usually one in execution and the other queued up by the GPU): We
-	 * only remove elements from the head of the queue when the hardware
-	 * informs us that an element has been completed.
-	 *
-	 * All accesses to the queue are mediated by a spinlock
-	 * (ring->execlist_lock).
-	 */
-
 	/** Execlist link in the submission queue.*/
-	struct list_head execlist_link;
-
-	/** Execlists no. of times this request has been sent to the ELSP */
-	int elsp_submitted;
+	struct list_head execlist_link; /* guarded by engine->execlist_lock */
 };
 
 struct drm_i915_gem_request *
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 5a6251926367..f4e09952d52c 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -393,7 +393,6 @@  static void guc_init_ctx_desc(struct intel_guc *guc,
 		struct intel_ring *ring = ctx->engine[i].ring;
 		struct intel_engine_cs *engine;
 		struct drm_i915_gem_object *obj;
-		uint64_t ctx_desc;
 
 		/* TODO: We have a design issue to be solved here. Only when we
 		 * receive the first batch, we know which engine is used by the
@@ -407,8 +406,7 @@  static void guc_init_ctx_desc(struct intel_guc *guc,
 			break;	/* XXX: continue? */
 
 		engine = ring->engine;
-		ctx_desc = intel_lr_context_descriptor(ctx, engine);
-		lrc->context_desc = (u32)ctx_desc;
+		lrc->context_desc = engine->execlist_context_descriptor;
 
 		/* The state page is after PPHWSP */
 		lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
@@ -548,7 +546,7 @@  static int guc_add_workqueue_item(struct i915_guc_client *gc,
 			WQ_NO_WCFLUSH_WAIT;
 
 	/* The GuC wants only the low-order word of the context descriptor */
-	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, rq->engine);
+	wqi->context_desc = rq->engine->execlist_context_descriptor;
 
 	/* The GuC firmware wants the tail index in QWords, not bytes */
 	tail = rq->ring->tail >> 3;
@@ -562,27 +560,6 @@  static int guc_add_workqueue_item(struct i915_guc_client *gc,
 
 #define CTX_RING_BUFFER_START		0x08
 
-/* Update the ringbuffer pointer in a saved context image */
-static void lr_context_update(struct drm_i915_gem_request *rq)
-{
-	enum intel_engine_id ring_id = rq->engine->id;
-	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring_id].state;
-	struct drm_i915_gem_object *rb_obj = rq->ring->obj;
-	struct page *page;
-	uint32_t *reg_state;
-
-	BUG_ON(!ctx_obj);
-	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
-	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
-
-	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
-	reg_state = kmap_atomic(page);
-
-	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
-
-	kunmap_atomic(reg_state);
-}
-
 /**
  * i915_guc_submit() - Submit commands through GuC
  * @client:	the guc client where commands will go through
@@ -597,10 +574,6 @@  int i915_guc_submit(struct i915_guc_client *client,
 	enum intel_engine_id ring_id = rq->engine->id;
 	int q_ret, b_ret;
 
-	/* Need this because of the deferred pin ctx and ring */
-	/* Shall we move this right after ring is pinned? */
-	lr_context_update(rq);
-
 	q_ret = guc_add_workqueue_item(client, rq);
 	if (q_ret == 0)
 		b_ret = guc_ring_doorbell(client);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index de5889e95d6d..80b346a3fd8a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -265,233 +265,133 @@  int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
 	return 0;
 }
 
-/**
- * intel_execlists_ctx_id() - get the Execlists Context ID
- * @ctx_obj: Logical Ring Context backing object.
- *
- * Do not confuse with ctx->id! Unfortunately we have a name overload
- * here: the old context ID we pass to userspace as a handler so that
- * they can refer to a context, and the new context ID we pass to the
- * ELSP so that the GPU can inform us of the context status via
- * interrupts.
- *
- * Return: 20-bits globally unique context ID.
- */
-u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
-{
-	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
-			LRC_PPHWSP_PN * PAGE_SIZE;
-
-	/* LRCA is required to be 4K aligned so the more significant 20 bits
-	 * are globally unique */
-	return lrca >> 12;
-}
-
-static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
-{
-	return (IS_SKL_REVID(ring->dev, 0, SKL_REVID_B0) ||
-		IS_BXT_REVID(ring->dev, 0, BXT_REVID_A1)) &&
-		(ring->id == VCS || ring->id == VCS2);
-}
-
-uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
-				     struct intel_engine_cs *ring)
+static u32 execlists_request_write_tail(struct drm_i915_gem_request *req)
 {
-	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
-	uint64_t desc;
-	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
-			LRC_PPHWSP_PN * PAGE_SIZE;
-
-	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
-
-	desc = GEN8_CTX_VALID;
-	desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
-	if (IS_GEN8(ring->i915))
-		desc |= GEN8_CTX_L3LLC_COHERENT;
-	desc |= GEN8_CTX_PRIVILEGE;
-	desc |= lrca;
-	desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
-
-	/* TODO: WaDisableLiteRestore when we start using semaphore
-	 * signalling between Command Streamers */
-	/* desc |= GEN8_CTX_FORCE_RESTORE; */
+	struct intel_ring *ring = req->ring;
+	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
 
-	/* WaEnableForceRestoreInCtxtDescForVCS:skl */
-	/* WaEnableForceRestoreInCtxtDescForVCS:bxt */
-	if (disable_lite_restore_wa(ring))
-		desc |= GEN8_CTX_FORCE_RESTORE;
+	if (ppgtt && !USES_FULL_48BIT_PPGTT(req->i915)) {
+		/* True 32b PPGTT with dynamic page allocation: update PDP
+		 * registers and point the unallocated PDPs to scratch page.
+		 * PML4 is allocated during ppgtt init, so this is not needed
+		 * in 48-bit mode.
+		 */
+		if (ppgtt->pd_dirty_rings & intel_engine_flag(req->engine)) {
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 3);
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 2);
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 1);
+			ASSIGN_CTX_PDP(ppgtt, ring->registers, 0);
+			ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
+		}
+	}
 
-	return desc;
+	ring->registers[CTX_RING_TAIL+1] = req->tail;
+	return ring->context_descriptor;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
+static void execlists_submit_pair(struct intel_engine_cs *ring)
 {
+	struct drm_i915_private *dev_priv = ring->i915;
+	uint32_t desc[4];
 
-	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_i915_private *dev_priv = rq0->i915;
-	uint64_t desc[2];
-
-	if (rq1) {
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
-	} else {
-		desc[1] = 0;
-	}
+	if (ring->execlist_port[1]) {
+		desc[0] = execlists_request_write_tail(ring->execlist_port[1]);
+		desc[1] = ring->execlist_port[1]->fence.seqno;
+	} else
+		desc[1] = desc[0] = 0;
 
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
+	desc[2] = execlists_request_write_tail(ring->execlist_port[0]);
+	desc[3] = ring->execlist_port[0]->fence.seqno;
 
-	/* You must always write both descriptors in the order below. */
-	spin_lock_irq(&dev_priv->uncore.lock);
-	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+	/* Note: You must always write both descriptors in the order below. */
+	I915_WRITE_FW(RING_ELSP(ring), desc[1]);
+	I915_WRITE_FW(RING_ELSP(ring), desc[0]);
+	I915_WRITE_FW(RING_ELSP(ring), desc[3]);
 
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
 	/* The context is automatically loaded after the following */
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
-
-	/* ELSP is a wo register, use another nearby reg for posting */
-	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
-	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
-	spin_unlock_irq(&dev_priv->uncore.lock);
+	I915_WRITE_FW(RING_ELSP(ring), desc[2]);
 }
 
-static int execlists_update_context(struct drm_i915_gem_request *rq)
+static void execlists_context_unqueue(struct intel_engine_cs *engine)
 {
-	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
-	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[rq->engine->id].state;
-	struct drm_i915_gem_object *rb_obj = rq->ring->obj;
-	struct page *page;
-	uint32_t *reg_state;
-
-	BUG_ON(!ctx_obj);
-	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
-	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
-
-	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
-	reg_state = kmap_atomic(page);
+	struct drm_i915_gem_request *cursor;
+	bool submit = false;
+	int port = 0;
 
-	reg_state[CTX_RING_TAIL+1] = rq->tail;
-	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
+	assert_spin_locked(&engine->execlist_lock);
 
-	if (ppgtt && !USES_FULL_48BIT_PPGTT(rq->i915)) {
-		/* True 32b PPGTT with dynamic page allocation: update PDP
-		 * registers and point the unallocated PDPs to scratch page.
-		 * PML4 is allocated during ppgtt init, so this is not needed
-		 * in 48-bit mode.
+	/* Try to read in pairs and fill both submission ports */
+	cursor = engine->execlist_port[port];
+	if (cursor != NULL) {
+		/* WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
+		 * as we resubmit the request. See gen8_emit_request()
+		 * for where we prepare the padding after the end of the
+		 * request.
 		 */
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
-	}
-
-	kunmap_atomic(reg_state);
-
-	return 0;
-}
+		cursor->tail = cursor->wa_tail;
+		cursor = list_next_entry(cursor, execlist_link);
+	} else
+		cursor = list_first_entry(&engine->execlist_queue,
+					  typeof(*cursor),
+					  execlist_link);
+	while (&cursor->execlist_link != &engine->execlist_queue) {
+		/* Same ctx: ignore earlier request, as the
+		 * second request extends the first.
+		 */
+		if (engine->execlist_port[port] &&
+		    cursor->ctx != engine->execlist_port[port]->ctx) {
+			if (++port == ARRAY_SIZE(engine->execlist_port))
+				break;
+		}
 
-static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
-				      struct drm_i915_gem_request *rq1)
-{
-	execlists_update_context(rq0);
+		engine->execlist_port[port] = cursor;
+		submit = true;
 
-	if (rq1)
-		execlists_update_context(rq1);
+		cursor = list_next_entry(cursor, execlist_link);
+	}
 
-	execlists_elsp_write(rq0, rq1);
+	if (submit)
+		execlists_submit_pair(engine);
 }
 
-static void execlists_context_unqueue(struct intel_engine_cs *engine)
+static bool execlists_complete_requests(struct intel_engine_cs *engine,
+					u32 seqno)
 {
-	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-	struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
-
 	assert_spin_locked(&engine->execlist_lock);
 
-	/*
-	 * If irqs are not active generate a warning as batches that finish
-	 * without the irqs may get lost and a GPU Hang may occur.
-	 */
-	WARN_ON(!intel_irqs_enabled(engine->dev->dev_private));
+	do {
+		struct drm_i915_gem_request *req;
 
-	if (list_empty(&engine->execlist_queue))
-		return;
+		req = engine->execlist_port[0];
+		if (req == NULL)
+			break;
 
-	/* Try to read in pairs */
-	list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
-				 execlist_link) {
-		if (!req0) {
-			req0 = cursor;
-		} else if (req0->ctx == cursor->ctx) {
-			/* Same ctx: ignore first request, as second request
-			 * will update tail past first request's workload */
-			cursor->elsp_submitted = req0->elsp_submitted;
-			list_del(&req0->execlist_link);
-			list_add_tail(&req0->execlist_link,
-				&engine->execlist_retired_req_list);
-			req0 = cursor;
-		} else {
-			req1 = cursor;
+		if (!i915_seqno_passed(seqno, req->fence.seqno))
 			break;
-		}
-	}
 
-	if (IS_GEN8(engine->dev) || IS_GEN9(engine->dev)) {
-		/*
-		 * WaIdleLiteRestore: make sure we never cause a lite
-		 * restore with HEAD==TAIL
+		/* Move the completed set of requests from the start of the
+		 * execlist_queue over to the tail of the execlist_completed.
 		 */
-		if (req0->elsp_submitted) {
-			/*
-			 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
-			 * as we resubmit the request. See gen8_add_request()
-			 * for where we prepare the padding after the end of the
-			 * request.
-			 */
-			struct intel_ring *ring;
-
-			ring = req0->ctx->engine[engine->id].ring;
-			req0->tail += 8;
-			req0->tail &= ring->size - 1;
-		}
-	}
-
-	WARN_ON(req1 && req1->elsp_submitted);
+		engine->execlist_completed.prev->next = engine->execlist_queue.next;
+		engine->execlist_completed.prev = &req->execlist_link;
 
-	execlists_submit_requests(req0, req1);
-}
-
-static bool execlists_check_remove_request(struct intel_engine_cs *ring,
-					   u32 request_id)
-{
-	struct drm_i915_gem_request *head_req;
+		engine->execlist_queue.next = req->execlist_link.next;
+		req->execlist_link.next->prev = &engine->execlist_queue;
 
-	assert_spin_locked(&ring->execlist_lock);
+		req->execlist_link.next = &engine->execlist_completed;
 
-	head_req = list_first_entry_or_null(&ring->execlist_queue,
-					    struct drm_i915_gem_request,
-					    execlist_link);
-
-	if (head_req != NULL) {
-		struct drm_i915_gem_object *ctx_obj =
-				head_req->ctx->engine[ring->id].state;
-		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
-			WARN(head_req->elsp_submitted == 0,
-			     "Never submitted head request\n");
-
-			if (--head_req->elsp_submitted <= 0) {
-				list_del(&head_req->execlist_link);
-				list_add_tail(&head_req->execlist_link,
-					&ring->execlist_retired_req_list);
-				return true;
-			}
-		}
-	}
+		/* The hardware has completed the request on this port, it
+		 * will switch to the next.
+		 */
+		engine->execlist_port[0] = engine->execlist_port[1];
+		engine->execlist_port[1] = NULL;
+	} while (1);
 
-	return false;
+	if (engine->execlist_context_descriptor & GEN8_CTX_FORCE_RESTORE)
+		return engine->execlist_port[0] == NULL;
+	else
+		return engine->execlist_port[1] == NULL;
 }
 
 static void set_rtpriority(void)
@@ -504,23 +404,29 @@  static int intel_execlists_submit(void *arg)
 {
 	struct intel_engine_cs *ring = arg;
 	struct drm_i915_private *dev_priv = ring->i915;
+	const i915_reg_t ptrs = RING_CONTEXT_STATUS_PTR(ring);
 
 	set_rtpriority();
 
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 	do {
-		u32 status;
-		u32 status_id;
-		u32 submit_contexts;
 		u8 head, tail;
+		u32 seqno;
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		head = ring->next_context_status_buffer;
-		tail = I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
+		head = tail = 0;
+		if (READ_ONCE(ring->execlist_port[0])) {
+			u32 x = I915_READ_FW(ptrs);
+			head = x >> 8;
+			tail = x;
+		}
 		if (head == tail) {
+			intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 			if (kthread_should_stop())
 				return 0;
 
 			schedule();
+			intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 			continue;
 		}
 		__set_current_state(TASK_RUNNING);
@@ -528,86 +434,46 @@  static int intel_execlists_submit(void *arg)
 		if (head > tail)
 			tail += GEN8_CSB_ENTRIES;
 
-		status = 0;
-		submit_contexts = 0;
-
-		spin_lock(&ring->execlist_lock);
-
+		seqno = 0;
 		while (head++ < tail) {
-			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES));
-			status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES));
-
-			if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-				continue;
-
-			if (status & GEN8_CTX_STATUS_PREEMPTED) {
-				if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
-					if (execlists_check_remove_request(ring, status_id))
-						WARN(1, "Lite Restored request removed from queue\n");
-				} else
-					WARN(1, "Preemption without Lite Restore\n");
-			}
-
-			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
-			    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
-				if (execlists_check_remove_request(ring, status_id))
-					submit_contexts++;
+			u32 status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring,
+									     head % GEN8_CSB_ENTRIES));
+			if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED && 0)) {
+				DRM_ERROR("Pre-empted request %x %s Lite Restore\n",
+					  I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES)),
+					  status & GEN8_CTX_STATUS_LITE_RESTORE ? "with" : "without");
 			}
+			if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+				      GEN8_CTX_STATUS_ELEMENT_SWITCH))
+				seqno = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring,
+										head % GEN8_CSB_ENTRIES));
 		}
 
-		if (disable_lite_restore_wa(ring)) {
-			/* Prevent a ctx to preempt itself */
-			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
-					(submit_contexts != 0))
+		I915_WRITE_FW(ptrs,
+			      _MASKED_FIELD(GEN8_CSB_PTR_MASK<<8,
+					    (tail % GEN8_CSB_ENTRIES) << 8));
+
+		if (seqno) {
+			spin_lock(&ring->execlist_lock);
+			if (execlists_complete_requests(ring, seqno))
 				execlists_context_unqueue(ring);
-		} else if (submit_contexts != 0) {
-			execlists_context_unqueue(ring);
+			spin_unlock(&ring->execlist_lock);
 		}
-
-		spin_unlock(&ring->execlist_lock);
-
-		WARN(submit_contexts > 2, "More than two context complete events?\n");
-		ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES;
-		I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
-			   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
-					 ring->next_context_status_buffer<<8));
 	} while (1);
 }
 
 static int execlists_context_queue(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	struct drm_i915_gem_request *cursor;
-	int num_elements = 0;
 
 	i915_gem_request_get(request);
 
 	spin_lock(&engine->execlist_lock);
-
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
-		if (++num_elements > 2)
-			break;
-
-	if (num_elements > 2) {
-		struct drm_i915_gem_request *tail_req;
-
-		tail_req = list_last_entry(&engine->execlist_queue,
-					   struct drm_i915_gem_request,
-					   execlist_link);
-
-		if (request->ctx == tail_req->ctx) {
-			WARN(tail_req->elsp_submitted != 0,
-				"More than 2 already-submitted reqs queued\n");
-			list_del(&tail_req->execlist_link);
-			list_add_tail(&tail_req->execlist_link,
-				&engine->execlist_retired_req_list);
-		}
-	}
-
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
-	if (num_elements == 0)
-		execlists_context_unqueue(engine);
-
+	if (engine->execlist_port[0] == NULL) {
+		engine->execlist_port[0] = request;
+		execlists_submit_pair(engine);
+	}
 	spin_unlock(&engine->execlist_lock);
 
 	return 0;
@@ -641,56 +507,32 @@  int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 	return 0;
 }
 
-/*
- * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload
- * @request: Request to advance the logical ringbuffer of.
- *
- * The tail is updated in our logical ringbuffer struct, not in the actual context. What
- * really happens during submission is that the context and current tail will be placed
- * on a queue waiting for the ELSP to be ready to accept a new context submission. At that
- * point, the tail *inside* the context is updated and the ELSP written to.
- */
-static void
-intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
-{
-	struct drm_i915_private *dev_priv = request->i915;
-
-	intel_ring_advance(request->ring);
-	request->tail = request->ring->tail;
-
-	if (dev_priv->guc.execbuf_client)
-		i915_guc_submit(dev_priv->guc.execbuf_client, request);
-	else
-		execlists_context_queue(request);
-}
-
 bool intel_execlists_retire_requests(struct intel_engine_cs *ring)
 {
 	struct drm_i915_gem_request *req, *tmp;
-	struct list_head retired_list;
+	struct list_head list;
 
-	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
-	if (list_empty(&ring->execlist_retired_req_list))
+	lockdep_assert_held(&ring->dev->struct_mutex);
+	if (list_empty(&ring->execlist_completed))
 		goto out;
 
-	INIT_LIST_HEAD(&retired_list);
 	spin_lock(&ring->execlist_lock);
-	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
+	list_replace_init(&ring->execlist_completed, &list);
 	spin_unlock(&ring->execlist_lock);
 
-	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
+	list_for_each_entry_safe(req, tmp, &list, execlist_link) {
 		struct intel_context *ctx = req->ctx;
 		struct drm_i915_gem_object *ctx_obj =
 				ctx->engine[ring->id].state;
 
 		if (ctx_obj && (ctx != ring->default_context))
 			intel_lr_context_unpin(req);
-		list_del(&req->execlist_link);
+
 		i915_gem_request_put(req);
 	}
 
 out:
-	return list_empty(&ring->execlist_queue);
+	return READ_ONCE(ring->execlist_port[0]) == NULL;
 }
 
 void intel_logical_ring_stop(struct intel_engine_cs *ring)
@@ -720,6 +562,7 @@  static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
 		struct intel_ring *ringbuf)
 {
 	struct drm_i915_private *dev_priv = ring->i915;
+	u32 ggtt_offset;
 	int ret = 0;
 
 	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
@@ -734,6 +577,16 @@  static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
 
 	ctx_obj->dirty = true;
 
+	ggtt_offset =
+		i915_gem_obj_ggtt_offset(ctx_obj) + LRC_PPHWSP_PN * PAGE_SIZE;
+	ringbuf->context_descriptor =
+		ggtt_offset | ring->execlist_context_descriptor;
+
+	ringbuf->registers =
+		kmap(i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN));
+	ringbuf->registers[CTX_RING_BUFFER_START+1] =
+		i915_gem_obj_ggtt_offset(ringbuf->obj);
+
 	/* Invalidate GuC TLB. */
 	if (i915.enable_guc_submission)
 		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
@@ -768,6 +621,7 @@  static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 
 void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
 {
+	struct drm_i915_gem_object *ctx_obj;
 	int engine = rq->engine->id;
 
 	WARN_ON(!mutex_is_locked(&rq->i915->dev->struct_mutex));
@@ -775,7 +629,10 @@  void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
 		return;
 
 	intel_ring_unmap(rq->ring);
-	i915_gem_object_ggtt_unpin(rq->ctx->engine[engine].state);
+
+	ctx_obj = rq->ctx->engine[engine].state;
+	kunmap(i915_gem_object_get_page(ctx_obj, LRC_STATE_PN));
+	i915_gem_object_ggtt_unpin(ctx_obj);
 	i915_gem_context_unreference(rq->ctx);
 }
 
@@ -1168,12 +1025,39 @@  out:
 	return ret;
 }
 
+static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
+{
+	return (IS_SKL_REVID(ring->i915, 0, SKL_REVID_B0) ||
+		IS_BXT_REVID(ring->i915, 0, BXT_REVID_A1)) &&
+		(ring->id == VCS || ring->id == VCS2);
+}
+
+static uint64_t lr_context_descriptor(struct intel_engine_cs *ring)
+{
+	uint64_t desc;
+
+	desc = GEN8_CTX_VALID;
+	desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
+	if (IS_GEN8(ring->i915))
+		desc |= GEN8_CTX_L3LLC_COHERENT;
+	desc |= GEN8_CTX_PRIVILEGE;
+
+	/* TODO: WaDisableLiteRestore when we start using semaphore
+	 * signalling between Command Streamers */
+	/* desc |= GEN8_CTX_FORCE_RESTORE; */
+
+	/* WaEnableForceRestoreInCtxtDescForVCS:skl */
+	/* WaEnableForceRestoreInCtxtDescForVCS:bxt */
+	if (disable_lite_restore_wa(ring))
+		desc |= GEN8_CTX_FORCE_RESTORE;
+
+	return desc;
+}
+
 static int gen8_init_common_ring(struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u8 next_context_status_buffer_hw;
-
 	lrc_setup_hardware_status_page(ring,
 				ring->default_context->engine[ring->id].state);
 
@@ -1197,18 +1081,6 @@  static int gen8_init_common_ring(struct intel_engine_cs *ring)
 	 * SKL  |         ?                |         ?            |
 	 * BXT  |         ?                |         ?            |
 	 */
-	next_context_status_buffer_hw =
-		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(ring)));
-
-	/*
-	 * When the CSB registers are reset (also after power-up / gpu reset),
-	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
-	 * this special case, so the first element read is CSB[0].
-	 */
-	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
-		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
-
-	ring->next_context_status_buffer = next_context_status_buffer_hw;
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name);
 
 	memset(&ring->hangcheck, 0, sizeof(ring->hangcheck));
@@ -1482,7 +1354,8 @@  static int gen8_add_request(struct drm_i915_gem_request *request)
 	intel_ring_emit(ring, request->fence.seqno);
 	intel_ring_emit(ring, MI_USER_INTERRUPT);
 	intel_ring_emit(ring, MI_NOOP);
-	intel_logical_ring_advance_and_submit(request);
+	intel_ring_advance(ring);
+	request->tail = ring->tail;
 
 	/*
 	 * Here we add two extra NOOPs as padding to avoid
@@ -1491,6 +1364,12 @@  static int gen8_add_request(struct drm_i915_gem_request *request)
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_advance(ring);
+	request->wa_tail = ring->tail;
+
+	if (request->i915->guc.execbuf_client)
+		i915_guc_submit(request->i915->guc.execbuf_client, request);
+	else
+		execlists_context_queue(request);
 
 	return 0;
 }
@@ -1569,9 +1448,11 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 
 	INIT_LIST_HEAD(&ring->buffers);
 	INIT_LIST_HEAD(&ring->execlist_queue);
-	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
+	INIT_LIST_HEAD(&ring->execlist_completed);
 	spin_lock_init(&ring->execlist_lock);
 
+	ring->execlist_context_descriptor = lr_context_descriptor(ring);
+
 	ret = i915_cmd_parser_init_ring(ring);
 	if (ret)
 		goto error;
@@ -1592,8 +1473,6 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 		goto error;
 	}
 
-	ring->next_context_status_buffer =
-			I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
 	task = kthread_run(intel_execlists_submit, ring,
 			   "irq/i915:%de", ring->id);
 	if (IS_ERR(task))
@@ -1904,9 +1783,7 @@  populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 					  CTX_CTRL_RS_CTX_ENABLE));
 	ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0);
 	ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0);
-	/* Ring buffer start address is not known until the buffer is pinned.
-	 * It is written to the context image in execlists_update_context()
-	 */
+	/* Ring buffer start address is not known until the buffer is pinned. */
 	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, RING_START(ring->mmio_base), 0);
 	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, RING_CTL(ring->mmio_base),
 		       ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 33f82a84065a..37601a35d5fc 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -74,12 +74,9 @@  int intel_lr_context_deferred_alloc(struct intel_context *ctx,
 void intel_lr_context_unpin(struct drm_i915_gem_request *req);
 void intel_lr_context_reset(struct drm_device *dev,
 			struct intel_context *ctx);
-uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
-				     struct intel_engine_cs *ring);
 
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);
-u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
 
 bool intel_execlists_retire_requests(struct intel_engine_cs *ring);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index edaf07b2292e..3d4d5711aea9 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -122,6 +122,9 @@  struct intel_ring {
 	 * we can detect new retirements.
 	 */
 	u32 last_retired_head;
+
+	u32 context_descriptor;
+	u32 *registers;
 };
 
 struct	intel_context;
@@ -293,9 +296,10 @@  struct intel_engine_cs {
 	/* Execlists */
 	struct task_struct *execlists_submit;
 	spinlock_t execlist_lock;
+	struct drm_i915_gem_request *execlist_port[2];
 	struct list_head execlist_queue;
-	struct list_head execlist_retired_req_list;
-	u8 next_context_status_buffer;
+	struct list_head execlist_completed;
+	u32 execlist_context_descriptor;
 	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
 
 	/**