[088/190] drm/i915: Move execlists interrupt based submission to a bottom-half
diff mbox

Message ID 1452509174-16671-2-git-send-email-chris@chris-wilson.co.uk
State New
Headers show

Commit Message

Chris Wilson Jan. 11, 2016, 10:44 a.m. UTC
[  196.988204] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
[  196.988512] clocksource:                       'refined-jiffies' wd_now: ffff9b48 wd_last: ffff9acb mask: ffffffff
[  196.988559] clocksource:                       'tsc' cs_now: 4fcfa84354 cs_last: 4f95425e98 mask: ffffffffffffffff
[  196.992115] clocksource: Switched to clocksource refined-jiffies

Followed by a hard lockup.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
 drivers/gpu/drm/i915/i915_gem.c         |  15 +--
 drivers/gpu/drm/i915/i915_irq.c         |   2 +-
 drivers/gpu/drm/i915/intel_lrc.c        | 164 +++++++++++++++++---------------
 drivers/gpu/drm/i915/intel_lrc.h        |   3 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h |   1 +
 6 files changed, 98 insertions(+), 92 deletions(-)

Comments

Tvrtko Ursulin Feb. 19, 2016, 12:08 p.m. UTC | #1
Hi,

On 11/01/16 10:44, Chris Wilson wrote:
> [  196.988204] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> [  196.988512] clocksource:                       'refined-jiffies' wd_now: ffff9b48 wd_last: ffff9acb mask: ffffffff
> [  196.988559] clocksource:                       'tsc' cs_now: 4fcfa84354 cs_last: 4f95425e98 mask: ffffffffffffffff
> [  196.992115] clocksource: Switched to clocksource refined-jiffies
>
> Followed by a hard lockup.

What does the above mean? Irq handler taking too long interferes with 
time keeping ?

I like it BTW. Just the commit message needs more work. :)

How is performance impact with just this patch in isolation? Worth 
progressing it on its own?

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
>   drivers/gpu/drm/i915/i915_gem.c         |  15 +--
>   drivers/gpu/drm/i915/i915_irq.c         |   2 +-
>   drivers/gpu/drm/i915/intel_lrc.c        | 164 +++++++++++++++++---------------
>   drivers/gpu/drm/i915/intel_lrc.h        |   3 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h |   1 +
>   6 files changed, 98 insertions(+), 92 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 378bc73296aa..15a6fddfb79b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2094,7 +2094,6 @@ static int i915_execlists(struct seq_file *m, void *data)
>   	for_each_ring(ring, dev_priv, ring_id) {
>   		struct drm_i915_gem_request *head_req = NULL;
>   		int count = 0;
> -		unsigned long flags;
>
>   		seq_printf(m, "%s\n", ring->name);
>
> @@ -2121,12 +2120,12 @@ static int i915_execlists(struct seq_file *m, void *data)
>   				   i, status, ctx_id);
>   		}
>
> -		spin_lock_irqsave(&ring->execlist_lock, flags);
> +		spin_lock(&ring->execlist_lock);
>   		list_for_each(cursor, &ring->execlist_queue)
>   			count++;
>   		head_req = list_first_entry_or_null(&ring->execlist_queue,
>   				struct drm_i915_gem_request, execlist_link);
> -		spin_unlock_irqrestore(&ring->execlist_lock, flags);
> +		spin_unlock(&ring->execlist_lock);
>
>   		seq_printf(m, "\t%d requests in queue\n", count);
>   		if (head_req) {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 391f840d29b7..eb875ecd7907 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2192,13 +2192,13 @@ static void i915_gem_reset_ring_cleanup(struct intel_engine_cs *engine)
>   	 */
>
>   	if (i915.enable_execlists) {
> -		spin_lock_irq(&engine->execlist_lock);
> +		spin_lock(&engine->execlist_lock);
>
>   		/* list_splice_tail_init checks for empty lists */
>   		list_splice_tail_init(&engine->execlist_queue,
>   				      &engine->execlist_retired_req_list);
>
> -		spin_unlock_irq(&engine->execlist_lock);
> +		spin_unlock(&engine->execlist_lock);
>   		intel_execlists_retire_requests(engine);
>   	}
>
> @@ -2290,15 +2290,8 @@ i915_gem_retire_requests(struct drm_device *dev)
>   	for_each_ring(ring, dev_priv, i) {
>   		i915_gem_retire_requests_ring(ring);
>   		idle &= list_empty(&ring->request_list);
> -		if (i915.enable_execlists) {
> -			unsigned long flags;
> -
> -			spin_lock_irqsave(&ring->execlist_lock, flags);
> -			idle &= list_empty(&ring->execlist_queue);
> -			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> -
> -			intel_execlists_retire_requests(ring);
> -		}
> +		if (i915.enable_execlists)
> +			idle &= intel_execlists_retire_requests(ring);
>   	}
>
>   	if (idle)
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ce047ac84f5f..b2ef2d0c211b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1316,7 +1316,7 @@ gen8_cs_irq_handler(struct intel_engine_cs *ring, u32 iir, int test_shift)
>   	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
>   		notify_ring(ring);
>   	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
> -		intel_lrc_irq_handler(ring);
> +		wake_up_process(ring->execlists_submit);
>   }
>
>   static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index b5f62b5f4913..de5889e95d6d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -132,6 +132,8 @@
>    *
>    */
>
> +#include <linux/kthread.h>
> +
>   #include <drm/drmP.h>
>   #include <drm/i915_drm.h>
>   #include "i915_drv.h"
> @@ -341,7 +343,7 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>   	rq0->elsp_submitted++;
>
>   	/* You must always write both descriptors in the order below. */
> -	spin_lock(&dev_priv->uncore.lock);
> +	spin_lock_irq(&dev_priv->uncore.lock);
>   	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
>   	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
>   	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> @@ -353,7 +355,7 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>   	/* ELSP is a wo register, use another nearby reg for posting */
>   	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
>   	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
> -	spin_unlock(&dev_priv->uncore.lock);
> +	spin_unlock_irq(&dev_priv->uncore.lock);
>   }
>
>   static int execlists_update_context(struct drm_i915_gem_request *rq)
> @@ -492,89 +494,84 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>   	return false;
>   }
>
> -static void get_context_status(struct intel_engine_cs *ring,
> -			       u8 read_pointer,
> -			       u32 *status, u32 *context_id)
> +static void set_rtpriority(void)
>   {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -
> -	if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
> -		return;
> -
> -	*status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
> -	*context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
> +	 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2-1 };
> +	 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
>   }
>
> -/**
> - * intel_lrc_irq_handler() - handle Context Switch interrupts
> - * @ring: Engine Command Streamer to handle.
> - *
> - * Check the unread Context Status Buffers and manage the submission of new
> - * contexts to the ELSP accordingly.
> - */
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring)
> +static int intel_execlists_submit(void *arg)
>   {
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -	u32 status_pointer;
> -	u8 read_pointer;
> -	u8 write_pointer;
> -	u32 status = 0;
> -	u32 status_id;
> -	u32 submit_contexts = 0;
> +	struct intel_engine_cs *ring = arg;
> +	struct drm_i915_private *dev_priv = ring->i915;
>
> -	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
> +	set_rtpriority();
>
> -	read_pointer = ring->next_context_status_buffer;
> -	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
> -	if (read_pointer > write_pointer)
> -		write_pointer += GEN8_CSB_ENTRIES;
> +	do {
> +		u32 status;
> +		u32 status_id;
> +		u32 submit_contexts;
> +		u8 head, tail;
>
> -	spin_lock(&ring->execlist_lock);
> +		set_current_state(TASK_INTERRUPTIBLE);

Hm, what is the effect of setting TASK_INTERRUPTIBLE at this stage 
rather than just before the call to schedule?

And why interruptible?

> +		head = ring->next_context_status_buffer;
> +		tail = I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
> +		if (head == tail) {
> +			if (kthread_should_stop())
> +				return 0;
>
> -	while (read_pointer < write_pointer) {
> +			schedule();
> +			continue;
> +		}
> +		__set_current_state(TASK_RUNNING);
>
> -		get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES,
> -				   &status, &status_id);
> +		if (head > tail)
> +			tail += GEN8_CSB_ENTRIES;
>
> -		if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
> -			continue;
> +		status = 0;
> +		submit_contexts = 0;
>
> -		if (status & GEN8_CTX_STATUS_PREEMPTED) {
> -			if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
> -				if (execlists_check_remove_request(ring, status_id))
> -					WARN(1, "Lite Restored request removed from queue\n");
> -			} else
> -				WARN(1, "Preemption without Lite Restore\n");
> -		}
> +		spin_lock(&ring->execlist_lock);
>
> -		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
> -		    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
> -			if (execlists_check_remove_request(ring, status_id))
> -				submit_contexts++;
> -		}
> -	}
> +		while (head++ < tail) {
> +			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES));
> +			status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES));

One potentially cheap improvement I've been thinking of is to move the 
CSB reading outside the execlist_lock, to avoid slow MMIO contending the 
lock.

We could fetch all valid entries into a temporary local buffer, then 
grab the execlist_lock and process completions and submission from there.

> -	if (disable_lite_restore_wa(ring)) {
> -		/* Prevent a ctx to preempt itself */
> -		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
> -		    (submit_contexts != 0))
> -			execlists_context_unqueue(ring);
> -	} else if (submit_contexts != 0) {
> -		execlists_context_unqueue(ring);
> -	}
> +			if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
> +				continue;
>
> -	spin_unlock(&ring->execlist_lock);
> +			if (status & GEN8_CTX_STATUS_PREEMPTED) {
> +				if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
> +					if (execlists_check_remove_request(ring, status_id))
> +						WARN(1, "Lite Restored request removed from queue\n");
> +				} else
> +					WARN(1, "Preemption without Lite Restore\n");
> +			}
>
> -	if (unlikely(submit_contexts > 2))
> -		DRM_ERROR("More than two context complete events?\n");
> +			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
> +			    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
> +				if (execlists_check_remove_request(ring, status_id))
> +					submit_contexts++;
> +			}
> +		}
> +
> +		if (disable_lite_restore_wa(ring)) {
> +			/* Prevent a ctx to preempt itself */
> +			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
> +					(submit_contexts != 0))
> +				execlists_context_unqueue(ring);
> +		} else if (submit_contexts != 0) {
> +			execlists_context_unqueue(ring);
> +		}
>
> -	ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
> +		spin_unlock(&ring->execlist_lock);

Would it be worth trying to grab the mutex and unpin the LRCs at this 
point? It would slow down the thread a bit but would get rid of the 
retired work queue. With the vma->iomap thingy could be quite cheap?

> -	/* Update the read pointer to the old write pointer. Manual ringbuffer
> -	 * management ftw </sarcasm> */
> -	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> -		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> -				 ring->next_context_status_buffer << 8));
> +		WARN(submit_contexts > 2, "More than two context complete events?\n");
> +		ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES;
> +		I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> +			   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
> +					 ring->next_context_status_buffer<<8));
> +	} while (1);
>   }
>
>   static int execlists_context_queue(struct drm_i915_gem_request *request)
> @@ -585,7 +582,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>
>   	i915_gem_request_get(request);
>
> -	spin_lock_irq(&engine->execlist_lock);
> +	spin_lock(&engine->execlist_lock);
>
>   	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
>   		if (++num_elements > 2)
> @@ -611,7 +608,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>   	if (num_elements == 0)
>   		execlists_context_unqueue(engine);
>
> -	spin_unlock_irq(&engine->execlist_lock);
> +	spin_unlock(&engine->execlist_lock);

Hm, another thing which could be quite cool is if here we could just 
wake the thread and let it submit the request instead of doing it 
directly from 3rd party context.

That would make all ELSP accesses serialized for free since they would 
only be happening from a single thread. And potentially could reduce the 
scope of the lock even more.

But it would mean extra latency when transitioning the idle engine to 
busy. Maybe it wouldn't matter for such workloads.

Regards,

Tvrtko
Chris Wilson Feb. 19, 2016, 12:29 p.m. UTC | #2
On Fri, Feb 19, 2016 at 12:08:14PM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 11/01/16 10:44, Chris Wilson wrote:
> >[  196.988204] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> >[  196.988512] clocksource:                       'refined-jiffies' wd_now: ffff9b48 wd_last: ffff9acb mask: ffffffff
> >[  196.988559] clocksource:                       'tsc' cs_now: 4fcfa84354 cs_last: 4f95425e98 mask: ffffffffffffffff
> >[  196.992115] clocksource: Switched to clocksource refined-jiffies
> >
> >Followed by a hard lockup.
> 
> What does the above mean? Irq handler taking too long interferes
> with time keeping ?

That's exactly what it means, we run for too long in interrupt context
(i.e. with interrupts disabled).
 
> I like it BTW. Just the commit message needs more work. :)
> 
> How is performance impact with just this patch in isolation? Worth
> progressing it on its own?

I only looked for regressions, which I didn't find. It fixes a machine
freeze/panic, so I wasn't looking for any other reason to justify the
patch!

> >-void intel_lrc_irq_handler(struct intel_engine_cs *ring)
> >+static int intel_execlists_submit(void *arg)
> >  {
> >-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >-	u32 status_pointer;
> >-	u8 read_pointer;
> >-	u8 write_pointer;
> >-	u32 status = 0;
> >-	u32 status_id;
> >-	u32 submit_contexts = 0;
> >+	struct intel_engine_cs *ring = arg;
> >+	struct drm_i915_private *dev_priv = ring->i915;
> >
> >-	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
> >+	set_rtpriority();
> >
> >-	read_pointer = ring->next_context_status_buffer;
> >-	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
> >-	if (read_pointer > write_pointer)
> >-		write_pointer += GEN8_CSB_ENTRIES;
> >+	do {
> >+		u32 status;
> >+		u32 status_id;
> >+		u32 submit_contexts;
> >+		u8 head, tail;
> >
> >-	spin_lock(&ring->execlist_lock);
> >+		set_current_state(TASK_INTERRUPTIBLE);
> 
> Hm, what is the effect of setting TASK_INTERRUPTIBLE at this stage
> rather than just before the call to schedule?

We need to set the task state before doing the checks, so that we do not
miss a wakeup from the interrupt handler. Otherwise, we may check the
register, decide there is nothing to do, be interrupted by the irq
handler which then sets us to TASK_RUNNING, and then proceed with going
to sleep by setting TASK_INTERRUPTIBLE (and so missing that the
context-switch had just occurred and sleep forever).
 
> And why interruptible?

To hide ourselves from contributing to the system load and appear as
sleeping (rather than blocked) in the process lists.

> >+		while (head++ < tail) {
> >+			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES));
> >+			status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES));
> 
> One potentially cheap improvement I've been thinking of is to move
> the CSB reading outside the execlist_lock, to avoid slow MMIO
> contending the lock.

Yes, that should be a small but noticeable improvement.
 
> We could fetch all valid entries into a temporary local buffer, then
> grab the execlist_lock and process completions and submission from
> there.

If you look at the next patch, you will see that's what I did do :)

> >-	ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
> >+		spin_unlock(&ring->execlist_lock);
> 
> Would it be worth trying to grab the mutex and unpin the LRCs at
> this point? It would slow down the thread a bit but would get rid of
> the retired work queue. With the vma->iomap thingy could be quite
> cheap?

Exactly, we want the iomap/vmap caching thingy first :) But the
retired work queue disappears as a fallout of your previous-context idea
anyway plus the fix to avoid the struct_mutex when freeing requests.

> >-	/* Update the read pointer to the old write pointer. Manual ringbuffer
> >-	 * management ftw </sarcasm> */
> >-	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> >-		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> >-				 ring->next_context_status_buffer << 8));
> >+		WARN(submit_contexts > 2, "More than two context complete events?\n");
> >+		ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES;
> >+		I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> >+			   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
> >+					 ring->next_context_status_buffer<<8));
> >+	} while (1);
> >  }
> >
> >  static int execlists_context_queue(struct drm_i915_gem_request *request)
> >@@ -585,7 +582,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
> >
> >  	i915_gem_request_get(request);
> >
> >-	spin_lock_irq(&engine->execlist_lock);
> >+	spin_lock(&engine->execlist_lock);
> >
> >  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> >  		if (++num_elements > 2)
> >@@ -611,7 +608,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
> >  	if (num_elements == 0)
> >  		execlists_context_unqueue(engine);
> >
> >-	spin_unlock_irq(&engine->execlist_lock);
> >+	spin_unlock(&engine->execlist_lock);
> 
> Hm, another thing which could be quite cool is if here we could just
> wake the thread and let it submit the request instead of doing it
> directly from 3rd party context.

Yes, this is something I played around with, and my conclusion was that
the extra overhead from calling ttwu (try_to_wake_up) on the majority of
workloads outweighed the few workloads that benefitted.
 
> That would make all ELSP accesses serialized for free since they
> would only be happening from a single thread. And potentially could
> reduce the scope of the lock even more.
> 
> But it would mean extra latency when transitioning the idle engine
> to busy. Maybe it wouldn't matter for such workloads.

Yup. I saw greater improvement from reducing the overhead along the
execlists context-switch handling path than the adding of requests.

There is certainly plenty of scope for improvement though.
-Chris
Tvrtko Ursulin Feb. 19, 2016, 2:10 p.m. UTC | #3
On 19/02/16 12:29, Chris Wilson wrote:
> On Fri, Feb 19, 2016 at 12:08:14PM +0000, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 11/01/16 10:44, Chris Wilson wrote:
>>> [  196.988204] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
>>> [  196.988512] clocksource:                       'refined-jiffies' wd_now: ffff9b48 wd_last: ffff9acb mask: ffffffff
>>> [  196.988559] clocksource:                       'tsc' cs_now: 4fcfa84354 cs_last: 4f95425e98 mask: ffffffffffffffff
>>> [  196.992115] clocksource: Switched to clocksource refined-jiffies
>>>
>>> Followed by a hard lockup.
>>
>> What does the above mean? Irq handler taking too long interferes
>> with time keeping ?
>
> That's exactly what it means, we run for too long in interrupt context
> (i.e. with interrupts disabled).

Okay, just please spell it out in the commit.

>> I like it BTW. Just the commit message needs more work. :)
>>
>> How is performance impact with just this patch in isolation? Worth
>> progressing it on its own?
>
> I only looked for regressions, which I didn't find. It fixes a machine
> freeze/panic, so I wasn't looking for any other reason to justify the
> patch!

Then both of the above also need to be documented in the commit message.

>>> -void intel_lrc_irq_handler(struct intel_engine_cs *ring)
>>> +static int intel_execlists_submit(void *arg)
>>>   {
>>> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>>> -	u32 status_pointer;
>>> -	u8 read_pointer;
>>> -	u8 write_pointer;
>>> -	u32 status = 0;
>>> -	u32 status_id;
>>> -	u32 submit_contexts = 0;
>>> +	struct intel_engine_cs *ring = arg;
>>> +	struct drm_i915_private *dev_priv = ring->i915;
>>>
>>> -	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
>>> +	set_rtpriority();
>>>
>>> -	read_pointer = ring->next_context_status_buffer;
>>> -	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
>>> -	if (read_pointer > write_pointer)
>>> -		write_pointer += GEN8_CSB_ENTRIES;
>>> +	do {
>>> +		u32 status;
>>> +		u32 status_id;
>>> +		u32 submit_contexts;
>>> +		u8 head, tail;
>>>
>>> -	spin_lock(&ring->execlist_lock);
>>> +		set_current_state(TASK_INTERRUPTIBLE);
>>
>> Hm, what is the effect of setting TASK_INTERRUPTIBLE at this stage
>> rather than just before the call to schedule?
>
> We need to set the task state before doing the checks, so that we do not
> miss a wakeup from the interrupt handler. Otherwise, we may check the
> register, decide there is nothing to do, be interrupted by the irq
> handler which then sets us to TASK_RUNNING, and then proceed with going
> to sleep by setting TASK_INTERRUPTIBLE (and so missing that the
> context-switch had just occurred and sleep forever).

Doh of course - haven't been exposed to this for a while.

>> And why interruptible?
>
> To hide ourselves from contributing to the system load and appear as
> sleeping (rather than blocked) in the process lists.

Oh yes, definitely.

>>> +		while (head++ < tail) {
>>> +			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES));
>>> +			status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES));
>>
>> One potentially cheap improvement I've been thinking of is to move
>> the CSB reading outside the execlist_lock, to avoid slow MMIO
>> contending the lock.
>
> Yes, that should be a small but noticeable improvement.
>
>> We could fetch all valid entries into a temporary local buffer, then
>> grab the execlist_lock and process completions and submission from
>> there.
>
> If you look at the next patch, you will see that's what I did do :)

I have just about started to understand (or pretend to understand) the 
current code. I don't dare to look at a complete rewrite.

So maybe start with this one and go with small incremental changes?

>>> -	ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
>>> +		spin_unlock(&ring->execlist_lock);
>>
>> Would it be worth trying to grab the mutex and unpin the LRCs at
>> this point? It would slow down the thread a bit but would get rid of
>> the retired work queue. With the vma->iomap thingy could be quite
>> cheap?
>
> Exactly, we want the iomap/vmap caching thingy first :) But the
> retired work queue disappears as a fallout of your previous-context idea
> anyway plus the fix to avoid the struct_mutex when freeing requests.

I did not get that working yet. I think it need N previous contexts 
pinned in a request, where N is equal to CSB size. Since most 
pessimistic thinking is we could get that many context complete events 
in a single interrupt.

Regards,

Tvrtko
Chris Wilson Feb. 19, 2016, 2:34 p.m. UTC | #4
On Fri, Feb 19, 2016 at 02:10:44PM +0000, Tvrtko Ursulin wrote:
> On 19/02/16 12:29, Chris Wilson wrote:
> >Exactly, we want the iomap/vmap caching thingy first :) But the
> >retired work queue disappears as a fallout of your previous-context idea
> >anyway plus the fix to avoid the struct_mutex when freeing requests.
> 
> I did not get that working yet. I think it need N previous contexts
> pinned in a request, where N is equal to CSB size. Since most
> pessimistic thinking is we could get that many context complete
> events in a single interrupt.

The completion order is still the same as our execution order (it has to
be otherwise it violates our serialisation rules), so worst case is you
only need to keep the engine->last_context pinned along with active
references on the outstanding contexts (and those active references are
held by the following request).
-Chris
Chris Wilson Feb. 19, 2016, 2:41 p.m. UTC | #5
On Fri, Feb 19, 2016 at 02:10:44PM +0000, Tvrtko Ursulin wrote:
> 
> On 19/02/16 12:29, Chris Wilson wrote:
> >On Fri, Feb 19, 2016 at 12:08:14PM +0000, Tvrtko Ursulin wrote:
> >>
> >>Hi,
> >>
> >>On 11/01/16 10:44, Chris Wilson wrote:
> >>>[  196.988204] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> >>>[  196.988512] clocksource:                       'refined-jiffies' wd_now: ffff9b48 wd_last: ffff9acb mask: ffffffff
> >>>[  196.988559] clocksource:                       'tsc' cs_now: 4fcfa84354 cs_last: 4f95425e98 mask: ffffffffffffffff
> >>>[  196.992115] clocksource: Switched to clocksource refined-jiffies
> >>>
> >>>Followed by a hard lockup.
> >>
> >>What does the above mean? Irq handler taking too long interferes
> >>with time keeping ?
> >
> >That's exactly what it means, we run for too long in interrupt context
> >(i.e. with interrupts disabled).
> 
> Okay, just please spell it out in the commit.
> 
> >>I like it BTW. Just the commit message needs more work. :)
> >>
> >>How is performance impact with just this patch in isolation? Worth
> >>progressing it on its own?
> >
> >I only looked for regressions, which I didn't find. It fixes a machine
> >freeze/panic, so I wasn't looking for any other reason to justify the
> >patch!
> 
> Then both of the above also need to be documented in the commit message.

Hah, one thing I just rediscovered was that the benchmarks for this kill
the machine without the patch.
-Chris
Tvrtko Ursulin Feb. 19, 2016, 2:52 p.m. UTC | #6
On 19/02/16 14:34, Chris Wilson wrote:
> On Fri, Feb 19, 2016 at 02:10:44PM +0000, Tvrtko Ursulin wrote:
>> On 19/02/16 12:29, Chris Wilson wrote:
>>> Exactly, we want the iomap/vmap caching thingy first :) But the
>>> retired work queue disappears as a fallout of your previous-context idea
>>> anyway plus the fix to avoid the struct_mutex when freeing requests.
>>
>> I did not get that working yet. I think it need N previous contexts
>> pinned in a request, where N is equal to CSB size. Since most
>> pessimistic thinking is we could get that many context complete
>> events in a single interrupt.
>
> The completion order is still the same as our execution order (it has to
> be otherwise it violates our serialisation rules), so worst case is you
> only need to keep the engine->last_context pinned along with active
> references on the outstanding contexts (and those active references are
> held by the following request).

Yes completion order is the same but seqno in HWS could theoretically be 
ahead of the context completes by the size of CSB I thought.

Theory is that GPU could have completed N contexts, stuffed the 
notifications in the CSB and then maybe the actual interrupt got 
coalesced, delayed, or something. At least I thought this was what I was 
observing. Not 100% sure.

But it was only a problem in the current code, where I tried to unpin 
the previous contexts from the seqno based timeline of course.

Regards,

Tvrtko
Chris Wilson Feb. 19, 2016, 3:02 p.m. UTC | #7
On Fri, Feb 19, 2016 at 02:52:18PM +0000, Tvrtko Ursulin wrote:
> 
> On 19/02/16 14:34, Chris Wilson wrote:
> >On Fri, Feb 19, 2016 at 02:10:44PM +0000, Tvrtko Ursulin wrote:
> >>On 19/02/16 12:29, Chris Wilson wrote:
> >>>Exactly, we want the iomap/vmap caching thingy first :) But the
> >>>retired work queue disappears as a fallout of your previous-context idea
> >>>anyway plus the fix to avoid the struct_mutex when freeing requests.
> >>
> >>I did not get that working yet. I think it need N previous contexts
> >>pinned in a request, where N is equal to CSB size. Since most
> >>pessimistic thinking is we could get that many context complete
> >>events in a single interrupt.
> >
> >The completion order is still the same as our execution order (it has to
> >be otherwise it violates our serialisation rules), so worst case is you
> >only need to keep the engine->last_context pinned along with active
> >references on the outstanding contexts (and those active references are
> >held by the following request).
> 
> Yes completion order is the same but seqno in HWS could
> theoretically be ahead of the context completes by the size of CSB I
> thought.
> 
> Theory is that GPU could have completed N contexts, stuffed the
> notifications in the CSB and then maybe the actual interrupt got
> coalesced, delayed, or something. At least I thought this was what I
> was observing. Not 100% sure.

Aiui, the issue is where we unpin (because of breadcrumb completion)
before the HW completes (finishes writing out the context). For legacy,
we rely on that the MI_SET_CONTEXT is a serialising instruction (i.e.
all writes are completed to the old context before execution continues)
and only then need to keep the old context alive until the request
containing the MI_SET_CONTEXT away is complete. For the same techinque
to be applicable to execlists, just needs the same guarantee that the
hardware will not execute from the next context until it has finished
saving state from the previous context.
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 378bc73296aa..15a6fddfb79b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2094,7 +2094,6 @@  static int i915_execlists(struct seq_file *m, void *data)
 	for_each_ring(ring, dev_priv, ring_id) {
 		struct drm_i915_gem_request *head_req = NULL;
 		int count = 0;
-		unsigned long flags;
 
 		seq_printf(m, "%s\n", ring->name);
 
@@ -2121,12 +2120,12 @@  static int i915_execlists(struct seq_file *m, void *data)
 				   i, status, ctx_id);
 		}
 
-		spin_lock_irqsave(&ring->execlist_lock, flags);
+		spin_lock(&ring->execlist_lock);
 		list_for_each(cursor, &ring->execlist_queue)
 			count++;
 		head_req = list_first_entry_or_null(&ring->execlist_queue,
 				struct drm_i915_gem_request, execlist_link);
-		spin_unlock_irqrestore(&ring->execlist_lock, flags);
+		spin_unlock(&ring->execlist_lock);
 
 		seq_printf(m, "\t%d requests in queue\n", count);
 		if (head_req) {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 391f840d29b7..eb875ecd7907 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2192,13 +2192,13 @@  static void i915_gem_reset_ring_cleanup(struct intel_engine_cs *engine)
 	 */
 
 	if (i915.enable_execlists) {
-		spin_lock_irq(&engine->execlist_lock);
+		spin_lock(&engine->execlist_lock);
 
 		/* list_splice_tail_init checks for empty lists */
 		list_splice_tail_init(&engine->execlist_queue,
 				      &engine->execlist_retired_req_list);
 
-		spin_unlock_irq(&engine->execlist_lock);
+		spin_unlock(&engine->execlist_lock);
 		intel_execlists_retire_requests(engine);
 	}
 
@@ -2290,15 +2290,8 @@  i915_gem_retire_requests(struct drm_device *dev)
 	for_each_ring(ring, dev_priv, i) {
 		i915_gem_retire_requests_ring(ring);
 		idle &= list_empty(&ring->request_list);
-		if (i915.enable_execlists) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&ring->execlist_lock, flags);
-			idle &= list_empty(&ring->execlist_queue);
-			spin_unlock_irqrestore(&ring->execlist_lock, flags);
-
-			intel_execlists_retire_requests(ring);
-		}
+		if (i915.enable_execlists)
+			idle &= intel_execlists_retire_requests(ring);
 	}
 
 	if (idle)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ce047ac84f5f..b2ef2d0c211b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1316,7 +1316,7 @@  gen8_cs_irq_handler(struct intel_engine_cs *ring, u32 iir, int test_shift)
 	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
 		notify_ring(ring);
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
-		intel_lrc_irq_handler(ring);
+		wake_up_process(ring->execlists_submit);
 }
 
 static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b5f62b5f4913..de5889e95d6d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -132,6 +132,8 @@ 
  *
  */
 
+#include <linux/kthread.h>
+
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
@@ -341,7 +343,7 @@  static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	rq0->elsp_submitted++;
 
 	/* You must always write both descriptors in the order below. */
-	spin_lock(&dev_priv->uncore.lock);
+	spin_lock_irq(&dev_priv->uncore.lock);
 	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
 	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
 	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
@@ -353,7 +355,7 @@  static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	/* ELSP is a wo register, use another nearby reg for posting */
 	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
 	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
-	spin_unlock(&dev_priv->uncore.lock);
+	spin_unlock_irq(&dev_priv->uncore.lock);
 }
 
 static int execlists_update_context(struct drm_i915_gem_request *rq)
@@ -492,89 +494,84 @@  static bool execlists_check_remove_request(struct intel_engine_cs *ring,
 	return false;
 }
 
-static void get_context_status(struct intel_engine_cs *ring,
-			       u8 read_pointer,
-			       u32 *status, u32 *context_id)
+static void set_rtpriority(void)
 {
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-
-	if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
-		return;
-
-	*status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
-	*context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
+	 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2-1 };
+	 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
 }
 
-/**
- * intel_lrc_irq_handler() - handle Context Switch interrupts
- * @ring: Engine Command Streamer to handle.
- *
- * Check the unread Context Status Buffers and manage the submission of new
- * contexts to the ELSP accordingly.
- */
-void intel_lrc_irq_handler(struct intel_engine_cs *ring)
+static int intel_execlists_submit(void *arg)
 {
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	u32 status_pointer;
-	u8 read_pointer;
-	u8 write_pointer;
-	u32 status = 0;
-	u32 status_id;
-	u32 submit_contexts = 0;
+	struct intel_engine_cs *ring = arg;
+	struct drm_i915_private *dev_priv = ring->i915;
 
-	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
+	set_rtpriority();
 
-	read_pointer = ring->next_context_status_buffer;
-	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
-	if (read_pointer > write_pointer)
-		write_pointer += GEN8_CSB_ENTRIES;
+	do {
+		u32 status;
+		u32 status_id;
+		u32 submit_contexts;
+		u8 head, tail;
 
-	spin_lock(&ring->execlist_lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		head = ring->next_context_status_buffer;
+		tail = I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
+		if (head == tail) {
+			if (kthread_should_stop())
+				return 0;
 
-	while (read_pointer < write_pointer) {
+			schedule();
+			continue;
+		}
+		__set_current_state(TASK_RUNNING);
 
-		get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES,
-				   &status, &status_id);
+		if (head > tail)
+			tail += GEN8_CSB_ENTRIES;
 
-		if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-			continue;
+		status = 0;
+		submit_contexts = 0;
 
-		if (status & GEN8_CTX_STATUS_PREEMPTED) {
-			if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
-				if (execlists_check_remove_request(ring, status_id))
-					WARN(1, "Lite Restored request removed from queue\n");
-			} else
-				WARN(1, "Preemption without Lite Restore\n");
-		}
+		spin_lock(&ring->execlist_lock);
 
-		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
-		    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
-			if (execlists_check_remove_request(ring, status_id))
-				submit_contexts++;
-		}
-	}
+		while (head++ < tail) {
+			status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES));
+			status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES));
 
-	if (disable_lite_restore_wa(ring)) {
-		/* Prevent a ctx to preempt itself */
-		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
-		    (submit_contexts != 0))
-			execlists_context_unqueue(ring);
-	} else if (submit_contexts != 0) {
-		execlists_context_unqueue(ring);
-	}
+			if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
+				continue;
 
-	spin_unlock(&ring->execlist_lock);
+			if (status & GEN8_CTX_STATUS_PREEMPTED) {
+				if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
+					if (execlists_check_remove_request(ring, status_id))
+						WARN(1, "Lite Restored request removed from queue\n");
+				} else
+					WARN(1, "Preemption without Lite Restore\n");
+			}
 
-	if (unlikely(submit_contexts > 2))
-		DRM_ERROR("More than two context complete events?\n");
+			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
+			    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
+				if (execlists_check_remove_request(ring, status_id))
+					submit_contexts++;
+			}
+		}
+
+		if (disable_lite_restore_wa(ring)) {
+			/* Prevent a ctx to preempt itself */
+			if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
+					(submit_contexts != 0))
+				execlists_context_unqueue(ring);
+		} else if (submit_contexts != 0) {
+			execlists_context_unqueue(ring);
+		}
 
-	ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
+		spin_unlock(&ring->execlist_lock);
 
-	/* Update the read pointer to the old write pointer. Manual ringbuffer
-	 * management ftw </sarcasm> */
-	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
-		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-				 ring->next_context_status_buffer << 8));
+		WARN(submit_contexts > 2, "More than two context complete events?\n");
+		ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES;
+		I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
+			   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
+					 ring->next_context_status_buffer<<8));
+	} while (1);
 }
 
 static int execlists_context_queue(struct drm_i915_gem_request *request)
@@ -585,7 +582,7 @@  static int execlists_context_queue(struct drm_i915_gem_request *request)
 
 	i915_gem_request_get(request);
 
-	spin_lock_irq(&engine->execlist_lock);
+	spin_lock(&engine->execlist_lock);
 
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
 		if (++num_elements > 2)
@@ -611,7 +608,7 @@  static int execlists_context_queue(struct drm_i915_gem_request *request)
 	if (num_elements == 0)
 		execlists_context_unqueue(engine);
 
-	spin_unlock_irq(&engine->execlist_lock);
+	spin_unlock(&engine->execlist_lock);
 
 	return 0;
 }
@@ -667,19 +664,19 @@  intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 		execlists_context_queue(request);
 }
 
-void intel_execlists_retire_requests(struct intel_engine_cs *ring)
+bool intel_execlists_retire_requests(struct intel_engine_cs *ring)
 {
 	struct drm_i915_gem_request *req, *tmp;
 	struct list_head retired_list;
 
 	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
 	if (list_empty(&ring->execlist_retired_req_list))
-		return;
+		goto out;
 
 	INIT_LIST_HEAD(&retired_list);
-	spin_lock_irq(&ring->execlist_lock);
+	spin_lock(&ring->execlist_lock);
 	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
-	spin_unlock_irq(&ring->execlist_lock);
+	spin_unlock(&ring->execlist_lock);
 
 	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
 		struct intel_context *ctx = req->ctx;
@@ -691,6 +688,9 @@  void intel_execlists_retire_requests(struct intel_engine_cs *ring)
 		list_del(&req->execlist_link);
 		i915_gem_request_put(req);
 	}
+
+out:
+	return list_empty(&ring->execlist_queue);
 }
 
 void intel_logical_ring_stop(struct intel_engine_cs *ring)
@@ -1525,6 +1525,9 @@  void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
 	if (!intel_engine_initialized(ring))
 		return;
 
+	if (ring->execlists_submit)
+		kthread_stop(ring->execlists_submit);
+
 	if (ring->buffer) {
 		struct drm_i915_private *dev_priv = ring->i915;
 		intel_logical_ring_stop(ring);
@@ -1550,13 +1553,15 @@  void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
 
 static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring)
 {
+	struct drm_i915_private *dev_priv = to_i915(dev);
+	struct task_struct *task;
 	int ret;
 
 	/* Intentionally left blank. */
 	ring->buffer = NULL;
 
 	ring->dev = dev;
-	ring->i915 = to_i915(dev);
+	ring->i915 = dev_priv;
 	ring->fence_context = fence_context_alloc(1);
 	INIT_LIST_HEAD(&ring->request_list);
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
@@ -1587,6 +1592,15 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 		goto error;
 	}
 
+	ring->next_context_status_buffer =
+			I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK;
+	task = kthread_run(intel_execlists_submit, ring,
+			   "irq/i915:%de", ring->id);
+	if (IS_ERR(task))
+		goto error;
+
+	ring->execlists_submit = task;
+
 	return 0;
 
 error:
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 87bc9acc4224..33f82a84065a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -81,7 +81,6 @@  uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);
 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
 
-void intel_lrc_irq_handler(struct intel_engine_cs *ring);
-void intel_execlists_retire_requests(struct intel_engine_cs *ring);
+bool intel_execlists_retire_requests(struct intel_engine_cs *ring);
 
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index bb92d831a100..edaf07b2292e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -291,6 +291,7 @@  struct intel_engine_cs {
 	} semaphore;
 
 	/* Execlists */
+	struct task_struct *execlists_submit;
 	spinlock_t execlist_lock;
 	struct list_head execlist_queue;
 	struct list_head execlist_retired_req_list;