diff mbox

[31/43] drm/i915/bdw: Handle context switch events

Message ID 1406217891-8912-32-git-send-email-thomas.daniel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Thomas Daniel July 24, 2014, 4:04 p.m. UTC
Handle all context status events in the context status buffer on every
context switch interrupt. We only remove work from the execlist queue
after a context status buffer reports that it has completed and we only
attempt to schedule new contexts on interrupt when a previously submitted
context completes (unless no contexts are queued, which means the GPU is
free).

We canot call intel_runtime_pm_get() in an interrupt (or with a spinlock
grabbed, FWIW), because it might sleep, which is not a nice thing to do.
Instead, do the runtime_pm get/put together with the create/destroy request,
and handle the forcewake get/put directly.

Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>

v2: Unreferencing the context when we are freeing the request might free
the backing bo, which requires the struct_mutex to be grabbed, so defer
unreferencing and freeing to a bottom half.

v3:
- Ack the interrupt inmediately, before trying to handle it (fix for
missing interrupts by Bob Beckett <robert.beckett@intel.com>).
- Update the Context Status Buffer Read Pointer, just in case (spotted
by Damien Lespiau).

v4: New namespace and multiple rebase changes.

v5: Squash with "drm/i915/bdw: Do not call intel_runtime_pm_get() in an
interrupt", as suggested by Daniel.

Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         |   35 ++++++---
 drivers/gpu/drm/i915/intel_lrc.c        |  129 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_lrc.h        |    3 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
 4 files changed, 151 insertions(+), 17 deletions(-)

Comments

Daniel Vetter Aug. 14, 2014, 8:13 p.m. UTC | #1
On Thu, Jul 24, 2014 at 05:04:39PM +0100, Thomas Daniel wrote:
> Handle all context status events in the context status buffer on every
> context switch interrupt. We only remove work from the execlist queue
> after a context status buffer reports that it has completed and we only
> attempt to schedule new contexts on interrupt when a previously submitted
> context completes (unless no contexts are queued, which means the GPU is
> free).
> 
> We canot call intel_runtime_pm_get() in an interrupt (or with a spinlock
> grabbed, FWIW), because it might sleep, which is not a nice thing to do.
> Instead, do the runtime_pm get/put together with the create/destroy request,
> and handle the forcewake get/put directly.
> 
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> 
> v2: Unreferencing the context when we are freeing the request might free
> the backing bo, which requires the struct_mutex to be grabbed, so defer
> unreferencing and freeing to a bottom half.
> 
> v3:
> - Ack the interrupt inmediately, before trying to handle it (fix for
> missing interrupts by Bob Beckett <robert.beckett@intel.com>).
> - Update the Context Status Buffer Read Pointer, just in case (spotted
> by Damien Lespiau).
> 
> v4: New namespace and multiple rebase changes.
> 
> v5: Squash with "drm/i915/bdw: Do not call intel_runtime_pm_get() in an
> interrupt", as suggested by Daniel.
> 
> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         |   35 ++++++---
>  drivers/gpu/drm/i915/intel_lrc.c        |  129 +++++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_lrc.h        |    3 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
>  4 files changed, 151 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index f77a4ca..e4077d1 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1628,6 +1628,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  				       struct drm_i915_private *dev_priv,
>  				       u32 master_ctl)
>  {
> +	struct intel_engine_cs *ring;
>  	u32 rcs, bcs, vcs;
>  	uint32_t tmp = 0;
>  	irqreturn_t ret = IRQ_NONE;
> @@ -1637,14 +1638,20 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  		if (tmp) {
>  			I915_WRITE(GEN8_GT_IIR(0), tmp);
>  			ret = IRQ_HANDLED;
> +
>  			rcs = tmp >> GEN8_RCS_IRQ_SHIFT;
> -			bcs = tmp >> GEN8_BCS_IRQ_SHIFT;
> +			ring = &dev_priv->ring[RCS];
>  			if (rcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[RCS]);
> +				notify_ring(dev, ring);
> +			if (rcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> +				intel_execlists_handle_ctx_events(ring);
> +
> +			bcs = tmp >> GEN8_BCS_IRQ_SHIFT;
> +			ring = &dev_priv->ring[BCS];
>  			if (bcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[BCS]);
> -			if ((rcs | bcs) & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				notify_ring(dev, ring);
> +			if (bcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> +				intel_execlists_handle_ctx_events(ring);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT0)!\n");
>  	}
> @@ -1654,16 +1661,20 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  		if (tmp) {
>  			I915_WRITE(GEN8_GT_IIR(1), tmp);
>  			ret = IRQ_HANDLED;
> +
>  			vcs = tmp >> GEN8_VCS1_IRQ_SHIFT;
> +			ring = &dev_priv->ring[VCS];
>  			if (vcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[VCS]);
> +				notify_ring(dev, ring);
>  			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				intel_execlists_handle_ctx_events(ring);
> +
>  			vcs = tmp >> GEN8_VCS2_IRQ_SHIFT;
> +			ring = &dev_priv->ring[VCS2];
>  			if (vcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[VCS2]);
> +				notify_ring(dev, ring);
>  			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				intel_execlists_handle_ctx_events(ring);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT1)!\n");
>  	}
> @@ -1684,11 +1695,13 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  		if (tmp) {
>  			I915_WRITE(GEN8_GT_IIR(3), tmp);
>  			ret = IRQ_HANDLED;
> +
>  			vcs = tmp >> GEN8_VECS_IRQ_SHIFT;
> +			ring = &dev_priv->ring[VECS];
>  			if (vcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[VECS]);
> +				notify_ring(dev, ring);
>  			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				intel_execlists_handle_ctx_events(ring);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT3)!\n");
>  	}

The above stuff is dropping of the left edge a bit. And it's all so
super-nicely lead out thanks to the hw engineers, so with a bit of code
refactoring we should be able to have one generic ring irq handler and 5
callers for them and rip this all you.

Can you please do that?
-Daniel

> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9e91169..65f4f26 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -49,6 +49,22 @@
>  #define RING_ELSP(ring)			((ring)->mmio_base+0x230)
>  #define RING_EXECLIST_STATUS(ring)	((ring)->mmio_base+0x234)
>  #define RING_CONTEXT_CONTROL(ring)	((ring)->mmio_base+0x244)
> +#define RING_CONTEXT_STATUS_BUF(ring)	((ring)->mmio_base+0x370)
> +#define RING_CONTEXT_STATUS_PTR(ring)	((ring)->mmio_base+0x3a0)
> +
> +#define RING_EXECLIST_QFULL		(1 << 0x2)
> +#define RING_EXECLIST1_VALID		(1 << 0x3)
> +#define RING_EXECLIST0_VALID		(1 << 0x4)
> +#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
> +#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
> +#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
> +
> +#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
> +#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
> +#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
> +#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
> +#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
> +#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
>  
>  #define CTX_LRI_HEADER_0		0x01
>  #define CTX_CONTEXT_CONTROL		0x02
> @@ -147,6 +163,7 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	uint64_t temp = 0;
>  	uint32_t desc[4];
> +	unsigned long flags;
>  
>  	/* XXX: You must always write both descriptors in the order below. */
>  	if (ctx_obj1)
> @@ -160,9 +177,17 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>  	desc[3] = (u32)(temp >> 32);
>  	desc[2] = (u32)temp;
>  
> -	/* Set Force Wakeup bit to prevent GT from entering C6 while
> -	 * ELSP writes are in progress */
> -	gen6_gt_force_wake_get(dev_priv, FORCEWAKE_ALL);
> +	/* Set Force Wakeup bit to prevent GT from entering C6 while ELSP writes
> +	 * are in progress.
> +	 *
> +	 * The other problem is that we can't just call gen6_gt_force_wake_get()
> +	 * because that function calls intel_runtime_pm_get(), which might sleep.
> +	 * Instead, we do the runtime_pm_get/put when creating/destroying requests.
> +	 */
> +	spin_lock_irqsave(&dev_priv->uncore.lock, flags);
> +	if (dev_priv->uncore.forcewake_count++ == 0)
> +		dev_priv->uncore.funcs.force_wake_get(dev_priv, FORCEWAKE_ALL);
> +	spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
>  
>  	I915_WRITE(RING_ELSP(ring), desc[1]);
>  	I915_WRITE(RING_ELSP(ring), desc[0]);
> @@ -173,7 +198,11 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>  	/* ELSP is a wo register, so use another nearby reg for posting instead */
>  	POSTING_READ(RING_EXECLIST_STATUS(ring));
>  
> -	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
> +	/* Release Force Wakeup (see the big comment above). */
> +	spin_lock_irqsave(&dev_priv->uncore.lock, flags);
> +	if (--dev_priv->uncore.forcewake_count == 0)
> +		dev_priv->uncore.funcs.force_wake_put(dev_priv, FORCEWAKE_ALL);
> +	spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
>  }
>  
>  static int execlists_ctx_write_tail(struct drm_i915_gem_object *ctx_obj, u32 tail)
> @@ -221,6 +250,9 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  {
>  	struct intel_ctx_submit_request *req0 = NULL, *req1 = NULL;
>  	struct intel_ctx_submit_request *cursor = NULL, *tmp = NULL;
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
> +	assert_spin_locked(&ring->execlist_lock);
>  
>  	if (list_empty(&ring->execlist_queue))
>  		return;
> @@ -233,8 +265,7 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  			/* Same ctx: ignore first request, as second request
>  			 * will update tail past first request's workload */
>  			list_del(&req0->execlist_link);
> -			i915_gem_context_unreference(req0->ctx);
> -			kfree(req0);
> +			queue_work(dev_priv->wq, &req0->work);
>  			req0 = cursor;
>  		} else {
>  			req1 = cursor;
> @@ -246,6 +277,89 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  			req1? req1->ctx : NULL, req1? req1->tail : 0));
>  }
>  
> +static bool execlists_check_remove_request(struct intel_engine_cs *ring,
> +					   u32 request_id)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct intel_ctx_submit_request *head_req;
> +
> +	assert_spin_locked(&ring->execlist_lock);
> +
> +	head_req = list_first_entry_or_null(&ring->execlist_queue,
> +			struct intel_ctx_submit_request, execlist_link);
> +	if (head_req != NULL) {
> +		struct drm_i915_gem_object *ctx_obj =
> +				head_req->ctx->engine[ring->id].state;
> +		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
> +			list_del(&head_req->execlist_link);
> +			queue_work(dev_priv->wq, &head_req->work);
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	u32 status_pointer;
> +	u8 read_pointer;
> +	u8 write_pointer;
> +	u32 status;
> +	u32 status_id;
> +	u32 submit_contexts = 0;
> +
> +	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
> +
> +	read_pointer = ring->next_context_status_buffer;
> +	write_pointer = status_pointer & 0x07;
> +	if (read_pointer > write_pointer)
> +		write_pointer += 6;
> +
> +	spin_lock(&ring->execlist_lock);
> +
> +	while (read_pointer < write_pointer) {
> +		read_pointer++;
> +		status = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
> +				(read_pointer % 6) * 8);
> +		status_id = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
> +				(read_pointer % 6) * 8 + 4);
> +
> +		if (status & GEN8_CTX_STATUS_COMPLETE) {
> +			if (execlists_check_remove_request(ring, status_id))
> +				submit_contexts++;
> +		}
> +	}
> +
> +	if (submit_contexts != 0)
> +		execlists_context_unqueue(ring);
> +
> +	spin_unlock(&ring->execlist_lock);
> +
> +	WARN(submit_contexts > 2, "More than two context complete events?\n");
> +	ring->next_context_status_buffer = write_pointer % 6;
> +
> +	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> +			((u32)ring->next_context_status_buffer & 0x07) << 8);
> +}
> +
> +static void execlists_free_request_task(struct work_struct *work)
> +{
> +	struct intel_ctx_submit_request *req =
> +			container_of(work, struct intel_ctx_submit_request, work);
> +	struct drm_device *dev = req->ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	intel_runtime_pm_put(dev_priv);
> +
> +	mutex_lock(&dev->struct_mutex);
> +	i915_gem_context_unreference(req->ctx);
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	kfree(req);
> +}
> +
>  static int execlists_context_queue(struct intel_engine_cs *ring,
>  				   struct intel_context *to,
>  				   u32 tail)
> @@ -261,6 +375,8 @@ static int execlists_context_queue(struct intel_engine_cs *ring,
>  	i915_gem_context_reference(req->ctx);
>  	req->ring = ring;
>  	req->tail = tail;
> +	INIT_WORK(&req->work, execlists_free_request_task);
> +	intel_runtime_pm_get(dev_priv);
>  
>  	spin_lock_irqsave(&ring->execlist_lock, flags);
>  
> @@ -908,6 +1024,7 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>  
>  	INIT_LIST_HEAD(&ring->execlist_queue);
>  	spin_lock_init(&ring->execlist_lock);
> +	ring->next_context_status_buffer = 0;
>  
>  	ret = intel_lr_context_deferred_create(dctx, ring);
>  	if (ret)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 14492a9..2e8929f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -66,6 +66,9 @@ struct intel_ctx_submit_request {
>  	u32 tail;
>  
>  	struct list_head execlist_link;
> +	struct work_struct work;
>  };
>  
> +void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring);
> +
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 6358823..905d1ba 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -225,6 +225,7 @@ struct  intel_engine_cs {
>  	/* Execlists */
>  	spinlock_t execlist_lock;
>  	struct list_head execlist_queue;
> +	u8 next_context_status_buffer;
>  	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
>  	int		(*emit_request)(struct intel_ringbuffer *ringbuf);
>  	int		(*emit_flush)(struct intel_ringbuffer *ringbuf,
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Daniel Vetter Aug. 14, 2014, 8:17 p.m. UTC | #2
On Thu, Jul 24, 2014 at 05:04:39PM +0100, Thomas Daniel wrote:
> Handle all context status events in the context status buffer on every
> context switch interrupt. We only remove work from the execlist queue
> after a context status buffer reports that it has completed and we only
> attempt to schedule new contexts on interrupt when a previously submitted
> context completes (unless no contexts are queued, which means the GPU is
> free).
> 
> We canot call intel_runtime_pm_get() in an interrupt (or with a spinlock
> grabbed, FWIW), because it might sleep, which is not a nice thing to do.
> Instead, do the runtime_pm get/put together with the create/destroy request,
> and handle the forcewake get/put directly.
> 
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> 
> v2: Unreferencing the context when we are freeing the request might free
> the backing bo, which requires the struct_mutex to be grabbed, so defer
> unreferencing and freeing to a bottom half.
> 
> v3:
> - Ack the interrupt inmediately, before trying to handle it (fix for
> missing interrupts by Bob Beckett <robert.beckett@intel.com>).
> - Update the Context Status Buffer Read Pointer, just in case (spotted
> by Damien Lespiau).
> 
> v4: New namespace and multiple rebase changes.
> 
> v5: Squash with "drm/i915/bdw: Do not call intel_runtime_pm_get() in an
> interrupt", as suggested by Daniel.
> 
> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         |   35 ++++++---
>  drivers/gpu/drm/i915/intel_lrc.c        |  129 +++++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_lrc.h        |    3 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
>  4 files changed, 151 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index f77a4ca..e4077d1 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1628,6 +1628,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  				       struct drm_i915_private *dev_priv,
>  				       u32 master_ctl)
>  {
> +	struct intel_engine_cs *ring;
>  	u32 rcs, bcs, vcs;
>  	uint32_t tmp = 0;
>  	irqreturn_t ret = IRQ_NONE;
> @@ -1637,14 +1638,20 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  		if (tmp) {
>  			I915_WRITE(GEN8_GT_IIR(0), tmp);
>  			ret = IRQ_HANDLED;
> +
>  			rcs = tmp >> GEN8_RCS_IRQ_SHIFT;
> -			bcs = tmp >> GEN8_BCS_IRQ_SHIFT;
> +			ring = &dev_priv->ring[RCS];
>  			if (rcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[RCS]);
> +				notify_ring(dev, ring);
> +			if (rcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> +				intel_execlists_handle_ctx_events(ring);
> +
> +			bcs = tmp >> GEN8_BCS_IRQ_SHIFT;
> +			ring = &dev_priv->ring[BCS];
>  			if (bcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[BCS]);
> -			if ((rcs | bcs) & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				notify_ring(dev, ring);
> +			if (bcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> +				intel_execlists_handle_ctx_events(ring);

Also patch split fail here, the above two cases should have been in an
earlier patch that added the basic irq handling.
-Daniel

>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT0)!\n");
>  	}
> @@ -1654,16 +1661,20 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  		if (tmp) {
>  			I915_WRITE(GEN8_GT_IIR(1), tmp);
>  			ret = IRQ_HANDLED;
> +
>  			vcs = tmp >> GEN8_VCS1_IRQ_SHIFT;
> +			ring = &dev_priv->ring[VCS];
>  			if (vcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[VCS]);
> +				notify_ring(dev, ring);
>  			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				intel_execlists_handle_ctx_events(ring);
> +
>  			vcs = tmp >> GEN8_VCS2_IRQ_SHIFT;
> +			ring = &dev_priv->ring[VCS2];
>  			if (vcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[VCS2]);
> +				notify_ring(dev, ring);
>  			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				intel_execlists_handle_ctx_events(ring);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT1)!\n");
>  	}
> @@ -1684,11 +1695,13 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
>  		if (tmp) {
>  			I915_WRITE(GEN8_GT_IIR(3), tmp);
>  			ret = IRQ_HANDLED;
> +
>  			vcs = tmp >> GEN8_VECS_IRQ_SHIFT;
> +			ring = &dev_priv->ring[VECS];
>  			if (vcs & GT_RENDER_USER_INTERRUPT)
> -				notify_ring(dev, &dev_priv->ring[VECS]);
> +				notify_ring(dev, ring);
>  			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
> -				DRM_DEBUG_DRIVER("TODO: Context switch\n");
> +				intel_execlists_handle_ctx_events(ring);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT3)!\n");
>  	}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9e91169..65f4f26 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -49,6 +49,22 @@
>  #define RING_ELSP(ring)			((ring)->mmio_base+0x230)
>  #define RING_EXECLIST_STATUS(ring)	((ring)->mmio_base+0x234)
>  #define RING_CONTEXT_CONTROL(ring)	((ring)->mmio_base+0x244)
> +#define RING_CONTEXT_STATUS_BUF(ring)	((ring)->mmio_base+0x370)
> +#define RING_CONTEXT_STATUS_PTR(ring)	((ring)->mmio_base+0x3a0)
> +
> +#define RING_EXECLIST_QFULL		(1 << 0x2)
> +#define RING_EXECLIST1_VALID		(1 << 0x3)
> +#define RING_EXECLIST0_VALID		(1 << 0x4)
> +#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
> +#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
> +#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
> +
> +#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
> +#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
> +#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
> +#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
> +#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
> +#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
>  
>  #define CTX_LRI_HEADER_0		0x01
>  #define CTX_CONTEXT_CONTROL		0x02
> @@ -147,6 +163,7 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	uint64_t temp = 0;
>  	uint32_t desc[4];
> +	unsigned long flags;
>  
>  	/* XXX: You must always write both descriptors in the order below. */
>  	if (ctx_obj1)
> @@ -160,9 +177,17 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>  	desc[3] = (u32)(temp >> 32);
>  	desc[2] = (u32)temp;
>  
> -	/* Set Force Wakeup bit to prevent GT from entering C6 while
> -	 * ELSP writes are in progress */
> -	gen6_gt_force_wake_get(dev_priv, FORCEWAKE_ALL);
> +	/* Set Force Wakeup bit to prevent GT from entering C6 while ELSP writes
> +	 * are in progress.
> +	 *
> +	 * The other problem is that we can't just call gen6_gt_force_wake_get()
> +	 * because that function calls intel_runtime_pm_get(), which might sleep.
> +	 * Instead, we do the runtime_pm_get/put when creating/destroying requests.
> +	 */
> +	spin_lock_irqsave(&dev_priv->uncore.lock, flags);
> +	if (dev_priv->uncore.forcewake_count++ == 0)
> +		dev_priv->uncore.funcs.force_wake_get(dev_priv, FORCEWAKE_ALL);
> +	spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
>  
>  	I915_WRITE(RING_ELSP(ring), desc[1]);
>  	I915_WRITE(RING_ELSP(ring), desc[0]);
> @@ -173,7 +198,11 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>  	/* ELSP is a wo register, so use another nearby reg for posting instead */
>  	POSTING_READ(RING_EXECLIST_STATUS(ring));
>  
> -	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
> +	/* Release Force Wakeup (see the big comment above). */
> +	spin_lock_irqsave(&dev_priv->uncore.lock, flags);
> +	if (--dev_priv->uncore.forcewake_count == 0)
> +		dev_priv->uncore.funcs.force_wake_put(dev_priv, FORCEWAKE_ALL);
> +	spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
>  }
>  
>  static int execlists_ctx_write_tail(struct drm_i915_gem_object *ctx_obj, u32 tail)
> @@ -221,6 +250,9 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  {
>  	struct intel_ctx_submit_request *req0 = NULL, *req1 = NULL;
>  	struct intel_ctx_submit_request *cursor = NULL, *tmp = NULL;
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
> +	assert_spin_locked(&ring->execlist_lock);
>  
>  	if (list_empty(&ring->execlist_queue))
>  		return;
> @@ -233,8 +265,7 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  			/* Same ctx: ignore first request, as second request
>  			 * will update tail past first request's workload */
>  			list_del(&req0->execlist_link);
> -			i915_gem_context_unreference(req0->ctx);
> -			kfree(req0);
> +			queue_work(dev_priv->wq, &req0->work);
>  			req0 = cursor;
>  		} else {
>  			req1 = cursor;
> @@ -246,6 +277,89 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  			req1? req1->ctx : NULL, req1? req1->tail : 0));
>  }
>  
> +static bool execlists_check_remove_request(struct intel_engine_cs *ring,
> +					   u32 request_id)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct intel_ctx_submit_request *head_req;
> +
> +	assert_spin_locked(&ring->execlist_lock);
> +
> +	head_req = list_first_entry_or_null(&ring->execlist_queue,
> +			struct intel_ctx_submit_request, execlist_link);
> +	if (head_req != NULL) {
> +		struct drm_i915_gem_object *ctx_obj =
> +				head_req->ctx->engine[ring->id].state;
> +		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
> +			list_del(&head_req->execlist_link);
> +			queue_work(dev_priv->wq, &head_req->work);
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	u32 status_pointer;
> +	u8 read_pointer;
> +	u8 write_pointer;
> +	u32 status;
> +	u32 status_id;
> +	u32 submit_contexts = 0;
> +
> +	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
> +
> +	read_pointer = ring->next_context_status_buffer;
> +	write_pointer = status_pointer & 0x07;
> +	if (read_pointer > write_pointer)
> +		write_pointer += 6;
> +
> +	spin_lock(&ring->execlist_lock);
> +
> +	while (read_pointer < write_pointer) {
> +		read_pointer++;
> +		status = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
> +				(read_pointer % 6) * 8);
> +		status_id = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
> +				(read_pointer % 6) * 8 + 4);
> +
> +		if (status & GEN8_CTX_STATUS_COMPLETE) {
> +			if (execlists_check_remove_request(ring, status_id))
> +				submit_contexts++;
> +		}
> +	}
> +
> +	if (submit_contexts != 0)
> +		execlists_context_unqueue(ring);
> +
> +	spin_unlock(&ring->execlist_lock);
> +
> +	WARN(submit_contexts > 2, "More than two context complete events?\n");
> +	ring->next_context_status_buffer = write_pointer % 6;
> +
> +	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> +			((u32)ring->next_context_status_buffer & 0x07) << 8);
> +}
> +
> +static void execlists_free_request_task(struct work_struct *work)
> +{
> +	struct intel_ctx_submit_request *req =
> +			container_of(work, struct intel_ctx_submit_request, work);
> +	struct drm_device *dev = req->ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	intel_runtime_pm_put(dev_priv);
> +
> +	mutex_lock(&dev->struct_mutex);
> +	i915_gem_context_unreference(req->ctx);
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	kfree(req);
> +}
> +
>  static int execlists_context_queue(struct intel_engine_cs *ring,
>  				   struct intel_context *to,
>  				   u32 tail)
> @@ -261,6 +375,8 @@ static int execlists_context_queue(struct intel_engine_cs *ring,
>  	i915_gem_context_reference(req->ctx);
>  	req->ring = ring;
>  	req->tail = tail;
> +	INIT_WORK(&req->work, execlists_free_request_task);
> +	intel_runtime_pm_get(dev_priv);
>  
>  	spin_lock_irqsave(&ring->execlist_lock, flags);
>  
> @@ -908,6 +1024,7 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>  
>  	INIT_LIST_HEAD(&ring->execlist_queue);
>  	spin_lock_init(&ring->execlist_lock);
> +	ring->next_context_status_buffer = 0;
>  
>  	ret = intel_lr_context_deferred_create(dctx, ring);
>  	if (ret)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 14492a9..2e8929f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -66,6 +66,9 @@ struct intel_ctx_submit_request {
>  	u32 tail;
>  
>  	struct list_head execlist_link;
> +	struct work_struct work;
>  };
>  
> +void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring);
> +
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 6358823..905d1ba 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -225,6 +225,7 @@ struct  intel_engine_cs {
>  	/* Execlists */
>  	spinlock_t execlist_lock;
>  	struct list_head execlist_queue;
> +	u8 next_context_status_buffer;
>  	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
>  	int		(*emit_request)(struct intel_ringbuffer *ringbuf);
>  	int		(*emit_flush)(struct intel_ringbuffer *ringbuf,
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Daniel Vetter Aug. 14, 2014, 8:28 p.m. UTC | #3
On Thu, Jul 24, 2014 at 05:04:39PM +0100, Thomas Daniel wrote:
> Handle all context status events in the context status buffer on every
> context switch interrupt. We only remove work from the execlist queue
> after a context status buffer reports that it has completed and we only
> attempt to schedule new contexts on interrupt when a previously submitted
> context completes (unless no contexts are queued, which means the GPU is
> free).
> 
> We canot call intel_runtime_pm_get() in an interrupt (or with a spinlock
> grabbed, FWIW), because it might sleep, which is not a nice thing to do.
> Instead, do the runtime_pm get/put together with the create/destroy request,
> and handle the forcewake get/put directly.
> 
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> 
> v2: Unreferencing the context when we are freeing the request might free
> the backing bo, which requires the struct_mutex to be grabbed, so defer
> unreferencing and freeing to a bottom half.
> 
> v3:
> - Ack the interrupt inmediately, before trying to handle it (fix for
> missing interrupts by Bob Beckett <robert.beckett@intel.com>).
> - Update the Context Status Buffer Read Pointer, just in case (spotted
> by Damien Lespiau).
> 
> v4: New namespace and multiple rebase changes.
> 
> v5: Squash with "drm/i915/bdw: Do not call intel_runtime_pm_get() in an
> interrupt", as suggested by Daniel.
> 
> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>

> +static void execlists_free_request_task(struct work_struct *work)
> +{
> +	struct intel_ctx_submit_request *req =
> +			container_of(work, struct intel_ctx_submit_request, work);
> +	struct drm_device *dev = req->ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	intel_runtime_pm_put(dev_priv);
> +
> +	mutex_lock(&dev->struct_mutex);
> +	i915_gem_context_unreference(req->ctx);
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	kfree(req);
> +}

Latching a work item simply for the unference of the context looks very
fishy. The context really can't possible disappear before the last request
on it has completed, since the request already holds a reference.

That you have this additional reference here makes it look a bit like the
relationship and lifetime rules for the execlist queue item is misshapen.

I'd have expected:
- A given execlist queue item is responsible for a list of requests (one
  or more)
- Each request already holds onto the context.

The other thing that's strange here is the runtime pm handling. We already
keep the device awake as long as it's busy, so I wonder why exactly we
need this here in addition.

In any case these kind of cleanup tasks are imo better done in the retire
request handler that we already have.

Imo needs some cleanup.
-Daniel
Daniel Vetter Aug. 14, 2014, 8:37 p.m. UTC | #4
On Thu, Jul 24, 2014 at 05:04:39PM +0100, Thomas Daniel wrote:
> Handle all context status events in the context status buffer on every
> context switch interrupt. We only remove work from the execlist queue
> after a context status buffer reports that it has completed and we only
> attempt to schedule new contexts on interrupt when a previously submitted
> context completes (unless no contexts are queued, which means the GPU is
> free).
> 
> We canot call intel_runtime_pm_get() in an interrupt (or with a spinlock
> grabbed, FWIW), because it might sleep, which is not a nice thing to do.
> Instead, do the runtime_pm get/put together with the create/destroy request,
> and handle the forcewake get/put directly.
> 
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> 
> v2: Unreferencing the context when we are freeing the request might free
> the backing bo, which requires the struct_mutex to be grabbed, so defer
> unreferencing and freeing to a bottom half.
> 
> v3:
> - Ack the interrupt inmediately, before trying to handle it (fix for
> missing interrupts by Bob Beckett <robert.beckett@intel.com>).
> - Update the Context Status Buffer Read Pointer, just in case (spotted
> by Damien Lespiau).
> 
> v4: New namespace and multiple rebase changes.
> 
> v5: Squash with "drm/i915/bdw: Do not call intel_runtime_pm_get() in an
> interrupt", as suggested by Daniel.
> 
> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>

One more ...

> +void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring)

Please rename this to intel_execlist_ctx_events_irq_handler or similar for
consistency with all the other irq handler functions in a follow-up patch.
That kind of consistency helps a lot when reviewing the locking of
irq-save spinlocks.
-Daniel

> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	u32 status_pointer;
> +	u8 read_pointer;
> +	u8 write_pointer;
> +	u32 status;
> +	u32 status_id;
> +	u32 submit_contexts = 0;
> +
> +	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
> +
> +	read_pointer = ring->next_context_status_buffer;
> +	write_pointer = status_pointer & 0x07;
> +	if (read_pointer > write_pointer)
> +		write_pointer += 6;
> +
> +	spin_lock(&ring->execlist_lock);
> +
> +	while (read_pointer < write_pointer) {
> +		read_pointer++;
> +		status = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
> +				(read_pointer % 6) * 8);
> +		status_id = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
> +				(read_pointer % 6) * 8 + 4);
> +
> +		if (status & GEN8_CTX_STATUS_COMPLETE) {
> +			if (execlists_check_remove_request(ring, status_id))
> +				submit_contexts++;
> +		}
> +	}
> +
> +	if (submit_contexts != 0)
> +		execlists_context_unqueue(ring);
> +
> +	spin_unlock(&ring->execlist_lock);
> +
> +	WARN(submit_contexts > 2, "More than two context complete events?\n");
> +	ring->next_context_status_buffer = write_pointer % 6;
> +
> +	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
> +			((u32)ring->next_context_status_buffer & 0x07) << 8);
> +}
> +
> +static void execlists_free_request_task(struct work_struct *work)
> +{
> +	struct intel_ctx_submit_request *req =
> +			container_of(work, struct intel_ctx_submit_request, work);
> +	struct drm_device *dev = req->ring->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	intel_runtime_pm_put(dev_priv);
> +
> +	mutex_lock(&dev->struct_mutex);
> +	i915_gem_context_unreference(req->ctx);
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	kfree(req);
> +}
> +
>  static int execlists_context_queue(struct intel_engine_cs *ring,
>  				   struct intel_context *to,
>  				   u32 tail)
> @@ -261,6 +375,8 @@ static int execlists_context_queue(struct intel_engine_cs *ring,
>  	i915_gem_context_reference(req->ctx);
>  	req->ring = ring;
>  	req->tail = tail;
> +	INIT_WORK(&req->work, execlists_free_request_task);
> +	intel_runtime_pm_get(dev_priv);
>  
>  	spin_lock_irqsave(&ring->execlist_lock, flags);
>  
> @@ -908,6 +1024,7 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>  
>  	INIT_LIST_HEAD(&ring->execlist_queue);
>  	spin_lock_init(&ring->execlist_lock);
> +	ring->next_context_status_buffer = 0;
>  
>  	ret = intel_lr_context_deferred_create(dctx, ring);
>  	if (ret)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 14492a9..2e8929f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -66,6 +66,9 @@ struct intel_ctx_submit_request {
>  	u32 tail;
>  
>  	struct list_head execlist_link;
> +	struct work_struct work;
>  };
>  
> +void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring);
> +
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 6358823..905d1ba 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -225,6 +225,7 @@ struct  intel_engine_cs {
>  	/* Execlists */
>  	spinlock_t execlist_lock;
>  	struct list_head execlist_queue;
> +	u8 next_context_status_buffer;
>  	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
>  	int		(*emit_request)(struct intel_ringbuffer *ringbuf);
>  	int		(*emit_flush)(struct intel_ringbuffer *ringbuf,
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index f77a4ca..e4077d1 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1628,6 +1628,7 @@  static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
 				       struct drm_i915_private *dev_priv,
 				       u32 master_ctl)
 {
+	struct intel_engine_cs *ring;
 	u32 rcs, bcs, vcs;
 	uint32_t tmp = 0;
 	irqreturn_t ret = IRQ_NONE;
@@ -1637,14 +1638,20 @@  static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
 		if (tmp) {
 			I915_WRITE(GEN8_GT_IIR(0), tmp);
 			ret = IRQ_HANDLED;
+
 			rcs = tmp >> GEN8_RCS_IRQ_SHIFT;
-			bcs = tmp >> GEN8_BCS_IRQ_SHIFT;
+			ring = &dev_priv->ring[RCS];
 			if (rcs & GT_RENDER_USER_INTERRUPT)
-				notify_ring(dev, &dev_priv->ring[RCS]);
+				notify_ring(dev, ring);
+			if (rcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
+				intel_execlists_handle_ctx_events(ring);
+
+			bcs = tmp >> GEN8_BCS_IRQ_SHIFT;
+			ring = &dev_priv->ring[BCS];
 			if (bcs & GT_RENDER_USER_INTERRUPT)
-				notify_ring(dev, &dev_priv->ring[BCS]);
-			if ((rcs | bcs) & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
-				DRM_DEBUG_DRIVER("TODO: Context switch\n");
+				notify_ring(dev, ring);
+			if (bcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
+				intel_execlists_handle_ctx_events(ring);
 		} else
 			DRM_ERROR("The master control interrupt lied (GT0)!\n");
 	}
@@ -1654,16 +1661,20 @@  static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
 		if (tmp) {
 			I915_WRITE(GEN8_GT_IIR(1), tmp);
 			ret = IRQ_HANDLED;
+
 			vcs = tmp >> GEN8_VCS1_IRQ_SHIFT;
+			ring = &dev_priv->ring[VCS];
 			if (vcs & GT_RENDER_USER_INTERRUPT)
-				notify_ring(dev, &dev_priv->ring[VCS]);
+				notify_ring(dev, ring);
 			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
-				DRM_DEBUG_DRIVER("TODO: Context switch\n");
+				intel_execlists_handle_ctx_events(ring);
+
 			vcs = tmp >> GEN8_VCS2_IRQ_SHIFT;
+			ring = &dev_priv->ring[VCS2];
 			if (vcs & GT_RENDER_USER_INTERRUPT)
-				notify_ring(dev, &dev_priv->ring[VCS2]);
+				notify_ring(dev, ring);
 			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
-				DRM_DEBUG_DRIVER("TODO: Context switch\n");
+				intel_execlists_handle_ctx_events(ring);
 		} else
 			DRM_ERROR("The master control interrupt lied (GT1)!\n");
 	}
@@ -1684,11 +1695,13 @@  static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev,
 		if (tmp) {
 			I915_WRITE(GEN8_GT_IIR(3), tmp);
 			ret = IRQ_HANDLED;
+
 			vcs = tmp >> GEN8_VECS_IRQ_SHIFT;
+			ring = &dev_priv->ring[VECS];
 			if (vcs & GT_RENDER_USER_INTERRUPT)
-				notify_ring(dev, &dev_priv->ring[VECS]);
+				notify_ring(dev, ring);
 			if (vcs & GEN8_GT_CONTEXT_SWITCH_INTERRUPT)
-				DRM_DEBUG_DRIVER("TODO: Context switch\n");
+				intel_execlists_handle_ctx_events(ring);
 		} else
 			DRM_ERROR("The master control interrupt lied (GT3)!\n");
 	}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 9e91169..65f4f26 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -49,6 +49,22 @@ 
 #define RING_ELSP(ring)			((ring)->mmio_base+0x230)
 #define RING_EXECLIST_STATUS(ring)	((ring)->mmio_base+0x234)
 #define RING_CONTEXT_CONTROL(ring)	((ring)->mmio_base+0x244)
+#define RING_CONTEXT_STATUS_BUF(ring)	((ring)->mmio_base+0x370)
+#define RING_CONTEXT_STATUS_PTR(ring)	((ring)->mmio_base+0x3a0)
+
+#define RING_EXECLIST_QFULL		(1 << 0x2)
+#define RING_EXECLIST1_VALID		(1 << 0x3)
+#define RING_EXECLIST0_VALID		(1 << 0x4)
+#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
+#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
+#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
+
+#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
+#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
+#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
+#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
+#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
+#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
 
 #define CTX_LRI_HEADER_0		0x01
 #define CTX_CONTEXT_CONTROL		0x02
@@ -147,6 +163,7 @@  static void execlists_elsp_write(struct intel_engine_cs *ring,
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	uint64_t temp = 0;
 	uint32_t desc[4];
+	unsigned long flags;
 
 	/* XXX: You must always write both descriptors in the order below. */
 	if (ctx_obj1)
@@ -160,9 +177,17 @@  static void execlists_elsp_write(struct intel_engine_cs *ring,
 	desc[3] = (u32)(temp >> 32);
 	desc[2] = (u32)temp;
 
-	/* Set Force Wakeup bit to prevent GT from entering C6 while
-	 * ELSP writes are in progress */
-	gen6_gt_force_wake_get(dev_priv, FORCEWAKE_ALL);
+	/* Set Force Wakeup bit to prevent GT from entering C6 while ELSP writes
+	 * are in progress.
+	 *
+	 * The other problem is that we can't just call gen6_gt_force_wake_get()
+	 * because that function calls intel_runtime_pm_get(), which might sleep.
+	 * Instead, we do the runtime_pm_get/put when creating/destroying requests.
+	 */
+	spin_lock_irqsave(&dev_priv->uncore.lock, flags);
+	if (dev_priv->uncore.forcewake_count++ == 0)
+		dev_priv->uncore.funcs.force_wake_get(dev_priv, FORCEWAKE_ALL);
+	spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
 
 	I915_WRITE(RING_ELSP(ring), desc[1]);
 	I915_WRITE(RING_ELSP(ring), desc[0]);
@@ -173,7 +198,11 @@  static void execlists_elsp_write(struct intel_engine_cs *ring,
 	/* ELSP is a wo register, so use another nearby reg for posting instead */
 	POSTING_READ(RING_EXECLIST_STATUS(ring));
 
-	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
+	/* Release Force Wakeup (see the big comment above). */
+	spin_lock_irqsave(&dev_priv->uncore.lock, flags);
+	if (--dev_priv->uncore.forcewake_count == 0)
+		dev_priv->uncore.funcs.force_wake_put(dev_priv, FORCEWAKE_ALL);
+	spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
 }
 
 static int execlists_ctx_write_tail(struct drm_i915_gem_object *ctx_obj, u32 tail)
@@ -221,6 +250,9 @@  static void execlists_context_unqueue(struct intel_engine_cs *ring)
 {
 	struct intel_ctx_submit_request *req0 = NULL, *req1 = NULL;
 	struct intel_ctx_submit_request *cursor = NULL, *tmp = NULL;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+	assert_spin_locked(&ring->execlist_lock);
 
 	if (list_empty(&ring->execlist_queue))
 		return;
@@ -233,8 +265,7 @@  static void execlists_context_unqueue(struct intel_engine_cs *ring)
 			/* Same ctx: ignore first request, as second request
 			 * will update tail past first request's workload */
 			list_del(&req0->execlist_link);
-			i915_gem_context_unreference(req0->ctx);
-			kfree(req0);
+			queue_work(dev_priv->wq, &req0->work);
 			req0 = cursor;
 		} else {
 			req1 = cursor;
@@ -246,6 +277,89 @@  static void execlists_context_unqueue(struct intel_engine_cs *ring)
 			req1? req1->ctx : NULL, req1? req1->tail : 0));
 }
 
+static bool execlists_check_remove_request(struct intel_engine_cs *ring,
+					   u32 request_id)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct intel_ctx_submit_request *head_req;
+
+	assert_spin_locked(&ring->execlist_lock);
+
+	head_req = list_first_entry_or_null(&ring->execlist_queue,
+			struct intel_ctx_submit_request, execlist_link);
+	if (head_req != NULL) {
+		struct drm_i915_gem_object *ctx_obj =
+				head_req->ctx->engine[ring->id].state;
+		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
+			list_del(&head_req->execlist_link);
+			queue_work(dev_priv->wq, &head_req->work);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	u32 status_pointer;
+	u8 read_pointer;
+	u8 write_pointer;
+	u32 status;
+	u32 status_id;
+	u32 submit_contexts = 0;
+
+	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
+
+	read_pointer = ring->next_context_status_buffer;
+	write_pointer = status_pointer & 0x07;
+	if (read_pointer > write_pointer)
+		write_pointer += 6;
+
+	spin_lock(&ring->execlist_lock);
+
+	while (read_pointer < write_pointer) {
+		read_pointer++;
+		status = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
+				(read_pointer % 6) * 8);
+		status_id = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
+				(read_pointer % 6) * 8 + 4);
+
+		if (status & GEN8_CTX_STATUS_COMPLETE) {
+			if (execlists_check_remove_request(ring, status_id))
+				submit_contexts++;
+		}
+	}
+
+	if (submit_contexts != 0)
+		execlists_context_unqueue(ring);
+
+	spin_unlock(&ring->execlist_lock);
+
+	WARN(submit_contexts > 2, "More than two context complete events?\n");
+	ring->next_context_status_buffer = write_pointer % 6;
+
+	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
+			((u32)ring->next_context_status_buffer & 0x07) << 8);
+}
+
+static void execlists_free_request_task(struct work_struct *work)
+{
+	struct intel_ctx_submit_request *req =
+			container_of(work, struct intel_ctx_submit_request, work);
+	struct drm_device *dev = req->ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	intel_runtime_pm_put(dev_priv);
+
+	mutex_lock(&dev->struct_mutex);
+	i915_gem_context_unreference(req->ctx);
+	mutex_unlock(&dev->struct_mutex);
+
+	kfree(req);
+}
+
 static int execlists_context_queue(struct intel_engine_cs *ring,
 				   struct intel_context *to,
 				   u32 tail)
@@ -261,6 +375,8 @@  static int execlists_context_queue(struct intel_engine_cs *ring,
 	i915_gem_context_reference(req->ctx);
 	req->ring = ring;
 	req->tail = tail;
+	INIT_WORK(&req->work, execlists_free_request_task);
+	intel_runtime_pm_get(dev_priv);
 
 	spin_lock_irqsave(&ring->execlist_lock, flags);
 
@@ -908,6 +1024,7 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 
 	INIT_LIST_HEAD(&ring->execlist_queue);
 	spin_lock_init(&ring->execlist_lock);
+	ring->next_context_status_buffer = 0;
 
 	ret = intel_lr_context_deferred_create(dctx, ring);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 14492a9..2e8929f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -66,6 +66,9 @@  struct intel_ctx_submit_request {
 	u32 tail;
 
 	struct list_head execlist_link;
+	struct work_struct work;
 };
 
+void intel_execlists_handle_ctx_events(struct intel_engine_cs *ring);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 6358823..905d1ba 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -225,6 +225,7 @@  struct  intel_engine_cs {
 	/* Execlists */
 	spinlock_t execlist_lock;
 	struct list_head execlist_queue;
+	u8 next_context_status_buffer;
 	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
 	int		(*emit_request)(struct intel_ringbuffer *ringbuf);
 	int		(*emit_flush)(struct intel_ringbuffer *ringbuf,