diff mbox

[08/10] drm/i915: Prelude to splitting i915_gem_do_execbuffer in two

Message ID 1418129953-1505-9-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison Dec. 9, 2014, 12:59 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

The scheduler decouples the submission of batch buffers to the driver with their
submission to the hardware. This basically means splitting the execbuffer()
function in half. This change rearranges some code ready for the split to occur.

Change-Id: Icc9c8afaac18821f3eb8a151a49f918f90c068a3
For: VIZ-1587
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   47 +++++++++++++++++-----------
 drivers/gpu/drm/i915/intel_lrc.c           |   17 +++++++---
 2 files changed, 40 insertions(+), 24 deletions(-)

Comments

Daniel Vetter Dec. 10, 2014, 10:58 a.m. UTC | #1
On Tue, Dec 09, 2014 at 12:59:11PM +0000, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> The scheduler decouples the submission of batch buffers to the driver with their
> submission to the hardware. This basically means splitting the execbuffer()
> function in half. This change rearranges some code ready for the split to occur.

Now there's the curios question: Where will the split in top/bottom halves
be? Without that I have no idea whether it makes sense and so can't review
this patch. At least if the goal is to really prep for the scheduler and
not just move a few lines around ;-)
-Daniel

> 
> Change-Id: Icc9c8afaac18821f3eb8a151a49f918f90c068a3
> For: VIZ-1587
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   47 +++++++++++++++++-----------
>  drivers/gpu/drm/i915/intel_lrc.c           |   17 +++++++---
>  2 files changed, 40 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index f09501c..a339556 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -848,10 +848,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
>  	if (flush_domains & I915_GEM_DOMAIN_GTT)
>  		wmb();
>  
> -	/* Unconditionally invalidate gpu caches and ensure that we do flush
> -	 * any residual writes from the previous batch.
> -	 */
> -	return intel_ring_invalidate_all_caches(ring);
> +	return 0;
>  }
>  
>  static bool
> @@ -1123,14 +1120,6 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
>  		}
>  	}
>  
> -	ret = i915_gem_execbuffer_move_to_gpu(ring, vmas);
> -	if (ret)
> -		goto error;
> -
> -	ret = i915_switch_context(ring, ctx);
> -	if (ret)
> -		goto error;
> -
>  	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>  	instp_mask = I915_EXEC_CONSTANTS_MASK;
>  	switch (instp_mode) {
> @@ -1168,6 +1157,28 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
>  		goto error;
>  	}
>  
> +	ret = i915_gem_execbuffer_move_to_gpu(ring, vmas);
> +	if (ret)
> +		goto error;
> +
> +	i915_gem_execbuffer_move_to_active(vmas, ring);
> +
> +	/* To be split into two functions here... */
> +
> +	intel_runtime_pm_get(dev_priv);
> +
> +	/* Unconditionally invalidate gpu caches and ensure that we do flush
> +	 * any residual writes from the previous batch.
> +	 */
> +	ret = intel_ring_invalidate_all_caches(ring);
> +	if (ret)
> +		goto error;
> +
> +	/* Switch to the correct context for the batch */
> +	ret = i915_switch_context(ring, ctx);
> +	if (ret)
> +		goto error;
> +
>  	if (ring == &dev_priv->ring[RCS] &&
>  			instp_mode != dev_priv->relative_constants_mode) {
>  		ret = intel_ring_begin(ring, 4);
> @@ -1208,15 +1219,18 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
>  						exec_start, exec_len,
>  						dispatch_flags);
>  		if (ret)
> -			return ret;
> +			goto error;
>  	}
>  
>  	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags);
>  
> -	i915_gem_execbuffer_move_to_active(vmas, ring);
>  	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
>  
>  error:
> +	/* intel_gpu_busy should also get a ref, so it will free when the device
> +	 * is really idle. */
> +	intel_runtime_pm_put(dev_priv);
> +
>  	kfree(cliprects);
>  	return ret;
>  }
> @@ -1335,8 +1349,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		return -EINVAL;
>  	}
>  
> -	intel_runtime_pm_get(dev_priv);
> -
>  	ret = i915_mutex_lock_interruptible(dev);
>  	if (ret)
>  		goto pre_mutex_err;
> @@ -1467,9 +1479,6 @@ err:
>  	mutex_unlock(&dev->struct_mutex);
>  
>  pre_mutex_err:
> -	/* intel_gpu_busy should also get a ref, so it will free when the device
> -	 * is really idle. */
> -	intel_runtime_pm_put(dev_priv);
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 037cbd5..f16b15d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -630,10 +630,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
>  	if (flush_domains & I915_GEM_DOMAIN_GTT)
>  		wmb();
>  
> -	/* Unconditionally invalidate gpu caches and ensure that we do flush
> -	 * any residual writes from the previous batch.
> -	 */
> -	return logical_ring_invalidate_all_caches(ringbuf);
> +	return 0;
>  }
>  
>  /**
> @@ -717,6 +714,17 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
>  	if (ret)
>  		return ret;
>  
> +	i915_gem_execbuffer_move_to_active(vmas, ring);
> +
> +	/* To be split into two functions here... */
> +
> +	/* Unconditionally invalidate gpu caches and ensure that we do flush
> +	 * any residual writes from the previous batch.
> +	 */
> +	ret = logical_ring_invalidate_all_caches(ringbuf);
> +	if (ret)
> +		return ret;
> +
>  	if (ring == &dev_priv->ring[RCS] &&
>  	    instp_mode != dev_priv->relative_constants_mode) {
>  		ret = intel_logical_ring_begin(ringbuf, 4);
> @@ -738,7 +746,6 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
>  
>  	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags);
>  
> -	i915_gem_execbuffer_move_to_active(vmas, ring);
>  	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
>  
>  	return 0;
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Dave Gordon Dec. 10, 2014, 4:33 p.m. UTC | #2
On 10/12/14 10:58, Daniel Vetter wrote:
> On Tue, Dec 09, 2014 at 12:59:11PM +0000, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> The scheduler decouples the submission of batch buffers to the driver with their
>> submission to the hardware. This basically means splitting the execbuffer()
>> function in half. This change rearranges some code ready for the split to occur.
> 
> Now there's the curios question: Where will the split in top/bottom halves
> be? Without that I have no idea whether it makes sense and so can't review
> this patch. At least if the goal is to really prep for the scheduler and
> not just move a few lines around ;-)
> -Daniel
> 
[snip]
>>  
>> +	i915_gem_execbuffer_move_to_active(vmas, ring);
>> +
>> +	/* To be split into two functions here... */
>> +
>> +	/* Unconditionally invalidate gpu caches and ensure that we do flush
>> +	 * any residual writes from the previous batch.
>> +	 */
>> +	ret = logical_ring_invalidate_all_caches(ringbuf);

It'll be where the marker comment is above. Ahead of that point is stuff
to do with setting up software state; after that we're talking to h/w.
When the scheduler code goes it, it decouples the two by interposing at
this point. Then batches go into it with s/w state set up, but don't get
to talk to the h/w until they're selected for execution, possibly in a
different order.

.Dave.
Daniel Vetter Dec. 10, 2014, 5:15 p.m. UTC | #3
On Wed, Dec 10, 2014 at 04:33:14PM +0000, Dave Gordon wrote:
> On 10/12/14 10:58, Daniel Vetter wrote:
> > On Tue, Dec 09, 2014 at 12:59:11PM +0000, John.C.Harrison@Intel.com wrote:
> >> From: John Harrison <John.C.Harrison@Intel.com>
> >>
> >> The scheduler decouples the submission of batch buffers to the driver with their
> >> submission to the hardware. This basically means splitting the execbuffer()
> >> function in half. This change rearranges some code ready for the split to occur.
> > 
> > Now there's the curios question: Where will the split in top/bottom halves
> > be? Without that I have no idea whether it makes sense and so can't review
> > this patch. At least if the goal is to really prep for the scheduler and
> > not just move a few lines around ;-)
> > -Daniel
> > 
> [snip]
> >>  
> >> +	i915_gem_execbuffer_move_to_active(vmas, ring);
> >> +
> >> +	/* To be split into two functions here... */
> >> +
> >> +	/* Unconditionally invalidate gpu caches and ensure that we do flush
> >> +	 * any residual writes from the previous batch.
> >> +	 */
> >> +	ret = logical_ring_invalidate_all_caches(ringbuf);
> 
> It'll be where the marker comment is above. Ahead of that point is stuff
> to do with setting up software state; after that we're talking to h/w.
> When the scheduler code goes it, it decouples the two by interposing at
> this point. Then batches go into it with s/w state set up, but don't get
> to talk to the h/w until they're selected for execution, possibly in a
> different order.

Oops, I guess I need see a doctor to check my eyesight ;-)

Two comments about code that's still in the bottom half:
- There's lots of input validation code still below that cutoff point
  which needs to be moved above to still be able to return -EINVAL. Test
  coverage is the critical part here, but I think we've closed most of our
  gaps. I guess a future patch will address this?

- retire_commands and so add_request are also in the cutoff. For shrinker
  interactions to work seamlessly we need to have both the objects on the
  active list and the request in the lists. So I expect more untangling
  there to separate the request emission to rings from the bookkeeping
  parts in add_request.

Also move_to_active will fall apart once we start to reorder requests I
think. Need to think about this case more, but we definitely need some
testcases with depencies and reordering by the scheduler.
-Daniel
Dave Gordon Dec. 16, 2014, 2:26 p.m. UTC | #4
On 10/12/14 17:15, Daniel Vetter wrote:
> On Wed, Dec 10, 2014 at 04:33:14PM +0000, Dave Gordon wrote:
>> On 10/12/14 10:58, Daniel Vetter wrote:
>>> On Tue, Dec 09, 2014 at 12:59:11PM +0000, John.C.Harrison@Intel.com wrote:
>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>
>>>> The scheduler decouples the submission of batch buffers to the driver with their
>>>> submission to the hardware. This basically means splitting the execbuffer()
>>>> function in half. This change rearranges some code ready for the split to occur.
>>>
>>> Now there's the curios question: Where will the split in top/bottom halves
>>> be? Without that I have no idea whether it makes sense and so can't review
>>> this patch. At least if the goal is to really prep for the scheduler and
>>> not just move a few lines around ;-)
>>> -Daniel
>>>
>> [snip]
>>>>  
>>>> +	i915_gem_execbuffer_move_to_active(vmas, ring);
>>>> +
>>>> +	/* To be split into two functions here... */
>>>> +
>>>> +	/* Unconditionally invalidate gpu caches and ensure that we do flush
>>>> +	 * any residual writes from the previous batch.
>>>> +	 */
>>>> +	ret = logical_ring_invalidate_all_caches(ringbuf);
>>
>> It'll be where the marker comment is above. Ahead of that point is stuff
>> to do with setting up software state; after that we're talking to h/w.
>> When the scheduler code goes it, it decouples the two by interposing at
>> this point. Then batches go into it with s/w state set up, but don't get
>> to talk to the h/w until they're selected for execution, possibly in a
>> different order.
> 
> Oops, I guess I need see a doctor to check my eyesight ;-)
> 
> Two comments about code that's still in the bottom half:
> - There's lots of input validation code still below that cutoff point
>   which needs to be moved above to still be able to return -EINVAL. Test
>   coverage is the critical part here, but I think we've closed most of our
>   gaps. I guess a future patch will address this?

Yes, all validation will end up before the batch gets pushed into the
scheduler queue. It had already been shuffled there in an earlier
version, but some of it then got relocated into less convenient places
during the legacy-vs-execlist split. So for now, the execbuffer path has
some common code (including early validation), then a split by
submission mechanism (with additional validation that may someday get
reconverged into the common section); and back to common code, and
/then/ the batches get passed to the scheduler, whereafter there won't
be any more EINVAL cases (though there will be other error cases, such
as ENOMEM). Then as and when batches are selected for execution, they're
passed to the specific backend submission mechanism code (which of
course won't contain any further validation code either).

> - retire_commands and so add_request are also in the cutoff. For shrinker
>   interactions to work seamlessly we need to have both the objects on the
>   active list and the request in the lists. So I expect more untangling
>   there to separate the request emission to rings from the bookkeeping
>   parts in add_request.
> 
> Also move_to_active will fall apart once we start to reorder requests I
> think. Need to think about this case more, but we definitely need some
> testcases with depencies and reordering by the scheduler.
> -Daniel

I'll leave the rest to John to comment on, as he's still rebasing this
section of the scheduler on top of all the other changes that have been
going on ...

.Dave.
Daniel Vetter Dec. 17, 2014, 8:09 p.m. UTC | #5
On Tue, Dec 16, 2014 at 02:26:55PM +0000, Dave Gordon wrote:
> On 10/12/14 17:15, Daniel Vetter wrote:
> > On Wed, Dec 10, 2014 at 04:33:14PM +0000, Dave Gordon wrote:
> >> On 10/12/14 10:58, Daniel Vetter wrote:
> >>> On Tue, Dec 09, 2014 at 12:59:11PM +0000, John.C.Harrison@Intel.com wrote:
> >>>> From: John Harrison <John.C.Harrison@Intel.com>
> >>>>
> >>>> The scheduler decouples the submission of batch buffers to the driver with their
> >>>> submission to the hardware. This basically means splitting the execbuffer()
> >>>> function in half. This change rearranges some code ready for the split to occur.
> >>>
> >>> Now there's the curios question: Where will the split in top/bottom halves
> >>> be? Without that I have no idea whether it makes sense and so can't review
> >>> this patch. At least if the goal is to really prep for the scheduler and
> >>> not just move a few lines around ;-)
> >>> -Daniel
> >>>
> >> [snip]
> >>>>  
> >>>> +	i915_gem_execbuffer_move_to_active(vmas, ring);
> >>>> +
> >>>> +	/* To be split into two functions here... */
> >>>> +
> >>>> +	/* Unconditionally invalidate gpu caches and ensure that we do flush
> >>>> +	 * any residual writes from the previous batch.
> >>>> +	 */
> >>>> +	ret = logical_ring_invalidate_all_caches(ringbuf);
> >>
> >> It'll be where the marker comment is above. Ahead of that point is stuff
> >> to do with setting up software state; after that we're talking to h/w.
> >> When the scheduler code goes it, it decouples the two by interposing at
> >> this point. Then batches go into it with s/w state set up, but don't get
> >> to talk to the h/w until they're selected for execution, possibly in a
> >> different order.
> > 
> > Oops, I guess I need see a doctor to check my eyesight ;-)
> > 
> > Two comments about code that's still in the bottom half:
> > - There's lots of input validation code still below that cutoff point
> >   which needs to be moved above to still be able to return -EINVAL. Test
> >   coverage is the critical part here, but I think we've closed most of our
> >   gaps. I guess a future patch will address this?
> 
> Yes, all validation will end up before the batch gets pushed into the
> scheduler queue. It had already been shuffled there in an earlier
> version, but some of it then got relocated into less convenient places
> during the legacy-vs-execlist split. So for now, the execbuffer path has
> some common code (including early validation), then a split by
> submission mechanism (with additional validation that may someday get
> reconverged into the common section); and back to common code, and
> /then/ the batches get passed to the scheduler, whereafter there won't
> be any more EINVAL cases (though there will be other error cases, such
> as ENOMEM). Then as and when batches are selected for execution, they're
> passed to the specific backend submission mechanism code (which of
> course won't contain any further validation code either).

Hm, what can still go ENOMEM in there? We should pin everything we need,
the struct is preallocated and everything else should be in place too. Or
at least that's how I think it should work out. If the scheduler needs any
allocations for its internal tracking the we need to push them all into
other places, most likely requests.
-Daniel
Dave Gordon Dec. 18, 2014, 2:06 p.m. UTC | #6
On 17/12/14 20:09, Daniel Vetter wrote:
> On Tue, Dec 16, 2014 at 02:26:55PM +0000, Dave Gordon wrote:
>> On 10/12/14 17:15, Daniel Vetter wrote:
>>> On Wed, Dec 10, 2014 at 04:33:14PM +0000, Dave Gordon wrote:
>>>> On 10/12/14 10:58, Daniel Vetter wrote:
>>>>> On Tue, Dec 09, 2014 at 12:59:11PM +0000, John.C.Harrison@Intel.com wrote:
>>>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>>>
>>>>>> The scheduler decouples the submission of batch buffers to the driver with their
>>>>>> submission to the hardware. This basically means splitting the execbuffer()
>>>>>> function in half. This change rearranges some code ready for the split to occur.
>>>>>
>>>>> Now there's the curios question: Where will the split in top/bottom halves
>>>>> be? Without that I have no idea whether it makes sense and so can't review
>>>>> this patch. At least if the goal is to really prep for the scheduler and
>>>>> not just move a few lines around ;-)
>>>>> -Daniel
>>>>>
>>>> [snip]
>>>>>>  
>>>>>> +	i915_gem_execbuffer_move_to_active(vmas, ring);
>>>>>> +
>>>>>> +	/* To be split into two functions here... */
>>>>>> +
>>>>>> +	/* Unconditionally invalidate gpu caches and ensure that we do flush
>>>>>> +	 * any residual writes from the previous batch.
>>>>>> +	 */
>>>>>> +	ret = logical_ring_invalidate_all_caches(ringbuf);
>>>>
>>>> It'll be where the marker comment is above. Ahead of that point is stuff
>>>> to do with setting up software state; after that we're talking to h/w.
>>>> When the scheduler code goes it, it decouples the two by interposing at
>>>> this point. Then batches go into it with s/w state set up, but don't get
>>>> to talk to the h/w until they're selected for execution, possibly in a
>>>> different order.
>>>
>>> Oops, I guess I need see a doctor to check my eyesight ;-)
>>>
>>> Two comments about code that's still in the bottom half:
>>> - There's lots of input validation code still below that cutoff point
>>>   which needs to be moved above to still be able to return -EINVAL. Test
>>>   coverage is the critical part here, but I think we've closed most of our
>>>   gaps. I guess a future patch will address this?
>>
>> Yes, all validation will end up before the batch gets pushed into the
>> scheduler queue. It had already been shuffled there in an earlier
>> version, but some of it then got relocated into less convenient places
>> during the legacy-vs-execlist split. So for now, the execbuffer path has
>> some common code (including early validation), then a split by
>> submission mechanism (with additional validation that may someday get
>> reconverged into the common section); and back to common code, and
>> /then/ the batches get passed to the scheduler, whereafter there won't
>> be any more EINVAL cases (though there will be other error cases, such
>> as ENOMEM). Then as and when batches are selected for execution, they're
>> passed to the specific backend submission mechanism code (which of
>> course won't contain any further validation code either).
> 
> Hm, what can still go ENOMEM in there? We should pin everything we need,
> the struct is preallocated and everything else should be in place too. Or
> at least that's how I think it should work out. If the scheduler needs any
> allocations for its internal tracking the we need to push them all into
> other places, most likely requests.
> -Daniel

It used to be the case that the scheduler had to allocate some tracking
data, but John may well have changed that with the advent of the
seqno-request transformation.

I'd like to eventually move all batch-related allocations into a single
chunk per request, done after input validation and before acquiring any
locks :)

.Dave.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index f09501c..a339556 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -848,10 +848,7 @@  i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
 	if (flush_domains & I915_GEM_DOMAIN_GTT)
 		wmb();
 
-	/* Unconditionally invalidate gpu caches and ensure that we do flush
-	 * any residual writes from the previous batch.
-	 */
-	return intel_ring_invalidate_all_caches(ring);
+	return 0;
 }
 
 static bool
@@ -1123,14 +1120,6 @@  i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 		}
 	}
 
-	ret = i915_gem_execbuffer_move_to_gpu(ring, vmas);
-	if (ret)
-		goto error;
-
-	ret = i915_switch_context(ring, ctx);
-	if (ret)
-		goto error;
-
 	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
 	instp_mask = I915_EXEC_CONSTANTS_MASK;
 	switch (instp_mode) {
@@ -1168,6 +1157,28 @@  i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 		goto error;
 	}
 
+	ret = i915_gem_execbuffer_move_to_gpu(ring, vmas);
+	if (ret)
+		goto error;
+
+	i915_gem_execbuffer_move_to_active(vmas, ring);
+
+	/* To be split into two functions here... */
+
+	intel_runtime_pm_get(dev_priv);
+
+	/* Unconditionally invalidate gpu caches and ensure that we do flush
+	 * any residual writes from the previous batch.
+	 */
+	ret = intel_ring_invalidate_all_caches(ring);
+	if (ret)
+		goto error;
+
+	/* Switch to the correct context for the batch */
+	ret = i915_switch_context(ring, ctx);
+	if (ret)
+		goto error;
+
 	if (ring == &dev_priv->ring[RCS] &&
 			instp_mode != dev_priv->relative_constants_mode) {
 		ret = intel_ring_begin(ring, 4);
@@ -1208,15 +1219,18 @@  i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 						exec_start, exec_len,
 						dispatch_flags);
 		if (ret)
-			return ret;
+			goto error;
 	}
 
 	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags);
 
-	i915_gem_execbuffer_move_to_active(vmas, ring);
 	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
 
 error:
+	/* intel_gpu_busy should also get a ref, so it will free when the device
+	 * is really idle. */
+	intel_runtime_pm_put(dev_priv);
+
 	kfree(cliprects);
 	return ret;
 }
@@ -1335,8 +1349,6 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	intel_runtime_pm_get(dev_priv);
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		goto pre_mutex_err;
@@ -1467,9 +1479,6 @@  err:
 	mutex_unlock(&dev->struct_mutex);
 
 pre_mutex_err:
-	/* intel_gpu_busy should also get a ref, so it will free when the device
-	 * is really idle. */
-	intel_runtime_pm_put(dev_priv);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 037cbd5..f16b15d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -630,10 +630,7 @@  static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
 	if (flush_domains & I915_GEM_DOMAIN_GTT)
 		wmb();
 
-	/* Unconditionally invalidate gpu caches and ensure that we do flush
-	 * any residual writes from the previous batch.
-	 */
-	return logical_ring_invalidate_all_caches(ringbuf);
+	return 0;
 }
 
 /**
@@ -717,6 +714,17 @@  int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
 	if (ret)
 		return ret;
 
+	i915_gem_execbuffer_move_to_active(vmas, ring);
+
+	/* To be split into two functions here... */
+
+	/* Unconditionally invalidate gpu caches and ensure that we do flush
+	 * any residual writes from the previous batch.
+	 */
+	ret = logical_ring_invalidate_all_caches(ringbuf);
+	if (ret)
+		return ret;
+
 	if (ring == &dev_priv->ring[RCS] &&
 	    instp_mode != dev_priv->relative_constants_mode) {
 		ret = intel_logical_ring_begin(ringbuf, 4);
@@ -738,7 +746,6 @@  int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
 
 	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags);
 
-	i915_gem_execbuffer_move_to_active(vmas, ring);
 	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
 
 	return 0;