[09/17,v2] drm/i915: Expose two LRC functions for GuC submission mode
diff mbox

Message ID 1435243213-22308-10-git-send-email-david.s.gordon@intel.com
State New
Headers show

Commit Message

Dave Gordon June 25, 2015, 2:40 p.m. UTC
GuC submission is basically execlist submission, but with the GuC
handling the actual writes to the ELSP and the resulting context
switch interrupts. So to prepare a context for submission via the
GuC, we need some of the same functions used in execlist mode.
This commit exposes two such functions, changing their names to
better describe what they do (they're related to logical ring
contexts rather than to execlists per se).

v2:
    Replaces previous "drm/i915: Move execlists defines from .c to .h"

Issue: VIZ-4884
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c |   27 +++++++++++++--------------
 drivers/gpu/drm/i915/intel_lrc.h |    5 +++++
 2 files changed, 18 insertions(+), 14 deletions(-)

Comments

yu.dai@intel.com June 25, 2015, 8:57 p.m. UTC | #1
On 06/25/2015 07:40 AM, Dave Gordon wrote:
> GuC submission is basically execlist submission, but with the GuC
> handling the actual writes to the ELSP and the resulting context
> switch interrupts. So to prepare a context for submission via the
> GuC, we need some of the same functions used in execlist mode.
> This commit exposes two such functions, changing their names to
> better describe what they do (they're related to logical ring
> contexts rather than to execlists per se).
>
> v2:
>      Replaces previous "drm/i915: Move execlists defines from .c to .h"
>
> Issue: VIZ-4884
> Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_lrc.c |   27 +++++++++++++--------------
>   drivers/gpu/drm/i915/intel_lrc.h |    5 +++++
>   2 files changed, 18 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e5f4040..a77b22d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -264,8 +264,8 @@ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
>   	return lrca >> 12;
>   }
>   
> -static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
> -					 struct drm_i915_gem_object *ctx_obj)
> +uint64_t intel_lr_context_descriptor(struct intel_engine_cs *ring,
> +				     struct drm_i915_gem_object *ctx_obj)
>   {
>   	struct drm_device *dev = ring->dev;
>   	uint64_t desc;
> @@ -306,13 +306,13 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>   
>   	/* XXX: You must always write both descriptors in the order below. */
>   	if (ctx_obj1)
> -		temp = execlists_ctx_descriptor(ring, ctx_obj1);
> +		temp = intel_lr_context_descriptor(ring, ctx_obj1);
>   	else
>   		temp = 0;
>   	desc[1] = (u32)(temp >> 32);
>   	desc[0] = (u32)temp;
>   
> -	temp = execlists_ctx_descriptor(ring, ctx_obj0);
> +	temp = intel_lr_context_descriptor(ring, ctx_obj0);
>   	desc[3] = (u32)(temp >> 32);
>   	desc[2] = (u32)temp;
>   
> @@ -331,10 +331,10 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>   	spin_unlock(&dev_priv->uncore.lock);
>   }
>   
> -static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
> -				    struct drm_i915_gem_object *ring_obj,
> -				    struct i915_hw_ppgtt *ppgtt,
> -				    u32 tail)
> +/* Update the ringbuffer pointer and tail offset in a saved context image */
> +void intel_lr_context_update(struct drm_i915_gem_object *ctx_obj,
> +			     struct drm_i915_gem_object *ring_obj,
> +			     u32 tail)
>   {
>   	struct page *page;
>   	uint32_t *reg_state;
> @@ -342,12 +342,11 @@ static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
>   	page = i915_gem_object_get_page(ctx_obj, 1);
>   	reg_state = kmap_atomic(page);
>   
> -	reg_state[CTX_RING_TAIL+1] = tail;
>   	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj);
> +	if (tail != ~0u)
> +		reg_state[CTX_RING_TAIL+1] = tail;
I actually regret my original copycat of this function for guc. Because 
updating ring tail is moved to guc, there is no need to call this for 
each submission. Maybe we should split this call into two parts. The 
updating of ring buffer base is moved to where ring buffer is newly 
mapped. And the updating of tail is kept here for execlist submission only.

Alex
Chris Wilson June 26, 2015, 7:31 a.m. UTC | #2
On Thu, Jun 25, 2015 at 01:57:16PM -0700, Yu Dai wrote:
> 
> On 06/25/2015 07:40 AM, Dave Gordon wrote:
> >GuC submission is basically execlist submission, but with the GuC
> >handling the actual writes to the ELSP and the resulting context
> >switch interrupts. So to prepare a context for submission via the
> >GuC, we need some of the same functions used in execlist mode.
> >This commit exposes two such functions, changing their names to
> >better describe what they do (they're related to logical ring
> >contexts rather than to execlists per se).
> >
> >v2:
> >     Replaces previous "drm/i915: Move execlists defines from .c to .h"
> >
> >Issue: VIZ-4884
> >Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
> >---
> >  drivers/gpu/drm/i915/intel_lrc.c |   27 +++++++++++++--------------
> >  drivers/gpu/drm/i915/intel_lrc.h |    5 +++++
> >  2 files changed, 18 insertions(+), 14 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >index e5f4040..a77b22d 100644
> >--- a/drivers/gpu/drm/i915/intel_lrc.c
> >+++ b/drivers/gpu/drm/i915/intel_lrc.c
> >@@ -264,8 +264,8 @@ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
> >  	return lrca >> 12;
> >  }
> >-static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
> >-					 struct drm_i915_gem_object *ctx_obj)
> >+uint64_t intel_lr_context_descriptor(struct intel_engine_cs *ring,
> >+				     struct drm_i915_gem_object *ctx_obj)
> >  {
> >  	struct drm_device *dev = ring->dev;
> >  	uint64_t desc;
> >@@ -306,13 +306,13 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
> >  	/* XXX: You must always write both descriptors in the order below. */
> >  	if (ctx_obj1)
> >-		temp = execlists_ctx_descriptor(ring, ctx_obj1);
> >+		temp = intel_lr_context_descriptor(ring, ctx_obj1);
> >  	else
> >  		temp = 0;
> >  	desc[1] = (u32)(temp >> 32);
> >  	desc[0] = (u32)temp;
> >-	temp = execlists_ctx_descriptor(ring, ctx_obj0);
> >+	temp = intel_lr_context_descriptor(ring, ctx_obj0);
> >  	desc[3] = (u32)(temp >> 32);
> >  	desc[2] = (u32)temp;
> >@@ -331,10 +331,10 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
> >  	spin_unlock(&dev_priv->uncore.lock);
> >  }
> >-static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
> >-				    struct drm_i915_gem_object *ring_obj,
> >-				    struct i915_hw_ppgtt *ppgtt,
> >-				    u32 tail)
> >+/* Update the ringbuffer pointer and tail offset in a saved context image */
> >+void intel_lr_context_update(struct drm_i915_gem_object *ctx_obj,
> >+			     struct drm_i915_gem_object *ring_obj,
> >+			     u32 tail)
> >  {
> >  	struct page *page;
> >  	uint32_t *reg_state;
> >@@ -342,12 +342,11 @@ static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
> >  	page = i915_gem_object_get_page(ctx_obj, 1);
> >  	reg_state = kmap_atomic(page);
> >-	reg_state[CTX_RING_TAIL+1] = tail;
> >  	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj);
> >+	if (tail != ~0u)
> >+		reg_state[CTX_RING_TAIL+1] = tail;
> I actually regret my original copycat of this function for guc.
> Because updating ring tail is moved to guc, there is no need to call
> this for each submission. Maybe we should split this call into two
> parts. The updating of ring buffer base is moved to where ring
> buffer is newly mapped. And the updating of tail is kept here for
> execlist submission only.

If you would be so kind to review patches that do just that, it would
make intel_lrc a much nicer place to work, and execlists much faster.
-Chris
Dave Gordon June 26, 2015, 1:14 p.m. UTC | #3
On 26/06/15 08:31, Chris Wilson wrote:
> On Thu, Jun 25, 2015 at 01:57:16PM -0700, Yu Dai wrote:
>>
>> On 06/25/2015 07:40 AM, Dave Gordon wrote:
>>> GuC submission is basically execlist submission, but with the GuC
>>> handling the actual writes to the ELSP and the resulting context
>>> switch interrupts. So to prepare a context for submission via the
>>> GuC, we need some of the same functions used in execlist mode.
>>> This commit exposes two such functions, changing their names to
>>> better describe what they do (they're related to logical ring
>>> contexts rather than to execlists per se).
>>>
>>> v2:
>>>     Replaces previous "drm/i915: Move execlists defines from .c to .h"
>>>
>>> Issue: VIZ-4884
>>> Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/intel_lrc.c |   27 +++++++++++++--------------
>>>  drivers/gpu/drm/i915/intel_lrc.h |    5 +++++
>>>  2 files changed, 18 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index e5f4040..a77b22d 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -264,8 +264,8 @@ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
>>>  	return lrca >> 12;
>>>  }
>>> -static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
>>> -					 struct drm_i915_gem_object *ctx_obj)
>>> +uint64_t intel_lr_context_descriptor(struct intel_engine_cs *ring,
>>> +				     struct drm_i915_gem_object *ctx_obj)
>>>  {
>>>  	struct drm_device *dev = ring->dev;
>>>  	uint64_t desc;
>>> @@ -306,13 +306,13 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>>>  	/* XXX: You must always write both descriptors in the order below. */
>>>  	if (ctx_obj1)
>>> -		temp = execlists_ctx_descriptor(ring, ctx_obj1);
>>> +		temp = intel_lr_context_descriptor(ring, ctx_obj1);
>>>  	else
>>>  		temp = 0;
>>>  	desc[1] = (u32)(temp >> 32);
>>>  	desc[0] = (u32)temp;
>>> -	temp = execlists_ctx_descriptor(ring, ctx_obj0);
>>> +	temp = intel_lr_context_descriptor(ring, ctx_obj0);
>>>  	desc[3] = (u32)(temp >> 32);
>>>  	desc[2] = (u32)temp;
>>> @@ -331,10 +331,10 @@ static void execlists_elsp_write(struct intel_engine_cs *ring,
>>>  	spin_unlock(&dev_priv->uncore.lock);
>>>  }
>>> -static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
>>> -				    struct drm_i915_gem_object *ring_obj,
>>> -				    struct i915_hw_ppgtt *ppgtt,
>>> -				    u32 tail)
>>> +/* Update the ringbuffer pointer and tail offset in a saved context image */
>>> +void intel_lr_context_update(struct drm_i915_gem_object *ctx_obj,
>>> +			     struct drm_i915_gem_object *ring_obj,
>>> +			     u32 tail)
>>>  {
>>>  	struct page *page;
>>>  	uint32_t *reg_state;
>>> @@ -342,12 +342,11 @@ static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
>>>  	page = i915_gem_object_get_page(ctx_obj, 1);
>>>  	reg_state = kmap_atomic(page);
>>> -	reg_state[CTX_RING_TAIL+1] = tail;
>>>  	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj);
>>> +	if (tail != ~0u)
>>> +		reg_state[CTX_RING_TAIL+1] = tail;
>>
>> I actually regret my original copycat of this function for guc.
>> Because updating ring tail is moved to guc, there is no need to call
>> this for each submission. Maybe we should split this call into two
>> parts. The updating of ring buffer base is moved to where ring
>> buffer is newly mapped. And the updating of tail is kept here for
>> execlist submission only.
> 
> If you would be so kind to review patches that do just that, it would
> make intel_lrc a much nicer place to work, and execlists much faster.
> -Chris

I've deleted Alex's coy of this function in the GuC code; that's why the
execlists/LRC version is exposed so that it can be shared with the GuC
path, and the update of "tail" here is conditional, so the GuC call
doesn't use that part.

I agree it would be nicer still to update the ringbuffer base address
only at the point when it's pinned to the GGTT, rather than on each
batch submission. As you say, that would allow us to remove this call
entirely from the GuC path.

Another improvement would be to kmap the context whenever it's active,
the way we already do for the ringbuffer, so we could get rid of the
kmap_atomic calls here. Since contexts and ringbuffers are one-to-one in
execlist/GuC modes, it should be simple to keep them mapped-and-pinned
in paralell.

Further, perhaps we could allocate them in a single contiguous structure
to further reduce overhead; 4 pages of ringbuffer, one page as a GuC
shared channel, one page for the (PP)HWSP, some number of pages for the
context image? All GGTT-pinned and kmapped together whenever they've got
any work in-flight; all unpinned and unmapped either during retirement
of the last-active-request in that context or lazily some time later.

I expect Chris has already implemented some of these ideas :)

.Dave.

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e5f4040..a77b22d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -264,8 +264,8 @@  u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
 	return lrca >> 12;
 }
 
-static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
-					 struct drm_i915_gem_object *ctx_obj)
+uint64_t intel_lr_context_descriptor(struct intel_engine_cs *ring,
+				     struct drm_i915_gem_object *ctx_obj)
 {
 	struct drm_device *dev = ring->dev;
 	uint64_t desc;
@@ -306,13 +306,13 @@  static void execlists_elsp_write(struct intel_engine_cs *ring,
 
 	/* XXX: You must always write both descriptors in the order below. */
 	if (ctx_obj1)
-		temp = execlists_ctx_descriptor(ring, ctx_obj1);
+		temp = intel_lr_context_descriptor(ring, ctx_obj1);
 	else
 		temp = 0;
 	desc[1] = (u32)(temp >> 32);
 	desc[0] = (u32)temp;
 
-	temp = execlists_ctx_descriptor(ring, ctx_obj0);
+	temp = intel_lr_context_descriptor(ring, ctx_obj0);
 	desc[3] = (u32)(temp >> 32);
 	desc[2] = (u32)temp;
 
@@ -331,10 +331,10 @@  static void execlists_elsp_write(struct intel_engine_cs *ring,
 	spin_unlock(&dev_priv->uncore.lock);
 }
 
-static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
-				    struct drm_i915_gem_object *ring_obj,
-				    struct i915_hw_ppgtt *ppgtt,
-				    u32 tail)
+/* Update the ringbuffer pointer and tail offset in a saved context image */
+void intel_lr_context_update(struct drm_i915_gem_object *ctx_obj,
+			     struct drm_i915_gem_object *ring_obj,
+			     u32 tail)
 {
 	struct page *page;
 	uint32_t *reg_state;
@@ -342,12 +342,11 @@  static int execlists_update_context(struct drm_i915_gem_object *ctx_obj,
 	page = i915_gem_object_get_page(ctx_obj, 1);
 	reg_state = kmap_atomic(page);
 
-	reg_state[CTX_RING_TAIL+1] = tail;
 	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj);
+	if (tail != ~0u)
+		reg_state[CTX_RING_TAIL+1] = tail;
 
 	kunmap_atomic(reg_state);
-
-	return 0;
 }
 
 static void execlists_submit_contexts(struct intel_engine_cs *ring,
@@ -363,7 +362,7 @@  static void execlists_submit_contexts(struct intel_engine_cs *ring,
 	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj0));
 	WARN_ON(!i915_gem_obj_is_pinned(ringbuf0->obj));
 
-	execlists_update_context(ctx_obj0, ringbuf0->obj, to0->ppgtt, tail0);
+	intel_lr_context_update(ctx_obj0, ringbuf0->obj, tail0);
 
 	if (to1) {
 		ringbuf1 = to1->engine[ring->id].ringbuf;
@@ -372,7 +371,7 @@  static void execlists_submit_contexts(struct intel_engine_cs *ring,
 		WARN_ON(!i915_gem_obj_is_pinned(ctx_obj1));
 		WARN_ON(!i915_gem_obj_is_pinned(ringbuf1->obj));
 
-		execlists_update_context(ctx_obj1, ringbuf1->obj, to1->ppgtt, tail1);
+		intel_lr_context_update(ctx_obj1, ringbuf1->obj, tail1);
 	}
 
 	execlists_elsp_write(ring, ctx_obj0, ctx_obj1);
@@ -2029,7 +2028,7 @@  populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 	reg_state[CTX_RING_TAIL+1] = 0;
 	reg_state[CTX_RING_BUFFER_START] = RING_START(ring->mmio_base);
 	/* Ring buffer start address is not known until the buffer is pinned.
-	 * It is written to the context image in execlists_update_context()
+	 * It is written to the context image in intel_lr_context_update()
 	 */
 	reg_state[CTX_RING_BUFFER_CONTROL] = RING_CTL(ring->mmio_base);
 	reg_state[CTX_RING_BUFFER_CONTROL+1] =
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index f59940a..b3659a1 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -73,6 +73,11 @@  void intel_lr_context_unpin(struct intel_engine_cs *ring,
 		struct intel_context *ctx);
 void intel_lr_context_reset(struct drm_device *dev,
 			struct intel_context *ctx);
+void intel_lr_context_update(struct drm_i915_gem_object *ctx_obj,
+			     struct drm_i915_gem_object *ring_obj,
+			     u32 tail);
+uint64_t intel_lr_context_descriptor(struct intel_engine_cs *ring,
+				     struct drm_i915_gem_object *ctx_obj);
 
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);