diff mbox

[11/13,v4] drm/i915: Integrate GuC-based command submission

Message ID 1436466554-24806-12-git-send-email-david.s.gordon@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Gordon July 9, 2015, 6:29 p.m. UTC
From: Alex Dai <yu.dai@intel.com>

GuC-based submission is mostly the same as execlist mode, up to
intel_logical_ring_advance_and_submit(), where the context being
dispatched would be added to the execlist queue; at this point
we submit the context to the GuC backend instead.

There are, however, a few other changes also required, notably:
1.  Contexts must be pinned at GGTT addresses accessible by the GuC
    i.e. NOT in the range [0..WOPCM_SIZE), so we have to add the
    PIN_OFFSET_BIAS flag to the relevant GGTT-pinning calls.

2.  The GuC's TLB must be invalidated after a context is pinned at
    a new GGTT address.

3.  GuC firmware uses the one page before Ring Context as shared data.
    Therefore, whenever driver wants to get base address of LRC, we
    will offset one page for it. LRC_PPHWSP_PN is defined as the page
    number of LRCA.

4.  In the work queue used to pass requests to the GuC, the GuC
    firmware requires the ring-tail-offset to be represented as an
    11-bit value, expressed in QWords. Therefore, the ringbuffer
    size must be reduced to the representable range (4 pages).

v2:
    Defer adding #defines until needed [Chris Wilson]
    Rationalise type declarations [Chris Wilson]

v4:
    Squashed kerneldoc patch into here [Daniel Vetter]

Issue: VIZ-4884
Signed-off-by: Alex Dai <yu.dai@intel.com>
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
---
 Documentation/DocBook/drm.tmpl             | 14 ++++++++
 drivers/gpu/drm/i915/i915_debugfs.c        |  2 +-
 drivers/gpu/drm/i915/i915_guc_submission.c | 52 +++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/intel_guc.h           |  1 +
 drivers/gpu/drm/i915/intel_lrc.c           | 51 ++++++++++++++++++++---------
 drivers/gpu/drm/i915/intel_lrc.h           |  6 ++++
 6 files changed, 106 insertions(+), 20 deletions(-)

Comments

Tom.O'Rourke@intel.com July 27, 2015, 3:57 p.m. UTC | #1
On Thu, Jul 09, 2015 at 07:29:12PM +0100, Dave Gordon wrote:
> From: Alex Dai <yu.dai@intel.com>
> 
> GuC-based submission is mostly the same as execlist mode, up to
> intel_logical_ring_advance_and_submit(), where the context being
> dispatched would be added to the execlist queue; at this point
> we submit the context to the GuC backend instead.
> 
> There are, however, a few other changes also required, notably:
> 1.  Contexts must be pinned at GGTT addresses accessible by the GuC
>     i.e. NOT in the range [0..WOPCM_SIZE), so we have to add the
>     PIN_OFFSET_BIAS flag to the relevant GGTT-pinning calls.
> 
> 2.  The GuC's TLB must be invalidated after a context is pinned at
>     a new GGTT address.
> 
> 3.  GuC firmware uses the one page before Ring Context as shared data.
>     Therefore, whenever driver wants to get base address of LRC, we
>     will offset one page for it. LRC_PPHWSP_PN is defined as the page
>     number of LRCA.
> 
> 4.  In the work queue used to pass requests to the GuC, the GuC
>     firmware requires the ring-tail-offset to be represented as an
>     11-bit value, expressed in QWords. Therefore, the ringbuffer
>     size must be reduced to the representable range (4 pages).
> 
> v2:
>     Defer adding #defines until needed [Chris Wilson]
>     Rationalise type declarations [Chris Wilson]
> 
> v4:
>     Squashed kerneldoc patch into here [Daniel Vetter]
> 
> Issue: VIZ-4884
> Signed-off-by: Alex Dai <yu.dai@intel.com>
> Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
> ---
>  Documentation/DocBook/drm.tmpl             | 14 ++++++++
>  drivers/gpu/drm/i915/i915_debugfs.c        |  2 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c | 52 +++++++++++++++++++++++++++---
>  drivers/gpu/drm/i915/intel_guc.h           |  1 +
>  drivers/gpu/drm/i915/intel_lrc.c           | 51 ++++++++++++++++++++---------
>  drivers/gpu/drm/i915/intel_lrc.h           |  6 ++++
>  6 files changed, 106 insertions(+), 20 deletions(-)
> 
> diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl
> index 596b11d..0ff5fd7 100644
> --- a/Documentation/DocBook/drm.tmpl
> +++ b/Documentation/DocBook/drm.tmpl
> @@ -4223,6 +4223,20 @@ int num_ioctls;</synopsis>
>        </sect2>
>      </sect1>
>      <sect1>
> +      <title>GuC-based Command Submission</title>
> +      <sect2>
> +        <title>GuC</title>
> +!Pdrivers/gpu/drm/i915/intel_guc_loader.c GuC-specific firmware loader
> +!Idrivers/gpu/drm/i915/intel_guc_loader.c
> +      </sect2>
> +      <sect2>
> +        <title>GuC Client</title>
> +!Pdrivers/gpu/drm/i915/intel_guc_submission.c GuC-based command submissison
> +!Idrivers/gpu/drm/i915/intel_guc_submission.c
> +      </sect2>
> +    </sect1>
> +
> +    <sect1>
>        <title> Tracing </title>
>        <para>
>      This sections covers all things related to the tracepoints implemented in
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 13e37d1..d93732a 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1982,7 +1982,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
>  		return;
>  	}
>  
> -	page = i915_gem_object_get_page(ctx_obj, 1);
> +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  	if (!WARN_ON(page == NULL)) {
>  		reg_state = kmap_atomic(page);
>  
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 25d8807..c5c9fbf 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -346,18 +346,58 @@ static void guc_init_proc_desc(struct intel_guc *guc,
>  static void guc_init_ctx_desc(struct intel_guc *guc,
>  			      struct i915_guc_client *client)
>  {
> +	struct intel_context *ctx = client->owner;
>  	struct guc_context_desc desc;
>  	struct sg_table *sg;
> +	int i;
>  
>  	memset(&desc, 0, sizeof(desc));
>  
>  	desc.attribute = GUC_CTX_DESC_ATTR_ACTIVE | GUC_CTX_DESC_ATTR_KERNEL;
>  	desc.context_id = client->ctx_index;
>  	desc.priority = client->priority;
> -	desc.engines_used = (1 << RCS) | (1 << VCS) | (1 << BCS) |
> -			    (1 << VECS) | (1 << VCS2); /* all engines */
>  	desc.db_id = client->doorbell_id;
>  
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		struct guc_execlist_context *lrc = &desc.lrc[i];
> +		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
> +		struct intel_engine_cs *ring;
> +		struct drm_i915_gem_object *obj;
> +		uint64_t ctx_desc;
> +
> +		/* TODO: We have a design issue to be solved here. Only when we
> +		 * receive the first batch, we know which engine is used by the
> +		 * user. But here GuC expects the lrc and ring to be pinned. It
> +		 * is not an issue for default context, which is the only one
> +		 * for now who owns a GuC client. But for future owner of GuC
> +		 * client, need to make sure lrc is pinned prior to enter here.
> +		 */
> +		obj = ctx->engine[i].state;
> +		if (!obj)
> +			break;
> +
> +		ring = ringbuf->ring;
> +		ctx_desc = intel_lr_context_descriptor(ctx, ring);
> +		lrc->context_desc = (u32)ctx_desc;
> +
> +		/* The state page is after PPHWSP */
> +		lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
> +				LRC_STATE_PN * PAGE_SIZE;
> +		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
> +				(ring->id << GUC_ELC_ENGINE_OFFSET);
> +
> +		obj = ringbuf->obj;
> +
> +		lrc->ring_begin = i915_gem_obj_ggtt_offset(obj);
> +		lrc->ring_end = lrc->ring_begin + obj->base.size - 1;
> +		lrc->ring_next_free_location = lrc->ring_begin;
> +		lrc->ring_current_tail_pointer_value = 0;
> +
> +		desc.engines_used |= (1 << ring->id);
> +	}
> +
> +	WARN_ON(desc.engines_used == 0);
> +
>  	/*
>  	 * The CPU address is only needed at certain points, so kmap_atomic on
>  	 * demand instead of storing it in the ctx descriptor.
> @@ -622,11 +662,13 @@ static void guc_client_free(struct drm_device *dev,
>   * 		The kernel client to replace ExecList submission is created with
>   * 		NORMAL priority. Priority of a client for scheduler can be HIGH,
>   * 		while a preemption context can use CRITICAL.
> + * @ctx		the context to own the client (we use the default render context)
>   *
>   * Return:	An i915_guc_client object if success.
>   */
>  static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
> -						uint32_t priority)
> +						uint32_t priority,
> +						struct intel_context *ctx)
>  {
>  	struct i915_guc_client *client;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -639,6 +681,7 @@ static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
>  
>  	client->doorbell_id = GUC_INVALID_DOORBELL_ID;
>  	client->priority = priority;
> +	client->owner = ctx;
>  
>  	client->ctx_index = (uint32_t)ida_simple_get(&guc->ctx_ids, 0,
>  			GUC_MAX_GPU_CONTEXTS, GFP_KERNEL);
> @@ -772,10 +815,11 @@ int i915_guc_submission_enable(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct intel_guc *guc = &dev_priv->guc;
> +	struct intel_context *ctx = dev_priv->ring[RCS].default_context;
>  	struct i915_guc_client *client;
>  
>  	/* client for execbuf submission */
> -	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL);
> +	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL, ctx);
>  	if (!client) {
>  		DRM_ERROR("Failed to create execbuf guc_client\n");
>  		return -ENOMEM;
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index d249326..9571b56 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -29,6 +29,7 @@
>  
>  struct i915_guc_client {
>  	struct drm_i915_gem_object *client_obj;
> +	struct intel_context *owner;
>  	uint32_t priority;
>  	uint32_t ctx_index;
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9e121d3..8294462 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -254,7 +254,8 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
>   */
>  u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
>  {
> -	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> +	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> +			LRC_PPHWSP_PN * PAGE_SIZE;
>  
>  	/* LRCA is required to be 4K aligned so the more significant 20 bits
>  	 * are globally unique */
> @@ -267,7 +268,8 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
>  	struct drm_device *dev = ring->dev;
>  	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>  	uint64_t desc;
> -	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> +	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> +			LRC_PPHWSP_PN * PAGE_SIZE;
>  
>  	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>  
> @@ -342,7 +344,7 @@ void intel_lr_context_update(struct drm_i915_gem_request *rq)
>  	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
>  	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
>  
> -	page = i915_gem_object_get_page(ctx_obj, 1);
> +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  	reg_state = kmap_atomic(page);
>  
>  	reg_state[CTX_RING_TAIL+1] = rq->tail;
> @@ -687,13 +689,17 @@ static void
>  intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *ring = request->ring;
> +	struct drm_i915_private *dev_priv = request->i915;
>  
>  	intel_logical_ring_advance(request->ringbuf);
>  
>  	if (intel_ring_stopped(ring))
>  		return;
>  
> -	execlists_context_queue(request);
> +	if (dev_priv->guc.execbuf_client)
> +		i915_guc_submit(dev_priv->guc.execbuf_client, request);
> +	else
> +		execlists_context_queue(request);
>  }
>  
>  static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
> @@ -984,6 +990,7 @@ int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
>  
>  static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
>  {
> +	struct drm_i915_private *dev_priv = rq->i915;
>  	struct intel_engine_cs *ring = rq->ring;
>  	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
>  	struct intel_ringbuffer *ringbuf = rq->ringbuf;
> @@ -991,14 +998,18 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
>  
>  	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
>  	if (rq->ctx->engine[ring->id].pin_count++ == 0) {
> -		ret = i915_gem_obj_ggtt_pin(ctx_obj,
> -				GEN8_LR_CONTEXT_ALIGN, 0);
> +		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> +				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
>  		if (ret)
>  			goto reset_pin_count;
>  
>  		ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
>  		if (ret)
>  			goto unpin_ctx_obj;
> +
> +		/* Invalidate GuC TLB. */
> +		if (i915.enable_guc_submission)
> +			I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
>  	}
>  
>  	return ret;
> @@ -1668,8 +1679,13 @@ out:
>  
>  static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
>  {
> +	struct drm_i915_private *dev_priv = req->i915;
>  	int ret;
>  
> +	/* Invalidate GuC TLB. */
[TOR:] This invalidation is in the init_context for render
ring but not the other rings.  Is this needed for other
rings?  Or, should this invalidation happen at a different
level?  It seems this may depend on the on render ring being
initialized first.

Thanks,
Tom

> +	if (i915.enable_guc_submission)
> +		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
> +
>  	ret = intel_logical_ring_workarounds_emit(req);
>  	if (ret)
>  		return ret;
> @@ -2026,7 +2042,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
>  
>  	/* The second page of the context object contains some fields which must
>  	 * be set up prior to the first execution. */
> -	page = i915_gem_object_get_page(ctx_obj, 1);
> +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  	reg_state = kmap_atomic(page);
>  
>  	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
> @@ -2185,12 +2201,13 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
>  		struct drm_i915_gem_object *default_ctx_obj)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct page *page;
>  
> -	/* The status page is offset 0 from the default context object
> -	 * in LRC mode. */
> -	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj);
> -	ring->status_page.page_addr =
> -			kmap(sg_page(default_ctx_obj->pages->sgl));
> +	/* The HWSP is part of the default context object in LRC mode. */
> +	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
> +			+ LRC_PPHWSP_PN * PAGE_SIZE;
> +	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
> +	ring->status_page.page_addr = kmap(page);
>  	ring->status_page.obj = default_ctx_obj;
>  
>  	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
> @@ -2226,6 +2243,9 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  
>  	context_size = round_up(get_lr_context_size(ring), 4096);
>  
> +	/* One extra page as the sharing data between driver and GuC */
> +	context_size += PAGE_SIZE * LRC_PPHWSP_PN;
> +
>  	ctx_obj = i915_gem_alloc_object(dev, context_size);
>  	if (!ctx_obj) {
>  		DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
> @@ -2233,7 +2253,8 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  	}
>  
>  	if (is_global_default_ctx) {
> -		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0);
> +		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> +				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
>  		if (ret) {
>  			DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n",
>  					ret);
> @@ -2252,7 +2273,7 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  
>  	ringbuf->ring = ring;
>  
> -	ringbuf->size = 32 * PAGE_SIZE;
> +	ringbuf->size = 4 * PAGE_SIZE;
>  	ringbuf->effective_size = ringbuf->size;
>  	ringbuf->head = 0;
>  	ringbuf->tail = 0;
> @@ -2352,7 +2373,7 @@ void intel_lr_context_reset(struct drm_device *dev,
>  			WARN(1, "Failed get_pages for context obj\n");
>  			continue;
>  		}
> -		page = i915_gem_object_get_page(ctx_obj, 1);
> +		page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
>  		reg_state = kmap_atomic(page);
>  
>  		reg_state[CTX_RING_HEAD+1] = 0;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 6ecc0b3..e04b5c2 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -67,6 +67,12 @@ static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf,
>  }
>  
>  /* Logical Ring Contexts */
> +
> +/* One extra page is added before LRC for GuC as shared data */
> +#define LRC_GUCSHR_PN	(0)
> +#define LRC_PPHWSP_PN	(LRC_GUCSHR_PN + 1)
> +#define LRC_STATE_PN	(LRC_PPHWSP_PN + 1)
> +
>  void intel_lr_context_free(struct intel_context *ctx);
>  int intel_lr_context_deferred_create(struct intel_context *ctx,
>  				     struct intel_engine_cs *ring);
> -- 
> 1.9.1
>
yu.dai@intel.com July 27, 2015, 7:33 p.m. UTC | #2
On 07/27/2015 08:57 AM, O'Rourke, Tom wrote:
> On Thu, Jul 09, 2015 at 07:29:12PM +0100, Dave Gordon wrote:
> > From: Alex Dai <yu.dai@intel.com>
> >
> > GuC-based submission is mostly the same as execlist mode, up to
> > intel_logical_ring_advance_and_submit(), where the context being
> > dispatched would be added to the execlist queue; at this point
> > we submit the context to the GuC backend instead.
> >
> > There are, however, a few other changes also required, notably:
> > 1.  Contexts must be pinned at GGTT addresses accessible by the GuC
> >     i.e. NOT in the range [0..WOPCM_SIZE), so we have to add the
> >     PIN_OFFSET_BIAS flag to the relevant GGTT-pinning calls.
> >
> > 2.  The GuC's TLB must be invalidated after a context is pinned at
> >     a new GGTT address.
> >
> > 3.  GuC firmware uses the one page before Ring Context as shared data.
> >     Therefore, whenever driver wants to get base address of LRC, we
> >     will offset one page for it. LRC_PPHWSP_PN is defined as the page
> >     number of LRCA.
> >
> > 4.  In the work queue used to pass requests to the GuC, the GuC
> >     firmware requires the ring-tail-offset to be represented as an
> >     11-bit value, expressed in QWords. Therefore, the ringbuffer
> >     size must be reduced to the representable range (4 pages).
> >
> > v2:
> >     Defer adding #defines until needed [Chris Wilson]
> >     Rationalise type declarations [Chris Wilson]
> >
> > v4:
> >     Squashed kerneldoc patch into here [Daniel Vetter]
> >
> > Issue: VIZ-4884
> > Signed-off-by: Alex Dai <yu.dai@intel.com>
> > Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
> > ---
> >  Documentation/DocBook/drm.tmpl             | 14 ++++++++
> >  drivers/gpu/drm/i915/i915_debugfs.c        |  2 +-
> >  drivers/gpu/drm/i915/i915_guc_submission.c | 52 +++++++++++++++++++++++++++---
> >  drivers/gpu/drm/i915/intel_guc.h           |  1 +
> >  drivers/gpu/drm/i915/intel_lrc.c           | 51 ++++++++++++++++++++---------
> >  drivers/gpu/drm/i915/intel_lrc.h           |  6 ++++
> >  6 files changed, 106 insertions(+), 20 deletions(-)
> >
> > diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl
> > index 596b11d..0ff5fd7 100644
> > --- a/Documentation/DocBook/drm.tmpl
> > +++ b/Documentation/DocBook/drm.tmpl
> > @@ -4223,6 +4223,20 @@ int num_ioctls;</synopsis>
> >        </sect2>
> >      </sect1>
> >      <sect1>
> > +      <title>GuC-based Command Submission</title>
> > +      <sect2>
> > +        <title>GuC</title>
> > +!Pdrivers/gpu/drm/i915/intel_guc_loader.c GuC-specific firmware loader
> > +!Idrivers/gpu/drm/i915/intel_guc_loader.c
> > +      </sect2>
> > +      <sect2>
> > +        <title>GuC Client</title>
> > +!Pdrivers/gpu/drm/i915/intel_guc_submission.c GuC-based command submissison
> > +!Idrivers/gpu/drm/i915/intel_guc_submission.c
> > +      </sect2>
> > +    </sect1>
> > +
> > +    <sect1>
> >        <title> Tracing </title>
> >        <para>
> >      This sections covers all things related to the tracepoints implemented in
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index 13e37d1..d93732a 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -1982,7 +1982,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
> >  		return;
> >  	}
> >
> > -	page = i915_gem_object_get_page(ctx_obj, 1);
> > +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> >  	if (!WARN_ON(page == NULL)) {
> >  		reg_state = kmap_atomic(page);
> >
> > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> > index 25d8807..c5c9fbf 100644
> > --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> > @@ -346,18 +346,58 @@ static void guc_init_proc_desc(struct intel_guc *guc,
> >  static void guc_init_ctx_desc(struct intel_guc *guc,
> >  			      struct i915_guc_client *client)
> >  {
> > +	struct intel_context *ctx = client->owner;
> >  	struct guc_context_desc desc;
> >  	struct sg_table *sg;
> > +	int i;
> >
> >  	memset(&desc, 0, sizeof(desc));
> >
> >  	desc.attribute = GUC_CTX_DESC_ATTR_ACTIVE | GUC_CTX_DESC_ATTR_KERNEL;
> >  	desc.context_id = client->ctx_index;
> >  	desc.priority = client->priority;
> > -	desc.engines_used = (1 << RCS) | (1 << VCS) | (1 << BCS) |
> > -			    (1 << VECS) | (1 << VCS2); /* all engines */
> >  	desc.db_id = client->doorbell_id;
> >
> > +	for (i = 0; i < I915_NUM_RINGS; i++) {
> > +		struct guc_execlist_context *lrc = &desc.lrc[i];
> > +		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
> > +		struct intel_engine_cs *ring;
> > +		struct drm_i915_gem_object *obj;
> > +		uint64_t ctx_desc;
> > +
> > +		/* TODO: We have a design issue to be solved here. Only when we
> > +		 * receive the first batch, we know which engine is used by the
> > +		 * user. But here GuC expects the lrc and ring to be pinned. It
> > +		 * is not an issue for default context, which is the only one
> > +		 * for now who owns a GuC client. But for future owner of GuC
> > +		 * client, need to make sure lrc is pinned prior to enter here.
> > +		 */
> > +		obj = ctx->engine[i].state;
> > +		if (!obj)
> > +			break;
> > +
> > +		ring = ringbuf->ring;
> > +		ctx_desc = intel_lr_context_descriptor(ctx, ring);
> > +		lrc->context_desc = (u32)ctx_desc;
> > +
> > +		/* The state page is after PPHWSP */
> > +		lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
> > +				LRC_STATE_PN * PAGE_SIZE;
> > +		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
> > +				(ring->id << GUC_ELC_ENGINE_OFFSET);
> > +
> > +		obj = ringbuf->obj;
> > +
> > +		lrc->ring_begin = i915_gem_obj_ggtt_offset(obj);
> > +		lrc->ring_end = lrc->ring_begin + obj->base.size - 1;
> > +		lrc->ring_next_free_location = lrc->ring_begin;
> > +		lrc->ring_current_tail_pointer_value = 0;
> > +
> > +		desc.engines_used |= (1 << ring->id);
> > +	}
> > +
> > +	WARN_ON(desc.engines_used == 0);
> > +
> >  	/*
> >  	 * The CPU address is only needed at certain points, so kmap_atomic on
> >  	 * demand instead of storing it in the ctx descriptor.
> > @@ -622,11 +662,13 @@ static void guc_client_free(struct drm_device *dev,
> >   * 		The kernel client to replace ExecList submission is created with
> >   * 		NORMAL priority. Priority of a client for scheduler can be HIGH,
> >   * 		while a preemption context can use CRITICAL.
> > + * @ctx		the context to own the client (we use the default render context)
> >   *
> >   * Return:	An i915_guc_client object if success.
> >   */
> >  static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
> > -						uint32_t priority)
> > +						uint32_t priority,
> > +						struct intel_context *ctx)
> >  {
> >  	struct i915_guc_client *client;
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > @@ -639,6 +681,7 @@ static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
> >
> >  	client->doorbell_id = GUC_INVALID_DOORBELL_ID;
> >  	client->priority = priority;
> > +	client->owner = ctx;
> >
> >  	client->ctx_index = (uint32_t)ida_simple_get(&guc->ctx_ids, 0,
> >  			GUC_MAX_GPU_CONTEXTS, GFP_KERNEL);
> > @@ -772,10 +815,11 @@ int i915_guc_submission_enable(struct drm_device *dev)
> >  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	struct intel_guc *guc = &dev_priv->guc;
> > +	struct intel_context *ctx = dev_priv->ring[RCS].default_context;
> >  	struct i915_guc_client *client;
> >
> >  	/* client for execbuf submission */
> > -	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL);
> > +	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL, ctx);
> >  	if (!client) {
> >  		DRM_ERROR("Failed to create execbuf guc_client\n");
> >  		return -ENOMEM;
> > diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> > index d249326..9571b56 100644
> > --- a/drivers/gpu/drm/i915/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/intel_guc.h
> > @@ -29,6 +29,7 @@
> >
> >  struct i915_guc_client {
> >  	struct drm_i915_gem_object *client_obj;
> > +	struct intel_context *owner;
> >  	uint32_t priority;
> >  	uint32_t ctx_index;
> >
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 9e121d3..8294462 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -254,7 +254,8 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
> >   */
> >  u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
> >  {
> > -	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> > +	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> > +			LRC_PPHWSP_PN * PAGE_SIZE;
> >
> >  	/* LRCA is required to be 4K aligned so the more significant 20 bits
> >  	 * are globally unique */
> > @@ -267,7 +268,8 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
> >  	struct drm_device *dev = ring->dev;
> >  	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
> >  	uint64_t desc;
> > -	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj);
> > +	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
> > +			LRC_PPHWSP_PN * PAGE_SIZE;
> >
> >  	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
> >
> > @@ -342,7 +344,7 @@ void intel_lr_context_update(struct drm_i915_gem_request *rq)
> >  	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
> >  	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
> >
> > -	page = i915_gem_object_get_page(ctx_obj, 1);
> > +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> >  	reg_state = kmap_atomic(page);
> >
> >  	reg_state[CTX_RING_TAIL+1] = rq->tail;
> > @@ -687,13 +689,17 @@ static void
> >  intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
> >  {
> >  	struct intel_engine_cs *ring = request->ring;
> > +	struct drm_i915_private *dev_priv = request->i915;
> >
> >  	intel_logical_ring_advance(request->ringbuf);
> >
> >  	if (intel_ring_stopped(ring))
> >  		return;
> >
> > -	execlists_context_queue(request);
> > +	if (dev_priv->guc.execbuf_client)
> > +		i915_guc_submit(dev_priv->guc.execbuf_client, request);
> > +	else
> > +		execlists_context_queue(request);
> >  }
> >
> >  static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
> > @@ -984,6 +990,7 @@ int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
> >
> >  static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
> >  {
> > +	struct drm_i915_private *dev_priv = rq->i915;
> >  	struct intel_engine_cs *ring = rq->ring;
> >  	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
> >  	struct intel_ringbuffer *ringbuf = rq->ringbuf;
> > @@ -991,14 +998,18 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
> >
> >  	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
> >  	if (rq->ctx->engine[ring->id].pin_count++ == 0) {
> > -		ret = i915_gem_obj_ggtt_pin(ctx_obj,
> > -				GEN8_LR_CONTEXT_ALIGN, 0);
> > +		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> > +				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
> >  		if (ret)
> >  			goto reset_pin_count;
> >
> >  		ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
> >  		if (ret)
> >  			goto unpin_ctx_obj;
> > +
> > +		/* Invalidate GuC TLB. */
> > +		if (i915.enable_guc_submission)
> > +			I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
> >  	}
> >
> >  	return ret;
> > @@ -1668,8 +1679,13 @@ out:
> >
> >  static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
> >  {
> > +	struct drm_i915_private *dev_priv = req->i915;
> >  	int ret;
> >
> > +	/* Invalidate GuC TLB. */
> [TOR:] This invalidation is in the init_context for render
> ring but not the other rings.  Is this needed for other
> rings?  Or, should this invalidation happen at a different
> level?  It seems this may depend on the on render ring being
> initialized first.

When a LRC is newly mapped, we should invalidate GuC TLB before any 
submission. This is needed for the golden context init. For other rings, 
it can be deferred until an user LRC is mapped into GGTT.

Alex

> Thanks,
> Tom
>
> > +	if (i915.enable_guc_submission)
> > +		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
> > +
> >  	ret = intel_logical_ring_workarounds_emit(req);
> >  	if (ret)
> >  		return ret;
> > @@ -2026,7 +2042,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
> >
> >  	/* The second page of the context object contains some fields which must
> >  	 * be set up prior to the first execution. */
> > -	page = i915_gem_object_get_page(ctx_obj, 1);
> > +	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> >  	reg_state = kmap_atomic(page);
> >
> >  	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
> > @@ -2185,12 +2201,13 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
> >  		struct drm_i915_gem_object *default_ctx_obj)
> >  {
> >  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > +	struct page *page;
> >
> > -	/* The status page is offset 0 from the default context object
> > -	 * in LRC mode. */
> > -	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj);
> > -	ring->status_page.page_addr =
> > -			kmap(sg_page(default_ctx_obj->pages->sgl));
> > +	/* The HWSP is part of the default context object in LRC mode. */
> > +	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
> > +			+ LRC_PPHWSP_PN * PAGE_SIZE;
> > +	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
> > +	ring->status_page.page_addr = kmap(page);
> >  	ring->status_page.obj = default_ctx_obj;
> >
> >  	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
> > @@ -2226,6 +2243,9 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
> >
> >  	context_size = round_up(get_lr_context_size(ring), 4096);
> >
> > +	/* One extra page as the sharing data between driver and GuC */
> > +	context_size += PAGE_SIZE * LRC_PPHWSP_PN;
> > +
> >  	ctx_obj = i915_gem_alloc_object(dev, context_size);
> >  	if (!ctx_obj) {
> >  		DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
> > @@ -2233,7 +2253,8 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
> >  	}
> >
> >  	if (is_global_default_ctx) {
> > -		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0);
> > +		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
> > +				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
> >  		if (ret) {
> >  			DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n",
> >  					ret);
> > @@ -2252,7 +2273,7 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
> >
> >  	ringbuf->ring = ring;
> >
> > -	ringbuf->size = 32 * PAGE_SIZE;
> > +	ringbuf->size = 4 * PAGE_SIZE;
> >  	ringbuf->effective_size = ringbuf->size;
> >  	ringbuf->head = 0;
> >  	ringbuf->tail = 0;
> > @@ -2352,7 +2373,7 @@ void intel_lr_context_reset(struct drm_device *dev,
> >  			WARN(1, "Failed get_pages for context obj\n");
> >  			continue;
> >  		}
> > -		page = i915_gem_object_get_page(ctx_obj, 1);
> > +		page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
> >  		reg_state = kmap_atomic(page);
> >
> >  		reg_state[CTX_RING_HEAD+1] = 0;
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> > index 6ecc0b3..e04b5c2 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.h
> > +++ b/drivers/gpu/drm/i915/intel_lrc.h
> > @@ -67,6 +67,12 @@ static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf,
> >  }
> >
> >  /* Logical Ring Contexts */
> > +
> > +/* One extra page is added before LRC for GuC as shared data */
> > +#define LRC_GUCSHR_PN	(0)
> > +#define LRC_PPHWSP_PN	(LRC_GUCSHR_PN + 1)
> > +#define LRC_STATE_PN	(LRC_PPHWSP_PN + 1)
> > +
> >  void intel_lr_context_free(struct intel_context *ctx);
> >  int intel_lr_context_deferred_create(struct intel_context *ctx,
> >  				     struct intel_engine_cs *ring);
> > --
> > 1.9.1
> >
Dave Gordon July 28, 2015, 1:59 p.m. UTC | #3
On 27/07/15 16:57, O'Rourke, Tom wrote:
> On Thu, Jul 09, 2015 at 07:29:12PM +0100, Dave Gordon wrote:
>> From: Alex Dai <yu.dai@intel.com>
>>
>> GuC-based submission is mostly the same as execlist mode, up to
>> intel_logical_ring_advance_and_submit(), where the context being
>> dispatched would be added to the execlist queue; at this point
>> we submit the context to the GuC backend instead.
>>
>> There are, however, a few other changes also required, notably:
>> 1.  Contexts must be pinned at GGTT addresses accessible by the GuC
>>      i.e. NOT in the range [0..WOPCM_SIZE), so we have to add the
>>      PIN_OFFSET_BIAS flag to the relevant GGTT-pinning calls.
>>
>> 2.  The GuC's TLB must be invalidated after a context is pinned at
>>      a new GGTT address.

[snip]

>>   static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
>>   {
>> +	struct drm_i915_private *dev_priv = req->i915;
>>   	int ret;
>>
>> +	/* Invalidate GuC TLB. */
>> +	if (i915.enable_guc_submission)
>> +		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
>> +
 >
> [TOR:] This invalidation is in the init_context for render
> ring but not the other rings.  Is this needed for other
> rings?  Or, should this invalidation happen at a different
> level?  It seems this may depend on the on render ring being
> initialized first.
>
> Thanks,
> Tom

Hi Tom,

it looks like this is redundant here in the case where its called from 
the non-default-context case of intel_lr_context_deferred_create(); but 
when called from i915_gem_init_hw() [via i915_gem_context_enable()] it 
wouldn't be, because the GuC TLB wouldn't have been flushed since the 
default context was pinned [which is in a completely different path 
through intel_lr_context_deferred_create()!].

However, if we add a TLB flush just after that point, we can remove this 
one here, with several advantages:
* firstly, that path is taken just once, rather than every time a new 
render context is created, and
* secondly, each flush can be specifically associated with a pin-to-gtt 
call that includes the (PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE) flags, 
showing that the pinned object is of interest to the GuC.

I'll also move the existing TLB flushes in guc_submission.c and 
guc_loader.c so that they're also just after the relevant 'pin' calls.

Thanks,
.Dave.
yu.dai@intel.com July 28, 2015, 4:47 p.m. UTC | #4
On 07/28/2015 06:59 AM, Dave Gordon wrote:
> On 27/07/15 16:57, O'Rourke, Tom wrote:
> > On Thu, Jul 09, 2015 at 07:29:12PM +0100, Dave Gordon wrote:
> >> From: Alex Dai <yu.dai@intel.com>
> >>
> >> GuC-based submission is mostly the same as execlist mode, up to
> >> intel_logical_ring_advance_and_submit(), where the context being
> >> dispatched would be added to the execlist queue; at this point
> >> we submit the context to the GuC backend instead.
> >>
> >> There are, however, a few other changes also required, notably:
> >> 1.  Contexts must be pinned at GGTT addresses accessible by the GuC
> >>      i.e. NOT in the range [0..WOPCM_SIZE), so we have to add the
> >>      PIN_OFFSET_BIAS flag to the relevant GGTT-pinning calls.
> >>
> >> 2.  The GuC's TLB must be invalidated after a context is pinned at
> >>      a new GGTT address.
>
> [snip]
>
> >>   static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
> >>   {
> >> +	struct drm_i915_private *dev_priv = req->i915;
> >>   	int ret;
> >>
> >> +	/* Invalidate GuC TLB. */
> >> +	if (i915.enable_guc_submission)
> >> +		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
> >> +
>   >
> > [TOR:] This invalidation is in the init_context for render
> > ring but not the other rings.  Is this needed for other
> > rings?  Or, should this invalidation happen at a different
> > level?  It seems this may depend on the on render ring being
> > initialized first.
> >
> > Thanks,
> > Tom
>
> Hi Tom,
>
> it looks like this is redundant here in the case where its called from
> the non-default-context case of intel_lr_context_deferred_create(); but
> when called from i915_gem_init_hw() [via i915_gem_context_enable()] it
> wouldn't be, because the GuC TLB wouldn't have been flushed since the
> default context was pinned [which is in a completely different path
> through intel_lr_context_deferred_create()!].
>
> However, if we add a TLB flush just after that point, we can remove this
> one here, with several advantages:
> * firstly, that path is taken just once, rather than every time a new
> render context is created, and
> * secondly, each flush can be specifically associated with a pin-to-gtt
> call that includes the (PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE) flags,
> showing that the pinned object is of interest to the GuC.
>
> I'll also move the existing TLB flushes in guc_submission.c and
> guc_loader.c so that they're also just after the relevant 'pin' calls.
>

Aye. -Alex
diff mbox

Patch

diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl
index 596b11d..0ff5fd7 100644
--- a/Documentation/DocBook/drm.tmpl
+++ b/Documentation/DocBook/drm.tmpl
@@ -4223,6 +4223,20 @@  int num_ioctls;</synopsis>
       </sect2>
     </sect1>
     <sect1>
+      <title>GuC-based Command Submission</title>
+      <sect2>
+        <title>GuC</title>
+!Pdrivers/gpu/drm/i915/intel_guc_loader.c GuC-specific firmware loader
+!Idrivers/gpu/drm/i915/intel_guc_loader.c
+      </sect2>
+      <sect2>
+        <title>GuC Client</title>
+!Pdrivers/gpu/drm/i915/intel_guc_submission.c GuC-based command submissison
+!Idrivers/gpu/drm/i915/intel_guc_submission.c
+      </sect2>
+    </sect1>
+
+    <sect1>
       <title> Tracing </title>
       <para>
     This sections covers all things related to the tracepoints implemented in
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 13e37d1..d93732a 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1982,7 +1982,7 @@  static void i915_dump_lrc_obj(struct seq_file *m,
 		return;
 	}
 
-	page = i915_gem_object_get_page(ctx_obj, 1);
+	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 	if (!WARN_ON(page == NULL)) {
 		reg_state = kmap_atomic(page);
 
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 25d8807..c5c9fbf 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -346,18 +346,58 @@  static void guc_init_proc_desc(struct intel_guc *guc,
 static void guc_init_ctx_desc(struct intel_guc *guc,
 			      struct i915_guc_client *client)
 {
+	struct intel_context *ctx = client->owner;
 	struct guc_context_desc desc;
 	struct sg_table *sg;
+	int i;
 
 	memset(&desc, 0, sizeof(desc));
 
 	desc.attribute = GUC_CTX_DESC_ATTR_ACTIVE | GUC_CTX_DESC_ATTR_KERNEL;
 	desc.context_id = client->ctx_index;
 	desc.priority = client->priority;
-	desc.engines_used = (1 << RCS) | (1 << VCS) | (1 << BCS) |
-			    (1 << VECS) | (1 << VCS2); /* all engines */
 	desc.db_id = client->doorbell_id;
 
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		struct guc_execlist_context *lrc = &desc.lrc[i];
+		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
+		struct intel_engine_cs *ring;
+		struct drm_i915_gem_object *obj;
+		uint64_t ctx_desc;
+
+		/* TODO: We have a design issue to be solved here. Only when we
+		 * receive the first batch, we know which engine is used by the
+		 * user. But here GuC expects the lrc and ring to be pinned. It
+		 * is not an issue for default context, which is the only one
+		 * for now who owns a GuC client. But for future owner of GuC
+		 * client, need to make sure lrc is pinned prior to enter here.
+		 */
+		obj = ctx->engine[i].state;
+		if (!obj)
+			break;
+
+		ring = ringbuf->ring;
+		ctx_desc = intel_lr_context_descriptor(ctx, ring);
+		lrc->context_desc = (u32)ctx_desc;
+
+		/* The state page is after PPHWSP */
+		lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
+				LRC_STATE_PN * PAGE_SIZE;
+		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
+				(ring->id << GUC_ELC_ENGINE_OFFSET);
+
+		obj = ringbuf->obj;
+
+		lrc->ring_begin = i915_gem_obj_ggtt_offset(obj);
+		lrc->ring_end = lrc->ring_begin + obj->base.size - 1;
+		lrc->ring_next_free_location = lrc->ring_begin;
+		lrc->ring_current_tail_pointer_value = 0;
+
+		desc.engines_used |= (1 << ring->id);
+	}
+
+	WARN_ON(desc.engines_used == 0);
+
 	/*
 	 * The CPU address is only needed at certain points, so kmap_atomic on
 	 * demand instead of storing it in the ctx descriptor.
@@ -622,11 +662,13 @@  static void guc_client_free(struct drm_device *dev,
  * 		The kernel client to replace ExecList submission is created with
  * 		NORMAL priority. Priority of a client for scheduler can be HIGH,
  * 		while a preemption context can use CRITICAL.
+ * @ctx		the context to own the client (we use the default render context)
  *
  * Return:	An i915_guc_client object if success.
  */
 static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
-						uint32_t priority)
+						uint32_t priority,
+						struct intel_context *ctx)
 {
 	struct i915_guc_client *client;
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -639,6 +681,7 @@  static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
 
 	client->doorbell_id = GUC_INVALID_DOORBELL_ID;
 	client->priority = priority;
+	client->owner = ctx;
 
 	client->ctx_index = (uint32_t)ida_simple_get(&guc->ctx_ids, 0,
 			GUC_MAX_GPU_CONTEXTS, GFP_KERNEL);
@@ -772,10 +815,11 @@  int i915_guc_submission_enable(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_guc *guc = &dev_priv->guc;
+	struct intel_context *ctx = dev_priv->ring[RCS].default_context;
 	struct i915_guc_client *client;
 
 	/* client for execbuf submission */
-	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL);
+	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_NORMAL, ctx);
 	if (!client) {
 		DRM_ERROR("Failed to create execbuf guc_client\n");
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index d249326..9571b56 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -29,6 +29,7 @@ 
 
 struct i915_guc_client {
 	struct drm_i915_gem_object *client_obj;
+	struct intel_context *owner;
 	uint32_t priority;
 	uint32_t ctx_index;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 9e121d3..8294462 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -254,7 +254,8 @@  int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
  */
 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
 {
-	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj);
+	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
+			LRC_PPHWSP_PN * PAGE_SIZE;
 
 	/* LRCA is required to be 4K aligned so the more significant 20 bits
 	 * are globally unique */
@@ -267,7 +268,8 @@  uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
 	uint64_t desc;
-	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj);
+	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
+			LRC_PPHWSP_PN * PAGE_SIZE;
 
 	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
 
@@ -342,7 +344,7 @@  void intel_lr_context_update(struct drm_i915_gem_request *rq)
 	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
 	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
 
-	page = i915_gem_object_get_page(ctx_obj, 1);
+	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 	reg_state = kmap_atomic(page);
 
 	reg_state[CTX_RING_TAIL+1] = rq->tail;
@@ -687,13 +689,17 @@  static void
 intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *ring = request->ring;
+	struct drm_i915_private *dev_priv = request->i915;
 
 	intel_logical_ring_advance(request->ringbuf);
 
 	if (intel_ring_stopped(ring))
 		return;
 
-	execlists_context_queue(request);
+	if (dev_priv->guc.execbuf_client)
+		i915_guc_submit(dev_priv->guc.execbuf_client, request);
+	else
+		execlists_context_queue(request);
 }
 
 static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
@@ -984,6 +990,7 @@  int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
 
 static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 {
+	struct drm_i915_private *dev_priv = rq->i915;
 	struct intel_engine_cs *ring = rq->ring;
 	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
 	struct intel_ringbuffer *ringbuf = rq->ringbuf;
@@ -991,14 +998,18 @@  static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 
 	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
 	if (rq->ctx->engine[ring->id].pin_count++ == 0) {
-		ret = i915_gem_obj_ggtt_pin(ctx_obj,
-				GEN8_LR_CONTEXT_ALIGN, 0);
+		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
+				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
 		if (ret)
 			goto reset_pin_count;
 
 		ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
 		if (ret)
 			goto unpin_ctx_obj;
+
+		/* Invalidate GuC TLB. */
+		if (i915.enable_guc_submission)
+			I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
 	}
 
 	return ret;
@@ -1668,8 +1679,13 @@  out:
 
 static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
 {
+	struct drm_i915_private *dev_priv = req->i915;
 	int ret;
 
+	/* Invalidate GuC TLB. */
+	if (i915.enable_guc_submission)
+		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+
 	ret = intel_logical_ring_workarounds_emit(req);
 	if (ret)
 		return ret;
@@ -2026,7 +2042,7 @@  populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 
 	/* The second page of the context object contains some fields which must
 	 * be set up prior to the first execution. */
-	page = i915_gem_object_get_page(ctx_obj, 1);
+	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 	reg_state = kmap_atomic(page);
 
 	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
@@ -2185,12 +2201,13 @@  static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
 		struct drm_i915_gem_object *default_ctx_obj)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct page *page;
 
-	/* The status page is offset 0 from the default context object
-	 * in LRC mode. */
-	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj);
-	ring->status_page.page_addr =
-			kmap(sg_page(default_ctx_obj->pages->sgl));
+	/* The HWSP is part of the default context object in LRC mode. */
+	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
+			+ LRC_PPHWSP_PN * PAGE_SIZE;
+	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
+	ring->status_page.page_addr = kmap(page);
 	ring->status_page.obj = default_ctx_obj;
 
 	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
@@ -2226,6 +2243,9 @@  int intel_lr_context_deferred_create(struct intel_context *ctx,
 
 	context_size = round_up(get_lr_context_size(ring), 4096);
 
+	/* One extra page as the sharing data between driver and GuC */
+	context_size += PAGE_SIZE * LRC_PPHWSP_PN;
+
 	ctx_obj = i915_gem_alloc_object(dev, context_size);
 	if (!ctx_obj) {
 		DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
@@ -2233,7 +2253,8 @@  int intel_lr_context_deferred_create(struct intel_context *ctx,
 	}
 
 	if (is_global_default_ctx) {
-		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0);
+		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
+				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
 		if (ret) {
 			DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n",
 					ret);
@@ -2252,7 +2273,7 @@  int intel_lr_context_deferred_create(struct intel_context *ctx,
 
 	ringbuf->ring = ring;
 
-	ringbuf->size = 32 * PAGE_SIZE;
+	ringbuf->size = 4 * PAGE_SIZE;
 	ringbuf->effective_size = ringbuf->size;
 	ringbuf->head = 0;
 	ringbuf->tail = 0;
@@ -2352,7 +2373,7 @@  void intel_lr_context_reset(struct drm_device *dev,
 			WARN(1, "Failed get_pages for context obj\n");
 			continue;
 		}
-		page = i915_gem_object_get_page(ctx_obj, 1);
+		page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 		reg_state = kmap_atomic(page);
 
 		reg_state[CTX_RING_HEAD+1] = 0;
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 6ecc0b3..e04b5c2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -67,6 +67,12 @@  static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf,
 }
 
 /* Logical Ring Contexts */
+
+/* One extra page is added before LRC for GuC as shared data */
+#define LRC_GUCSHR_PN	(0)
+#define LRC_PPHWSP_PN	(LRC_GUCSHR_PN + 1)
+#define LRC_STATE_PN	(LRC_PPHWSP_PN + 1)
+
 void intel_lr_context_free(struct intel_context *ctx);
 int intel_lr_context_deferred_create(struct intel_context *ctx,
 				     struct intel_engine_cs *ring);