[RFC] drm/i915/lrc: allocate separate page for HWSP
diff mbox

Message ID 1499357440-34688-1-git-send-email-daniele.ceraolospurio@intel.com
State New
Headers show

Commit Message

Daniele Ceraolo Spurio July 6, 2017, 4:10 p.m. UTC
On gen8+ we're currently using the PPHWSP of the kernel ctx as the
global HWSP. However, when the kernel ctx gets submitted (e.g. from
__intel_autoenable_gt_powersave) the HW will use that page as both
HWSP and PPHWSP. Currently we're not seeing any problem because the
conflict happens at offsets below 0x30 in an area we don't access,
but that is not guaranteed to be true for future platform.

To avoid the conflict, instead of re-using the PPHWSP of the kernel
ctx we can allocate a separate page for the HWSP like what happens for
pre-execlists platform.

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/intel_engine_cs.c  | 123 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c        |  42 +----------
 drivers/gpu/drm/i915/intel_ringbuffer.c | 125 +-------------------------------
 3 files changed, 127 insertions(+), 163 deletions(-)

Comments

Chris Wilson July 6, 2017, 5:59 p.m. UTC | #1
Quoting Daniele Ceraolo Spurio (2017-07-06 17:10:40)
> On gen8+ we're currently using the PPHWSP of the kernel ctx as the
> global HWSP. However, when the kernel ctx gets submitted (e.g. from
> __intel_autoenable_gt_powersave) the HW will use that page as both
> HWSP and PPHWSP. Currently we're not seeing any problem because the
> conflict happens at offsets below 0x30 in an area we don't access,
> but that is not guaranteed to be true for future platform.
> 
> To avoid the conflict, instead of re-using the PPHWSP of the kernel
> ctx we can allocate a separate page for the HWSP like what happens for
> pre-execlists platform.

If there are no conflicts, then just skip the patch, and really there
shouldn't be since the writes we care about are ordered and the writes
we don't, we don't. We will need to move to per-context hwsp in the near
future which should alleviate these concerns.
-Chris
Michel Thierry July 6, 2017, 6:51 p.m. UTC | #2
On 7/6/2017 10:59 AM, Chris Wilson wrote:
 > If there are no conflicts, then just skip the patch, and really there
 > shouldn't be since the writes we care about are ordered and the writes
 > we don't, we don't. We will need to move to per-context hwsp in the near
 > future which should alleviate these concerns.

It helped me explain the strange data I was seeing in the HSWP during 
driver load came from (that random data wasn't really random, I was 
looking at the PPHWSP which later became the HWSP).

Just a comment/fix for GuC submission below, because as it is, this will 
break it.

-Michel

On 06/07/17 09:10, Daniele Ceraolo Spurio wrote:
> On gen8+ we're currently using the PPHWSP of the kernel ctx as the
> global HWSP. However, when the kernel ctx gets submitted (e.g. from
> __intel_autoenable_gt_powersave) the HW will use that page as both
> HWSP and PPHWSP. Currently we're not seeing any problem because the
> conflict happens at offsets below 0x30 in an area we don't access,
> but that is not guaranteed to be true for future platform.
> 
> To avoid the conflict, instead of re-using the PPHWSP of the kernel
> ctx we can allocate a separate page for the HWSP like what happens for
> pre-execlists platform.
> 
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com> > ---
>   drivers/gpu/drm/i915/intel_engine_cs.c  | 123 +++++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/intel_lrc.c        |  42 +----------
>   drivers/gpu/drm/i915/intel_ringbuffer.c | 125 +-------------------------------
>   3 files changed, 127 insertions(+), 163 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index a55cd72..70e9d88 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -444,6 +444,114 @@ static void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
>   	i915_vma_unpin_and_release(&engine->scratch);
>   }
>   
> +static void cleanup_phys_status_page(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +
> +	if (!dev_priv->status_page_dmah)
> +		return;
> +
> +	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
> +	engine->status_page.page_addr = NULL;
> +}
> +
> +static void cleanup_status_page(struct intel_engine_cs *engine)
> +{
> +	struct i915_vma *vma;
> +	struct drm_i915_gem_object *obj;
> +
> +	vma = fetch_and_zero(&engine->status_page.vma);
> +	if (!vma)
> +		return;
> +
> +	obj = vma->obj;
> +
> +	i915_vma_unpin(vma);
> +	i915_vma_close(vma);
> +
> +	i915_gem_object_unpin_map(obj);
> +	__i915_gem_object_release_unless_active(obj);
> +}
> +
> +static int init_status_page(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_gem_object *obj;
> +	struct i915_vma *vma;
> +	unsigned int flags;
> +	void *vaddr;
> +	int ret;
> +
> +	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
> +	if (IS_ERR(obj)) {
> +		DRM_ERROR("Failed to allocate status page\n");
> +		return PTR_ERR(obj);
> +	}
> +
> +	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
> +	if (ret)
> +		goto err;
> +
> +	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
> +	if (IS_ERR(vma)) {
> +		ret = PTR_ERR(vma);
> +		goto err;
> +	}
> +
> +	flags = PIN_GLOBAL;
> +	if (!HAS_LLC(engine->i915))
> +		/* On g33, we cannot place HWS above 256MiB, so
> +		 * restrict its pinning to the low mappable arena.
> +		 * Though this restriction is not documented for
> +		 * gen4, gen5, or byt, they also behave similarly
> +		 * and hang if the HWS is placed at the top of the
> +		 * GTT. To generalise, it appears that all !llc
> +		 * platforms have issues with us placing the HWS
> +		 * above the mappable region (even though we never
> +		 * actualy map it).

Chance to fix a typo ("actualy"), and proof that it's really just 
reusing code.

> +		 */
> +		flags |= PIN_MAPPABLE;
> +	ret = i915_vma_pin(vma, 0, 4096, flags);
> +	if (ret)
> +		goto err;

This will break GuC submission, the HWSP will be allocated correctly,

[ ] [drm:intel_engine_init_common [i915]] rcs0 hws offset: 0x00001000
[ ] [drm:intel_engine_init_common [i915]] bcs0 hws offset: 0x00002000
[ ] [drm:intel_engine_init_common [i915]] vcs0 hws offset: 0x00003000
[ ] [drm:intel_engine_init_common [i915]] vcs1 hws offset: 0x00004000
[ ] [drm:intel_engine_init_common [i915]] vecs0 hws offset: 0x00005000

But these are below GUC_WOPCM_TOP (0x80000), and our _beloved_ fw must 
be having problems accessing them, because this causes:

[ ] [drm:intel_guc_init_hw [i915]] GuC fw status: fetch SUCCESS, load 
PENDING
[ ] [drm:guc_ucode_xfer_dma [i915]] DMA status 0x10, GuC status 0x800071ec
[ ] [drm:guc_ucode_xfer_dma [i915]] returning -110

So we must need this restriction before pinning it,

@@ -510,6 +510,11 @@ static int init_status_page(struct intel_engine_cs 
*engine)
                  * actualy map it).
                  */
                 flags |= PIN_MAPPABLE;
+
+       /* GuC can only access pages above GUC_WOPCM_TOP ¯\_(ツ)_/¯ */
+       if (HAS_GUC(engine->i915) && i915.enable_guc_loading)
+               flags |= PIN_OFFSET_BIAS | GUC_WOPCM_TOP;
+
         ret = i915_vma_pin(vma, 0, 4096, flags);
         if (ret)
                 goto err;

With that change, GuC is happy:

[ ] [drm:intel_engine_init_common [i915]] rcs0 hws offset: 0x00084000
[ ] [drm:intel_engine_init_common [i915]] bcs0 hws offset: 0x00089000
[ ] [drm:intel_engine_init_common [i915]] vcs0 hws offset: 0x0008e000
[ ] [drm:intel_engine_init_common [i915]] vcs1 hws offset: 0x00093000
[ ] [drm:intel_engine_init_common [i915]] vecs0 hws offset: 0x00098000
...
[ ] [drm:intel_guc_init_hw [i915]] GuC fw status: fetch SUCCESS, load 
PENDING
[ ] [drm:guc_ucode_xfer_dma [i915]] DMA status 0x10, GuC status 0x8002f0ec
[ ] [drm:guc_ucode_xfer_dma [i915]] returning 0
[ ] [drm] GuC submission enabled (firmware i915/skl_guc_ver9_33.bin 
[version 9.33])


> +
> +	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
> +	if (IS_ERR(vaddr)) {
> +		ret = PTR_ERR(vaddr);
> +		goto err_unpin;
> +	}
> +
> +	engine->status_page.vma = vma;
> +	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
> +	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
> +
> +	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
> +			 engine->name, i915_ggtt_offset(vma));
> +	return 0;
> +
> +err_unpin:
> +	i915_vma_unpin(vma);
> +err:
> +	i915_gem_object_put(obj);
> +	return ret;
> +}
> +
> +static int init_phys_status_page(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +
> +	GEM_BUG_ON(engine->id != RCS);
> +
> +	dev_priv->status_page_dmah =
> +		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
> +	if (!dev_priv->status_page_dmah)
> +		return -ENOMEM;
> +
> +	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
> +	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
> +
> +	return 0;
> +}
> +
>   /**
>    * intel_engines_init_common - initialize cengine state which might require hw access
>    * @engine: Engine to initialize.
> @@ -481,8 +589,18 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>   	if (ret)
>   		goto err_unpin;
>   
> +	if (HWS_NEEDS_PHYSICAL(engine->i915))
> +		ret = init_phys_status_page(engine);
> +	else
> +		ret = init_status_page(engine);
> +	if (ret)
> +		goto err_rs_fini;
> +
>   	return 0;
>   
> +err_rs_fini:
> +	i915_gem_render_state_fini(engine);
> +
>   err_unpin:
>   	engine->context_unpin(engine, engine->i915->kernel_context);
>   	return ret;
> @@ -499,6 +617,11 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>   {
>   	intel_engine_cleanup_scratch(engine);
>   
> +	if (HWS_NEEDS_PHYSICAL(engine->i915))
> +		cleanup_phys_status_page(engine);
> +	else
> +		cleanup_status_page(engine);
> +
>   	i915_gem_render_state_fini(engine);
>   	intel_engine_fini_breadcrumbs(engine);
>   	intel_engine_cleanup_cmd_parser(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 699868d..d84a36c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1641,11 +1641,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
>   	if (engine->cleanup)
>   		engine->cleanup(engine);
>   
> -	if (engine->status_page.vma) {
> -		i915_gem_object_unpin_map(engine->status_page.vma->obj);
> -		engine->status_page.vma = NULL;
> -	}
> -
>   	intel_engine_cleanup_common(engine);
>   
>   	lrc_destroy_wa_ctx(engine);
> @@ -1692,24 +1687,6 @@ static void execlists_set_default_submission(struct intel_engine_cs *engine)
>   	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
>   }
>   
> -static int
> -lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma)
> -{
> -	const int hws_offset = LRC_PPHWSP_PN * PAGE_SIZE;
> -	void *hws;
> -
> -	/* The HWSP is part of the default context object in LRC mode. */
> -	hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
> -	if (IS_ERR(hws))
> -		return PTR_ERR(hws);
> -
> -	engine->status_page.page_addr = hws + hws_offset;
> -	engine->status_page.ggtt_offset = i915_ggtt_offset(vma) + hws_offset;
> -	engine->status_page.vma = vma;
> -
> -	return 0;
> -}
> -
>   static void
>   logical_ring_setup(struct intel_engine_cs *engine)
>   {
> @@ -1742,27 +1719,14 @@ static void execlists_set_default_submission(struct intel_engine_cs *engine)
>   	logical_ring_default_irqs(engine);
>   }
>   
> -static int
> -logical_ring_init(struct intel_engine_cs *engine)
> +static int logical_ring_init(struct intel_engine_cs *engine)
>   {
> -	struct i915_gem_context *dctx = engine->i915->kernel_context;
> -	int ret;
> +	int ret = 0;
>   
>   	ret = intel_engine_init_common(engine);
>   	if (ret)
> -		goto error;
> -
> -	/* And setup the hardware status page. */
> -	ret = lrc_setup_hws(engine, dctx->engine[engine->id].state);
> -	if (ret) {
> -		DRM_ERROR("Failed to set up hws %s: %d\n", engine->name, ret);
> -		goto error;
> -	}
> -
> -	return 0;
> +		intel_logical_ring_cleanup(engine);
>   
> -error:
> -	intel_logical_ring_cleanup(engine);
>   	return ret;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 5224b7a..b258053 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1174,113 +1174,7 @@ static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req,
>   	return 0;
>   }
>   
> -static void cleanup_phys_status_page(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -
> -	if (!dev_priv->status_page_dmah)
> -		return;
> -
> -	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
> -	engine->status_page.page_addr = NULL;
> -}
> -
> -static void cleanup_status_page(struct intel_engine_cs *engine)
> -{
> -	struct i915_vma *vma;
> -	struct drm_i915_gem_object *obj;
> -
> -	vma = fetch_and_zero(&engine->status_page.vma);
> -	if (!vma)
> -		return;
> -
> -	obj = vma->obj;
> -
> -	i915_vma_unpin(vma);
> -	i915_vma_close(vma);
> -
> -	i915_gem_object_unpin_map(obj);
> -	__i915_gem_object_release_unless_active(obj);
> -}
>   
> -static int init_status_page(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_gem_object *obj;
> -	struct i915_vma *vma;
> -	unsigned int flags;
> -	void *vaddr;
> -	int ret;
> -
> -	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
> -	if (IS_ERR(obj)) {
> -		DRM_ERROR("Failed to allocate status page\n");
> -		return PTR_ERR(obj);
> -	}
> -
> -	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
> -	if (ret)
> -		goto err;
> -
> -	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
> -	if (IS_ERR(vma)) {
> -		ret = PTR_ERR(vma);
> -		goto err;
> -	}
> -
> -	flags = PIN_GLOBAL;
> -	if (!HAS_LLC(engine->i915))
> -		/* On g33, we cannot place HWS above 256MiB, so
> -		 * restrict its pinning to the low mappable arena.
> -		 * Though this restriction is not documented for
> -		 * gen4, gen5, or byt, they also behave similarly
> -		 * and hang if the HWS is placed at the top of the
> -		 * GTT. To generalise, it appears that all !llc
> -		 * platforms have issues with us placing the HWS
> -		 * above the mappable region (even though we never
> -		 * actualy map it).
> -		 */
> -		flags |= PIN_MAPPABLE;
> -	ret = i915_vma_pin(vma, 0, 4096, flags);
> -	if (ret)
> -		goto err;
> -
> -	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
> -	if (IS_ERR(vaddr)) {
> -		ret = PTR_ERR(vaddr);
> -		goto err_unpin;
> -	}
> -
> -	engine->status_page.vma = vma;
> -	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
> -	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
> -
> -	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
> -			 engine->name, i915_ggtt_offset(vma));
> -	return 0;
> -
> -err_unpin:
> -	i915_vma_unpin(vma);
> -err:
> -	i915_gem_object_put(obj);
> -	return ret;
> -}
> -
> -static int init_phys_status_page(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -
> -	GEM_BUG_ON(engine->id != RCS);
> -
> -	dev_priv->status_page_dmah =
> -		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
> -	if (!dev_priv->status_page_dmah)
> -		return -ENOMEM;
> -
> -	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
> -	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
> -
> -	return 0;
> -}
>   
>   int intel_ring_pin(struct intel_ring *ring,
>   		   struct drm_i915_private *i915,
> @@ -1567,17 +1461,10 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   	if (err)
>   		goto err;
>   
> -	if (HWS_NEEDS_PHYSICAL(engine->i915))
> -		err = init_phys_status_page(engine);
> -	else
> -		err = init_status_page(engine);
> -	if (err)
> -		goto err;
> -
>   	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
>   	if (IS_ERR(ring)) {
>   		err = PTR_ERR(ring);
> -		goto err_hws;
> +		goto err;
>   	}
>   
>   	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
> @@ -1592,11 +1479,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   
>   err_ring:
>   	intel_ring_free(ring);
> -err_hws:
> -	if (HWS_NEEDS_PHYSICAL(engine->i915))
> -		cleanup_phys_status_page(engine);
> -	else
> -		cleanup_status_page(engine);
>   err:
>   	intel_engine_cleanup_common(engine);
>   	return err;
> @@ -1615,11 +1497,6 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
>   	if (engine->cleanup)
>   		engine->cleanup(engine);
>   
> -	if (HWS_NEEDS_PHYSICAL(dev_priv))
> -		cleanup_phys_status_page(engine);
> -	else
> -		cleanup_status_page(engine);
> -
>   	intel_engine_cleanup_common(engine);
>   
>   	dev_priv->engine[engine->id] = NULL;
>
Daniele Ceraolo Spurio July 7, 2017, 9:15 p.m. UTC | #3
On 06/07/17 11:51, Michel Thierry wrote:
> On 7/6/2017 10:59 AM, Chris Wilson wrote:
>  > If there are no conflicts, then just skip the patch, and really there
>  > shouldn't be since the writes we care about are ordered and the writes
>  > we don't, we don't. We will need to move to per-context hwsp in the near
>  > future which should alleviate these concerns.
> 
> It helped me explain the strange data I was seeing in the HSWP during 
> driver load came from (that random data wasn't really random, I was 
> looking at the PPHWSP which later became the HWSP).
> 
> Just a comment/fix for GuC submission below, because as it is, this will 
> break it.
> 
> -Michel
> 

I'd argue that since we're doing it wrong we should fix it even if it is 
not currently failing, as things could go wrong in unexpected ways 
(especially on new HW). However, I'm happy to drop this patch now if 
we're going to fix it later with the per-context HWSP.


<snip>

>> +         */
>> +        flags |= PIN_MAPPABLE;
>> +    ret = i915_vma_pin(vma, 0, 4096, flags);
>> +    if (ret)
>> +        goto err;
> 
> This will break GuC submission, the HWSP will be allocated correctly,
> 
> [ ] [drm:intel_engine_init_common [i915]] rcs0 hws offset: 0x00001000
> [ ] [drm:intel_engine_init_common [i915]] bcs0 hws offset: 0x00002000
> [ ] [drm:intel_engine_init_common [i915]] vcs0 hws offset: 0x00003000
> [ ] [drm:intel_engine_init_common [i915]] vcs1 hws offset: 0x00004000
> [ ] [drm:intel_engine_init_common [i915]] vecs0 hws offset: 0x00005000
> 
> But these are below GUC_WOPCM_TOP (0x80000), and our _beloved_ fw must 
> be having problems accessing them, because this causes:
> 
> [ ] [drm:intel_guc_init_hw [i915]] GuC fw status: fetch SUCCESS, load 
> PENDING
> [ ] [drm:guc_ucode_xfer_dma [i915]] DMA status 0x10, GuC status 0x800071ec
> [ ] [drm:guc_ucode_xfer_dma [i915]] returning -110
> 
> So we must need this restriction before pinning it,
> 
> @@ -510,6 +510,11 @@ static int init_status_page(struct intel_engine_cs 
> *engine)
>                   * actualy map it).
>                   */
>                  flags |= PIN_MAPPABLE;
> +
> +       /* GuC can only access pages above GUC_WOPCM_TOP ¯\_(ツ)_/¯ */
> +       if (HAS_GUC(engine->i915) && i915.enable_guc_loading)
> +               flags |= PIN_OFFSET_BIAS | GUC_WOPCM_TOP;
> +
>          ret = i915_vma_pin(vma, 0, 4096, flags);
>          if (ret)
>                  goto err;
> 
> With that change, GuC is happy:
> 
> [ ] [drm:intel_engine_init_common [i915]] rcs0 hws offset: 0x00084000
> [ ] [drm:intel_engine_init_common [i915]] bcs0 hws offset: 0x00089000
> [ ] [drm:intel_engine_init_common [i915]] vcs0 hws offset: 0x0008e000
> [ ] [drm:intel_engine_init_common [i915]] vcs1 hws offset: 0x00093000
> [ ] [drm:intel_engine_init_common [i915]] vecs0 hws offset: 0x00098000
> ...
> [ ] [drm:intel_guc_init_hw [i915]] GuC fw status: fetch SUCCESS, load 
> PENDING
> [ ] [drm:guc_ucode_xfer_dma [i915]] DMA status 0x10, GuC status 0x8002f0ec
> [ ] [drm:guc_ucode_xfer_dma [i915]] returning 0
> [ ] [drm] GuC submission enabled (firmware i915/skl_guc_ver9_33.bin 
> [version 9.33])
> 
> 

After a bit of investigation I've found that the issue is not actually 
with the positioning of the HWSP but with the fact that we use 
status_page.ggtt_offset to point to the lrca offset of the default 
context in guc_ads_create instead of using 
kernel_context->engine[].state. With that fixed GuC works fine even with 
the HWSP below GUC_WOPCM_TOP.

Thanks,
Daniele
Michel Thierry July 7, 2017, 11:53 p.m. UTC | #4
On 07/07/17 14:15, Daniele Ceraolo Spurio wrote:
> After a bit of investigation I've found that the issue is not actually 
> with the positioning of the HWSP but with the fact that we use 
> status_page.ggtt_offset to point to the lrca offset of the default 
> context in guc_ads_create instead of using 
> kernel_context->engine[].state. With that fixed GuC works fine even with 
> the HWSP below GUC_WOPCM_TOP.

Agreed, and it's something worth fixing (regardless of what we end up 
deciding about the HWSP). ads.golden_context_lrca shouldn't be relying 
on the HWSP location.

Thanks for spotting this.

-Michel
Chris Wilson July 12, 2017, 9:23 p.m. UTC | #5
Quoting Daniele Ceraolo Spurio (2017-07-06 17:10:40)
> On gen8+ we're currently using the PPHWSP of the kernel ctx as the
> global HWSP. However, when the kernel ctx gets submitted (e.g. from
> __intel_autoenable_gt_powersave) the HW will use that page as both
> HWSP and PPHWSP. Currently we're not seeing any problem because the
> conflict happens at offsets below 0x30 in an area we don't access,
> but that is not guaranteed to be true for future platform.

More to the point, I've stumbled upon a reason to want a separate HWSP
for execlists, today, as we can access the CSB from within the cacheable
HWSP. I evaluated this patch thinking that there was no need, since the
only use we have for contents of the status_page is the seqno, everything
else is volatile.

I'll look at it again in a fresh light, and at first glance it appears
to be just the simple translation one expects.
-Chris
Chris Wilson July 12, 2017, 9:50 p.m. UTC | #6
Quoting Daniele Ceraolo Spurio (2017-07-06 17:10:40)
> On gen8+ we're currently using the PPHWSP of the kernel ctx as the
> global HWSP. However, when the kernel ctx gets submitted (e.g. from
> __intel_autoenable_gt_powersave) the HW will use that page as both
> HWSP and PPHWSP. Currently we're not seeing any problem because the
> conflict happens at offsets below 0x30 in an area we don't access,
> but that is not guaranteed to be true for future platform.
> 
> To avoid the conflict, instead of re-using the PPHWSP of the kernel
> ctx we can allocate a separate page for the HWSP like what happens for
> pre-execlists platform.

We should add some concrete reason why we want it today...

> +static void cleanup_status_page(struct intel_engine_cs *engine)
> +{
> +       struct i915_vma *vma;
> +       struct drm_i915_gem_object *obj;

This is virtually i915_vma_unpin_and_release(), I think there are enough
users that adding some flags to unpin the map may be a good refactor.

> -static int
> -logical_ring_init(struct intel_engine_cs *engine)
> +static int logical_ring_init(struct intel_engine_cs *engine)
>  {
> -       struct i915_gem_context *dctx = engine->i915->kernel_context;
> -       int ret;
> +       int ret = 0;
>  
>         ret = intel_engine_init_common(engine);
>         if (ret)
> -               goto error;

I prefer if we keep the error:, it makes adding new steps easier.
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index a55cd72..70e9d88 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -444,6 +444,114 @@  static void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
 	i915_vma_unpin_and_release(&engine->scratch);
 }
 
+static void cleanup_phys_status_page(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	if (!dev_priv->status_page_dmah)
+		return;
+
+	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
+	engine->status_page.page_addr = NULL;
+}
+
+static void cleanup_status_page(struct intel_engine_cs *engine)
+{
+	struct i915_vma *vma;
+	struct drm_i915_gem_object *obj;
+
+	vma = fetch_and_zero(&engine->status_page.vma);
+	if (!vma)
+		return;
+
+	obj = vma->obj;
+
+	i915_vma_unpin(vma);
+	i915_vma_close(vma);
+
+	i915_gem_object_unpin_map(obj);
+	__i915_gem_object_release_unless_active(obj);
+}
+
+static int init_status_page(struct intel_engine_cs *engine)
+{
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	unsigned int flags;
+	void *vaddr;
+	int ret;
+
+	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
+	if (IS_ERR(obj)) {
+		DRM_ERROR("Failed to allocate status page\n");
+		return PTR_ERR(obj);
+	}
+
+	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
+	if (ret)
+		goto err;
+
+	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err;
+	}
+
+	flags = PIN_GLOBAL;
+	if (!HAS_LLC(engine->i915))
+		/* On g33, we cannot place HWS above 256MiB, so
+		 * restrict its pinning to the low mappable arena.
+		 * Though this restriction is not documented for
+		 * gen4, gen5, or byt, they also behave similarly
+		 * and hang if the HWS is placed at the top of the
+		 * GTT. To generalise, it appears that all !llc
+		 * platforms have issues with us placing the HWS
+		 * above the mappable region (even though we never
+		 * actualy map it).
+		 */
+		flags |= PIN_MAPPABLE;
+	ret = i915_vma_pin(vma, 0, 4096, flags);
+	if (ret)
+		goto err;
+
+	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
+	if (IS_ERR(vaddr)) {
+		ret = PTR_ERR(vaddr);
+		goto err_unpin;
+	}
+
+	engine->status_page.vma = vma;
+	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
+	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
+
+	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
+			 engine->name, i915_ggtt_offset(vma));
+	return 0;
+
+err_unpin:
+	i915_vma_unpin(vma);
+err:
+	i915_gem_object_put(obj);
+	return ret;
+}
+
+static int init_phys_status_page(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	GEM_BUG_ON(engine->id != RCS);
+
+	dev_priv->status_page_dmah =
+		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
+	if (!dev_priv->status_page_dmah)
+		return -ENOMEM;
+
+	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
+	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
+
+	return 0;
+}
+
 /**
  * intel_engines_init_common - initialize cengine state which might require hw access
  * @engine: Engine to initialize.
@@ -481,8 +589,18 @@  int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret)
 		goto err_unpin;
 
+	if (HWS_NEEDS_PHYSICAL(engine->i915))
+		ret = init_phys_status_page(engine);
+	else
+		ret = init_status_page(engine);
+	if (ret)
+		goto err_rs_fini;
+
 	return 0;
 
+err_rs_fini:
+	i915_gem_render_state_fini(engine);
+
 err_unpin:
 	engine->context_unpin(engine, engine->i915->kernel_context);
 	return ret;
@@ -499,6 +617,11 @@  void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 {
 	intel_engine_cleanup_scratch(engine);
 
+	if (HWS_NEEDS_PHYSICAL(engine->i915))
+		cleanup_phys_status_page(engine);
+	else
+		cleanup_status_page(engine);
+
 	i915_gem_render_state_fini(engine);
 	intel_engine_fini_breadcrumbs(engine);
 	intel_engine_cleanup_cmd_parser(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 699868d..d84a36c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1641,11 +1641,6 @@  void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	if (engine->cleanup)
 		engine->cleanup(engine);
 
-	if (engine->status_page.vma) {
-		i915_gem_object_unpin_map(engine->status_page.vma->obj);
-		engine->status_page.vma = NULL;
-	}
-
 	intel_engine_cleanup_common(engine);
 
 	lrc_destroy_wa_ctx(engine);
@@ -1692,24 +1687,6 @@  static void execlists_set_default_submission(struct intel_engine_cs *engine)
 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
 }
 
-static int
-lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma)
-{
-	const int hws_offset = LRC_PPHWSP_PN * PAGE_SIZE;
-	void *hws;
-
-	/* The HWSP is part of the default context object in LRC mode. */
-	hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
-	if (IS_ERR(hws))
-		return PTR_ERR(hws);
-
-	engine->status_page.page_addr = hws + hws_offset;
-	engine->status_page.ggtt_offset = i915_ggtt_offset(vma) + hws_offset;
-	engine->status_page.vma = vma;
-
-	return 0;
-}
-
 static void
 logical_ring_setup(struct intel_engine_cs *engine)
 {
@@ -1742,27 +1719,14 @@  static void execlists_set_default_submission(struct intel_engine_cs *engine)
 	logical_ring_default_irqs(engine);
 }
 
-static int
-logical_ring_init(struct intel_engine_cs *engine)
+static int logical_ring_init(struct intel_engine_cs *engine)
 {
-	struct i915_gem_context *dctx = engine->i915->kernel_context;
-	int ret;
+	int ret = 0;
 
 	ret = intel_engine_init_common(engine);
 	if (ret)
-		goto error;
-
-	/* And setup the hardware status page. */
-	ret = lrc_setup_hws(engine, dctx->engine[engine->id].state);
-	if (ret) {
-		DRM_ERROR("Failed to set up hws %s: %d\n", engine->name, ret);
-		goto error;
-	}
-
-	return 0;
+		intel_logical_ring_cleanup(engine);
 
-error:
-	intel_logical_ring_cleanup(engine);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 5224b7a..b258053 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1174,113 +1174,7 @@  static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req,
 	return 0;
 }
 
-static void cleanup_phys_status_page(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	if (!dev_priv->status_page_dmah)
-		return;
-
-	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
-	engine->status_page.page_addr = NULL;
-}
-
-static void cleanup_status_page(struct intel_engine_cs *engine)
-{
-	struct i915_vma *vma;
-	struct drm_i915_gem_object *obj;
-
-	vma = fetch_and_zero(&engine->status_page.vma);
-	if (!vma)
-		return;
-
-	obj = vma->obj;
-
-	i915_vma_unpin(vma);
-	i915_vma_close(vma);
-
-	i915_gem_object_unpin_map(obj);
-	__i915_gem_object_release_unless_active(obj);
-}
 
-static int init_status_page(struct intel_engine_cs *engine)
-{
-	struct drm_i915_gem_object *obj;
-	struct i915_vma *vma;
-	unsigned int flags;
-	void *vaddr;
-	int ret;
-
-	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
-	if (IS_ERR(obj)) {
-		DRM_ERROR("Failed to allocate status page\n");
-		return PTR_ERR(obj);
-	}
-
-	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
-	if (ret)
-		goto err;
-
-	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
-		goto err;
-	}
-
-	flags = PIN_GLOBAL;
-	if (!HAS_LLC(engine->i915))
-		/* On g33, we cannot place HWS above 256MiB, so
-		 * restrict its pinning to the low mappable arena.
-		 * Though this restriction is not documented for
-		 * gen4, gen5, or byt, they also behave similarly
-		 * and hang if the HWS is placed at the top of the
-		 * GTT. To generalise, it appears that all !llc
-		 * platforms have issues with us placing the HWS
-		 * above the mappable region (even though we never
-		 * actualy map it).
-		 */
-		flags |= PIN_MAPPABLE;
-	ret = i915_vma_pin(vma, 0, 4096, flags);
-	if (ret)
-		goto err;
-
-	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
-	if (IS_ERR(vaddr)) {
-		ret = PTR_ERR(vaddr);
-		goto err_unpin;
-	}
-
-	engine->status_page.vma = vma;
-	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
-	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
-
-	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
-			 engine->name, i915_ggtt_offset(vma));
-	return 0;
-
-err_unpin:
-	i915_vma_unpin(vma);
-err:
-	i915_gem_object_put(obj);
-	return ret;
-}
-
-static int init_phys_status_page(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	GEM_BUG_ON(engine->id != RCS);
-
-	dev_priv->status_page_dmah =
-		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
-	if (!dev_priv->status_page_dmah)
-		return -ENOMEM;
-
-	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
-	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
-
-	return 0;
-}
 
 int intel_ring_pin(struct intel_ring *ring,
 		   struct drm_i915_private *i915,
@@ -1567,17 +1461,10 @@  static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	if (err)
 		goto err;
 
-	if (HWS_NEEDS_PHYSICAL(engine->i915))
-		err = init_phys_status_page(engine);
-	else
-		err = init_status_page(engine);
-	if (err)
-		goto err;
-
 	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
 	if (IS_ERR(ring)) {
 		err = PTR_ERR(ring);
-		goto err_hws;
+		goto err;
 	}
 
 	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
@@ -1592,11 +1479,6 @@  static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 
 err_ring:
 	intel_ring_free(ring);
-err_hws:
-	if (HWS_NEEDS_PHYSICAL(engine->i915))
-		cleanup_phys_status_page(engine);
-	else
-		cleanup_status_page(engine);
 err:
 	intel_engine_cleanup_common(engine);
 	return err;
@@ -1615,11 +1497,6 @@  void intel_engine_cleanup(struct intel_engine_cs *engine)
 	if (engine->cleanup)
 		engine->cleanup(engine);
 
-	if (HWS_NEEDS_PHYSICAL(dev_priv))
-		cleanup_phys_status_page(engine);
-	else
-		cleanup_status_page(engine);
-
 	intel_engine_cleanup_common(engine);
 
 	dev_priv->engine[engine->id] = NULL;