[v5,1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
diff mbox

Message ID 1434632855-7080-2-git-send-email-arun.siluvery@linux.intel.com
State New
Headers show

Commit Message

arun.siluvery@linux.intel.com June 18, 2015, 1:07 p.m. UTC
Some of the WA are to be applied during context save but before restore and
some at the end of context save/restore but before executing the instructions
in the ring, WA batch buffers are created for this purpose and these WA cannot
be applied using normal means. Each context has two registers to load the
offsets of these batch buffers. If they are non-zero, HW understands that it
need to execute these batches.

v1: In this version two separate ring_buffer objects were used to load WA
instructions for indirect and per context batch buffers and they were part
of every context.

v2: Chris suggested to include additional page in context and use it to load
these WA instead of creating separate objects. This will simplify lot of things
as we need not explicity pin/unpin them. Thomas Daniel further pointed that GuC
is planning to use a similar setup to share data between GuC and driver and
WA batch buffers can probably share that page. However after discussions with
Dave who is implementing GuC changes, he suggested to use an independent page
for the reasons - GuC area might grow and these WA are initialized only once and
are not changed afterwards so we can share them share across all contexts.

The page is updated with WA during render ring init. This has an advantage of
not adding more special cases to default_context.

We don't know upfront the number of WA we will applying using these batch buffers.
For this reason the size was fixed earlier but it is not a good idea. To fix this,
the functions that load instructions are modified to report the no of commands
inserted and the size is now calculated after the batch is updated. A macro is
introduced to add commands to these batch buffers which also checks for overflow
and returns error.
We have a full page dedicated for these WA so that should be sufficient for
good number of WA, anything more means we have major issues.
The list for Gen8 is small, same for Gen9 also, maybe few more gets added
going forward but not close to filling entire page. Chris suggested a two-pass
approach but we agreed to go with single page setup as it is a one-off routine
and simpler code wins. Moved around functions to simplify it further, add comments.

One additional option is offset field which is helpful if we would like to
have multiple batches at different offsets within the page and select them
based on some criteria. This is not a requirement at this point but could
help in future (Dave).

(Thanks to Chris, Dave and Thomas for their reviews and inputs)

Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c        | 204 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  18 +++
 2 files changed, 218 insertions(+), 4 deletions(-)

Comments

Chris Wilson June 18, 2015, 4:06 p.m. UTC | #1
I'm pretty happy with the code, I was just confused by the series
changing the setup halfway through

On Thu, Jun 18, 2015 at 02:07:30PM +0100, Arun Siluvery wrote:
> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
> +				    uint32_t **wa_ctx_batch,
> +				    uint32_t offset,
> +				    uint32_t *num_dwords)
> +{
> +	uint32_t index;
> +	uint32_t *batch = *wa_ctx_batch;
> +
> +	index = offset;
> +
> +	/* FIXME: fill one cacheline with NOOPs.
> +	 * Replace these instructions with WA
> +	 */
> +	while (index < (offset + 16))
> +		wa_ctx_emit(batch, MI_NOOP);

If this was

/* Replace me with WA */
wa_ctx_emit(batch, MI_NOOP)

/* Pad to end of cacheline */
while (index % 16)
	wa_ctx_emit(batch, MI_NOOP);

You then don't need to alter the code when yo add the real w/a. Note
that using (unsigned long)batch as you do later for cacheline
calculation is wrong, as that is a local physical CPU address (not the
virtual address used by the cache in the GPU) and was page aligned
anyway.

Similary,

> +static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
> +			       uint32_t **wa_ctx_batch,
> +			       uint32_t offset,
> +			       uint32_t *num_dwords)
> +{
> +	uint32_t index;
> +	uint32_t *batch = *wa_ctx_batch;
> +
> +	index = offset;
> +

If this just did
		wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
rather than insert a cacheline of noops, again you wouldn't need to
touch this infrastructure as you added the w/a.

As it stands, I was a little worried halfway through when the cache
alignment suddenly disappeared - but this patch implied to me that it
was necessary.
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0413b8f..ad0b189 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -211,6 +211,7 @@  enum {
 	FAULT_AND_CONTINUE /* Unsupported */
 };
 #define GEN8_CTX_ID_SHIFT 32
+#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
 
 static int intel_lr_context_pin(struct intel_engine_cs *ring,
 		struct intel_context *ctx);
@@ -1077,6 +1078,173 @@  static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring,
 	return 0;
 }
 
+#define wa_ctx_emit(batch, cmd) {	\
+		if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) {	\
+			return -ENOSPC;					\
+		}							\
+		batch[index++] = (cmd);					\
+	}
+
+/**
+ * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA
+ *
+ * @ring: only applicable for RCS
+ * @wa_ctx_batch: page in which WA are loaded
+ * @offset: This is for future use in case if we would like to have multiple
+ *          batches at different offsets and select them based on a criteria.
+ * @num_dwords: The number of WA applied are known at the beginning, it returns
+ * the no of DWORDS written. This batch does not contain MI_BATCH_BUFFER_END
+ * so it adds padding to make it cacheline aligned. MI_BATCH_BUFFER_END will be
+ * added to perctx batch and both of them together makes a complete batch buffer.
+ *
+ * Return: non-zero if we exceed the PAGE_SIZE limit.
+ */
+
+static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
+				    uint32_t **wa_ctx_batch,
+				    uint32_t offset,
+				    uint32_t *num_dwords)
+{
+	uint32_t index;
+	uint32_t *batch = *wa_ctx_batch;
+
+	index = offset;
+
+	/* FIXME: fill one cacheline with NOOPs.
+	 * Replace these instructions with WA
+	 */
+	while (index < (offset + 16))
+		wa_ctx_emit(batch, MI_NOOP);
+
+	/*
+	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+	 * execution depends on the length specified in terms of cache lines
+	 * in the register CTX_RCS_INDIRECT_CTX
+	 */
+
+	*num_dwords = index - offset;
+
+	return 0;
+}
+
+/*
+ * gen8_init_perctx_bb() - initialize per ctx batch with WA
+ *
+ * This function doesn't add any padding at the end as it contains
+ * MI_BATCH_BUFFER_END and padding after it is redundant.
+ */
+static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
+			       uint32_t **wa_ctx_batch,
+			       uint32_t offset,
+			       uint32_t *num_dwords)
+{
+	uint32_t index;
+	uint32_t *batch = *wa_ctx_batch;
+
+	index = offset;
+
+	/* FIXME: fill one cacheline with NOOPs.
+	 * Replace these instructions with WA
+	 */
+	while (index < (offset + 16))
+		wa_ctx_emit(batch, MI_NOOP);
+
+	batch[index - 1] = MI_BATCH_BUFFER_END;
+
+	*num_dwords = index - offset;
+
+	return 0;
+}
+
+static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
+{
+	int ret;
+
+	WARN_ON(ring->id != RCS);
+
+	ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size));
+	if (!ring->wa_ctx.obj) {
+		DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
+		return -ENOMEM;
+	}
+
+	ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0);
+	if (ret) {
+		DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
+				 ret);
+		drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
+{
+	WARN_ON(ring->id != RCS);
+
+	i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
+	drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+	ring->wa_ctx.obj = NULL;
+}
+
+static int intel_init_workaround_bb(struct intel_engine_cs *ring)
+{
+	int ret = 0;
+	uint32_t *batch;
+	uint32_t num_dwords;
+	struct page *page;
+	struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
+
+	WARN_ON(ring->id != RCS);
+
+	if (ring->scratch.obj == NULL) {
+		DRM_ERROR("scratch page not allocated for %s\n", ring->name);
+		return -EINVAL;
+	}
+
+	ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
+		return ret;
+	}
+
+	page = i915_gem_object_get_page(wa_ctx->obj, 0);
+	batch = kmap_atomic(page);
+
+	if (INTEL_INFO(ring->dev)->gen == 8) {
+		wa_ctx->indctx_batch_offset = 0;
+
+		ret = gen8_init_indirectctx_bb(ring,
+					       &batch,
+					       wa_ctx->indctx_batch_offset,
+					       &num_dwords);
+		if (ret)
+			goto out;
+
+		wa_ctx->indctx_batch_size = round_up(num_dwords, CACHELINE_DWORDS);
+		wa_ctx->perctx_batch_offset = wa_ctx->indctx_batch_size;
+
+		ret = gen8_init_perctx_bb(ring,
+					  &batch,
+					  wa_ctx->perctx_batch_offset,
+					  &num_dwords);
+		if (ret)
+			goto out;
+	} else {
+		WARN(INTEL_INFO(ring->dev)->gen >= 8,
+		     "WA batch buffer is not initialized for Gen%d\n",
+		     INTEL_INFO(ring->dev)->gen);
+	}
+
+out:
+	kunmap_atomic(batch);
+	if (ret)
+		lrc_destroy_wa_ctx_obj(ring);
+
+	return ret;
+}
+
 static int gen8_init_common_ring(struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ring->dev;
@@ -1411,6 +1579,9 @@  void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
 		kunmap(sg_page(ring->status_page.obj->pages->sgl));
 		ring->status_page.obj = NULL;
 	}
+
+	if (ring->wa_ctx.obj)
+		lrc_destroy_wa_ctx_obj(ring);
 }
 
 static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring)
@@ -1474,7 +1645,21 @@  static int logical_render_ring_init(struct drm_device *dev)
 	if (ret)
 		return ret;
 
-	return intel_init_pipe_control(ring);
+	if (INTEL_INFO(ring->dev)->gen >= 8) {
+		ret = intel_init_workaround_bb(ring);
+		if (ret) {
+			DRM_ERROR("WA batch buffers are not initialized: %d\n",
+				  ret);
+		}
+	}
+
+	ret = intel_init_pipe_control(ring);
+	if (ret) {
+		if (ring->wa_ctx.obj)
+			lrc_destroy_wa_ctx_obj(ring);
+	}
+
+	return ret;
 }
 
 static int logical_bsd_ring_init(struct drm_device *dev)
@@ -1754,15 +1939,26 @@  populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 	reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
 	reg_state[CTX_SECOND_BB_STATE+1] = 0;
 	if (ring->id == RCS) {
-		/* TODO: according to BSpec, the register state context
-		 * for CHV does not have these. OTOH, these registers do
-		 * exist in CHV. I'm waiting for a clarification */
 		reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
 		reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
 		reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
 		reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
 		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8;
 		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
+		if (ring->wa_ctx.obj) {
+			reg_state[CTX_RCS_INDIRECT_CTX+1] =
+				(i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+				 ring->wa_ctx.indctx_batch_offset * sizeof(uint32_t)) |
+				(ring->wa_ctx.indctx_batch_size / CACHELINE_DWORDS);
+
+			reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
+				CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
+
+			reg_state[CTX_BB_PER_CTX_PTR+1] =
+				(i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+				 ring->wa_ctx.perctx_batch_offset * sizeof(uint32_t)) |
+				0x01;
+		}
 	}
 	reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
 	reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 39f6dfc..1f38af3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -12,6 +12,7 @@ 
  * workarounds!
  */
 #define CACHELINE_BYTES 64
+#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
 
 /*
  * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
@@ -119,6 +120,22 @@  struct intel_ringbuffer {
 
 struct	intel_context;
 
+/*
+ * we use a single page to load ctx workarounds so all of these
+ * values are referred in terms of dwords
+ *
+ * offset field - helpful in case if we want to have multiple batches
+ * at different offsets based on some conditions. It is not a requirement
+ * at the moment but provides an option for future use.
+ * indctx_batch_size - HW expects this value in terms of cachelines
+ */
+struct  i915_ctx_workarounds {
+	u32 indctx_batch_offset;
+	u32 indctx_batch_size;
+	u32 perctx_batch_offset;
+	struct drm_i915_gem_object *obj;
+};
+
 struct  intel_engine_cs {
 	const char	*name;
 	enum intel_ring_id {
@@ -142,6 +159,7 @@  struct  intel_engine_cs {
 	struct i915_gem_batch_pool batch_pool;
 
 	struct intel_hw_status_page status_page;
+	struct i915_ctx_workarounds wa_ctx;
 
 	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
 	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */