diff mbox

[6/8] drm/i915: reprogram NOA muxes on context switch when using perf

Message ID 20180425114521.7524-7-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lionel Landwerlin April 25, 2018, 11:45 a.m. UTC
If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes. We need to reprogram them at
context switch.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |  2 +
 drivers/gpu/drm/i915/i915_perf.c | 92 +++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/intel_lrc.c | 71 +++++++++++++++++++++---
 drivers/gpu/drm/i915/intel_lrc.h |  1 +
 4 files changed, 153 insertions(+), 13 deletions(-)

Comments

Chris Wilson April 25, 2018, 11:57 a.m. UTC | #1
Quoting Lionel Landwerlin (2018-04-25 12:45:19)
> If some of the contexts submitting workloads to the GPU have been
> configured to shutdown slices/subslices, we might loose the NOA
> configurations written in the NOA muxes. We need to reprogram them at
> context switch.

On every single batchbuffer, no way. That's around a 33% latency hit,
ignoring the cost of what's inside the wa_bb.

If they are not context saved, userspace isn't going to be allowed to
modify them. So why do we need this? If they not context saved, then the
kernel needs to control them and doesn't need to reset around every
batch, just the one's that want non-default (fancier if we handle
preemption)?
-Chris
Chris Wilson April 25, 2018, 1:23 p.m. UTC | #2
Quoting Chris Wilson (2018-04-25 12:57:24)
> If they are not context saved, userspace isn't going to be allowed to
> modify them. So why do we need this? If they not context saved, then the
> kernel needs to control them and doesn't need to reset around every
> batch, just the one's that want non-default (fancier if we handle
> preemption)?

Preemption is the killer here. Jumping from one preempted batch back to
inside another and you skip over all ring preambles. We could use the
preemption ring itself to set state though.

Still I would rather see an alternative to wa_bb than incur the cost
where it is never needed :|
-Chris
Lionel Landwerlin April 25, 2018, 2:35 p.m. UTC | #3
On 25/04/18 12:57, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2018-04-25 12:45:19)
>> If some of the contexts submitting workloads to the GPU have been
>> configured to shutdown slices/subslices, we might loose the NOA
>> configurations written in the NOA muxes. We need to reprogram them at
>> context switch.
> On every single batchbuffer, no way. That's around a 33% latency hit,
> ignoring the cost of what's inside the wa_bb.
>
> If they are not context saved, userspace isn't going to be allowed to
> modify them. So why do we need this? If they not context saved, then the
> kernel needs to control them and doesn't need to reset around every
> batch, just the one's that want non-default (fancier if we handle
> preemption)?
> -Chris
>
These are neither context saved, nor power saved :(

I can't tell whether the GuC might schedule things behind our back.
But eventually that will happen anyway. So that's why I put the 
reprogramming in per ctx wa.
Agreed, it sucks.

I'll look into the fancier things.

Thanks,

-
Lionel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fdf71164ee24..b15f1fa4453a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3263,6 +3263,8 @@  int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
 void i915_oa_init_reg_state(struct intel_engine_cs *engine,
 			    struct i915_gem_context *ctx,
 			    uint32_t *reg_state);
+u32 i915_oa_get_perctx_bb_size(struct intel_engine_cs *engine);
+u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 2fc9c85a0d99..d1598f54e63b 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1752,6 +1752,71 @@  static int gen8_emit_oa_config(struct i915_request *rq,
 	return 0;
 }
 
+#define MAX_LRI_SIZE (125U)
+
+u32 i915_oa_get_perctx_bb_size(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
+	struct i915_oa_config *oa_config;
+	u32 n_lri;
+
+	/* We only care about RCS. */
+	if (engine->id != RCS)
+		return 0;
+
+	/* Perf not supported. */
+	if (!dev_priv->perf.initialized)
+		return 0;
+
+	/* OA not currently configured. */
+	if (!stream)
+		return 0;
+
+	oa_config = stream->oa_config;
+
+	/* Very unlikely but possible that we have no muxes to configure. */
+	if (!oa_config->mux_regs_len)
+		return 0;
+
+	n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
+		(oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
+
+	/* Return the size of MI_LOAD_REGISTER_IMMs + PIPE_CONTROL . */
+	return n_lri * 4 + oa_config->mux_regs_len * 8 + 24;
+}
+
+u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_oa_config *oa_config;
+	u32 i, n_loaded_regs;
+
+	if (i915_oa_get_perctx_bb_size(engine) == 0)
+		return batch;
+
+	oa_config = dev_priv->perf.oa.exclusive_stream->oa_config;
+
+	n_loaded_regs = 0;
+	for (i = 0; i < oa_config->mux_regs_len; i++) {
+		if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
+			u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
+					MAX_LRI_SIZE);
+			*batch++ = MI_LOAD_REGISTER_IMM(n_lri);
+		}
+
+		*batch++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
+		*batch++ = oa_config->mux_regs[i].value;
+		n_loaded_regs++;
+	}
+
+	batch = gen8_emit_pipe_control(batch,
+				       PIPE_CONTROL_MMIO_WRITE,
+				       0);
+
+	return batch;
+}
+
 static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
 						 const struct i915_oa_config *oa_config)
 {
@@ -1829,7 +1894,7 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	/* Switch away from any user context. */
 	ret = gen8_switch_to_updated_kernel_context(dev_priv, oa_config);
 	if (ret)
-		goto out;
+		return ret;
 
 	/*
 	 * The OA register config is setup through the context image. This image
@@ -1846,7 +1911,16 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	 */
 	ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
 	if (ret)
-		goto out;
+		return ret;
+
+	/*
+	 * Reload the workaround batchbuffer to include NOA muxes
+	 * reprogramming on context-switch, so we don't loose configurations
+	 * after switch-from a context with disabled slices/subslices.
+	 */
+	ret = logical_render_ring_reload_wa_bb(dev_priv->engine[RCS]);
+	if (ret)
+		return ret;
 
 	/* Update all contexts now that we've stalled the submission. */
 	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
@@ -1858,10 +1932,8 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 			continue;
 
 		regs = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
-		if (IS_ERR(regs)) {
-			ret = PTR_ERR(regs);
-			goto out;
-		}
+		if (IS_ERR(regs))
+			return PTR_ERR(regs);
 
 		ce->state->obj->mm.dirty = true;
 		regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
@@ -1871,7 +1943,6 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 		i915_gem_object_unpin_map(ce->state->obj);
 	}
 
- out:
 	return ret;
 }
 
@@ -2213,6 +2284,13 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 
 	dev_priv->perf.oa.exclusive_stream = stream;
 
+	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
+						      stream->oa_config);
+	if (ret)
+		goto err_enable;
+
+	stream->ops = &i915_oa_stream_ops;
+
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 7a5efab3e4fb..a0f72fcda0d9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -169,6 +169,8 @@  static void execlists_init_reg_state(u32 *reg_state,
 				     struct i915_gem_context *ctx,
 				     struct intel_engine_cs *engine,
 				     struct intel_ring *ring);
+static void execlists_init_reg_state_wa_bb(u32 *reg_state,
+					   struct intel_engine_cs *engine);
 
 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 {
@@ -1584,16 +1586,28 @@  gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 	return batch;
 }
 
-#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
+static u32 *gen_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+	batch = i915_oa_emit_perctx_bb(engine, batch);
+	*batch++ = MI_BATCH_BUFFER_END;
+
+	return batch;
+}
+
+/* Reserve a minimum of 200 dwords for indirect bb */
+#define CTX_WA_BB_MIN_DWORDS (200)
 
 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine,
 			    struct i915_ctx_workarounds *wa_ctx)
 {
 	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
+	u32 n_pages = DIV_ROUND_UP(i915_oa_get_perctx_bb_size(engine) +
+				   4 * CTX_WA_BB_MIN_DWORDS,
+				   PAGE_SIZE);
 	int err;
 
-	obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
+	obj = i915_gem_object_create(engine->i915, n_pages * PAGE_SIZE);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 
@@ -1639,15 +1653,15 @@  static int intel_init_workaround_bb(struct intel_engine_cs *engine,
 	switch (INTEL_GEN(engine->i915)) {
 	case 10:
 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
-		wa_bb_fn[1] = NULL;
+		wa_bb_fn[1] = gen_init_perctx_bb;
 		break;
 	case 9:
 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
-		wa_bb_fn[1] = NULL;
+		wa_bb_fn[1] = gen_init_perctx_bb;
 		break;
 	case 8:
 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
-		wa_bb_fn[1] = NULL;
+		wa_bb_fn[1] = gen_init_perctx_bb;
 		break;
 	default:
 		MISSING_CASE(INTEL_GEN(engine->i915));
@@ -1680,7 +1694,7 @@  static int intel_init_workaround_bb(struct intel_engine_cs *engine,
 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
 	}
 
-	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+	BUG_ON(batch_ptr - batch > wa_ctx->vma->obj->base.size);
 
 	kunmap_atomic(batch);
 	if (ret)
@@ -2321,6 +2335,51 @@  int logical_render_ring_init(struct intel_engine_cs *engine)
 	return logical_ring_init(engine);
 }
 
+int logical_render_ring_reload_wa_bb(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_ctx_workarounds new_wa_ctx;
+	struct i915_gem_context *ctx;
+	int ret;
+
+	if (WARN_ON(engine->id != RCS))
+		return -EINVAL;
+
+	memset(&new_wa_ctx, 0, sizeof(new_wa_ctx));
+	ret = intel_init_workaround_bb(engine, &new_wa_ctx);
+	if (ret)
+		return ret;
+
+	if (engine->wa_ctx.vma)
+		lrc_destroy_wa_ctx(engine);
+
+	memcpy(&engine->wa_ctx, &new_wa_ctx, sizeof(engine->wa_ctx));
+
+	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
+		struct intel_context *ce = &ctx->engine[RCS];
+		u32 *regs;
+
+		/* Settings will be set upon first use. */
+		if (!ce->state)
+			continue;
+
+		regs = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
+		if (IS_ERR(regs)) {
+			ret = PTR_ERR(regs);
+			break;
+		}
+
+		ce->state->obj->mm.dirty = true;
+		regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
+
+		execlists_init_reg_state_wa_bb(regs, engine);
+
+		i915_gem_object_unpin_map(ce->state->obj);
+	}
+
+	return ret;
+}
+
 int logical_xcs_ring_init(struct intel_engine_cs *engine)
 {
 	logical_ring_setup(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 59d7b86012e9..d91d69a17206 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -71,6 +71,7 @@  enum {
 /* Logical Rings */
 void intel_logical_ring_cleanup(struct intel_engine_cs *engine);
 int logical_render_ring_init(struct intel_engine_cs *engine);
+int logical_render_ring_reload_wa_bb(struct intel_engine_cs *engine);
 int logical_xcs_ring_init(struct intel_engine_cs *engine);
 
 /* Logical Ring Contexts */