@@ -3263,6 +3263,8 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
void i915_oa_init_reg_state(struct intel_engine_cs *engine,
struct i915_gem_context *ctx,
uint32_t *reg_state);
+u32 i915_oa_get_perctx_bb_size(struct intel_engine_cs *engine);
+u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch);
/* i915_gem_evict.c */
int __must_check i915_gem_evict_something(struct i915_address_space *vm,
@@ -1752,6 +1752,71 @@ static int gen8_emit_oa_config(struct i915_request *rq,
return 0;
}
+#define MAX_LRI_SIZE (125U)
+
+u32 i915_oa_get_perctx_bb_size(struct intel_engine_cs *engine)
+{
+ struct drm_i915_private *dev_priv = engine->i915;
+ struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
+ struct i915_oa_config *oa_config;
+ u32 n_lri;
+
+ /* We only care about RCS. */
+ if (engine->id != RCS)
+ return 0;
+
+ /* Perf not supported. */
+ if (!dev_priv->perf.initialized)
+ return 0;
+
+ /* OA not currently configured. */
+ if (!stream)
+ return 0;
+
+ oa_config = stream->oa_config;
+
+ /* Very unlikely but possible that we have no muxes to configure. */
+ if (!oa_config->mux_regs_len)
+ return 0;
+
+ n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
+ (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
+
+ /* Return the size of MI_LOAD_REGISTER_IMMs + PIPE_CONTROL . */
+ return n_lri * 4 + oa_config->mux_regs_len * 8 + 24;
+}
+
+u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ struct drm_i915_private *dev_priv = engine->i915;
+ struct i915_oa_config *oa_config;
+ u32 i, n_loaded_regs;
+
+ if (i915_oa_get_perctx_bb_size(engine) == 0)
+ return batch;
+
+ oa_config = dev_priv->perf.oa.exclusive_stream->oa_config;
+
+ n_loaded_regs = 0;
+ for (i = 0; i < oa_config->mux_regs_len; i++) {
+ if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
+ u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
+ MAX_LRI_SIZE);
+ *batch++ = MI_LOAD_REGISTER_IMM(n_lri);
+ }
+
+ *batch++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
+ *batch++ = oa_config->mux_regs[i].value;
+ n_loaded_regs++;
+ }
+
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_MMIO_WRITE,
+ 0);
+
+ return batch;
+}
+
static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
const struct i915_oa_config *oa_config)
{
@@ -1829,7 +1894,7 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
/* Switch away from any user context. */
ret = gen8_switch_to_updated_kernel_context(dev_priv, oa_config);
if (ret)
- goto out;
+ return ret;
/*
* The OA register config is setup through the context image. This image
@@ -1846,7 +1911,16 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
*/
ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
if (ret)
- goto out;
+ return ret;
+
+ /*
+ * Reload the workaround batchbuffer to include NOA muxes
+ * reprogramming on context-switch, so we don't loose configurations
+ * after switch-from a context with disabled slices/subslices.
+ */
+ ret = logical_render_ring_reload_wa_bb(dev_priv->engine[RCS]);
+ if (ret)
+ return ret;
/* Update all contexts now that we've stalled the submission. */
list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
@@ -1858,10 +1932,8 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
continue;
regs = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
- if (IS_ERR(regs)) {
- ret = PTR_ERR(regs);
- goto out;
- }
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
ce->state->obj->mm.dirty = true;
regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
@@ -1871,7 +1943,6 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
i915_gem_object_unpin_map(ce->state->obj);
}
- out:
return ret;
}
@@ -2213,6 +2284,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
dev_priv->perf.oa.exclusive_stream = stream;
+ ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
+ stream->oa_config);
+ if (ret)
+ goto err_enable;
+
+ stream->ops = &i915_oa_stream_ops;
+
mutex_unlock(&dev_priv->drm.struct_mutex);
return 0;
@@ -169,6 +169,8 @@ static void execlists_init_reg_state(u32 *reg_state,
struct i915_gem_context *ctx,
struct intel_engine_cs *engine,
struct intel_ring *ring);
+static void execlists_init_reg_state_wa_bb(u32 *reg_state,
+ struct intel_engine_cs *engine);
static inline struct i915_priolist *to_priolist(struct rb_node *rb)
{
@@ -1584,16 +1586,28 @@ gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
return batch;
}
-#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
+static u32 *gen_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ batch = i915_oa_emit_perctx_bb(engine, batch);
+ *batch++ = MI_BATCH_BUFFER_END;
+
+ return batch;
+}
+
+/* Reserve a minimum of 200 dwords for indirect bb */
+#define CTX_WA_BB_MIN_DWORDS (200)
static int lrc_setup_wa_ctx(struct intel_engine_cs *engine,
struct i915_ctx_workarounds *wa_ctx)
{
struct drm_i915_gem_object *obj;
struct i915_vma *vma;
+ u32 n_pages = DIV_ROUND_UP(i915_oa_get_perctx_bb_size(engine) +
+ 4 * CTX_WA_BB_MIN_DWORDS,
+ PAGE_SIZE);
int err;
- obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
+ obj = i915_gem_object_create(engine->i915, n_pages * PAGE_SIZE);
if (IS_ERR(obj))
return PTR_ERR(obj);
@@ -1639,15 +1653,15 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine,
switch (INTEL_GEN(engine->i915)) {
case 10:
wa_bb_fn[0] = gen10_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
+ wa_bb_fn[1] = gen_init_perctx_bb;
break;
case 9:
wa_bb_fn[0] = gen9_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
+ wa_bb_fn[1] = gen_init_perctx_bb;
break;
case 8:
wa_bb_fn[0] = gen8_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
+ wa_bb_fn[1] = gen_init_perctx_bb;
break;
default:
MISSING_CASE(INTEL_GEN(engine->i915));
@@ -1680,7 +1694,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine,
wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
}
- BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+ BUG_ON(batch_ptr - batch > wa_ctx->vma->obj->base.size);
kunmap_atomic(batch);
if (ret)
@@ -2321,6 +2335,51 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
return logical_ring_init(engine);
}
+int logical_render_ring_reload_wa_bb(struct intel_engine_cs *engine)
+{
+ struct drm_i915_private *dev_priv = engine->i915;
+ struct i915_ctx_workarounds new_wa_ctx;
+ struct i915_gem_context *ctx;
+ int ret;
+
+ if (WARN_ON(engine->id != RCS))
+ return -EINVAL;
+
+ memset(&new_wa_ctx, 0, sizeof(new_wa_ctx));
+ ret = intel_init_workaround_bb(engine, &new_wa_ctx);
+ if (ret)
+ return ret;
+
+ if (engine->wa_ctx.vma)
+ lrc_destroy_wa_ctx(engine);
+
+ memcpy(&engine->wa_ctx, &new_wa_ctx, sizeof(engine->wa_ctx));
+
+ list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
+ struct intel_context *ce = &ctx->engine[RCS];
+ u32 *regs;
+
+ /* Settings will be set upon first use. */
+ if (!ce->state)
+ continue;
+
+ regs = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
+ if (IS_ERR(regs)) {
+ ret = PTR_ERR(regs);
+ break;
+ }
+
+ ce->state->obj->mm.dirty = true;
+ regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
+
+ execlists_init_reg_state_wa_bb(regs, engine);
+
+ i915_gem_object_unpin_map(ce->state->obj);
+ }
+
+ return ret;
+}
+
int logical_xcs_ring_init(struct intel_engine_cs *engine)
{
logical_ring_setup(engine);
@@ -71,6 +71,7 @@ enum {
/* Logical Rings */
void intel_logical_ring_cleanup(struct intel_engine_cs *engine);
int logical_render_ring_init(struct intel_engine_cs *engine);
+int logical_render_ring_reload_wa_bb(struct intel_engine_cs *engine);
int logical_xcs_ring_init(struct intel_engine_cs *engine);
/* Logical Ring Contexts */
If some of the contexts submitting workloads to the GPU have been configured to shutdown slices/subslices, we might loose the NOA configurations written in the NOA muxes. We need to reprogram them at context switch. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_perf.c | 92 +++++++++++++++++++++++++++++--- drivers/gpu/drm/i915/intel_lrc.c | 71 +++++++++++++++++++++--- drivers/gpu/drm/i915/intel_lrc.h | 1 + 4 files changed, 153 insertions(+), 13 deletions(-)